diff --git a/glot-contrastive-final-lora/README.md b/glot-contrastive-final-lora/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/adapter_config.json b/glot-contrastive-final-lora/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/adapter_model.safetensors b/glot-contrastive-final-lora/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dba4d5dd074dc3d6c4bc4d4f36793beac178e2c3
--- /dev/null
+++ b/glot-contrastive-final-lora/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ba05d9cb007251d29a6f02fdd92f56fa1beb8f9e0676686472daf07c4e9f478
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-1000/README.md b/glot-contrastive-final-lora/checkpoint-1000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-1000/adapter_config.json b/glot-contrastive-final-lora/checkpoint-1000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-1000/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-1000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8c6f622d6f49815caea6fb659ebe020c89f378ea
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a256288c756ea19f50dedef87a2b9786971da4a587298a49f74d6f7686b0572
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-1000/optimizer.pt b/glot-contrastive-final-lora/checkpoint-1000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1bc165023ccee4f4afe4d93551ef9e2f43dc826e
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:192e794a1d2f26b8d41b4ac6d8d1d65d67318fcf2e777ca7722bceabd58f6fb6
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-1000/rng_state.pth b/glot-contrastive-final-lora/checkpoint-1000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3fc18f310321470d8a0ba51339a5c8840edcb27d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbe2c991b46d5f8c63a3e3c3773a3bf7d45c1bcb99de1418411217d641560e12
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-1000/scheduler.pt b/glot-contrastive-final-lora/checkpoint-1000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..39d88f920629c8004eb7888895c9b25772a0f6f1
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce27a0a34a8d4759fb8e422039bad599131740613f03bb839ba3688bec3369a7
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-1000/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-1000/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-1000/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-1000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-1000/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-1000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-1000/trainer_state.json b/glot-contrastive-final-lora/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a8ba71f054efc1fce20afc03cc54fb58a0a853e
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/trainer_state.json
@@ -0,0 +1,1434 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.714285714285714,
+ "eval_steps": 5,
+ "global_step": 1000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ },
+ {
+ "epoch": 2.8857142857142857,
+ "grad_norm": 1.3597544431686401,
+ "learning_rate": 0.00025679999999999995,
+ "loss": 0.1583,
+ "step": 505
+ },
+ {
+ "epoch": 2.914285714285714,
+ "grad_norm": 0.8790671825408936,
+ "learning_rate": 0.00025637142857142854,
+ "loss": 0.1565,
+ "step": 510
+ },
+ {
+ "epoch": 2.942857142857143,
+ "grad_norm": 1.1175066232681274,
+ "learning_rate": 0.0002559428571428571,
+ "loss": 0.1406,
+ "step": 515
+ },
+ {
+ "epoch": 2.9714285714285715,
+ "grad_norm": 2.8528785705566406,
+ "learning_rate": 0.0002555142857142857,
+ "loss": 0.1735,
+ "step": 520
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 2.2073328495025635,
+ "learning_rate": 0.0002550857142857143,
+ "loss": 0.1816,
+ "step": 525
+ },
+ {
+ "epoch": 3.0285714285714285,
+ "grad_norm": 11.01322078704834,
+ "learning_rate": 0.0002546571428571428,
+ "loss": 0.1873,
+ "step": 530
+ },
+ {
+ "epoch": 3.057142857142857,
+ "grad_norm": 1.5822402238845825,
+ "learning_rate": 0.0002542285714285714,
+ "loss": 0.168,
+ "step": 535
+ },
+ {
+ "epoch": 3.085714285714286,
+ "grad_norm": 1.3086942434310913,
+ "learning_rate": 0.0002538,
+ "loss": 0.149,
+ "step": 540
+ },
+ {
+ "epoch": 3.1142857142857143,
+ "grad_norm": 6.303041458129883,
+ "learning_rate": 0.0002533714285714285,
+ "loss": 0.1651,
+ "step": 545
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 14.48929500579834,
+ "learning_rate": 0.00025294285714285716,
+ "loss": 0.1687,
+ "step": 550
+ },
+ {
+ "epoch": 3.1714285714285713,
+ "grad_norm": 6.824525356292725,
+ "learning_rate": 0.0002525142857142857,
+ "loss": 0.1919,
+ "step": 555
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 18.772563934326172,
+ "learning_rate": 0.00025208571428571427,
+ "loss": 0.2075,
+ "step": 560
+ },
+ {
+ "epoch": 3.2285714285714286,
+ "grad_norm": 0.7268752455711365,
+ "learning_rate": 0.00025165714285714286,
+ "loss": 0.174,
+ "step": 565
+ },
+ {
+ "epoch": 3.257142857142857,
+ "grad_norm": 1.1301453113555908,
+ "learning_rate": 0.0002512285714285714,
+ "loss": 0.1668,
+ "step": 570
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 2.846802234649658,
+ "learning_rate": 0.00025079999999999997,
+ "loss": 0.1645,
+ "step": 575
+ },
+ {
+ "epoch": 3.314285714285714,
+ "grad_norm": 1.417515754699707,
+ "learning_rate": 0.00025037142857142855,
+ "loss": 0.1719,
+ "step": 580
+ },
+ {
+ "epoch": 3.342857142857143,
+ "grad_norm": 4.137150764465332,
+ "learning_rate": 0.00024994285714285714,
+ "loss": 0.1739,
+ "step": 585
+ },
+ {
+ "epoch": 3.3714285714285714,
+ "grad_norm": 2.6067259311676025,
+ "learning_rate": 0.0002495142857142857,
+ "loss": 0.1489,
+ "step": 590
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.601024627685547,
+ "learning_rate": 0.00024908571428571425,
+ "loss": 0.1618,
+ "step": 595
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 3.849017858505249,
+ "learning_rate": 0.00024865714285714284,
+ "loss": 0.1899,
+ "step": 600
+ },
+ {
+ "epoch": 3.4571428571428573,
+ "grad_norm": 4.673766136169434,
+ "learning_rate": 0.0002482285714285714,
+ "loss": 0.1761,
+ "step": 605
+ },
+ {
+ "epoch": 3.4857142857142858,
+ "grad_norm": 2.6057631969451904,
+ "learning_rate": 0.00024779999999999995,
+ "loss": 0.1743,
+ "step": 610
+ },
+ {
+ "epoch": 3.5142857142857142,
+ "grad_norm": 2.932652473449707,
+ "learning_rate": 0.0002473714285714286,
+ "loss": 0.1482,
+ "step": 615
+ },
+ {
+ "epoch": 3.5428571428571427,
+ "grad_norm": 0.8764939308166504,
+ "learning_rate": 0.0002469428571428571,
+ "loss": 0.1644,
+ "step": 620
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 1.3203191757202148,
+ "learning_rate": 0.0002465142857142857,
+ "loss": 0.1654,
+ "step": 625
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 0.7977635264396667,
+ "learning_rate": 0.0002460857142857143,
+ "loss": 0.1472,
+ "step": 630
+ },
+ {
+ "epoch": 3.6285714285714286,
+ "grad_norm": 1.4750248193740845,
+ "learning_rate": 0.0002456571428571428,
+ "loss": 0.1735,
+ "step": 635
+ },
+ {
+ "epoch": 3.657142857142857,
+ "grad_norm": 1.8164482116699219,
+ "learning_rate": 0.0002452285714285714,
+ "loss": 0.1593,
+ "step": 640
+ },
+ {
+ "epoch": 3.685714285714286,
+ "grad_norm": 1.4829603433609009,
+ "learning_rate": 0.0002448,
+ "loss": 0.1508,
+ "step": 645
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.8828144669532776,
+ "learning_rate": 0.00024437142857142857,
+ "loss": 0.1573,
+ "step": 650
+ },
+ {
+ "epoch": 3.742857142857143,
+ "grad_norm": 2.039384126663208,
+ "learning_rate": 0.00024394285714285713,
+ "loss": 0.1745,
+ "step": 655
+ },
+ {
+ "epoch": 3.7714285714285714,
+ "grad_norm": 0.9604200720787048,
+ "learning_rate": 0.00024351428571428569,
+ "loss": 0.17,
+ "step": 660
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 0.7903971076011658,
+ "learning_rate": 0.00024308571428571427,
+ "loss": 0.1654,
+ "step": 665
+ },
+ {
+ "epoch": 3.8285714285714287,
+ "grad_norm": 0.6935649514198303,
+ "learning_rate": 0.00024265714285714283,
+ "loss": 0.1714,
+ "step": 670
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.5832012295722961,
+ "learning_rate": 0.00024222857142857138,
+ "loss": 0.1636,
+ "step": 675
+ },
+ {
+ "epoch": 3.8857142857142857,
+ "grad_norm": 0.6303168535232544,
+ "learning_rate": 0.0002418,
+ "loss": 0.1604,
+ "step": 680
+ },
+ {
+ "epoch": 3.914285714285714,
+ "grad_norm": 0.7210885882377625,
+ "learning_rate": 0.00024137142857142855,
+ "loss": 0.1444,
+ "step": 685
+ },
+ {
+ "epoch": 3.942857142857143,
+ "grad_norm": 0.7690990567207336,
+ "learning_rate": 0.00024094285714285714,
+ "loss": 0.1631,
+ "step": 690
+ },
+ {
+ "epoch": 3.9714285714285715,
+ "grad_norm": 1.0142720937728882,
+ "learning_rate": 0.0002405142857142857,
+ "loss": 0.158,
+ "step": 695
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.7970322966575623,
+ "learning_rate": 0.00024008571428571425,
+ "loss": 0.1803,
+ "step": 700
+ },
+ {
+ "epoch": 4.0285714285714285,
+ "grad_norm": 0.6795914769172668,
+ "learning_rate": 0.00023965714285714284,
+ "loss": 0.143,
+ "step": 705
+ },
+ {
+ "epoch": 4.057142857142857,
+ "grad_norm": 0.6832629442214966,
+ "learning_rate": 0.0002392285714285714,
+ "loss": 0.1457,
+ "step": 710
+ },
+ {
+ "epoch": 4.085714285714285,
+ "grad_norm": 3.8629798889160156,
+ "learning_rate": 0.0002388,
+ "loss": 0.1671,
+ "step": 715
+ },
+ {
+ "epoch": 4.114285714285714,
+ "grad_norm": 1.1167882680892944,
+ "learning_rate": 0.00023837142857142856,
+ "loss": 0.1544,
+ "step": 720
+ },
+ {
+ "epoch": 4.142857142857143,
+ "grad_norm": 0.9431412816047668,
+ "learning_rate": 0.00023794285714285712,
+ "loss": 0.1605,
+ "step": 725
+ },
+ {
+ "epoch": 4.171428571428572,
+ "grad_norm": 1.310948133468628,
+ "learning_rate": 0.0002375142857142857,
+ "loss": 0.1121,
+ "step": 730
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 0.9830737709999084,
+ "learning_rate": 0.00023708571428571426,
+ "loss": 0.1742,
+ "step": 735
+ },
+ {
+ "epoch": 4.228571428571429,
+ "grad_norm": 0.6166555881500244,
+ "learning_rate": 0.00023665714285714282,
+ "loss": 0.1525,
+ "step": 740
+ },
+ {
+ "epoch": 4.257142857142857,
+ "grad_norm": 0.995579719543457,
+ "learning_rate": 0.00023622857142857143,
+ "loss": 0.1439,
+ "step": 745
+ },
+ {
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.639796793460846,
+ "learning_rate": 0.00023579999999999999,
+ "loss": 0.1692,
+ "step": 750
+ },
+ {
+ "epoch": 4.314285714285714,
+ "grad_norm": 0.9438050389289856,
+ "learning_rate": 0.00023537142857142854,
+ "loss": 0.1785,
+ "step": 755
+ },
+ {
+ "epoch": 4.3428571428571425,
+ "grad_norm": 0.8960750102996826,
+ "learning_rate": 0.00023494285714285713,
+ "loss": 0.1557,
+ "step": 760
+ },
+ {
+ "epoch": 4.371428571428572,
+ "grad_norm": 0.6287499070167542,
+ "learning_rate": 0.00023451428571428568,
+ "loss": 0.1459,
+ "step": 765
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 0.7638295888900757,
+ "learning_rate": 0.00023408571428571424,
+ "loss": 0.1341,
+ "step": 770
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.655878484249115,
+ "learning_rate": 0.00023365714285714283,
+ "loss": 0.1358,
+ "step": 775
+ },
+ {
+ "epoch": 4.457142857142857,
+ "grad_norm": 0.5840997695922852,
+ "learning_rate": 0.0002332285714285714,
+ "loss": 0.1386,
+ "step": 780
+ },
+ {
+ "epoch": 4.485714285714286,
+ "grad_norm": 1.1082488298416138,
+ "learning_rate": 0.0002328,
+ "loss": 0.1827,
+ "step": 785
+ },
+ {
+ "epoch": 4.514285714285714,
+ "grad_norm": 0.8825240135192871,
+ "learning_rate": 0.00023237142857142855,
+ "loss": 0.1527,
+ "step": 790
+ },
+ {
+ "epoch": 4.542857142857143,
+ "grad_norm": 0.6752304434776306,
+ "learning_rate": 0.0002319428571428571,
+ "loss": 0.1392,
+ "step": 795
+ },
+ {
+ "epoch": 4.571428571428571,
+ "grad_norm": 1.1423301696777344,
+ "learning_rate": 0.0002315142857142857,
+ "loss": 0.1433,
+ "step": 800
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 10.793691635131836,
+ "learning_rate": 0.00023108571428571425,
+ "loss": 0.1635,
+ "step": 805
+ },
+ {
+ "epoch": 4.628571428571428,
+ "grad_norm": 0.47564294934272766,
+ "learning_rate": 0.00023065714285714286,
+ "loss": 0.1199,
+ "step": 810
+ },
+ {
+ "epoch": 4.6571428571428575,
+ "grad_norm": 1.2492656707763672,
+ "learning_rate": 0.00023022857142857142,
+ "loss": 0.1488,
+ "step": 815
+ },
+ {
+ "epoch": 4.685714285714286,
+ "grad_norm": 0.6933501958847046,
+ "learning_rate": 0.00022979999999999997,
+ "loss": 0.1812,
+ "step": 820
+ },
+ {
+ "epoch": 4.714285714285714,
+ "grad_norm": 0.7901633977890015,
+ "learning_rate": 0.00022937142857142856,
+ "loss": 0.1415,
+ "step": 825
+ },
+ {
+ "epoch": 4.742857142857143,
+ "grad_norm": 0.7854829430580139,
+ "learning_rate": 0.00022894285714285712,
+ "loss": 0.1401,
+ "step": 830
+ },
+ {
+ "epoch": 4.771428571428571,
+ "grad_norm": 0.8716740608215332,
+ "learning_rate": 0.00022851428571428567,
+ "loss": 0.1982,
+ "step": 835
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 0.7047899961471558,
+ "learning_rate": 0.00022808571428571426,
+ "loss": 0.1624,
+ "step": 840
+ },
+ {
+ "epoch": 4.828571428571428,
+ "grad_norm": 0.7134959697723389,
+ "learning_rate": 0.00022765714285714284,
+ "loss": 0.1375,
+ "step": 845
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 1.0897325277328491,
+ "learning_rate": 0.00022722857142857143,
+ "loss": 0.1489,
+ "step": 850
+ },
+ {
+ "epoch": 4.885714285714286,
+ "grad_norm": 1.1065207719802856,
+ "learning_rate": 0.00022679999999999998,
+ "loss": 0.1495,
+ "step": 855
+ },
+ {
+ "epoch": 4.914285714285715,
+ "grad_norm": 0.7434757351875305,
+ "learning_rate": 0.00022637142857142854,
+ "loss": 0.1507,
+ "step": 860
+ },
+ {
+ "epoch": 4.942857142857143,
+ "grad_norm": 1.0045181512832642,
+ "learning_rate": 0.00022594285714285712,
+ "loss": 0.1527,
+ "step": 865
+ },
+ {
+ "epoch": 4.9714285714285715,
+ "grad_norm": 1.2025654315948486,
+ "learning_rate": 0.00022551428571428568,
+ "loss": 0.1523,
+ "step": 870
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.7823342084884644,
+ "learning_rate": 0.0002250857142857143,
+ "loss": 0.1514,
+ "step": 875
+ },
+ {
+ "epoch": 5.0285714285714285,
+ "grad_norm": 0.8405362963676453,
+ "learning_rate": 0.00022465714285714285,
+ "loss": 0.1461,
+ "step": 880
+ },
+ {
+ "epoch": 5.057142857142857,
+ "grad_norm": 0.7527463436126709,
+ "learning_rate": 0.0002242285714285714,
+ "loss": 0.1206,
+ "step": 885
+ },
+ {
+ "epoch": 5.085714285714285,
+ "grad_norm": 0.8372548222541809,
+ "learning_rate": 0.0002238,
+ "loss": 0.1513,
+ "step": 890
+ },
+ {
+ "epoch": 5.114285714285714,
+ "grad_norm": 0.8755456209182739,
+ "learning_rate": 0.00022337142857142855,
+ "loss": 0.1498,
+ "step": 895
+ },
+ {
+ "epoch": 5.142857142857143,
+ "grad_norm": 0.7312084436416626,
+ "learning_rate": 0.0002229428571428571,
+ "loss": 0.154,
+ "step": 900
+ },
+ {
+ "epoch": 5.171428571428572,
+ "grad_norm": 0.6366221904754639,
+ "learning_rate": 0.0002225142857142857,
+ "loss": 0.1466,
+ "step": 905
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 0.6406880617141724,
+ "learning_rate": 0.00022208571428571427,
+ "loss": 0.1254,
+ "step": 910
+ },
+ {
+ "epoch": 5.228571428571429,
+ "grad_norm": 2.4106833934783936,
+ "learning_rate": 0.00022165714285714283,
+ "loss": 0.1534,
+ "step": 915
+ },
+ {
+ "epoch": 5.257142857142857,
+ "grad_norm": 0.5635722279548645,
+ "learning_rate": 0.00022122857142857142,
+ "loss": 0.1461,
+ "step": 920
+ },
+ {
+ "epoch": 5.285714285714286,
+ "grad_norm": 0.787162184715271,
+ "learning_rate": 0.00022079999999999997,
+ "loss": 0.1424,
+ "step": 925
+ },
+ {
+ "epoch": 5.314285714285714,
+ "grad_norm": 0.6513975262641907,
+ "learning_rate": 0.00022037142857142853,
+ "loss": 0.1326,
+ "step": 930
+ },
+ {
+ "epoch": 5.3428571428571425,
+ "grad_norm": 0.6933534741401672,
+ "learning_rate": 0.00021994285714285711,
+ "loss": 0.1661,
+ "step": 935
+ },
+ {
+ "epoch": 5.371428571428572,
+ "grad_norm": 0.7263259887695312,
+ "learning_rate": 0.0002195142857142857,
+ "loss": 0.15,
+ "step": 940
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 0.5537381768226624,
+ "learning_rate": 0.00021908571428571428,
+ "loss": 0.129,
+ "step": 945
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.6014005541801453,
+ "learning_rate": 0.00021865714285714284,
+ "loss": 0.1321,
+ "step": 950
+ },
+ {
+ "epoch": 5.457142857142857,
+ "grad_norm": 0.6581441760063171,
+ "learning_rate": 0.0002182285714285714,
+ "loss": 0.1587,
+ "step": 955
+ },
+ {
+ "epoch": 5.485714285714286,
+ "grad_norm": 0.9326379895210266,
+ "learning_rate": 0.00021779999999999998,
+ "loss": 0.1654,
+ "step": 960
+ },
+ {
+ "epoch": 5.514285714285714,
+ "grad_norm": 0.9438592791557312,
+ "learning_rate": 0.00021737142857142854,
+ "loss": 0.1212,
+ "step": 965
+ },
+ {
+ "epoch": 5.542857142857143,
+ "grad_norm": 0.7699571251869202,
+ "learning_rate": 0.00021694285714285715,
+ "loss": 0.1464,
+ "step": 970
+ },
+ {
+ "epoch": 5.571428571428571,
+ "grad_norm": 0.8758366703987122,
+ "learning_rate": 0.0002165142857142857,
+ "loss": 0.1599,
+ "step": 975
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 0.6101442575454712,
+ "learning_rate": 0.00021608571428571426,
+ "loss": 0.1589,
+ "step": 980
+ },
+ {
+ "epoch": 5.628571428571428,
+ "grad_norm": 0.7454060912132263,
+ "learning_rate": 0.00021565714285714285,
+ "loss": 0.1433,
+ "step": 985
+ },
+ {
+ "epoch": 5.6571428571428575,
+ "grad_norm": 0.6379484534263611,
+ "learning_rate": 0.0002152285714285714,
+ "loss": 0.1592,
+ "step": 990
+ },
+ {
+ "epoch": 5.685714285714286,
+ "grad_norm": 1.1601309776306152,
+ "learning_rate": 0.00021479999999999996,
+ "loss": 0.1647,
+ "step": 995
+ },
+ {
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.5464673638343811,
+ "learning_rate": 0.00021437142857142855,
+ "loss": 0.1469,
+ "step": 1000
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-1000/training_args.bin b/glot-contrastive-final-lora/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/checkpoint-1500/README.md b/glot-contrastive-final-lora/checkpoint-1500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-1500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-1500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-1500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-1500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..34730efaa5567fa6056cd05fb71e8f8aa4bcdf15
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6413fb7d01f4b21da6e461dea0648d8d88fd37d6bd7c099ca98b3253cf62a00
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-1500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-1500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77d1cadcfde59acc8da9ebd6747c3ca5c6223db8
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36773257ecda9472e5ab320c80e2afdec9be64091b43e5bcbc53455be6b8149d
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-1500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-1500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8e52ad573b8a63ae0f6bd42c23fdfa2580e5b0a6
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1be729d0d5511ce10795029b99cb6f519c2f3eea267e5026e9426be89babe546
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-1500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-1500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..740ce0a61b00dfd2cb65009a5ada2d4c9668e5e4
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7719f3f0106e49068ce5d6f3e02c2bb61413e6107676c385f70427146af2266c
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-1500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-1500/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-1500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-1500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-1500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-1500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-1500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-1500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..947e7d9e51194a2b33424d124fca5853c1fb8b52
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/trainer_state.json
@@ -0,0 +1,2134 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.571428571428571,
+ "eval_steps": 5,
+ "global_step": 1500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ },
+ {
+ "epoch": 2.8857142857142857,
+ "grad_norm": 1.3597544431686401,
+ "learning_rate": 0.00025679999999999995,
+ "loss": 0.1583,
+ "step": 505
+ },
+ {
+ "epoch": 2.914285714285714,
+ "grad_norm": 0.8790671825408936,
+ "learning_rate": 0.00025637142857142854,
+ "loss": 0.1565,
+ "step": 510
+ },
+ {
+ "epoch": 2.942857142857143,
+ "grad_norm": 1.1175066232681274,
+ "learning_rate": 0.0002559428571428571,
+ "loss": 0.1406,
+ "step": 515
+ },
+ {
+ "epoch": 2.9714285714285715,
+ "grad_norm": 2.8528785705566406,
+ "learning_rate": 0.0002555142857142857,
+ "loss": 0.1735,
+ "step": 520
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 2.2073328495025635,
+ "learning_rate": 0.0002550857142857143,
+ "loss": 0.1816,
+ "step": 525
+ },
+ {
+ "epoch": 3.0285714285714285,
+ "grad_norm": 11.01322078704834,
+ "learning_rate": 0.0002546571428571428,
+ "loss": 0.1873,
+ "step": 530
+ },
+ {
+ "epoch": 3.057142857142857,
+ "grad_norm": 1.5822402238845825,
+ "learning_rate": 0.0002542285714285714,
+ "loss": 0.168,
+ "step": 535
+ },
+ {
+ "epoch": 3.085714285714286,
+ "grad_norm": 1.3086942434310913,
+ "learning_rate": 0.0002538,
+ "loss": 0.149,
+ "step": 540
+ },
+ {
+ "epoch": 3.1142857142857143,
+ "grad_norm": 6.303041458129883,
+ "learning_rate": 0.0002533714285714285,
+ "loss": 0.1651,
+ "step": 545
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 14.48929500579834,
+ "learning_rate": 0.00025294285714285716,
+ "loss": 0.1687,
+ "step": 550
+ },
+ {
+ "epoch": 3.1714285714285713,
+ "grad_norm": 6.824525356292725,
+ "learning_rate": 0.0002525142857142857,
+ "loss": 0.1919,
+ "step": 555
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 18.772563934326172,
+ "learning_rate": 0.00025208571428571427,
+ "loss": 0.2075,
+ "step": 560
+ },
+ {
+ "epoch": 3.2285714285714286,
+ "grad_norm": 0.7268752455711365,
+ "learning_rate": 0.00025165714285714286,
+ "loss": 0.174,
+ "step": 565
+ },
+ {
+ "epoch": 3.257142857142857,
+ "grad_norm": 1.1301453113555908,
+ "learning_rate": 0.0002512285714285714,
+ "loss": 0.1668,
+ "step": 570
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 2.846802234649658,
+ "learning_rate": 0.00025079999999999997,
+ "loss": 0.1645,
+ "step": 575
+ },
+ {
+ "epoch": 3.314285714285714,
+ "grad_norm": 1.417515754699707,
+ "learning_rate": 0.00025037142857142855,
+ "loss": 0.1719,
+ "step": 580
+ },
+ {
+ "epoch": 3.342857142857143,
+ "grad_norm": 4.137150764465332,
+ "learning_rate": 0.00024994285714285714,
+ "loss": 0.1739,
+ "step": 585
+ },
+ {
+ "epoch": 3.3714285714285714,
+ "grad_norm": 2.6067259311676025,
+ "learning_rate": 0.0002495142857142857,
+ "loss": 0.1489,
+ "step": 590
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.601024627685547,
+ "learning_rate": 0.00024908571428571425,
+ "loss": 0.1618,
+ "step": 595
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 3.849017858505249,
+ "learning_rate": 0.00024865714285714284,
+ "loss": 0.1899,
+ "step": 600
+ },
+ {
+ "epoch": 3.4571428571428573,
+ "grad_norm": 4.673766136169434,
+ "learning_rate": 0.0002482285714285714,
+ "loss": 0.1761,
+ "step": 605
+ },
+ {
+ "epoch": 3.4857142857142858,
+ "grad_norm": 2.6057631969451904,
+ "learning_rate": 0.00024779999999999995,
+ "loss": 0.1743,
+ "step": 610
+ },
+ {
+ "epoch": 3.5142857142857142,
+ "grad_norm": 2.932652473449707,
+ "learning_rate": 0.0002473714285714286,
+ "loss": 0.1482,
+ "step": 615
+ },
+ {
+ "epoch": 3.5428571428571427,
+ "grad_norm": 0.8764939308166504,
+ "learning_rate": 0.0002469428571428571,
+ "loss": 0.1644,
+ "step": 620
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 1.3203191757202148,
+ "learning_rate": 0.0002465142857142857,
+ "loss": 0.1654,
+ "step": 625
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 0.7977635264396667,
+ "learning_rate": 0.0002460857142857143,
+ "loss": 0.1472,
+ "step": 630
+ },
+ {
+ "epoch": 3.6285714285714286,
+ "grad_norm": 1.4750248193740845,
+ "learning_rate": 0.0002456571428571428,
+ "loss": 0.1735,
+ "step": 635
+ },
+ {
+ "epoch": 3.657142857142857,
+ "grad_norm": 1.8164482116699219,
+ "learning_rate": 0.0002452285714285714,
+ "loss": 0.1593,
+ "step": 640
+ },
+ {
+ "epoch": 3.685714285714286,
+ "grad_norm": 1.4829603433609009,
+ "learning_rate": 0.0002448,
+ "loss": 0.1508,
+ "step": 645
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.8828144669532776,
+ "learning_rate": 0.00024437142857142857,
+ "loss": 0.1573,
+ "step": 650
+ },
+ {
+ "epoch": 3.742857142857143,
+ "grad_norm": 2.039384126663208,
+ "learning_rate": 0.00024394285714285713,
+ "loss": 0.1745,
+ "step": 655
+ },
+ {
+ "epoch": 3.7714285714285714,
+ "grad_norm": 0.9604200720787048,
+ "learning_rate": 0.00024351428571428569,
+ "loss": 0.17,
+ "step": 660
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 0.7903971076011658,
+ "learning_rate": 0.00024308571428571427,
+ "loss": 0.1654,
+ "step": 665
+ },
+ {
+ "epoch": 3.8285714285714287,
+ "grad_norm": 0.6935649514198303,
+ "learning_rate": 0.00024265714285714283,
+ "loss": 0.1714,
+ "step": 670
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.5832012295722961,
+ "learning_rate": 0.00024222857142857138,
+ "loss": 0.1636,
+ "step": 675
+ },
+ {
+ "epoch": 3.8857142857142857,
+ "grad_norm": 0.6303168535232544,
+ "learning_rate": 0.0002418,
+ "loss": 0.1604,
+ "step": 680
+ },
+ {
+ "epoch": 3.914285714285714,
+ "grad_norm": 0.7210885882377625,
+ "learning_rate": 0.00024137142857142855,
+ "loss": 0.1444,
+ "step": 685
+ },
+ {
+ "epoch": 3.942857142857143,
+ "grad_norm": 0.7690990567207336,
+ "learning_rate": 0.00024094285714285714,
+ "loss": 0.1631,
+ "step": 690
+ },
+ {
+ "epoch": 3.9714285714285715,
+ "grad_norm": 1.0142720937728882,
+ "learning_rate": 0.0002405142857142857,
+ "loss": 0.158,
+ "step": 695
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.7970322966575623,
+ "learning_rate": 0.00024008571428571425,
+ "loss": 0.1803,
+ "step": 700
+ },
+ {
+ "epoch": 4.0285714285714285,
+ "grad_norm": 0.6795914769172668,
+ "learning_rate": 0.00023965714285714284,
+ "loss": 0.143,
+ "step": 705
+ },
+ {
+ "epoch": 4.057142857142857,
+ "grad_norm": 0.6832629442214966,
+ "learning_rate": 0.0002392285714285714,
+ "loss": 0.1457,
+ "step": 710
+ },
+ {
+ "epoch": 4.085714285714285,
+ "grad_norm": 3.8629798889160156,
+ "learning_rate": 0.0002388,
+ "loss": 0.1671,
+ "step": 715
+ },
+ {
+ "epoch": 4.114285714285714,
+ "grad_norm": 1.1167882680892944,
+ "learning_rate": 0.00023837142857142856,
+ "loss": 0.1544,
+ "step": 720
+ },
+ {
+ "epoch": 4.142857142857143,
+ "grad_norm": 0.9431412816047668,
+ "learning_rate": 0.00023794285714285712,
+ "loss": 0.1605,
+ "step": 725
+ },
+ {
+ "epoch": 4.171428571428572,
+ "grad_norm": 1.310948133468628,
+ "learning_rate": 0.0002375142857142857,
+ "loss": 0.1121,
+ "step": 730
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 0.9830737709999084,
+ "learning_rate": 0.00023708571428571426,
+ "loss": 0.1742,
+ "step": 735
+ },
+ {
+ "epoch": 4.228571428571429,
+ "grad_norm": 0.6166555881500244,
+ "learning_rate": 0.00023665714285714282,
+ "loss": 0.1525,
+ "step": 740
+ },
+ {
+ "epoch": 4.257142857142857,
+ "grad_norm": 0.995579719543457,
+ "learning_rate": 0.00023622857142857143,
+ "loss": 0.1439,
+ "step": 745
+ },
+ {
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.639796793460846,
+ "learning_rate": 0.00023579999999999999,
+ "loss": 0.1692,
+ "step": 750
+ },
+ {
+ "epoch": 4.314285714285714,
+ "grad_norm": 0.9438050389289856,
+ "learning_rate": 0.00023537142857142854,
+ "loss": 0.1785,
+ "step": 755
+ },
+ {
+ "epoch": 4.3428571428571425,
+ "grad_norm": 0.8960750102996826,
+ "learning_rate": 0.00023494285714285713,
+ "loss": 0.1557,
+ "step": 760
+ },
+ {
+ "epoch": 4.371428571428572,
+ "grad_norm": 0.6287499070167542,
+ "learning_rate": 0.00023451428571428568,
+ "loss": 0.1459,
+ "step": 765
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 0.7638295888900757,
+ "learning_rate": 0.00023408571428571424,
+ "loss": 0.1341,
+ "step": 770
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.655878484249115,
+ "learning_rate": 0.00023365714285714283,
+ "loss": 0.1358,
+ "step": 775
+ },
+ {
+ "epoch": 4.457142857142857,
+ "grad_norm": 0.5840997695922852,
+ "learning_rate": 0.0002332285714285714,
+ "loss": 0.1386,
+ "step": 780
+ },
+ {
+ "epoch": 4.485714285714286,
+ "grad_norm": 1.1082488298416138,
+ "learning_rate": 0.0002328,
+ "loss": 0.1827,
+ "step": 785
+ },
+ {
+ "epoch": 4.514285714285714,
+ "grad_norm": 0.8825240135192871,
+ "learning_rate": 0.00023237142857142855,
+ "loss": 0.1527,
+ "step": 790
+ },
+ {
+ "epoch": 4.542857142857143,
+ "grad_norm": 0.6752304434776306,
+ "learning_rate": 0.0002319428571428571,
+ "loss": 0.1392,
+ "step": 795
+ },
+ {
+ "epoch": 4.571428571428571,
+ "grad_norm": 1.1423301696777344,
+ "learning_rate": 0.0002315142857142857,
+ "loss": 0.1433,
+ "step": 800
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 10.793691635131836,
+ "learning_rate": 0.00023108571428571425,
+ "loss": 0.1635,
+ "step": 805
+ },
+ {
+ "epoch": 4.628571428571428,
+ "grad_norm": 0.47564294934272766,
+ "learning_rate": 0.00023065714285714286,
+ "loss": 0.1199,
+ "step": 810
+ },
+ {
+ "epoch": 4.6571428571428575,
+ "grad_norm": 1.2492656707763672,
+ "learning_rate": 0.00023022857142857142,
+ "loss": 0.1488,
+ "step": 815
+ },
+ {
+ "epoch": 4.685714285714286,
+ "grad_norm": 0.6933501958847046,
+ "learning_rate": 0.00022979999999999997,
+ "loss": 0.1812,
+ "step": 820
+ },
+ {
+ "epoch": 4.714285714285714,
+ "grad_norm": 0.7901633977890015,
+ "learning_rate": 0.00022937142857142856,
+ "loss": 0.1415,
+ "step": 825
+ },
+ {
+ "epoch": 4.742857142857143,
+ "grad_norm": 0.7854829430580139,
+ "learning_rate": 0.00022894285714285712,
+ "loss": 0.1401,
+ "step": 830
+ },
+ {
+ "epoch": 4.771428571428571,
+ "grad_norm": 0.8716740608215332,
+ "learning_rate": 0.00022851428571428567,
+ "loss": 0.1982,
+ "step": 835
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 0.7047899961471558,
+ "learning_rate": 0.00022808571428571426,
+ "loss": 0.1624,
+ "step": 840
+ },
+ {
+ "epoch": 4.828571428571428,
+ "grad_norm": 0.7134959697723389,
+ "learning_rate": 0.00022765714285714284,
+ "loss": 0.1375,
+ "step": 845
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 1.0897325277328491,
+ "learning_rate": 0.00022722857142857143,
+ "loss": 0.1489,
+ "step": 850
+ },
+ {
+ "epoch": 4.885714285714286,
+ "grad_norm": 1.1065207719802856,
+ "learning_rate": 0.00022679999999999998,
+ "loss": 0.1495,
+ "step": 855
+ },
+ {
+ "epoch": 4.914285714285715,
+ "grad_norm": 0.7434757351875305,
+ "learning_rate": 0.00022637142857142854,
+ "loss": 0.1507,
+ "step": 860
+ },
+ {
+ "epoch": 4.942857142857143,
+ "grad_norm": 1.0045181512832642,
+ "learning_rate": 0.00022594285714285712,
+ "loss": 0.1527,
+ "step": 865
+ },
+ {
+ "epoch": 4.9714285714285715,
+ "grad_norm": 1.2025654315948486,
+ "learning_rate": 0.00022551428571428568,
+ "loss": 0.1523,
+ "step": 870
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.7823342084884644,
+ "learning_rate": 0.0002250857142857143,
+ "loss": 0.1514,
+ "step": 875
+ },
+ {
+ "epoch": 5.0285714285714285,
+ "grad_norm": 0.8405362963676453,
+ "learning_rate": 0.00022465714285714285,
+ "loss": 0.1461,
+ "step": 880
+ },
+ {
+ "epoch": 5.057142857142857,
+ "grad_norm": 0.7527463436126709,
+ "learning_rate": 0.0002242285714285714,
+ "loss": 0.1206,
+ "step": 885
+ },
+ {
+ "epoch": 5.085714285714285,
+ "grad_norm": 0.8372548222541809,
+ "learning_rate": 0.0002238,
+ "loss": 0.1513,
+ "step": 890
+ },
+ {
+ "epoch": 5.114285714285714,
+ "grad_norm": 0.8755456209182739,
+ "learning_rate": 0.00022337142857142855,
+ "loss": 0.1498,
+ "step": 895
+ },
+ {
+ "epoch": 5.142857142857143,
+ "grad_norm": 0.7312084436416626,
+ "learning_rate": 0.0002229428571428571,
+ "loss": 0.154,
+ "step": 900
+ },
+ {
+ "epoch": 5.171428571428572,
+ "grad_norm": 0.6366221904754639,
+ "learning_rate": 0.0002225142857142857,
+ "loss": 0.1466,
+ "step": 905
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 0.6406880617141724,
+ "learning_rate": 0.00022208571428571427,
+ "loss": 0.1254,
+ "step": 910
+ },
+ {
+ "epoch": 5.228571428571429,
+ "grad_norm": 2.4106833934783936,
+ "learning_rate": 0.00022165714285714283,
+ "loss": 0.1534,
+ "step": 915
+ },
+ {
+ "epoch": 5.257142857142857,
+ "grad_norm": 0.5635722279548645,
+ "learning_rate": 0.00022122857142857142,
+ "loss": 0.1461,
+ "step": 920
+ },
+ {
+ "epoch": 5.285714285714286,
+ "grad_norm": 0.787162184715271,
+ "learning_rate": 0.00022079999999999997,
+ "loss": 0.1424,
+ "step": 925
+ },
+ {
+ "epoch": 5.314285714285714,
+ "grad_norm": 0.6513975262641907,
+ "learning_rate": 0.00022037142857142853,
+ "loss": 0.1326,
+ "step": 930
+ },
+ {
+ "epoch": 5.3428571428571425,
+ "grad_norm": 0.6933534741401672,
+ "learning_rate": 0.00021994285714285711,
+ "loss": 0.1661,
+ "step": 935
+ },
+ {
+ "epoch": 5.371428571428572,
+ "grad_norm": 0.7263259887695312,
+ "learning_rate": 0.0002195142857142857,
+ "loss": 0.15,
+ "step": 940
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 0.5537381768226624,
+ "learning_rate": 0.00021908571428571428,
+ "loss": 0.129,
+ "step": 945
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.6014005541801453,
+ "learning_rate": 0.00021865714285714284,
+ "loss": 0.1321,
+ "step": 950
+ },
+ {
+ "epoch": 5.457142857142857,
+ "grad_norm": 0.6581441760063171,
+ "learning_rate": 0.0002182285714285714,
+ "loss": 0.1587,
+ "step": 955
+ },
+ {
+ "epoch": 5.485714285714286,
+ "grad_norm": 0.9326379895210266,
+ "learning_rate": 0.00021779999999999998,
+ "loss": 0.1654,
+ "step": 960
+ },
+ {
+ "epoch": 5.514285714285714,
+ "grad_norm": 0.9438592791557312,
+ "learning_rate": 0.00021737142857142854,
+ "loss": 0.1212,
+ "step": 965
+ },
+ {
+ "epoch": 5.542857142857143,
+ "grad_norm": 0.7699571251869202,
+ "learning_rate": 0.00021694285714285715,
+ "loss": 0.1464,
+ "step": 970
+ },
+ {
+ "epoch": 5.571428571428571,
+ "grad_norm": 0.8758366703987122,
+ "learning_rate": 0.0002165142857142857,
+ "loss": 0.1599,
+ "step": 975
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 0.6101442575454712,
+ "learning_rate": 0.00021608571428571426,
+ "loss": 0.1589,
+ "step": 980
+ },
+ {
+ "epoch": 5.628571428571428,
+ "grad_norm": 0.7454060912132263,
+ "learning_rate": 0.00021565714285714285,
+ "loss": 0.1433,
+ "step": 985
+ },
+ {
+ "epoch": 5.6571428571428575,
+ "grad_norm": 0.6379484534263611,
+ "learning_rate": 0.0002152285714285714,
+ "loss": 0.1592,
+ "step": 990
+ },
+ {
+ "epoch": 5.685714285714286,
+ "grad_norm": 1.1601309776306152,
+ "learning_rate": 0.00021479999999999996,
+ "loss": 0.1647,
+ "step": 995
+ },
+ {
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.5464673638343811,
+ "learning_rate": 0.00021437142857142855,
+ "loss": 0.1469,
+ "step": 1000
+ },
+ {
+ "epoch": 5.742857142857143,
+ "grad_norm": 1.0279319286346436,
+ "learning_rate": 0.00021394285714285713,
+ "loss": 0.1203,
+ "step": 1005
+ },
+ {
+ "epoch": 5.771428571428571,
+ "grad_norm": 0.5503718256950378,
+ "learning_rate": 0.00021351428571428572,
+ "loss": 0.1409,
+ "step": 1010
+ },
+ {
+ "epoch": 5.8,
+ "grad_norm": 0.6123886108398438,
+ "learning_rate": 0.00021308571428571427,
+ "loss": 0.1427,
+ "step": 1015
+ },
+ {
+ "epoch": 5.828571428571428,
+ "grad_norm": 0.6560390591621399,
+ "learning_rate": 0.00021265714285714283,
+ "loss": 0.1415,
+ "step": 1020
+ },
+ {
+ "epoch": 5.857142857142857,
+ "grad_norm": 0.5576716661453247,
+ "learning_rate": 0.00021222857142857141,
+ "loss": 0.1408,
+ "step": 1025
+ },
+ {
+ "epoch": 5.885714285714286,
+ "grad_norm": 0.6419074535369873,
+ "learning_rate": 0.00021179999999999997,
+ "loss": 0.1385,
+ "step": 1030
+ },
+ {
+ "epoch": 5.914285714285715,
+ "grad_norm": 1.008925199508667,
+ "learning_rate": 0.00021137142857142858,
+ "loss": 0.1497,
+ "step": 1035
+ },
+ {
+ "epoch": 5.942857142857143,
+ "grad_norm": 0.6559906005859375,
+ "learning_rate": 0.00021094285714285714,
+ "loss": 0.1218,
+ "step": 1040
+ },
+ {
+ "epoch": 5.9714285714285715,
+ "grad_norm": 0.627164363861084,
+ "learning_rate": 0.0002105142857142857,
+ "loss": 0.1368,
+ "step": 1045
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.5760972499847412,
+ "learning_rate": 0.00021008571428571428,
+ "loss": 0.1508,
+ "step": 1050
+ },
+ {
+ "epoch": 6.0285714285714285,
+ "grad_norm": 0.5754174590110779,
+ "learning_rate": 0.00020965714285714284,
+ "loss": 0.1181,
+ "step": 1055
+ },
+ {
+ "epoch": 6.057142857142857,
+ "grad_norm": 0.8736348748207092,
+ "learning_rate": 0.0002092285714285714,
+ "loss": 0.1252,
+ "step": 1060
+ },
+ {
+ "epoch": 6.085714285714285,
+ "grad_norm": 0.7166719436645508,
+ "learning_rate": 0.00020879999999999998,
+ "loss": 0.1481,
+ "step": 1065
+ },
+ {
+ "epoch": 6.114285714285714,
+ "grad_norm": 0.6494349241256714,
+ "learning_rate": 0.00020837142857142856,
+ "loss": 0.1478,
+ "step": 1070
+ },
+ {
+ "epoch": 6.142857142857143,
+ "grad_norm": 0.6681587100028992,
+ "learning_rate": 0.00020794285714285712,
+ "loss": 0.1488,
+ "step": 1075
+ },
+ {
+ "epoch": 6.171428571428572,
+ "grad_norm": 0.7123684883117676,
+ "learning_rate": 0.0002075142857142857,
+ "loss": 0.1378,
+ "step": 1080
+ },
+ {
+ "epoch": 6.2,
+ "grad_norm": 0.6146950721740723,
+ "learning_rate": 0.00020708571428571426,
+ "loss": 0.1306,
+ "step": 1085
+ },
+ {
+ "epoch": 6.228571428571429,
+ "grad_norm": 0.8402445912361145,
+ "learning_rate": 0.00020665714285714282,
+ "loss": 0.1063,
+ "step": 1090
+ },
+ {
+ "epoch": 6.257142857142857,
+ "grad_norm": 0.6567764282226562,
+ "learning_rate": 0.0002062285714285714,
+ "loss": 0.1195,
+ "step": 1095
+ },
+ {
+ "epoch": 6.285714285714286,
+ "grad_norm": 0.6006014943122864,
+ "learning_rate": 0.0002058,
+ "loss": 0.1542,
+ "step": 1100
+ },
+ {
+ "epoch": 6.314285714285714,
+ "grad_norm": 0.793100893497467,
+ "learning_rate": 0.00020537142857142857,
+ "loss": 0.1381,
+ "step": 1105
+ },
+ {
+ "epoch": 6.3428571428571425,
+ "grad_norm": 0.5923666954040527,
+ "learning_rate": 0.00020494285714285713,
+ "loss": 0.1386,
+ "step": 1110
+ },
+ {
+ "epoch": 6.371428571428572,
+ "grad_norm": 0.6692521572113037,
+ "learning_rate": 0.0002045142857142857,
+ "loss": 0.1223,
+ "step": 1115
+ },
+ {
+ "epoch": 6.4,
+ "grad_norm": 0.7216306328773499,
+ "learning_rate": 0.00020408571428571427,
+ "loss": 0.1367,
+ "step": 1120
+ },
+ {
+ "epoch": 6.428571428571429,
+ "grad_norm": 0.5640934109687805,
+ "learning_rate": 0.00020365714285714283,
+ "loss": 0.1554,
+ "step": 1125
+ },
+ {
+ "epoch": 6.457142857142857,
+ "grad_norm": 0.8154368996620178,
+ "learning_rate": 0.00020322857142857138,
+ "loss": 0.1674,
+ "step": 1130
+ },
+ {
+ "epoch": 6.485714285714286,
+ "grad_norm": 0.7185398936271667,
+ "learning_rate": 0.0002028,
+ "loss": 0.1375,
+ "step": 1135
+ },
+ {
+ "epoch": 6.514285714285714,
+ "grad_norm": 0.6805170774459839,
+ "learning_rate": 0.00020237142857142855,
+ "loss": 0.1306,
+ "step": 1140
+ },
+ {
+ "epoch": 6.542857142857143,
+ "grad_norm": 0.5996941924095154,
+ "learning_rate": 0.00020194285714285714,
+ "loss": 0.1433,
+ "step": 1145
+ },
+ {
+ "epoch": 6.571428571428571,
+ "grad_norm": 0.5258373022079468,
+ "learning_rate": 0.0002015142857142857,
+ "loss": 0.1285,
+ "step": 1150
+ },
+ {
+ "epoch": 6.6,
+ "grad_norm": 0.7771695256233215,
+ "learning_rate": 0.00020108571428571425,
+ "loss": 0.1493,
+ "step": 1155
+ },
+ {
+ "epoch": 6.628571428571428,
+ "grad_norm": 0.5920616388320923,
+ "learning_rate": 0.00020065714285714284,
+ "loss": 0.1479,
+ "step": 1160
+ },
+ {
+ "epoch": 6.6571428571428575,
+ "grad_norm": 0.7460982799530029,
+ "learning_rate": 0.00020022857142857142,
+ "loss": 0.1173,
+ "step": 1165
+ },
+ {
+ "epoch": 6.685714285714286,
+ "grad_norm": 1.1703822612762451,
+ "learning_rate": 0.0001998,
+ "loss": 0.1402,
+ "step": 1170
+ },
+ {
+ "epoch": 6.714285714285714,
+ "grad_norm": 0.7894724011421204,
+ "learning_rate": 0.00019937142857142856,
+ "loss": 0.1253,
+ "step": 1175
+ },
+ {
+ "epoch": 6.742857142857143,
+ "grad_norm": 0.7013376355171204,
+ "learning_rate": 0.00019894285714285712,
+ "loss": 0.1573,
+ "step": 1180
+ },
+ {
+ "epoch": 6.771428571428571,
+ "grad_norm": 0.6421737670898438,
+ "learning_rate": 0.0001985142857142857,
+ "loss": 0.1497,
+ "step": 1185
+ },
+ {
+ "epoch": 6.8,
+ "grad_norm": 1.204296350479126,
+ "learning_rate": 0.00019808571428571426,
+ "loss": 0.1634,
+ "step": 1190
+ },
+ {
+ "epoch": 6.828571428571428,
+ "grad_norm": 0.867765486240387,
+ "learning_rate": 0.00019765714285714282,
+ "loss": 0.1353,
+ "step": 1195
+ },
+ {
+ "epoch": 6.857142857142857,
+ "grad_norm": 0.7325594425201416,
+ "learning_rate": 0.00019722857142857143,
+ "loss": 0.118,
+ "step": 1200
+ },
+ {
+ "epoch": 6.885714285714286,
+ "grad_norm": 0.7029078006744385,
+ "learning_rate": 0.00019679999999999999,
+ "loss": 0.1425,
+ "step": 1205
+ },
+ {
+ "epoch": 6.914285714285715,
+ "grad_norm": 1.1572504043579102,
+ "learning_rate": 0.00019637142857142857,
+ "loss": 0.1337,
+ "step": 1210
+ },
+ {
+ "epoch": 6.942857142857143,
+ "grad_norm": 0.8022822141647339,
+ "learning_rate": 0.00019594285714285713,
+ "loss": 0.1684,
+ "step": 1215
+ },
+ {
+ "epoch": 6.9714285714285715,
+ "grad_norm": 0.6729874610900879,
+ "learning_rate": 0.00019551428571428568,
+ "loss": 0.1238,
+ "step": 1220
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5773627758026123,
+ "learning_rate": 0.00019508571428571427,
+ "loss": 0.138,
+ "step": 1225
+ },
+ {
+ "epoch": 7.0285714285714285,
+ "grad_norm": 0.7182291150093079,
+ "learning_rate": 0.00019465714285714285,
+ "loss": 0.1431,
+ "step": 1230
+ },
+ {
+ "epoch": 7.057142857142857,
+ "grad_norm": 1.7567912340164185,
+ "learning_rate": 0.0001942285714285714,
+ "loss": 0.1319,
+ "step": 1235
+ },
+ {
+ "epoch": 7.085714285714285,
+ "grad_norm": 0.6845232248306274,
+ "learning_rate": 0.0001938,
+ "loss": 0.1292,
+ "step": 1240
+ },
+ {
+ "epoch": 7.114285714285714,
+ "grad_norm": 0.6077771782875061,
+ "learning_rate": 0.00019337142857142855,
+ "loss": 0.1238,
+ "step": 1245
+ },
+ {
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.6168347597122192,
+ "learning_rate": 0.0001929428571428571,
+ "loss": 0.1384,
+ "step": 1250
+ },
+ {
+ "epoch": 7.171428571428572,
+ "grad_norm": 0.7457576394081116,
+ "learning_rate": 0.0001925142857142857,
+ "loss": 0.1306,
+ "step": 1255
+ },
+ {
+ "epoch": 7.2,
+ "grad_norm": 0.5969316363334656,
+ "learning_rate": 0.00019208571428571425,
+ "loss": 0.1123,
+ "step": 1260
+ },
+ {
+ "epoch": 7.228571428571429,
+ "grad_norm": 0.6902753710746765,
+ "learning_rate": 0.00019165714285714286,
+ "loss": 0.1185,
+ "step": 1265
+ },
+ {
+ "epoch": 7.257142857142857,
+ "grad_norm": 0.6488338112831116,
+ "learning_rate": 0.00019122857142857142,
+ "loss": 0.1431,
+ "step": 1270
+ },
+ {
+ "epoch": 7.285714285714286,
+ "grad_norm": 0.6814819574356079,
+ "learning_rate": 0.00019079999999999998,
+ "loss": 0.1495,
+ "step": 1275
+ },
+ {
+ "epoch": 7.314285714285714,
+ "grad_norm": 0.7468088865280151,
+ "learning_rate": 0.00019037142857142856,
+ "loss": 0.1158,
+ "step": 1280
+ },
+ {
+ "epoch": 7.3428571428571425,
+ "grad_norm": 0.7417412400245667,
+ "learning_rate": 0.00018994285714285712,
+ "loss": 0.1311,
+ "step": 1285
+ },
+ {
+ "epoch": 7.371428571428572,
+ "grad_norm": 0.5480664372444153,
+ "learning_rate": 0.00018951428571428567,
+ "loss": 0.135,
+ "step": 1290
+ },
+ {
+ "epoch": 7.4,
+ "grad_norm": 0.725527822971344,
+ "learning_rate": 0.00018908571428571429,
+ "loss": 0.1217,
+ "step": 1295
+ },
+ {
+ "epoch": 7.428571428571429,
+ "grad_norm": 0.6566678285598755,
+ "learning_rate": 0.00018865714285714284,
+ "loss": 0.1417,
+ "step": 1300
+ },
+ {
+ "epoch": 7.457142857142857,
+ "grad_norm": 0.516952395439148,
+ "learning_rate": 0.00018822857142857143,
+ "loss": 0.1329,
+ "step": 1305
+ },
+ {
+ "epoch": 7.485714285714286,
+ "grad_norm": 1.9545241594314575,
+ "learning_rate": 0.00018779999999999998,
+ "loss": 0.1339,
+ "step": 1310
+ },
+ {
+ "epoch": 7.514285714285714,
+ "grad_norm": 0.8276839852333069,
+ "learning_rate": 0.00018737142857142854,
+ "loss": 0.1324,
+ "step": 1315
+ },
+ {
+ "epoch": 7.542857142857143,
+ "grad_norm": 0.6737099289894104,
+ "learning_rate": 0.00018694285714285713,
+ "loss": 0.1139,
+ "step": 1320
+ },
+ {
+ "epoch": 7.571428571428571,
+ "grad_norm": 0.6914472579956055,
+ "learning_rate": 0.00018651428571428568,
+ "loss": 0.1146,
+ "step": 1325
+ },
+ {
+ "epoch": 7.6,
+ "grad_norm": 0.6630033850669861,
+ "learning_rate": 0.0001860857142857143,
+ "loss": 0.1571,
+ "step": 1330
+ },
+ {
+ "epoch": 7.628571428571428,
+ "grad_norm": 0.820688784122467,
+ "learning_rate": 0.00018565714285714285,
+ "loss": 0.15,
+ "step": 1335
+ },
+ {
+ "epoch": 7.6571428571428575,
+ "grad_norm": 2.0491325855255127,
+ "learning_rate": 0.0001852285714285714,
+ "loss": 0.127,
+ "step": 1340
+ },
+ {
+ "epoch": 7.685714285714286,
+ "grad_norm": 0.9327268004417419,
+ "learning_rate": 0.0001848,
+ "loss": 0.1289,
+ "step": 1345
+ },
+ {
+ "epoch": 7.714285714285714,
+ "grad_norm": 1.3131701946258545,
+ "learning_rate": 0.00018437142857142855,
+ "loss": 0.1228,
+ "step": 1350
+ },
+ {
+ "epoch": 7.742857142857143,
+ "grad_norm": 2.955918312072754,
+ "learning_rate": 0.0001839428571428571,
+ "loss": 0.1082,
+ "step": 1355
+ },
+ {
+ "epoch": 7.771428571428571,
+ "grad_norm": 1.2165493965148926,
+ "learning_rate": 0.00018351428571428572,
+ "loss": 0.1688,
+ "step": 1360
+ },
+ {
+ "epoch": 7.8,
+ "grad_norm": 0.759324312210083,
+ "learning_rate": 0.00018308571428571428,
+ "loss": 0.1185,
+ "step": 1365
+ },
+ {
+ "epoch": 7.828571428571428,
+ "grad_norm": 0.7445591688156128,
+ "learning_rate": 0.00018265714285714286,
+ "loss": 0.1431,
+ "step": 1370
+ },
+ {
+ "epoch": 7.857142857142857,
+ "grad_norm": 0.679374098777771,
+ "learning_rate": 0.00018222857142857142,
+ "loss": 0.1451,
+ "step": 1375
+ },
+ {
+ "epoch": 7.885714285714286,
+ "grad_norm": 2.1234302520751953,
+ "learning_rate": 0.00018179999999999997,
+ "loss": 0.1265,
+ "step": 1380
+ },
+ {
+ "epoch": 7.914285714285715,
+ "grad_norm": 1.006521224975586,
+ "learning_rate": 0.00018137142857142856,
+ "loss": 0.1722,
+ "step": 1385
+ },
+ {
+ "epoch": 7.942857142857143,
+ "grad_norm": 0.7275253534317017,
+ "learning_rate": 0.00018094285714285712,
+ "loss": 0.1625,
+ "step": 1390
+ },
+ {
+ "epoch": 7.9714285714285715,
+ "grad_norm": 0.8612022995948792,
+ "learning_rate": 0.0001805142857142857,
+ "loss": 0.1345,
+ "step": 1395
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.7276798486709595,
+ "learning_rate": 0.00018008571428571428,
+ "loss": 0.1236,
+ "step": 1400
+ },
+ {
+ "epoch": 8.028571428571428,
+ "grad_norm": 0.8731086850166321,
+ "learning_rate": 0.00017965714285714284,
+ "loss": 0.1604,
+ "step": 1405
+ },
+ {
+ "epoch": 8.057142857142857,
+ "grad_norm": 0.8950818777084351,
+ "learning_rate": 0.0001792285714285714,
+ "loss": 0.1531,
+ "step": 1410
+ },
+ {
+ "epoch": 8.085714285714285,
+ "grad_norm": 0.7399356365203857,
+ "learning_rate": 0.00017879999999999998,
+ "loss": 0.1508,
+ "step": 1415
+ },
+ {
+ "epoch": 8.114285714285714,
+ "grad_norm": 1.3727307319641113,
+ "learning_rate": 0.00017837142857142854,
+ "loss": 0.1487,
+ "step": 1420
+ },
+ {
+ "epoch": 8.142857142857142,
+ "grad_norm": 0.5938125848770142,
+ "learning_rate": 0.00017794285714285715,
+ "loss": 0.1303,
+ "step": 1425
+ },
+ {
+ "epoch": 8.17142857142857,
+ "grad_norm": 0.7043821811676025,
+ "learning_rate": 0.0001775142857142857,
+ "loss": 0.0948,
+ "step": 1430
+ },
+ {
+ "epoch": 8.2,
+ "grad_norm": 1.1062767505645752,
+ "learning_rate": 0.00017708571428571426,
+ "loss": 0.1412,
+ "step": 1435
+ },
+ {
+ "epoch": 8.228571428571428,
+ "grad_norm": 0.844832181930542,
+ "learning_rate": 0.00017665714285714285,
+ "loss": 0.113,
+ "step": 1440
+ },
+ {
+ "epoch": 8.257142857142856,
+ "grad_norm": 0.7564154863357544,
+ "learning_rate": 0.0001762285714285714,
+ "loss": 0.1319,
+ "step": 1445
+ },
+ {
+ "epoch": 8.285714285714286,
+ "grad_norm": 0.8843110203742981,
+ "learning_rate": 0.00017579999999999996,
+ "loss": 0.1206,
+ "step": 1450
+ },
+ {
+ "epoch": 8.314285714285715,
+ "grad_norm": 0.8175828456878662,
+ "learning_rate": 0.00017537142857142855,
+ "loss": 0.1327,
+ "step": 1455
+ },
+ {
+ "epoch": 8.342857142857143,
+ "grad_norm": 0.6443565487861633,
+ "learning_rate": 0.00017494285714285713,
+ "loss": 0.1239,
+ "step": 1460
+ },
+ {
+ "epoch": 8.371428571428572,
+ "grad_norm": 0.7237185835838318,
+ "learning_rate": 0.00017451428571428572,
+ "loss": 0.1639,
+ "step": 1465
+ },
+ {
+ "epoch": 8.4,
+ "grad_norm": 0.6118057370185852,
+ "learning_rate": 0.00017408571428571427,
+ "loss": 0.1363,
+ "step": 1470
+ },
+ {
+ "epoch": 8.428571428571429,
+ "grad_norm": 0.6754649877548218,
+ "learning_rate": 0.00017365714285714283,
+ "loss": 0.1187,
+ "step": 1475
+ },
+ {
+ "epoch": 8.457142857142857,
+ "grad_norm": 1.0067390203475952,
+ "learning_rate": 0.00017322857142857141,
+ "loss": 0.1401,
+ "step": 1480
+ },
+ {
+ "epoch": 8.485714285714286,
+ "grad_norm": 8.509544372558594,
+ "learning_rate": 0.00017279999999999997,
+ "loss": 0.1304,
+ "step": 1485
+ },
+ {
+ "epoch": 8.514285714285714,
+ "grad_norm": 4.2030205726623535,
+ "learning_rate": 0.00017237142857142858,
+ "loss": 0.121,
+ "step": 1490
+ },
+ {
+ "epoch": 8.542857142857143,
+ "grad_norm": 4.877438068389893,
+ "learning_rate": 0.00017194285714285714,
+ "loss": 0.1918,
+ "step": 1495
+ },
+ {
+ "epoch": 8.571428571428571,
+ "grad_norm": 6.4971232414245605,
+ "learning_rate": 0.0001715142857142857,
+ "loss": 0.2154,
+ "step": 1500
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-1500/training_args.bin b/glot-contrastive-final-lora/checkpoint-1500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/checkpoint-2000/README.md b/glot-contrastive-final-lora/checkpoint-2000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-2000/adapter_config.json b/glot-contrastive-final-lora/checkpoint-2000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-2000/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-2000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ce77fbf4791146bc293336e87f65f339e4c78599
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:711e07c24e31501f072e595cc3a3ab71fd99dfdb7b91db165f6ee74a84d23cd0
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-2000/optimizer.pt b/glot-contrastive-final-lora/checkpoint-2000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..715c563bfca0e12474e7e1ab3d806dfcf59b200f
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3498c655bb1206516340ad6a1a375f5542a3351919099d2fe49c4838bfe9533
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-2000/rng_state.pth b/glot-contrastive-final-lora/checkpoint-2000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c7b203bcce35936bf21b078f9f5ec4070fea73ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e03ebb8df928308a6424f992063f5301b7d41a4785e5763346c3448dc6be8b4b
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-2000/scheduler.pt b/glot-contrastive-final-lora/checkpoint-2000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ebce6beffa1d7c57cbe076d7901f3b92c4904d4
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad5926187bcca6f27644c72ca9d33e1556220045488e2d905d0c7306c6d222dc
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-2000/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-2000/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-2000/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-2000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-2000/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-2000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-2000/trainer_state.json b/glot-contrastive-final-lora/checkpoint-2000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdef36940feea5968fda1cff13bd0c6ccdf187e6
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/trainer_state.json
@@ -0,0 +1,2834 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 11.428571428571429,
+ "eval_steps": 5,
+ "global_step": 2000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ },
+ {
+ "epoch": 2.8857142857142857,
+ "grad_norm": 1.3597544431686401,
+ "learning_rate": 0.00025679999999999995,
+ "loss": 0.1583,
+ "step": 505
+ },
+ {
+ "epoch": 2.914285714285714,
+ "grad_norm": 0.8790671825408936,
+ "learning_rate": 0.00025637142857142854,
+ "loss": 0.1565,
+ "step": 510
+ },
+ {
+ "epoch": 2.942857142857143,
+ "grad_norm": 1.1175066232681274,
+ "learning_rate": 0.0002559428571428571,
+ "loss": 0.1406,
+ "step": 515
+ },
+ {
+ "epoch": 2.9714285714285715,
+ "grad_norm": 2.8528785705566406,
+ "learning_rate": 0.0002555142857142857,
+ "loss": 0.1735,
+ "step": 520
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 2.2073328495025635,
+ "learning_rate": 0.0002550857142857143,
+ "loss": 0.1816,
+ "step": 525
+ },
+ {
+ "epoch": 3.0285714285714285,
+ "grad_norm": 11.01322078704834,
+ "learning_rate": 0.0002546571428571428,
+ "loss": 0.1873,
+ "step": 530
+ },
+ {
+ "epoch": 3.057142857142857,
+ "grad_norm": 1.5822402238845825,
+ "learning_rate": 0.0002542285714285714,
+ "loss": 0.168,
+ "step": 535
+ },
+ {
+ "epoch": 3.085714285714286,
+ "grad_norm": 1.3086942434310913,
+ "learning_rate": 0.0002538,
+ "loss": 0.149,
+ "step": 540
+ },
+ {
+ "epoch": 3.1142857142857143,
+ "grad_norm": 6.303041458129883,
+ "learning_rate": 0.0002533714285714285,
+ "loss": 0.1651,
+ "step": 545
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 14.48929500579834,
+ "learning_rate": 0.00025294285714285716,
+ "loss": 0.1687,
+ "step": 550
+ },
+ {
+ "epoch": 3.1714285714285713,
+ "grad_norm": 6.824525356292725,
+ "learning_rate": 0.0002525142857142857,
+ "loss": 0.1919,
+ "step": 555
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 18.772563934326172,
+ "learning_rate": 0.00025208571428571427,
+ "loss": 0.2075,
+ "step": 560
+ },
+ {
+ "epoch": 3.2285714285714286,
+ "grad_norm": 0.7268752455711365,
+ "learning_rate": 0.00025165714285714286,
+ "loss": 0.174,
+ "step": 565
+ },
+ {
+ "epoch": 3.257142857142857,
+ "grad_norm": 1.1301453113555908,
+ "learning_rate": 0.0002512285714285714,
+ "loss": 0.1668,
+ "step": 570
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 2.846802234649658,
+ "learning_rate": 0.00025079999999999997,
+ "loss": 0.1645,
+ "step": 575
+ },
+ {
+ "epoch": 3.314285714285714,
+ "grad_norm": 1.417515754699707,
+ "learning_rate": 0.00025037142857142855,
+ "loss": 0.1719,
+ "step": 580
+ },
+ {
+ "epoch": 3.342857142857143,
+ "grad_norm": 4.137150764465332,
+ "learning_rate": 0.00024994285714285714,
+ "loss": 0.1739,
+ "step": 585
+ },
+ {
+ "epoch": 3.3714285714285714,
+ "grad_norm": 2.6067259311676025,
+ "learning_rate": 0.0002495142857142857,
+ "loss": 0.1489,
+ "step": 590
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.601024627685547,
+ "learning_rate": 0.00024908571428571425,
+ "loss": 0.1618,
+ "step": 595
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 3.849017858505249,
+ "learning_rate": 0.00024865714285714284,
+ "loss": 0.1899,
+ "step": 600
+ },
+ {
+ "epoch": 3.4571428571428573,
+ "grad_norm": 4.673766136169434,
+ "learning_rate": 0.0002482285714285714,
+ "loss": 0.1761,
+ "step": 605
+ },
+ {
+ "epoch": 3.4857142857142858,
+ "grad_norm": 2.6057631969451904,
+ "learning_rate": 0.00024779999999999995,
+ "loss": 0.1743,
+ "step": 610
+ },
+ {
+ "epoch": 3.5142857142857142,
+ "grad_norm": 2.932652473449707,
+ "learning_rate": 0.0002473714285714286,
+ "loss": 0.1482,
+ "step": 615
+ },
+ {
+ "epoch": 3.5428571428571427,
+ "grad_norm": 0.8764939308166504,
+ "learning_rate": 0.0002469428571428571,
+ "loss": 0.1644,
+ "step": 620
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 1.3203191757202148,
+ "learning_rate": 0.0002465142857142857,
+ "loss": 0.1654,
+ "step": 625
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 0.7977635264396667,
+ "learning_rate": 0.0002460857142857143,
+ "loss": 0.1472,
+ "step": 630
+ },
+ {
+ "epoch": 3.6285714285714286,
+ "grad_norm": 1.4750248193740845,
+ "learning_rate": 0.0002456571428571428,
+ "loss": 0.1735,
+ "step": 635
+ },
+ {
+ "epoch": 3.657142857142857,
+ "grad_norm": 1.8164482116699219,
+ "learning_rate": 0.0002452285714285714,
+ "loss": 0.1593,
+ "step": 640
+ },
+ {
+ "epoch": 3.685714285714286,
+ "grad_norm": 1.4829603433609009,
+ "learning_rate": 0.0002448,
+ "loss": 0.1508,
+ "step": 645
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.8828144669532776,
+ "learning_rate": 0.00024437142857142857,
+ "loss": 0.1573,
+ "step": 650
+ },
+ {
+ "epoch": 3.742857142857143,
+ "grad_norm": 2.039384126663208,
+ "learning_rate": 0.00024394285714285713,
+ "loss": 0.1745,
+ "step": 655
+ },
+ {
+ "epoch": 3.7714285714285714,
+ "grad_norm": 0.9604200720787048,
+ "learning_rate": 0.00024351428571428569,
+ "loss": 0.17,
+ "step": 660
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 0.7903971076011658,
+ "learning_rate": 0.00024308571428571427,
+ "loss": 0.1654,
+ "step": 665
+ },
+ {
+ "epoch": 3.8285714285714287,
+ "grad_norm": 0.6935649514198303,
+ "learning_rate": 0.00024265714285714283,
+ "loss": 0.1714,
+ "step": 670
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.5832012295722961,
+ "learning_rate": 0.00024222857142857138,
+ "loss": 0.1636,
+ "step": 675
+ },
+ {
+ "epoch": 3.8857142857142857,
+ "grad_norm": 0.6303168535232544,
+ "learning_rate": 0.0002418,
+ "loss": 0.1604,
+ "step": 680
+ },
+ {
+ "epoch": 3.914285714285714,
+ "grad_norm": 0.7210885882377625,
+ "learning_rate": 0.00024137142857142855,
+ "loss": 0.1444,
+ "step": 685
+ },
+ {
+ "epoch": 3.942857142857143,
+ "grad_norm": 0.7690990567207336,
+ "learning_rate": 0.00024094285714285714,
+ "loss": 0.1631,
+ "step": 690
+ },
+ {
+ "epoch": 3.9714285714285715,
+ "grad_norm": 1.0142720937728882,
+ "learning_rate": 0.0002405142857142857,
+ "loss": 0.158,
+ "step": 695
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.7970322966575623,
+ "learning_rate": 0.00024008571428571425,
+ "loss": 0.1803,
+ "step": 700
+ },
+ {
+ "epoch": 4.0285714285714285,
+ "grad_norm": 0.6795914769172668,
+ "learning_rate": 0.00023965714285714284,
+ "loss": 0.143,
+ "step": 705
+ },
+ {
+ "epoch": 4.057142857142857,
+ "grad_norm": 0.6832629442214966,
+ "learning_rate": 0.0002392285714285714,
+ "loss": 0.1457,
+ "step": 710
+ },
+ {
+ "epoch": 4.085714285714285,
+ "grad_norm": 3.8629798889160156,
+ "learning_rate": 0.0002388,
+ "loss": 0.1671,
+ "step": 715
+ },
+ {
+ "epoch": 4.114285714285714,
+ "grad_norm": 1.1167882680892944,
+ "learning_rate": 0.00023837142857142856,
+ "loss": 0.1544,
+ "step": 720
+ },
+ {
+ "epoch": 4.142857142857143,
+ "grad_norm": 0.9431412816047668,
+ "learning_rate": 0.00023794285714285712,
+ "loss": 0.1605,
+ "step": 725
+ },
+ {
+ "epoch": 4.171428571428572,
+ "grad_norm": 1.310948133468628,
+ "learning_rate": 0.0002375142857142857,
+ "loss": 0.1121,
+ "step": 730
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 0.9830737709999084,
+ "learning_rate": 0.00023708571428571426,
+ "loss": 0.1742,
+ "step": 735
+ },
+ {
+ "epoch": 4.228571428571429,
+ "grad_norm": 0.6166555881500244,
+ "learning_rate": 0.00023665714285714282,
+ "loss": 0.1525,
+ "step": 740
+ },
+ {
+ "epoch": 4.257142857142857,
+ "grad_norm": 0.995579719543457,
+ "learning_rate": 0.00023622857142857143,
+ "loss": 0.1439,
+ "step": 745
+ },
+ {
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.639796793460846,
+ "learning_rate": 0.00023579999999999999,
+ "loss": 0.1692,
+ "step": 750
+ },
+ {
+ "epoch": 4.314285714285714,
+ "grad_norm": 0.9438050389289856,
+ "learning_rate": 0.00023537142857142854,
+ "loss": 0.1785,
+ "step": 755
+ },
+ {
+ "epoch": 4.3428571428571425,
+ "grad_norm": 0.8960750102996826,
+ "learning_rate": 0.00023494285714285713,
+ "loss": 0.1557,
+ "step": 760
+ },
+ {
+ "epoch": 4.371428571428572,
+ "grad_norm": 0.6287499070167542,
+ "learning_rate": 0.00023451428571428568,
+ "loss": 0.1459,
+ "step": 765
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 0.7638295888900757,
+ "learning_rate": 0.00023408571428571424,
+ "loss": 0.1341,
+ "step": 770
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.655878484249115,
+ "learning_rate": 0.00023365714285714283,
+ "loss": 0.1358,
+ "step": 775
+ },
+ {
+ "epoch": 4.457142857142857,
+ "grad_norm": 0.5840997695922852,
+ "learning_rate": 0.0002332285714285714,
+ "loss": 0.1386,
+ "step": 780
+ },
+ {
+ "epoch": 4.485714285714286,
+ "grad_norm": 1.1082488298416138,
+ "learning_rate": 0.0002328,
+ "loss": 0.1827,
+ "step": 785
+ },
+ {
+ "epoch": 4.514285714285714,
+ "grad_norm": 0.8825240135192871,
+ "learning_rate": 0.00023237142857142855,
+ "loss": 0.1527,
+ "step": 790
+ },
+ {
+ "epoch": 4.542857142857143,
+ "grad_norm": 0.6752304434776306,
+ "learning_rate": 0.0002319428571428571,
+ "loss": 0.1392,
+ "step": 795
+ },
+ {
+ "epoch": 4.571428571428571,
+ "grad_norm": 1.1423301696777344,
+ "learning_rate": 0.0002315142857142857,
+ "loss": 0.1433,
+ "step": 800
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 10.793691635131836,
+ "learning_rate": 0.00023108571428571425,
+ "loss": 0.1635,
+ "step": 805
+ },
+ {
+ "epoch": 4.628571428571428,
+ "grad_norm": 0.47564294934272766,
+ "learning_rate": 0.00023065714285714286,
+ "loss": 0.1199,
+ "step": 810
+ },
+ {
+ "epoch": 4.6571428571428575,
+ "grad_norm": 1.2492656707763672,
+ "learning_rate": 0.00023022857142857142,
+ "loss": 0.1488,
+ "step": 815
+ },
+ {
+ "epoch": 4.685714285714286,
+ "grad_norm": 0.6933501958847046,
+ "learning_rate": 0.00022979999999999997,
+ "loss": 0.1812,
+ "step": 820
+ },
+ {
+ "epoch": 4.714285714285714,
+ "grad_norm": 0.7901633977890015,
+ "learning_rate": 0.00022937142857142856,
+ "loss": 0.1415,
+ "step": 825
+ },
+ {
+ "epoch": 4.742857142857143,
+ "grad_norm": 0.7854829430580139,
+ "learning_rate": 0.00022894285714285712,
+ "loss": 0.1401,
+ "step": 830
+ },
+ {
+ "epoch": 4.771428571428571,
+ "grad_norm": 0.8716740608215332,
+ "learning_rate": 0.00022851428571428567,
+ "loss": 0.1982,
+ "step": 835
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 0.7047899961471558,
+ "learning_rate": 0.00022808571428571426,
+ "loss": 0.1624,
+ "step": 840
+ },
+ {
+ "epoch": 4.828571428571428,
+ "grad_norm": 0.7134959697723389,
+ "learning_rate": 0.00022765714285714284,
+ "loss": 0.1375,
+ "step": 845
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 1.0897325277328491,
+ "learning_rate": 0.00022722857142857143,
+ "loss": 0.1489,
+ "step": 850
+ },
+ {
+ "epoch": 4.885714285714286,
+ "grad_norm": 1.1065207719802856,
+ "learning_rate": 0.00022679999999999998,
+ "loss": 0.1495,
+ "step": 855
+ },
+ {
+ "epoch": 4.914285714285715,
+ "grad_norm": 0.7434757351875305,
+ "learning_rate": 0.00022637142857142854,
+ "loss": 0.1507,
+ "step": 860
+ },
+ {
+ "epoch": 4.942857142857143,
+ "grad_norm": 1.0045181512832642,
+ "learning_rate": 0.00022594285714285712,
+ "loss": 0.1527,
+ "step": 865
+ },
+ {
+ "epoch": 4.9714285714285715,
+ "grad_norm": 1.2025654315948486,
+ "learning_rate": 0.00022551428571428568,
+ "loss": 0.1523,
+ "step": 870
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.7823342084884644,
+ "learning_rate": 0.0002250857142857143,
+ "loss": 0.1514,
+ "step": 875
+ },
+ {
+ "epoch": 5.0285714285714285,
+ "grad_norm": 0.8405362963676453,
+ "learning_rate": 0.00022465714285714285,
+ "loss": 0.1461,
+ "step": 880
+ },
+ {
+ "epoch": 5.057142857142857,
+ "grad_norm": 0.7527463436126709,
+ "learning_rate": 0.0002242285714285714,
+ "loss": 0.1206,
+ "step": 885
+ },
+ {
+ "epoch": 5.085714285714285,
+ "grad_norm": 0.8372548222541809,
+ "learning_rate": 0.0002238,
+ "loss": 0.1513,
+ "step": 890
+ },
+ {
+ "epoch": 5.114285714285714,
+ "grad_norm": 0.8755456209182739,
+ "learning_rate": 0.00022337142857142855,
+ "loss": 0.1498,
+ "step": 895
+ },
+ {
+ "epoch": 5.142857142857143,
+ "grad_norm": 0.7312084436416626,
+ "learning_rate": 0.0002229428571428571,
+ "loss": 0.154,
+ "step": 900
+ },
+ {
+ "epoch": 5.171428571428572,
+ "grad_norm": 0.6366221904754639,
+ "learning_rate": 0.0002225142857142857,
+ "loss": 0.1466,
+ "step": 905
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 0.6406880617141724,
+ "learning_rate": 0.00022208571428571427,
+ "loss": 0.1254,
+ "step": 910
+ },
+ {
+ "epoch": 5.228571428571429,
+ "grad_norm": 2.4106833934783936,
+ "learning_rate": 0.00022165714285714283,
+ "loss": 0.1534,
+ "step": 915
+ },
+ {
+ "epoch": 5.257142857142857,
+ "grad_norm": 0.5635722279548645,
+ "learning_rate": 0.00022122857142857142,
+ "loss": 0.1461,
+ "step": 920
+ },
+ {
+ "epoch": 5.285714285714286,
+ "grad_norm": 0.787162184715271,
+ "learning_rate": 0.00022079999999999997,
+ "loss": 0.1424,
+ "step": 925
+ },
+ {
+ "epoch": 5.314285714285714,
+ "grad_norm": 0.6513975262641907,
+ "learning_rate": 0.00022037142857142853,
+ "loss": 0.1326,
+ "step": 930
+ },
+ {
+ "epoch": 5.3428571428571425,
+ "grad_norm": 0.6933534741401672,
+ "learning_rate": 0.00021994285714285711,
+ "loss": 0.1661,
+ "step": 935
+ },
+ {
+ "epoch": 5.371428571428572,
+ "grad_norm": 0.7263259887695312,
+ "learning_rate": 0.0002195142857142857,
+ "loss": 0.15,
+ "step": 940
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 0.5537381768226624,
+ "learning_rate": 0.00021908571428571428,
+ "loss": 0.129,
+ "step": 945
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.6014005541801453,
+ "learning_rate": 0.00021865714285714284,
+ "loss": 0.1321,
+ "step": 950
+ },
+ {
+ "epoch": 5.457142857142857,
+ "grad_norm": 0.6581441760063171,
+ "learning_rate": 0.0002182285714285714,
+ "loss": 0.1587,
+ "step": 955
+ },
+ {
+ "epoch": 5.485714285714286,
+ "grad_norm": 0.9326379895210266,
+ "learning_rate": 0.00021779999999999998,
+ "loss": 0.1654,
+ "step": 960
+ },
+ {
+ "epoch": 5.514285714285714,
+ "grad_norm": 0.9438592791557312,
+ "learning_rate": 0.00021737142857142854,
+ "loss": 0.1212,
+ "step": 965
+ },
+ {
+ "epoch": 5.542857142857143,
+ "grad_norm": 0.7699571251869202,
+ "learning_rate": 0.00021694285714285715,
+ "loss": 0.1464,
+ "step": 970
+ },
+ {
+ "epoch": 5.571428571428571,
+ "grad_norm": 0.8758366703987122,
+ "learning_rate": 0.0002165142857142857,
+ "loss": 0.1599,
+ "step": 975
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 0.6101442575454712,
+ "learning_rate": 0.00021608571428571426,
+ "loss": 0.1589,
+ "step": 980
+ },
+ {
+ "epoch": 5.628571428571428,
+ "grad_norm": 0.7454060912132263,
+ "learning_rate": 0.00021565714285714285,
+ "loss": 0.1433,
+ "step": 985
+ },
+ {
+ "epoch": 5.6571428571428575,
+ "grad_norm": 0.6379484534263611,
+ "learning_rate": 0.0002152285714285714,
+ "loss": 0.1592,
+ "step": 990
+ },
+ {
+ "epoch": 5.685714285714286,
+ "grad_norm": 1.1601309776306152,
+ "learning_rate": 0.00021479999999999996,
+ "loss": 0.1647,
+ "step": 995
+ },
+ {
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.5464673638343811,
+ "learning_rate": 0.00021437142857142855,
+ "loss": 0.1469,
+ "step": 1000
+ },
+ {
+ "epoch": 5.742857142857143,
+ "grad_norm": 1.0279319286346436,
+ "learning_rate": 0.00021394285714285713,
+ "loss": 0.1203,
+ "step": 1005
+ },
+ {
+ "epoch": 5.771428571428571,
+ "grad_norm": 0.5503718256950378,
+ "learning_rate": 0.00021351428571428572,
+ "loss": 0.1409,
+ "step": 1010
+ },
+ {
+ "epoch": 5.8,
+ "grad_norm": 0.6123886108398438,
+ "learning_rate": 0.00021308571428571427,
+ "loss": 0.1427,
+ "step": 1015
+ },
+ {
+ "epoch": 5.828571428571428,
+ "grad_norm": 0.6560390591621399,
+ "learning_rate": 0.00021265714285714283,
+ "loss": 0.1415,
+ "step": 1020
+ },
+ {
+ "epoch": 5.857142857142857,
+ "grad_norm": 0.5576716661453247,
+ "learning_rate": 0.00021222857142857141,
+ "loss": 0.1408,
+ "step": 1025
+ },
+ {
+ "epoch": 5.885714285714286,
+ "grad_norm": 0.6419074535369873,
+ "learning_rate": 0.00021179999999999997,
+ "loss": 0.1385,
+ "step": 1030
+ },
+ {
+ "epoch": 5.914285714285715,
+ "grad_norm": 1.008925199508667,
+ "learning_rate": 0.00021137142857142858,
+ "loss": 0.1497,
+ "step": 1035
+ },
+ {
+ "epoch": 5.942857142857143,
+ "grad_norm": 0.6559906005859375,
+ "learning_rate": 0.00021094285714285714,
+ "loss": 0.1218,
+ "step": 1040
+ },
+ {
+ "epoch": 5.9714285714285715,
+ "grad_norm": 0.627164363861084,
+ "learning_rate": 0.0002105142857142857,
+ "loss": 0.1368,
+ "step": 1045
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.5760972499847412,
+ "learning_rate": 0.00021008571428571428,
+ "loss": 0.1508,
+ "step": 1050
+ },
+ {
+ "epoch": 6.0285714285714285,
+ "grad_norm": 0.5754174590110779,
+ "learning_rate": 0.00020965714285714284,
+ "loss": 0.1181,
+ "step": 1055
+ },
+ {
+ "epoch": 6.057142857142857,
+ "grad_norm": 0.8736348748207092,
+ "learning_rate": 0.0002092285714285714,
+ "loss": 0.1252,
+ "step": 1060
+ },
+ {
+ "epoch": 6.085714285714285,
+ "grad_norm": 0.7166719436645508,
+ "learning_rate": 0.00020879999999999998,
+ "loss": 0.1481,
+ "step": 1065
+ },
+ {
+ "epoch": 6.114285714285714,
+ "grad_norm": 0.6494349241256714,
+ "learning_rate": 0.00020837142857142856,
+ "loss": 0.1478,
+ "step": 1070
+ },
+ {
+ "epoch": 6.142857142857143,
+ "grad_norm": 0.6681587100028992,
+ "learning_rate": 0.00020794285714285712,
+ "loss": 0.1488,
+ "step": 1075
+ },
+ {
+ "epoch": 6.171428571428572,
+ "grad_norm": 0.7123684883117676,
+ "learning_rate": 0.0002075142857142857,
+ "loss": 0.1378,
+ "step": 1080
+ },
+ {
+ "epoch": 6.2,
+ "grad_norm": 0.6146950721740723,
+ "learning_rate": 0.00020708571428571426,
+ "loss": 0.1306,
+ "step": 1085
+ },
+ {
+ "epoch": 6.228571428571429,
+ "grad_norm": 0.8402445912361145,
+ "learning_rate": 0.00020665714285714282,
+ "loss": 0.1063,
+ "step": 1090
+ },
+ {
+ "epoch": 6.257142857142857,
+ "grad_norm": 0.6567764282226562,
+ "learning_rate": 0.0002062285714285714,
+ "loss": 0.1195,
+ "step": 1095
+ },
+ {
+ "epoch": 6.285714285714286,
+ "grad_norm": 0.6006014943122864,
+ "learning_rate": 0.0002058,
+ "loss": 0.1542,
+ "step": 1100
+ },
+ {
+ "epoch": 6.314285714285714,
+ "grad_norm": 0.793100893497467,
+ "learning_rate": 0.00020537142857142857,
+ "loss": 0.1381,
+ "step": 1105
+ },
+ {
+ "epoch": 6.3428571428571425,
+ "grad_norm": 0.5923666954040527,
+ "learning_rate": 0.00020494285714285713,
+ "loss": 0.1386,
+ "step": 1110
+ },
+ {
+ "epoch": 6.371428571428572,
+ "grad_norm": 0.6692521572113037,
+ "learning_rate": 0.0002045142857142857,
+ "loss": 0.1223,
+ "step": 1115
+ },
+ {
+ "epoch": 6.4,
+ "grad_norm": 0.7216306328773499,
+ "learning_rate": 0.00020408571428571427,
+ "loss": 0.1367,
+ "step": 1120
+ },
+ {
+ "epoch": 6.428571428571429,
+ "grad_norm": 0.5640934109687805,
+ "learning_rate": 0.00020365714285714283,
+ "loss": 0.1554,
+ "step": 1125
+ },
+ {
+ "epoch": 6.457142857142857,
+ "grad_norm": 0.8154368996620178,
+ "learning_rate": 0.00020322857142857138,
+ "loss": 0.1674,
+ "step": 1130
+ },
+ {
+ "epoch": 6.485714285714286,
+ "grad_norm": 0.7185398936271667,
+ "learning_rate": 0.0002028,
+ "loss": 0.1375,
+ "step": 1135
+ },
+ {
+ "epoch": 6.514285714285714,
+ "grad_norm": 0.6805170774459839,
+ "learning_rate": 0.00020237142857142855,
+ "loss": 0.1306,
+ "step": 1140
+ },
+ {
+ "epoch": 6.542857142857143,
+ "grad_norm": 0.5996941924095154,
+ "learning_rate": 0.00020194285714285714,
+ "loss": 0.1433,
+ "step": 1145
+ },
+ {
+ "epoch": 6.571428571428571,
+ "grad_norm": 0.5258373022079468,
+ "learning_rate": 0.0002015142857142857,
+ "loss": 0.1285,
+ "step": 1150
+ },
+ {
+ "epoch": 6.6,
+ "grad_norm": 0.7771695256233215,
+ "learning_rate": 0.00020108571428571425,
+ "loss": 0.1493,
+ "step": 1155
+ },
+ {
+ "epoch": 6.628571428571428,
+ "grad_norm": 0.5920616388320923,
+ "learning_rate": 0.00020065714285714284,
+ "loss": 0.1479,
+ "step": 1160
+ },
+ {
+ "epoch": 6.6571428571428575,
+ "grad_norm": 0.7460982799530029,
+ "learning_rate": 0.00020022857142857142,
+ "loss": 0.1173,
+ "step": 1165
+ },
+ {
+ "epoch": 6.685714285714286,
+ "grad_norm": 1.1703822612762451,
+ "learning_rate": 0.0001998,
+ "loss": 0.1402,
+ "step": 1170
+ },
+ {
+ "epoch": 6.714285714285714,
+ "grad_norm": 0.7894724011421204,
+ "learning_rate": 0.00019937142857142856,
+ "loss": 0.1253,
+ "step": 1175
+ },
+ {
+ "epoch": 6.742857142857143,
+ "grad_norm": 0.7013376355171204,
+ "learning_rate": 0.00019894285714285712,
+ "loss": 0.1573,
+ "step": 1180
+ },
+ {
+ "epoch": 6.771428571428571,
+ "grad_norm": 0.6421737670898438,
+ "learning_rate": 0.0001985142857142857,
+ "loss": 0.1497,
+ "step": 1185
+ },
+ {
+ "epoch": 6.8,
+ "grad_norm": 1.204296350479126,
+ "learning_rate": 0.00019808571428571426,
+ "loss": 0.1634,
+ "step": 1190
+ },
+ {
+ "epoch": 6.828571428571428,
+ "grad_norm": 0.867765486240387,
+ "learning_rate": 0.00019765714285714282,
+ "loss": 0.1353,
+ "step": 1195
+ },
+ {
+ "epoch": 6.857142857142857,
+ "grad_norm": 0.7325594425201416,
+ "learning_rate": 0.00019722857142857143,
+ "loss": 0.118,
+ "step": 1200
+ },
+ {
+ "epoch": 6.885714285714286,
+ "grad_norm": 0.7029078006744385,
+ "learning_rate": 0.00019679999999999999,
+ "loss": 0.1425,
+ "step": 1205
+ },
+ {
+ "epoch": 6.914285714285715,
+ "grad_norm": 1.1572504043579102,
+ "learning_rate": 0.00019637142857142857,
+ "loss": 0.1337,
+ "step": 1210
+ },
+ {
+ "epoch": 6.942857142857143,
+ "grad_norm": 0.8022822141647339,
+ "learning_rate": 0.00019594285714285713,
+ "loss": 0.1684,
+ "step": 1215
+ },
+ {
+ "epoch": 6.9714285714285715,
+ "grad_norm": 0.6729874610900879,
+ "learning_rate": 0.00019551428571428568,
+ "loss": 0.1238,
+ "step": 1220
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5773627758026123,
+ "learning_rate": 0.00019508571428571427,
+ "loss": 0.138,
+ "step": 1225
+ },
+ {
+ "epoch": 7.0285714285714285,
+ "grad_norm": 0.7182291150093079,
+ "learning_rate": 0.00019465714285714285,
+ "loss": 0.1431,
+ "step": 1230
+ },
+ {
+ "epoch": 7.057142857142857,
+ "grad_norm": 1.7567912340164185,
+ "learning_rate": 0.0001942285714285714,
+ "loss": 0.1319,
+ "step": 1235
+ },
+ {
+ "epoch": 7.085714285714285,
+ "grad_norm": 0.6845232248306274,
+ "learning_rate": 0.0001938,
+ "loss": 0.1292,
+ "step": 1240
+ },
+ {
+ "epoch": 7.114285714285714,
+ "grad_norm": 0.6077771782875061,
+ "learning_rate": 0.00019337142857142855,
+ "loss": 0.1238,
+ "step": 1245
+ },
+ {
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.6168347597122192,
+ "learning_rate": 0.0001929428571428571,
+ "loss": 0.1384,
+ "step": 1250
+ },
+ {
+ "epoch": 7.171428571428572,
+ "grad_norm": 0.7457576394081116,
+ "learning_rate": 0.0001925142857142857,
+ "loss": 0.1306,
+ "step": 1255
+ },
+ {
+ "epoch": 7.2,
+ "grad_norm": 0.5969316363334656,
+ "learning_rate": 0.00019208571428571425,
+ "loss": 0.1123,
+ "step": 1260
+ },
+ {
+ "epoch": 7.228571428571429,
+ "grad_norm": 0.6902753710746765,
+ "learning_rate": 0.00019165714285714286,
+ "loss": 0.1185,
+ "step": 1265
+ },
+ {
+ "epoch": 7.257142857142857,
+ "grad_norm": 0.6488338112831116,
+ "learning_rate": 0.00019122857142857142,
+ "loss": 0.1431,
+ "step": 1270
+ },
+ {
+ "epoch": 7.285714285714286,
+ "grad_norm": 0.6814819574356079,
+ "learning_rate": 0.00019079999999999998,
+ "loss": 0.1495,
+ "step": 1275
+ },
+ {
+ "epoch": 7.314285714285714,
+ "grad_norm": 0.7468088865280151,
+ "learning_rate": 0.00019037142857142856,
+ "loss": 0.1158,
+ "step": 1280
+ },
+ {
+ "epoch": 7.3428571428571425,
+ "grad_norm": 0.7417412400245667,
+ "learning_rate": 0.00018994285714285712,
+ "loss": 0.1311,
+ "step": 1285
+ },
+ {
+ "epoch": 7.371428571428572,
+ "grad_norm": 0.5480664372444153,
+ "learning_rate": 0.00018951428571428567,
+ "loss": 0.135,
+ "step": 1290
+ },
+ {
+ "epoch": 7.4,
+ "grad_norm": 0.725527822971344,
+ "learning_rate": 0.00018908571428571429,
+ "loss": 0.1217,
+ "step": 1295
+ },
+ {
+ "epoch": 7.428571428571429,
+ "grad_norm": 0.6566678285598755,
+ "learning_rate": 0.00018865714285714284,
+ "loss": 0.1417,
+ "step": 1300
+ },
+ {
+ "epoch": 7.457142857142857,
+ "grad_norm": 0.516952395439148,
+ "learning_rate": 0.00018822857142857143,
+ "loss": 0.1329,
+ "step": 1305
+ },
+ {
+ "epoch": 7.485714285714286,
+ "grad_norm": 1.9545241594314575,
+ "learning_rate": 0.00018779999999999998,
+ "loss": 0.1339,
+ "step": 1310
+ },
+ {
+ "epoch": 7.514285714285714,
+ "grad_norm": 0.8276839852333069,
+ "learning_rate": 0.00018737142857142854,
+ "loss": 0.1324,
+ "step": 1315
+ },
+ {
+ "epoch": 7.542857142857143,
+ "grad_norm": 0.6737099289894104,
+ "learning_rate": 0.00018694285714285713,
+ "loss": 0.1139,
+ "step": 1320
+ },
+ {
+ "epoch": 7.571428571428571,
+ "grad_norm": 0.6914472579956055,
+ "learning_rate": 0.00018651428571428568,
+ "loss": 0.1146,
+ "step": 1325
+ },
+ {
+ "epoch": 7.6,
+ "grad_norm": 0.6630033850669861,
+ "learning_rate": 0.0001860857142857143,
+ "loss": 0.1571,
+ "step": 1330
+ },
+ {
+ "epoch": 7.628571428571428,
+ "grad_norm": 0.820688784122467,
+ "learning_rate": 0.00018565714285714285,
+ "loss": 0.15,
+ "step": 1335
+ },
+ {
+ "epoch": 7.6571428571428575,
+ "grad_norm": 2.0491325855255127,
+ "learning_rate": 0.0001852285714285714,
+ "loss": 0.127,
+ "step": 1340
+ },
+ {
+ "epoch": 7.685714285714286,
+ "grad_norm": 0.9327268004417419,
+ "learning_rate": 0.0001848,
+ "loss": 0.1289,
+ "step": 1345
+ },
+ {
+ "epoch": 7.714285714285714,
+ "grad_norm": 1.3131701946258545,
+ "learning_rate": 0.00018437142857142855,
+ "loss": 0.1228,
+ "step": 1350
+ },
+ {
+ "epoch": 7.742857142857143,
+ "grad_norm": 2.955918312072754,
+ "learning_rate": 0.0001839428571428571,
+ "loss": 0.1082,
+ "step": 1355
+ },
+ {
+ "epoch": 7.771428571428571,
+ "grad_norm": 1.2165493965148926,
+ "learning_rate": 0.00018351428571428572,
+ "loss": 0.1688,
+ "step": 1360
+ },
+ {
+ "epoch": 7.8,
+ "grad_norm": 0.759324312210083,
+ "learning_rate": 0.00018308571428571428,
+ "loss": 0.1185,
+ "step": 1365
+ },
+ {
+ "epoch": 7.828571428571428,
+ "grad_norm": 0.7445591688156128,
+ "learning_rate": 0.00018265714285714286,
+ "loss": 0.1431,
+ "step": 1370
+ },
+ {
+ "epoch": 7.857142857142857,
+ "grad_norm": 0.679374098777771,
+ "learning_rate": 0.00018222857142857142,
+ "loss": 0.1451,
+ "step": 1375
+ },
+ {
+ "epoch": 7.885714285714286,
+ "grad_norm": 2.1234302520751953,
+ "learning_rate": 0.00018179999999999997,
+ "loss": 0.1265,
+ "step": 1380
+ },
+ {
+ "epoch": 7.914285714285715,
+ "grad_norm": 1.006521224975586,
+ "learning_rate": 0.00018137142857142856,
+ "loss": 0.1722,
+ "step": 1385
+ },
+ {
+ "epoch": 7.942857142857143,
+ "grad_norm": 0.7275253534317017,
+ "learning_rate": 0.00018094285714285712,
+ "loss": 0.1625,
+ "step": 1390
+ },
+ {
+ "epoch": 7.9714285714285715,
+ "grad_norm": 0.8612022995948792,
+ "learning_rate": 0.0001805142857142857,
+ "loss": 0.1345,
+ "step": 1395
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.7276798486709595,
+ "learning_rate": 0.00018008571428571428,
+ "loss": 0.1236,
+ "step": 1400
+ },
+ {
+ "epoch": 8.028571428571428,
+ "grad_norm": 0.8731086850166321,
+ "learning_rate": 0.00017965714285714284,
+ "loss": 0.1604,
+ "step": 1405
+ },
+ {
+ "epoch": 8.057142857142857,
+ "grad_norm": 0.8950818777084351,
+ "learning_rate": 0.0001792285714285714,
+ "loss": 0.1531,
+ "step": 1410
+ },
+ {
+ "epoch": 8.085714285714285,
+ "grad_norm": 0.7399356365203857,
+ "learning_rate": 0.00017879999999999998,
+ "loss": 0.1508,
+ "step": 1415
+ },
+ {
+ "epoch": 8.114285714285714,
+ "grad_norm": 1.3727307319641113,
+ "learning_rate": 0.00017837142857142854,
+ "loss": 0.1487,
+ "step": 1420
+ },
+ {
+ "epoch": 8.142857142857142,
+ "grad_norm": 0.5938125848770142,
+ "learning_rate": 0.00017794285714285715,
+ "loss": 0.1303,
+ "step": 1425
+ },
+ {
+ "epoch": 8.17142857142857,
+ "grad_norm": 0.7043821811676025,
+ "learning_rate": 0.0001775142857142857,
+ "loss": 0.0948,
+ "step": 1430
+ },
+ {
+ "epoch": 8.2,
+ "grad_norm": 1.1062767505645752,
+ "learning_rate": 0.00017708571428571426,
+ "loss": 0.1412,
+ "step": 1435
+ },
+ {
+ "epoch": 8.228571428571428,
+ "grad_norm": 0.844832181930542,
+ "learning_rate": 0.00017665714285714285,
+ "loss": 0.113,
+ "step": 1440
+ },
+ {
+ "epoch": 8.257142857142856,
+ "grad_norm": 0.7564154863357544,
+ "learning_rate": 0.0001762285714285714,
+ "loss": 0.1319,
+ "step": 1445
+ },
+ {
+ "epoch": 8.285714285714286,
+ "grad_norm": 0.8843110203742981,
+ "learning_rate": 0.00017579999999999996,
+ "loss": 0.1206,
+ "step": 1450
+ },
+ {
+ "epoch": 8.314285714285715,
+ "grad_norm": 0.8175828456878662,
+ "learning_rate": 0.00017537142857142855,
+ "loss": 0.1327,
+ "step": 1455
+ },
+ {
+ "epoch": 8.342857142857143,
+ "grad_norm": 0.6443565487861633,
+ "learning_rate": 0.00017494285714285713,
+ "loss": 0.1239,
+ "step": 1460
+ },
+ {
+ "epoch": 8.371428571428572,
+ "grad_norm": 0.7237185835838318,
+ "learning_rate": 0.00017451428571428572,
+ "loss": 0.1639,
+ "step": 1465
+ },
+ {
+ "epoch": 8.4,
+ "grad_norm": 0.6118057370185852,
+ "learning_rate": 0.00017408571428571427,
+ "loss": 0.1363,
+ "step": 1470
+ },
+ {
+ "epoch": 8.428571428571429,
+ "grad_norm": 0.6754649877548218,
+ "learning_rate": 0.00017365714285714283,
+ "loss": 0.1187,
+ "step": 1475
+ },
+ {
+ "epoch": 8.457142857142857,
+ "grad_norm": 1.0067390203475952,
+ "learning_rate": 0.00017322857142857141,
+ "loss": 0.1401,
+ "step": 1480
+ },
+ {
+ "epoch": 8.485714285714286,
+ "grad_norm": 8.509544372558594,
+ "learning_rate": 0.00017279999999999997,
+ "loss": 0.1304,
+ "step": 1485
+ },
+ {
+ "epoch": 8.514285714285714,
+ "grad_norm": 4.2030205726623535,
+ "learning_rate": 0.00017237142857142858,
+ "loss": 0.121,
+ "step": 1490
+ },
+ {
+ "epoch": 8.542857142857143,
+ "grad_norm": 4.877438068389893,
+ "learning_rate": 0.00017194285714285714,
+ "loss": 0.1918,
+ "step": 1495
+ },
+ {
+ "epoch": 8.571428571428571,
+ "grad_norm": 6.4971232414245605,
+ "learning_rate": 0.0001715142857142857,
+ "loss": 0.2154,
+ "step": 1500
+ },
+ {
+ "epoch": 8.6,
+ "grad_norm": 4.365469932556152,
+ "learning_rate": 0.00017108571428571428,
+ "loss": 0.2272,
+ "step": 1505
+ },
+ {
+ "epoch": 8.628571428571428,
+ "grad_norm": 2.551957845687866,
+ "learning_rate": 0.00017065714285714284,
+ "loss": 0.2163,
+ "step": 1510
+ },
+ {
+ "epoch": 8.657142857142857,
+ "grad_norm": 5.326391220092773,
+ "learning_rate": 0.0001702285714285714,
+ "loss": 0.1612,
+ "step": 1515
+ },
+ {
+ "epoch": 8.685714285714285,
+ "grad_norm": 1.3528404235839844,
+ "learning_rate": 0.00016979999999999998,
+ "loss": 0.1636,
+ "step": 1520
+ },
+ {
+ "epoch": 8.714285714285714,
+ "grad_norm": 1.4466065168380737,
+ "learning_rate": 0.00016937142857142856,
+ "loss": 0.1295,
+ "step": 1525
+ },
+ {
+ "epoch": 8.742857142857144,
+ "grad_norm": 0.6576040387153625,
+ "learning_rate": 0.00016894285714285715,
+ "loss": 0.1318,
+ "step": 1530
+ },
+ {
+ "epoch": 8.771428571428572,
+ "grad_norm": 1.286942958831787,
+ "learning_rate": 0.0001685142857142857,
+ "loss": 0.1443,
+ "step": 1535
+ },
+ {
+ "epoch": 8.8,
+ "grad_norm": 9.474458694458008,
+ "learning_rate": 0.00016808571428571426,
+ "loss": 0.1313,
+ "step": 1540
+ },
+ {
+ "epoch": 8.82857142857143,
+ "grad_norm": 2.6731069087982178,
+ "learning_rate": 0.00016765714285714285,
+ "loss": 0.1485,
+ "step": 1545
+ },
+ {
+ "epoch": 8.857142857142858,
+ "grad_norm": 1.313723087310791,
+ "learning_rate": 0.0001672285714285714,
+ "loss": 0.1346,
+ "step": 1550
+ },
+ {
+ "epoch": 8.885714285714286,
+ "grad_norm": 1.7115576267242432,
+ "learning_rate": 0.0001668,
+ "loss": 0.1471,
+ "step": 1555
+ },
+ {
+ "epoch": 8.914285714285715,
+ "grad_norm": 1.2599923610687256,
+ "learning_rate": 0.00016637142857142857,
+ "loss": 0.1433,
+ "step": 1560
+ },
+ {
+ "epoch": 8.942857142857143,
+ "grad_norm": 0.9659029245376587,
+ "learning_rate": 0.00016594285714285713,
+ "loss": 0.1256,
+ "step": 1565
+ },
+ {
+ "epoch": 8.971428571428572,
+ "grad_norm": 1.1282744407653809,
+ "learning_rate": 0.0001655142857142857,
+ "loss": 0.1373,
+ "step": 1570
+ },
+ {
+ "epoch": 9.0,
+ "grad_norm": 3.20717453956604,
+ "learning_rate": 0.00016508571428571427,
+ "loss": 0.1355,
+ "step": 1575
+ },
+ {
+ "epoch": 9.028571428571428,
+ "grad_norm": 0.8310821056365967,
+ "learning_rate": 0.00016465714285714283,
+ "loss": 0.1268,
+ "step": 1580
+ },
+ {
+ "epoch": 9.057142857142857,
+ "grad_norm": 1.5337790250778198,
+ "learning_rate": 0.00016422857142857139,
+ "loss": 0.1267,
+ "step": 1585
+ },
+ {
+ "epoch": 9.085714285714285,
+ "grad_norm": 2.6406068801879883,
+ "learning_rate": 0.0001638,
+ "loss": 0.1363,
+ "step": 1590
+ },
+ {
+ "epoch": 9.114285714285714,
+ "grad_norm": 0.7705873847007751,
+ "learning_rate": 0.00016337142857142855,
+ "loss": 0.1291,
+ "step": 1595
+ },
+ {
+ "epoch": 9.142857142857142,
+ "grad_norm": 0.7092650532722473,
+ "learning_rate": 0.00016294285714285714,
+ "loss": 0.1435,
+ "step": 1600
+ },
+ {
+ "epoch": 9.17142857142857,
+ "grad_norm": 1.098961591720581,
+ "learning_rate": 0.0001625142857142857,
+ "loss": 0.1471,
+ "step": 1605
+ },
+ {
+ "epoch": 9.2,
+ "grad_norm": 0.6994885206222534,
+ "learning_rate": 0.00016208571428571425,
+ "loss": 0.1345,
+ "step": 1610
+ },
+ {
+ "epoch": 9.228571428571428,
+ "grad_norm": 0.9613476991653442,
+ "learning_rate": 0.00016165714285714284,
+ "loss": 0.1399,
+ "step": 1615
+ },
+ {
+ "epoch": 9.257142857142856,
+ "grad_norm": 0.675588846206665,
+ "learning_rate": 0.00016122857142857142,
+ "loss": 0.1319,
+ "step": 1620
+ },
+ {
+ "epoch": 9.285714285714286,
+ "grad_norm": 0.7519372701644897,
+ "learning_rate": 0.0001608,
+ "loss": 0.137,
+ "step": 1625
+ },
+ {
+ "epoch": 9.314285714285715,
+ "grad_norm": 1.135025978088379,
+ "learning_rate": 0.00016037142857142856,
+ "loss": 0.1322,
+ "step": 1630
+ },
+ {
+ "epoch": 9.342857142857143,
+ "grad_norm": 0.7462936639785767,
+ "learning_rate": 0.00015994285714285712,
+ "loss": 0.1215,
+ "step": 1635
+ },
+ {
+ "epoch": 9.371428571428572,
+ "grad_norm": 0.9042088985443115,
+ "learning_rate": 0.0001595142857142857,
+ "loss": 0.1191,
+ "step": 1640
+ },
+ {
+ "epoch": 9.4,
+ "grad_norm": 0.567828893661499,
+ "learning_rate": 0.00015908571428571426,
+ "loss": 0.1189,
+ "step": 1645
+ },
+ {
+ "epoch": 9.428571428571429,
+ "grad_norm": 0.981585681438446,
+ "learning_rate": 0.00015865714285714282,
+ "loss": 0.128,
+ "step": 1650
+ },
+ {
+ "epoch": 9.457142857142857,
+ "grad_norm": 1.24985933303833,
+ "learning_rate": 0.00015822857142857143,
+ "loss": 0.1315,
+ "step": 1655
+ },
+ {
+ "epoch": 9.485714285714286,
+ "grad_norm": 0.6517993211746216,
+ "learning_rate": 0.0001578,
+ "loss": 0.1076,
+ "step": 1660
+ },
+ {
+ "epoch": 9.514285714285714,
+ "grad_norm": 1.166628122329712,
+ "learning_rate": 0.00015737142857142857,
+ "loss": 0.1345,
+ "step": 1665
+ },
+ {
+ "epoch": 9.542857142857143,
+ "grad_norm": 0.9763592481613159,
+ "learning_rate": 0.00015694285714285713,
+ "loss": 0.1449,
+ "step": 1670
+ },
+ {
+ "epoch": 9.571428571428571,
+ "grad_norm": 0.7829060554504395,
+ "learning_rate": 0.00015651428571428569,
+ "loss": 0.1117,
+ "step": 1675
+ },
+ {
+ "epoch": 9.6,
+ "grad_norm": 0.6693719029426575,
+ "learning_rate": 0.00015608571428571427,
+ "loss": 0.1129,
+ "step": 1680
+ },
+ {
+ "epoch": 9.628571428571428,
+ "grad_norm": 1.2122846841812134,
+ "learning_rate": 0.00015565714285714285,
+ "loss": 0.1125,
+ "step": 1685
+ },
+ {
+ "epoch": 9.657142857142857,
+ "grad_norm": 1.0689371824264526,
+ "learning_rate": 0.0001552285714285714,
+ "loss": 0.1478,
+ "step": 1690
+ },
+ {
+ "epoch": 9.685714285714285,
+ "grad_norm": 1.8511656522750854,
+ "learning_rate": 0.0001548,
+ "loss": 0.1431,
+ "step": 1695
+ },
+ {
+ "epoch": 9.714285714285714,
+ "grad_norm": 0.6706506609916687,
+ "learning_rate": 0.00015437142857142855,
+ "loss": 0.1262,
+ "step": 1700
+ },
+ {
+ "epoch": 9.742857142857144,
+ "grad_norm": 1.0798784494400024,
+ "learning_rate": 0.00015394285714285714,
+ "loss": 0.1275,
+ "step": 1705
+ },
+ {
+ "epoch": 9.771428571428572,
+ "grad_norm": 0.7915983200073242,
+ "learning_rate": 0.0001535142857142857,
+ "loss": 0.1316,
+ "step": 1710
+ },
+ {
+ "epoch": 9.8,
+ "grad_norm": 1.8630567789077759,
+ "learning_rate": 0.00015308571428571425,
+ "loss": 0.1258,
+ "step": 1715
+ },
+ {
+ "epoch": 9.82857142857143,
+ "grad_norm": 0.7807756662368774,
+ "learning_rate": 0.00015265714285714286,
+ "loss": 0.1079,
+ "step": 1720
+ },
+ {
+ "epoch": 9.857142857142858,
+ "grad_norm": 1.4698439836502075,
+ "learning_rate": 0.00015222857142857142,
+ "loss": 0.1357,
+ "step": 1725
+ },
+ {
+ "epoch": 9.885714285714286,
+ "grad_norm": 1.2121926546096802,
+ "learning_rate": 0.00015179999999999998,
+ "loss": 0.1322,
+ "step": 1730
+ },
+ {
+ "epoch": 9.914285714285715,
+ "grad_norm": 0.6348568201065063,
+ "learning_rate": 0.00015137142857142856,
+ "loss": 0.0893,
+ "step": 1735
+ },
+ {
+ "epoch": 9.942857142857143,
+ "grad_norm": 0.6694422364234924,
+ "learning_rate": 0.00015094285714285712,
+ "loss": 0.1189,
+ "step": 1740
+ },
+ {
+ "epoch": 9.971428571428572,
+ "grad_norm": 0.569332480430603,
+ "learning_rate": 0.00015051428571428567,
+ "loss": 0.1349,
+ "step": 1745
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.934073269367218,
+ "learning_rate": 0.00015008571428571429,
+ "loss": 0.1237,
+ "step": 1750
+ },
+ {
+ "epoch": 10.028571428571428,
+ "grad_norm": 0.7191672325134277,
+ "learning_rate": 0.00014965714285714284,
+ "loss": 0.1308,
+ "step": 1755
+ },
+ {
+ "epoch": 10.057142857142857,
+ "grad_norm": 0.7006493806838989,
+ "learning_rate": 0.00014922857142857143,
+ "loss": 0.104,
+ "step": 1760
+ },
+ {
+ "epoch": 10.085714285714285,
+ "grad_norm": 0.9030678272247314,
+ "learning_rate": 0.00014879999999999998,
+ "loss": 0.1308,
+ "step": 1765
+ },
+ {
+ "epoch": 10.114285714285714,
+ "grad_norm": 0.7007766366004944,
+ "learning_rate": 0.00014837142857142854,
+ "loss": 0.1044,
+ "step": 1770
+ },
+ {
+ "epoch": 10.142857142857142,
+ "grad_norm": 0.4832770824432373,
+ "learning_rate": 0.00014794285714285713,
+ "loss": 0.1119,
+ "step": 1775
+ },
+ {
+ "epoch": 10.17142857142857,
+ "grad_norm": 0.7819458842277527,
+ "learning_rate": 0.0001475142857142857,
+ "loss": 0.1087,
+ "step": 1780
+ },
+ {
+ "epoch": 10.2,
+ "grad_norm": 1.0223525762557983,
+ "learning_rate": 0.00014708571428571427,
+ "loss": 0.1314,
+ "step": 1785
+ },
+ {
+ "epoch": 10.228571428571428,
+ "grad_norm": 0.6224566698074341,
+ "learning_rate": 0.00014665714285714285,
+ "loss": 0.1159,
+ "step": 1790
+ },
+ {
+ "epoch": 10.257142857142856,
+ "grad_norm": 0.45800235867500305,
+ "learning_rate": 0.0001462285714285714,
+ "loss": 0.0942,
+ "step": 1795
+ },
+ {
+ "epoch": 10.285714285714286,
+ "grad_norm": 0.6258400082588196,
+ "learning_rate": 0.0001458,
+ "loss": 0.1079,
+ "step": 1800
+ },
+ {
+ "epoch": 10.314285714285715,
+ "grad_norm": 1.1812794208526611,
+ "learning_rate": 0.00014537142857142858,
+ "loss": 0.1378,
+ "step": 1805
+ },
+ {
+ "epoch": 10.342857142857143,
+ "grad_norm": 0.8541269898414612,
+ "learning_rate": 0.00014494285714285713,
+ "loss": 0.1274,
+ "step": 1810
+ },
+ {
+ "epoch": 10.371428571428572,
+ "grad_norm": 0.7131860256195068,
+ "learning_rate": 0.0001445142857142857,
+ "loss": 0.1247,
+ "step": 1815
+ },
+ {
+ "epoch": 10.4,
+ "grad_norm": 0.6109820008277893,
+ "learning_rate": 0.00014408571428571428,
+ "loss": 0.1246,
+ "step": 1820
+ },
+ {
+ "epoch": 10.428571428571429,
+ "grad_norm": 0.5621510744094849,
+ "learning_rate": 0.00014365714285714286,
+ "loss": 0.1039,
+ "step": 1825
+ },
+ {
+ "epoch": 10.457142857142857,
+ "grad_norm": 1.022777795791626,
+ "learning_rate": 0.00014322857142857142,
+ "loss": 0.1206,
+ "step": 1830
+ },
+ {
+ "epoch": 10.485714285714286,
+ "grad_norm": 0.9120668768882751,
+ "learning_rate": 0.00014279999999999997,
+ "loss": 0.1289,
+ "step": 1835
+ },
+ {
+ "epoch": 10.514285714285714,
+ "grad_norm": 1.1882030963897705,
+ "learning_rate": 0.00014237142857142856,
+ "loss": 0.1194,
+ "step": 1840
+ },
+ {
+ "epoch": 10.542857142857143,
+ "grad_norm": 0.6078401207923889,
+ "learning_rate": 0.00014194285714285714,
+ "loss": 0.1339,
+ "step": 1845
+ },
+ {
+ "epoch": 10.571428571428571,
+ "grad_norm": 0.7380999326705933,
+ "learning_rate": 0.0001415142857142857,
+ "loss": 0.1318,
+ "step": 1850
+ },
+ {
+ "epoch": 10.6,
+ "grad_norm": 0.5884959101676941,
+ "learning_rate": 0.00014108571428571428,
+ "loss": 0.1249,
+ "step": 1855
+ },
+ {
+ "epoch": 10.628571428571428,
+ "grad_norm": 1.0121936798095703,
+ "learning_rate": 0.00014065714285714284,
+ "loss": 0.1137,
+ "step": 1860
+ },
+ {
+ "epoch": 10.657142857142857,
+ "grad_norm": 0.6444916129112244,
+ "learning_rate": 0.00014022857142857143,
+ "loss": 0.1213,
+ "step": 1865
+ },
+ {
+ "epoch": 10.685714285714285,
+ "grad_norm": 0.7931004762649536,
+ "learning_rate": 0.00013979999999999998,
+ "loss": 0.1318,
+ "step": 1870
+ },
+ {
+ "epoch": 10.714285714285714,
+ "grad_norm": 0.5596404075622559,
+ "learning_rate": 0.00013937142857142857,
+ "loss": 0.1075,
+ "step": 1875
+ },
+ {
+ "epoch": 10.742857142857144,
+ "grad_norm": 0.6586474180221558,
+ "learning_rate": 0.00013894285714285712,
+ "loss": 0.13,
+ "step": 1880
+ },
+ {
+ "epoch": 10.771428571428572,
+ "grad_norm": 1.0195013284683228,
+ "learning_rate": 0.00013851428571428568,
+ "loss": 0.1373,
+ "step": 1885
+ },
+ {
+ "epoch": 10.8,
+ "grad_norm": 0.9233512878417969,
+ "learning_rate": 0.00013808571428571427,
+ "loss": 0.1168,
+ "step": 1890
+ },
+ {
+ "epoch": 10.82857142857143,
+ "grad_norm": 0.7154092788696289,
+ "learning_rate": 0.00013765714285714285,
+ "loss": 0.1081,
+ "step": 1895
+ },
+ {
+ "epoch": 10.857142857142858,
+ "grad_norm": 1.4588117599487305,
+ "learning_rate": 0.0001372285714285714,
+ "loss": 0.1061,
+ "step": 1900
+ },
+ {
+ "epoch": 10.885714285714286,
+ "grad_norm": 0.6087035536766052,
+ "learning_rate": 0.0001368,
+ "loss": 0.1157,
+ "step": 1905
+ },
+ {
+ "epoch": 10.914285714285715,
+ "grad_norm": 0.7371247410774231,
+ "learning_rate": 0.00013637142857142855,
+ "loss": 0.1339,
+ "step": 1910
+ },
+ {
+ "epoch": 10.942857142857143,
+ "grad_norm": 0.8253212571144104,
+ "learning_rate": 0.00013594285714285713,
+ "loss": 0.1198,
+ "step": 1915
+ },
+ {
+ "epoch": 10.971428571428572,
+ "grad_norm": 0.6889544129371643,
+ "learning_rate": 0.00013551428571428572,
+ "loss": 0.1131,
+ "step": 1920
+ },
+ {
+ "epoch": 11.0,
+ "grad_norm": 0.6408224105834961,
+ "learning_rate": 0.00013508571428571427,
+ "loss": 0.122,
+ "step": 1925
+ },
+ {
+ "epoch": 11.028571428571428,
+ "grad_norm": 0.6771185398101807,
+ "learning_rate": 0.00013465714285714283,
+ "loss": 0.1492,
+ "step": 1930
+ },
+ {
+ "epoch": 11.057142857142857,
+ "grad_norm": 0.8706450462341309,
+ "learning_rate": 0.00013422857142857142,
+ "loss": 0.1294,
+ "step": 1935
+ },
+ {
+ "epoch": 11.085714285714285,
+ "grad_norm": 1.730648398399353,
+ "learning_rate": 0.0001338,
+ "loss": 0.1004,
+ "step": 1940
+ },
+ {
+ "epoch": 11.114285714285714,
+ "grad_norm": 0.6985113620758057,
+ "learning_rate": 0.00013337142857142856,
+ "loss": 0.0995,
+ "step": 1945
+ },
+ {
+ "epoch": 11.142857142857142,
+ "grad_norm": 0.8901951313018799,
+ "learning_rate": 0.00013294285714285711,
+ "loss": 0.1179,
+ "step": 1950
+ },
+ {
+ "epoch": 11.17142857142857,
+ "grad_norm": 0.7232164144515991,
+ "learning_rate": 0.0001325142857142857,
+ "loss": 0.1397,
+ "step": 1955
+ },
+ {
+ "epoch": 11.2,
+ "grad_norm": 0.6447544693946838,
+ "learning_rate": 0.00013208571428571428,
+ "loss": 0.1366,
+ "step": 1960
+ },
+ {
+ "epoch": 11.228571428571428,
+ "grad_norm": 0.7964944243431091,
+ "learning_rate": 0.00013165714285714284,
+ "loss": 0.1121,
+ "step": 1965
+ },
+ {
+ "epoch": 11.257142857142856,
+ "grad_norm": 0.9012628793716431,
+ "learning_rate": 0.00013122857142857142,
+ "loss": 0.1131,
+ "step": 1970
+ },
+ {
+ "epoch": 11.285714285714286,
+ "grad_norm": 0.9295369982719421,
+ "learning_rate": 0.00013079999999999998,
+ "loss": 0.1232,
+ "step": 1975
+ },
+ {
+ "epoch": 11.314285714285715,
+ "grad_norm": 0.6237708926200867,
+ "learning_rate": 0.00013037142857142857,
+ "loss": 0.1066,
+ "step": 1980
+ },
+ {
+ "epoch": 11.342857142857143,
+ "grad_norm": 0.5250967741012573,
+ "learning_rate": 0.00012994285714285715,
+ "loss": 0.118,
+ "step": 1985
+ },
+ {
+ "epoch": 11.371428571428572,
+ "grad_norm": 1.0013964176177979,
+ "learning_rate": 0.0001295142857142857,
+ "loss": 0.1125,
+ "step": 1990
+ },
+ {
+ "epoch": 11.4,
+ "grad_norm": 0.6721311807632446,
+ "learning_rate": 0.00012908571428571426,
+ "loss": 0.1196,
+ "step": 1995
+ },
+ {
+ "epoch": 11.428571428571429,
+ "grad_norm": 0.6966421008110046,
+ "learning_rate": 0.00012865714285714285,
+ "loss": 0.1172,
+ "step": 2000
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-2000/training_args.bin b/glot-contrastive-final-lora/checkpoint-2000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/checkpoint-2500/README.md b/glot-contrastive-final-lora/checkpoint-2500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-2500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-2500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-2500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-2500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..964a26ebd94b76a139f6016a3b577cdf72a05f0d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:940dc572880c580cd969ac155623363743c9f3ef94854aba54b224023c4a2ee1
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-2500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-2500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5788a9996e7f73c42f9b09fc4be20cc399796580
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d2c540c91f6c54cf3701175d6db55034ccae2b3b587a04b9476ce989d4fa18b
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-2500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-2500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..33b6eedc9a83e25b359069f1d4502c4ee4ec4163
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82bf023104ba6bb70dbc679f41d50ee904b14245b597026bbb288d43524d6797
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-2500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-2500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c9272fd4135ab463ff7ad109f92c09aff73a7ae4
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11ef9936017bed12cabfddfce2a90fd82a625d038e573173ab445ab44ee6c357
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-2500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-2500/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-2500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-2500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-2500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-2500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-2500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-2500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..67999d8d7c11daa2dc63ca5ef8eb1010c1ffc191
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/trainer_state.json
@@ -0,0 +1,3534 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 14.285714285714286,
+ "eval_steps": 5,
+ "global_step": 2500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ },
+ {
+ "epoch": 2.8857142857142857,
+ "grad_norm": 1.3597544431686401,
+ "learning_rate": 0.00025679999999999995,
+ "loss": 0.1583,
+ "step": 505
+ },
+ {
+ "epoch": 2.914285714285714,
+ "grad_norm": 0.8790671825408936,
+ "learning_rate": 0.00025637142857142854,
+ "loss": 0.1565,
+ "step": 510
+ },
+ {
+ "epoch": 2.942857142857143,
+ "grad_norm": 1.1175066232681274,
+ "learning_rate": 0.0002559428571428571,
+ "loss": 0.1406,
+ "step": 515
+ },
+ {
+ "epoch": 2.9714285714285715,
+ "grad_norm": 2.8528785705566406,
+ "learning_rate": 0.0002555142857142857,
+ "loss": 0.1735,
+ "step": 520
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 2.2073328495025635,
+ "learning_rate": 0.0002550857142857143,
+ "loss": 0.1816,
+ "step": 525
+ },
+ {
+ "epoch": 3.0285714285714285,
+ "grad_norm": 11.01322078704834,
+ "learning_rate": 0.0002546571428571428,
+ "loss": 0.1873,
+ "step": 530
+ },
+ {
+ "epoch": 3.057142857142857,
+ "grad_norm": 1.5822402238845825,
+ "learning_rate": 0.0002542285714285714,
+ "loss": 0.168,
+ "step": 535
+ },
+ {
+ "epoch": 3.085714285714286,
+ "grad_norm": 1.3086942434310913,
+ "learning_rate": 0.0002538,
+ "loss": 0.149,
+ "step": 540
+ },
+ {
+ "epoch": 3.1142857142857143,
+ "grad_norm": 6.303041458129883,
+ "learning_rate": 0.0002533714285714285,
+ "loss": 0.1651,
+ "step": 545
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 14.48929500579834,
+ "learning_rate": 0.00025294285714285716,
+ "loss": 0.1687,
+ "step": 550
+ },
+ {
+ "epoch": 3.1714285714285713,
+ "grad_norm": 6.824525356292725,
+ "learning_rate": 0.0002525142857142857,
+ "loss": 0.1919,
+ "step": 555
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 18.772563934326172,
+ "learning_rate": 0.00025208571428571427,
+ "loss": 0.2075,
+ "step": 560
+ },
+ {
+ "epoch": 3.2285714285714286,
+ "grad_norm": 0.7268752455711365,
+ "learning_rate": 0.00025165714285714286,
+ "loss": 0.174,
+ "step": 565
+ },
+ {
+ "epoch": 3.257142857142857,
+ "grad_norm": 1.1301453113555908,
+ "learning_rate": 0.0002512285714285714,
+ "loss": 0.1668,
+ "step": 570
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 2.846802234649658,
+ "learning_rate": 0.00025079999999999997,
+ "loss": 0.1645,
+ "step": 575
+ },
+ {
+ "epoch": 3.314285714285714,
+ "grad_norm": 1.417515754699707,
+ "learning_rate": 0.00025037142857142855,
+ "loss": 0.1719,
+ "step": 580
+ },
+ {
+ "epoch": 3.342857142857143,
+ "grad_norm": 4.137150764465332,
+ "learning_rate": 0.00024994285714285714,
+ "loss": 0.1739,
+ "step": 585
+ },
+ {
+ "epoch": 3.3714285714285714,
+ "grad_norm": 2.6067259311676025,
+ "learning_rate": 0.0002495142857142857,
+ "loss": 0.1489,
+ "step": 590
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.601024627685547,
+ "learning_rate": 0.00024908571428571425,
+ "loss": 0.1618,
+ "step": 595
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 3.849017858505249,
+ "learning_rate": 0.00024865714285714284,
+ "loss": 0.1899,
+ "step": 600
+ },
+ {
+ "epoch": 3.4571428571428573,
+ "grad_norm": 4.673766136169434,
+ "learning_rate": 0.0002482285714285714,
+ "loss": 0.1761,
+ "step": 605
+ },
+ {
+ "epoch": 3.4857142857142858,
+ "grad_norm": 2.6057631969451904,
+ "learning_rate": 0.00024779999999999995,
+ "loss": 0.1743,
+ "step": 610
+ },
+ {
+ "epoch": 3.5142857142857142,
+ "grad_norm": 2.932652473449707,
+ "learning_rate": 0.0002473714285714286,
+ "loss": 0.1482,
+ "step": 615
+ },
+ {
+ "epoch": 3.5428571428571427,
+ "grad_norm": 0.8764939308166504,
+ "learning_rate": 0.0002469428571428571,
+ "loss": 0.1644,
+ "step": 620
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 1.3203191757202148,
+ "learning_rate": 0.0002465142857142857,
+ "loss": 0.1654,
+ "step": 625
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 0.7977635264396667,
+ "learning_rate": 0.0002460857142857143,
+ "loss": 0.1472,
+ "step": 630
+ },
+ {
+ "epoch": 3.6285714285714286,
+ "grad_norm": 1.4750248193740845,
+ "learning_rate": 0.0002456571428571428,
+ "loss": 0.1735,
+ "step": 635
+ },
+ {
+ "epoch": 3.657142857142857,
+ "grad_norm": 1.8164482116699219,
+ "learning_rate": 0.0002452285714285714,
+ "loss": 0.1593,
+ "step": 640
+ },
+ {
+ "epoch": 3.685714285714286,
+ "grad_norm": 1.4829603433609009,
+ "learning_rate": 0.0002448,
+ "loss": 0.1508,
+ "step": 645
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.8828144669532776,
+ "learning_rate": 0.00024437142857142857,
+ "loss": 0.1573,
+ "step": 650
+ },
+ {
+ "epoch": 3.742857142857143,
+ "grad_norm": 2.039384126663208,
+ "learning_rate": 0.00024394285714285713,
+ "loss": 0.1745,
+ "step": 655
+ },
+ {
+ "epoch": 3.7714285714285714,
+ "grad_norm": 0.9604200720787048,
+ "learning_rate": 0.00024351428571428569,
+ "loss": 0.17,
+ "step": 660
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 0.7903971076011658,
+ "learning_rate": 0.00024308571428571427,
+ "loss": 0.1654,
+ "step": 665
+ },
+ {
+ "epoch": 3.8285714285714287,
+ "grad_norm": 0.6935649514198303,
+ "learning_rate": 0.00024265714285714283,
+ "loss": 0.1714,
+ "step": 670
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.5832012295722961,
+ "learning_rate": 0.00024222857142857138,
+ "loss": 0.1636,
+ "step": 675
+ },
+ {
+ "epoch": 3.8857142857142857,
+ "grad_norm": 0.6303168535232544,
+ "learning_rate": 0.0002418,
+ "loss": 0.1604,
+ "step": 680
+ },
+ {
+ "epoch": 3.914285714285714,
+ "grad_norm": 0.7210885882377625,
+ "learning_rate": 0.00024137142857142855,
+ "loss": 0.1444,
+ "step": 685
+ },
+ {
+ "epoch": 3.942857142857143,
+ "grad_norm": 0.7690990567207336,
+ "learning_rate": 0.00024094285714285714,
+ "loss": 0.1631,
+ "step": 690
+ },
+ {
+ "epoch": 3.9714285714285715,
+ "grad_norm": 1.0142720937728882,
+ "learning_rate": 0.0002405142857142857,
+ "loss": 0.158,
+ "step": 695
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.7970322966575623,
+ "learning_rate": 0.00024008571428571425,
+ "loss": 0.1803,
+ "step": 700
+ },
+ {
+ "epoch": 4.0285714285714285,
+ "grad_norm": 0.6795914769172668,
+ "learning_rate": 0.00023965714285714284,
+ "loss": 0.143,
+ "step": 705
+ },
+ {
+ "epoch": 4.057142857142857,
+ "grad_norm": 0.6832629442214966,
+ "learning_rate": 0.0002392285714285714,
+ "loss": 0.1457,
+ "step": 710
+ },
+ {
+ "epoch": 4.085714285714285,
+ "grad_norm": 3.8629798889160156,
+ "learning_rate": 0.0002388,
+ "loss": 0.1671,
+ "step": 715
+ },
+ {
+ "epoch": 4.114285714285714,
+ "grad_norm": 1.1167882680892944,
+ "learning_rate": 0.00023837142857142856,
+ "loss": 0.1544,
+ "step": 720
+ },
+ {
+ "epoch": 4.142857142857143,
+ "grad_norm": 0.9431412816047668,
+ "learning_rate": 0.00023794285714285712,
+ "loss": 0.1605,
+ "step": 725
+ },
+ {
+ "epoch": 4.171428571428572,
+ "grad_norm": 1.310948133468628,
+ "learning_rate": 0.0002375142857142857,
+ "loss": 0.1121,
+ "step": 730
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 0.9830737709999084,
+ "learning_rate": 0.00023708571428571426,
+ "loss": 0.1742,
+ "step": 735
+ },
+ {
+ "epoch": 4.228571428571429,
+ "grad_norm": 0.6166555881500244,
+ "learning_rate": 0.00023665714285714282,
+ "loss": 0.1525,
+ "step": 740
+ },
+ {
+ "epoch": 4.257142857142857,
+ "grad_norm": 0.995579719543457,
+ "learning_rate": 0.00023622857142857143,
+ "loss": 0.1439,
+ "step": 745
+ },
+ {
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.639796793460846,
+ "learning_rate": 0.00023579999999999999,
+ "loss": 0.1692,
+ "step": 750
+ },
+ {
+ "epoch": 4.314285714285714,
+ "grad_norm": 0.9438050389289856,
+ "learning_rate": 0.00023537142857142854,
+ "loss": 0.1785,
+ "step": 755
+ },
+ {
+ "epoch": 4.3428571428571425,
+ "grad_norm": 0.8960750102996826,
+ "learning_rate": 0.00023494285714285713,
+ "loss": 0.1557,
+ "step": 760
+ },
+ {
+ "epoch": 4.371428571428572,
+ "grad_norm": 0.6287499070167542,
+ "learning_rate": 0.00023451428571428568,
+ "loss": 0.1459,
+ "step": 765
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 0.7638295888900757,
+ "learning_rate": 0.00023408571428571424,
+ "loss": 0.1341,
+ "step": 770
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.655878484249115,
+ "learning_rate": 0.00023365714285714283,
+ "loss": 0.1358,
+ "step": 775
+ },
+ {
+ "epoch": 4.457142857142857,
+ "grad_norm": 0.5840997695922852,
+ "learning_rate": 0.0002332285714285714,
+ "loss": 0.1386,
+ "step": 780
+ },
+ {
+ "epoch": 4.485714285714286,
+ "grad_norm": 1.1082488298416138,
+ "learning_rate": 0.0002328,
+ "loss": 0.1827,
+ "step": 785
+ },
+ {
+ "epoch": 4.514285714285714,
+ "grad_norm": 0.8825240135192871,
+ "learning_rate": 0.00023237142857142855,
+ "loss": 0.1527,
+ "step": 790
+ },
+ {
+ "epoch": 4.542857142857143,
+ "grad_norm": 0.6752304434776306,
+ "learning_rate": 0.0002319428571428571,
+ "loss": 0.1392,
+ "step": 795
+ },
+ {
+ "epoch": 4.571428571428571,
+ "grad_norm": 1.1423301696777344,
+ "learning_rate": 0.0002315142857142857,
+ "loss": 0.1433,
+ "step": 800
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 10.793691635131836,
+ "learning_rate": 0.00023108571428571425,
+ "loss": 0.1635,
+ "step": 805
+ },
+ {
+ "epoch": 4.628571428571428,
+ "grad_norm": 0.47564294934272766,
+ "learning_rate": 0.00023065714285714286,
+ "loss": 0.1199,
+ "step": 810
+ },
+ {
+ "epoch": 4.6571428571428575,
+ "grad_norm": 1.2492656707763672,
+ "learning_rate": 0.00023022857142857142,
+ "loss": 0.1488,
+ "step": 815
+ },
+ {
+ "epoch": 4.685714285714286,
+ "grad_norm": 0.6933501958847046,
+ "learning_rate": 0.00022979999999999997,
+ "loss": 0.1812,
+ "step": 820
+ },
+ {
+ "epoch": 4.714285714285714,
+ "grad_norm": 0.7901633977890015,
+ "learning_rate": 0.00022937142857142856,
+ "loss": 0.1415,
+ "step": 825
+ },
+ {
+ "epoch": 4.742857142857143,
+ "grad_norm": 0.7854829430580139,
+ "learning_rate": 0.00022894285714285712,
+ "loss": 0.1401,
+ "step": 830
+ },
+ {
+ "epoch": 4.771428571428571,
+ "grad_norm": 0.8716740608215332,
+ "learning_rate": 0.00022851428571428567,
+ "loss": 0.1982,
+ "step": 835
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 0.7047899961471558,
+ "learning_rate": 0.00022808571428571426,
+ "loss": 0.1624,
+ "step": 840
+ },
+ {
+ "epoch": 4.828571428571428,
+ "grad_norm": 0.7134959697723389,
+ "learning_rate": 0.00022765714285714284,
+ "loss": 0.1375,
+ "step": 845
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 1.0897325277328491,
+ "learning_rate": 0.00022722857142857143,
+ "loss": 0.1489,
+ "step": 850
+ },
+ {
+ "epoch": 4.885714285714286,
+ "grad_norm": 1.1065207719802856,
+ "learning_rate": 0.00022679999999999998,
+ "loss": 0.1495,
+ "step": 855
+ },
+ {
+ "epoch": 4.914285714285715,
+ "grad_norm": 0.7434757351875305,
+ "learning_rate": 0.00022637142857142854,
+ "loss": 0.1507,
+ "step": 860
+ },
+ {
+ "epoch": 4.942857142857143,
+ "grad_norm": 1.0045181512832642,
+ "learning_rate": 0.00022594285714285712,
+ "loss": 0.1527,
+ "step": 865
+ },
+ {
+ "epoch": 4.9714285714285715,
+ "grad_norm": 1.2025654315948486,
+ "learning_rate": 0.00022551428571428568,
+ "loss": 0.1523,
+ "step": 870
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.7823342084884644,
+ "learning_rate": 0.0002250857142857143,
+ "loss": 0.1514,
+ "step": 875
+ },
+ {
+ "epoch": 5.0285714285714285,
+ "grad_norm": 0.8405362963676453,
+ "learning_rate": 0.00022465714285714285,
+ "loss": 0.1461,
+ "step": 880
+ },
+ {
+ "epoch": 5.057142857142857,
+ "grad_norm": 0.7527463436126709,
+ "learning_rate": 0.0002242285714285714,
+ "loss": 0.1206,
+ "step": 885
+ },
+ {
+ "epoch": 5.085714285714285,
+ "grad_norm": 0.8372548222541809,
+ "learning_rate": 0.0002238,
+ "loss": 0.1513,
+ "step": 890
+ },
+ {
+ "epoch": 5.114285714285714,
+ "grad_norm": 0.8755456209182739,
+ "learning_rate": 0.00022337142857142855,
+ "loss": 0.1498,
+ "step": 895
+ },
+ {
+ "epoch": 5.142857142857143,
+ "grad_norm": 0.7312084436416626,
+ "learning_rate": 0.0002229428571428571,
+ "loss": 0.154,
+ "step": 900
+ },
+ {
+ "epoch": 5.171428571428572,
+ "grad_norm": 0.6366221904754639,
+ "learning_rate": 0.0002225142857142857,
+ "loss": 0.1466,
+ "step": 905
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 0.6406880617141724,
+ "learning_rate": 0.00022208571428571427,
+ "loss": 0.1254,
+ "step": 910
+ },
+ {
+ "epoch": 5.228571428571429,
+ "grad_norm": 2.4106833934783936,
+ "learning_rate": 0.00022165714285714283,
+ "loss": 0.1534,
+ "step": 915
+ },
+ {
+ "epoch": 5.257142857142857,
+ "grad_norm": 0.5635722279548645,
+ "learning_rate": 0.00022122857142857142,
+ "loss": 0.1461,
+ "step": 920
+ },
+ {
+ "epoch": 5.285714285714286,
+ "grad_norm": 0.787162184715271,
+ "learning_rate": 0.00022079999999999997,
+ "loss": 0.1424,
+ "step": 925
+ },
+ {
+ "epoch": 5.314285714285714,
+ "grad_norm": 0.6513975262641907,
+ "learning_rate": 0.00022037142857142853,
+ "loss": 0.1326,
+ "step": 930
+ },
+ {
+ "epoch": 5.3428571428571425,
+ "grad_norm": 0.6933534741401672,
+ "learning_rate": 0.00021994285714285711,
+ "loss": 0.1661,
+ "step": 935
+ },
+ {
+ "epoch": 5.371428571428572,
+ "grad_norm": 0.7263259887695312,
+ "learning_rate": 0.0002195142857142857,
+ "loss": 0.15,
+ "step": 940
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 0.5537381768226624,
+ "learning_rate": 0.00021908571428571428,
+ "loss": 0.129,
+ "step": 945
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.6014005541801453,
+ "learning_rate": 0.00021865714285714284,
+ "loss": 0.1321,
+ "step": 950
+ },
+ {
+ "epoch": 5.457142857142857,
+ "grad_norm": 0.6581441760063171,
+ "learning_rate": 0.0002182285714285714,
+ "loss": 0.1587,
+ "step": 955
+ },
+ {
+ "epoch": 5.485714285714286,
+ "grad_norm": 0.9326379895210266,
+ "learning_rate": 0.00021779999999999998,
+ "loss": 0.1654,
+ "step": 960
+ },
+ {
+ "epoch": 5.514285714285714,
+ "grad_norm": 0.9438592791557312,
+ "learning_rate": 0.00021737142857142854,
+ "loss": 0.1212,
+ "step": 965
+ },
+ {
+ "epoch": 5.542857142857143,
+ "grad_norm": 0.7699571251869202,
+ "learning_rate": 0.00021694285714285715,
+ "loss": 0.1464,
+ "step": 970
+ },
+ {
+ "epoch": 5.571428571428571,
+ "grad_norm": 0.8758366703987122,
+ "learning_rate": 0.0002165142857142857,
+ "loss": 0.1599,
+ "step": 975
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 0.6101442575454712,
+ "learning_rate": 0.00021608571428571426,
+ "loss": 0.1589,
+ "step": 980
+ },
+ {
+ "epoch": 5.628571428571428,
+ "grad_norm": 0.7454060912132263,
+ "learning_rate": 0.00021565714285714285,
+ "loss": 0.1433,
+ "step": 985
+ },
+ {
+ "epoch": 5.6571428571428575,
+ "grad_norm": 0.6379484534263611,
+ "learning_rate": 0.0002152285714285714,
+ "loss": 0.1592,
+ "step": 990
+ },
+ {
+ "epoch": 5.685714285714286,
+ "grad_norm": 1.1601309776306152,
+ "learning_rate": 0.00021479999999999996,
+ "loss": 0.1647,
+ "step": 995
+ },
+ {
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.5464673638343811,
+ "learning_rate": 0.00021437142857142855,
+ "loss": 0.1469,
+ "step": 1000
+ },
+ {
+ "epoch": 5.742857142857143,
+ "grad_norm": 1.0279319286346436,
+ "learning_rate": 0.00021394285714285713,
+ "loss": 0.1203,
+ "step": 1005
+ },
+ {
+ "epoch": 5.771428571428571,
+ "grad_norm": 0.5503718256950378,
+ "learning_rate": 0.00021351428571428572,
+ "loss": 0.1409,
+ "step": 1010
+ },
+ {
+ "epoch": 5.8,
+ "grad_norm": 0.6123886108398438,
+ "learning_rate": 0.00021308571428571427,
+ "loss": 0.1427,
+ "step": 1015
+ },
+ {
+ "epoch": 5.828571428571428,
+ "grad_norm": 0.6560390591621399,
+ "learning_rate": 0.00021265714285714283,
+ "loss": 0.1415,
+ "step": 1020
+ },
+ {
+ "epoch": 5.857142857142857,
+ "grad_norm": 0.5576716661453247,
+ "learning_rate": 0.00021222857142857141,
+ "loss": 0.1408,
+ "step": 1025
+ },
+ {
+ "epoch": 5.885714285714286,
+ "grad_norm": 0.6419074535369873,
+ "learning_rate": 0.00021179999999999997,
+ "loss": 0.1385,
+ "step": 1030
+ },
+ {
+ "epoch": 5.914285714285715,
+ "grad_norm": 1.008925199508667,
+ "learning_rate": 0.00021137142857142858,
+ "loss": 0.1497,
+ "step": 1035
+ },
+ {
+ "epoch": 5.942857142857143,
+ "grad_norm": 0.6559906005859375,
+ "learning_rate": 0.00021094285714285714,
+ "loss": 0.1218,
+ "step": 1040
+ },
+ {
+ "epoch": 5.9714285714285715,
+ "grad_norm": 0.627164363861084,
+ "learning_rate": 0.0002105142857142857,
+ "loss": 0.1368,
+ "step": 1045
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.5760972499847412,
+ "learning_rate": 0.00021008571428571428,
+ "loss": 0.1508,
+ "step": 1050
+ },
+ {
+ "epoch": 6.0285714285714285,
+ "grad_norm": 0.5754174590110779,
+ "learning_rate": 0.00020965714285714284,
+ "loss": 0.1181,
+ "step": 1055
+ },
+ {
+ "epoch": 6.057142857142857,
+ "grad_norm": 0.8736348748207092,
+ "learning_rate": 0.0002092285714285714,
+ "loss": 0.1252,
+ "step": 1060
+ },
+ {
+ "epoch": 6.085714285714285,
+ "grad_norm": 0.7166719436645508,
+ "learning_rate": 0.00020879999999999998,
+ "loss": 0.1481,
+ "step": 1065
+ },
+ {
+ "epoch": 6.114285714285714,
+ "grad_norm": 0.6494349241256714,
+ "learning_rate": 0.00020837142857142856,
+ "loss": 0.1478,
+ "step": 1070
+ },
+ {
+ "epoch": 6.142857142857143,
+ "grad_norm": 0.6681587100028992,
+ "learning_rate": 0.00020794285714285712,
+ "loss": 0.1488,
+ "step": 1075
+ },
+ {
+ "epoch": 6.171428571428572,
+ "grad_norm": 0.7123684883117676,
+ "learning_rate": 0.0002075142857142857,
+ "loss": 0.1378,
+ "step": 1080
+ },
+ {
+ "epoch": 6.2,
+ "grad_norm": 0.6146950721740723,
+ "learning_rate": 0.00020708571428571426,
+ "loss": 0.1306,
+ "step": 1085
+ },
+ {
+ "epoch": 6.228571428571429,
+ "grad_norm": 0.8402445912361145,
+ "learning_rate": 0.00020665714285714282,
+ "loss": 0.1063,
+ "step": 1090
+ },
+ {
+ "epoch": 6.257142857142857,
+ "grad_norm": 0.6567764282226562,
+ "learning_rate": 0.0002062285714285714,
+ "loss": 0.1195,
+ "step": 1095
+ },
+ {
+ "epoch": 6.285714285714286,
+ "grad_norm": 0.6006014943122864,
+ "learning_rate": 0.0002058,
+ "loss": 0.1542,
+ "step": 1100
+ },
+ {
+ "epoch": 6.314285714285714,
+ "grad_norm": 0.793100893497467,
+ "learning_rate": 0.00020537142857142857,
+ "loss": 0.1381,
+ "step": 1105
+ },
+ {
+ "epoch": 6.3428571428571425,
+ "grad_norm": 0.5923666954040527,
+ "learning_rate": 0.00020494285714285713,
+ "loss": 0.1386,
+ "step": 1110
+ },
+ {
+ "epoch": 6.371428571428572,
+ "grad_norm": 0.6692521572113037,
+ "learning_rate": 0.0002045142857142857,
+ "loss": 0.1223,
+ "step": 1115
+ },
+ {
+ "epoch": 6.4,
+ "grad_norm": 0.7216306328773499,
+ "learning_rate": 0.00020408571428571427,
+ "loss": 0.1367,
+ "step": 1120
+ },
+ {
+ "epoch": 6.428571428571429,
+ "grad_norm": 0.5640934109687805,
+ "learning_rate": 0.00020365714285714283,
+ "loss": 0.1554,
+ "step": 1125
+ },
+ {
+ "epoch": 6.457142857142857,
+ "grad_norm": 0.8154368996620178,
+ "learning_rate": 0.00020322857142857138,
+ "loss": 0.1674,
+ "step": 1130
+ },
+ {
+ "epoch": 6.485714285714286,
+ "grad_norm": 0.7185398936271667,
+ "learning_rate": 0.0002028,
+ "loss": 0.1375,
+ "step": 1135
+ },
+ {
+ "epoch": 6.514285714285714,
+ "grad_norm": 0.6805170774459839,
+ "learning_rate": 0.00020237142857142855,
+ "loss": 0.1306,
+ "step": 1140
+ },
+ {
+ "epoch": 6.542857142857143,
+ "grad_norm": 0.5996941924095154,
+ "learning_rate": 0.00020194285714285714,
+ "loss": 0.1433,
+ "step": 1145
+ },
+ {
+ "epoch": 6.571428571428571,
+ "grad_norm": 0.5258373022079468,
+ "learning_rate": 0.0002015142857142857,
+ "loss": 0.1285,
+ "step": 1150
+ },
+ {
+ "epoch": 6.6,
+ "grad_norm": 0.7771695256233215,
+ "learning_rate": 0.00020108571428571425,
+ "loss": 0.1493,
+ "step": 1155
+ },
+ {
+ "epoch": 6.628571428571428,
+ "grad_norm": 0.5920616388320923,
+ "learning_rate": 0.00020065714285714284,
+ "loss": 0.1479,
+ "step": 1160
+ },
+ {
+ "epoch": 6.6571428571428575,
+ "grad_norm": 0.7460982799530029,
+ "learning_rate": 0.00020022857142857142,
+ "loss": 0.1173,
+ "step": 1165
+ },
+ {
+ "epoch": 6.685714285714286,
+ "grad_norm": 1.1703822612762451,
+ "learning_rate": 0.0001998,
+ "loss": 0.1402,
+ "step": 1170
+ },
+ {
+ "epoch": 6.714285714285714,
+ "grad_norm": 0.7894724011421204,
+ "learning_rate": 0.00019937142857142856,
+ "loss": 0.1253,
+ "step": 1175
+ },
+ {
+ "epoch": 6.742857142857143,
+ "grad_norm": 0.7013376355171204,
+ "learning_rate": 0.00019894285714285712,
+ "loss": 0.1573,
+ "step": 1180
+ },
+ {
+ "epoch": 6.771428571428571,
+ "grad_norm": 0.6421737670898438,
+ "learning_rate": 0.0001985142857142857,
+ "loss": 0.1497,
+ "step": 1185
+ },
+ {
+ "epoch": 6.8,
+ "grad_norm": 1.204296350479126,
+ "learning_rate": 0.00019808571428571426,
+ "loss": 0.1634,
+ "step": 1190
+ },
+ {
+ "epoch": 6.828571428571428,
+ "grad_norm": 0.867765486240387,
+ "learning_rate": 0.00019765714285714282,
+ "loss": 0.1353,
+ "step": 1195
+ },
+ {
+ "epoch": 6.857142857142857,
+ "grad_norm": 0.7325594425201416,
+ "learning_rate": 0.00019722857142857143,
+ "loss": 0.118,
+ "step": 1200
+ },
+ {
+ "epoch": 6.885714285714286,
+ "grad_norm": 0.7029078006744385,
+ "learning_rate": 0.00019679999999999999,
+ "loss": 0.1425,
+ "step": 1205
+ },
+ {
+ "epoch": 6.914285714285715,
+ "grad_norm": 1.1572504043579102,
+ "learning_rate": 0.00019637142857142857,
+ "loss": 0.1337,
+ "step": 1210
+ },
+ {
+ "epoch": 6.942857142857143,
+ "grad_norm": 0.8022822141647339,
+ "learning_rate": 0.00019594285714285713,
+ "loss": 0.1684,
+ "step": 1215
+ },
+ {
+ "epoch": 6.9714285714285715,
+ "grad_norm": 0.6729874610900879,
+ "learning_rate": 0.00019551428571428568,
+ "loss": 0.1238,
+ "step": 1220
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5773627758026123,
+ "learning_rate": 0.00019508571428571427,
+ "loss": 0.138,
+ "step": 1225
+ },
+ {
+ "epoch": 7.0285714285714285,
+ "grad_norm": 0.7182291150093079,
+ "learning_rate": 0.00019465714285714285,
+ "loss": 0.1431,
+ "step": 1230
+ },
+ {
+ "epoch": 7.057142857142857,
+ "grad_norm": 1.7567912340164185,
+ "learning_rate": 0.0001942285714285714,
+ "loss": 0.1319,
+ "step": 1235
+ },
+ {
+ "epoch": 7.085714285714285,
+ "grad_norm": 0.6845232248306274,
+ "learning_rate": 0.0001938,
+ "loss": 0.1292,
+ "step": 1240
+ },
+ {
+ "epoch": 7.114285714285714,
+ "grad_norm": 0.6077771782875061,
+ "learning_rate": 0.00019337142857142855,
+ "loss": 0.1238,
+ "step": 1245
+ },
+ {
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.6168347597122192,
+ "learning_rate": 0.0001929428571428571,
+ "loss": 0.1384,
+ "step": 1250
+ },
+ {
+ "epoch": 7.171428571428572,
+ "grad_norm": 0.7457576394081116,
+ "learning_rate": 0.0001925142857142857,
+ "loss": 0.1306,
+ "step": 1255
+ },
+ {
+ "epoch": 7.2,
+ "grad_norm": 0.5969316363334656,
+ "learning_rate": 0.00019208571428571425,
+ "loss": 0.1123,
+ "step": 1260
+ },
+ {
+ "epoch": 7.228571428571429,
+ "grad_norm": 0.6902753710746765,
+ "learning_rate": 0.00019165714285714286,
+ "loss": 0.1185,
+ "step": 1265
+ },
+ {
+ "epoch": 7.257142857142857,
+ "grad_norm": 0.6488338112831116,
+ "learning_rate": 0.00019122857142857142,
+ "loss": 0.1431,
+ "step": 1270
+ },
+ {
+ "epoch": 7.285714285714286,
+ "grad_norm": 0.6814819574356079,
+ "learning_rate": 0.00019079999999999998,
+ "loss": 0.1495,
+ "step": 1275
+ },
+ {
+ "epoch": 7.314285714285714,
+ "grad_norm": 0.7468088865280151,
+ "learning_rate": 0.00019037142857142856,
+ "loss": 0.1158,
+ "step": 1280
+ },
+ {
+ "epoch": 7.3428571428571425,
+ "grad_norm": 0.7417412400245667,
+ "learning_rate": 0.00018994285714285712,
+ "loss": 0.1311,
+ "step": 1285
+ },
+ {
+ "epoch": 7.371428571428572,
+ "grad_norm": 0.5480664372444153,
+ "learning_rate": 0.00018951428571428567,
+ "loss": 0.135,
+ "step": 1290
+ },
+ {
+ "epoch": 7.4,
+ "grad_norm": 0.725527822971344,
+ "learning_rate": 0.00018908571428571429,
+ "loss": 0.1217,
+ "step": 1295
+ },
+ {
+ "epoch": 7.428571428571429,
+ "grad_norm": 0.6566678285598755,
+ "learning_rate": 0.00018865714285714284,
+ "loss": 0.1417,
+ "step": 1300
+ },
+ {
+ "epoch": 7.457142857142857,
+ "grad_norm": 0.516952395439148,
+ "learning_rate": 0.00018822857142857143,
+ "loss": 0.1329,
+ "step": 1305
+ },
+ {
+ "epoch": 7.485714285714286,
+ "grad_norm": 1.9545241594314575,
+ "learning_rate": 0.00018779999999999998,
+ "loss": 0.1339,
+ "step": 1310
+ },
+ {
+ "epoch": 7.514285714285714,
+ "grad_norm": 0.8276839852333069,
+ "learning_rate": 0.00018737142857142854,
+ "loss": 0.1324,
+ "step": 1315
+ },
+ {
+ "epoch": 7.542857142857143,
+ "grad_norm": 0.6737099289894104,
+ "learning_rate": 0.00018694285714285713,
+ "loss": 0.1139,
+ "step": 1320
+ },
+ {
+ "epoch": 7.571428571428571,
+ "grad_norm": 0.6914472579956055,
+ "learning_rate": 0.00018651428571428568,
+ "loss": 0.1146,
+ "step": 1325
+ },
+ {
+ "epoch": 7.6,
+ "grad_norm": 0.6630033850669861,
+ "learning_rate": 0.0001860857142857143,
+ "loss": 0.1571,
+ "step": 1330
+ },
+ {
+ "epoch": 7.628571428571428,
+ "grad_norm": 0.820688784122467,
+ "learning_rate": 0.00018565714285714285,
+ "loss": 0.15,
+ "step": 1335
+ },
+ {
+ "epoch": 7.6571428571428575,
+ "grad_norm": 2.0491325855255127,
+ "learning_rate": 0.0001852285714285714,
+ "loss": 0.127,
+ "step": 1340
+ },
+ {
+ "epoch": 7.685714285714286,
+ "grad_norm": 0.9327268004417419,
+ "learning_rate": 0.0001848,
+ "loss": 0.1289,
+ "step": 1345
+ },
+ {
+ "epoch": 7.714285714285714,
+ "grad_norm": 1.3131701946258545,
+ "learning_rate": 0.00018437142857142855,
+ "loss": 0.1228,
+ "step": 1350
+ },
+ {
+ "epoch": 7.742857142857143,
+ "grad_norm": 2.955918312072754,
+ "learning_rate": 0.0001839428571428571,
+ "loss": 0.1082,
+ "step": 1355
+ },
+ {
+ "epoch": 7.771428571428571,
+ "grad_norm": 1.2165493965148926,
+ "learning_rate": 0.00018351428571428572,
+ "loss": 0.1688,
+ "step": 1360
+ },
+ {
+ "epoch": 7.8,
+ "grad_norm": 0.759324312210083,
+ "learning_rate": 0.00018308571428571428,
+ "loss": 0.1185,
+ "step": 1365
+ },
+ {
+ "epoch": 7.828571428571428,
+ "grad_norm": 0.7445591688156128,
+ "learning_rate": 0.00018265714285714286,
+ "loss": 0.1431,
+ "step": 1370
+ },
+ {
+ "epoch": 7.857142857142857,
+ "grad_norm": 0.679374098777771,
+ "learning_rate": 0.00018222857142857142,
+ "loss": 0.1451,
+ "step": 1375
+ },
+ {
+ "epoch": 7.885714285714286,
+ "grad_norm": 2.1234302520751953,
+ "learning_rate": 0.00018179999999999997,
+ "loss": 0.1265,
+ "step": 1380
+ },
+ {
+ "epoch": 7.914285714285715,
+ "grad_norm": 1.006521224975586,
+ "learning_rate": 0.00018137142857142856,
+ "loss": 0.1722,
+ "step": 1385
+ },
+ {
+ "epoch": 7.942857142857143,
+ "grad_norm": 0.7275253534317017,
+ "learning_rate": 0.00018094285714285712,
+ "loss": 0.1625,
+ "step": 1390
+ },
+ {
+ "epoch": 7.9714285714285715,
+ "grad_norm": 0.8612022995948792,
+ "learning_rate": 0.0001805142857142857,
+ "loss": 0.1345,
+ "step": 1395
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.7276798486709595,
+ "learning_rate": 0.00018008571428571428,
+ "loss": 0.1236,
+ "step": 1400
+ },
+ {
+ "epoch": 8.028571428571428,
+ "grad_norm": 0.8731086850166321,
+ "learning_rate": 0.00017965714285714284,
+ "loss": 0.1604,
+ "step": 1405
+ },
+ {
+ "epoch": 8.057142857142857,
+ "grad_norm": 0.8950818777084351,
+ "learning_rate": 0.0001792285714285714,
+ "loss": 0.1531,
+ "step": 1410
+ },
+ {
+ "epoch": 8.085714285714285,
+ "grad_norm": 0.7399356365203857,
+ "learning_rate": 0.00017879999999999998,
+ "loss": 0.1508,
+ "step": 1415
+ },
+ {
+ "epoch": 8.114285714285714,
+ "grad_norm": 1.3727307319641113,
+ "learning_rate": 0.00017837142857142854,
+ "loss": 0.1487,
+ "step": 1420
+ },
+ {
+ "epoch": 8.142857142857142,
+ "grad_norm": 0.5938125848770142,
+ "learning_rate": 0.00017794285714285715,
+ "loss": 0.1303,
+ "step": 1425
+ },
+ {
+ "epoch": 8.17142857142857,
+ "grad_norm": 0.7043821811676025,
+ "learning_rate": 0.0001775142857142857,
+ "loss": 0.0948,
+ "step": 1430
+ },
+ {
+ "epoch": 8.2,
+ "grad_norm": 1.1062767505645752,
+ "learning_rate": 0.00017708571428571426,
+ "loss": 0.1412,
+ "step": 1435
+ },
+ {
+ "epoch": 8.228571428571428,
+ "grad_norm": 0.844832181930542,
+ "learning_rate": 0.00017665714285714285,
+ "loss": 0.113,
+ "step": 1440
+ },
+ {
+ "epoch": 8.257142857142856,
+ "grad_norm": 0.7564154863357544,
+ "learning_rate": 0.0001762285714285714,
+ "loss": 0.1319,
+ "step": 1445
+ },
+ {
+ "epoch": 8.285714285714286,
+ "grad_norm": 0.8843110203742981,
+ "learning_rate": 0.00017579999999999996,
+ "loss": 0.1206,
+ "step": 1450
+ },
+ {
+ "epoch": 8.314285714285715,
+ "grad_norm": 0.8175828456878662,
+ "learning_rate": 0.00017537142857142855,
+ "loss": 0.1327,
+ "step": 1455
+ },
+ {
+ "epoch": 8.342857142857143,
+ "grad_norm": 0.6443565487861633,
+ "learning_rate": 0.00017494285714285713,
+ "loss": 0.1239,
+ "step": 1460
+ },
+ {
+ "epoch": 8.371428571428572,
+ "grad_norm": 0.7237185835838318,
+ "learning_rate": 0.00017451428571428572,
+ "loss": 0.1639,
+ "step": 1465
+ },
+ {
+ "epoch": 8.4,
+ "grad_norm": 0.6118057370185852,
+ "learning_rate": 0.00017408571428571427,
+ "loss": 0.1363,
+ "step": 1470
+ },
+ {
+ "epoch": 8.428571428571429,
+ "grad_norm": 0.6754649877548218,
+ "learning_rate": 0.00017365714285714283,
+ "loss": 0.1187,
+ "step": 1475
+ },
+ {
+ "epoch": 8.457142857142857,
+ "grad_norm": 1.0067390203475952,
+ "learning_rate": 0.00017322857142857141,
+ "loss": 0.1401,
+ "step": 1480
+ },
+ {
+ "epoch": 8.485714285714286,
+ "grad_norm": 8.509544372558594,
+ "learning_rate": 0.00017279999999999997,
+ "loss": 0.1304,
+ "step": 1485
+ },
+ {
+ "epoch": 8.514285714285714,
+ "grad_norm": 4.2030205726623535,
+ "learning_rate": 0.00017237142857142858,
+ "loss": 0.121,
+ "step": 1490
+ },
+ {
+ "epoch": 8.542857142857143,
+ "grad_norm": 4.877438068389893,
+ "learning_rate": 0.00017194285714285714,
+ "loss": 0.1918,
+ "step": 1495
+ },
+ {
+ "epoch": 8.571428571428571,
+ "grad_norm": 6.4971232414245605,
+ "learning_rate": 0.0001715142857142857,
+ "loss": 0.2154,
+ "step": 1500
+ },
+ {
+ "epoch": 8.6,
+ "grad_norm": 4.365469932556152,
+ "learning_rate": 0.00017108571428571428,
+ "loss": 0.2272,
+ "step": 1505
+ },
+ {
+ "epoch": 8.628571428571428,
+ "grad_norm": 2.551957845687866,
+ "learning_rate": 0.00017065714285714284,
+ "loss": 0.2163,
+ "step": 1510
+ },
+ {
+ "epoch": 8.657142857142857,
+ "grad_norm": 5.326391220092773,
+ "learning_rate": 0.0001702285714285714,
+ "loss": 0.1612,
+ "step": 1515
+ },
+ {
+ "epoch": 8.685714285714285,
+ "grad_norm": 1.3528404235839844,
+ "learning_rate": 0.00016979999999999998,
+ "loss": 0.1636,
+ "step": 1520
+ },
+ {
+ "epoch": 8.714285714285714,
+ "grad_norm": 1.4466065168380737,
+ "learning_rate": 0.00016937142857142856,
+ "loss": 0.1295,
+ "step": 1525
+ },
+ {
+ "epoch": 8.742857142857144,
+ "grad_norm": 0.6576040387153625,
+ "learning_rate": 0.00016894285714285715,
+ "loss": 0.1318,
+ "step": 1530
+ },
+ {
+ "epoch": 8.771428571428572,
+ "grad_norm": 1.286942958831787,
+ "learning_rate": 0.0001685142857142857,
+ "loss": 0.1443,
+ "step": 1535
+ },
+ {
+ "epoch": 8.8,
+ "grad_norm": 9.474458694458008,
+ "learning_rate": 0.00016808571428571426,
+ "loss": 0.1313,
+ "step": 1540
+ },
+ {
+ "epoch": 8.82857142857143,
+ "grad_norm": 2.6731069087982178,
+ "learning_rate": 0.00016765714285714285,
+ "loss": 0.1485,
+ "step": 1545
+ },
+ {
+ "epoch": 8.857142857142858,
+ "grad_norm": 1.313723087310791,
+ "learning_rate": 0.0001672285714285714,
+ "loss": 0.1346,
+ "step": 1550
+ },
+ {
+ "epoch": 8.885714285714286,
+ "grad_norm": 1.7115576267242432,
+ "learning_rate": 0.0001668,
+ "loss": 0.1471,
+ "step": 1555
+ },
+ {
+ "epoch": 8.914285714285715,
+ "grad_norm": 1.2599923610687256,
+ "learning_rate": 0.00016637142857142857,
+ "loss": 0.1433,
+ "step": 1560
+ },
+ {
+ "epoch": 8.942857142857143,
+ "grad_norm": 0.9659029245376587,
+ "learning_rate": 0.00016594285714285713,
+ "loss": 0.1256,
+ "step": 1565
+ },
+ {
+ "epoch": 8.971428571428572,
+ "grad_norm": 1.1282744407653809,
+ "learning_rate": 0.0001655142857142857,
+ "loss": 0.1373,
+ "step": 1570
+ },
+ {
+ "epoch": 9.0,
+ "grad_norm": 3.20717453956604,
+ "learning_rate": 0.00016508571428571427,
+ "loss": 0.1355,
+ "step": 1575
+ },
+ {
+ "epoch": 9.028571428571428,
+ "grad_norm": 0.8310821056365967,
+ "learning_rate": 0.00016465714285714283,
+ "loss": 0.1268,
+ "step": 1580
+ },
+ {
+ "epoch": 9.057142857142857,
+ "grad_norm": 1.5337790250778198,
+ "learning_rate": 0.00016422857142857139,
+ "loss": 0.1267,
+ "step": 1585
+ },
+ {
+ "epoch": 9.085714285714285,
+ "grad_norm": 2.6406068801879883,
+ "learning_rate": 0.0001638,
+ "loss": 0.1363,
+ "step": 1590
+ },
+ {
+ "epoch": 9.114285714285714,
+ "grad_norm": 0.7705873847007751,
+ "learning_rate": 0.00016337142857142855,
+ "loss": 0.1291,
+ "step": 1595
+ },
+ {
+ "epoch": 9.142857142857142,
+ "grad_norm": 0.7092650532722473,
+ "learning_rate": 0.00016294285714285714,
+ "loss": 0.1435,
+ "step": 1600
+ },
+ {
+ "epoch": 9.17142857142857,
+ "grad_norm": 1.098961591720581,
+ "learning_rate": 0.0001625142857142857,
+ "loss": 0.1471,
+ "step": 1605
+ },
+ {
+ "epoch": 9.2,
+ "grad_norm": 0.6994885206222534,
+ "learning_rate": 0.00016208571428571425,
+ "loss": 0.1345,
+ "step": 1610
+ },
+ {
+ "epoch": 9.228571428571428,
+ "grad_norm": 0.9613476991653442,
+ "learning_rate": 0.00016165714285714284,
+ "loss": 0.1399,
+ "step": 1615
+ },
+ {
+ "epoch": 9.257142857142856,
+ "grad_norm": 0.675588846206665,
+ "learning_rate": 0.00016122857142857142,
+ "loss": 0.1319,
+ "step": 1620
+ },
+ {
+ "epoch": 9.285714285714286,
+ "grad_norm": 0.7519372701644897,
+ "learning_rate": 0.0001608,
+ "loss": 0.137,
+ "step": 1625
+ },
+ {
+ "epoch": 9.314285714285715,
+ "grad_norm": 1.135025978088379,
+ "learning_rate": 0.00016037142857142856,
+ "loss": 0.1322,
+ "step": 1630
+ },
+ {
+ "epoch": 9.342857142857143,
+ "grad_norm": 0.7462936639785767,
+ "learning_rate": 0.00015994285714285712,
+ "loss": 0.1215,
+ "step": 1635
+ },
+ {
+ "epoch": 9.371428571428572,
+ "grad_norm": 0.9042088985443115,
+ "learning_rate": 0.0001595142857142857,
+ "loss": 0.1191,
+ "step": 1640
+ },
+ {
+ "epoch": 9.4,
+ "grad_norm": 0.567828893661499,
+ "learning_rate": 0.00015908571428571426,
+ "loss": 0.1189,
+ "step": 1645
+ },
+ {
+ "epoch": 9.428571428571429,
+ "grad_norm": 0.981585681438446,
+ "learning_rate": 0.00015865714285714282,
+ "loss": 0.128,
+ "step": 1650
+ },
+ {
+ "epoch": 9.457142857142857,
+ "grad_norm": 1.24985933303833,
+ "learning_rate": 0.00015822857142857143,
+ "loss": 0.1315,
+ "step": 1655
+ },
+ {
+ "epoch": 9.485714285714286,
+ "grad_norm": 0.6517993211746216,
+ "learning_rate": 0.0001578,
+ "loss": 0.1076,
+ "step": 1660
+ },
+ {
+ "epoch": 9.514285714285714,
+ "grad_norm": 1.166628122329712,
+ "learning_rate": 0.00015737142857142857,
+ "loss": 0.1345,
+ "step": 1665
+ },
+ {
+ "epoch": 9.542857142857143,
+ "grad_norm": 0.9763592481613159,
+ "learning_rate": 0.00015694285714285713,
+ "loss": 0.1449,
+ "step": 1670
+ },
+ {
+ "epoch": 9.571428571428571,
+ "grad_norm": 0.7829060554504395,
+ "learning_rate": 0.00015651428571428569,
+ "loss": 0.1117,
+ "step": 1675
+ },
+ {
+ "epoch": 9.6,
+ "grad_norm": 0.6693719029426575,
+ "learning_rate": 0.00015608571428571427,
+ "loss": 0.1129,
+ "step": 1680
+ },
+ {
+ "epoch": 9.628571428571428,
+ "grad_norm": 1.2122846841812134,
+ "learning_rate": 0.00015565714285714285,
+ "loss": 0.1125,
+ "step": 1685
+ },
+ {
+ "epoch": 9.657142857142857,
+ "grad_norm": 1.0689371824264526,
+ "learning_rate": 0.0001552285714285714,
+ "loss": 0.1478,
+ "step": 1690
+ },
+ {
+ "epoch": 9.685714285714285,
+ "grad_norm": 1.8511656522750854,
+ "learning_rate": 0.0001548,
+ "loss": 0.1431,
+ "step": 1695
+ },
+ {
+ "epoch": 9.714285714285714,
+ "grad_norm": 0.6706506609916687,
+ "learning_rate": 0.00015437142857142855,
+ "loss": 0.1262,
+ "step": 1700
+ },
+ {
+ "epoch": 9.742857142857144,
+ "grad_norm": 1.0798784494400024,
+ "learning_rate": 0.00015394285714285714,
+ "loss": 0.1275,
+ "step": 1705
+ },
+ {
+ "epoch": 9.771428571428572,
+ "grad_norm": 0.7915983200073242,
+ "learning_rate": 0.0001535142857142857,
+ "loss": 0.1316,
+ "step": 1710
+ },
+ {
+ "epoch": 9.8,
+ "grad_norm": 1.8630567789077759,
+ "learning_rate": 0.00015308571428571425,
+ "loss": 0.1258,
+ "step": 1715
+ },
+ {
+ "epoch": 9.82857142857143,
+ "grad_norm": 0.7807756662368774,
+ "learning_rate": 0.00015265714285714286,
+ "loss": 0.1079,
+ "step": 1720
+ },
+ {
+ "epoch": 9.857142857142858,
+ "grad_norm": 1.4698439836502075,
+ "learning_rate": 0.00015222857142857142,
+ "loss": 0.1357,
+ "step": 1725
+ },
+ {
+ "epoch": 9.885714285714286,
+ "grad_norm": 1.2121926546096802,
+ "learning_rate": 0.00015179999999999998,
+ "loss": 0.1322,
+ "step": 1730
+ },
+ {
+ "epoch": 9.914285714285715,
+ "grad_norm": 0.6348568201065063,
+ "learning_rate": 0.00015137142857142856,
+ "loss": 0.0893,
+ "step": 1735
+ },
+ {
+ "epoch": 9.942857142857143,
+ "grad_norm": 0.6694422364234924,
+ "learning_rate": 0.00015094285714285712,
+ "loss": 0.1189,
+ "step": 1740
+ },
+ {
+ "epoch": 9.971428571428572,
+ "grad_norm": 0.569332480430603,
+ "learning_rate": 0.00015051428571428567,
+ "loss": 0.1349,
+ "step": 1745
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.934073269367218,
+ "learning_rate": 0.00015008571428571429,
+ "loss": 0.1237,
+ "step": 1750
+ },
+ {
+ "epoch": 10.028571428571428,
+ "grad_norm": 0.7191672325134277,
+ "learning_rate": 0.00014965714285714284,
+ "loss": 0.1308,
+ "step": 1755
+ },
+ {
+ "epoch": 10.057142857142857,
+ "grad_norm": 0.7006493806838989,
+ "learning_rate": 0.00014922857142857143,
+ "loss": 0.104,
+ "step": 1760
+ },
+ {
+ "epoch": 10.085714285714285,
+ "grad_norm": 0.9030678272247314,
+ "learning_rate": 0.00014879999999999998,
+ "loss": 0.1308,
+ "step": 1765
+ },
+ {
+ "epoch": 10.114285714285714,
+ "grad_norm": 0.7007766366004944,
+ "learning_rate": 0.00014837142857142854,
+ "loss": 0.1044,
+ "step": 1770
+ },
+ {
+ "epoch": 10.142857142857142,
+ "grad_norm": 0.4832770824432373,
+ "learning_rate": 0.00014794285714285713,
+ "loss": 0.1119,
+ "step": 1775
+ },
+ {
+ "epoch": 10.17142857142857,
+ "grad_norm": 0.7819458842277527,
+ "learning_rate": 0.0001475142857142857,
+ "loss": 0.1087,
+ "step": 1780
+ },
+ {
+ "epoch": 10.2,
+ "grad_norm": 1.0223525762557983,
+ "learning_rate": 0.00014708571428571427,
+ "loss": 0.1314,
+ "step": 1785
+ },
+ {
+ "epoch": 10.228571428571428,
+ "grad_norm": 0.6224566698074341,
+ "learning_rate": 0.00014665714285714285,
+ "loss": 0.1159,
+ "step": 1790
+ },
+ {
+ "epoch": 10.257142857142856,
+ "grad_norm": 0.45800235867500305,
+ "learning_rate": 0.0001462285714285714,
+ "loss": 0.0942,
+ "step": 1795
+ },
+ {
+ "epoch": 10.285714285714286,
+ "grad_norm": 0.6258400082588196,
+ "learning_rate": 0.0001458,
+ "loss": 0.1079,
+ "step": 1800
+ },
+ {
+ "epoch": 10.314285714285715,
+ "grad_norm": 1.1812794208526611,
+ "learning_rate": 0.00014537142857142858,
+ "loss": 0.1378,
+ "step": 1805
+ },
+ {
+ "epoch": 10.342857142857143,
+ "grad_norm": 0.8541269898414612,
+ "learning_rate": 0.00014494285714285713,
+ "loss": 0.1274,
+ "step": 1810
+ },
+ {
+ "epoch": 10.371428571428572,
+ "grad_norm": 0.7131860256195068,
+ "learning_rate": 0.0001445142857142857,
+ "loss": 0.1247,
+ "step": 1815
+ },
+ {
+ "epoch": 10.4,
+ "grad_norm": 0.6109820008277893,
+ "learning_rate": 0.00014408571428571428,
+ "loss": 0.1246,
+ "step": 1820
+ },
+ {
+ "epoch": 10.428571428571429,
+ "grad_norm": 0.5621510744094849,
+ "learning_rate": 0.00014365714285714286,
+ "loss": 0.1039,
+ "step": 1825
+ },
+ {
+ "epoch": 10.457142857142857,
+ "grad_norm": 1.022777795791626,
+ "learning_rate": 0.00014322857142857142,
+ "loss": 0.1206,
+ "step": 1830
+ },
+ {
+ "epoch": 10.485714285714286,
+ "grad_norm": 0.9120668768882751,
+ "learning_rate": 0.00014279999999999997,
+ "loss": 0.1289,
+ "step": 1835
+ },
+ {
+ "epoch": 10.514285714285714,
+ "grad_norm": 1.1882030963897705,
+ "learning_rate": 0.00014237142857142856,
+ "loss": 0.1194,
+ "step": 1840
+ },
+ {
+ "epoch": 10.542857142857143,
+ "grad_norm": 0.6078401207923889,
+ "learning_rate": 0.00014194285714285714,
+ "loss": 0.1339,
+ "step": 1845
+ },
+ {
+ "epoch": 10.571428571428571,
+ "grad_norm": 0.7380999326705933,
+ "learning_rate": 0.0001415142857142857,
+ "loss": 0.1318,
+ "step": 1850
+ },
+ {
+ "epoch": 10.6,
+ "grad_norm": 0.5884959101676941,
+ "learning_rate": 0.00014108571428571428,
+ "loss": 0.1249,
+ "step": 1855
+ },
+ {
+ "epoch": 10.628571428571428,
+ "grad_norm": 1.0121936798095703,
+ "learning_rate": 0.00014065714285714284,
+ "loss": 0.1137,
+ "step": 1860
+ },
+ {
+ "epoch": 10.657142857142857,
+ "grad_norm": 0.6444916129112244,
+ "learning_rate": 0.00014022857142857143,
+ "loss": 0.1213,
+ "step": 1865
+ },
+ {
+ "epoch": 10.685714285714285,
+ "grad_norm": 0.7931004762649536,
+ "learning_rate": 0.00013979999999999998,
+ "loss": 0.1318,
+ "step": 1870
+ },
+ {
+ "epoch": 10.714285714285714,
+ "grad_norm": 0.5596404075622559,
+ "learning_rate": 0.00013937142857142857,
+ "loss": 0.1075,
+ "step": 1875
+ },
+ {
+ "epoch": 10.742857142857144,
+ "grad_norm": 0.6586474180221558,
+ "learning_rate": 0.00013894285714285712,
+ "loss": 0.13,
+ "step": 1880
+ },
+ {
+ "epoch": 10.771428571428572,
+ "grad_norm": 1.0195013284683228,
+ "learning_rate": 0.00013851428571428568,
+ "loss": 0.1373,
+ "step": 1885
+ },
+ {
+ "epoch": 10.8,
+ "grad_norm": 0.9233512878417969,
+ "learning_rate": 0.00013808571428571427,
+ "loss": 0.1168,
+ "step": 1890
+ },
+ {
+ "epoch": 10.82857142857143,
+ "grad_norm": 0.7154092788696289,
+ "learning_rate": 0.00013765714285714285,
+ "loss": 0.1081,
+ "step": 1895
+ },
+ {
+ "epoch": 10.857142857142858,
+ "grad_norm": 1.4588117599487305,
+ "learning_rate": 0.0001372285714285714,
+ "loss": 0.1061,
+ "step": 1900
+ },
+ {
+ "epoch": 10.885714285714286,
+ "grad_norm": 0.6087035536766052,
+ "learning_rate": 0.0001368,
+ "loss": 0.1157,
+ "step": 1905
+ },
+ {
+ "epoch": 10.914285714285715,
+ "grad_norm": 0.7371247410774231,
+ "learning_rate": 0.00013637142857142855,
+ "loss": 0.1339,
+ "step": 1910
+ },
+ {
+ "epoch": 10.942857142857143,
+ "grad_norm": 0.8253212571144104,
+ "learning_rate": 0.00013594285714285713,
+ "loss": 0.1198,
+ "step": 1915
+ },
+ {
+ "epoch": 10.971428571428572,
+ "grad_norm": 0.6889544129371643,
+ "learning_rate": 0.00013551428571428572,
+ "loss": 0.1131,
+ "step": 1920
+ },
+ {
+ "epoch": 11.0,
+ "grad_norm": 0.6408224105834961,
+ "learning_rate": 0.00013508571428571427,
+ "loss": 0.122,
+ "step": 1925
+ },
+ {
+ "epoch": 11.028571428571428,
+ "grad_norm": 0.6771185398101807,
+ "learning_rate": 0.00013465714285714283,
+ "loss": 0.1492,
+ "step": 1930
+ },
+ {
+ "epoch": 11.057142857142857,
+ "grad_norm": 0.8706450462341309,
+ "learning_rate": 0.00013422857142857142,
+ "loss": 0.1294,
+ "step": 1935
+ },
+ {
+ "epoch": 11.085714285714285,
+ "grad_norm": 1.730648398399353,
+ "learning_rate": 0.0001338,
+ "loss": 0.1004,
+ "step": 1940
+ },
+ {
+ "epoch": 11.114285714285714,
+ "grad_norm": 0.6985113620758057,
+ "learning_rate": 0.00013337142857142856,
+ "loss": 0.0995,
+ "step": 1945
+ },
+ {
+ "epoch": 11.142857142857142,
+ "grad_norm": 0.8901951313018799,
+ "learning_rate": 0.00013294285714285711,
+ "loss": 0.1179,
+ "step": 1950
+ },
+ {
+ "epoch": 11.17142857142857,
+ "grad_norm": 0.7232164144515991,
+ "learning_rate": 0.0001325142857142857,
+ "loss": 0.1397,
+ "step": 1955
+ },
+ {
+ "epoch": 11.2,
+ "grad_norm": 0.6447544693946838,
+ "learning_rate": 0.00013208571428571428,
+ "loss": 0.1366,
+ "step": 1960
+ },
+ {
+ "epoch": 11.228571428571428,
+ "grad_norm": 0.7964944243431091,
+ "learning_rate": 0.00013165714285714284,
+ "loss": 0.1121,
+ "step": 1965
+ },
+ {
+ "epoch": 11.257142857142856,
+ "grad_norm": 0.9012628793716431,
+ "learning_rate": 0.00013122857142857142,
+ "loss": 0.1131,
+ "step": 1970
+ },
+ {
+ "epoch": 11.285714285714286,
+ "grad_norm": 0.9295369982719421,
+ "learning_rate": 0.00013079999999999998,
+ "loss": 0.1232,
+ "step": 1975
+ },
+ {
+ "epoch": 11.314285714285715,
+ "grad_norm": 0.6237708926200867,
+ "learning_rate": 0.00013037142857142857,
+ "loss": 0.1066,
+ "step": 1980
+ },
+ {
+ "epoch": 11.342857142857143,
+ "grad_norm": 0.5250967741012573,
+ "learning_rate": 0.00012994285714285715,
+ "loss": 0.118,
+ "step": 1985
+ },
+ {
+ "epoch": 11.371428571428572,
+ "grad_norm": 1.0013964176177979,
+ "learning_rate": 0.0001295142857142857,
+ "loss": 0.1125,
+ "step": 1990
+ },
+ {
+ "epoch": 11.4,
+ "grad_norm": 0.6721311807632446,
+ "learning_rate": 0.00012908571428571426,
+ "loss": 0.1196,
+ "step": 1995
+ },
+ {
+ "epoch": 11.428571428571429,
+ "grad_norm": 0.6966421008110046,
+ "learning_rate": 0.00012865714285714285,
+ "loss": 0.1172,
+ "step": 2000
+ },
+ {
+ "epoch": 11.457142857142857,
+ "grad_norm": 0.8811460733413696,
+ "learning_rate": 0.00012822857142857143,
+ "loss": 0.135,
+ "step": 2005
+ },
+ {
+ "epoch": 11.485714285714286,
+ "grad_norm": 0.8829531073570251,
+ "learning_rate": 0.0001278,
+ "loss": 0.1288,
+ "step": 2010
+ },
+ {
+ "epoch": 11.514285714285714,
+ "grad_norm": 0.7530654668807983,
+ "learning_rate": 0.00012737142857142855,
+ "loss": 0.1073,
+ "step": 2015
+ },
+ {
+ "epoch": 11.542857142857143,
+ "grad_norm": 0.513940691947937,
+ "learning_rate": 0.00012694285714285713,
+ "loss": 0.121,
+ "step": 2020
+ },
+ {
+ "epoch": 11.571428571428571,
+ "grad_norm": 0.8574968576431274,
+ "learning_rate": 0.0001265142857142857,
+ "loss": 0.1103,
+ "step": 2025
+ },
+ {
+ "epoch": 11.6,
+ "grad_norm": 0.7482439875602722,
+ "learning_rate": 0.00012608571428571427,
+ "loss": 0.1027,
+ "step": 2030
+ },
+ {
+ "epoch": 11.628571428571428,
+ "grad_norm": 0.8367976546287537,
+ "learning_rate": 0.00012565714285714286,
+ "loss": 0.1181,
+ "step": 2035
+ },
+ {
+ "epoch": 11.657142857142857,
+ "grad_norm": 2.048128366470337,
+ "learning_rate": 0.0001252285714285714,
+ "loss": 0.1122,
+ "step": 2040
+ },
+ {
+ "epoch": 11.685714285714285,
+ "grad_norm": 0.7426862716674805,
+ "learning_rate": 0.00012479999999999997,
+ "loss": 0.1169,
+ "step": 2045
+ },
+ {
+ "epoch": 11.714285714285714,
+ "grad_norm": 3.093841791152954,
+ "learning_rate": 0.00012437142857142855,
+ "loss": 0.1164,
+ "step": 2050
+ },
+ {
+ "epoch": 11.742857142857144,
+ "grad_norm": 0.8172643184661865,
+ "learning_rate": 0.00012394285714285714,
+ "loss": 0.1354,
+ "step": 2055
+ },
+ {
+ "epoch": 11.771428571428572,
+ "grad_norm": 1.9950591325759888,
+ "learning_rate": 0.0001235142857142857,
+ "loss": 0.1037,
+ "step": 2060
+ },
+ {
+ "epoch": 11.8,
+ "grad_norm": 0.5929077863693237,
+ "learning_rate": 0.00012308571428571428,
+ "loss": 0.1194,
+ "step": 2065
+ },
+ {
+ "epoch": 11.82857142857143,
+ "grad_norm": 1.293624997138977,
+ "learning_rate": 0.00012265714285714284,
+ "loss": 0.12,
+ "step": 2070
+ },
+ {
+ "epoch": 11.857142857142858,
+ "grad_norm": 1.0515168905258179,
+ "learning_rate": 0.00012222857142857142,
+ "loss": 0.1049,
+ "step": 2075
+ },
+ {
+ "epoch": 11.885714285714286,
+ "grad_norm": 1.2874428033828735,
+ "learning_rate": 0.00012179999999999999,
+ "loss": 0.115,
+ "step": 2080
+ },
+ {
+ "epoch": 11.914285714285715,
+ "grad_norm": 0.7317278385162354,
+ "learning_rate": 0.00012137142857142856,
+ "loss": 0.1184,
+ "step": 2085
+ },
+ {
+ "epoch": 11.942857142857143,
+ "grad_norm": 1.3407148122787476,
+ "learning_rate": 0.00012094285714285713,
+ "loss": 0.132,
+ "step": 2090
+ },
+ {
+ "epoch": 11.971428571428572,
+ "grad_norm": 2.656409502029419,
+ "learning_rate": 0.00012051428571428569,
+ "loss": 0.1359,
+ "step": 2095
+ },
+ {
+ "epoch": 12.0,
+ "grad_norm": 0.7189064025878906,
+ "learning_rate": 0.00012008571428571428,
+ "loss": 0.1217,
+ "step": 2100
+ },
+ {
+ "epoch": 12.028571428571428,
+ "grad_norm": 0.7510334849357605,
+ "learning_rate": 0.00011965714285714285,
+ "loss": 0.109,
+ "step": 2105
+ },
+ {
+ "epoch": 12.057142857142857,
+ "grad_norm": 0.7235113382339478,
+ "learning_rate": 0.00011922857142857142,
+ "loss": 0.1114,
+ "step": 2110
+ },
+ {
+ "epoch": 12.085714285714285,
+ "grad_norm": 1.7435882091522217,
+ "learning_rate": 0.0001188,
+ "loss": 0.1357,
+ "step": 2115
+ },
+ {
+ "epoch": 12.114285714285714,
+ "grad_norm": 1.170392632484436,
+ "learning_rate": 0.00011837142857142856,
+ "loss": 0.1255,
+ "step": 2120
+ },
+ {
+ "epoch": 12.142857142857142,
+ "grad_norm": 0.6476783752441406,
+ "learning_rate": 0.00011794285714285713,
+ "loss": 0.1108,
+ "step": 2125
+ },
+ {
+ "epoch": 12.17142857142857,
+ "grad_norm": 0.8599929213523865,
+ "learning_rate": 0.00011751428571428571,
+ "loss": 0.0997,
+ "step": 2130
+ },
+ {
+ "epoch": 12.2,
+ "grad_norm": 0.8918687105178833,
+ "learning_rate": 0.00011708571428571428,
+ "loss": 0.1149,
+ "step": 2135
+ },
+ {
+ "epoch": 12.228571428571428,
+ "grad_norm": 1.609435796737671,
+ "learning_rate": 0.00011665714285714284,
+ "loss": 0.1136,
+ "step": 2140
+ },
+ {
+ "epoch": 12.257142857142856,
+ "grad_norm": 0.6206801533699036,
+ "learning_rate": 0.00011622857142857143,
+ "loss": 0.1135,
+ "step": 2145
+ },
+ {
+ "epoch": 12.285714285714286,
+ "grad_norm": 0.8769077658653259,
+ "learning_rate": 0.0001158,
+ "loss": 0.1344,
+ "step": 2150
+ },
+ {
+ "epoch": 12.314285714285715,
+ "grad_norm": 0.6279401183128357,
+ "learning_rate": 0.00011537142857142855,
+ "loss": 0.1049,
+ "step": 2155
+ },
+ {
+ "epoch": 12.342857142857143,
+ "grad_norm": 1.1110137701034546,
+ "learning_rate": 0.00011494285714285712,
+ "loss": 0.1146,
+ "step": 2160
+ },
+ {
+ "epoch": 12.371428571428572,
+ "grad_norm": 0.7911233901977539,
+ "learning_rate": 0.00011451428571428571,
+ "loss": 0.1257,
+ "step": 2165
+ },
+ {
+ "epoch": 12.4,
+ "grad_norm": 0.9691207408905029,
+ "learning_rate": 0.00011408571428571428,
+ "loss": 0.1226,
+ "step": 2170
+ },
+ {
+ "epoch": 12.428571428571429,
+ "grad_norm": 0.6168835759162903,
+ "learning_rate": 0.00011365714285714284,
+ "loss": 0.1271,
+ "step": 2175
+ },
+ {
+ "epoch": 12.457142857142857,
+ "grad_norm": 0.6143497228622437,
+ "learning_rate": 0.00011322857142857142,
+ "loss": 0.111,
+ "step": 2180
+ },
+ {
+ "epoch": 12.485714285714286,
+ "grad_norm": 1.5673450231552124,
+ "learning_rate": 0.00011279999999999999,
+ "loss": 0.1186,
+ "step": 2185
+ },
+ {
+ "epoch": 12.514285714285714,
+ "grad_norm": 1.298756718635559,
+ "learning_rate": 0.00011237142857142856,
+ "loss": 0.1024,
+ "step": 2190
+ },
+ {
+ "epoch": 12.542857142857143,
+ "grad_norm": 0.9484918117523193,
+ "learning_rate": 0.00011194285714285715,
+ "loss": 0.1171,
+ "step": 2195
+ },
+ {
+ "epoch": 12.571428571428571,
+ "grad_norm": 0.725705623626709,
+ "learning_rate": 0.0001115142857142857,
+ "loss": 0.1216,
+ "step": 2200
+ },
+ {
+ "epoch": 12.6,
+ "grad_norm": 1.1394798755645752,
+ "learning_rate": 0.00011108571428571427,
+ "loss": 0.1132,
+ "step": 2205
+ },
+ {
+ "epoch": 12.628571428571428,
+ "grad_norm": 0.9548712968826294,
+ "learning_rate": 0.00011065714285714286,
+ "loss": 0.1209,
+ "step": 2210
+ },
+ {
+ "epoch": 12.657142857142857,
+ "grad_norm": 0.6173953413963318,
+ "learning_rate": 0.00011022857142857143,
+ "loss": 0.1049,
+ "step": 2215
+ },
+ {
+ "epoch": 12.685714285714285,
+ "grad_norm": 0.8227205872535706,
+ "learning_rate": 0.00010979999999999999,
+ "loss": 0.1045,
+ "step": 2220
+ },
+ {
+ "epoch": 12.714285714285714,
+ "grad_norm": 0.7252780795097351,
+ "learning_rate": 0.00010937142857142856,
+ "loss": 0.1146,
+ "step": 2225
+ },
+ {
+ "epoch": 12.742857142857144,
+ "grad_norm": 0.9374399781227112,
+ "learning_rate": 0.00010894285714285714,
+ "loss": 0.1478,
+ "step": 2230
+ },
+ {
+ "epoch": 12.771428571428572,
+ "grad_norm": 5.1985368728637695,
+ "learning_rate": 0.0001085142857142857,
+ "loss": 0.1059,
+ "step": 2235
+ },
+ {
+ "epoch": 12.8,
+ "grad_norm": 0.9629620909690857,
+ "learning_rate": 0.00010808571428571427,
+ "loss": 0.124,
+ "step": 2240
+ },
+ {
+ "epoch": 12.82857142857143,
+ "grad_norm": 0.7022290229797363,
+ "learning_rate": 0.00010765714285714285,
+ "loss": 0.1309,
+ "step": 2245
+ },
+ {
+ "epoch": 12.857142857142858,
+ "grad_norm": 0.574188232421875,
+ "learning_rate": 0.00010722857142857142,
+ "loss": 0.086,
+ "step": 2250
+ },
+ {
+ "epoch": 12.885714285714286,
+ "grad_norm": 0.9712439179420471,
+ "learning_rate": 0.00010679999999999998,
+ "loss": 0.1152,
+ "step": 2255
+ },
+ {
+ "epoch": 12.914285714285715,
+ "grad_norm": 0.6562150120735168,
+ "learning_rate": 0.00010637142857142856,
+ "loss": 0.1343,
+ "step": 2260
+ },
+ {
+ "epoch": 12.942857142857143,
+ "grad_norm": 0.6936819553375244,
+ "learning_rate": 0.00010594285714285714,
+ "loss": 0.1009,
+ "step": 2265
+ },
+ {
+ "epoch": 12.971428571428572,
+ "grad_norm": 0.8664882779121399,
+ "learning_rate": 0.0001055142857142857,
+ "loss": 0.1164,
+ "step": 2270
+ },
+ {
+ "epoch": 13.0,
+ "grad_norm": 0.9224509000778198,
+ "learning_rate": 0.00010508571428571429,
+ "loss": 0.1347,
+ "step": 2275
+ },
+ {
+ "epoch": 13.028571428571428,
+ "grad_norm": 0.6596968770027161,
+ "learning_rate": 0.00010465714285714285,
+ "loss": 0.1041,
+ "step": 2280
+ },
+ {
+ "epoch": 13.057142857142857,
+ "grad_norm": 0.6456631422042847,
+ "learning_rate": 0.00010422857142857142,
+ "loss": 0.1142,
+ "step": 2285
+ },
+ {
+ "epoch": 13.085714285714285,
+ "grad_norm": 0.9466612339019775,
+ "learning_rate": 0.00010379999999999999,
+ "loss": 0.1191,
+ "step": 2290
+ },
+ {
+ "epoch": 13.114285714285714,
+ "grad_norm": 0.9036727547645569,
+ "learning_rate": 0.00010337142857142856,
+ "loss": 0.121,
+ "step": 2295
+ },
+ {
+ "epoch": 13.142857142857142,
+ "grad_norm": 1.08086359500885,
+ "learning_rate": 0.00010294285714285713,
+ "loss": 0.1313,
+ "step": 2300
+ },
+ {
+ "epoch": 13.17142857142857,
+ "grad_norm": 0.703241765499115,
+ "learning_rate": 0.0001025142857142857,
+ "loss": 0.1151,
+ "step": 2305
+ },
+ {
+ "epoch": 13.2,
+ "grad_norm": 0.7901896238327026,
+ "learning_rate": 0.00010208571428571429,
+ "loss": 0.1275,
+ "step": 2310
+ },
+ {
+ "epoch": 13.228571428571428,
+ "grad_norm": 0.703542947769165,
+ "learning_rate": 0.00010165714285714284,
+ "loss": 0.1,
+ "step": 2315
+ },
+ {
+ "epoch": 13.257142857142856,
+ "grad_norm": 0.6657671928405762,
+ "learning_rate": 0.00010122857142857141,
+ "loss": 0.1141,
+ "step": 2320
+ },
+ {
+ "epoch": 13.285714285714286,
+ "grad_norm": 0.7593729496002197,
+ "learning_rate": 0.0001008,
+ "loss": 0.1099,
+ "step": 2325
+ },
+ {
+ "epoch": 13.314285714285715,
+ "grad_norm": 0.6681057810783386,
+ "learning_rate": 0.00010037142857142857,
+ "loss": 0.112,
+ "step": 2330
+ },
+ {
+ "epoch": 13.342857142857143,
+ "grad_norm": 0.7155857682228088,
+ "learning_rate": 9.994285714285712e-05,
+ "loss": 0.0989,
+ "step": 2335
+ },
+ {
+ "epoch": 13.371428571428572,
+ "grad_norm": 0.9484553337097168,
+ "learning_rate": 9.951428571428571e-05,
+ "loss": 0.0902,
+ "step": 2340
+ },
+ {
+ "epoch": 13.4,
+ "grad_norm": 0.9317265152931213,
+ "learning_rate": 9.908571428571428e-05,
+ "loss": 0.1432,
+ "step": 2345
+ },
+ {
+ "epoch": 13.428571428571429,
+ "grad_norm": 1.039158821105957,
+ "learning_rate": 9.865714285714285e-05,
+ "loss": 0.114,
+ "step": 2350
+ },
+ {
+ "epoch": 13.457142857142857,
+ "grad_norm": 0.8524000644683838,
+ "learning_rate": 9.822857142857141e-05,
+ "loss": 0.1144,
+ "step": 2355
+ },
+ {
+ "epoch": 13.485714285714286,
+ "grad_norm": 0.6337461471557617,
+ "learning_rate": 9.779999999999999e-05,
+ "loss": 0.1073,
+ "step": 2360
+ },
+ {
+ "epoch": 13.514285714285714,
+ "grad_norm": 0.9097298383712769,
+ "learning_rate": 9.737142857142856e-05,
+ "loss": 0.1031,
+ "step": 2365
+ },
+ {
+ "epoch": 13.542857142857143,
+ "grad_norm": 1.2013412714004517,
+ "learning_rate": 9.694285714285713e-05,
+ "loss": 0.1174,
+ "step": 2370
+ },
+ {
+ "epoch": 13.571428571428571,
+ "grad_norm": 0.7055214643478394,
+ "learning_rate": 9.65142857142857e-05,
+ "loss": 0.1175,
+ "step": 2375
+ },
+ {
+ "epoch": 13.6,
+ "grad_norm": 0.807955265045166,
+ "learning_rate": 9.608571428571427e-05,
+ "loss": 0.1286,
+ "step": 2380
+ },
+ {
+ "epoch": 13.628571428571428,
+ "grad_norm": 0.6661797761917114,
+ "learning_rate": 9.565714285714285e-05,
+ "loss": 0.1091,
+ "step": 2385
+ },
+ {
+ "epoch": 13.657142857142857,
+ "grad_norm": 1.119604468345642,
+ "learning_rate": 9.522857142857143e-05,
+ "loss": 0.1393,
+ "step": 2390
+ },
+ {
+ "epoch": 13.685714285714285,
+ "grad_norm": 0.5365435481071472,
+ "learning_rate": 9.479999999999999e-05,
+ "loss": 0.1075,
+ "step": 2395
+ },
+ {
+ "epoch": 13.714285714285714,
+ "grad_norm": 0.9443924427032471,
+ "learning_rate": 9.437142857142856e-05,
+ "loss": 0.0977,
+ "step": 2400
+ },
+ {
+ "epoch": 13.742857142857144,
+ "grad_norm": 0.6075264811515808,
+ "learning_rate": 9.394285714285714e-05,
+ "loss": 0.1329,
+ "step": 2405
+ },
+ {
+ "epoch": 13.771428571428572,
+ "grad_norm": 1.019352912902832,
+ "learning_rate": 9.351428571428571e-05,
+ "loss": 0.1083,
+ "step": 2410
+ },
+ {
+ "epoch": 13.8,
+ "grad_norm": 0.7234058380126953,
+ "learning_rate": 9.308571428571427e-05,
+ "loss": 0.1118,
+ "step": 2415
+ },
+ {
+ "epoch": 13.82857142857143,
+ "grad_norm": 0.6786122918128967,
+ "learning_rate": 9.265714285714284e-05,
+ "loss": 0.1208,
+ "step": 2420
+ },
+ {
+ "epoch": 13.857142857142858,
+ "grad_norm": 0.5820732116699219,
+ "learning_rate": 9.222857142857142e-05,
+ "loss": 0.1022,
+ "step": 2425
+ },
+ {
+ "epoch": 13.885714285714286,
+ "grad_norm": 0.8007987141609192,
+ "learning_rate": 9.18e-05,
+ "loss": 0.1293,
+ "step": 2430
+ },
+ {
+ "epoch": 13.914285714285715,
+ "grad_norm": 0.6813766956329346,
+ "learning_rate": 9.137142857142855e-05,
+ "loss": 0.1284,
+ "step": 2435
+ },
+ {
+ "epoch": 13.942857142857143,
+ "grad_norm": 0.6460041403770447,
+ "learning_rate": 9.094285714285714e-05,
+ "loss": 0.1073,
+ "step": 2440
+ },
+ {
+ "epoch": 13.971428571428572,
+ "grad_norm": 0.5939205288887024,
+ "learning_rate": 9.051428571428571e-05,
+ "loss": 0.1185,
+ "step": 2445
+ },
+ {
+ "epoch": 14.0,
+ "grad_norm": 0.8150635361671448,
+ "learning_rate": 9.008571428571428e-05,
+ "loss": 0.1039,
+ "step": 2450
+ },
+ {
+ "epoch": 14.028571428571428,
+ "grad_norm": 1.3691389560699463,
+ "learning_rate": 8.965714285714285e-05,
+ "loss": 0.1112,
+ "step": 2455
+ },
+ {
+ "epoch": 14.057142857142857,
+ "grad_norm": 0.9042718410491943,
+ "learning_rate": 8.922857142857142e-05,
+ "loss": 0.112,
+ "step": 2460
+ },
+ {
+ "epoch": 14.085714285714285,
+ "grad_norm": 0.7222105860710144,
+ "learning_rate": 8.879999999999999e-05,
+ "loss": 0.1221,
+ "step": 2465
+ },
+ {
+ "epoch": 14.114285714285714,
+ "grad_norm": 0.595588207244873,
+ "learning_rate": 8.837142857142857e-05,
+ "loss": 0.1058,
+ "step": 2470
+ },
+ {
+ "epoch": 14.142857142857142,
+ "grad_norm": 0.5262706279754639,
+ "learning_rate": 8.794285714285713e-05,
+ "loss": 0.1071,
+ "step": 2475
+ },
+ {
+ "epoch": 14.17142857142857,
+ "grad_norm": 0.6511022448539734,
+ "learning_rate": 8.75142857142857e-05,
+ "loss": 0.0917,
+ "step": 2480
+ },
+ {
+ "epoch": 14.2,
+ "grad_norm": 0.5737650394439697,
+ "learning_rate": 8.708571428571427e-05,
+ "loss": 0.0988,
+ "step": 2485
+ },
+ {
+ "epoch": 14.228571428571428,
+ "grad_norm": 0.7679132223129272,
+ "learning_rate": 8.665714285714286e-05,
+ "loss": 0.1185,
+ "step": 2490
+ },
+ {
+ "epoch": 14.257142857142856,
+ "grad_norm": 0.641198456287384,
+ "learning_rate": 8.622857142857141e-05,
+ "loss": 0.0894,
+ "step": 2495
+ },
+ {
+ "epoch": 14.285714285714286,
+ "grad_norm": 0.7215464115142822,
+ "learning_rate": 8.579999999999998e-05,
+ "loss": 0.0935,
+ "step": 2500
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-2500/training_args.bin b/glot-contrastive-final-lora/checkpoint-2500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-2500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/checkpoint-3000/README.md b/glot-contrastive-final-lora/checkpoint-3000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-3000/adapter_config.json b/glot-contrastive-final-lora/checkpoint-3000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-3000/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-3000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..52df9731df7fb38e231addbaba67c93c7ac2b266
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8cbb012358d427813b69c11a43d2279370f570cd9c119787e1f92c372b0761a
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-3000/optimizer.pt b/glot-contrastive-final-lora/checkpoint-3000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d606fc7d83ef97243d5320cccf780b1da4b091e2
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b176518aa7e320498ebc9cb02498e947ee4917f2a36df17791e539e71f009f6
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-3000/rng_state.pth b/glot-contrastive-final-lora/checkpoint-3000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bbf137dfb2e3f06ac978673b34d3a0010f4d8691
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b715e8f56451136ff979d4ad11ab96c9bdf53a90baa37faf2d19ec4b1b33a518
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-3000/scheduler.pt b/glot-contrastive-final-lora/checkpoint-3000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0dbe8bbe41639bd78f16d495261daf80fc931b7e
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc2601af904a968b88642c5ada06b99cc5fd89af9787905603b8974532a00cc
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-3000/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-3000/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-3000/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-3000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-3000/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-3000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-3000/trainer_state.json b/glot-contrastive-final-lora/checkpoint-3000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..90997ef2de60e79a78c3d3a847b63fab071b0f3f
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/trainer_state.json
@@ -0,0 +1,4234 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 17.142857142857142,
+ "eval_steps": 5,
+ "global_step": 3000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ },
+ {
+ "epoch": 2.8857142857142857,
+ "grad_norm": 1.3597544431686401,
+ "learning_rate": 0.00025679999999999995,
+ "loss": 0.1583,
+ "step": 505
+ },
+ {
+ "epoch": 2.914285714285714,
+ "grad_norm": 0.8790671825408936,
+ "learning_rate": 0.00025637142857142854,
+ "loss": 0.1565,
+ "step": 510
+ },
+ {
+ "epoch": 2.942857142857143,
+ "grad_norm": 1.1175066232681274,
+ "learning_rate": 0.0002559428571428571,
+ "loss": 0.1406,
+ "step": 515
+ },
+ {
+ "epoch": 2.9714285714285715,
+ "grad_norm": 2.8528785705566406,
+ "learning_rate": 0.0002555142857142857,
+ "loss": 0.1735,
+ "step": 520
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 2.2073328495025635,
+ "learning_rate": 0.0002550857142857143,
+ "loss": 0.1816,
+ "step": 525
+ },
+ {
+ "epoch": 3.0285714285714285,
+ "grad_norm": 11.01322078704834,
+ "learning_rate": 0.0002546571428571428,
+ "loss": 0.1873,
+ "step": 530
+ },
+ {
+ "epoch": 3.057142857142857,
+ "grad_norm": 1.5822402238845825,
+ "learning_rate": 0.0002542285714285714,
+ "loss": 0.168,
+ "step": 535
+ },
+ {
+ "epoch": 3.085714285714286,
+ "grad_norm": 1.3086942434310913,
+ "learning_rate": 0.0002538,
+ "loss": 0.149,
+ "step": 540
+ },
+ {
+ "epoch": 3.1142857142857143,
+ "grad_norm": 6.303041458129883,
+ "learning_rate": 0.0002533714285714285,
+ "loss": 0.1651,
+ "step": 545
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 14.48929500579834,
+ "learning_rate": 0.00025294285714285716,
+ "loss": 0.1687,
+ "step": 550
+ },
+ {
+ "epoch": 3.1714285714285713,
+ "grad_norm": 6.824525356292725,
+ "learning_rate": 0.0002525142857142857,
+ "loss": 0.1919,
+ "step": 555
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 18.772563934326172,
+ "learning_rate": 0.00025208571428571427,
+ "loss": 0.2075,
+ "step": 560
+ },
+ {
+ "epoch": 3.2285714285714286,
+ "grad_norm": 0.7268752455711365,
+ "learning_rate": 0.00025165714285714286,
+ "loss": 0.174,
+ "step": 565
+ },
+ {
+ "epoch": 3.257142857142857,
+ "grad_norm": 1.1301453113555908,
+ "learning_rate": 0.0002512285714285714,
+ "loss": 0.1668,
+ "step": 570
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 2.846802234649658,
+ "learning_rate": 0.00025079999999999997,
+ "loss": 0.1645,
+ "step": 575
+ },
+ {
+ "epoch": 3.314285714285714,
+ "grad_norm": 1.417515754699707,
+ "learning_rate": 0.00025037142857142855,
+ "loss": 0.1719,
+ "step": 580
+ },
+ {
+ "epoch": 3.342857142857143,
+ "grad_norm": 4.137150764465332,
+ "learning_rate": 0.00024994285714285714,
+ "loss": 0.1739,
+ "step": 585
+ },
+ {
+ "epoch": 3.3714285714285714,
+ "grad_norm": 2.6067259311676025,
+ "learning_rate": 0.0002495142857142857,
+ "loss": 0.1489,
+ "step": 590
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.601024627685547,
+ "learning_rate": 0.00024908571428571425,
+ "loss": 0.1618,
+ "step": 595
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 3.849017858505249,
+ "learning_rate": 0.00024865714285714284,
+ "loss": 0.1899,
+ "step": 600
+ },
+ {
+ "epoch": 3.4571428571428573,
+ "grad_norm": 4.673766136169434,
+ "learning_rate": 0.0002482285714285714,
+ "loss": 0.1761,
+ "step": 605
+ },
+ {
+ "epoch": 3.4857142857142858,
+ "grad_norm": 2.6057631969451904,
+ "learning_rate": 0.00024779999999999995,
+ "loss": 0.1743,
+ "step": 610
+ },
+ {
+ "epoch": 3.5142857142857142,
+ "grad_norm": 2.932652473449707,
+ "learning_rate": 0.0002473714285714286,
+ "loss": 0.1482,
+ "step": 615
+ },
+ {
+ "epoch": 3.5428571428571427,
+ "grad_norm": 0.8764939308166504,
+ "learning_rate": 0.0002469428571428571,
+ "loss": 0.1644,
+ "step": 620
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 1.3203191757202148,
+ "learning_rate": 0.0002465142857142857,
+ "loss": 0.1654,
+ "step": 625
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 0.7977635264396667,
+ "learning_rate": 0.0002460857142857143,
+ "loss": 0.1472,
+ "step": 630
+ },
+ {
+ "epoch": 3.6285714285714286,
+ "grad_norm": 1.4750248193740845,
+ "learning_rate": 0.0002456571428571428,
+ "loss": 0.1735,
+ "step": 635
+ },
+ {
+ "epoch": 3.657142857142857,
+ "grad_norm": 1.8164482116699219,
+ "learning_rate": 0.0002452285714285714,
+ "loss": 0.1593,
+ "step": 640
+ },
+ {
+ "epoch": 3.685714285714286,
+ "grad_norm": 1.4829603433609009,
+ "learning_rate": 0.0002448,
+ "loss": 0.1508,
+ "step": 645
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.8828144669532776,
+ "learning_rate": 0.00024437142857142857,
+ "loss": 0.1573,
+ "step": 650
+ },
+ {
+ "epoch": 3.742857142857143,
+ "grad_norm": 2.039384126663208,
+ "learning_rate": 0.00024394285714285713,
+ "loss": 0.1745,
+ "step": 655
+ },
+ {
+ "epoch": 3.7714285714285714,
+ "grad_norm": 0.9604200720787048,
+ "learning_rate": 0.00024351428571428569,
+ "loss": 0.17,
+ "step": 660
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 0.7903971076011658,
+ "learning_rate": 0.00024308571428571427,
+ "loss": 0.1654,
+ "step": 665
+ },
+ {
+ "epoch": 3.8285714285714287,
+ "grad_norm": 0.6935649514198303,
+ "learning_rate": 0.00024265714285714283,
+ "loss": 0.1714,
+ "step": 670
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.5832012295722961,
+ "learning_rate": 0.00024222857142857138,
+ "loss": 0.1636,
+ "step": 675
+ },
+ {
+ "epoch": 3.8857142857142857,
+ "grad_norm": 0.6303168535232544,
+ "learning_rate": 0.0002418,
+ "loss": 0.1604,
+ "step": 680
+ },
+ {
+ "epoch": 3.914285714285714,
+ "grad_norm": 0.7210885882377625,
+ "learning_rate": 0.00024137142857142855,
+ "loss": 0.1444,
+ "step": 685
+ },
+ {
+ "epoch": 3.942857142857143,
+ "grad_norm": 0.7690990567207336,
+ "learning_rate": 0.00024094285714285714,
+ "loss": 0.1631,
+ "step": 690
+ },
+ {
+ "epoch": 3.9714285714285715,
+ "grad_norm": 1.0142720937728882,
+ "learning_rate": 0.0002405142857142857,
+ "loss": 0.158,
+ "step": 695
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.7970322966575623,
+ "learning_rate": 0.00024008571428571425,
+ "loss": 0.1803,
+ "step": 700
+ },
+ {
+ "epoch": 4.0285714285714285,
+ "grad_norm": 0.6795914769172668,
+ "learning_rate": 0.00023965714285714284,
+ "loss": 0.143,
+ "step": 705
+ },
+ {
+ "epoch": 4.057142857142857,
+ "grad_norm": 0.6832629442214966,
+ "learning_rate": 0.0002392285714285714,
+ "loss": 0.1457,
+ "step": 710
+ },
+ {
+ "epoch": 4.085714285714285,
+ "grad_norm": 3.8629798889160156,
+ "learning_rate": 0.0002388,
+ "loss": 0.1671,
+ "step": 715
+ },
+ {
+ "epoch": 4.114285714285714,
+ "grad_norm": 1.1167882680892944,
+ "learning_rate": 0.00023837142857142856,
+ "loss": 0.1544,
+ "step": 720
+ },
+ {
+ "epoch": 4.142857142857143,
+ "grad_norm": 0.9431412816047668,
+ "learning_rate": 0.00023794285714285712,
+ "loss": 0.1605,
+ "step": 725
+ },
+ {
+ "epoch": 4.171428571428572,
+ "grad_norm": 1.310948133468628,
+ "learning_rate": 0.0002375142857142857,
+ "loss": 0.1121,
+ "step": 730
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 0.9830737709999084,
+ "learning_rate": 0.00023708571428571426,
+ "loss": 0.1742,
+ "step": 735
+ },
+ {
+ "epoch": 4.228571428571429,
+ "grad_norm": 0.6166555881500244,
+ "learning_rate": 0.00023665714285714282,
+ "loss": 0.1525,
+ "step": 740
+ },
+ {
+ "epoch": 4.257142857142857,
+ "grad_norm": 0.995579719543457,
+ "learning_rate": 0.00023622857142857143,
+ "loss": 0.1439,
+ "step": 745
+ },
+ {
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.639796793460846,
+ "learning_rate": 0.00023579999999999999,
+ "loss": 0.1692,
+ "step": 750
+ },
+ {
+ "epoch": 4.314285714285714,
+ "grad_norm": 0.9438050389289856,
+ "learning_rate": 0.00023537142857142854,
+ "loss": 0.1785,
+ "step": 755
+ },
+ {
+ "epoch": 4.3428571428571425,
+ "grad_norm": 0.8960750102996826,
+ "learning_rate": 0.00023494285714285713,
+ "loss": 0.1557,
+ "step": 760
+ },
+ {
+ "epoch": 4.371428571428572,
+ "grad_norm": 0.6287499070167542,
+ "learning_rate": 0.00023451428571428568,
+ "loss": 0.1459,
+ "step": 765
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 0.7638295888900757,
+ "learning_rate": 0.00023408571428571424,
+ "loss": 0.1341,
+ "step": 770
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.655878484249115,
+ "learning_rate": 0.00023365714285714283,
+ "loss": 0.1358,
+ "step": 775
+ },
+ {
+ "epoch": 4.457142857142857,
+ "grad_norm": 0.5840997695922852,
+ "learning_rate": 0.0002332285714285714,
+ "loss": 0.1386,
+ "step": 780
+ },
+ {
+ "epoch": 4.485714285714286,
+ "grad_norm": 1.1082488298416138,
+ "learning_rate": 0.0002328,
+ "loss": 0.1827,
+ "step": 785
+ },
+ {
+ "epoch": 4.514285714285714,
+ "grad_norm": 0.8825240135192871,
+ "learning_rate": 0.00023237142857142855,
+ "loss": 0.1527,
+ "step": 790
+ },
+ {
+ "epoch": 4.542857142857143,
+ "grad_norm": 0.6752304434776306,
+ "learning_rate": 0.0002319428571428571,
+ "loss": 0.1392,
+ "step": 795
+ },
+ {
+ "epoch": 4.571428571428571,
+ "grad_norm": 1.1423301696777344,
+ "learning_rate": 0.0002315142857142857,
+ "loss": 0.1433,
+ "step": 800
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 10.793691635131836,
+ "learning_rate": 0.00023108571428571425,
+ "loss": 0.1635,
+ "step": 805
+ },
+ {
+ "epoch": 4.628571428571428,
+ "grad_norm": 0.47564294934272766,
+ "learning_rate": 0.00023065714285714286,
+ "loss": 0.1199,
+ "step": 810
+ },
+ {
+ "epoch": 4.6571428571428575,
+ "grad_norm": 1.2492656707763672,
+ "learning_rate": 0.00023022857142857142,
+ "loss": 0.1488,
+ "step": 815
+ },
+ {
+ "epoch": 4.685714285714286,
+ "grad_norm": 0.6933501958847046,
+ "learning_rate": 0.00022979999999999997,
+ "loss": 0.1812,
+ "step": 820
+ },
+ {
+ "epoch": 4.714285714285714,
+ "grad_norm": 0.7901633977890015,
+ "learning_rate": 0.00022937142857142856,
+ "loss": 0.1415,
+ "step": 825
+ },
+ {
+ "epoch": 4.742857142857143,
+ "grad_norm": 0.7854829430580139,
+ "learning_rate": 0.00022894285714285712,
+ "loss": 0.1401,
+ "step": 830
+ },
+ {
+ "epoch": 4.771428571428571,
+ "grad_norm": 0.8716740608215332,
+ "learning_rate": 0.00022851428571428567,
+ "loss": 0.1982,
+ "step": 835
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 0.7047899961471558,
+ "learning_rate": 0.00022808571428571426,
+ "loss": 0.1624,
+ "step": 840
+ },
+ {
+ "epoch": 4.828571428571428,
+ "grad_norm": 0.7134959697723389,
+ "learning_rate": 0.00022765714285714284,
+ "loss": 0.1375,
+ "step": 845
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 1.0897325277328491,
+ "learning_rate": 0.00022722857142857143,
+ "loss": 0.1489,
+ "step": 850
+ },
+ {
+ "epoch": 4.885714285714286,
+ "grad_norm": 1.1065207719802856,
+ "learning_rate": 0.00022679999999999998,
+ "loss": 0.1495,
+ "step": 855
+ },
+ {
+ "epoch": 4.914285714285715,
+ "grad_norm": 0.7434757351875305,
+ "learning_rate": 0.00022637142857142854,
+ "loss": 0.1507,
+ "step": 860
+ },
+ {
+ "epoch": 4.942857142857143,
+ "grad_norm": 1.0045181512832642,
+ "learning_rate": 0.00022594285714285712,
+ "loss": 0.1527,
+ "step": 865
+ },
+ {
+ "epoch": 4.9714285714285715,
+ "grad_norm": 1.2025654315948486,
+ "learning_rate": 0.00022551428571428568,
+ "loss": 0.1523,
+ "step": 870
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.7823342084884644,
+ "learning_rate": 0.0002250857142857143,
+ "loss": 0.1514,
+ "step": 875
+ },
+ {
+ "epoch": 5.0285714285714285,
+ "grad_norm": 0.8405362963676453,
+ "learning_rate": 0.00022465714285714285,
+ "loss": 0.1461,
+ "step": 880
+ },
+ {
+ "epoch": 5.057142857142857,
+ "grad_norm": 0.7527463436126709,
+ "learning_rate": 0.0002242285714285714,
+ "loss": 0.1206,
+ "step": 885
+ },
+ {
+ "epoch": 5.085714285714285,
+ "grad_norm": 0.8372548222541809,
+ "learning_rate": 0.0002238,
+ "loss": 0.1513,
+ "step": 890
+ },
+ {
+ "epoch": 5.114285714285714,
+ "grad_norm": 0.8755456209182739,
+ "learning_rate": 0.00022337142857142855,
+ "loss": 0.1498,
+ "step": 895
+ },
+ {
+ "epoch": 5.142857142857143,
+ "grad_norm": 0.7312084436416626,
+ "learning_rate": 0.0002229428571428571,
+ "loss": 0.154,
+ "step": 900
+ },
+ {
+ "epoch": 5.171428571428572,
+ "grad_norm": 0.6366221904754639,
+ "learning_rate": 0.0002225142857142857,
+ "loss": 0.1466,
+ "step": 905
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 0.6406880617141724,
+ "learning_rate": 0.00022208571428571427,
+ "loss": 0.1254,
+ "step": 910
+ },
+ {
+ "epoch": 5.228571428571429,
+ "grad_norm": 2.4106833934783936,
+ "learning_rate": 0.00022165714285714283,
+ "loss": 0.1534,
+ "step": 915
+ },
+ {
+ "epoch": 5.257142857142857,
+ "grad_norm": 0.5635722279548645,
+ "learning_rate": 0.00022122857142857142,
+ "loss": 0.1461,
+ "step": 920
+ },
+ {
+ "epoch": 5.285714285714286,
+ "grad_norm": 0.787162184715271,
+ "learning_rate": 0.00022079999999999997,
+ "loss": 0.1424,
+ "step": 925
+ },
+ {
+ "epoch": 5.314285714285714,
+ "grad_norm": 0.6513975262641907,
+ "learning_rate": 0.00022037142857142853,
+ "loss": 0.1326,
+ "step": 930
+ },
+ {
+ "epoch": 5.3428571428571425,
+ "grad_norm": 0.6933534741401672,
+ "learning_rate": 0.00021994285714285711,
+ "loss": 0.1661,
+ "step": 935
+ },
+ {
+ "epoch": 5.371428571428572,
+ "grad_norm": 0.7263259887695312,
+ "learning_rate": 0.0002195142857142857,
+ "loss": 0.15,
+ "step": 940
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 0.5537381768226624,
+ "learning_rate": 0.00021908571428571428,
+ "loss": 0.129,
+ "step": 945
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.6014005541801453,
+ "learning_rate": 0.00021865714285714284,
+ "loss": 0.1321,
+ "step": 950
+ },
+ {
+ "epoch": 5.457142857142857,
+ "grad_norm": 0.6581441760063171,
+ "learning_rate": 0.0002182285714285714,
+ "loss": 0.1587,
+ "step": 955
+ },
+ {
+ "epoch": 5.485714285714286,
+ "grad_norm": 0.9326379895210266,
+ "learning_rate": 0.00021779999999999998,
+ "loss": 0.1654,
+ "step": 960
+ },
+ {
+ "epoch": 5.514285714285714,
+ "grad_norm": 0.9438592791557312,
+ "learning_rate": 0.00021737142857142854,
+ "loss": 0.1212,
+ "step": 965
+ },
+ {
+ "epoch": 5.542857142857143,
+ "grad_norm": 0.7699571251869202,
+ "learning_rate": 0.00021694285714285715,
+ "loss": 0.1464,
+ "step": 970
+ },
+ {
+ "epoch": 5.571428571428571,
+ "grad_norm": 0.8758366703987122,
+ "learning_rate": 0.0002165142857142857,
+ "loss": 0.1599,
+ "step": 975
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 0.6101442575454712,
+ "learning_rate": 0.00021608571428571426,
+ "loss": 0.1589,
+ "step": 980
+ },
+ {
+ "epoch": 5.628571428571428,
+ "grad_norm": 0.7454060912132263,
+ "learning_rate": 0.00021565714285714285,
+ "loss": 0.1433,
+ "step": 985
+ },
+ {
+ "epoch": 5.6571428571428575,
+ "grad_norm": 0.6379484534263611,
+ "learning_rate": 0.0002152285714285714,
+ "loss": 0.1592,
+ "step": 990
+ },
+ {
+ "epoch": 5.685714285714286,
+ "grad_norm": 1.1601309776306152,
+ "learning_rate": 0.00021479999999999996,
+ "loss": 0.1647,
+ "step": 995
+ },
+ {
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.5464673638343811,
+ "learning_rate": 0.00021437142857142855,
+ "loss": 0.1469,
+ "step": 1000
+ },
+ {
+ "epoch": 5.742857142857143,
+ "grad_norm": 1.0279319286346436,
+ "learning_rate": 0.00021394285714285713,
+ "loss": 0.1203,
+ "step": 1005
+ },
+ {
+ "epoch": 5.771428571428571,
+ "grad_norm": 0.5503718256950378,
+ "learning_rate": 0.00021351428571428572,
+ "loss": 0.1409,
+ "step": 1010
+ },
+ {
+ "epoch": 5.8,
+ "grad_norm": 0.6123886108398438,
+ "learning_rate": 0.00021308571428571427,
+ "loss": 0.1427,
+ "step": 1015
+ },
+ {
+ "epoch": 5.828571428571428,
+ "grad_norm": 0.6560390591621399,
+ "learning_rate": 0.00021265714285714283,
+ "loss": 0.1415,
+ "step": 1020
+ },
+ {
+ "epoch": 5.857142857142857,
+ "grad_norm": 0.5576716661453247,
+ "learning_rate": 0.00021222857142857141,
+ "loss": 0.1408,
+ "step": 1025
+ },
+ {
+ "epoch": 5.885714285714286,
+ "grad_norm": 0.6419074535369873,
+ "learning_rate": 0.00021179999999999997,
+ "loss": 0.1385,
+ "step": 1030
+ },
+ {
+ "epoch": 5.914285714285715,
+ "grad_norm": 1.008925199508667,
+ "learning_rate": 0.00021137142857142858,
+ "loss": 0.1497,
+ "step": 1035
+ },
+ {
+ "epoch": 5.942857142857143,
+ "grad_norm": 0.6559906005859375,
+ "learning_rate": 0.00021094285714285714,
+ "loss": 0.1218,
+ "step": 1040
+ },
+ {
+ "epoch": 5.9714285714285715,
+ "grad_norm": 0.627164363861084,
+ "learning_rate": 0.0002105142857142857,
+ "loss": 0.1368,
+ "step": 1045
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.5760972499847412,
+ "learning_rate": 0.00021008571428571428,
+ "loss": 0.1508,
+ "step": 1050
+ },
+ {
+ "epoch": 6.0285714285714285,
+ "grad_norm": 0.5754174590110779,
+ "learning_rate": 0.00020965714285714284,
+ "loss": 0.1181,
+ "step": 1055
+ },
+ {
+ "epoch": 6.057142857142857,
+ "grad_norm": 0.8736348748207092,
+ "learning_rate": 0.0002092285714285714,
+ "loss": 0.1252,
+ "step": 1060
+ },
+ {
+ "epoch": 6.085714285714285,
+ "grad_norm": 0.7166719436645508,
+ "learning_rate": 0.00020879999999999998,
+ "loss": 0.1481,
+ "step": 1065
+ },
+ {
+ "epoch": 6.114285714285714,
+ "grad_norm": 0.6494349241256714,
+ "learning_rate": 0.00020837142857142856,
+ "loss": 0.1478,
+ "step": 1070
+ },
+ {
+ "epoch": 6.142857142857143,
+ "grad_norm": 0.6681587100028992,
+ "learning_rate": 0.00020794285714285712,
+ "loss": 0.1488,
+ "step": 1075
+ },
+ {
+ "epoch": 6.171428571428572,
+ "grad_norm": 0.7123684883117676,
+ "learning_rate": 0.0002075142857142857,
+ "loss": 0.1378,
+ "step": 1080
+ },
+ {
+ "epoch": 6.2,
+ "grad_norm": 0.6146950721740723,
+ "learning_rate": 0.00020708571428571426,
+ "loss": 0.1306,
+ "step": 1085
+ },
+ {
+ "epoch": 6.228571428571429,
+ "grad_norm": 0.8402445912361145,
+ "learning_rate": 0.00020665714285714282,
+ "loss": 0.1063,
+ "step": 1090
+ },
+ {
+ "epoch": 6.257142857142857,
+ "grad_norm": 0.6567764282226562,
+ "learning_rate": 0.0002062285714285714,
+ "loss": 0.1195,
+ "step": 1095
+ },
+ {
+ "epoch": 6.285714285714286,
+ "grad_norm": 0.6006014943122864,
+ "learning_rate": 0.0002058,
+ "loss": 0.1542,
+ "step": 1100
+ },
+ {
+ "epoch": 6.314285714285714,
+ "grad_norm": 0.793100893497467,
+ "learning_rate": 0.00020537142857142857,
+ "loss": 0.1381,
+ "step": 1105
+ },
+ {
+ "epoch": 6.3428571428571425,
+ "grad_norm": 0.5923666954040527,
+ "learning_rate": 0.00020494285714285713,
+ "loss": 0.1386,
+ "step": 1110
+ },
+ {
+ "epoch": 6.371428571428572,
+ "grad_norm": 0.6692521572113037,
+ "learning_rate": 0.0002045142857142857,
+ "loss": 0.1223,
+ "step": 1115
+ },
+ {
+ "epoch": 6.4,
+ "grad_norm": 0.7216306328773499,
+ "learning_rate": 0.00020408571428571427,
+ "loss": 0.1367,
+ "step": 1120
+ },
+ {
+ "epoch": 6.428571428571429,
+ "grad_norm": 0.5640934109687805,
+ "learning_rate": 0.00020365714285714283,
+ "loss": 0.1554,
+ "step": 1125
+ },
+ {
+ "epoch": 6.457142857142857,
+ "grad_norm": 0.8154368996620178,
+ "learning_rate": 0.00020322857142857138,
+ "loss": 0.1674,
+ "step": 1130
+ },
+ {
+ "epoch": 6.485714285714286,
+ "grad_norm": 0.7185398936271667,
+ "learning_rate": 0.0002028,
+ "loss": 0.1375,
+ "step": 1135
+ },
+ {
+ "epoch": 6.514285714285714,
+ "grad_norm": 0.6805170774459839,
+ "learning_rate": 0.00020237142857142855,
+ "loss": 0.1306,
+ "step": 1140
+ },
+ {
+ "epoch": 6.542857142857143,
+ "grad_norm": 0.5996941924095154,
+ "learning_rate": 0.00020194285714285714,
+ "loss": 0.1433,
+ "step": 1145
+ },
+ {
+ "epoch": 6.571428571428571,
+ "grad_norm": 0.5258373022079468,
+ "learning_rate": 0.0002015142857142857,
+ "loss": 0.1285,
+ "step": 1150
+ },
+ {
+ "epoch": 6.6,
+ "grad_norm": 0.7771695256233215,
+ "learning_rate": 0.00020108571428571425,
+ "loss": 0.1493,
+ "step": 1155
+ },
+ {
+ "epoch": 6.628571428571428,
+ "grad_norm": 0.5920616388320923,
+ "learning_rate": 0.00020065714285714284,
+ "loss": 0.1479,
+ "step": 1160
+ },
+ {
+ "epoch": 6.6571428571428575,
+ "grad_norm": 0.7460982799530029,
+ "learning_rate": 0.00020022857142857142,
+ "loss": 0.1173,
+ "step": 1165
+ },
+ {
+ "epoch": 6.685714285714286,
+ "grad_norm": 1.1703822612762451,
+ "learning_rate": 0.0001998,
+ "loss": 0.1402,
+ "step": 1170
+ },
+ {
+ "epoch": 6.714285714285714,
+ "grad_norm": 0.7894724011421204,
+ "learning_rate": 0.00019937142857142856,
+ "loss": 0.1253,
+ "step": 1175
+ },
+ {
+ "epoch": 6.742857142857143,
+ "grad_norm": 0.7013376355171204,
+ "learning_rate": 0.00019894285714285712,
+ "loss": 0.1573,
+ "step": 1180
+ },
+ {
+ "epoch": 6.771428571428571,
+ "grad_norm": 0.6421737670898438,
+ "learning_rate": 0.0001985142857142857,
+ "loss": 0.1497,
+ "step": 1185
+ },
+ {
+ "epoch": 6.8,
+ "grad_norm": 1.204296350479126,
+ "learning_rate": 0.00019808571428571426,
+ "loss": 0.1634,
+ "step": 1190
+ },
+ {
+ "epoch": 6.828571428571428,
+ "grad_norm": 0.867765486240387,
+ "learning_rate": 0.00019765714285714282,
+ "loss": 0.1353,
+ "step": 1195
+ },
+ {
+ "epoch": 6.857142857142857,
+ "grad_norm": 0.7325594425201416,
+ "learning_rate": 0.00019722857142857143,
+ "loss": 0.118,
+ "step": 1200
+ },
+ {
+ "epoch": 6.885714285714286,
+ "grad_norm": 0.7029078006744385,
+ "learning_rate": 0.00019679999999999999,
+ "loss": 0.1425,
+ "step": 1205
+ },
+ {
+ "epoch": 6.914285714285715,
+ "grad_norm": 1.1572504043579102,
+ "learning_rate": 0.00019637142857142857,
+ "loss": 0.1337,
+ "step": 1210
+ },
+ {
+ "epoch": 6.942857142857143,
+ "grad_norm": 0.8022822141647339,
+ "learning_rate": 0.00019594285714285713,
+ "loss": 0.1684,
+ "step": 1215
+ },
+ {
+ "epoch": 6.9714285714285715,
+ "grad_norm": 0.6729874610900879,
+ "learning_rate": 0.00019551428571428568,
+ "loss": 0.1238,
+ "step": 1220
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5773627758026123,
+ "learning_rate": 0.00019508571428571427,
+ "loss": 0.138,
+ "step": 1225
+ },
+ {
+ "epoch": 7.0285714285714285,
+ "grad_norm": 0.7182291150093079,
+ "learning_rate": 0.00019465714285714285,
+ "loss": 0.1431,
+ "step": 1230
+ },
+ {
+ "epoch": 7.057142857142857,
+ "grad_norm": 1.7567912340164185,
+ "learning_rate": 0.0001942285714285714,
+ "loss": 0.1319,
+ "step": 1235
+ },
+ {
+ "epoch": 7.085714285714285,
+ "grad_norm": 0.6845232248306274,
+ "learning_rate": 0.0001938,
+ "loss": 0.1292,
+ "step": 1240
+ },
+ {
+ "epoch": 7.114285714285714,
+ "grad_norm": 0.6077771782875061,
+ "learning_rate": 0.00019337142857142855,
+ "loss": 0.1238,
+ "step": 1245
+ },
+ {
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.6168347597122192,
+ "learning_rate": 0.0001929428571428571,
+ "loss": 0.1384,
+ "step": 1250
+ },
+ {
+ "epoch": 7.171428571428572,
+ "grad_norm": 0.7457576394081116,
+ "learning_rate": 0.0001925142857142857,
+ "loss": 0.1306,
+ "step": 1255
+ },
+ {
+ "epoch": 7.2,
+ "grad_norm": 0.5969316363334656,
+ "learning_rate": 0.00019208571428571425,
+ "loss": 0.1123,
+ "step": 1260
+ },
+ {
+ "epoch": 7.228571428571429,
+ "grad_norm": 0.6902753710746765,
+ "learning_rate": 0.00019165714285714286,
+ "loss": 0.1185,
+ "step": 1265
+ },
+ {
+ "epoch": 7.257142857142857,
+ "grad_norm": 0.6488338112831116,
+ "learning_rate": 0.00019122857142857142,
+ "loss": 0.1431,
+ "step": 1270
+ },
+ {
+ "epoch": 7.285714285714286,
+ "grad_norm": 0.6814819574356079,
+ "learning_rate": 0.00019079999999999998,
+ "loss": 0.1495,
+ "step": 1275
+ },
+ {
+ "epoch": 7.314285714285714,
+ "grad_norm": 0.7468088865280151,
+ "learning_rate": 0.00019037142857142856,
+ "loss": 0.1158,
+ "step": 1280
+ },
+ {
+ "epoch": 7.3428571428571425,
+ "grad_norm": 0.7417412400245667,
+ "learning_rate": 0.00018994285714285712,
+ "loss": 0.1311,
+ "step": 1285
+ },
+ {
+ "epoch": 7.371428571428572,
+ "grad_norm": 0.5480664372444153,
+ "learning_rate": 0.00018951428571428567,
+ "loss": 0.135,
+ "step": 1290
+ },
+ {
+ "epoch": 7.4,
+ "grad_norm": 0.725527822971344,
+ "learning_rate": 0.00018908571428571429,
+ "loss": 0.1217,
+ "step": 1295
+ },
+ {
+ "epoch": 7.428571428571429,
+ "grad_norm": 0.6566678285598755,
+ "learning_rate": 0.00018865714285714284,
+ "loss": 0.1417,
+ "step": 1300
+ },
+ {
+ "epoch": 7.457142857142857,
+ "grad_norm": 0.516952395439148,
+ "learning_rate": 0.00018822857142857143,
+ "loss": 0.1329,
+ "step": 1305
+ },
+ {
+ "epoch": 7.485714285714286,
+ "grad_norm": 1.9545241594314575,
+ "learning_rate": 0.00018779999999999998,
+ "loss": 0.1339,
+ "step": 1310
+ },
+ {
+ "epoch": 7.514285714285714,
+ "grad_norm": 0.8276839852333069,
+ "learning_rate": 0.00018737142857142854,
+ "loss": 0.1324,
+ "step": 1315
+ },
+ {
+ "epoch": 7.542857142857143,
+ "grad_norm": 0.6737099289894104,
+ "learning_rate": 0.00018694285714285713,
+ "loss": 0.1139,
+ "step": 1320
+ },
+ {
+ "epoch": 7.571428571428571,
+ "grad_norm": 0.6914472579956055,
+ "learning_rate": 0.00018651428571428568,
+ "loss": 0.1146,
+ "step": 1325
+ },
+ {
+ "epoch": 7.6,
+ "grad_norm": 0.6630033850669861,
+ "learning_rate": 0.0001860857142857143,
+ "loss": 0.1571,
+ "step": 1330
+ },
+ {
+ "epoch": 7.628571428571428,
+ "grad_norm": 0.820688784122467,
+ "learning_rate": 0.00018565714285714285,
+ "loss": 0.15,
+ "step": 1335
+ },
+ {
+ "epoch": 7.6571428571428575,
+ "grad_norm": 2.0491325855255127,
+ "learning_rate": 0.0001852285714285714,
+ "loss": 0.127,
+ "step": 1340
+ },
+ {
+ "epoch": 7.685714285714286,
+ "grad_norm": 0.9327268004417419,
+ "learning_rate": 0.0001848,
+ "loss": 0.1289,
+ "step": 1345
+ },
+ {
+ "epoch": 7.714285714285714,
+ "grad_norm": 1.3131701946258545,
+ "learning_rate": 0.00018437142857142855,
+ "loss": 0.1228,
+ "step": 1350
+ },
+ {
+ "epoch": 7.742857142857143,
+ "grad_norm": 2.955918312072754,
+ "learning_rate": 0.0001839428571428571,
+ "loss": 0.1082,
+ "step": 1355
+ },
+ {
+ "epoch": 7.771428571428571,
+ "grad_norm": 1.2165493965148926,
+ "learning_rate": 0.00018351428571428572,
+ "loss": 0.1688,
+ "step": 1360
+ },
+ {
+ "epoch": 7.8,
+ "grad_norm": 0.759324312210083,
+ "learning_rate": 0.00018308571428571428,
+ "loss": 0.1185,
+ "step": 1365
+ },
+ {
+ "epoch": 7.828571428571428,
+ "grad_norm": 0.7445591688156128,
+ "learning_rate": 0.00018265714285714286,
+ "loss": 0.1431,
+ "step": 1370
+ },
+ {
+ "epoch": 7.857142857142857,
+ "grad_norm": 0.679374098777771,
+ "learning_rate": 0.00018222857142857142,
+ "loss": 0.1451,
+ "step": 1375
+ },
+ {
+ "epoch": 7.885714285714286,
+ "grad_norm": 2.1234302520751953,
+ "learning_rate": 0.00018179999999999997,
+ "loss": 0.1265,
+ "step": 1380
+ },
+ {
+ "epoch": 7.914285714285715,
+ "grad_norm": 1.006521224975586,
+ "learning_rate": 0.00018137142857142856,
+ "loss": 0.1722,
+ "step": 1385
+ },
+ {
+ "epoch": 7.942857142857143,
+ "grad_norm": 0.7275253534317017,
+ "learning_rate": 0.00018094285714285712,
+ "loss": 0.1625,
+ "step": 1390
+ },
+ {
+ "epoch": 7.9714285714285715,
+ "grad_norm": 0.8612022995948792,
+ "learning_rate": 0.0001805142857142857,
+ "loss": 0.1345,
+ "step": 1395
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.7276798486709595,
+ "learning_rate": 0.00018008571428571428,
+ "loss": 0.1236,
+ "step": 1400
+ },
+ {
+ "epoch": 8.028571428571428,
+ "grad_norm": 0.8731086850166321,
+ "learning_rate": 0.00017965714285714284,
+ "loss": 0.1604,
+ "step": 1405
+ },
+ {
+ "epoch": 8.057142857142857,
+ "grad_norm": 0.8950818777084351,
+ "learning_rate": 0.0001792285714285714,
+ "loss": 0.1531,
+ "step": 1410
+ },
+ {
+ "epoch": 8.085714285714285,
+ "grad_norm": 0.7399356365203857,
+ "learning_rate": 0.00017879999999999998,
+ "loss": 0.1508,
+ "step": 1415
+ },
+ {
+ "epoch": 8.114285714285714,
+ "grad_norm": 1.3727307319641113,
+ "learning_rate": 0.00017837142857142854,
+ "loss": 0.1487,
+ "step": 1420
+ },
+ {
+ "epoch": 8.142857142857142,
+ "grad_norm": 0.5938125848770142,
+ "learning_rate": 0.00017794285714285715,
+ "loss": 0.1303,
+ "step": 1425
+ },
+ {
+ "epoch": 8.17142857142857,
+ "grad_norm": 0.7043821811676025,
+ "learning_rate": 0.0001775142857142857,
+ "loss": 0.0948,
+ "step": 1430
+ },
+ {
+ "epoch": 8.2,
+ "grad_norm": 1.1062767505645752,
+ "learning_rate": 0.00017708571428571426,
+ "loss": 0.1412,
+ "step": 1435
+ },
+ {
+ "epoch": 8.228571428571428,
+ "grad_norm": 0.844832181930542,
+ "learning_rate": 0.00017665714285714285,
+ "loss": 0.113,
+ "step": 1440
+ },
+ {
+ "epoch": 8.257142857142856,
+ "grad_norm": 0.7564154863357544,
+ "learning_rate": 0.0001762285714285714,
+ "loss": 0.1319,
+ "step": 1445
+ },
+ {
+ "epoch": 8.285714285714286,
+ "grad_norm": 0.8843110203742981,
+ "learning_rate": 0.00017579999999999996,
+ "loss": 0.1206,
+ "step": 1450
+ },
+ {
+ "epoch": 8.314285714285715,
+ "grad_norm": 0.8175828456878662,
+ "learning_rate": 0.00017537142857142855,
+ "loss": 0.1327,
+ "step": 1455
+ },
+ {
+ "epoch": 8.342857142857143,
+ "grad_norm": 0.6443565487861633,
+ "learning_rate": 0.00017494285714285713,
+ "loss": 0.1239,
+ "step": 1460
+ },
+ {
+ "epoch": 8.371428571428572,
+ "grad_norm": 0.7237185835838318,
+ "learning_rate": 0.00017451428571428572,
+ "loss": 0.1639,
+ "step": 1465
+ },
+ {
+ "epoch": 8.4,
+ "grad_norm": 0.6118057370185852,
+ "learning_rate": 0.00017408571428571427,
+ "loss": 0.1363,
+ "step": 1470
+ },
+ {
+ "epoch": 8.428571428571429,
+ "grad_norm": 0.6754649877548218,
+ "learning_rate": 0.00017365714285714283,
+ "loss": 0.1187,
+ "step": 1475
+ },
+ {
+ "epoch": 8.457142857142857,
+ "grad_norm": 1.0067390203475952,
+ "learning_rate": 0.00017322857142857141,
+ "loss": 0.1401,
+ "step": 1480
+ },
+ {
+ "epoch": 8.485714285714286,
+ "grad_norm": 8.509544372558594,
+ "learning_rate": 0.00017279999999999997,
+ "loss": 0.1304,
+ "step": 1485
+ },
+ {
+ "epoch": 8.514285714285714,
+ "grad_norm": 4.2030205726623535,
+ "learning_rate": 0.00017237142857142858,
+ "loss": 0.121,
+ "step": 1490
+ },
+ {
+ "epoch": 8.542857142857143,
+ "grad_norm": 4.877438068389893,
+ "learning_rate": 0.00017194285714285714,
+ "loss": 0.1918,
+ "step": 1495
+ },
+ {
+ "epoch": 8.571428571428571,
+ "grad_norm": 6.4971232414245605,
+ "learning_rate": 0.0001715142857142857,
+ "loss": 0.2154,
+ "step": 1500
+ },
+ {
+ "epoch": 8.6,
+ "grad_norm": 4.365469932556152,
+ "learning_rate": 0.00017108571428571428,
+ "loss": 0.2272,
+ "step": 1505
+ },
+ {
+ "epoch": 8.628571428571428,
+ "grad_norm": 2.551957845687866,
+ "learning_rate": 0.00017065714285714284,
+ "loss": 0.2163,
+ "step": 1510
+ },
+ {
+ "epoch": 8.657142857142857,
+ "grad_norm": 5.326391220092773,
+ "learning_rate": 0.0001702285714285714,
+ "loss": 0.1612,
+ "step": 1515
+ },
+ {
+ "epoch": 8.685714285714285,
+ "grad_norm": 1.3528404235839844,
+ "learning_rate": 0.00016979999999999998,
+ "loss": 0.1636,
+ "step": 1520
+ },
+ {
+ "epoch": 8.714285714285714,
+ "grad_norm": 1.4466065168380737,
+ "learning_rate": 0.00016937142857142856,
+ "loss": 0.1295,
+ "step": 1525
+ },
+ {
+ "epoch": 8.742857142857144,
+ "grad_norm": 0.6576040387153625,
+ "learning_rate": 0.00016894285714285715,
+ "loss": 0.1318,
+ "step": 1530
+ },
+ {
+ "epoch": 8.771428571428572,
+ "grad_norm": 1.286942958831787,
+ "learning_rate": 0.0001685142857142857,
+ "loss": 0.1443,
+ "step": 1535
+ },
+ {
+ "epoch": 8.8,
+ "grad_norm": 9.474458694458008,
+ "learning_rate": 0.00016808571428571426,
+ "loss": 0.1313,
+ "step": 1540
+ },
+ {
+ "epoch": 8.82857142857143,
+ "grad_norm": 2.6731069087982178,
+ "learning_rate": 0.00016765714285714285,
+ "loss": 0.1485,
+ "step": 1545
+ },
+ {
+ "epoch": 8.857142857142858,
+ "grad_norm": 1.313723087310791,
+ "learning_rate": 0.0001672285714285714,
+ "loss": 0.1346,
+ "step": 1550
+ },
+ {
+ "epoch": 8.885714285714286,
+ "grad_norm": 1.7115576267242432,
+ "learning_rate": 0.0001668,
+ "loss": 0.1471,
+ "step": 1555
+ },
+ {
+ "epoch": 8.914285714285715,
+ "grad_norm": 1.2599923610687256,
+ "learning_rate": 0.00016637142857142857,
+ "loss": 0.1433,
+ "step": 1560
+ },
+ {
+ "epoch": 8.942857142857143,
+ "grad_norm": 0.9659029245376587,
+ "learning_rate": 0.00016594285714285713,
+ "loss": 0.1256,
+ "step": 1565
+ },
+ {
+ "epoch": 8.971428571428572,
+ "grad_norm": 1.1282744407653809,
+ "learning_rate": 0.0001655142857142857,
+ "loss": 0.1373,
+ "step": 1570
+ },
+ {
+ "epoch": 9.0,
+ "grad_norm": 3.20717453956604,
+ "learning_rate": 0.00016508571428571427,
+ "loss": 0.1355,
+ "step": 1575
+ },
+ {
+ "epoch": 9.028571428571428,
+ "grad_norm": 0.8310821056365967,
+ "learning_rate": 0.00016465714285714283,
+ "loss": 0.1268,
+ "step": 1580
+ },
+ {
+ "epoch": 9.057142857142857,
+ "grad_norm": 1.5337790250778198,
+ "learning_rate": 0.00016422857142857139,
+ "loss": 0.1267,
+ "step": 1585
+ },
+ {
+ "epoch": 9.085714285714285,
+ "grad_norm": 2.6406068801879883,
+ "learning_rate": 0.0001638,
+ "loss": 0.1363,
+ "step": 1590
+ },
+ {
+ "epoch": 9.114285714285714,
+ "grad_norm": 0.7705873847007751,
+ "learning_rate": 0.00016337142857142855,
+ "loss": 0.1291,
+ "step": 1595
+ },
+ {
+ "epoch": 9.142857142857142,
+ "grad_norm": 0.7092650532722473,
+ "learning_rate": 0.00016294285714285714,
+ "loss": 0.1435,
+ "step": 1600
+ },
+ {
+ "epoch": 9.17142857142857,
+ "grad_norm": 1.098961591720581,
+ "learning_rate": 0.0001625142857142857,
+ "loss": 0.1471,
+ "step": 1605
+ },
+ {
+ "epoch": 9.2,
+ "grad_norm": 0.6994885206222534,
+ "learning_rate": 0.00016208571428571425,
+ "loss": 0.1345,
+ "step": 1610
+ },
+ {
+ "epoch": 9.228571428571428,
+ "grad_norm": 0.9613476991653442,
+ "learning_rate": 0.00016165714285714284,
+ "loss": 0.1399,
+ "step": 1615
+ },
+ {
+ "epoch": 9.257142857142856,
+ "grad_norm": 0.675588846206665,
+ "learning_rate": 0.00016122857142857142,
+ "loss": 0.1319,
+ "step": 1620
+ },
+ {
+ "epoch": 9.285714285714286,
+ "grad_norm": 0.7519372701644897,
+ "learning_rate": 0.0001608,
+ "loss": 0.137,
+ "step": 1625
+ },
+ {
+ "epoch": 9.314285714285715,
+ "grad_norm": 1.135025978088379,
+ "learning_rate": 0.00016037142857142856,
+ "loss": 0.1322,
+ "step": 1630
+ },
+ {
+ "epoch": 9.342857142857143,
+ "grad_norm": 0.7462936639785767,
+ "learning_rate": 0.00015994285714285712,
+ "loss": 0.1215,
+ "step": 1635
+ },
+ {
+ "epoch": 9.371428571428572,
+ "grad_norm": 0.9042088985443115,
+ "learning_rate": 0.0001595142857142857,
+ "loss": 0.1191,
+ "step": 1640
+ },
+ {
+ "epoch": 9.4,
+ "grad_norm": 0.567828893661499,
+ "learning_rate": 0.00015908571428571426,
+ "loss": 0.1189,
+ "step": 1645
+ },
+ {
+ "epoch": 9.428571428571429,
+ "grad_norm": 0.981585681438446,
+ "learning_rate": 0.00015865714285714282,
+ "loss": 0.128,
+ "step": 1650
+ },
+ {
+ "epoch": 9.457142857142857,
+ "grad_norm": 1.24985933303833,
+ "learning_rate": 0.00015822857142857143,
+ "loss": 0.1315,
+ "step": 1655
+ },
+ {
+ "epoch": 9.485714285714286,
+ "grad_norm": 0.6517993211746216,
+ "learning_rate": 0.0001578,
+ "loss": 0.1076,
+ "step": 1660
+ },
+ {
+ "epoch": 9.514285714285714,
+ "grad_norm": 1.166628122329712,
+ "learning_rate": 0.00015737142857142857,
+ "loss": 0.1345,
+ "step": 1665
+ },
+ {
+ "epoch": 9.542857142857143,
+ "grad_norm": 0.9763592481613159,
+ "learning_rate": 0.00015694285714285713,
+ "loss": 0.1449,
+ "step": 1670
+ },
+ {
+ "epoch": 9.571428571428571,
+ "grad_norm": 0.7829060554504395,
+ "learning_rate": 0.00015651428571428569,
+ "loss": 0.1117,
+ "step": 1675
+ },
+ {
+ "epoch": 9.6,
+ "grad_norm": 0.6693719029426575,
+ "learning_rate": 0.00015608571428571427,
+ "loss": 0.1129,
+ "step": 1680
+ },
+ {
+ "epoch": 9.628571428571428,
+ "grad_norm": 1.2122846841812134,
+ "learning_rate": 0.00015565714285714285,
+ "loss": 0.1125,
+ "step": 1685
+ },
+ {
+ "epoch": 9.657142857142857,
+ "grad_norm": 1.0689371824264526,
+ "learning_rate": 0.0001552285714285714,
+ "loss": 0.1478,
+ "step": 1690
+ },
+ {
+ "epoch": 9.685714285714285,
+ "grad_norm": 1.8511656522750854,
+ "learning_rate": 0.0001548,
+ "loss": 0.1431,
+ "step": 1695
+ },
+ {
+ "epoch": 9.714285714285714,
+ "grad_norm": 0.6706506609916687,
+ "learning_rate": 0.00015437142857142855,
+ "loss": 0.1262,
+ "step": 1700
+ },
+ {
+ "epoch": 9.742857142857144,
+ "grad_norm": 1.0798784494400024,
+ "learning_rate": 0.00015394285714285714,
+ "loss": 0.1275,
+ "step": 1705
+ },
+ {
+ "epoch": 9.771428571428572,
+ "grad_norm": 0.7915983200073242,
+ "learning_rate": 0.0001535142857142857,
+ "loss": 0.1316,
+ "step": 1710
+ },
+ {
+ "epoch": 9.8,
+ "grad_norm": 1.8630567789077759,
+ "learning_rate": 0.00015308571428571425,
+ "loss": 0.1258,
+ "step": 1715
+ },
+ {
+ "epoch": 9.82857142857143,
+ "grad_norm": 0.7807756662368774,
+ "learning_rate": 0.00015265714285714286,
+ "loss": 0.1079,
+ "step": 1720
+ },
+ {
+ "epoch": 9.857142857142858,
+ "grad_norm": 1.4698439836502075,
+ "learning_rate": 0.00015222857142857142,
+ "loss": 0.1357,
+ "step": 1725
+ },
+ {
+ "epoch": 9.885714285714286,
+ "grad_norm": 1.2121926546096802,
+ "learning_rate": 0.00015179999999999998,
+ "loss": 0.1322,
+ "step": 1730
+ },
+ {
+ "epoch": 9.914285714285715,
+ "grad_norm": 0.6348568201065063,
+ "learning_rate": 0.00015137142857142856,
+ "loss": 0.0893,
+ "step": 1735
+ },
+ {
+ "epoch": 9.942857142857143,
+ "grad_norm": 0.6694422364234924,
+ "learning_rate": 0.00015094285714285712,
+ "loss": 0.1189,
+ "step": 1740
+ },
+ {
+ "epoch": 9.971428571428572,
+ "grad_norm": 0.569332480430603,
+ "learning_rate": 0.00015051428571428567,
+ "loss": 0.1349,
+ "step": 1745
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.934073269367218,
+ "learning_rate": 0.00015008571428571429,
+ "loss": 0.1237,
+ "step": 1750
+ },
+ {
+ "epoch": 10.028571428571428,
+ "grad_norm": 0.7191672325134277,
+ "learning_rate": 0.00014965714285714284,
+ "loss": 0.1308,
+ "step": 1755
+ },
+ {
+ "epoch": 10.057142857142857,
+ "grad_norm": 0.7006493806838989,
+ "learning_rate": 0.00014922857142857143,
+ "loss": 0.104,
+ "step": 1760
+ },
+ {
+ "epoch": 10.085714285714285,
+ "grad_norm": 0.9030678272247314,
+ "learning_rate": 0.00014879999999999998,
+ "loss": 0.1308,
+ "step": 1765
+ },
+ {
+ "epoch": 10.114285714285714,
+ "grad_norm": 0.7007766366004944,
+ "learning_rate": 0.00014837142857142854,
+ "loss": 0.1044,
+ "step": 1770
+ },
+ {
+ "epoch": 10.142857142857142,
+ "grad_norm": 0.4832770824432373,
+ "learning_rate": 0.00014794285714285713,
+ "loss": 0.1119,
+ "step": 1775
+ },
+ {
+ "epoch": 10.17142857142857,
+ "grad_norm": 0.7819458842277527,
+ "learning_rate": 0.0001475142857142857,
+ "loss": 0.1087,
+ "step": 1780
+ },
+ {
+ "epoch": 10.2,
+ "grad_norm": 1.0223525762557983,
+ "learning_rate": 0.00014708571428571427,
+ "loss": 0.1314,
+ "step": 1785
+ },
+ {
+ "epoch": 10.228571428571428,
+ "grad_norm": 0.6224566698074341,
+ "learning_rate": 0.00014665714285714285,
+ "loss": 0.1159,
+ "step": 1790
+ },
+ {
+ "epoch": 10.257142857142856,
+ "grad_norm": 0.45800235867500305,
+ "learning_rate": 0.0001462285714285714,
+ "loss": 0.0942,
+ "step": 1795
+ },
+ {
+ "epoch": 10.285714285714286,
+ "grad_norm": 0.6258400082588196,
+ "learning_rate": 0.0001458,
+ "loss": 0.1079,
+ "step": 1800
+ },
+ {
+ "epoch": 10.314285714285715,
+ "grad_norm": 1.1812794208526611,
+ "learning_rate": 0.00014537142857142858,
+ "loss": 0.1378,
+ "step": 1805
+ },
+ {
+ "epoch": 10.342857142857143,
+ "grad_norm": 0.8541269898414612,
+ "learning_rate": 0.00014494285714285713,
+ "loss": 0.1274,
+ "step": 1810
+ },
+ {
+ "epoch": 10.371428571428572,
+ "grad_norm": 0.7131860256195068,
+ "learning_rate": 0.0001445142857142857,
+ "loss": 0.1247,
+ "step": 1815
+ },
+ {
+ "epoch": 10.4,
+ "grad_norm": 0.6109820008277893,
+ "learning_rate": 0.00014408571428571428,
+ "loss": 0.1246,
+ "step": 1820
+ },
+ {
+ "epoch": 10.428571428571429,
+ "grad_norm": 0.5621510744094849,
+ "learning_rate": 0.00014365714285714286,
+ "loss": 0.1039,
+ "step": 1825
+ },
+ {
+ "epoch": 10.457142857142857,
+ "grad_norm": 1.022777795791626,
+ "learning_rate": 0.00014322857142857142,
+ "loss": 0.1206,
+ "step": 1830
+ },
+ {
+ "epoch": 10.485714285714286,
+ "grad_norm": 0.9120668768882751,
+ "learning_rate": 0.00014279999999999997,
+ "loss": 0.1289,
+ "step": 1835
+ },
+ {
+ "epoch": 10.514285714285714,
+ "grad_norm": 1.1882030963897705,
+ "learning_rate": 0.00014237142857142856,
+ "loss": 0.1194,
+ "step": 1840
+ },
+ {
+ "epoch": 10.542857142857143,
+ "grad_norm": 0.6078401207923889,
+ "learning_rate": 0.00014194285714285714,
+ "loss": 0.1339,
+ "step": 1845
+ },
+ {
+ "epoch": 10.571428571428571,
+ "grad_norm": 0.7380999326705933,
+ "learning_rate": 0.0001415142857142857,
+ "loss": 0.1318,
+ "step": 1850
+ },
+ {
+ "epoch": 10.6,
+ "grad_norm": 0.5884959101676941,
+ "learning_rate": 0.00014108571428571428,
+ "loss": 0.1249,
+ "step": 1855
+ },
+ {
+ "epoch": 10.628571428571428,
+ "grad_norm": 1.0121936798095703,
+ "learning_rate": 0.00014065714285714284,
+ "loss": 0.1137,
+ "step": 1860
+ },
+ {
+ "epoch": 10.657142857142857,
+ "grad_norm": 0.6444916129112244,
+ "learning_rate": 0.00014022857142857143,
+ "loss": 0.1213,
+ "step": 1865
+ },
+ {
+ "epoch": 10.685714285714285,
+ "grad_norm": 0.7931004762649536,
+ "learning_rate": 0.00013979999999999998,
+ "loss": 0.1318,
+ "step": 1870
+ },
+ {
+ "epoch": 10.714285714285714,
+ "grad_norm": 0.5596404075622559,
+ "learning_rate": 0.00013937142857142857,
+ "loss": 0.1075,
+ "step": 1875
+ },
+ {
+ "epoch": 10.742857142857144,
+ "grad_norm": 0.6586474180221558,
+ "learning_rate": 0.00013894285714285712,
+ "loss": 0.13,
+ "step": 1880
+ },
+ {
+ "epoch": 10.771428571428572,
+ "grad_norm": 1.0195013284683228,
+ "learning_rate": 0.00013851428571428568,
+ "loss": 0.1373,
+ "step": 1885
+ },
+ {
+ "epoch": 10.8,
+ "grad_norm": 0.9233512878417969,
+ "learning_rate": 0.00013808571428571427,
+ "loss": 0.1168,
+ "step": 1890
+ },
+ {
+ "epoch": 10.82857142857143,
+ "grad_norm": 0.7154092788696289,
+ "learning_rate": 0.00013765714285714285,
+ "loss": 0.1081,
+ "step": 1895
+ },
+ {
+ "epoch": 10.857142857142858,
+ "grad_norm": 1.4588117599487305,
+ "learning_rate": 0.0001372285714285714,
+ "loss": 0.1061,
+ "step": 1900
+ },
+ {
+ "epoch": 10.885714285714286,
+ "grad_norm": 0.6087035536766052,
+ "learning_rate": 0.0001368,
+ "loss": 0.1157,
+ "step": 1905
+ },
+ {
+ "epoch": 10.914285714285715,
+ "grad_norm": 0.7371247410774231,
+ "learning_rate": 0.00013637142857142855,
+ "loss": 0.1339,
+ "step": 1910
+ },
+ {
+ "epoch": 10.942857142857143,
+ "grad_norm": 0.8253212571144104,
+ "learning_rate": 0.00013594285714285713,
+ "loss": 0.1198,
+ "step": 1915
+ },
+ {
+ "epoch": 10.971428571428572,
+ "grad_norm": 0.6889544129371643,
+ "learning_rate": 0.00013551428571428572,
+ "loss": 0.1131,
+ "step": 1920
+ },
+ {
+ "epoch": 11.0,
+ "grad_norm": 0.6408224105834961,
+ "learning_rate": 0.00013508571428571427,
+ "loss": 0.122,
+ "step": 1925
+ },
+ {
+ "epoch": 11.028571428571428,
+ "grad_norm": 0.6771185398101807,
+ "learning_rate": 0.00013465714285714283,
+ "loss": 0.1492,
+ "step": 1930
+ },
+ {
+ "epoch": 11.057142857142857,
+ "grad_norm": 0.8706450462341309,
+ "learning_rate": 0.00013422857142857142,
+ "loss": 0.1294,
+ "step": 1935
+ },
+ {
+ "epoch": 11.085714285714285,
+ "grad_norm": 1.730648398399353,
+ "learning_rate": 0.0001338,
+ "loss": 0.1004,
+ "step": 1940
+ },
+ {
+ "epoch": 11.114285714285714,
+ "grad_norm": 0.6985113620758057,
+ "learning_rate": 0.00013337142857142856,
+ "loss": 0.0995,
+ "step": 1945
+ },
+ {
+ "epoch": 11.142857142857142,
+ "grad_norm": 0.8901951313018799,
+ "learning_rate": 0.00013294285714285711,
+ "loss": 0.1179,
+ "step": 1950
+ },
+ {
+ "epoch": 11.17142857142857,
+ "grad_norm": 0.7232164144515991,
+ "learning_rate": 0.0001325142857142857,
+ "loss": 0.1397,
+ "step": 1955
+ },
+ {
+ "epoch": 11.2,
+ "grad_norm": 0.6447544693946838,
+ "learning_rate": 0.00013208571428571428,
+ "loss": 0.1366,
+ "step": 1960
+ },
+ {
+ "epoch": 11.228571428571428,
+ "grad_norm": 0.7964944243431091,
+ "learning_rate": 0.00013165714285714284,
+ "loss": 0.1121,
+ "step": 1965
+ },
+ {
+ "epoch": 11.257142857142856,
+ "grad_norm": 0.9012628793716431,
+ "learning_rate": 0.00013122857142857142,
+ "loss": 0.1131,
+ "step": 1970
+ },
+ {
+ "epoch": 11.285714285714286,
+ "grad_norm": 0.9295369982719421,
+ "learning_rate": 0.00013079999999999998,
+ "loss": 0.1232,
+ "step": 1975
+ },
+ {
+ "epoch": 11.314285714285715,
+ "grad_norm": 0.6237708926200867,
+ "learning_rate": 0.00013037142857142857,
+ "loss": 0.1066,
+ "step": 1980
+ },
+ {
+ "epoch": 11.342857142857143,
+ "grad_norm": 0.5250967741012573,
+ "learning_rate": 0.00012994285714285715,
+ "loss": 0.118,
+ "step": 1985
+ },
+ {
+ "epoch": 11.371428571428572,
+ "grad_norm": 1.0013964176177979,
+ "learning_rate": 0.0001295142857142857,
+ "loss": 0.1125,
+ "step": 1990
+ },
+ {
+ "epoch": 11.4,
+ "grad_norm": 0.6721311807632446,
+ "learning_rate": 0.00012908571428571426,
+ "loss": 0.1196,
+ "step": 1995
+ },
+ {
+ "epoch": 11.428571428571429,
+ "grad_norm": 0.6966421008110046,
+ "learning_rate": 0.00012865714285714285,
+ "loss": 0.1172,
+ "step": 2000
+ },
+ {
+ "epoch": 11.457142857142857,
+ "grad_norm": 0.8811460733413696,
+ "learning_rate": 0.00012822857142857143,
+ "loss": 0.135,
+ "step": 2005
+ },
+ {
+ "epoch": 11.485714285714286,
+ "grad_norm": 0.8829531073570251,
+ "learning_rate": 0.0001278,
+ "loss": 0.1288,
+ "step": 2010
+ },
+ {
+ "epoch": 11.514285714285714,
+ "grad_norm": 0.7530654668807983,
+ "learning_rate": 0.00012737142857142855,
+ "loss": 0.1073,
+ "step": 2015
+ },
+ {
+ "epoch": 11.542857142857143,
+ "grad_norm": 0.513940691947937,
+ "learning_rate": 0.00012694285714285713,
+ "loss": 0.121,
+ "step": 2020
+ },
+ {
+ "epoch": 11.571428571428571,
+ "grad_norm": 0.8574968576431274,
+ "learning_rate": 0.0001265142857142857,
+ "loss": 0.1103,
+ "step": 2025
+ },
+ {
+ "epoch": 11.6,
+ "grad_norm": 0.7482439875602722,
+ "learning_rate": 0.00012608571428571427,
+ "loss": 0.1027,
+ "step": 2030
+ },
+ {
+ "epoch": 11.628571428571428,
+ "grad_norm": 0.8367976546287537,
+ "learning_rate": 0.00012565714285714286,
+ "loss": 0.1181,
+ "step": 2035
+ },
+ {
+ "epoch": 11.657142857142857,
+ "grad_norm": 2.048128366470337,
+ "learning_rate": 0.0001252285714285714,
+ "loss": 0.1122,
+ "step": 2040
+ },
+ {
+ "epoch": 11.685714285714285,
+ "grad_norm": 0.7426862716674805,
+ "learning_rate": 0.00012479999999999997,
+ "loss": 0.1169,
+ "step": 2045
+ },
+ {
+ "epoch": 11.714285714285714,
+ "grad_norm": 3.093841791152954,
+ "learning_rate": 0.00012437142857142855,
+ "loss": 0.1164,
+ "step": 2050
+ },
+ {
+ "epoch": 11.742857142857144,
+ "grad_norm": 0.8172643184661865,
+ "learning_rate": 0.00012394285714285714,
+ "loss": 0.1354,
+ "step": 2055
+ },
+ {
+ "epoch": 11.771428571428572,
+ "grad_norm": 1.9950591325759888,
+ "learning_rate": 0.0001235142857142857,
+ "loss": 0.1037,
+ "step": 2060
+ },
+ {
+ "epoch": 11.8,
+ "grad_norm": 0.5929077863693237,
+ "learning_rate": 0.00012308571428571428,
+ "loss": 0.1194,
+ "step": 2065
+ },
+ {
+ "epoch": 11.82857142857143,
+ "grad_norm": 1.293624997138977,
+ "learning_rate": 0.00012265714285714284,
+ "loss": 0.12,
+ "step": 2070
+ },
+ {
+ "epoch": 11.857142857142858,
+ "grad_norm": 1.0515168905258179,
+ "learning_rate": 0.00012222857142857142,
+ "loss": 0.1049,
+ "step": 2075
+ },
+ {
+ "epoch": 11.885714285714286,
+ "grad_norm": 1.2874428033828735,
+ "learning_rate": 0.00012179999999999999,
+ "loss": 0.115,
+ "step": 2080
+ },
+ {
+ "epoch": 11.914285714285715,
+ "grad_norm": 0.7317278385162354,
+ "learning_rate": 0.00012137142857142856,
+ "loss": 0.1184,
+ "step": 2085
+ },
+ {
+ "epoch": 11.942857142857143,
+ "grad_norm": 1.3407148122787476,
+ "learning_rate": 0.00012094285714285713,
+ "loss": 0.132,
+ "step": 2090
+ },
+ {
+ "epoch": 11.971428571428572,
+ "grad_norm": 2.656409502029419,
+ "learning_rate": 0.00012051428571428569,
+ "loss": 0.1359,
+ "step": 2095
+ },
+ {
+ "epoch": 12.0,
+ "grad_norm": 0.7189064025878906,
+ "learning_rate": 0.00012008571428571428,
+ "loss": 0.1217,
+ "step": 2100
+ },
+ {
+ "epoch": 12.028571428571428,
+ "grad_norm": 0.7510334849357605,
+ "learning_rate": 0.00011965714285714285,
+ "loss": 0.109,
+ "step": 2105
+ },
+ {
+ "epoch": 12.057142857142857,
+ "grad_norm": 0.7235113382339478,
+ "learning_rate": 0.00011922857142857142,
+ "loss": 0.1114,
+ "step": 2110
+ },
+ {
+ "epoch": 12.085714285714285,
+ "grad_norm": 1.7435882091522217,
+ "learning_rate": 0.0001188,
+ "loss": 0.1357,
+ "step": 2115
+ },
+ {
+ "epoch": 12.114285714285714,
+ "grad_norm": 1.170392632484436,
+ "learning_rate": 0.00011837142857142856,
+ "loss": 0.1255,
+ "step": 2120
+ },
+ {
+ "epoch": 12.142857142857142,
+ "grad_norm": 0.6476783752441406,
+ "learning_rate": 0.00011794285714285713,
+ "loss": 0.1108,
+ "step": 2125
+ },
+ {
+ "epoch": 12.17142857142857,
+ "grad_norm": 0.8599929213523865,
+ "learning_rate": 0.00011751428571428571,
+ "loss": 0.0997,
+ "step": 2130
+ },
+ {
+ "epoch": 12.2,
+ "grad_norm": 0.8918687105178833,
+ "learning_rate": 0.00011708571428571428,
+ "loss": 0.1149,
+ "step": 2135
+ },
+ {
+ "epoch": 12.228571428571428,
+ "grad_norm": 1.609435796737671,
+ "learning_rate": 0.00011665714285714284,
+ "loss": 0.1136,
+ "step": 2140
+ },
+ {
+ "epoch": 12.257142857142856,
+ "grad_norm": 0.6206801533699036,
+ "learning_rate": 0.00011622857142857143,
+ "loss": 0.1135,
+ "step": 2145
+ },
+ {
+ "epoch": 12.285714285714286,
+ "grad_norm": 0.8769077658653259,
+ "learning_rate": 0.0001158,
+ "loss": 0.1344,
+ "step": 2150
+ },
+ {
+ "epoch": 12.314285714285715,
+ "grad_norm": 0.6279401183128357,
+ "learning_rate": 0.00011537142857142855,
+ "loss": 0.1049,
+ "step": 2155
+ },
+ {
+ "epoch": 12.342857142857143,
+ "grad_norm": 1.1110137701034546,
+ "learning_rate": 0.00011494285714285712,
+ "loss": 0.1146,
+ "step": 2160
+ },
+ {
+ "epoch": 12.371428571428572,
+ "grad_norm": 0.7911233901977539,
+ "learning_rate": 0.00011451428571428571,
+ "loss": 0.1257,
+ "step": 2165
+ },
+ {
+ "epoch": 12.4,
+ "grad_norm": 0.9691207408905029,
+ "learning_rate": 0.00011408571428571428,
+ "loss": 0.1226,
+ "step": 2170
+ },
+ {
+ "epoch": 12.428571428571429,
+ "grad_norm": 0.6168835759162903,
+ "learning_rate": 0.00011365714285714284,
+ "loss": 0.1271,
+ "step": 2175
+ },
+ {
+ "epoch": 12.457142857142857,
+ "grad_norm": 0.6143497228622437,
+ "learning_rate": 0.00011322857142857142,
+ "loss": 0.111,
+ "step": 2180
+ },
+ {
+ "epoch": 12.485714285714286,
+ "grad_norm": 1.5673450231552124,
+ "learning_rate": 0.00011279999999999999,
+ "loss": 0.1186,
+ "step": 2185
+ },
+ {
+ "epoch": 12.514285714285714,
+ "grad_norm": 1.298756718635559,
+ "learning_rate": 0.00011237142857142856,
+ "loss": 0.1024,
+ "step": 2190
+ },
+ {
+ "epoch": 12.542857142857143,
+ "grad_norm": 0.9484918117523193,
+ "learning_rate": 0.00011194285714285715,
+ "loss": 0.1171,
+ "step": 2195
+ },
+ {
+ "epoch": 12.571428571428571,
+ "grad_norm": 0.725705623626709,
+ "learning_rate": 0.0001115142857142857,
+ "loss": 0.1216,
+ "step": 2200
+ },
+ {
+ "epoch": 12.6,
+ "grad_norm": 1.1394798755645752,
+ "learning_rate": 0.00011108571428571427,
+ "loss": 0.1132,
+ "step": 2205
+ },
+ {
+ "epoch": 12.628571428571428,
+ "grad_norm": 0.9548712968826294,
+ "learning_rate": 0.00011065714285714286,
+ "loss": 0.1209,
+ "step": 2210
+ },
+ {
+ "epoch": 12.657142857142857,
+ "grad_norm": 0.6173953413963318,
+ "learning_rate": 0.00011022857142857143,
+ "loss": 0.1049,
+ "step": 2215
+ },
+ {
+ "epoch": 12.685714285714285,
+ "grad_norm": 0.8227205872535706,
+ "learning_rate": 0.00010979999999999999,
+ "loss": 0.1045,
+ "step": 2220
+ },
+ {
+ "epoch": 12.714285714285714,
+ "grad_norm": 0.7252780795097351,
+ "learning_rate": 0.00010937142857142856,
+ "loss": 0.1146,
+ "step": 2225
+ },
+ {
+ "epoch": 12.742857142857144,
+ "grad_norm": 0.9374399781227112,
+ "learning_rate": 0.00010894285714285714,
+ "loss": 0.1478,
+ "step": 2230
+ },
+ {
+ "epoch": 12.771428571428572,
+ "grad_norm": 5.1985368728637695,
+ "learning_rate": 0.0001085142857142857,
+ "loss": 0.1059,
+ "step": 2235
+ },
+ {
+ "epoch": 12.8,
+ "grad_norm": 0.9629620909690857,
+ "learning_rate": 0.00010808571428571427,
+ "loss": 0.124,
+ "step": 2240
+ },
+ {
+ "epoch": 12.82857142857143,
+ "grad_norm": 0.7022290229797363,
+ "learning_rate": 0.00010765714285714285,
+ "loss": 0.1309,
+ "step": 2245
+ },
+ {
+ "epoch": 12.857142857142858,
+ "grad_norm": 0.574188232421875,
+ "learning_rate": 0.00010722857142857142,
+ "loss": 0.086,
+ "step": 2250
+ },
+ {
+ "epoch": 12.885714285714286,
+ "grad_norm": 0.9712439179420471,
+ "learning_rate": 0.00010679999999999998,
+ "loss": 0.1152,
+ "step": 2255
+ },
+ {
+ "epoch": 12.914285714285715,
+ "grad_norm": 0.6562150120735168,
+ "learning_rate": 0.00010637142857142856,
+ "loss": 0.1343,
+ "step": 2260
+ },
+ {
+ "epoch": 12.942857142857143,
+ "grad_norm": 0.6936819553375244,
+ "learning_rate": 0.00010594285714285714,
+ "loss": 0.1009,
+ "step": 2265
+ },
+ {
+ "epoch": 12.971428571428572,
+ "grad_norm": 0.8664882779121399,
+ "learning_rate": 0.0001055142857142857,
+ "loss": 0.1164,
+ "step": 2270
+ },
+ {
+ "epoch": 13.0,
+ "grad_norm": 0.9224509000778198,
+ "learning_rate": 0.00010508571428571429,
+ "loss": 0.1347,
+ "step": 2275
+ },
+ {
+ "epoch": 13.028571428571428,
+ "grad_norm": 0.6596968770027161,
+ "learning_rate": 0.00010465714285714285,
+ "loss": 0.1041,
+ "step": 2280
+ },
+ {
+ "epoch": 13.057142857142857,
+ "grad_norm": 0.6456631422042847,
+ "learning_rate": 0.00010422857142857142,
+ "loss": 0.1142,
+ "step": 2285
+ },
+ {
+ "epoch": 13.085714285714285,
+ "grad_norm": 0.9466612339019775,
+ "learning_rate": 0.00010379999999999999,
+ "loss": 0.1191,
+ "step": 2290
+ },
+ {
+ "epoch": 13.114285714285714,
+ "grad_norm": 0.9036727547645569,
+ "learning_rate": 0.00010337142857142856,
+ "loss": 0.121,
+ "step": 2295
+ },
+ {
+ "epoch": 13.142857142857142,
+ "grad_norm": 1.08086359500885,
+ "learning_rate": 0.00010294285714285713,
+ "loss": 0.1313,
+ "step": 2300
+ },
+ {
+ "epoch": 13.17142857142857,
+ "grad_norm": 0.703241765499115,
+ "learning_rate": 0.0001025142857142857,
+ "loss": 0.1151,
+ "step": 2305
+ },
+ {
+ "epoch": 13.2,
+ "grad_norm": 0.7901896238327026,
+ "learning_rate": 0.00010208571428571429,
+ "loss": 0.1275,
+ "step": 2310
+ },
+ {
+ "epoch": 13.228571428571428,
+ "grad_norm": 0.703542947769165,
+ "learning_rate": 0.00010165714285714284,
+ "loss": 0.1,
+ "step": 2315
+ },
+ {
+ "epoch": 13.257142857142856,
+ "grad_norm": 0.6657671928405762,
+ "learning_rate": 0.00010122857142857141,
+ "loss": 0.1141,
+ "step": 2320
+ },
+ {
+ "epoch": 13.285714285714286,
+ "grad_norm": 0.7593729496002197,
+ "learning_rate": 0.0001008,
+ "loss": 0.1099,
+ "step": 2325
+ },
+ {
+ "epoch": 13.314285714285715,
+ "grad_norm": 0.6681057810783386,
+ "learning_rate": 0.00010037142857142857,
+ "loss": 0.112,
+ "step": 2330
+ },
+ {
+ "epoch": 13.342857142857143,
+ "grad_norm": 0.7155857682228088,
+ "learning_rate": 9.994285714285712e-05,
+ "loss": 0.0989,
+ "step": 2335
+ },
+ {
+ "epoch": 13.371428571428572,
+ "grad_norm": 0.9484553337097168,
+ "learning_rate": 9.951428571428571e-05,
+ "loss": 0.0902,
+ "step": 2340
+ },
+ {
+ "epoch": 13.4,
+ "grad_norm": 0.9317265152931213,
+ "learning_rate": 9.908571428571428e-05,
+ "loss": 0.1432,
+ "step": 2345
+ },
+ {
+ "epoch": 13.428571428571429,
+ "grad_norm": 1.039158821105957,
+ "learning_rate": 9.865714285714285e-05,
+ "loss": 0.114,
+ "step": 2350
+ },
+ {
+ "epoch": 13.457142857142857,
+ "grad_norm": 0.8524000644683838,
+ "learning_rate": 9.822857142857141e-05,
+ "loss": 0.1144,
+ "step": 2355
+ },
+ {
+ "epoch": 13.485714285714286,
+ "grad_norm": 0.6337461471557617,
+ "learning_rate": 9.779999999999999e-05,
+ "loss": 0.1073,
+ "step": 2360
+ },
+ {
+ "epoch": 13.514285714285714,
+ "grad_norm": 0.9097298383712769,
+ "learning_rate": 9.737142857142856e-05,
+ "loss": 0.1031,
+ "step": 2365
+ },
+ {
+ "epoch": 13.542857142857143,
+ "grad_norm": 1.2013412714004517,
+ "learning_rate": 9.694285714285713e-05,
+ "loss": 0.1174,
+ "step": 2370
+ },
+ {
+ "epoch": 13.571428571428571,
+ "grad_norm": 0.7055214643478394,
+ "learning_rate": 9.65142857142857e-05,
+ "loss": 0.1175,
+ "step": 2375
+ },
+ {
+ "epoch": 13.6,
+ "grad_norm": 0.807955265045166,
+ "learning_rate": 9.608571428571427e-05,
+ "loss": 0.1286,
+ "step": 2380
+ },
+ {
+ "epoch": 13.628571428571428,
+ "grad_norm": 0.6661797761917114,
+ "learning_rate": 9.565714285714285e-05,
+ "loss": 0.1091,
+ "step": 2385
+ },
+ {
+ "epoch": 13.657142857142857,
+ "grad_norm": 1.119604468345642,
+ "learning_rate": 9.522857142857143e-05,
+ "loss": 0.1393,
+ "step": 2390
+ },
+ {
+ "epoch": 13.685714285714285,
+ "grad_norm": 0.5365435481071472,
+ "learning_rate": 9.479999999999999e-05,
+ "loss": 0.1075,
+ "step": 2395
+ },
+ {
+ "epoch": 13.714285714285714,
+ "grad_norm": 0.9443924427032471,
+ "learning_rate": 9.437142857142856e-05,
+ "loss": 0.0977,
+ "step": 2400
+ },
+ {
+ "epoch": 13.742857142857144,
+ "grad_norm": 0.6075264811515808,
+ "learning_rate": 9.394285714285714e-05,
+ "loss": 0.1329,
+ "step": 2405
+ },
+ {
+ "epoch": 13.771428571428572,
+ "grad_norm": 1.019352912902832,
+ "learning_rate": 9.351428571428571e-05,
+ "loss": 0.1083,
+ "step": 2410
+ },
+ {
+ "epoch": 13.8,
+ "grad_norm": 0.7234058380126953,
+ "learning_rate": 9.308571428571427e-05,
+ "loss": 0.1118,
+ "step": 2415
+ },
+ {
+ "epoch": 13.82857142857143,
+ "grad_norm": 0.6786122918128967,
+ "learning_rate": 9.265714285714284e-05,
+ "loss": 0.1208,
+ "step": 2420
+ },
+ {
+ "epoch": 13.857142857142858,
+ "grad_norm": 0.5820732116699219,
+ "learning_rate": 9.222857142857142e-05,
+ "loss": 0.1022,
+ "step": 2425
+ },
+ {
+ "epoch": 13.885714285714286,
+ "grad_norm": 0.8007987141609192,
+ "learning_rate": 9.18e-05,
+ "loss": 0.1293,
+ "step": 2430
+ },
+ {
+ "epoch": 13.914285714285715,
+ "grad_norm": 0.6813766956329346,
+ "learning_rate": 9.137142857142855e-05,
+ "loss": 0.1284,
+ "step": 2435
+ },
+ {
+ "epoch": 13.942857142857143,
+ "grad_norm": 0.6460041403770447,
+ "learning_rate": 9.094285714285714e-05,
+ "loss": 0.1073,
+ "step": 2440
+ },
+ {
+ "epoch": 13.971428571428572,
+ "grad_norm": 0.5939205288887024,
+ "learning_rate": 9.051428571428571e-05,
+ "loss": 0.1185,
+ "step": 2445
+ },
+ {
+ "epoch": 14.0,
+ "grad_norm": 0.8150635361671448,
+ "learning_rate": 9.008571428571428e-05,
+ "loss": 0.1039,
+ "step": 2450
+ },
+ {
+ "epoch": 14.028571428571428,
+ "grad_norm": 1.3691389560699463,
+ "learning_rate": 8.965714285714285e-05,
+ "loss": 0.1112,
+ "step": 2455
+ },
+ {
+ "epoch": 14.057142857142857,
+ "grad_norm": 0.9042718410491943,
+ "learning_rate": 8.922857142857142e-05,
+ "loss": 0.112,
+ "step": 2460
+ },
+ {
+ "epoch": 14.085714285714285,
+ "grad_norm": 0.7222105860710144,
+ "learning_rate": 8.879999999999999e-05,
+ "loss": 0.1221,
+ "step": 2465
+ },
+ {
+ "epoch": 14.114285714285714,
+ "grad_norm": 0.595588207244873,
+ "learning_rate": 8.837142857142857e-05,
+ "loss": 0.1058,
+ "step": 2470
+ },
+ {
+ "epoch": 14.142857142857142,
+ "grad_norm": 0.5262706279754639,
+ "learning_rate": 8.794285714285713e-05,
+ "loss": 0.1071,
+ "step": 2475
+ },
+ {
+ "epoch": 14.17142857142857,
+ "grad_norm": 0.6511022448539734,
+ "learning_rate": 8.75142857142857e-05,
+ "loss": 0.0917,
+ "step": 2480
+ },
+ {
+ "epoch": 14.2,
+ "grad_norm": 0.5737650394439697,
+ "learning_rate": 8.708571428571427e-05,
+ "loss": 0.0988,
+ "step": 2485
+ },
+ {
+ "epoch": 14.228571428571428,
+ "grad_norm": 0.7679132223129272,
+ "learning_rate": 8.665714285714286e-05,
+ "loss": 0.1185,
+ "step": 2490
+ },
+ {
+ "epoch": 14.257142857142856,
+ "grad_norm": 0.641198456287384,
+ "learning_rate": 8.622857142857141e-05,
+ "loss": 0.0894,
+ "step": 2495
+ },
+ {
+ "epoch": 14.285714285714286,
+ "grad_norm": 0.7215464115142822,
+ "learning_rate": 8.579999999999998e-05,
+ "loss": 0.0935,
+ "step": 2500
+ },
+ {
+ "epoch": 14.314285714285715,
+ "grad_norm": 1.0740891695022583,
+ "learning_rate": 8.537142857142857e-05,
+ "loss": 0.1156,
+ "step": 2505
+ },
+ {
+ "epoch": 14.342857142857143,
+ "grad_norm": 0.6668990254402161,
+ "learning_rate": 8.494285714285714e-05,
+ "loss": 0.1006,
+ "step": 2510
+ },
+ {
+ "epoch": 14.371428571428572,
+ "grad_norm": 0.6674673557281494,
+ "learning_rate": 8.45142857142857e-05,
+ "loss": 0.1045,
+ "step": 2515
+ },
+ {
+ "epoch": 14.4,
+ "grad_norm": 0.6198854446411133,
+ "learning_rate": 8.408571428571428e-05,
+ "loss": 0.0997,
+ "step": 2520
+ },
+ {
+ "epoch": 14.428571428571429,
+ "grad_norm": 0.7187360525131226,
+ "learning_rate": 8.365714285714285e-05,
+ "loss": 0.1277,
+ "step": 2525
+ },
+ {
+ "epoch": 14.457142857142857,
+ "grad_norm": 0.583990216255188,
+ "learning_rate": 8.322857142857142e-05,
+ "loss": 0.1182,
+ "step": 2530
+ },
+ {
+ "epoch": 14.485714285714286,
+ "grad_norm": 1.1340539455413818,
+ "learning_rate": 8.28e-05,
+ "loss": 0.106,
+ "step": 2535
+ },
+ {
+ "epoch": 14.514285714285714,
+ "grad_norm": 0.6411644816398621,
+ "learning_rate": 8.237142857142856e-05,
+ "loss": 0.0963,
+ "step": 2540
+ },
+ {
+ "epoch": 14.542857142857143,
+ "grad_norm": 0.7092474102973938,
+ "learning_rate": 8.194285714285713e-05,
+ "loss": 0.1061,
+ "step": 2545
+ },
+ {
+ "epoch": 14.571428571428571,
+ "grad_norm": 0.6887038946151733,
+ "learning_rate": 8.151428571428572e-05,
+ "loss": 0.1224,
+ "step": 2550
+ },
+ {
+ "epoch": 14.6,
+ "grad_norm": 0.8119840621948242,
+ "learning_rate": 8.108571428571428e-05,
+ "loss": 0.1023,
+ "step": 2555
+ },
+ {
+ "epoch": 14.628571428571428,
+ "grad_norm": 0.6380637288093567,
+ "learning_rate": 8.065714285714285e-05,
+ "loss": 0.0893,
+ "step": 2560
+ },
+ {
+ "epoch": 14.657142857142857,
+ "grad_norm": 0.7857063412666321,
+ "learning_rate": 8.022857142857142e-05,
+ "loss": 0.1227,
+ "step": 2565
+ },
+ {
+ "epoch": 14.685714285714285,
+ "grad_norm": 0.6368046998977661,
+ "learning_rate": 7.98e-05,
+ "loss": 0.1074,
+ "step": 2570
+ },
+ {
+ "epoch": 14.714285714285714,
+ "grad_norm": 0.7269926071166992,
+ "learning_rate": 7.937142857142856e-05,
+ "loss": 0.1166,
+ "step": 2575
+ },
+ {
+ "epoch": 14.742857142857144,
+ "grad_norm": 0.6903791427612305,
+ "learning_rate": 7.894285714285713e-05,
+ "loss": 0.1274,
+ "step": 2580
+ },
+ {
+ "epoch": 14.771428571428572,
+ "grad_norm": 0.8257679343223572,
+ "learning_rate": 7.851428571428571e-05,
+ "loss": 0.1274,
+ "step": 2585
+ },
+ {
+ "epoch": 14.8,
+ "grad_norm": 1.0489627122879028,
+ "learning_rate": 7.808571428571428e-05,
+ "loss": 0.1091,
+ "step": 2590
+ },
+ {
+ "epoch": 14.82857142857143,
+ "grad_norm": 0.6699196696281433,
+ "learning_rate": 7.765714285714284e-05,
+ "loss": 0.1244,
+ "step": 2595
+ },
+ {
+ "epoch": 14.857142857142858,
+ "grad_norm": 0.61530601978302,
+ "learning_rate": 7.722857142857143e-05,
+ "loss": 0.1122,
+ "step": 2600
+ },
+ {
+ "epoch": 14.885714285714286,
+ "grad_norm": 0.5789124369621277,
+ "learning_rate": 7.68e-05,
+ "loss": 0.1272,
+ "step": 2605
+ },
+ {
+ "epoch": 14.914285714285715,
+ "grad_norm": 2.1323459148406982,
+ "learning_rate": 7.637142857142857e-05,
+ "loss": 0.1034,
+ "step": 2610
+ },
+ {
+ "epoch": 14.942857142857143,
+ "grad_norm": 1.2433545589447021,
+ "learning_rate": 7.594285714285714e-05,
+ "loss": 0.1052,
+ "step": 2615
+ },
+ {
+ "epoch": 14.971428571428572,
+ "grad_norm": 0.868093729019165,
+ "learning_rate": 7.551428571428571e-05,
+ "loss": 0.111,
+ "step": 2620
+ },
+ {
+ "epoch": 15.0,
+ "grad_norm": 0.6479918360710144,
+ "learning_rate": 7.508571428571428e-05,
+ "loss": 0.1067,
+ "step": 2625
+ },
+ {
+ "epoch": 15.028571428571428,
+ "grad_norm": 0.8062720894813538,
+ "learning_rate": 7.465714285714285e-05,
+ "loss": 0.1113,
+ "step": 2630
+ },
+ {
+ "epoch": 15.057142857142857,
+ "grad_norm": 0.7333181500434875,
+ "learning_rate": 7.422857142857142e-05,
+ "loss": 0.0985,
+ "step": 2635
+ },
+ {
+ "epoch": 15.085714285714285,
+ "grad_norm": 0.550039529800415,
+ "learning_rate": 7.379999999999999e-05,
+ "loss": 0.1077,
+ "step": 2640
+ },
+ {
+ "epoch": 15.114285714285714,
+ "grad_norm": 0.9256687164306641,
+ "learning_rate": 7.337142857142856e-05,
+ "loss": 0.0875,
+ "step": 2645
+ },
+ {
+ "epoch": 15.142857142857142,
+ "grad_norm": 0.6421870589256287,
+ "learning_rate": 7.294285714285713e-05,
+ "loss": 0.1069,
+ "step": 2650
+ },
+ {
+ "epoch": 15.17142857142857,
+ "grad_norm": 0.6614648699760437,
+ "learning_rate": 7.25142857142857e-05,
+ "loss": 0.1249,
+ "step": 2655
+ },
+ {
+ "epoch": 15.2,
+ "grad_norm": 0.8273601531982422,
+ "learning_rate": 7.208571428571429e-05,
+ "loss": 0.1135,
+ "step": 2660
+ },
+ {
+ "epoch": 15.228571428571428,
+ "grad_norm": 0.6795836687088013,
+ "learning_rate": 7.165714285714284e-05,
+ "loss": 0.1081,
+ "step": 2665
+ },
+ {
+ "epoch": 15.257142857142856,
+ "grad_norm": 0.7508160471916199,
+ "learning_rate": 7.122857142857143e-05,
+ "loss": 0.0869,
+ "step": 2670
+ },
+ {
+ "epoch": 15.285714285714286,
+ "grad_norm": 0.7219347357749939,
+ "learning_rate": 7.079999999999999e-05,
+ "loss": 0.1115,
+ "step": 2675
+ },
+ {
+ "epoch": 15.314285714285715,
+ "grad_norm": 0.5592671036720276,
+ "learning_rate": 7.037142857142857e-05,
+ "loss": 0.1116,
+ "step": 2680
+ },
+ {
+ "epoch": 15.342857142857143,
+ "grad_norm": 0.8736717104911804,
+ "learning_rate": 6.994285714285714e-05,
+ "loss": 0.0784,
+ "step": 2685
+ },
+ {
+ "epoch": 15.371428571428572,
+ "grad_norm": 0.6056572198867798,
+ "learning_rate": 6.951428571428571e-05,
+ "loss": 0.1105,
+ "step": 2690
+ },
+ {
+ "epoch": 15.4,
+ "grad_norm": 0.671410322189331,
+ "learning_rate": 6.908571428571428e-05,
+ "loss": 0.1219,
+ "step": 2695
+ },
+ {
+ "epoch": 15.428571428571429,
+ "grad_norm": 0.7952276468276978,
+ "learning_rate": 6.865714285714285e-05,
+ "loss": 0.0865,
+ "step": 2700
+ },
+ {
+ "epoch": 15.457142857142857,
+ "grad_norm": 0.8185123205184937,
+ "learning_rate": 6.822857142857142e-05,
+ "loss": 0.1095,
+ "step": 2705
+ },
+ {
+ "epoch": 15.485714285714286,
+ "grad_norm": 0.6969497203826904,
+ "learning_rate": 6.78e-05,
+ "loss": 0.0928,
+ "step": 2710
+ },
+ {
+ "epoch": 15.514285714285714,
+ "grad_norm": 0.7323058843612671,
+ "learning_rate": 6.737142857142857e-05,
+ "loss": 0.099,
+ "step": 2715
+ },
+ {
+ "epoch": 15.542857142857143,
+ "grad_norm": 0.6498017311096191,
+ "learning_rate": 6.694285714285714e-05,
+ "loss": 0.0871,
+ "step": 2720
+ },
+ {
+ "epoch": 15.571428571428571,
+ "grad_norm": 2.0899710655212402,
+ "learning_rate": 6.65142857142857e-05,
+ "loss": 0.1306,
+ "step": 2725
+ },
+ {
+ "epoch": 15.6,
+ "grad_norm": 1.0896337032318115,
+ "learning_rate": 6.608571428571428e-05,
+ "loss": 0.1085,
+ "step": 2730
+ },
+ {
+ "epoch": 15.628571428571428,
+ "grad_norm": 0.6709671020507812,
+ "learning_rate": 6.565714285714285e-05,
+ "loss": 0.0977,
+ "step": 2735
+ },
+ {
+ "epoch": 15.657142857142857,
+ "grad_norm": 0.6750431060791016,
+ "learning_rate": 6.522857142857142e-05,
+ "loss": 0.1154,
+ "step": 2740
+ },
+ {
+ "epoch": 15.685714285714285,
+ "grad_norm": 1.2888147830963135,
+ "learning_rate": 6.479999999999999e-05,
+ "loss": 0.0918,
+ "step": 2745
+ },
+ {
+ "epoch": 15.714285714285714,
+ "grad_norm": 0.9803931713104248,
+ "learning_rate": 6.437142857142857e-05,
+ "loss": 0.112,
+ "step": 2750
+ },
+ {
+ "epoch": 15.742857142857144,
+ "grad_norm": 0.8548974394798279,
+ "learning_rate": 6.394285714285713e-05,
+ "loss": 0.0974,
+ "step": 2755
+ },
+ {
+ "epoch": 15.771428571428572,
+ "grad_norm": 0.7924854159355164,
+ "learning_rate": 6.351428571428572e-05,
+ "loss": 0.1344,
+ "step": 2760
+ },
+ {
+ "epoch": 15.8,
+ "grad_norm": 0.9245836138725281,
+ "learning_rate": 6.308571428571429e-05,
+ "loss": 0.1182,
+ "step": 2765
+ },
+ {
+ "epoch": 15.82857142857143,
+ "grad_norm": 0.6067193150520325,
+ "learning_rate": 6.265714285714286e-05,
+ "loss": 0.0959,
+ "step": 2770
+ },
+ {
+ "epoch": 15.857142857142858,
+ "grad_norm": 0.5575870275497437,
+ "learning_rate": 6.222857142857143e-05,
+ "loss": 0.1208,
+ "step": 2775
+ },
+ {
+ "epoch": 15.885714285714286,
+ "grad_norm": 0.8608399629592896,
+ "learning_rate": 6.18e-05,
+ "loss": 0.0937,
+ "step": 2780
+ },
+ {
+ "epoch": 15.914285714285715,
+ "grad_norm": 0.6910924911499023,
+ "learning_rate": 6.137142857142857e-05,
+ "loss": 0.1175,
+ "step": 2785
+ },
+ {
+ "epoch": 15.942857142857143,
+ "grad_norm": 0.7266614437103271,
+ "learning_rate": 6.094285714285714e-05,
+ "loss": 0.1023,
+ "step": 2790
+ },
+ {
+ "epoch": 15.971428571428572,
+ "grad_norm": 0.7580139636993408,
+ "learning_rate": 6.051428571428571e-05,
+ "loss": 0.1103,
+ "step": 2795
+ },
+ {
+ "epoch": 16.0,
+ "grad_norm": 0.9288797974586487,
+ "learning_rate": 6.008571428571428e-05,
+ "loss": 0.0892,
+ "step": 2800
+ },
+ {
+ "epoch": 16.02857142857143,
+ "grad_norm": 1.4218194484710693,
+ "learning_rate": 5.9657142857142845e-05,
+ "loss": 0.104,
+ "step": 2805
+ },
+ {
+ "epoch": 16.057142857142857,
+ "grad_norm": 0.7665567994117737,
+ "learning_rate": 5.922857142857142e-05,
+ "loss": 0.1084,
+ "step": 2810
+ },
+ {
+ "epoch": 16.085714285714285,
+ "grad_norm": 0.8587457537651062,
+ "learning_rate": 5.88e-05,
+ "loss": 0.1041,
+ "step": 2815
+ },
+ {
+ "epoch": 16.114285714285714,
+ "grad_norm": 0.5792443752288818,
+ "learning_rate": 5.837142857142856e-05,
+ "loss": 0.1086,
+ "step": 2820
+ },
+ {
+ "epoch": 16.142857142857142,
+ "grad_norm": 0.6510186195373535,
+ "learning_rate": 5.794285714285714e-05,
+ "loss": 0.0919,
+ "step": 2825
+ },
+ {
+ "epoch": 16.17142857142857,
+ "grad_norm": 1.170145869255066,
+ "learning_rate": 5.751428571428571e-05,
+ "loss": 0.1083,
+ "step": 2830
+ },
+ {
+ "epoch": 16.2,
+ "grad_norm": 1.0514795780181885,
+ "learning_rate": 5.708571428571428e-05,
+ "loss": 0.1223,
+ "step": 2835
+ },
+ {
+ "epoch": 16.228571428571428,
+ "grad_norm": 0.7993499040603638,
+ "learning_rate": 5.665714285714285e-05,
+ "loss": 0.1101,
+ "step": 2840
+ },
+ {
+ "epoch": 16.257142857142856,
+ "grad_norm": 0.6342432498931885,
+ "learning_rate": 5.622857142857142e-05,
+ "loss": 0.1243,
+ "step": 2845
+ },
+ {
+ "epoch": 16.285714285714285,
+ "grad_norm": 1.2524505853652954,
+ "learning_rate": 5.5799999999999994e-05,
+ "loss": 0.1251,
+ "step": 2850
+ },
+ {
+ "epoch": 16.314285714285713,
+ "grad_norm": 1.0769789218902588,
+ "learning_rate": 5.537142857142857e-05,
+ "loss": 0.1074,
+ "step": 2855
+ },
+ {
+ "epoch": 16.34285714285714,
+ "grad_norm": 1.8232245445251465,
+ "learning_rate": 5.4942857142857136e-05,
+ "loss": 0.0929,
+ "step": 2860
+ },
+ {
+ "epoch": 16.37142857142857,
+ "grad_norm": 0.814189612865448,
+ "learning_rate": 5.451428571428571e-05,
+ "loss": 0.0998,
+ "step": 2865
+ },
+ {
+ "epoch": 16.4,
+ "grad_norm": 0.9731772541999817,
+ "learning_rate": 5.4085714285714284e-05,
+ "loss": 0.0849,
+ "step": 2870
+ },
+ {
+ "epoch": 16.428571428571427,
+ "grad_norm": 0.778213381767273,
+ "learning_rate": 5.3657142857142855e-05,
+ "loss": 0.0907,
+ "step": 2875
+ },
+ {
+ "epoch": 16.457142857142856,
+ "grad_norm": 0.9219964146614075,
+ "learning_rate": 5.3228571428571425e-05,
+ "loss": 0.0855,
+ "step": 2880
+ },
+ {
+ "epoch": 16.485714285714284,
+ "grad_norm": 0.7354393005371094,
+ "learning_rate": 5.279999999999999e-05,
+ "loss": 0.1296,
+ "step": 2885
+ },
+ {
+ "epoch": 16.514285714285712,
+ "grad_norm": 0.6051219701766968,
+ "learning_rate": 5.2371428571428567e-05,
+ "loss": 0.1086,
+ "step": 2890
+ },
+ {
+ "epoch": 16.542857142857144,
+ "grad_norm": 0.8592603206634521,
+ "learning_rate": 5.1942857142857144e-05,
+ "loss": 0.1017,
+ "step": 2895
+ },
+ {
+ "epoch": 16.571428571428573,
+ "grad_norm": 0.5748846530914307,
+ "learning_rate": 5.151428571428571e-05,
+ "loss": 0.0775,
+ "step": 2900
+ },
+ {
+ "epoch": 16.6,
+ "grad_norm": 0.6640213131904602,
+ "learning_rate": 5.1085714285714285e-05,
+ "loss": 0.1059,
+ "step": 2905
+ },
+ {
+ "epoch": 16.62857142857143,
+ "grad_norm": 0.9514361023902893,
+ "learning_rate": 5.065714285714285e-05,
+ "loss": 0.0832,
+ "step": 2910
+ },
+ {
+ "epoch": 16.65714285714286,
+ "grad_norm": 1.1062079668045044,
+ "learning_rate": 5.022857142857143e-05,
+ "loss": 0.0817,
+ "step": 2915
+ },
+ {
+ "epoch": 16.685714285714287,
+ "grad_norm": 0.6824453473091125,
+ "learning_rate": 4.98e-05,
+ "loss": 0.1064,
+ "step": 2920
+ },
+ {
+ "epoch": 16.714285714285715,
+ "grad_norm": 0.643827497959137,
+ "learning_rate": 4.937142857142856e-05,
+ "loss": 0.1196,
+ "step": 2925
+ },
+ {
+ "epoch": 16.742857142857144,
+ "grad_norm": 0.7824274897575378,
+ "learning_rate": 4.894285714285714e-05,
+ "loss": 0.0945,
+ "step": 2930
+ },
+ {
+ "epoch": 16.771428571428572,
+ "grad_norm": 0.7110689878463745,
+ "learning_rate": 4.8514285714285716e-05,
+ "loss": 0.1124,
+ "step": 2935
+ },
+ {
+ "epoch": 16.8,
+ "grad_norm": 0.9542856812477112,
+ "learning_rate": 4.808571428571428e-05,
+ "loss": 0.1036,
+ "step": 2940
+ },
+ {
+ "epoch": 16.82857142857143,
+ "grad_norm": 0.6353528499603271,
+ "learning_rate": 4.765714285714286e-05,
+ "loss": 0.0977,
+ "step": 2945
+ },
+ {
+ "epoch": 16.857142857142858,
+ "grad_norm": 0.843910813331604,
+ "learning_rate": 4.722857142857142e-05,
+ "loss": 0.1164,
+ "step": 2950
+ },
+ {
+ "epoch": 16.885714285714286,
+ "grad_norm": 0.9607085585594177,
+ "learning_rate": 4.68e-05,
+ "loss": 0.1111,
+ "step": 2955
+ },
+ {
+ "epoch": 16.914285714285715,
+ "grad_norm": 0.7393201589584351,
+ "learning_rate": 4.637142857142857e-05,
+ "loss": 0.106,
+ "step": 2960
+ },
+ {
+ "epoch": 16.942857142857143,
+ "grad_norm": 0.5248494148254395,
+ "learning_rate": 4.5942857142857134e-05,
+ "loss": 0.1017,
+ "step": 2965
+ },
+ {
+ "epoch": 16.97142857142857,
+ "grad_norm": 0.8800868988037109,
+ "learning_rate": 4.551428571428571e-05,
+ "loss": 0.0872,
+ "step": 2970
+ },
+ {
+ "epoch": 17.0,
+ "grad_norm": 0.8447640538215637,
+ "learning_rate": 4.5085714285714275e-05,
+ "loss": 0.1293,
+ "step": 2975
+ },
+ {
+ "epoch": 17.02857142857143,
+ "grad_norm": 0.5356553792953491,
+ "learning_rate": 4.465714285714285e-05,
+ "loss": 0.0984,
+ "step": 2980
+ },
+ {
+ "epoch": 17.057142857142857,
+ "grad_norm": 0.7713034152984619,
+ "learning_rate": 4.422857142857143e-05,
+ "loss": 0.0858,
+ "step": 2985
+ },
+ {
+ "epoch": 17.085714285714285,
+ "grad_norm": 0.9854580760002136,
+ "learning_rate": 4.3799999999999994e-05,
+ "loss": 0.1237,
+ "step": 2990
+ },
+ {
+ "epoch": 17.114285714285714,
+ "grad_norm": 0.7012975811958313,
+ "learning_rate": 4.337142857142857e-05,
+ "loss": 0.1233,
+ "step": 2995
+ },
+ {
+ "epoch": 17.142857142857142,
+ "grad_norm": 0.5461836457252502,
+ "learning_rate": 4.294285714285714e-05,
+ "loss": 0.0978,
+ "step": 3000
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-3000/training_args.bin b/glot-contrastive-final-lora/checkpoint-3000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/checkpoint-3500/README.md b/glot-contrastive-final-lora/checkpoint-3500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-3500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-3500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-3500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-3500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dba4d5dd074dc3d6c4bc4d4f36793beac178e2c3
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ba05d9cb007251d29a6f02fdd92f56fa1beb8f9e0676686472daf07c4e9f478
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-3500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-3500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..20f9723049ba94933d0ebf54f53f34b8edb32d68
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec69941aff3e022b84f7642bf65d1f256ba3d34a59cc1d3185bfaed806e27b82
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-3500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-3500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f2f30d236c6c0667ccc3a756b378faccab328a42
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:119b741182d96487ccd4b17518349b97dcc4a6ddb4f50860c285ce876df3e7b3
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-3500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-3500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..baae0d262b4de605c55755daad74e8104d12dea5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65c44133b6126dd443eb3d89fa8880514c2d750a567a84b1bc49dd491e9083bb
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-3500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-3500/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-3500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-3500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-3500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-3500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-3500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-3500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..675d571d5f9814e4e6181db1dee227e7e2d62781
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/trainer_state.json
@@ -0,0 +1,4934 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 20.0,
+ "eval_steps": 5,
+ "global_step": 3500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ },
+ {
+ "epoch": 2.8857142857142857,
+ "grad_norm": 1.3597544431686401,
+ "learning_rate": 0.00025679999999999995,
+ "loss": 0.1583,
+ "step": 505
+ },
+ {
+ "epoch": 2.914285714285714,
+ "grad_norm": 0.8790671825408936,
+ "learning_rate": 0.00025637142857142854,
+ "loss": 0.1565,
+ "step": 510
+ },
+ {
+ "epoch": 2.942857142857143,
+ "grad_norm": 1.1175066232681274,
+ "learning_rate": 0.0002559428571428571,
+ "loss": 0.1406,
+ "step": 515
+ },
+ {
+ "epoch": 2.9714285714285715,
+ "grad_norm": 2.8528785705566406,
+ "learning_rate": 0.0002555142857142857,
+ "loss": 0.1735,
+ "step": 520
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 2.2073328495025635,
+ "learning_rate": 0.0002550857142857143,
+ "loss": 0.1816,
+ "step": 525
+ },
+ {
+ "epoch": 3.0285714285714285,
+ "grad_norm": 11.01322078704834,
+ "learning_rate": 0.0002546571428571428,
+ "loss": 0.1873,
+ "step": 530
+ },
+ {
+ "epoch": 3.057142857142857,
+ "grad_norm": 1.5822402238845825,
+ "learning_rate": 0.0002542285714285714,
+ "loss": 0.168,
+ "step": 535
+ },
+ {
+ "epoch": 3.085714285714286,
+ "grad_norm": 1.3086942434310913,
+ "learning_rate": 0.0002538,
+ "loss": 0.149,
+ "step": 540
+ },
+ {
+ "epoch": 3.1142857142857143,
+ "grad_norm": 6.303041458129883,
+ "learning_rate": 0.0002533714285714285,
+ "loss": 0.1651,
+ "step": 545
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 14.48929500579834,
+ "learning_rate": 0.00025294285714285716,
+ "loss": 0.1687,
+ "step": 550
+ },
+ {
+ "epoch": 3.1714285714285713,
+ "grad_norm": 6.824525356292725,
+ "learning_rate": 0.0002525142857142857,
+ "loss": 0.1919,
+ "step": 555
+ },
+ {
+ "epoch": 3.2,
+ "grad_norm": 18.772563934326172,
+ "learning_rate": 0.00025208571428571427,
+ "loss": 0.2075,
+ "step": 560
+ },
+ {
+ "epoch": 3.2285714285714286,
+ "grad_norm": 0.7268752455711365,
+ "learning_rate": 0.00025165714285714286,
+ "loss": 0.174,
+ "step": 565
+ },
+ {
+ "epoch": 3.257142857142857,
+ "grad_norm": 1.1301453113555908,
+ "learning_rate": 0.0002512285714285714,
+ "loss": 0.1668,
+ "step": 570
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 2.846802234649658,
+ "learning_rate": 0.00025079999999999997,
+ "loss": 0.1645,
+ "step": 575
+ },
+ {
+ "epoch": 3.314285714285714,
+ "grad_norm": 1.417515754699707,
+ "learning_rate": 0.00025037142857142855,
+ "loss": 0.1719,
+ "step": 580
+ },
+ {
+ "epoch": 3.342857142857143,
+ "grad_norm": 4.137150764465332,
+ "learning_rate": 0.00024994285714285714,
+ "loss": 0.1739,
+ "step": 585
+ },
+ {
+ "epoch": 3.3714285714285714,
+ "grad_norm": 2.6067259311676025,
+ "learning_rate": 0.0002495142857142857,
+ "loss": 0.1489,
+ "step": 590
+ },
+ {
+ "epoch": 3.4,
+ "grad_norm": 2.601024627685547,
+ "learning_rate": 0.00024908571428571425,
+ "loss": 0.1618,
+ "step": 595
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 3.849017858505249,
+ "learning_rate": 0.00024865714285714284,
+ "loss": 0.1899,
+ "step": 600
+ },
+ {
+ "epoch": 3.4571428571428573,
+ "grad_norm": 4.673766136169434,
+ "learning_rate": 0.0002482285714285714,
+ "loss": 0.1761,
+ "step": 605
+ },
+ {
+ "epoch": 3.4857142857142858,
+ "grad_norm": 2.6057631969451904,
+ "learning_rate": 0.00024779999999999995,
+ "loss": 0.1743,
+ "step": 610
+ },
+ {
+ "epoch": 3.5142857142857142,
+ "grad_norm": 2.932652473449707,
+ "learning_rate": 0.0002473714285714286,
+ "loss": 0.1482,
+ "step": 615
+ },
+ {
+ "epoch": 3.5428571428571427,
+ "grad_norm": 0.8764939308166504,
+ "learning_rate": 0.0002469428571428571,
+ "loss": 0.1644,
+ "step": 620
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 1.3203191757202148,
+ "learning_rate": 0.0002465142857142857,
+ "loss": 0.1654,
+ "step": 625
+ },
+ {
+ "epoch": 3.6,
+ "grad_norm": 0.7977635264396667,
+ "learning_rate": 0.0002460857142857143,
+ "loss": 0.1472,
+ "step": 630
+ },
+ {
+ "epoch": 3.6285714285714286,
+ "grad_norm": 1.4750248193740845,
+ "learning_rate": 0.0002456571428571428,
+ "loss": 0.1735,
+ "step": 635
+ },
+ {
+ "epoch": 3.657142857142857,
+ "grad_norm": 1.8164482116699219,
+ "learning_rate": 0.0002452285714285714,
+ "loss": 0.1593,
+ "step": 640
+ },
+ {
+ "epoch": 3.685714285714286,
+ "grad_norm": 1.4829603433609009,
+ "learning_rate": 0.0002448,
+ "loss": 0.1508,
+ "step": 645
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.8828144669532776,
+ "learning_rate": 0.00024437142857142857,
+ "loss": 0.1573,
+ "step": 650
+ },
+ {
+ "epoch": 3.742857142857143,
+ "grad_norm": 2.039384126663208,
+ "learning_rate": 0.00024394285714285713,
+ "loss": 0.1745,
+ "step": 655
+ },
+ {
+ "epoch": 3.7714285714285714,
+ "grad_norm": 0.9604200720787048,
+ "learning_rate": 0.00024351428571428569,
+ "loss": 0.17,
+ "step": 660
+ },
+ {
+ "epoch": 3.8,
+ "grad_norm": 0.7903971076011658,
+ "learning_rate": 0.00024308571428571427,
+ "loss": 0.1654,
+ "step": 665
+ },
+ {
+ "epoch": 3.8285714285714287,
+ "grad_norm": 0.6935649514198303,
+ "learning_rate": 0.00024265714285714283,
+ "loss": 0.1714,
+ "step": 670
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.5832012295722961,
+ "learning_rate": 0.00024222857142857138,
+ "loss": 0.1636,
+ "step": 675
+ },
+ {
+ "epoch": 3.8857142857142857,
+ "grad_norm": 0.6303168535232544,
+ "learning_rate": 0.0002418,
+ "loss": 0.1604,
+ "step": 680
+ },
+ {
+ "epoch": 3.914285714285714,
+ "grad_norm": 0.7210885882377625,
+ "learning_rate": 0.00024137142857142855,
+ "loss": 0.1444,
+ "step": 685
+ },
+ {
+ "epoch": 3.942857142857143,
+ "grad_norm": 0.7690990567207336,
+ "learning_rate": 0.00024094285714285714,
+ "loss": 0.1631,
+ "step": 690
+ },
+ {
+ "epoch": 3.9714285714285715,
+ "grad_norm": 1.0142720937728882,
+ "learning_rate": 0.0002405142857142857,
+ "loss": 0.158,
+ "step": 695
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.7970322966575623,
+ "learning_rate": 0.00024008571428571425,
+ "loss": 0.1803,
+ "step": 700
+ },
+ {
+ "epoch": 4.0285714285714285,
+ "grad_norm": 0.6795914769172668,
+ "learning_rate": 0.00023965714285714284,
+ "loss": 0.143,
+ "step": 705
+ },
+ {
+ "epoch": 4.057142857142857,
+ "grad_norm": 0.6832629442214966,
+ "learning_rate": 0.0002392285714285714,
+ "loss": 0.1457,
+ "step": 710
+ },
+ {
+ "epoch": 4.085714285714285,
+ "grad_norm": 3.8629798889160156,
+ "learning_rate": 0.0002388,
+ "loss": 0.1671,
+ "step": 715
+ },
+ {
+ "epoch": 4.114285714285714,
+ "grad_norm": 1.1167882680892944,
+ "learning_rate": 0.00023837142857142856,
+ "loss": 0.1544,
+ "step": 720
+ },
+ {
+ "epoch": 4.142857142857143,
+ "grad_norm": 0.9431412816047668,
+ "learning_rate": 0.00023794285714285712,
+ "loss": 0.1605,
+ "step": 725
+ },
+ {
+ "epoch": 4.171428571428572,
+ "grad_norm": 1.310948133468628,
+ "learning_rate": 0.0002375142857142857,
+ "loss": 0.1121,
+ "step": 730
+ },
+ {
+ "epoch": 4.2,
+ "grad_norm": 0.9830737709999084,
+ "learning_rate": 0.00023708571428571426,
+ "loss": 0.1742,
+ "step": 735
+ },
+ {
+ "epoch": 4.228571428571429,
+ "grad_norm": 0.6166555881500244,
+ "learning_rate": 0.00023665714285714282,
+ "loss": 0.1525,
+ "step": 740
+ },
+ {
+ "epoch": 4.257142857142857,
+ "grad_norm": 0.995579719543457,
+ "learning_rate": 0.00023622857142857143,
+ "loss": 0.1439,
+ "step": 745
+ },
+ {
+ "epoch": 4.285714285714286,
+ "grad_norm": 0.639796793460846,
+ "learning_rate": 0.00023579999999999999,
+ "loss": 0.1692,
+ "step": 750
+ },
+ {
+ "epoch": 4.314285714285714,
+ "grad_norm": 0.9438050389289856,
+ "learning_rate": 0.00023537142857142854,
+ "loss": 0.1785,
+ "step": 755
+ },
+ {
+ "epoch": 4.3428571428571425,
+ "grad_norm": 0.8960750102996826,
+ "learning_rate": 0.00023494285714285713,
+ "loss": 0.1557,
+ "step": 760
+ },
+ {
+ "epoch": 4.371428571428572,
+ "grad_norm": 0.6287499070167542,
+ "learning_rate": 0.00023451428571428568,
+ "loss": 0.1459,
+ "step": 765
+ },
+ {
+ "epoch": 4.4,
+ "grad_norm": 0.7638295888900757,
+ "learning_rate": 0.00023408571428571424,
+ "loss": 0.1341,
+ "step": 770
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.655878484249115,
+ "learning_rate": 0.00023365714285714283,
+ "loss": 0.1358,
+ "step": 775
+ },
+ {
+ "epoch": 4.457142857142857,
+ "grad_norm": 0.5840997695922852,
+ "learning_rate": 0.0002332285714285714,
+ "loss": 0.1386,
+ "step": 780
+ },
+ {
+ "epoch": 4.485714285714286,
+ "grad_norm": 1.1082488298416138,
+ "learning_rate": 0.0002328,
+ "loss": 0.1827,
+ "step": 785
+ },
+ {
+ "epoch": 4.514285714285714,
+ "grad_norm": 0.8825240135192871,
+ "learning_rate": 0.00023237142857142855,
+ "loss": 0.1527,
+ "step": 790
+ },
+ {
+ "epoch": 4.542857142857143,
+ "grad_norm": 0.6752304434776306,
+ "learning_rate": 0.0002319428571428571,
+ "loss": 0.1392,
+ "step": 795
+ },
+ {
+ "epoch": 4.571428571428571,
+ "grad_norm": 1.1423301696777344,
+ "learning_rate": 0.0002315142857142857,
+ "loss": 0.1433,
+ "step": 800
+ },
+ {
+ "epoch": 4.6,
+ "grad_norm": 10.793691635131836,
+ "learning_rate": 0.00023108571428571425,
+ "loss": 0.1635,
+ "step": 805
+ },
+ {
+ "epoch": 4.628571428571428,
+ "grad_norm": 0.47564294934272766,
+ "learning_rate": 0.00023065714285714286,
+ "loss": 0.1199,
+ "step": 810
+ },
+ {
+ "epoch": 4.6571428571428575,
+ "grad_norm": 1.2492656707763672,
+ "learning_rate": 0.00023022857142857142,
+ "loss": 0.1488,
+ "step": 815
+ },
+ {
+ "epoch": 4.685714285714286,
+ "grad_norm": 0.6933501958847046,
+ "learning_rate": 0.00022979999999999997,
+ "loss": 0.1812,
+ "step": 820
+ },
+ {
+ "epoch": 4.714285714285714,
+ "grad_norm": 0.7901633977890015,
+ "learning_rate": 0.00022937142857142856,
+ "loss": 0.1415,
+ "step": 825
+ },
+ {
+ "epoch": 4.742857142857143,
+ "grad_norm": 0.7854829430580139,
+ "learning_rate": 0.00022894285714285712,
+ "loss": 0.1401,
+ "step": 830
+ },
+ {
+ "epoch": 4.771428571428571,
+ "grad_norm": 0.8716740608215332,
+ "learning_rate": 0.00022851428571428567,
+ "loss": 0.1982,
+ "step": 835
+ },
+ {
+ "epoch": 4.8,
+ "grad_norm": 0.7047899961471558,
+ "learning_rate": 0.00022808571428571426,
+ "loss": 0.1624,
+ "step": 840
+ },
+ {
+ "epoch": 4.828571428571428,
+ "grad_norm": 0.7134959697723389,
+ "learning_rate": 0.00022765714285714284,
+ "loss": 0.1375,
+ "step": 845
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 1.0897325277328491,
+ "learning_rate": 0.00022722857142857143,
+ "loss": 0.1489,
+ "step": 850
+ },
+ {
+ "epoch": 4.885714285714286,
+ "grad_norm": 1.1065207719802856,
+ "learning_rate": 0.00022679999999999998,
+ "loss": 0.1495,
+ "step": 855
+ },
+ {
+ "epoch": 4.914285714285715,
+ "grad_norm": 0.7434757351875305,
+ "learning_rate": 0.00022637142857142854,
+ "loss": 0.1507,
+ "step": 860
+ },
+ {
+ "epoch": 4.942857142857143,
+ "grad_norm": 1.0045181512832642,
+ "learning_rate": 0.00022594285714285712,
+ "loss": 0.1527,
+ "step": 865
+ },
+ {
+ "epoch": 4.9714285714285715,
+ "grad_norm": 1.2025654315948486,
+ "learning_rate": 0.00022551428571428568,
+ "loss": 0.1523,
+ "step": 870
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.7823342084884644,
+ "learning_rate": 0.0002250857142857143,
+ "loss": 0.1514,
+ "step": 875
+ },
+ {
+ "epoch": 5.0285714285714285,
+ "grad_norm": 0.8405362963676453,
+ "learning_rate": 0.00022465714285714285,
+ "loss": 0.1461,
+ "step": 880
+ },
+ {
+ "epoch": 5.057142857142857,
+ "grad_norm": 0.7527463436126709,
+ "learning_rate": 0.0002242285714285714,
+ "loss": 0.1206,
+ "step": 885
+ },
+ {
+ "epoch": 5.085714285714285,
+ "grad_norm": 0.8372548222541809,
+ "learning_rate": 0.0002238,
+ "loss": 0.1513,
+ "step": 890
+ },
+ {
+ "epoch": 5.114285714285714,
+ "grad_norm": 0.8755456209182739,
+ "learning_rate": 0.00022337142857142855,
+ "loss": 0.1498,
+ "step": 895
+ },
+ {
+ "epoch": 5.142857142857143,
+ "grad_norm": 0.7312084436416626,
+ "learning_rate": 0.0002229428571428571,
+ "loss": 0.154,
+ "step": 900
+ },
+ {
+ "epoch": 5.171428571428572,
+ "grad_norm": 0.6366221904754639,
+ "learning_rate": 0.0002225142857142857,
+ "loss": 0.1466,
+ "step": 905
+ },
+ {
+ "epoch": 5.2,
+ "grad_norm": 0.6406880617141724,
+ "learning_rate": 0.00022208571428571427,
+ "loss": 0.1254,
+ "step": 910
+ },
+ {
+ "epoch": 5.228571428571429,
+ "grad_norm": 2.4106833934783936,
+ "learning_rate": 0.00022165714285714283,
+ "loss": 0.1534,
+ "step": 915
+ },
+ {
+ "epoch": 5.257142857142857,
+ "grad_norm": 0.5635722279548645,
+ "learning_rate": 0.00022122857142857142,
+ "loss": 0.1461,
+ "step": 920
+ },
+ {
+ "epoch": 5.285714285714286,
+ "grad_norm": 0.787162184715271,
+ "learning_rate": 0.00022079999999999997,
+ "loss": 0.1424,
+ "step": 925
+ },
+ {
+ "epoch": 5.314285714285714,
+ "grad_norm": 0.6513975262641907,
+ "learning_rate": 0.00022037142857142853,
+ "loss": 0.1326,
+ "step": 930
+ },
+ {
+ "epoch": 5.3428571428571425,
+ "grad_norm": 0.6933534741401672,
+ "learning_rate": 0.00021994285714285711,
+ "loss": 0.1661,
+ "step": 935
+ },
+ {
+ "epoch": 5.371428571428572,
+ "grad_norm": 0.7263259887695312,
+ "learning_rate": 0.0002195142857142857,
+ "loss": 0.15,
+ "step": 940
+ },
+ {
+ "epoch": 5.4,
+ "grad_norm": 0.5537381768226624,
+ "learning_rate": 0.00021908571428571428,
+ "loss": 0.129,
+ "step": 945
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.6014005541801453,
+ "learning_rate": 0.00021865714285714284,
+ "loss": 0.1321,
+ "step": 950
+ },
+ {
+ "epoch": 5.457142857142857,
+ "grad_norm": 0.6581441760063171,
+ "learning_rate": 0.0002182285714285714,
+ "loss": 0.1587,
+ "step": 955
+ },
+ {
+ "epoch": 5.485714285714286,
+ "grad_norm": 0.9326379895210266,
+ "learning_rate": 0.00021779999999999998,
+ "loss": 0.1654,
+ "step": 960
+ },
+ {
+ "epoch": 5.514285714285714,
+ "grad_norm": 0.9438592791557312,
+ "learning_rate": 0.00021737142857142854,
+ "loss": 0.1212,
+ "step": 965
+ },
+ {
+ "epoch": 5.542857142857143,
+ "grad_norm": 0.7699571251869202,
+ "learning_rate": 0.00021694285714285715,
+ "loss": 0.1464,
+ "step": 970
+ },
+ {
+ "epoch": 5.571428571428571,
+ "grad_norm": 0.8758366703987122,
+ "learning_rate": 0.0002165142857142857,
+ "loss": 0.1599,
+ "step": 975
+ },
+ {
+ "epoch": 5.6,
+ "grad_norm": 0.6101442575454712,
+ "learning_rate": 0.00021608571428571426,
+ "loss": 0.1589,
+ "step": 980
+ },
+ {
+ "epoch": 5.628571428571428,
+ "grad_norm": 0.7454060912132263,
+ "learning_rate": 0.00021565714285714285,
+ "loss": 0.1433,
+ "step": 985
+ },
+ {
+ "epoch": 5.6571428571428575,
+ "grad_norm": 0.6379484534263611,
+ "learning_rate": 0.0002152285714285714,
+ "loss": 0.1592,
+ "step": 990
+ },
+ {
+ "epoch": 5.685714285714286,
+ "grad_norm": 1.1601309776306152,
+ "learning_rate": 0.00021479999999999996,
+ "loss": 0.1647,
+ "step": 995
+ },
+ {
+ "epoch": 5.714285714285714,
+ "grad_norm": 0.5464673638343811,
+ "learning_rate": 0.00021437142857142855,
+ "loss": 0.1469,
+ "step": 1000
+ },
+ {
+ "epoch": 5.742857142857143,
+ "grad_norm": 1.0279319286346436,
+ "learning_rate": 0.00021394285714285713,
+ "loss": 0.1203,
+ "step": 1005
+ },
+ {
+ "epoch": 5.771428571428571,
+ "grad_norm": 0.5503718256950378,
+ "learning_rate": 0.00021351428571428572,
+ "loss": 0.1409,
+ "step": 1010
+ },
+ {
+ "epoch": 5.8,
+ "grad_norm": 0.6123886108398438,
+ "learning_rate": 0.00021308571428571427,
+ "loss": 0.1427,
+ "step": 1015
+ },
+ {
+ "epoch": 5.828571428571428,
+ "grad_norm": 0.6560390591621399,
+ "learning_rate": 0.00021265714285714283,
+ "loss": 0.1415,
+ "step": 1020
+ },
+ {
+ "epoch": 5.857142857142857,
+ "grad_norm": 0.5576716661453247,
+ "learning_rate": 0.00021222857142857141,
+ "loss": 0.1408,
+ "step": 1025
+ },
+ {
+ "epoch": 5.885714285714286,
+ "grad_norm": 0.6419074535369873,
+ "learning_rate": 0.00021179999999999997,
+ "loss": 0.1385,
+ "step": 1030
+ },
+ {
+ "epoch": 5.914285714285715,
+ "grad_norm": 1.008925199508667,
+ "learning_rate": 0.00021137142857142858,
+ "loss": 0.1497,
+ "step": 1035
+ },
+ {
+ "epoch": 5.942857142857143,
+ "grad_norm": 0.6559906005859375,
+ "learning_rate": 0.00021094285714285714,
+ "loss": 0.1218,
+ "step": 1040
+ },
+ {
+ "epoch": 5.9714285714285715,
+ "grad_norm": 0.627164363861084,
+ "learning_rate": 0.0002105142857142857,
+ "loss": 0.1368,
+ "step": 1045
+ },
+ {
+ "epoch": 6.0,
+ "grad_norm": 0.5760972499847412,
+ "learning_rate": 0.00021008571428571428,
+ "loss": 0.1508,
+ "step": 1050
+ },
+ {
+ "epoch": 6.0285714285714285,
+ "grad_norm": 0.5754174590110779,
+ "learning_rate": 0.00020965714285714284,
+ "loss": 0.1181,
+ "step": 1055
+ },
+ {
+ "epoch": 6.057142857142857,
+ "grad_norm": 0.8736348748207092,
+ "learning_rate": 0.0002092285714285714,
+ "loss": 0.1252,
+ "step": 1060
+ },
+ {
+ "epoch": 6.085714285714285,
+ "grad_norm": 0.7166719436645508,
+ "learning_rate": 0.00020879999999999998,
+ "loss": 0.1481,
+ "step": 1065
+ },
+ {
+ "epoch": 6.114285714285714,
+ "grad_norm": 0.6494349241256714,
+ "learning_rate": 0.00020837142857142856,
+ "loss": 0.1478,
+ "step": 1070
+ },
+ {
+ "epoch": 6.142857142857143,
+ "grad_norm": 0.6681587100028992,
+ "learning_rate": 0.00020794285714285712,
+ "loss": 0.1488,
+ "step": 1075
+ },
+ {
+ "epoch": 6.171428571428572,
+ "grad_norm": 0.7123684883117676,
+ "learning_rate": 0.0002075142857142857,
+ "loss": 0.1378,
+ "step": 1080
+ },
+ {
+ "epoch": 6.2,
+ "grad_norm": 0.6146950721740723,
+ "learning_rate": 0.00020708571428571426,
+ "loss": 0.1306,
+ "step": 1085
+ },
+ {
+ "epoch": 6.228571428571429,
+ "grad_norm": 0.8402445912361145,
+ "learning_rate": 0.00020665714285714282,
+ "loss": 0.1063,
+ "step": 1090
+ },
+ {
+ "epoch": 6.257142857142857,
+ "grad_norm": 0.6567764282226562,
+ "learning_rate": 0.0002062285714285714,
+ "loss": 0.1195,
+ "step": 1095
+ },
+ {
+ "epoch": 6.285714285714286,
+ "grad_norm": 0.6006014943122864,
+ "learning_rate": 0.0002058,
+ "loss": 0.1542,
+ "step": 1100
+ },
+ {
+ "epoch": 6.314285714285714,
+ "grad_norm": 0.793100893497467,
+ "learning_rate": 0.00020537142857142857,
+ "loss": 0.1381,
+ "step": 1105
+ },
+ {
+ "epoch": 6.3428571428571425,
+ "grad_norm": 0.5923666954040527,
+ "learning_rate": 0.00020494285714285713,
+ "loss": 0.1386,
+ "step": 1110
+ },
+ {
+ "epoch": 6.371428571428572,
+ "grad_norm": 0.6692521572113037,
+ "learning_rate": 0.0002045142857142857,
+ "loss": 0.1223,
+ "step": 1115
+ },
+ {
+ "epoch": 6.4,
+ "grad_norm": 0.7216306328773499,
+ "learning_rate": 0.00020408571428571427,
+ "loss": 0.1367,
+ "step": 1120
+ },
+ {
+ "epoch": 6.428571428571429,
+ "grad_norm": 0.5640934109687805,
+ "learning_rate": 0.00020365714285714283,
+ "loss": 0.1554,
+ "step": 1125
+ },
+ {
+ "epoch": 6.457142857142857,
+ "grad_norm": 0.8154368996620178,
+ "learning_rate": 0.00020322857142857138,
+ "loss": 0.1674,
+ "step": 1130
+ },
+ {
+ "epoch": 6.485714285714286,
+ "grad_norm": 0.7185398936271667,
+ "learning_rate": 0.0002028,
+ "loss": 0.1375,
+ "step": 1135
+ },
+ {
+ "epoch": 6.514285714285714,
+ "grad_norm": 0.6805170774459839,
+ "learning_rate": 0.00020237142857142855,
+ "loss": 0.1306,
+ "step": 1140
+ },
+ {
+ "epoch": 6.542857142857143,
+ "grad_norm": 0.5996941924095154,
+ "learning_rate": 0.00020194285714285714,
+ "loss": 0.1433,
+ "step": 1145
+ },
+ {
+ "epoch": 6.571428571428571,
+ "grad_norm": 0.5258373022079468,
+ "learning_rate": 0.0002015142857142857,
+ "loss": 0.1285,
+ "step": 1150
+ },
+ {
+ "epoch": 6.6,
+ "grad_norm": 0.7771695256233215,
+ "learning_rate": 0.00020108571428571425,
+ "loss": 0.1493,
+ "step": 1155
+ },
+ {
+ "epoch": 6.628571428571428,
+ "grad_norm": 0.5920616388320923,
+ "learning_rate": 0.00020065714285714284,
+ "loss": 0.1479,
+ "step": 1160
+ },
+ {
+ "epoch": 6.6571428571428575,
+ "grad_norm": 0.7460982799530029,
+ "learning_rate": 0.00020022857142857142,
+ "loss": 0.1173,
+ "step": 1165
+ },
+ {
+ "epoch": 6.685714285714286,
+ "grad_norm": 1.1703822612762451,
+ "learning_rate": 0.0001998,
+ "loss": 0.1402,
+ "step": 1170
+ },
+ {
+ "epoch": 6.714285714285714,
+ "grad_norm": 0.7894724011421204,
+ "learning_rate": 0.00019937142857142856,
+ "loss": 0.1253,
+ "step": 1175
+ },
+ {
+ "epoch": 6.742857142857143,
+ "grad_norm": 0.7013376355171204,
+ "learning_rate": 0.00019894285714285712,
+ "loss": 0.1573,
+ "step": 1180
+ },
+ {
+ "epoch": 6.771428571428571,
+ "grad_norm": 0.6421737670898438,
+ "learning_rate": 0.0001985142857142857,
+ "loss": 0.1497,
+ "step": 1185
+ },
+ {
+ "epoch": 6.8,
+ "grad_norm": 1.204296350479126,
+ "learning_rate": 0.00019808571428571426,
+ "loss": 0.1634,
+ "step": 1190
+ },
+ {
+ "epoch": 6.828571428571428,
+ "grad_norm": 0.867765486240387,
+ "learning_rate": 0.00019765714285714282,
+ "loss": 0.1353,
+ "step": 1195
+ },
+ {
+ "epoch": 6.857142857142857,
+ "grad_norm": 0.7325594425201416,
+ "learning_rate": 0.00019722857142857143,
+ "loss": 0.118,
+ "step": 1200
+ },
+ {
+ "epoch": 6.885714285714286,
+ "grad_norm": 0.7029078006744385,
+ "learning_rate": 0.00019679999999999999,
+ "loss": 0.1425,
+ "step": 1205
+ },
+ {
+ "epoch": 6.914285714285715,
+ "grad_norm": 1.1572504043579102,
+ "learning_rate": 0.00019637142857142857,
+ "loss": 0.1337,
+ "step": 1210
+ },
+ {
+ "epoch": 6.942857142857143,
+ "grad_norm": 0.8022822141647339,
+ "learning_rate": 0.00019594285714285713,
+ "loss": 0.1684,
+ "step": 1215
+ },
+ {
+ "epoch": 6.9714285714285715,
+ "grad_norm": 0.6729874610900879,
+ "learning_rate": 0.00019551428571428568,
+ "loss": 0.1238,
+ "step": 1220
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.5773627758026123,
+ "learning_rate": 0.00019508571428571427,
+ "loss": 0.138,
+ "step": 1225
+ },
+ {
+ "epoch": 7.0285714285714285,
+ "grad_norm": 0.7182291150093079,
+ "learning_rate": 0.00019465714285714285,
+ "loss": 0.1431,
+ "step": 1230
+ },
+ {
+ "epoch": 7.057142857142857,
+ "grad_norm": 1.7567912340164185,
+ "learning_rate": 0.0001942285714285714,
+ "loss": 0.1319,
+ "step": 1235
+ },
+ {
+ "epoch": 7.085714285714285,
+ "grad_norm": 0.6845232248306274,
+ "learning_rate": 0.0001938,
+ "loss": 0.1292,
+ "step": 1240
+ },
+ {
+ "epoch": 7.114285714285714,
+ "grad_norm": 0.6077771782875061,
+ "learning_rate": 0.00019337142857142855,
+ "loss": 0.1238,
+ "step": 1245
+ },
+ {
+ "epoch": 7.142857142857143,
+ "grad_norm": 0.6168347597122192,
+ "learning_rate": 0.0001929428571428571,
+ "loss": 0.1384,
+ "step": 1250
+ },
+ {
+ "epoch": 7.171428571428572,
+ "grad_norm": 0.7457576394081116,
+ "learning_rate": 0.0001925142857142857,
+ "loss": 0.1306,
+ "step": 1255
+ },
+ {
+ "epoch": 7.2,
+ "grad_norm": 0.5969316363334656,
+ "learning_rate": 0.00019208571428571425,
+ "loss": 0.1123,
+ "step": 1260
+ },
+ {
+ "epoch": 7.228571428571429,
+ "grad_norm": 0.6902753710746765,
+ "learning_rate": 0.00019165714285714286,
+ "loss": 0.1185,
+ "step": 1265
+ },
+ {
+ "epoch": 7.257142857142857,
+ "grad_norm": 0.6488338112831116,
+ "learning_rate": 0.00019122857142857142,
+ "loss": 0.1431,
+ "step": 1270
+ },
+ {
+ "epoch": 7.285714285714286,
+ "grad_norm": 0.6814819574356079,
+ "learning_rate": 0.00019079999999999998,
+ "loss": 0.1495,
+ "step": 1275
+ },
+ {
+ "epoch": 7.314285714285714,
+ "grad_norm": 0.7468088865280151,
+ "learning_rate": 0.00019037142857142856,
+ "loss": 0.1158,
+ "step": 1280
+ },
+ {
+ "epoch": 7.3428571428571425,
+ "grad_norm": 0.7417412400245667,
+ "learning_rate": 0.00018994285714285712,
+ "loss": 0.1311,
+ "step": 1285
+ },
+ {
+ "epoch": 7.371428571428572,
+ "grad_norm": 0.5480664372444153,
+ "learning_rate": 0.00018951428571428567,
+ "loss": 0.135,
+ "step": 1290
+ },
+ {
+ "epoch": 7.4,
+ "grad_norm": 0.725527822971344,
+ "learning_rate": 0.00018908571428571429,
+ "loss": 0.1217,
+ "step": 1295
+ },
+ {
+ "epoch": 7.428571428571429,
+ "grad_norm": 0.6566678285598755,
+ "learning_rate": 0.00018865714285714284,
+ "loss": 0.1417,
+ "step": 1300
+ },
+ {
+ "epoch": 7.457142857142857,
+ "grad_norm": 0.516952395439148,
+ "learning_rate": 0.00018822857142857143,
+ "loss": 0.1329,
+ "step": 1305
+ },
+ {
+ "epoch": 7.485714285714286,
+ "grad_norm": 1.9545241594314575,
+ "learning_rate": 0.00018779999999999998,
+ "loss": 0.1339,
+ "step": 1310
+ },
+ {
+ "epoch": 7.514285714285714,
+ "grad_norm": 0.8276839852333069,
+ "learning_rate": 0.00018737142857142854,
+ "loss": 0.1324,
+ "step": 1315
+ },
+ {
+ "epoch": 7.542857142857143,
+ "grad_norm": 0.6737099289894104,
+ "learning_rate": 0.00018694285714285713,
+ "loss": 0.1139,
+ "step": 1320
+ },
+ {
+ "epoch": 7.571428571428571,
+ "grad_norm": 0.6914472579956055,
+ "learning_rate": 0.00018651428571428568,
+ "loss": 0.1146,
+ "step": 1325
+ },
+ {
+ "epoch": 7.6,
+ "grad_norm": 0.6630033850669861,
+ "learning_rate": 0.0001860857142857143,
+ "loss": 0.1571,
+ "step": 1330
+ },
+ {
+ "epoch": 7.628571428571428,
+ "grad_norm": 0.820688784122467,
+ "learning_rate": 0.00018565714285714285,
+ "loss": 0.15,
+ "step": 1335
+ },
+ {
+ "epoch": 7.6571428571428575,
+ "grad_norm": 2.0491325855255127,
+ "learning_rate": 0.0001852285714285714,
+ "loss": 0.127,
+ "step": 1340
+ },
+ {
+ "epoch": 7.685714285714286,
+ "grad_norm": 0.9327268004417419,
+ "learning_rate": 0.0001848,
+ "loss": 0.1289,
+ "step": 1345
+ },
+ {
+ "epoch": 7.714285714285714,
+ "grad_norm": 1.3131701946258545,
+ "learning_rate": 0.00018437142857142855,
+ "loss": 0.1228,
+ "step": 1350
+ },
+ {
+ "epoch": 7.742857142857143,
+ "grad_norm": 2.955918312072754,
+ "learning_rate": 0.0001839428571428571,
+ "loss": 0.1082,
+ "step": 1355
+ },
+ {
+ "epoch": 7.771428571428571,
+ "grad_norm": 1.2165493965148926,
+ "learning_rate": 0.00018351428571428572,
+ "loss": 0.1688,
+ "step": 1360
+ },
+ {
+ "epoch": 7.8,
+ "grad_norm": 0.759324312210083,
+ "learning_rate": 0.00018308571428571428,
+ "loss": 0.1185,
+ "step": 1365
+ },
+ {
+ "epoch": 7.828571428571428,
+ "grad_norm": 0.7445591688156128,
+ "learning_rate": 0.00018265714285714286,
+ "loss": 0.1431,
+ "step": 1370
+ },
+ {
+ "epoch": 7.857142857142857,
+ "grad_norm": 0.679374098777771,
+ "learning_rate": 0.00018222857142857142,
+ "loss": 0.1451,
+ "step": 1375
+ },
+ {
+ "epoch": 7.885714285714286,
+ "grad_norm": 2.1234302520751953,
+ "learning_rate": 0.00018179999999999997,
+ "loss": 0.1265,
+ "step": 1380
+ },
+ {
+ "epoch": 7.914285714285715,
+ "grad_norm": 1.006521224975586,
+ "learning_rate": 0.00018137142857142856,
+ "loss": 0.1722,
+ "step": 1385
+ },
+ {
+ "epoch": 7.942857142857143,
+ "grad_norm": 0.7275253534317017,
+ "learning_rate": 0.00018094285714285712,
+ "loss": 0.1625,
+ "step": 1390
+ },
+ {
+ "epoch": 7.9714285714285715,
+ "grad_norm": 0.8612022995948792,
+ "learning_rate": 0.0001805142857142857,
+ "loss": 0.1345,
+ "step": 1395
+ },
+ {
+ "epoch": 8.0,
+ "grad_norm": 0.7276798486709595,
+ "learning_rate": 0.00018008571428571428,
+ "loss": 0.1236,
+ "step": 1400
+ },
+ {
+ "epoch": 8.028571428571428,
+ "grad_norm": 0.8731086850166321,
+ "learning_rate": 0.00017965714285714284,
+ "loss": 0.1604,
+ "step": 1405
+ },
+ {
+ "epoch": 8.057142857142857,
+ "grad_norm": 0.8950818777084351,
+ "learning_rate": 0.0001792285714285714,
+ "loss": 0.1531,
+ "step": 1410
+ },
+ {
+ "epoch": 8.085714285714285,
+ "grad_norm": 0.7399356365203857,
+ "learning_rate": 0.00017879999999999998,
+ "loss": 0.1508,
+ "step": 1415
+ },
+ {
+ "epoch": 8.114285714285714,
+ "grad_norm": 1.3727307319641113,
+ "learning_rate": 0.00017837142857142854,
+ "loss": 0.1487,
+ "step": 1420
+ },
+ {
+ "epoch": 8.142857142857142,
+ "grad_norm": 0.5938125848770142,
+ "learning_rate": 0.00017794285714285715,
+ "loss": 0.1303,
+ "step": 1425
+ },
+ {
+ "epoch": 8.17142857142857,
+ "grad_norm": 0.7043821811676025,
+ "learning_rate": 0.0001775142857142857,
+ "loss": 0.0948,
+ "step": 1430
+ },
+ {
+ "epoch": 8.2,
+ "grad_norm": 1.1062767505645752,
+ "learning_rate": 0.00017708571428571426,
+ "loss": 0.1412,
+ "step": 1435
+ },
+ {
+ "epoch": 8.228571428571428,
+ "grad_norm": 0.844832181930542,
+ "learning_rate": 0.00017665714285714285,
+ "loss": 0.113,
+ "step": 1440
+ },
+ {
+ "epoch": 8.257142857142856,
+ "grad_norm": 0.7564154863357544,
+ "learning_rate": 0.0001762285714285714,
+ "loss": 0.1319,
+ "step": 1445
+ },
+ {
+ "epoch": 8.285714285714286,
+ "grad_norm": 0.8843110203742981,
+ "learning_rate": 0.00017579999999999996,
+ "loss": 0.1206,
+ "step": 1450
+ },
+ {
+ "epoch": 8.314285714285715,
+ "grad_norm": 0.8175828456878662,
+ "learning_rate": 0.00017537142857142855,
+ "loss": 0.1327,
+ "step": 1455
+ },
+ {
+ "epoch": 8.342857142857143,
+ "grad_norm": 0.6443565487861633,
+ "learning_rate": 0.00017494285714285713,
+ "loss": 0.1239,
+ "step": 1460
+ },
+ {
+ "epoch": 8.371428571428572,
+ "grad_norm": 0.7237185835838318,
+ "learning_rate": 0.00017451428571428572,
+ "loss": 0.1639,
+ "step": 1465
+ },
+ {
+ "epoch": 8.4,
+ "grad_norm": 0.6118057370185852,
+ "learning_rate": 0.00017408571428571427,
+ "loss": 0.1363,
+ "step": 1470
+ },
+ {
+ "epoch": 8.428571428571429,
+ "grad_norm": 0.6754649877548218,
+ "learning_rate": 0.00017365714285714283,
+ "loss": 0.1187,
+ "step": 1475
+ },
+ {
+ "epoch": 8.457142857142857,
+ "grad_norm": 1.0067390203475952,
+ "learning_rate": 0.00017322857142857141,
+ "loss": 0.1401,
+ "step": 1480
+ },
+ {
+ "epoch": 8.485714285714286,
+ "grad_norm": 8.509544372558594,
+ "learning_rate": 0.00017279999999999997,
+ "loss": 0.1304,
+ "step": 1485
+ },
+ {
+ "epoch": 8.514285714285714,
+ "grad_norm": 4.2030205726623535,
+ "learning_rate": 0.00017237142857142858,
+ "loss": 0.121,
+ "step": 1490
+ },
+ {
+ "epoch": 8.542857142857143,
+ "grad_norm": 4.877438068389893,
+ "learning_rate": 0.00017194285714285714,
+ "loss": 0.1918,
+ "step": 1495
+ },
+ {
+ "epoch": 8.571428571428571,
+ "grad_norm": 6.4971232414245605,
+ "learning_rate": 0.0001715142857142857,
+ "loss": 0.2154,
+ "step": 1500
+ },
+ {
+ "epoch": 8.6,
+ "grad_norm": 4.365469932556152,
+ "learning_rate": 0.00017108571428571428,
+ "loss": 0.2272,
+ "step": 1505
+ },
+ {
+ "epoch": 8.628571428571428,
+ "grad_norm": 2.551957845687866,
+ "learning_rate": 0.00017065714285714284,
+ "loss": 0.2163,
+ "step": 1510
+ },
+ {
+ "epoch": 8.657142857142857,
+ "grad_norm": 5.326391220092773,
+ "learning_rate": 0.0001702285714285714,
+ "loss": 0.1612,
+ "step": 1515
+ },
+ {
+ "epoch": 8.685714285714285,
+ "grad_norm": 1.3528404235839844,
+ "learning_rate": 0.00016979999999999998,
+ "loss": 0.1636,
+ "step": 1520
+ },
+ {
+ "epoch": 8.714285714285714,
+ "grad_norm": 1.4466065168380737,
+ "learning_rate": 0.00016937142857142856,
+ "loss": 0.1295,
+ "step": 1525
+ },
+ {
+ "epoch": 8.742857142857144,
+ "grad_norm": 0.6576040387153625,
+ "learning_rate": 0.00016894285714285715,
+ "loss": 0.1318,
+ "step": 1530
+ },
+ {
+ "epoch": 8.771428571428572,
+ "grad_norm": 1.286942958831787,
+ "learning_rate": 0.0001685142857142857,
+ "loss": 0.1443,
+ "step": 1535
+ },
+ {
+ "epoch": 8.8,
+ "grad_norm": 9.474458694458008,
+ "learning_rate": 0.00016808571428571426,
+ "loss": 0.1313,
+ "step": 1540
+ },
+ {
+ "epoch": 8.82857142857143,
+ "grad_norm": 2.6731069087982178,
+ "learning_rate": 0.00016765714285714285,
+ "loss": 0.1485,
+ "step": 1545
+ },
+ {
+ "epoch": 8.857142857142858,
+ "grad_norm": 1.313723087310791,
+ "learning_rate": 0.0001672285714285714,
+ "loss": 0.1346,
+ "step": 1550
+ },
+ {
+ "epoch": 8.885714285714286,
+ "grad_norm": 1.7115576267242432,
+ "learning_rate": 0.0001668,
+ "loss": 0.1471,
+ "step": 1555
+ },
+ {
+ "epoch": 8.914285714285715,
+ "grad_norm": 1.2599923610687256,
+ "learning_rate": 0.00016637142857142857,
+ "loss": 0.1433,
+ "step": 1560
+ },
+ {
+ "epoch": 8.942857142857143,
+ "grad_norm": 0.9659029245376587,
+ "learning_rate": 0.00016594285714285713,
+ "loss": 0.1256,
+ "step": 1565
+ },
+ {
+ "epoch": 8.971428571428572,
+ "grad_norm": 1.1282744407653809,
+ "learning_rate": 0.0001655142857142857,
+ "loss": 0.1373,
+ "step": 1570
+ },
+ {
+ "epoch": 9.0,
+ "grad_norm": 3.20717453956604,
+ "learning_rate": 0.00016508571428571427,
+ "loss": 0.1355,
+ "step": 1575
+ },
+ {
+ "epoch": 9.028571428571428,
+ "grad_norm": 0.8310821056365967,
+ "learning_rate": 0.00016465714285714283,
+ "loss": 0.1268,
+ "step": 1580
+ },
+ {
+ "epoch": 9.057142857142857,
+ "grad_norm": 1.5337790250778198,
+ "learning_rate": 0.00016422857142857139,
+ "loss": 0.1267,
+ "step": 1585
+ },
+ {
+ "epoch": 9.085714285714285,
+ "grad_norm": 2.6406068801879883,
+ "learning_rate": 0.0001638,
+ "loss": 0.1363,
+ "step": 1590
+ },
+ {
+ "epoch": 9.114285714285714,
+ "grad_norm": 0.7705873847007751,
+ "learning_rate": 0.00016337142857142855,
+ "loss": 0.1291,
+ "step": 1595
+ },
+ {
+ "epoch": 9.142857142857142,
+ "grad_norm": 0.7092650532722473,
+ "learning_rate": 0.00016294285714285714,
+ "loss": 0.1435,
+ "step": 1600
+ },
+ {
+ "epoch": 9.17142857142857,
+ "grad_norm": 1.098961591720581,
+ "learning_rate": 0.0001625142857142857,
+ "loss": 0.1471,
+ "step": 1605
+ },
+ {
+ "epoch": 9.2,
+ "grad_norm": 0.6994885206222534,
+ "learning_rate": 0.00016208571428571425,
+ "loss": 0.1345,
+ "step": 1610
+ },
+ {
+ "epoch": 9.228571428571428,
+ "grad_norm": 0.9613476991653442,
+ "learning_rate": 0.00016165714285714284,
+ "loss": 0.1399,
+ "step": 1615
+ },
+ {
+ "epoch": 9.257142857142856,
+ "grad_norm": 0.675588846206665,
+ "learning_rate": 0.00016122857142857142,
+ "loss": 0.1319,
+ "step": 1620
+ },
+ {
+ "epoch": 9.285714285714286,
+ "grad_norm": 0.7519372701644897,
+ "learning_rate": 0.0001608,
+ "loss": 0.137,
+ "step": 1625
+ },
+ {
+ "epoch": 9.314285714285715,
+ "grad_norm": 1.135025978088379,
+ "learning_rate": 0.00016037142857142856,
+ "loss": 0.1322,
+ "step": 1630
+ },
+ {
+ "epoch": 9.342857142857143,
+ "grad_norm": 0.7462936639785767,
+ "learning_rate": 0.00015994285714285712,
+ "loss": 0.1215,
+ "step": 1635
+ },
+ {
+ "epoch": 9.371428571428572,
+ "grad_norm": 0.9042088985443115,
+ "learning_rate": 0.0001595142857142857,
+ "loss": 0.1191,
+ "step": 1640
+ },
+ {
+ "epoch": 9.4,
+ "grad_norm": 0.567828893661499,
+ "learning_rate": 0.00015908571428571426,
+ "loss": 0.1189,
+ "step": 1645
+ },
+ {
+ "epoch": 9.428571428571429,
+ "grad_norm": 0.981585681438446,
+ "learning_rate": 0.00015865714285714282,
+ "loss": 0.128,
+ "step": 1650
+ },
+ {
+ "epoch": 9.457142857142857,
+ "grad_norm": 1.24985933303833,
+ "learning_rate": 0.00015822857142857143,
+ "loss": 0.1315,
+ "step": 1655
+ },
+ {
+ "epoch": 9.485714285714286,
+ "grad_norm": 0.6517993211746216,
+ "learning_rate": 0.0001578,
+ "loss": 0.1076,
+ "step": 1660
+ },
+ {
+ "epoch": 9.514285714285714,
+ "grad_norm": 1.166628122329712,
+ "learning_rate": 0.00015737142857142857,
+ "loss": 0.1345,
+ "step": 1665
+ },
+ {
+ "epoch": 9.542857142857143,
+ "grad_norm": 0.9763592481613159,
+ "learning_rate": 0.00015694285714285713,
+ "loss": 0.1449,
+ "step": 1670
+ },
+ {
+ "epoch": 9.571428571428571,
+ "grad_norm": 0.7829060554504395,
+ "learning_rate": 0.00015651428571428569,
+ "loss": 0.1117,
+ "step": 1675
+ },
+ {
+ "epoch": 9.6,
+ "grad_norm": 0.6693719029426575,
+ "learning_rate": 0.00015608571428571427,
+ "loss": 0.1129,
+ "step": 1680
+ },
+ {
+ "epoch": 9.628571428571428,
+ "grad_norm": 1.2122846841812134,
+ "learning_rate": 0.00015565714285714285,
+ "loss": 0.1125,
+ "step": 1685
+ },
+ {
+ "epoch": 9.657142857142857,
+ "grad_norm": 1.0689371824264526,
+ "learning_rate": 0.0001552285714285714,
+ "loss": 0.1478,
+ "step": 1690
+ },
+ {
+ "epoch": 9.685714285714285,
+ "grad_norm": 1.8511656522750854,
+ "learning_rate": 0.0001548,
+ "loss": 0.1431,
+ "step": 1695
+ },
+ {
+ "epoch": 9.714285714285714,
+ "grad_norm": 0.6706506609916687,
+ "learning_rate": 0.00015437142857142855,
+ "loss": 0.1262,
+ "step": 1700
+ },
+ {
+ "epoch": 9.742857142857144,
+ "grad_norm": 1.0798784494400024,
+ "learning_rate": 0.00015394285714285714,
+ "loss": 0.1275,
+ "step": 1705
+ },
+ {
+ "epoch": 9.771428571428572,
+ "grad_norm": 0.7915983200073242,
+ "learning_rate": 0.0001535142857142857,
+ "loss": 0.1316,
+ "step": 1710
+ },
+ {
+ "epoch": 9.8,
+ "grad_norm": 1.8630567789077759,
+ "learning_rate": 0.00015308571428571425,
+ "loss": 0.1258,
+ "step": 1715
+ },
+ {
+ "epoch": 9.82857142857143,
+ "grad_norm": 0.7807756662368774,
+ "learning_rate": 0.00015265714285714286,
+ "loss": 0.1079,
+ "step": 1720
+ },
+ {
+ "epoch": 9.857142857142858,
+ "grad_norm": 1.4698439836502075,
+ "learning_rate": 0.00015222857142857142,
+ "loss": 0.1357,
+ "step": 1725
+ },
+ {
+ "epoch": 9.885714285714286,
+ "grad_norm": 1.2121926546096802,
+ "learning_rate": 0.00015179999999999998,
+ "loss": 0.1322,
+ "step": 1730
+ },
+ {
+ "epoch": 9.914285714285715,
+ "grad_norm": 0.6348568201065063,
+ "learning_rate": 0.00015137142857142856,
+ "loss": 0.0893,
+ "step": 1735
+ },
+ {
+ "epoch": 9.942857142857143,
+ "grad_norm": 0.6694422364234924,
+ "learning_rate": 0.00015094285714285712,
+ "loss": 0.1189,
+ "step": 1740
+ },
+ {
+ "epoch": 9.971428571428572,
+ "grad_norm": 0.569332480430603,
+ "learning_rate": 0.00015051428571428567,
+ "loss": 0.1349,
+ "step": 1745
+ },
+ {
+ "epoch": 10.0,
+ "grad_norm": 0.934073269367218,
+ "learning_rate": 0.00015008571428571429,
+ "loss": 0.1237,
+ "step": 1750
+ },
+ {
+ "epoch": 10.028571428571428,
+ "grad_norm": 0.7191672325134277,
+ "learning_rate": 0.00014965714285714284,
+ "loss": 0.1308,
+ "step": 1755
+ },
+ {
+ "epoch": 10.057142857142857,
+ "grad_norm": 0.7006493806838989,
+ "learning_rate": 0.00014922857142857143,
+ "loss": 0.104,
+ "step": 1760
+ },
+ {
+ "epoch": 10.085714285714285,
+ "grad_norm": 0.9030678272247314,
+ "learning_rate": 0.00014879999999999998,
+ "loss": 0.1308,
+ "step": 1765
+ },
+ {
+ "epoch": 10.114285714285714,
+ "grad_norm": 0.7007766366004944,
+ "learning_rate": 0.00014837142857142854,
+ "loss": 0.1044,
+ "step": 1770
+ },
+ {
+ "epoch": 10.142857142857142,
+ "grad_norm": 0.4832770824432373,
+ "learning_rate": 0.00014794285714285713,
+ "loss": 0.1119,
+ "step": 1775
+ },
+ {
+ "epoch": 10.17142857142857,
+ "grad_norm": 0.7819458842277527,
+ "learning_rate": 0.0001475142857142857,
+ "loss": 0.1087,
+ "step": 1780
+ },
+ {
+ "epoch": 10.2,
+ "grad_norm": 1.0223525762557983,
+ "learning_rate": 0.00014708571428571427,
+ "loss": 0.1314,
+ "step": 1785
+ },
+ {
+ "epoch": 10.228571428571428,
+ "grad_norm": 0.6224566698074341,
+ "learning_rate": 0.00014665714285714285,
+ "loss": 0.1159,
+ "step": 1790
+ },
+ {
+ "epoch": 10.257142857142856,
+ "grad_norm": 0.45800235867500305,
+ "learning_rate": 0.0001462285714285714,
+ "loss": 0.0942,
+ "step": 1795
+ },
+ {
+ "epoch": 10.285714285714286,
+ "grad_norm": 0.6258400082588196,
+ "learning_rate": 0.0001458,
+ "loss": 0.1079,
+ "step": 1800
+ },
+ {
+ "epoch": 10.314285714285715,
+ "grad_norm": 1.1812794208526611,
+ "learning_rate": 0.00014537142857142858,
+ "loss": 0.1378,
+ "step": 1805
+ },
+ {
+ "epoch": 10.342857142857143,
+ "grad_norm": 0.8541269898414612,
+ "learning_rate": 0.00014494285714285713,
+ "loss": 0.1274,
+ "step": 1810
+ },
+ {
+ "epoch": 10.371428571428572,
+ "grad_norm": 0.7131860256195068,
+ "learning_rate": 0.0001445142857142857,
+ "loss": 0.1247,
+ "step": 1815
+ },
+ {
+ "epoch": 10.4,
+ "grad_norm": 0.6109820008277893,
+ "learning_rate": 0.00014408571428571428,
+ "loss": 0.1246,
+ "step": 1820
+ },
+ {
+ "epoch": 10.428571428571429,
+ "grad_norm": 0.5621510744094849,
+ "learning_rate": 0.00014365714285714286,
+ "loss": 0.1039,
+ "step": 1825
+ },
+ {
+ "epoch": 10.457142857142857,
+ "grad_norm": 1.022777795791626,
+ "learning_rate": 0.00014322857142857142,
+ "loss": 0.1206,
+ "step": 1830
+ },
+ {
+ "epoch": 10.485714285714286,
+ "grad_norm": 0.9120668768882751,
+ "learning_rate": 0.00014279999999999997,
+ "loss": 0.1289,
+ "step": 1835
+ },
+ {
+ "epoch": 10.514285714285714,
+ "grad_norm": 1.1882030963897705,
+ "learning_rate": 0.00014237142857142856,
+ "loss": 0.1194,
+ "step": 1840
+ },
+ {
+ "epoch": 10.542857142857143,
+ "grad_norm": 0.6078401207923889,
+ "learning_rate": 0.00014194285714285714,
+ "loss": 0.1339,
+ "step": 1845
+ },
+ {
+ "epoch": 10.571428571428571,
+ "grad_norm": 0.7380999326705933,
+ "learning_rate": 0.0001415142857142857,
+ "loss": 0.1318,
+ "step": 1850
+ },
+ {
+ "epoch": 10.6,
+ "grad_norm": 0.5884959101676941,
+ "learning_rate": 0.00014108571428571428,
+ "loss": 0.1249,
+ "step": 1855
+ },
+ {
+ "epoch": 10.628571428571428,
+ "grad_norm": 1.0121936798095703,
+ "learning_rate": 0.00014065714285714284,
+ "loss": 0.1137,
+ "step": 1860
+ },
+ {
+ "epoch": 10.657142857142857,
+ "grad_norm": 0.6444916129112244,
+ "learning_rate": 0.00014022857142857143,
+ "loss": 0.1213,
+ "step": 1865
+ },
+ {
+ "epoch": 10.685714285714285,
+ "grad_norm": 0.7931004762649536,
+ "learning_rate": 0.00013979999999999998,
+ "loss": 0.1318,
+ "step": 1870
+ },
+ {
+ "epoch": 10.714285714285714,
+ "grad_norm": 0.5596404075622559,
+ "learning_rate": 0.00013937142857142857,
+ "loss": 0.1075,
+ "step": 1875
+ },
+ {
+ "epoch": 10.742857142857144,
+ "grad_norm": 0.6586474180221558,
+ "learning_rate": 0.00013894285714285712,
+ "loss": 0.13,
+ "step": 1880
+ },
+ {
+ "epoch": 10.771428571428572,
+ "grad_norm": 1.0195013284683228,
+ "learning_rate": 0.00013851428571428568,
+ "loss": 0.1373,
+ "step": 1885
+ },
+ {
+ "epoch": 10.8,
+ "grad_norm": 0.9233512878417969,
+ "learning_rate": 0.00013808571428571427,
+ "loss": 0.1168,
+ "step": 1890
+ },
+ {
+ "epoch": 10.82857142857143,
+ "grad_norm": 0.7154092788696289,
+ "learning_rate": 0.00013765714285714285,
+ "loss": 0.1081,
+ "step": 1895
+ },
+ {
+ "epoch": 10.857142857142858,
+ "grad_norm": 1.4588117599487305,
+ "learning_rate": 0.0001372285714285714,
+ "loss": 0.1061,
+ "step": 1900
+ },
+ {
+ "epoch": 10.885714285714286,
+ "grad_norm": 0.6087035536766052,
+ "learning_rate": 0.0001368,
+ "loss": 0.1157,
+ "step": 1905
+ },
+ {
+ "epoch": 10.914285714285715,
+ "grad_norm": 0.7371247410774231,
+ "learning_rate": 0.00013637142857142855,
+ "loss": 0.1339,
+ "step": 1910
+ },
+ {
+ "epoch": 10.942857142857143,
+ "grad_norm": 0.8253212571144104,
+ "learning_rate": 0.00013594285714285713,
+ "loss": 0.1198,
+ "step": 1915
+ },
+ {
+ "epoch": 10.971428571428572,
+ "grad_norm": 0.6889544129371643,
+ "learning_rate": 0.00013551428571428572,
+ "loss": 0.1131,
+ "step": 1920
+ },
+ {
+ "epoch": 11.0,
+ "grad_norm": 0.6408224105834961,
+ "learning_rate": 0.00013508571428571427,
+ "loss": 0.122,
+ "step": 1925
+ },
+ {
+ "epoch": 11.028571428571428,
+ "grad_norm": 0.6771185398101807,
+ "learning_rate": 0.00013465714285714283,
+ "loss": 0.1492,
+ "step": 1930
+ },
+ {
+ "epoch": 11.057142857142857,
+ "grad_norm": 0.8706450462341309,
+ "learning_rate": 0.00013422857142857142,
+ "loss": 0.1294,
+ "step": 1935
+ },
+ {
+ "epoch": 11.085714285714285,
+ "grad_norm": 1.730648398399353,
+ "learning_rate": 0.0001338,
+ "loss": 0.1004,
+ "step": 1940
+ },
+ {
+ "epoch": 11.114285714285714,
+ "grad_norm": 0.6985113620758057,
+ "learning_rate": 0.00013337142857142856,
+ "loss": 0.0995,
+ "step": 1945
+ },
+ {
+ "epoch": 11.142857142857142,
+ "grad_norm": 0.8901951313018799,
+ "learning_rate": 0.00013294285714285711,
+ "loss": 0.1179,
+ "step": 1950
+ },
+ {
+ "epoch": 11.17142857142857,
+ "grad_norm": 0.7232164144515991,
+ "learning_rate": 0.0001325142857142857,
+ "loss": 0.1397,
+ "step": 1955
+ },
+ {
+ "epoch": 11.2,
+ "grad_norm": 0.6447544693946838,
+ "learning_rate": 0.00013208571428571428,
+ "loss": 0.1366,
+ "step": 1960
+ },
+ {
+ "epoch": 11.228571428571428,
+ "grad_norm": 0.7964944243431091,
+ "learning_rate": 0.00013165714285714284,
+ "loss": 0.1121,
+ "step": 1965
+ },
+ {
+ "epoch": 11.257142857142856,
+ "grad_norm": 0.9012628793716431,
+ "learning_rate": 0.00013122857142857142,
+ "loss": 0.1131,
+ "step": 1970
+ },
+ {
+ "epoch": 11.285714285714286,
+ "grad_norm": 0.9295369982719421,
+ "learning_rate": 0.00013079999999999998,
+ "loss": 0.1232,
+ "step": 1975
+ },
+ {
+ "epoch": 11.314285714285715,
+ "grad_norm": 0.6237708926200867,
+ "learning_rate": 0.00013037142857142857,
+ "loss": 0.1066,
+ "step": 1980
+ },
+ {
+ "epoch": 11.342857142857143,
+ "grad_norm": 0.5250967741012573,
+ "learning_rate": 0.00012994285714285715,
+ "loss": 0.118,
+ "step": 1985
+ },
+ {
+ "epoch": 11.371428571428572,
+ "grad_norm": 1.0013964176177979,
+ "learning_rate": 0.0001295142857142857,
+ "loss": 0.1125,
+ "step": 1990
+ },
+ {
+ "epoch": 11.4,
+ "grad_norm": 0.6721311807632446,
+ "learning_rate": 0.00012908571428571426,
+ "loss": 0.1196,
+ "step": 1995
+ },
+ {
+ "epoch": 11.428571428571429,
+ "grad_norm": 0.6966421008110046,
+ "learning_rate": 0.00012865714285714285,
+ "loss": 0.1172,
+ "step": 2000
+ },
+ {
+ "epoch": 11.457142857142857,
+ "grad_norm": 0.8811460733413696,
+ "learning_rate": 0.00012822857142857143,
+ "loss": 0.135,
+ "step": 2005
+ },
+ {
+ "epoch": 11.485714285714286,
+ "grad_norm": 0.8829531073570251,
+ "learning_rate": 0.0001278,
+ "loss": 0.1288,
+ "step": 2010
+ },
+ {
+ "epoch": 11.514285714285714,
+ "grad_norm": 0.7530654668807983,
+ "learning_rate": 0.00012737142857142855,
+ "loss": 0.1073,
+ "step": 2015
+ },
+ {
+ "epoch": 11.542857142857143,
+ "grad_norm": 0.513940691947937,
+ "learning_rate": 0.00012694285714285713,
+ "loss": 0.121,
+ "step": 2020
+ },
+ {
+ "epoch": 11.571428571428571,
+ "grad_norm": 0.8574968576431274,
+ "learning_rate": 0.0001265142857142857,
+ "loss": 0.1103,
+ "step": 2025
+ },
+ {
+ "epoch": 11.6,
+ "grad_norm": 0.7482439875602722,
+ "learning_rate": 0.00012608571428571427,
+ "loss": 0.1027,
+ "step": 2030
+ },
+ {
+ "epoch": 11.628571428571428,
+ "grad_norm": 0.8367976546287537,
+ "learning_rate": 0.00012565714285714286,
+ "loss": 0.1181,
+ "step": 2035
+ },
+ {
+ "epoch": 11.657142857142857,
+ "grad_norm": 2.048128366470337,
+ "learning_rate": 0.0001252285714285714,
+ "loss": 0.1122,
+ "step": 2040
+ },
+ {
+ "epoch": 11.685714285714285,
+ "grad_norm": 0.7426862716674805,
+ "learning_rate": 0.00012479999999999997,
+ "loss": 0.1169,
+ "step": 2045
+ },
+ {
+ "epoch": 11.714285714285714,
+ "grad_norm": 3.093841791152954,
+ "learning_rate": 0.00012437142857142855,
+ "loss": 0.1164,
+ "step": 2050
+ },
+ {
+ "epoch": 11.742857142857144,
+ "grad_norm": 0.8172643184661865,
+ "learning_rate": 0.00012394285714285714,
+ "loss": 0.1354,
+ "step": 2055
+ },
+ {
+ "epoch": 11.771428571428572,
+ "grad_norm": 1.9950591325759888,
+ "learning_rate": 0.0001235142857142857,
+ "loss": 0.1037,
+ "step": 2060
+ },
+ {
+ "epoch": 11.8,
+ "grad_norm": 0.5929077863693237,
+ "learning_rate": 0.00012308571428571428,
+ "loss": 0.1194,
+ "step": 2065
+ },
+ {
+ "epoch": 11.82857142857143,
+ "grad_norm": 1.293624997138977,
+ "learning_rate": 0.00012265714285714284,
+ "loss": 0.12,
+ "step": 2070
+ },
+ {
+ "epoch": 11.857142857142858,
+ "grad_norm": 1.0515168905258179,
+ "learning_rate": 0.00012222857142857142,
+ "loss": 0.1049,
+ "step": 2075
+ },
+ {
+ "epoch": 11.885714285714286,
+ "grad_norm": 1.2874428033828735,
+ "learning_rate": 0.00012179999999999999,
+ "loss": 0.115,
+ "step": 2080
+ },
+ {
+ "epoch": 11.914285714285715,
+ "grad_norm": 0.7317278385162354,
+ "learning_rate": 0.00012137142857142856,
+ "loss": 0.1184,
+ "step": 2085
+ },
+ {
+ "epoch": 11.942857142857143,
+ "grad_norm": 1.3407148122787476,
+ "learning_rate": 0.00012094285714285713,
+ "loss": 0.132,
+ "step": 2090
+ },
+ {
+ "epoch": 11.971428571428572,
+ "grad_norm": 2.656409502029419,
+ "learning_rate": 0.00012051428571428569,
+ "loss": 0.1359,
+ "step": 2095
+ },
+ {
+ "epoch": 12.0,
+ "grad_norm": 0.7189064025878906,
+ "learning_rate": 0.00012008571428571428,
+ "loss": 0.1217,
+ "step": 2100
+ },
+ {
+ "epoch": 12.028571428571428,
+ "grad_norm": 0.7510334849357605,
+ "learning_rate": 0.00011965714285714285,
+ "loss": 0.109,
+ "step": 2105
+ },
+ {
+ "epoch": 12.057142857142857,
+ "grad_norm": 0.7235113382339478,
+ "learning_rate": 0.00011922857142857142,
+ "loss": 0.1114,
+ "step": 2110
+ },
+ {
+ "epoch": 12.085714285714285,
+ "grad_norm": 1.7435882091522217,
+ "learning_rate": 0.0001188,
+ "loss": 0.1357,
+ "step": 2115
+ },
+ {
+ "epoch": 12.114285714285714,
+ "grad_norm": 1.170392632484436,
+ "learning_rate": 0.00011837142857142856,
+ "loss": 0.1255,
+ "step": 2120
+ },
+ {
+ "epoch": 12.142857142857142,
+ "grad_norm": 0.6476783752441406,
+ "learning_rate": 0.00011794285714285713,
+ "loss": 0.1108,
+ "step": 2125
+ },
+ {
+ "epoch": 12.17142857142857,
+ "grad_norm": 0.8599929213523865,
+ "learning_rate": 0.00011751428571428571,
+ "loss": 0.0997,
+ "step": 2130
+ },
+ {
+ "epoch": 12.2,
+ "grad_norm": 0.8918687105178833,
+ "learning_rate": 0.00011708571428571428,
+ "loss": 0.1149,
+ "step": 2135
+ },
+ {
+ "epoch": 12.228571428571428,
+ "grad_norm": 1.609435796737671,
+ "learning_rate": 0.00011665714285714284,
+ "loss": 0.1136,
+ "step": 2140
+ },
+ {
+ "epoch": 12.257142857142856,
+ "grad_norm": 0.6206801533699036,
+ "learning_rate": 0.00011622857142857143,
+ "loss": 0.1135,
+ "step": 2145
+ },
+ {
+ "epoch": 12.285714285714286,
+ "grad_norm": 0.8769077658653259,
+ "learning_rate": 0.0001158,
+ "loss": 0.1344,
+ "step": 2150
+ },
+ {
+ "epoch": 12.314285714285715,
+ "grad_norm": 0.6279401183128357,
+ "learning_rate": 0.00011537142857142855,
+ "loss": 0.1049,
+ "step": 2155
+ },
+ {
+ "epoch": 12.342857142857143,
+ "grad_norm": 1.1110137701034546,
+ "learning_rate": 0.00011494285714285712,
+ "loss": 0.1146,
+ "step": 2160
+ },
+ {
+ "epoch": 12.371428571428572,
+ "grad_norm": 0.7911233901977539,
+ "learning_rate": 0.00011451428571428571,
+ "loss": 0.1257,
+ "step": 2165
+ },
+ {
+ "epoch": 12.4,
+ "grad_norm": 0.9691207408905029,
+ "learning_rate": 0.00011408571428571428,
+ "loss": 0.1226,
+ "step": 2170
+ },
+ {
+ "epoch": 12.428571428571429,
+ "grad_norm": 0.6168835759162903,
+ "learning_rate": 0.00011365714285714284,
+ "loss": 0.1271,
+ "step": 2175
+ },
+ {
+ "epoch": 12.457142857142857,
+ "grad_norm": 0.6143497228622437,
+ "learning_rate": 0.00011322857142857142,
+ "loss": 0.111,
+ "step": 2180
+ },
+ {
+ "epoch": 12.485714285714286,
+ "grad_norm": 1.5673450231552124,
+ "learning_rate": 0.00011279999999999999,
+ "loss": 0.1186,
+ "step": 2185
+ },
+ {
+ "epoch": 12.514285714285714,
+ "grad_norm": 1.298756718635559,
+ "learning_rate": 0.00011237142857142856,
+ "loss": 0.1024,
+ "step": 2190
+ },
+ {
+ "epoch": 12.542857142857143,
+ "grad_norm": 0.9484918117523193,
+ "learning_rate": 0.00011194285714285715,
+ "loss": 0.1171,
+ "step": 2195
+ },
+ {
+ "epoch": 12.571428571428571,
+ "grad_norm": 0.725705623626709,
+ "learning_rate": 0.0001115142857142857,
+ "loss": 0.1216,
+ "step": 2200
+ },
+ {
+ "epoch": 12.6,
+ "grad_norm": 1.1394798755645752,
+ "learning_rate": 0.00011108571428571427,
+ "loss": 0.1132,
+ "step": 2205
+ },
+ {
+ "epoch": 12.628571428571428,
+ "grad_norm": 0.9548712968826294,
+ "learning_rate": 0.00011065714285714286,
+ "loss": 0.1209,
+ "step": 2210
+ },
+ {
+ "epoch": 12.657142857142857,
+ "grad_norm": 0.6173953413963318,
+ "learning_rate": 0.00011022857142857143,
+ "loss": 0.1049,
+ "step": 2215
+ },
+ {
+ "epoch": 12.685714285714285,
+ "grad_norm": 0.8227205872535706,
+ "learning_rate": 0.00010979999999999999,
+ "loss": 0.1045,
+ "step": 2220
+ },
+ {
+ "epoch": 12.714285714285714,
+ "grad_norm": 0.7252780795097351,
+ "learning_rate": 0.00010937142857142856,
+ "loss": 0.1146,
+ "step": 2225
+ },
+ {
+ "epoch": 12.742857142857144,
+ "grad_norm": 0.9374399781227112,
+ "learning_rate": 0.00010894285714285714,
+ "loss": 0.1478,
+ "step": 2230
+ },
+ {
+ "epoch": 12.771428571428572,
+ "grad_norm": 5.1985368728637695,
+ "learning_rate": 0.0001085142857142857,
+ "loss": 0.1059,
+ "step": 2235
+ },
+ {
+ "epoch": 12.8,
+ "grad_norm": 0.9629620909690857,
+ "learning_rate": 0.00010808571428571427,
+ "loss": 0.124,
+ "step": 2240
+ },
+ {
+ "epoch": 12.82857142857143,
+ "grad_norm": 0.7022290229797363,
+ "learning_rate": 0.00010765714285714285,
+ "loss": 0.1309,
+ "step": 2245
+ },
+ {
+ "epoch": 12.857142857142858,
+ "grad_norm": 0.574188232421875,
+ "learning_rate": 0.00010722857142857142,
+ "loss": 0.086,
+ "step": 2250
+ },
+ {
+ "epoch": 12.885714285714286,
+ "grad_norm": 0.9712439179420471,
+ "learning_rate": 0.00010679999999999998,
+ "loss": 0.1152,
+ "step": 2255
+ },
+ {
+ "epoch": 12.914285714285715,
+ "grad_norm": 0.6562150120735168,
+ "learning_rate": 0.00010637142857142856,
+ "loss": 0.1343,
+ "step": 2260
+ },
+ {
+ "epoch": 12.942857142857143,
+ "grad_norm": 0.6936819553375244,
+ "learning_rate": 0.00010594285714285714,
+ "loss": 0.1009,
+ "step": 2265
+ },
+ {
+ "epoch": 12.971428571428572,
+ "grad_norm": 0.8664882779121399,
+ "learning_rate": 0.0001055142857142857,
+ "loss": 0.1164,
+ "step": 2270
+ },
+ {
+ "epoch": 13.0,
+ "grad_norm": 0.9224509000778198,
+ "learning_rate": 0.00010508571428571429,
+ "loss": 0.1347,
+ "step": 2275
+ },
+ {
+ "epoch": 13.028571428571428,
+ "grad_norm": 0.6596968770027161,
+ "learning_rate": 0.00010465714285714285,
+ "loss": 0.1041,
+ "step": 2280
+ },
+ {
+ "epoch": 13.057142857142857,
+ "grad_norm": 0.6456631422042847,
+ "learning_rate": 0.00010422857142857142,
+ "loss": 0.1142,
+ "step": 2285
+ },
+ {
+ "epoch": 13.085714285714285,
+ "grad_norm": 0.9466612339019775,
+ "learning_rate": 0.00010379999999999999,
+ "loss": 0.1191,
+ "step": 2290
+ },
+ {
+ "epoch": 13.114285714285714,
+ "grad_norm": 0.9036727547645569,
+ "learning_rate": 0.00010337142857142856,
+ "loss": 0.121,
+ "step": 2295
+ },
+ {
+ "epoch": 13.142857142857142,
+ "grad_norm": 1.08086359500885,
+ "learning_rate": 0.00010294285714285713,
+ "loss": 0.1313,
+ "step": 2300
+ },
+ {
+ "epoch": 13.17142857142857,
+ "grad_norm": 0.703241765499115,
+ "learning_rate": 0.0001025142857142857,
+ "loss": 0.1151,
+ "step": 2305
+ },
+ {
+ "epoch": 13.2,
+ "grad_norm": 0.7901896238327026,
+ "learning_rate": 0.00010208571428571429,
+ "loss": 0.1275,
+ "step": 2310
+ },
+ {
+ "epoch": 13.228571428571428,
+ "grad_norm": 0.703542947769165,
+ "learning_rate": 0.00010165714285714284,
+ "loss": 0.1,
+ "step": 2315
+ },
+ {
+ "epoch": 13.257142857142856,
+ "grad_norm": 0.6657671928405762,
+ "learning_rate": 0.00010122857142857141,
+ "loss": 0.1141,
+ "step": 2320
+ },
+ {
+ "epoch": 13.285714285714286,
+ "grad_norm": 0.7593729496002197,
+ "learning_rate": 0.0001008,
+ "loss": 0.1099,
+ "step": 2325
+ },
+ {
+ "epoch": 13.314285714285715,
+ "grad_norm": 0.6681057810783386,
+ "learning_rate": 0.00010037142857142857,
+ "loss": 0.112,
+ "step": 2330
+ },
+ {
+ "epoch": 13.342857142857143,
+ "grad_norm": 0.7155857682228088,
+ "learning_rate": 9.994285714285712e-05,
+ "loss": 0.0989,
+ "step": 2335
+ },
+ {
+ "epoch": 13.371428571428572,
+ "grad_norm": 0.9484553337097168,
+ "learning_rate": 9.951428571428571e-05,
+ "loss": 0.0902,
+ "step": 2340
+ },
+ {
+ "epoch": 13.4,
+ "grad_norm": 0.9317265152931213,
+ "learning_rate": 9.908571428571428e-05,
+ "loss": 0.1432,
+ "step": 2345
+ },
+ {
+ "epoch": 13.428571428571429,
+ "grad_norm": 1.039158821105957,
+ "learning_rate": 9.865714285714285e-05,
+ "loss": 0.114,
+ "step": 2350
+ },
+ {
+ "epoch": 13.457142857142857,
+ "grad_norm": 0.8524000644683838,
+ "learning_rate": 9.822857142857141e-05,
+ "loss": 0.1144,
+ "step": 2355
+ },
+ {
+ "epoch": 13.485714285714286,
+ "grad_norm": 0.6337461471557617,
+ "learning_rate": 9.779999999999999e-05,
+ "loss": 0.1073,
+ "step": 2360
+ },
+ {
+ "epoch": 13.514285714285714,
+ "grad_norm": 0.9097298383712769,
+ "learning_rate": 9.737142857142856e-05,
+ "loss": 0.1031,
+ "step": 2365
+ },
+ {
+ "epoch": 13.542857142857143,
+ "grad_norm": 1.2013412714004517,
+ "learning_rate": 9.694285714285713e-05,
+ "loss": 0.1174,
+ "step": 2370
+ },
+ {
+ "epoch": 13.571428571428571,
+ "grad_norm": 0.7055214643478394,
+ "learning_rate": 9.65142857142857e-05,
+ "loss": 0.1175,
+ "step": 2375
+ },
+ {
+ "epoch": 13.6,
+ "grad_norm": 0.807955265045166,
+ "learning_rate": 9.608571428571427e-05,
+ "loss": 0.1286,
+ "step": 2380
+ },
+ {
+ "epoch": 13.628571428571428,
+ "grad_norm": 0.6661797761917114,
+ "learning_rate": 9.565714285714285e-05,
+ "loss": 0.1091,
+ "step": 2385
+ },
+ {
+ "epoch": 13.657142857142857,
+ "grad_norm": 1.119604468345642,
+ "learning_rate": 9.522857142857143e-05,
+ "loss": 0.1393,
+ "step": 2390
+ },
+ {
+ "epoch": 13.685714285714285,
+ "grad_norm": 0.5365435481071472,
+ "learning_rate": 9.479999999999999e-05,
+ "loss": 0.1075,
+ "step": 2395
+ },
+ {
+ "epoch": 13.714285714285714,
+ "grad_norm": 0.9443924427032471,
+ "learning_rate": 9.437142857142856e-05,
+ "loss": 0.0977,
+ "step": 2400
+ },
+ {
+ "epoch": 13.742857142857144,
+ "grad_norm": 0.6075264811515808,
+ "learning_rate": 9.394285714285714e-05,
+ "loss": 0.1329,
+ "step": 2405
+ },
+ {
+ "epoch": 13.771428571428572,
+ "grad_norm": 1.019352912902832,
+ "learning_rate": 9.351428571428571e-05,
+ "loss": 0.1083,
+ "step": 2410
+ },
+ {
+ "epoch": 13.8,
+ "grad_norm": 0.7234058380126953,
+ "learning_rate": 9.308571428571427e-05,
+ "loss": 0.1118,
+ "step": 2415
+ },
+ {
+ "epoch": 13.82857142857143,
+ "grad_norm": 0.6786122918128967,
+ "learning_rate": 9.265714285714284e-05,
+ "loss": 0.1208,
+ "step": 2420
+ },
+ {
+ "epoch": 13.857142857142858,
+ "grad_norm": 0.5820732116699219,
+ "learning_rate": 9.222857142857142e-05,
+ "loss": 0.1022,
+ "step": 2425
+ },
+ {
+ "epoch": 13.885714285714286,
+ "grad_norm": 0.8007987141609192,
+ "learning_rate": 9.18e-05,
+ "loss": 0.1293,
+ "step": 2430
+ },
+ {
+ "epoch": 13.914285714285715,
+ "grad_norm": 0.6813766956329346,
+ "learning_rate": 9.137142857142855e-05,
+ "loss": 0.1284,
+ "step": 2435
+ },
+ {
+ "epoch": 13.942857142857143,
+ "grad_norm": 0.6460041403770447,
+ "learning_rate": 9.094285714285714e-05,
+ "loss": 0.1073,
+ "step": 2440
+ },
+ {
+ "epoch": 13.971428571428572,
+ "grad_norm": 0.5939205288887024,
+ "learning_rate": 9.051428571428571e-05,
+ "loss": 0.1185,
+ "step": 2445
+ },
+ {
+ "epoch": 14.0,
+ "grad_norm": 0.8150635361671448,
+ "learning_rate": 9.008571428571428e-05,
+ "loss": 0.1039,
+ "step": 2450
+ },
+ {
+ "epoch": 14.028571428571428,
+ "grad_norm": 1.3691389560699463,
+ "learning_rate": 8.965714285714285e-05,
+ "loss": 0.1112,
+ "step": 2455
+ },
+ {
+ "epoch": 14.057142857142857,
+ "grad_norm": 0.9042718410491943,
+ "learning_rate": 8.922857142857142e-05,
+ "loss": 0.112,
+ "step": 2460
+ },
+ {
+ "epoch": 14.085714285714285,
+ "grad_norm": 0.7222105860710144,
+ "learning_rate": 8.879999999999999e-05,
+ "loss": 0.1221,
+ "step": 2465
+ },
+ {
+ "epoch": 14.114285714285714,
+ "grad_norm": 0.595588207244873,
+ "learning_rate": 8.837142857142857e-05,
+ "loss": 0.1058,
+ "step": 2470
+ },
+ {
+ "epoch": 14.142857142857142,
+ "grad_norm": 0.5262706279754639,
+ "learning_rate": 8.794285714285713e-05,
+ "loss": 0.1071,
+ "step": 2475
+ },
+ {
+ "epoch": 14.17142857142857,
+ "grad_norm": 0.6511022448539734,
+ "learning_rate": 8.75142857142857e-05,
+ "loss": 0.0917,
+ "step": 2480
+ },
+ {
+ "epoch": 14.2,
+ "grad_norm": 0.5737650394439697,
+ "learning_rate": 8.708571428571427e-05,
+ "loss": 0.0988,
+ "step": 2485
+ },
+ {
+ "epoch": 14.228571428571428,
+ "grad_norm": 0.7679132223129272,
+ "learning_rate": 8.665714285714286e-05,
+ "loss": 0.1185,
+ "step": 2490
+ },
+ {
+ "epoch": 14.257142857142856,
+ "grad_norm": 0.641198456287384,
+ "learning_rate": 8.622857142857141e-05,
+ "loss": 0.0894,
+ "step": 2495
+ },
+ {
+ "epoch": 14.285714285714286,
+ "grad_norm": 0.7215464115142822,
+ "learning_rate": 8.579999999999998e-05,
+ "loss": 0.0935,
+ "step": 2500
+ },
+ {
+ "epoch": 14.314285714285715,
+ "grad_norm": 1.0740891695022583,
+ "learning_rate": 8.537142857142857e-05,
+ "loss": 0.1156,
+ "step": 2505
+ },
+ {
+ "epoch": 14.342857142857143,
+ "grad_norm": 0.6668990254402161,
+ "learning_rate": 8.494285714285714e-05,
+ "loss": 0.1006,
+ "step": 2510
+ },
+ {
+ "epoch": 14.371428571428572,
+ "grad_norm": 0.6674673557281494,
+ "learning_rate": 8.45142857142857e-05,
+ "loss": 0.1045,
+ "step": 2515
+ },
+ {
+ "epoch": 14.4,
+ "grad_norm": 0.6198854446411133,
+ "learning_rate": 8.408571428571428e-05,
+ "loss": 0.0997,
+ "step": 2520
+ },
+ {
+ "epoch": 14.428571428571429,
+ "grad_norm": 0.7187360525131226,
+ "learning_rate": 8.365714285714285e-05,
+ "loss": 0.1277,
+ "step": 2525
+ },
+ {
+ "epoch": 14.457142857142857,
+ "grad_norm": 0.583990216255188,
+ "learning_rate": 8.322857142857142e-05,
+ "loss": 0.1182,
+ "step": 2530
+ },
+ {
+ "epoch": 14.485714285714286,
+ "grad_norm": 1.1340539455413818,
+ "learning_rate": 8.28e-05,
+ "loss": 0.106,
+ "step": 2535
+ },
+ {
+ "epoch": 14.514285714285714,
+ "grad_norm": 0.6411644816398621,
+ "learning_rate": 8.237142857142856e-05,
+ "loss": 0.0963,
+ "step": 2540
+ },
+ {
+ "epoch": 14.542857142857143,
+ "grad_norm": 0.7092474102973938,
+ "learning_rate": 8.194285714285713e-05,
+ "loss": 0.1061,
+ "step": 2545
+ },
+ {
+ "epoch": 14.571428571428571,
+ "grad_norm": 0.6887038946151733,
+ "learning_rate": 8.151428571428572e-05,
+ "loss": 0.1224,
+ "step": 2550
+ },
+ {
+ "epoch": 14.6,
+ "grad_norm": 0.8119840621948242,
+ "learning_rate": 8.108571428571428e-05,
+ "loss": 0.1023,
+ "step": 2555
+ },
+ {
+ "epoch": 14.628571428571428,
+ "grad_norm": 0.6380637288093567,
+ "learning_rate": 8.065714285714285e-05,
+ "loss": 0.0893,
+ "step": 2560
+ },
+ {
+ "epoch": 14.657142857142857,
+ "grad_norm": 0.7857063412666321,
+ "learning_rate": 8.022857142857142e-05,
+ "loss": 0.1227,
+ "step": 2565
+ },
+ {
+ "epoch": 14.685714285714285,
+ "grad_norm": 0.6368046998977661,
+ "learning_rate": 7.98e-05,
+ "loss": 0.1074,
+ "step": 2570
+ },
+ {
+ "epoch": 14.714285714285714,
+ "grad_norm": 0.7269926071166992,
+ "learning_rate": 7.937142857142856e-05,
+ "loss": 0.1166,
+ "step": 2575
+ },
+ {
+ "epoch": 14.742857142857144,
+ "grad_norm": 0.6903791427612305,
+ "learning_rate": 7.894285714285713e-05,
+ "loss": 0.1274,
+ "step": 2580
+ },
+ {
+ "epoch": 14.771428571428572,
+ "grad_norm": 0.8257679343223572,
+ "learning_rate": 7.851428571428571e-05,
+ "loss": 0.1274,
+ "step": 2585
+ },
+ {
+ "epoch": 14.8,
+ "grad_norm": 1.0489627122879028,
+ "learning_rate": 7.808571428571428e-05,
+ "loss": 0.1091,
+ "step": 2590
+ },
+ {
+ "epoch": 14.82857142857143,
+ "grad_norm": 0.6699196696281433,
+ "learning_rate": 7.765714285714284e-05,
+ "loss": 0.1244,
+ "step": 2595
+ },
+ {
+ "epoch": 14.857142857142858,
+ "grad_norm": 0.61530601978302,
+ "learning_rate": 7.722857142857143e-05,
+ "loss": 0.1122,
+ "step": 2600
+ },
+ {
+ "epoch": 14.885714285714286,
+ "grad_norm": 0.5789124369621277,
+ "learning_rate": 7.68e-05,
+ "loss": 0.1272,
+ "step": 2605
+ },
+ {
+ "epoch": 14.914285714285715,
+ "grad_norm": 2.1323459148406982,
+ "learning_rate": 7.637142857142857e-05,
+ "loss": 0.1034,
+ "step": 2610
+ },
+ {
+ "epoch": 14.942857142857143,
+ "grad_norm": 1.2433545589447021,
+ "learning_rate": 7.594285714285714e-05,
+ "loss": 0.1052,
+ "step": 2615
+ },
+ {
+ "epoch": 14.971428571428572,
+ "grad_norm": 0.868093729019165,
+ "learning_rate": 7.551428571428571e-05,
+ "loss": 0.111,
+ "step": 2620
+ },
+ {
+ "epoch": 15.0,
+ "grad_norm": 0.6479918360710144,
+ "learning_rate": 7.508571428571428e-05,
+ "loss": 0.1067,
+ "step": 2625
+ },
+ {
+ "epoch": 15.028571428571428,
+ "grad_norm": 0.8062720894813538,
+ "learning_rate": 7.465714285714285e-05,
+ "loss": 0.1113,
+ "step": 2630
+ },
+ {
+ "epoch": 15.057142857142857,
+ "grad_norm": 0.7333181500434875,
+ "learning_rate": 7.422857142857142e-05,
+ "loss": 0.0985,
+ "step": 2635
+ },
+ {
+ "epoch": 15.085714285714285,
+ "grad_norm": 0.550039529800415,
+ "learning_rate": 7.379999999999999e-05,
+ "loss": 0.1077,
+ "step": 2640
+ },
+ {
+ "epoch": 15.114285714285714,
+ "grad_norm": 0.9256687164306641,
+ "learning_rate": 7.337142857142856e-05,
+ "loss": 0.0875,
+ "step": 2645
+ },
+ {
+ "epoch": 15.142857142857142,
+ "grad_norm": 0.6421870589256287,
+ "learning_rate": 7.294285714285713e-05,
+ "loss": 0.1069,
+ "step": 2650
+ },
+ {
+ "epoch": 15.17142857142857,
+ "grad_norm": 0.6614648699760437,
+ "learning_rate": 7.25142857142857e-05,
+ "loss": 0.1249,
+ "step": 2655
+ },
+ {
+ "epoch": 15.2,
+ "grad_norm": 0.8273601531982422,
+ "learning_rate": 7.208571428571429e-05,
+ "loss": 0.1135,
+ "step": 2660
+ },
+ {
+ "epoch": 15.228571428571428,
+ "grad_norm": 0.6795836687088013,
+ "learning_rate": 7.165714285714284e-05,
+ "loss": 0.1081,
+ "step": 2665
+ },
+ {
+ "epoch": 15.257142857142856,
+ "grad_norm": 0.7508160471916199,
+ "learning_rate": 7.122857142857143e-05,
+ "loss": 0.0869,
+ "step": 2670
+ },
+ {
+ "epoch": 15.285714285714286,
+ "grad_norm": 0.7219347357749939,
+ "learning_rate": 7.079999999999999e-05,
+ "loss": 0.1115,
+ "step": 2675
+ },
+ {
+ "epoch": 15.314285714285715,
+ "grad_norm": 0.5592671036720276,
+ "learning_rate": 7.037142857142857e-05,
+ "loss": 0.1116,
+ "step": 2680
+ },
+ {
+ "epoch": 15.342857142857143,
+ "grad_norm": 0.8736717104911804,
+ "learning_rate": 6.994285714285714e-05,
+ "loss": 0.0784,
+ "step": 2685
+ },
+ {
+ "epoch": 15.371428571428572,
+ "grad_norm": 0.6056572198867798,
+ "learning_rate": 6.951428571428571e-05,
+ "loss": 0.1105,
+ "step": 2690
+ },
+ {
+ "epoch": 15.4,
+ "grad_norm": 0.671410322189331,
+ "learning_rate": 6.908571428571428e-05,
+ "loss": 0.1219,
+ "step": 2695
+ },
+ {
+ "epoch": 15.428571428571429,
+ "grad_norm": 0.7952276468276978,
+ "learning_rate": 6.865714285714285e-05,
+ "loss": 0.0865,
+ "step": 2700
+ },
+ {
+ "epoch": 15.457142857142857,
+ "grad_norm": 0.8185123205184937,
+ "learning_rate": 6.822857142857142e-05,
+ "loss": 0.1095,
+ "step": 2705
+ },
+ {
+ "epoch": 15.485714285714286,
+ "grad_norm": 0.6969497203826904,
+ "learning_rate": 6.78e-05,
+ "loss": 0.0928,
+ "step": 2710
+ },
+ {
+ "epoch": 15.514285714285714,
+ "grad_norm": 0.7323058843612671,
+ "learning_rate": 6.737142857142857e-05,
+ "loss": 0.099,
+ "step": 2715
+ },
+ {
+ "epoch": 15.542857142857143,
+ "grad_norm": 0.6498017311096191,
+ "learning_rate": 6.694285714285714e-05,
+ "loss": 0.0871,
+ "step": 2720
+ },
+ {
+ "epoch": 15.571428571428571,
+ "grad_norm": 2.0899710655212402,
+ "learning_rate": 6.65142857142857e-05,
+ "loss": 0.1306,
+ "step": 2725
+ },
+ {
+ "epoch": 15.6,
+ "grad_norm": 1.0896337032318115,
+ "learning_rate": 6.608571428571428e-05,
+ "loss": 0.1085,
+ "step": 2730
+ },
+ {
+ "epoch": 15.628571428571428,
+ "grad_norm": 0.6709671020507812,
+ "learning_rate": 6.565714285714285e-05,
+ "loss": 0.0977,
+ "step": 2735
+ },
+ {
+ "epoch": 15.657142857142857,
+ "grad_norm": 0.6750431060791016,
+ "learning_rate": 6.522857142857142e-05,
+ "loss": 0.1154,
+ "step": 2740
+ },
+ {
+ "epoch": 15.685714285714285,
+ "grad_norm": 1.2888147830963135,
+ "learning_rate": 6.479999999999999e-05,
+ "loss": 0.0918,
+ "step": 2745
+ },
+ {
+ "epoch": 15.714285714285714,
+ "grad_norm": 0.9803931713104248,
+ "learning_rate": 6.437142857142857e-05,
+ "loss": 0.112,
+ "step": 2750
+ },
+ {
+ "epoch": 15.742857142857144,
+ "grad_norm": 0.8548974394798279,
+ "learning_rate": 6.394285714285713e-05,
+ "loss": 0.0974,
+ "step": 2755
+ },
+ {
+ "epoch": 15.771428571428572,
+ "grad_norm": 0.7924854159355164,
+ "learning_rate": 6.351428571428572e-05,
+ "loss": 0.1344,
+ "step": 2760
+ },
+ {
+ "epoch": 15.8,
+ "grad_norm": 0.9245836138725281,
+ "learning_rate": 6.308571428571429e-05,
+ "loss": 0.1182,
+ "step": 2765
+ },
+ {
+ "epoch": 15.82857142857143,
+ "grad_norm": 0.6067193150520325,
+ "learning_rate": 6.265714285714286e-05,
+ "loss": 0.0959,
+ "step": 2770
+ },
+ {
+ "epoch": 15.857142857142858,
+ "grad_norm": 0.5575870275497437,
+ "learning_rate": 6.222857142857143e-05,
+ "loss": 0.1208,
+ "step": 2775
+ },
+ {
+ "epoch": 15.885714285714286,
+ "grad_norm": 0.8608399629592896,
+ "learning_rate": 6.18e-05,
+ "loss": 0.0937,
+ "step": 2780
+ },
+ {
+ "epoch": 15.914285714285715,
+ "grad_norm": 0.6910924911499023,
+ "learning_rate": 6.137142857142857e-05,
+ "loss": 0.1175,
+ "step": 2785
+ },
+ {
+ "epoch": 15.942857142857143,
+ "grad_norm": 0.7266614437103271,
+ "learning_rate": 6.094285714285714e-05,
+ "loss": 0.1023,
+ "step": 2790
+ },
+ {
+ "epoch": 15.971428571428572,
+ "grad_norm": 0.7580139636993408,
+ "learning_rate": 6.051428571428571e-05,
+ "loss": 0.1103,
+ "step": 2795
+ },
+ {
+ "epoch": 16.0,
+ "grad_norm": 0.9288797974586487,
+ "learning_rate": 6.008571428571428e-05,
+ "loss": 0.0892,
+ "step": 2800
+ },
+ {
+ "epoch": 16.02857142857143,
+ "grad_norm": 1.4218194484710693,
+ "learning_rate": 5.9657142857142845e-05,
+ "loss": 0.104,
+ "step": 2805
+ },
+ {
+ "epoch": 16.057142857142857,
+ "grad_norm": 0.7665567994117737,
+ "learning_rate": 5.922857142857142e-05,
+ "loss": 0.1084,
+ "step": 2810
+ },
+ {
+ "epoch": 16.085714285714285,
+ "grad_norm": 0.8587457537651062,
+ "learning_rate": 5.88e-05,
+ "loss": 0.1041,
+ "step": 2815
+ },
+ {
+ "epoch": 16.114285714285714,
+ "grad_norm": 0.5792443752288818,
+ "learning_rate": 5.837142857142856e-05,
+ "loss": 0.1086,
+ "step": 2820
+ },
+ {
+ "epoch": 16.142857142857142,
+ "grad_norm": 0.6510186195373535,
+ "learning_rate": 5.794285714285714e-05,
+ "loss": 0.0919,
+ "step": 2825
+ },
+ {
+ "epoch": 16.17142857142857,
+ "grad_norm": 1.170145869255066,
+ "learning_rate": 5.751428571428571e-05,
+ "loss": 0.1083,
+ "step": 2830
+ },
+ {
+ "epoch": 16.2,
+ "grad_norm": 1.0514795780181885,
+ "learning_rate": 5.708571428571428e-05,
+ "loss": 0.1223,
+ "step": 2835
+ },
+ {
+ "epoch": 16.228571428571428,
+ "grad_norm": 0.7993499040603638,
+ "learning_rate": 5.665714285714285e-05,
+ "loss": 0.1101,
+ "step": 2840
+ },
+ {
+ "epoch": 16.257142857142856,
+ "grad_norm": 0.6342432498931885,
+ "learning_rate": 5.622857142857142e-05,
+ "loss": 0.1243,
+ "step": 2845
+ },
+ {
+ "epoch": 16.285714285714285,
+ "grad_norm": 1.2524505853652954,
+ "learning_rate": 5.5799999999999994e-05,
+ "loss": 0.1251,
+ "step": 2850
+ },
+ {
+ "epoch": 16.314285714285713,
+ "grad_norm": 1.0769789218902588,
+ "learning_rate": 5.537142857142857e-05,
+ "loss": 0.1074,
+ "step": 2855
+ },
+ {
+ "epoch": 16.34285714285714,
+ "grad_norm": 1.8232245445251465,
+ "learning_rate": 5.4942857142857136e-05,
+ "loss": 0.0929,
+ "step": 2860
+ },
+ {
+ "epoch": 16.37142857142857,
+ "grad_norm": 0.814189612865448,
+ "learning_rate": 5.451428571428571e-05,
+ "loss": 0.0998,
+ "step": 2865
+ },
+ {
+ "epoch": 16.4,
+ "grad_norm": 0.9731772541999817,
+ "learning_rate": 5.4085714285714284e-05,
+ "loss": 0.0849,
+ "step": 2870
+ },
+ {
+ "epoch": 16.428571428571427,
+ "grad_norm": 0.778213381767273,
+ "learning_rate": 5.3657142857142855e-05,
+ "loss": 0.0907,
+ "step": 2875
+ },
+ {
+ "epoch": 16.457142857142856,
+ "grad_norm": 0.9219964146614075,
+ "learning_rate": 5.3228571428571425e-05,
+ "loss": 0.0855,
+ "step": 2880
+ },
+ {
+ "epoch": 16.485714285714284,
+ "grad_norm": 0.7354393005371094,
+ "learning_rate": 5.279999999999999e-05,
+ "loss": 0.1296,
+ "step": 2885
+ },
+ {
+ "epoch": 16.514285714285712,
+ "grad_norm": 0.6051219701766968,
+ "learning_rate": 5.2371428571428567e-05,
+ "loss": 0.1086,
+ "step": 2890
+ },
+ {
+ "epoch": 16.542857142857144,
+ "grad_norm": 0.8592603206634521,
+ "learning_rate": 5.1942857142857144e-05,
+ "loss": 0.1017,
+ "step": 2895
+ },
+ {
+ "epoch": 16.571428571428573,
+ "grad_norm": 0.5748846530914307,
+ "learning_rate": 5.151428571428571e-05,
+ "loss": 0.0775,
+ "step": 2900
+ },
+ {
+ "epoch": 16.6,
+ "grad_norm": 0.6640213131904602,
+ "learning_rate": 5.1085714285714285e-05,
+ "loss": 0.1059,
+ "step": 2905
+ },
+ {
+ "epoch": 16.62857142857143,
+ "grad_norm": 0.9514361023902893,
+ "learning_rate": 5.065714285714285e-05,
+ "loss": 0.0832,
+ "step": 2910
+ },
+ {
+ "epoch": 16.65714285714286,
+ "grad_norm": 1.1062079668045044,
+ "learning_rate": 5.022857142857143e-05,
+ "loss": 0.0817,
+ "step": 2915
+ },
+ {
+ "epoch": 16.685714285714287,
+ "grad_norm": 0.6824453473091125,
+ "learning_rate": 4.98e-05,
+ "loss": 0.1064,
+ "step": 2920
+ },
+ {
+ "epoch": 16.714285714285715,
+ "grad_norm": 0.643827497959137,
+ "learning_rate": 4.937142857142856e-05,
+ "loss": 0.1196,
+ "step": 2925
+ },
+ {
+ "epoch": 16.742857142857144,
+ "grad_norm": 0.7824274897575378,
+ "learning_rate": 4.894285714285714e-05,
+ "loss": 0.0945,
+ "step": 2930
+ },
+ {
+ "epoch": 16.771428571428572,
+ "grad_norm": 0.7110689878463745,
+ "learning_rate": 4.8514285714285716e-05,
+ "loss": 0.1124,
+ "step": 2935
+ },
+ {
+ "epoch": 16.8,
+ "grad_norm": 0.9542856812477112,
+ "learning_rate": 4.808571428571428e-05,
+ "loss": 0.1036,
+ "step": 2940
+ },
+ {
+ "epoch": 16.82857142857143,
+ "grad_norm": 0.6353528499603271,
+ "learning_rate": 4.765714285714286e-05,
+ "loss": 0.0977,
+ "step": 2945
+ },
+ {
+ "epoch": 16.857142857142858,
+ "grad_norm": 0.843910813331604,
+ "learning_rate": 4.722857142857142e-05,
+ "loss": 0.1164,
+ "step": 2950
+ },
+ {
+ "epoch": 16.885714285714286,
+ "grad_norm": 0.9607085585594177,
+ "learning_rate": 4.68e-05,
+ "loss": 0.1111,
+ "step": 2955
+ },
+ {
+ "epoch": 16.914285714285715,
+ "grad_norm": 0.7393201589584351,
+ "learning_rate": 4.637142857142857e-05,
+ "loss": 0.106,
+ "step": 2960
+ },
+ {
+ "epoch": 16.942857142857143,
+ "grad_norm": 0.5248494148254395,
+ "learning_rate": 4.5942857142857134e-05,
+ "loss": 0.1017,
+ "step": 2965
+ },
+ {
+ "epoch": 16.97142857142857,
+ "grad_norm": 0.8800868988037109,
+ "learning_rate": 4.551428571428571e-05,
+ "loss": 0.0872,
+ "step": 2970
+ },
+ {
+ "epoch": 17.0,
+ "grad_norm": 0.8447640538215637,
+ "learning_rate": 4.5085714285714275e-05,
+ "loss": 0.1293,
+ "step": 2975
+ },
+ {
+ "epoch": 17.02857142857143,
+ "grad_norm": 0.5356553792953491,
+ "learning_rate": 4.465714285714285e-05,
+ "loss": 0.0984,
+ "step": 2980
+ },
+ {
+ "epoch": 17.057142857142857,
+ "grad_norm": 0.7713034152984619,
+ "learning_rate": 4.422857142857143e-05,
+ "loss": 0.0858,
+ "step": 2985
+ },
+ {
+ "epoch": 17.085714285714285,
+ "grad_norm": 0.9854580760002136,
+ "learning_rate": 4.3799999999999994e-05,
+ "loss": 0.1237,
+ "step": 2990
+ },
+ {
+ "epoch": 17.114285714285714,
+ "grad_norm": 0.7012975811958313,
+ "learning_rate": 4.337142857142857e-05,
+ "loss": 0.1233,
+ "step": 2995
+ },
+ {
+ "epoch": 17.142857142857142,
+ "grad_norm": 0.5461836457252502,
+ "learning_rate": 4.294285714285714e-05,
+ "loss": 0.0978,
+ "step": 3000
+ },
+ {
+ "epoch": 17.17142857142857,
+ "grad_norm": 0.8236174583435059,
+ "learning_rate": 4.2514285714285706e-05,
+ "loss": 0.1041,
+ "step": 3005
+ },
+ {
+ "epoch": 17.2,
+ "grad_norm": 1.040204644203186,
+ "learning_rate": 4.2085714285714284e-05,
+ "loss": 0.0974,
+ "step": 3010
+ },
+ {
+ "epoch": 17.228571428571428,
+ "grad_norm": 0.6091800928115845,
+ "learning_rate": 4.165714285714285e-05,
+ "loss": 0.1098,
+ "step": 3015
+ },
+ {
+ "epoch": 17.257142857142856,
+ "grad_norm": 0.7302913069725037,
+ "learning_rate": 4.1228571428571425e-05,
+ "loss": 0.1162,
+ "step": 3020
+ },
+ {
+ "epoch": 17.285714285714285,
+ "grad_norm": 0.7015142440795898,
+ "learning_rate": 4.08e-05,
+ "loss": 0.1059,
+ "step": 3025
+ },
+ {
+ "epoch": 17.314285714285713,
+ "grad_norm": 0.8828005790710449,
+ "learning_rate": 4.0371428571428566e-05,
+ "loss": 0.1181,
+ "step": 3030
+ },
+ {
+ "epoch": 17.34285714285714,
+ "grad_norm": 0.7601356506347656,
+ "learning_rate": 3.994285714285714e-05,
+ "loss": 0.1034,
+ "step": 3035
+ },
+ {
+ "epoch": 17.37142857142857,
+ "grad_norm": 0.8163303136825562,
+ "learning_rate": 3.951428571428571e-05,
+ "loss": 0.1221,
+ "step": 3040
+ },
+ {
+ "epoch": 17.4,
+ "grad_norm": 0.6229556202888489,
+ "learning_rate": 3.908571428571428e-05,
+ "loss": 0.0807,
+ "step": 3045
+ },
+ {
+ "epoch": 17.428571428571427,
+ "grad_norm": 0.7089337706565857,
+ "learning_rate": 3.8657142857142856e-05,
+ "loss": 0.1059,
+ "step": 3050
+ },
+ {
+ "epoch": 17.457142857142856,
+ "grad_norm": 0.7208603024482727,
+ "learning_rate": 3.822857142857142e-05,
+ "loss": 0.1091,
+ "step": 3055
+ },
+ {
+ "epoch": 17.485714285714284,
+ "grad_norm": 0.6921990513801575,
+ "learning_rate": 3.78e-05,
+ "loss": 0.0945,
+ "step": 3060
+ },
+ {
+ "epoch": 17.514285714285712,
+ "grad_norm": 0.6333826184272766,
+ "learning_rate": 3.737142857142857e-05,
+ "loss": 0.0872,
+ "step": 3065
+ },
+ {
+ "epoch": 17.542857142857144,
+ "grad_norm": 0.634069561958313,
+ "learning_rate": 3.694285714285714e-05,
+ "loss": 0.103,
+ "step": 3070
+ },
+ {
+ "epoch": 17.571428571428573,
+ "grad_norm": 0.951956033706665,
+ "learning_rate": 3.651428571428571e-05,
+ "loss": 0.0803,
+ "step": 3075
+ },
+ {
+ "epoch": 17.6,
+ "grad_norm": 0.6465116143226624,
+ "learning_rate": 3.608571428571428e-05,
+ "loss": 0.1072,
+ "step": 3080
+ },
+ {
+ "epoch": 17.62857142857143,
+ "grad_norm": 0.9632124304771423,
+ "learning_rate": 3.565714285714285e-05,
+ "loss": 0.1052,
+ "step": 3085
+ },
+ {
+ "epoch": 17.65714285714286,
+ "grad_norm": 1.454498529434204,
+ "learning_rate": 3.522857142857142e-05,
+ "loss": 0.1019,
+ "step": 3090
+ },
+ {
+ "epoch": 17.685714285714287,
+ "grad_norm": 0.9264261722564697,
+ "learning_rate": 3.48e-05,
+ "loss": 0.0877,
+ "step": 3095
+ },
+ {
+ "epoch": 17.714285714285715,
+ "grad_norm": 0.913129985332489,
+ "learning_rate": 3.437142857142857e-05,
+ "loss": 0.1072,
+ "step": 3100
+ },
+ {
+ "epoch": 17.742857142857144,
+ "grad_norm": 0.6301171779632568,
+ "learning_rate": 3.394285714285714e-05,
+ "loss": 0.1095,
+ "step": 3105
+ },
+ {
+ "epoch": 17.771428571428572,
+ "grad_norm": 0.7603205442428589,
+ "learning_rate": 3.351428571428571e-05,
+ "loss": 0.1042,
+ "step": 3110
+ },
+ {
+ "epoch": 17.8,
+ "grad_norm": 0.5240493416786194,
+ "learning_rate": 3.308571428571428e-05,
+ "loss": 0.0901,
+ "step": 3115
+ },
+ {
+ "epoch": 17.82857142857143,
+ "grad_norm": 0.709513783454895,
+ "learning_rate": 3.265714285714285e-05,
+ "loss": 0.0899,
+ "step": 3120
+ },
+ {
+ "epoch": 17.857142857142858,
+ "grad_norm": 0.8211326003074646,
+ "learning_rate": 3.222857142857142e-05,
+ "loss": 0.106,
+ "step": 3125
+ },
+ {
+ "epoch": 17.885714285714286,
+ "grad_norm": 1.5929844379425049,
+ "learning_rate": 3.1799999999999994e-05,
+ "loss": 0.1282,
+ "step": 3130
+ },
+ {
+ "epoch": 17.914285714285715,
+ "grad_norm": 0.7880852818489075,
+ "learning_rate": 3.1371428571428564e-05,
+ "loss": 0.1076,
+ "step": 3135
+ },
+ {
+ "epoch": 17.942857142857143,
+ "grad_norm": 0.8155010342597961,
+ "learning_rate": 3.094285714285714e-05,
+ "loss": 0.1044,
+ "step": 3140
+ },
+ {
+ "epoch": 17.97142857142857,
+ "grad_norm": 0.7286632657051086,
+ "learning_rate": 3.0514285714285713e-05,
+ "loss": 0.1083,
+ "step": 3145
+ },
+ {
+ "epoch": 18.0,
+ "grad_norm": 0.618222177028656,
+ "learning_rate": 3.0085714285714283e-05,
+ "loss": 0.1051,
+ "step": 3150
+ },
+ {
+ "epoch": 18.02857142857143,
+ "grad_norm": 0.6627287864685059,
+ "learning_rate": 2.9657142857142854e-05,
+ "loss": 0.0861,
+ "step": 3155
+ },
+ {
+ "epoch": 18.057142857142857,
+ "grad_norm": 0.7387683987617493,
+ "learning_rate": 2.9228571428571428e-05,
+ "loss": 0.1008,
+ "step": 3160
+ },
+ {
+ "epoch": 18.085714285714285,
+ "grad_norm": 1.671528935432434,
+ "learning_rate": 2.88e-05,
+ "loss": 0.1032,
+ "step": 3165
+ },
+ {
+ "epoch": 18.114285714285714,
+ "grad_norm": 1.9250961542129517,
+ "learning_rate": 2.837142857142857e-05,
+ "loss": 0.1142,
+ "step": 3170
+ },
+ {
+ "epoch": 18.142857142857142,
+ "grad_norm": 0.7627262473106384,
+ "learning_rate": 2.794285714285714e-05,
+ "loss": 0.1047,
+ "step": 3175
+ },
+ {
+ "epoch": 18.17142857142857,
+ "grad_norm": 0.8871546983718872,
+ "learning_rate": 2.7514285714285714e-05,
+ "loss": 0.0964,
+ "step": 3180
+ },
+ {
+ "epoch": 18.2,
+ "grad_norm": 1.5572978258132935,
+ "learning_rate": 2.7085714285714285e-05,
+ "loss": 0.0898,
+ "step": 3185
+ },
+ {
+ "epoch": 18.228571428571428,
+ "grad_norm": 0.5867496132850647,
+ "learning_rate": 2.6657142857142856e-05,
+ "loss": 0.104,
+ "step": 3190
+ },
+ {
+ "epoch": 18.257142857142856,
+ "grad_norm": 1.2859222888946533,
+ "learning_rate": 2.6228571428571426e-05,
+ "loss": 0.1016,
+ "step": 3195
+ },
+ {
+ "epoch": 18.285714285714285,
+ "grad_norm": 0.9340577721595764,
+ "learning_rate": 2.5799999999999997e-05,
+ "loss": 0.1128,
+ "step": 3200
+ },
+ {
+ "epoch": 18.314285714285713,
+ "grad_norm": 0.7944216132164001,
+ "learning_rate": 2.537142857142857e-05,
+ "loss": 0.086,
+ "step": 3205
+ },
+ {
+ "epoch": 18.34285714285714,
+ "grad_norm": 0.8302488327026367,
+ "learning_rate": 2.4942857142857142e-05,
+ "loss": 0.1167,
+ "step": 3210
+ },
+ {
+ "epoch": 18.37142857142857,
+ "grad_norm": 0.7752293348312378,
+ "learning_rate": 2.4514285714285712e-05,
+ "loss": 0.0926,
+ "step": 3215
+ },
+ {
+ "epoch": 18.4,
+ "grad_norm": 0.7102646827697754,
+ "learning_rate": 2.4085714285714283e-05,
+ "loss": 0.1076,
+ "step": 3220
+ },
+ {
+ "epoch": 18.428571428571427,
+ "grad_norm": 0.8423459529876709,
+ "learning_rate": 2.3657142857142857e-05,
+ "loss": 0.0922,
+ "step": 3225
+ },
+ {
+ "epoch": 18.457142857142856,
+ "grad_norm": 0.798784613609314,
+ "learning_rate": 2.3228571428571428e-05,
+ "loss": 0.1124,
+ "step": 3230
+ },
+ {
+ "epoch": 18.485714285714284,
+ "grad_norm": 0.8125432133674622,
+ "learning_rate": 2.28e-05,
+ "loss": 0.1113,
+ "step": 3235
+ },
+ {
+ "epoch": 18.514285714285712,
+ "grad_norm": 0.7036454081535339,
+ "learning_rate": 2.237142857142857e-05,
+ "loss": 0.1068,
+ "step": 3240
+ },
+ {
+ "epoch": 18.542857142857144,
+ "grad_norm": 1.1506881713867188,
+ "learning_rate": 2.1942857142857143e-05,
+ "loss": 0.0998,
+ "step": 3245
+ },
+ {
+ "epoch": 18.571428571428573,
+ "grad_norm": 0.6385633945465088,
+ "learning_rate": 2.1514285714285714e-05,
+ "loss": 0.1205,
+ "step": 3250
+ },
+ {
+ "epoch": 18.6,
+ "grad_norm": 0.6759969592094421,
+ "learning_rate": 2.1085714285714285e-05,
+ "loss": 0.1109,
+ "step": 3255
+ },
+ {
+ "epoch": 18.62857142857143,
+ "grad_norm": 0.49440646171569824,
+ "learning_rate": 2.0657142857142855e-05,
+ "loss": 0.0963,
+ "step": 3260
+ },
+ {
+ "epoch": 18.65714285714286,
+ "grad_norm": 0.5873342156410217,
+ "learning_rate": 2.022857142857143e-05,
+ "loss": 0.0918,
+ "step": 3265
+ },
+ {
+ "epoch": 18.685714285714287,
+ "grad_norm": 0.6430002450942993,
+ "learning_rate": 1.98e-05,
+ "loss": 0.0884,
+ "step": 3270
+ },
+ {
+ "epoch": 18.714285714285715,
+ "grad_norm": 0.6482405066490173,
+ "learning_rate": 1.937142857142857e-05,
+ "loss": 0.0986,
+ "step": 3275
+ },
+ {
+ "epoch": 18.742857142857144,
+ "grad_norm": 5.264317035675049,
+ "learning_rate": 1.894285714285714e-05,
+ "loss": 0.1177,
+ "step": 3280
+ },
+ {
+ "epoch": 18.771428571428572,
+ "grad_norm": 0.7969473600387573,
+ "learning_rate": 1.8514285714285712e-05,
+ "loss": 0.1058,
+ "step": 3285
+ },
+ {
+ "epoch": 18.8,
+ "grad_norm": 0.8115813732147217,
+ "learning_rate": 1.8085714285714283e-05,
+ "loss": 0.1013,
+ "step": 3290
+ },
+ {
+ "epoch": 18.82857142857143,
+ "grad_norm": 0.7753077745437622,
+ "learning_rate": 1.7657142857142857e-05,
+ "loss": 0.1146,
+ "step": 3295
+ },
+ {
+ "epoch": 18.857142857142858,
+ "grad_norm": 0.7666271328926086,
+ "learning_rate": 1.7228571428571428e-05,
+ "loss": 0.0982,
+ "step": 3300
+ },
+ {
+ "epoch": 18.885714285714286,
+ "grad_norm": 0.803175151348114,
+ "learning_rate": 1.68e-05,
+ "loss": 0.0948,
+ "step": 3305
+ },
+ {
+ "epoch": 18.914285714285715,
+ "grad_norm": 0.6946846842765808,
+ "learning_rate": 1.637142857142857e-05,
+ "loss": 0.1233,
+ "step": 3310
+ },
+ {
+ "epoch": 18.942857142857143,
+ "grad_norm": 0.6007334589958191,
+ "learning_rate": 1.5942857142857143e-05,
+ "loss": 0.0998,
+ "step": 3315
+ },
+ {
+ "epoch": 18.97142857142857,
+ "grad_norm": 1.0508538484573364,
+ "learning_rate": 1.5514285714285714e-05,
+ "loss": 0.1141,
+ "step": 3320
+ },
+ {
+ "epoch": 19.0,
+ "grad_norm": 1.0431159734725952,
+ "learning_rate": 1.5085714285714285e-05,
+ "loss": 0.0844,
+ "step": 3325
+ },
+ {
+ "epoch": 19.02857142857143,
+ "grad_norm": 0.8848056793212891,
+ "learning_rate": 1.4657142857142855e-05,
+ "loss": 0.1158,
+ "step": 3330
+ },
+ {
+ "epoch": 19.057142857142857,
+ "grad_norm": 0.6595423817634583,
+ "learning_rate": 1.4228571428571428e-05,
+ "loss": 0.1045,
+ "step": 3335
+ },
+ {
+ "epoch": 19.085714285714285,
+ "grad_norm": 0.7581779360771179,
+ "learning_rate": 1.3799999999999998e-05,
+ "loss": 0.1042,
+ "step": 3340
+ },
+ {
+ "epoch": 19.114285714285714,
+ "grad_norm": 0.7570774555206299,
+ "learning_rate": 1.337142857142857e-05,
+ "loss": 0.0884,
+ "step": 3345
+ },
+ {
+ "epoch": 19.142857142857142,
+ "grad_norm": 0.8883652687072754,
+ "learning_rate": 1.2942857142857141e-05,
+ "loss": 0.0921,
+ "step": 3350
+ },
+ {
+ "epoch": 19.17142857142857,
+ "grad_norm": 0.5633381605148315,
+ "learning_rate": 1.2514285714285714e-05,
+ "loss": 0.0875,
+ "step": 3355
+ },
+ {
+ "epoch": 19.2,
+ "grad_norm": 0.7671384215354919,
+ "learning_rate": 1.2085714285714284e-05,
+ "loss": 0.0828,
+ "step": 3360
+ },
+ {
+ "epoch": 19.228571428571428,
+ "grad_norm": 0.7777629494667053,
+ "learning_rate": 1.1657142857142855e-05,
+ "loss": 0.083,
+ "step": 3365
+ },
+ {
+ "epoch": 19.257142857142856,
+ "grad_norm": 0.602395236492157,
+ "learning_rate": 1.1228571428571428e-05,
+ "loss": 0.1072,
+ "step": 3370
+ },
+ {
+ "epoch": 19.285714285714285,
+ "grad_norm": 0.8774672746658325,
+ "learning_rate": 1.0799999999999998e-05,
+ "loss": 0.1093,
+ "step": 3375
+ },
+ {
+ "epoch": 19.314285714285713,
+ "grad_norm": 0.6401615142822266,
+ "learning_rate": 1.037142857142857e-05,
+ "loss": 0.1166,
+ "step": 3380
+ },
+ {
+ "epoch": 19.34285714285714,
+ "grad_norm": 0.6146759390830994,
+ "learning_rate": 9.942857142857141e-06,
+ "loss": 0.0972,
+ "step": 3385
+ },
+ {
+ "epoch": 19.37142857142857,
+ "grad_norm": 0.7564222812652588,
+ "learning_rate": 9.514285714285714e-06,
+ "loss": 0.1029,
+ "step": 3390
+ },
+ {
+ "epoch": 19.4,
+ "grad_norm": 0.8068543076515198,
+ "learning_rate": 9.085714285714286e-06,
+ "loss": 0.0889,
+ "step": 3395
+ },
+ {
+ "epoch": 19.428571428571427,
+ "grad_norm": 0.8872269988059998,
+ "learning_rate": 8.657142857142855e-06,
+ "loss": 0.0918,
+ "step": 3400
+ },
+ {
+ "epoch": 19.457142857142856,
+ "grad_norm": 0.7162922620773315,
+ "learning_rate": 8.228571428571427e-06,
+ "loss": 0.1072,
+ "step": 3405
+ },
+ {
+ "epoch": 19.485714285714284,
+ "grad_norm": 0.7198708057403564,
+ "learning_rate": 7.799999999999998e-06,
+ "loss": 0.097,
+ "step": 3410
+ },
+ {
+ "epoch": 19.514285714285712,
+ "grad_norm": 0.9534723162651062,
+ "learning_rate": 7.3714285714285706e-06,
+ "loss": 0.1075,
+ "step": 3415
+ },
+ {
+ "epoch": 19.542857142857144,
+ "grad_norm": 0.9135831594467163,
+ "learning_rate": 6.942857142857142e-06,
+ "loss": 0.0791,
+ "step": 3420
+ },
+ {
+ "epoch": 19.571428571428573,
+ "grad_norm": 0.8475795984268188,
+ "learning_rate": 6.514285714285714e-06,
+ "loss": 0.1158,
+ "step": 3425
+ },
+ {
+ "epoch": 19.6,
+ "grad_norm": 0.7961953282356262,
+ "learning_rate": 6.085714285714285e-06,
+ "loss": 0.1049,
+ "step": 3430
+ },
+ {
+ "epoch": 19.62857142857143,
+ "grad_norm": 0.8671003580093384,
+ "learning_rate": 5.657142857142857e-06,
+ "loss": 0.1218,
+ "step": 3435
+ },
+ {
+ "epoch": 19.65714285714286,
+ "grad_norm": 1.1290555000305176,
+ "learning_rate": 5.228571428571428e-06,
+ "loss": 0.0798,
+ "step": 3440
+ },
+ {
+ "epoch": 19.685714285714287,
+ "grad_norm": 0.9520301222801208,
+ "learning_rate": 4.8e-06,
+ "loss": 0.1026,
+ "step": 3445
+ },
+ {
+ "epoch": 19.714285714285715,
+ "grad_norm": 1.114631175994873,
+ "learning_rate": 4.371428571428571e-06,
+ "loss": 0.1242,
+ "step": 3450
+ },
+ {
+ "epoch": 19.742857142857144,
+ "grad_norm": 0.8614441752433777,
+ "learning_rate": 3.942857142857143e-06,
+ "loss": 0.1103,
+ "step": 3455
+ },
+ {
+ "epoch": 19.771428571428572,
+ "grad_norm": 3.417344570159912,
+ "learning_rate": 3.5142857142857136e-06,
+ "loss": 0.1088,
+ "step": 3460
+ },
+ {
+ "epoch": 19.8,
+ "grad_norm": 0.5386614203453064,
+ "learning_rate": 3.085714285714285e-06,
+ "loss": 0.0923,
+ "step": 3465
+ },
+ {
+ "epoch": 19.82857142857143,
+ "grad_norm": 0.6228803396224976,
+ "learning_rate": 2.6571428571428566e-06,
+ "loss": 0.0872,
+ "step": 3470
+ },
+ {
+ "epoch": 19.857142857142858,
+ "grad_norm": 0.9215123653411865,
+ "learning_rate": 2.228571428571428e-06,
+ "loss": 0.0927,
+ "step": 3475
+ },
+ {
+ "epoch": 19.885714285714286,
+ "grad_norm": 0.5741862058639526,
+ "learning_rate": 1.8e-06,
+ "loss": 0.1066,
+ "step": 3480
+ },
+ {
+ "epoch": 19.914285714285715,
+ "grad_norm": 0.6639522910118103,
+ "learning_rate": 1.3714285714285715e-06,
+ "loss": 0.0966,
+ "step": 3485
+ },
+ {
+ "epoch": 19.942857142857143,
+ "grad_norm": 0.5956189632415771,
+ "learning_rate": 9.428571428571428e-07,
+ "loss": 0.1126,
+ "step": 3490
+ },
+ {
+ "epoch": 19.97142857142857,
+ "grad_norm": 0.7502082586288452,
+ "learning_rate": 5.142857142857142e-07,
+ "loss": 0.0887,
+ "step": 3495
+ },
+ {
+ "epoch": 20.0,
+ "grad_norm": 0.7295340895652771,
+ "learning_rate": 8.571428571428572e-08,
+ "loss": 0.0894,
+ "step": 3500
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-3500/training_args.bin b/glot-contrastive-final-lora/checkpoint-3500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-3500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/checkpoint-500/README.md b/glot-contrastive-final-lora/checkpoint-500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/README.md
@@ -0,0 +1,206 @@
+---
+base_model: ./glot-mlm-adapted
+library_name: peft
+tags:
+- base_model:adapter:./glot-mlm-adapted
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "./glot-mlm-adapted",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 32,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 16,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "FEATURE_EXTRACTION",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/glot-contrastive-final-lora/checkpoint-500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ec8776f38c68f5ed6c88c8787cd9329a82e969f
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1b44b5f55073f2521757e46e9180a390c2cdea6590d7bef8f961ffef9fd06fb
+size 2365824
diff --git a/glot-contrastive-final-lora/checkpoint-500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c8e6208f6ff74597b0ac2dec615305caa0dc94a
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:524eed2618966787cdc3c8b7a304e201c8b5fd9dc5932284f208f7ce24f96dec
+size 4760395
diff --git a/glot-contrastive-final-lora/checkpoint-500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3cedd628c79e483da81d5902f59b1f462f277654
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91667f27232c2f24307f0b8d5980c62b6cf48987f494164a9a220e1f7de29d1c
+size 14645
diff --git a/glot-contrastive-final-lora/checkpoint-500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8cb684291513b6a7c362908f1d14d27539fec384
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8327fad933e6a9c750f39d471d04a7cfe660f5e99f616254567588fe1a243c3
+size 1465
diff --git a/glot-contrastive-final-lora/checkpoint-500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-500/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/checkpoint-500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/checkpoint-500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/checkpoint-500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1aa33e916da6c284236ad1e1bd75fc05ee038d30
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/trainer_state.json
@@ -0,0 +1,734 @@
+{
+ "best_global_step": null,
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.857142857142857,
+ "eval_steps": 5,
+ "global_step": 500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02857142857142857,
+ "grad_norm": 0.1407003551721573,
+ "learning_rate": 0.00029965714285714283,
+ "loss": 0.9726,
+ "step": 5
+ },
+ {
+ "epoch": 0.05714285714285714,
+ "grad_norm": 0.26689061522483826,
+ "learning_rate": 0.0002992285714285714,
+ "loss": 0.9633,
+ "step": 10
+ },
+ {
+ "epoch": 0.08571428571428572,
+ "grad_norm": 0.8670485615730286,
+ "learning_rate": 0.0002988,
+ "loss": 0.9013,
+ "step": 15
+ },
+ {
+ "epoch": 0.11428571428571428,
+ "grad_norm": 0.9785467386245728,
+ "learning_rate": 0.00029837142857142853,
+ "loss": 0.6942,
+ "step": 20
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 1.3083932399749756,
+ "learning_rate": 0.0002979428571428571,
+ "loss": 0.4472,
+ "step": 25
+ },
+ {
+ "epoch": 0.17142857142857143,
+ "grad_norm": 1.6103293895721436,
+ "learning_rate": 0.0002975142857142857,
+ "loss": 0.3782,
+ "step": 30
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 2.6353416442871094,
+ "learning_rate": 0.0002970857142857143,
+ "loss": 0.3732,
+ "step": 35
+ },
+ {
+ "epoch": 0.22857142857142856,
+ "grad_norm": 0.9949072003364563,
+ "learning_rate": 0.0002966571428571428,
+ "loss": 0.3506,
+ "step": 40
+ },
+ {
+ "epoch": 0.2571428571428571,
+ "grad_norm": 1.280673861503601,
+ "learning_rate": 0.0002962285714285714,
+ "loss": 0.3346,
+ "step": 45
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.7681456208229065,
+ "learning_rate": 0.0002958,
+ "loss": 0.2832,
+ "step": 50
+ },
+ {
+ "epoch": 0.3142857142857143,
+ "grad_norm": 1.0000813007354736,
+ "learning_rate": 0.0002953714285714285,
+ "loss": 0.2603,
+ "step": 55
+ },
+ {
+ "epoch": 0.34285714285714286,
+ "grad_norm": 1.0222399234771729,
+ "learning_rate": 0.0002949428571428571,
+ "loss": 0.2507,
+ "step": 60
+ },
+ {
+ "epoch": 0.37142857142857144,
+ "grad_norm": 0.896902322769165,
+ "learning_rate": 0.0002945142857142857,
+ "loss": 0.2556,
+ "step": 65
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9035541415214539,
+ "learning_rate": 0.00029408571428571426,
+ "loss": 0.2402,
+ "step": 70
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 1.4886469841003418,
+ "learning_rate": 0.00029365714285714285,
+ "loss": 0.2376,
+ "step": 75
+ },
+ {
+ "epoch": 0.45714285714285713,
+ "grad_norm": 0.8951187133789062,
+ "learning_rate": 0.0002932285714285714,
+ "loss": 0.2276,
+ "step": 80
+ },
+ {
+ "epoch": 0.4857142857142857,
+ "grad_norm": 0.7876377105712891,
+ "learning_rate": 0.00029279999999999996,
+ "loss": 0.2537,
+ "step": 85
+ },
+ {
+ "epoch": 0.5142857142857142,
+ "grad_norm": 1.0927226543426514,
+ "learning_rate": 0.00029237142857142855,
+ "loss": 0.2152,
+ "step": 90
+ },
+ {
+ "epoch": 0.5428571428571428,
+ "grad_norm": 1.4946355819702148,
+ "learning_rate": 0.00029194285714285713,
+ "loss": 0.2441,
+ "step": 95
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.7082991600036621,
+ "learning_rate": 0.0002915142857142857,
+ "loss": 0.2708,
+ "step": 100
+ },
+ {
+ "epoch": 0.6,
+ "grad_norm": 0.670010507106781,
+ "learning_rate": 0.00029108571428571424,
+ "loss": 0.2396,
+ "step": 105
+ },
+ {
+ "epoch": 0.6285714285714286,
+ "grad_norm": 0.9797312021255493,
+ "learning_rate": 0.00029065714285714283,
+ "loss": 0.2275,
+ "step": 110
+ },
+ {
+ "epoch": 0.6571428571428571,
+ "grad_norm": 1.5220463275909424,
+ "learning_rate": 0.0002902285714285714,
+ "loss": 0.2114,
+ "step": 115
+ },
+ {
+ "epoch": 0.6857142857142857,
+ "grad_norm": 1.3326867818832397,
+ "learning_rate": 0.00028979999999999994,
+ "loss": 0.241,
+ "step": 120
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 1.1195529699325562,
+ "learning_rate": 0.0002893714285714285,
+ "loss": 0.2389,
+ "step": 125
+ },
+ {
+ "epoch": 0.7428571428571429,
+ "grad_norm": 0.7551061511039734,
+ "learning_rate": 0.0002889428571428571,
+ "loss": 0.2162,
+ "step": 130
+ },
+ {
+ "epoch": 0.7714285714285715,
+ "grad_norm": 1.018908977508545,
+ "learning_rate": 0.0002885142857142857,
+ "loss": 0.1924,
+ "step": 135
+ },
+ {
+ "epoch": 0.8,
+ "grad_norm": 2.123642921447754,
+ "learning_rate": 0.0002880857142857143,
+ "loss": 0.2174,
+ "step": 140
+ },
+ {
+ "epoch": 0.8285714285714286,
+ "grad_norm": 0.7585068941116333,
+ "learning_rate": 0.0002876571428571428,
+ "loss": 0.2006,
+ "step": 145
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 1.64150869846344,
+ "learning_rate": 0.0002872285714285714,
+ "loss": 0.1905,
+ "step": 150
+ },
+ {
+ "epoch": 0.8857142857142857,
+ "grad_norm": 0.9126951694488525,
+ "learning_rate": 0.0002868,
+ "loss": 0.2312,
+ "step": 155
+ },
+ {
+ "epoch": 0.9142857142857143,
+ "grad_norm": 0.7278801202774048,
+ "learning_rate": 0.00028637142857142856,
+ "loss": 0.2077,
+ "step": 160
+ },
+ {
+ "epoch": 0.9428571428571428,
+ "grad_norm": 0.8931339383125305,
+ "learning_rate": 0.00028594285714285715,
+ "loss": 0.1951,
+ "step": 165
+ },
+ {
+ "epoch": 0.9714285714285714,
+ "grad_norm": 1.0831843614578247,
+ "learning_rate": 0.0002855142857142857,
+ "loss": 0.2103,
+ "step": 170
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 1.3750063180923462,
+ "learning_rate": 0.00028508571428571426,
+ "loss": 0.2396,
+ "step": 175
+ },
+ {
+ "epoch": 1.0285714285714285,
+ "grad_norm": 0.8338337540626526,
+ "learning_rate": 0.00028465714285714285,
+ "loss": 0.2404,
+ "step": 180
+ },
+ {
+ "epoch": 1.0571428571428572,
+ "grad_norm": 1.2879024744033813,
+ "learning_rate": 0.0002842285714285714,
+ "loss": 0.2117,
+ "step": 185
+ },
+ {
+ "epoch": 1.0857142857142856,
+ "grad_norm": 1.6751821041107178,
+ "learning_rate": 0.00028379999999999996,
+ "loss": 0.1796,
+ "step": 190
+ },
+ {
+ "epoch": 1.1142857142857143,
+ "grad_norm": 0.9864417910575867,
+ "learning_rate": 0.00028337142857142854,
+ "loss": 0.1993,
+ "step": 195
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 1.0174155235290527,
+ "learning_rate": 0.00028294285714285713,
+ "loss": 0.2068,
+ "step": 200
+ },
+ {
+ "epoch": 1.1714285714285715,
+ "grad_norm": 1.029832124710083,
+ "learning_rate": 0.0002825142857142857,
+ "loss": 0.2015,
+ "step": 205
+ },
+ {
+ "epoch": 1.2,
+ "grad_norm": 0.7745446562767029,
+ "learning_rate": 0.00028208571428571424,
+ "loss": 0.2129,
+ "step": 210
+ },
+ {
+ "epoch": 1.2285714285714286,
+ "grad_norm": 2.5578622817993164,
+ "learning_rate": 0.0002816571428571428,
+ "loss": 0.2224,
+ "step": 215
+ },
+ {
+ "epoch": 1.2571428571428571,
+ "grad_norm": 2.4185051918029785,
+ "learning_rate": 0.0002812285714285714,
+ "loss": 0.2276,
+ "step": 220
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 1.4176461696624756,
+ "learning_rate": 0.0002808,
+ "loss": 0.1781,
+ "step": 225
+ },
+ {
+ "epoch": 1.3142857142857143,
+ "grad_norm": 0.709326982498169,
+ "learning_rate": 0.0002803714285714286,
+ "loss": 0.2177,
+ "step": 230
+ },
+ {
+ "epoch": 1.342857142857143,
+ "grad_norm": 0.8170766830444336,
+ "learning_rate": 0.0002799428571428571,
+ "loss": 0.1769,
+ "step": 235
+ },
+ {
+ "epoch": 1.3714285714285714,
+ "grad_norm": 1.3850761651992798,
+ "learning_rate": 0.0002795142857142857,
+ "loss": 0.2262,
+ "step": 240
+ },
+ {
+ "epoch": 1.4,
+ "grad_norm": 1.0064373016357422,
+ "learning_rate": 0.0002790857142857143,
+ "loss": 0.196,
+ "step": 245
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 1.9635728597640991,
+ "learning_rate": 0.0002786571428571428,
+ "loss": 0.2029,
+ "step": 250
+ },
+ {
+ "epoch": 1.457142857142857,
+ "grad_norm": 16.20791244506836,
+ "learning_rate": 0.0002782285714285714,
+ "loss": 0.3925,
+ "step": 255
+ },
+ {
+ "epoch": 1.4857142857142858,
+ "grad_norm": 1.4363322257995605,
+ "learning_rate": 0.0002778,
+ "loss": 0.3684,
+ "step": 260
+ },
+ {
+ "epoch": 1.5142857142857142,
+ "grad_norm": 0.9379534721374512,
+ "learning_rate": 0.00027737142857142856,
+ "loss": 0.2265,
+ "step": 265
+ },
+ {
+ "epoch": 1.5428571428571427,
+ "grad_norm": 0.8453512787818909,
+ "learning_rate": 0.00027694285714285714,
+ "loss": 0.1976,
+ "step": 270
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 2.316664695739746,
+ "learning_rate": 0.0002765142857142857,
+ "loss": 0.23,
+ "step": 275
+ },
+ {
+ "epoch": 1.6,
+ "grad_norm": 1.0548444986343384,
+ "learning_rate": 0.00027608571428571426,
+ "loss": 0.1823,
+ "step": 280
+ },
+ {
+ "epoch": 1.6285714285714286,
+ "grad_norm": 3.7894928455352783,
+ "learning_rate": 0.00027565714285714284,
+ "loss": 0.1962,
+ "step": 285
+ },
+ {
+ "epoch": 1.657142857142857,
+ "grad_norm": 2.3081610202789307,
+ "learning_rate": 0.00027522857142857143,
+ "loss": 0.2087,
+ "step": 290
+ },
+ {
+ "epoch": 1.6857142857142857,
+ "grad_norm": 0.9311438202857971,
+ "learning_rate": 0.0002748,
+ "loss": 0.1597,
+ "step": 295
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 1.1881247758865356,
+ "learning_rate": 0.00027437142857142854,
+ "loss": 0.1764,
+ "step": 300
+ },
+ {
+ "epoch": 1.7428571428571429,
+ "grad_norm": 1.30265212059021,
+ "learning_rate": 0.0002739428571428571,
+ "loss": 0.1647,
+ "step": 305
+ },
+ {
+ "epoch": 1.7714285714285714,
+ "grad_norm": 0.6832175850868225,
+ "learning_rate": 0.0002735142857142857,
+ "loss": 0.1638,
+ "step": 310
+ },
+ {
+ "epoch": 1.8,
+ "grad_norm": 1.8740538358688354,
+ "learning_rate": 0.00027308571428571424,
+ "loss": 0.1803,
+ "step": 315
+ },
+ {
+ "epoch": 1.8285714285714287,
+ "grad_norm": 9.821504592895508,
+ "learning_rate": 0.0002726571428571428,
+ "loss": 0.226,
+ "step": 320
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 1.0889750719070435,
+ "learning_rate": 0.0002722285714285714,
+ "loss": 0.1822,
+ "step": 325
+ },
+ {
+ "epoch": 1.8857142857142857,
+ "grad_norm": 0.9660868048667908,
+ "learning_rate": 0.0002718,
+ "loss": 0.1842,
+ "step": 330
+ },
+ {
+ "epoch": 1.9142857142857141,
+ "grad_norm": 0.6329234838485718,
+ "learning_rate": 0.0002713714285714286,
+ "loss": 0.1488,
+ "step": 335
+ },
+ {
+ "epoch": 1.9428571428571428,
+ "grad_norm": 3.601266384124756,
+ "learning_rate": 0.0002709428571428571,
+ "loss": 0.1887,
+ "step": 340
+ },
+ {
+ "epoch": 1.9714285714285715,
+ "grad_norm": 1.1441439390182495,
+ "learning_rate": 0.0002705142857142857,
+ "loss": 0.184,
+ "step": 345
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.8586034774780273,
+ "learning_rate": 0.0002700857142857143,
+ "loss": 0.1578,
+ "step": 350
+ },
+ {
+ "epoch": 2.0285714285714285,
+ "grad_norm": 1.5113487243652344,
+ "learning_rate": 0.00026965714285714286,
+ "loss": 0.2002,
+ "step": 355
+ },
+ {
+ "epoch": 2.057142857142857,
+ "grad_norm": 1.1123011112213135,
+ "learning_rate": 0.0002692285714285714,
+ "loss": 0.1946,
+ "step": 360
+ },
+ {
+ "epoch": 2.085714285714286,
+ "grad_norm": 0.9377036094665527,
+ "learning_rate": 0.0002688,
+ "loss": 0.1971,
+ "step": 365
+ },
+ {
+ "epoch": 2.1142857142857143,
+ "grad_norm": 0.6956892609596252,
+ "learning_rate": 0.00026837142857142856,
+ "loss": 0.1758,
+ "step": 370
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.7510782480239868,
+ "learning_rate": 0.0002679428571428571,
+ "loss": 0.1674,
+ "step": 375
+ },
+ {
+ "epoch": 2.1714285714285713,
+ "grad_norm": 0.7009285092353821,
+ "learning_rate": 0.00026751428571428567,
+ "loss": 0.1945,
+ "step": 380
+ },
+ {
+ "epoch": 2.2,
+ "grad_norm": 0.9555609822273254,
+ "learning_rate": 0.00026708571428571426,
+ "loss": 0.1857,
+ "step": 385
+ },
+ {
+ "epoch": 2.2285714285714286,
+ "grad_norm": 2.133979082107544,
+ "learning_rate": 0.00026665714285714284,
+ "loss": 0.1636,
+ "step": 390
+ },
+ {
+ "epoch": 2.257142857142857,
+ "grad_norm": 0.7105309963226318,
+ "learning_rate": 0.0002662285714285714,
+ "loss": 0.2014,
+ "step": 395
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.7329701781272888,
+ "learning_rate": 0.00026579999999999996,
+ "loss": 0.1884,
+ "step": 400
+ },
+ {
+ "epoch": 2.314285714285714,
+ "grad_norm": 1.0426994562149048,
+ "learning_rate": 0.00026537142857142854,
+ "loss": 0.1558,
+ "step": 405
+ },
+ {
+ "epoch": 2.342857142857143,
+ "grad_norm": 0.9306122660636902,
+ "learning_rate": 0.0002649428571428571,
+ "loss": 0.1774,
+ "step": 410
+ },
+ {
+ "epoch": 2.3714285714285714,
+ "grad_norm": 0.6989394426345825,
+ "learning_rate": 0.00026451428571428565,
+ "loss": 0.1601,
+ "step": 415
+ },
+ {
+ "epoch": 2.4,
+ "grad_norm": 1.4383760690689087,
+ "learning_rate": 0.0002640857142857143,
+ "loss": 0.1564,
+ "step": 420
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.6448336839675903,
+ "learning_rate": 0.0002636571428571428,
+ "loss": 0.1827,
+ "step": 425
+ },
+ {
+ "epoch": 2.4571428571428573,
+ "grad_norm": 0.9535760879516602,
+ "learning_rate": 0.0002632285714285714,
+ "loss": 0.1713,
+ "step": 430
+ },
+ {
+ "epoch": 2.4857142857142858,
+ "grad_norm": 1.034945011138916,
+ "learning_rate": 0.0002628,
+ "loss": 0.1457,
+ "step": 435
+ },
+ {
+ "epoch": 2.5142857142857142,
+ "grad_norm": 1.3225128650665283,
+ "learning_rate": 0.0002623714285714285,
+ "loss": 0.1633,
+ "step": 440
+ },
+ {
+ "epoch": 2.5428571428571427,
+ "grad_norm": 0.8285059928894043,
+ "learning_rate": 0.0002619428571428571,
+ "loss": 0.2004,
+ "step": 445
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.773176908493042,
+ "learning_rate": 0.0002615142857142857,
+ "loss": 0.1641,
+ "step": 450
+ },
+ {
+ "epoch": 2.6,
+ "grad_norm": 0.7964853048324585,
+ "learning_rate": 0.0002610857142857143,
+ "loss": 0.1608,
+ "step": 455
+ },
+ {
+ "epoch": 2.6285714285714286,
+ "grad_norm": 1.0967328548431396,
+ "learning_rate": 0.00026065714285714286,
+ "loss": 0.1697,
+ "step": 460
+ },
+ {
+ "epoch": 2.657142857142857,
+ "grad_norm": 0.6462066173553467,
+ "learning_rate": 0.0002602285714285714,
+ "loss": 0.1512,
+ "step": 465
+ },
+ {
+ "epoch": 2.685714285714286,
+ "grad_norm": 0.8765937089920044,
+ "learning_rate": 0.00025979999999999997,
+ "loss": 0.1826,
+ "step": 470
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 1.2524124383926392,
+ "learning_rate": 0.00025937142857142856,
+ "loss": 0.1731,
+ "step": 475
+ },
+ {
+ "epoch": 2.742857142857143,
+ "grad_norm": 2.2982606887817383,
+ "learning_rate": 0.0002589428571428571,
+ "loss": 0.1852,
+ "step": 480
+ },
+ {
+ "epoch": 2.7714285714285714,
+ "grad_norm": 0.9989053010940552,
+ "learning_rate": 0.0002585142857142857,
+ "loss": 0.1791,
+ "step": 485
+ },
+ {
+ "epoch": 2.8,
+ "grad_norm": 0.772343635559082,
+ "learning_rate": 0.00025808571428571426,
+ "loss": 0.1862,
+ "step": 490
+ },
+ {
+ "epoch": 2.8285714285714287,
+ "grad_norm": 1.2101136445999146,
+ "learning_rate": 0.00025765714285714284,
+ "loss": 0.1806,
+ "step": 495
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.8010189533233643,
+ "learning_rate": 0.0002572285714285714,
+ "loss": 0.1842,
+ "step": 500
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3500,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 20,
+ "save_steps": 500,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 0.0,
+ "train_batch_size": 200,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/glot-contrastive-final-lora/checkpoint-500/training_args.bin b/glot-contrastive-final-lora/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777
diff --git a/glot-contrastive-final-lora/sentencepiece.bpe.model b/glot-contrastive-final-lora/sentencepiece.bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0
--- /dev/null
+++ b/glot-contrastive-final-lora/sentencepiece.bpe.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613
+size 7658320
diff --git a/glot-contrastive-final-lora/special_tokens_map.json b/glot-contrastive-final-lora/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b
--- /dev/null
+++ b/glot-contrastive-final-lora/special_tokens_map.json
@@ -0,0 +1,15 @@
+{
+ "bos_token": "",
+ "cls_token": "",
+ "eos_token": "",
+ "mask_token": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "sep_token": "",
+ "unk_token": ""
+}
diff --git a/glot-contrastive-final-lora/tokenizer_config.json b/glot-contrastive-final-lora/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d
--- /dev/null
+++ b/glot-contrastive-final-lora/tokenizer_config.json
@@ -0,0 +1,57 @@
+{
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "3": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "401144": {
+ "content": "",
+ "lstrip": true,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "",
+ "eos_token": "",
+ "extra_special_tokens": {},
+ "mask_token": "",
+ "model_max_length": 512,
+ "pad_token": "",
+ "sep_token": "",
+ "sp_model_kwargs": {},
+ "tokenizer_class": "XLMRobertaTokenizer",
+ "unk_token": "",
+ "use_fast": true
+}
diff --git a/glot-contrastive-final-lora/training_args.bin b/glot-contrastive-final-lora/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec
--- /dev/null
+++ b/glot-contrastive-final-lora/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3
+size 5777