diff --git a/glot-contrastive-final-lora/README.md b/glot-contrastive-final-lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/adapter_config.json b/glot-contrastive-final-lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/adapter_model.safetensors b/glot-contrastive-final-lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dba4d5dd074dc3d6c4bc4d4f36793beac178e2c3 --- /dev/null +++ b/glot-contrastive-final-lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba05d9cb007251d29a6f02fdd92f56fa1beb8f9e0676686472daf07c4e9f478 +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-1000/README.md b/glot-contrastive-final-lora/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-1000/adapter_config.json b/glot-contrastive-final-lora/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-1000/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8c6f622d6f49815caea6fb659ebe020c89f378ea --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a256288c756ea19f50dedef87a2b9786971da4a587298a49f74d6f7686b0572 +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-1000/optimizer.pt b/glot-contrastive-final-lora/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bc165023ccee4f4afe4d93551ef9e2f43dc826e --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:192e794a1d2f26b8d41b4ac6d8d1d65d67318fcf2e777ca7722bceabd58f6fb6 +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-1000/rng_state.pth b/glot-contrastive-final-lora/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3fc18f310321470d8a0ba51339a5c8840edcb27d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbe2c991b46d5f8c63a3e3c3773a3bf7d45c1bcb99de1418411217d641560e12 +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-1000/scheduler.pt b/glot-contrastive-final-lora/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..39d88f920629c8004eb7888895c9b25772a0f6f1 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce27a0a34a8d4759fb8e422039bad599131740613f03bb839ba3688bec3369a7 +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-1000/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-1000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-1000/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-1000/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-1000/trainer_state.json b/glot-contrastive-final-lora/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4a8ba71f054efc1fce20afc03cc54fb58a0a853e --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/trainer_state.json @@ -0,0 +1,1434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.714285714285714, + "eval_steps": 5, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 1.3597544431686401, + "learning_rate": 0.00025679999999999995, + "loss": 0.1583, + "step": 505 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.8790671825408936, + "learning_rate": 0.00025637142857142854, + "loss": 0.1565, + "step": 510 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 1.1175066232681274, + "learning_rate": 0.0002559428571428571, + "loss": 0.1406, + "step": 515 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.8528785705566406, + "learning_rate": 0.0002555142857142857, + "loss": 0.1735, + "step": 520 + }, + { + "epoch": 3.0, + "grad_norm": 2.2073328495025635, + "learning_rate": 0.0002550857142857143, + "loss": 0.1816, + "step": 525 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 11.01322078704834, + "learning_rate": 0.0002546571428571428, + "loss": 0.1873, + "step": 530 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 1.5822402238845825, + "learning_rate": 0.0002542285714285714, + "loss": 0.168, + "step": 535 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 1.3086942434310913, + "learning_rate": 0.0002538, + "loss": 0.149, + "step": 540 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 6.303041458129883, + "learning_rate": 0.0002533714285714285, + "loss": 0.1651, + "step": 545 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 14.48929500579834, + "learning_rate": 0.00025294285714285716, + "loss": 0.1687, + "step": 550 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 6.824525356292725, + "learning_rate": 0.0002525142857142857, + "loss": 0.1919, + "step": 555 + }, + { + "epoch": 3.2, + "grad_norm": 18.772563934326172, + "learning_rate": 0.00025208571428571427, + "loss": 0.2075, + "step": 560 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.7268752455711365, + "learning_rate": 0.00025165714285714286, + "loss": 0.174, + "step": 565 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 1.1301453113555908, + "learning_rate": 0.0002512285714285714, + "loss": 0.1668, + "step": 570 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 2.846802234649658, + "learning_rate": 0.00025079999999999997, + "loss": 0.1645, + "step": 575 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 1.417515754699707, + "learning_rate": 0.00025037142857142855, + "loss": 0.1719, + "step": 580 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 4.137150764465332, + "learning_rate": 0.00024994285714285714, + "loss": 0.1739, + "step": 585 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 2.6067259311676025, + "learning_rate": 0.0002495142857142857, + "loss": 0.1489, + "step": 590 + }, + { + "epoch": 3.4, + "grad_norm": 2.601024627685547, + "learning_rate": 0.00024908571428571425, + "loss": 0.1618, + "step": 595 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.849017858505249, + "learning_rate": 0.00024865714285714284, + "loss": 0.1899, + "step": 600 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 4.673766136169434, + "learning_rate": 0.0002482285714285714, + "loss": 0.1761, + "step": 605 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 2.6057631969451904, + "learning_rate": 0.00024779999999999995, + "loss": 0.1743, + "step": 610 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 2.932652473449707, + "learning_rate": 0.0002473714285714286, + "loss": 0.1482, + "step": 615 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.8764939308166504, + "learning_rate": 0.0002469428571428571, + "loss": 0.1644, + "step": 620 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.3203191757202148, + "learning_rate": 0.0002465142857142857, + "loss": 0.1654, + "step": 625 + }, + { + "epoch": 3.6, + "grad_norm": 0.7977635264396667, + "learning_rate": 0.0002460857142857143, + "loss": 0.1472, + "step": 630 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.4750248193740845, + "learning_rate": 0.0002456571428571428, + "loss": 0.1735, + "step": 635 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.8164482116699219, + "learning_rate": 0.0002452285714285714, + "loss": 0.1593, + "step": 640 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 1.4829603433609009, + "learning_rate": 0.0002448, + "loss": 0.1508, + "step": 645 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.8828144669532776, + "learning_rate": 0.00024437142857142857, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 2.039384126663208, + "learning_rate": 0.00024394285714285713, + "loss": 0.1745, + "step": 655 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.9604200720787048, + "learning_rate": 0.00024351428571428569, + "loss": 0.17, + "step": 660 + }, + { + "epoch": 3.8, + "grad_norm": 0.7903971076011658, + "learning_rate": 0.00024308571428571427, + "loss": 0.1654, + "step": 665 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 0.6935649514198303, + "learning_rate": 0.00024265714285714283, + "loss": 0.1714, + "step": 670 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.5832012295722961, + "learning_rate": 0.00024222857142857138, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.6303168535232544, + "learning_rate": 0.0002418, + "loss": 0.1604, + "step": 680 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.7210885882377625, + "learning_rate": 0.00024137142857142855, + "loss": 0.1444, + "step": 685 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 0.7690990567207336, + "learning_rate": 0.00024094285714285714, + "loss": 0.1631, + "step": 690 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.0142720937728882, + "learning_rate": 0.0002405142857142857, + "loss": 0.158, + "step": 695 + }, + { + "epoch": 4.0, + "grad_norm": 0.7970322966575623, + "learning_rate": 0.00024008571428571425, + "loss": 0.1803, + "step": 700 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 0.6795914769172668, + "learning_rate": 0.00023965714285714284, + "loss": 0.143, + "step": 705 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.6832629442214966, + "learning_rate": 0.0002392285714285714, + "loss": 0.1457, + "step": 710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 3.8629798889160156, + "learning_rate": 0.0002388, + "loss": 0.1671, + "step": 715 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.1167882680892944, + "learning_rate": 0.00023837142857142856, + "loss": 0.1544, + "step": 720 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.9431412816047668, + "learning_rate": 0.00023794285714285712, + "loss": 0.1605, + "step": 725 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 1.310948133468628, + "learning_rate": 0.0002375142857142857, + "loss": 0.1121, + "step": 730 + }, + { + "epoch": 4.2, + "grad_norm": 0.9830737709999084, + "learning_rate": 0.00023708571428571426, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.6166555881500244, + "learning_rate": 0.00023665714285714282, + "loss": 0.1525, + "step": 740 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 0.995579719543457, + "learning_rate": 0.00023622857142857143, + "loss": 0.1439, + "step": 745 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.639796793460846, + "learning_rate": 0.00023579999999999999, + "loss": 0.1692, + "step": 750 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 0.9438050389289856, + "learning_rate": 0.00023537142857142854, + "loss": 0.1785, + "step": 755 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8960750102996826, + "learning_rate": 0.00023494285714285713, + "loss": 0.1557, + "step": 760 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 0.6287499070167542, + "learning_rate": 0.00023451428571428568, + "loss": 0.1459, + "step": 765 + }, + { + "epoch": 4.4, + "grad_norm": 0.7638295888900757, + "learning_rate": 0.00023408571428571424, + "loss": 0.1341, + "step": 770 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.655878484249115, + "learning_rate": 0.00023365714285714283, + "loss": 0.1358, + "step": 775 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.5840997695922852, + "learning_rate": 0.0002332285714285714, + "loss": 0.1386, + "step": 780 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.1082488298416138, + "learning_rate": 0.0002328, + "loss": 0.1827, + "step": 785 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8825240135192871, + "learning_rate": 0.00023237142857142855, + "loss": 0.1527, + "step": 790 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.6752304434776306, + "learning_rate": 0.0002319428571428571, + "loss": 0.1392, + "step": 795 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 1.1423301696777344, + "learning_rate": 0.0002315142857142857, + "loss": 0.1433, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 10.793691635131836, + "learning_rate": 0.00023108571428571425, + "loss": 0.1635, + "step": 805 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 0.47564294934272766, + "learning_rate": 0.00023065714285714286, + "loss": 0.1199, + "step": 810 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 1.2492656707763672, + "learning_rate": 0.00023022857142857142, + "loss": 0.1488, + "step": 815 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.6933501958847046, + "learning_rate": 0.00022979999999999997, + "loss": 0.1812, + "step": 820 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00022937142857142856, + "loss": 0.1415, + "step": 825 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 0.7854829430580139, + "learning_rate": 0.00022894285714285712, + "loss": 0.1401, + "step": 830 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 0.8716740608215332, + "learning_rate": 0.00022851428571428567, + "loss": 0.1982, + "step": 835 + }, + { + "epoch": 4.8, + "grad_norm": 0.7047899961471558, + "learning_rate": 0.00022808571428571426, + "loss": 0.1624, + "step": 840 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 0.7134959697723389, + "learning_rate": 0.00022765714285714284, + "loss": 0.1375, + "step": 845 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.0897325277328491, + "learning_rate": 0.00022722857142857143, + "loss": 0.1489, + "step": 850 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 1.1065207719802856, + "learning_rate": 0.00022679999999999998, + "loss": 0.1495, + "step": 855 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.7434757351875305, + "learning_rate": 0.00022637142857142854, + "loss": 0.1507, + "step": 860 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 1.0045181512832642, + "learning_rate": 0.00022594285714285712, + "loss": 0.1527, + "step": 865 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 1.2025654315948486, + "learning_rate": 0.00022551428571428568, + "loss": 0.1523, + "step": 870 + }, + { + "epoch": 5.0, + "grad_norm": 0.7823342084884644, + "learning_rate": 0.0002250857142857143, + "loss": 0.1514, + "step": 875 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 0.8405362963676453, + "learning_rate": 0.00022465714285714285, + "loss": 0.1461, + "step": 880 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 0.7527463436126709, + "learning_rate": 0.0002242285714285714, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 0.8372548222541809, + "learning_rate": 0.0002238, + "loss": 0.1513, + "step": 890 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 0.8755456209182739, + "learning_rate": 0.00022337142857142855, + "loss": 0.1498, + "step": 895 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.7312084436416626, + "learning_rate": 0.0002229428571428571, + "loss": 0.154, + "step": 900 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.6366221904754639, + "learning_rate": 0.0002225142857142857, + "loss": 0.1466, + "step": 905 + }, + { + "epoch": 5.2, + "grad_norm": 0.6406880617141724, + "learning_rate": 0.00022208571428571427, + "loss": 0.1254, + "step": 910 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 2.4106833934783936, + "learning_rate": 0.00022165714285714283, + "loss": 0.1534, + "step": 915 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 0.5635722279548645, + "learning_rate": 0.00022122857142857142, + "loss": 0.1461, + "step": 920 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 0.787162184715271, + "learning_rate": 0.00022079999999999997, + "loss": 0.1424, + "step": 925 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.00022037142857142853, + "loss": 0.1326, + "step": 930 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 0.6933534741401672, + "learning_rate": 0.00021994285714285711, + "loss": 0.1661, + "step": 935 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 0.7263259887695312, + "learning_rate": 0.0002195142857142857, + "loss": 0.15, + "step": 940 + }, + { + "epoch": 5.4, + "grad_norm": 0.5537381768226624, + "learning_rate": 0.00021908571428571428, + "loss": 0.129, + "step": 945 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.6014005541801453, + "learning_rate": 0.00021865714285714284, + "loss": 0.1321, + "step": 950 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 0.6581441760063171, + "learning_rate": 0.0002182285714285714, + "loss": 0.1587, + "step": 955 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 0.9326379895210266, + "learning_rate": 0.00021779999999999998, + "loss": 0.1654, + "step": 960 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 0.9438592791557312, + "learning_rate": 0.00021737142857142854, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 0.7699571251869202, + "learning_rate": 0.00021694285714285715, + "loss": 0.1464, + "step": 970 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 0.8758366703987122, + "learning_rate": 0.0002165142857142857, + "loss": 0.1599, + "step": 975 + }, + { + "epoch": 5.6, + "grad_norm": 0.6101442575454712, + "learning_rate": 0.00021608571428571426, + "loss": 0.1589, + "step": 980 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 0.7454060912132263, + "learning_rate": 0.00021565714285714285, + "loss": 0.1433, + "step": 985 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 0.6379484534263611, + "learning_rate": 0.0002152285714285714, + "loss": 0.1592, + "step": 990 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1601309776306152, + "learning_rate": 0.00021479999999999996, + "loss": 0.1647, + "step": 995 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.5464673638343811, + "learning_rate": 0.00021437142857142855, + "loss": 0.1469, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-1000/training_args.bin b/glot-contrastive-final-lora/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/checkpoint-1500/README.md b/glot-contrastive-final-lora/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-1500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-1500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34730efaa5567fa6056cd05fb71e8f8aa4bcdf15 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6413fb7d01f4b21da6e461dea0648d8d88fd37d6bd7c099ca98b3253cf62a00 +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-1500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..77d1cadcfde59acc8da9ebd6747c3ca5c6223db8 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36773257ecda9472e5ab320c80e2afdec9be64091b43e5bcbc53455be6b8149d +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-1500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e52ad573b8a63ae0f6bd42c23fdfa2580e5b0a6 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be729d0d5511ce10795029b99cb6f519c2f3eea267e5026e9426be89babe546 +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-1500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..740ce0a61b00dfd2cb65009a5ada2d4c9668e5e4 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7719f3f0106e49068ce5d6f3e02c2bb61413e6107676c385f70427146af2266c +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-1500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-1500/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-1500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-1500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-1500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..947e7d9e51194a2b33424d124fca5853c1fb8b52 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/trainer_state.json @@ -0,0 +1,2134 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.571428571428571, + "eval_steps": 5, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 1.3597544431686401, + "learning_rate": 0.00025679999999999995, + "loss": 0.1583, + "step": 505 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.8790671825408936, + "learning_rate": 0.00025637142857142854, + "loss": 0.1565, + "step": 510 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 1.1175066232681274, + "learning_rate": 0.0002559428571428571, + "loss": 0.1406, + "step": 515 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.8528785705566406, + "learning_rate": 0.0002555142857142857, + "loss": 0.1735, + "step": 520 + }, + { + "epoch": 3.0, + "grad_norm": 2.2073328495025635, + "learning_rate": 0.0002550857142857143, + "loss": 0.1816, + "step": 525 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 11.01322078704834, + "learning_rate": 0.0002546571428571428, + "loss": 0.1873, + "step": 530 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 1.5822402238845825, + "learning_rate": 0.0002542285714285714, + "loss": 0.168, + "step": 535 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 1.3086942434310913, + "learning_rate": 0.0002538, + "loss": 0.149, + "step": 540 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 6.303041458129883, + "learning_rate": 0.0002533714285714285, + "loss": 0.1651, + "step": 545 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 14.48929500579834, + "learning_rate": 0.00025294285714285716, + "loss": 0.1687, + "step": 550 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 6.824525356292725, + "learning_rate": 0.0002525142857142857, + "loss": 0.1919, + "step": 555 + }, + { + "epoch": 3.2, + "grad_norm": 18.772563934326172, + "learning_rate": 0.00025208571428571427, + "loss": 0.2075, + "step": 560 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.7268752455711365, + "learning_rate": 0.00025165714285714286, + "loss": 0.174, + "step": 565 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 1.1301453113555908, + "learning_rate": 0.0002512285714285714, + "loss": 0.1668, + "step": 570 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 2.846802234649658, + "learning_rate": 0.00025079999999999997, + "loss": 0.1645, + "step": 575 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 1.417515754699707, + "learning_rate": 0.00025037142857142855, + "loss": 0.1719, + "step": 580 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 4.137150764465332, + "learning_rate": 0.00024994285714285714, + "loss": 0.1739, + "step": 585 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 2.6067259311676025, + "learning_rate": 0.0002495142857142857, + "loss": 0.1489, + "step": 590 + }, + { + "epoch": 3.4, + "grad_norm": 2.601024627685547, + "learning_rate": 0.00024908571428571425, + "loss": 0.1618, + "step": 595 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.849017858505249, + "learning_rate": 0.00024865714285714284, + "loss": 0.1899, + "step": 600 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 4.673766136169434, + "learning_rate": 0.0002482285714285714, + "loss": 0.1761, + "step": 605 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 2.6057631969451904, + "learning_rate": 0.00024779999999999995, + "loss": 0.1743, + "step": 610 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 2.932652473449707, + "learning_rate": 0.0002473714285714286, + "loss": 0.1482, + "step": 615 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.8764939308166504, + "learning_rate": 0.0002469428571428571, + "loss": 0.1644, + "step": 620 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.3203191757202148, + "learning_rate": 0.0002465142857142857, + "loss": 0.1654, + "step": 625 + }, + { + "epoch": 3.6, + "grad_norm": 0.7977635264396667, + "learning_rate": 0.0002460857142857143, + "loss": 0.1472, + "step": 630 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.4750248193740845, + "learning_rate": 0.0002456571428571428, + "loss": 0.1735, + "step": 635 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.8164482116699219, + "learning_rate": 0.0002452285714285714, + "loss": 0.1593, + "step": 640 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 1.4829603433609009, + "learning_rate": 0.0002448, + "loss": 0.1508, + "step": 645 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.8828144669532776, + "learning_rate": 0.00024437142857142857, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 2.039384126663208, + "learning_rate": 0.00024394285714285713, + "loss": 0.1745, + "step": 655 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.9604200720787048, + "learning_rate": 0.00024351428571428569, + "loss": 0.17, + "step": 660 + }, + { + "epoch": 3.8, + "grad_norm": 0.7903971076011658, + "learning_rate": 0.00024308571428571427, + "loss": 0.1654, + "step": 665 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 0.6935649514198303, + "learning_rate": 0.00024265714285714283, + "loss": 0.1714, + "step": 670 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.5832012295722961, + "learning_rate": 0.00024222857142857138, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.6303168535232544, + "learning_rate": 0.0002418, + "loss": 0.1604, + "step": 680 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.7210885882377625, + "learning_rate": 0.00024137142857142855, + "loss": 0.1444, + "step": 685 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 0.7690990567207336, + "learning_rate": 0.00024094285714285714, + "loss": 0.1631, + "step": 690 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.0142720937728882, + "learning_rate": 0.0002405142857142857, + "loss": 0.158, + "step": 695 + }, + { + "epoch": 4.0, + "grad_norm": 0.7970322966575623, + "learning_rate": 0.00024008571428571425, + "loss": 0.1803, + "step": 700 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 0.6795914769172668, + "learning_rate": 0.00023965714285714284, + "loss": 0.143, + "step": 705 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.6832629442214966, + "learning_rate": 0.0002392285714285714, + "loss": 0.1457, + "step": 710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 3.8629798889160156, + "learning_rate": 0.0002388, + "loss": 0.1671, + "step": 715 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.1167882680892944, + "learning_rate": 0.00023837142857142856, + "loss": 0.1544, + "step": 720 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.9431412816047668, + "learning_rate": 0.00023794285714285712, + "loss": 0.1605, + "step": 725 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 1.310948133468628, + "learning_rate": 0.0002375142857142857, + "loss": 0.1121, + "step": 730 + }, + { + "epoch": 4.2, + "grad_norm": 0.9830737709999084, + "learning_rate": 0.00023708571428571426, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.6166555881500244, + "learning_rate": 0.00023665714285714282, + "loss": 0.1525, + "step": 740 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 0.995579719543457, + "learning_rate": 0.00023622857142857143, + "loss": 0.1439, + "step": 745 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.639796793460846, + "learning_rate": 0.00023579999999999999, + "loss": 0.1692, + "step": 750 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 0.9438050389289856, + "learning_rate": 0.00023537142857142854, + "loss": 0.1785, + "step": 755 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8960750102996826, + "learning_rate": 0.00023494285714285713, + "loss": 0.1557, + "step": 760 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 0.6287499070167542, + "learning_rate": 0.00023451428571428568, + "loss": 0.1459, + "step": 765 + }, + { + "epoch": 4.4, + "grad_norm": 0.7638295888900757, + "learning_rate": 0.00023408571428571424, + "loss": 0.1341, + "step": 770 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.655878484249115, + "learning_rate": 0.00023365714285714283, + "loss": 0.1358, + "step": 775 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.5840997695922852, + "learning_rate": 0.0002332285714285714, + "loss": 0.1386, + "step": 780 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.1082488298416138, + "learning_rate": 0.0002328, + "loss": 0.1827, + "step": 785 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8825240135192871, + "learning_rate": 0.00023237142857142855, + "loss": 0.1527, + "step": 790 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.6752304434776306, + "learning_rate": 0.0002319428571428571, + "loss": 0.1392, + "step": 795 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 1.1423301696777344, + "learning_rate": 0.0002315142857142857, + "loss": 0.1433, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 10.793691635131836, + "learning_rate": 0.00023108571428571425, + "loss": 0.1635, + "step": 805 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 0.47564294934272766, + "learning_rate": 0.00023065714285714286, + "loss": 0.1199, + "step": 810 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 1.2492656707763672, + "learning_rate": 0.00023022857142857142, + "loss": 0.1488, + "step": 815 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.6933501958847046, + "learning_rate": 0.00022979999999999997, + "loss": 0.1812, + "step": 820 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00022937142857142856, + "loss": 0.1415, + "step": 825 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 0.7854829430580139, + "learning_rate": 0.00022894285714285712, + "loss": 0.1401, + "step": 830 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 0.8716740608215332, + "learning_rate": 0.00022851428571428567, + "loss": 0.1982, + "step": 835 + }, + { + "epoch": 4.8, + "grad_norm": 0.7047899961471558, + "learning_rate": 0.00022808571428571426, + "loss": 0.1624, + "step": 840 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 0.7134959697723389, + "learning_rate": 0.00022765714285714284, + "loss": 0.1375, + "step": 845 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.0897325277328491, + "learning_rate": 0.00022722857142857143, + "loss": 0.1489, + "step": 850 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 1.1065207719802856, + "learning_rate": 0.00022679999999999998, + "loss": 0.1495, + "step": 855 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.7434757351875305, + "learning_rate": 0.00022637142857142854, + "loss": 0.1507, + "step": 860 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 1.0045181512832642, + "learning_rate": 0.00022594285714285712, + "loss": 0.1527, + "step": 865 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 1.2025654315948486, + "learning_rate": 0.00022551428571428568, + "loss": 0.1523, + "step": 870 + }, + { + "epoch": 5.0, + "grad_norm": 0.7823342084884644, + "learning_rate": 0.0002250857142857143, + "loss": 0.1514, + "step": 875 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 0.8405362963676453, + "learning_rate": 0.00022465714285714285, + "loss": 0.1461, + "step": 880 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 0.7527463436126709, + "learning_rate": 0.0002242285714285714, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 0.8372548222541809, + "learning_rate": 0.0002238, + "loss": 0.1513, + "step": 890 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 0.8755456209182739, + "learning_rate": 0.00022337142857142855, + "loss": 0.1498, + "step": 895 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.7312084436416626, + "learning_rate": 0.0002229428571428571, + "loss": 0.154, + "step": 900 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.6366221904754639, + "learning_rate": 0.0002225142857142857, + "loss": 0.1466, + "step": 905 + }, + { + "epoch": 5.2, + "grad_norm": 0.6406880617141724, + "learning_rate": 0.00022208571428571427, + "loss": 0.1254, + "step": 910 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 2.4106833934783936, + "learning_rate": 0.00022165714285714283, + "loss": 0.1534, + "step": 915 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 0.5635722279548645, + "learning_rate": 0.00022122857142857142, + "loss": 0.1461, + "step": 920 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 0.787162184715271, + "learning_rate": 0.00022079999999999997, + "loss": 0.1424, + "step": 925 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.00022037142857142853, + "loss": 0.1326, + "step": 930 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 0.6933534741401672, + "learning_rate": 0.00021994285714285711, + "loss": 0.1661, + "step": 935 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 0.7263259887695312, + "learning_rate": 0.0002195142857142857, + "loss": 0.15, + "step": 940 + }, + { + "epoch": 5.4, + "grad_norm": 0.5537381768226624, + "learning_rate": 0.00021908571428571428, + "loss": 0.129, + "step": 945 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.6014005541801453, + "learning_rate": 0.00021865714285714284, + "loss": 0.1321, + "step": 950 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 0.6581441760063171, + "learning_rate": 0.0002182285714285714, + "loss": 0.1587, + "step": 955 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 0.9326379895210266, + "learning_rate": 0.00021779999999999998, + "loss": 0.1654, + "step": 960 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 0.9438592791557312, + "learning_rate": 0.00021737142857142854, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 0.7699571251869202, + "learning_rate": 0.00021694285714285715, + "loss": 0.1464, + "step": 970 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 0.8758366703987122, + "learning_rate": 0.0002165142857142857, + "loss": 0.1599, + "step": 975 + }, + { + "epoch": 5.6, + "grad_norm": 0.6101442575454712, + "learning_rate": 0.00021608571428571426, + "loss": 0.1589, + "step": 980 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 0.7454060912132263, + "learning_rate": 0.00021565714285714285, + "loss": 0.1433, + "step": 985 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 0.6379484534263611, + "learning_rate": 0.0002152285714285714, + "loss": 0.1592, + "step": 990 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1601309776306152, + "learning_rate": 0.00021479999999999996, + "loss": 0.1647, + "step": 995 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.5464673638343811, + "learning_rate": 0.00021437142857142855, + "loss": 0.1469, + "step": 1000 + }, + { + "epoch": 5.742857142857143, + "grad_norm": 1.0279319286346436, + "learning_rate": 0.00021394285714285713, + "loss": 0.1203, + "step": 1005 + }, + { + "epoch": 5.771428571428571, + "grad_norm": 0.5503718256950378, + "learning_rate": 0.00021351428571428572, + "loss": 0.1409, + "step": 1010 + }, + { + "epoch": 5.8, + "grad_norm": 0.6123886108398438, + "learning_rate": 0.00021308571428571427, + "loss": 0.1427, + "step": 1015 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 0.6560390591621399, + "learning_rate": 0.00021265714285714283, + "loss": 0.1415, + "step": 1020 + }, + { + "epoch": 5.857142857142857, + "grad_norm": 0.5576716661453247, + "learning_rate": 0.00021222857142857141, + "loss": 0.1408, + "step": 1025 + }, + { + "epoch": 5.885714285714286, + "grad_norm": 0.6419074535369873, + "learning_rate": 0.00021179999999999997, + "loss": 0.1385, + "step": 1030 + }, + { + "epoch": 5.914285714285715, + "grad_norm": 1.008925199508667, + "learning_rate": 0.00021137142857142858, + "loss": 0.1497, + "step": 1035 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 0.6559906005859375, + "learning_rate": 0.00021094285714285714, + "loss": 0.1218, + "step": 1040 + }, + { + "epoch": 5.9714285714285715, + "grad_norm": 0.627164363861084, + "learning_rate": 0.0002105142857142857, + "loss": 0.1368, + "step": 1045 + }, + { + "epoch": 6.0, + "grad_norm": 0.5760972499847412, + "learning_rate": 0.00021008571428571428, + "loss": 0.1508, + "step": 1050 + }, + { + "epoch": 6.0285714285714285, + "grad_norm": 0.5754174590110779, + "learning_rate": 0.00020965714285714284, + "loss": 0.1181, + "step": 1055 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 0.8736348748207092, + "learning_rate": 0.0002092285714285714, + "loss": 0.1252, + "step": 1060 + }, + { + "epoch": 6.085714285714285, + "grad_norm": 0.7166719436645508, + "learning_rate": 0.00020879999999999998, + "loss": 0.1481, + "step": 1065 + }, + { + "epoch": 6.114285714285714, + "grad_norm": 0.6494349241256714, + "learning_rate": 0.00020837142857142856, + "loss": 0.1478, + "step": 1070 + }, + { + "epoch": 6.142857142857143, + "grad_norm": 0.6681587100028992, + "learning_rate": 0.00020794285714285712, + "loss": 0.1488, + "step": 1075 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 0.7123684883117676, + "learning_rate": 0.0002075142857142857, + "loss": 0.1378, + "step": 1080 + }, + { + "epoch": 6.2, + "grad_norm": 0.6146950721740723, + "learning_rate": 0.00020708571428571426, + "loss": 0.1306, + "step": 1085 + }, + { + "epoch": 6.228571428571429, + "grad_norm": 0.8402445912361145, + "learning_rate": 0.00020665714285714282, + "loss": 0.1063, + "step": 1090 + }, + { + "epoch": 6.257142857142857, + "grad_norm": 0.6567764282226562, + "learning_rate": 0.0002062285714285714, + "loss": 0.1195, + "step": 1095 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.6006014943122864, + "learning_rate": 0.0002058, + "loss": 0.1542, + "step": 1100 + }, + { + "epoch": 6.314285714285714, + "grad_norm": 0.793100893497467, + "learning_rate": 0.00020537142857142857, + "loss": 0.1381, + "step": 1105 + }, + { + "epoch": 6.3428571428571425, + "grad_norm": 0.5923666954040527, + "learning_rate": 0.00020494285714285713, + "loss": 0.1386, + "step": 1110 + }, + { + "epoch": 6.371428571428572, + "grad_norm": 0.6692521572113037, + "learning_rate": 0.0002045142857142857, + "loss": 0.1223, + "step": 1115 + }, + { + "epoch": 6.4, + "grad_norm": 0.7216306328773499, + "learning_rate": 0.00020408571428571427, + "loss": 0.1367, + "step": 1120 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.5640934109687805, + "learning_rate": 0.00020365714285714283, + "loss": 0.1554, + "step": 1125 + }, + { + "epoch": 6.457142857142857, + "grad_norm": 0.8154368996620178, + "learning_rate": 0.00020322857142857138, + "loss": 0.1674, + "step": 1130 + }, + { + "epoch": 6.485714285714286, + "grad_norm": 0.7185398936271667, + "learning_rate": 0.0002028, + "loss": 0.1375, + "step": 1135 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 0.6805170774459839, + "learning_rate": 0.00020237142857142855, + "loss": 0.1306, + "step": 1140 + }, + { + "epoch": 6.542857142857143, + "grad_norm": 0.5996941924095154, + "learning_rate": 0.00020194285714285714, + "loss": 0.1433, + "step": 1145 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 0.5258373022079468, + "learning_rate": 0.0002015142857142857, + "loss": 0.1285, + "step": 1150 + }, + { + "epoch": 6.6, + "grad_norm": 0.7771695256233215, + "learning_rate": 0.00020108571428571425, + "loss": 0.1493, + "step": 1155 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 0.5920616388320923, + "learning_rate": 0.00020065714285714284, + "loss": 0.1479, + "step": 1160 + }, + { + "epoch": 6.6571428571428575, + "grad_norm": 0.7460982799530029, + "learning_rate": 0.00020022857142857142, + "loss": 0.1173, + "step": 1165 + }, + { + "epoch": 6.685714285714286, + "grad_norm": 1.1703822612762451, + "learning_rate": 0.0001998, + "loss": 0.1402, + "step": 1170 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.7894724011421204, + "learning_rate": 0.00019937142857142856, + "loss": 0.1253, + "step": 1175 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 0.7013376355171204, + "learning_rate": 0.00019894285714285712, + "loss": 0.1573, + "step": 1180 + }, + { + "epoch": 6.771428571428571, + "grad_norm": 0.6421737670898438, + "learning_rate": 0.0001985142857142857, + "loss": 0.1497, + "step": 1185 + }, + { + "epoch": 6.8, + "grad_norm": 1.204296350479126, + "learning_rate": 0.00019808571428571426, + "loss": 0.1634, + "step": 1190 + }, + { + "epoch": 6.828571428571428, + "grad_norm": 0.867765486240387, + "learning_rate": 0.00019765714285714282, + "loss": 0.1353, + "step": 1195 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 0.7325594425201416, + "learning_rate": 0.00019722857142857143, + "loss": 0.118, + "step": 1200 + }, + { + "epoch": 6.885714285714286, + "grad_norm": 0.7029078006744385, + "learning_rate": 0.00019679999999999999, + "loss": 0.1425, + "step": 1205 + }, + { + "epoch": 6.914285714285715, + "grad_norm": 1.1572504043579102, + "learning_rate": 0.00019637142857142857, + "loss": 0.1337, + "step": 1210 + }, + { + "epoch": 6.942857142857143, + "grad_norm": 0.8022822141647339, + "learning_rate": 0.00019594285714285713, + "loss": 0.1684, + "step": 1215 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 0.6729874610900879, + "learning_rate": 0.00019551428571428568, + "loss": 0.1238, + "step": 1220 + }, + { + "epoch": 7.0, + "grad_norm": 0.5773627758026123, + "learning_rate": 0.00019508571428571427, + "loss": 0.138, + "step": 1225 + }, + { + "epoch": 7.0285714285714285, + "grad_norm": 0.7182291150093079, + "learning_rate": 0.00019465714285714285, + "loss": 0.1431, + "step": 1230 + }, + { + "epoch": 7.057142857142857, + "grad_norm": 1.7567912340164185, + "learning_rate": 0.0001942285714285714, + "loss": 0.1319, + "step": 1235 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 0.6845232248306274, + "learning_rate": 0.0001938, + "loss": 0.1292, + "step": 1240 + }, + { + "epoch": 7.114285714285714, + "grad_norm": 0.6077771782875061, + "learning_rate": 0.00019337142857142855, + "loss": 0.1238, + "step": 1245 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.6168347597122192, + "learning_rate": 0.0001929428571428571, + "loss": 0.1384, + "step": 1250 + }, + { + "epoch": 7.171428571428572, + "grad_norm": 0.7457576394081116, + "learning_rate": 0.0001925142857142857, + "loss": 0.1306, + "step": 1255 + }, + { + "epoch": 7.2, + "grad_norm": 0.5969316363334656, + "learning_rate": 0.00019208571428571425, + "loss": 0.1123, + "step": 1260 + }, + { + "epoch": 7.228571428571429, + "grad_norm": 0.6902753710746765, + "learning_rate": 0.00019165714285714286, + "loss": 0.1185, + "step": 1265 + }, + { + "epoch": 7.257142857142857, + "grad_norm": 0.6488338112831116, + "learning_rate": 0.00019122857142857142, + "loss": 0.1431, + "step": 1270 + }, + { + "epoch": 7.285714285714286, + "grad_norm": 0.6814819574356079, + "learning_rate": 0.00019079999999999998, + "loss": 0.1495, + "step": 1275 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 0.7468088865280151, + "learning_rate": 0.00019037142857142856, + "loss": 0.1158, + "step": 1280 + }, + { + "epoch": 7.3428571428571425, + "grad_norm": 0.7417412400245667, + "learning_rate": 0.00018994285714285712, + "loss": 0.1311, + "step": 1285 + }, + { + "epoch": 7.371428571428572, + "grad_norm": 0.5480664372444153, + "learning_rate": 0.00018951428571428567, + "loss": 0.135, + "step": 1290 + }, + { + "epoch": 7.4, + "grad_norm": 0.725527822971344, + "learning_rate": 0.00018908571428571429, + "loss": 0.1217, + "step": 1295 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 0.6566678285598755, + "learning_rate": 0.00018865714285714284, + "loss": 0.1417, + "step": 1300 + }, + { + "epoch": 7.457142857142857, + "grad_norm": 0.516952395439148, + "learning_rate": 0.00018822857142857143, + "loss": 0.1329, + "step": 1305 + }, + { + "epoch": 7.485714285714286, + "grad_norm": 1.9545241594314575, + "learning_rate": 0.00018779999999999998, + "loss": 0.1339, + "step": 1310 + }, + { + "epoch": 7.514285714285714, + "grad_norm": 0.8276839852333069, + "learning_rate": 0.00018737142857142854, + "loss": 0.1324, + "step": 1315 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 0.6737099289894104, + "learning_rate": 0.00018694285714285713, + "loss": 0.1139, + "step": 1320 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 0.6914472579956055, + "learning_rate": 0.00018651428571428568, + "loss": 0.1146, + "step": 1325 + }, + { + "epoch": 7.6, + "grad_norm": 0.6630033850669861, + "learning_rate": 0.0001860857142857143, + "loss": 0.1571, + "step": 1330 + }, + { + "epoch": 7.628571428571428, + "grad_norm": 0.820688784122467, + "learning_rate": 0.00018565714285714285, + "loss": 0.15, + "step": 1335 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 2.0491325855255127, + "learning_rate": 0.0001852285714285714, + "loss": 0.127, + "step": 1340 + }, + { + "epoch": 7.685714285714286, + "grad_norm": 0.9327268004417419, + "learning_rate": 0.0001848, + "loss": 0.1289, + "step": 1345 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 1.3131701946258545, + "learning_rate": 0.00018437142857142855, + "loss": 0.1228, + "step": 1350 + }, + { + "epoch": 7.742857142857143, + "grad_norm": 2.955918312072754, + "learning_rate": 0.0001839428571428571, + "loss": 0.1082, + "step": 1355 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 1.2165493965148926, + "learning_rate": 0.00018351428571428572, + "loss": 0.1688, + "step": 1360 + }, + { + "epoch": 7.8, + "grad_norm": 0.759324312210083, + "learning_rate": 0.00018308571428571428, + "loss": 0.1185, + "step": 1365 + }, + { + "epoch": 7.828571428571428, + "grad_norm": 0.7445591688156128, + "learning_rate": 0.00018265714285714286, + "loss": 0.1431, + "step": 1370 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.679374098777771, + "learning_rate": 0.00018222857142857142, + "loss": 0.1451, + "step": 1375 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 2.1234302520751953, + "learning_rate": 0.00018179999999999997, + "loss": 0.1265, + "step": 1380 + }, + { + "epoch": 7.914285714285715, + "grad_norm": 1.006521224975586, + "learning_rate": 0.00018137142857142856, + "loss": 0.1722, + "step": 1385 + }, + { + "epoch": 7.942857142857143, + "grad_norm": 0.7275253534317017, + "learning_rate": 0.00018094285714285712, + "loss": 0.1625, + "step": 1390 + }, + { + "epoch": 7.9714285714285715, + "grad_norm": 0.8612022995948792, + "learning_rate": 0.0001805142857142857, + "loss": 0.1345, + "step": 1395 + }, + { + "epoch": 8.0, + "grad_norm": 0.7276798486709595, + "learning_rate": 0.00018008571428571428, + "loss": 0.1236, + "step": 1400 + }, + { + "epoch": 8.028571428571428, + "grad_norm": 0.8731086850166321, + "learning_rate": 0.00017965714285714284, + "loss": 0.1604, + "step": 1405 + }, + { + "epoch": 8.057142857142857, + "grad_norm": 0.8950818777084351, + "learning_rate": 0.0001792285714285714, + "loss": 0.1531, + "step": 1410 + }, + { + "epoch": 8.085714285714285, + "grad_norm": 0.7399356365203857, + "learning_rate": 0.00017879999999999998, + "loss": 0.1508, + "step": 1415 + }, + { + "epoch": 8.114285714285714, + "grad_norm": 1.3727307319641113, + "learning_rate": 0.00017837142857142854, + "loss": 0.1487, + "step": 1420 + }, + { + "epoch": 8.142857142857142, + "grad_norm": 0.5938125848770142, + "learning_rate": 0.00017794285714285715, + "loss": 0.1303, + "step": 1425 + }, + { + "epoch": 8.17142857142857, + "grad_norm": 0.7043821811676025, + "learning_rate": 0.0001775142857142857, + "loss": 0.0948, + "step": 1430 + }, + { + "epoch": 8.2, + "grad_norm": 1.1062767505645752, + "learning_rate": 0.00017708571428571426, + "loss": 0.1412, + "step": 1435 + }, + { + "epoch": 8.228571428571428, + "grad_norm": 0.844832181930542, + "learning_rate": 0.00017665714285714285, + "loss": 0.113, + "step": 1440 + }, + { + "epoch": 8.257142857142856, + "grad_norm": 0.7564154863357544, + "learning_rate": 0.0001762285714285714, + "loss": 0.1319, + "step": 1445 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 0.8843110203742981, + "learning_rate": 0.00017579999999999996, + "loss": 0.1206, + "step": 1450 + }, + { + "epoch": 8.314285714285715, + "grad_norm": 0.8175828456878662, + "learning_rate": 0.00017537142857142855, + "loss": 0.1327, + "step": 1455 + }, + { + "epoch": 8.342857142857143, + "grad_norm": 0.6443565487861633, + "learning_rate": 0.00017494285714285713, + "loss": 0.1239, + "step": 1460 + }, + { + "epoch": 8.371428571428572, + "grad_norm": 0.7237185835838318, + "learning_rate": 0.00017451428571428572, + "loss": 0.1639, + "step": 1465 + }, + { + "epoch": 8.4, + "grad_norm": 0.6118057370185852, + "learning_rate": 0.00017408571428571427, + "loss": 0.1363, + "step": 1470 + }, + { + "epoch": 8.428571428571429, + "grad_norm": 0.6754649877548218, + "learning_rate": 0.00017365714285714283, + "loss": 0.1187, + "step": 1475 + }, + { + "epoch": 8.457142857142857, + "grad_norm": 1.0067390203475952, + "learning_rate": 0.00017322857142857141, + "loss": 0.1401, + "step": 1480 + }, + { + "epoch": 8.485714285714286, + "grad_norm": 8.509544372558594, + "learning_rate": 0.00017279999999999997, + "loss": 0.1304, + "step": 1485 + }, + { + "epoch": 8.514285714285714, + "grad_norm": 4.2030205726623535, + "learning_rate": 0.00017237142857142858, + "loss": 0.121, + "step": 1490 + }, + { + "epoch": 8.542857142857143, + "grad_norm": 4.877438068389893, + "learning_rate": 0.00017194285714285714, + "loss": 0.1918, + "step": 1495 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 6.4971232414245605, + "learning_rate": 0.0001715142857142857, + "loss": 0.2154, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-1500/training_args.bin b/glot-contrastive-final-lora/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/checkpoint-2000/README.md b/glot-contrastive-final-lora/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-2000/adapter_config.json b/glot-contrastive-final-lora/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-2000/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce77fbf4791146bc293336e87f65f339e4c78599 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711e07c24e31501f072e595cc3a3ab71fd99dfdb7b91db165f6ee74a84d23cd0 +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-2000/optimizer.pt b/glot-contrastive-final-lora/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..715c563bfca0e12474e7e1ab3d806dfcf59b200f --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3498c655bb1206516340ad6a1a375f5542a3351919099d2fe49c4838bfe9533 +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-2000/rng_state.pth b/glot-contrastive-final-lora/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7b203bcce35936bf21b078f9f5ec4070fea73ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03ebb8df928308a6424f992063f5301b7d41a4785e5763346c3448dc6be8b4b +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-2000/scheduler.pt b/glot-contrastive-final-lora/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ebce6beffa1d7c57cbe076d7901f3b92c4904d4 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad5926187bcca6f27644c72ca9d33e1556220045488e2d905d0c7306c6d222dc +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-2000/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-2000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-2000/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-2000/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-2000/trainer_state.json b/glot-contrastive-final-lora/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fdef36940feea5968fda1cff13bd0c6ccdf187e6 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/trainer_state.json @@ -0,0 +1,2834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.428571428571429, + "eval_steps": 5, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 1.3597544431686401, + "learning_rate": 0.00025679999999999995, + "loss": 0.1583, + "step": 505 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.8790671825408936, + "learning_rate": 0.00025637142857142854, + "loss": 0.1565, + "step": 510 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 1.1175066232681274, + "learning_rate": 0.0002559428571428571, + "loss": 0.1406, + "step": 515 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.8528785705566406, + "learning_rate": 0.0002555142857142857, + "loss": 0.1735, + "step": 520 + }, + { + "epoch": 3.0, + "grad_norm": 2.2073328495025635, + "learning_rate": 0.0002550857142857143, + "loss": 0.1816, + "step": 525 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 11.01322078704834, + "learning_rate": 0.0002546571428571428, + "loss": 0.1873, + "step": 530 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 1.5822402238845825, + "learning_rate": 0.0002542285714285714, + "loss": 0.168, + "step": 535 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 1.3086942434310913, + "learning_rate": 0.0002538, + "loss": 0.149, + "step": 540 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 6.303041458129883, + "learning_rate": 0.0002533714285714285, + "loss": 0.1651, + "step": 545 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 14.48929500579834, + "learning_rate": 0.00025294285714285716, + "loss": 0.1687, + "step": 550 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 6.824525356292725, + "learning_rate": 0.0002525142857142857, + "loss": 0.1919, + "step": 555 + }, + { + "epoch": 3.2, + "grad_norm": 18.772563934326172, + "learning_rate": 0.00025208571428571427, + "loss": 0.2075, + "step": 560 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.7268752455711365, + "learning_rate": 0.00025165714285714286, + "loss": 0.174, + "step": 565 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 1.1301453113555908, + "learning_rate": 0.0002512285714285714, + "loss": 0.1668, + "step": 570 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 2.846802234649658, + "learning_rate": 0.00025079999999999997, + "loss": 0.1645, + "step": 575 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 1.417515754699707, + "learning_rate": 0.00025037142857142855, + "loss": 0.1719, + "step": 580 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 4.137150764465332, + "learning_rate": 0.00024994285714285714, + "loss": 0.1739, + "step": 585 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 2.6067259311676025, + "learning_rate": 0.0002495142857142857, + "loss": 0.1489, + "step": 590 + }, + { + "epoch": 3.4, + "grad_norm": 2.601024627685547, + "learning_rate": 0.00024908571428571425, + "loss": 0.1618, + "step": 595 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.849017858505249, + "learning_rate": 0.00024865714285714284, + "loss": 0.1899, + "step": 600 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 4.673766136169434, + "learning_rate": 0.0002482285714285714, + "loss": 0.1761, + "step": 605 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 2.6057631969451904, + "learning_rate": 0.00024779999999999995, + "loss": 0.1743, + "step": 610 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 2.932652473449707, + "learning_rate": 0.0002473714285714286, + "loss": 0.1482, + "step": 615 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.8764939308166504, + "learning_rate": 0.0002469428571428571, + "loss": 0.1644, + "step": 620 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.3203191757202148, + "learning_rate": 0.0002465142857142857, + "loss": 0.1654, + "step": 625 + }, + { + "epoch": 3.6, + "grad_norm": 0.7977635264396667, + "learning_rate": 0.0002460857142857143, + "loss": 0.1472, + "step": 630 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.4750248193740845, + "learning_rate": 0.0002456571428571428, + "loss": 0.1735, + "step": 635 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.8164482116699219, + "learning_rate": 0.0002452285714285714, + "loss": 0.1593, + "step": 640 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 1.4829603433609009, + "learning_rate": 0.0002448, + "loss": 0.1508, + "step": 645 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.8828144669532776, + "learning_rate": 0.00024437142857142857, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 2.039384126663208, + "learning_rate": 0.00024394285714285713, + "loss": 0.1745, + "step": 655 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.9604200720787048, + "learning_rate": 0.00024351428571428569, + "loss": 0.17, + "step": 660 + }, + { + "epoch": 3.8, + "grad_norm": 0.7903971076011658, + "learning_rate": 0.00024308571428571427, + "loss": 0.1654, + "step": 665 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 0.6935649514198303, + "learning_rate": 0.00024265714285714283, + "loss": 0.1714, + "step": 670 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.5832012295722961, + "learning_rate": 0.00024222857142857138, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.6303168535232544, + "learning_rate": 0.0002418, + "loss": 0.1604, + "step": 680 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.7210885882377625, + "learning_rate": 0.00024137142857142855, + "loss": 0.1444, + "step": 685 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 0.7690990567207336, + "learning_rate": 0.00024094285714285714, + "loss": 0.1631, + "step": 690 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.0142720937728882, + "learning_rate": 0.0002405142857142857, + "loss": 0.158, + "step": 695 + }, + { + "epoch": 4.0, + "grad_norm": 0.7970322966575623, + "learning_rate": 0.00024008571428571425, + "loss": 0.1803, + "step": 700 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 0.6795914769172668, + "learning_rate": 0.00023965714285714284, + "loss": 0.143, + "step": 705 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.6832629442214966, + "learning_rate": 0.0002392285714285714, + "loss": 0.1457, + "step": 710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 3.8629798889160156, + "learning_rate": 0.0002388, + "loss": 0.1671, + "step": 715 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.1167882680892944, + "learning_rate": 0.00023837142857142856, + "loss": 0.1544, + "step": 720 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.9431412816047668, + "learning_rate": 0.00023794285714285712, + "loss": 0.1605, + "step": 725 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 1.310948133468628, + "learning_rate": 0.0002375142857142857, + "loss": 0.1121, + "step": 730 + }, + { + "epoch": 4.2, + "grad_norm": 0.9830737709999084, + "learning_rate": 0.00023708571428571426, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.6166555881500244, + "learning_rate": 0.00023665714285714282, + "loss": 0.1525, + "step": 740 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 0.995579719543457, + "learning_rate": 0.00023622857142857143, + "loss": 0.1439, + "step": 745 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.639796793460846, + "learning_rate": 0.00023579999999999999, + "loss": 0.1692, + "step": 750 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 0.9438050389289856, + "learning_rate": 0.00023537142857142854, + "loss": 0.1785, + "step": 755 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8960750102996826, + "learning_rate": 0.00023494285714285713, + "loss": 0.1557, + "step": 760 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 0.6287499070167542, + "learning_rate": 0.00023451428571428568, + "loss": 0.1459, + "step": 765 + }, + { + "epoch": 4.4, + "grad_norm": 0.7638295888900757, + "learning_rate": 0.00023408571428571424, + "loss": 0.1341, + "step": 770 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.655878484249115, + "learning_rate": 0.00023365714285714283, + "loss": 0.1358, + "step": 775 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.5840997695922852, + "learning_rate": 0.0002332285714285714, + "loss": 0.1386, + "step": 780 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.1082488298416138, + "learning_rate": 0.0002328, + "loss": 0.1827, + "step": 785 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8825240135192871, + "learning_rate": 0.00023237142857142855, + "loss": 0.1527, + "step": 790 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.6752304434776306, + "learning_rate": 0.0002319428571428571, + "loss": 0.1392, + "step": 795 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 1.1423301696777344, + "learning_rate": 0.0002315142857142857, + "loss": 0.1433, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 10.793691635131836, + "learning_rate": 0.00023108571428571425, + "loss": 0.1635, + "step": 805 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 0.47564294934272766, + "learning_rate": 0.00023065714285714286, + "loss": 0.1199, + "step": 810 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 1.2492656707763672, + "learning_rate": 0.00023022857142857142, + "loss": 0.1488, + "step": 815 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.6933501958847046, + "learning_rate": 0.00022979999999999997, + "loss": 0.1812, + "step": 820 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00022937142857142856, + "loss": 0.1415, + "step": 825 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 0.7854829430580139, + "learning_rate": 0.00022894285714285712, + "loss": 0.1401, + "step": 830 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 0.8716740608215332, + "learning_rate": 0.00022851428571428567, + "loss": 0.1982, + "step": 835 + }, + { + "epoch": 4.8, + "grad_norm": 0.7047899961471558, + "learning_rate": 0.00022808571428571426, + "loss": 0.1624, + "step": 840 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 0.7134959697723389, + "learning_rate": 0.00022765714285714284, + "loss": 0.1375, + "step": 845 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.0897325277328491, + "learning_rate": 0.00022722857142857143, + "loss": 0.1489, + "step": 850 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 1.1065207719802856, + "learning_rate": 0.00022679999999999998, + "loss": 0.1495, + "step": 855 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.7434757351875305, + "learning_rate": 0.00022637142857142854, + "loss": 0.1507, + "step": 860 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 1.0045181512832642, + "learning_rate": 0.00022594285714285712, + "loss": 0.1527, + "step": 865 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 1.2025654315948486, + "learning_rate": 0.00022551428571428568, + "loss": 0.1523, + "step": 870 + }, + { + "epoch": 5.0, + "grad_norm": 0.7823342084884644, + "learning_rate": 0.0002250857142857143, + "loss": 0.1514, + "step": 875 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 0.8405362963676453, + "learning_rate": 0.00022465714285714285, + "loss": 0.1461, + "step": 880 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 0.7527463436126709, + "learning_rate": 0.0002242285714285714, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 0.8372548222541809, + "learning_rate": 0.0002238, + "loss": 0.1513, + "step": 890 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 0.8755456209182739, + "learning_rate": 0.00022337142857142855, + "loss": 0.1498, + "step": 895 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.7312084436416626, + "learning_rate": 0.0002229428571428571, + "loss": 0.154, + "step": 900 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.6366221904754639, + "learning_rate": 0.0002225142857142857, + "loss": 0.1466, + "step": 905 + }, + { + "epoch": 5.2, + "grad_norm": 0.6406880617141724, + "learning_rate": 0.00022208571428571427, + "loss": 0.1254, + "step": 910 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 2.4106833934783936, + "learning_rate": 0.00022165714285714283, + "loss": 0.1534, + "step": 915 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 0.5635722279548645, + "learning_rate": 0.00022122857142857142, + "loss": 0.1461, + "step": 920 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 0.787162184715271, + "learning_rate": 0.00022079999999999997, + "loss": 0.1424, + "step": 925 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.00022037142857142853, + "loss": 0.1326, + "step": 930 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 0.6933534741401672, + "learning_rate": 0.00021994285714285711, + "loss": 0.1661, + "step": 935 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 0.7263259887695312, + "learning_rate": 0.0002195142857142857, + "loss": 0.15, + "step": 940 + }, + { + "epoch": 5.4, + "grad_norm": 0.5537381768226624, + "learning_rate": 0.00021908571428571428, + "loss": 0.129, + "step": 945 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.6014005541801453, + "learning_rate": 0.00021865714285714284, + "loss": 0.1321, + "step": 950 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 0.6581441760063171, + "learning_rate": 0.0002182285714285714, + "loss": 0.1587, + "step": 955 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 0.9326379895210266, + "learning_rate": 0.00021779999999999998, + "loss": 0.1654, + "step": 960 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 0.9438592791557312, + "learning_rate": 0.00021737142857142854, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 0.7699571251869202, + "learning_rate": 0.00021694285714285715, + "loss": 0.1464, + "step": 970 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 0.8758366703987122, + "learning_rate": 0.0002165142857142857, + "loss": 0.1599, + "step": 975 + }, + { + "epoch": 5.6, + "grad_norm": 0.6101442575454712, + "learning_rate": 0.00021608571428571426, + "loss": 0.1589, + "step": 980 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 0.7454060912132263, + "learning_rate": 0.00021565714285714285, + "loss": 0.1433, + "step": 985 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 0.6379484534263611, + "learning_rate": 0.0002152285714285714, + "loss": 0.1592, + "step": 990 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1601309776306152, + "learning_rate": 0.00021479999999999996, + "loss": 0.1647, + "step": 995 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.5464673638343811, + "learning_rate": 0.00021437142857142855, + "loss": 0.1469, + "step": 1000 + }, + { + "epoch": 5.742857142857143, + "grad_norm": 1.0279319286346436, + "learning_rate": 0.00021394285714285713, + "loss": 0.1203, + "step": 1005 + }, + { + "epoch": 5.771428571428571, + "grad_norm": 0.5503718256950378, + "learning_rate": 0.00021351428571428572, + "loss": 0.1409, + "step": 1010 + }, + { + "epoch": 5.8, + "grad_norm": 0.6123886108398438, + "learning_rate": 0.00021308571428571427, + "loss": 0.1427, + "step": 1015 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 0.6560390591621399, + "learning_rate": 0.00021265714285714283, + "loss": 0.1415, + "step": 1020 + }, + { + "epoch": 5.857142857142857, + "grad_norm": 0.5576716661453247, + "learning_rate": 0.00021222857142857141, + "loss": 0.1408, + "step": 1025 + }, + { + "epoch": 5.885714285714286, + "grad_norm": 0.6419074535369873, + "learning_rate": 0.00021179999999999997, + "loss": 0.1385, + "step": 1030 + }, + { + "epoch": 5.914285714285715, + "grad_norm": 1.008925199508667, + "learning_rate": 0.00021137142857142858, + "loss": 0.1497, + "step": 1035 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 0.6559906005859375, + "learning_rate": 0.00021094285714285714, + "loss": 0.1218, + "step": 1040 + }, + { + "epoch": 5.9714285714285715, + "grad_norm": 0.627164363861084, + "learning_rate": 0.0002105142857142857, + "loss": 0.1368, + "step": 1045 + }, + { + "epoch": 6.0, + "grad_norm": 0.5760972499847412, + "learning_rate": 0.00021008571428571428, + "loss": 0.1508, + "step": 1050 + }, + { + "epoch": 6.0285714285714285, + "grad_norm": 0.5754174590110779, + "learning_rate": 0.00020965714285714284, + "loss": 0.1181, + "step": 1055 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 0.8736348748207092, + "learning_rate": 0.0002092285714285714, + "loss": 0.1252, + "step": 1060 + }, + { + "epoch": 6.085714285714285, + "grad_norm": 0.7166719436645508, + "learning_rate": 0.00020879999999999998, + "loss": 0.1481, + "step": 1065 + }, + { + "epoch": 6.114285714285714, + "grad_norm": 0.6494349241256714, + "learning_rate": 0.00020837142857142856, + "loss": 0.1478, + "step": 1070 + }, + { + "epoch": 6.142857142857143, + "grad_norm": 0.6681587100028992, + "learning_rate": 0.00020794285714285712, + "loss": 0.1488, + "step": 1075 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 0.7123684883117676, + "learning_rate": 0.0002075142857142857, + "loss": 0.1378, + "step": 1080 + }, + { + "epoch": 6.2, + "grad_norm": 0.6146950721740723, + "learning_rate": 0.00020708571428571426, + "loss": 0.1306, + "step": 1085 + }, + { + "epoch": 6.228571428571429, + "grad_norm": 0.8402445912361145, + "learning_rate": 0.00020665714285714282, + "loss": 0.1063, + "step": 1090 + }, + { + "epoch": 6.257142857142857, + "grad_norm": 0.6567764282226562, + "learning_rate": 0.0002062285714285714, + "loss": 0.1195, + "step": 1095 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.6006014943122864, + "learning_rate": 0.0002058, + "loss": 0.1542, + "step": 1100 + }, + { + "epoch": 6.314285714285714, + "grad_norm": 0.793100893497467, + "learning_rate": 0.00020537142857142857, + "loss": 0.1381, + "step": 1105 + }, + { + "epoch": 6.3428571428571425, + "grad_norm": 0.5923666954040527, + "learning_rate": 0.00020494285714285713, + "loss": 0.1386, + "step": 1110 + }, + { + "epoch": 6.371428571428572, + "grad_norm": 0.6692521572113037, + "learning_rate": 0.0002045142857142857, + "loss": 0.1223, + "step": 1115 + }, + { + "epoch": 6.4, + "grad_norm": 0.7216306328773499, + "learning_rate": 0.00020408571428571427, + "loss": 0.1367, + "step": 1120 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.5640934109687805, + "learning_rate": 0.00020365714285714283, + "loss": 0.1554, + "step": 1125 + }, + { + "epoch": 6.457142857142857, + "grad_norm": 0.8154368996620178, + "learning_rate": 0.00020322857142857138, + "loss": 0.1674, + "step": 1130 + }, + { + "epoch": 6.485714285714286, + "grad_norm": 0.7185398936271667, + "learning_rate": 0.0002028, + "loss": 0.1375, + "step": 1135 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 0.6805170774459839, + "learning_rate": 0.00020237142857142855, + "loss": 0.1306, + "step": 1140 + }, + { + "epoch": 6.542857142857143, + "grad_norm": 0.5996941924095154, + "learning_rate": 0.00020194285714285714, + "loss": 0.1433, + "step": 1145 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 0.5258373022079468, + "learning_rate": 0.0002015142857142857, + "loss": 0.1285, + "step": 1150 + }, + { + "epoch": 6.6, + "grad_norm": 0.7771695256233215, + "learning_rate": 0.00020108571428571425, + "loss": 0.1493, + "step": 1155 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 0.5920616388320923, + "learning_rate": 0.00020065714285714284, + "loss": 0.1479, + "step": 1160 + }, + { + "epoch": 6.6571428571428575, + "grad_norm": 0.7460982799530029, + "learning_rate": 0.00020022857142857142, + "loss": 0.1173, + "step": 1165 + }, + { + "epoch": 6.685714285714286, + "grad_norm": 1.1703822612762451, + "learning_rate": 0.0001998, + "loss": 0.1402, + "step": 1170 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.7894724011421204, + "learning_rate": 0.00019937142857142856, + "loss": 0.1253, + "step": 1175 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 0.7013376355171204, + "learning_rate": 0.00019894285714285712, + "loss": 0.1573, + "step": 1180 + }, + { + "epoch": 6.771428571428571, + "grad_norm": 0.6421737670898438, + "learning_rate": 0.0001985142857142857, + "loss": 0.1497, + "step": 1185 + }, + { + "epoch": 6.8, + "grad_norm": 1.204296350479126, + "learning_rate": 0.00019808571428571426, + "loss": 0.1634, + "step": 1190 + }, + { + "epoch": 6.828571428571428, + "grad_norm": 0.867765486240387, + "learning_rate": 0.00019765714285714282, + "loss": 0.1353, + "step": 1195 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 0.7325594425201416, + "learning_rate": 0.00019722857142857143, + "loss": 0.118, + "step": 1200 + }, + { + "epoch": 6.885714285714286, + "grad_norm": 0.7029078006744385, + "learning_rate": 0.00019679999999999999, + "loss": 0.1425, + "step": 1205 + }, + { + "epoch": 6.914285714285715, + "grad_norm": 1.1572504043579102, + "learning_rate": 0.00019637142857142857, + "loss": 0.1337, + "step": 1210 + }, + { + "epoch": 6.942857142857143, + "grad_norm": 0.8022822141647339, + "learning_rate": 0.00019594285714285713, + "loss": 0.1684, + "step": 1215 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 0.6729874610900879, + "learning_rate": 0.00019551428571428568, + "loss": 0.1238, + "step": 1220 + }, + { + "epoch": 7.0, + "grad_norm": 0.5773627758026123, + "learning_rate": 0.00019508571428571427, + "loss": 0.138, + "step": 1225 + }, + { + "epoch": 7.0285714285714285, + "grad_norm": 0.7182291150093079, + "learning_rate": 0.00019465714285714285, + "loss": 0.1431, + "step": 1230 + }, + { + "epoch": 7.057142857142857, + "grad_norm": 1.7567912340164185, + "learning_rate": 0.0001942285714285714, + "loss": 0.1319, + "step": 1235 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 0.6845232248306274, + "learning_rate": 0.0001938, + "loss": 0.1292, + "step": 1240 + }, + { + "epoch": 7.114285714285714, + "grad_norm": 0.6077771782875061, + "learning_rate": 0.00019337142857142855, + "loss": 0.1238, + "step": 1245 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.6168347597122192, + "learning_rate": 0.0001929428571428571, + "loss": 0.1384, + "step": 1250 + }, + { + "epoch": 7.171428571428572, + "grad_norm": 0.7457576394081116, + "learning_rate": 0.0001925142857142857, + "loss": 0.1306, + "step": 1255 + }, + { + "epoch": 7.2, + "grad_norm": 0.5969316363334656, + "learning_rate": 0.00019208571428571425, + "loss": 0.1123, + "step": 1260 + }, + { + "epoch": 7.228571428571429, + "grad_norm": 0.6902753710746765, + "learning_rate": 0.00019165714285714286, + "loss": 0.1185, + "step": 1265 + }, + { + "epoch": 7.257142857142857, + "grad_norm": 0.6488338112831116, + "learning_rate": 0.00019122857142857142, + "loss": 0.1431, + "step": 1270 + }, + { + "epoch": 7.285714285714286, + "grad_norm": 0.6814819574356079, + "learning_rate": 0.00019079999999999998, + "loss": 0.1495, + "step": 1275 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 0.7468088865280151, + "learning_rate": 0.00019037142857142856, + "loss": 0.1158, + "step": 1280 + }, + { + "epoch": 7.3428571428571425, + "grad_norm": 0.7417412400245667, + "learning_rate": 0.00018994285714285712, + "loss": 0.1311, + "step": 1285 + }, + { + "epoch": 7.371428571428572, + "grad_norm": 0.5480664372444153, + "learning_rate": 0.00018951428571428567, + "loss": 0.135, + "step": 1290 + }, + { + "epoch": 7.4, + "grad_norm": 0.725527822971344, + "learning_rate": 0.00018908571428571429, + "loss": 0.1217, + "step": 1295 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 0.6566678285598755, + "learning_rate": 0.00018865714285714284, + "loss": 0.1417, + "step": 1300 + }, + { + "epoch": 7.457142857142857, + "grad_norm": 0.516952395439148, + "learning_rate": 0.00018822857142857143, + "loss": 0.1329, + "step": 1305 + }, + { + "epoch": 7.485714285714286, + "grad_norm": 1.9545241594314575, + "learning_rate": 0.00018779999999999998, + "loss": 0.1339, + "step": 1310 + }, + { + "epoch": 7.514285714285714, + "grad_norm": 0.8276839852333069, + "learning_rate": 0.00018737142857142854, + "loss": 0.1324, + "step": 1315 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 0.6737099289894104, + "learning_rate": 0.00018694285714285713, + "loss": 0.1139, + "step": 1320 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 0.6914472579956055, + "learning_rate": 0.00018651428571428568, + "loss": 0.1146, + "step": 1325 + }, + { + "epoch": 7.6, + "grad_norm": 0.6630033850669861, + "learning_rate": 0.0001860857142857143, + "loss": 0.1571, + "step": 1330 + }, + { + "epoch": 7.628571428571428, + "grad_norm": 0.820688784122467, + "learning_rate": 0.00018565714285714285, + "loss": 0.15, + "step": 1335 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 2.0491325855255127, + "learning_rate": 0.0001852285714285714, + "loss": 0.127, + "step": 1340 + }, + { + "epoch": 7.685714285714286, + "grad_norm": 0.9327268004417419, + "learning_rate": 0.0001848, + "loss": 0.1289, + "step": 1345 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 1.3131701946258545, + "learning_rate": 0.00018437142857142855, + "loss": 0.1228, + "step": 1350 + }, + { + "epoch": 7.742857142857143, + "grad_norm": 2.955918312072754, + "learning_rate": 0.0001839428571428571, + "loss": 0.1082, + "step": 1355 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 1.2165493965148926, + "learning_rate": 0.00018351428571428572, + "loss": 0.1688, + "step": 1360 + }, + { + "epoch": 7.8, + "grad_norm": 0.759324312210083, + "learning_rate": 0.00018308571428571428, + "loss": 0.1185, + "step": 1365 + }, + { + "epoch": 7.828571428571428, + "grad_norm": 0.7445591688156128, + "learning_rate": 0.00018265714285714286, + "loss": 0.1431, + "step": 1370 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.679374098777771, + "learning_rate": 0.00018222857142857142, + "loss": 0.1451, + "step": 1375 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 2.1234302520751953, + "learning_rate": 0.00018179999999999997, + "loss": 0.1265, + "step": 1380 + }, + { + "epoch": 7.914285714285715, + "grad_norm": 1.006521224975586, + "learning_rate": 0.00018137142857142856, + "loss": 0.1722, + "step": 1385 + }, + { + "epoch": 7.942857142857143, + "grad_norm": 0.7275253534317017, + "learning_rate": 0.00018094285714285712, + "loss": 0.1625, + "step": 1390 + }, + { + "epoch": 7.9714285714285715, + "grad_norm": 0.8612022995948792, + "learning_rate": 0.0001805142857142857, + "loss": 0.1345, + "step": 1395 + }, + { + "epoch": 8.0, + "grad_norm": 0.7276798486709595, + "learning_rate": 0.00018008571428571428, + "loss": 0.1236, + "step": 1400 + }, + { + "epoch": 8.028571428571428, + "grad_norm": 0.8731086850166321, + "learning_rate": 0.00017965714285714284, + "loss": 0.1604, + "step": 1405 + }, + { + "epoch": 8.057142857142857, + "grad_norm": 0.8950818777084351, + "learning_rate": 0.0001792285714285714, + "loss": 0.1531, + "step": 1410 + }, + { + "epoch": 8.085714285714285, + "grad_norm": 0.7399356365203857, + "learning_rate": 0.00017879999999999998, + "loss": 0.1508, + "step": 1415 + }, + { + "epoch": 8.114285714285714, + "grad_norm": 1.3727307319641113, + "learning_rate": 0.00017837142857142854, + "loss": 0.1487, + "step": 1420 + }, + { + "epoch": 8.142857142857142, + "grad_norm": 0.5938125848770142, + "learning_rate": 0.00017794285714285715, + "loss": 0.1303, + "step": 1425 + }, + { + "epoch": 8.17142857142857, + "grad_norm": 0.7043821811676025, + "learning_rate": 0.0001775142857142857, + "loss": 0.0948, + "step": 1430 + }, + { + "epoch": 8.2, + "grad_norm": 1.1062767505645752, + "learning_rate": 0.00017708571428571426, + "loss": 0.1412, + "step": 1435 + }, + { + "epoch": 8.228571428571428, + "grad_norm": 0.844832181930542, + "learning_rate": 0.00017665714285714285, + "loss": 0.113, + "step": 1440 + }, + { + "epoch": 8.257142857142856, + "grad_norm": 0.7564154863357544, + "learning_rate": 0.0001762285714285714, + "loss": 0.1319, + "step": 1445 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 0.8843110203742981, + "learning_rate": 0.00017579999999999996, + "loss": 0.1206, + "step": 1450 + }, + { + "epoch": 8.314285714285715, + "grad_norm": 0.8175828456878662, + "learning_rate": 0.00017537142857142855, + "loss": 0.1327, + "step": 1455 + }, + { + "epoch": 8.342857142857143, + "grad_norm": 0.6443565487861633, + "learning_rate": 0.00017494285714285713, + "loss": 0.1239, + "step": 1460 + }, + { + "epoch": 8.371428571428572, + "grad_norm": 0.7237185835838318, + "learning_rate": 0.00017451428571428572, + "loss": 0.1639, + "step": 1465 + }, + { + "epoch": 8.4, + "grad_norm": 0.6118057370185852, + "learning_rate": 0.00017408571428571427, + "loss": 0.1363, + "step": 1470 + }, + { + "epoch": 8.428571428571429, + "grad_norm": 0.6754649877548218, + "learning_rate": 0.00017365714285714283, + "loss": 0.1187, + "step": 1475 + }, + { + "epoch": 8.457142857142857, + "grad_norm": 1.0067390203475952, + "learning_rate": 0.00017322857142857141, + "loss": 0.1401, + "step": 1480 + }, + { + "epoch": 8.485714285714286, + "grad_norm": 8.509544372558594, + "learning_rate": 0.00017279999999999997, + "loss": 0.1304, + "step": 1485 + }, + { + "epoch": 8.514285714285714, + "grad_norm": 4.2030205726623535, + "learning_rate": 0.00017237142857142858, + "loss": 0.121, + "step": 1490 + }, + { + "epoch": 8.542857142857143, + "grad_norm": 4.877438068389893, + "learning_rate": 0.00017194285714285714, + "loss": 0.1918, + "step": 1495 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 6.4971232414245605, + "learning_rate": 0.0001715142857142857, + "loss": 0.2154, + "step": 1500 + }, + { + "epoch": 8.6, + "grad_norm": 4.365469932556152, + "learning_rate": 0.00017108571428571428, + "loss": 0.2272, + "step": 1505 + }, + { + "epoch": 8.628571428571428, + "grad_norm": 2.551957845687866, + "learning_rate": 0.00017065714285714284, + "loss": 0.2163, + "step": 1510 + }, + { + "epoch": 8.657142857142857, + "grad_norm": 5.326391220092773, + "learning_rate": 0.0001702285714285714, + "loss": 0.1612, + "step": 1515 + }, + { + "epoch": 8.685714285714285, + "grad_norm": 1.3528404235839844, + "learning_rate": 0.00016979999999999998, + "loss": 0.1636, + "step": 1520 + }, + { + "epoch": 8.714285714285714, + "grad_norm": 1.4466065168380737, + "learning_rate": 0.00016937142857142856, + "loss": 0.1295, + "step": 1525 + }, + { + "epoch": 8.742857142857144, + "grad_norm": 0.6576040387153625, + "learning_rate": 0.00016894285714285715, + "loss": 0.1318, + "step": 1530 + }, + { + "epoch": 8.771428571428572, + "grad_norm": 1.286942958831787, + "learning_rate": 0.0001685142857142857, + "loss": 0.1443, + "step": 1535 + }, + { + "epoch": 8.8, + "grad_norm": 9.474458694458008, + "learning_rate": 0.00016808571428571426, + "loss": 0.1313, + "step": 1540 + }, + { + "epoch": 8.82857142857143, + "grad_norm": 2.6731069087982178, + "learning_rate": 0.00016765714285714285, + "loss": 0.1485, + "step": 1545 + }, + { + "epoch": 8.857142857142858, + "grad_norm": 1.313723087310791, + "learning_rate": 0.0001672285714285714, + "loss": 0.1346, + "step": 1550 + }, + { + "epoch": 8.885714285714286, + "grad_norm": 1.7115576267242432, + "learning_rate": 0.0001668, + "loss": 0.1471, + "step": 1555 + }, + { + "epoch": 8.914285714285715, + "grad_norm": 1.2599923610687256, + "learning_rate": 0.00016637142857142857, + "loss": 0.1433, + "step": 1560 + }, + { + "epoch": 8.942857142857143, + "grad_norm": 0.9659029245376587, + "learning_rate": 0.00016594285714285713, + "loss": 0.1256, + "step": 1565 + }, + { + "epoch": 8.971428571428572, + "grad_norm": 1.1282744407653809, + "learning_rate": 0.0001655142857142857, + "loss": 0.1373, + "step": 1570 + }, + { + "epoch": 9.0, + "grad_norm": 3.20717453956604, + "learning_rate": 0.00016508571428571427, + "loss": 0.1355, + "step": 1575 + }, + { + "epoch": 9.028571428571428, + "grad_norm": 0.8310821056365967, + "learning_rate": 0.00016465714285714283, + "loss": 0.1268, + "step": 1580 + }, + { + "epoch": 9.057142857142857, + "grad_norm": 1.5337790250778198, + "learning_rate": 0.00016422857142857139, + "loss": 0.1267, + "step": 1585 + }, + { + "epoch": 9.085714285714285, + "grad_norm": 2.6406068801879883, + "learning_rate": 0.0001638, + "loss": 0.1363, + "step": 1590 + }, + { + "epoch": 9.114285714285714, + "grad_norm": 0.7705873847007751, + "learning_rate": 0.00016337142857142855, + "loss": 0.1291, + "step": 1595 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 0.7092650532722473, + "learning_rate": 0.00016294285714285714, + "loss": 0.1435, + "step": 1600 + }, + { + "epoch": 9.17142857142857, + "grad_norm": 1.098961591720581, + "learning_rate": 0.0001625142857142857, + "loss": 0.1471, + "step": 1605 + }, + { + "epoch": 9.2, + "grad_norm": 0.6994885206222534, + "learning_rate": 0.00016208571428571425, + "loss": 0.1345, + "step": 1610 + }, + { + "epoch": 9.228571428571428, + "grad_norm": 0.9613476991653442, + "learning_rate": 0.00016165714285714284, + "loss": 0.1399, + "step": 1615 + }, + { + "epoch": 9.257142857142856, + "grad_norm": 0.675588846206665, + "learning_rate": 0.00016122857142857142, + "loss": 0.1319, + "step": 1620 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.7519372701644897, + "learning_rate": 0.0001608, + "loss": 0.137, + "step": 1625 + }, + { + "epoch": 9.314285714285715, + "grad_norm": 1.135025978088379, + "learning_rate": 0.00016037142857142856, + "loss": 0.1322, + "step": 1630 + }, + { + "epoch": 9.342857142857143, + "grad_norm": 0.7462936639785767, + "learning_rate": 0.00015994285714285712, + "loss": 0.1215, + "step": 1635 + }, + { + "epoch": 9.371428571428572, + "grad_norm": 0.9042088985443115, + "learning_rate": 0.0001595142857142857, + "loss": 0.1191, + "step": 1640 + }, + { + "epoch": 9.4, + "grad_norm": 0.567828893661499, + "learning_rate": 0.00015908571428571426, + "loss": 0.1189, + "step": 1645 + }, + { + "epoch": 9.428571428571429, + "grad_norm": 0.981585681438446, + "learning_rate": 0.00015865714285714282, + "loss": 0.128, + "step": 1650 + }, + { + "epoch": 9.457142857142857, + "grad_norm": 1.24985933303833, + "learning_rate": 0.00015822857142857143, + "loss": 0.1315, + "step": 1655 + }, + { + "epoch": 9.485714285714286, + "grad_norm": 0.6517993211746216, + "learning_rate": 0.0001578, + "loss": 0.1076, + "step": 1660 + }, + { + "epoch": 9.514285714285714, + "grad_norm": 1.166628122329712, + "learning_rate": 0.00015737142857142857, + "loss": 0.1345, + "step": 1665 + }, + { + "epoch": 9.542857142857143, + "grad_norm": 0.9763592481613159, + "learning_rate": 0.00015694285714285713, + "loss": 0.1449, + "step": 1670 + }, + { + "epoch": 9.571428571428571, + "grad_norm": 0.7829060554504395, + "learning_rate": 0.00015651428571428569, + "loss": 0.1117, + "step": 1675 + }, + { + "epoch": 9.6, + "grad_norm": 0.6693719029426575, + "learning_rate": 0.00015608571428571427, + "loss": 0.1129, + "step": 1680 + }, + { + "epoch": 9.628571428571428, + "grad_norm": 1.2122846841812134, + "learning_rate": 0.00015565714285714285, + "loss": 0.1125, + "step": 1685 + }, + { + "epoch": 9.657142857142857, + "grad_norm": 1.0689371824264526, + "learning_rate": 0.0001552285714285714, + "loss": 0.1478, + "step": 1690 + }, + { + "epoch": 9.685714285714285, + "grad_norm": 1.8511656522750854, + "learning_rate": 0.0001548, + "loss": 0.1431, + "step": 1695 + }, + { + "epoch": 9.714285714285714, + "grad_norm": 0.6706506609916687, + "learning_rate": 0.00015437142857142855, + "loss": 0.1262, + "step": 1700 + }, + { + "epoch": 9.742857142857144, + "grad_norm": 1.0798784494400024, + "learning_rate": 0.00015394285714285714, + "loss": 0.1275, + "step": 1705 + }, + { + "epoch": 9.771428571428572, + "grad_norm": 0.7915983200073242, + "learning_rate": 0.0001535142857142857, + "loss": 0.1316, + "step": 1710 + }, + { + "epoch": 9.8, + "grad_norm": 1.8630567789077759, + "learning_rate": 0.00015308571428571425, + "loss": 0.1258, + "step": 1715 + }, + { + "epoch": 9.82857142857143, + "grad_norm": 0.7807756662368774, + "learning_rate": 0.00015265714285714286, + "loss": 0.1079, + "step": 1720 + }, + { + "epoch": 9.857142857142858, + "grad_norm": 1.4698439836502075, + "learning_rate": 0.00015222857142857142, + "loss": 0.1357, + "step": 1725 + }, + { + "epoch": 9.885714285714286, + "grad_norm": 1.2121926546096802, + "learning_rate": 0.00015179999999999998, + "loss": 0.1322, + "step": 1730 + }, + { + "epoch": 9.914285714285715, + "grad_norm": 0.6348568201065063, + "learning_rate": 0.00015137142857142856, + "loss": 0.0893, + "step": 1735 + }, + { + "epoch": 9.942857142857143, + "grad_norm": 0.6694422364234924, + "learning_rate": 0.00015094285714285712, + "loss": 0.1189, + "step": 1740 + }, + { + "epoch": 9.971428571428572, + "grad_norm": 0.569332480430603, + "learning_rate": 0.00015051428571428567, + "loss": 0.1349, + "step": 1745 + }, + { + "epoch": 10.0, + "grad_norm": 0.934073269367218, + "learning_rate": 0.00015008571428571429, + "loss": 0.1237, + "step": 1750 + }, + { + "epoch": 10.028571428571428, + "grad_norm": 0.7191672325134277, + "learning_rate": 0.00014965714285714284, + "loss": 0.1308, + "step": 1755 + }, + { + "epoch": 10.057142857142857, + "grad_norm": 0.7006493806838989, + "learning_rate": 0.00014922857142857143, + "loss": 0.104, + "step": 1760 + }, + { + "epoch": 10.085714285714285, + "grad_norm": 0.9030678272247314, + "learning_rate": 0.00014879999999999998, + "loss": 0.1308, + "step": 1765 + }, + { + "epoch": 10.114285714285714, + "grad_norm": 0.7007766366004944, + "learning_rate": 0.00014837142857142854, + "loss": 0.1044, + "step": 1770 + }, + { + "epoch": 10.142857142857142, + "grad_norm": 0.4832770824432373, + "learning_rate": 0.00014794285714285713, + "loss": 0.1119, + "step": 1775 + }, + { + "epoch": 10.17142857142857, + "grad_norm": 0.7819458842277527, + "learning_rate": 0.0001475142857142857, + "loss": 0.1087, + "step": 1780 + }, + { + "epoch": 10.2, + "grad_norm": 1.0223525762557983, + "learning_rate": 0.00014708571428571427, + "loss": 0.1314, + "step": 1785 + }, + { + "epoch": 10.228571428571428, + "grad_norm": 0.6224566698074341, + "learning_rate": 0.00014665714285714285, + "loss": 0.1159, + "step": 1790 + }, + { + "epoch": 10.257142857142856, + "grad_norm": 0.45800235867500305, + "learning_rate": 0.0001462285714285714, + "loss": 0.0942, + "step": 1795 + }, + { + "epoch": 10.285714285714286, + "grad_norm": 0.6258400082588196, + "learning_rate": 0.0001458, + "loss": 0.1079, + "step": 1800 + }, + { + "epoch": 10.314285714285715, + "grad_norm": 1.1812794208526611, + "learning_rate": 0.00014537142857142858, + "loss": 0.1378, + "step": 1805 + }, + { + "epoch": 10.342857142857143, + "grad_norm": 0.8541269898414612, + "learning_rate": 0.00014494285714285713, + "loss": 0.1274, + "step": 1810 + }, + { + "epoch": 10.371428571428572, + "grad_norm": 0.7131860256195068, + "learning_rate": 0.0001445142857142857, + "loss": 0.1247, + "step": 1815 + }, + { + "epoch": 10.4, + "grad_norm": 0.6109820008277893, + "learning_rate": 0.00014408571428571428, + "loss": 0.1246, + "step": 1820 + }, + { + "epoch": 10.428571428571429, + "grad_norm": 0.5621510744094849, + "learning_rate": 0.00014365714285714286, + "loss": 0.1039, + "step": 1825 + }, + { + "epoch": 10.457142857142857, + "grad_norm": 1.022777795791626, + "learning_rate": 0.00014322857142857142, + "loss": 0.1206, + "step": 1830 + }, + { + "epoch": 10.485714285714286, + "grad_norm": 0.9120668768882751, + "learning_rate": 0.00014279999999999997, + "loss": 0.1289, + "step": 1835 + }, + { + "epoch": 10.514285714285714, + "grad_norm": 1.1882030963897705, + "learning_rate": 0.00014237142857142856, + "loss": 0.1194, + "step": 1840 + }, + { + "epoch": 10.542857142857143, + "grad_norm": 0.6078401207923889, + "learning_rate": 0.00014194285714285714, + "loss": 0.1339, + "step": 1845 + }, + { + "epoch": 10.571428571428571, + "grad_norm": 0.7380999326705933, + "learning_rate": 0.0001415142857142857, + "loss": 0.1318, + "step": 1850 + }, + { + "epoch": 10.6, + "grad_norm": 0.5884959101676941, + "learning_rate": 0.00014108571428571428, + "loss": 0.1249, + "step": 1855 + }, + { + "epoch": 10.628571428571428, + "grad_norm": 1.0121936798095703, + "learning_rate": 0.00014065714285714284, + "loss": 0.1137, + "step": 1860 + }, + { + "epoch": 10.657142857142857, + "grad_norm": 0.6444916129112244, + "learning_rate": 0.00014022857142857143, + "loss": 0.1213, + "step": 1865 + }, + { + "epoch": 10.685714285714285, + "grad_norm": 0.7931004762649536, + "learning_rate": 0.00013979999999999998, + "loss": 0.1318, + "step": 1870 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 0.5596404075622559, + "learning_rate": 0.00013937142857142857, + "loss": 0.1075, + "step": 1875 + }, + { + "epoch": 10.742857142857144, + "grad_norm": 0.6586474180221558, + "learning_rate": 0.00013894285714285712, + "loss": 0.13, + "step": 1880 + }, + { + "epoch": 10.771428571428572, + "grad_norm": 1.0195013284683228, + "learning_rate": 0.00013851428571428568, + "loss": 0.1373, + "step": 1885 + }, + { + "epoch": 10.8, + "grad_norm": 0.9233512878417969, + "learning_rate": 0.00013808571428571427, + "loss": 0.1168, + "step": 1890 + }, + { + "epoch": 10.82857142857143, + "grad_norm": 0.7154092788696289, + "learning_rate": 0.00013765714285714285, + "loss": 0.1081, + "step": 1895 + }, + { + "epoch": 10.857142857142858, + "grad_norm": 1.4588117599487305, + "learning_rate": 0.0001372285714285714, + "loss": 0.1061, + "step": 1900 + }, + { + "epoch": 10.885714285714286, + "grad_norm": 0.6087035536766052, + "learning_rate": 0.0001368, + "loss": 0.1157, + "step": 1905 + }, + { + "epoch": 10.914285714285715, + "grad_norm": 0.7371247410774231, + "learning_rate": 0.00013637142857142855, + "loss": 0.1339, + "step": 1910 + }, + { + "epoch": 10.942857142857143, + "grad_norm": 0.8253212571144104, + "learning_rate": 0.00013594285714285713, + "loss": 0.1198, + "step": 1915 + }, + { + "epoch": 10.971428571428572, + "grad_norm": 0.6889544129371643, + "learning_rate": 0.00013551428571428572, + "loss": 0.1131, + "step": 1920 + }, + { + "epoch": 11.0, + "grad_norm": 0.6408224105834961, + "learning_rate": 0.00013508571428571427, + "loss": 0.122, + "step": 1925 + }, + { + "epoch": 11.028571428571428, + "grad_norm": 0.6771185398101807, + "learning_rate": 0.00013465714285714283, + "loss": 0.1492, + "step": 1930 + }, + { + "epoch": 11.057142857142857, + "grad_norm": 0.8706450462341309, + "learning_rate": 0.00013422857142857142, + "loss": 0.1294, + "step": 1935 + }, + { + "epoch": 11.085714285714285, + "grad_norm": 1.730648398399353, + "learning_rate": 0.0001338, + "loss": 0.1004, + "step": 1940 + }, + { + "epoch": 11.114285714285714, + "grad_norm": 0.6985113620758057, + "learning_rate": 0.00013337142857142856, + "loss": 0.0995, + "step": 1945 + }, + { + "epoch": 11.142857142857142, + "grad_norm": 0.8901951313018799, + "learning_rate": 0.00013294285714285711, + "loss": 0.1179, + "step": 1950 + }, + { + "epoch": 11.17142857142857, + "grad_norm": 0.7232164144515991, + "learning_rate": 0.0001325142857142857, + "loss": 0.1397, + "step": 1955 + }, + { + "epoch": 11.2, + "grad_norm": 0.6447544693946838, + "learning_rate": 0.00013208571428571428, + "loss": 0.1366, + "step": 1960 + }, + { + "epoch": 11.228571428571428, + "grad_norm": 0.7964944243431091, + "learning_rate": 0.00013165714285714284, + "loss": 0.1121, + "step": 1965 + }, + { + "epoch": 11.257142857142856, + "grad_norm": 0.9012628793716431, + "learning_rate": 0.00013122857142857142, + "loss": 0.1131, + "step": 1970 + }, + { + "epoch": 11.285714285714286, + "grad_norm": 0.9295369982719421, + "learning_rate": 0.00013079999999999998, + "loss": 0.1232, + "step": 1975 + }, + { + "epoch": 11.314285714285715, + "grad_norm": 0.6237708926200867, + "learning_rate": 0.00013037142857142857, + "loss": 0.1066, + "step": 1980 + }, + { + "epoch": 11.342857142857143, + "grad_norm": 0.5250967741012573, + "learning_rate": 0.00012994285714285715, + "loss": 0.118, + "step": 1985 + }, + { + "epoch": 11.371428571428572, + "grad_norm": 1.0013964176177979, + "learning_rate": 0.0001295142857142857, + "loss": 0.1125, + "step": 1990 + }, + { + "epoch": 11.4, + "grad_norm": 0.6721311807632446, + "learning_rate": 0.00012908571428571426, + "loss": 0.1196, + "step": 1995 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.6966421008110046, + "learning_rate": 0.00012865714285714285, + "loss": 0.1172, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-2000/training_args.bin b/glot-contrastive-final-lora/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/checkpoint-2500/README.md b/glot-contrastive-final-lora/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-2500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-2500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..964a26ebd94b76a139f6016a3b577cdf72a05f0d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940dc572880c580cd969ac155623363743c9f3ef94854aba54b224023c4a2ee1 +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-2500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5788a9996e7f73c42f9b09fc4be20cc399796580 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2c540c91f6c54cf3701175d6db55034ccae2b3b587a04b9476ce989d4fa18b +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-2500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..33b6eedc9a83e25b359069f1d4502c4ee4ec4163 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82bf023104ba6bb70dbc679f41d50ee904b14245b597026bbb288d43524d6797 +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-2500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9272fd4135ab463ff7ad109f92c09aff73a7ae4 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11ef9936017bed12cabfddfce2a90fd82a625d038e573173ab445ab44ee6c357 +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-2500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-2500/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-2500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-2500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-2500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-2500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..67999d8d7c11daa2dc63ca5ef8eb1010c1ffc191 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/trainer_state.json @@ -0,0 +1,3534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.285714285714286, + "eval_steps": 5, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 1.3597544431686401, + "learning_rate": 0.00025679999999999995, + "loss": 0.1583, + "step": 505 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.8790671825408936, + "learning_rate": 0.00025637142857142854, + "loss": 0.1565, + "step": 510 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 1.1175066232681274, + "learning_rate": 0.0002559428571428571, + "loss": 0.1406, + "step": 515 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.8528785705566406, + "learning_rate": 0.0002555142857142857, + "loss": 0.1735, + "step": 520 + }, + { + "epoch": 3.0, + "grad_norm": 2.2073328495025635, + "learning_rate": 0.0002550857142857143, + "loss": 0.1816, + "step": 525 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 11.01322078704834, + "learning_rate": 0.0002546571428571428, + "loss": 0.1873, + "step": 530 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 1.5822402238845825, + "learning_rate": 0.0002542285714285714, + "loss": 0.168, + "step": 535 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 1.3086942434310913, + "learning_rate": 0.0002538, + "loss": 0.149, + "step": 540 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 6.303041458129883, + "learning_rate": 0.0002533714285714285, + "loss": 0.1651, + "step": 545 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 14.48929500579834, + "learning_rate": 0.00025294285714285716, + "loss": 0.1687, + "step": 550 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 6.824525356292725, + "learning_rate": 0.0002525142857142857, + "loss": 0.1919, + "step": 555 + }, + { + "epoch": 3.2, + "grad_norm": 18.772563934326172, + "learning_rate": 0.00025208571428571427, + "loss": 0.2075, + "step": 560 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.7268752455711365, + "learning_rate": 0.00025165714285714286, + "loss": 0.174, + "step": 565 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 1.1301453113555908, + "learning_rate": 0.0002512285714285714, + "loss": 0.1668, + "step": 570 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 2.846802234649658, + "learning_rate": 0.00025079999999999997, + "loss": 0.1645, + "step": 575 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 1.417515754699707, + "learning_rate": 0.00025037142857142855, + "loss": 0.1719, + "step": 580 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 4.137150764465332, + "learning_rate": 0.00024994285714285714, + "loss": 0.1739, + "step": 585 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 2.6067259311676025, + "learning_rate": 0.0002495142857142857, + "loss": 0.1489, + "step": 590 + }, + { + "epoch": 3.4, + "grad_norm": 2.601024627685547, + "learning_rate": 0.00024908571428571425, + "loss": 0.1618, + "step": 595 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.849017858505249, + "learning_rate": 0.00024865714285714284, + "loss": 0.1899, + "step": 600 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 4.673766136169434, + "learning_rate": 0.0002482285714285714, + "loss": 0.1761, + "step": 605 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 2.6057631969451904, + "learning_rate": 0.00024779999999999995, + "loss": 0.1743, + "step": 610 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 2.932652473449707, + "learning_rate": 0.0002473714285714286, + "loss": 0.1482, + "step": 615 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.8764939308166504, + "learning_rate": 0.0002469428571428571, + "loss": 0.1644, + "step": 620 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.3203191757202148, + "learning_rate": 0.0002465142857142857, + "loss": 0.1654, + "step": 625 + }, + { + "epoch": 3.6, + "grad_norm": 0.7977635264396667, + "learning_rate": 0.0002460857142857143, + "loss": 0.1472, + "step": 630 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.4750248193740845, + "learning_rate": 0.0002456571428571428, + "loss": 0.1735, + "step": 635 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.8164482116699219, + "learning_rate": 0.0002452285714285714, + "loss": 0.1593, + "step": 640 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 1.4829603433609009, + "learning_rate": 0.0002448, + "loss": 0.1508, + "step": 645 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.8828144669532776, + "learning_rate": 0.00024437142857142857, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 2.039384126663208, + "learning_rate": 0.00024394285714285713, + "loss": 0.1745, + "step": 655 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.9604200720787048, + "learning_rate": 0.00024351428571428569, + "loss": 0.17, + "step": 660 + }, + { + "epoch": 3.8, + "grad_norm": 0.7903971076011658, + "learning_rate": 0.00024308571428571427, + "loss": 0.1654, + "step": 665 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 0.6935649514198303, + "learning_rate": 0.00024265714285714283, + "loss": 0.1714, + "step": 670 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.5832012295722961, + "learning_rate": 0.00024222857142857138, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.6303168535232544, + "learning_rate": 0.0002418, + "loss": 0.1604, + "step": 680 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.7210885882377625, + "learning_rate": 0.00024137142857142855, + "loss": 0.1444, + "step": 685 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 0.7690990567207336, + "learning_rate": 0.00024094285714285714, + "loss": 0.1631, + "step": 690 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.0142720937728882, + "learning_rate": 0.0002405142857142857, + "loss": 0.158, + "step": 695 + }, + { + "epoch": 4.0, + "grad_norm": 0.7970322966575623, + "learning_rate": 0.00024008571428571425, + "loss": 0.1803, + "step": 700 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 0.6795914769172668, + "learning_rate": 0.00023965714285714284, + "loss": 0.143, + "step": 705 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.6832629442214966, + "learning_rate": 0.0002392285714285714, + "loss": 0.1457, + "step": 710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 3.8629798889160156, + "learning_rate": 0.0002388, + "loss": 0.1671, + "step": 715 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.1167882680892944, + "learning_rate": 0.00023837142857142856, + "loss": 0.1544, + "step": 720 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.9431412816047668, + "learning_rate": 0.00023794285714285712, + "loss": 0.1605, + "step": 725 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 1.310948133468628, + "learning_rate": 0.0002375142857142857, + "loss": 0.1121, + "step": 730 + }, + { + "epoch": 4.2, + "grad_norm": 0.9830737709999084, + "learning_rate": 0.00023708571428571426, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.6166555881500244, + "learning_rate": 0.00023665714285714282, + "loss": 0.1525, + "step": 740 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 0.995579719543457, + "learning_rate": 0.00023622857142857143, + "loss": 0.1439, + "step": 745 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.639796793460846, + "learning_rate": 0.00023579999999999999, + "loss": 0.1692, + "step": 750 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 0.9438050389289856, + "learning_rate": 0.00023537142857142854, + "loss": 0.1785, + "step": 755 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8960750102996826, + "learning_rate": 0.00023494285714285713, + "loss": 0.1557, + "step": 760 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 0.6287499070167542, + "learning_rate": 0.00023451428571428568, + "loss": 0.1459, + "step": 765 + }, + { + "epoch": 4.4, + "grad_norm": 0.7638295888900757, + "learning_rate": 0.00023408571428571424, + "loss": 0.1341, + "step": 770 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.655878484249115, + "learning_rate": 0.00023365714285714283, + "loss": 0.1358, + "step": 775 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.5840997695922852, + "learning_rate": 0.0002332285714285714, + "loss": 0.1386, + "step": 780 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.1082488298416138, + "learning_rate": 0.0002328, + "loss": 0.1827, + "step": 785 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8825240135192871, + "learning_rate": 0.00023237142857142855, + "loss": 0.1527, + "step": 790 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.6752304434776306, + "learning_rate": 0.0002319428571428571, + "loss": 0.1392, + "step": 795 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 1.1423301696777344, + "learning_rate": 0.0002315142857142857, + "loss": 0.1433, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 10.793691635131836, + "learning_rate": 0.00023108571428571425, + "loss": 0.1635, + "step": 805 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 0.47564294934272766, + "learning_rate": 0.00023065714285714286, + "loss": 0.1199, + "step": 810 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 1.2492656707763672, + "learning_rate": 0.00023022857142857142, + "loss": 0.1488, + "step": 815 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.6933501958847046, + "learning_rate": 0.00022979999999999997, + "loss": 0.1812, + "step": 820 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00022937142857142856, + "loss": 0.1415, + "step": 825 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 0.7854829430580139, + "learning_rate": 0.00022894285714285712, + "loss": 0.1401, + "step": 830 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 0.8716740608215332, + "learning_rate": 0.00022851428571428567, + "loss": 0.1982, + "step": 835 + }, + { + "epoch": 4.8, + "grad_norm": 0.7047899961471558, + "learning_rate": 0.00022808571428571426, + "loss": 0.1624, + "step": 840 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 0.7134959697723389, + "learning_rate": 0.00022765714285714284, + "loss": 0.1375, + "step": 845 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.0897325277328491, + "learning_rate": 0.00022722857142857143, + "loss": 0.1489, + "step": 850 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 1.1065207719802856, + "learning_rate": 0.00022679999999999998, + "loss": 0.1495, + "step": 855 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.7434757351875305, + "learning_rate": 0.00022637142857142854, + "loss": 0.1507, + "step": 860 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 1.0045181512832642, + "learning_rate": 0.00022594285714285712, + "loss": 0.1527, + "step": 865 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 1.2025654315948486, + "learning_rate": 0.00022551428571428568, + "loss": 0.1523, + "step": 870 + }, + { + "epoch": 5.0, + "grad_norm": 0.7823342084884644, + "learning_rate": 0.0002250857142857143, + "loss": 0.1514, + "step": 875 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 0.8405362963676453, + "learning_rate": 0.00022465714285714285, + "loss": 0.1461, + "step": 880 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 0.7527463436126709, + "learning_rate": 0.0002242285714285714, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 0.8372548222541809, + "learning_rate": 0.0002238, + "loss": 0.1513, + "step": 890 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 0.8755456209182739, + "learning_rate": 0.00022337142857142855, + "loss": 0.1498, + "step": 895 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.7312084436416626, + "learning_rate": 0.0002229428571428571, + "loss": 0.154, + "step": 900 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.6366221904754639, + "learning_rate": 0.0002225142857142857, + "loss": 0.1466, + "step": 905 + }, + { + "epoch": 5.2, + "grad_norm": 0.6406880617141724, + "learning_rate": 0.00022208571428571427, + "loss": 0.1254, + "step": 910 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 2.4106833934783936, + "learning_rate": 0.00022165714285714283, + "loss": 0.1534, + "step": 915 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 0.5635722279548645, + "learning_rate": 0.00022122857142857142, + "loss": 0.1461, + "step": 920 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 0.787162184715271, + "learning_rate": 0.00022079999999999997, + "loss": 0.1424, + "step": 925 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.00022037142857142853, + "loss": 0.1326, + "step": 930 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 0.6933534741401672, + "learning_rate": 0.00021994285714285711, + "loss": 0.1661, + "step": 935 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 0.7263259887695312, + "learning_rate": 0.0002195142857142857, + "loss": 0.15, + "step": 940 + }, + { + "epoch": 5.4, + "grad_norm": 0.5537381768226624, + "learning_rate": 0.00021908571428571428, + "loss": 0.129, + "step": 945 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.6014005541801453, + "learning_rate": 0.00021865714285714284, + "loss": 0.1321, + "step": 950 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 0.6581441760063171, + "learning_rate": 0.0002182285714285714, + "loss": 0.1587, + "step": 955 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 0.9326379895210266, + "learning_rate": 0.00021779999999999998, + "loss": 0.1654, + "step": 960 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 0.9438592791557312, + "learning_rate": 0.00021737142857142854, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 0.7699571251869202, + "learning_rate": 0.00021694285714285715, + "loss": 0.1464, + "step": 970 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 0.8758366703987122, + "learning_rate": 0.0002165142857142857, + "loss": 0.1599, + "step": 975 + }, + { + "epoch": 5.6, + "grad_norm": 0.6101442575454712, + "learning_rate": 0.00021608571428571426, + "loss": 0.1589, + "step": 980 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 0.7454060912132263, + "learning_rate": 0.00021565714285714285, + "loss": 0.1433, + "step": 985 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 0.6379484534263611, + "learning_rate": 0.0002152285714285714, + "loss": 0.1592, + "step": 990 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1601309776306152, + "learning_rate": 0.00021479999999999996, + "loss": 0.1647, + "step": 995 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.5464673638343811, + "learning_rate": 0.00021437142857142855, + "loss": 0.1469, + "step": 1000 + }, + { + "epoch": 5.742857142857143, + "grad_norm": 1.0279319286346436, + "learning_rate": 0.00021394285714285713, + "loss": 0.1203, + "step": 1005 + }, + { + "epoch": 5.771428571428571, + "grad_norm": 0.5503718256950378, + "learning_rate": 0.00021351428571428572, + "loss": 0.1409, + "step": 1010 + }, + { + "epoch": 5.8, + "grad_norm": 0.6123886108398438, + "learning_rate": 0.00021308571428571427, + "loss": 0.1427, + "step": 1015 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 0.6560390591621399, + "learning_rate": 0.00021265714285714283, + "loss": 0.1415, + "step": 1020 + }, + { + "epoch": 5.857142857142857, + "grad_norm": 0.5576716661453247, + "learning_rate": 0.00021222857142857141, + "loss": 0.1408, + "step": 1025 + }, + { + "epoch": 5.885714285714286, + "grad_norm": 0.6419074535369873, + "learning_rate": 0.00021179999999999997, + "loss": 0.1385, + "step": 1030 + }, + { + "epoch": 5.914285714285715, + "grad_norm": 1.008925199508667, + "learning_rate": 0.00021137142857142858, + "loss": 0.1497, + "step": 1035 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 0.6559906005859375, + "learning_rate": 0.00021094285714285714, + "loss": 0.1218, + "step": 1040 + }, + { + "epoch": 5.9714285714285715, + "grad_norm": 0.627164363861084, + "learning_rate": 0.0002105142857142857, + "loss": 0.1368, + "step": 1045 + }, + { + "epoch": 6.0, + "grad_norm": 0.5760972499847412, + "learning_rate": 0.00021008571428571428, + "loss": 0.1508, + "step": 1050 + }, + { + "epoch": 6.0285714285714285, + "grad_norm": 0.5754174590110779, + "learning_rate": 0.00020965714285714284, + "loss": 0.1181, + "step": 1055 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 0.8736348748207092, + "learning_rate": 0.0002092285714285714, + "loss": 0.1252, + "step": 1060 + }, + { + "epoch": 6.085714285714285, + "grad_norm": 0.7166719436645508, + "learning_rate": 0.00020879999999999998, + "loss": 0.1481, + "step": 1065 + }, + { + "epoch": 6.114285714285714, + "grad_norm": 0.6494349241256714, + "learning_rate": 0.00020837142857142856, + "loss": 0.1478, + "step": 1070 + }, + { + "epoch": 6.142857142857143, + "grad_norm": 0.6681587100028992, + "learning_rate": 0.00020794285714285712, + "loss": 0.1488, + "step": 1075 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 0.7123684883117676, + "learning_rate": 0.0002075142857142857, + "loss": 0.1378, + "step": 1080 + }, + { + "epoch": 6.2, + "grad_norm": 0.6146950721740723, + "learning_rate": 0.00020708571428571426, + "loss": 0.1306, + "step": 1085 + }, + { + "epoch": 6.228571428571429, + "grad_norm": 0.8402445912361145, + "learning_rate": 0.00020665714285714282, + "loss": 0.1063, + "step": 1090 + }, + { + "epoch": 6.257142857142857, + "grad_norm": 0.6567764282226562, + "learning_rate": 0.0002062285714285714, + "loss": 0.1195, + "step": 1095 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.6006014943122864, + "learning_rate": 0.0002058, + "loss": 0.1542, + "step": 1100 + }, + { + "epoch": 6.314285714285714, + "grad_norm": 0.793100893497467, + "learning_rate": 0.00020537142857142857, + "loss": 0.1381, + "step": 1105 + }, + { + "epoch": 6.3428571428571425, + "grad_norm": 0.5923666954040527, + "learning_rate": 0.00020494285714285713, + "loss": 0.1386, + "step": 1110 + }, + { + "epoch": 6.371428571428572, + "grad_norm": 0.6692521572113037, + "learning_rate": 0.0002045142857142857, + "loss": 0.1223, + "step": 1115 + }, + { + "epoch": 6.4, + "grad_norm": 0.7216306328773499, + "learning_rate": 0.00020408571428571427, + "loss": 0.1367, + "step": 1120 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.5640934109687805, + "learning_rate": 0.00020365714285714283, + "loss": 0.1554, + "step": 1125 + }, + { + "epoch": 6.457142857142857, + "grad_norm": 0.8154368996620178, + "learning_rate": 0.00020322857142857138, + "loss": 0.1674, + "step": 1130 + }, + { + "epoch": 6.485714285714286, + "grad_norm": 0.7185398936271667, + "learning_rate": 0.0002028, + "loss": 0.1375, + "step": 1135 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 0.6805170774459839, + "learning_rate": 0.00020237142857142855, + "loss": 0.1306, + "step": 1140 + }, + { + "epoch": 6.542857142857143, + "grad_norm": 0.5996941924095154, + "learning_rate": 0.00020194285714285714, + "loss": 0.1433, + "step": 1145 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 0.5258373022079468, + "learning_rate": 0.0002015142857142857, + "loss": 0.1285, + "step": 1150 + }, + { + "epoch": 6.6, + "grad_norm": 0.7771695256233215, + "learning_rate": 0.00020108571428571425, + "loss": 0.1493, + "step": 1155 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 0.5920616388320923, + "learning_rate": 0.00020065714285714284, + "loss": 0.1479, + "step": 1160 + }, + { + "epoch": 6.6571428571428575, + "grad_norm": 0.7460982799530029, + "learning_rate": 0.00020022857142857142, + "loss": 0.1173, + "step": 1165 + }, + { + "epoch": 6.685714285714286, + "grad_norm": 1.1703822612762451, + "learning_rate": 0.0001998, + "loss": 0.1402, + "step": 1170 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.7894724011421204, + "learning_rate": 0.00019937142857142856, + "loss": 0.1253, + "step": 1175 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 0.7013376355171204, + "learning_rate": 0.00019894285714285712, + "loss": 0.1573, + "step": 1180 + }, + { + "epoch": 6.771428571428571, + "grad_norm": 0.6421737670898438, + "learning_rate": 0.0001985142857142857, + "loss": 0.1497, + "step": 1185 + }, + { + "epoch": 6.8, + "grad_norm": 1.204296350479126, + "learning_rate": 0.00019808571428571426, + "loss": 0.1634, + "step": 1190 + }, + { + "epoch": 6.828571428571428, + "grad_norm": 0.867765486240387, + "learning_rate": 0.00019765714285714282, + "loss": 0.1353, + "step": 1195 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 0.7325594425201416, + "learning_rate": 0.00019722857142857143, + "loss": 0.118, + "step": 1200 + }, + { + "epoch": 6.885714285714286, + "grad_norm": 0.7029078006744385, + "learning_rate": 0.00019679999999999999, + "loss": 0.1425, + "step": 1205 + }, + { + "epoch": 6.914285714285715, + "grad_norm": 1.1572504043579102, + "learning_rate": 0.00019637142857142857, + "loss": 0.1337, + "step": 1210 + }, + { + "epoch": 6.942857142857143, + "grad_norm": 0.8022822141647339, + "learning_rate": 0.00019594285714285713, + "loss": 0.1684, + "step": 1215 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 0.6729874610900879, + "learning_rate": 0.00019551428571428568, + "loss": 0.1238, + "step": 1220 + }, + { + "epoch": 7.0, + "grad_norm": 0.5773627758026123, + "learning_rate": 0.00019508571428571427, + "loss": 0.138, + "step": 1225 + }, + { + "epoch": 7.0285714285714285, + "grad_norm": 0.7182291150093079, + "learning_rate": 0.00019465714285714285, + "loss": 0.1431, + "step": 1230 + }, + { + "epoch": 7.057142857142857, + "grad_norm": 1.7567912340164185, + "learning_rate": 0.0001942285714285714, + "loss": 0.1319, + "step": 1235 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 0.6845232248306274, + "learning_rate": 0.0001938, + "loss": 0.1292, + "step": 1240 + }, + { + "epoch": 7.114285714285714, + "grad_norm": 0.6077771782875061, + "learning_rate": 0.00019337142857142855, + "loss": 0.1238, + "step": 1245 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.6168347597122192, + "learning_rate": 0.0001929428571428571, + "loss": 0.1384, + "step": 1250 + }, + { + "epoch": 7.171428571428572, + "grad_norm": 0.7457576394081116, + "learning_rate": 0.0001925142857142857, + "loss": 0.1306, + "step": 1255 + }, + { + "epoch": 7.2, + "grad_norm": 0.5969316363334656, + "learning_rate": 0.00019208571428571425, + "loss": 0.1123, + "step": 1260 + }, + { + "epoch": 7.228571428571429, + "grad_norm": 0.6902753710746765, + "learning_rate": 0.00019165714285714286, + "loss": 0.1185, + "step": 1265 + }, + { + "epoch": 7.257142857142857, + "grad_norm": 0.6488338112831116, + "learning_rate": 0.00019122857142857142, + "loss": 0.1431, + "step": 1270 + }, + { + "epoch": 7.285714285714286, + "grad_norm": 0.6814819574356079, + "learning_rate": 0.00019079999999999998, + "loss": 0.1495, + "step": 1275 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 0.7468088865280151, + "learning_rate": 0.00019037142857142856, + "loss": 0.1158, + "step": 1280 + }, + { + "epoch": 7.3428571428571425, + "grad_norm": 0.7417412400245667, + "learning_rate": 0.00018994285714285712, + "loss": 0.1311, + "step": 1285 + }, + { + "epoch": 7.371428571428572, + "grad_norm": 0.5480664372444153, + "learning_rate": 0.00018951428571428567, + "loss": 0.135, + "step": 1290 + }, + { + "epoch": 7.4, + "grad_norm": 0.725527822971344, + "learning_rate": 0.00018908571428571429, + "loss": 0.1217, + "step": 1295 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 0.6566678285598755, + "learning_rate": 0.00018865714285714284, + "loss": 0.1417, + "step": 1300 + }, + { + "epoch": 7.457142857142857, + "grad_norm": 0.516952395439148, + "learning_rate": 0.00018822857142857143, + "loss": 0.1329, + "step": 1305 + }, + { + "epoch": 7.485714285714286, + "grad_norm": 1.9545241594314575, + "learning_rate": 0.00018779999999999998, + "loss": 0.1339, + "step": 1310 + }, + { + "epoch": 7.514285714285714, + "grad_norm": 0.8276839852333069, + "learning_rate": 0.00018737142857142854, + "loss": 0.1324, + "step": 1315 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 0.6737099289894104, + "learning_rate": 0.00018694285714285713, + "loss": 0.1139, + "step": 1320 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 0.6914472579956055, + "learning_rate": 0.00018651428571428568, + "loss": 0.1146, + "step": 1325 + }, + { + "epoch": 7.6, + "grad_norm": 0.6630033850669861, + "learning_rate": 0.0001860857142857143, + "loss": 0.1571, + "step": 1330 + }, + { + "epoch": 7.628571428571428, + "grad_norm": 0.820688784122467, + "learning_rate": 0.00018565714285714285, + "loss": 0.15, + "step": 1335 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 2.0491325855255127, + "learning_rate": 0.0001852285714285714, + "loss": 0.127, + "step": 1340 + }, + { + "epoch": 7.685714285714286, + "grad_norm": 0.9327268004417419, + "learning_rate": 0.0001848, + "loss": 0.1289, + "step": 1345 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 1.3131701946258545, + "learning_rate": 0.00018437142857142855, + "loss": 0.1228, + "step": 1350 + }, + { + "epoch": 7.742857142857143, + "grad_norm": 2.955918312072754, + "learning_rate": 0.0001839428571428571, + "loss": 0.1082, + "step": 1355 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 1.2165493965148926, + "learning_rate": 0.00018351428571428572, + "loss": 0.1688, + "step": 1360 + }, + { + "epoch": 7.8, + "grad_norm": 0.759324312210083, + "learning_rate": 0.00018308571428571428, + "loss": 0.1185, + "step": 1365 + }, + { + "epoch": 7.828571428571428, + "grad_norm": 0.7445591688156128, + "learning_rate": 0.00018265714285714286, + "loss": 0.1431, + "step": 1370 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.679374098777771, + "learning_rate": 0.00018222857142857142, + "loss": 0.1451, + "step": 1375 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 2.1234302520751953, + "learning_rate": 0.00018179999999999997, + "loss": 0.1265, + "step": 1380 + }, + { + "epoch": 7.914285714285715, + "grad_norm": 1.006521224975586, + "learning_rate": 0.00018137142857142856, + "loss": 0.1722, + "step": 1385 + }, + { + "epoch": 7.942857142857143, + "grad_norm": 0.7275253534317017, + "learning_rate": 0.00018094285714285712, + "loss": 0.1625, + "step": 1390 + }, + { + "epoch": 7.9714285714285715, + "grad_norm": 0.8612022995948792, + "learning_rate": 0.0001805142857142857, + "loss": 0.1345, + "step": 1395 + }, + { + "epoch": 8.0, + "grad_norm": 0.7276798486709595, + "learning_rate": 0.00018008571428571428, + "loss": 0.1236, + "step": 1400 + }, + { + "epoch": 8.028571428571428, + "grad_norm": 0.8731086850166321, + "learning_rate": 0.00017965714285714284, + "loss": 0.1604, + "step": 1405 + }, + { + "epoch": 8.057142857142857, + "grad_norm": 0.8950818777084351, + "learning_rate": 0.0001792285714285714, + "loss": 0.1531, + "step": 1410 + }, + { + "epoch": 8.085714285714285, + "grad_norm": 0.7399356365203857, + "learning_rate": 0.00017879999999999998, + "loss": 0.1508, + "step": 1415 + }, + { + "epoch": 8.114285714285714, + "grad_norm": 1.3727307319641113, + "learning_rate": 0.00017837142857142854, + "loss": 0.1487, + "step": 1420 + }, + { + "epoch": 8.142857142857142, + "grad_norm": 0.5938125848770142, + "learning_rate": 0.00017794285714285715, + "loss": 0.1303, + "step": 1425 + }, + { + "epoch": 8.17142857142857, + "grad_norm": 0.7043821811676025, + "learning_rate": 0.0001775142857142857, + "loss": 0.0948, + "step": 1430 + }, + { + "epoch": 8.2, + "grad_norm": 1.1062767505645752, + "learning_rate": 0.00017708571428571426, + "loss": 0.1412, + "step": 1435 + }, + { + "epoch": 8.228571428571428, + "grad_norm": 0.844832181930542, + "learning_rate": 0.00017665714285714285, + "loss": 0.113, + "step": 1440 + }, + { + "epoch": 8.257142857142856, + "grad_norm": 0.7564154863357544, + "learning_rate": 0.0001762285714285714, + "loss": 0.1319, + "step": 1445 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 0.8843110203742981, + "learning_rate": 0.00017579999999999996, + "loss": 0.1206, + "step": 1450 + }, + { + "epoch": 8.314285714285715, + "grad_norm": 0.8175828456878662, + "learning_rate": 0.00017537142857142855, + "loss": 0.1327, + "step": 1455 + }, + { + "epoch": 8.342857142857143, + "grad_norm": 0.6443565487861633, + "learning_rate": 0.00017494285714285713, + "loss": 0.1239, + "step": 1460 + }, + { + "epoch": 8.371428571428572, + "grad_norm": 0.7237185835838318, + "learning_rate": 0.00017451428571428572, + "loss": 0.1639, + "step": 1465 + }, + { + "epoch": 8.4, + "grad_norm": 0.6118057370185852, + "learning_rate": 0.00017408571428571427, + "loss": 0.1363, + "step": 1470 + }, + { + "epoch": 8.428571428571429, + "grad_norm": 0.6754649877548218, + "learning_rate": 0.00017365714285714283, + "loss": 0.1187, + "step": 1475 + }, + { + "epoch": 8.457142857142857, + "grad_norm": 1.0067390203475952, + "learning_rate": 0.00017322857142857141, + "loss": 0.1401, + "step": 1480 + }, + { + "epoch": 8.485714285714286, + "grad_norm": 8.509544372558594, + "learning_rate": 0.00017279999999999997, + "loss": 0.1304, + "step": 1485 + }, + { + "epoch": 8.514285714285714, + "grad_norm": 4.2030205726623535, + "learning_rate": 0.00017237142857142858, + "loss": 0.121, + "step": 1490 + }, + { + "epoch": 8.542857142857143, + "grad_norm": 4.877438068389893, + "learning_rate": 0.00017194285714285714, + "loss": 0.1918, + "step": 1495 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 6.4971232414245605, + "learning_rate": 0.0001715142857142857, + "loss": 0.2154, + "step": 1500 + }, + { + "epoch": 8.6, + "grad_norm": 4.365469932556152, + "learning_rate": 0.00017108571428571428, + "loss": 0.2272, + "step": 1505 + }, + { + "epoch": 8.628571428571428, + "grad_norm": 2.551957845687866, + "learning_rate": 0.00017065714285714284, + "loss": 0.2163, + "step": 1510 + }, + { + "epoch": 8.657142857142857, + "grad_norm": 5.326391220092773, + "learning_rate": 0.0001702285714285714, + "loss": 0.1612, + "step": 1515 + }, + { + "epoch": 8.685714285714285, + "grad_norm": 1.3528404235839844, + "learning_rate": 0.00016979999999999998, + "loss": 0.1636, + "step": 1520 + }, + { + "epoch": 8.714285714285714, + "grad_norm": 1.4466065168380737, + "learning_rate": 0.00016937142857142856, + "loss": 0.1295, + "step": 1525 + }, + { + "epoch": 8.742857142857144, + "grad_norm": 0.6576040387153625, + "learning_rate": 0.00016894285714285715, + "loss": 0.1318, + "step": 1530 + }, + { + "epoch": 8.771428571428572, + "grad_norm": 1.286942958831787, + "learning_rate": 0.0001685142857142857, + "loss": 0.1443, + "step": 1535 + }, + { + "epoch": 8.8, + "grad_norm": 9.474458694458008, + "learning_rate": 0.00016808571428571426, + "loss": 0.1313, + "step": 1540 + }, + { + "epoch": 8.82857142857143, + "grad_norm": 2.6731069087982178, + "learning_rate": 0.00016765714285714285, + "loss": 0.1485, + "step": 1545 + }, + { + "epoch": 8.857142857142858, + "grad_norm": 1.313723087310791, + "learning_rate": 0.0001672285714285714, + "loss": 0.1346, + "step": 1550 + }, + { + "epoch": 8.885714285714286, + "grad_norm": 1.7115576267242432, + "learning_rate": 0.0001668, + "loss": 0.1471, + "step": 1555 + }, + { + "epoch": 8.914285714285715, + "grad_norm": 1.2599923610687256, + "learning_rate": 0.00016637142857142857, + "loss": 0.1433, + "step": 1560 + }, + { + "epoch": 8.942857142857143, + "grad_norm": 0.9659029245376587, + "learning_rate": 0.00016594285714285713, + "loss": 0.1256, + "step": 1565 + }, + { + "epoch": 8.971428571428572, + "grad_norm": 1.1282744407653809, + "learning_rate": 0.0001655142857142857, + "loss": 0.1373, + "step": 1570 + }, + { + "epoch": 9.0, + "grad_norm": 3.20717453956604, + "learning_rate": 0.00016508571428571427, + "loss": 0.1355, + "step": 1575 + }, + { + "epoch": 9.028571428571428, + "grad_norm": 0.8310821056365967, + "learning_rate": 0.00016465714285714283, + "loss": 0.1268, + "step": 1580 + }, + { + "epoch": 9.057142857142857, + "grad_norm": 1.5337790250778198, + "learning_rate": 0.00016422857142857139, + "loss": 0.1267, + "step": 1585 + }, + { + "epoch": 9.085714285714285, + "grad_norm": 2.6406068801879883, + "learning_rate": 0.0001638, + "loss": 0.1363, + "step": 1590 + }, + { + "epoch": 9.114285714285714, + "grad_norm": 0.7705873847007751, + "learning_rate": 0.00016337142857142855, + "loss": 0.1291, + "step": 1595 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 0.7092650532722473, + "learning_rate": 0.00016294285714285714, + "loss": 0.1435, + "step": 1600 + }, + { + "epoch": 9.17142857142857, + "grad_norm": 1.098961591720581, + "learning_rate": 0.0001625142857142857, + "loss": 0.1471, + "step": 1605 + }, + { + "epoch": 9.2, + "grad_norm": 0.6994885206222534, + "learning_rate": 0.00016208571428571425, + "loss": 0.1345, + "step": 1610 + }, + { + "epoch": 9.228571428571428, + "grad_norm": 0.9613476991653442, + "learning_rate": 0.00016165714285714284, + "loss": 0.1399, + "step": 1615 + }, + { + "epoch": 9.257142857142856, + "grad_norm": 0.675588846206665, + "learning_rate": 0.00016122857142857142, + "loss": 0.1319, + "step": 1620 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.7519372701644897, + "learning_rate": 0.0001608, + "loss": 0.137, + "step": 1625 + }, + { + "epoch": 9.314285714285715, + "grad_norm": 1.135025978088379, + "learning_rate": 0.00016037142857142856, + "loss": 0.1322, + "step": 1630 + }, + { + "epoch": 9.342857142857143, + "grad_norm": 0.7462936639785767, + "learning_rate": 0.00015994285714285712, + "loss": 0.1215, + "step": 1635 + }, + { + "epoch": 9.371428571428572, + "grad_norm": 0.9042088985443115, + "learning_rate": 0.0001595142857142857, + "loss": 0.1191, + "step": 1640 + }, + { + "epoch": 9.4, + "grad_norm": 0.567828893661499, + "learning_rate": 0.00015908571428571426, + "loss": 0.1189, + "step": 1645 + }, + { + "epoch": 9.428571428571429, + "grad_norm": 0.981585681438446, + "learning_rate": 0.00015865714285714282, + "loss": 0.128, + "step": 1650 + }, + { + "epoch": 9.457142857142857, + "grad_norm": 1.24985933303833, + "learning_rate": 0.00015822857142857143, + "loss": 0.1315, + "step": 1655 + }, + { + "epoch": 9.485714285714286, + "grad_norm": 0.6517993211746216, + "learning_rate": 0.0001578, + "loss": 0.1076, + "step": 1660 + }, + { + "epoch": 9.514285714285714, + "grad_norm": 1.166628122329712, + "learning_rate": 0.00015737142857142857, + "loss": 0.1345, + "step": 1665 + }, + { + "epoch": 9.542857142857143, + "grad_norm": 0.9763592481613159, + "learning_rate": 0.00015694285714285713, + "loss": 0.1449, + "step": 1670 + }, + { + "epoch": 9.571428571428571, + "grad_norm": 0.7829060554504395, + "learning_rate": 0.00015651428571428569, + "loss": 0.1117, + "step": 1675 + }, + { + "epoch": 9.6, + "grad_norm": 0.6693719029426575, + "learning_rate": 0.00015608571428571427, + "loss": 0.1129, + "step": 1680 + }, + { + "epoch": 9.628571428571428, + "grad_norm": 1.2122846841812134, + "learning_rate": 0.00015565714285714285, + "loss": 0.1125, + "step": 1685 + }, + { + "epoch": 9.657142857142857, + "grad_norm": 1.0689371824264526, + "learning_rate": 0.0001552285714285714, + "loss": 0.1478, + "step": 1690 + }, + { + "epoch": 9.685714285714285, + "grad_norm": 1.8511656522750854, + "learning_rate": 0.0001548, + "loss": 0.1431, + "step": 1695 + }, + { + "epoch": 9.714285714285714, + "grad_norm": 0.6706506609916687, + "learning_rate": 0.00015437142857142855, + "loss": 0.1262, + "step": 1700 + }, + { + "epoch": 9.742857142857144, + "grad_norm": 1.0798784494400024, + "learning_rate": 0.00015394285714285714, + "loss": 0.1275, + "step": 1705 + }, + { + "epoch": 9.771428571428572, + "grad_norm": 0.7915983200073242, + "learning_rate": 0.0001535142857142857, + "loss": 0.1316, + "step": 1710 + }, + { + "epoch": 9.8, + "grad_norm": 1.8630567789077759, + "learning_rate": 0.00015308571428571425, + "loss": 0.1258, + "step": 1715 + }, + { + "epoch": 9.82857142857143, + "grad_norm": 0.7807756662368774, + "learning_rate": 0.00015265714285714286, + "loss": 0.1079, + "step": 1720 + }, + { + "epoch": 9.857142857142858, + "grad_norm": 1.4698439836502075, + "learning_rate": 0.00015222857142857142, + "loss": 0.1357, + "step": 1725 + }, + { + "epoch": 9.885714285714286, + "grad_norm": 1.2121926546096802, + "learning_rate": 0.00015179999999999998, + "loss": 0.1322, + "step": 1730 + }, + { + "epoch": 9.914285714285715, + "grad_norm": 0.6348568201065063, + "learning_rate": 0.00015137142857142856, + "loss": 0.0893, + "step": 1735 + }, + { + "epoch": 9.942857142857143, + "grad_norm": 0.6694422364234924, + "learning_rate": 0.00015094285714285712, + "loss": 0.1189, + "step": 1740 + }, + { + "epoch": 9.971428571428572, + "grad_norm": 0.569332480430603, + "learning_rate": 0.00015051428571428567, + "loss": 0.1349, + "step": 1745 + }, + { + "epoch": 10.0, + "grad_norm": 0.934073269367218, + "learning_rate": 0.00015008571428571429, + "loss": 0.1237, + "step": 1750 + }, + { + "epoch": 10.028571428571428, + "grad_norm": 0.7191672325134277, + "learning_rate": 0.00014965714285714284, + "loss": 0.1308, + "step": 1755 + }, + { + "epoch": 10.057142857142857, + "grad_norm": 0.7006493806838989, + "learning_rate": 0.00014922857142857143, + "loss": 0.104, + "step": 1760 + }, + { + "epoch": 10.085714285714285, + "grad_norm": 0.9030678272247314, + "learning_rate": 0.00014879999999999998, + "loss": 0.1308, + "step": 1765 + }, + { + "epoch": 10.114285714285714, + "grad_norm": 0.7007766366004944, + "learning_rate": 0.00014837142857142854, + "loss": 0.1044, + "step": 1770 + }, + { + "epoch": 10.142857142857142, + "grad_norm": 0.4832770824432373, + "learning_rate": 0.00014794285714285713, + "loss": 0.1119, + "step": 1775 + }, + { + "epoch": 10.17142857142857, + "grad_norm": 0.7819458842277527, + "learning_rate": 0.0001475142857142857, + "loss": 0.1087, + "step": 1780 + }, + { + "epoch": 10.2, + "grad_norm": 1.0223525762557983, + "learning_rate": 0.00014708571428571427, + "loss": 0.1314, + "step": 1785 + }, + { + "epoch": 10.228571428571428, + "grad_norm": 0.6224566698074341, + "learning_rate": 0.00014665714285714285, + "loss": 0.1159, + "step": 1790 + }, + { + "epoch": 10.257142857142856, + "grad_norm": 0.45800235867500305, + "learning_rate": 0.0001462285714285714, + "loss": 0.0942, + "step": 1795 + }, + { + "epoch": 10.285714285714286, + "grad_norm": 0.6258400082588196, + "learning_rate": 0.0001458, + "loss": 0.1079, + "step": 1800 + }, + { + "epoch": 10.314285714285715, + "grad_norm": 1.1812794208526611, + "learning_rate": 0.00014537142857142858, + "loss": 0.1378, + "step": 1805 + }, + { + "epoch": 10.342857142857143, + "grad_norm": 0.8541269898414612, + "learning_rate": 0.00014494285714285713, + "loss": 0.1274, + "step": 1810 + }, + { + "epoch": 10.371428571428572, + "grad_norm": 0.7131860256195068, + "learning_rate": 0.0001445142857142857, + "loss": 0.1247, + "step": 1815 + }, + { + "epoch": 10.4, + "grad_norm": 0.6109820008277893, + "learning_rate": 0.00014408571428571428, + "loss": 0.1246, + "step": 1820 + }, + { + "epoch": 10.428571428571429, + "grad_norm": 0.5621510744094849, + "learning_rate": 0.00014365714285714286, + "loss": 0.1039, + "step": 1825 + }, + { + "epoch": 10.457142857142857, + "grad_norm": 1.022777795791626, + "learning_rate": 0.00014322857142857142, + "loss": 0.1206, + "step": 1830 + }, + { + "epoch": 10.485714285714286, + "grad_norm": 0.9120668768882751, + "learning_rate": 0.00014279999999999997, + "loss": 0.1289, + "step": 1835 + }, + { + "epoch": 10.514285714285714, + "grad_norm": 1.1882030963897705, + "learning_rate": 0.00014237142857142856, + "loss": 0.1194, + "step": 1840 + }, + { + "epoch": 10.542857142857143, + "grad_norm": 0.6078401207923889, + "learning_rate": 0.00014194285714285714, + "loss": 0.1339, + "step": 1845 + }, + { + "epoch": 10.571428571428571, + "grad_norm": 0.7380999326705933, + "learning_rate": 0.0001415142857142857, + "loss": 0.1318, + "step": 1850 + }, + { + "epoch": 10.6, + "grad_norm": 0.5884959101676941, + "learning_rate": 0.00014108571428571428, + "loss": 0.1249, + "step": 1855 + }, + { + "epoch": 10.628571428571428, + "grad_norm": 1.0121936798095703, + "learning_rate": 0.00014065714285714284, + "loss": 0.1137, + "step": 1860 + }, + { + "epoch": 10.657142857142857, + "grad_norm": 0.6444916129112244, + "learning_rate": 0.00014022857142857143, + "loss": 0.1213, + "step": 1865 + }, + { + "epoch": 10.685714285714285, + "grad_norm": 0.7931004762649536, + "learning_rate": 0.00013979999999999998, + "loss": 0.1318, + "step": 1870 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 0.5596404075622559, + "learning_rate": 0.00013937142857142857, + "loss": 0.1075, + "step": 1875 + }, + { + "epoch": 10.742857142857144, + "grad_norm": 0.6586474180221558, + "learning_rate": 0.00013894285714285712, + "loss": 0.13, + "step": 1880 + }, + { + "epoch": 10.771428571428572, + "grad_norm": 1.0195013284683228, + "learning_rate": 0.00013851428571428568, + "loss": 0.1373, + "step": 1885 + }, + { + "epoch": 10.8, + "grad_norm": 0.9233512878417969, + "learning_rate": 0.00013808571428571427, + "loss": 0.1168, + "step": 1890 + }, + { + "epoch": 10.82857142857143, + "grad_norm": 0.7154092788696289, + "learning_rate": 0.00013765714285714285, + "loss": 0.1081, + "step": 1895 + }, + { + "epoch": 10.857142857142858, + "grad_norm": 1.4588117599487305, + "learning_rate": 0.0001372285714285714, + "loss": 0.1061, + "step": 1900 + }, + { + "epoch": 10.885714285714286, + "grad_norm": 0.6087035536766052, + "learning_rate": 0.0001368, + "loss": 0.1157, + "step": 1905 + }, + { + "epoch": 10.914285714285715, + "grad_norm": 0.7371247410774231, + "learning_rate": 0.00013637142857142855, + "loss": 0.1339, + "step": 1910 + }, + { + "epoch": 10.942857142857143, + "grad_norm": 0.8253212571144104, + "learning_rate": 0.00013594285714285713, + "loss": 0.1198, + "step": 1915 + }, + { + "epoch": 10.971428571428572, + "grad_norm": 0.6889544129371643, + "learning_rate": 0.00013551428571428572, + "loss": 0.1131, + "step": 1920 + }, + { + "epoch": 11.0, + "grad_norm": 0.6408224105834961, + "learning_rate": 0.00013508571428571427, + "loss": 0.122, + "step": 1925 + }, + { + "epoch": 11.028571428571428, + "grad_norm": 0.6771185398101807, + "learning_rate": 0.00013465714285714283, + "loss": 0.1492, + "step": 1930 + }, + { + "epoch": 11.057142857142857, + "grad_norm": 0.8706450462341309, + "learning_rate": 0.00013422857142857142, + "loss": 0.1294, + "step": 1935 + }, + { + "epoch": 11.085714285714285, + "grad_norm": 1.730648398399353, + "learning_rate": 0.0001338, + "loss": 0.1004, + "step": 1940 + }, + { + "epoch": 11.114285714285714, + "grad_norm": 0.6985113620758057, + "learning_rate": 0.00013337142857142856, + "loss": 0.0995, + "step": 1945 + }, + { + "epoch": 11.142857142857142, + "grad_norm": 0.8901951313018799, + "learning_rate": 0.00013294285714285711, + "loss": 0.1179, + "step": 1950 + }, + { + "epoch": 11.17142857142857, + "grad_norm": 0.7232164144515991, + "learning_rate": 0.0001325142857142857, + "loss": 0.1397, + "step": 1955 + }, + { + "epoch": 11.2, + "grad_norm": 0.6447544693946838, + "learning_rate": 0.00013208571428571428, + "loss": 0.1366, + "step": 1960 + }, + { + "epoch": 11.228571428571428, + "grad_norm": 0.7964944243431091, + "learning_rate": 0.00013165714285714284, + "loss": 0.1121, + "step": 1965 + }, + { + "epoch": 11.257142857142856, + "grad_norm": 0.9012628793716431, + "learning_rate": 0.00013122857142857142, + "loss": 0.1131, + "step": 1970 + }, + { + "epoch": 11.285714285714286, + "grad_norm": 0.9295369982719421, + "learning_rate": 0.00013079999999999998, + "loss": 0.1232, + "step": 1975 + }, + { + "epoch": 11.314285714285715, + "grad_norm": 0.6237708926200867, + "learning_rate": 0.00013037142857142857, + "loss": 0.1066, + "step": 1980 + }, + { + "epoch": 11.342857142857143, + "grad_norm": 0.5250967741012573, + "learning_rate": 0.00012994285714285715, + "loss": 0.118, + "step": 1985 + }, + { + "epoch": 11.371428571428572, + "grad_norm": 1.0013964176177979, + "learning_rate": 0.0001295142857142857, + "loss": 0.1125, + "step": 1990 + }, + { + "epoch": 11.4, + "grad_norm": 0.6721311807632446, + "learning_rate": 0.00012908571428571426, + "loss": 0.1196, + "step": 1995 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.6966421008110046, + "learning_rate": 0.00012865714285714285, + "loss": 0.1172, + "step": 2000 + }, + { + "epoch": 11.457142857142857, + "grad_norm": 0.8811460733413696, + "learning_rate": 0.00012822857142857143, + "loss": 0.135, + "step": 2005 + }, + { + "epoch": 11.485714285714286, + "grad_norm": 0.8829531073570251, + "learning_rate": 0.0001278, + "loss": 0.1288, + "step": 2010 + }, + { + "epoch": 11.514285714285714, + "grad_norm": 0.7530654668807983, + "learning_rate": 0.00012737142857142855, + "loss": 0.1073, + "step": 2015 + }, + { + "epoch": 11.542857142857143, + "grad_norm": 0.513940691947937, + "learning_rate": 0.00012694285714285713, + "loss": 0.121, + "step": 2020 + }, + { + "epoch": 11.571428571428571, + "grad_norm": 0.8574968576431274, + "learning_rate": 0.0001265142857142857, + "loss": 0.1103, + "step": 2025 + }, + { + "epoch": 11.6, + "grad_norm": 0.7482439875602722, + "learning_rate": 0.00012608571428571427, + "loss": 0.1027, + "step": 2030 + }, + { + "epoch": 11.628571428571428, + "grad_norm": 0.8367976546287537, + "learning_rate": 0.00012565714285714286, + "loss": 0.1181, + "step": 2035 + }, + { + "epoch": 11.657142857142857, + "grad_norm": 2.048128366470337, + "learning_rate": 0.0001252285714285714, + "loss": 0.1122, + "step": 2040 + }, + { + "epoch": 11.685714285714285, + "grad_norm": 0.7426862716674805, + "learning_rate": 0.00012479999999999997, + "loss": 0.1169, + "step": 2045 + }, + { + "epoch": 11.714285714285714, + "grad_norm": 3.093841791152954, + "learning_rate": 0.00012437142857142855, + "loss": 0.1164, + "step": 2050 + }, + { + "epoch": 11.742857142857144, + "grad_norm": 0.8172643184661865, + "learning_rate": 0.00012394285714285714, + "loss": 0.1354, + "step": 2055 + }, + { + "epoch": 11.771428571428572, + "grad_norm": 1.9950591325759888, + "learning_rate": 0.0001235142857142857, + "loss": 0.1037, + "step": 2060 + }, + { + "epoch": 11.8, + "grad_norm": 0.5929077863693237, + "learning_rate": 0.00012308571428571428, + "loss": 0.1194, + "step": 2065 + }, + { + "epoch": 11.82857142857143, + "grad_norm": 1.293624997138977, + "learning_rate": 0.00012265714285714284, + "loss": 0.12, + "step": 2070 + }, + { + "epoch": 11.857142857142858, + "grad_norm": 1.0515168905258179, + "learning_rate": 0.00012222857142857142, + "loss": 0.1049, + "step": 2075 + }, + { + "epoch": 11.885714285714286, + "grad_norm": 1.2874428033828735, + "learning_rate": 0.00012179999999999999, + "loss": 0.115, + "step": 2080 + }, + { + "epoch": 11.914285714285715, + "grad_norm": 0.7317278385162354, + "learning_rate": 0.00012137142857142856, + "loss": 0.1184, + "step": 2085 + }, + { + "epoch": 11.942857142857143, + "grad_norm": 1.3407148122787476, + "learning_rate": 0.00012094285714285713, + "loss": 0.132, + "step": 2090 + }, + { + "epoch": 11.971428571428572, + "grad_norm": 2.656409502029419, + "learning_rate": 0.00012051428571428569, + "loss": 0.1359, + "step": 2095 + }, + { + "epoch": 12.0, + "grad_norm": 0.7189064025878906, + "learning_rate": 0.00012008571428571428, + "loss": 0.1217, + "step": 2100 + }, + { + "epoch": 12.028571428571428, + "grad_norm": 0.7510334849357605, + "learning_rate": 0.00011965714285714285, + "loss": 0.109, + "step": 2105 + }, + { + "epoch": 12.057142857142857, + "grad_norm": 0.7235113382339478, + "learning_rate": 0.00011922857142857142, + "loss": 0.1114, + "step": 2110 + }, + { + "epoch": 12.085714285714285, + "grad_norm": 1.7435882091522217, + "learning_rate": 0.0001188, + "loss": 0.1357, + "step": 2115 + }, + { + "epoch": 12.114285714285714, + "grad_norm": 1.170392632484436, + "learning_rate": 0.00011837142857142856, + "loss": 0.1255, + "step": 2120 + }, + { + "epoch": 12.142857142857142, + "grad_norm": 0.6476783752441406, + "learning_rate": 0.00011794285714285713, + "loss": 0.1108, + "step": 2125 + }, + { + "epoch": 12.17142857142857, + "grad_norm": 0.8599929213523865, + "learning_rate": 0.00011751428571428571, + "loss": 0.0997, + "step": 2130 + }, + { + "epoch": 12.2, + "grad_norm": 0.8918687105178833, + "learning_rate": 0.00011708571428571428, + "loss": 0.1149, + "step": 2135 + }, + { + "epoch": 12.228571428571428, + "grad_norm": 1.609435796737671, + "learning_rate": 0.00011665714285714284, + "loss": 0.1136, + "step": 2140 + }, + { + "epoch": 12.257142857142856, + "grad_norm": 0.6206801533699036, + "learning_rate": 0.00011622857142857143, + "loss": 0.1135, + "step": 2145 + }, + { + "epoch": 12.285714285714286, + "grad_norm": 0.8769077658653259, + "learning_rate": 0.0001158, + "loss": 0.1344, + "step": 2150 + }, + { + "epoch": 12.314285714285715, + "grad_norm": 0.6279401183128357, + "learning_rate": 0.00011537142857142855, + "loss": 0.1049, + "step": 2155 + }, + { + "epoch": 12.342857142857143, + "grad_norm": 1.1110137701034546, + "learning_rate": 0.00011494285714285712, + "loss": 0.1146, + "step": 2160 + }, + { + "epoch": 12.371428571428572, + "grad_norm": 0.7911233901977539, + "learning_rate": 0.00011451428571428571, + "loss": 0.1257, + "step": 2165 + }, + { + "epoch": 12.4, + "grad_norm": 0.9691207408905029, + "learning_rate": 0.00011408571428571428, + "loss": 0.1226, + "step": 2170 + }, + { + "epoch": 12.428571428571429, + "grad_norm": 0.6168835759162903, + "learning_rate": 0.00011365714285714284, + "loss": 0.1271, + "step": 2175 + }, + { + "epoch": 12.457142857142857, + "grad_norm": 0.6143497228622437, + "learning_rate": 0.00011322857142857142, + "loss": 0.111, + "step": 2180 + }, + { + "epoch": 12.485714285714286, + "grad_norm": 1.5673450231552124, + "learning_rate": 0.00011279999999999999, + "loss": 0.1186, + "step": 2185 + }, + { + "epoch": 12.514285714285714, + "grad_norm": 1.298756718635559, + "learning_rate": 0.00011237142857142856, + "loss": 0.1024, + "step": 2190 + }, + { + "epoch": 12.542857142857143, + "grad_norm": 0.9484918117523193, + "learning_rate": 0.00011194285714285715, + "loss": 0.1171, + "step": 2195 + }, + { + "epoch": 12.571428571428571, + "grad_norm": 0.725705623626709, + "learning_rate": 0.0001115142857142857, + "loss": 0.1216, + "step": 2200 + }, + { + "epoch": 12.6, + "grad_norm": 1.1394798755645752, + "learning_rate": 0.00011108571428571427, + "loss": 0.1132, + "step": 2205 + }, + { + "epoch": 12.628571428571428, + "grad_norm": 0.9548712968826294, + "learning_rate": 0.00011065714285714286, + "loss": 0.1209, + "step": 2210 + }, + { + "epoch": 12.657142857142857, + "grad_norm": 0.6173953413963318, + "learning_rate": 0.00011022857142857143, + "loss": 0.1049, + "step": 2215 + }, + { + "epoch": 12.685714285714285, + "grad_norm": 0.8227205872535706, + "learning_rate": 0.00010979999999999999, + "loss": 0.1045, + "step": 2220 + }, + { + "epoch": 12.714285714285714, + "grad_norm": 0.7252780795097351, + "learning_rate": 0.00010937142857142856, + "loss": 0.1146, + "step": 2225 + }, + { + "epoch": 12.742857142857144, + "grad_norm": 0.9374399781227112, + "learning_rate": 0.00010894285714285714, + "loss": 0.1478, + "step": 2230 + }, + { + "epoch": 12.771428571428572, + "grad_norm": 5.1985368728637695, + "learning_rate": 0.0001085142857142857, + "loss": 0.1059, + "step": 2235 + }, + { + "epoch": 12.8, + "grad_norm": 0.9629620909690857, + "learning_rate": 0.00010808571428571427, + "loss": 0.124, + "step": 2240 + }, + { + "epoch": 12.82857142857143, + "grad_norm": 0.7022290229797363, + "learning_rate": 0.00010765714285714285, + "loss": 0.1309, + "step": 2245 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.574188232421875, + "learning_rate": 0.00010722857142857142, + "loss": 0.086, + "step": 2250 + }, + { + "epoch": 12.885714285714286, + "grad_norm": 0.9712439179420471, + "learning_rate": 0.00010679999999999998, + "loss": 0.1152, + "step": 2255 + }, + { + "epoch": 12.914285714285715, + "grad_norm": 0.6562150120735168, + "learning_rate": 0.00010637142857142856, + "loss": 0.1343, + "step": 2260 + }, + { + "epoch": 12.942857142857143, + "grad_norm": 0.6936819553375244, + "learning_rate": 0.00010594285714285714, + "loss": 0.1009, + "step": 2265 + }, + { + "epoch": 12.971428571428572, + "grad_norm": 0.8664882779121399, + "learning_rate": 0.0001055142857142857, + "loss": 0.1164, + "step": 2270 + }, + { + "epoch": 13.0, + "grad_norm": 0.9224509000778198, + "learning_rate": 0.00010508571428571429, + "loss": 0.1347, + "step": 2275 + }, + { + "epoch": 13.028571428571428, + "grad_norm": 0.6596968770027161, + "learning_rate": 0.00010465714285714285, + "loss": 0.1041, + "step": 2280 + }, + { + "epoch": 13.057142857142857, + "grad_norm": 0.6456631422042847, + "learning_rate": 0.00010422857142857142, + "loss": 0.1142, + "step": 2285 + }, + { + "epoch": 13.085714285714285, + "grad_norm": 0.9466612339019775, + "learning_rate": 0.00010379999999999999, + "loss": 0.1191, + "step": 2290 + }, + { + "epoch": 13.114285714285714, + "grad_norm": 0.9036727547645569, + "learning_rate": 0.00010337142857142856, + "loss": 0.121, + "step": 2295 + }, + { + "epoch": 13.142857142857142, + "grad_norm": 1.08086359500885, + "learning_rate": 0.00010294285714285713, + "loss": 0.1313, + "step": 2300 + }, + { + "epoch": 13.17142857142857, + "grad_norm": 0.703241765499115, + "learning_rate": 0.0001025142857142857, + "loss": 0.1151, + "step": 2305 + }, + { + "epoch": 13.2, + "grad_norm": 0.7901896238327026, + "learning_rate": 0.00010208571428571429, + "loss": 0.1275, + "step": 2310 + }, + { + "epoch": 13.228571428571428, + "grad_norm": 0.703542947769165, + "learning_rate": 0.00010165714285714284, + "loss": 0.1, + "step": 2315 + }, + { + "epoch": 13.257142857142856, + "grad_norm": 0.6657671928405762, + "learning_rate": 0.00010122857142857141, + "loss": 0.1141, + "step": 2320 + }, + { + "epoch": 13.285714285714286, + "grad_norm": 0.7593729496002197, + "learning_rate": 0.0001008, + "loss": 0.1099, + "step": 2325 + }, + { + "epoch": 13.314285714285715, + "grad_norm": 0.6681057810783386, + "learning_rate": 0.00010037142857142857, + "loss": 0.112, + "step": 2330 + }, + { + "epoch": 13.342857142857143, + "grad_norm": 0.7155857682228088, + "learning_rate": 9.994285714285712e-05, + "loss": 0.0989, + "step": 2335 + }, + { + "epoch": 13.371428571428572, + "grad_norm": 0.9484553337097168, + "learning_rate": 9.951428571428571e-05, + "loss": 0.0902, + "step": 2340 + }, + { + "epoch": 13.4, + "grad_norm": 0.9317265152931213, + "learning_rate": 9.908571428571428e-05, + "loss": 0.1432, + "step": 2345 + }, + { + "epoch": 13.428571428571429, + "grad_norm": 1.039158821105957, + "learning_rate": 9.865714285714285e-05, + "loss": 0.114, + "step": 2350 + }, + { + "epoch": 13.457142857142857, + "grad_norm": 0.8524000644683838, + "learning_rate": 9.822857142857141e-05, + "loss": 0.1144, + "step": 2355 + }, + { + "epoch": 13.485714285714286, + "grad_norm": 0.6337461471557617, + "learning_rate": 9.779999999999999e-05, + "loss": 0.1073, + "step": 2360 + }, + { + "epoch": 13.514285714285714, + "grad_norm": 0.9097298383712769, + "learning_rate": 9.737142857142856e-05, + "loss": 0.1031, + "step": 2365 + }, + { + "epoch": 13.542857142857143, + "grad_norm": 1.2013412714004517, + "learning_rate": 9.694285714285713e-05, + "loss": 0.1174, + "step": 2370 + }, + { + "epoch": 13.571428571428571, + "grad_norm": 0.7055214643478394, + "learning_rate": 9.65142857142857e-05, + "loss": 0.1175, + "step": 2375 + }, + { + "epoch": 13.6, + "grad_norm": 0.807955265045166, + "learning_rate": 9.608571428571427e-05, + "loss": 0.1286, + "step": 2380 + }, + { + "epoch": 13.628571428571428, + "grad_norm": 0.6661797761917114, + "learning_rate": 9.565714285714285e-05, + "loss": 0.1091, + "step": 2385 + }, + { + "epoch": 13.657142857142857, + "grad_norm": 1.119604468345642, + "learning_rate": 9.522857142857143e-05, + "loss": 0.1393, + "step": 2390 + }, + { + "epoch": 13.685714285714285, + "grad_norm": 0.5365435481071472, + "learning_rate": 9.479999999999999e-05, + "loss": 0.1075, + "step": 2395 + }, + { + "epoch": 13.714285714285714, + "grad_norm": 0.9443924427032471, + "learning_rate": 9.437142857142856e-05, + "loss": 0.0977, + "step": 2400 + }, + { + "epoch": 13.742857142857144, + "grad_norm": 0.6075264811515808, + "learning_rate": 9.394285714285714e-05, + "loss": 0.1329, + "step": 2405 + }, + { + "epoch": 13.771428571428572, + "grad_norm": 1.019352912902832, + "learning_rate": 9.351428571428571e-05, + "loss": 0.1083, + "step": 2410 + }, + { + "epoch": 13.8, + "grad_norm": 0.7234058380126953, + "learning_rate": 9.308571428571427e-05, + "loss": 0.1118, + "step": 2415 + }, + { + "epoch": 13.82857142857143, + "grad_norm": 0.6786122918128967, + "learning_rate": 9.265714285714284e-05, + "loss": 0.1208, + "step": 2420 + }, + { + "epoch": 13.857142857142858, + "grad_norm": 0.5820732116699219, + "learning_rate": 9.222857142857142e-05, + "loss": 0.1022, + "step": 2425 + }, + { + "epoch": 13.885714285714286, + "grad_norm": 0.8007987141609192, + "learning_rate": 9.18e-05, + "loss": 0.1293, + "step": 2430 + }, + { + "epoch": 13.914285714285715, + "grad_norm": 0.6813766956329346, + "learning_rate": 9.137142857142855e-05, + "loss": 0.1284, + "step": 2435 + }, + { + "epoch": 13.942857142857143, + "grad_norm": 0.6460041403770447, + "learning_rate": 9.094285714285714e-05, + "loss": 0.1073, + "step": 2440 + }, + { + "epoch": 13.971428571428572, + "grad_norm": 0.5939205288887024, + "learning_rate": 9.051428571428571e-05, + "loss": 0.1185, + "step": 2445 + }, + { + "epoch": 14.0, + "grad_norm": 0.8150635361671448, + "learning_rate": 9.008571428571428e-05, + "loss": 0.1039, + "step": 2450 + }, + { + "epoch": 14.028571428571428, + "grad_norm": 1.3691389560699463, + "learning_rate": 8.965714285714285e-05, + "loss": 0.1112, + "step": 2455 + }, + { + "epoch": 14.057142857142857, + "grad_norm": 0.9042718410491943, + "learning_rate": 8.922857142857142e-05, + "loss": 0.112, + "step": 2460 + }, + { + "epoch": 14.085714285714285, + "grad_norm": 0.7222105860710144, + "learning_rate": 8.879999999999999e-05, + "loss": 0.1221, + "step": 2465 + }, + { + "epoch": 14.114285714285714, + "grad_norm": 0.595588207244873, + "learning_rate": 8.837142857142857e-05, + "loss": 0.1058, + "step": 2470 + }, + { + "epoch": 14.142857142857142, + "grad_norm": 0.5262706279754639, + "learning_rate": 8.794285714285713e-05, + "loss": 0.1071, + "step": 2475 + }, + { + "epoch": 14.17142857142857, + "grad_norm": 0.6511022448539734, + "learning_rate": 8.75142857142857e-05, + "loss": 0.0917, + "step": 2480 + }, + { + "epoch": 14.2, + "grad_norm": 0.5737650394439697, + "learning_rate": 8.708571428571427e-05, + "loss": 0.0988, + "step": 2485 + }, + { + "epoch": 14.228571428571428, + "grad_norm": 0.7679132223129272, + "learning_rate": 8.665714285714286e-05, + "loss": 0.1185, + "step": 2490 + }, + { + "epoch": 14.257142857142856, + "grad_norm": 0.641198456287384, + "learning_rate": 8.622857142857141e-05, + "loss": 0.0894, + "step": 2495 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.7215464115142822, + "learning_rate": 8.579999999999998e-05, + "loss": 0.0935, + "step": 2500 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-2500/training_args.bin b/glot-contrastive-final-lora/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/checkpoint-3000/README.md b/glot-contrastive-final-lora/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-3000/adapter_config.json b/glot-contrastive-final-lora/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-3000/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52df9731df7fb38e231addbaba67c93c7ac2b266 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8cbb012358d427813b69c11a43d2279370f570cd9c119787e1f92c372b0761a +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-3000/optimizer.pt b/glot-contrastive-final-lora/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d606fc7d83ef97243d5320cccf780b1da4b091e2 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b176518aa7e320498ebc9cb02498e947ee4917f2a36df17791e539e71f009f6 +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-3000/rng_state.pth b/glot-contrastive-final-lora/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bbf137dfb2e3f06ac978673b34d3a0010f4d8691 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b715e8f56451136ff979d4ad11ab96c9bdf53a90baa37faf2d19ec4b1b33a518 +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-3000/scheduler.pt b/glot-contrastive-final-lora/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dbe8bbe41639bd78f16d495261daf80fc931b7e --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfc2601af904a968b88642c5ada06b99cc5fd89af9787905603b8974532a00cc +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-3000/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-3000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-3000/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-3000/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-3000/trainer_state.json b/glot-contrastive-final-lora/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..90997ef2de60e79a78c3d3a847b63fab071b0f3f --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/trainer_state.json @@ -0,0 +1,4234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 17.142857142857142, + "eval_steps": 5, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 1.3597544431686401, + "learning_rate": 0.00025679999999999995, + "loss": 0.1583, + "step": 505 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.8790671825408936, + "learning_rate": 0.00025637142857142854, + "loss": 0.1565, + "step": 510 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 1.1175066232681274, + "learning_rate": 0.0002559428571428571, + "loss": 0.1406, + "step": 515 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.8528785705566406, + "learning_rate": 0.0002555142857142857, + "loss": 0.1735, + "step": 520 + }, + { + "epoch": 3.0, + "grad_norm": 2.2073328495025635, + "learning_rate": 0.0002550857142857143, + "loss": 0.1816, + "step": 525 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 11.01322078704834, + "learning_rate": 0.0002546571428571428, + "loss": 0.1873, + "step": 530 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 1.5822402238845825, + "learning_rate": 0.0002542285714285714, + "loss": 0.168, + "step": 535 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 1.3086942434310913, + "learning_rate": 0.0002538, + "loss": 0.149, + "step": 540 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 6.303041458129883, + "learning_rate": 0.0002533714285714285, + "loss": 0.1651, + "step": 545 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 14.48929500579834, + "learning_rate": 0.00025294285714285716, + "loss": 0.1687, + "step": 550 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 6.824525356292725, + "learning_rate": 0.0002525142857142857, + "loss": 0.1919, + "step": 555 + }, + { + "epoch": 3.2, + "grad_norm": 18.772563934326172, + "learning_rate": 0.00025208571428571427, + "loss": 0.2075, + "step": 560 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.7268752455711365, + "learning_rate": 0.00025165714285714286, + "loss": 0.174, + "step": 565 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 1.1301453113555908, + "learning_rate": 0.0002512285714285714, + "loss": 0.1668, + "step": 570 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 2.846802234649658, + "learning_rate": 0.00025079999999999997, + "loss": 0.1645, + "step": 575 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 1.417515754699707, + "learning_rate": 0.00025037142857142855, + "loss": 0.1719, + "step": 580 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 4.137150764465332, + "learning_rate": 0.00024994285714285714, + "loss": 0.1739, + "step": 585 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 2.6067259311676025, + "learning_rate": 0.0002495142857142857, + "loss": 0.1489, + "step": 590 + }, + { + "epoch": 3.4, + "grad_norm": 2.601024627685547, + "learning_rate": 0.00024908571428571425, + "loss": 0.1618, + "step": 595 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.849017858505249, + "learning_rate": 0.00024865714285714284, + "loss": 0.1899, + "step": 600 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 4.673766136169434, + "learning_rate": 0.0002482285714285714, + "loss": 0.1761, + "step": 605 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 2.6057631969451904, + "learning_rate": 0.00024779999999999995, + "loss": 0.1743, + "step": 610 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 2.932652473449707, + "learning_rate": 0.0002473714285714286, + "loss": 0.1482, + "step": 615 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.8764939308166504, + "learning_rate": 0.0002469428571428571, + "loss": 0.1644, + "step": 620 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.3203191757202148, + "learning_rate": 0.0002465142857142857, + "loss": 0.1654, + "step": 625 + }, + { + "epoch": 3.6, + "grad_norm": 0.7977635264396667, + "learning_rate": 0.0002460857142857143, + "loss": 0.1472, + "step": 630 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.4750248193740845, + "learning_rate": 0.0002456571428571428, + "loss": 0.1735, + "step": 635 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.8164482116699219, + "learning_rate": 0.0002452285714285714, + "loss": 0.1593, + "step": 640 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 1.4829603433609009, + "learning_rate": 0.0002448, + "loss": 0.1508, + "step": 645 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.8828144669532776, + "learning_rate": 0.00024437142857142857, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 2.039384126663208, + "learning_rate": 0.00024394285714285713, + "loss": 0.1745, + "step": 655 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.9604200720787048, + "learning_rate": 0.00024351428571428569, + "loss": 0.17, + "step": 660 + }, + { + "epoch": 3.8, + "grad_norm": 0.7903971076011658, + "learning_rate": 0.00024308571428571427, + "loss": 0.1654, + "step": 665 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 0.6935649514198303, + "learning_rate": 0.00024265714285714283, + "loss": 0.1714, + "step": 670 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.5832012295722961, + "learning_rate": 0.00024222857142857138, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.6303168535232544, + "learning_rate": 0.0002418, + "loss": 0.1604, + "step": 680 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.7210885882377625, + "learning_rate": 0.00024137142857142855, + "loss": 0.1444, + "step": 685 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 0.7690990567207336, + "learning_rate": 0.00024094285714285714, + "loss": 0.1631, + "step": 690 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.0142720937728882, + "learning_rate": 0.0002405142857142857, + "loss": 0.158, + "step": 695 + }, + { + "epoch": 4.0, + "grad_norm": 0.7970322966575623, + "learning_rate": 0.00024008571428571425, + "loss": 0.1803, + "step": 700 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 0.6795914769172668, + "learning_rate": 0.00023965714285714284, + "loss": 0.143, + "step": 705 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.6832629442214966, + "learning_rate": 0.0002392285714285714, + "loss": 0.1457, + "step": 710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 3.8629798889160156, + "learning_rate": 0.0002388, + "loss": 0.1671, + "step": 715 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.1167882680892944, + "learning_rate": 0.00023837142857142856, + "loss": 0.1544, + "step": 720 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.9431412816047668, + "learning_rate": 0.00023794285714285712, + "loss": 0.1605, + "step": 725 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 1.310948133468628, + "learning_rate": 0.0002375142857142857, + "loss": 0.1121, + "step": 730 + }, + { + "epoch": 4.2, + "grad_norm": 0.9830737709999084, + "learning_rate": 0.00023708571428571426, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.6166555881500244, + "learning_rate": 0.00023665714285714282, + "loss": 0.1525, + "step": 740 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 0.995579719543457, + "learning_rate": 0.00023622857142857143, + "loss": 0.1439, + "step": 745 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.639796793460846, + "learning_rate": 0.00023579999999999999, + "loss": 0.1692, + "step": 750 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 0.9438050389289856, + "learning_rate": 0.00023537142857142854, + "loss": 0.1785, + "step": 755 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8960750102996826, + "learning_rate": 0.00023494285714285713, + "loss": 0.1557, + "step": 760 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 0.6287499070167542, + "learning_rate": 0.00023451428571428568, + "loss": 0.1459, + "step": 765 + }, + { + "epoch": 4.4, + "grad_norm": 0.7638295888900757, + "learning_rate": 0.00023408571428571424, + "loss": 0.1341, + "step": 770 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.655878484249115, + "learning_rate": 0.00023365714285714283, + "loss": 0.1358, + "step": 775 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.5840997695922852, + "learning_rate": 0.0002332285714285714, + "loss": 0.1386, + "step": 780 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.1082488298416138, + "learning_rate": 0.0002328, + "loss": 0.1827, + "step": 785 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8825240135192871, + "learning_rate": 0.00023237142857142855, + "loss": 0.1527, + "step": 790 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.6752304434776306, + "learning_rate": 0.0002319428571428571, + "loss": 0.1392, + "step": 795 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 1.1423301696777344, + "learning_rate": 0.0002315142857142857, + "loss": 0.1433, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 10.793691635131836, + "learning_rate": 0.00023108571428571425, + "loss": 0.1635, + "step": 805 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 0.47564294934272766, + "learning_rate": 0.00023065714285714286, + "loss": 0.1199, + "step": 810 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 1.2492656707763672, + "learning_rate": 0.00023022857142857142, + "loss": 0.1488, + "step": 815 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.6933501958847046, + "learning_rate": 0.00022979999999999997, + "loss": 0.1812, + "step": 820 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00022937142857142856, + "loss": 0.1415, + "step": 825 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 0.7854829430580139, + "learning_rate": 0.00022894285714285712, + "loss": 0.1401, + "step": 830 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 0.8716740608215332, + "learning_rate": 0.00022851428571428567, + "loss": 0.1982, + "step": 835 + }, + { + "epoch": 4.8, + "grad_norm": 0.7047899961471558, + "learning_rate": 0.00022808571428571426, + "loss": 0.1624, + "step": 840 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 0.7134959697723389, + "learning_rate": 0.00022765714285714284, + "loss": 0.1375, + "step": 845 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.0897325277328491, + "learning_rate": 0.00022722857142857143, + "loss": 0.1489, + "step": 850 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 1.1065207719802856, + "learning_rate": 0.00022679999999999998, + "loss": 0.1495, + "step": 855 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.7434757351875305, + "learning_rate": 0.00022637142857142854, + "loss": 0.1507, + "step": 860 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 1.0045181512832642, + "learning_rate": 0.00022594285714285712, + "loss": 0.1527, + "step": 865 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 1.2025654315948486, + "learning_rate": 0.00022551428571428568, + "loss": 0.1523, + "step": 870 + }, + { + "epoch": 5.0, + "grad_norm": 0.7823342084884644, + "learning_rate": 0.0002250857142857143, + "loss": 0.1514, + "step": 875 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 0.8405362963676453, + "learning_rate": 0.00022465714285714285, + "loss": 0.1461, + "step": 880 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 0.7527463436126709, + "learning_rate": 0.0002242285714285714, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 0.8372548222541809, + "learning_rate": 0.0002238, + "loss": 0.1513, + "step": 890 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 0.8755456209182739, + "learning_rate": 0.00022337142857142855, + "loss": 0.1498, + "step": 895 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.7312084436416626, + "learning_rate": 0.0002229428571428571, + "loss": 0.154, + "step": 900 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.6366221904754639, + "learning_rate": 0.0002225142857142857, + "loss": 0.1466, + "step": 905 + }, + { + "epoch": 5.2, + "grad_norm": 0.6406880617141724, + "learning_rate": 0.00022208571428571427, + "loss": 0.1254, + "step": 910 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 2.4106833934783936, + "learning_rate": 0.00022165714285714283, + "loss": 0.1534, + "step": 915 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 0.5635722279548645, + "learning_rate": 0.00022122857142857142, + "loss": 0.1461, + "step": 920 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 0.787162184715271, + "learning_rate": 0.00022079999999999997, + "loss": 0.1424, + "step": 925 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.00022037142857142853, + "loss": 0.1326, + "step": 930 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 0.6933534741401672, + "learning_rate": 0.00021994285714285711, + "loss": 0.1661, + "step": 935 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 0.7263259887695312, + "learning_rate": 0.0002195142857142857, + "loss": 0.15, + "step": 940 + }, + { + "epoch": 5.4, + "grad_norm": 0.5537381768226624, + "learning_rate": 0.00021908571428571428, + "loss": 0.129, + "step": 945 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.6014005541801453, + "learning_rate": 0.00021865714285714284, + "loss": 0.1321, + "step": 950 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 0.6581441760063171, + "learning_rate": 0.0002182285714285714, + "loss": 0.1587, + "step": 955 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 0.9326379895210266, + "learning_rate": 0.00021779999999999998, + "loss": 0.1654, + "step": 960 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 0.9438592791557312, + "learning_rate": 0.00021737142857142854, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 0.7699571251869202, + "learning_rate": 0.00021694285714285715, + "loss": 0.1464, + "step": 970 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 0.8758366703987122, + "learning_rate": 0.0002165142857142857, + "loss": 0.1599, + "step": 975 + }, + { + "epoch": 5.6, + "grad_norm": 0.6101442575454712, + "learning_rate": 0.00021608571428571426, + "loss": 0.1589, + "step": 980 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 0.7454060912132263, + "learning_rate": 0.00021565714285714285, + "loss": 0.1433, + "step": 985 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 0.6379484534263611, + "learning_rate": 0.0002152285714285714, + "loss": 0.1592, + "step": 990 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1601309776306152, + "learning_rate": 0.00021479999999999996, + "loss": 0.1647, + "step": 995 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.5464673638343811, + "learning_rate": 0.00021437142857142855, + "loss": 0.1469, + "step": 1000 + }, + { + "epoch": 5.742857142857143, + "grad_norm": 1.0279319286346436, + "learning_rate": 0.00021394285714285713, + "loss": 0.1203, + "step": 1005 + }, + { + "epoch": 5.771428571428571, + "grad_norm": 0.5503718256950378, + "learning_rate": 0.00021351428571428572, + "loss": 0.1409, + "step": 1010 + }, + { + "epoch": 5.8, + "grad_norm": 0.6123886108398438, + "learning_rate": 0.00021308571428571427, + "loss": 0.1427, + "step": 1015 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 0.6560390591621399, + "learning_rate": 0.00021265714285714283, + "loss": 0.1415, + "step": 1020 + }, + { + "epoch": 5.857142857142857, + "grad_norm": 0.5576716661453247, + "learning_rate": 0.00021222857142857141, + "loss": 0.1408, + "step": 1025 + }, + { + "epoch": 5.885714285714286, + "grad_norm": 0.6419074535369873, + "learning_rate": 0.00021179999999999997, + "loss": 0.1385, + "step": 1030 + }, + { + "epoch": 5.914285714285715, + "grad_norm": 1.008925199508667, + "learning_rate": 0.00021137142857142858, + "loss": 0.1497, + "step": 1035 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 0.6559906005859375, + "learning_rate": 0.00021094285714285714, + "loss": 0.1218, + "step": 1040 + }, + { + "epoch": 5.9714285714285715, + "grad_norm": 0.627164363861084, + "learning_rate": 0.0002105142857142857, + "loss": 0.1368, + "step": 1045 + }, + { + "epoch": 6.0, + "grad_norm": 0.5760972499847412, + "learning_rate": 0.00021008571428571428, + "loss": 0.1508, + "step": 1050 + }, + { + "epoch": 6.0285714285714285, + "grad_norm": 0.5754174590110779, + "learning_rate": 0.00020965714285714284, + "loss": 0.1181, + "step": 1055 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 0.8736348748207092, + "learning_rate": 0.0002092285714285714, + "loss": 0.1252, + "step": 1060 + }, + { + "epoch": 6.085714285714285, + "grad_norm": 0.7166719436645508, + "learning_rate": 0.00020879999999999998, + "loss": 0.1481, + "step": 1065 + }, + { + "epoch": 6.114285714285714, + "grad_norm": 0.6494349241256714, + "learning_rate": 0.00020837142857142856, + "loss": 0.1478, + "step": 1070 + }, + { + "epoch": 6.142857142857143, + "grad_norm": 0.6681587100028992, + "learning_rate": 0.00020794285714285712, + "loss": 0.1488, + "step": 1075 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 0.7123684883117676, + "learning_rate": 0.0002075142857142857, + "loss": 0.1378, + "step": 1080 + }, + { + "epoch": 6.2, + "grad_norm": 0.6146950721740723, + "learning_rate": 0.00020708571428571426, + "loss": 0.1306, + "step": 1085 + }, + { + "epoch": 6.228571428571429, + "grad_norm": 0.8402445912361145, + "learning_rate": 0.00020665714285714282, + "loss": 0.1063, + "step": 1090 + }, + { + "epoch": 6.257142857142857, + "grad_norm": 0.6567764282226562, + "learning_rate": 0.0002062285714285714, + "loss": 0.1195, + "step": 1095 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.6006014943122864, + "learning_rate": 0.0002058, + "loss": 0.1542, + "step": 1100 + }, + { + "epoch": 6.314285714285714, + "grad_norm": 0.793100893497467, + "learning_rate": 0.00020537142857142857, + "loss": 0.1381, + "step": 1105 + }, + { + "epoch": 6.3428571428571425, + "grad_norm": 0.5923666954040527, + "learning_rate": 0.00020494285714285713, + "loss": 0.1386, + "step": 1110 + }, + { + "epoch": 6.371428571428572, + "grad_norm": 0.6692521572113037, + "learning_rate": 0.0002045142857142857, + "loss": 0.1223, + "step": 1115 + }, + { + "epoch": 6.4, + "grad_norm": 0.7216306328773499, + "learning_rate": 0.00020408571428571427, + "loss": 0.1367, + "step": 1120 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.5640934109687805, + "learning_rate": 0.00020365714285714283, + "loss": 0.1554, + "step": 1125 + }, + { + "epoch": 6.457142857142857, + "grad_norm": 0.8154368996620178, + "learning_rate": 0.00020322857142857138, + "loss": 0.1674, + "step": 1130 + }, + { + "epoch": 6.485714285714286, + "grad_norm": 0.7185398936271667, + "learning_rate": 0.0002028, + "loss": 0.1375, + "step": 1135 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 0.6805170774459839, + "learning_rate": 0.00020237142857142855, + "loss": 0.1306, + "step": 1140 + }, + { + "epoch": 6.542857142857143, + "grad_norm": 0.5996941924095154, + "learning_rate": 0.00020194285714285714, + "loss": 0.1433, + "step": 1145 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 0.5258373022079468, + "learning_rate": 0.0002015142857142857, + "loss": 0.1285, + "step": 1150 + }, + { + "epoch": 6.6, + "grad_norm": 0.7771695256233215, + "learning_rate": 0.00020108571428571425, + "loss": 0.1493, + "step": 1155 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 0.5920616388320923, + "learning_rate": 0.00020065714285714284, + "loss": 0.1479, + "step": 1160 + }, + { + "epoch": 6.6571428571428575, + "grad_norm": 0.7460982799530029, + "learning_rate": 0.00020022857142857142, + "loss": 0.1173, + "step": 1165 + }, + { + "epoch": 6.685714285714286, + "grad_norm": 1.1703822612762451, + "learning_rate": 0.0001998, + "loss": 0.1402, + "step": 1170 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.7894724011421204, + "learning_rate": 0.00019937142857142856, + "loss": 0.1253, + "step": 1175 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 0.7013376355171204, + "learning_rate": 0.00019894285714285712, + "loss": 0.1573, + "step": 1180 + }, + { + "epoch": 6.771428571428571, + "grad_norm": 0.6421737670898438, + "learning_rate": 0.0001985142857142857, + "loss": 0.1497, + "step": 1185 + }, + { + "epoch": 6.8, + "grad_norm": 1.204296350479126, + "learning_rate": 0.00019808571428571426, + "loss": 0.1634, + "step": 1190 + }, + { + "epoch": 6.828571428571428, + "grad_norm": 0.867765486240387, + "learning_rate": 0.00019765714285714282, + "loss": 0.1353, + "step": 1195 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 0.7325594425201416, + "learning_rate": 0.00019722857142857143, + "loss": 0.118, + "step": 1200 + }, + { + "epoch": 6.885714285714286, + "grad_norm": 0.7029078006744385, + "learning_rate": 0.00019679999999999999, + "loss": 0.1425, + "step": 1205 + }, + { + "epoch": 6.914285714285715, + "grad_norm": 1.1572504043579102, + "learning_rate": 0.00019637142857142857, + "loss": 0.1337, + "step": 1210 + }, + { + "epoch": 6.942857142857143, + "grad_norm": 0.8022822141647339, + "learning_rate": 0.00019594285714285713, + "loss": 0.1684, + "step": 1215 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 0.6729874610900879, + "learning_rate": 0.00019551428571428568, + "loss": 0.1238, + "step": 1220 + }, + { + "epoch": 7.0, + "grad_norm": 0.5773627758026123, + "learning_rate": 0.00019508571428571427, + "loss": 0.138, + "step": 1225 + }, + { + "epoch": 7.0285714285714285, + "grad_norm": 0.7182291150093079, + "learning_rate": 0.00019465714285714285, + "loss": 0.1431, + "step": 1230 + }, + { + "epoch": 7.057142857142857, + "grad_norm": 1.7567912340164185, + "learning_rate": 0.0001942285714285714, + "loss": 0.1319, + "step": 1235 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 0.6845232248306274, + "learning_rate": 0.0001938, + "loss": 0.1292, + "step": 1240 + }, + { + "epoch": 7.114285714285714, + "grad_norm": 0.6077771782875061, + "learning_rate": 0.00019337142857142855, + "loss": 0.1238, + "step": 1245 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.6168347597122192, + "learning_rate": 0.0001929428571428571, + "loss": 0.1384, + "step": 1250 + }, + { + "epoch": 7.171428571428572, + "grad_norm": 0.7457576394081116, + "learning_rate": 0.0001925142857142857, + "loss": 0.1306, + "step": 1255 + }, + { + "epoch": 7.2, + "grad_norm": 0.5969316363334656, + "learning_rate": 0.00019208571428571425, + "loss": 0.1123, + "step": 1260 + }, + { + "epoch": 7.228571428571429, + "grad_norm": 0.6902753710746765, + "learning_rate": 0.00019165714285714286, + "loss": 0.1185, + "step": 1265 + }, + { + "epoch": 7.257142857142857, + "grad_norm": 0.6488338112831116, + "learning_rate": 0.00019122857142857142, + "loss": 0.1431, + "step": 1270 + }, + { + "epoch": 7.285714285714286, + "grad_norm": 0.6814819574356079, + "learning_rate": 0.00019079999999999998, + "loss": 0.1495, + "step": 1275 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 0.7468088865280151, + "learning_rate": 0.00019037142857142856, + "loss": 0.1158, + "step": 1280 + }, + { + "epoch": 7.3428571428571425, + "grad_norm": 0.7417412400245667, + "learning_rate": 0.00018994285714285712, + "loss": 0.1311, + "step": 1285 + }, + { + "epoch": 7.371428571428572, + "grad_norm": 0.5480664372444153, + "learning_rate": 0.00018951428571428567, + "loss": 0.135, + "step": 1290 + }, + { + "epoch": 7.4, + "grad_norm": 0.725527822971344, + "learning_rate": 0.00018908571428571429, + "loss": 0.1217, + "step": 1295 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 0.6566678285598755, + "learning_rate": 0.00018865714285714284, + "loss": 0.1417, + "step": 1300 + }, + { + "epoch": 7.457142857142857, + "grad_norm": 0.516952395439148, + "learning_rate": 0.00018822857142857143, + "loss": 0.1329, + "step": 1305 + }, + { + "epoch": 7.485714285714286, + "grad_norm": 1.9545241594314575, + "learning_rate": 0.00018779999999999998, + "loss": 0.1339, + "step": 1310 + }, + { + "epoch": 7.514285714285714, + "grad_norm": 0.8276839852333069, + "learning_rate": 0.00018737142857142854, + "loss": 0.1324, + "step": 1315 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 0.6737099289894104, + "learning_rate": 0.00018694285714285713, + "loss": 0.1139, + "step": 1320 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 0.6914472579956055, + "learning_rate": 0.00018651428571428568, + "loss": 0.1146, + "step": 1325 + }, + { + "epoch": 7.6, + "grad_norm": 0.6630033850669861, + "learning_rate": 0.0001860857142857143, + "loss": 0.1571, + "step": 1330 + }, + { + "epoch": 7.628571428571428, + "grad_norm": 0.820688784122467, + "learning_rate": 0.00018565714285714285, + "loss": 0.15, + "step": 1335 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 2.0491325855255127, + "learning_rate": 0.0001852285714285714, + "loss": 0.127, + "step": 1340 + }, + { + "epoch": 7.685714285714286, + "grad_norm": 0.9327268004417419, + "learning_rate": 0.0001848, + "loss": 0.1289, + "step": 1345 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 1.3131701946258545, + "learning_rate": 0.00018437142857142855, + "loss": 0.1228, + "step": 1350 + }, + { + "epoch": 7.742857142857143, + "grad_norm": 2.955918312072754, + "learning_rate": 0.0001839428571428571, + "loss": 0.1082, + "step": 1355 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 1.2165493965148926, + "learning_rate": 0.00018351428571428572, + "loss": 0.1688, + "step": 1360 + }, + { + "epoch": 7.8, + "grad_norm": 0.759324312210083, + "learning_rate": 0.00018308571428571428, + "loss": 0.1185, + "step": 1365 + }, + { + "epoch": 7.828571428571428, + "grad_norm": 0.7445591688156128, + "learning_rate": 0.00018265714285714286, + "loss": 0.1431, + "step": 1370 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.679374098777771, + "learning_rate": 0.00018222857142857142, + "loss": 0.1451, + "step": 1375 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 2.1234302520751953, + "learning_rate": 0.00018179999999999997, + "loss": 0.1265, + "step": 1380 + }, + { + "epoch": 7.914285714285715, + "grad_norm": 1.006521224975586, + "learning_rate": 0.00018137142857142856, + "loss": 0.1722, + "step": 1385 + }, + { + "epoch": 7.942857142857143, + "grad_norm": 0.7275253534317017, + "learning_rate": 0.00018094285714285712, + "loss": 0.1625, + "step": 1390 + }, + { + "epoch": 7.9714285714285715, + "grad_norm": 0.8612022995948792, + "learning_rate": 0.0001805142857142857, + "loss": 0.1345, + "step": 1395 + }, + { + "epoch": 8.0, + "grad_norm": 0.7276798486709595, + "learning_rate": 0.00018008571428571428, + "loss": 0.1236, + "step": 1400 + }, + { + "epoch": 8.028571428571428, + "grad_norm": 0.8731086850166321, + "learning_rate": 0.00017965714285714284, + "loss": 0.1604, + "step": 1405 + }, + { + "epoch": 8.057142857142857, + "grad_norm": 0.8950818777084351, + "learning_rate": 0.0001792285714285714, + "loss": 0.1531, + "step": 1410 + }, + { + "epoch": 8.085714285714285, + "grad_norm": 0.7399356365203857, + "learning_rate": 0.00017879999999999998, + "loss": 0.1508, + "step": 1415 + }, + { + "epoch": 8.114285714285714, + "grad_norm": 1.3727307319641113, + "learning_rate": 0.00017837142857142854, + "loss": 0.1487, + "step": 1420 + }, + { + "epoch": 8.142857142857142, + "grad_norm": 0.5938125848770142, + "learning_rate": 0.00017794285714285715, + "loss": 0.1303, + "step": 1425 + }, + { + "epoch": 8.17142857142857, + "grad_norm": 0.7043821811676025, + "learning_rate": 0.0001775142857142857, + "loss": 0.0948, + "step": 1430 + }, + { + "epoch": 8.2, + "grad_norm": 1.1062767505645752, + "learning_rate": 0.00017708571428571426, + "loss": 0.1412, + "step": 1435 + }, + { + "epoch": 8.228571428571428, + "grad_norm": 0.844832181930542, + "learning_rate": 0.00017665714285714285, + "loss": 0.113, + "step": 1440 + }, + { + "epoch": 8.257142857142856, + "grad_norm": 0.7564154863357544, + "learning_rate": 0.0001762285714285714, + "loss": 0.1319, + "step": 1445 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 0.8843110203742981, + "learning_rate": 0.00017579999999999996, + "loss": 0.1206, + "step": 1450 + }, + { + "epoch": 8.314285714285715, + "grad_norm": 0.8175828456878662, + "learning_rate": 0.00017537142857142855, + "loss": 0.1327, + "step": 1455 + }, + { + "epoch": 8.342857142857143, + "grad_norm": 0.6443565487861633, + "learning_rate": 0.00017494285714285713, + "loss": 0.1239, + "step": 1460 + }, + { + "epoch": 8.371428571428572, + "grad_norm": 0.7237185835838318, + "learning_rate": 0.00017451428571428572, + "loss": 0.1639, + "step": 1465 + }, + { + "epoch": 8.4, + "grad_norm": 0.6118057370185852, + "learning_rate": 0.00017408571428571427, + "loss": 0.1363, + "step": 1470 + }, + { + "epoch": 8.428571428571429, + "grad_norm": 0.6754649877548218, + "learning_rate": 0.00017365714285714283, + "loss": 0.1187, + "step": 1475 + }, + { + "epoch": 8.457142857142857, + "grad_norm": 1.0067390203475952, + "learning_rate": 0.00017322857142857141, + "loss": 0.1401, + "step": 1480 + }, + { + "epoch": 8.485714285714286, + "grad_norm": 8.509544372558594, + "learning_rate": 0.00017279999999999997, + "loss": 0.1304, + "step": 1485 + }, + { + "epoch": 8.514285714285714, + "grad_norm": 4.2030205726623535, + "learning_rate": 0.00017237142857142858, + "loss": 0.121, + "step": 1490 + }, + { + "epoch": 8.542857142857143, + "grad_norm": 4.877438068389893, + "learning_rate": 0.00017194285714285714, + "loss": 0.1918, + "step": 1495 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 6.4971232414245605, + "learning_rate": 0.0001715142857142857, + "loss": 0.2154, + "step": 1500 + }, + { + "epoch": 8.6, + "grad_norm": 4.365469932556152, + "learning_rate": 0.00017108571428571428, + "loss": 0.2272, + "step": 1505 + }, + { + "epoch": 8.628571428571428, + "grad_norm": 2.551957845687866, + "learning_rate": 0.00017065714285714284, + "loss": 0.2163, + "step": 1510 + }, + { + "epoch": 8.657142857142857, + "grad_norm": 5.326391220092773, + "learning_rate": 0.0001702285714285714, + "loss": 0.1612, + "step": 1515 + }, + { + "epoch": 8.685714285714285, + "grad_norm": 1.3528404235839844, + "learning_rate": 0.00016979999999999998, + "loss": 0.1636, + "step": 1520 + }, + { + "epoch": 8.714285714285714, + "grad_norm": 1.4466065168380737, + "learning_rate": 0.00016937142857142856, + "loss": 0.1295, + "step": 1525 + }, + { + "epoch": 8.742857142857144, + "grad_norm": 0.6576040387153625, + "learning_rate": 0.00016894285714285715, + "loss": 0.1318, + "step": 1530 + }, + { + "epoch": 8.771428571428572, + "grad_norm": 1.286942958831787, + "learning_rate": 0.0001685142857142857, + "loss": 0.1443, + "step": 1535 + }, + { + "epoch": 8.8, + "grad_norm": 9.474458694458008, + "learning_rate": 0.00016808571428571426, + "loss": 0.1313, + "step": 1540 + }, + { + "epoch": 8.82857142857143, + "grad_norm": 2.6731069087982178, + "learning_rate": 0.00016765714285714285, + "loss": 0.1485, + "step": 1545 + }, + { + "epoch": 8.857142857142858, + "grad_norm": 1.313723087310791, + "learning_rate": 0.0001672285714285714, + "loss": 0.1346, + "step": 1550 + }, + { + "epoch": 8.885714285714286, + "grad_norm": 1.7115576267242432, + "learning_rate": 0.0001668, + "loss": 0.1471, + "step": 1555 + }, + { + "epoch": 8.914285714285715, + "grad_norm": 1.2599923610687256, + "learning_rate": 0.00016637142857142857, + "loss": 0.1433, + "step": 1560 + }, + { + "epoch": 8.942857142857143, + "grad_norm": 0.9659029245376587, + "learning_rate": 0.00016594285714285713, + "loss": 0.1256, + "step": 1565 + }, + { + "epoch": 8.971428571428572, + "grad_norm": 1.1282744407653809, + "learning_rate": 0.0001655142857142857, + "loss": 0.1373, + "step": 1570 + }, + { + "epoch": 9.0, + "grad_norm": 3.20717453956604, + "learning_rate": 0.00016508571428571427, + "loss": 0.1355, + "step": 1575 + }, + { + "epoch": 9.028571428571428, + "grad_norm": 0.8310821056365967, + "learning_rate": 0.00016465714285714283, + "loss": 0.1268, + "step": 1580 + }, + { + "epoch": 9.057142857142857, + "grad_norm": 1.5337790250778198, + "learning_rate": 0.00016422857142857139, + "loss": 0.1267, + "step": 1585 + }, + { + "epoch": 9.085714285714285, + "grad_norm": 2.6406068801879883, + "learning_rate": 0.0001638, + "loss": 0.1363, + "step": 1590 + }, + { + "epoch": 9.114285714285714, + "grad_norm": 0.7705873847007751, + "learning_rate": 0.00016337142857142855, + "loss": 0.1291, + "step": 1595 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 0.7092650532722473, + "learning_rate": 0.00016294285714285714, + "loss": 0.1435, + "step": 1600 + }, + { + "epoch": 9.17142857142857, + "grad_norm": 1.098961591720581, + "learning_rate": 0.0001625142857142857, + "loss": 0.1471, + "step": 1605 + }, + { + "epoch": 9.2, + "grad_norm": 0.6994885206222534, + "learning_rate": 0.00016208571428571425, + "loss": 0.1345, + "step": 1610 + }, + { + "epoch": 9.228571428571428, + "grad_norm": 0.9613476991653442, + "learning_rate": 0.00016165714285714284, + "loss": 0.1399, + "step": 1615 + }, + { + "epoch": 9.257142857142856, + "grad_norm": 0.675588846206665, + "learning_rate": 0.00016122857142857142, + "loss": 0.1319, + "step": 1620 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.7519372701644897, + "learning_rate": 0.0001608, + "loss": 0.137, + "step": 1625 + }, + { + "epoch": 9.314285714285715, + "grad_norm": 1.135025978088379, + "learning_rate": 0.00016037142857142856, + "loss": 0.1322, + "step": 1630 + }, + { + "epoch": 9.342857142857143, + "grad_norm": 0.7462936639785767, + "learning_rate": 0.00015994285714285712, + "loss": 0.1215, + "step": 1635 + }, + { + "epoch": 9.371428571428572, + "grad_norm": 0.9042088985443115, + "learning_rate": 0.0001595142857142857, + "loss": 0.1191, + "step": 1640 + }, + { + "epoch": 9.4, + "grad_norm": 0.567828893661499, + "learning_rate": 0.00015908571428571426, + "loss": 0.1189, + "step": 1645 + }, + { + "epoch": 9.428571428571429, + "grad_norm": 0.981585681438446, + "learning_rate": 0.00015865714285714282, + "loss": 0.128, + "step": 1650 + }, + { + "epoch": 9.457142857142857, + "grad_norm": 1.24985933303833, + "learning_rate": 0.00015822857142857143, + "loss": 0.1315, + "step": 1655 + }, + { + "epoch": 9.485714285714286, + "grad_norm": 0.6517993211746216, + "learning_rate": 0.0001578, + "loss": 0.1076, + "step": 1660 + }, + { + "epoch": 9.514285714285714, + "grad_norm": 1.166628122329712, + "learning_rate": 0.00015737142857142857, + "loss": 0.1345, + "step": 1665 + }, + { + "epoch": 9.542857142857143, + "grad_norm": 0.9763592481613159, + "learning_rate": 0.00015694285714285713, + "loss": 0.1449, + "step": 1670 + }, + { + "epoch": 9.571428571428571, + "grad_norm": 0.7829060554504395, + "learning_rate": 0.00015651428571428569, + "loss": 0.1117, + "step": 1675 + }, + { + "epoch": 9.6, + "grad_norm": 0.6693719029426575, + "learning_rate": 0.00015608571428571427, + "loss": 0.1129, + "step": 1680 + }, + { + "epoch": 9.628571428571428, + "grad_norm": 1.2122846841812134, + "learning_rate": 0.00015565714285714285, + "loss": 0.1125, + "step": 1685 + }, + { + "epoch": 9.657142857142857, + "grad_norm": 1.0689371824264526, + "learning_rate": 0.0001552285714285714, + "loss": 0.1478, + "step": 1690 + }, + { + "epoch": 9.685714285714285, + "grad_norm": 1.8511656522750854, + "learning_rate": 0.0001548, + "loss": 0.1431, + "step": 1695 + }, + { + "epoch": 9.714285714285714, + "grad_norm": 0.6706506609916687, + "learning_rate": 0.00015437142857142855, + "loss": 0.1262, + "step": 1700 + }, + { + "epoch": 9.742857142857144, + "grad_norm": 1.0798784494400024, + "learning_rate": 0.00015394285714285714, + "loss": 0.1275, + "step": 1705 + }, + { + "epoch": 9.771428571428572, + "grad_norm": 0.7915983200073242, + "learning_rate": 0.0001535142857142857, + "loss": 0.1316, + "step": 1710 + }, + { + "epoch": 9.8, + "grad_norm": 1.8630567789077759, + "learning_rate": 0.00015308571428571425, + "loss": 0.1258, + "step": 1715 + }, + { + "epoch": 9.82857142857143, + "grad_norm": 0.7807756662368774, + "learning_rate": 0.00015265714285714286, + "loss": 0.1079, + "step": 1720 + }, + { + "epoch": 9.857142857142858, + "grad_norm": 1.4698439836502075, + "learning_rate": 0.00015222857142857142, + "loss": 0.1357, + "step": 1725 + }, + { + "epoch": 9.885714285714286, + "grad_norm": 1.2121926546096802, + "learning_rate": 0.00015179999999999998, + "loss": 0.1322, + "step": 1730 + }, + { + "epoch": 9.914285714285715, + "grad_norm": 0.6348568201065063, + "learning_rate": 0.00015137142857142856, + "loss": 0.0893, + "step": 1735 + }, + { + "epoch": 9.942857142857143, + "grad_norm": 0.6694422364234924, + "learning_rate": 0.00015094285714285712, + "loss": 0.1189, + "step": 1740 + }, + { + "epoch": 9.971428571428572, + "grad_norm": 0.569332480430603, + "learning_rate": 0.00015051428571428567, + "loss": 0.1349, + "step": 1745 + }, + { + "epoch": 10.0, + "grad_norm": 0.934073269367218, + "learning_rate": 0.00015008571428571429, + "loss": 0.1237, + "step": 1750 + }, + { + "epoch": 10.028571428571428, + "grad_norm": 0.7191672325134277, + "learning_rate": 0.00014965714285714284, + "loss": 0.1308, + "step": 1755 + }, + { + "epoch": 10.057142857142857, + "grad_norm": 0.7006493806838989, + "learning_rate": 0.00014922857142857143, + "loss": 0.104, + "step": 1760 + }, + { + "epoch": 10.085714285714285, + "grad_norm": 0.9030678272247314, + "learning_rate": 0.00014879999999999998, + "loss": 0.1308, + "step": 1765 + }, + { + "epoch": 10.114285714285714, + "grad_norm": 0.7007766366004944, + "learning_rate": 0.00014837142857142854, + "loss": 0.1044, + "step": 1770 + }, + { + "epoch": 10.142857142857142, + "grad_norm": 0.4832770824432373, + "learning_rate": 0.00014794285714285713, + "loss": 0.1119, + "step": 1775 + }, + { + "epoch": 10.17142857142857, + "grad_norm": 0.7819458842277527, + "learning_rate": 0.0001475142857142857, + "loss": 0.1087, + "step": 1780 + }, + { + "epoch": 10.2, + "grad_norm": 1.0223525762557983, + "learning_rate": 0.00014708571428571427, + "loss": 0.1314, + "step": 1785 + }, + { + "epoch": 10.228571428571428, + "grad_norm": 0.6224566698074341, + "learning_rate": 0.00014665714285714285, + "loss": 0.1159, + "step": 1790 + }, + { + "epoch": 10.257142857142856, + "grad_norm": 0.45800235867500305, + "learning_rate": 0.0001462285714285714, + "loss": 0.0942, + "step": 1795 + }, + { + "epoch": 10.285714285714286, + "grad_norm": 0.6258400082588196, + "learning_rate": 0.0001458, + "loss": 0.1079, + "step": 1800 + }, + { + "epoch": 10.314285714285715, + "grad_norm": 1.1812794208526611, + "learning_rate": 0.00014537142857142858, + "loss": 0.1378, + "step": 1805 + }, + { + "epoch": 10.342857142857143, + "grad_norm": 0.8541269898414612, + "learning_rate": 0.00014494285714285713, + "loss": 0.1274, + "step": 1810 + }, + { + "epoch": 10.371428571428572, + "grad_norm": 0.7131860256195068, + "learning_rate": 0.0001445142857142857, + "loss": 0.1247, + "step": 1815 + }, + { + "epoch": 10.4, + "grad_norm": 0.6109820008277893, + "learning_rate": 0.00014408571428571428, + "loss": 0.1246, + "step": 1820 + }, + { + "epoch": 10.428571428571429, + "grad_norm": 0.5621510744094849, + "learning_rate": 0.00014365714285714286, + "loss": 0.1039, + "step": 1825 + }, + { + "epoch": 10.457142857142857, + "grad_norm": 1.022777795791626, + "learning_rate": 0.00014322857142857142, + "loss": 0.1206, + "step": 1830 + }, + { + "epoch": 10.485714285714286, + "grad_norm": 0.9120668768882751, + "learning_rate": 0.00014279999999999997, + "loss": 0.1289, + "step": 1835 + }, + { + "epoch": 10.514285714285714, + "grad_norm": 1.1882030963897705, + "learning_rate": 0.00014237142857142856, + "loss": 0.1194, + "step": 1840 + }, + { + "epoch": 10.542857142857143, + "grad_norm": 0.6078401207923889, + "learning_rate": 0.00014194285714285714, + "loss": 0.1339, + "step": 1845 + }, + { + "epoch": 10.571428571428571, + "grad_norm": 0.7380999326705933, + "learning_rate": 0.0001415142857142857, + "loss": 0.1318, + "step": 1850 + }, + { + "epoch": 10.6, + "grad_norm": 0.5884959101676941, + "learning_rate": 0.00014108571428571428, + "loss": 0.1249, + "step": 1855 + }, + { + "epoch": 10.628571428571428, + "grad_norm": 1.0121936798095703, + "learning_rate": 0.00014065714285714284, + "loss": 0.1137, + "step": 1860 + }, + { + "epoch": 10.657142857142857, + "grad_norm": 0.6444916129112244, + "learning_rate": 0.00014022857142857143, + "loss": 0.1213, + "step": 1865 + }, + { + "epoch": 10.685714285714285, + "grad_norm": 0.7931004762649536, + "learning_rate": 0.00013979999999999998, + "loss": 0.1318, + "step": 1870 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 0.5596404075622559, + "learning_rate": 0.00013937142857142857, + "loss": 0.1075, + "step": 1875 + }, + { + "epoch": 10.742857142857144, + "grad_norm": 0.6586474180221558, + "learning_rate": 0.00013894285714285712, + "loss": 0.13, + "step": 1880 + }, + { + "epoch": 10.771428571428572, + "grad_norm": 1.0195013284683228, + "learning_rate": 0.00013851428571428568, + "loss": 0.1373, + "step": 1885 + }, + { + "epoch": 10.8, + "grad_norm": 0.9233512878417969, + "learning_rate": 0.00013808571428571427, + "loss": 0.1168, + "step": 1890 + }, + { + "epoch": 10.82857142857143, + "grad_norm": 0.7154092788696289, + "learning_rate": 0.00013765714285714285, + "loss": 0.1081, + "step": 1895 + }, + { + "epoch": 10.857142857142858, + "grad_norm": 1.4588117599487305, + "learning_rate": 0.0001372285714285714, + "loss": 0.1061, + "step": 1900 + }, + { + "epoch": 10.885714285714286, + "grad_norm": 0.6087035536766052, + "learning_rate": 0.0001368, + "loss": 0.1157, + "step": 1905 + }, + { + "epoch": 10.914285714285715, + "grad_norm": 0.7371247410774231, + "learning_rate": 0.00013637142857142855, + "loss": 0.1339, + "step": 1910 + }, + { + "epoch": 10.942857142857143, + "grad_norm": 0.8253212571144104, + "learning_rate": 0.00013594285714285713, + "loss": 0.1198, + "step": 1915 + }, + { + "epoch": 10.971428571428572, + "grad_norm": 0.6889544129371643, + "learning_rate": 0.00013551428571428572, + "loss": 0.1131, + "step": 1920 + }, + { + "epoch": 11.0, + "grad_norm": 0.6408224105834961, + "learning_rate": 0.00013508571428571427, + "loss": 0.122, + "step": 1925 + }, + { + "epoch": 11.028571428571428, + "grad_norm": 0.6771185398101807, + "learning_rate": 0.00013465714285714283, + "loss": 0.1492, + "step": 1930 + }, + { + "epoch": 11.057142857142857, + "grad_norm": 0.8706450462341309, + "learning_rate": 0.00013422857142857142, + "loss": 0.1294, + "step": 1935 + }, + { + "epoch": 11.085714285714285, + "grad_norm": 1.730648398399353, + "learning_rate": 0.0001338, + "loss": 0.1004, + "step": 1940 + }, + { + "epoch": 11.114285714285714, + "grad_norm": 0.6985113620758057, + "learning_rate": 0.00013337142857142856, + "loss": 0.0995, + "step": 1945 + }, + { + "epoch": 11.142857142857142, + "grad_norm": 0.8901951313018799, + "learning_rate": 0.00013294285714285711, + "loss": 0.1179, + "step": 1950 + }, + { + "epoch": 11.17142857142857, + "grad_norm": 0.7232164144515991, + "learning_rate": 0.0001325142857142857, + "loss": 0.1397, + "step": 1955 + }, + { + "epoch": 11.2, + "grad_norm": 0.6447544693946838, + "learning_rate": 0.00013208571428571428, + "loss": 0.1366, + "step": 1960 + }, + { + "epoch": 11.228571428571428, + "grad_norm": 0.7964944243431091, + "learning_rate": 0.00013165714285714284, + "loss": 0.1121, + "step": 1965 + }, + { + "epoch": 11.257142857142856, + "grad_norm": 0.9012628793716431, + "learning_rate": 0.00013122857142857142, + "loss": 0.1131, + "step": 1970 + }, + { + "epoch": 11.285714285714286, + "grad_norm": 0.9295369982719421, + "learning_rate": 0.00013079999999999998, + "loss": 0.1232, + "step": 1975 + }, + { + "epoch": 11.314285714285715, + "grad_norm": 0.6237708926200867, + "learning_rate": 0.00013037142857142857, + "loss": 0.1066, + "step": 1980 + }, + { + "epoch": 11.342857142857143, + "grad_norm": 0.5250967741012573, + "learning_rate": 0.00012994285714285715, + "loss": 0.118, + "step": 1985 + }, + { + "epoch": 11.371428571428572, + "grad_norm": 1.0013964176177979, + "learning_rate": 0.0001295142857142857, + "loss": 0.1125, + "step": 1990 + }, + { + "epoch": 11.4, + "grad_norm": 0.6721311807632446, + "learning_rate": 0.00012908571428571426, + "loss": 0.1196, + "step": 1995 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.6966421008110046, + "learning_rate": 0.00012865714285714285, + "loss": 0.1172, + "step": 2000 + }, + { + "epoch": 11.457142857142857, + "grad_norm": 0.8811460733413696, + "learning_rate": 0.00012822857142857143, + "loss": 0.135, + "step": 2005 + }, + { + "epoch": 11.485714285714286, + "grad_norm": 0.8829531073570251, + "learning_rate": 0.0001278, + "loss": 0.1288, + "step": 2010 + }, + { + "epoch": 11.514285714285714, + "grad_norm": 0.7530654668807983, + "learning_rate": 0.00012737142857142855, + "loss": 0.1073, + "step": 2015 + }, + { + "epoch": 11.542857142857143, + "grad_norm": 0.513940691947937, + "learning_rate": 0.00012694285714285713, + "loss": 0.121, + "step": 2020 + }, + { + "epoch": 11.571428571428571, + "grad_norm": 0.8574968576431274, + "learning_rate": 0.0001265142857142857, + "loss": 0.1103, + "step": 2025 + }, + { + "epoch": 11.6, + "grad_norm": 0.7482439875602722, + "learning_rate": 0.00012608571428571427, + "loss": 0.1027, + "step": 2030 + }, + { + "epoch": 11.628571428571428, + "grad_norm": 0.8367976546287537, + "learning_rate": 0.00012565714285714286, + "loss": 0.1181, + "step": 2035 + }, + { + "epoch": 11.657142857142857, + "grad_norm": 2.048128366470337, + "learning_rate": 0.0001252285714285714, + "loss": 0.1122, + "step": 2040 + }, + { + "epoch": 11.685714285714285, + "grad_norm": 0.7426862716674805, + "learning_rate": 0.00012479999999999997, + "loss": 0.1169, + "step": 2045 + }, + { + "epoch": 11.714285714285714, + "grad_norm": 3.093841791152954, + "learning_rate": 0.00012437142857142855, + "loss": 0.1164, + "step": 2050 + }, + { + "epoch": 11.742857142857144, + "grad_norm": 0.8172643184661865, + "learning_rate": 0.00012394285714285714, + "loss": 0.1354, + "step": 2055 + }, + { + "epoch": 11.771428571428572, + "grad_norm": 1.9950591325759888, + "learning_rate": 0.0001235142857142857, + "loss": 0.1037, + "step": 2060 + }, + { + "epoch": 11.8, + "grad_norm": 0.5929077863693237, + "learning_rate": 0.00012308571428571428, + "loss": 0.1194, + "step": 2065 + }, + { + "epoch": 11.82857142857143, + "grad_norm": 1.293624997138977, + "learning_rate": 0.00012265714285714284, + "loss": 0.12, + "step": 2070 + }, + { + "epoch": 11.857142857142858, + "grad_norm": 1.0515168905258179, + "learning_rate": 0.00012222857142857142, + "loss": 0.1049, + "step": 2075 + }, + { + "epoch": 11.885714285714286, + "grad_norm": 1.2874428033828735, + "learning_rate": 0.00012179999999999999, + "loss": 0.115, + "step": 2080 + }, + { + "epoch": 11.914285714285715, + "grad_norm": 0.7317278385162354, + "learning_rate": 0.00012137142857142856, + "loss": 0.1184, + "step": 2085 + }, + { + "epoch": 11.942857142857143, + "grad_norm": 1.3407148122787476, + "learning_rate": 0.00012094285714285713, + "loss": 0.132, + "step": 2090 + }, + { + "epoch": 11.971428571428572, + "grad_norm": 2.656409502029419, + "learning_rate": 0.00012051428571428569, + "loss": 0.1359, + "step": 2095 + }, + { + "epoch": 12.0, + "grad_norm": 0.7189064025878906, + "learning_rate": 0.00012008571428571428, + "loss": 0.1217, + "step": 2100 + }, + { + "epoch": 12.028571428571428, + "grad_norm": 0.7510334849357605, + "learning_rate": 0.00011965714285714285, + "loss": 0.109, + "step": 2105 + }, + { + "epoch": 12.057142857142857, + "grad_norm": 0.7235113382339478, + "learning_rate": 0.00011922857142857142, + "loss": 0.1114, + "step": 2110 + }, + { + "epoch": 12.085714285714285, + "grad_norm": 1.7435882091522217, + "learning_rate": 0.0001188, + "loss": 0.1357, + "step": 2115 + }, + { + "epoch": 12.114285714285714, + "grad_norm": 1.170392632484436, + "learning_rate": 0.00011837142857142856, + "loss": 0.1255, + "step": 2120 + }, + { + "epoch": 12.142857142857142, + "grad_norm": 0.6476783752441406, + "learning_rate": 0.00011794285714285713, + "loss": 0.1108, + "step": 2125 + }, + { + "epoch": 12.17142857142857, + "grad_norm": 0.8599929213523865, + "learning_rate": 0.00011751428571428571, + "loss": 0.0997, + "step": 2130 + }, + { + "epoch": 12.2, + "grad_norm": 0.8918687105178833, + "learning_rate": 0.00011708571428571428, + "loss": 0.1149, + "step": 2135 + }, + { + "epoch": 12.228571428571428, + "grad_norm": 1.609435796737671, + "learning_rate": 0.00011665714285714284, + "loss": 0.1136, + "step": 2140 + }, + { + "epoch": 12.257142857142856, + "grad_norm": 0.6206801533699036, + "learning_rate": 0.00011622857142857143, + "loss": 0.1135, + "step": 2145 + }, + { + "epoch": 12.285714285714286, + "grad_norm": 0.8769077658653259, + "learning_rate": 0.0001158, + "loss": 0.1344, + "step": 2150 + }, + { + "epoch": 12.314285714285715, + "grad_norm": 0.6279401183128357, + "learning_rate": 0.00011537142857142855, + "loss": 0.1049, + "step": 2155 + }, + { + "epoch": 12.342857142857143, + "grad_norm": 1.1110137701034546, + "learning_rate": 0.00011494285714285712, + "loss": 0.1146, + "step": 2160 + }, + { + "epoch": 12.371428571428572, + "grad_norm": 0.7911233901977539, + "learning_rate": 0.00011451428571428571, + "loss": 0.1257, + "step": 2165 + }, + { + "epoch": 12.4, + "grad_norm": 0.9691207408905029, + "learning_rate": 0.00011408571428571428, + "loss": 0.1226, + "step": 2170 + }, + { + "epoch": 12.428571428571429, + "grad_norm": 0.6168835759162903, + "learning_rate": 0.00011365714285714284, + "loss": 0.1271, + "step": 2175 + }, + { + "epoch": 12.457142857142857, + "grad_norm": 0.6143497228622437, + "learning_rate": 0.00011322857142857142, + "loss": 0.111, + "step": 2180 + }, + { + "epoch": 12.485714285714286, + "grad_norm": 1.5673450231552124, + "learning_rate": 0.00011279999999999999, + "loss": 0.1186, + "step": 2185 + }, + { + "epoch": 12.514285714285714, + "grad_norm": 1.298756718635559, + "learning_rate": 0.00011237142857142856, + "loss": 0.1024, + "step": 2190 + }, + { + "epoch": 12.542857142857143, + "grad_norm": 0.9484918117523193, + "learning_rate": 0.00011194285714285715, + "loss": 0.1171, + "step": 2195 + }, + { + "epoch": 12.571428571428571, + "grad_norm": 0.725705623626709, + "learning_rate": 0.0001115142857142857, + "loss": 0.1216, + "step": 2200 + }, + { + "epoch": 12.6, + "grad_norm": 1.1394798755645752, + "learning_rate": 0.00011108571428571427, + "loss": 0.1132, + "step": 2205 + }, + { + "epoch": 12.628571428571428, + "grad_norm": 0.9548712968826294, + "learning_rate": 0.00011065714285714286, + "loss": 0.1209, + "step": 2210 + }, + { + "epoch": 12.657142857142857, + "grad_norm": 0.6173953413963318, + "learning_rate": 0.00011022857142857143, + "loss": 0.1049, + "step": 2215 + }, + { + "epoch": 12.685714285714285, + "grad_norm": 0.8227205872535706, + "learning_rate": 0.00010979999999999999, + "loss": 0.1045, + "step": 2220 + }, + { + "epoch": 12.714285714285714, + "grad_norm": 0.7252780795097351, + "learning_rate": 0.00010937142857142856, + "loss": 0.1146, + "step": 2225 + }, + { + "epoch": 12.742857142857144, + "grad_norm": 0.9374399781227112, + "learning_rate": 0.00010894285714285714, + "loss": 0.1478, + "step": 2230 + }, + { + "epoch": 12.771428571428572, + "grad_norm": 5.1985368728637695, + "learning_rate": 0.0001085142857142857, + "loss": 0.1059, + "step": 2235 + }, + { + "epoch": 12.8, + "grad_norm": 0.9629620909690857, + "learning_rate": 0.00010808571428571427, + "loss": 0.124, + "step": 2240 + }, + { + "epoch": 12.82857142857143, + "grad_norm": 0.7022290229797363, + "learning_rate": 0.00010765714285714285, + "loss": 0.1309, + "step": 2245 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.574188232421875, + "learning_rate": 0.00010722857142857142, + "loss": 0.086, + "step": 2250 + }, + { + "epoch": 12.885714285714286, + "grad_norm": 0.9712439179420471, + "learning_rate": 0.00010679999999999998, + "loss": 0.1152, + "step": 2255 + }, + { + "epoch": 12.914285714285715, + "grad_norm": 0.6562150120735168, + "learning_rate": 0.00010637142857142856, + "loss": 0.1343, + "step": 2260 + }, + { + "epoch": 12.942857142857143, + "grad_norm": 0.6936819553375244, + "learning_rate": 0.00010594285714285714, + "loss": 0.1009, + "step": 2265 + }, + { + "epoch": 12.971428571428572, + "grad_norm": 0.8664882779121399, + "learning_rate": 0.0001055142857142857, + "loss": 0.1164, + "step": 2270 + }, + { + "epoch": 13.0, + "grad_norm": 0.9224509000778198, + "learning_rate": 0.00010508571428571429, + "loss": 0.1347, + "step": 2275 + }, + { + "epoch": 13.028571428571428, + "grad_norm": 0.6596968770027161, + "learning_rate": 0.00010465714285714285, + "loss": 0.1041, + "step": 2280 + }, + { + "epoch": 13.057142857142857, + "grad_norm": 0.6456631422042847, + "learning_rate": 0.00010422857142857142, + "loss": 0.1142, + "step": 2285 + }, + { + "epoch": 13.085714285714285, + "grad_norm": 0.9466612339019775, + "learning_rate": 0.00010379999999999999, + "loss": 0.1191, + "step": 2290 + }, + { + "epoch": 13.114285714285714, + "grad_norm": 0.9036727547645569, + "learning_rate": 0.00010337142857142856, + "loss": 0.121, + "step": 2295 + }, + { + "epoch": 13.142857142857142, + "grad_norm": 1.08086359500885, + "learning_rate": 0.00010294285714285713, + "loss": 0.1313, + "step": 2300 + }, + { + "epoch": 13.17142857142857, + "grad_norm": 0.703241765499115, + "learning_rate": 0.0001025142857142857, + "loss": 0.1151, + "step": 2305 + }, + { + "epoch": 13.2, + "grad_norm": 0.7901896238327026, + "learning_rate": 0.00010208571428571429, + "loss": 0.1275, + "step": 2310 + }, + { + "epoch": 13.228571428571428, + "grad_norm": 0.703542947769165, + "learning_rate": 0.00010165714285714284, + "loss": 0.1, + "step": 2315 + }, + { + "epoch": 13.257142857142856, + "grad_norm": 0.6657671928405762, + "learning_rate": 0.00010122857142857141, + "loss": 0.1141, + "step": 2320 + }, + { + "epoch": 13.285714285714286, + "grad_norm": 0.7593729496002197, + "learning_rate": 0.0001008, + "loss": 0.1099, + "step": 2325 + }, + { + "epoch": 13.314285714285715, + "grad_norm": 0.6681057810783386, + "learning_rate": 0.00010037142857142857, + "loss": 0.112, + "step": 2330 + }, + { + "epoch": 13.342857142857143, + "grad_norm": 0.7155857682228088, + "learning_rate": 9.994285714285712e-05, + "loss": 0.0989, + "step": 2335 + }, + { + "epoch": 13.371428571428572, + "grad_norm": 0.9484553337097168, + "learning_rate": 9.951428571428571e-05, + "loss": 0.0902, + "step": 2340 + }, + { + "epoch": 13.4, + "grad_norm": 0.9317265152931213, + "learning_rate": 9.908571428571428e-05, + "loss": 0.1432, + "step": 2345 + }, + { + "epoch": 13.428571428571429, + "grad_norm": 1.039158821105957, + "learning_rate": 9.865714285714285e-05, + "loss": 0.114, + "step": 2350 + }, + { + "epoch": 13.457142857142857, + "grad_norm": 0.8524000644683838, + "learning_rate": 9.822857142857141e-05, + "loss": 0.1144, + "step": 2355 + }, + { + "epoch": 13.485714285714286, + "grad_norm": 0.6337461471557617, + "learning_rate": 9.779999999999999e-05, + "loss": 0.1073, + "step": 2360 + }, + { + "epoch": 13.514285714285714, + "grad_norm": 0.9097298383712769, + "learning_rate": 9.737142857142856e-05, + "loss": 0.1031, + "step": 2365 + }, + { + "epoch": 13.542857142857143, + "grad_norm": 1.2013412714004517, + "learning_rate": 9.694285714285713e-05, + "loss": 0.1174, + "step": 2370 + }, + { + "epoch": 13.571428571428571, + "grad_norm": 0.7055214643478394, + "learning_rate": 9.65142857142857e-05, + "loss": 0.1175, + "step": 2375 + }, + { + "epoch": 13.6, + "grad_norm": 0.807955265045166, + "learning_rate": 9.608571428571427e-05, + "loss": 0.1286, + "step": 2380 + }, + { + "epoch": 13.628571428571428, + "grad_norm": 0.6661797761917114, + "learning_rate": 9.565714285714285e-05, + "loss": 0.1091, + "step": 2385 + }, + { + "epoch": 13.657142857142857, + "grad_norm": 1.119604468345642, + "learning_rate": 9.522857142857143e-05, + "loss": 0.1393, + "step": 2390 + }, + { + "epoch": 13.685714285714285, + "grad_norm": 0.5365435481071472, + "learning_rate": 9.479999999999999e-05, + "loss": 0.1075, + "step": 2395 + }, + { + "epoch": 13.714285714285714, + "grad_norm": 0.9443924427032471, + "learning_rate": 9.437142857142856e-05, + "loss": 0.0977, + "step": 2400 + }, + { + "epoch": 13.742857142857144, + "grad_norm": 0.6075264811515808, + "learning_rate": 9.394285714285714e-05, + "loss": 0.1329, + "step": 2405 + }, + { + "epoch": 13.771428571428572, + "grad_norm": 1.019352912902832, + "learning_rate": 9.351428571428571e-05, + "loss": 0.1083, + "step": 2410 + }, + { + "epoch": 13.8, + "grad_norm": 0.7234058380126953, + "learning_rate": 9.308571428571427e-05, + "loss": 0.1118, + "step": 2415 + }, + { + "epoch": 13.82857142857143, + "grad_norm": 0.6786122918128967, + "learning_rate": 9.265714285714284e-05, + "loss": 0.1208, + "step": 2420 + }, + { + "epoch": 13.857142857142858, + "grad_norm": 0.5820732116699219, + "learning_rate": 9.222857142857142e-05, + "loss": 0.1022, + "step": 2425 + }, + { + "epoch": 13.885714285714286, + "grad_norm": 0.8007987141609192, + "learning_rate": 9.18e-05, + "loss": 0.1293, + "step": 2430 + }, + { + "epoch": 13.914285714285715, + "grad_norm": 0.6813766956329346, + "learning_rate": 9.137142857142855e-05, + "loss": 0.1284, + "step": 2435 + }, + { + "epoch": 13.942857142857143, + "grad_norm": 0.6460041403770447, + "learning_rate": 9.094285714285714e-05, + "loss": 0.1073, + "step": 2440 + }, + { + "epoch": 13.971428571428572, + "grad_norm": 0.5939205288887024, + "learning_rate": 9.051428571428571e-05, + "loss": 0.1185, + "step": 2445 + }, + { + "epoch": 14.0, + "grad_norm": 0.8150635361671448, + "learning_rate": 9.008571428571428e-05, + "loss": 0.1039, + "step": 2450 + }, + { + "epoch": 14.028571428571428, + "grad_norm": 1.3691389560699463, + "learning_rate": 8.965714285714285e-05, + "loss": 0.1112, + "step": 2455 + }, + { + "epoch": 14.057142857142857, + "grad_norm": 0.9042718410491943, + "learning_rate": 8.922857142857142e-05, + "loss": 0.112, + "step": 2460 + }, + { + "epoch": 14.085714285714285, + "grad_norm": 0.7222105860710144, + "learning_rate": 8.879999999999999e-05, + "loss": 0.1221, + "step": 2465 + }, + { + "epoch": 14.114285714285714, + "grad_norm": 0.595588207244873, + "learning_rate": 8.837142857142857e-05, + "loss": 0.1058, + "step": 2470 + }, + { + "epoch": 14.142857142857142, + "grad_norm": 0.5262706279754639, + "learning_rate": 8.794285714285713e-05, + "loss": 0.1071, + "step": 2475 + }, + { + "epoch": 14.17142857142857, + "grad_norm": 0.6511022448539734, + "learning_rate": 8.75142857142857e-05, + "loss": 0.0917, + "step": 2480 + }, + { + "epoch": 14.2, + "grad_norm": 0.5737650394439697, + "learning_rate": 8.708571428571427e-05, + "loss": 0.0988, + "step": 2485 + }, + { + "epoch": 14.228571428571428, + "grad_norm": 0.7679132223129272, + "learning_rate": 8.665714285714286e-05, + "loss": 0.1185, + "step": 2490 + }, + { + "epoch": 14.257142857142856, + "grad_norm": 0.641198456287384, + "learning_rate": 8.622857142857141e-05, + "loss": 0.0894, + "step": 2495 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.7215464115142822, + "learning_rate": 8.579999999999998e-05, + "loss": 0.0935, + "step": 2500 + }, + { + "epoch": 14.314285714285715, + "grad_norm": 1.0740891695022583, + "learning_rate": 8.537142857142857e-05, + "loss": 0.1156, + "step": 2505 + }, + { + "epoch": 14.342857142857143, + "grad_norm": 0.6668990254402161, + "learning_rate": 8.494285714285714e-05, + "loss": 0.1006, + "step": 2510 + }, + { + "epoch": 14.371428571428572, + "grad_norm": 0.6674673557281494, + "learning_rate": 8.45142857142857e-05, + "loss": 0.1045, + "step": 2515 + }, + { + "epoch": 14.4, + "grad_norm": 0.6198854446411133, + "learning_rate": 8.408571428571428e-05, + "loss": 0.0997, + "step": 2520 + }, + { + "epoch": 14.428571428571429, + "grad_norm": 0.7187360525131226, + "learning_rate": 8.365714285714285e-05, + "loss": 0.1277, + "step": 2525 + }, + { + "epoch": 14.457142857142857, + "grad_norm": 0.583990216255188, + "learning_rate": 8.322857142857142e-05, + "loss": 0.1182, + "step": 2530 + }, + { + "epoch": 14.485714285714286, + "grad_norm": 1.1340539455413818, + "learning_rate": 8.28e-05, + "loss": 0.106, + "step": 2535 + }, + { + "epoch": 14.514285714285714, + "grad_norm": 0.6411644816398621, + "learning_rate": 8.237142857142856e-05, + "loss": 0.0963, + "step": 2540 + }, + { + "epoch": 14.542857142857143, + "grad_norm": 0.7092474102973938, + "learning_rate": 8.194285714285713e-05, + "loss": 0.1061, + "step": 2545 + }, + { + "epoch": 14.571428571428571, + "grad_norm": 0.6887038946151733, + "learning_rate": 8.151428571428572e-05, + "loss": 0.1224, + "step": 2550 + }, + { + "epoch": 14.6, + "grad_norm": 0.8119840621948242, + "learning_rate": 8.108571428571428e-05, + "loss": 0.1023, + "step": 2555 + }, + { + "epoch": 14.628571428571428, + "grad_norm": 0.6380637288093567, + "learning_rate": 8.065714285714285e-05, + "loss": 0.0893, + "step": 2560 + }, + { + "epoch": 14.657142857142857, + "grad_norm": 0.7857063412666321, + "learning_rate": 8.022857142857142e-05, + "loss": 0.1227, + "step": 2565 + }, + { + "epoch": 14.685714285714285, + "grad_norm": 0.6368046998977661, + "learning_rate": 7.98e-05, + "loss": 0.1074, + "step": 2570 + }, + { + "epoch": 14.714285714285714, + "grad_norm": 0.7269926071166992, + "learning_rate": 7.937142857142856e-05, + "loss": 0.1166, + "step": 2575 + }, + { + "epoch": 14.742857142857144, + "grad_norm": 0.6903791427612305, + "learning_rate": 7.894285714285713e-05, + "loss": 0.1274, + "step": 2580 + }, + { + "epoch": 14.771428571428572, + "grad_norm": 0.8257679343223572, + "learning_rate": 7.851428571428571e-05, + "loss": 0.1274, + "step": 2585 + }, + { + "epoch": 14.8, + "grad_norm": 1.0489627122879028, + "learning_rate": 7.808571428571428e-05, + "loss": 0.1091, + "step": 2590 + }, + { + "epoch": 14.82857142857143, + "grad_norm": 0.6699196696281433, + "learning_rate": 7.765714285714284e-05, + "loss": 0.1244, + "step": 2595 + }, + { + "epoch": 14.857142857142858, + "grad_norm": 0.61530601978302, + "learning_rate": 7.722857142857143e-05, + "loss": 0.1122, + "step": 2600 + }, + { + "epoch": 14.885714285714286, + "grad_norm": 0.5789124369621277, + "learning_rate": 7.68e-05, + "loss": 0.1272, + "step": 2605 + }, + { + "epoch": 14.914285714285715, + "grad_norm": 2.1323459148406982, + "learning_rate": 7.637142857142857e-05, + "loss": 0.1034, + "step": 2610 + }, + { + "epoch": 14.942857142857143, + "grad_norm": 1.2433545589447021, + "learning_rate": 7.594285714285714e-05, + "loss": 0.1052, + "step": 2615 + }, + { + "epoch": 14.971428571428572, + "grad_norm": 0.868093729019165, + "learning_rate": 7.551428571428571e-05, + "loss": 0.111, + "step": 2620 + }, + { + "epoch": 15.0, + "grad_norm": 0.6479918360710144, + "learning_rate": 7.508571428571428e-05, + "loss": 0.1067, + "step": 2625 + }, + { + "epoch": 15.028571428571428, + "grad_norm": 0.8062720894813538, + "learning_rate": 7.465714285714285e-05, + "loss": 0.1113, + "step": 2630 + }, + { + "epoch": 15.057142857142857, + "grad_norm": 0.7333181500434875, + "learning_rate": 7.422857142857142e-05, + "loss": 0.0985, + "step": 2635 + }, + { + "epoch": 15.085714285714285, + "grad_norm": 0.550039529800415, + "learning_rate": 7.379999999999999e-05, + "loss": 0.1077, + "step": 2640 + }, + { + "epoch": 15.114285714285714, + "grad_norm": 0.9256687164306641, + "learning_rate": 7.337142857142856e-05, + "loss": 0.0875, + "step": 2645 + }, + { + "epoch": 15.142857142857142, + "grad_norm": 0.6421870589256287, + "learning_rate": 7.294285714285713e-05, + "loss": 0.1069, + "step": 2650 + }, + { + "epoch": 15.17142857142857, + "grad_norm": 0.6614648699760437, + "learning_rate": 7.25142857142857e-05, + "loss": 0.1249, + "step": 2655 + }, + { + "epoch": 15.2, + "grad_norm": 0.8273601531982422, + "learning_rate": 7.208571428571429e-05, + "loss": 0.1135, + "step": 2660 + }, + { + "epoch": 15.228571428571428, + "grad_norm": 0.6795836687088013, + "learning_rate": 7.165714285714284e-05, + "loss": 0.1081, + "step": 2665 + }, + { + "epoch": 15.257142857142856, + "grad_norm": 0.7508160471916199, + "learning_rate": 7.122857142857143e-05, + "loss": 0.0869, + "step": 2670 + }, + { + "epoch": 15.285714285714286, + "grad_norm": 0.7219347357749939, + "learning_rate": 7.079999999999999e-05, + "loss": 0.1115, + "step": 2675 + }, + { + "epoch": 15.314285714285715, + "grad_norm": 0.5592671036720276, + "learning_rate": 7.037142857142857e-05, + "loss": 0.1116, + "step": 2680 + }, + { + "epoch": 15.342857142857143, + "grad_norm": 0.8736717104911804, + "learning_rate": 6.994285714285714e-05, + "loss": 0.0784, + "step": 2685 + }, + { + "epoch": 15.371428571428572, + "grad_norm": 0.6056572198867798, + "learning_rate": 6.951428571428571e-05, + "loss": 0.1105, + "step": 2690 + }, + { + "epoch": 15.4, + "grad_norm": 0.671410322189331, + "learning_rate": 6.908571428571428e-05, + "loss": 0.1219, + "step": 2695 + }, + { + "epoch": 15.428571428571429, + "grad_norm": 0.7952276468276978, + "learning_rate": 6.865714285714285e-05, + "loss": 0.0865, + "step": 2700 + }, + { + "epoch": 15.457142857142857, + "grad_norm": 0.8185123205184937, + "learning_rate": 6.822857142857142e-05, + "loss": 0.1095, + "step": 2705 + }, + { + "epoch": 15.485714285714286, + "grad_norm": 0.6969497203826904, + "learning_rate": 6.78e-05, + "loss": 0.0928, + "step": 2710 + }, + { + "epoch": 15.514285714285714, + "grad_norm": 0.7323058843612671, + "learning_rate": 6.737142857142857e-05, + "loss": 0.099, + "step": 2715 + }, + { + "epoch": 15.542857142857143, + "grad_norm": 0.6498017311096191, + "learning_rate": 6.694285714285714e-05, + "loss": 0.0871, + "step": 2720 + }, + { + "epoch": 15.571428571428571, + "grad_norm": 2.0899710655212402, + "learning_rate": 6.65142857142857e-05, + "loss": 0.1306, + "step": 2725 + }, + { + "epoch": 15.6, + "grad_norm": 1.0896337032318115, + "learning_rate": 6.608571428571428e-05, + "loss": 0.1085, + "step": 2730 + }, + { + "epoch": 15.628571428571428, + "grad_norm": 0.6709671020507812, + "learning_rate": 6.565714285714285e-05, + "loss": 0.0977, + "step": 2735 + }, + { + "epoch": 15.657142857142857, + "grad_norm": 0.6750431060791016, + "learning_rate": 6.522857142857142e-05, + "loss": 0.1154, + "step": 2740 + }, + { + "epoch": 15.685714285714285, + "grad_norm": 1.2888147830963135, + "learning_rate": 6.479999999999999e-05, + "loss": 0.0918, + "step": 2745 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 0.9803931713104248, + "learning_rate": 6.437142857142857e-05, + "loss": 0.112, + "step": 2750 + }, + { + "epoch": 15.742857142857144, + "grad_norm": 0.8548974394798279, + "learning_rate": 6.394285714285713e-05, + "loss": 0.0974, + "step": 2755 + }, + { + "epoch": 15.771428571428572, + "grad_norm": 0.7924854159355164, + "learning_rate": 6.351428571428572e-05, + "loss": 0.1344, + "step": 2760 + }, + { + "epoch": 15.8, + "grad_norm": 0.9245836138725281, + "learning_rate": 6.308571428571429e-05, + "loss": 0.1182, + "step": 2765 + }, + { + "epoch": 15.82857142857143, + "grad_norm": 0.6067193150520325, + "learning_rate": 6.265714285714286e-05, + "loss": 0.0959, + "step": 2770 + }, + { + "epoch": 15.857142857142858, + "grad_norm": 0.5575870275497437, + "learning_rate": 6.222857142857143e-05, + "loss": 0.1208, + "step": 2775 + }, + { + "epoch": 15.885714285714286, + "grad_norm": 0.8608399629592896, + "learning_rate": 6.18e-05, + "loss": 0.0937, + "step": 2780 + }, + { + "epoch": 15.914285714285715, + "grad_norm": 0.6910924911499023, + "learning_rate": 6.137142857142857e-05, + "loss": 0.1175, + "step": 2785 + }, + { + "epoch": 15.942857142857143, + "grad_norm": 0.7266614437103271, + "learning_rate": 6.094285714285714e-05, + "loss": 0.1023, + "step": 2790 + }, + { + "epoch": 15.971428571428572, + "grad_norm": 0.7580139636993408, + "learning_rate": 6.051428571428571e-05, + "loss": 0.1103, + "step": 2795 + }, + { + "epoch": 16.0, + "grad_norm": 0.9288797974586487, + "learning_rate": 6.008571428571428e-05, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 16.02857142857143, + "grad_norm": 1.4218194484710693, + "learning_rate": 5.9657142857142845e-05, + "loss": 0.104, + "step": 2805 + }, + { + "epoch": 16.057142857142857, + "grad_norm": 0.7665567994117737, + "learning_rate": 5.922857142857142e-05, + "loss": 0.1084, + "step": 2810 + }, + { + "epoch": 16.085714285714285, + "grad_norm": 0.8587457537651062, + "learning_rate": 5.88e-05, + "loss": 0.1041, + "step": 2815 + }, + { + "epoch": 16.114285714285714, + "grad_norm": 0.5792443752288818, + "learning_rate": 5.837142857142856e-05, + "loss": 0.1086, + "step": 2820 + }, + { + "epoch": 16.142857142857142, + "grad_norm": 0.6510186195373535, + "learning_rate": 5.794285714285714e-05, + "loss": 0.0919, + "step": 2825 + }, + { + "epoch": 16.17142857142857, + "grad_norm": 1.170145869255066, + "learning_rate": 5.751428571428571e-05, + "loss": 0.1083, + "step": 2830 + }, + { + "epoch": 16.2, + "grad_norm": 1.0514795780181885, + "learning_rate": 5.708571428571428e-05, + "loss": 0.1223, + "step": 2835 + }, + { + "epoch": 16.228571428571428, + "grad_norm": 0.7993499040603638, + "learning_rate": 5.665714285714285e-05, + "loss": 0.1101, + "step": 2840 + }, + { + "epoch": 16.257142857142856, + "grad_norm": 0.6342432498931885, + "learning_rate": 5.622857142857142e-05, + "loss": 0.1243, + "step": 2845 + }, + { + "epoch": 16.285714285714285, + "grad_norm": 1.2524505853652954, + "learning_rate": 5.5799999999999994e-05, + "loss": 0.1251, + "step": 2850 + }, + { + "epoch": 16.314285714285713, + "grad_norm": 1.0769789218902588, + "learning_rate": 5.537142857142857e-05, + "loss": 0.1074, + "step": 2855 + }, + { + "epoch": 16.34285714285714, + "grad_norm": 1.8232245445251465, + "learning_rate": 5.4942857142857136e-05, + "loss": 0.0929, + "step": 2860 + }, + { + "epoch": 16.37142857142857, + "grad_norm": 0.814189612865448, + "learning_rate": 5.451428571428571e-05, + "loss": 0.0998, + "step": 2865 + }, + { + "epoch": 16.4, + "grad_norm": 0.9731772541999817, + "learning_rate": 5.4085714285714284e-05, + "loss": 0.0849, + "step": 2870 + }, + { + "epoch": 16.428571428571427, + "grad_norm": 0.778213381767273, + "learning_rate": 5.3657142857142855e-05, + "loss": 0.0907, + "step": 2875 + }, + { + "epoch": 16.457142857142856, + "grad_norm": 0.9219964146614075, + "learning_rate": 5.3228571428571425e-05, + "loss": 0.0855, + "step": 2880 + }, + { + "epoch": 16.485714285714284, + "grad_norm": 0.7354393005371094, + "learning_rate": 5.279999999999999e-05, + "loss": 0.1296, + "step": 2885 + }, + { + "epoch": 16.514285714285712, + "grad_norm": 0.6051219701766968, + "learning_rate": 5.2371428571428567e-05, + "loss": 0.1086, + "step": 2890 + }, + { + "epoch": 16.542857142857144, + "grad_norm": 0.8592603206634521, + "learning_rate": 5.1942857142857144e-05, + "loss": 0.1017, + "step": 2895 + }, + { + "epoch": 16.571428571428573, + "grad_norm": 0.5748846530914307, + "learning_rate": 5.151428571428571e-05, + "loss": 0.0775, + "step": 2900 + }, + { + "epoch": 16.6, + "grad_norm": 0.6640213131904602, + "learning_rate": 5.1085714285714285e-05, + "loss": 0.1059, + "step": 2905 + }, + { + "epoch": 16.62857142857143, + "grad_norm": 0.9514361023902893, + "learning_rate": 5.065714285714285e-05, + "loss": 0.0832, + "step": 2910 + }, + { + "epoch": 16.65714285714286, + "grad_norm": 1.1062079668045044, + "learning_rate": 5.022857142857143e-05, + "loss": 0.0817, + "step": 2915 + }, + { + "epoch": 16.685714285714287, + "grad_norm": 0.6824453473091125, + "learning_rate": 4.98e-05, + "loss": 0.1064, + "step": 2920 + }, + { + "epoch": 16.714285714285715, + "grad_norm": 0.643827497959137, + "learning_rate": 4.937142857142856e-05, + "loss": 0.1196, + "step": 2925 + }, + { + "epoch": 16.742857142857144, + "grad_norm": 0.7824274897575378, + "learning_rate": 4.894285714285714e-05, + "loss": 0.0945, + "step": 2930 + }, + { + "epoch": 16.771428571428572, + "grad_norm": 0.7110689878463745, + "learning_rate": 4.8514285714285716e-05, + "loss": 0.1124, + "step": 2935 + }, + { + "epoch": 16.8, + "grad_norm": 0.9542856812477112, + "learning_rate": 4.808571428571428e-05, + "loss": 0.1036, + "step": 2940 + }, + { + "epoch": 16.82857142857143, + "grad_norm": 0.6353528499603271, + "learning_rate": 4.765714285714286e-05, + "loss": 0.0977, + "step": 2945 + }, + { + "epoch": 16.857142857142858, + "grad_norm": 0.843910813331604, + "learning_rate": 4.722857142857142e-05, + "loss": 0.1164, + "step": 2950 + }, + { + "epoch": 16.885714285714286, + "grad_norm": 0.9607085585594177, + "learning_rate": 4.68e-05, + "loss": 0.1111, + "step": 2955 + }, + { + "epoch": 16.914285714285715, + "grad_norm": 0.7393201589584351, + "learning_rate": 4.637142857142857e-05, + "loss": 0.106, + "step": 2960 + }, + { + "epoch": 16.942857142857143, + "grad_norm": 0.5248494148254395, + "learning_rate": 4.5942857142857134e-05, + "loss": 0.1017, + "step": 2965 + }, + { + "epoch": 16.97142857142857, + "grad_norm": 0.8800868988037109, + "learning_rate": 4.551428571428571e-05, + "loss": 0.0872, + "step": 2970 + }, + { + "epoch": 17.0, + "grad_norm": 0.8447640538215637, + "learning_rate": 4.5085714285714275e-05, + "loss": 0.1293, + "step": 2975 + }, + { + "epoch": 17.02857142857143, + "grad_norm": 0.5356553792953491, + "learning_rate": 4.465714285714285e-05, + "loss": 0.0984, + "step": 2980 + }, + { + "epoch": 17.057142857142857, + "grad_norm": 0.7713034152984619, + "learning_rate": 4.422857142857143e-05, + "loss": 0.0858, + "step": 2985 + }, + { + "epoch": 17.085714285714285, + "grad_norm": 0.9854580760002136, + "learning_rate": 4.3799999999999994e-05, + "loss": 0.1237, + "step": 2990 + }, + { + "epoch": 17.114285714285714, + "grad_norm": 0.7012975811958313, + "learning_rate": 4.337142857142857e-05, + "loss": 0.1233, + "step": 2995 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 0.5461836457252502, + "learning_rate": 4.294285714285714e-05, + "loss": 0.0978, + "step": 3000 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-3000/training_args.bin b/glot-contrastive-final-lora/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/checkpoint-3500/README.md b/glot-contrastive-final-lora/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-3500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-3500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dba4d5dd074dc3d6c4bc4d4f36793beac178e2c3 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba05d9cb007251d29a6f02fdd92f56fa1beb8f9e0676686472daf07c4e9f478 +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-3500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..20f9723049ba94933d0ebf54f53f34b8edb32d68 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec69941aff3e022b84f7642bf65d1f256ba3d34a59cc1d3185bfaed806e27b82 +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-3500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2f30d236c6c0667ccc3a756b378faccab328a42 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119b741182d96487ccd4b17518349b97dcc4a6ddb4f50860c285ce876df3e7b3 +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-3500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..baae0d262b4de605c55755daad74e8104d12dea5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c44133b6126dd443eb3d89fa8880514c2d750a567a84b1bc49dd491e9083bb +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-3500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-3500/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-3500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-3500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-3500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-3500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-3500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..675d571d5f9814e4e6181db1dee227e7e2d62781 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/trainer_state.json @@ -0,0 +1,4934 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 5, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 1.3597544431686401, + "learning_rate": 0.00025679999999999995, + "loss": 0.1583, + "step": 505 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.8790671825408936, + "learning_rate": 0.00025637142857142854, + "loss": 0.1565, + "step": 510 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 1.1175066232681274, + "learning_rate": 0.0002559428571428571, + "loss": 0.1406, + "step": 515 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.8528785705566406, + "learning_rate": 0.0002555142857142857, + "loss": 0.1735, + "step": 520 + }, + { + "epoch": 3.0, + "grad_norm": 2.2073328495025635, + "learning_rate": 0.0002550857142857143, + "loss": 0.1816, + "step": 525 + }, + { + "epoch": 3.0285714285714285, + "grad_norm": 11.01322078704834, + "learning_rate": 0.0002546571428571428, + "loss": 0.1873, + "step": 530 + }, + { + "epoch": 3.057142857142857, + "grad_norm": 1.5822402238845825, + "learning_rate": 0.0002542285714285714, + "loss": 0.168, + "step": 535 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 1.3086942434310913, + "learning_rate": 0.0002538, + "loss": 0.149, + "step": 540 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 6.303041458129883, + "learning_rate": 0.0002533714285714285, + "loss": 0.1651, + "step": 545 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 14.48929500579834, + "learning_rate": 0.00025294285714285716, + "loss": 0.1687, + "step": 550 + }, + { + "epoch": 3.1714285714285713, + "grad_norm": 6.824525356292725, + "learning_rate": 0.0002525142857142857, + "loss": 0.1919, + "step": 555 + }, + { + "epoch": 3.2, + "grad_norm": 18.772563934326172, + "learning_rate": 0.00025208571428571427, + "loss": 0.2075, + "step": 560 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.7268752455711365, + "learning_rate": 0.00025165714285714286, + "loss": 0.174, + "step": 565 + }, + { + "epoch": 3.257142857142857, + "grad_norm": 1.1301453113555908, + "learning_rate": 0.0002512285714285714, + "loss": 0.1668, + "step": 570 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 2.846802234649658, + "learning_rate": 0.00025079999999999997, + "loss": 0.1645, + "step": 575 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 1.417515754699707, + "learning_rate": 0.00025037142857142855, + "loss": 0.1719, + "step": 580 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 4.137150764465332, + "learning_rate": 0.00024994285714285714, + "loss": 0.1739, + "step": 585 + }, + { + "epoch": 3.3714285714285714, + "grad_norm": 2.6067259311676025, + "learning_rate": 0.0002495142857142857, + "loss": 0.1489, + "step": 590 + }, + { + "epoch": 3.4, + "grad_norm": 2.601024627685547, + "learning_rate": 0.00024908571428571425, + "loss": 0.1618, + "step": 595 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 3.849017858505249, + "learning_rate": 0.00024865714285714284, + "loss": 0.1899, + "step": 600 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 4.673766136169434, + "learning_rate": 0.0002482285714285714, + "loss": 0.1761, + "step": 605 + }, + { + "epoch": 3.4857142857142858, + "grad_norm": 2.6057631969451904, + "learning_rate": 0.00024779999999999995, + "loss": 0.1743, + "step": 610 + }, + { + "epoch": 3.5142857142857142, + "grad_norm": 2.932652473449707, + "learning_rate": 0.0002473714285714286, + "loss": 0.1482, + "step": 615 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.8764939308166504, + "learning_rate": 0.0002469428571428571, + "loss": 0.1644, + "step": 620 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 1.3203191757202148, + "learning_rate": 0.0002465142857142857, + "loss": 0.1654, + "step": 625 + }, + { + "epoch": 3.6, + "grad_norm": 0.7977635264396667, + "learning_rate": 0.0002460857142857143, + "loss": 0.1472, + "step": 630 + }, + { + "epoch": 3.6285714285714286, + "grad_norm": 1.4750248193740845, + "learning_rate": 0.0002456571428571428, + "loss": 0.1735, + "step": 635 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 1.8164482116699219, + "learning_rate": 0.0002452285714285714, + "loss": 0.1593, + "step": 640 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 1.4829603433609009, + "learning_rate": 0.0002448, + "loss": 0.1508, + "step": 645 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.8828144669532776, + "learning_rate": 0.00024437142857142857, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 3.742857142857143, + "grad_norm": 2.039384126663208, + "learning_rate": 0.00024394285714285713, + "loss": 0.1745, + "step": 655 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.9604200720787048, + "learning_rate": 0.00024351428571428569, + "loss": 0.17, + "step": 660 + }, + { + "epoch": 3.8, + "grad_norm": 0.7903971076011658, + "learning_rate": 0.00024308571428571427, + "loss": 0.1654, + "step": 665 + }, + { + "epoch": 3.8285714285714287, + "grad_norm": 0.6935649514198303, + "learning_rate": 0.00024265714285714283, + "loss": 0.1714, + "step": 670 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.5832012295722961, + "learning_rate": 0.00024222857142857138, + "loss": 0.1636, + "step": 675 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.6303168535232544, + "learning_rate": 0.0002418, + "loss": 0.1604, + "step": 680 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.7210885882377625, + "learning_rate": 0.00024137142857142855, + "loss": 0.1444, + "step": 685 + }, + { + "epoch": 3.942857142857143, + "grad_norm": 0.7690990567207336, + "learning_rate": 0.00024094285714285714, + "loss": 0.1631, + "step": 690 + }, + { + "epoch": 3.9714285714285715, + "grad_norm": 1.0142720937728882, + "learning_rate": 0.0002405142857142857, + "loss": 0.158, + "step": 695 + }, + { + "epoch": 4.0, + "grad_norm": 0.7970322966575623, + "learning_rate": 0.00024008571428571425, + "loss": 0.1803, + "step": 700 + }, + { + "epoch": 4.0285714285714285, + "grad_norm": 0.6795914769172668, + "learning_rate": 0.00023965714285714284, + "loss": 0.143, + "step": 705 + }, + { + "epoch": 4.057142857142857, + "grad_norm": 0.6832629442214966, + "learning_rate": 0.0002392285714285714, + "loss": 0.1457, + "step": 710 + }, + { + "epoch": 4.085714285714285, + "grad_norm": 3.8629798889160156, + "learning_rate": 0.0002388, + "loss": 0.1671, + "step": 715 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 1.1167882680892944, + "learning_rate": 0.00023837142857142856, + "loss": 0.1544, + "step": 720 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.9431412816047668, + "learning_rate": 0.00023794285714285712, + "loss": 0.1605, + "step": 725 + }, + { + "epoch": 4.171428571428572, + "grad_norm": 1.310948133468628, + "learning_rate": 0.0002375142857142857, + "loss": 0.1121, + "step": 730 + }, + { + "epoch": 4.2, + "grad_norm": 0.9830737709999084, + "learning_rate": 0.00023708571428571426, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.6166555881500244, + "learning_rate": 0.00023665714285714282, + "loss": 0.1525, + "step": 740 + }, + { + "epoch": 4.257142857142857, + "grad_norm": 0.995579719543457, + "learning_rate": 0.00023622857142857143, + "loss": 0.1439, + "step": 745 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.639796793460846, + "learning_rate": 0.00023579999999999999, + "loss": 0.1692, + "step": 750 + }, + { + "epoch": 4.314285714285714, + "grad_norm": 0.9438050389289856, + "learning_rate": 0.00023537142857142854, + "loss": 0.1785, + "step": 755 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8960750102996826, + "learning_rate": 0.00023494285714285713, + "loss": 0.1557, + "step": 760 + }, + { + "epoch": 4.371428571428572, + "grad_norm": 0.6287499070167542, + "learning_rate": 0.00023451428571428568, + "loss": 0.1459, + "step": 765 + }, + { + "epoch": 4.4, + "grad_norm": 0.7638295888900757, + "learning_rate": 0.00023408571428571424, + "loss": 0.1341, + "step": 770 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.655878484249115, + "learning_rate": 0.00023365714285714283, + "loss": 0.1358, + "step": 775 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.5840997695922852, + "learning_rate": 0.0002332285714285714, + "loss": 0.1386, + "step": 780 + }, + { + "epoch": 4.485714285714286, + "grad_norm": 1.1082488298416138, + "learning_rate": 0.0002328, + "loss": 0.1827, + "step": 785 + }, + { + "epoch": 4.514285714285714, + "grad_norm": 0.8825240135192871, + "learning_rate": 0.00023237142857142855, + "loss": 0.1527, + "step": 790 + }, + { + "epoch": 4.542857142857143, + "grad_norm": 0.6752304434776306, + "learning_rate": 0.0002319428571428571, + "loss": 0.1392, + "step": 795 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 1.1423301696777344, + "learning_rate": 0.0002315142857142857, + "loss": 0.1433, + "step": 800 + }, + { + "epoch": 4.6, + "grad_norm": 10.793691635131836, + "learning_rate": 0.00023108571428571425, + "loss": 0.1635, + "step": 805 + }, + { + "epoch": 4.628571428571428, + "grad_norm": 0.47564294934272766, + "learning_rate": 0.00023065714285714286, + "loss": 0.1199, + "step": 810 + }, + { + "epoch": 4.6571428571428575, + "grad_norm": 1.2492656707763672, + "learning_rate": 0.00023022857142857142, + "loss": 0.1488, + "step": 815 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.6933501958847046, + "learning_rate": 0.00022979999999999997, + "loss": 0.1812, + "step": 820 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.7901633977890015, + "learning_rate": 0.00022937142857142856, + "loss": 0.1415, + "step": 825 + }, + { + "epoch": 4.742857142857143, + "grad_norm": 0.7854829430580139, + "learning_rate": 0.00022894285714285712, + "loss": 0.1401, + "step": 830 + }, + { + "epoch": 4.771428571428571, + "grad_norm": 0.8716740608215332, + "learning_rate": 0.00022851428571428567, + "loss": 0.1982, + "step": 835 + }, + { + "epoch": 4.8, + "grad_norm": 0.7047899961471558, + "learning_rate": 0.00022808571428571426, + "loss": 0.1624, + "step": 840 + }, + { + "epoch": 4.828571428571428, + "grad_norm": 0.7134959697723389, + "learning_rate": 0.00022765714285714284, + "loss": 0.1375, + "step": 845 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 1.0897325277328491, + "learning_rate": 0.00022722857142857143, + "loss": 0.1489, + "step": 850 + }, + { + "epoch": 4.885714285714286, + "grad_norm": 1.1065207719802856, + "learning_rate": 0.00022679999999999998, + "loss": 0.1495, + "step": 855 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.7434757351875305, + "learning_rate": 0.00022637142857142854, + "loss": 0.1507, + "step": 860 + }, + { + "epoch": 4.942857142857143, + "grad_norm": 1.0045181512832642, + "learning_rate": 0.00022594285714285712, + "loss": 0.1527, + "step": 865 + }, + { + "epoch": 4.9714285714285715, + "grad_norm": 1.2025654315948486, + "learning_rate": 0.00022551428571428568, + "loss": 0.1523, + "step": 870 + }, + { + "epoch": 5.0, + "grad_norm": 0.7823342084884644, + "learning_rate": 0.0002250857142857143, + "loss": 0.1514, + "step": 875 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 0.8405362963676453, + "learning_rate": 0.00022465714285714285, + "loss": 0.1461, + "step": 880 + }, + { + "epoch": 5.057142857142857, + "grad_norm": 0.7527463436126709, + "learning_rate": 0.0002242285714285714, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 5.085714285714285, + "grad_norm": 0.8372548222541809, + "learning_rate": 0.0002238, + "loss": 0.1513, + "step": 890 + }, + { + "epoch": 5.114285714285714, + "grad_norm": 0.8755456209182739, + "learning_rate": 0.00022337142857142855, + "loss": 0.1498, + "step": 895 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.7312084436416626, + "learning_rate": 0.0002229428571428571, + "loss": 0.154, + "step": 900 + }, + { + "epoch": 5.171428571428572, + "grad_norm": 0.6366221904754639, + "learning_rate": 0.0002225142857142857, + "loss": 0.1466, + "step": 905 + }, + { + "epoch": 5.2, + "grad_norm": 0.6406880617141724, + "learning_rate": 0.00022208571428571427, + "loss": 0.1254, + "step": 910 + }, + { + "epoch": 5.228571428571429, + "grad_norm": 2.4106833934783936, + "learning_rate": 0.00022165714285714283, + "loss": 0.1534, + "step": 915 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 0.5635722279548645, + "learning_rate": 0.00022122857142857142, + "loss": 0.1461, + "step": 920 + }, + { + "epoch": 5.285714285714286, + "grad_norm": 0.787162184715271, + "learning_rate": 0.00022079999999999997, + "loss": 0.1424, + "step": 925 + }, + { + "epoch": 5.314285714285714, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.00022037142857142853, + "loss": 0.1326, + "step": 930 + }, + { + "epoch": 5.3428571428571425, + "grad_norm": 0.6933534741401672, + "learning_rate": 0.00021994285714285711, + "loss": 0.1661, + "step": 935 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 0.7263259887695312, + "learning_rate": 0.0002195142857142857, + "loss": 0.15, + "step": 940 + }, + { + "epoch": 5.4, + "grad_norm": 0.5537381768226624, + "learning_rate": 0.00021908571428571428, + "loss": 0.129, + "step": 945 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.6014005541801453, + "learning_rate": 0.00021865714285714284, + "loss": 0.1321, + "step": 950 + }, + { + "epoch": 5.457142857142857, + "grad_norm": 0.6581441760063171, + "learning_rate": 0.0002182285714285714, + "loss": 0.1587, + "step": 955 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 0.9326379895210266, + "learning_rate": 0.00021779999999999998, + "loss": 0.1654, + "step": 960 + }, + { + "epoch": 5.514285714285714, + "grad_norm": 0.9438592791557312, + "learning_rate": 0.00021737142857142854, + "loss": 0.1212, + "step": 965 + }, + { + "epoch": 5.542857142857143, + "grad_norm": 0.7699571251869202, + "learning_rate": 0.00021694285714285715, + "loss": 0.1464, + "step": 970 + }, + { + "epoch": 5.571428571428571, + "grad_norm": 0.8758366703987122, + "learning_rate": 0.0002165142857142857, + "loss": 0.1599, + "step": 975 + }, + { + "epoch": 5.6, + "grad_norm": 0.6101442575454712, + "learning_rate": 0.00021608571428571426, + "loss": 0.1589, + "step": 980 + }, + { + "epoch": 5.628571428571428, + "grad_norm": 0.7454060912132263, + "learning_rate": 0.00021565714285714285, + "loss": 0.1433, + "step": 985 + }, + { + "epoch": 5.6571428571428575, + "grad_norm": 0.6379484534263611, + "learning_rate": 0.0002152285714285714, + "loss": 0.1592, + "step": 990 + }, + { + "epoch": 5.685714285714286, + "grad_norm": 1.1601309776306152, + "learning_rate": 0.00021479999999999996, + "loss": 0.1647, + "step": 995 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.5464673638343811, + "learning_rate": 0.00021437142857142855, + "loss": 0.1469, + "step": 1000 + }, + { + "epoch": 5.742857142857143, + "grad_norm": 1.0279319286346436, + "learning_rate": 0.00021394285714285713, + "loss": 0.1203, + "step": 1005 + }, + { + "epoch": 5.771428571428571, + "grad_norm": 0.5503718256950378, + "learning_rate": 0.00021351428571428572, + "loss": 0.1409, + "step": 1010 + }, + { + "epoch": 5.8, + "grad_norm": 0.6123886108398438, + "learning_rate": 0.00021308571428571427, + "loss": 0.1427, + "step": 1015 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 0.6560390591621399, + "learning_rate": 0.00021265714285714283, + "loss": 0.1415, + "step": 1020 + }, + { + "epoch": 5.857142857142857, + "grad_norm": 0.5576716661453247, + "learning_rate": 0.00021222857142857141, + "loss": 0.1408, + "step": 1025 + }, + { + "epoch": 5.885714285714286, + "grad_norm": 0.6419074535369873, + "learning_rate": 0.00021179999999999997, + "loss": 0.1385, + "step": 1030 + }, + { + "epoch": 5.914285714285715, + "grad_norm": 1.008925199508667, + "learning_rate": 0.00021137142857142858, + "loss": 0.1497, + "step": 1035 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 0.6559906005859375, + "learning_rate": 0.00021094285714285714, + "loss": 0.1218, + "step": 1040 + }, + { + "epoch": 5.9714285714285715, + "grad_norm": 0.627164363861084, + "learning_rate": 0.0002105142857142857, + "loss": 0.1368, + "step": 1045 + }, + { + "epoch": 6.0, + "grad_norm": 0.5760972499847412, + "learning_rate": 0.00021008571428571428, + "loss": 0.1508, + "step": 1050 + }, + { + "epoch": 6.0285714285714285, + "grad_norm": 0.5754174590110779, + "learning_rate": 0.00020965714285714284, + "loss": 0.1181, + "step": 1055 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 0.8736348748207092, + "learning_rate": 0.0002092285714285714, + "loss": 0.1252, + "step": 1060 + }, + { + "epoch": 6.085714285714285, + "grad_norm": 0.7166719436645508, + "learning_rate": 0.00020879999999999998, + "loss": 0.1481, + "step": 1065 + }, + { + "epoch": 6.114285714285714, + "grad_norm": 0.6494349241256714, + "learning_rate": 0.00020837142857142856, + "loss": 0.1478, + "step": 1070 + }, + { + "epoch": 6.142857142857143, + "grad_norm": 0.6681587100028992, + "learning_rate": 0.00020794285714285712, + "loss": 0.1488, + "step": 1075 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 0.7123684883117676, + "learning_rate": 0.0002075142857142857, + "loss": 0.1378, + "step": 1080 + }, + { + "epoch": 6.2, + "grad_norm": 0.6146950721740723, + "learning_rate": 0.00020708571428571426, + "loss": 0.1306, + "step": 1085 + }, + { + "epoch": 6.228571428571429, + "grad_norm": 0.8402445912361145, + "learning_rate": 0.00020665714285714282, + "loss": 0.1063, + "step": 1090 + }, + { + "epoch": 6.257142857142857, + "grad_norm": 0.6567764282226562, + "learning_rate": 0.0002062285714285714, + "loss": 0.1195, + "step": 1095 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.6006014943122864, + "learning_rate": 0.0002058, + "loss": 0.1542, + "step": 1100 + }, + { + "epoch": 6.314285714285714, + "grad_norm": 0.793100893497467, + "learning_rate": 0.00020537142857142857, + "loss": 0.1381, + "step": 1105 + }, + { + "epoch": 6.3428571428571425, + "grad_norm": 0.5923666954040527, + "learning_rate": 0.00020494285714285713, + "loss": 0.1386, + "step": 1110 + }, + { + "epoch": 6.371428571428572, + "grad_norm": 0.6692521572113037, + "learning_rate": 0.0002045142857142857, + "loss": 0.1223, + "step": 1115 + }, + { + "epoch": 6.4, + "grad_norm": 0.7216306328773499, + "learning_rate": 0.00020408571428571427, + "loss": 0.1367, + "step": 1120 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.5640934109687805, + "learning_rate": 0.00020365714285714283, + "loss": 0.1554, + "step": 1125 + }, + { + "epoch": 6.457142857142857, + "grad_norm": 0.8154368996620178, + "learning_rate": 0.00020322857142857138, + "loss": 0.1674, + "step": 1130 + }, + { + "epoch": 6.485714285714286, + "grad_norm": 0.7185398936271667, + "learning_rate": 0.0002028, + "loss": 0.1375, + "step": 1135 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 0.6805170774459839, + "learning_rate": 0.00020237142857142855, + "loss": 0.1306, + "step": 1140 + }, + { + "epoch": 6.542857142857143, + "grad_norm": 0.5996941924095154, + "learning_rate": 0.00020194285714285714, + "loss": 0.1433, + "step": 1145 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 0.5258373022079468, + "learning_rate": 0.0002015142857142857, + "loss": 0.1285, + "step": 1150 + }, + { + "epoch": 6.6, + "grad_norm": 0.7771695256233215, + "learning_rate": 0.00020108571428571425, + "loss": 0.1493, + "step": 1155 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 0.5920616388320923, + "learning_rate": 0.00020065714285714284, + "loss": 0.1479, + "step": 1160 + }, + { + "epoch": 6.6571428571428575, + "grad_norm": 0.7460982799530029, + "learning_rate": 0.00020022857142857142, + "loss": 0.1173, + "step": 1165 + }, + { + "epoch": 6.685714285714286, + "grad_norm": 1.1703822612762451, + "learning_rate": 0.0001998, + "loss": 0.1402, + "step": 1170 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.7894724011421204, + "learning_rate": 0.00019937142857142856, + "loss": 0.1253, + "step": 1175 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 0.7013376355171204, + "learning_rate": 0.00019894285714285712, + "loss": 0.1573, + "step": 1180 + }, + { + "epoch": 6.771428571428571, + "grad_norm": 0.6421737670898438, + "learning_rate": 0.0001985142857142857, + "loss": 0.1497, + "step": 1185 + }, + { + "epoch": 6.8, + "grad_norm": 1.204296350479126, + "learning_rate": 0.00019808571428571426, + "loss": 0.1634, + "step": 1190 + }, + { + "epoch": 6.828571428571428, + "grad_norm": 0.867765486240387, + "learning_rate": 0.00019765714285714282, + "loss": 0.1353, + "step": 1195 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 0.7325594425201416, + "learning_rate": 0.00019722857142857143, + "loss": 0.118, + "step": 1200 + }, + { + "epoch": 6.885714285714286, + "grad_norm": 0.7029078006744385, + "learning_rate": 0.00019679999999999999, + "loss": 0.1425, + "step": 1205 + }, + { + "epoch": 6.914285714285715, + "grad_norm": 1.1572504043579102, + "learning_rate": 0.00019637142857142857, + "loss": 0.1337, + "step": 1210 + }, + { + "epoch": 6.942857142857143, + "grad_norm": 0.8022822141647339, + "learning_rate": 0.00019594285714285713, + "loss": 0.1684, + "step": 1215 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 0.6729874610900879, + "learning_rate": 0.00019551428571428568, + "loss": 0.1238, + "step": 1220 + }, + { + "epoch": 7.0, + "grad_norm": 0.5773627758026123, + "learning_rate": 0.00019508571428571427, + "loss": 0.138, + "step": 1225 + }, + { + "epoch": 7.0285714285714285, + "grad_norm": 0.7182291150093079, + "learning_rate": 0.00019465714285714285, + "loss": 0.1431, + "step": 1230 + }, + { + "epoch": 7.057142857142857, + "grad_norm": 1.7567912340164185, + "learning_rate": 0.0001942285714285714, + "loss": 0.1319, + "step": 1235 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 0.6845232248306274, + "learning_rate": 0.0001938, + "loss": 0.1292, + "step": 1240 + }, + { + "epoch": 7.114285714285714, + "grad_norm": 0.6077771782875061, + "learning_rate": 0.00019337142857142855, + "loss": 0.1238, + "step": 1245 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.6168347597122192, + "learning_rate": 0.0001929428571428571, + "loss": 0.1384, + "step": 1250 + }, + { + "epoch": 7.171428571428572, + "grad_norm": 0.7457576394081116, + "learning_rate": 0.0001925142857142857, + "loss": 0.1306, + "step": 1255 + }, + { + "epoch": 7.2, + "grad_norm": 0.5969316363334656, + "learning_rate": 0.00019208571428571425, + "loss": 0.1123, + "step": 1260 + }, + { + "epoch": 7.228571428571429, + "grad_norm": 0.6902753710746765, + "learning_rate": 0.00019165714285714286, + "loss": 0.1185, + "step": 1265 + }, + { + "epoch": 7.257142857142857, + "grad_norm": 0.6488338112831116, + "learning_rate": 0.00019122857142857142, + "loss": 0.1431, + "step": 1270 + }, + { + "epoch": 7.285714285714286, + "grad_norm": 0.6814819574356079, + "learning_rate": 0.00019079999999999998, + "loss": 0.1495, + "step": 1275 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 0.7468088865280151, + "learning_rate": 0.00019037142857142856, + "loss": 0.1158, + "step": 1280 + }, + { + "epoch": 7.3428571428571425, + "grad_norm": 0.7417412400245667, + "learning_rate": 0.00018994285714285712, + "loss": 0.1311, + "step": 1285 + }, + { + "epoch": 7.371428571428572, + "grad_norm": 0.5480664372444153, + "learning_rate": 0.00018951428571428567, + "loss": 0.135, + "step": 1290 + }, + { + "epoch": 7.4, + "grad_norm": 0.725527822971344, + "learning_rate": 0.00018908571428571429, + "loss": 0.1217, + "step": 1295 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 0.6566678285598755, + "learning_rate": 0.00018865714285714284, + "loss": 0.1417, + "step": 1300 + }, + { + "epoch": 7.457142857142857, + "grad_norm": 0.516952395439148, + "learning_rate": 0.00018822857142857143, + "loss": 0.1329, + "step": 1305 + }, + { + "epoch": 7.485714285714286, + "grad_norm": 1.9545241594314575, + "learning_rate": 0.00018779999999999998, + "loss": 0.1339, + "step": 1310 + }, + { + "epoch": 7.514285714285714, + "grad_norm": 0.8276839852333069, + "learning_rate": 0.00018737142857142854, + "loss": 0.1324, + "step": 1315 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 0.6737099289894104, + "learning_rate": 0.00018694285714285713, + "loss": 0.1139, + "step": 1320 + }, + { + "epoch": 7.571428571428571, + "grad_norm": 0.6914472579956055, + "learning_rate": 0.00018651428571428568, + "loss": 0.1146, + "step": 1325 + }, + { + "epoch": 7.6, + "grad_norm": 0.6630033850669861, + "learning_rate": 0.0001860857142857143, + "loss": 0.1571, + "step": 1330 + }, + { + "epoch": 7.628571428571428, + "grad_norm": 0.820688784122467, + "learning_rate": 0.00018565714285714285, + "loss": 0.15, + "step": 1335 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 2.0491325855255127, + "learning_rate": 0.0001852285714285714, + "loss": 0.127, + "step": 1340 + }, + { + "epoch": 7.685714285714286, + "grad_norm": 0.9327268004417419, + "learning_rate": 0.0001848, + "loss": 0.1289, + "step": 1345 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 1.3131701946258545, + "learning_rate": 0.00018437142857142855, + "loss": 0.1228, + "step": 1350 + }, + { + "epoch": 7.742857142857143, + "grad_norm": 2.955918312072754, + "learning_rate": 0.0001839428571428571, + "loss": 0.1082, + "step": 1355 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 1.2165493965148926, + "learning_rate": 0.00018351428571428572, + "loss": 0.1688, + "step": 1360 + }, + { + "epoch": 7.8, + "grad_norm": 0.759324312210083, + "learning_rate": 0.00018308571428571428, + "loss": 0.1185, + "step": 1365 + }, + { + "epoch": 7.828571428571428, + "grad_norm": 0.7445591688156128, + "learning_rate": 0.00018265714285714286, + "loss": 0.1431, + "step": 1370 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.679374098777771, + "learning_rate": 0.00018222857142857142, + "loss": 0.1451, + "step": 1375 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 2.1234302520751953, + "learning_rate": 0.00018179999999999997, + "loss": 0.1265, + "step": 1380 + }, + { + "epoch": 7.914285714285715, + "grad_norm": 1.006521224975586, + "learning_rate": 0.00018137142857142856, + "loss": 0.1722, + "step": 1385 + }, + { + "epoch": 7.942857142857143, + "grad_norm": 0.7275253534317017, + "learning_rate": 0.00018094285714285712, + "loss": 0.1625, + "step": 1390 + }, + { + "epoch": 7.9714285714285715, + "grad_norm": 0.8612022995948792, + "learning_rate": 0.0001805142857142857, + "loss": 0.1345, + "step": 1395 + }, + { + "epoch": 8.0, + "grad_norm": 0.7276798486709595, + "learning_rate": 0.00018008571428571428, + "loss": 0.1236, + "step": 1400 + }, + { + "epoch": 8.028571428571428, + "grad_norm": 0.8731086850166321, + "learning_rate": 0.00017965714285714284, + "loss": 0.1604, + "step": 1405 + }, + { + "epoch": 8.057142857142857, + "grad_norm": 0.8950818777084351, + "learning_rate": 0.0001792285714285714, + "loss": 0.1531, + "step": 1410 + }, + { + "epoch": 8.085714285714285, + "grad_norm": 0.7399356365203857, + "learning_rate": 0.00017879999999999998, + "loss": 0.1508, + "step": 1415 + }, + { + "epoch": 8.114285714285714, + "grad_norm": 1.3727307319641113, + "learning_rate": 0.00017837142857142854, + "loss": 0.1487, + "step": 1420 + }, + { + "epoch": 8.142857142857142, + "grad_norm": 0.5938125848770142, + "learning_rate": 0.00017794285714285715, + "loss": 0.1303, + "step": 1425 + }, + { + "epoch": 8.17142857142857, + "grad_norm": 0.7043821811676025, + "learning_rate": 0.0001775142857142857, + "loss": 0.0948, + "step": 1430 + }, + { + "epoch": 8.2, + "grad_norm": 1.1062767505645752, + "learning_rate": 0.00017708571428571426, + "loss": 0.1412, + "step": 1435 + }, + { + "epoch": 8.228571428571428, + "grad_norm": 0.844832181930542, + "learning_rate": 0.00017665714285714285, + "loss": 0.113, + "step": 1440 + }, + { + "epoch": 8.257142857142856, + "grad_norm": 0.7564154863357544, + "learning_rate": 0.0001762285714285714, + "loss": 0.1319, + "step": 1445 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 0.8843110203742981, + "learning_rate": 0.00017579999999999996, + "loss": 0.1206, + "step": 1450 + }, + { + "epoch": 8.314285714285715, + "grad_norm": 0.8175828456878662, + "learning_rate": 0.00017537142857142855, + "loss": 0.1327, + "step": 1455 + }, + { + "epoch": 8.342857142857143, + "grad_norm": 0.6443565487861633, + "learning_rate": 0.00017494285714285713, + "loss": 0.1239, + "step": 1460 + }, + { + "epoch": 8.371428571428572, + "grad_norm": 0.7237185835838318, + "learning_rate": 0.00017451428571428572, + "loss": 0.1639, + "step": 1465 + }, + { + "epoch": 8.4, + "grad_norm": 0.6118057370185852, + "learning_rate": 0.00017408571428571427, + "loss": 0.1363, + "step": 1470 + }, + { + "epoch": 8.428571428571429, + "grad_norm": 0.6754649877548218, + "learning_rate": 0.00017365714285714283, + "loss": 0.1187, + "step": 1475 + }, + { + "epoch": 8.457142857142857, + "grad_norm": 1.0067390203475952, + "learning_rate": 0.00017322857142857141, + "loss": 0.1401, + "step": 1480 + }, + { + "epoch": 8.485714285714286, + "grad_norm": 8.509544372558594, + "learning_rate": 0.00017279999999999997, + "loss": 0.1304, + "step": 1485 + }, + { + "epoch": 8.514285714285714, + "grad_norm": 4.2030205726623535, + "learning_rate": 0.00017237142857142858, + "loss": 0.121, + "step": 1490 + }, + { + "epoch": 8.542857142857143, + "grad_norm": 4.877438068389893, + "learning_rate": 0.00017194285714285714, + "loss": 0.1918, + "step": 1495 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 6.4971232414245605, + "learning_rate": 0.0001715142857142857, + "loss": 0.2154, + "step": 1500 + }, + { + "epoch": 8.6, + "grad_norm": 4.365469932556152, + "learning_rate": 0.00017108571428571428, + "loss": 0.2272, + "step": 1505 + }, + { + "epoch": 8.628571428571428, + "grad_norm": 2.551957845687866, + "learning_rate": 0.00017065714285714284, + "loss": 0.2163, + "step": 1510 + }, + { + "epoch": 8.657142857142857, + "grad_norm": 5.326391220092773, + "learning_rate": 0.0001702285714285714, + "loss": 0.1612, + "step": 1515 + }, + { + "epoch": 8.685714285714285, + "grad_norm": 1.3528404235839844, + "learning_rate": 0.00016979999999999998, + "loss": 0.1636, + "step": 1520 + }, + { + "epoch": 8.714285714285714, + "grad_norm": 1.4466065168380737, + "learning_rate": 0.00016937142857142856, + "loss": 0.1295, + "step": 1525 + }, + { + "epoch": 8.742857142857144, + "grad_norm": 0.6576040387153625, + "learning_rate": 0.00016894285714285715, + "loss": 0.1318, + "step": 1530 + }, + { + "epoch": 8.771428571428572, + "grad_norm": 1.286942958831787, + "learning_rate": 0.0001685142857142857, + "loss": 0.1443, + "step": 1535 + }, + { + "epoch": 8.8, + "grad_norm": 9.474458694458008, + "learning_rate": 0.00016808571428571426, + "loss": 0.1313, + "step": 1540 + }, + { + "epoch": 8.82857142857143, + "grad_norm": 2.6731069087982178, + "learning_rate": 0.00016765714285714285, + "loss": 0.1485, + "step": 1545 + }, + { + "epoch": 8.857142857142858, + "grad_norm": 1.313723087310791, + "learning_rate": 0.0001672285714285714, + "loss": 0.1346, + "step": 1550 + }, + { + "epoch": 8.885714285714286, + "grad_norm": 1.7115576267242432, + "learning_rate": 0.0001668, + "loss": 0.1471, + "step": 1555 + }, + { + "epoch": 8.914285714285715, + "grad_norm": 1.2599923610687256, + "learning_rate": 0.00016637142857142857, + "loss": 0.1433, + "step": 1560 + }, + { + "epoch": 8.942857142857143, + "grad_norm": 0.9659029245376587, + "learning_rate": 0.00016594285714285713, + "loss": 0.1256, + "step": 1565 + }, + { + "epoch": 8.971428571428572, + "grad_norm": 1.1282744407653809, + "learning_rate": 0.0001655142857142857, + "loss": 0.1373, + "step": 1570 + }, + { + "epoch": 9.0, + "grad_norm": 3.20717453956604, + "learning_rate": 0.00016508571428571427, + "loss": 0.1355, + "step": 1575 + }, + { + "epoch": 9.028571428571428, + "grad_norm": 0.8310821056365967, + "learning_rate": 0.00016465714285714283, + "loss": 0.1268, + "step": 1580 + }, + { + "epoch": 9.057142857142857, + "grad_norm": 1.5337790250778198, + "learning_rate": 0.00016422857142857139, + "loss": 0.1267, + "step": 1585 + }, + { + "epoch": 9.085714285714285, + "grad_norm": 2.6406068801879883, + "learning_rate": 0.0001638, + "loss": 0.1363, + "step": 1590 + }, + { + "epoch": 9.114285714285714, + "grad_norm": 0.7705873847007751, + "learning_rate": 0.00016337142857142855, + "loss": 0.1291, + "step": 1595 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 0.7092650532722473, + "learning_rate": 0.00016294285714285714, + "loss": 0.1435, + "step": 1600 + }, + { + "epoch": 9.17142857142857, + "grad_norm": 1.098961591720581, + "learning_rate": 0.0001625142857142857, + "loss": 0.1471, + "step": 1605 + }, + { + "epoch": 9.2, + "grad_norm": 0.6994885206222534, + "learning_rate": 0.00016208571428571425, + "loss": 0.1345, + "step": 1610 + }, + { + "epoch": 9.228571428571428, + "grad_norm": 0.9613476991653442, + "learning_rate": 0.00016165714285714284, + "loss": 0.1399, + "step": 1615 + }, + { + "epoch": 9.257142857142856, + "grad_norm": 0.675588846206665, + "learning_rate": 0.00016122857142857142, + "loss": 0.1319, + "step": 1620 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.7519372701644897, + "learning_rate": 0.0001608, + "loss": 0.137, + "step": 1625 + }, + { + "epoch": 9.314285714285715, + "grad_norm": 1.135025978088379, + "learning_rate": 0.00016037142857142856, + "loss": 0.1322, + "step": 1630 + }, + { + "epoch": 9.342857142857143, + "grad_norm": 0.7462936639785767, + "learning_rate": 0.00015994285714285712, + "loss": 0.1215, + "step": 1635 + }, + { + "epoch": 9.371428571428572, + "grad_norm": 0.9042088985443115, + "learning_rate": 0.0001595142857142857, + "loss": 0.1191, + "step": 1640 + }, + { + "epoch": 9.4, + "grad_norm": 0.567828893661499, + "learning_rate": 0.00015908571428571426, + "loss": 0.1189, + "step": 1645 + }, + { + "epoch": 9.428571428571429, + "grad_norm": 0.981585681438446, + "learning_rate": 0.00015865714285714282, + "loss": 0.128, + "step": 1650 + }, + { + "epoch": 9.457142857142857, + "grad_norm": 1.24985933303833, + "learning_rate": 0.00015822857142857143, + "loss": 0.1315, + "step": 1655 + }, + { + "epoch": 9.485714285714286, + "grad_norm": 0.6517993211746216, + "learning_rate": 0.0001578, + "loss": 0.1076, + "step": 1660 + }, + { + "epoch": 9.514285714285714, + "grad_norm": 1.166628122329712, + "learning_rate": 0.00015737142857142857, + "loss": 0.1345, + "step": 1665 + }, + { + "epoch": 9.542857142857143, + "grad_norm": 0.9763592481613159, + "learning_rate": 0.00015694285714285713, + "loss": 0.1449, + "step": 1670 + }, + { + "epoch": 9.571428571428571, + "grad_norm": 0.7829060554504395, + "learning_rate": 0.00015651428571428569, + "loss": 0.1117, + "step": 1675 + }, + { + "epoch": 9.6, + "grad_norm": 0.6693719029426575, + "learning_rate": 0.00015608571428571427, + "loss": 0.1129, + "step": 1680 + }, + { + "epoch": 9.628571428571428, + "grad_norm": 1.2122846841812134, + "learning_rate": 0.00015565714285714285, + "loss": 0.1125, + "step": 1685 + }, + { + "epoch": 9.657142857142857, + "grad_norm": 1.0689371824264526, + "learning_rate": 0.0001552285714285714, + "loss": 0.1478, + "step": 1690 + }, + { + "epoch": 9.685714285714285, + "grad_norm": 1.8511656522750854, + "learning_rate": 0.0001548, + "loss": 0.1431, + "step": 1695 + }, + { + "epoch": 9.714285714285714, + "grad_norm": 0.6706506609916687, + "learning_rate": 0.00015437142857142855, + "loss": 0.1262, + "step": 1700 + }, + { + "epoch": 9.742857142857144, + "grad_norm": 1.0798784494400024, + "learning_rate": 0.00015394285714285714, + "loss": 0.1275, + "step": 1705 + }, + { + "epoch": 9.771428571428572, + "grad_norm": 0.7915983200073242, + "learning_rate": 0.0001535142857142857, + "loss": 0.1316, + "step": 1710 + }, + { + "epoch": 9.8, + "grad_norm": 1.8630567789077759, + "learning_rate": 0.00015308571428571425, + "loss": 0.1258, + "step": 1715 + }, + { + "epoch": 9.82857142857143, + "grad_norm": 0.7807756662368774, + "learning_rate": 0.00015265714285714286, + "loss": 0.1079, + "step": 1720 + }, + { + "epoch": 9.857142857142858, + "grad_norm": 1.4698439836502075, + "learning_rate": 0.00015222857142857142, + "loss": 0.1357, + "step": 1725 + }, + { + "epoch": 9.885714285714286, + "grad_norm": 1.2121926546096802, + "learning_rate": 0.00015179999999999998, + "loss": 0.1322, + "step": 1730 + }, + { + "epoch": 9.914285714285715, + "grad_norm": 0.6348568201065063, + "learning_rate": 0.00015137142857142856, + "loss": 0.0893, + "step": 1735 + }, + { + "epoch": 9.942857142857143, + "grad_norm": 0.6694422364234924, + "learning_rate": 0.00015094285714285712, + "loss": 0.1189, + "step": 1740 + }, + { + "epoch": 9.971428571428572, + "grad_norm": 0.569332480430603, + "learning_rate": 0.00015051428571428567, + "loss": 0.1349, + "step": 1745 + }, + { + "epoch": 10.0, + "grad_norm": 0.934073269367218, + "learning_rate": 0.00015008571428571429, + "loss": 0.1237, + "step": 1750 + }, + { + "epoch": 10.028571428571428, + "grad_norm": 0.7191672325134277, + "learning_rate": 0.00014965714285714284, + "loss": 0.1308, + "step": 1755 + }, + { + "epoch": 10.057142857142857, + "grad_norm": 0.7006493806838989, + "learning_rate": 0.00014922857142857143, + "loss": 0.104, + "step": 1760 + }, + { + "epoch": 10.085714285714285, + "grad_norm": 0.9030678272247314, + "learning_rate": 0.00014879999999999998, + "loss": 0.1308, + "step": 1765 + }, + { + "epoch": 10.114285714285714, + "grad_norm": 0.7007766366004944, + "learning_rate": 0.00014837142857142854, + "loss": 0.1044, + "step": 1770 + }, + { + "epoch": 10.142857142857142, + "grad_norm": 0.4832770824432373, + "learning_rate": 0.00014794285714285713, + "loss": 0.1119, + "step": 1775 + }, + { + "epoch": 10.17142857142857, + "grad_norm": 0.7819458842277527, + "learning_rate": 0.0001475142857142857, + "loss": 0.1087, + "step": 1780 + }, + { + "epoch": 10.2, + "grad_norm": 1.0223525762557983, + "learning_rate": 0.00014708571428571427, + "loss": 0.1314, + "step": 1785 + }, + { + "epoch": 10.228571428571428, + "grad_norm": 0.6224566698074341, + "learning_rate": 0.00014665714285714285, + "loss": 0.1159, + "step": 1790 + }, + { + "epoch": 10.257142857142856, + "grad_norm": 0.45800235867500305, + "learning_rate": 0.0001462285714285714, + "loss": 0.0942, + "step": 1795 + }, + { + "epoch": 10.285714285714286, + "grad_norm": 0.6258400082588196, + "learning_rate": 0.0001458, + "loss": 0.1079, + "step": 1800 + }, + { + "epoch": 10.314285714285715, + "grad_norm": 1.1812794208526611, + "learning_rate": 0.00014537142857142858, + "loss": 0.1378, + "step": 1805 + }, + { + "epoch": 10.342857142857143, + "grad_norm": 0.8541269898414612, + "learning_rate": 0.00014494285714285713, + "loss": 0.1274, + "step": 1810 + }, + { + "epoch": 10.371428571428572, + "grad_norm": 0.7131860256195068, + "learning_rate": 0.0001445142857142857, + "loss": 0.1247, + "step": 1815 + }, + { + "epoch": 10.4, + "grad_norm": 0.6109820008277893, + "learning_rate": 0.00014408571428571428, + "loss": 0.1246, + "step": 1820 + }, + { + "epoch": 10.428571428571429, + "grad_norm": 0.5621510744094849, + "learning_rate": 0.00014365714285714286, + "loss": 0.1039, + "step": 1825 + }, + { + "epoch": 10.457142857142857, + "grad_norm": 1.022777795791626, + "learning_rate": 0.00014322857142857142, + "loss": 0.1206, + "step": 1830 + }, + { + "epoch": 10.485714285714286, + "grad_norm": 0.9120668768882751, + "learning_rate": 0.00014279999999999997, + "loss": 0.1289, + "step": 1835 + }, + { + "epoch": 10.514285714285714, + "grad_norm": 1.1882030963897705, + "learning_rate": 0.00014237142857142856, + "loss": 0.1194, + "step": 1840 + }, + { + "epoch": 10.542857142857143, + "grad_norm": 0.6078401207923889, + "learning_rate": 0.00014194285714285714, + "loss": 0.1339, + "step": 1845 + }, + { + "epoch": 10.571428571428571, + "grad_norm": 0.7380999326705933, + "learning_rate": 0.0001415142857142857, + "loss": 0.1318, + "step": 1850 + }, + { + "epoch": 10.6, + "grad_norm": 0.5884959101676941, + "learning_rate": 0.00014108571428571428, + "loss": 0.1249, + "step": 1855 + }, + { + "epoch": 10.628571428571428, + "grad_norm": 1.0121936798095703, + "learning_rate": 0.00014065714285714284, + "loss": 0.1137, + "step": 1860 + }, + { + "epoch": 10.657142857142857, + "grad_norm": 0.6444916129112244, + "learning_rate": 0.00014022857142857143, + "loss": 0.1213, + "step": 1865 + }, + { + "epoch": 10.685714285714285, + "grad_norm": 0.7931004762649536, + "learning_rate": 0.00013979999999999998, + "loss": 0.1318, + "step": 1870 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 0.5596404075622559, + "learning_rate": 0.00013937142857142857, + "loss": 0.1075, + "step": 1875 + }, + { + "epoch": 10.742857142857144, + "grad_norm": 0.6586474180221558, + "learning_rate": 0.00013894285714285712, + "loss": 0.13, + "step": 1880 + }, + { + "epoch": 10.771428571428572, + "grad_norm": 1.0195013284683228, + "learning_rate": 0.00013851428571428568, + "loss": 0.1373, + "step": 1885 + }, + { + "epoch": 10.8, + "grad_norm": 0.9233512878417969, + "learning_rate": 0.00013808571428571427, + "loss": 0.1168, + "step": 1890 + }, + { + "epoch": 10.82857142857143, + "grad_norm": 0.7154092788696289, + "learning_rate": 0.00013765714285714285, + "loss": 0.1081, + "step": 1895 + }, + { + "epoch": 10.857142857142858, + "grad_norm": 1.4588117599487305, + "learning_rate": 0.0001372285714285714, + "loss": 0.1061, + "step": 1900 + }, + { + "epoch": 10.885714285714286, + "grad_norm": 0.6087035536766052, + "learning_rate": 0.0001368, + "loss": 0.1157, + "step": 1905 + }, + { + "epoch": 10.914285714285715, + "grad_norm": 0.7371247410774231, + "learning_rate": 0.00013637142857142855, + "loss": 0.1339, + "step": 1910 + }, + { + "epoch": 10.942857142857143, + "grad_norm": 0.8253212571144104, + "learning_rate": 0.00013594285714285713, + "loss": 0.1198, + "step": 1915 + }, + { + "epoch": 10.971428571428572, + "grad_norm": 0.6889544129371643, + "learning_rate": 0.00013551428571428572, + "loss": 0.1131, + "step": 1920 + }, + { + "epoch": 11.0, + "grad_norm": 0.6408224105834961, + "learning_rate": 0.00013508571428571427, + "loss": 0.122, + "step": 1925 + }, + { + "epoch": 11.028571428571428, + "grad_norm": 0.6771185398101807, + "learning_rate": 0.00013465714285714283, + "loss": 0.1492, + "step": 1930 + }, + { + "epoch": 11.057142857142857, + "grad_norm": 0.8706450462341309, + "learning_rate": 0.00013422857142857142, + "loss": 0.1294, + "step": 1935 + }, + { + "epoch": 11.085714285714285, + "grad_norm": 1.730648398399353, + "learning_rate": 0.0001338, + "loss": 0.1004, + "step": 1940 + }, + { + "epoch": 11.114285714285714, + "grad_norm": 0.6985113620758057, + "learning_rate": 0.00013337142857142856, + "loss": 0.0995, + "step": 1945 + }, + { + "epoch": 11.142857142857142, + "grad_norm": 0.8901951313018799, + "learning_rate": 0.00013294285714285711, + "loss": 0.1179, + "step": 1950 + }, + { + "epoch": 11.17142857142857, + "grad_norm": 0.7232164144515991, + "learning_rate": 0.0001325142857142857, + "loss": 0.1397, + "step": 1955 + }, + { + "epoch": 11.2, + "grad_norm": 0.6447544693946838, + "learning_rate": 0.00013208571428571428, + "loss": 0.1366, + "step": 1960 + }, + { + "epoch": 11.228571428571428, + "grad_norm": 0.7964944243431091, + "learning_rate": 0.00013165714285714284, + "loss": 0.1121, + "step": 1965 + }, + { + "epoch": 11.257142857142856, + "grad_norm": 0.9012628793716431, + "learning_rate": 0.00013122857142857142, + "loss": 0.1131, + "step": 1970 + }, + { + "epoch": 11.285714285714286, + "grad_norm": 0.9295369982719421, + "learning_rate": 0.00013079999999999998, + "loss": 0.1232, + "step": 1975 + }, + { + "epoch": 11.314285714285715, + "grad_norm": 0.6237708926200867, + "learning_rate": 0.00013037142857142857, + "loss": 0.1066, + "step": 1980 + }, + { + "epoch": 11.342857142857143, + "grad_norm": 0.5250967741012573, + "learning_rate": 0.00012994285714285715, + "loss": 0.118, + "step": 1985 + }, + { + "epoch": 11.371428571428572, + "grad_norm": 1.0013964176177979, + "learning_rate": 0.0001295142857142857, + "loss": 0.1125, + "step": 1990 + }, + { + "epoch": 11.4, + "grad_norm": 0.6721311807632446, + "learning_rate": 0.00012908571428571426, + "loss": 0.1196, + "step": 1995 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.6966421008110046, + "learning_rate": 0.00012865714285714285, + "loss": 0.1172, + "step": 2000 + }, + { + "epoch": 11.457142857142857, + "grad_norm": 0.8811460733413696, + "learning_rate": 0.00012822857142857143, + "loss": 0.135, + "step": 2005 + }, + { + "epoch": 11.485714285714286, + "grad_norm": 0.8829531073570251, + "learning_rate": 0.0001278, + "loss": 0.1288, + "step": 2010 + }, + { + "epoch": 11.514285714285714, + "grad_norm": 0.7530654668807983, + "learning_rate": 0.00012737142857142855, + "loss": 0.1073, + "step": 2015 + }, + { + "epoch": 11.542857142857143, + "grad_norm": 0.513940691947937, + "learning_rate": 0.00012694285714285713, + "loss": 0.121, + "step": 2020 + }, + { + "epoch": 11.571428571428571, + "grad_norm": 0.8574968576431274, + "learning_rate": 0.0001265142857142857, + "loss": 0.1103, + "step": 2025 + }, + { + "epoch": 11.6, + "grad_norm": 0.7482439875602722, + "learning_rate": 0.00012608571428571427, + "loss": 0.1027, + "step": 2030 + }, + { + "epoch": 11.628571428571428, + "grad_norm": 0.8367976546287537, + "learning_rate": 0.00012565714285714286, + "loss": 0.1181, + "step": 2035 + }, + { + "epoch": 11.657142857142857, + "grad_norm": 2.048128366470337, + "learning_rate": 0.0001252285714285714, + "loss": 0.1122, + "step": 2040 + }, + { + "epoch": 11.685714285714285, + "grad_norm": 0.7426862716674805, + "learning_rate": 0.00012479999999999997, + "loss": 0.1169, + "step": 2045 + }, + { + "epoch": 11.714285714285714, + "grad_norm": 3.093841791152954, + "learning_rate": 0.00012437142857142855, + "loss": 0.1164, + "step": 2050 + }, + { + "epoch": 11.742857142857144, + "grad_norm": 0.8172643184661865, + "learning_rate": 0.00012394285714285714, + "loss": 0.1354, + "step": 2055 + }, + { + "epoch": 11.771428571428572, + "grad_norm": 1.9950591325759888, + "learning_rate": 0.0001235142857142857, + "loss": 0.1037, + "step": 2060 + }, + { + "epoch": 11.8, + "grad_norm": 0.5929077863693237, + "learning_rate": 0.00012308571428571428, + "loss": 0.1194, + "step": 2065 + }, + { + "epoch": 11.82857142857143, + "grad_norm": 1.293624997138977, + "learning_rate": 0.00012265714285714284, + "loss": 0.12, + "step": 2070 + }, + { + "epoch": 11.857142857142858, + "grad_norm": 1.0515168905258179, + "learning_rate": 0.00012222857142857142, + "loss": 0.1049, + "step": 2075 + }, + { + "epoch": 11.885714285714286, + "grad_norm": 1.2874428033828735, + "learning_rate": 0.00012179999999999999, + "loss": 0.115, + "step": 2080 + }, + { + "epoch": 11.914285714285715, + "grad_norm": 0.7317278385162354, + "learning_rate": 0.00012137142857142856, + "loss": 0.1184, + "step": 2085 + }, + { + "epoch": 11.942857142857143, + "grad_norm": 1.3407148122787476, + "learning_rate": 0.00012094285714285713, + "loss": 0.132, + "step": 2090 + }, + { + "epoch": 11.971428571428572, + "grad_norm": 2.656409502029419, + "learning_rate": 0.00012051428571428569, + "loss": 0.1359, + "step": 2095 + }, + { + "epoch": 12.0, + "grad_norm": 0.7189064025878906, + "learning_rate": 0.00012008571428571428, + "loss": 0.1217, + "step": 2100 + }, + { + "epoch": 12.028571428571428, + "grad_norm": 0.7510334849357605, + "learning_rate": 0.00011965714285714285, + "loss": 0.109, + "step": 2105 + }, + { + "epoch": 12.057142857142857, + "grad_norm": 0.7235113382339478, + "learning_rate": 0.00011922857142857142, + "loss": 0.1114, + "step": 2110 + }, + { + "epoch": 12.085714285714285, + "grad_norm": 1.7435882091522217, + "learning_rate": 0.0001188, + "loss": 0.1357, + "step": 2115 + }, + { + "epoch": 12.114285714285714, + "grad_norm": 1.170392632484436, + "learning_rate": 0.00011837142857142856, + "loss": 0.1255, + "step": 2120 + }, + { + "epoch": 12.142857142857142, + "grad_norm": 0.6476783752441406, + "learning_rate": 0.00011794285714285713, + "loss": 0.1108, + "step": 2125 + }, + { + "epoch": 12.17142857142857, + "grad_norm": 0.8599929213523865, + "learning_rate": 0.00011751428571428571, + "loss": 0.0997, + "step": 2130 + }, + { + "epoch": 12.2, + "grad_norm": 0.8918687105178833, + "learning_rate": 0.00011708571428571428, + "loss": 0.1149, + "step": 2135 + }, + { + "epoch": 12.228571428571428, + "grad_norm": 1.609435796737671, + "learning_rate": 0.00011665714285714284, + "loss": 0.1136, + "step": 2140 + }, + { + "epoch": 12.257142857142856, + "grad_norm": 0.6206801533699036, + "learning_rate": 0.00011622857142857143, + "loss": 0.1135, + "step": 2145 + }, + { + "epoch": 12.285714285714286, + "grad_norm": 0.8769077658653259, + "learning_rate": 0.0001158, + "loss": 0.1344, + "step": 2150 + }, + { + "epoch": 12.314285714285715, + "grad_norm": 0.6279401183128357, + "learning_rate": 0.00011537142857142855, + "loss": 0.1049, + "step": 2155 + }, + { + "epoch": 12.342857142857143, + "grad_norm": 1.1110137701034546, + "learning_rate": 0.00011494285714285712, + "loss": 0.1146, + "step": 2160 + }, + { + "epoch": 12.371428571428572, + "grad_norm": 0.7911233901977539, + "learning_rate": 0.00011451428571428571, + "loss": 0.1257, + "step": 2165 + }, + { + "epoch": 12.4, + "grad_norm": 0.9691207408905029, + "learning_rate": 0.00011408571428571428, + "loss": 0.1226, + "step": 2170 + }, + { + "epoch": 12.428571428571429, + "grad_norm": 0.6168835759162903, + "learning_rate": 0.00011365714285714284, + "loss": 0.1271, + "step": 2175 + }, + { + "epoch": 12.457142857142857, + "grad_norm": 0.6143497228622437, + "learning_rate": 0.00011322857142857142, + "loss": 0.111, + "step": 2180 + }, + { + "epoch": 12.485714285714286, + "grad_norm": 1.5673450231552124, + "learning_rate": 0.00011279999999999999, + "loss": 0.1186, + "step": 2185 + }, + { + "epoch": 12.514285714285714, + "grad_norm": 1.298756718635559, + "learning_rate": 0.00011237142857142856, + "loss": 0.1024, + "step": 2190 + }, + { + "epoch": 12.542857142857143, + "grad_norm": 0.9484918117523193, + "learning_rate": 0.00011194285714285715, + "loss": 0.1171, + "step": 2195 + }, + { + "epoch": 12.571428571428571, + "grad_norm": 0.725705623626709, + "learning_rate": 0.0001115142857142857, + "loss": 0.1216, + "step": 2200 + }, + { + "epoch": 12.6, + "grad_norm": 1.1394798755645752, + "learning_rate": 0.00011108571428571427, + "loss": 0.1132, + "step": 2205 + }, + { + "epoch": 12.628571428571428, + "grad_norm": 0.9548712968826294, + "learning_rate": 0.00011065714285714286, + "loss": 0.1209, + "step": 2210 + }, + { + "epoch": 12.657142857142857, + "grad_norm": 0.6173953413963318, + "learning_rate": 0.00011022857142857143, + "loss": 0.1049, + "step": 2215 + }, + { + "epoch": 12.685714285714285, + "grad_norm": 0.8227205872535706, + "learning_rate": 0.00010979999999999999, + "loss": 0.1045, + "step": 2220 + }, + { + "epoch": 12.714285714285714, + "grad_norm": 0.7252780795097351, + "learning_rate": 0.00010937142857142856, + "loss": 0.1146, + "step": 2225 + }, + { + "epoch": 12.742857142857144, + "grad_norm": 0.9374399781227112, + "learning_rate": 0.00010894285714285714, + "loss": 0.1478, + "step": 2230 + }, + { + "epoch": 12.771428571428572, + "grad_norm": 5.1985368728637695, + "learning_rate": 0.0001085142857142857, + "loss": 0.1059, + "step": 2235 + }, + { + "epoch": 12.8, + "grad_norm": 0.9629620909690857, + "learning_rate": 0.00010808571428571427, + "loss": 0.124, + "step": 2240 + }, + { + "epoch": 12.82857142857143, + "grad_norm": 0.7022290229797363, + "learning_rate": 0.00010765714285714285, + "loss": 0.1309, + "step": 2245 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.574188232421875, + "learning_rate": 0.00010722857142857142, + "loss": 0.086, + "step": 2250 + }, + { + "epoch": 12.885714285714286, + "grad_norm": 0.9712439179420471, + "learning_rate": 0.00010679999999999998, + "loss": 0.1152, + "step": 2255 + }, + { + "epoch": 12.914285714285715, + "grad_norm": 0.6562150120735168, + "learning_rate": 0.00010637142857142856, + "loss": 0.1343, + "step": 2260 + }, + { + "epoch": 12.942857142857143, + "grad_norm": 0.6936819553375244, + "learning_rate": 0.00010594285714285714, + "loss": 0.1009, + "step": 2265 + }, + { + "epoch": 12.971428571428572, + "grad_norm": 0.8664882779121399, + "learning_rate": 0.0001055142857142857, + "loss": 0.1164, + "step": 2270 + }, + { + "epoch": 13.0, + "grad_norm": 0.9224509000778198, + "learning_rate": 0.00010508571428571429, + "loss": 0.1347, + "step": 2275 + }, + { + "epoch": 13.028571428571428, + "grad_norm": 0.6596968770027161, + "learning_rate": 0.00010465714285714285, + "loss": 0.1041, + "step": 2280 + }, + { + "epoch": 13.057142857142857, + "grad_norm": 0.6456631422042847, + "learning_rate": 0.00010422857142857142, + "loss": 0.1142, + "step": 2285 + }, + { + "epoch": 13.085714285714285, + "grad_norm": 0.9466612339019775, + "learning_rate": 0.00010379999999999999, + "loss": 0.1191, + "step": 2290 + }, + { + "epoch": 13.114285714285714, + "grad_norm": 0.9036727547645569, + "learning_rate": 0.00010337142857142856, + "loss": 0.121, + "step": 2295 + }, + { + "epoch": 13.142857142857142, + "grad_norm": 1.08086359500885, + "learning_rate": 0.00010294285714285713, + "loss": 0.1313, + "step": 2300 + }, + { + "epoch": 13.17142857142857, + "grad_norm": 0.703241765499115, + "learning_rate": 0.0001025142857142857, + "loss": 0.1151, + "step": 2305 + }, + { + "epoch": 13.2, + "grad_norm": 0.7901896238327026, + "learning_rate": 0.00010208571428571429, + "loss": 0.1275, + "step": 2310 + }, + { + "epoch": 13.228571428571428, + "grad_norm": 0.703542947769165, + "learning_rate": 0.00010165714285714284, + "loss": 0.1, + "step": 2315 + }, + { + "epoch": 13.257142857142856, + "grad_norm": 0.6657671928405762, + "learning_rate": 0.00010122857142857141, + "loss": 0.1141, + "step": 2320 + }, + { + "epoch": 13.285714285714286, + "grad_norm": 0.7593729496002197, + "learning_rate": 0.0001008, + "loss": 0.1099, + "step": 2325 + }, + { + "epoch": 13.314285714285715, + "grad_norm": 0.6681057810783386, + "learning_rate": 0.00010037142857142857, + "loss": 0.112, + "step": 2330 + }, + { + "epoch": 13.342857142857143, + "grad_norm": 0.7155857682228088, + "learning_rate": 9.994285714285712e-05, + "loss": 0.0989, + "step": 2335 + }, + { + "epoch": 13.371428571428572, + "grad_norm": 0.9484553337097168, + "learning_rate": 9.951428571428571e-05, + "loss": 0.0902, + "step": 2340 + }, + { + "epoch": 13.4, + "grad_norm": 0.9317265152931213, + "learning_rate": 9.908571428571428e-05, + "loss": 0.1432, + "step": 2345 + }, + { + "epoch": 13.428571428571429, + "grad_norm": 1.039158821105957, + "learning_rate": 9.865714285714285e-05, + "loss": 0.114, + "step": 2350 + }, + { + "epoch": 13.457142857142857, + "grad_norm": 0.8524000644683838, + "learning_rate": 9.822857142857141e-05, + "loss": 0.1144, + "step": 2355 + }, + { + "epoch": 13.485714285714286, + "grad_norm": 0.6337461471557617, + "learning_rate": 9.779999999999999e-05, + "loss": 0.1073, + "step": 2360 + }, + { + "epoch": 13.514285714285714, + "grad_norm": 0.9097298383712769, + "learning_rate": 9.737142857142856e-05, + "loss": 0.1031, + "step": 2365 + }, + { + "epoch": 13.542857142857143, + "grad_norm": 1.2013412714004517, + "learning_rate": 9.694285714285713e-05, + "loss": 0.1174, + "step": 2370 + }, + { + "epoch": 13.571428571428571, + "grad_norm": 0.7055214643478394, + "learning_rate": 9.65142857142857e-05, + "loss": 0.1175, + "step": 2375 + }, + { + "epoch": 13.6, + "grad_norm": 0.807955265045166, + "learning_rate": 9.608571428571427e-05, + "loss": 0.1286, + "step": 2380 + }, + { + "epoch": 13.628571428571428, + "grad_norm": 0.6661797761917114, + "learning_rate": 9.565714285714285e-05, + "loss": 0.1091, + "step": 2385 + }, + { + "epoch": 13.657142857142857, + "grad_norm": 1.119604468345642, + "learning_rate": 9.522857142857143e-05, + "loss": 0.1393, + "step": 2390 + }, + { + "epoch": 13.685714285714285, + "grad_norm": 0.5365435481071472, + "learning_rate": 9.479999999999999e-05, + "loss": 0.1075, + "step": 2395 + }, + { + "epoch": 13.714285714285714, + "grad_norm": 0.9443924427032471, + "learning_rate": 9.437142857142856e-05, + "loss": 0.0977, + "step": 2400 + }, + { + "epoch": 13.742857142857144, + "grad_norm": 0.6075264811515808, + "learning_rate": 9.394285714285714e-05, + "loss": 0.1329, + "step": 2405 + }, + { + "epoch": 13.771428571428572, + "grad_norm": 1.019352912902832, + "learning_rate": 9.351428571428571e-05, + "loss": 0.1083, + "step": 2410 + }, + { + "epoch": 13.8, + "grad_norm": 0.7234058380126953, + "learning_rate": 9.308571428571427e-05, + "loss": 0.1118, + "step": 2415 + }, + { + "epoch": 13.82857142857143, + "grad_norm": 0.6786122918128967, + "learning_rate": 9.265714285714284e-05, + "loss": 0.1208, + "step": 2420 + }, + { + "epoch": 13.857142857142858, + "grad_norm": 0.5820732116699219, + "learning_rate": 9.222857142857142e-05, + "loss": 0.1022, + "step": 2425 + }, + { + "epoch": 13.885714285714286, + "grad_norm": 0.8007987141609192, + "learning_rate": 9.18e-05, + "loss": 0.1293, + "step": 2430 + }, + { + "epoch": 13.914285714285715, + "grad_norm": 0.6813766956329346, + "learning_rate": 9.137142857142855e-05, + "loss": 0.1284, + "step": 2435 + }, + { + "epoch": 13.942857142857143, + "grad_norm": 0.6460041403770447, + "learning_rate": 9.094285714285714e-05, + "loss": 0.1073, + "step": 2440 + }, + { + "epoch": 13.971428571428572, + "grad_norm": 0.5939205288887024, + "learning_rate": 9.051428571428571e-05, + "loss": 0.1185, + "step": 2445 + }, + { + "epoch": 14.0, + "grad_norm": 0.8150635361671448, + "learning_rate": 9.008571428571428e-05, + "loss": 0.1039, + "step": 2450 + }, + { + "epoch": 14.028571428571428, + "grad_norm": 1.3691389560699463, + "learning_rate": 8.965714285714285e-05, + "loss": 0.1112, + "step": 2455 + }, + { + "epoch": 14.057142857142857, + "grad_norm": 0.9042718410491943, + "learning_rate": 8.922857142857142e-05, + "loss": 0.112, + "step": 2460 + }, + { + "epoch": 14.085714285714285, + "grad_norm": 0.7222105860710144, + "learning_rate": 8.879999999999999e-05, + "loss": 0.1221, + "step": 2465 + }, + { + "epoch": 14.114285714285714, + "grad_norm": 0.595588207244873, + "learning_rate": 8.837142857142857e-05, + "loss": 0.1058, + "step": 2470 + }, + { + "epoch": 14.142857142857142, + "grad_norm": 0.5262706279754639, + "learning_rate": 8.794285714285713e-05, + "loss": 0.1071, + "step": 2475 + }, + { + "epoch": 14.17142857142857, + "grad_norm": 0.6511022448539734, + "learning_rate": 8.75142857142857e-05, + "loss": 0.0917, + "step": 2480 + }, + { + "epoch": 14.2, + "grad_norm": 0.5737650394439697, + "learning_rate": 8.708571428571427e-05, + "loss": 0.0988, + "step": 2485 + }, + { + "epoch": 14.228571428571428, + "grad_norm": 0.7679132223129272, + "learning_rate": 8.665714285714286e-05, + "loss": 0.1185, + "step": 2490 + }, + { + "epoch": 14.257142857142856, + "grad_norm": 0.641198456287384, + "learning_rate": 8.622857142857141e-05, + "loss": 0.0894, + "step": 2495 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.7215464115142822, + "learning_rate": 8.579999999999998e-05, + "loss": 0.0935, + "step": 2500 + }, + { + "epoch": 14.314285714285715, + "grad_norm": 1.0740891695022583, + "learning_rate": 8.537142857142857e-05, + "loss": 0.1156, + "step": 2505 + }, + { + "epoch": 14.342857142857143, + "grad_norm": 0.6668990254402161, + "learning_rate": 8.494285714285714e-05, + "loss": 0.1006, + "step": 2510 + }, + { + "epoch": 14.371428571428572, + "grad_norm": 0.6674673557281494, + "learning_rate": 8.45142857142857e-05, + "loss": 0.1045, + "step": 2515 + }, + { + "epoch": 14.4, + "grad_norm": 0.6198854446411133, + "learning_rate": 8.408571428571428e-05, + "loss": 0.0997, + "step": 2520 + }, + { + "epoch": 14.428571428571429, + "grad_norm": 0.7187360525131226, + "learning_rate": 8.365714285714285e-05, + "loss": 0.1277, + "step": 2525 + }, + { + "epoch": 14.457142857142857, + "grad_norm": 0.583990216255188, + "learning_rate": 8.322857142857142e-05, + "loss": 0.1182, + "step": 2530 + }, + { + "epoch": 14.485714285714286, + "grad_norm": 1.1340539455413818, + "learning_rate": 8.28e-05, + "loss": 0.106, + "step": 2535 + }, + { + "epoch": 14.514285714285714, + "grad_norm": 0.6411644816398621, + "learning_rate": 8.237142857142856e-05, + "loss": 0.0963, + "step": 2540 + }, + { + "epoch": 14.542857142857143, + "grad_norm": 0.7092474102973938, + "learning_rate": 8.194285714285713e-05, + "loss": 0.1061, + "step": 2545 + }, + { + "epoch": 14.571428571428571, + "grad_norm": 0.6887038946151733, + "learning_rate": 8.151428571428572e-05, + "loss": 0.1224, + "step": 2550 + }, + { + "epoch": 14.6, + "grad_norm": 0.8119840621948242, + "learning_rate": 8.108571428571428e-05, + "loss": 0.1023, + "step": 2555 + }, + { + "epoch": 14.628571428571428, + "grad_norm": 0.6380637288093567, + "learning_rate": 8.065714285714285e-05, + "loss": 0.0893, + "step": 2560 + }, + { + "epoch": 14.657142857142857, + "grad_norm": 0.7857063412666321, + "learning_rate": 8.022857142857142e-05, + "loss": 0.1227, + "step": 2565 + }, + { + "epoch": 14.685714285714285, + "grad_norm": 0.6368046998977661, + "learning_rate": 7.98e-05, + "loss": 0.1074, + "step": 2570 + }, + { + "epoch": 14.714285714285714, + "grad_norm": 0.7269926071166992, + "learning_rate": 7.937142857142856e-05, + "loss": 0.1166, + "step": 2575 + }, + { + "epoch": 14.742857142857144, + "grad_norm": 0.6903791427612305, + "learning_rate": 7.894285714285713e-05, + "loss": 0.1274, + "step": 2580 + }, + { + "epoch": 14.771428571428572, + "grad_norm": 0.8257679343223572, + "learning_rate": 7.851428571428571e-05, + "loss": 0.1274, + "step": 2585 + }, + { + "epoch": 14.8, + "grad_norm": 1.0489627122879028, + "learning_rate": 7.808571428571428e-05, + "loss": 0.1091, + "step": 2590 + }, + { + "epoch": 14.82857142857143, + "grad_norm": 0.6699196696281433, + "learning_rate": 7.765714285714284e-05, + "loss": 0.1244, + "step": 2595 + }, + { + "epoch": 14.857142857142858, + "grad_norm": 0.61530601978302, + "learning_rate": 7.722857142857143e-05, + "loss": 0.1122, + "step": 2600 + }, + { + "epoch": 14.885714285714286, + "grad_norm": 0.5789124369621277, + "learning_rate": 7.68e-05, + "loss": 0.1272, + "step": 2605 + }, + { + "epoch": 14.914285714285715, + "grad_norm": 2.1323459148406982, + "learning_rate": 7.637142857142857e-05, + "loss": 0.1034, + "step": 2610 + }, + { + "epoch": 14.942857142857143, + "grad_norm": 1.2433545589447021, + "learning_rate": 7.594285714285714e-05, + "loss": 0.1052, + "step": 2615 + }, + { + "epoch": 14.971428571428572, + "grad_norm": 0.868093729019165, + "learning_rate": 7.551428571428571e-05, + "loss": 0.111, + "step": 2620 + }, + { + "epoch": 15.0, + "grad_norm": 0.6479918360710144, + "learning_rate": 7.508571428571428e-05, + "loss": 0.1067, + "step": 2625 + }, + { + "epoch": 15.028571428571428, + "grad_norm": 0.8062720894813538, + "learning_rate": 7.465714285714285e-05, + "loss": 0.1113, + "step": 2630 + }, + { + "epoch": 15.057142857142857, + "grad_norm": 0.7333181500434875, + "learning_rate": 7.422857142857142e-05, + "loss": 0.0985, + "step": 2635 + }, + { + "epoch": 15.085714285714285, + "grad_norm": 0.550039529800415, + "learning_rate": 7.379999999999999e-05, + "loss": 0.1077, + "step": 2640 + }, + { + "epoch": 15.114285714285714, + "grad_norm": 0.9256687164306641, + "learning_rate": 7.337142857142856e-05, + "loss": 0.0875, + "step": 2645 + }, + { + "epoch": 15.142857142857142, + "grad_norm": 0.6421870589256287, + "learning_rate": 7.294285714285713e-05, + "loss": 0.1069, + "step": 2650 + }, + { + "epoch": 15.17142857142857, + "grad_norm": 0.6614648699760437, + "learning_rate": 7.25142857142857e-05, + "loss": 0.1249, + "step": 2655 + }, + { + "epoch": 15.2, + "grad_norm": 0.8273601531982422, + "learning_rate": 7.208571428571429e-05, + "loss": 0.1135, + "step": 2660 + }, + { + "epoch": 15.228571428571428, + "grad_norm": 0.6795836687088013, + "learning_rate": 7.165714285714284e-05, + "loss": 0.1081, + "step": 2665 + }, + { + "epoch": 15.257142857142856, + "grad_norm": 0.7508160471916199, + "learning_rate": 7.122857142857143e-05, + "loss": 0.0869, + "step": 2670 + }, + { + "epoch": 15.285714285714286, + "grad_norm": 0.7219347357749939, + "learning_rate": 7.079999999999999e-05, + "loss": 0.1115, + "step": 2675 + }, + { + "epoch": 15.314285714285715, + "grad_norm": 0.5592671036720276, + "learning_rate": 7.037142857142857e-05, + "loss": 0.1116, + "step": 2680 + }, + { + "epoch": 15.342857142857143, + "grad_norm": 0.8736717104911804, + "learning_rate": 6.994285714285714e-05, + "loss": 0.0784, + "step": 2685 + }, + { + "epoch": 15.371428571428572, + "grad_norm": 0.6056572198867798, + "learning_rate": 6.951428571428571e-05, + "loss": 0.1105, + "step": 2690 + }, + { + "epoch": 15.4, + "grad_norm": 0.671410322189331, + "learning_rate": 6.908571428571428e-05, + "loss": 0.1219, + "step": 2695 + }, + { + "epoch": 15.428571428571429, + "grad_norm": 0.7952276468276978, + "learning_rate": 6.865714285714285e-05, + "loss": 0.0865, + "step": 2700 + }, + { + "epoch": 15.457142857142857, + "grad_norm": 0.8185123205184937, + "learning_rate": 6.822857142857142e-05, + "loss": 0.1095, + "step": 2705 + }, + { + "epoch": 15.485714285714286, + "grad_norm": 0.6969497203826904, + "learning_rate": 6.78e-05, + "loss": 0.0928, + "step": 2710 + }, + { + "epoch": 15.514285714285714, + "grad_norm": 0.7323058843612671, + "learning_rate": 6.737142857142857e-05, + "loss": 0.099, + "step": 2715 + }, + { + "epoch": 15.542857142857143, + "grad_norm": 0.6498017311096191, + "learning_rate": 6.694285714285714e-05, + "loss": 0.0871, + "step": 2720 + }, + { + "epoch": 15.571428571428571, + "grad_norm": 2.0899710655212402, + "learning_rate": 6.65142857142857e-05, + "loss": 0.1306, + "step": 2725 + }, + { + "epoch": 15.6, + "grad_norm": 1.0896337032318115, + "learning_rate": 6.608571428571428e-05, + "loss": 0.1085, + "step": 2730 + }, + { + "epoch": 15.628571428571428, + "grad_norm": 0.6709671020507812, + "learning_rate": 6.565714285714285e-05, + "loss": 0.0977, + "step": 2735 + }, + { + "epoch": 15.657142857142857, + "grad_norm": 0.6750431060791016, + "learning_rate": 6.522857142857142e-05, + "loss": 0.1154, + "step": 2740 + }, + { + "epoch": 15.685714285714285, + "grad_norm": 1.2888147830963135, + "learning_rate": 6.479999999999999e-05, + "loss": 0.0918, + "step": 2745 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 0.9803931713104248, + "learning_rate": 6.437142857142857e-05, + "loss": 0.112, + "step": 2750 + }, + { + "epoch": 15.742857142857144, + "grad_norm": 0.8548974394798279, + "learning_rate": 6.394285714285713e-05, + "loss": 0.0974, + "step": 2755 + }, + { + "epoch": 15.771428571428572, + "grad_norm": 0.7924854159355164, + "learning_rate": 6.351428571428572e-05, + "loss": 0.1344, + "step": 2760 + }, + { + "epoch": 15.8, + "grad_norm": 0.9245836138725281, + "learning_rate": 6.308571428571429e-05, + "loss": 0.1182, + "step": 2765 + }, + { + "epoch": 15.82857142857143, + "grad_norm": 0.6067193150520325, + "learning_rate": 6.265714285714286e-05, + "loss": 0.0959, + "step": 2770 + }, + { + "epoch": 15.857142857142858, + "grad_norm": 0.5575870275497437, + "learning_rate": 6.222857142857143e-05, + "loss": 0.1208, + "step": 2775 + }, + { + "epoch": 15.885714285714286, + "grad_norm": 0.8608399629592896, + "learning_rate": 6.18e-05, + "loss": 0.0937, + "step": 2780 + }, + { + "epoch": 15.914285714285715, + "grad_norm": 0.6910924911499023, + "learning_rate": 6.137142857142857e-05, + "loss": 0.1175, + "step": 2785 + }, + { + "epoch": 15.942857142857143, + "grad_norm": 0.7266614437103271, + "learning_rate": 6.094285714285714e-05, + "loss": 0.1023, + "step": 2790 + }, + { + "epoch": 15.971428571428572, + "grad_norm": 0.7580139636993408, + "learning_rate": 6.051428571428571e-05, + "loss": 0.1103, + "step": 2795 + }, + { + "epoch": 16.0, + "grad_norm": 0.9288797974586487, + "learning_rate": 6.008571428571428e-05, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 16.02857142857143, + "grad_norm": 1.4218194484710693, + "learning_rate": 5.9657142857142845e-05, + "loss": 0.104, + "step": 2805 + }, + { + "epoch": 16.057142857142857, + "grad_norm": 0.7665567994117737, + "learning_rate": 5.922857142857142e-05, + "loss": 0.1084, + "step": 2810 + }, + { + "epoch": 16.085714285714285, + "grad_norm": 0.8587457537651062, + "learning_rate": 5.88e-05, + "loss": 0.1041, + "step": 2815 + }, + { + "epoch": 16.114285714285714, + "grad_norm": 0.5792443752288818, + "learning_rate": 5.837142857142856e-05, + "loss": 0.1086, + "step": 2820 + }, + { + "epoch": 16.142857142857142, + "grad_norm": 0.6510186195373535, + "learning_rate": 5.794285714285714e-05, + "loss": 0.0919, + "step": 2825 + }, + { + "epoch": 16.17142857142857, + "grad_norm": 1.170145869255066, + "learning_rate": 5.751428571428571e-05, + "loss": 0.1083, + "step": 2830 + }, + { + "epoch": 16.2, + "grad_norm": 1.0514795780181885, + "learning_rate": 5.708571428571428e-05, + "loss": 0.1223, + "step": 2835 + }, + { + "epoch": 16.228571428571428, + "grad_norm": 0.7993499040603638, + "learning_rate": 5.665714285714285e-05, + "loss": 0.1101, + "step": 2840 + }, + { + "epoch": 16.257142857142856, + "grad_norm": 0.6342432498931885, + "learning_rate": 5.622857142857142e-05, + "loss": 0.1243, + "step": 2845 + }, + { + "epoch": 16.285714285714285, + "grad_norm": 1.2524505853652954, + "learning_rate": 5.5799999999999994e-05, + "loss": 0.1251, + "step": 2850 + }, + { + "epoch": 16.314285714285713, + "grad_norm": 1.0769789218902588, + "learning_rate": 5.537142857142857e-05, + "loss": 0.1074, + "step": 2855 + }, + { + "epoch": 16.34285714285714, + "grad_norm": 1.8232245445251465, + "learning_rate": 5.4942857142857136e-05, + "loss": 0.0929, + "step": 2860 + }, + { + "epoch": 16.37142857142857, + "grad_norm": 0.814189612865448, + "learning_rate": 5.451428571428571e-05, + "loss": 0.0998, + "step": 2865 + }, + { + "epoch": 16.4, + "grad_norm": 0.9731772541999817, + "learning_rate": 5.4085714285714284e-05, + "loss": 0.0849, + "step": 2870 + }, + { + "epoch": 16.428571428571427, + "grad_norm": 0.778213381767273, + "learning_rate": 5.3657142857142855e-05, + "loss": 0.0907, + "step": 2875 + }, + { + "epoch": 16.457142857142856, + "grad_norm": 0.9219964146614075, + "learning_rate": 5.3228571428571425e-05, + "loss": 0.0855, + "step": 2880 + }, + { + "epoch": 16.485714285714284, + "grad_norm": 0.7354393005371094, + "learning_rate": 5.279999999999999e-05, + "loss": 0.1296, + "step": 2885 + }, + { + "epoch": 16.514285714285712, + "grad_norm": 0.6051219701766968, + "learning_rate": 5.2371428571428567e-05, + "loss": 0.1086, + "step": 2890 + }, + { + "epoch": 16.542857142857144, + "grad_norm": 0.8592603206634521, + "learning_rate": 5.1942857142857144e-05, + "loss": 0.1017, + "step": 2895 + }, + { + "epoch": 16.571428571428573, + "grad_norm": 0.5748846530914307, + "learning_rate": 5.151428571428571e-05, + "loss": 0.0775, + "step": 2900 + }, + { + "epoch": 16.6, + "grad_norm": 0.6640213131904602, + "learning_rate": 5.1085714285714285e-05, + "loss": 0.1059, + "step": 2905 + }, + { + "epoch": 16.62857142857143, + "grad_norm": 0.9514361023902893, + "learning_rate": 5.065714285714285e-05, + "loss": 0.0832, + "step": 2910 + }, + { + "epoch": 16.65714285714286, + "grad_norm": 1.1062079668045044, + "learning_rate": 5.022857142857143e-05, + "loss": 0.0817, + "step": 2915 + }, + { + "epoch": 16.685714285714287, + "grad_norm": 0.6824453473091125, + "learning_rate": 4.98e-05, + "loss": 0.1064, + "step": 2920 + }, + { + "epoch": 16.714285714285715, + "grad_norm": 0.643827497959137, + "learning_rate": 4.937142857142856e-05, + "loss": 0.1196, + "step": 2925 + }, + { + "epoch": 16.742857142857144, + "grad_norm": 0.7824274897575378, + "learning_rate": 4.894285714285714e-05, + "loss": 0.0945, + "step": 2930 + }, + { + "epoch": 16.771428571428572, + "grad_norm": 0.7110689878463745, + "learning_rate": 4.8514285714285716e-05, + "loss": 0.1124, + "step": 2935 + }, + { + "epoch": 16.8, + "grad_norm": 0.9542856812477112, + "learning_rate": 4.808571428571428e-05, + "loss": 0.1036, + "step": 2940 + }, + { + "epoch": 16.82857142857143, + "grad_norm": 0.6353528499603271, + "learning_rate": 4.765714285714286e-05, + "loss": 0.0977, + "step": 2945 + }, + { + "epoch": 16.857142857142858, + "grad_norm": 0.843910813331604, + "learning_rate": 4.722857142857142e-05, + "loss": 0.1164, + "step": 2950 + }, + { + "epoch": 16.885714285714286, + "grad_norm": 0.9607085585594177, + "learning_rate": 4.68e-05, + "loss": 0.1111, + "step": 2955 + }, + { + "epoch": 16.914285714285715, + "grad_norm": 0.7393201589584351, + "learning_rate": 4.637142857142857e-05, + "loss": 0.106, + "step": 2960 + }, + { + "epoch": 16.942857142857143, + "grad_norm": 0.5248494148254395, + "learning_rate": 4.5942857142857134e-05, + "loss": 0.1017, + "step": 2965 + }, + { + "epoch": 16.97142857142857, + "grad_norm": 0.8800868988037109, + "learning_rate": 4.551428571428571e-05, + "loss": 0.0872, + "step": 2970 + }, + { + "epoch": 17.0, + "grad_norm": 0.8447640538215637, + "learning_rate": 4.5085714285714275e-05, + "loss": 0.1293, + "step": 2975 + }, + { + "epoch": 17.02857142857143, + "grad_norm": 0.5356553792953491, + "learning_rate": 4.465714285714285e-05, + "loss": 0.0984, + "step": 2980 + }, + { + "epoch": 17.057142857142857, + "grad_norm": 0.7713034152984619, + "learning_rate": 4.422857142857143e-05, + "loss": 0.0858, + "step": 2985 + }, + { + "epoch": 17.085714285714285, + "grad_norm": 0.9854580760002136, + "learning_rate": 4.3799999999999994e-05, + "loss": 0.1237, + "step": 2990 + }, + { + "epoch": 17.114285714285714, + "grad_norm": 0.7012975811958313, + "learning_rate": 4.337142857142857e-05, + "loss": 0.1233, + "step": 2995 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 0.5461836457252502, + "learning_rate": 4.294285714285714e-05, + "loss": 0.0978, + "step": 3000 + }, + { + "epoch": 17.17142857142857, + "grad_norm": 0.8236174583435059, + "learning_rate": 4.2514285714285706e-05, + "loss": 0.1041, + "step": 3005 + }, + { + "epoch": 17.2, + "grad_norm": 1.040204644203186, + "learning_rate": 4.2085714285714284e-05, + "loss": 0.0974, + "step": 3010 + }, + { + "epoch": 17.228571428571428, + "grad_norm": 0.6091800928115845, + "learning_rate": 4.165714285714285e-05, + "loss": 0.1098, + "step": 3015 + }, + { + "epoch": 17.257142857142856, + "grad_norm": 0.7302913069725037, + "learning_rate": 4.1228571428571425e-05, + "loss": 0.1162, + "step": 3020 + }, + { + "epoch": 17.285714285714285, + "grad_norm": 0.7015142440795898, + "learning_rate": 4.08e-05, + "loss": 0.1059, + "step": 3025 + }, + { + "epoch": 17.314285714285713, + "grad_norm": 0.8828005790710449, + "learning_rate": 4.0371428571428566e-05, + "loss": 0.1181, + "step": 3030 + }, + { + "epoch": 17.34285714285714, + "grad_norm": 0.7601356506347656, + "learning_rate": 3.994285714285714e-05, + "loss": 0.1034, + "step": 3035 + }, + { + "epoch": 17.37142857142857, + "grad_norm": 0.8163303136825562, + "learning_rate": 3.951428571428571e-05, + "loss": 0.1221, + "step": 3040 + }, + { + "epoch": 17.4, + "grad_norm": 0.6229556202888489, + "learning_rate": 3.908571428571428e-05, + "loss": 0.0807, + "step": 3045 + }, + { + "epoch": 17.428571428571427, + "grad_norm": 0.7089337706565857, + "learning_rate": 3.8657142857142856e-05, + "loss": 0.1059, + "step": 3050 + }, + { + "epoch": 17.457142857142856, + "grad_norm": 0.7208603024482727, + "learning_rate": 3.822857142857142e-05, + "loss": 0.1091, + "step": 3055 + }, + { + "epoch": 17.485714285714284, + "grad_norm": 0.6921990513801575, + "learning_rate": 3.78e-05, + "loss": 0.0945, + "step": 3060 + }, + { + "epoch": 17.514285714285712, + "grad_norm": 0.6333826184272766, + "learning_rate": 3.737142857142857e-05, + "loss": 0.0872, + "step": 3065 + }, + { + "epoch": 17.542857142857144, + "grad_norm": 0.634069561958313, + "learning_rate": 3.694285714285714e-05, + "loss": 0.103, + "step": 3070 + }, + { + "epoch": 17.571428571428573, + "grad_norm": 0.951956033706665, + "learning_rate": 3.651428571428571e-05, + "loss": 0.0803, + "step": 3075 + }, + { + "epoch": 17.6, + "grad_norm": 0.6465116143226624, + "learning_rate": 3.608571428571428e-05, + "loss": 0.1072, + "step": 3080 + }, + { + "epoch": 17.62857142857143, + "grad_norm": 0.9632124304771423, + "learning_rate": 3.565714285714285e-05, + "loss": 0.1052, + "step": 3085 + }, + { + "epoch": 17.65714285714286, + "grad_norm": 1.454498529434204, + "learning_rate": 3.522857142857142e-05, + "loss": 0.1019, + "step": 3090 + }, + { + "epoch": 17.685714285714287, + "grad_norm": 0.9264261722564697, + "learning_rate": 3.48e-05, + "loss": 0.0877, + "step": 3095 + }, + { + "epoch": 17.714285714285715, + "grad_norm": 0.913129985332489, + "learning_rate": 3.437142857142857e-05, + "loss": 0.1072, + "step": 3100 + }, + { + "epoch": 17.742857142857144, + "grad_norm": 0.6301171779632568, + "learning_rate": 3.394285714285714e-05, + "loss": 0.1095, + "step": 3105 + }, + { + "epoch": 17.771428571428572, + "grad_norm": 0.7603205442428589, + "learning_rate": 3.351428571428571e-05, + "loss": 0.1042, + "step": 3110 + }, + { + "epoch": 17.8, + "grad_norm": 0.5240493416786194, + "learning_rate": 3.308571428571428e-05, + "loss": 0.0901, + "step": 3115 + }, + { + "epoch": 17.82857142857143, + "grad_norm": 0.709513783454895, + "learning_rate": 3.265714285714285e-05, + "loss": 0.0899, + "step": 3120 + }, + { + "epoch": 17.857142857142858, + "grad_norm": 0.8211326003074646, + "learning_rate": 3.222857142857142e-05, + "loss": 0.106, + "step": 3125 + }, + { + "epoch": 17.885714285714286, + "grad_norm": 1.5929844379425049, + "learning_rate": 3.1799999999999994e-05, + "loss": 0.1282, + "step": 3130 + }, + { + "epoch": 17.914285714285715, + "grad_norm": 0.7880852818489075, + "learning_rate": 3.1371428571428564e-05, + "loss": 0.1076, + "step": 3135 + }, + { + "epoch": 17.942857142857143, + "grad_norm": 0.8155010342597961, + "learning_rate": 3.094285714285714e-05, + "loss": 0.1044, + "step": 3140 + }, + { + "epoch": 17.97142857142857, + "grad_norm": 0.7286632657051086, + "learning_rate": 3.0514285714285713e-05, + "loss": 0.1083, + "step": 3145 + }, + { + "epoch": 18.0, + "grad_norm": 0.618222177028656, + "learning_rate": 3.0085714285714283e-05, + "loss": 0.1051, + "step": 3150 + }, + { + "epoch": 18.02857142857143, + "grad_norm": 0.6627287864685059, + "learning_rate": 2.9657142857142854e-05, + "loss": 0.0861, + "step": 3155 + }, + { + "epoch": 18.057142857142857, + "grad_norm": 0.7387683987617493, + "learning_rate": 2.9228571428571428e-05, + "loss": 0.1008, + "step": 3160 + }, + { + "epoch": 18.085714285714285, + "grad_norm": 1.671528935432434, + "learning_rate": 2.88e-05, + "loss": 0.1032, + "step": 3165 + }, + { + "epoch": 18.114285714285714, + "grad_norm": 1.9250961542129517, + "learning_rate": 2.837142857142857e-05, + "loss": 0.1142, + "step": 3170 + }, + { + "epoch": 18.142857142857142, + "grad_norm": 0.7627262473106384, + "learning_rate": 2.794285714285714e-05, + "loss": 0.1047, + "step": 3175 + }, + { + "epoch": 18.17142857142857, + "grad_norm": 0.8871546983718872, + "learning_rate": 2.7514285714285714e-05, + "loss": 0.0964, + "step": 3180 + }, + { + "epoch": 18.2, + "grad_norm": 1.5572978258132935, + "learning_rate": 2.7085714285714285e-05, + "loss": 0.0898, + "step": 3185 + }, + { + "epoch": 18.228571428571428, + "grad_norm": 0.5867496132850647, + "learning_rate": 2.6657142857142856e-05, + "loss": 0.104, + "step": 3190 + }, + { + "epoch": 18.257142857142856, + "grad_norm": 1.2859222888946533, + "learning_rate": 2.6228571428571426e-05, + "loss": 0.1016, + "step": 3195 + }, + { + "epoch": 18.285714285714285, + "grad_norm": 0.9340577721595764, + "learning_rate": 2.5799999999999997e-05, + "loss": 0.1128, + "step": 3200 + }, + { + "epoch": 18.314285714285713, + "grad_norm": 0.7944216132164001, + "learning_rate": 2.537142857142857e-05, + "loss": 0.086, + "step": 3205 + }, + { + "epoch": 18.34285714285714, + "grad_norm": 0.8302488327026367, + "learning_rate": 2.4942857142857142e-05, + "loss": 0.1167, + "step": 3210 + }, + { + "epoch": 18.37142857142857, + "grad_norm": 0.7752293348312378, + "learning_rate": 2.4514285714285712e-05, + "loss": 0.0926, + "step": 3215 + }, + { + "epoch": 18.4, + "grad_norm": 0.7102646827697754, + "learning_rate": 2.4085714285714283e-05, + "loss": 0.1076, + "step": 3220 + }, + { + "epoch": 18.428571428571427, + "grad_norm": 0.8423459529876709, + "learning_rate": 2.3657142857142857e-05, + "loss": 0.0922, + "step": 3225 + }, + { + "epoch": 18.457142857142856, + "grad_norm": 0.798784613609314, + "learning_rate": 2.3228571428571428e-05, + "loss": 0.1124, + "step": 3230 + }, + { + "epoch": 18.485714285714284, + "grad_norm": 0.8125432133674622, + "learning_rate": 2.28e-05, + "loss": 0.1113, + "step": 3235 + }, + { + "epoch": 18.514285714285712, + "grad_norm": 0.7036454081535339, + "learning_rate": 2.237142857142857e-05, + "loss": 0.1068, + "step": 3240 + }, + { + "epoch": 18.542857142857144, + "grad_norm": 1.1506881713867188, + "learning_rate": 2.1942857142857143e-05, + "loss": 0.0998, + "step": 3245 + }, + { + "epoch": 18.571428571428573, + "grad_norm": 0.6385633945465088, + "learning_rate": 2.1514285714285714e-05, + "loss": 0.1205, + "step": 3250 + }, + { + "epoch": 18.6, + "grad_norm": 0.6759969592094421, + "learning_rate": 2.1085714285714285e-05, + "loss": 0.1109, + "step": 3255 + }, + { + "epoch": 18.62857142857143, + "grad_norm": 0.49440646171569824, + "learning_rate": 2.0657142857142855e-05, + "loss": 0.0963, + "step": 3260 + }, + { + "epoch": 18.65714285714286, + "grad_norm": 0.5873342156410217, + "learning_rate": 2.022857142857143e-05, + "loss": 0.0918, + "step": 3265 + }, + { + "epoch": 18.685714285714287, + "grad_norm": 0.6430002450942993, + "learning_rate": 1.98e-05, + "loss": 0.0884, + "step": 3270 + }, + { + "epoch": 18.714285714285715, + "grad_norm": 0.6482405066490173, + "learning_rate": 1.937142857142857e-05, + "loss": 0.0986, + "step": 3275 + }, + { + "epoch": 18.742857142857144, + "grad_norm": 5.264317035675049, + "learning_rate": 1.894285714285714e-05, + "loss": 0.1177, + "step": 3280 + }, + { + "epoch": 18.771428571428572, + "grad_norm": 0.7969473600387573, + "learning_rate": 1.8514285714285712e-05, + "loss": 0.1058, + "step": 3285 + }, + { + "epoch": 18.8, + "grad_norm": 0.8115813732147217, + "learning_rate": 1.8085714285714283e-05, + "loss": 0.1013, + "step": 3290 + }, + { + "epoch": 18.82857142857143, + "grad_norm": 0.7753077745437622, + "learning_rate": 1.7657142857142857e-05, + "loss": 0.1146, + "step": 3295 + }, + { + "epoch": 18.857142857142858, + "grad_norm": 0.7666271328926086, + "learning_rate": 1.7228571428571428e-05, + "loss": 0.0982, + "step": 3300 + }, + { + "epoch": 18.885714285714286, + "grad_norm": 0.803175151348114, + "learning_rate": 1.68e-05, + "loss": 0.0948, + "step": 3305 + }, + { + "epoch": 18.914285714285715, + "grad_norm": 0.6946846842765808, + "learning_rate": 1.637142857142857e-05, + "loss": 0.1233, + "step": 3310 + }, + { + "epoch": 18.942857142857143, + "grad_norm": 0.6007334589958191, + "learning_rate": 1.5942857142857143e-05, + "loss": 0.0998, + "step": 3315 + }, + { + "epoch": 18.97142857142857, + "grad_norm": 1.0508538484573364, + "learning_rate": 1.5514285714285714e-05, + "loss": 0.1141, + "step": 3320 + }, + { + "epoch": 19.0, + "grad_norm": 1.0431159734725952, + "learning_rate": 1.5085714285714285e-05, + "loss": 0.0844, + "step": 3325 + }, + { + "epoch": 19.02857142857143, + "grad_norm": 0.8848056793212891, + "learning_rate": 1.4657142857142855e-05, + "loss": 0.1158, + "step": 3330 + }, + { + "epoch": 19.057142857142857, + "grad_norm": 0.6595423817634583, + "learning_rate": 1.4228571428571428e-05, + "loss": 0.1045, + "step": 3335 + }, + { + "epoch": 19.085714285714285, + "grad_norm": 0.7581779360771179, + "learning_rate": 1.3799999999999998e-05, + "loss": 0.1042, + "step": 3340 + }, + { + "epoch": 19.114285714285714, + "grad_norm": 0.7570774555206299, + "learning_rate": 1.337142857142857e-05, + "loss": 0.0884, + "step": 3345 + }, + { + "epoch": 19.142857142857142, + "grad_norm": 0.8883652687072754, + "learning_rate": 1.2942857142857141e-05, + "loss": 0.0921, + "step": 3350 + }, + { + "epoch": 19.17142857142857, + "grad_norm": 0.5633381605148315, + "learning_rate": 1.2514285714285714e-05, + "loss": 0.0875, + "step": 3355 + }, + { + "epoch": 19.2, + "grad_norm": 0.7671384215354919, + "learning_rate": 1.2085714285714284e-05, + "loss": 0.0828, + "step": 3360 + }, + { + "epoch": 19.228571428571428, + "grad_norm": 0.7777629494667053, + "learning_rate": 1.1657142857142855e-05, + "loss": 0.083, + "step": 3365 + }, + { + "epoch": 19.257142857142856, + "grad_norm": 0.602395236492157, + "learning_rate": 1.1228571428571428e-05, + "loss": 0.1072, + "step": 3370 + }, + { + "epoch": 19.285714285714285, + "grad_norm": 0.8774672746658325, + "learning_rate": 1.0799999999999998e-05, + "loss": 0.1093, + "step": 3375 + }, + { + "epoch": 19.314285714285713, + "grad_norm": 0.6401615142822266, + "learning_rate": 1.037142857142857e-05, + "loss": 0.1166, + "step": 3380 + }, + { + "epoch": 19.34285714285714, + "grad_norm": 0.6146759390830994, + "learning_rate": 9.942857142857141e-06, + "loss": 0.0972, + "step": 3385 + }, + { + "epoch": 19.37142857142857, + "grad_norm": 0.7564222812652588, + "learning_rate": 9.514285714285714e-06, + "loss": 0.1029, + "step": 3390 + }, + { + "epoch": 19.4, + "grad_norm": 0.8068543076515198, + "learning_rate": 9.085714285714286e-06, + "loss": 0.0889, + "step": 3395 + }, + { + "epoch": 19.428571428571427, + "grad_norm": 0.8872269988059998, + "learning_rate": 8.657142857142855e-06, + "loss": 0.0918, + "step": 3400 + }, + { + "epoch": 19.457142857142856, + "grad_norm": 0.7162922620773315, + "learning_rate": 8.228571428571427e-06, + "loss": 0.1072, + "step": 3405 + }, + { + "epoch": 19.485714285714284, + "grad_norm": 0.7198708057403564, + "learning_rate": 7.799999999999998e-06, + "loss": 0.097, + "step": 3410 + }, + { + "epoch": 19.514285714285712, + "grad_norm": 0.9534723162651062, + "learning_rate": 7.3714285714285706e-06, + "loss": 0.1075, + "step": 3415 + }, + { + "epoch": 19.542857142857144, + "grad_norm": 0.9135831594467163, + "learning_rate": 6.942857142857142e-06, + "loss": 0.0791, + "step": 3420 + }, + { + "epoch": 19.571428571428573, + "grad_norm": 0.8475795984268188, + "learning_rate": 6.514285714285714e-06, + "loss": 0.1158, + "step": 3425 + }, + { + "epoch": 19.6, + "grad_norm": 0.7961953282356262, + "learning_rate": 6.085714285714285e-06, + "loss": 0.1049, + "step": 3430 + }, + { + "epoch": 19.62857142857143, + "grad_norm": 0.8671003580093384, + "learning_rate": 5.657142857142857e-06, + "loss": 0.1218, + "step": 3435 + }, + { + "epoch": 19.65714285714286, + "grad_norm": 1.1290555000305176, + "learning_rate": 5.228571428571428e-06, + "loss": 0.0798, + "step": 3440 + }, + { + "epoch": 19.685714285714287, + "grad_norm": 0.9520301222801208, + "learning_rate": 4.8e-06, + "loss": 0.1026, + "step": 3445 + }, + { + "epoch": 19.714285714285715, + "grad_norm": 1.114631175994873, + "learning_rate": 4.371428571428571e-06, + "loss": 0.1242, + "step": 3450 + }, + { + "epoch": 19.742857142857144, + "grad_norm": 0.8614441752433777, + "learning_rate": 3.942857142857143e-06, + "loss": 0.1103, + "step": 3455 + }, + { + "epoch": 19.771428571428572, + "grad_norm": 3.417344570159912, + "learning_rate": 3.5142857142857136e-06, + "loss": 0.1088, + "step": 3460 + }, + { + "epoch": 19.8, + "grad_norm": 0.5386614203453064, + "learning_rate": 3.085714285714285e-06, + "loss": 0.0923, + "step": 3465 + }, + { + "epoch": 19.82857142857143, + "grad_norm": 0.6228803396224976, + "learning_rate": 2.6571428571428566e-06, + "loss": 0.0872, + "step": 3470 + }, + { + "epoch": 19.857142857142858, + "grad_norm": 0.9215123653411865, + "learning_rate": 2.228571428571428e-06, + "loss": 0.0927, + "step": 3475 + }, + { + "epoch": 19.885714285714286, + "grad_norm": 0.5741862058639526, + "learning_rate": 1.8e-06, + "loss": 0.1066, + "step": 3480 + }, + { + "epoch": 19.914285714285715, + "grad_norm": 0.6639522910118103, + "learning_rate": 1.3714285714285715e-06, + "loss": 0.0966, + "step": 3485 + }, + { + "epoch": 19.942857142857143, + "grad_norm": 0.5956189632415771, + "learning_rate": 9.428571428571428e-07, + "loss": 0.1126, + "step": 3490 + }, + { + "epoch": 19.97142857142857, + "grad_norm": 0.7502082586288452, + "learning_rate": 5.142857142857142e-07, + "loss": 0.0887, + "step": 3495 + }, + { + "epoch": 20.0, + "grad_norm": 0.7295340895652771, + "learning_rate": 8.571428571428572e-08, + "loss": 0.0894, + "step": 3500 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-3500/training_args.bin b/glot-contrastive-final-lora/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/checkpoint-500/README.md b/glot-contrastive-final-lora/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9e142ce83a38d62751eee08226da0acc7c10eae5 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/README.md @@ -0,0 +1,206 @@ +--- +base_model: ./glot-mlm-adapted +library_name: peft +tags: +- base_model:adapter:./glot-mlm-adapted +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-500/adapter_config.json b/glot-contrastive-final-lora/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..846b0d8ef2ad7bc4b0b04decd5769b6250d7be73 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./glot-mlm-adapted", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "target_parameters": null, + "task_type": "FEATURE_EXTRACTION", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/glot-contrastive-final-lora/checkpoint-500/adapter_model.safetensors b/glot-contrastive-final-lora/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ec8776f38c68f5ed6c88c8787cd9329a82e969f --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b44b5f55073f2521757e46e9180a390c2cdea6590d7bef8f961ffef9fd06fb +size 2365824 diff --git a/glot-contrastive-final-lora/checkpoint-500/optimizer.pt b/glot-contrastive-final-lora/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c8e6208f6ff74597b0ac2dec615305caa0dc94a --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524eed2618966787cdc3c8b7a304e201c8b5fd9dc5932284f208f7ce24f96dec +size 4760395 diff --git a/glot-contrastive-final-lora/checkpoint-500/rng_state.pth b/glot-contrastive-final-lora/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3cedd628c79e483da81d5902f59b1f462f277654 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91667f27232c2f24307f0b8d5980c62b6cf48987f494164a9a220e1f7de29d1c +size 14645 diff --git a/glot-contrastive-final-lora/checkpoint-500/scheduler.pt b/glot-contrastive-final-lora/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cb684291513b6a7c362908f1d14d27539fec384 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8327fad933e6a9c750f39d471d04a7cfe660f5e99f616254567588fe1a243c3 +size 1465 diff --git a/glot-contrastive-final-lora/checkpoint-500/sentencepiece.bpe.model b/glot-contrastive-final-lora/checkpoint-500/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/checkpoint-500/special_tokens_map.json b/glot-contrastive-final-lora/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/checkpoint-500/tokenizer_config.json b/glot-contrastive-final-lora/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/checkpoint-500/trainer_state.json b/glot-contrastive-final-lora/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa33e916da6c284236ad1e1bd75fc05ee038d30 --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/trainer_state.json @@ -0,0 +1,734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.857142857142857, + "eval_steps": 5, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02857142857142857, + "grad_norm": 0.1407003551721573, + "learning_rate": 0.00029965714285714283, + "loss": 0.9726, + "step": 5 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26689061522483826, + "learning_rate": 0.0002992285714285714, + "loss": 0.9633, + "step": 10 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.8670485615730286, + "learning_rate": 0.0002988, + "loss": 0.9013, + "step": 15 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.9785467386245728, + "learning_rate": 0.00029837142857142853, + "loss": 0.6942, + "step": 20 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3083932399749756, + "learning_rate": 0.0002979428571428571, + "loss": 0.4472, + "step": 25 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.6103293895721436, + "learning_rate": 0.0002975142857142857, + "loss": 0.3782, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 2.6353416442871094, + "learning_rate": 0.0002970857142857143, + "loss": 0.3732, + "step": 35 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.9949072003364563, + "learning_rate": 0.0002966571428571428, + "loss": 0.3506, + "step": 40 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 1.280673861503601, + "learning_rate": 0.0002962285714285714, + "loss": 0.3346, + "step": 45 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002958, + "loss": 0.2832, + "step": 50 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 1.0000813007354736, + "learning_rate": 0.0002953714285714285, + "loss": 0.2603, + "step": 55 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0222399234771729, + "learning_rate": 0.0002949428571428571, + "loss": 0.2507, + "step": 60 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.896902322769165, + "learning_rate": 0.0002945142857142857, + "loss": 0.2556, + "step": 65 + }, + { + "epoch": 0.4, + "grad_norm": 0.9035541415214539, + "learning_rate": 0.00029408571428571426, + "loss": 0.2402, + "step": 70 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.4886469841003418, + "learning_rate": 0.00029365714285714285, + "loss": 0.2376, + "step": 75 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.8951187133789062, + "learning_rate": 0.0002932285714285714, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.7876377105712891, + "learning_rate": 0.00029279999999999996, + "loss": 0.2537, + "step": 85 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.0927226543426514, + "learning_rate": 0.00029237142857142855, + "loss": 0.2152, + "step": 90 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 1.4946355819702148, + "learning_rate": 0.00029194285714285713, + "loss": 0.2441, + "step": 95 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7082991600036621, + "learning_rate": 0.0002915142857142857, + "loss": 0.2708, + "step": 100 + }, + { + "epoch": 0.6, + "grad_norm": 0.670010507106781, + "learning_rate": 0.00029108571428571424, + "loss": 0.2396, + "step": 105 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9797312021255493, + "learning_rate": 0.00029065714285714283, + "loss": 0.2275, + "step": 110 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 1.5220463275909424, + "learning_rate": 0.0002902285714285714, + "loss": 0.2114, + "step": 115 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.3326867818832397, + "learning_rate": 0.00028979999999999994, + "loss": 0.241, + "step": 120 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.1195529699325562, + "learning_rate": 0.0002893714285714285, + "loss": 0.2389, + "step": 125 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.7551061511039734, + "learning_rate": 0.0002889428571428571, + "loss": 0.2162, + "step": 130 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 1.018908977508545, + "learning_rate": 0.0002885142857142857, + "loss": 0.1924, + "step": 135 + }, + { + "epoch": 0.8, + "grad_norm": 2.123642921447754, + "learning_rate": 0.0002880857142857143, + "loss": 0.2174, + "step": 140 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.7585068941116333, + "learning_rate": 0.0002876571428571428, + "loss": 0.2006, + "step": 145 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.64150869846344, + "learning_rate": 0.0002872285714285714, + "loss": 0.1905, + "step": 150 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.9126951694488525, + "learning_rate": 0.0002868, + "loss": 0.2312, + "step": 155 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.7278801202774048, + "learning_rate": 0.00028637142857142856, + "loss": 0.2077, + "step": 160 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.8931339383125305, + "learning_rate": 0.00028594285714285715, + "loss": 0.1951, + "step": 165 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 1.0831843614578247, + "learning_rate": 0.0002855142857142857, + "loss": 0.2103, + "step": 170 + }, + { + "epoch": 1.0, + "grad_norm": 1.3750063180923462, + "learning_rate": 0.00028508571428571426, + "loss": 0.2396, + "step": 175 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.8338337540626526, + "learning_rate": 0.00028465714285714285, + "loss": 0.2404, + "step": 180 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 1.2879024744033813, + "learning_rate": 0.0002842285714285714, + "loss": 0.2117, + "step": 185 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 1.6751821041107178, + "learning_rate": 0.00028379999999999996, + "loss": 0.1796, + "step": 190 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.9864417910575867, + "learning_rate": 0.00028337142857142854, + "loss": 0.1993, + "step": 195 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0174155235290527, + "learning_rate": 0.00028294285714285713, + "loss": 0.2068, + "step": 200 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 1.029832124710083, + "learning_rate": 0.0002825142857142857, + "loss": 0.2015, + "step": 205 + }, + { + "epoch": 1.2, + "grad_norm": 0.7745446562767029, + "learning_rate": 0.00028208571428571424, + "loss": 0.2129, + "step": 210 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 2.5578622817993164, + "learning_rate": 0.0002816571428571428, + "loss": 0.2224, + "step": 215 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.4185051918029785, + "learning_rate": 0.0002812285714285714, + "loss": 0.2276, + "step": 220 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4176461696624756, + "learning_rate": 0.0002808, + "loss": 0.1781, + "step": 225 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.709326982498169, + "learning_rate": 0.0002803714285714286, + "loss": 0.2177, + "step": 230 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.8170766830444336, + "learning_rate": 0.0002799428571428571, + "loss": 0.1769, + "step": 235 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 1.3850761651992798, + "learning_rate": 0.0002795142857142857, + "loss": 0.2262, + "step": 240 + }, + { + "epoch": 1.4, + "grad_norm": 1.0064373016357422, + "learning_rate": 0.0002790857142857143, + "loss": 0.196, + "step": 245 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.9635728597640991, + "learning_rate": 0.0002786571428571428, + "loss": 0.2029, + "step": 250 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 16.20791244506836, + "learning_rate": 0.0002782285714285714, + "loss": 0.3925, + "step": 255 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 1.4363322257995605, + "learning_rate": 0.0002778, + "loss": 0.3684, + "step": 260 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9379534721374512, + "learning_rate": 0.00027737142857142856, + "loss": 0.2265, + "step": 265 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.8453512787818909, + "learning_rate": 0.00027694285714285714, + "loss": 0.1976, + "step": 270 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.316664695739746, + "learning_rate": 0.0002765142857142857, + "loss": 0.23, + "step": 275 + }, + { + "epoch": 1.6, + "grad_norm": 1.0548444986343384, + "learning_rate": 0.00027608571428571426, + "loss": 0.1823, + "step": 280 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 3.7894928455352783, + "learning_rate": 0.00027565714285714284, + "loss": 0.1962, + "step": 285 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 2.3081610202789307, + "learning_rate": 0.00027522857142857143, + "loss": 0.2087, + "step": 290 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.9311438202857971, + "learning_rate": 0.0002748, + "loss": 0.1597, + "step": 295 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.1881247758865356, + "learning_rate": 0.00027437142857142854, + "loss": 0.1764, + "step": 300 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.30265212059021, + "learning_rate": 0.0002739428571428571, + "loss": 0.1647, + "step": 305 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.6832175850868225, + "learning_rate": 0.0002735142857142857, + "loss": 0.1638, + "step": 310 + }, + { + "epoch": 1.8, + "grad_norm": 1.8740538358688354, + "learning_rate": 0.00027308571428571424, + "loss": 0.1803, + "step": 315 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 9.821504592895508, + "learning_rate": 0.0002726571428571428, + "loss": 0.226, + "step": 320 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.0889750719070435, + "learning_rate": 0.0002722285714285714, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.9660868048667908, + "learning_rate": 0.0002718, + "loss": 0.1842, + "step": 330 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6329234838485718, + "learning_rate": 0.0002713714285714286, + "loss": 0.1488, + "step": 335 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 3.601266384124756, + "learning_rate": 0.0002709428571428571, + "loss": 0.1887, + "step": 340 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 1.1441439390182495, + "learning_rate": 0.0002705142857142857, + "loss": 0.184, + "step": 345 + }, + { + "epoch": 2.0, + "grad_norm": 0.8586034774780273, + "learning_rate": 0.0002700857142857143, + "loss": 0.1578, + "step": 350 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 1.5113487243652344, + "learning_rate": 0.00026965714285714286, + "loss": 0.2002, + "step": 355 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 1.1123011112213135, + "learning_rate": 0.0002692285714285714, + "loss": 0.1946, + "step": 360 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.9377036094665527, + "learning_rate": 0.0002688, + "loss": 0.1971, + "step": 365 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6956892609596252, + "learning_rate": 0.00026837142857142856, + "loss": 0.1758, + "step": 370 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.7510782480239868, + "learning_rate": 0.0002679428571428571, + "loss": 0.1674, + "step": 375 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.7009285092353821, + "learning_rate": 0.00026751428571428567, + "loss": 0.1945, + "step": 380 + }, + { + "epoch": 2.2, + "grad_norm": 0.9555609822273254, + "learning_rate": 0.00026708571428571426, + "loss": 0.1857, + "step": 385 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 2.133979082107544, + "learning_rate": 0.00026665714285714284, + "loss": 0.1636, + "step": 390 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.7105309963226318, + "learning_rate": 0.0002662285714285714, + "loss": 0.2014, + "step": 395 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7329701781272888, + "learning_rate": 0.00026579999999999996, + "loss": 0.1884, + "step": 400 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 1.0426994562149048, + "learning_rate": 0.00026537142857142854, + "loss": 0.1558, + "step": 405 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.9306122660636902, + "learning_rate": 0.0002649428571428571, + "loss": 0.1774, + "step": 410 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.6989394426345825, + "learning_rate": 0.00026451428571428565, + "loss": 0.1601, + "step": 415 + }, + { + "epoch": 2.4, + "grad_norm": 1.4383760690689087, + "learning_rate": 0.0002640857142857143, + "loss": 0.1564, + "step": 420 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6448336839675903, + "learning_rate": 0.0002636571428571428, + "loss": 0.1827, + "step": 425 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.9535760879516602, + "learning_rate": 0.0002632285714285714, + "loss": 0.1713, + "step": 430 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 1.034945011138916, + "learning_rate": 0.0002628, + "loss": 0.1457, + "step": 435 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.3225128650665283, + "learning_rate": 0.0002623714285714285, + "loss": 0.1633, + "step": 440 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.8285059928894043, + "learning_rate": 0.0002619428571428571, + "loss": 0.2004, + "step": 445 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.773176908493042, + "learning_rate": 0.0002615142857142857, + "loss": 0.1641, + "step": 450 + }, + { + "epoch": 2.6, + "grad_norm": 0.7964853048324585, + "learning_rate": 0.0002610857142857143, + "loss": 0.1608, + "step": 455 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.0967328548431396, + "learning_rate": 0.00026065714285714286, + "loss": 0.1697, + "step": 460 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.6462066173553467, + "learning_rate": 0.0002602285714285714, + "loss": 0.1512, + "step": 465 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.8765937089920044, + "learning_rate": 0.00025979999999999997, + "loss": 0.1826, + "step": 470 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.2524124383926392, + "learning_rate": 0.00025937142857142856, + "loss": 0.1731, + "step": 475 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.2982606887817383, + "learning_rate": 0.0002589428571428571, + "loss": 0.1852, + "step": 480 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.9989053010940552, + "learning_rate": 0.0002585142857142857, + "loss": 0.1791, + "step": 485 + }, + { + "epoch": 2.8, + "grad_norm": 0.772343635559082, + "learning_rate": 0.00025808571428571426, + "loss": 0.1862, + "step": 490 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 1.2101136445999146, + "learning_rate": 0.00025765714285714284, + "loss": 0.1806, + "step": 495 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.8010189533233643, + "learning_rate": 0.0002572285714285714, + "loss": 0.1842, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 200, + "trial_name": null, + "trial_params": null +} diff --git a/glot-contrastive-final-lora/checkpoint-500/training_args.bin b/glot-contrastive-final-lora/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777 diff --git a/glot-contrastive-final-lora/sentencepiece.bpe.model b/glot-contrastive-final-lora/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..41409bcc76133d6e92fdd5909093d24e32662ba0 --- /dev/null +++ b/glot-contrastive-final-lora/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a313a26470baedaede322622492f2a542aa41527ddc5d40de444e945ad3c613 +size 7658320 diff --git a/glot-contrastive-final-lora/special_tokens_map.json b/glot-contrastive-final-lora/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68171d1ff68b731a33d119708476692c094a466b --- /dev/null +++ b/glot-contrastive-final-lora/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/glot-contrastive-final-lora/tokenizer_config.json b/glot-contrastive-final-lora/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4a02cfefad7bc6d993905187616acfc547f17d --- /dev/null +++ b/glot-contrastive-final-lora/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "401144": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "", + "use_fast": true +} diff --git a/glot-contrastive-final-lora/training_args.bin b/glot-contrastive-final-lora/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..504636d7db81a728b5309e791498748e16c3b8ec --- /dev/null +++ b/glot-contrastive-final-lora/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a87dc6b2c67ad3df98065b9e8fa21d9d93cd2cb361c532cb83c8a37bdc81a3 +size 5777