diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-4000/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-4000/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0de93a061edf7f2ef9ecdb128f2258e2b486c576 --- /dev/null +++ b/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf9900938297f10ca5bbfd3aea42b01ed8d414c257d79e23cf99c5abd7400bb4 +size 123328576 diff --git a/checkpoint-4000/experiment_cfg/metadata.json b/checkpoint-4000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-4000/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e94d4a79268232ba173bd561e08a45b966a441c --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eeb6962209af3ac5e5780010e2e1e0f341c4fb0db66c4b4d9e43149a9fa7c5c +size 246824634 diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..984f2765c38c6ff454f047e0fdf6eeb0c67f2995 --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff41b94b6c01cfd2a73909eff11b53105955ebf408577b11fc3fb83ad491ea1f +size 14244 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc86c19600caaefd6e16d883ff4efc6b32839712 --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:217d233fe11f3792edfe1d9ed6553ae6ac505268a32c9fdc6ca353c28974200b +size 1064 diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..718fbf0cade2a1a134b08f9133b21dade52edb23 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,2833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.925373134328359, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.0070179379263744e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4500/README.md b/checkpoint-4500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-4500/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-4500/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6456dc50a21580e55b7a448d2bbefa25af7ed015 --- /dev/null +++ b/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67195aed8359e845aa6e736c609a2360ef327f1e8d7a1e7603ea5bd0aed0f09c +size 123328576 diff --git a/checkpoint-4500/experiment_cfg/metadata.json b/checkpoint-4500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-4500/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a6533690a80e5cfb3bf04174e87b1b45fa1f3e0 --- /dev/null +++ b/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7d3a2a860bfa94c86db53c686d2fcbaaf7931b0c992b1d60b8b794ee205fe85 +size 246824634 diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f988fe207ce9147ce861ce8f43b570cc356de166 --- /dev/null +++ b/checkpoint-4500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a9c3f7e9d7afb450f369a250eea2d964c6dd9b992e90e251ae8ac092e097545 +size 14244 diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3662fca649d7331179dc55901b920c1a5c27cf6 --- /dev/null +++ b/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ded847c9c7706bb864044bf340d57043e09157b2acfb3ad0c66b2c6ae892c2 +size 1064 diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..82ae40880c0363e4982d48f61bd8def57442128e --- /dev/null +++ b/checkpoint-4500/trainer_state.json @@ -0,0 +1,3183 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.791044776119403, + "eval_steps": 500, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.3828481543643136e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-5000/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-5000/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6980be1af88247a8c856f782f38b3f699cd5f2aa --- /dev/null +++ b/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:193ed2eff2f1efc1b45d0876f750a171ac697da5510b5d45510990c327aa971e +size 123328576 diff --git a/checkpoint-5000/experiment_cfg/metadata.json b/checkpoint-5000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-5000/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff0d1cf29c87f1d853a3872fa9e0445f5b0fa186 --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe44114548aaec257567950fa1c0a2981b6c4d75cd69e6f7a58ecb03c5ab446 +size 246824634 diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e08261796bd8c95ffdeb971fcf6b55820163ad6 --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e423bc827d5231f98e5bb1fe4ad825145c336e59988047618a72a0e4330b669e +size 14244 diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..627f9d59edb9d02c2562649f89298f7e6642ad1a --- /dev/null +++ b/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686df702ebab6ee6c983b8676d5ee13ddeab5e30f320c5136697dbf37d11f69e +size 1064 diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4aceba7ce96868822fe71ebd2a42c3ea73d3ea95 --- /dev/null +++ b/checkpoint-5000/trainer_state.json @@ -0,0 +1,3533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.65671641791045, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + }, + { + "epoch": 16.828358208955223, + "grad_norm": 0.1758819818496704, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0462, + "step": 4510 + }, + { + "epoch": 16.865671641791046, + "grad_norm": 0.18571069836616516, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0425, + "step": 4520 + }, + { + "epoch": 16.902985074626866, + "grad_norm": 0.20819155871868134, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0428, + "step": 4530 + }, + { + "epoch": 16.940298507462686, + "grad_norm": 0.30357328057289124, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0414, + "step": 4540 + }, + { + "epoch": 16.97761194029851, + "grad_norm": 0.20977462828159332, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0447, + "step": 4550 + }, + { + "epoch": 17.01492537313433, + "grad_norm": 0.2535971701145172, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0384, + "step": 4560 + }, + { + "epoch": 17.05223880597015, + "grad_norm": 0.2789897620677948, + "learning_rate": 9.153900045904549e-05, + "loss": 0.042, + "step": 4570 + }, + { + "epoch": 17.08955223880597, + "grad_norm": 0.18474848568439484, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0398, + "step": 4580 + }, + { + "epoch": 17.12686567164179, + "grad_norm": 0.12615208327770233, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 0.17756640911102295, + "learning_rate": 9.140044155740101e-05, + "loss": 0.035, + "step": 4600 + }, + { + "epoch": 17.20149253731343, + "grad_norm": 0.24410821497440338, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0459, + "step": 4610 + }, + { + "epoch": 17.238805970149254, + "grad_norm": 0.21573011577129364, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0385, + "step": 4620 + }, + { + "epoch": 17.276119402985074, + "grad_norm": 0.13879653811454773, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0391, + "step": 4630 + }, + { + "epoch": 17.313432835820894, + "grad_norm": 0.17508305609226227, + "learning_rate": 9.121411232980588e-05, + "loss": 0.038, + "step": 4640 + }, + { + "epoch": 17.350746268656717, + "grad_norm": 0.2536008358001709, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0418, + "step": 4650 + }, + { + "epoch": 17.388059701492537, + "grad_norm": 0.1942976713180542, + "learning_rate": 9.112027113896262e-05, + "loss": 0.052, + "step": 4660 + }, + { + "epoch": 17.425373134328357, + "grad_norm": 0.16561119258403778, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0451, + "step": 4670 + }, + { + "epoch": 17.46268656716418, + "grad_norm": 0.22971832752227783, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0407, + "step": 4680 + }, + { + "epoch": 17.5, + "grad_norm": 0.1306753158569336, + "learning_rate": 9.097866651593317e-05, + "loss": 0.042, + "step": 4690 + }, + { + "epoch": 17.53731343283582, + "grad_norm": 0.21278400719165802, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 17.574626865671643, + "grad_norm": 0.22757171094417572, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0364, + "step": 4710 + }, + { + "epoch": 17.611940298507463, + "grad_norm": 0.216596320271492, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0434, + "step": 4720 + }, + { + "epoch": 17.649253731343283, + "grad_norm": 0.13022471964359283, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0415, + "step": 4730 + }, + { + "epoch": 17.686567164179106, + "grad_norm": 0.2280716598033905, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0385, + "step": 4740 + }, + { + "epoch": 17.723880597014926, + "grad_norm": 0.14666135609149933, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0347, + "step": 4750 + }, + { + "epoch": 17.761194029850746, + "grad_norm": 0.1631281077861786, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0407, + "step": 4760 + }, + { + "epoch": 17.798507462686565, + "grad_norm": 0.18697327375411987, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0425, + "step": 4770 + }, + { + "epoch": 17.83582089552239, + "grad_norm": 0.12955111265182495, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 17.87313432835821, + "grad_norm": 0.15547148883342743, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0409, + "step": 4790 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 0.1900598704814911, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0369, + "step": 4800 + }, + { + "epoch": 17.94776119402985, + "grad_norm": 0.1846715807914734, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0415, + "step": 4810 + }, + { + "epoch": 17.98507462686567, + "grad_norm": 0.1829937845468521, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 18.02238805970149, + "grad_norm": 0.25900354981422424, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0398, + "step": 4830 + }, + { + "epoch": 18.059701492537314, + "grad_norm": 0.21235992014408112, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0477, + "step": 4840 + }, + { + "epoch": 18.097014925373134, + "grad_norm": 0.18785078823566437, + "learning_rate": 9.020649881213958e-05, + "loss": 0.039, + "step": 4850 + }, + { + "epoch": 18.134328358208954, + "grad_norm": 0.1951548010110855, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0357, + "step": 4860 + }, + { + "epoch": 18.171641791044777, + "grad_norm": 0.1280934363603592, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0425, + "step": 4870 + }, + { + "epoch": 18.208955223880597, + "grad_norm": 0.1693423092365265, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0447, + "step": 4880 + }, + { + "epoch": 18.246268656716417, + "grad_norm": 0.23712658882141113, + "learning_rate": 9.000903867511666e-05, + "loss": 0.042, + "step": 4890 + }, + { + "epoch": 18.28358208955224, + "grad_norm": 0.26489710807800293, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0457, + "step": 4900 + }, + { + "epoch": 18.32089552238806, + "grad_norm": 0.20792756974697113, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 18.35820895522388, + "grad_norm": 0.18526089191436768, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0366, + "step": 4920 + }, + { + "epoch": 18.395522388059703, + "grad_norm": 0.2214607298374176, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 18.432835820895523, + "grad_norm": 0.1896953135728836, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0391, + "step": 4940 + }, + { + "epoch": 18.470149253731343, + "grad_norm": 0.1430232971906662, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0403, + "step": 4950 + }, + { + "epoch": 18.507462686567163, + "grad_norm": 0.1991272121667862, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0429, + "step": 4960 + }, + { + "epoch": 18.544776119402986, + "grad_norm": 0.2361849844455719, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0416, + "step": 4970 + }, + { + "epoch": 18.582089552238806, + "grad_norm": 0.25857019424438477, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0461, + "step": 4980 + }, + { + "epoch": 18.619402985074625, + "grad_norm": 0.12873682379722595, + "learning_rate": 8.950775061878453e-05, + "loss": 0.035, + "step": 4990 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 0.19786769151687622, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0399, + "step": 5000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.758678370802253e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5500/README.md b/checkpoint-5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-5500/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-5500/adapter_config.json b/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-5500/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5500/adapter_model.safetensors b/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ecbd566a6d173de7cd1301cc18df7a2a9fffa32 --- /dev/null +++ b/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af956da5fc247034f8acc05baa83b16179bfe24b3f0ae5b6ffa3a7aea4db48b5 +size 123328576 diff --git a/checkpoint-5500/experiment_cfg/metadata.json b/checkpoint-5500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-5500/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-5500/optimizer.pt b/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..36b6b657d333d8bff9efbe8dc2c0e1ae8345c90e --- /dev/null +++ b/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:824f4483f101baf3361f5c6ba11df31164f64392f31713e19eaaebc052b7ca63 +size 246824634 diff --git a/checkpoint-5500/rng_state.pth b/checkpoint-5500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7bef2d0e31ace1ea6a706d6394d750bcefd4535f --- /dev/null +++ b/checkpoint-5500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d4aa547bd6e670e9fd947ee26f6c4619dee2f8046c1b446067ae49dc3a9c0d +size 14244 diff --git a/checkpoint-5500/scheduler.pt b/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9239278542ebe582980c4469b926cd0ca58b0625 --- /dev/null +++ b/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c3fd1b661e3333c011abb51039c1da9b026667d8841fc5e277a18e72de1fd6 +size 1064 diff --git a/checkpoint-5500/trainer_state.json b/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae454b3c784d19df17b1078a46dc4628478df502 --- /dev/null +++ b/checkpoint-5500/trainer_state.json @@ -0,0 +1,3883 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.52238805970149, + "eval_steps": 500, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + }, + { + "epoch": 16.828358208955223, + "grad_norm": 0.1758819818496704, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0462, + "step": 4510 + }, + { + "epoch": 16.865671641791046, + "grad_norm": 0.18571069836616516, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0425, + "step": 4520 + }, + { + "epoch": 16.902985074626866, + "grad_norm": 0.20819155871868134, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0428, + "step": 4530 + }, + { + "epoch": 16.940298507462686, + "grad_norm": 0.30357328057289124, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0414, + "step": 4540 + }, + { + "epoch": 16.97761194029851, + "grad_norm": 0.20977462828159332, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0447, + "step": 4550 + }, + { + "epoch": 17.01492537313433, + "grad_norm": 0.2535971701145172, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0384, + "step": 4560 + }, + { + "epoch": 17.05223880597015, + "grad_norm": 0.2789897620677948, + "learning_rate": 9.153900045904549e-05, + "loss": 0.042, + "step": 4570 + }, + { + "epoch": 17.08955223880597, + "grad_norm": 0.18474848568439484, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0398, + "step": 4580 + }, + { + "epoch": 17.12686567164179, + "grad_norm": 0.12615208327770233, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 0.17756640911102295, + "learning_rate": 9.140044155740101e-05, + "loss": 0.035, + "step": 4600 + }, + { + "epoch": 17.20149253731343, + "grad_norm": 0.24410821497440338, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0459, + "step": 4610 + }, + { + "epoch": 17.238805970149254, + "grad_norm": 0.21573011577129364, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0385, + "step": 4620 + }, + { + "epoch": 17.276119402985074, + "grad_norm": 0.13879653811454773, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0391, + "step": 4630 + }, + { + "epoch": 17.313432835820894, + "grad_norm": 0.17508305609226227, + "learning_rate": 9.121411232980588e-05, + "loss": 0.038, + "step": 4640 + }, + { + "epoch": 17.350746268656717, + "grad_norm": 0.2536008358001709, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0418, + "step": 4650 + }, + { + "epoch": 17.388059701492537, + "grad_norm": 0.1942976713180542, + "learning_rate": 9.112027113896262e-05, + "loss": 0.052, + "step": 4660 + }, + { + "epoch": 17.425373134328357, + "grad_norm": 0.16561119258403778, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0451, + "step": 4670 + }, + { + "epoch": 17.46268656716418, + "grad_norm": 0.22971832752227783, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0407, + "step": 4680 + }, + { + "epoch": 17.5, + "grad_norm": 0.1306753158569336, + "learning_rate": 9.097866651593317e-05, + "loss": 0.042, + "step": 4690 + }, + { + "epoch": 17.53731343283582, + "grad_norm": 0.21278400719165802, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 17.574626865671643, + "grad_norm": 0.22757171094417572, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0364, + "step": 4710 + }, + { + "epoch": 17.611940298507463, + "grad_norm": 0.216596320271492, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0434, + "step": 4720 + }, + { + "epoch": 17.649253731343283, + "grad_norm": 0.13022471964359283, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0415, + "step": 4730 + }, + { + "epoch": 17.686567164179106, + "grad_norm": 0.2280716598033905, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0385, + "step": 4740 + }, + { + "epoch": 17.723880597014926, + "grad_norm": 0.14666135609149933, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0347, + "step": 4750 + }, + { + "epoch": 17.761194029850746, + "grad_norm": 0.1631281077861786, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0407, + "step": 4760 + }, + { + "epoch": 17.798507462686565, + "grad_norm": 0.18697327375411987, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0425, + "step": 4770 + }, + { + "epoch": 17.83582089552239, + "grad_norm": 0.12955111265182495, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 17.87313432835821, + "grad_norm": 0.15547148883342743, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0409, + "step": 4790 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 0.1900598704814911, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0369, + "step": 4800 + }, + { + "epoch": 17.94776119402985, + "grad_norm": 0.1846715807914734, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0415, + "step": 4810 + }, + { + "epoch": 17.98507462686567, + "grad_norm": 0.1829937845468521, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 18.02238805970149, + "grad_norm": 0.25900354981422424, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0398, + "step": 4830 + }, + { + "epoch": 18.059701492537314, + "grad_norm": 0.21235992014408112, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0477, + "step": 4840 + }, + { + "epoch": 18.097014925373134, + "grad_norm": 0.18785078823566437, + "learning_rate": 9.020649881213958e-05, + "loss": 0.039, + "step": 4850 + }, + { + "epoch": 18.134328358208954, + "grad_norm": 0.1951548010110855, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0357, + "step": 4860 + }, + { + "epoch": 18.171641791044777, + "grad_norm": 0.1280934363603592, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0425, + "step": 4870 + }, + { + "epoch": 18.208955223880597, + "grad_norm": 0.1693423092365265, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0447, + "step": 4880 + }, + { + "epoch": 18.246268656716417, + "grad_norm": 0.23712658882141113, + "learning_rate": 9.000903867511666e-05, + "loss": 0.042, + "step": 4890 + }, + { + "epoch": 18.28358208955224, + "grad_norm": 0.26489710807800293, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0457, + "step": 4900 + }, + { + "epoch": 18.32089552238806, + "grad_norm": 0.20792756974697113, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 18.35820895522388, + "grad_norm": 0.18526089191436768, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0366, + "step": 4920 + }, + { + "epoch": 18.395522388059703, + "grad_norm": 0.2214607298374176, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 18.432835820895523, + "grad_norm": 0.1896953135728836, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0391, + "step": 4940 + }, + { + "epoch": 18.470149253731343, + "grad_norm": 0.1430232971906662, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0403, + "step": 4950 + }, + { + "epoch": 18.507462686567163, + "grad_norm": 0.1991272121667862, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0429, + "step": 4960 + }, + { + "epoch": 18.544776119402986, + "grad_norm": 0.2361849844455719, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0416, + "step": 4970 + }, + { + "epoch": 18.582089552238806, + "grad_norm": 0.25857019424438477, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0461, + "step": 4980 + }, + { + "epoch": 18.619402985074625, + "grad_norm": 0.12873682379722595, + "learning_rate": 8.950775061878453e-05, + "loss": 0.035, + "step": 4990 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 0.19786769151687622, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0399, + "step": 5000 + }, + { + "epoch": 18.69402985074627, + "grad_norm": 0.2562239170074463, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0372, + "step": 5010 + }, + { + "epoch": 18.73134328358209, + "grad_norm": 0.14586858451366425, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0427, + "step": 5020 + }, + { + "epoch": 18.76865671641791, + "grad_norm": 0.20062318444252014, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0403, + "step": 5030 + }, + { + "epoch": 18.80597014925373, + "grad_norm": 0.22698874771595, + "learning_rate": 8.92530475251784e-05, + "loss": 0.036, + "step": 5040 + }, + { + "epoch": 18.84328358208955, + "grad_norm": 0.2103697657585144, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0431, + "step": 5050 + }, + { + "epoch": 18.880597014925375, + "grad_norm": 0.16042308509349823, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0388, + "step": 5060 + }, + { + "epoch": 18.917910447761194, + "grad_norm": 0.16874109208583832, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0388, + "step": 5070 + }, + { + "epoch": 18.955223880597014, + "grad_norm": 0.15569192171096802, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0353, + "step": 5080 + }, + { + "epoch": 18.992537313432837, + "grad_norm": 0.16723507642745972, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0443, + "step": 5090 + }, + { + "epoch": 19.029850746268657, + "grad_norm": 0.23284228146076202, + "learning_rate": 8.894386393810563e-05, + "loss": 0.05, + "step": 5100 + }, + { + "epoch": 19.067164179104477, + "grad_norm": 0.1621718853712082, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0369, + "step": 5110 + }, + { + "epoch": 19.104477611940297, + "grad_norm": 0.17522747814655304, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0475, + "step": 5120 + }, + { + "epoch": 19.14179104477612, + "grad_norm": 0.16110533475875854, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0383, + "step": 5130 + }, + { + "epoch": 19.17910447761194, + "grad_norm": 0.2574177086353302, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0383, + "step": 5140 + }, + { + "epoch": 19.21641791044776, + "grad_norm": 0.14560100436210632, + "learning_rate": 8.868328171593448e-05, + "loss": 0.037, + "step": 5150 + }, + { + "epoch": 19.253731343283583, + "grad_norm": 0.14456631243228912, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0423, + "step": 5160 + }, + { + "epoch": 19.291044776119403, + "grad_norm": 0.1403595507144928, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0327, + "step": 5170 + }, + { + "epoch": 19.328358208955223, + "grad_norm": 0.18462564051151276, + "learning_rate": 8.852566213878947e-05, + "loss": 0.037, + "step": 5180 + }, + { + "epoch": 19.365671641791046, + "grad_norm": 0.20725117623806, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0376, + "step": 5190 + }, + { + "epoch": 19.402985074626866, + "grad_norm": 0.17023132741451263, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0467, + "step": 5200 + }, + { + "epoch": 19.440298507462686, + "grad_norm": 0.31033241748809814, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0425, + "step": 5210 + }, + { + "epoch": 19.47761194029851, + "grad_norm": 0.14057482779026031, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0432, + "step": 5220 + }, + { + "epoch": 19.51492537313433, + "grad_norm": 0.23247437179088593, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0396, + "step": 5230 + }, + { + "epoch": 19.55223880597015, + "grad_norm": 0.1305907964706421, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0389, + "step": 5240 + }, + { + "epoch": 19.58955223880597, + "grad_norm": 0.17093417048454285, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0347, + "step": 5250 + }, + { + "epoch": 19.62686567164179, + "grad_norm": 0.24105240404605865, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0392, + "step": 5260 + }, + { + "epoch": 19.66417910447761, + "grad_norm": 0.2234315127134323, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0403, + "step": 5270 + }, + { + "epoch": 19.701492537313435, + "grad_norm": 0.16947844624519348, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0368, + "step": 5280 + }, + { + "epoch": 19.738805970149254, + "grad_norm": 0.26133742928504944, + "learning_rate": 8.79396432173515e-05, + "loss": 0.041, + "step": 5290 + }, + { + "epoch": 19.776119402985074, + "grad_norm": 0.2099352777004242, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0363, + "step": 5300 + }, + { + "epoch": 19.813432835820894, + "grad_norm": 0.1662513017654419, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0409, + "step": 5310 + }, + { + "epoch": 19.850746268656717, + "grad_norm": 0.18933714926242828, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0348, + "step": 5320 + }, + { + "epoch": 19.888059701492537, + "grad_norm": 0.21673552691936493, + "learning_rate": 8.772342342181095e-05, + "loss": 0.037, + "step": 5330 + }, + { + "epoch": 19.925373134328357, + "grad_norm": 0.13009892404079437, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0386, + "step": 5340 + }, + { + "epoch": 19.96268656716418, + "grad_norm": 0.1655230075120926, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0404, + "step": 5350 + }, + { + "epoch": 20.0, + "grad_norm": 0.2821272611618042, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0441, + "step": 5360 + }, + { + "epoch": 20.03731343283582, + "grad_norm": 0.1302652508020401, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0389, + "step": 5370 + }, + { + "epoch": 20.074626865671643, + "grad_norm": 0.13331563770771027, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0393, + "step": 5380 + }, + { + "epoch": 20.111940298507463, + "grad_norm": 0.244130939245224, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0372, + "step": 5390 + }, + { + "epoch": 20.149253731343283, + "grad_norm": 0.20429308712482452, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0387, + "step": 5400 + }, + { + "epoch": 20.186567164179106, + "grad_norm": 0.2954719364643097, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0404, + "step": 5410 + }, + { + "epoch": 20.223880597014926, + "grad_norm": 0.20438429713249207, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0359, + "step": 5420 + }, + { + "epoch": 20.261194029850746, + "grad_norm": 0.17289331555366516, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0357, + "step": 5430 + }, + { + "epoch": 20.298507462686565, + "grad_norm": 0.24367138743400574, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0392, + "step": 5440 + }, + { + "epoch": 20.33582089552239, + "grad_norm": 0.21900270879268646, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0419, + "step": 5450 + }, + { + "epoch": 20.37313432835821, + "grad_norm": 0.1526443362236023, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0333, + "step": 5460 + }, + { + "epoch": 20.41044776119403, + "grad_norm": 0.24582353234291077, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0417, + "step": 5470 + }, + { + "epoch": 20.44776119402985, + "grad_norm": 0.21462485194206238, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0347, + "step": 5480 + }, + { + "epoch": 20.48507462686567, + "grad_norm": 0.17611616849899292, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0395, + "step": 5490 + }, + { + "epoch": 20.52238805970149, + "grad_norm": 0.19724012911319733, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0332, + "step": 5500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.134508587240192e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6000/README.md b/checkpoint-6000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-6000/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-6000/adapter_config.json b/checkpoint-6000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-6000/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6000/adapter_model.safetensors b/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e4c072fff9b254f686d36709fd703ba22d28fc65 --- /dev/null +++ b/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7e637252ccbc2072f81752b8b8609968b5b00d64efc6b714296170cc67cc63 +size 123328576 diff --git a/checkpoint-6000/experiment_cfg/metadata.json b/checkpoint-6000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-6000/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-6000/optimizer.pt b/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..17148f79896b02af320e05f5c76f06b821221e89 --- /dev/null +++ b/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ba63daa5ec509f37eb8ad97d3a5972c35bea7a8dd0db7d466efd0d3f329b3c8 +size 246824634 diff --git a/checkpoint-6000/rng_state.pth b/checkpoint-6000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..81db5454ec03125821a10d65ff4f753e454c77a1 --- /dev/null +++ b/checkpoint-6000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a72b98cbbed0760e18ed8782e6594f386d25ba2964d1c4523fa752e78decb29 +size 14244 diff --git a/checkpoint-6000/scheduler.pt b/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..02b3c89f05c3b1acdcb7adbbaf3b9d490d60fd37 --- /dev/null +++ b/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9495ce8c64325a54dd12b90b11af20d3a01df15d5fab3ae0abdaff10fb2ae7f +size 1064 diff --git a/checkpoint-6000/trainer_state.json b/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4f1002f4e408abac915a25e7ada8887476ab37b0 --- /dev/null +++ b/checkpoint-6000/trainer_state.json @@ -0,0 +1,4233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 22.388059701492537, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + }, + { + "epoch": 16.828358208955223, + "grad_norm": 0.1758819818496704, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0462, + "step": 4510 + }, + { + "epoch": 16.865671641791046, + "grad_norm": 0.18571069836616516, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0425, + "step": 4520 + }, + { + "epoch": 16.902985074626866, + "grad_norm": 0.20819155871868134, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0428, + "step": 4530 + }, + { + "epoch": 16.940298507462686, + "grad_norm": 0.30357328057289124, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0414, + "step": 4540 + }, + { + "epoch": 16.97761194029851, + "grad_norm": 0.20977462828159332, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0447, + "step": 4550 + }, + { + "epoch": 17.01492537313433, + "grad_norm": 0.2535971701145172, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0384, + "step": 4560 + }, + { + "epoch": 17.05223880597015, + "grad_norm": 0.2789897620677948, + "learning_rate": 9.153900045904549e-05, + "loss": 0.042, + "step": 4570 + }, + { + "epoch": 17.08955223880597, + "grad_norm": 0.18474848568439484, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0398, + "step": 4580 + }, + { + "epoch": 17.12686567164179, + "grad_norm": 0.12615208327770233, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 0.17756640911102295, + "learning_rate": 9.140044155740101e-05, + "loss": 0.035, + "step": 4600 + }, + { + "epoch": 17.20149253731343, + "grad_norm": 0.24410821497440338, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0459, + "step": 4610 + }, + { + "epoch": 17.238805970149254, + "grad_norm": 0.21573011577129364, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0385, + "step": 4620 + }, + { + "epoch": 17.276119402985074, + "grad_norm": 0.13879653811454773, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0391, + "step": 4630 + }, + { + "epoch": 17.313432835820894, + "grad_norm": 0.17508305609226227, + "learning_rate": 9.121411232980588e-05, + "loss": 0.038, + "step": 4640 + }, + { + "epoch": 17.350746268656717, + "grad_norm": 0.2536008358001709, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0418, + "step": 4650 + }, + { + "epoch": 17.388059701492537, + "grad_norm": 0.1942976713180542, + "learning_rate": 9.112027113896262e-05, + "loss": 0.052, + "step": 4660 + }, + { + "epoch": 17.425373134328357, + "grad_norm": 0.16561119258403778, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0451, + "step": 4670 + }, + { + "epoch": 17.46268656716418, + "grad_norm": 0.22971832752227783, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0407, + "step": 4680 + }, + { + "epoch": 17.5, + "grad_norm": 0.1306753158569336, + "learning_rate": 9.097866651593317e-05, + "loss": 0.042, + "step": 4690 + }, + { + "epoch": 17.53731343283582, + "grad_norm": 0.21278400719165802, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 17.574626865671643, + "grad_norm": 0.22757171094417572, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0364, + "step": 4710 + }, + { + "epoch": 17.611940298507463, + "grad_norm": 0.216596320271492, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0434, + "step": 4720 + }, + { + "epoch": 17.649253731343283, + "grad_norm": 0.13022471964359283, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0415, + "step": 4730 + }, + { + "epoch": 17.686567164179106, + "grad_norm": 0.2280716598033905, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0385, + "step": 4740 + }, + { + "epoch": 17.723880597014926, + "grad_norm": 0.14666135609149933, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0347, + "step": 4750 + }, + { + "epoch": 17.761194029850746, + "grad_norm": 0.1631281077861786, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0407, + "step": 4760 + }, + { + "epoch": 17.798507462686565, + "grad_norm": 0.18697327375411987, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0425, + "step": 4770 + }, + { + "epoch": 17.83582089552239, + "grad_norm": 0.12955111265182495, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 17.87313432835821, + "grad_norm": 0.15547148883342743, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0409, + "step": 4790 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 0.1900598704814911, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0369, + "step": 4800 + }, + { + "epoch": 17.94776119402985, + "grad_norm": 0.1846715807914734, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0415, + "step": 4810 + }, + { + "epoch": 17.98507462686567, + "grad_norm": 0.1829937845468521, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 18.02238805970149, + "grad_norm": 0.25900354981422424, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0398, + "step": 4830 + }, + { + "epoch": 18.059701492537314, + "grad_norm": 0.21235992014408112, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0477, + "step": 4840 + }, + { + "epoch": 18.097014925373134, + "grad_norm": 0.18785078823566437, + "learning_rate": 9.020649881213958e-05, + "loss": 0.039, + "step": 4850 + }, + { + "epoch": 18.134328358208954, + "grad_norm": 0.1951548010110855, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0357, + "step": 4860 + }, + { + "epoch": 18.171641791044777, + "grad_norm": 0.1280934363603592, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0425, + "step": 4870 + }, + { + "epoch": 18.208955223880597, + "grad_norm": 0.1693423092365265, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0447, + "step": 4880 + }, + { + "epoch": 18.246268656716417, + "grad_norm": 0.23712658882141113, + "learning_rate": 9.000903867511666e-05, + "loss": 0.042, + "step": 4890 + }, + { + "epoch": 18.28358208955224, + "grad_norm": 0.26489710807800293, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0457, + "step": 4900 + }, + { + "epoch": 18.32089552238806, + "grad_norm": 0.20792756974697113, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 18.35820895522388, + "grad_norm": 0.18526089191436768, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0366, + "step": 4920 + }, + { + "epoch": 18.395522388059703, + "grad_norm": 0.2214607298374176, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 18.432835820895523, + "grad_norm": 0.1896953135728836, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0391, + "step": 4940 + }, + { + "epoch": 18.470149253731343, + "grad_norm": 0.1430232971906662, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0403, + "step": 4950 + }, + { + "epoch": 18.507462686567163, + "grad_norm": 0.1991272121667862, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0429, + "step": 4960 + }, + { + "epoch": 18.544776119402986, + "grad_norm": 0.2361849844455719, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0416, + "step": 4970 + }, + { + "epoch": 18.582089552238806, + "grad_norm": 0.25857019424438477, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0461, + "step": 4980 + }, + { + "epoch": 18.619402985074625, + "grad_norm": 0.12873682379722595, + "learning_rate": 8.950775061878453e-05, + "loss": 0.035, + "step": 4990 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 0.19786769151687622, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0399, + "step": 5000 + }, + { + "epoch": 18.69402985074627, + "grad_norm": 0.2562239170074463, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0372, + "step": 5010 + }, + { + "epoch": 18.73134328358209, + "grad_norm": 0.14586858451366425, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0427, + "step": 5020 + }, + { + "epoch": 18.76865671641791, + "grad_norm": 0.20062318444252014, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0403, + "step": 5030 + }, + { + "epoch": 18.80597014925373, + "grad_norm": 0.22698874771595, + "learning_rate": 8.92530475251784e-05, + "loss": 0.036, + "step": 5040 + }, + { + "epoch": 18.84328358208955, + "grad_norm": 0.2103697657585144, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0431, + "step": 5050 + }, + { + "epoch": 18.880597014925375, + "grad_norm": 0.16042308509349823, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0388, + "step": 5060 + }, + { + "epoch": 18.917910447761194, + "grad_norm": 0.16874109208583832, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0388, + "step": 5070 + }, + { + "epoch": 18.955223880597014, + "grad_norm": 0.15569192171096802, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0353, + "step": 5080 + }, + { + "epoch": 18.992537313432837, + "grad_norm": 0.16723507642745972, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0443, + "step": 5090 + }, + { + "epoch": 19.029850746268657, + "grad_norm": 0.23284228146076202, + "learning_rate": 8.894386393810563e-05, + "loss": 0.05, + "step": 5100 + }, + { + "epoch": 19.067164179104477, + "grad_norm": 0.1621718853712082, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0369, + "step": 5110 + }, + { + "epoch": 19.104477611940297, + "grad_norm": 0.17522747814655304, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0475, + "step": 5120 + }, + { + "epoch": 19.14179104477612, + "grad_norm": 0.16110533475875854, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0383, + "step": 5130 + }, + { + "epoch": 19.17910447761194, + "grad_norm": 0.2574177086353302, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0383, + "step": 5140 + }, + { + "epoch": 19.21641791044776, + "grad_norm": 0.14560100436210632, + "learning_rate": 8.868328171593448e-05, + "loss": 0.037, + "step": 5150 + }, + { + "epoch": 19.253731343283583, + "grad_norm": 0.14456631243228912, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0423, + "step": 5160 + }, + { + "epoch": 19.291044776119403, + "grad_norm": 0.1403595507144928, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0327, + "step": 5170 + }, + { + "epoch": 19.328358208955223, + "grad_norm": 0.18462564051151276, + "learning_rate": 8.852566213878947e-05, + "loss": 0.037, + "step": 5180 + }, + { + "epoch": 19.365671641791046, + "grad_norm": 0.20725117623806, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0376, + "step": 5190 + }, + { + "epoch": 19.402985074626866, + "grad_norm": 0.17023132741451263, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0467, + "step": 5200 + }, + { + "epoch": 19.440298507462686, + "grad_norm": 0.31033241748809814, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0425, + "step": 5210 + }, + { + "epoch": 19.47761194029851, + "grad_norm": 0.14057482779026031, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0432, + "step": 5220 + }, + { + "epoch": 19.51492537313433, + "grad_norm": 0.23247437179088593, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0396, + "step": 5230 + }, + { + "epoch": 19.55223880597015, + "grad_norm": 0.1305907964706421, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0389, + "step": 5240 + }, + { + "epoch": 19.58955223880597, + "grad_norm": 0.17093417048454285, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0347, + "step": 5250 + }, + { + "epoch": 19.62686567164179, + "grad_norm": 0.24105240404605865, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0392, + "step": 5260 + }, + { + "epoch": 19.66417910447761, + "grad_norm": 0.2234315127134323, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0403, + "step": 5270 + }, + { + "epoch": 19.701492537313435, + "grad_norm": 0.16947844624519348, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0368, + "step": 5280 + }, + { + "epoch": 19.738805970149254, + "grad_norm": 0.26133742928504944, + "learning_rate": 8.79396432173515e-05, + "loss": 0.041, + "step": 5290 + }, + { + "epoch": 19.776119402985074, + "grad_norm": 0.2099352777004242, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0363, + "step": 5300 + }, + { + "epoch": 19.813432835820894, + "grad_norm": 0.1662513017654419, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0409, + "step": 5310 + }, + { + "epoch": 19.850746268656717, + "grad_norm": 0.18933714926242828, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0348, + "step": 5320 + }, + { + "epoch": 19.888059701492537, + "grad_norm": 0.21673552691936493, + "learning_rate": 8.772342342181095e-05, + "loss": 0.037, + "step": 5330 + }, + { + "epoch": 19.925373134328357, + "grad_norm": 0.13009892404079437, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0386, + "step": 5340 + }, + { + "epoch": 19.96268656716418, + "grad_norm": 0.1655230075120926, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0404, + "step": 5350 + }, + { + "epoch": 20.0, + "grad_norm": 0.2821272611618042, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0441, + "step": 5360 + }, + { + "epoch": 20.03731343283582, + "grad_norm": 0.1302652508020401, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0389, + "step": 5370 + }, + { + "epoch": 20.074626865671643, + "grad_norm": 0.13331563770771027, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0393, + "step": 5380 + }, + { + "epoch": 20.111940298507463, + "grad_norm": 0.244130939245224, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0372, + "step": 5390 + }, + { + "epoch": 20.149253731343283, + "grad_norm": 0.20429308712482452, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0387, + "step": 5400 + }, + { + "epoch": 20.186567164179106, + "grad_norm": 0.2954719364643097, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0404, + "step": 5410 + }, + { + "epoch": 20.223880597014926, + "grad_norm": 0.20438429713249207, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0359, + "step": 5420 + }, + { + "epoch": 20.261194029850746, + "grad_norm": 0.17289331555366516, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0357, + "step": 5430 + }, + { + "epoch": 20.298507462686565, + "grad_norm": 0.24367138743400574, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0392, + "step": 5440 + }, + { + "epoch": 20.33582089552239, + "grad_norm": 0.21900270879268646, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0419, + "step": 5450 + }, + { + "epoch": 20.37313432835821, + "grad_norm": 0.1526443362236023, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0333, + "step": 5460 + }, + { + "epoch": 20.41044776119403, + "grad_norm": 0.24582353234291077, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0417, + "step": 5470 + }, + { + "epoch": 20.44776119402985, + "grad_norm": 0.21462485194206238, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0347, + "step": 5480 + }, + { + "epoch": 20.48507462686567, + "grad_norm": 0.17611616849899292, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0395, + "step": 5490 + }, + { + "epoch": 20.52238805970149, + "grad_norm": 0.19724012911319733, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0332, + "step": 5500 + }, + { + "epoch": 20.559701492537314, + "grad_norm": 0.2080456167459488, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0361, + "step": 5510 + }, + { + "epoch": 20.597014925373134, + "grad_norm": 0.21469220519065857, + "learning_rate": 8.6674008130122e-05, + "loss": 0.039, + "step": 5520 + }, + { + "epoch": 20.634328358208954, + "grad_norm": 0.242497980594635, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0397, + "step": 5530 + }, + { + "epoch": 20.671641791044777, + "grad_norm": 0.20539864897727966, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0392, + "step": 5540 + }, + { + "epoch": 20.708955223880597, + "grad_norm": 0.21964021027088165, + "learning_rate": 8.650497541989482e-05, + "loss": 0.035, + "step": 5550 + }, + { + "epoch": 20.746268656716417, + "grad_norm": 0.15793637931346893, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0363, + "step": 5560 + }, + { + "epoch": 20.78358208955224, + "grad_norm": 0.1731041818857193, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0371, + "step": 5570 + }, + { + "epoch": 20.82089552238806, + "grad_norm": 0.15019342303276062, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0335, + "step": 5580 + }, + { + "epoch": 20.85820895522388, + "grad_norm": 0.1397496908903122, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0391, + "step": 5590 + }, + { + "epoch": 20.895522388059703, + "grad_norm": 0.141131192445755, + "learning_rate": 8.622126023955446e-05, + "loss": 0.041, + "step": 5600 + }, + { + "epoch": 20.932835820895523, + "grad_norm": 0.20025403797626495, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0412, + "step": 5610 + }, + { + "epoch": 20.970149253731343, + "grad_norm": 0.2251378893852234, + "learning_rate": 8.610707988678503e-05, + "loss": 0.037, + "step": 5620 + }, + { + "epoch": 21.007462686567163, + "grad_norm": 0.1341109722852707, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0371, + "step": 5630 + }, + { + "epoch": 21.044776119402986, + "grad_norm": 0.28053462505340576, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0366, + "step": 5640 + }, + { + "epoch": 21.082089552238806, + "grad_norm": 0.10567930340766907, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0394, + "step": 5650 + }, + { + "epoch": 21.119402985074625, + "grad_norm": 0.17919886112213135, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0357, + "step": 5660 + }, + { + "epoch": 21.15671641791045, + "grad_norm": 0.3223204016685486, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0369, + "step": 5670 + }, + { + "epoch": 21.19402985074627, + "grad_norm": 0.20072297751903534, + "learning_rate": 8.576217467724128e-05, + "loss": 0.0389, + "step": 5680 + }, + { + "epoch": 21.23134328358209, + "grad_norm": 0.1556226760149002, + "learning_rate": 8.570434735306671e-05, + "loss": 0.035, + "step": 5690 + }, + { + "epoch": 21.26865671641791, + "grad_norm": 0.20265886187553406, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0418, + "step": 5700 + }, + { + "epoch": 21.30597014925373, + "grad_norm": 0.15518955886363983, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0331, + "step": 5710 + }, + { + "epoch": 21.34328358208955, + "grad_norm": 0.1822584569454193, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0421, + "step": 5720 + }, + { + "epoch": 21.380597014925375, + "grad_norm": 0.14216330647468567, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0413, + "step": 5730 + }, + { + "epoch": 21.417910447761194, + "grad_norm": 0.24156329035758972, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0404, + "step": 5740 + }, + { + "epoch": 21.455223880597014, + "grad_norm": 0.2753167748451233, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0369, + "step": 5750 + }, + { + "epoch": 21.492537313432837, + "grad_norm": 0.17052626609802246, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0328, + "step": 5760 + }, + { + "epoch": 21.529850746268657, + "grad_norm": 0.11597824096679688, + "learning_rate": 8.523822798020827e-05, + "loss": 0.041, + "step": 5770 + }, + { + "epoch": 21.567164179104477, + "grad_norm": 0.14363346993923187, + "learning_rate": 8.517952785058385e-05, + "loss": 0.0393, + "step": 5780 + }, + { + "epoch": 21.604477611940297, + "grad_norm": 0.19373776018619537, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0372, + "step": 5790 + }, + { + "epoch": 21.64179104477612, + "grad_norm": 0.20276981592178345, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0389, + "step": 5800 + }, + { + "epoch": 21.67910447761194, + "grad_norm": 0.19267870485782623, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0371, + "step": 5810 + }, + { + "epoch": 21.71641791044776, + "grad_norm": 0.2701839208602905, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0333, + "step": 5820 + }, + { + "epoch": 21.753731343283583, + "grad_norm": 0.20612668991088867, + "learning_rate": 8.488458772904684e-05, + "loss": 0.0358, + "step": 5830 + }, + { + "epoch": 21.791044776119403, + "grad_norm": 0.18102902173995972, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0376, + "step": 5840 + }, + { + "epoch": 21.828358208955223, + "grad_norm": 0.23202018439769745, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0418, + "step": 5850 + }, + { + "epoch": 21.865671641791046, + "grad_norm": 0.09540139883756638, + "learning_rate": 8.470647788785665e-05, + "loss": 0.041, + "step": 5860 + }, + { + "epoch": 21.902985074626866, + "grad_norm": 0.23362809419631958, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0402, + "step": 5870 + }, + { + "epoch": 21.940298507462686, + "grad_norm": 0.20929335057735443, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0385, + "step": 5880 + }, + { + "epoch": 21.97761194029851, + "grad_norm": 0.18403425812721252, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0399, + "step": 5890 + }, + { + "epoch": 22.01492537313433, + "grad_norm": 0.2034774273633957, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0361, + "step": 5900 + }, + { + "epoch": 22.05223880597015, + "grad_norm": 0.14981597661972046, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0393, + "step": 5910 + }, + { + "epoch": 22.08955223880597, + "grad_norm": 0.20903146266937256, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0406, + "step": 5920 + }, + { + "epoch": 22.12686567164179, + "grad_norm": 0.12090307474136353, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0349, + "step": 5930 + }, + { + "epoch": 22.16417910447761, + "grad_norm": 0.14085660874843597, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0344, + "step": 5940 + }, + { + "epoch": 22.20149253731343, + "grad_norm": 0.30808404088020325, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0298, + "step": 5950 + }, + { + "epoch": 22.238805970149254, + "grad_norm": 0.17409317195415497, + "learning_rate": 8.410663560133784e-05, + "loss": 0.035, + "step": 5960 + }, + { + "epoch": 22.276119402985074, + "grad_norm": 0.18731828033924103, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0322, + "step": 5970 + }, + { + "epoch": 22.313432835820894, + "grad_norm": 0.16483667492866516, + "learning_rate": 8.398554292153866e-05, + "loss": 0.033, + "step": 5980 + }, + { + "epoch": 22.350746268656717, + "grad_norm": 0.195018008351326, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0344, + "step": 5990 + }, + { + "epoch": 22.388059701492537, + "grad_norm": 0.18210549652576447, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0387, + "step": 6000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.510338803678131e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6500/README.md b/checkpoint-6500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-6500/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-6500/adapter_config.json b/checkpoint-6500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-6500/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6500/adapter_model.safetensors b/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..df0feaabf13596a4a4fa4a9921fe19a879d29585 --- /dev/null +++ b/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef0e30937cf6ac0448dd499578f580b4575192c99686e3ba6b4ea14a17946583 +size 123328576 diff --git a/checkpoint-6500/experiment_cfg/metadata.json b/checkpoint-6500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-6500/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-6500/optimizer.pt b/checkpoint-6500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..61251be672e982afa6f8f4eab43036b373dcee69 --- /dev/null +++ b/checkpoint-6500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b48f8520ef9abf5e83329a356c9f345d1daabe3f4564eab738688ad9403d423f +size 246824634 diff --git a/checkpoint-6500/rng_state.pth b/checkpoint-6500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b176b874f681e9efbaa47478c577e51aad6913a5 --- /dev/null +++ b/checkpoint-6500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac183675168f49ce92cfa88a64cc68cf198aafa3214b49550c76b3c23dcb507e +size 14244 diff --git a/checkpoint-6500/scheduler.pt b/checkpoint-6500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f9b80282678a3628c601632c42a0519ef3d9996 --- /dev/null +++ b/checkpoint-6500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ee88f0f3f63ecfc8da4336b17f02a8ebc3c4ed4823c0ead962ba0f4237fc5e +size 1064 diff --git a/checkpoint-6500/trainer_state.json b/checkpoint-6500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..293c31d1ea30d799f7bdd99f175ab7a2befec0da --- /dev/null +++ b/checkpoint-6500/trainer_state.json @@ -0,0 +1,4583 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 24.253731343283583, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + }, + { + "epoch": 16.828358208955223, + "grad_norm": 0.1758819818496704, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0462, + "step": 4510 + }, + { + "epoch": 16.865671641791046, + "grad_norm": 0.18571069836616516, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0425, + "step": 4520 + }, + { + "epoch": 16.902985074626866, + "grad_norm": 0.20819155871868134, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0428, + "step": 4530 + }, + { + "epoch": 16.940298507462686, + "grad_norm": 0.30357328057289124, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0414, + "step": 4540 + }, + { + "epoch": 16.97761194029851, + "grad_norm": 0.20977462828159332, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0447, + "step": 4550 + }, + { + "epoch": 17.01492537313433, + "grad_norm": 0.2535971701145172, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0384, + "step": 4560 + }, + { + "epoch": 17.05223880597015, + "grad_norm": 0.2789897620677948, + "learning_rate": 9.153900045904549e-05, + "loss": 0.042, + "step": 4570 + }, + { + "epoch": 17.08955223880597, + "grad_norm": 0.18474848568439484, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0398, + "step": 4580 + }, + { + "epoch": 17.12686567164179, + "grad_norm": 0.12615208327770233, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 0.17756640911102295, + "learning_rate": 9.140044155740101e-05, + "loss": 0.035, + "step": 4600 + }, + { + "epoch": 17.20149253731343, + "grad_norm": 0.24410821497440338, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0459, + "step": 4610 + }, + { + "epoch": 17.238805970149254, + "grad_norm": 0.21573011577129364, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0385, + "step": 4620 + }, + { + "epoch": 17.276119402985074, + "grad_norm": 0.13879653811454773, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0391, + "step": 4630 + }, + { + "epoch": 17.313432835820894, + "grad_norm": 0.17508305609226227, + "learning_rate": 9.121411232980588e-05, + "loss": 0.038, + "step": 4640 + }, + { + "epoch": 17.350746268656717, + "grad_norm": 0.2536008358001709, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0418, + "step": 4650 + }, + { + "epoch": 17.388059701492537, + "grad_norm": 0.1942976713180542, + "learning_rate": 9.112027113896262e-05, + "loss": 0.052, + "step": 4660 + }, + { + "epoch": 17.425373134328357, + "grad_norm": 0.16561119258403778, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0451, + "step": 4670 + }, + { + "epoch": 17.46268656716418, + "grad_norm": 0.22971832752227783, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0407, + "step": 4680 + }, + { + "epoch": 17.5, + "grad_norm": 0.1306753158569336, + "learning_rate": 9.097866651593317e-05, + "loss": 0.042, + "step": 4690 + }, + { + "epoch": 17.53731343283582, + "grad_norm": 0.21278400719165802, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 17.574626865671643, + "grad_norm": 0.22757171094417572, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0364, + "step": 4710 + }, + { + "epoch": 17.611940298507463, + "grad_norm": 0.216596320271492, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0434, + "step": 4720 + }, + { + "epoch": 17.649253731343283, + "grad_norm": 0.13022471964359283, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0415, + "step": 4730 + }, + { + "epoch": 17.686567164179106, + "grad_norm": 0.2280716598033905, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0385, + "step": 4740 + }, + { + "epoch": 17.723880597014926, + "grad_norm": 0.14666135609149933, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0347, + "step": 4750 + }, + { + "epoch": 17.761194029850746, + "grad_norm": 0.1631281077861786, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0407, + "step": 4760 + }, + { + "epoch": 17.798507462686565, + "grad_norm": 0.18697327375411987, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0425, + "step": 4770 + }, + { + "epoch": 17.83582089552239, + "grad_norm": 0.12955111265182495, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 17.87313432835821, + "grad_norm": 0.15547148883342743, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0409, + "step": 4790 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 0.1900598704814911, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0369, + "step": 4800 + }, + { + "epoch": 17.94776119402985, + "grad_norm": 0.1846715807914734, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0415, + "step": 4810 + }, + { + "epoch": 17.98507462686567, + "grad_norm": 0.1829937845468521, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 18.02238805970149, + "grad_norm": 0.25900354981422424, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0398, + "step": 4830 + }, + { + "epoch": 18.059701492537314, + "grad_norm": 0.21235992014408112, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0477, + "step": 4840 + }, + { + "epoch": 18.097014925373134, + "grad_norm": 0.18785078823566437, + "learning_rate": 9.020649881213958e-05, + "loss": 0.039, + "step": 4850 + }, + { + "epoch": 18.134328358208954, + "grad_norm": 0.1951548010110855, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0357, + "step": 4860 + }, + { + "epoch": 18.171641791044777, + "grad_norm": 0.1280934363603592, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0425, + "step": 4870 + }, + { + "epoch": 18.208955223880597, + "grad_norm": 0.1693423092365265, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0447, + "step": 4880 + }, + { + "epoch": 18.246268656716417, + "grad_norm": 0.23712658882141113, + "learning_rate": 9.000903867511666e-05, + "loss": 0.042, + "step": 4890 + }, + { + "epoch": 18.28358208955224, + "grad_norm": 0.26489710807800293, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0457, + "step": 4900 + }, + { + "epoch": 18.32089552238806, + "grad_norm": 0.20792756974697113, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 18.35820895522388, + "grad_norm": 0.18526089191436768, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0366, + "step": 4920 + }, + { + "epoch": 18.395522388059703, + "grad_norm": 0.2214607298374176, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 18.432835820895523, + "grad_norm": 0.1896953135728836, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0391, + "step": 4940 + }, + { + "epoch": 18.470149253731343, + "grad_norm": 0.1430232971906662, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0403, + "step": 4950 + }, + { + "epoch": 18.507462686567163, + "grad_norm": 0.1991272121667862, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0429, + "step": 4960 + }, + { + "epoch": 18.544776119402986, + "grad_norm": 0.2361849844455719, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0416, + "step": 4970 + }, + { + "epoch": 18.582089552238806, + "grad_norm": 0.25857019424438477, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0461, + "step": 4980 + }, + { + "epoch": 18.619402985074625, + "grad_norm": 0.12873682379722595, + "learning_rate": 8.950775061878453e-05, + "loss": 0.035, + "step": 4990 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 0.19786769151687622, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0399, + "step": 5000 + }, + { + "epoch": 18.69402985074627, + "grad_norm": 0.2562239170074463, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0372, + "step": 5010 + }, + { + "epoch": 18.73134328358209, + "grad_norm": 0.14586858451366425, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0427, + "step": 5020 + }, + { + "epoch": 18.76865671641791, + "grad_norm": 0.20062318444252014, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0403, + "step": 5030 + }, + { + "epoch": 18.80597014925373, + "grad_norm": 0.22698874771595, + "learning_rate": 8.92530475251784e-05, + "loss": 0.036, + "step": 5040 + }, + { + "epoch": 18.84328358208955, + "grad_norm": 0.2103697657585144, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0431, + "step": 5050 + }, + { + "epoch": 18.880597014925375, + "grad_norm": 0.16042308509349823, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0388, + "step": 5060 + }, + { + "epoch": 18.917910447761194, + "grad_norm": 0.16874109208583832, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0388, + "step": 5070 + }, + { + "epoch": 18.955223880597014, + "grad_norm": 0.15569192171096802, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0353, + "step": 5080 + }, + { + "epoch": 18.992537313432837, + "grad_norm": 0.16723507642745972, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0443, + "step": 5090 + }, + { + "epoch": 19.029850746268657, + "grad_norm": 0.23284228146076202, + "learning_rate": 8.894386393810563e-05, + "loss": 0.05, + "step": 5100 + }, + { + "epoch": 19.067164179104477, + "grad_norm": 0.1621718853712082, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0369, + "step": 5110 + }, + { + "epoch": 19.104477611940297, + "grad_norm": 0.17522747814655304, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0475, + "step": 5120 + }, + { + "epoch": 19.14179104477612, + "grad_norm": 0.16110533475875854, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0383, + "step": 5130 + }, + { + "epoch": 19.17910447761194, + "grad_norm": 0.2574177086353302, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0383, + "step": 5140 + }, + { + "epoch": 19.21641791044776, + "grad_norm": 0.14560100436210632, + "learning_rate": 8.868328171593448e-05, + "loss": 0.037, + "step": 5150 + }, + { + "epoch": 19.253731343283583, + "grad_norm": 0.14456631243228912, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0423, + "step": 5160 + }, + { + "epoch": 19.291044776119403, + "grad_norm": 0.1403595507144928, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0327, + "step": 5170 + }, + { + "epoch": 19.328358208955223, + "grad_norm": 0.18462564051151276, + "learning_rate": 8.852566213878947e-05, + "loss": 0.037, + "step": 5180 + }, + { + "epoch": 19.365671641791046, + "grad_norm": 0.20725117623806, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0376, + "step": 5190 + }, + { + "epoch": 19.402985074626866, + "grad_norm": 0.17023132741451263, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0467, + "step": 5200 + }, + { + "epoch": 19.440298507462686, + "grad_norm": 0.31033241748809814, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0425, + "step": 5210 + }, + { + "epoch": 19.47761194029851, + "grad_norm": 0.14057482779026031, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0432, + "step": 5220 + }, + { + "epoch": 19.51492537313433, + "grad_norm": 0.23247437179088593, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0396, + "step": 5230 + }, + { + "epoch": 19.55223880597015, + "grad_norm": 0.1305907964706421, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0389, + "step": 5240 + }, + { + "epoch": 19.58955223880597, + "grad_norm": 0.17093417048454285, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0347, + "step": 5250 + }, + { + "epoch": 19.62686567164179, + "grad_norm": 0.24105240404605865, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0392, + "step": 5260 + }, + { + "epoch": 19.66417910447761, + "grad_norm": 0.2234315127134323, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0403, + "step": 5270 + }, + { + "epoch": 19.701492537313435, + "grad_norm": 0.16947844624519348, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0368, + "step": 5280 + }, + { + "epoch": 19.738805970149254, + "grad_norm": 0.26133742928504944, + "learning_rate": 8.79396432173515e-05, + "loss": 0.041, + "step": 5290 + }, + { + "epoch": 19.776119402985074, + "grad_norm": 0.2099352777004242, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0363, + "step": 5300 + }, + { + "epoch": 19.813432835820894, + "grad_norm": 0.1662513017654419, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0409, + "step": 5310 + }, + { + "epoch": 19.850746268656717, + "grad_norm": 0.18933714926242828, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0348, + "step": 5320 + }, + { + "epoch": 19.888059701492537, + "grad_norm": 0.21673552691936493, + "learning_rate": 8.772342342181095e-05, + "loss": 0.037, + "step": 5330 + }, + { + "epoch": 19.925373134328357, + "grad_norm": 0.13009892404079437, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0386, + "step": 5340 + }, + { + "epoch": 19.96268656716418, + "grad_norm": 0.1655230075120926, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0404, + "step": 5350 + }, + { + "epoch": 20.0, + "grad_norm": 0.2821272611618042, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0441, + "step": 5360 + }, + { + "epoch": 20.03731343283582, + "grad_norm": 0.1302652508020401, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0389, + "step": 5370 + }, + { + "epoch": 20.074626865671643, + "grad_norm": 0.13331563770771027, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0393, + "step": 5380 + }, + { + "epoch": 20.111940298507463, + "grad_norm": 0.244130939245224, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0372, + "step": 5390 + }, + { + "epoch": 20.149253731343283, + "grad_norm": 0.20429308712482452, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0387, + "step": 5400 + }, + { + "epoch": 20.186567164179106, + "grad_norm": 0.2954719364643097, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0404, + "step": 5410 + }, + { + "epoch": 20.223880597014926, + "grad_norm": 0.20438429713249207, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0359, + "step": 5420 + }, + { + "epoch": 20.261194029850746, + "grad_norm": 0.17289331555366516, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0357, + "step": 5430 + }, + { + "epoch": 20.298507462686565, + "grad_norm": 0.24367138743400574, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0392, + "step": 5440 + }, + { + "epoch": 20.33582089552239, + "grad_norm": 0.21900270879268646, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0419, + "step": 5450 + }, + { + "epoch": 20.37313432835821, + "grad_norm": 0.1526443362236023, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0333, + "step": 5460 + }, + { + "epoch": 20.41044776119403, + "grad_norm": 0.24582353234291077, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0417, + "step": 5470 + }, + { + "epoch": 20.44776119402985, + "grad_norm": 0.21462485194206238, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0347, + "step": 5480 + }, + { + "epoch": 20.48507462686567, + "grad_norm": 0.17611616849899292, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0395, + "step": 5490 + }, + { + "epoch": 20.52238805970149, + "grad_norm": 0.19724012911319733, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0332, + "step": 5500 + }, + { + "epoch": 20.559701492537314, + "grad_norm": 0.2080456167459488, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0361, + "step": 5510 + }, + { + "epoch": 20.597014925373134, + "grad_norm": 0.21469220519065857, + "learning_rate": 8.6674008130122e-05, + "loss": 0.039, + "step": 5520 + }, + { + "epoch": 20.634328358208954, + "grad_norm": 0.242497980594635, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0397, + "step": 5530 + }, + { + "epoch": 20.671641791044777, + "grad_norm": 0.20539864897727966, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0392, + "step": 5540 + }, + { + "epoch": 20.708955223880597, + "grad_norm": 0.21964021027088165, + "learning_rate": 8.650497541989482e-05, + "loss": 0.035, + "step": 5550 + }, + { + "epoch": 20.746268656716417, + "grad_norm": 0.15793637931346893, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0363, + "step": 5560 + }, + { + "epoch": 20.78358208955224, + "grad_norm": 0.1731041818857193, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0371, + "step": 5570 + }, + { + "epoch": 20.82089552238806, + "grad_norm": 0.15019342303276062, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0335, + "step": 5580 + }, + { + "epoch": 20.85820895522388, + "grad_norm": 0.1397496908903122, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0391, + "step": 5590 + }, + { + "epoch": 20.895522388059703, + "grad_norm": 0.141131192445755, + "learning_rate": 8.622126023955446e-05, + "loss": 0.041, + "step": 5600 + }, + { + "epoch": 20.932835820895523, + "grad_norm": 0.20025403797626495, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0412, + "step": 5610 + }, + { + "epoch": 20.970149253731343, + "grad_norm": 0.2251378893852234, + "learning_rate": 8.610707988678503e-05, + "loss": 0.037, + "step": 5620 + }, + { + "epoch": 21.007462686567163, + "grad_norm": 0.1341109722852707, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0371, + "step": 5630 + }, + { + "epoch": 21.044776119402986, + "grad_norm": 0.28053462505340576, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0366, + "step": 5640 + }, + { + "epoch": 21.082089552238806, + "grad_norm": 0.10567930340766907, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0394, + "step": 5650 + }, + { + "epoch": 21.119402985074625, + "grad_norm": 0.17919886112213135, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0357, + "step": 5660 + }, + { + "epoch": 21.15671641791045, + "grad_norm": 0.3223204016685486, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0369, + "step": 5670 + }, + { + "epoch": 21.19402985074627, + "grad_norm": 0.20072297751903534, + "learning_rate": 8.576217467724128e-05, + "loss": 0.0389, + "step": 5680 + }, + { + "epoch": 21.23134328358209, + "grad_norm": 0.1556226760149002, + "learning_rate": 8.570434735306671e-05, + "loss": 0.035, + "step": 5690 + }, + { + "epoch": 21.26865671641791, + "grad_norm": 0.20265886187553406, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0418, + "step": 5700 + }, + { + "epoch": 21.30597014925373, + "grad_norm": 0.15518955886363983, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0331, + "step": 5710 + }, + { + "epoch": 21.34328358208955, + "grad_norm": 0.1822584569454193, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0421, + "step": 5720 + }, + { + "epoch": 21.380597014925375, + "grad_norm": 0.14216330647468567, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0413, + "step": 5730 + }, + { + "epoch": 21.417910447761194, + "grad_norm": 0.24156329035758972, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0404, + "step": 5740 + }, + { + "epoch": 21.455223880597014, + "grad_norm": 0.2753167748451233, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0369, + "step": 5750 + }, + { + "epoch": 21.492537313432837, + "grad_norm": 0.17052626609802246, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0328, + "step": 5760 + }, + { + "epoch": 21.529850746268657, + "grad_norm": 0.11597824096679688, + "learning_rate": 8.523822798020827e-05, + "loss": 0.041, + "step": 5770 + }, + { + "epoch": 21.567164179104477, + "grad_norm": 0.14363346993923187, + "learning_rate": 8.517952785058385e-05, + "loss": 0.0393, + "step": 5780 + }, + { + "epoch": 21.604477611940297, + "grad_norm": 0.19373776018619537, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0372, + "step": 5790 + }, + { + "epoch": 21.64179104477612, + "grad_norm": 0.20276981592178345, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0389, + "step": 5800 + }, + { + "epoch": 21.67910447761194, + "grad_norm": 0.19267870485782623, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0371, + "step": 5810 + }, + { + "epoch": 21.71641791044776, + "grad_norm": 0.2701839208602905, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0333, + "step": 5820 + }, + { + "epoch": 21.753731343283583, + "grad_norm": 0.20612668991088867, + "learning_rate": 8.488458772904684e-05, + "loss": 0.0358, + "step": 5830 + }, + { + "epoch": 21.791044776119403, + "grad_norm": 0.18102902173995972, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0376, + "step": 5840 + }, + { + "epoch": 21.828358208955223, + "grad_norm": 0.23202018439769745, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0418, + "step": 5850 + }, + { + "epoch": 21.865671641791046, + "grad_norm": 0.09540139883756638, + "learning_rate": 8.470647788785665e-05, + "loss": 0.041, + "step": 5860 + }, + { + "epoch": 21.902985074626866, + "grad_norm": 0.23362809419631958, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0402, + "step": 5870 + }, + { + "epoch": 21.940298507462686, + "grad_norm": 0.20929335057735443, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0385, + "step": 5880 + }, + { + "epoch": 21.97761194029851, + "grad_norm": 0.18403425812721252, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0399, + "step": 5890 + }, + { + "epoch": 22.01492537313433, + "grad_norm": 0.2034774273633957, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0361, + "step": 5900 + }, + { + "epoch": 22.05223880597015, + "grad_norm": 0.14981597661972046, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0393, + "step": 5910 + }, + { + "epoch": 22.08955223880597, + "grad_norm": 0.20903146266937256, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0406, + "step": 5920 + }, + { + "epoch": 22.12686567164179, + "grad_norm": 0.12090307474136353, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0349, + "step": 5930 + }, + { + "epoch": 22.16417910447761, + "grad_norm": 0.14085660874843597, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0344, + "step": 5940 + }, + { + "epoch": 22.20149253731343, + "grad_norm": 0.30808404088020325, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0298, + "step": 5950 + }, + { + "epoch": 22.238805970149254, + "grad_norm": 0.17409317195415497, + "learning_rate": 8.410663560133784e-05, + "loss": 0.035, + "step": 5960 + }, + { + "epoch": 22.276119402985074, + "grad_norm": 0.18731828033924103, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0322, + "step": 5970 + }, + { + "epoch": 22.313432835820894, + "grad_norm": 0.16483667492866516, + "learning_rate": 8.398554292153866e-05, + "loss": 0.033, + "step": 5980 + }, + { + "epoch": 22.350746268656717, + "grad_norm": 0.195018008351326, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0344, + "step": 5990 + }, + { + "epoch": 22.388059701492537, + "grad_norm": 0.18210549652576447, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0387, + "step": 6000 + }, + { + "epoch": 22.425373134328357, + "grad_norm": 0.18658341467380524, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0359, + "step": 6010 + }, + { + "epoch": 22.46268656716418, + "grad_norm": 0.260953426361084, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0291, + "step": 6020 + }, + { + "epoch": 22.5, + "grad_norm": 0.2177930772304535, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0384, + "step": 6030 + }, + { + "epoch": 22.53731343283582, + "grad_norm": 0.1596938520669937, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0372, + "step": 6040 + }, + { + "epoch": 22.574626865671643, + "grad_norm": 0.21605637669563293, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0304, + "step": 6050 + }, + { + "epoch": 22.611940298507463, + "grad_norm": 0.13812203705310822, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0295, + "step": 6060 + }, + { + "epoch": 22.649253731343283, + "grad_norm": 0.22850565612316132, + "learning_rate": 8.343604577838964e-05, + "loss": 0.0385, + "step": 6070 + }, + { + "epoch": 22.686567164179106, + "grad_norm": 0.22924698889255524, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0438, + "step": 6080 + }, + { + "epoch": 22.723880597014926, + "grad_norm": 0.1455918848514557, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0358, + "step": 6090 + }, + { + "epoch": 22.761194029850746, + "grad_norm": 0.1839921921491623, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0349, + "step": 6100 + }, + { + "epoch": 22.798507462686565, + "grad_norm": 0.24356882274150848, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0335, + "step": 6110 + }, + { + "epoch": 22.83582089552239, + "grad_norm": 0.2336840182542801, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0339, + "step": 6120 + }, + { + "epoch": 22.87313432835821, + "grad_norm": 0.17839699983596802, + "learning_rate": 8.306559326618259e-05, + "loss": 0.0365, + "step": 6130 + }, + { + "epoch": 22.91044776119403, + "grad_norm": 0.18088172376155853, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0331, + "step": 6140 + }, + { + "epoch": 22.94776119402985, + "grad_norm": 0.1771453320980072, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0321, + "step": 6150 + }, + { + "epoch": 22.98507462686567, + "grad_norm": 0.1374535709619522, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0328, + "step": 6160 + }, + { + "epoch": 23.02238805970149, + "grad_norm": 0.17898012697696686, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0408, + "step": 6170 + }, + { + "epoch": 23.059701492537314, + "grad_norm": 0.21729676425457, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0354, + "step": 6180 + }, + { + "epoch": 23.097014925373134, + "grad_norm": 0.2473490685224533, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0361, + "step": 6190 + }, + { + "epoch": 23.134328358208954, + "grad_norm": 0.15661069750785828, + "learning_rate": 8.262928807620843e-05, + "loss": 0.036, + "step": 6200 + }, + { + "epoch": 23.171641791044777, + "grad_norm": 0.12378236651420593, + "learning_rate": 8.256660056776076e-05, + "loss": 0.0308, + "step": 6210 + }, + { + "epoch": 23.208955223880597, + "grad_norm": 0.1373433768749237, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0344, + "step": 6220 + }, + { + "epoch": 23.246268656716417, + "grad_norm": 0.14814983308315277, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0368, + "step": 6230 + }, + { + "epoch": 23.28358208955224, + "grad_norm": 0.15903662145137787, + "learning_rate": 8.237800451412095e-05, + "loss": 0.033, + "step": 6240 + }, + { + "epoch": 23.32089552238806, + "grad_norm": 0.1676921397447586, + "learning_rate": 8.231496189304704e-05, + "loss": 0.0361, + "step": 6250 + }, + { + "epoch": 23.35820895522388, + "grad_norm": 0.2496129870414734, + "learning_rate": 8.225183092410128e-05, + "loss": 0.037, + "step": 6260 + }, + { + "epoch": 23.395522388059703, + "grad_norm": 0.1830875128507614, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0377, + "step": 6270 + }, + { + "epoch": 23.432835820895523, + "grad_norm": 0.18538393080234528, + "learning_rate": 8.212530463322583e-05, + "loss": 0.0343, + "step": 6280 + }, + { + "epoch": 23.470149253731343, + "grad_norm": 0.23813718557357788, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0336, + "step": 6290 + }, + { + "epoch": 23.507462686567163, + "grad_norm": 0.14053800702095032, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0334, + "step": 6300 + }, + { + "epoch": 23.544776119402986, + "grad_norm": 0.19115787744522095, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0338, + "step": 6310 + }, + { + "epoch": 23.582089552238806, + "grad_norm": 0.1176459789276123, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0324, + "step": 6320 + }, + { + "epoch": 23.619402985074625, + "grad_norm": 0.13881400227546692, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0375, + "step": 6330 + }, + { + "epoch": 23.65671641791045, + "grad_norm": 0.12102743983268738, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0338, + "step": 6340 + }, + { + "epoch": 23.69402985074627, + "grad_norm": 0.16610436141490936, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0296, + "step": 6350 + }, + { + "epoch": 23.73134328358209, + "grad_norm": 0.12234822660684586, + "learning_rate": 8.161570019212921e-05, + "loss": 0.029, + "step": 6360 + }, + { + "epoch": 23.76865671641791, + "grad_norm": 0.17056342959403992, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0381, + "step": 6370 + }, + { + "epoch": 23.80597014925373, + "grad_norm": 0.1477614790201187, + "learning_rate": 8.148743122865463e-05, + "loss": 0.0315, + "step": 6380 + }, + { + "epoch": 23.84328358208955, + "grad_norm": 0.38320279121398926, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0366, + "step": 6390 + }, + { + "epoch": 23.880597014925375, + "grad_norm": 0.1497313380241394, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0325, + "step": 6400 + }, + { + "epoch": 23.917910447761194, + "grad_norm": 0.1574944257736206, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0372, + "step": 6410 + }, + { + "epoch": 23.955223880597014, + "grad_norm": 0.17678116261959076, + "learning_rate": 8.12298616836904e-05, + "loss": 0.034, + "step": 6420 + }, + { + "epoch": 23.992537313432837, + "grad_norm": 0.13617518544197083, + "learning_rate": 8.116525540362434e-05, + "loss": 0.032, + "step": 6430 + }, + { + "epoch": 24.029850746268657, + "grad_norm": 0.1610628217458725, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0295, + "step": 6440 + }, + { + "epoch": 24.067164179104477, + "grad_norm": 0.24379907548427582, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0318, + "step": 6450 + }, + { + "epoch": 24.104477611940297, + "grad_norm": 0.15908868610858917, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0285, + "step": 6460 + }, + { + "epoch": 24.14179104477612, + "grad_norm": 0.17211472988128662, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0345, + "step": 6470 + }, + { + "epoch": 24.17910447761194, + "grad_norm": 0.10870133340358734, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0349, + "step": 6480 + }, + { + "epoch": 24.21641791044776, + "grad_norm": 0.1614072173833847, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0305, + "step": 6490 + }, + { + "epoch": 24.253731343283583, + "grad_norm": 0.1449541449546814, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0326, + "step": 6500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.8861690201160704e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7000/README.md b/checkpoint-7000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-7000/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-7000/adapter_config.json b/checkpoint-7000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-7000/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7000/adapter_model.safetensors b/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9931bf448aafd231c6bfa8c22d1323df5ee5d76 --- /dev/null +++ b/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c37651e874df6e132130379078083cebd5a019b9c1e0dd7d126dcd5b1fc58e +size 123328576 diff --git a/checkpoint-7000/experiment_cfg/metadata.json b/checkpoint-7000/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-7000/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-7000/optimizer.pt b/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0491f8722bfdb269d0683c83c53e0610373d2cc --- /dev/null +++ b/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1670a79d6ec87ef34796ee1a51ce88f6cfb39ccf5f6f1c7a766e1aa501466b5d +size 246824634 diff --git a/checkpoint-7000/rng_state.pth b/checkpoint-7000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..814aa2bd44c06a560c3da27389d22fe8eebad7b0 --- /dev/null +++ b/checkpoint-7000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f2b87b18f2501953c4fdaa3bce4008b6859745fcd22bd46b16f3a89b3ce9a24 +size 14244 diff --git a/checkpoint-7000/scheduler.pt b/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4be81b71fb9a0d3d6a169c4c6306bc41f56c18af --- /dev/null +++ b/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f4a62865e4efefe590b1e7efb8b83e3201ac29d0f7970441bd0ea45267d0f10 +size 1064 diff --git a/checkpoint-7000/trainer_state.json b/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e098cef5f7c93cf84e01ddc8ce2262071afd9ab9 --- /dev/null +++ b/checkpoint-7000/trainer_state.json @@ -0,0 +1,4933 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 26.119402985074625, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + }, + { + "epoch": 16.828358208955223, + "grad_norm": 0.1758819818496704, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0462, + "step": 4510 + }, + { + "epoch": 16.865671641791046, + "grad_norm": 0.18571069836616516, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0425, + "step": 4520 + }, + { + "epoch": 16.902985074626866, + "grad_norm": 0.20819155871868134, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0428, + "step": 4530 + }, + { + "epoch": 16.940298507462686, + "grad_norm": 0.30357328057289124, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0414, + "step": 4540 + }, + { + "epoch": 16.97761194029851, + "grad_norm": 0.20977462828159332, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0447, + "step": 4550 + }, + { + "epoch": 17.01492537313433, + "grad_norm": 0.2535971701145172, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0384, + "step": 4560 + }, + { + "epoch": 17.05223880597015, + "grad_norm": 0.2789897620677948, + "learning_rate": 9.153900045904549e-05, + "loss": 0.042, + "step": 4570 + }, + { + "epoch": 17.08955223880597, + "grad_norm": 0.18474848568439484, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0398, + "step": 4580 + }, + { + "epoch": 17.12686567164179, + "grad_norm": 0.12615208327770233, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 0.17756640911102295, + "learning_rate": 9.140044155740101e-05, + "loss": 0.035, + "step": 4600 + }, + { + "epoch": 17.20149253731343, + "grad_norm": 0.24410821497440338, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0459, + "step": 4610 + }, + { + "epoch": 17.238805970149254, + "grad_norm": 0.21573011577129364, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0385, + "step": 4620 + }, + { + "epoch": 17.276119402985074, + "grad_norm": 0.13879653811454773, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0391, + "step": 4630 + }, + { + "epoch": 17.313432835820894, + "grad_norm": 0.17508305609226227, + "learning_rate": 9.121411232980588e-05, + "loss": 0.038, + "step": 4640 + }, + { + "epoch": 17.350746268656717, + "grad_norm": 0.2536008358001709, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0418, + "step": 4650 + }, + { + "epoch": 17.388059701492537, + "grad_norm": 0.1942976713180542, + "learning_rate": 9.112027113896262e-05, + "loss": 0.052, + "step": 4660 + }, + { + "epoch": 17.425373134328357, + "grad_norm": 0.16561119258403778, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0451, + "step": 4670 + }, + { + "epoch": 17.46268656716418, + "grad_norm": 0.22971832752227783, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0407, + "step": 4680 + }, + { + "epoch": 17.5, + "grad_norm": 0.1306753158569336, + "learning_rate": 9.097866651593317e-05, + "loss": 0.042, + "step": 4690 + }, + { + "epoch": 17.53731343283582, + "grad_norm": 0.21278400719165802, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 17.574626865671643, + "grad_norm": 0.22757171094417572, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0364, + "step": 4710 + }, + { + "epoch": 17.611940298507463, + "grad_norm": 0.216596320271492, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0434, + "step": 4720 + }, + { + "epoch": 17.649253731343283, + "grad_norm": 0.13022471964359283, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0415, + "step": 4730 + }, + { + "epoch": 17.686567164179106, + "grad_norm": 0.2280716598033905, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0385, + "step": 4740 + }, + { + "epoch": 17.723880597014926, + "grad_norm": 0.14666135609149933, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0347, + "step": 4750 + }, + { + "epoch": 17.761194029850746, + "grad_norm": 0.1631281077861786, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0407, + "step": 4760 + }, + { + "epoch": 17.798507462686565, + "grad_norm": 0.18697327375411987, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0425, + "step": 4770 + }, + { + "epoch": 17.83582089552239, + "grad_norm": 0.12955111265182495, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 17.87313432835821, + "grad_norm": 0.15547148883342743, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0409, + "step": 4790 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 0.1900598704814911, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0369, + "step": 4800 + }, + { + "epoch": 17.94776119402985, + "grad_norm": 0.1846715807914734, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0415, + "step": 4810 + }, + { + "epoch": 17.98507462686567, + "grad_norm": 0.1829937845468521, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 18.02238805970149, + "grad_norm": 0.25900354981422424, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0398, + "step": 4830 + }, + { + "epoch": 18.059701492537314, + "grad_norm": 0.21235992014408112, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0477, + "step": 4840 + }, + { + "epoch": 18.097014925373134, + "grad_norm": 0.18785078823566437, + "learning_rate": 9.020649881213958e-05, + "loss": 0.039, + "step": 4850 + }, + { + "epoch": 18.134328358208954, + "grad_norm": 0.1951548010110855, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0357, + "step": 4860 + }, + { + "epoch": 18.171641791044777, + "grad_norm": 0.1280934363603592, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0425, + "step": 4870 + }, + { + "epoch": 18.208955223880597, + "grad_norm": 0.1693423092365265, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0447, + "step": 4880 + }, + { + "epoch": 18.246268656716417, + "grad_norm": 0.23712658882141113, + "learning_rate": 9.000903867511666e-05, + "loss": 0.042, + "step": 4890 + }, + { + "epoch": 18.28358208955224, + "grad_norm": 0.26489710807800293, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0457, + "step": 4900 + }, + { + "epoch": 18.32089552238806, + "grad_norm": 0.20792756974697113, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 18.35820895522388, + "grad_norm": 0.18526089191436768, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0366, + "step": 4920 + }, + { + "epoch": 18.395522388059703, + "grad_norm": 0.2214607298374176, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 18.432835820895523, + "grad_norm": 0.1896953135728836, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0391, + "step": 4940 + }, + { + "epoch": 18.470149253731343, + "grad_norm": 0.1430232971906662, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0403, + "step": 4950 + }, + { + "epoch": 18.507462686567163, + "grad_norm": 0.1991272121667862, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0429, + "step": 4960 + }, + { + "epoch": 18.544776119402986, + "grad_norm": 0.2361849844455719, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0416, + "step": 4970 + }, + { + "epoch": 18.582089552238806, + "grad_norm": 0.25857019424438477, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0461, + "step": 4980 + }, + { + "epoch": 18.619402985074625, + "grad_norm": 0.12873682379722595, + "learning_rate": 8.950775061878453e-05, + "loss": 0.035, + "step": 4990 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 0.19786769151687622, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0399, + "step": 5000 + }, + { + "epoch": 18.69402985074627, + "grad_norm": 0.2562239170074463, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0372, + "step": 5010 + }, + { + "epoch": 18.73134328358209, + "grad_norm": 0.14586858451366425, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0427, + "step": 5020 + }, + { + "epoch": 18.76865671641791, + "grad_norm": 0.20062318444252014, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0403, + "step": 5030 + }, + { + "epoch": 18.80597014925373, + "grad_norm": 0.22698874771595, + "learning_rate": 8.92530475251784e-05, + "loss": 0.036, + "step": 5040 + }, + { + "epoch": 18.84328358208955, + "grad_norm": 0.2103697657585144, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0431, + "step": 5050 + }, + { + "epoch": 18.880597014925375, + "grad_norm": 0.16042308509349823, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0388, + "step": 5060 + }, + { + "epoch": 18.917910447761194, + "grad_norm": 0.16874109208583832, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0388, + "step": 5070 + }, + { + "epoch": 18.955223880597014, + "grad_norm": 0.15569192171096802, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0353, + "step": 5080 + }, + { + "epoch": 18.992537313432837, + "grad_norm": 0.16723507642745972, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0443, + "step": 5090 + }, + { + "epoch": 19.029850746268657, + "grad_norm": 0.23284228146076202, + "learning_rate": 8.894386393810563e-05, + "loss": 0.05, + "step": 5100 + }, + { + "epoch": 19.067164179104477, + "grad_norm": 0.1621718853712082, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0369, + "step": 5110 + }, + { + "epoch": 19.104477611940297, + "grad_norm": 0.17522747814655304, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0475, + "step": 5120 + }, + { + "epoch": 19.14179104477612, + "grad_norm": 0.16110533475875854, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0383, + "step": 5130 + }, + { + "epoch": 19.17910447761194, + "grad_norm": 0.2574177086353302, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0383, + "step": 5140 + }, + { + "epoch": 19.21641791044776, + "grad_norm": 0.14560100436210632, + "learning_rate": 8.868328171593448e-05, + "loss": 0.037, + "step": 5150 + }, + { + "epoch": 19.253731343283583, + "grad_norm": 0.14456631243228912, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0423, + "step": 5160 + }, + { + "epoch": 19.291044776119403, + "grad_norm": 0.1403595507144928, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0327, + "step": 5170 + }, + { + "epoch": 19.328358208955223, + "grad_norm": 0.18462564051151276, + "learning_rate": 8.852566213878947e-05, + "loss": 0.037, + "step": 5180 + }, + { + "epoch": 19.365671641791046, + "grad_norm": 0.20725117623806, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0376, + "step": 5190 + }, + { + "epoch": 19.402985074626866, + "grad_norm": 0.17023132741451263, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0467, + "step": 5200 + }, + { + "epoch": 19.440298507462686, + "grad_norm": 0.31033241748809814, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0425, + "step": 5210 + }, + { + "epoch": 19.47761194029851, + "grad_norm": 0.14057482779026031, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0432, + "step": 5220 + }, + { + "epoch": 19.51492537313433, + "grad_norm": 0.23247437179088593, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0396, + "step": 5230 + }, + { + "epoch": 19.55223880597015, + "grad_norm": 0.1305907964706421, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0389, + "step": 5240 + }, + { + "epoch": 19.58955223880597, + "grad_norm": 0.17093417048454285, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0347, + "step": 5250 + }, + { + "epoch": 19.62686567164179, + "grad_norm": 0.24105240404605865, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0392, + "step": 5260 + }, + { + "epoch": 19.66417910447761, + "grad_norm": 0.2234315127134323, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0403, + "step": 5270 + }, + { + "epoch": 19.701492537313435, + "grad_norm": 0.16947844624519348, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0368, + "step": 5280 + }, + { + "epoch": 19.738805970149254, + "grad_norm": 0.26133742928504944, + "learning_rate": 8.79396432173515e-05, + "loss": 0.041, + "step": 5290 + }, + { + "epoch": 19.776119402985074, + "grad_norm": 0.2099352777004242, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0363, + "step": 5300 + }, + { + "epoch": 19.813432835820894, + "grad_norm": 0.1662513017654419, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0409, + "step": 5310 + }, + { + "epoch": 19.850746268656717, + "grad_norm": 0.18933714926242828, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0348, + "step": 5320 + }, + { + "epoch": 19.888059701492537, + "grad_norm": 0.21673552691936493, + "learning_rate": 8.772342342181095e-05, + "loss": 0.037, + "step": 5330 + }, + { + "epoch": 19.925373134328357, + "grad_norm": 0.13009892404079437, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0386, + "step": 5340 + }, + { + "epoch": 19.96268656716418, + "grad_norm": 0.1655230075120926, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0404, + "step": 5350 + }, + { + "epoch": 20.0, + "grad_norm": 0.2821272611618042, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0441, + "step": 5360 + }, + { + "epoch": 20.03731343283582, + "grad_norm": 0.1302652508020401, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0389, + "step": 5370 + }, + { + "epoch": 20.074626865671643, + "grad_norm": 0.13331563770771027, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0393, + "step": 5380 + }, + { + "epoch": 20.111940298507463, + "grad_norm": 0.244130939245224, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0372, + "step": 5390 + }, + { + "epoch": 20.149253731343283, + "grad_norm": 0.20429308712482452, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0387, + "step": 5400 + }, + { + "epoch": 20.186567164179106, + "grad_norm": 0.2954719364643097, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0404, + "step": 5410 + }, + { + "epoch": 20.223880597014926, + "grad_norm": 0.20438429713249207, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0359, + "step": 5420 + }, + { + "epoch": 20.261194029850746, + "grad_norm": 0.17289331555366516, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0357, + "step": 5430 + }, + { + "epoch": 20.298507462686565, + "grad_norm": 0.24367138743400574, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0392, + "step": 5440 + }, + { + "epoch": 20.33582089552239, + "grad_norm": 0.21900270879268646, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0419, + "step": 5450 + }, + { + "epoch": 20.37313432835821, + "grad_norm": 0.1526443362236023, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0333, + "step": 5460 + }, + { + "epoch": 20.41044776119403, + "grad_norm": 0.24582353234291077, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0417, + "step": 5470 + }, + { + "epoch": 20.44776119402985, + "grad_norm": 0.21462485194206238, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0347, + "step": 5480 + }, + { + "epoch": 20.48507462686567, + "grad_norm": 0.17611616849899292, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0395, + "step": 5490 + }, + { + "epoch": 20.52238805970149, + "grad_norm": 0.19724012911319733, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0332, + "step": 5500 + }, + { + "epoch": 20.559701492537314, + "grad_norm": 0.2080456167459488, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0361, + "step": 5510 + }, + { + "epoch": 20.597014925373134, + "grad_norm": 0.21469220519065857, + "learning_rate": 8.6674008130122e-05, + "loss": 0.039, + "step": 5520 + }, + { + "epoch": 20.634328358208954, + "grad_norm": 0.242497980594635, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0397, + "step": 5530 + }, + { + "epoch": 20.671641791044777, + "grad_norm": 0.20539864897727966, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0392, + "step": 5540 + }, + { + "epoch": 20.708955223880597, + "grad_norm": 0.21964021027088165, + "learning_rate": 8.650497541989482e-05, + "loss": 0.035, + "step": 5550 + }, + { + "epoch": 20.746268656716417, + "grad_norm": 0.15793637931346893, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0363, + "step": 5560 + }, + { + "epoch": 20.78358208955224, + "grad_norm": 0.1731041818857193, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0371, + "step": 5570 + }, + { + "epoch": 20.82089552238806, + "grad_norm": 0.15019342303276062, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0335, + "step": 5580 + }, + { + "epoch": 20.85820895522388, + "grad_norm": 0.1397496908903122, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0391, + "step": 5590 + }, + { + "epoch": 20.895522388059703, + "grad_norm": 0.141131192445755, + "learning_rate": 8.622126023955446e-05, + "loss": 0.041, + "step": 5600 + }, + { + "epoch": 20.932835820895523, + "grad_norm": 0.20025403797626495, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0412, + "step": 5610 + }, + { + "epoch": 20.970149253731343, + "grad_norm": 0.2251378893852234, + "learning_rate": 8.610707988678503e-05, + "loss": 0.037, + "step": 5620 + }, + { + "epoch": 21.007462686567163, + "grad_norm": 0.1341109722852707, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0371, + "step": 5630 + }, + { + "epoch": 21.044776119402986, + "grad_norm": 0.28053462505340576, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0366, + "step": 5640 + }, + { + "epoch": 21.082089552238806, + "grad_norm": 0.10567930340766907, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0394, + "step": 5650 + }, + { + "epoch": 21.119402985074625, + "grad_norm": 0.17919886112213135, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0357, + "step": 5660 + }, + { + "epoch": 21.15671641791045, + "grad_norm": 0.3223204016685486, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0369, + "step": 5670 + }, + { + "epoch": 21.19402985074627, + "grad_norm": 0.20072297751903534, + "learning_rate": 8.576217467724128e-05, + "loss": 0.0389, + "step": 5680 + }, + { + "epoch": 21.23134328358209, + "grad_norm": 0.1556226760149002, + "learning_rate": 8.570434735306671e-05, + "loss": 0.035, + "step": 5690 + }, + { + "epoch": 21.26865671641791, + "grad_norm": 0.20265886187553406, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0418, + "step": 5700 + }, + { + "epoch": 21.30597014925373, + "grad_norm": 0.15518955886363983, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0331, + "step": 5710 + }, + { + "epoch": 21.34328358208955, + "grad_norm": 0.1822584569454193, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0421, + "step": 5720 + }, + { + "epoch": 21.380597014925375, + "grad_norm": 0.14216330647468567, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0413, + "step": 5730 + }, + { + "epoch": 21.417910447761194, + "grad_norm": 0.24156329035758972, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0404, + "step": 5740 + }, + { + "epoch": 21.455223880597014, + "grad_norm": 0.2753167748451233, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0369, + "step": 5750 + }, + { + "epoch": 21.492537313432837, + "grad_norm": 0.17052626609802246, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0328, + "step": 5760 + }, + { + "epoch": 21.529850746268657, + "grad_norm": 0.11597824096679688, + "learning_rate": 8.523822798020827e-05, + "loss": 0.041, + "step": 5770 + }, + { + "epoch": 21.567164179104477, + "grad_norm": 0.14363346993923187, + "learning_rate": 8.517952785058385e-05, + "loss": 0.0393, + "step": 5780 + }, + { + "epoch": 21.604477611940297, + "grad_norm": 0.19373776018619537, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0372, + "step": 5790 + }, + { + "epoch": 21.64179104477612, + "grad_norm": 0.20276981592178345, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0389, + "step": 5800 + }, + { + "epoch": 21.67910447761194, + "grad_norm": 0.19267870485782623, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0371, + "step": 5810 + }, + { + "epoch": 21.71641791044776, + "grad_norm": 0.2701839208602905, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0333, + "step": 5820 + }, + { + "epoch": 21.753731343283583, + "grad_norm": 0.20612668991088867, + "learning_rate": 8.488458772904684e-05, + "loss": 0.0358, + "step": 5830 + }, + { + "epoch": 21.791044776119403, + "grad_norm": 0.18102902173995972, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0376, + "step": 5840 + }, + { + "epoch": 21.828358208955223, + "grad_norm": 0.23202018439769745, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0418, + "step": 5850 + }, + { + "epoch": 21.865671641791046, + "grad_norm": 0.09540139883756638, + "learning_rate": 8.470647788785665e-05, + "loss": 0.041, + "step": 5860 + }, + { + "epoch": 21.902985074626866, + "grad_norm": 0.23362809419631958, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0402, + "step": 5870 + }, + { + "epoch": 21.940298507462686, + "grad_norm": 0.20929335057735443, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0385, + "step": 5880 + }, + { + "epoch": 21.97761194029851, + "grad_norm": 0.18403425812721252, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0399, + "step": 5890 + }, + { + "epoch": 22.01492537313433, + "grad_norm": 0.2034774273633957, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0361, + "step": 5900 + }, + { + "epoch": 22.05223880597015, + "grad_norm": 0.14981597661972046, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0393, + "step": 5910 + }, + { + "epoch": 22.08955223880597, + "grad_norm": 0.20903146266937256, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0406, + "step": 5920 + }, + { + "epoch": 22.12686567164179, + "grad_norm": 0.12090307474136353, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0349, + "step": 5930 + }, + { + "epoch": 22.16417910447761, + "grad_norm": 0.14085660874843597, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0344, + "step": 5940 + }, + { + "epoch": 22.20149253731343, + "grad_norm": 0.30808404088020325, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0298, + "step": 5950 + }, + { + "epoch": 22.238805970149254, + "grad_norm": 0.17409317195415497, + "learning_rate": 8.410663560133784e-05, + "loss": 0.035, + "step": 5960 + }, + { + "epoch": 22.276119402985074, + "grad_norm": 0.18731828033924103, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0322, + "step": 5970 + }, + { + "epoch": 22.313432835820894, + "grad_norm": 0.16483667492866516, + "learning_rate": 8.398554292153866e-05, + "loss": 0.033, + "step": 5980 + }, + { + "epoch": 22.350746268656717, + "grad_norm": 0.195018008351326, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0344, + "step": 5990 + }, + { + "epoch": 22.388059701492537, + "grad_norm": 0.18210549652576447, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0387, + "step": 6000 + }, + { + "epoch": 22.425373134328357, + "grad_norm": 0.18658341467380524, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0359, + "step": 6010 + }, + { + "epoch": 22.46268656716418, + "grad_norm": 0.260953426361084, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0291, + "step": 6020 + }, + { + "epoch": 22.5, + "grad_norm": 0.2177930772304535, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0384, + "step": 6030 + }, + { + "epoch": 22.53731343283582, + "grad_norm": 0.1596938520669937, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0372, + "step": 6040 + }, + { + "epoch": 22.574626865671643, + "grad_norm": 0.21605637669563293, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0304, + "step": 6050 + }, + { + "epoch": 22.611940298507463, + "grad_norm": 0.13812203705310822, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0295, + "step": 6060 + }, + { + "epoch": 22.649253731343283, + "grad_norm": 0.22850565612316132, + "learning_rate": 8.343604577838964e-05, + "loss": 0.0385, + "step": 6070 + }, + { + "epoch": 22.686567164179106, + "grad_norm": 0.22924698889255524, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0438, + "step": 6080 + }, + { + "epoch": 22.723880597014926, + "grad_norm": 0.1455918848514557, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0358, + "step": 6090 + }, + { + "epoch": 22.761194029850746, + "grad_norm": 0.1839921921491623, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0349, + "step": 6100 + }, + { + "epoch": 22.798507462686565, + "grad_norm": 0.24356882274150848, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0335, + "step": 6110 + }, + { + "epoch": 22.83582089552239, + "grad_norm": 0.2336840182542801, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0339, + "step": 6120 + }, + { + "epoch": 22.87313432835821, + "grad_norm": 0.17839699983596802, + "learning_rate": 8.306559326618259e-05, + "loss": 0.0365, + "step": 6130 + }, + { + "epoch": 22.91044776119403, + "grad_norm": 0.18088172376155853, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0331, + "step": 6140 + }, + { + "epoch": 22.94776119402985, + "grad_norm": 0.1771453320980072, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0321, + "step": 6150 + }, + { + "epoch": 22.98507462686567, + "grad_norm": 0.1374535709619522, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0328, + "step": 6160 + }, + { + "epoch": 23.02238805970149, + "grad_norm": 0.17898012697696686, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0408, + "step": 6170 + }, + { + "epoch": 23.059701492537314, + "grad_norm": 0.21729676425457, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0354, + "step": 6180 + }, + { + "epoch": 23.097014925373134, + "grad_norm": 0.2473490685224533, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0361, + "step": 6190 + }, + { + "epoch": 23.134328358208954, + "grad_norm": 0.15661069750785828, + "learning_rate": 8.262928807620843e-05, + "loss": 0.036, + "step": 6200 + }, + { + "epoch": 23.171641791044777, + "grad_norm": 0.12378236651420593, + "learning_rate": 8.256660056776076e-05, + "loss": 0.0308, + "step": 6210 + }, + { + "epoch": 23.208955223880597, + "grad_norm": 0.1373433768749237, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0344, + "step": 6220 + }, + { + "epoch": 23.246268656716417, + "grad_norm": 0.14814983308315277, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0368, + "step": 6230 + }, + { + "epoch": 23.28358208955224, + "grad_norm": 0.15903662145137787, + "learning_rate": 8.237800451412095e-05, + "loss": 0.033, + "step": 6240 + }, + { + "epoch": 23.32089552238806, + "grad_norm": 0.1676921397447586, + "learning_rate": 8.231496189304704e-05, + "loss": 0.0361, + "step": 6250 + }, + { + "epoch": 23.35820895522388, + "grad_norm": 0.2496129870414734, + "learning_rate": 8.225183092410128e-05, + "loss": 0.037, + "step": 6260 + }, + { + "epoch": 23.395522388059703, + "grad_norm": 0.1830875128507614, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0377, + "step": 6270 + }, + { + "epoch": 23.432835820895523, + "grad_norm": 0.18538393080234528, + "learning_rate": 8.212530463322583e-05, + "loss": 0.0343, + "step": 6280 + }, + { + "epoch": 23.470149253731343, + "grad_norm": 0.23813718557357788, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0336, + "step": 6290 + }, + { + "epoch": 23.507462686567163, + "grad_norm": 0.14053800702095032, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0334, + "step": 6300 + }, + { + "epoch": 23.544776119402986, + "grad_norm": 0.19115787744522095, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0338, + "step": 6310 + }, + { + "epoch": 23.582089552238806, + "grad_norm": 0.1176459789276123, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0324, + "step": 6320 + }, + { + "epoch": 23.619402985074625, + "grad_norm": 0.13881400227546692, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0375, + "step": 6330 + }, + { + "epoch": 23.65671641791045, + "grad_norm": 0.12102743983268738, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0338, + "step": 6340 + }, + { + "epoch": 23.69402985074627, + "grad_norm": 0.16610436141490936, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0296, + "step": 6350 + }, + { + "epoch": 23.73134328358209, + "grad_norm": 0.12234822660684586, + "learning_rate": 8.161570019212921e-05, + "loss": 0.029, + "step": 6360 + }, + { + "epoch": 23.76865671641791, + "grad_norm": 0.17056342959403992, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0381, + "step": 6370 + }, + { + "epoch": 23.80597014925373, + "grad_norm": 0.1477614790201187, + "learning_rate": 8.148743122865463e-05, + "loss": 0.0315, + "step": 6380 + }, + { + "epoch": 23.84328358208955, + "grad_norm": 0.38320279121398926, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0366, + "step": 6390 + }, + { + "epoch": 23.880597014925375, + "grad_norm": 0.1497313380241394, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0325, + "step": 6400 + }, + { + "epoch": 23.917910447761194, + "grad_norm": 0.1574944257736206, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0372, + "step": 6410 + }, + { + "epoch": 23.955223880597014, + "grad_norm": 0.17678116261959076, + "learning_rate": 8.12298616836904e-05, + "loss": 0.034, + "step": 6420 + }, + { + "epoch": 23.992537313432837, + "grad_norm": 0.13617518544197083, + "learning_rate": 8.116525540362434e-05, + "loss": 0.032, + "step": 6430 + }, + { + "epoch": 24.029850746268657, + "grad_norm": 0.1610628217458725, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0295, + "step": 6440 + }, + { + "epoch": 24.067164179104477, + "grad_norm": 0.24379907548427582, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0318, + "step": 6450 + }, + { + "epoch": 24.104477611940297, + "grad_norm": 0.15908868610858917, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0285, + "step": 6460 + }, + { + "epoch": 24.14179104477612, + "grad_norm": 0.17211472988128662, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0345, + "step": 6470 + }, + { + "epoch": 24.17910447761194, + "grad_norm": 0.10870133340358734, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0349, + "step": 6480 + }, + { + "epoch": 24.21641791044776, + "grad_norm": 0.1614072173833847, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0305, + "step": 6490 + }, + { + "epoch": 24.253731343283583, + "grad_norm": 0.1449541449546814, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0326, + "step": 6500 + }, + { + "epoch": 24.291044776119403, + "grad_norm": 0.15968690812587738, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0379, + "step": 6510 + }, + { + "epoch": 24.328358208955223, + "grad_norm": 0.2027505785226822, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0368, + "step": 6520 + }, + { + "epoch": 24.365671641791046, + "grad_norm": 0.18664468824863434, + "learning_rate": 8.051453560801772e-05, + "loss": 0.041, + "step": 6530 + }, + { + "epoch": 24.402985074626866, + "grad_norm": 0.2137981504201889, + "learning_rate": 8.044900184287007e-05, + "loss": 0.036, + "step": 6540 + }, + { + "epoch": 24.440298507462686, + "grad_norm": 0.1381145715713501, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0342, + "step": 6550 + }, + { + "epoch": 24.47761194029851, + "grad_norm": 0.2125469446182251, + "learning_rate": 8.031768475274413e-05, + "loss": 0.0363, + "step": 6560 + }, + { + "epoch": 24.51492537313433, + "grad_norm": 0.1482478678226471, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0359, + "step": 6570 + }, + { + "epoch": 24.55223880597015, + "grad_norm": 0.17988649010658264, + "learning_rate": 8.018603611327504e-05, + "loss": 0.0388, + "step": 6580 + }, + { + "epoch": 24.58955223880597, + "grad_norm": 0.1568310409784317, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0357, + "step": 6590 + }, + { + "epoch": 24.62686567164179, + "grad_norm": 0.17348839342594147, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0348, + "step": 6600 + }, + { + "epoch": 24.66417910447761, + "grad_norm": 0.18807284533977509, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0371, + "step": 6610 + }, + { + "epoch": 24.701492537313435, + "grad_norm": 0.12133855372667313, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0376, + "step": 6620 + }, + { + "epoch": 24.738805970149254, + "grad_norm": 0.2808085083961487, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0346, + "step": 6630 + }, + { + "epoch": 24.776119402985074, + "grad_norm": 0.13642264902591705, + "learning_rate": 7.978911531372765e-05, + "loss": 0.0365, + "step": 6640 + }, + { + "epoch": 24.813432835820894, + "grad_norm": 0.19014127552509308, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0344, + "step": 6650 + }, + { + "epoch": 24.850746268656717, + "grad_norm": 0.16038668155670166, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0339, + "step": 6660 + }, + { + "epoch": 24.888059701492537, + "grad_norm": 0.17937994003295898, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0294, + "step": 6670 + }, + { + "epoch": 24.925373134328357, + "grad_norm": 0.19632326066493988, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0365, + "step": 6680 + }, + { + "epoch": 24.96268656716418, + "grad_norm": 0.14519083499908447, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0307, + "step": 6690 + }, + { + "epoch": 25.0, + "grad_norm": 0.17961327731609344, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0348, + "step": 6700 + }, + { + "epoch": 25.03731343283582, + "grad_norm": 0.1272597312927246, + "learning_rate": 7.932233821142987e-05, + "loss": 0.0296, + "step": 6710 + }, + { + "epoch": 25.074626865671643, + "grad_norm": 0.21824714541435242, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0333, + "step": 6720 + }, + { + "epoch": 25.111940298507463, + "grad_norm": 0.12162213027477264, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0314, + "step": 6730 + }, + { + "epoch": 25.149253731343283, + "grad_norm": 0.17663027346134186, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0395, + "step": 6740 + }, + { + "epoch": 25.186567164179106, + "grad_norm": 0.1496419459581375, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0377, + "step": 6750 + }, + { + "epoch": 25.223880597014926, + "grad_norm": 0.15282033383846283, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0308, + "step": 6760 + }, + { + "epoch": 25.261194029850746, + "grad_norm": 0.1586643010377884, + "learning_rate": 7.891911472227478e-05, + "loss": 0.031, + "step": 6770 + }, + { + "epoch": 25.298507462686565, + "grad_norm": 0.15809810161590576, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0304, + "step": 6780 + }, + { + "epoch": 25.33582089552239, + "grad_norm": 0.19378156960010529, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0336, + "step": 6790 + }, + { + "epoch": 25.37313432835821, + "grad_norm": 0.17140574753284454, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0301, + "step": 6800 + }, + { + "epoch": 25.41044776119403, + "grad_norm": 0.1791999191045761, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0381, + "step": 6810 + }, + { + "epoch": 25.44776119402985, + "grad_norm": 0.13285186886787415, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0301, + "step": 6820 + }, + { + "epoch": 25.48507462686567, + "grad_norm": 0.1444288194179535, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0257, + "step": 6830 + }, + { + "epoch": 25.52238805970149, + "grad_norm": 0.17137834429740906, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0272, + "step": 6840 + }, + { + "epoch": 25.559701492537314, + "grad_norm": 0.17752587795257568, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0343, + "step": 6850 + }, + { + "epoch": 25.597014925373134, + "grad_norm": 0.1934349089860916, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0289, + "step": 6860 + }, + { + "epoch": 25.634328358208954, + "grad_norm": 0.21587027609348297, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0349, + "step": 6870 + }, + { + "epoch": 25.671641791044777, + "grad_norm": 0.15302105247974396, + "learning_rate": 7.817250808190483e-05, + "loss": 0.0346, + "step": 6880 + }, + { + "epoch": 25.708955223880597, + "grad_norm": 0.15441982448101044, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0403, + "step": 6890 + }, + { + "epoch": 25.746268656716417, + "grad_norm": 0.11743316054344177, + "learning_rate": 7.803575286758364e-05, + "loss": 0.0329, + "step": 6900 + }, + { + "epoch": 25.78358208955224, + "grad_norm": 0.1417740434408188, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0346, + "step": 6910 + }, + { + "epoch": 25.82089552238806, + "grad_norm": 0.14196589589118958, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0333, + "step": 6920 + }, + { + "epoch": 25.85820895522388, + "grad_norm": 0.17111234366893768, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0403, + "step": 6930 + }, + { + "epoch": 25.895522388059703, + "grad_norm": 0.159880131483078, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0281, + "step": 6940 + }, + { + "epoch": 25.932835820895523, + "grad_norm": 0.1706574410200119, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0327, + "step": 6950 + }, + { + "epoch": 25.970149253731343, + "grad_norm": 0.20553110539913177, + "learning_rate": 7.762365365649067e-05, + "loss": 0.0345, + "step": 6960 + }, + { + "epoch": 26.007462686567163, + "grad_norm": 0.17439968883991241, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0338, + "step": 6970 + }, + { + "epoch": 26.044776119402986, + "grad_norm": 0.1736845076084137, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0309, + "step": 6980 + }, + { + "epoch": 26.082089552238806, + "grad_norm": 0.15822389721870422, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0338, + "step": 6990 + }, + { + "epoch": 26.119402985074625, + "grad_norm": 0.24268855154514313, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0356, + "step": 7000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.2619992365540096e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7500/README.md b/checkpoint-7500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b89968043c4a4cf38dcac1f9bc557c35da3883 --- /dev/null +++ b/checkpoint-7500/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-7500/adapter_config.json b/checkpoint-7500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8f45e5c825b3b34b334d049ddf8e68e52a500cc6 --- /dev/null +++ b/checkpoint-7500/adapter_config.json @@ -0,0 +1,36 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ibru/.cache/huggingface/hub/models--nvidia--GR00T-N1-2B/snapshots/32e1fd2507f7739fad443e6b449c8188e0e02fcb", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "to_k", + "to_q", + "v_proj", + "to_v" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7500/adapter_model.safetensors b/checkpoint-7500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0db4212d70ce6339925102bd70c10f0b7a6dbd07 --- /dev/null +++ b/checkpoint-7500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c8f98436d616b4327663678179776a89992c8603528338a991607c7334dd5c +size 123328576 diff --git a/checkpoint-7500/experiment_cfg/metadata.json b/checkpoint-7500/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/checkpoint-7500/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file diff --git a/checkpoint-7500/nohup.out b/checkpoint-7500/nohup.out new file mode 100644 index 0000000000000000000000000000000000000000..8d669a6fe8d9e4d83c2d1ee8b8b7da9017f941ce --- /dev/null +++ b/checkpoint-7500/nohup.out @@ -0,0 +1 @@ +python: can't open file '/home/ibru/Isaac-GR00T/outputs/bobo_groot/checkpoint-7500/scripts/gr00t_finetune.py': [Errno 2] No such file or directory diff --git a/checkpoint-7500/optimizer.pt b/checkpoint-7500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdcdfa39d8b29cfc6d7b0e6f722adf6cb5e99763 --- /dev/null +++ b/checkpoint-7500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ab2b263971007d60e6712fe6c7152526f7b59993871f694890721925e23bc14 +size 246824634 diff --git a/checkpoint-7500/rng_state.pth b/checkpoint-7500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..85d9ca82db72b207195dc73cf5c8a5d7323bb69f --- /dev/null +++ b/checkpoint-7500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0708df4313fcbbc99dda238cd973f2ec6c64e779d58155a2f9763d8156b443a +size 14244 diff --git a/checkpoint-7500/scheduler.pt b/checkpoint-7500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9e39679b16c506382c978c22e6485b0b4bd87fd --- /dev/null +++ b/checkpoint-7500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8dab4c50c0e9fd86bc4ccbd787e4a839aa2d9e9e754344b4b154dcfe3901329 +size 1064 diff --git a/checkpoint-7500/trainer_state.json b/checkpoint-7500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3bc4ba3a0dba70bc8b5902e81b3633c486f47bfb --- /dev/null +++ b/checkpoint-7500/trainer_state.json @@ -0,0 +1,5283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 27.98507462686567, + "eval_steps": 500, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03731343283582089, + "grad_norm": 0.8186072111129761, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3847, + "step": 10 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.5007426142692566, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4283, + "step": 20 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.49460887908935547, + "learning_rate": 3e-06, + "loss": 1.4868, + "step": 30 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5032920837402344, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4491, + "step": 40 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5688469409942627, + "learning_rate": 5e-06, + "loss": 1.3703, + "step": 50 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5052517652511597, + "learning_rate": 6e-06, + "loss": 1.419, + "step": 60 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.6315643787384033, + "learning_rate": 7.000000000000001e-06, + "loss": 1.3058, + "step": 70 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6060447692871094, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2908, + "step": 80 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5513179302215576, + "learning_rate": 9e-06, + "loss": 1.2311, + "step": 90 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.8467404246330261, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 100 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.8141824007034302, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.0707, + "step": 110 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.7932347059249878, + "learning_rate": 1.2e-05, + "loss": 0.9377, + "step": 120 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.684220552444458, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.5886895060539246, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.6479, + "step": 140 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.4764939248561859, + "learning_rate": 1.5e-05, + "loss": 0.5463, + "step": 150 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4621008038520813, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4641, + "step": 160 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.46492910385131836, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.4159, + "step": 170 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.5017415881156921, + "learning_rate": 1.8e-05, + "loss": 0.4094, + "step": 180 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.34392210841178894, + "learning_rate": 1.9e-05, + "loss": 0.3478, + "step": 190 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3240516483783722, + "learning_rate": 2e-05, + "loss": 0.3821, + "step": 200 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.26301339268684387, + "learning_rate": 2.1e-05, + "loss": 0.3606, + "step": 210 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.34712520241737366, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3421, + "step": 220 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.3248469829559326, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3389, + "step": 230 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.298149436712265, + "learning_rate": 2.4e-05, + "loss": 0.3145, + "step": 240 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.2757190763950348, + "learning_rate": 2.5e-05, + "loss": 0.3065, + "step": 250 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.30510950088500977, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.2971, + "step": 260 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.37349891662597656, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.3273, + "step": 270 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.3667634129524231, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.308, + "step": 280 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.3463355004787445, + "learning_rate": 2.9e-05, + "loss": 0.3109, + "step": 290 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3888525366783142, + "learning_rate": 3e-05, + "loss": 0.2644, + "step": 300 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.3749147951602936, + "learning_rate": 3.1e-05, + "loss": 0.2858, + "step": 310 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.3270276188850403, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.2573, + "step": 320 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.3658592998981476, + "learning_rate": 3.3e-05, + "loss": 0.2613, + "step": 330 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3526328206062317, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.2328, + "step": 340 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.4528139531612396, + "learning_rate": 3.5e-05, + "loss": 0.2429, + "step": 350 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.5426791310310364, + "learning_rate": 3.6e-05, + "loss": 0.2209, + "step": 360 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.41844552755355835, + "learning_rate": 3.7e-05, + "loss": 0.2319, + "step": 370 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.4749431908130646, + "learning_rate": 3.8e-05, + "loss": 0.2233, + "step": 380 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.7010189890861511, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.2181, + "step": 390 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.5747635960578918, + "learning_rate": 4e-05, + "loss": 0.213, + "step": 400 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3661474287509918, + "learning_rate": 4.1e-05, + "loss": 0.2171, + "step": 410 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.467835396528244, + "learning_rate": 4.2e-05, + "loss": 0.1985, + "step": 420 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.5470123291015625, + "learning_rate": 4.3e-05, + "loss": 0.2176, + "step": 430 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.5761199593544006, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.2007, + "step": 440 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.48257485032081604, + "learning_rate": 4.5e-05, + "loss": 0.2043, + "step": 450 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.48353052139282227, + "learning_rate": 4.600000000000001e-05, + "loss": 0.1872, + "step": 460 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.4388391375541687, + "learning_rate": 4.7e-05, + "loss": 0.206, + "step": 470 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.47332626581192017, + "learning_rate": 4.8e-05, + "loss": 0.1876, + "step": 480 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.8053535223007202, + "learning_rate": 4.9e-05, + "loss": 0.1839, + "step": 490 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.413979709148407, + "learning_rate": 5e-05, + "loss": 0.1732, + "step": 500 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.36910712718963623, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1827, + "step": 510 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.8458298444747925, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.1727, + "step": 520 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.5452115535736084, + "learning_rate": 5.300000000000001e-05, + "loss": 0.1818, + "step": 530 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4518108069896698, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.177, + "step": 540 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.66865074634552, + "learning_rate": 5.500000000000001e-05, + "loss": 0.1726, + "step": 550 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.6536034345626831, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.1541, + "step": 560 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.5571377277374268, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.1671, + "step": 570 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.5385546684265137, + "learning_rate": 5.8e-05, + "loss": 0.1582, + "step": 580 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.577961266040802, + "learning_rate": 5.9e-05, + "loss": 0.1528, + "step": 590 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.5082416534423828, + "learning_rate": 6e-05, + "loss": 0.1638, + "step": 600 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.5490861535072327, + "learning_rate": 6.1e-05, + "loss": 0.166, + "step": 610 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.492366760969162, + "learning_rate": 6.2e-05, + "loss": 0.1481, + "step": 620 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.3702855110168457, + "learning_rate": 6.3e-05, + "loss": 0.1514, + "step": 630 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.664667010307312, + "learning_rate": 6.400000000000001e-05, + "loss": 0.1441, + "step": 640 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.33382174372673035, + "learning_rate": 6.500000000000001e-05, + "loss": 0.1573, + "step": 650 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.4848814010620117, + "learning_rate": 6.6e-05, + "loss": 0.1457, + "step": 660 + }, + { + "epoch": 2.5, + "grad_norm": 0.3649997413158417, + "learning_rate": 6.7e-05, + "loss": 0.1467, + "step": 670 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.6385223865509033, + "learning_rate": 6.800000000000001e-05, + "loss": 0.145, + "step": 680 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.4580625891685486, + "learning_rate": 6.9e-05, + "loss": 0.1352, + "step": 690 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.5141746401786804, + "learning_rate": 7e-05, + "loss": 0.1444, + "step": 700 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.40220722556114197, + "learning_rate": 7.1e-05, + "loss": 0.1493, + "step": 710 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.5510571002960205, + "learning_rate": 7.2e-05, + "loss": 0.1387, + "step": 720 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.43814659118652344, + "learning_rate": 7.3e-05, + "loss": 0.1374, + "step": 730 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.4118008613586426, + "learning_rate": 7.4e-05, + "loss": 0.1297, + "step": 740 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.5626503229141235, + "learning_rate": 7.500000000000001e-05, + "loss": 0.1299, + "step": 750 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.4066360592842102, + "learning_rate": 7.6e-05, + "loss": 0.1102, + "step": 760 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.47184985876083374, + "learning_rate": 7.7e-05, + "loss": 0.1219, + "step": 770 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.6611475348472595, + "learning_rate": 7.800000000000001e-05, + "loss": 0.1267, + "step": 780 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.3570108413696289, + "learning_rate": 7.900000000000001e-05, + "loss": 0.1191, + "step": 790 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.4581681489944458, + "learning_rate": 8e-05, + "loss": 0.1209, + "step": 800 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.4643435776233673, + "learning_rate": 8.1e-05, + "loss": 0.129, + "step": 810 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.5595763921737671, + "learning_rate": 8.2e-05, + "loss": 0.1158, + "step": 820 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.48848605155944824, + "learning_rate": 8.3e-05, + "loss": 0.1188, + "step": 830 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.4496570825576782, + "learning_rate": 8.4e-05, + "loss": 0.114, + "step": 840 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.31364986300468445, + "learning_rate": 8.5e-05, + "loss": 0.1196, + "step": 850 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.3395878076553345, + "learning_rate": 8.6e-05, + "loss": 0.1124, + "step": 860 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.4917413592338562, + "learning_rate": 8.7e-05, + "loss": 0.1074, + "step": 870 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.44114553928375244, + "learning_rate": 8.800000000000001e-05, + "loss": 0.1095, + "step": 880 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.3323831558227539, + "learning_rate": 8.900000000000001e-05, + "loss": 0.106, + "step": 890 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.4495660066604614, + "learning_rate": 9e-05, + "loss": 0.1222, + "step": 900 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.40784788131713867, + "learning_rate": 9.1e-05, + "loss": 0.1048, + "step": 910 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.4643700420856476, + "learning_rate": 9.200000000000001e-05, + "loss": 0.1097, + "step": 920 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.472494512796402, + "learning_rate": 9.300000000000001e-05, + "loss": 0.1041, + "step": 930 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.6110897660255432, + "learning_rate": 9.4e-05, + "loss": 0.0959, + "step": 940 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.5313069820404053, + "learning_rate": 9.5e-05, + "loss": 0.113, + "step": 950 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.4223133623600006, + "learning_rate": 9.6e-05, + "loss": 0.099, + "step": 960 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.5464731454849243, + "learning_rate": 9.7e-05, + "loss": 0.1008, + "step": 970 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3538314402103424, + "learning_rate": 9.8e-05, + "loss": 0.1049, + "step": 980 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.7460148334503174, + "learning_rate": 9.900000000000001e-05, + "loss": 0.1088, + "step": 990 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.3210597038269043, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 1000 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.4450497627258301, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0985, + "step": 1010 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.4348960816860199, + "learning_rate": 9.999972660400536e-05, + "loss": 0.1015, + "step": 1020 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.462782621383667, + "learning_rate": 9.999938485971279e-05, + "loss": 0.1068, + "step": 1030 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.3801368474960327, + "learning_rate": 9.999890641901125e-05, + "loss": 0.1117, + "step": 1040 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.45135366916656494, + "learning_rate": 9.999829128320874e-05, + "loss": 0.0917, + "step": 1050 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.41138389706611633, + "learning_rate": 9.999753945398704e-05, + "loss": 0.1049, + "step": 1060 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.4976252317428589, + "learning_rate": 9.999665093340165e-05, + "loss": 0.1029, + "step": 1070 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.46372008323669434, + "learning_rate": 9.99956257238817e-05, + "loss": 0.1012, + "step": 1080 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.546938955783844, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0829, + "step": 1090 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.40513405203819275, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0933, + "step": 1100 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.4198484420776367, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0895, + "step": 1110 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.3965628743171692, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0917, + "step": 1120 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.3095884621143341, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0953, + "step": 1130 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.7962276339530945, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0979, + "step": 1140 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.42066490650177, + "learning_rate": 9.998462224960175e-05, + "loss": 0.099, + "step": 1150 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.3894193470478058, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0887, + "step": 1160 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.28998032212257385, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0943, + "step": 1170 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.3919823467731476, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.3708650469779968, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0858, + "step": 1190 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.2935069799423218, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0992, + "step": 1200 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.4675377607345581, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0854, + "step": 1210 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.31374865770339966, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0788, + "step": 1220 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.419249951839447, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0873, + "step": 1230 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.26002731919288635, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0845, + "step": 1240 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.29573896527290344, + "learning_rate": 9.995728791936504e-05, + "loss": 0.091, + "step": 1250 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.33090147376060486, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0827, + "step": 1260 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.24417485296726227, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0859, + "step": 1270 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.48290401697158813, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0889, + "step": 1280 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.4479697048664093, + "learning_rate": 9.99425294526634e-05, + "loss": 0.097, + "step": 1290 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.3560147285461426, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0904, + "step": 1300 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.6645416617393494, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0922, + "step": 1310 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.29696759581565857, + "learning_rate": 9.993002688846913e-05, + "loss": 0.093, + "step": 1320 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.47146692872047424, + "learning_rate": 9.992558633793212e-05, + "loss": 0.085, + "step": 1330 + }, + { + "epoch": 5.0, + "grad_norm": 0.3430916368961334, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0805, + "step": 1340 + }, + { + "epoch": 5.037313432835821, + "grad_norm": 0.3205055892467499, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0766, + "step": 1350 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.3664805293083191, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0766, + "step": 1360 + }, + { + "epoch": 5.111940298507463, + "grad_norm": 0.3753412663936615, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0688, + "step": 1370 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.31633055210113525, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0796, + "step": 1380 + }, + { + "epoch": 5.186567164179104, + "grad_norm": 0.3355732262134552, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0716, + "step": 1390 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.24850831925868988, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0778, + "step": 1400 + }, + { + "epoch": 5.2611940298507465, + "grad_norm": 0.29537102580070496, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0759, + "step": 1410 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.3430945873260498, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0774, + "step": 1420 + }, + { + "epoch": 5.335820895522388, + "grad_norm": 0.5220637917518616, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0836, + "step": 1430 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.28184008598327637, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0752, + "step": 1440 + }, + { + "epoch": 5.41044776119403, + "grad_norm": 0.36261311173439026, + "learning_rate": 9.986165699464705e-05, + "loss": 0.075, + "step": 1450 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.5107380151748657, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0814, + "step": 1460 + }, + { + "epoch": 5.485074626865671, + "grad_norm": 0.2446671426296234, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0704, + "step": 1470 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.30449381470680237, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0794, + "step": 1480 + }, + { + "epoch": 5.559701492537314, + "grad_norm": 0.25645050406455994, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0709, + "step": 1490 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.23825006186962128, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0828, + "step": 1500 + }, + { + "epoch": 5.634328358208955, + "grad_norm": 0.3259269893169403, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0824, + "step": 1510 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.24058914184570312, + "learning_rate": 9.981529796748134e-05, + "loss": 0.073, + "step": 1520 + }, + { + "epoch": 5.708955223880597, + "grad_norm": 0.34457242488861084, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0845, + "step": 1530 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.32940393686294556, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0832, + "step": 1540 + }, + { + "epoch": 5.7835820895522385, + "grad_norm": 0.5683007836341858, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0836, + "step": 1550 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.31041061878204346, + "learning_rate": 9.97858104436822e-05, + "loss": 0.07, + "step": 1560 + }, + { + "epoch": 5.858208955223881, + "grad_norm": 0.37858131527900696, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.2743091583251953, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0761, + "step": 1580 + }, + { + "epoch": 5.932835820895522, + "grad_norm": 0.29117098450660706, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0777, + "step": 1590 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.31398633122444153, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0664, + "step": 1600 + }, + { + "epoch": 6.007462686567164, + "grad_norm": 0.2684272527694702, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0686, + "step": 1610 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.3945397436618805, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0614, + "step": 1620 + }, + { + "epoch": 6.082089552238806, + "grad_norm": 0.2747954726219177, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0681, + "step": 1630 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.43257200717926025, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0725, + "step": 1640 + }, + { + "epoch": 6.156716417910448, + "grad_norm": 0.3559250831604004, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0804, + "step": 1650 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.3079264760017395, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0649, + "step": 1660 + }, + { + "epoch": 6.231343283582089, + "grad_norm": 0.32298946380615234, + "learning_rate": 9.969349472991838e-05, + "loss": 0.0668, + "step": 1670 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.2826225459575653, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0734, + "step": 1680 + }, + { + "epoch": 6.3059701492537314, + "grad_norm": 0.39002349972724915, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0728, + "step": 1690 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.403890997171402, + "learning_rate": 9.966546331768191e-05, + "loss": 0.067, + "step": 1700 + }, + { + "epoch": 6.380597014925373, + "grad_norm": 0.3755359351634979, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.26346635818481445, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0728, + "step": 1720 + }, + { + "epoch": 6.455223880597015, + "grad_norm": 0.45292145013809204, + "learning_rate": 9.963620985635065e-05, + "loss": 0.0731, + "step": 1730 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.3568434715270996, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0761, + "step": 1740 + }, + { + "epoch": 6.529850746268656, + "grad_norm": 0.2551257014274597, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0694, + "step": 1750 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.6106354594230652, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0827, + "step": 1760 + }, + { + "epoch": 6.604477611940299, + "grad_norm": 0.3226093053817749, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0716, + "step": 1770 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.4297254979610443, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0748, + "step": 1780 + }, + { + "epoch": 6.67910447761194, + "grad_norm": 0.26469680666923523, + "learning_rate": 9.95740396956525e-05, + "loss": 0.074, + "step": 1790 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.22717897593975067, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0739, + "step": 1800 + }, + { + "epoch": 6.753731343283582, + "grad_norm": 0.4513498544692993, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0664, + "step": 1810 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.31683439016342163, + "learning_rate": 9.954112452602045e-05, + "loss": 0.069, + "step": 1820 + }, + { + "epoch": 6.8283582089552235, + "grad_norm": 0.3350532650947571, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0699, + "step": 1830 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.29829463362693787, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0663, + "step": 1840 + }, + { + "epoch": 6.902985074626866, + "grad_norm": 0.31650781631469727, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0668, + "step": 1850 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.360944926738739, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0696, + "step": 1860 + }, + { + "epoch": 6.977611940298507, + "grad_norm": 0.31337013840675354, + "learning_rate": 9.948355745757741e-05, + "loss": 0.073, + "step": 1870 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.4675919711589813, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0688, + "step": 1880 + }, + { + "epoch": 7.052238805970149, + "grad_norm": 0.3031919002532959, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.24858474731445312, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0717, + "step": 1900 + }, + { + "epoch": 7.126865671641791, + "grad_norm": 0.20959483087062836, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0694, + "step": 1910 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.35621434450149536, + "learning_rate": 9.942260825371358e-05, + "loss": 0.063, + "step": 1920 + }, + { + "epoch": 7.201492537313433, + "grad_norm": 0.3462587594985962, + "learning_rate": 9.941001291921512e-05, + "loss": 0.068, + "step": 1930 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.38649681210517883, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0638, + "step": 1940 + }, + { + "epoch": 7.276119402985074, + "grad_norm": 0.29564595222473145, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0626, + "step": 1950 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.339857816696167, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0535, + "step": 1960 + }, + { + "epoch": 7.350746268656716, + "grad_norm": 0.2591215670108795, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0645, + "step": 1970 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.30237796902656555, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0675, + "step": 1980 + }, + { + "epoch": 7.425373134328359, + "grad_norm": 0.28394174575805664, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0643, + "step": 1990 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.3124663233757019, + "learning_rate": 9.931806517013612e-05, + "loss": 0.059, + "step": 2000 + }, + { + "epoch": 7.5, + "grad_norm": 0.36073037981987, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0606, + "step": 2010 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.4091481864452362, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0603, + "step": 2020 + }, + { + "epoch": 7.574626865671641, + "grad_norm": 0.44718074798583984, + "learning_rate": 9.927663575889521e-05, + "loss": 0.0741, + "step": 2030 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.3819601833820343, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0707, + "step": 2040 + }, + { + "epoch": 7.649253731343284, + "grad_norm": 0.23336420953273773, + "learning_rate": 9.92483424862726e-05, + "loss": 0.0676, + "step": 2050 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.24415315687656403, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0594, + "step": 2060 + }, + { + "epoch": 7.723880597014926, + "grad_norm": 0.3735473155975342, + "learning_rate": 9.921951064166684e-05, + "loss": 0.062, + "step": 2070 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.31629472970962524, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0606, + "step": 2080 + }, + { + "epoch": 7.798507462686567, + "grad_norm": 0.37902557849884033, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0584, + "step": 2090 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.3486720323562622, + "learning_rate": 9.917525374361912e-05, + "loss": 0.056, + "step": 2100 + }, + { + "epoch": 7.8731343283582085, + "grad_norm": 0.3731362521648407, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0601, + "step": 2110 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.3560399115085602, + "learning_rate": 9.914507686137019e-05, + "loss": 0.06, + "step": 2120 + }, + { + "epoch": 7.947761194029851, + "grad_norm": 0.30075564980506897, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0657, + "step": 2130 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.2984028458595276, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0587, + "step": 2140 + }, + { + "epoch": 8.022388059701493, + "grad_norm": 0.1980169117450714, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0718, + "step": 2150 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.31339579820632935, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0557, + "step": 2160 + }, + { + "epoch": 8.097014925373134, + "grad_norm": 0.1968696266412735, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0571, + "step": 2170 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.23931682109832764, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0566, + "step": 2180 + }, + { + "epoch": 8.171641791044776, + "grad_norm": 0.21741189062595367, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0575, + "step": 2190 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.22874368727207184, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0565, + "step": 2200 + }, + { + "epoch": 8.246268656716419, + "grad_norm": 0.3441888093948364, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0565, + "step": 2210 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.2539830803871155, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0538, + "step": 2220 + }, + { + "epoch": 8.32089552238806, + "grad_norm": 0.2235102653503418, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0609, + "step": 2230 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.1941322684288025, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0562, + "step": 2240 + }, + { + "epoch": 8.395522388059701, + "grad_norm": 0.2691369950771332, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0608, + "step": 2250 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.24730461835861206, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0582, + "step": 2260 + }, + { + "epoch": 8.470149253731343, + "grad_norm": 0.34785839915275574, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0506, + "step": 2270 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.3625825345516205, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0614, + "step": 2280 + }, + { + "epoch": 8.544776119402986, + "grad_norm": 0.25210148096084595, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0611, + "step": 2290 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.27312466502189636, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0537, + "step": 2300 + }, + { + "epoch": 8.619402985074627, + "grad_norm": 0.314647912979126, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0602, + "step": 2310 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.21531912684440613, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0552, + "step": 2320 + }, + { + "epoch": 8.694029850746269, + "grad_norm": 0.23920664191246033, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0613, + "step": 2330 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.21864956617355347, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0649, + "step": 2340 + }, + { + "epoch": 8.76865671641791, + "grad_norm": 0.27523377537727356, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0606, + "step": 2350 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.24805469810962677, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0549, + "step": 2360 + }, + { + "epoch": 8.843283582089553, + "grad_norm": 0.23070092499256134, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0562, + "step": 2370 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.20833946764469147, + "learning_rate": 9.870399824239117e-05, + "loss": 0.05, + "step": 2380 + }, + { + "epoch": 8.917910447761194, + "grad_norm": 0.34507372975349426, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0613, + "step": 2390 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.32865110039711, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0621, + "step": 2400 + }, + { + "epoch": 8.992537313432836, + "grad_norm": 0.21305270493030548, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0572, + "step": 2410 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.28193730115890503, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0555, + "step": 2420 + }, + { + "epoch": 9.067164179104477, + "grad_norm": 0.3953789472579956, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0536, + "step": 2430 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.23013322055339813, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0572, + "step": 2440 + }, + { + "epoch": 9.14179104477612, + "grad_norm": 0.2906680107116699, + "learning_rate": 9.856983142141339e-05, + "loss": 0.0592, + "step": 2450 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.23490828275680542, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0583, + "step": 2460 + }, + { + "epoch": 9.216417910447761, + "grad_norm": 0.22825880348682404, + "learning_rate": 9.853030215667093e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.25871285796165466, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0555, + "step": 2480 + }, + { + "epoch": 9.291044776119403, + "grad_norm": 0.27220776677131653, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0542, + "step": 2490 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.26534005999565125, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0526, + "step": 2500 + }, + { + "epoch": 9.365671641791044, + "grad_norm": 0.33486032485961914, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0563, + "step": 2510 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.2949483394622803, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0556, + "step": 2520 + }, + { + "epoch": 9.440298507462687, + "grad_norm": 0.24123981595039368, + "learning_rate": 9.840853180294608e-05, + "loss": 0.05, + "step": 2530 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.22536049783229828, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0522, + "step": 2540 + }, + { + "epoch": 9.514925373134329, + "grad_norm": 0.23206663131713867, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0591, + "step": 2550 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.28573134541511536, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0568, + "step": 2560 + }, + { + "epoch": 9.58955223880597, + "grad_norm": 0.2628820538520813, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.2880440652370453, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0559, + "step": 2580 + }, + { + "epoch": 9.664179104477611, + "grad_norm": 0.1786259263753891, + "learning_rate": 9.82819969924244e-05, + "loss": 0.058, + "step": 2590 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3501608073711395, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0523, + "step": 2600 + }, + { + "epoch": 9.738805970149254, + "grad_norm": 0.24757252633571625, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.2556290626525879, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0555, + "step": 2620 + }, + { + "epoch": 9.813432835820896, + "grad_norm": 0.2561217248439789, + "learning_rate": 9.819499966239243e-05, + "loss": 0.052, + "step": 2630 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.2776634097099304, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0498, + "step": 2640 + }, + { + "epoch": 9.888059701492537, + "grad_norm": 0.20668549835681915, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0517, + "step": 2650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.3100263178348541, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0633, + "step": 2660 + }, + { + "epoch": 9.962686567164178, + "grad_norm": 0.2780782878398895, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0581, + "step": 2670 + }, + { + "epoch": 10.0, + "grad_norm": 0.28903728723526, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0538, + "step": 2680 + }, + { + "epoch": 10.037313432835822, + "grad_norm": 0.22727562487125397, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0491, + "step": 2690 + }, + { + "epoch": 10.074626865671641, + "grad_norm": 0.267918199300766, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0562, + "step": 2700 + }, + { + "epoch": 10.111940298507463, + "grad_norm": 0.2988606095314026, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2710 + }, + { + "epoch": 10.149253731343283, + "grad_norm": 0.2710281312465668, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0541, + "step": 2720 + }, + { + "epoch": 10.186567164179104, + "grad_norm": 0.15320520102977753, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0548, + "step": 2730 + }, + { + "epoch": 10.223880597014926, + "grad_norm": 0.2653089463710785, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0599, + "step": 2740 + }, + { + "epoch": 10.261194029850746, + "grad_norm": 0.19223959743976593, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0494, + "step": 2750 + }, + { + "epoch": 10.298507462686567, + "grad_norm": 0.20455987751483917, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0538, + "step": 2760 + }, + { + "epoch": 10.335820895522389, + "grad_norm": 0.24908749759197235, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0481, + "step": 2770 + }, + { + "epoch": 10.373134328358208, + "grad_norm": 0.3131813406944275, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0526, + "step": 2780 + }, + { + "epoch": 10.41044776119403, + "grad_norm": 0.24828971922397614, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0489, + "step": 2790 + }, + { + "epoch": 10.447761194029852, + "grad_norm": 0.21727119386196136, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0532, + "step": 2800 + }, + { + "epoch": 10.485074626865671, + "grad_norm": 0.20279547572135925, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0565, + "step": 2810 + }, + { + "epoch": 10.522388059701493, + "grad_norm": 0.17726702988147736, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0437, + "step": 2820 + }, + { + "epoch": 10.559701492537313, + "grad_norm": 0.18961119651794434, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0584, + "step": 2830 + }, + { + "epoch": 10.597014925373134, + "grad_norm": 0.2498980015516281, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0496, + "step": 2840 + }, + { + "epoch": 10.634328358208956, + "grad_norm": 0.16978798806667328, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0469, + "step": 2850 + }, + { + "epoch": 10.671641791044776, + "grad_norm": 0.16128584742546082, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0533, + "step": 2860 + }, + { + "epoch": 10.708955223880597, + "grad_norm": 0.20463155210018158, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0464, + "step": 2870 + }, + { + "epoch": 10.746268656716419, + "grad_norm": 0.30601629614830017, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0489, + "step": 2880 + }, + { + "epoch": 10.783582089552239, + "grad_norm": 0.2730671763420105, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0495, + "step": 2890 + }, + { + "epoch": 10.82089552238806, + "grad_norm": 0.251432865858078, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0549, + "step": 2900 + }, + { + "epoch": 10.85820895522388, + "grad_norm": 0.26670166850090027, + "learning_rate": 9.752721330892624e-05, + "loss": 0.061, + "step": 2910 + }, + { + "epoch": 10.895522388059701, + "grad_norm": 0.2965967655181885, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0473, + "step": 2920 + }, + { + "epoch": 10.932835820895523, + "grad_norm": 0.683840274810791, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0509, + "step": 2930 + }, + { + "epoch": 10.970149253731343, + "grad_norm": 0.25740495324134827, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "epoch": 11.007462686567164, + "grad_norm": 0.2880542278289795, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0482, + "step": 2950 + }, + { + "epoch": 11.044776119402986, + "grad_norm": 0.45032551884651184, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0557, + "step": 2960 + }, + { + "epoch": 11.082089552238806, + "grad_norm": 0.2829900085926056, + "learning_rate": 9.73708120603067e-05, + "loss": 0.052, + "step": 2970 + }, + { + "epoch": 11.119402985074627, + "grad_norm": 0.309597373008728, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0541, + "step": 2980 + }, + { + "epoch": 11.156716417910447, + "grad_norm": 0.2433389127254486, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0482, + "step": 2990 + }, + { + "epoch": 11.194029850746269, + "grad_norm": 0.24458132684230804, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0505, + "step": 3000 + }, + { + "epoch": 11.23134328358209, + "grad_norm": 0.2305087298154831, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0504, + "step": 3010 + }, + { + "epoch": 11.26865671641791, + "grad_norm": 0.18110457062721252, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 11.305970149253731, + "grad_norm": 0.20407621562480927, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0494, + "step": 3030 + }, + { + "epoch": 11.343283582089553, + "grad_norm": 0.25924697518348694, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0472, + "step": 3040 + }, + { + "epoch": 11.380597014925373, + "grad_norm": 0.23041822016239166, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0481, + "step": 3050 + }, + { + "epoch": 11.417910447761194, + "grad_norm": 0.25381171703338623, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0543, + "step": 3060 + }, + { + "epoch": 11.455223880597014, + "grad_norm": 0.18027640879154205, + "learning_rate": 9.709979040531569e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 11.492537313432836, + "grad_norm": 0.2954868674278259, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0473, + "step": 3080 + }, + { + "epoch": 11.529850746268657, + "grad_norm": 0.25323861837387085, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0509, + "step": 3090 + }, + { + "epoch": 11.567164179104477, + "grad_norm": 0.36910176277160645, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0504, + "step": 3100 + }, + { + "epoch": 11.604477611940299, + "grad_norm": 0.34199246764183044, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0526, + "step": 3110 + }, + { + "epoch": 11.64179104477612, + "grad_norm": 0.2146557718515396, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 11.67910447761194, + "grad_norm": 0.20559175312519073, + "learning_rate": 9.693099174962103e-05, + "loss": 0.0514, + "step": 3130 + }, + { + "epoch": 11.716417910447761, + "grad_norm": 0.2689419090747833, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 11.753731343283581, + "grad_norm": 0.34870603680610657, + "learning_rate": 9.687369824539577e-05, + "loss": 0.0485, + "step": 3150 + }, + { + "epoch": 11.791044776119403, + "grad_norm": 0.15433363616466522, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0418, + "step": 3160 + }, + { + "epoch": 11.828358208955224, + "grad_norm": 0.26874423027038574, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0537, + "step": 3170 + }, + { + "epoch": 11.865671641791044, + "grad_norm": 0.3361654281616211, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0474, + "step": 3180 + }, + { + "epoch": 11.902985074626866, + "grad_norm": 0.17938771843910217, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0443, + "step": 3190 + }, + { + "epoch": 11.940298507462687, + "grad_norm": 0.31368622183799744, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0594, + "step": 3200 + }, + { + "epoch": 11.977611940298507, + "grad_norm": 0.16268151998519897, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0487, + "step": 3210 + }, + { + "epoch": 12.014925373134329, + "grad_norm": 0.23879969120025635, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 12.052238805970148, + "grad_norm": 0.2321789413690567, + "learning_rate": 9.663940454552342e-05, + "loss": 0.051, + "step": 3230 + }, + { + "epoch": 12.08955223880597, + "grad_norm": 0.22873088717460632, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0406, + "step": 3240 + }, + { + "epoch": 12.126865671641792, + "grad_norm": 0.3767557740211487, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0432, + "step": 3250 + }, + { + "epoch": 12.164179104477611, + "grad_norm": 0.21569453179836273, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0528, + "step": 3260 + }, + { + "epoch": 12.201492537313433, + "grad_norm": 0.23698291182518005, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0457, + "step": 3270 + }, + { + "epoch": 12.238805970149254, + "grad_norm": 0.21086478233337402, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0508, + "step": 3280 + }, + { + "epoch": 12.276119402985074, + "grad_norm": 0.19763463735580444, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0497, + "step": 3290 + }, + { + "epoch": 12.313432835820896, + "grad_norm": 0.18413852155208588, + "learning_rate": 9.642770192448536e-05, + "loss": 0.0441, + "step": 3300 + }, + { + "epoch": 12.350746268656717, + "grad_norm": 0.13946911692619324, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0453, + "step": 3310 + }, + { + "epoch": 12.388059701492537, + "grad_norm": 0.21613670885562897, + "learning_rate": 9.636607182864827e-05, + "loss": 0.0511, + "step": 3320 + }, + { + "epoch": 12.425373134328359, + "grad_norm": 0.24953646957874298, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0451, + "step": 3330 + }, + { + "epoch": 12.462686567164178, + "grad_norm": 0.2993795871734619, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0469, + "step": 3340 + }, + { + "epoch": 12.5, + "grad_norm": 0.2261819839477539, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0484, + "step": 3350 + }, + { + "epoch": 12.537313432835822, + "grad_norm": 0.23026186227798462, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0452, + "step": 3360 + }, + { + "epoch": 12.574626865671641, + "grad_norm": 0.27859947085380554, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0593, + "step": 3370 + }, + { + "epoch": 12.611940298507463, + "grad_norm": 0.23060785233974457, + "learning_rate": 9.617814195316411e-05, + "loss": 0.05, + "step": 3380 + }, + { + "epoch": 12.649253731343283, + "grad_norm": 0.20185025036334991, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0573, + "step": 3390 + }, + { + "epoch": 12.686567164179104, + "grad_norm": 0.3584498167037964, + "learning_rate": 9.611448774886924e-05, + "loss": 0.052, + "step": 3400 + }, + { + "epoch": 12.723880597014926, + "grad_norm": 0.19336827099323273, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0535, + "step": 3410 + }, + { + "epoch": 12.761194029850746, + "grad_norm": 0.22223635017871857, + "learning_rate": 9.605032924392457e-05, + "loss": 0.05, + "step": 3420 + }, + { + "epoch": 12.798507462686567, + "grad_norm": 0.17108851671218872, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0475, + "step": 3430 + }, + { + "epoch": 12.835820895522389, + "grad_norm": 0.3861902952194214, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0439, + "step": 3440 + }, + { + "epoch": 12.873134328358208, + "grad_norm": 0.18927253782749176, + "learning_rate": 9.595314745910456e-05, + "loss": 0.052, + "step": 3450 + }, + { + "epoch": 12.91044776119403, + "grad_norm": 0.21963383257389069, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0504, + "step": 3460 + }, + { + "epoch": 12.947761194029852, + "grad_norm": 0.18016670644283295, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0467, + "step": 3470 + }, + { + "epoch": 12.985074626865671, + "grad_norm": 0.1776365041732788, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0414, + "step": 3480 + }, + { + "epoch": 13.022388059701493, + "grad_norm": 0.2616482973098755, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 13.059701492537313, + "grad_norm": 0.20341171324253082, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0533, + "step": 3500 + }, + { + "epoch": 13.097014925373134, + "grad_norm": 0.2223699688911438, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0458, + "step": 3510 + }, + { + "epoch": 13.134328358208956, + "grad_norm": 0.22557464241981506, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0445, + "step": 3520 + }, + { + "epoch": 13.171641791044776, + "grad_norm": 0.25104308128356934, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 13.208955223880597, + "grad_norm": 0.18720711767673492, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0453, + "step": 3540 + }, + { + "epoch": 13.246268656716419, + "grad_norm": 0.16838951408863068, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0505, + "step": 3550 + }, + { + "epoch": 13.283582089552239, + "grad_norm": 0.31681734323501587, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0494, + "step": 3560 + }, + { + "epoch": 13.32089552238806, + "grad_norm": 0.2390700727701187, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0538, + "step": 3570 + }, + { + "epoch": 13.35820895522388, + "grad_norm": 0.20680709183216095, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0519, + "step": 3580 + }, + { + "epoch": 13.395522388059701, + "grad_norm": 0.2758580148220062, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0512, + "step": 3590 + }, + { + "epoch": 13.432835820895523, + "grad_norm": 0.3653097450733185, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0496, + "step": 3600 + }, + { + "epoch": 13.470149253731343, + "grad_norm": 0.23886866867542267, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0455, + "step": 3610 + }, + { + "epoch": 13.507462686567164, + "grad_norm": 0.3280908465385437, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0489, + "step": 3620 + }, + { + "epoch": 13.544776119402986, + "grad_norm": 0.20268180966377258, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0484, + "step": 3630 + }, + { + "epoch": 13.582089552238806, + "grad_norm": 0.2582015097141266, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0487, + "step": 3640 + }, + { + "epoch": 13.619402985074627, + "grad_norm": 0.18157973885536194, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0496, + "step": 3650 + }, + { + "epoch": 13.656716417910447, + "grad_norm": 0.22841542959213257, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0467, + "step": 3660 + }, + { + "epoch": 13.694029850746269, + "grad_norm": 0.2519935369491577, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0487, + "step": 3670 + }, + { + "epoch": 13.73134328358209, + "grad_norm": 0.24680495262145996, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0457, + "step": 3680 + }, + { + "epoch": 13.76865671641791, + "grad_norm": 0.26362067461013794, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0495, + "step": 3690 + }, + { + "epoch": 13.805970149253731, + "grad_norm": 0.3240712583065033, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0496, + "step": 3700 + }, + { + "epoch": 13.843283582089553, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0457, + "step": 3710 + }, + { + "epoch": 13.880597014925373, + "grad_norm": 0.1669154316186905, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0496, + "step": 3720 + }, + { + "epoch": 13.917910447761194, + "grad_norm": 0.22347605228424072, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0426, + "step": 3730 + }, + { + "epoch": 13.955223880597014, + "grad_norm": 0.15208907425403595, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0447, + "step": 3740 + }, + { + "epoch": 13.992537313432836, + "grad_norm": 0.3206101059913635, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0471, + "step": 3750 + }, + { + "epoch": 14.029850746268657, + "grad_norm": 0.15873713791370392, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0401, + "step": 3760 + }, + { + "epoch": 14.067164179104477, + "grad_norm": 0.19690357148647308, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0491, + "step": 3770 + }, + { + "epoch": 14.104477611940299, + "grad_norm": 0.3211113214492798, + "learning_rate": 9.481006715927351e-05, + "loss": 0.049, + "step": 3780 + }, + { + "epoch": 14.14179104477612, + "grad_norm": 0.27657604217529297, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0396, + "step": 3790 + }, + { + "epoch": 14.17910447761194, + "grad_norm": 0.20194031298160553, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0442, + "step": 3800 + }, + { + "epoch": 14.216417910447761, + "grad_norm": 0.20344595611095428, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0427, + "step": 3810 + }, + { + "epoch": 14.253731343283581, + "grad_norm": 0.2067718505859375, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0501, + "step": 3820 + }, + { + "epoch": 14.291044776119403, + "grad_norm": 0.29719170928001404, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0519, + "step": 3830 + }, + { + "epoch": 14.328358208955224, + "grad_norm": 0.2347182184457779, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0413, + "step": 3840 + }, + { + "epoch": 14.365671641791044, + "grad_norm": 0.1558852344751358, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0442, + "step": 3850 + }, + { + "epoch": 14.402985074626866, + "grad_norm": 0.23085005581378937, + "learning_rate": 9.451273234763371e-05, + "loss": 0.047, + "step": 3860 + }, + { + "epoch": 14.440298507462687, + "grad_norm": 0.1515151560306549, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0481, + "step": 3870 + }, + { + "epoch": 14.477611940298507, + "grad_norm": 0.1916729211807251, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0474, + "step": 3880 + }, + { + "epoch": 14.514925373134329, + "grad_norm": 0.2536492943763733, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0382, + "step": 3890 + }, + { + "epoch": 14.552238805970148, + "grad_norm": 0.1672086864709854, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0497, + "step": 3900 + }, + { + "epoch": 14.58955223880597, + "grad_norm": 0.3644237518310547, + "learning_rate": 9.432293966429514e-05, + "loss": 0.0444, + "step": 3910 + }, + { + "epoch": 14.626865671641792, + "grad_norm": 0.20307251811027527, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0452, + "step": 3920 + }, + { + "epoch": 14.664179104477611, + "grad_norm": 0.20441733300685883, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0454, + "step": 3930 + }, + { + "epoch": 14.701492537313433, + "grad_norm": 0.26315611600875854, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0486, + "step": 3940 + }, + { + "epoch": 14.738805970149254, + "grad_norm": 0.1983092874288559, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0483, + "step": 3950 + }, + { + "epoch": 14.776119402985074, + "grad_norm": 0.18301443755626678, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0456, + "step": 3960 + }, + { + "epoch": 14.813432835820896, + "grad_norm": 0.2433597594499588, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0463, + "step": 3970 + }, + { + "epoch": 14.850746268656717, + "grad_norm": 0.27949392795562744, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 14.888059701492537, + "grad_norm": 0.22806599736213684, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0448, + "step": 3990 + }, + { + "epoch": 14.925373134328359, + "grad_norm": 0.25421562790870667, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0426, + "step": 4000 + }, + { + "epoch": 14.962686567164178, + "grad_norm": 0.2436474859714508, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0474, + "step": 4010 + }, + { + "epoch": 15.0, + "grad_norm": 0.3756405711174011, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0438, + "step": 4020 + }, + { + "epoch": 15.037313432835822, + "grad_norm": 0.25687697529792786, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0435, + "step": 4030 + }, + { + "epoch": 15.074626865671641, + "grad_norm": 0.17263716459274292, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0455, + "step": 4040 + }, + { + "epoch": 15.111940298507463, + "grad_norm": 0.2471216470003128, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0429, + "step": 4050 + }, + { + "epoch": 15.149253731343283, + "grad_norm": 0.20195460319519043, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0432, + "step": 4060 + }, + { + "epoch": 15.186567164179104, + "grad_norm": 0.1709851622581482, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0477, + "step": 4070 + }, + { + "epoch": 15.223880597014926, + "grad_norm": 0.23063932359218597, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0379, + "step": 4080 + }, + { + "epoch": 15.261194029850746, + "grad_norm": 0.3265426754951477, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0435, + "step": 4090 + }, + { + "epoch": 15.298507462686567, + "grad_norm": 0.26373934745788574, + "learning_rate": 9.357421218136386e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 15.335820895522389, + "grad_norm": 0.16861388087272644, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0448, + "step": 4110 + }, + { + "epoch": 15.373134328358208, + "grad_norm": 0.303790807723999, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0396, + "step": 4120 + }, + { + "epoch": 15.41044776119403, + "grad_norm": 0.1940719038248062, + "learning_rate": 9.345204152589428e-05, + "loss": 0.0474, + "step": 4130 + }, + { + "epoch": 15.447761194029852, + "grad_norm": 0.34091615676879883, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0424, + "step": 4140 + }, + { + "epoch": 15.485074626865671, + "grad_norm": 0.27036693692207336, + "learning_rate": 9.337000025557476e-05, + "loss": 0.0482, + "step": 4150 + }, + { + "epoch": 15.522388059701493, + "grad_norm": 0.16908007860183716, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0381, + "step": 4160 + }, + { + "epoch": 15.559701492537313, + "grad_norm": 0.23332923650741577, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0427, + "step": 4170 + }, + { + "epoch": 15.597014925373134, + "grad_norm": 0.16899706423282623, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0439, + "step": 4180 + }, + { + "epoch": 15.634328358208956, + "grad_norm": 0.12869524955749512, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0481, + "step": 4190 + }, + { + "epoch": 15.671641791044776, + "grad_norm": 0.21159130334854126, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0446, + "step": 4200 + }, + { + "epoch": 15.708955223880597, + "grad_norm": 0.1849961131811142, + "learning_rate": 9.31210343350549e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 15.746268656716419, + "grad_norm": 0.16107840836048126, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0429, + "step": 4220 + }, + { + "epoch": 15.783582089552239, + "grad_norm": 0.14206446707248688, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0409, + "step": 4230 + }, + { + "epoch": 15.82089552238806, + "grad_norm": 0.2168441116809845, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0413, + "step": 4240 + }, + { + "epoch": 15.85820895522388, + "grad_norm": 0.21431951224803925, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0472, + "step": 4250 + }, + { + "epoch": 15.895522388059701, + "grad_norm": 0.16851255297660828, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0508, + "step": 4260 + }, + { + "epoch": 15.932835820895523, + "grad_norm": 0.18404732644557953, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0402, + "step": 4270 + }, + { + "epoch": 15.970149253731343, + "grad_norm": 0.21722930669784546, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 16.007462686567163, + "grad_norm": 0.2523709833621979, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0427, + "step": 4290 + }, + { + "epoch": 16.044776119402986, + "grad_norm": 0.17736563086509705, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0458, + "step": 4300 + }, + { + "epoch": 16.082089552238806, + "grad_norm": 0.20613858103752136, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0387, + "step": 4310 + }, + { + "epoch": 16.119402985074625, + "grad_norm": 0.16557513177394867, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0411, + "step": 4320 + }, + { + "epoch": 16.15671641791045, + "grad_norm": 0.28119519352912903, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0468, + "step": 4330 + }, + { + "epoch": 16.19402985074627, + "grad_norm": 0.21538576483726501, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0368, + "step": 4340 + }, + { + "epoch": 16.23134328358209, + "grad_norm": 0.19657357037067413, + "learning_rate": 9.252365234273755e-05, + "loss": 0.038, + "step": 4350 + }, + { + "epoch": 16.26865671641791, + "grad_norm": 0.19258421659469604, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0414, + "step": 4360 + }, + { + "epoch": 16.30597014925373, + "grad_norm": 0.28801625967025757, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0387, + "step": 4370 + }, + { + "epoch": 16.34328358208955, + "grad_norm": 0.16581468284130096, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0383, + "step": 4380 + }, + { + "epoch": 16.380597014925375, + "grad_norm": 0.34664949774742126, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0453, + "step": 4390 + }, + { + "epoch": 16.417910447761194, + "grad_norm": 0.1439947783946991, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0466, + "step": 4400 + }, + { + "epoch": 16.455223880597014, + "grad_norm": 0.15509940683841705, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0488, + "step": 4410 + }, + { + "epoch": 16.492537313432837, + "grad_norm": 0.18005985021591187, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0407, + "step": 4420 + }, + { + "epoch": 16.529850746268657, + "grad_norm": 0.16012470424175262, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0394, + "step": 4430 + }, + { + "epoch": 16.567164179104477, + "grad_norm": 0.2978847920894623, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0451, + "step": 4440 + }, + { + "epoch": 16.604477611940297, + "grad_norm": 0.2236834019422531, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0487, + "step": 4450 + }, + { + "epoch": 16.64179104477612, + "grad_norm": 0.2686060667037964, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0403, + "step": 4460 + }, + { + "epoch": 16.67910447761194, + "grad_norm": 0.33534038066864014, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "epoch": 16.71641791044776, + "grad_norm": 0.19250528514385223, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0441, + "step": 4480 + }, + { + "epoch": 16.753731343283583, + "grad_norm": 0.19218407571315765, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0474, + "step": 4490 + }, + { + "epoch": 16.791044776119403, + "grad_norm": 0.21163488924503326, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0411, + "step": 4500 + }, + { + "epoch": 16.828358208955223, + "grad_norm": 0.1758819818496704, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0462, + "step": 4510 + }, + { + "epoch": 16.865671641791046, + "grad_norm": 0.18571069836616516, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0425, + "step": 4520 + }, + { + "epoch": 16.902985074626866, + "grad_norm": 0.20819155871868134, + "learning_rate": 9.17221551539151e-05, + "loss": 0.0428, + "step": 4530 + }, + { + "epoch": 16.940298507462686, + "grad_norm": 0.30357328057289124, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0414, + "step": 4540 + }, + { + "epoch": 16.97761194029851, + "grad_norm": 0.20977462828159332, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0447, + "step": 4550 + }, + { + "epoch": 17.01492537313433, + "grad_norm": 0.2535971701145172, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0384, + "step": 4560 + }, + { + "epoch": 17.05223880597015, + "grad_norm": 0.2789897620677948, + "learning_rate": 9.153900045904549e-05, + "loss": 0.042, + "step": 4570 + }, + { + "epoch": 17.08955223880597, + "grad_norm": 0.18474848568439484, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0398, + "step": 4580 + }, + { + "epoch": 17.12686567164179, + "grad_norm": 0.12615208327770233, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 17.16417910447761, + "grad_norm": 0.17756640911102295, + "learning_rate": 9.140044155740101e-05, + "loss": 0.035, + "step": 4600 + }, + { + "epoch": 17.20149253731343, + "grad_norm": 0.24410821497440338, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0459, + "step": 4610 + }, + { + "epoch": 17.238805970149254, + "grad_norm": 0.21573011577129364, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0385, + "step": 4620 + }, + { + "epoch": 17.276119402985074, + "grad_norm": 0.13879653811454773, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0391, + "step": 4630 + }, + { + "epoch": 17.313432835820894, + "grad_norm": 0.17508305609226227, + "learning_rate": 9.121411232980588e-05, + "loss": 0.038, + "step": 4640 + }, + { + "epoch": 17.350746268656717, + "grad_norm": 0.2536008358001709, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0418, + "step": 4650 + }, + { + "epoch": 17.388059701492537, + "grad_norm": 0.1942976713180542, + "learning_rate": 9.112027113896262e-05, + "loss": 0.052, + "step": 4660 + }, + { + "epoch": 17.425373134328357, + "grad_norm": 0.16561119258403778, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0451, + "step": 4670 + }, + { + "epoch": 17.46268656716418, + "grad_norm": 0.22971832752227783, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0407, + "step": 4680 + }, + { + "epoch": 17.5, + "grad_norm": 0.1306753158569336, + "learning_rate": 9.097866651593317e-05, + "loss": 0.042, + "step": 4690 + }, + { + "epoch": 17.53731343283582, + "grad_norm": 0.21278400719165802, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0458, + "step": 4700 + }, + { + "epoch": 17.574626865671643, + "grad_norm": 0.22757171094417572, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0364, + "step": 4710 + }, + { + "epoch": 17.611940298507463, + "grad_norm": 0.216596320271492, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0434, + "step": 4720 + }, + { + "epoch": 17.649253731343283, + "grad_norm": 0.13022471964359283, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0415, + "step": 4730 + }, + { + "epoch": 17.686567164179106, + "grad_norm": 0.2280716598033905, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0385, + "step": 4740 + }, + { + "epoch": 17.723880597014926, + "grad_norm": 0.14666135609149933, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0347, + "step": 4750 + }, + { + "epoch": 17.761194029850746, + "grad_norm": 0.1631281077861786, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0407, + "step": 4760 + }, + { + "epoch": 17.798507462686565, + "grad_norm": 0.18697327375411987, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0425, + "step": 4770 + }, + { + "epoch": 17.83582089552239, + "grad_norm": 0.12955111265182495, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 17.87313432835821, + "grad_norm": 0.15547148883342743, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0409, + "step": 4790 + }, + { + "epoch": 17.91044776119403, + "grad_norm": 0.1900598704814911, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0369, + "step": 4800 + }, + { + "epoch": 17.94776119402985, + "grad_norm": 0.1846715807914734, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0415, + "step": 4810 + }, + { + "epoch": 17.98507462686567, + "grad_norm": 0.1829937845468521, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 18.02238805970149, + "grad_norm": 0.25900354981422424, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0398, + "step": 4830 + }, + { + "epoch": 18.059701492537314, + "grad_norm": 0.21235992014408112, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0477, + "step": 4840 + }, + { + "epoch": 18.097014925373134, + "grad_norm": 0.18785078823566437, + "learning_rate": 9.020649881213958e-05, + "loss": 0.039, + "step": 4850 + }, + { + "epoch": 18.134328358208954, + "grad_norm": 0.1951548010110855, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0357, + "step": 4860 + }, + { + "epoch": 18.171641791044777, + "grad_norm": 0.1280934363603592, + "learning_rate": 9.010798805089384e-05, + "loss": 0.0425, + "step": 4870 + }, + { + "epoch": 18.208955223880597, + "grad_norm": 0.1693423092365265, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0447, + "step": 4880 + }, + { + "epoch": 18.246268656716417, + "grad_norm": 0.23712658882141113, + "learning_rate": 9.000903867511666e-05, + "loss": 0.042, + "step": 4890 + }, + { + "epoch": 18.28358208955224, + "grad_norm": 0.26489710807800293, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0457, + "step": 4900 + }, + { + "epoch": 18.32089552238806, + "grad_norm": 0.20792756974697113, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 18.35820895522388, + "grad_norm": 0.18526089191436768, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0366, + "step": 4920 + }, + { + "epoch": 18.395522388059703, + "grad_norm": 0.2214607298374176, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 18.432835820895523, + "grad_norm": 0.1896953135728836, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0391, + "step": 4940 + }, + { + "epoch": 18.470149253731343, + "grad_norm": 0.1430232971906662, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0403, + "step": 4950 + }, + { + "epoch": 18.507462686567163, + "grad_norm": 0.1991272121667862, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0429, + "step": 4960 + }, + { + "epoch": 18.544776119402986, + "grad_norm": 0.2361849844455719, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0416, + "step": 4970 + }, + { + "epoch": 18.582089552238806, + "grad_norm": 0.25857019424438477, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0461, + "step": 4980 + }, + { + "epoch": 18.619402985074625, + "grad_norm": 0.12873682379722595, + "learning_rate": 8.950775061878453e-05, + "loss": 0.035, + "step": 4990 + }, + { + "epoch": 18.65671641791045, + "grad_norm": 0.19786769151687622, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0399, + "step": 5000 + }, + { + "epoch": 18.69402985074627, + "grad_norm": 0.2562239170074463, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0372, + "step": 5010 + }, + { + "epoch": 18.73134328358209, + "grad_norm": 0.14586858451366425, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0427, + "step": 5020 + }, + { + "epoch": 18.76865671641791, + "grad_norm": 0.20062318444252014, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0403, + "step": 5030 + }, + { + "epoch": 18.80597014925373, + "grad_norm": 0.22698874771595, + "learning_rate": 8.92530475251784e-05, + "loss": 0.036, + "step": 5040 + }, + { + "epoch": 18.84328358208955, + "grad_norm": 0.2103697657585144, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0431, + "step": 5050 + }, + { + "epoch": 18.880597014925375, + "grad_norm": 0.16042308509349823, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0388, + "step": 5060 + }, + { + "epoch": 18.917910447761194, + "grad_norm": 0.16874109208583832, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0388, + "step": 5070 + }, + { + "epoch": 18.955223880597014, + "grad_norm": 0.15569192171096802, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0353, + "step": 5080 + }, + { + "epoch": 18.992537313432837, + "grad_norm": 0.16723507642745972, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0443, + "step": 5090 + }, + { + "epoch": 19.029850746268657, + "grad_norm": 0.23284228146076202, + "learning_rate": 8.894386393810563e-05, + "loss": 0.05, + "step": 5100 + }, + { + "epoch": 19.067164179104477, + "grad_norm": 0.1621718853712082, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0369, + "step": 5110 + }, + { + "epoch": 19.104477611940297, + "grad_norm": 0.17522747814655304, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0475, + "step": 5120 + }, + { + "epoch": 19.14179104477612, + "grad_norm": 0.16110533475875854, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0383, + "step": 5130 + }, + { + "epoch": 19.17910447761194, + "grad_norm": 0.2574177086353302, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0383, + "step": 5140 + }, + { + "epoch": 19.21641791044776, + "grad_norm": 0.14560100436210632, + "learning_rate": 8.868328171593448e-05, + "loss": 0.037, + "step": 5150 + }, + { + "epoch": 19.253731343283583, + "grad_norm": 0.14456631243228912, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0423, + "step": 5160 + }, + { + "epoch": 19.291044776119403, + "grad_norm": 0.1403595507144928, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0327, + "step": 5170 + }, + { + "epoch": 19.328358208955223, + "grad_norm": 0.18462564051151276, + "learning_rate": 8.852566213878947e-05, + "loss": 0.037, + "step": 5180 + }, + { + "epoch": 19.365671641791046, + "grad_norm": 0.20725117623806, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0376, + "step": 5190 + }, + { + "epoch": 19.402985074626866, + "grad_norm": 0.17023132741451263, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0467, + "step": 5200 + }, + { + "epoch": 19.440298507462686, + "grad_norm": 0.31033241748809814, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0425, + "step": 5210 + }, + { + "epoch": 19.47761194029851, + "grad_norm": 0.14057482779026031, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0432, + "step": 5220 + }, + { + "epoch": 19.51492537313433, + "grad_norm": 0.23247437179088593, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0396, + "step": 5230 + }, + { + "epoch": 19.55223880597015, + "grad_norm": 0.1305907964706421, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0389, + "step": 5240 + }, + { + "epoch": 19.58955223880597, + "grad_norm": 0.17093417048454285, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0347, + "step": 5250 + }, + { + "epoch": 19.62686567164179, + "grad_norm": 0.24105240404605865, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0392, + "step": 5260 + }, + { + "epoch": 19.66417910447761, + "grad_norm": 0.2234315127134323, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0403, + "step": 5270 + }, + { + "epoch": 19.701492537313435, + "grad_norm": 0.16947844624519348, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0368, + "step": 5280 + }, + { + "epoch": 19.738805970149254, + "grad_norm": 0.26133742928504944, + "learning_rate": 8.79396432173515e-05, + "loss": 0.041, + "step": 5290 + }, + { + "epoch": 19.776119402985074, + "grad_norm": 0.2099352777004242, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0363, + "step": 5300 + }, + { + "epoch": 19.813432835820894, + "grad_norm": 0.1662513017654419, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0409, + "step": 5310 + }, + { + "epoch": 19.850746268656717, + "grad_norm": 0.18933714926242828, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0348, + "step": 5320 + }, + { + "epoch": 19.888059701492537, + "grad_norm": 0.21673552691936493, + "learning_rate": 8.772342342181095e-05, + "loss": 0.037, + "step": 5330 + }, + { + "epoch": 19.925373134328357, + "grad_norm": 0.13009892404079437, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0386, + "step": 5340 + }, + { + "epoch": 19.96268656716418, + "grad_norm": 0.1655230075120926, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0404, + "step": 5350 + }, + { + "epoch": 20.0, + "grad_norm": 0.2821272611618042, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0441, + "step": 5360 + }, + { + "epoch": 20.03731343283582, + "grad_norm": 0.1302652508020401, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0389, + "step": 5370 + }, + { + "epoch": 20.074626865671643, + "grad_norm": 0.13331563770771027, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0393, + "step": 5380 + }, + { + "epoch": 20.111940298507463, + "grad_norm": 0.244130939245224, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0372, + "step": 5390 + }, + { + "epoch": 20.149253731343283, + "grad_norm": 0.20429308712482452, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0387, + "step": 5400 + }, + { + "epoch": 20.186567164179106, + "grad_norm": 0.2954719364643097, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0404, + "step": 5410 + }, + { + "epoch": 20.223880597014926, + "grad_norm": 0.20438429713249207, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0359, + "step": 5420 + }, + { + "epoch": 20.261194029850746, + "grad_norm": 0.17289331555366516, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0357, + "step": 5430 + }, + { + "epoch": 20.298507462686565, + "grad_norm": 0.24367138743400574, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0392, + "step": 5440 + }, + { + "epoch": 20.33582089552239, + "grad_norm": 0.21900270879268646, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0419, + "step": 5450 + }, + { + "epoch": 20.37313432835821, + "grad_norm": 0.1526443362236023, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0333, + "step": 5460 + }, + { + "epoch": 20.41044776119403, + "grad_norm": 0.24582353234291077, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0417, + "step": 5470 + }, + { + "epoch": 20.44776119402985, + "grad_norm": 0.21462485194206238, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0347, + "step": 5480 + }, + { + "epoch": 20.48507462686567, + "grad_norm": 0.17611616849899292, + "learning_rate": 8.684213845395339e-05, + "loss": 0.0395, + "step": 5490 + }, + { + "epoch": 20.52238805970149, + "grad_norm": 0.19724012911319733, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0332, + "step": 5500 + }, + { + "epoch": 20.559701492537314, + "grad_norm": 0.2080456167459488, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0361, + "step": 5510 + }, + { + "epoch": 20.597014925373134, + "grad_norm": 0.21469220519065857, + "learning_rate": 8.6674008130122e-05, + "loss": 0.039, + "step": 5520 + }, + { + "epoch": 20.634328358208954, + "grad_norm": 0.242497980594635, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0397, + "step": 5530 + }, + { + "epoch": 20.671641791044777, + "grad_norm": 0.20539864897727966, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0392, + "step": 5540 + }, + { + "epoch": 20.708955223880597, + "grad_norm": 0.21964021027088165, + "learning_rate": 8.650497541989482e-05, + "loss": 0.035, + "step": 5550 + }, + { + "epoch": 20.746268656716417, + "grad_norm": 0.15793637931346893, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0363, + "step": 5560 + }, + { + "epoch": 20.78358208955224, + "grad_norm": 0.1731041818857193, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0371, + "step": 5570 + }, + { + "epoch": 20.82089552238806, + "grad_norm": 0.15019342303276062, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0335, + "step": 5580 + }, + { + "epoch": 20.85820895522388, + "grad_norm": 0.1397496908903122, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0391, + "step": 5590 + }, + { + "epoch": 20.895522388059703, + "grad_norm": 0.141131192445755, + "learning_rate": 8.622126023955446e-05, + "loss": 0.041, + "step": 5600 + }, + { + "epoch": 20.932835820895523, + "grad_norm": 0.20025403797626495, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0412, + "step": 5610 + }, + { + "epoch": 20.970149253731343, + "grad_norm": 0.2251378893852234, + "learning_rate": 8.610707988678503e-05, + "loss": 0.037, + "step": 5620 + }, + { + "epoch": 21.007462686567163, + "grad_norm": 0.1341109722852707, + "learning_rate": 8.604984155922506e-05, + "loss": 0.0371, + "step": 5630 + }, + { + "epoch": 21.044776119402986, + "grad_norm": 0.28053462505340576, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0366, + "step": 5640 + }, + { + "epoch": 21.082089552238806, + "grad_norm": 0.10567930340766907, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0394, + "step": 5650 + }, + { + "epoch": 21.119402985074625, + "grad_norm": 0.17919886112213135, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0357, + "step": 5660 + }, + { + "epoch": 21.15671641791045, + "grad_norm": 0.3223204016685486, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0369, + "step": 5670 + }, + { + "epoch": 21.19402985074627, + "grad_norm": 0.20072297751903534, + "learning_rate": 8.576217467724128e-05, + "loss": 0.0389, + "step": 5680 + }, + { + "epoch": 21.23134328358209, + "grad_norm": 0.1556226760149002, + "learning_rate": 8.570434735306671e-05, + "loss": 0.035, + "step": 5690 + }, + { + "epoch": 21.26865671641791, + "grad_norm": 0.20265886187553406, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0418, + "step": 5700 + }, + { + "epoch": 21.30597014925373, + "grad_norm": 0.15518955886363983, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0331, + "step": 5710 + }, + { + "epoch": 21.34328358208955, + "grad_norm": 0.1822584569454193, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0421, + "step": 5720 + }, + { + "epoch": 21.380597014925375, + "grad_norm": 0.14216330647468567, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0413, + "step": 5730 + }, + { + "epoch": 21.417910447761194, + "grad_norm": 0.24156329035758972, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0404, + "step": 5740 + }, + { + "epoch": 21.455223880597014, + "grad_norm": 0.2753167748451233, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0369, + "step": 5750 + }, + { + "epoch": 21.492537313432837, + "grad_norm": 0.17052626609802246, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0328, + "step": 5760 + }, + { + "epoch": 21.529850746268657, + "grad_norm": 0.11597824096679688, + "learning_rate": 8.523822798020827e-05, + "loss": 0.041, + "step": 5770 + }, + { + "epoch": 21.567164179104477, + "grad_norm": 0.14363346993923187, + "learning_rate": 8.517952785058385e-05, + "loss": 0.0393, + "step": 5780 + }, + { + "epoch": 21.604477611940297, + "grad_norm": 0.19373776018619537, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0372, + "step": 5790 + }, + { + "epoch": 21.64179104477612, + "grad_norm": 0.20276981592178345, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0389, + "step": 5800 + }, + { + "epoch": 21.67910447761194, + "grad_norm": 0.19267870485782623, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0371, + "step": 5810 + }, + { + "epoch": 21.71641791044776, + "grad_norm": 0.2701839208602905, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0333, + "step": 5820 + }, + { + "epoch": 21.753731343283583, + "grad_norm": 0.20612668991088867, + "learning_rate": 8.488458772904684e-05, + "loss": 0.0358, + "step": 5830 + }, + { + "epoch": 21.791044776119403, + "grad_norm": 0.18102902173995972, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0376, + "step": 5840 + }, + { + "epoch": 21.828358208955223, + "grad_norm": 0.23202018439769745, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0418, + "step": 5850 + }, + { + "epoch": 21.865671641791046, + "grad_norm": 0.09540139883756638, + "learning_rate": 8.470647788785665e-05, + "loss": 0.041, + "step": 5860 + }, + { + "epoch": 21.902985074626866, + "grad_norm": 0.23362809419631958, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0402, + "step": 5870 + }, + { + "epoch": 21.940298507462686, + "grad_norm": 0.20929335057735443, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0385, + "step": 5880 + }, + { + "epoch": 21.97761194029851, + "grad_norm": 0.18403425812721252, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0399, + "step": 5890 + }, + { + "epoch": 22.01492537313433, + "grad_norm": 0.2034774273633957, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0361, + "step": 5900 + }, + { + "epoch": 22.05223880597015, + "grad_norm": 0.14981597661972046, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0393, + "step": 5910 + }, + { + "epoch": 22.08955223880597, + "grad_norm": 0.20903146266937256, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0406, + "step": 5920 + }, + { + "epoch": 22.12686567164179, + "grad_norm": 0.12090307474136353, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0349, + "step": 5930 + }, + { + "epoch": 22.16417910447761, + "grad_norm": 0.14085660874843597, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0344, + "step": 5940 + }, + { + "epoch": 22.20149253731343, + "grad_norm": 0.30808404088020325, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0298, + "step": 5950 + }, + { + "epoch": 22.238805970149254, + "grad_norm": 0.17409317195415497, + "learning_rate": 8.410663560133784e-05, + "loss": 0.035, + "step": 5960 + }, + { + "epoch": 22.276119402985074, + "grad_norm": 0.18731828033924103, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0322, + "step": 5970 + }, + { + "epoch": 22.313432835820894, + "grad_norm": 0.16483667492866516, + "learning_rate": 8.398554292153866e-05, + "loss": 0.033, + "step": 5980 + }, + { + "epoch": 22.350746268656717, + "grad_norm": 0.195018008351326, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0344, + "step": 5990 + }, + { + "epoch": 22.388059701492537, + "grad_norm": 0.18210549652576447, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0387, + "step": 6000 + }, + { + "epoch": 22.425373134328357, + "grad_norm": 0.18658341467380524, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0359, + "step": 6010 + }, + { + "epoch": 22.46268656716418, + "grad_norm": 0.260953426361084, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0291, + "step": 6020 + }, + { + "epoch": 22.5, + "grad_norm": 0.2177930772304535, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0384, + "step": 6030 + }, + { + "epoch": 22.53731343283582, + "grad_norm": 0.1596938520669937, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0372, + "step": 6040 + }, + { + "epoch": 22.574626865671643, + "grad_norm": 0.21605637669563293, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0304, + "step": 6050 + }, + { + "epoch": 22.611940298507463, + "grad_norm": 0.13812203705310822, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0295, + "step": 6060 + }, + { + "epoch": 22.649253731343283, + "grad_norm": 0.22850565612316132, + "learning_rate": 8.343604577838964e-05, + "loss": 0.0385, + "step": 6070 + }, + { + "epoch": 22.686567164179106, + "grad_norm": 0.22924698889255524, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0438, + "step": 6080 + }, + { + "epoch": 22.723880597014926, + "grad_norm": 0.1455918848514557, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0358, + "step": 6090 + }, + { + "epoch": 22.761194029850746, + "grad_norm": 0.1839921921491623, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0349, + "step": 6100 + }, + { + "epoch": 22.798507462686565, + "grad_norm": 0.24356882274150848, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0335, + "step": 6110 + }, + { + "epoch": 22.83582089552239, + "grad_norm": 0.2336840182542801, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0339, + "step": 6120 + }, + { + "epoch": 22.87313432835821, + "grad_norm": 0.17839699983596802, + "learning_rate": 8.306559326618259e-05, + "loss": 0.0365, + "step": 6130 + }, + { + "epoch": 22.91044776119403, + "grad_norm": 0.18088172376155853, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0331, + "step": 6140 + }, + { + "epoch": 22.94776119402985, + "grad_norm": 0.1771453320980072, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0321, + "step": 6150 + }, + { + "epoch": 22.98507462686567, + "grad_norm": 0.1374535709619522, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0328, + "step": 6160 + }, + { + "epoch": 23.02238805970149, + "grad_norm": 0.17898012697696686, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0408, + "step": 6170 + }, + { + "epoch": 23.059701492537314, + "grad_norm": 0.21729676425457, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0354, + "step": 6180 + }, + { + "epoch": 23.097014925373134, + "grad_norm": 0.2473490685224533, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0361, + "step": 6190 + }, + { + "epoch": 23.134328358208954, + "grad_norm": 0.15661069750785828, + "learning_rate": 8.262928807620843e-05, + "loss": 0.036, + "step": 6200 + }, + { + "epoch": 23.171641791044777, + "grad_norm": 0.12378236651420593, + "learning_rate": 8.256660056776076e-05, + "loss": 0.0308, + "step": 6210 + }, + { + "epoch": 23.208955223880597, + "grad_norm": 0.1373433768749237, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0344, + "step": 6220 + }, + { + "epoch": 23.246268656716417, + "grad_norm": 0.14814983308315277, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0368, + "step": 6230 + }, + { + "epoch": 23.28358208955224, + "grad_norm": 0.15903662145137787, + "learning_rate": 8.237800451412095e-05, + "loss": 0.033, + "step": 6240 + }, + { + "epoch": 23.32089552238806, + "grad_norm": 0.1676921397447586, + "learning_rate": 8.231496189304704e-05, + "loss": 0.0361, + "step": 6250 + }, + { + "epoch": 23.35820895522388, + "grad_norm": 0.2496129870414734, + "learning_rate": 8.225183092410128e-05, + "loss": 0.037, + "step": 6260 + }, + { + "epoch": 23.395522388059703, + "grad_norm": 0.1830875128507614, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0377, + "step": 6270 + }, + { + "epoch": 23.432835820895523, + "grad_norm": 0.18538393080234528, + "learning_rate": 8.212530463322583e-05, + "loss": 0.0343, + "step": 6280 + }, + { + "epoch": 23.470149253731343, + "grad_norm": 0.23813718557357788, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0336, + "step": 6290 + }, + { + "epoch": 23.507462686567163, + "grad_norm": 0.14053800702095032, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0334, + "step": 6300 + }, + { + "epoch": 23.544776119402986, + "grad_norm": 0.19115787744522095, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0338, + "step": 6310 + }, + { + "epoch": 23.582089552238806, + "grad_norm": 0.1176459789276123, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0324, + "step": 6320 + }, + { + "epoch": 23.619402985074625, + "grad_norm": 0.13881400227546692, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0375, + "step": 6330 + }, + { + "epoch": 23.65671641791045, + "grad_norm": 0.12102743983268738, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0338, + "step": 6340 + }, + { + "epoch": 23.69402985074627, + "grad_norm": 0.16610436141490936, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0296, + "step": 6350 + }, + { + "epoch": 23.73134328358209, + "grad_norm": 0.12234822660684586, + "learning_rate": 8.161570019212921e-05, + "loss": 0.029, + "step": 6360 + }, + { + "epoch": 23.76865671641791, + "grad_norm": 0.17056342959403992, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0381, + "step": 6370 + }, + { + "epoch": 23.80597014925373, + "grad_norm": 0.1477614790201187, + "learning_rate": 8.148743122865463e-05, + "loss": 0.0315, + "step": 6380 + }, + { + "epoch": 23.84328358208955, + "grad_norm": 0.38320279121398926, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0366, + "step": 6390 + }, + { + "epoch": 23.880597014925375, + "grad_norm": 0.1497313380241394, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0325, + "step": 6400 + }, + { + "epoch": 23.917910447761194, + "grad_norm": 0.1574944257736206, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0372, + "step": 6410 + }, + { + "epoch": 23.955223880597014, + "grad_norm": 0.17678116261959076, + "learning_rate": 8.12298616836904e-05, + "loss": 0.034, + "step": 6420 + }, + { + "epoch": 23.992537313432837, + "grad_norm": 0.13617518544197083, + "learning_rate": 8.116525540362434e-05, + "loss": 0.032, + "step": 6430 + }, + { + "epoch": 24.029850746268657, + "grad_norm": 0.1610628217458725, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0295, + "step": 6440 + }, + { + "epoch": 24.067164179104477, + "grad_norm": 0.24379907548427582, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0318, + "step": 6450 + }, + { + "epoch": 24.104477611940297, + "grad_norm": 0.15908868610858917, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0285, + "step": 6460 + }, + { + "epoch": 24.14179104477612, + "grad_norm": 0.17211472988128662, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0345, + "step": 6470 + }, + { + "epoch": 24.17910447761194, + "grad_norm": 0.10870133340358734, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0349, + "step": 6480 + }, + { + "epoch": 24.21641791044776, + "grad_norm": 0.1614072173833847, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0305, + "step": 6490 + }, + { + "epoch": 24.253731343283583, + "grad_norm": 0.1449541449546814, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0326, + "step": 6500 + }, + { + "epoch": 24.291044776119403, + "grad_norm": 0.15968690812587738, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0379, + "step": 6510 + }, + { + "epoch": 24.328358208955223, + "grad_norm": 0.2027505785226822, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0368, + "step": 6520 + }, + { + "epoch": 24.365671641791046, + "grad_norm": 0.18664468824863434, + "learning_rate": 8.051453560801772e-05, + "loss": 0.041, + "step": 6530 + }, + { + "epoch": 24.402985074626866, + "grad_norm": 0.2137981504201889, + "learning_rate": 8.044900184287007e-05, + "loss": 0.036, + "step": 6540 + }, + { + "epoch": 24.440298507462686, + "grad_norm": 0.1381145715713501, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0342, + "step": 6550 + }, + { + "epoch": 24.47761194029851, + "grad_norm": 0.2125469446182251, + "learning_rate": 8.031768475274413e-05, + "loss": 0.0363, + "step": 6560 + }, + { + "epoch": 24.51492537313433, + "grad_norm": 0.1482478678226471, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0359, + "step": 6570 + }, + { + "epoch": 24.55223880597015, + "grad_norm": 0.17988649010658264, + "learning_rate": 8.018603611327504e-05, + "loss": 0.0388, + "step": 6580 + }, + { + "epoch": 24.58955223880597, + "grad_norm": 0.1568310409784317, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0357, + "step": 6590 + }, + { + "epoch": 24.62686567164179, + "grad_norm": 0.17348839342594147, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0348, + "step": 6600 + }, + { + "epoch": 24.66417910447761, + "grad_norm": 0.18807284533977509, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0371, + "step": 6610 + }, + { + "epoch": 24.701492537313435, + "grad_norm": 0.12133855372667313, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0376, + "step": 6620 + }, + { + "epoch": 24.738805970149254, + "grad_norm": 0.2808085083961487, + "learning_rate": 7.985547344306161e-05, + "loss": 0.0346, + "step": 6630 + }, + { + "epoch": 24.776119402985074, + "grad_norm": 0.13642264902591705, + "learning_rate": 7.978911531372765e-05, + "loss": 0.0365, + "step": 6640 + }, + { + "epoch": 24.813432835820894, + "grad_norm": 0.19014127552509308, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0344, + "step": 6650 + }, + { + "epoch": 24.850746268656717, + "grad_norm": 0.16038668155670166, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0339, + "step": 6660 + }, + { + "epoch": 24.888059701492537, + "grad_norm": 0.17937994003295898, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0294, + "step": 6670 + }, + { + "epoch": 24.925373134328357, + "grad_norm": 0.19632326066493988, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0365, + "step": 6680 + }, + { + "epoch": 24.96268656716418, + "grad_norm": 0.14519083499908447, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0307, + "step": 6690 + }, + { + "epoch": 25.0, + "grad_norm": 0.17961327731609344, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0348, + "step": 6700 + }, + { + "epoch": 25.03731343283582, + "grad_norm": 0.1272597312927246, + "learning_rate": 7.932233821142987e-05, + "loss": 0.0296, + "step": 6710 + }, + { + "epoch": 25.074626865671643, + "grad_norm": 0.21824714541435242, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0333, + "step": 6720 + }, + { + "epoch": 25.111940298507463, + "grad_norm": 0.12162213027477264, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0314, + "step": 6730 + }, + { + "epoch": 25.149253731343283, + "grad_norm": 0.17663027346134186, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0395, + "step": 6740 + }, + { + "epoch": 25.186567164179106, + "grad_norm": 0.1496419459581375, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0377, + "step": 6750 + }, + { + "epoch": 25.223880597014926, + "grad_norm": 0.15282033383846283, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0308, + "step": 6760 + }, + { + "epoch": 25.261194029850746, + "grad_norm": 0.1586643010377884, + "learning_rate": 7.891911472227478e-05, + "loss": 0.031, + "step": 6770 + }, + { + "epoch": 25.298507462686565, + "grad_norm": 0.15809810161590576, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0304, + "step": 6780 + }, + { + "epoch": 25.33582089552239, + "grad_norm": 0.19378156960010529, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0336, + "step": 6790 + }, + { + "epoch": 25.37313432835821, + "grad_norm": 0.17140574753284454, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0301, + "step": 6800 + }, + { + "epoch": 25.41044776119403, + "grad_norm": 0.1791999191045761, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0381, + "step": 6810 + }, + { + "epoch": 25.44776119402985, + "grad_norm": 0.13285186886787415, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0301, + "step": 6820 + }, + { + "epoch": 25.48507462686567, + "grad_norm": 0.1444288194179535, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0257, + "step": 6830 + }, + { + "epoch": 25.52238805970149, + "grad_norm": 0.17137834429740906, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0272, + "step": 6840 + }, + { + "epoch": 25.559701492537314, + "grad_norm": 0.17752587795257568, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0343, + "step": 6850 + }, + { + "epoch": 25.597014925373134, + "grad_norm": 0.1934349089860916, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0289, + "step": 6860 + }, + { + "epoch": 25.634328358208954, + "grad_norm": 0.21587027609348297, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0349, + "step": 6870 + }, + { + "epoch": 25.671641791044777, + "grad_norm": 0.15302105247974396, + "learning_rate": 7.817250808190483e-05, + "loss": 0.0346, + "step": 6880 + }, + { + "epoch": 25.708955223880597, + "grad_norm": 0.15441982448101044, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0403, + "step": 6890 + }, + { + "epoch": 25.746268656716417, + "grad_norm": 0.11743316054344177, + "learning_rate": 7.803575286758364e-05, + "loss": 0.0329, + "step": 6900 + }, + { + "epoch": 25.78358208955224, + "grad_norm": 0.1417740434408188, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0346, + "step": 6910 + }, + { + "epoch": 25.82089552238806, + "grad_norm": 0.14196589589118958, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0333, + "step": 6920 + }, + { + "epoch": 25.85820895522388, + "grad_norm": 0.17111234366893768, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0403, + "step": 6930 + }, + { + "epoch": 25.895522388059703, + "grad_norm": 0.159880131483078, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0281, + "step": 6940 + }, + { + "epoch": 25.932835820895523, + "grad_norm": 0.1706574410200119, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0327, + "step": 6950 + }, + { + "epoch": 25.970149253731343, + "grad_norm": 0.20553110539913177, + "learning_rate": 7.762365365649067e-05, + "loss": 0.0345, + "step": 6960 + }, + { + "epoch": 26.007462686567163, + "grad_norm": 0.17439968883991241, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0338, + "step": 6970 + }, + { + "epoch": 26.044776119402986, + "grad_norm": 0.1736845076084137, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0309, + "step": 6980 + }, + { + "epoch": 26.082089552238806, + "grad_norm": 0.15822389721870422, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0338, + "step": 6990 + }, + { + "epoch": 26.119402985074625, + "grad_norm": 0.24268855154514313, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0356, + "step": 7000 + }, + { + "epoch": 26.15671641791045, + "grad_norm": 0.18587811291217804, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0302, + "step": 7010 + }, + { + "epoch": 26.19402985074627, + "grad_norm": 0.18292690813541412, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0337, + "step": 7020 + }, + { + "epoch": 26.23134328358209, + "grad_norm": 0.1578633338212967, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0303, + "step": 7030 + }, + { + "epoch": 26.26865671641791, + "grad_norm": 0.21999752521514893, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0289, + "step": 7040 + }, + { + "epoch": 26.30597014925373, + "grad_norm": 0.2454708218574524, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0403, + "step": 7050 + }, + { + "epoch": 26.34328358208955, + "grad_norm": 0.1282198578119278, + "learning_rate": 7.693080007570084e-05, + "loss": 0.0378, + "step": 7060 + }, + { + "epoch": 26.380597014925375, + "grad_norm": 0.14491668343544006, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0364, + "step": 7070 + }, + { + "epoch": 26.417910447761194, + "grad_norm": 0.2097531259059906, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0306, + "step": 7080 + }, + { + "epoch": 26.455223880597014, + "grad_norm": 0.15113137662410736, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0352, + "step": 7090 + }, + { + "epoch": 26.492537313432837, + "grad_norm": 0.1912260502576828, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0279, + "step": 7100 + }, + { + "epoch": 26.529850746268657, + "grad_norm": 0.17578865587711334, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0343, + "step": 7110 + }, + { + "epoch": 26.567164179104477, + "grad_norm": 0.13778840005397797, + "learning_rate": 7.651154166637025e-05, + "loss": 0.0286, + "step": 7120 + }, + { + "epoch": 26.604477611940297, + "grad_norm": 0.13315582275390625, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0316, + "step": 7130 + }, + { + "epoch": 26.64179104477612, + "grad_norm": 0.2831936180591583, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0341, + "step": 7140 + }, + { + "epoch": 26.67910447761194, + "grad_norm": 0.1701228767633438, + "learning_rate": 7.630093137959171e-05, + "loss": 0.039, + "step": 7150 + }, + { + "epoch": 26.71641791044776, + "grad_norm": 0.1315871775150299, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0323, + "step": 7160 + }, + { + "epoch": 26.753731343283583, + "grad_norm": 0.12028495967388153, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0343, + "step": 7170 + }, + { + "epoch": 26.791044776119403, + "grad_norm": 0.14795522391796112, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0359, + "step": 7180 + }, + { + "epoch": 26.828358208955223, + "grad_norm": 0.08910278230905533, + "learning_rate": 7.60191118833165e-05, + "loss": 0.0381, + "step": 7190 + }, + { + "epoch": 26.865671641791046, + "grad_norm": 0.12936276197433472, + "learning_rate": 7.594847868906076e-05, + "loss": 0.0344, + "step": 7200 + }, + { + "epoch": 26.902985074626866, + "grad_norm": 0.1755450963973999, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0298, + "step": 7210 + }, + { + "epoch": 26.940298507462686, + "grad_norm": 0.15424427390098572, + "learning_rate": 7.580699966740201e-05, + "loss": 0.03, + "step": 7220 + }, + { + "epoch": 26.97761194029851, + "grad_norm": 0.18190084397792816, + "learning_rate": 7.573615422679726e-05, + "loss": 0.036, + "step": 7230 + }, + { + "epoch": 27.01492537313433, + "grad_norm": 0.12391633540391922, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0302, + "step": 7240 + }, + { + "epoch": 27.05223880597015, + "grad_norm": 0.09776072204113007, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0301, + "step": 7250 + }, + { + "epoch": 27.08955223880597, + "grad_norm": 0.13018116354942322, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0263, + "step": 7260 + }, + { + "epoch": 27.12686567164179, + "grad_norm": 0.11622302234172821, + "learning_rate": 7.545207078751857e-05, + "loss": 0.0307, + "step": 7270 + }, + { + "epoch": 27.16417910447761, + "grad_norm": 0.14957569539546967, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0298, + "step": 7280 + }, + { + "epoch": 27.20149253731343, + "grad_norm": 0.18458008766174316, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0297, + "step": 7290 + }, + { + "epoch": 27.238805970149254, + "grad_norm": 0.16861510276794434, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0353, + "step": 7300 + }, + { + "epoch": 27.276119402985074, + "grad_norm": 0.16894373297691345, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0342, + "step": 7310 + }, + { + "epoch": 27.313432835820894, + "grad_norm": 0.16104437410831451, + "learning_rate": 7.509540229240601e-05, + "loss": 0.035, + "step": 7320 + }, + { + "epoch": 27.350746268656717, + "grad_norm": 0.24792194366455078, + "learning_rate": 7.50238619827301e-05, + "loss": 0.033, + "step": 7330 + }, + { + "epoch": 27.388059701492537, + "grad_norm": 0.15868139266967773, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0372, + "step": 7340 + }, + { + "epoch": 27.425373134328357, + "grad_norm": 0.1612396538257599, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0324, + "step": 7350 + }, + { + "epoch": 27.46268656716418, + "grad_norm": 0.1115829274058342, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0283, + "step": 7360 + }, + { + "epoch": 27.5, + "grad_norm": 0.14605456590652466, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0299, + "step": 7370 + }, + { + "epoch": 27.53731343283582, + "grad_norm": 0.18136604130268097, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0298, + "step": 7380 + }, + { + "epoch": 27.574626865671643, + "grad_norm": 0.16361157596111298, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0338, + "step": 7390 + }, + { + "epoch": 27.611940298507463, + "grad_norm": 0.18856105208396912, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0384, + "step": 7400 + }, + { + "epoch": 27.649253731343283, + "grad_norm": 0.15344281494617462, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0295, + "step": 7410 + }, + { + "epoch": 27.686567164179106, + "grad_norm": 0.17113055288791656, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0311, + "step": 7420 + }, + { + "epoch": 27.723880597014926, + "grad_norm": 0.1809830665588379, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0343, + "step": 7430 + }, + { + "epoch": 27.761194029850746, + "grad_norm": 0.1445932686328888, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0338, + "step": 7440 + }, + { + "epoch": 27.798507462686565, + "grad_norm": 0.1392519325017929, + "learning_rate": 7.416009807699482e-05, + "loss": 0.0329, + "step": 7450 + }, + { + "epoch": 27.83582089552239, + "grad_norm": 0.1715395152568817, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0308, + "step": 7460 + }, + { + "epoch": 27.87313432835821, + "grad_norm": 0.25407588481903076, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0367, + "step": 7470 + }, + { + "epoch": 27.91044776119403, + "grad_norm": 0.19018910825252533, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0296, + "step": 7480 + }, + { + "epoch": 27.94776119402985, + "grad_norm": 0.19958099722862244, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0363, + "step": 7490 + }, + { + "epoch": 27.98507462686567, + "grad_norm": 0.16039235889911652, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0323, + "step": 7500 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 75, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.638017556203379e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/experiment_cfg/metadata.json b/experiment_cfg/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..40302046074c7e429ab3933ad6b163f9735902de --- /dev/null +++ b/experiment_cfg/metadata.json @@ -0,0 +1,275 @@ +{ + "new_embodiment": { + "statistics": { + "state": { + "single_arm": { + "max": [ + 47.021484375, + 135.263671875, + 178.505859375, + 78.3984375, + 56.77734375 + ], + "min": [ + -25.576171875, + 46.93359375, + 89.736328125, + -30.41015625, + -77.607421875 + ], + "mean": [ + 7.780572414398193, + 121.54933166503906, + 145.44825744628906, + 26.051393508911133, + -12.748016357421875 + ], + "std": [ + 11.060831069946289, + 21.937597274780273, + 17.16187286376953, + 19.231945037841797, + 14.66512680053711 + ], + "q01": [ + -17.578125, + 58.0078125, + 97.998046875, + -13.447265625, + -39.9005859375 + ], + "q99": [ + 36.650390625, + 134.47265625, + 178.41796875, + 66.65009765625, + 40.166015625 + ] + }, + "gripper": { + "max": [ + 52.22222137451172 + ], + "min": [ + -3.846153974533081 + ], + "mean": [ + 10.933439254760742 + ], + "std": [ + 15.509913444519043 + ], + "q01": [ + -3.846153974533081 + ], + "q99": [ + 51.02564239501953 + ] + }, + "mobile_base": { + "max": [ + 75.42072296142578, + 276.7638244628906, + 93.75 + ], + "min": [ + -170.01620483398438, + -274.5497131347656, + -93.75 + ], + "mean": [ + -0.31241804361343384, + 58.99717712402344, + 2.4293017387390137 + ], + "std": [ + 10.56183910369873, + 119.39802551269531, + 22.590484619140625 + ], + "q01": [ + -33.65809627532959, + -265.6932678222656, + -72.849609375 + ], + "q99": [ + 30.679615020751953, + 270.1214904785156, + 90.234375 + ] + } + }, + "action": { + "single_arm": { + "max": [ + 37.96875, + 135.087890625, + 179.384765625, + 78.837890625, + 57.392578125 + ], + "min": [ + -26.279296875, + 47.373046875, + 89.912109375, + -31.640625, + -77.16796875 + ], + "mean": [ + 8.038639068603516, + 122.76031494140625, + 145.15855407714844, + 26.28432846069336, + -13.195321083068848 + ], + "std": [ + 11.36032772064209, + 21.925451278686523, + 17.071842193603516, + 19.503877639770508, + 14.882487297058105 + ], + "q01": [ + -18.10546875, + 58.623046875, + 98.26171875, + -14.326171875, + -40.078125 + ], + "q99": [ + 37.44140625, + 135.087890625, + 179.296875, + 67.1484375, + 40.869140625 + ] + }, + "gripper": { + "max": [ + 52.646484375 + ], + "min": [ + -10.72265625 + ], + "mean": [ + 4.366570949554443 + ], + "std": [ + 18.90865707397461 + ], + "q01": [ + -10.546875 + ], + "q99": [ + 51.767578125 + ] + }, + "mobile_base": { + "max": [ + 230.0971221923828, + 265.6932678222656, + 90.0 + ], + "min": [ + -230.0971221923828, + -265.6932678222656, + -90.0 + ], + "mean": [ + -0.36507830023765564, + 60.13115310668945, + 2.5394127368927 + ], + "std": [ + 15.02155590057373, + 129.06507873535156, + 27.82071304321289 + ], + "q01": [ + -0.02556634694337845, + -265.6932678222656, + -90.0 + ], + "q99": [ + 0.02556634694337845, + 265.6932678222656, + 90.0 + ] + } + } + }, + "modalities": { + "video": { + "wrist": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + }, + "front": { + "resolution": [ + 640, + 480 + ], + "channels": 3, + "fps": 30.0 + } + }, + "state": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + }, + "action": { + "single_arm": { + "absolute": true, + "rotation_type": null, + "shape": [ + 5 + ], + "continuous": true + }, + "gripper": { + "absolute": true, + "rotation_type": null, + "shape": [ + 1 + ], + "continuous": true + }, + "mobile_base": { + "absolute": true, + "rotation_type": null, + "shape": [ + 3 + ], + "continuous": true + } + } + }, + "embodiment_tag": "new_embodiment" + } +} \ No newline at end of file