diff --git a/shared/auto_workers/twiny-stack-L03/worker-18ce0552.json b/shared/auto_workers/twiny-stack-L03/worker-18ce0552.json deleted file mode 100644 index 421a1d6bf8daef3b1a2fa8ecef02ecc362d649bc..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-18ce0552.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-18ce0552", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816432} diff --git a/shared/auto_workers/twiny-stack-L03/worker-2962f1fd.json b/shared/auto_workers/twiny-stack-L03/worker-2962f1fd.json deleted file mode 100644 index e06f3cf88ea3ef1b4c426ef0f4546a1bac9f17bd..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-2962f1fd.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-2962f1fd", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776771850} diff --git a/shared/auto_workers/twiny-stack-L03/worker-2e7552ae.json b/shared/auto_workers/twiny-stack-L03/worker-2e7552ae.json deleted file mode 100644 index e83738f13bf1e915287c1ada60f6233a52441891..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-2e7552ae.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-2e7552ae", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816467} diff --git a/shared/auto_workers/twiny-stack-L03/worker-32adc266.json b/shared/auto_workers/twiny-stack-L03/worker-32adc266.json deleted file mode 100644 index 40fc0f19cd6ed4732eb56eb789bf4ddbad5f3a1d..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-32adc266.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-32adc266", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776813411} diff --git a/shared/auto_workers/twiny-stack-L03/worker-3e045c6f.json b/shared/auto_workers/twiny-stack-L03/worker-3e045c6f.json deleted file mode 100644 index 70619bd42c27bb35c42c641002a6eb6a790acb9b..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-3e045c6f.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-3e045c6f", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776813416} diff --git a/shared/auto_workers/twiny-stack-L03/worker-4a9c72b2.json b/shared/auto_workers/twiny-stack-L03/worker-4a9c72b2.json deleted file mode 100644 index de313d6c9f61638fc2145703c5247f639514e1bf..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-4a9c72b2.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-4a9c72b2", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776813417} diff --git a/shared/auto_workers/twiny-stack-L03/worker-56fe9716.json b/shared/auto_workers/twiny-stack-L03/worker-56fe9716.json deleted file mode 100644 index c9771cf89cbbbea368aa69056c3cfe124c29aac7..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-56fe9716.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-56fe9716", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816457} diff --git a/shared/auto_workers/twiny-stack-L03/worker-93c0e681.json b/shared/auto_workers/twiny-stack-L03/worker-93c0e681.json deleted file mode 100644 index 3fcfad97fb72746edc054e94e0174cffb99bde23..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-93c0e681.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-93c0e681", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776813412} diff --git a/shared/auto_workers/twiny-stack-L03/worker-9e926922.json b/shared/auto_workers/twiny-stack-L03/worker-9e926922.json deleted file mode 100644 index 8de01f9126987e8ca1306dc84d4d765b883a9561..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-9e926922.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-9e926922", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816462} diff --git a/shared/auto_workers/twiny-stack-L03/worker-a41b4bdc.json b/shared/auto_workers/twiny-stack-L03/worker-a41b4bdc.json deleted file mode 100644 index 0f76e0fafcecc69f6a46839b5f248dba7e0e0fb3..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-a41b4bdc.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-a41b4bdc", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816459} diff --git a/shared/auto_workers/twiny-stack-L03/worker-bc63c10f.json b/shared/auto_workers/twiny-stack-L03/worker-bc63c10f.json deleted file mode 100644 index d3ffe1ca3eef7f981e4389429d02f01bdac5cc15..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-bc63c10f.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-bc63c10f", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816464} diff --git a/shared/auto_workers/twiny-stack-L03/worker-c7779c04.json b/shared/auto_workers/twiny-stack-L03/worker-c7779c04.json deleted file mode 100644 index 0c433f3f2ea3aa44eccd805ef1cb4545601ceec8..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-c7779c04.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-c7779c04", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776771852} diff --git a/shared/auto_workers/twiny-stack-L03/worker-ca8986e8.json b/shared/auto_workers/twiny-stack-L03/worker-ca8986e8.json deleted file mode 100644 index 0bd90ab09bfb4a71be51e231fd00e29f93bc126c..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-ca8986e8.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-ca8986e8", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776816432} diff --git a/shared/auto_workers/twiny-stack-L03/worker-cba01791.json b/shared/auto_workers/twiny-stack-L03/worker-cba01791.json deleted file mode 100644 index 6aa5ddcc92eef1750168236bd42619e11a943b5e..0000000000000000000000000000000000000000 --- a/shared/auto_workers/twiny-stack-L03/worker-cba01791.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "worker-cba01791", "stage": 3, "model_name": "twiny-stack-L03", "updated_at": 1776813412} diff --git a/shared/slot_plan/twiny-stack-L03/stage3.json b/shared/slot_plan/twiny-stack-L03/stage3.json deleted file mode 100644 index cad2f2cb0535511971cfa9044f19460492e99d03..0000000000000000000000000000000000000000 --- a/shared/slot_plan/twiny-stack-L03/stage3.json +++ /dev/null @@ -1 +0,0 @@ -{"model_name": "twiny-stack-L03", "stage": 3, "slot_total": 20, "worker_id": "worker-18ce0552", "updated_at": 1776816794, "slot_workers": ["worker-c7779c04", "worker-2962f1fd", "worker-4a9c72b2", "worker-3e045c6f", "worker-93c0e681", "worker-cba01791", "worker-32adc266", "worker-2e7552ae", "worker-bc63c10f", "worker-9e926922", "worker-a41b4bdc", "worker-56fe9716", "worker-18ce0552", "worker-ca8986e8"]} diff --git a/slots/0/latest.json b/slots/0/latest.json deleted file mode 100644 index b43aff69bc36a19565712698ddfc4f239c0cbc84..0000000000000000000000000000000000000000 --- a/slots/0/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:0", "checkpoint": "slots/0/checkpoint-9484", "step": 9484, "updated_at": 1776780683} diff --git a/slots/1/latest.json b/slots/1/latest.json deleted file mode 100644 index 1985acb0b2eb63a6706c360542f2dd7664805915..0000000000000000000000000000000000000000 --- a/slots/1/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:1", "checkpoint": "slots/1/checkpoint-9903", "step": 9903, "updated_at": 1776815559} diff --git a/slots/10/checkpoint-10075/config.json b/slots/10/checkpoint-10075/config.json deleted file mode 100644 index 9e5d8b7224eff16a790758ae86dd97c89afeab74..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "TwinyForCausalLM" - ], - "attention_dropout": 0.0, - "dtype": "float32", - "hidden_dropout": 0.0, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 128, - "model_type": "twiny", - "neftune_alpha": 0.0, - "num_attention_heads": 12, - "num_hidden_layers": 3, - "num_key_value_heads": 3, - "qk_norm": true, - "rezero_init": 1.0, - "rms_norm_eps": 1e-06, - "rope_theta": 10000.0, - "transformers_version": "5.0.0", - "use_cache": false, - "vocab_size": 32000 -} diff --git a/slots/10/checkpoint-10075/model.safetensors b/slots/10/checkpoint-10075/model.safetensors deleted file mode 100644 index e59b65821a645cf3e64e7952eacf6bda50d731e4..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a2ca48b75a7002146d5ef60bc204ba77a99d24d42ffe9f0c5cbaafcfc4682879 -size 306388092 diff --git a/slots/10/checkpoint-10075/optimizer.pt b/slots/10/checkpoint-10075/optimizer.pt deleted file mode 100644 index e6e5f4791dfddc7b38dd83f2ee87e42acfdc96f3..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d1c60a18f57ed02384f52ee715de607f83af4d48116e43c4e045833a4700d37 -size 302484555 diff --git a/slots/10/checkpoint-10075/rng_state.pth b/slots/10/checkpoint-10075/rng_state.pth deleted file mode 100644 index 1feba1a6538e93b94696d3773853dbc8947b0cad..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 -size 14645 diff --git a/slots/10/checkpoint-10075/scaler.pt b/slots/10/checkpoint-10075/scaler.pt deleted file mode 100644 index bc67d6c815c5bebc898f2a6ad1c368c134527aee..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/scaler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ee5f7f13c8c1d241dae1ff1f44e0e8453841f58efabc810c63b4dd5709316db -size 1383 diff --git a/slots/10/checkpoint-10075/scheduler.pt b/slots/10/checkpoint-10075/scheduler.pt deleted file mode 100644 index 410c10306598c85ec38d12c31f33bf58809cf756..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67a18908c2d144f110b11a9c3fc2646e1414a2d3c62599cf96cbeb463b495cdd -size 1465 diff --git a/slots/10/checkpoint-10075/trainer_state.json b/slots/10/checkpoint-10075/trainer_state.json deleted file mode 100644 index 9f2bccde0cd60239eb6795d271b9f82c571592ce..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/trainer_state.json +++ /dev/null @@ -1,3562 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.3567929577258835, - "eval_steps": 500, - "global_step": 10075, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001336931908386741, - "grad_norm": Infinity, - "learning_rate": 5e-05, - "loss": 129.80032348632812, - "step": 1 - }, - { - "epoch": 0.002673863816773482, - "grad_norm": 63.64365768432617, - "learning_rate": 4.999995392022967e-05, - "loss": 63.88374408922697, - "step": 20 - }, - { - "epoch": 0.005347727633546964, - "grad_norm": 24.627853393554688, - "learning_rate": 4.999976672145381e-05, - "loss": 12.65963363647461, - "step": 40 - }, - { - "epoch": 0.008021591450320446, - "grad_norm": 14.29983901977539, - "learning_rate": 4.999943552476422e-05, - "loss": 5.90204963684082, - "step": 60 - }, - { - "epoch": 0.010695455267093928, - "grad_norm": 15.690323829650879, - "learning_rate": 4.999896033206858e-05, - "loss": 3.9918922424316405, - "step": 80 - }, - { - "epoch": 0.01336931908386741, - "grad_norm": 31.583160400390625, - "learning_rate": 4.999834114610398e-05, - "loss": 2.9675426483154297, - "step": 100 - }, - { - "epoch": 0.01604318290064089, - "grad_norm": 13.034649848937988, - "learning_rate": 4.999757797043691e-05, - "loss": 2.725296401977539, - "step": 120 - }, - { - "epoch": 0.018717046717414372, - "grad_norm": 8.362203598022461, - "learning_rate": 4.999667080946324e-05, - "loss": 2.2478992462158205, - "step": 140 - }, - { - "epoch": 0.021390910534187856, - "grad_norm": 8.726786613464355, - "learning_rate": 4.999561966840821e-05, - "loss": 1.8447845458984375, - "step": 160 - }, - { - "epoch": 0.024064774350961337, - "grad_norm": 10.092752456665039, - "learning_rate": 4.9994424553326335e-05, - "loss": 1.5611843109130858, - "step": 180 - }, - { - "epoch": 0.02673863816773482, - "grad_norm": 9.090085983276367, - "learning_rate": 4.999308547110146e-05, - "loss": 1.520334815979004, - "step": 200 - }, - { - "epoch": 0.029412501984508302, - "grad_norm": 9.668124198913574, - "learning_rate": 4.999160242944665e-05, - "loss": 1.2818055152893066, - "step": 220 - }, - { - "epoch": 0.03208636580128178, - "grad_norm": 9.182533264160156, - "learning_rate": 4.998997543690418e-05, - "loss": 1.0428407669067383, - "step": 240 - }, - { - "epoch": 0.03476022961805526, - "grad_norm": 5.745838165283203, - "learning_rate": 4.998820450284549e-05, - "loss": 1.2343652725219727, - "step": 260 - }, - { - "epoch": 0.037434093434828744, - "grad_norm": 8.651643753051758, - "learning_rate": 4.99862896374711e-05, - "loss": 0.8859601020812988, - "step": 280 - }, - { - "epoch": 0.04010795725160223, - "grad_norm": 10.765266418457031, - "learning_rate": 4.998423085181056e-05, - "loss": 0.989600658416748, - "step": 300 - }, - { - "epoch": 0.04278182106837571, - "grad_norm": 6.092499256134033, - "learning_rate": 4.998202815772245e-05, - "loss": 0.7189463615417481, - "step": 320 - }, - { - "epoch": 0.04545568488514919, - "grad_norm": 6.352876663208008, - "learning_rate": 4.9979681567894195e-05, - "loss": 0.7489545345306396, - "step": 340 - }, - { - "epoch": 0.048129548701922674, - "grad_norm": 4.620656490325928, - "learning_rate": 4.997719109584209e-05, - "loss": 0.7381401538848877, - "step": 360 - }, - { - "epoch": 0.050803412518696155, - "grad_norm": 7.796917915344238, - "learning_rate": 4.997455675591119e-05, - "loss": 0.5687405109405518, - "step": 380 - }, - { - "epoch": 0.05347727633546964, - "grad_norm": 2.837172508239746, - "learning_rate": 4.9971778563275204e-05, - "loss": 0.5686865329742432, - "step": 400 - }, - { - "epoch": 0.05615114015224312, - "grad_norm": 3.3103690147399902, - "learning_rate": 4.9968856533936436e-05, - "loss": 0.625730562210083, - "step": 420 - }, - { - "epoch": 0.058825003969016604, - "grad_norm": 3.5682132244110107, - "learning_rate": 4.99657906847257e-05, - "loss": 0.6125466346740722, - "step": 440 - }, - { - "epoch": 0.061498867785790085, - "grad_norm": 5.63640832901001, - "learning_rate": 4.996258103330218e-05, - "loss": 0.6182214260101319, - "step": 460 - }, - { - "epoch": 0.06417273160256357, - "grad_norm": 4.698945999145508, - "learning_rate": 4.995922759815339e-05, - "loss": 0.43828091621398924, - "step": 480 - }, - { - "epoch": 0.06684659541933705, - "grad_norm": 2.1976189613342285, - "learning_rate": 4.995573039859501e-05, - "loss": 0.4459230899810791, - "step": 500 - }, - { - "epoch": 0.06952045923611053, - "grad_norm": 3.8809523582458496, - "learning_rate": 4.995208945477081e-05, - "loss": 0.3821882963180542, - "step": 520 - }, - { - "epoch": 0.07219432305288401, - "grad_norm": 3.75144100189209, - "learning_rate": 4.994830478765251e-05, - "loss": 0.5800807476043701, - "step": 540 - }, - { - "epoch": 0.07486818686965749, - "grad_norm": 3.0038585662841797, - "learning_rate": 4.9944376419039684e-05, - "loss": 0.3928264617919922, - "step": 560 - }, - { - "epoch": 0.07754205068643098, - "grad_norm": 3.614591598510742, - "learning_rate": 4.994030437155961e-05, - "loss": 0.48637890815734863, - "step": 580 - }, - { - "epoch": 0.08021591450320446, - "grad_norm": 4.143443584442139, - "learning_rate": 4.993608866866718e-05, - "loss": 0.3650153160095215, - "step": 600 - }, - { - "epoch": 0.08288977831997794, - "grad_norm": 6.692712783813477, - "learning_rate": 4.993172933464471e-05, - "loss": 0.3677916288375854, - "step": 620 - }, - { - "epoch": 0.08556364213675142, - "grad_norm": 8.383441925048828, - "learning_rate": 4.9927226394601815e-05, - "loss": 0.3399480104446411, - "step": 640 - }, - { - "epoch": 0.0882375059535249, - "grad_norm": 5.566338062286377, - "learning_rate": 4.992257987447532e-05, - "loss": 0.28104052543640134, - "step": 660 - }, - { - "epoch": 0.09091136977029839, - "grad_norm": 3.1196420192718506, - "learning_rate": 4.991778980102904e-05, - "loss": 0.351950478553772, - "step": 680 - }, - { - "epoch": 0.09358523358707187, - "grad_norm": 3.47979736328125, - "learning_rate": 4.9912856201853644e-05, - "loss": 0.27501535415649414, - "step": 700 - }, - { - "epoch": 0.09625909740384535, - "grad_norm": 5.446717262268066, - "learning_rate": 4.990777910536653e-05, - "loss": 0.2651593923568726, - "step": 720 - }, - { - "epoch": 0.09893296122061883, - "grad_norm": 7.6145339012146, - "learning_rate": 4.990255854081161e-05, - "loss": 0.35140380859375, - "step": 740 - }, - { - "epoch": 0.10160682503739231, - "grad_norm": 8.445616722106934, - "learning_rate": 4.989719453825918e-05, - "loss": 0.2961219072341919, - "step": 760 - }, - { - "epoch": 0.10428068885416579, - "grad_norm": 6.339537620544434, - "learning_rate": 4.9891687128605744e-05, - "loss": 0.24962289333343507, - "step": 780 - }, - { - "epoch": 0.10695455267093928, - "grad_norm": 3.3369436264038086, - "learning_rate": 4.988603634357383e-05, - "loss": 0.2124847412109375, - "step": 800 - }, - { - "epoch": 0.10962841648771277, - "grad_norm": 2.2909045219421387, - "learning_rate": 4.988024221571177e-05, - "loss": 0.24679112434387207, - "step": 820 - }, - { - "epoch": 0.11230228030448625, - "grad_norm": 3.1149911880493164, - "learning_rate": 4.9874304778393574e-05, - "loss": 0.22161397933959961, - "step": 840 - }, - { - "epoch": 0.11497614412125973, - "grad_norm": 14.802160263061523, - "learning_rate": 4.9868224065818706e-05, - "loss": 0.2623537302017212, - "step": 860 - }, - { - "epoch": 0.11765000793803321, - "grad_norm": 5.586325168609619, - "learning_rate": 4.98620001130119e-05, - "loss": 0.3560942649841309, - "step": 880 - }, - { - "epoch": 0.12032387175480669, - "grad_norm": 3.390017032623291, - "learning_rate": 4.9855632955822916e-05, - "loss": 0.16934787034988402, - "step": 900 - }, - { - "epoch": 0.12299773557158017, - "grad_norm": 6.070940971374512, - "learning_rate": 4.984912263092641e-05, - "loss": 0.2131197214126587, - "step": 920 - }, - { - "epoch": 0.12567159938835365, - "grad_norm": 1.4912281036376953, - "learning_rate": 4.984246917582166e-05, - "loss": 0.25128653049468996, - "step": 940 - }, - { - "epoch": 0.12834546320512713, - "grad_norm": 7.000472545623779, - "learning_rate": 4.9835672628832366e-05, - "loss": 0.2653592586517334, - "step": 960 - }, - { - "epoch": 0.1310193270219006, - "grad_norm": 5.427223205566406, - "learning_rate": 4.9828733029106434e-05, - "loss": 0.1653295636177063, - "step": 980 - }, - { - "epoch": 0.1336931908386741, - "grad_norm": 1.9502102136611938, - "learning_rate": 4.982165041661575e-05, - "loss": 0.2250870943069458, - "step": 1000 - }, - { - "epoch": 0.13636705465544757, - "grad_norm": 0.6216259598731995, - "learning_rate": 4.981442483215595e-05, - "loss": 0.18943849802017212, - "step": 1020 - }, - { - "epoch": 0.13904091847222105, - "grad_norm": 2.3363687992095947, - "learning_rate": 4.98070563173462e-05, - "loss": 0.1673592209815979, - "step": 1040 - }, - { - "epoch": 0.14171478228899453, - "grad_norm": 1.040717601776123, - "learning_rate": 4.979954491462892e-05, - "loss": 0.2113173007965088, - "step": 1060 - }, - { - "epoch": 0.14438864610576801, - "grad_norm": 2.735522747039795, - "learning_rate": 4.979189066726955e-05, - "loss": 0.17504971027374266, - "step": 1080 - }, - { - "epoch": 0.1470625099225415, - "grad_norm": 4.701151371002197, - "learning_rate": 4.978409361935636e-05, - "loss": 0.15881222486495972, - "step": 1100 - }, - { - "epoch": 0.14973637373931498, - "grad_norm": 2.735919237136841, - "learning_rate": 4.9776153815800075e-05, - "loss": 0.14044179916381835, - "step": 1120 - }, - { - "epoch": 0.15241023755608848, - "grad_norm": 3.5479538440704346, - "learning_rate": 4.976807130233375e-05, - "loss": 0.18565714359283447, - "step": 1140 - }, - { - "epoch": 0.15508410137286197, - "grad_norm": 3.2167458534240723, - "learning_rate": 4.975984612551243e-05, - "loss": 0.13236271142959594, - "step": 1160 - }, - { - "epoch": 0.15775796518963545, - "grad_norm": 1.0206760168075562, - "learning_rate": 4.975147833271288e-05, - "loss": 0.19124728441238403, - "step": 1180 - }, - { - "epoch": 0.16043182900640893, - "grad_norm": 4.194457530975342, - "learning_rate": 4.9742967972133335e-05, - "loss": 0.144741427898407, - "step": 1200 - }, - { - "epoch": 0.1631056928231824, - "grad_norm": 3.0225746631622314, - "learning_rate": 4.973431509279323e-05, - "loss": 0.1374324679374695, - "step": 1220 - }, - { - "epoch": 0.1657795566399559, - "grad_norm": 4.243523120880127, - "learning_rate": 4.972551974453287e-05, - "loss": 0.13663809299468993, - "step": 1240 - }, - { - "epoch": 0.16845342045672937, - "grad_norm": 2.4990086555480957, - "learning_rate": 4.971658197801322e-05, - "loss": 0.16817957162857056, - "step": 1260 - }, - { - "epoch": 0.17112728427350285, - "grad_norm": 4.983982563018799, - "learning_rate": 4.9707501844715554e-05, - "loss": 0.13795313835144044, - "step": 1280 - }, - { - "epoch": 0.17380114809027633, - "grad_norm": 3.6780316829681396, - "learning_rate": 4.969827939694115e-05, - "loss": 0.1637880802154541, - "step": 1300 - }, - { - "epoch": 0.1764750119070498, - "grad_norm": 0.7950732707977295, - "learning_rate": 4.968891468781105e-05, - "loss": 0.10979138612747193, - "step": 1320 - }, - { - "epoch": 0.1791488757238233, - "grad_norm": 1.2414121627807617, - "learning_rate": 4.967940777126569e-05, - "loss": 0.13692171573638917, - "step": 1340 - }, - { - "epoch": 0.18182273954059677, - "grad_norm": 2.1383633613586426, - "learning_rate": 4.9669758702064636e-05, - "loss": 0.07821698188781738, - "step": 1360 - }, - { - "epoch": 0.18449660335737025, - "grad_norm": 5.061275959014893, - "learning_rate": 4.965996753578623e-05, - "loss": 0.19053516387939454, - "step": 1380 - }, - { - "epoch": 0.18717046717414373, - "grad_norm": 6.151792049407959, - "learning_rate": 4.9650034328827305e-05, - "loss": 0.11360721588134766, - "step": 1400 - }, - { - "epoch": 0.18984433099091721, - "grad_norm": 1.0604305267333984, - "learning_rate": 4.963995913840284e-05, - "loss": 0.13138024806976317, - "step": 1420 - }, - { - "epoch": 0.1925181948076907, - "grad_norm": 1.7159489393234253, - "learning_rate": 4.9629742022545623e-05, - "loss": 0.08657677173614502, - "step": 1440 - }, - { - "epoch": 0.19519205862446418, - "grad_norm": 2.4207754135131836, - "learning_rate": 4.961938304010595e-05, - "loss": 0.10309149026870727, - "step": 1460 - }, - { - "epoch": 0.19786592244123766, - "grad_norm": 1.532060146331787, - "learning_rate": 4.9608882250751245e-05, - "loss": 0.13628544807434081, - "step": 1480 - }, - { - "epoch": 0.20053978625801114, - "grad_norm": 6.409943580627441, - "learning_rate": 4.959823971496574e-05, - "loss": 0.10584845542907714, - "step": 1500 - }, - { - "epoch": 0.20321365007478462, - "grad_norm": 2.452012538909912, - "learning_rate": 4.9587455494050136e-05, - "loss": 0.06506187915802002, - "step": 1520 - }, - { - "epoch": 0.2058875138915581, - "grad_norm": 5.3016533851623535, - "learning_rate": 4.9576529650121214e-05, - "loss": 0.11848526000976563, - "step": 1540 - }, - { - "epoch": 0.20856137770833158, - "grad_norm": 4.341775894165039, - "learning_rate": 4.956546224611152e-05, - "loss": 0.11318533420562744, - "step": 1560 - }, - { - "epoch": 0.21123524152510506, - "grad_norm": 1.9056169986724854, - "learning_rate": 4.9554253345768965e-05, - "loss": 0.12768398523330687, - "step": 1580 - }, - { - "epoch": 0.21390910534187857, - "grad_norm": 1.8939746618270874, - "learning_rate": 4.9542903013656486e-05, - "loss": 0.10782338380813598, - "step": 1600 - }, - { - "epoch": 0.21658296915865205, - "grad_norm": 8.53671932220459, - "learning_rate": 4.9531411315151654e-05, - "loss": 0.1733921766281128, - "step": 1620 - }, - { - "epoch": 0.21925683297542553, - "grad_norm": 2.0152978897094727, - "learning_rate": 4.951977831644632e-05, - "loss": 0.11197054386138916, - "step": 1640 - }, - { - "epoch": 0.221930696792199, - "grad_norm": 3.8422367572784424, - "learning_rate": 4.95080040845462e-05, - "loss": 0.11441781520843505, - "step": 1660 - }, - { - "epoch": 0.2246045606089725, - "grad_norm": 1.819858193397522, - "learning_rate": 4.949608868727053e-05, - "loss": 0.11403474807739258, - "step": 1680 - }, - { - "epoch": 0.22727842442574597, - "grad_norm": 7.45100212097168, - "learning_rate": 4.948403219325163e-05, - "loss": 0.13117753267288207, - "step": 1700 - }, - { - "epoch": 0.22995228824251945, - "grad_norm": 0.6526040434837341, - "learning_rate": 4.947183467193456e-05, - "loss": 0.07524924874305725, - "step": 1720 - }, - { - "epoch": 0.23262615205929293, - "grad_norm": 3.814746856689453, - "learning_rate": 4.945949619357668e-05, - "loss": 0.07659345269203185, - "step": 1740 - }, - { - "epoch": 0.23530001587606642, - "grad_norm": 2.373124122619629, - "learning_rate": 4.944701682924726e-05, - "loss": 0.1147496223449707, - "step": 1760 - }, - { - "epoch": 0.2379738796928399, - "grad_norm": 0.11161285638809204, - "learning_rate": 4.943439665082707e-05, - "loss": 0.07256829738616943, - "step": 1780 - }, - { - "epoch": 0.24064774350961338, - "grad_norm": 0.45990192890167236, - "learning_rate": 4.942163573100794e-05, - "loss": 0.07726740837097168, - "step": 1800 - }, - { - "epoch": 0.24332160732638686, - "grad_norm": 4.2301926612854, - "learning_rate": 4.940873414329242e-05, - "loss": 0.09349535703659058, - "step": 1820 - }, - { - "epoch": 0.24599547114316034, - "grad_norm": 2.442178726196289, - "learning_rate": 4.939569196199325e-05, - "loss": 0.12413722276687622, - "step": 1840 - }, - { - "epoch": 0.24866933495993382, - "grad_norm": 2.523683786392212, - "learning_rate": 4.938250926223302e-05, - "loss": 0.08566288352012634, - "step": 1860 - }, - { - "epoch": 0.2513431987767073, - "grad_norm": 3.511075258255005, - "learning_rate": 4.936918611994368e-05, - "loss": 0.08007702231407166, - "step": 1880 - }, - { - "epoch": 0.2540170625934808, - "grad_norm": 6.254627704620361, - "learning_rate": 4.935572261186614e-05, - "loss": 0.10983954668045044, - "step": 1900 - }, - { - "epoch": 0.25669092641025426, - "grad_norm": 1.5211899280548096, - "learning_rate": 4.934211881554981e-05, - "loss": 0.09120344519615173, - "step": 1920 - }, - { - "epoch": 0.25936479022702774, - "grad_norm": 2.5893588066101074, - "learning_rate": 4.932837480935214e-05, - "loss": 0.08754412531852722, - "step": 1940 - }, - { - "epoch": 0.2620386540438012, - "grad_norm": 6.878556251525879, - "learning_rate": 4.931449067243821e-05, - "loss": 0.08636274933815002, - "step": 1960 - }, - { - "epoch": 0.2647125178605747, - "grad_norm": 2.9078798294067383, - "learning_rate": 4.9300466484780226e-05, - "loss": 0.09582929015159607, - "step": 1980 - }, - { - "epoch": 0.2673863816773482, - "grad_norm": 3.391852855682373, - "learning_rate": 4.92863023271571e-05, - "loss": 0.0850919783115387, - "step": 2000 - }, - { - "epoch": 0.27006024549412166, - "grad_norm": 5.522103309631348, - "learning_rate": 4.927199828115395e-05, - "loss": 0.050999772548675534, - "step": 2020 - }, - { - "epoch": 0.27273410931089515, - "grad_norm": 0.90350741147995, - "learning_rate": 4.925755442916167e-05, - "loss": 0.10100446939468384, - "step": 2040 - }, - { - "epoch": 0.2754079731276686, - "grad_norm": 1.602030634880066, - "learning_rate": 4.924297085437641e-05, - "loss": 0.0468633770942688, - "step": 2060 - }, - { - "epoch": 0.2780818369444421, - "grad_norm": 1.5823460817337036, - "learning_rate": 4.922824764079913e-05, - "loss": 0.06786358952522278, - "step": 2080 - }, - { - "epoch": 0.2807557007612156, - "grad_norm": 1.6624343395233154, - "learning_rate": 4.92133848732351e-05, - "loss": 0.05772828459739685, - "step": 2100 - }, - { - "epoch": 0.28342956457798907, - "grad_norm": 0.947078287601471, - "learning_rate": 4.9198382637293424e-05, - "loss": 0.08012173175811768, - "step": 2120 - }, - { - "epoch": 0.28610342839476255, - "grad_norm": 0.2919924259185791, - "learning_rate": 4.918324101938653e-05, - "loss": 0.1208539366722107, - "step": 2140 - }, - { - "epoch": 0.28877729221153603, - "grad_norm": 9.258247375488281, - "learning_rate": 4.916796010672969e-05, - "loss": 0.10037034749984741, - "step": 2160 - }, - { - "epoch": 0.2914511560283095, - "grad_norm": 4.0920491218566895, - "learning_rate": 4.915253998734051e-05, - "loss": 0.061488878726959226, - "step": 2180 - }, - { - "epoch": 0.294125019845083, - "grad_norm": 6.1126627922058105, - "learning_rate": 4.913698075003841e-05, - "loss": 0.0862967312335968, - "step": 2200 - }, - { - "epoch": 0.29679888366185647, - "grad_norm": 2.585484743118286, - "learning_rate": 4.912128248444414e-05, - "loss": 0.05393874645233154, - "step": 2220 - }, - { - "epoch": 0.29947274747862995, - "grad_norm": 6.944481372833252, - "learning_rate": 4.9105445280979256e-05, - "loss": 0.08570566773414612, - "step": 2240 - }, - { - "epoch": 0.30214661129540343, - "grad_norm": 1.3824089765548706, - "learning_rate": 4.908946923086556e-05, - "loss": 0.09689127206802368, - "step": 2260 - }, - { - "epoch": 0.30482047511217697, - "grad_norm": 3.4861342906951904, - "learning_rate": 4.907335442612464e-05, - "loss": 0.12550976276397705, - "step": 2280 - }, - { - "epoch": 0.30749433892895045, - "grad_norm": 3.668980121612549, - "learning_rate": 4.905710095957728e-05, - "loss": 0.09089353680610657, - "step": 2300 - }, - { - "epoch": 0.31016820274572393, - "grad_norm": 1.093095064163208, - "learning_rate": 4.904070892484298e-05, - "loss": 0.03925192356109619, - "step": 2320 - }, - { - "epoch": 0.3128420665624974, - "grad_norm": 0.8169485926628113, - "learning_rate": 4.9024178416339364e-05, - "loss": 0.0979581356048584, - "step": 2340 - }, - { - "epoch": 0.3155159303792709, - "grad_norm": 1.892451286315918, - "learning_rate": 4.900750952928166e-05, - "loss": 0.05913209915161133, - "step": 2360 - }, - { - "epoch": 0.3181897941960444, - "grad_norm": 0.24644255638122559, - "learning_rate": 4.8990702359682184e-05, - "loss": 0.06815173625946044, - "step": 2380 - }, - { - "epoch": 0.32086365801281785, - "grad_norm": 2.1861305236816406, - "learning_rate": 4.897375700434972e-05, - "loss": 0.04142785966396332, - "step": 2400 - }, - { - "epoch": 0.32353752182959133, - "grad_norm": 2.6643004417419434, - "learning_rate": 4.8956673560889013e-05, - "loss": 0.05177200436592102, - "step": 2420 - }, - { - "epoch": 0.3262113856463648, - "grad_norm": 2.588113784790039, - "learning_rate": 4.8939452127700195e-05, - "loss": 0.05783546566963196, - "step": 2440 - }, - { - "epoch": 0.3288852494631383, - "grad_norm": 2.419644594192505, - "learning_rate": 4.8922092803978203e-05, - "loss": 0.08906854391098022, - "step": 2460 - }, - { - "epoch": 0.3315591132799118, - "grad_norm": 0.16949939727783203, - "learning_rate": 4.890459568971223e-05, - "loss": 0.10305211544036866, - "step": 2480 - }, - { - "epoch": 0.33423297709668526, - "grad_norm": 0.10032984614372253, - "learning_rate": 4.8886960885685126e-05, - "loss": 0.06348527669906616, - "step": 2500 - }, - { - "epoch": 0.33690684091345874, - "grad_norm": 3.3658738136291504, - "learning_rate": 4.8869188493472854e-05, - "loss": 0.06826075911521912, - "step": 2520 - }, - { - "epoch": 0.3395807047302322, - "grad_norm": 0.8656186461448669, - "learning_rate": 4.885127861544386e-05, - "loss": 0.05929765701293945, - "step": 2540 - }, - { - "epoch": 0.3422545685470057, - "grad_norm": 0.1492065042257309, - "learning_rate": 4.8833231354758496e-05, - "loss": 0.09429731965065002, - "step": 2560 - }, - { - "epoch": 0.3449284323637792, - "grad_norm": 0.6010928153991699, - "learning_rate": 4.881504681536846e-05, - "loss": 0.06262240409851075, - "step": 2580 - }, - { - "epoch": 0.34760229618055266, - "grad_norm": 1.6506450176239014, - "learning_rate": 4.879672510201616e-05, - "loss": 0.061688083410263064, - "step": 2600 - }, - { - "epoch": 0.35027615999732614, - "grad_norm": 0.2703142464160919, - "learning_rate": 4.877826632023412e-05, - "loss": 0.06175137162208557, - "step": 2620 - }, - { - "epoch": 0.3529500238140996, - "grad_norm": 3.1056365966796875, - "learning_rate": 4.875967057634437e-05, - "loss": 0.07828506827354431, - "step": 2640 - }, - { - "epoch": 0.3556238876308731, - "grad_norm": 0.28790283203125, - "learning_rate": 4.874093797745784e-05, - "loss": 0.11355981826782227, - "step": 2660 - }, - { - "epoch": 0.3582977514476466, - "grad_norm": 2.3372068405151367, - "learning_rate": 4.8722068631473746e-05, - "loss": 0.048267871141433716, - "step": 2680 - }, - { - "epoch": 0.36097161526442006, - "grad_norm": 0.12767371535301208, - "learning_rate": 4.8703062647078976e-05, - "loss": 0.04319801032543182, - "step": 2700 - }, - { - "epoch": 0.36364547908119355, - "grad_norm": 0.5145738124847412, - "learning_rate": 4.868392013374741e-05, - "loss": 0.0773090660572052, - "step": 2720 - }, - { - "epoch": 0.366319342897967, - "grad_norm": 0.8518500328063965, - "learning_rate": 4.866464120173937e-05, - "loss": 0.05149460434913635, - "step": 2740 - }, - { - "epoch": 0.3689932067147405, - "grad_norm": 3.6726584434509277, - "learning_rate": 4.8645225962100924e-05, - "loss": 0.06896821856498718, - "step": 2760 - }, - { - "epoch": 0.371667070531514, - "grad_norm": 1.5626497268676758, - "learning_rate": 4.862567452666329e-05, - "loss": 0.047730174660682675, - "step": 2780 - }, - { - "epoch": 0.37434093434828747, - "grad_norm": 6.562028884887695, - "learning_rate": 4.8605987008042144e-05, - "loss": 0.07060698866844177, - "step": 2800 - }, - { - "epoch": 0.37701479816506095, - "grad_norm": 0.7631726861000061, - "learning_rate": 4.8586163519637005e-05, - "loss": 0.04944324493408203, - "step": 2820 - }, - { - "epoch": 0.37968866198183443, - "grad_norm": 1.6982293128967285, - "learning_rate": 4.8566204175630595e-05, - "loss": 0.03000348210334778, - "step": 2840 - }, - { - "epoch": 0.3823625257986079, - "grad_norm": 0.6487429141998291, - "learning_rate": 4.854610909098812e-05, - "loss": 0.06691416501998901, - "step": 2860 - }, - { - "epoch": 0.3850363896153814, - "grad_norm": 0.7648892402648926, - "learning_rate": 4.852587838145668e-05, - "loss": 0.05529783964157105, - "step": 2880 - }, - { - "epoch": 0.38771025343215487, - "grad_norm": 0.11601298302412033, - "learning_rate": 4.850551216356457e-05, - "loss": 0.07780832052230835, - "step": 2900 - }, - { - "epoch": 0.39038411724892835, - "grad_norm": 0.9443137645721436, - "learning_rate": 4.8485010554620594e-05, - "loss": 0.08007023930549621, - "step": 2920 - }, - { - "epoch": 0.39305798106570183, - "grad_norm": 0.8828252553939819, - "learning_rate": 4.846437367271341e-05, - "loss": 0.03541453182697296, - "step": 2940 - }, - { - "epoch": 0.3957318448824753, - "grad_norm": 0.21668888628482819, - "learning_rate": 4.844360163671083e-05, - "loss": 0.08354364633560181, - "step": 2960 - }, - { - "epoch": 0.3984057086992488, - "grad_norm": 0.6840483546257019, - "learning_rate": 4.8422694566259194e-05, - "loss": 0.045807772874832155, - "step": 2980 - }, - { - "epoch": 0.4010795725160223, - "grad_norm": 1.2754698991775513, - "learning_rate": 4.8401652581782584e-05, - "loss": 0.053487342596054074, - "step": 3000 - }, - { - "epoch": 0.40375343633279576, - "grad_norm": 0.19012756645679474, - "learning_rate": 4.838047580448222e-05, - "loss": 0.05881953239440918, - "step": 3020 - }, - { - "epoch": 0.40642730014956924, - "grad_norm": 2.1057698726654053, - "learning_rate": 4.835916435633569e-05, - "loss": 0.031065690517425536, - "step": 3040 - }, - { - "epoch": 0.4091011639663427, - "grad_norm": 4.188559055328369, - "learning_rate": 4.833771836009633e-05, - "loss": 0.07205432653427124, - "step": 3060 - }, - { - "epoch": 0.4117750277831162, - "grad_norm": 6.975829124450684, - "learning_rate": 4.831613793929242e-05, - "loss": 0.04953635036945343, - "step": 3080 - }, - { - "epoch": 0.4144488915998897, - "grad_norm": 4.725269317626953, - "learning_rate": 4.8294423218226546e-05, - "loss": 0.05965519547462463, - "step": 3100 - }, - { - "epoch": 0.41712275541666316, - "grad_norm": 1.7124755382537842, - "learning_rate": 4.827257432197486e-05, - "loss": 0.039625433087348935, - "step": 3120 - }, - { - "epoch": 0.41979661923343664, - "grad_norm": 2.6687324047088623, - "learning_rate": 4.825059137638636e-05, - "loss": 0.05020809769630432, - "step": 3140 - }, - { - "epoch": 0.4224704830502101, - "grad_norm": 1.111640214920044, - "learning_rate": 4.822847450808215e-05, - "loss": 0.04404452443122864, - "step": 3160 - }, - { - "epoch": 0.42514434686698366, - "grad_norm": 0.2128070890903473, - "learning_rate": 4.8206223844454744e-05, - "loss": 0.08283355236053466, - "step": 3180 - }, - { - "epoch": 0.42781821068375714, - "grad_norm": 0.10757248103618622, - "learning_rate": 4.818383951366729e-05, - "loss": 0.08568671345710754, - "step": 3200 - }, - { - "epoch": 0.4304920745005306, - "grad_norm": 0.08344592899084091, - "learning_rate": 4.816132164465289e-05, - "loss": 0.0426956832408905, - "step": 3220 - }, - { - "epoch": 0.4331659383173041, - "grad_norm": 0.5657751560211182, - "learning_rate": 4.813867036711378e-05, - "loss": 0.04971776902675629, - "step": 3240 - }, - { - "epoch": 0.4358398021340776, - "grad_norm": 2.1529288291931152, - "learning_rate": 4.8115885811520654e-05, - "loss": 0.025386181473731995, - "step": 3260 - }, - { - "epoch": 0.43851366595085106, - "grad_norm": 4.228519916534424, - "learning_rate": 4.809296810911188e-05, - "loss": 0.06401395201683044, - "step": 3280 - }, - { - "epoch": 0.44118752976762454, - "grad_norm": 6.770420551300049, - "learning_rate": 4.806991739189274e-05, - "loss": 0.16425553560256959, - "step": 3300 - }, - { - "epoch": 0.443861393584398, - "grad_norm": 0.5303187370300293, - "learning_rate": 4.804673379263467e-05, - "loss": 0.045900467038154605, - "step": 3320 - }, - { - "epoch": 0.4465352574011715, - "grad_norm": 0.221473827958107, - "learning_rate": 4.802341744487453e-05, - "loss": 0.07529735565185547, - "step": 3340 - }, - { - "epoch": 0.449209121217945, - "grad_norm": 3.48736834526062, - "learning_rate": 4.799996848291378e-05, - "loss": 0.062433135509490964, - "step": 3360 - }, - { - "epoch": 0.45188298503471847, - "grad_norm": 2.650038242340088, - "learning_rate": 4.797638704181774e-05, - "loss": 0.03762982189655304, - "step": 3380 - }, - { - "epoch": 0.45455684885149195, - "grad_norm": 3.159665584564209, - "learning_rate": 4.795267325741483e-05, - "loss": 0.04745924174785614, - "step": 3400 - }, - { - "epoch": 0.4572307126682654, - "grad_norm": 0.8763885498046875, - "learning_rate": 4.7928827266295715e-05, - "loss": 0.07380253076553345, - "step": 3420 - }, - { - "epoch": 0.4599045764850389, - "grad_norm": 0.1779366433620453, - "learning_rate": 4.790484920581262e-05, - "loss": 0.045916372537612916, - "step": 3440 - }, - { - "epoch": 0.4625784403018124, - "grad_norm": 1.1228729486465454, - "learning_rate": 4.7880739214078454e-05, - "loss": 0.04461723566055298, - "step": 3460 - }, - { - "epoch": 0.46525230411858587, - "grad_norm": 0.1629919707775116, - "learning_rate": 4.785649742996605e-05, - "loss": 0.017159442603588104, - "step": 3480 - }, - { - "epoch": 0.46792616793535935, - "grad_norm": 3.583951473236084, - "learning_rate": 4.783212399310737e-05, - "loss": 0.047145146131515506, - "step": 3500 - }, - { - "epoch": 0.47060003175213283, - "grad_norm": 0.9766237139701843, - "learning_rate": 4.780761904389267e-05, - "loss": 0.050229442119598386, - "step": 3520 - }, - { - "epoch": 0.4732738955689063, - "grad_norm": 0.05617872253060341, - "learning_rate": 4.778298272346976e-05, - "loss": 0.024862812459468843, - "step": 3540 - }, - { - "epoch": 0.4759477593856798, - "grad_norm": 1.3586453199386597, - "learning_rate": 4.775821517374308e-05, - "loss": 0.02117772251367569, - "step": 3560 - }, - { - "epoch": 0.4786216232024533, - "grad_norm": 1.2116742134094238, - "learning_rate": 4.7733316537373006e-05, - "loss": 0.03060794174671173, - "step": 3580 - }, - { - "epoch": 0.48129548701922675, - "grad_norm": 0.39403238892555237, - "learning_rate": 4.770828695777493e-05, - "loss": 0.05482668280601501, - "step": 3600 - }, - { - "epoch": 0.48396935083600023, - "grad_norm": 0.9248486161231995, - "learning_rate": 4.7683126579118495e-05, - "loss": 0.03612814247608185, - "step": 3620 - }, - { - "epoch": 0.4866432146527737, - "grad_norm": 0.1624649167060852, - "learning_rate": 4.7657835546326736e-05, - "loss": 0.04334873259067536, - "step": 3640 - }, - { - "epoch": 0.4893170784695472, - "grad_norm": 0.5321119427680969, - "learning_rate": 4.763241400507524e-05, - "loss": 0.0461233913898468, - "step": 3660 - }, - { - "epoch": 0.4919909422863207, - "grad_norm": 0.34861093759536743, - "learning_rate": 4.760686210179133e-05, - "loss": 0.024829554557800292, - "step": 3680 - }, - { - "epoch": 0.49466480610309416, - "grad_norm": 1.2561241388320923, - "learning_rate": 4.758117998365322e-05, - "loss": 0.03157005608081818, - "step": 3700 - }, - { - "epoch": 0.49733866991986764, - "grad_norm": 0.8691341280937195, - "learning_rate": 4.7555367798589146e-05, - "loss": 0.04310203492641449, - "step": 3720 - }, - { - "epoch": 0.5000125337366411, - "grad_norm": 0.3134572505950928, - "learning_rate": 4.752942569527653e-05, - "loss": 0.03796039223670959, - "step": 3740 - }, - { - "epoch": 0.5026863975534146, - "grad_norm": 2.3359289169311523, - "learning_rate": 4.75033538231411e-05, - "loss": 0.055599170923233035, - "step": 3760 - }, - { - "epoch": 0.5053602613701881, - "grad_norm": 7.426175594329834, - "learning_rate": 4.747715233235608e-05, - "loss": 0.054436272382736205, - "step": 3780 - }, - { - "epoch": 0.5080341251869616, - "grad_norm": 0.5940203070640564, - "learning_rate": 4.745082137384128e-05, - "loss": 0.03682814538478851, - "step": 3800 - }, - { - "epoch": 0.510707989003735, - "grad_norm": 0.22821389138698578, - "learning_rate": 4.7424361099262225e-05, - "loss": 0.051123309135437014, - "step": 3820 - }, - { - "epoch": 0.5133818528205085, - "grad_norm": 8.20633602142334, - "learning_rate": 4.739777166102932e-05, - "loss": 0.0704378604888916, - "step": 3840 - }, - { - "epoch": 0.516055716637282, - "grad_norm": 3.023848533630371, - "learning_rate": 4.737105321229694e-05, - "loss": 0.03368058800697327, - "step": 3860 - }, - { - "epoch": 0.5187295804540555, - "grad_norm": 0.07666649669408798, - "learning_rate": 4.7344205906962555e-05, - "loss": 0.03665303289890289, - "step": 3880 - }, - { - "epoch": 0.521403444270829, - "grad_norm": 0.7571629881858826, - "learning_rate": 4.731722989966585e-05, - "loss": 0.058415502309799194, - "step": 3900 - }, - { - "epoch": 0.5240773080876024, - "grad_norm": 3.2599120140075684, - "learning_rate": 4.7290125345787816e-05, - "loss": 0.07323018908500671, - "step": 3920 - }, - { - "epoch": 0.5267511719043759, - "grad_norm": 0.28930988907814026, - "learning_rate": 4.7262892401449886e-05, - "loss": 0.054371267557144165, - "step": 3940 - }, - { - "epoch": 0.5294250357211494, - "grad_norm": 2.2296454906463623, - "learning_rate": 4.7235531223513004e-05, - "loss": 0.040819621086120604, - "step": 3960 - }, - { - "epoch": 0.5320988995379229, - "grad_norm": 0.11608211696147919, - "learning_rate": 4.720804196957675e-05, - "loss": 0.05215579271316528, - "step": 3980 - }, - { - "epoch": 0.5347727633546964, - "grad_norm": 1.1587547063827515, - "learning_rate": 4.7180424797978415e-05, - "loss": 0.026277875900268553, - "step": 4000 - }, - { - "epoch": 0.5374466271714698, - "grad_norm": 0.06253435462713242, - "learning_rate": 4.7152679867792074e-05, - "loss": 0.02574407756328583, - "step": 4020 - }, - { - "epoch": 0.5401204909882433, - "grad_norm": 1.3441458940505981, - "learning_rate": 4.71248073388277e-05, - "loss": 0.05538107752799988, - "step": 4040 - }, - { - "epoch": 0.5427943548050168, - "grad_norm": 0.48076340556144714, - "learning_rate": 4.7096807371630236e-05, - "loss": 0.047986540198326114, - "step": 4060 - }, - { - "epoch": 0.5454682186217903, - "grad_norm": 0.5924936532974243, - "learning_rate": 4.706868012747867e-05, - "loss": 0.05463914275169372, - "step": 4080 - }, - { - "epoch": 0.7673995566395854, - "grad_norm": 0.05143728107213974, - "learning_rate": 4.431151627307268e-05, - "loss": 0.00959376593430837, - "step": 4100 - }, - { - "epoch": 0.771142969110998, - "grad_norm": 1.2308074235916138, - "learning_rate": 4.425806509248848e-05, - "loss": 0.002745623141527176, - "step": 4120 - }, - { - "epoch": 0.7748863815824106, - "grad_norm": 2.080223798751831, - "learning_rate": 4.420439652052499e-05, - "loss": 0.012390998750925064, - "step": 4140 - }, - { - "epoch": 0.7786297940538233, - "grad_norm": 0.049312230199575424, - "learning_rate": 4.415051116301072e-05, - "loss": 0.004607534408569336, - "step": 4160 - }, - { - "epoch": 0.7823732065252359, - "grad_norm": 0.07747476547956467, - "learning_rate": 4.409640962822132e-05, - "loss": 0.034441503882408145, - "step": 4180 - }, - { - "epoch": 0.7861166189966485, - "grad_norm": 0.021327875554561615, - "learning_rate": 4.404209252687275e-05, - "loss": 0.009768449515104295, - "step": 4200 - }, - { - "epoch": 0.789860031468061, - "grad_norm": 2.406580924987793, - "learning_rate": 4.398756047211431e-05, - "loss": 0.005304037779569626, - "step": 4220 - }, - { - "epoch": 0.7936034439394737, - "grad_norm": 0.027869906276464462, - "learning_rate": 4.39328140795218e-05, - "loss": 0.00896073654294014, - "step": 4240 - }, - { - "epoch": 0.7973468564108863, - "grad_norm": 0.09702044725418091, - "learning_rate": 4.387785396709052e-05, - "loss": 0.0117533378303051, - "step": 4260 - }, - { - "epoch": 0.801090268882299, - "grad_norm": 0.529065728187561, - "learning_rate": 4.382268075522831e-05, - "loss": 0.0037526611238718035, - "step": 4280 - }, - { - "epoch": 0.8048336813537116, - "grad_norm": 0.015109462663531303, - "learning_rate": 4.3767295066748564e-05, - "loss": 0.0025708725675940513, - "step": 4300 - }, - { - "epoch": 0.8085770938251241, - "grad_norm": 0.7257627248764038, - "learning_rate": 4.371169752686316e-05, - "loss": 0.006234285607933998, - "step": 4320 - }, - { - "epoch": 0.8123205062965367, - "grad_norm": 0.016853008419275284, - "learning_rate": 4.3655888763175436e-05, - "loss": 0.0023587727919220924, - "step": 4340 - }, - { - "epoch": 0.8160639187679494, - "grad_norm": 0.017816167324781418, - "learning_rate": 4.3599869405673085e-05, - "loss": 0.0012389549054205417, - "step": 4360 - }, - { - "epoch": 0.819807331239362, - "grad_norm": 0.014672616496682167, - "learning_rate": 4.354364008672106e-05, - "loss": 0.002244691364467144, - "step": 4380 - }, - { - "epoch": 0.8235507437107746, - "grad_norm": 0.044869400560855865, - "learning_rate": 4.3487201441054435e-05, - "loss": 0.007713723182678223, - "step": 4400 - }, - { - "epoch": 0.8272941561821872, - "grad_norm": 0.06367291510105133, - "learning_rate": 4.343055410577122e-05, - "loss": 0.005743256583809852, - "step": 4420 - }, - { - "epoch": 0.8310375686535998, - "grad_norm": 0.1354215145111084, - "learning_rate": 4.3373698720325176e-05, - "loss": 0.009635470807552338, - "step": 4440 - }, - { - "epoch": 0.8347809811250124, - "grad_norm": 0.9089844822883606, - "learning_rate": 4.331663592651862e-05, - "loss": 0.01007603257894516, - "step": 4460 - }, - { - "epoch": 0.838524393596425, - "grad_norm": 0.025831619277596474, - "learning_rate": 4.3259366368495167e-05, - "loss": 0.006179215386509895, - "step": 4480 - }, - { - "epoch": 0.8422678060678377, - "grad_norm": 0.016653764992952347, - "learning_rate": 4.320189069273243e-05, - "loss": 0.0025156451389193534, - "step": 4500 - }, - { - "epoch": 0.8460112185392502, - "grad_norm": 0.27361780405044556, - "learning_rate": 4.3144209548034766e-05, - "loss": 0.002235286869108677, - "step": 4520 - }, - { - "epoch": 0.8497546310106628, - "grad_norm": 2.6958701610565186, - "learning_rate": 4.3086323585525915e-05, - "loss": 0.03571180701255798, - "step": 4540 - }, - { - "epoch": 0.8534980434820755, - "grad_norm": 0.1260778158903122, - "learning_rate": 4.3028233458641696e-05, - "loss": 0.0036518506705760954, - "step": 4560 - }, - { - "epoch": 0.8572414559534881, - "grad_norm": 0.2445528209209442, - "learning_rate": 4.2969939823122586e-05, - "loss": 0.024949796497821808, - "step": 4580 - }, - { - "epoch": 0.8609848684249007, - "grad_norm": 0.1674242913722992, - "learning_rate": 4.291144333700633e-05, - "loss": 0.002089798077940941, - "step": 4600 - }, - { - "epoch": 0.8647282808963134, - "grad_norm": 0.05161884427070618, - "learning_rate": 4.2852744660620515e-05, - "loss": 0.007847145944833756, - "step": 4620 - }, - { - "epoch": 0.8684716933677259, - "grad_norm": 0.019796324893832207, - "learning_rate": 4.279384445657514e-05, - "loss": 0.0023555334657430647, - "step": 4640 - }, - { - "epoch": 0.8722151058391385, - "grad_norm": 0.0647754967212677, - "learning_rate": 4.2734743389755096e-05, - "loss": 0.009586349129676819, - "step": 4660 - }, - { - "epoch": 0.8759585183105512, - "grad_norm": 0.015243460424244404, - "learning_rate": 4.267544212731268e-05, - "loss": 0.017788709700107576, - "step": 4680 - }, - { - "epoch": 0.8797019307819638, - "grad_norm": 0.05756703019142151, - "learning_rate": 4.261594133866007e-05, - "loss": 0.014256520569324494, - "step": 4700 - }, - { - "epoch": 0.8834453432533764, - "grad_norm": 0.2002931535243988, - "learning_rate": 4.255624169546175e-05, - "loss": 0.0014025470241904258, - "step": 4720 - }, - { - "epoch": 0.887188755724789, - "grad_norm": 0.04325389489531517, - "learning_rate": 4.249634387162696e-05, - "loss": 0.010552891343832017, - "step": 4740 - }, - { - "epoch": 0.8909321681962016, - "grad_norm": 0.8975178599357605, - "learning_rate": 4.243624854330206e-05, - "loss": 0.0032475266605615618, - "step": 4760 - }, - { - "epoch": 0.8946755806676142, - "grad_norm": 0.01541830413043499, - "learning_rate": 4.237595638886288e-05, - "loss": 0.003157203644514084, - "step": 4780 - }, - { - "epoch": 0.8984189931390268, - "grad_norm": 1.673305869102478, - "learning_rate": 4.231546808890713e-05, - "loss": 0.0028239911422133445, - "step": 4800 - }, - { - "epoch": 0.9021624056104395, - "grad_norm": 0.021689629182219505, - "learning_rate": 4.225478432624665e-05, - "loss": 0.0026885712519288062, - "step": 4820 - }, - { - "epoch": 0.905905818081852, - "grad_norm": 0.019590798765420914, - "learning_rate": 4.219390578589973e-05, - "loss": 0.00780024379491806, - "step": 4840 - }, - { - "epoch": 0.9096492305532646, - "grad_norm": 0.024581020697951317, - "learning_rate": 4.213283315508337e-05, - "loss": 0.006697511672973633, - "step": 4860 - }, - { - "epoch": 0.9133926430246773, - "grad_norm": 0.20615583658218384, - "learning_rate": 4.207156712320555e-05, - "loss": 0.007314208894968033, - "step": 4880 - }, - { - "epoch": 0.9171360554960899, - "grad_norm": 0.015673745423555374, - "learning_rate": 4.20101083818574e-05, - "loss": 0.004841562733054161, - "step": 4900 - }, - { - "epoch": 0.9208794679675025, - "grad_norm": 0.008306623436510563, - "learning_rate": 4.194845762480544e-05, - "loss": 0.0010150263085961341, - "step": 4920 - }, - { - "epoch": 0.9246228804389152, - "grad_norm": 0.051861703395843506, - "learning_rate": 4.188661554798369e-05, - "loss": 0.011043114960193634, - "step": 4940 - }, - { - "epoch": 0.9283662929103277, - "grad_norm": 1.7019767761230469, - "learning_rate": 4.1824582849485884e-05, - "loss": 0.004985674470663071, - "step": 4960 - }, - { - "epoch": 0.9321097053817403, - "grad_norm": 0.021240154281258583, - "learning_rate": 4.176236022955755e-05, - "loss": 0.04885836541652679, - "step": 4980 - }, - { - "epoch": 0.935853117853153, - "grad_norm": 0.016504865139722824, - "learning_rate": 4.16999483905881e-05, - "loss": 0.0027378931641578673, - "step": 5000 - }, - { - "epoch": 0.9395965303245656, - "grad_norm": 0.014015628024935722, - "learning_rate": 4.163734803710294e-05, - "loss": 0.012781022489070893, - "step": 5020 - }, - { - "epoch": 0.9433399427959782, - "grad_norm": 0.013812500052154064, - "learning_rate": 4.157455987575545e-05, - "loss": 0.007508871704339981, - "step": 5040 - }, - { - "epoch": 0.9470833552673907, - "grad_norm": 0.01622290164232254, - "learning_rate": 4.1511584615319075e-05, - "loss": 0.0014614147134125234, - "step": 5060 - }, - { - "epoch": 0.9508267677388034, - "grad_norm": 0.01259149145334959, - "learning_rate": 4.144842296667929e-05, - "loss": 0.006202424317598343, - "step": 5080 - }, - { - "epoch": 0.954570180210216, - "grad_norm": 0.012383027002215385, - "learning_rate": 4.138507564282558e-05, - "loss": 0.006122353300452232, - "step": 5100 - }, - { - "epoch": 0.9583135926816286, - "grad_norm": 0.006499920971691608, - "learning_rate": 4.1321543358843385e-05, - "loss": 0.0008865024894475937, - "step": 5120 - }, - { - "epoch": 0.9620570051530413, - "grad_norm": 0.00830752868205309, - "learning_rate": 4.125782683190606e-05, - "loss": 0.0008420860394835472, - "step": 5140 - }, - { - "epoch": 0.9658004176244538, - "grad_norm": 0.01525857299566269, - "learning_rate": 4.119392678126673e-05, - "loss": 0.00587364137172699, - "step": 5160 - }, - { - "epoch": 0.9695438300958664, - "grad_norm": 0.01072095800191164, - "learning_rate": 4.11298439282502e-05, - "loss": 0.00853007659316063, - "step": 5180 - }, - { - "epoch": 0.973287242567279, - "grad_norm": 0.030316641554236412, - "learning_rate": 4.106557899624482e-05, - "loss": 0.0058747071772813795, - "step": 5200 - }, - { - "epoch": 0.9770306550386917, - "grad_norm": 0.0391647033393383, - "learning_rate": 4.1001132710694304e-05, - "loss": 0.0034765828400850295, - "step": 5220 - }, - { - "epoch": 0.9807740675101043, - "grad_norm": 0.04938298836350441, - "learning_rate": 4.093650579908953e-05, - "loss": 0.007594724744558334, - "step": 5240 - }, - { - "epoch": 0.984517479981517, - "grad_norm": 0.005873252172023058, - "learning_rate": 4.087169899096037e-05, - "loss": 0.013347607851028443, - "step": 5260 - }, - { - "epoch": 0.9882608924529295, - "grad_norm": 1.2757259607315063, - "learning_rate": 4.080671301786741e-05, - "loss": 0.004837355017662049, - "step": 5280 - }, - { - "epoch": 0.9920043049243421, - "grad_norm": 0.00920735765248537, - "learning_rate": 4.0741548613393675e-05, - "loss": 0.007415445148944854, - "step": 5300 - }, - { - "epoch": 0.9957477173957547, - "grad_norm": 0.5702093839645386, - "learning_rate": 4.067620651313647e-05, - "loss": 0.00406576506793499, - "step": 5320 - }, - { - "epoch": 0.9994911298671674, - "grad_norm": 1.8361051082611084, - "learning_rate": 4.0610687454698906e-05, - "loss": 0.00997612327337265, - "step": 5340 - }, - { - "epoch": 1.0031819006007008, - "grad_norm": 3.335326910018921, - "learning_rate": 4.0544992177681685e-05, - "loss": 0.008442799001932145, - "step": 5360 - }, - { - "epoch": 1.0069253130721134, - "grad_norm": 0.03184954449534416, - "learning_rate": 4.047912142367473e-05, - "loss": 0.008095134049654007, - "step": 5380 - }, - { - "epoch": 1.010668725543526, - "grad_norm": 0.029989074915647507, - "learning_rate": 4.04130759362488e-05, - "loss": 0.0012585990130901336, - "step": 5400 - }, - { - "epoch": 1.0144121380149385, - "grad_norm": 0.08727464079856873, - "learning_rate": 4.034685646094711e-05, - "loss": 0.012588074803352356, - "step": 5420 - }, - { - "epoch": 1.018155550486351, - "grad_norm": 0.018498806282877922, - "learning_rate": 4.028046374527689e-05, - "loss": 0.001854238100349903, - "step": 5440 - }, - { - "epoch": 1.0218989629577637, - "grad_norm": 0.013779236935079098, - "learning_rate": 4.021389853870095e-05, - "loss": 0.0008004569448530674, - "step": 5460 - }, - { - "epoch": 1.0256423754291764, - "grad_norm": 0.028235070407390594, - "learning_rate": 4.0147161592629306e-05, - "loss": 0.002274145185947418, - "step": 5480 - }, - { - "epoch": 1.029385787900589, - "grad_norm": 0.023030120879411697, - "learning_rate": 4.008025366041055e-05, - "loss": 0.008717305958271027, - "step": 5500 - }, - { - "epoch": 1.0331292003720016, - "grad_norm": 0.018347155302762985, - "learning_rate": 4.001317549732345e-05, - "loss": 0.00244256854057312, - "step": 5520 - }, - { - "epoch": 1.0368726128434143, - "grad_norm": 0.03449391946196556, - "learning_rate": 3.99459278605684e-05, - "loss": 0.0039924226701259615, - "step": 5540 - }, - { - "epoch": 1.0406160253148269, - "grad_norm": 0.030406463891267776, - "learning_rate": 3.9878511509258866e-05, - "loss": 0.0021008485928177834, - "step": 5560 - }, - { - "epoch": 1.0443594377862395, - "grad_norm": 0.01783100888133049, - "learning_rate": 3.9810927204412803e-05, - "loss": 0.0006656501442193985, - "step": 5580 - }, - { - "epoch": 1.0481028502576522, - "grad_norm": 0.05360455811023712, - "learning_rate": 3.974317570894413e-05, - "loss": 0.005278818309307098, - "step": 5600 - }, - { - "epoch": 1.0518462627290646, - "grad_norm": 0.008699169382452965, - "learning_rate": 3.9675257787654e-05, - "loss": 0.005309444293379784, - "step": 5620 - }, - { - "epoch": 1.0555896752004772, - "grad_norm": 0.036641959100961685, - "learning_rate": 3.960717420722227e-05, - "loss": 0.0034692320972681046, - "step": 5640 - }, - { - "epoch": 1.0593330876718898, - "grad_norm": 0.012212110683321953, - "learning_rate": 3.953892573619883e-05, - "loss": 0.005343861132860184, - "step": 5660 - }, - { - "epoch": 1.0630765001433025, - "grad_norm": 0.011296284385025501, - "learning_rate": 3.947051314499489e-05, - "loss": 0.0038058970123529432, - "step": 5680 - }, - { - "epoch": 1.066819912614715, - "grad_norm": 0.05954049900174141, - "learning_rate": 3.94019372058743e-05, - "loss": 0.008142991364002228, - "step": 5700 - }, - { - "epoch": 1.0705633250861277, - "grad_norm": 0.03478416055440903, - "learning_rate": 3.933319869294483e-05, - "loss": 0.0075227849185466765, - "step": 5720 - }, - { - "epoch": 1.0743067375575404, - "grad_norm": 0.014586996287107468, - "learning_rate": 3.9264298382149455e-05, - "loss": 0.0036750122904777526, - "step": 5740 - }, - { - "epoch": 1.078050150028953, - "grad_norm": 0.025754544883966446, - "learning_rate": 3.919523705125757e-05, - "loss": 0.004151013493537903, - "step": 5760 - }, - { - "epoch": 1.0817935625003656, - "grad_norm": 0.03239905461668968, - "learning_rate": 3.9126015479856205e-05, - "loss": 0.00861695185303688, - "step": 5780 - }, - { - "epoch": 1.0855369749717783, - "grad_norm": 0.03506994619965553, - "learning_rate": 3.9056634449341256e-05, - "loss": 0.003123755753040314, - "step": 5800 - }, - { - "epoch": 1.089280387443191, - "grad_norm": 0.0286911278963089, - "learning_rate": 3.898709474290864e-05, - "loss": 0.002537376619875431, - "step": 5820 - }, - { - "epoch": 1.0930237999146033, - "grad_norm": 0.03490692004561424, - "learning_rate": 3.8917397145545454e-05, - "loss": 0.0010227372869849205, - "step": 5840 - }, - { - "epoch": 1.096767212386016, - "grad_norm": 0.013748899102210999, - "learning_rate": 3.884754244402113e-05, - "loss": 0.011847371608018875, - "step": 5860 - }, - { - "epoch": 1.1005106248574286, - "grad_norm": 0.035458195954561234, - "learning_rate": 3.877753142687852e-05, - "loss": 0.009741749614477158, - "step": 5880 - }, - { - "epoch": 1.1042540373288412, - "grad_norm": 0.012493673712015152, - "learning_rate": 3.8707364884425064e-05, - "loss": 0.006607493013143539, - "step": 5900 - }, - { - "epoch": 1.1079974498002538, - "grad_norm": 0.018607834354043007, - "learning_rate": 3.863704360872378e-05, - "loss": 0.0016217166557908058, - "step": 5920 - }, - { - "epoch": 1.1117408622716665, - "grad_norm": 0.0283930953592062, - "learning_rate": 3.8566568393584366e-05, - "loss": 0.002083975449204445, - "step": 5940 - }, - { - "epoch": 1.115484274743079, - "grad_norm": 0.05229801684617996, - "learning_rate": 3.8495940034554283e-05, - "loss": 0.0014217685908079146, - "step": 5960 - }, - { - "epoch": 1.1192276872144917, - "grad_norm": 0.008808930404484272, - "learning_rate": 3.8425159328909684e-05, - "loss": 0.0022570645436644555, - "step": 5980 - }, - { - "epoch": 1.1229710996859044, - "grad_norm": 0.020502232015132904, - "learning_rate": 3.835422707564648e-05, - "loss": 0.003745942190289497, - "step": 6000 - }, - { - "epoch": 1.126714512157317, - "grad_norm": 0.032347094267606735, - "learning_rate": 3.82831440754713e-05, - "loss": 0.003347185626626015, - "step": 6020 - }, - { - "epoch": 1.1304579246287294, - "grad_norm": 0.020310478284955025, - "learning_rate": 3.821191113079246e-05, - "loss": 0.006166417896747589, - "step": 6040 - }, - { - "epoch": 1.134201337100142, - "grad_norm": 0.06390372663736343, - "learning_rate": 3.8140529045710876e-05, - "loss": 0.0013674044981598853, - "step": 6060 - }, - { - "epoch": 1.1379447495715547, - "grad_norm": 1.1938918828964233, - "learning_rate": 3.806899862601105e-05, - "loss": 0.010550644248723984, - "step": 6080 - }, - { - "epoch": 1.1416881620429673, - "grad_norm": 0.035355549305677414, - "learning_rate": 3.799732067915189e-05, - "loss": 0.0069750770926475525, - "step": 6100 - }, - { - "epoch": 1.14543157451438, - "grad_norm": 0.009921093471348286, - "learning_rate": 3.792549601425767e-05, - "loss": 0.0027949588373303415, - "step": 6120 - }, - { - "epoch": 1.1491749869857926, - "grad_norm": 0.06172063946723938, - "learning_rate": 3.785352544210884e-05, - "loss": 0.0009372101165354251, - "step": 6140 - }, - { - "epoch": 1.1529183994572052, - "grad_norm": 0.008572470396757126, - "learning_rate": 3.778140977513294e-05, - "loss": 0.0029502738267183303, - "step": 6160 - }, - { - "epoch": 1.1566618119286178, - "grad_norm": 0.4211727976799011, - "learning_rate": 3.770914982739534e-05, - "loss": 0.014692296087741853, - "step": 6180 - }, - { - "epoch": 1.1604052244000305, - "grad_norm": 0.02292146533727646, - "learning_rate": 3.7636746414590126e-05, - "loss": 0.0020170681178569793, - "step": 6200 - }, - { - "epoch": 1.164148636871443, - "grad_norm": 0.11247449368238449, - "learning_rate": 3.756420035403086e-05, - "loss": 0.006851900368928909, - "step": 6220 - }, - { - "epoch": 1.1678920493428557, - "grad_norm": 0.020755017176270485, - "learning_rate": 3.749151246464137e-05, - "loss": 0.0021739909425377846, - "step": 6240 - }, - { - "epoch": 1.1716354618142684, - "grad_norm": 0.017202025279402733, - "learning_rate": 3.741868356694647e-05, - "loss": 0.002353278361260891, - "step": 6260 - }, - { - "epoch": 1.1753788742856808, - "grad_norm": 0.014947429299354553, - "learning_rate": 3.734571448306274e-05, - "loss": 0.0010860362090170383, - "step": 6280 - }, - { - "epoch": 1.1791222867570934, - "grad_norm": 1.5391262769699097, - "learning_rate": 3.727260603668922e-05, - "loss": 0.01233254000544548, - "step": 6300 - }, - { - "epoch": 1.182865699228506, - "grad_norm": 0.4759792387485504, - "learning_rate": 3.7199359053098133e-05, - "loss": 0.0028501398861408233, - "step": 6320 - }, - { - "epoch": 1.1866091116999187, - "grad_norm": 0.01719040609896183, - "learning_rate": 3.7125974359125536e-05, - "loss": 0.00934450700879097, - "step": 6340 - }, - { - "epoch": 1.1903525241713313, - "grad_norm": 2.4766688346862793, - "learning_rate": 3.7052452783162015e-05, - "loss": 0.018582724034786224, - "step": 6360 - }, - { - "epoch": 1.194095936642744, - "grad_norm": 0.11404932290315628, - "learning_rate": 3.6978795155143326e-05, - "loss": 0.01815672367811203, - "step": 6380 - }, - { - "epoch": 1.1978393491141566, - "grad_norm": 0.021365633234381676, - "learning_rate": 3.690500230654103e-05, - "loss": 0.004123781993985176, - "step": 6400 - }, - { - "epoch": 1.2015827615855692, - "grad_norm": 0.022478772327303886, - "learning_rate": 3.68310750703531e-05, - "loss": 0.0038731731474399567, - "step": 6420 - }, - { - "epoch": 1.2053261740569818, - "grad_norm": 0.15531578660011292, - "learning_rate": 3.67570142810945e-05, - "loss": 0.002076444961130619, - "step": 6440 - }, - { - "epoch": 1.2090695865283942, - "grad_norm": 0.012458150275051594, - "learning_rate": 3.668282077478783e-05, - "loss": 0.0027592860162258146, - "step": 6460 - }, - { - "epoch": 1.2128129989998069, - "grad_norm": 0.01572798565030098, - "learning_rate": 3.66084953889538e-05, - "loss": 0.002740098722279072, - "step": 6480 - }, - { - "epoch": 1.2165564114712195, - "grad_norm": 0.13682503998279572, - "learning_rate": 3.6534038962601835e-05, - "loss": 0.000705425813794136, - "step": 6500 - }, - { - "epoch": 1.2202998239426321, - "grad_norm": 0.030630914494395256, - "learning_rate": 3.64594523362206e-05, - "loss": 0.012480729073286057, - "step": 6520 - }, - { - "epoch": 1.2240432364140448, - "grad_norm": 0.024804554879665375, - "learning_rate": 3.638473635176848e-05, - "loss": 0.0007834361866116523, - "step": 6540 - }, - { - "epoch": 1.2277866488854574, - "grad_norm": 0.011334752663969994, - "learning_rate": 3.630989185266411e-05, - "loss": 0.022086825966835023, - "step": 6560 - }, - { - "epoch": 1.23153006135687, - "grad_norm": 0.020346902310848236, - "learning_rate": 3.623491968377684e-05, - "loss": 0.018024472892284392, - "step": 6580 - }, - { - "epoch": 1.2352734738282827, - "grad_norm": 0.015177210792899132, - "learning_rate": 3.615982069141719e-05, - "loss": 0.005251453071832657, - "step": 6600 - }, - { - "epoch": 1.2390168862996953, - "grad_norm": 0.013680647127330303, - "learning_rate": 3.608459572332733e-05, - "loss": 0.006734563410282135, - "step": 6620 - }, - { - "epoch": 1.242760298771108, - "grad_norm": 0.17980872094631195, - "learning_rate": 3.600924562867144e-05, - "loss": 0.003970410302281379, - "step": 6640 - }, - { - "epoch": 1.2465037112425206, - "grad_norm": 0.015203841030597687, - "learning_rate": 3.593377125802622e-05, - "loss": 0.0032148901373147964, - "step": 6660 - }, - { - "epoch": 1.2502471237139332, - "grad_norm": 0.017300931736826897, - "learning_rate": 3.585817346337119e-05, - "loss": 0.00467667318880558, - "step": 6680 - }, - { - "epoch": 1.2539905361853458, - "grad_norm": 0.028181765228509903, - "learning_rate": 3.5782453098079175e-05, - "loss": 0.0015515764243900776, - "step": 6700 - }, - { - "epoch": 1.2577339486567582, - "grad_norm": 0.01730780117213726, - "learning_rate": 3.570661101690657e-05, - "loss": 0.007991334050893783, - "step": 6720 - }, - { - "epoch": 1.2614773611281709, - "grad_norm": 0.014216347597539425, - "learning_rate": 3.5630648075983763e-05, - "loss": 0.002533360943198204, - "step": 6740 - }, - { - "epoch": 1.2652207735995835, - "grad_norm": 0.1556195169687271, - "learning_rate": 3.555456513280544e-05, - "loss": 0.0032653655856847764, - "step": 6760 - }, - { - "epoch": 1.2689641860709961, - "grad_norm": 0.023955868557095528, - "learning_rate": 3.5478363046220915e-05, - "loss": 0.00850408971309662, - "step": 6780 - }, - { - "epoch": 1.2727075985424088, - "grad_norm": 0.17874136567115784, - "learning_rate": 3.5402042676424424e-05, - "loss": 0.0032720811665058135, - "step": 6800 - }, - { - "epoch": 1.2764510110138214, - "grad_norm": 0.0899379625916481, - "learning_rate": 3.5325604884945434e-05, - "loss": 0.003243798017501831, - "step": 6820 - }, - { - "epoch": 1.280194423485234, - "grad_norm": 0.413362056016922, - "learning_rate": 3.5249050534638906e-05, - "loss": 0.0036127623170614243, - "step": 6840 - }, - { - "epoch": 1.2839378359566467, - "grad_norm": 0.02790931612253189, - "learning_rate": 3.517238048967554e-05, - "loss": 0.008225285261869431, - "step": 6860 - }, - { - "epoch": 1.287681248428059, - "grad_norm": 0.6761110424995422, - "learning_rate": 3.5095595615532056e-05, - "loss": 0.00199942234903574, - "step": 6880 - }, - { - "epoch": 1.2914246608994717, - "grad_norm": 4.593618869781494, - "learning_rate": 3.5018696778981385e-05, - "loss": 0.007301987707614898, - "step": 6900 - }, - { - "epoch": 1.2951680733708844, - "grad_norm": 0.09392693638801575, - "learning_rate": 3.494168484808293e-05, - "loss": 0.009008315950632095, - "step": 6920 - }, - { - "epoch": 1.298911485842297, - "grad_norm": 0.008239852264523506, - "learning_rate": 3.48645606921727e-05, - "loss": 0.012661360204219818, - "step": 6940 - }, - { - "epoch": 1.3026548983137096, - "grad_norm": 0.05141177773475647, - "learning_rate": 3.4787325181853576e-05, - "loss": 0.0007553372532129287, - "step": 6960 - }, - { - "epoch": 1.3063983107851223, - "grad_norm": 0.024333903566002846, - "learning_rate": 3.470997918898541e-05, - "loss": 0.0016128463670611382, - "step": 6980 - }, - { - "epoch": 1.3101417232565349, - "grad_norm": 0.0337531715631485, - "learning_rate": 3.4632523586675254e-05, - "loss": 0.003253454715013504, - "step": 7000 - }, - { - "epoch": 1.3138851357279475, - "grad_norm": 0.05121550336480141, - "learning_rate": 3.4554959249267436e-05, - "loss": 0.0026307271793484686, - "step": 7020 - }, - { - "epoch": 1.3176285481993602, - "grad_norm": 0.025997543707489967, - "learning_rate": 3.447728705233374e-05, - "loss": 0.0012719514779746532, - "step": 7040 - }, - { - "epoch": 1.3213719606707728, - "grad_norm": 0.009486268274486065, - "learning_rate": 3.4399507872663494e-05, - "loss": 0.002009082958102226, - "step": 7060 - }, - { - "epoch": 1.3251153731421854, - "grad_norm": 0.016816232353448868, - "learning_rate": 3.432162258825369e-05, - "loss": 0.0005956823006272316, - "step": 7080 - }, - { - "epoch": 1.328858785613598, - "grad_norm": 0.004733961541205645, - "learning_rate": 3.424363207829906e-05, - "loss": 0.003636709600687027, - "step": 7100 - }, - { - "epoch": 1.3326021980850107, - "grad_norm": 3.666203498840332, - "learning_rate": 3.4165537223182155e-05, - "loss": 0.010488419234752655, - "step": 7120 - }, - { - "epoch": 1.336345610556423, - "grad_norm": 0.021471882238984108, - "learning_rate": 3.408733890446341e-05, - "loss": 0.0009709249250590801, - "step": 7140 - }, - { - "epoch": 1.3400890230278357, - "grad_norm": 0.007639541756361723, - "learning_rate": 3.40090380048712e-05, - "loss": 0.0030905861407518388, - "step": 7160 - }, - { - "epoch": 1.3438324354992484, - "grad_norm": 0.16878941655158997, - "learning_rate": 3.393063540829186e-05, - "loss": 0.0036965351551771163, - "step": 7180 - }, - { - "epoch": 1.347575847970661, - "grad_norm": 0.07014094293117523, - "learning_rate": 3.385213199975971e-05, - "loss": 0.0005677144508808851, - "step": 7200 - }, - { - "epoch": 1.3513192604420736, - "grad_norm": 0.008626374416053295, - "learning_rate": 3.377352866544706e-05, - "loss": 0.0005447934381663799, - "step": 7220 - }, - { - "epoch": 1.3550626729134863, - "grad_norm": 0.013825134374201298, - "learning_rate": 3.3694826292654246e-05, - "loss": 0.004854041337966919, - "step": 7240 - }, - { - "epoch": 1.3588060853848989, - "grad_norm": 0.025015883147716522, - "learning_rate": 3.361602576979956e-05, - "loss": 0.004542553424835205, - "step": 7260 - }, - { - "epoch": 1.3625494978563115, - "grad_norm": 0.009614030830562115, - "learning_rate": 3.353712798640923e-05, - "loss": 0.0008775785565376282, - "step": 7280 - }, - { - "epoch": 1.366292910327724, - "grad_norm": 3.8835268020629883, - "learning_rate": 3.345813383310744e-05, - "loss": 0.0063879616558551785, - "step": 7300 - }, - { - "epoch": 1.3700363227991366, - "grad_norm": 0.005518193822354078, - "learning_rate": 3.337904420160618e-05, - "loss": 0.0010956574231386184, - "step": 7320 - }, - { - "epoch": 1.3737797352705492, - "grad_norm": 0.005018322728574276, - "learning_rate": 3.329985998469526e-05, - "loss": 0.0012317843735218047, - "step": 7340 - }, - { - "epoch": 0.6887872232777639, - "grad_norm": 0.3108454942703247, - "learning_rate": 3.322058207623218e-05, - "loss": 0.010070423781871795, - "step": 7360 - }, - { - "epoch": 0.6906589276888447, - "grad_norm": 0.3556046783924103, - "learning_rate": 3.314121137113209e-05, - "loss": 0.0278738796710968, - "step": 7380 - }, - { - "epoch": 0.6925306320999256, - "grad_norm": 4.041794300079346, - "learning_rate": 3.306174876535762e-05, - "loss": 0.025335192680358887, - "step": 7400 - }, - { - "epoch": 0.6944023365110065, - "grad_norm": 0.04647493362426758, - "learning_rate": 3.2982195155908845e-05, - "loss": 0.05056847333908081, - "step": 7420 - }, - { - "epoch": 0.6962740409220873, - "grad_norm": 0.6827419400215149, - "learning_rate": 3.290653575270209e-05, - "loss": 0.036053261160850524, - "step": 7440 - }, - { - "epoch": 0.6981457453331683, - "grad_norm": 0.256136029958725, - "learning_rate": 3.2826807269966064e-05, - "loss": 0.020640365779399872, - "step": 7460 - }, - { - "epoch": 0.7000174497442492, - "grad_norm": 0.2054845094680786, - "learning_rate": 3.274699043565268e-05, - "loss": 0.03456352353096008, - "step": 7480 - }, - { - "epoch": 0.70188915415533, - "grad_norm": 0.2027648538351059, - "learning_rate": 3.266708615076064e-05, - "loss": 0.00846734493970871, - "step": 7500 - }, - { - "epoch": 0.7037608585664109, - "grad_norm": 1.6423311233520508, - "learning_rate": 3.258709531727582e-05, - "loss": 0.054978948831558225, - "step": 7520 - }, - { - "epoch": 0.7056325629774918, - "grad_norm": 1.775089144706726, - "learning_rate": 3.2507018838161085e-05, - "loss": 0.03238933086395264, - "step": 7540 - }, - { - "epoch": 0.7075042673885726, - "grad_norm": 0.06917860358953476, - "learning_rate": 3.242685761734609e-05, - "loss": 0.016849520802497863, - "step": 7560 - }, - { - "epoch": 0.7093759717996535, - "grad_norm": 0.051443129777908325, - "learning_rate": 3.2346612559717094e-05, - "loss": 0.048251998424530027, - "step": 7580 - }, - { - "epoch": 0.7112476762107344, - "grad_norm": 0.06533925980329514, - "learning_rate": 3.226628457110672e-05, - "loss": 0.03696450293064117, - "step": 7600 - }, - { - "epoch": 0.7131193806218153, - "grad_norm": 0.45661595463752747, - "learning_rate": 3.218587455828377e-05, - "loss": 0.05503013730049133, - "step": 7620 - }, - { - "epoch": 0.7149910850328962, - "grad_norm": 2.0205914974212646, - "learning_rate": 3.210538342894291e-05, - "loss": 0.033562681078910826, - "step": 7640 - }, - { - "epoch": 0.7168627894439771, - "grad_norm": 2.4842448234558105, - "learning_rate": 3.202481209169455e-05, - "loss": 0.019278638064861298, - "step": 7660 - }, - { - "epoch": 0.7187344938550579, - "grad_norm": 0.10550081729888916, - "learning_rate": 3.1944161456054436e-05, - "loss": 0.01638232171535492, - "step": 7680 - }, - { - "epoch": 0.7206061982661388, - "grad_norm": 1.606436014175415, - "learning_rate": 3.1863432432433506e-05, - "loss": 0.020552067458629607, - "step": 7700 - }, - { - "epoch": 0.7224779026772197, - "grad_norm": 0.2617719769477844, - "learning_rate": 3.178262593212757e-05, - "loss": 0.02315783053636551, - "step": 7720 - }, - { - "epoch": 0.7243496070883005, - "grad_norm": 0.9734074473381042, - "learning_rate": 3.1701742867307e-05, - "loss": 0.01938771307468414, - "step": 7740 - }, - { - "epoch": 0.7262213114993814, - "grad_norm": 0.5882985591888428, - "learning_rate": 3.162078415100647e-05, - "loss": 0.011305707693099975, - "step": 7760 - }, - { - "epoch": 0.7280930159104624, - "grad_norm": 0.04298723489046097, - "learning_rate": 3.15397506971146e-05, - "loss": 0.04238930344581604, - "step": 7780 - }, - { - "epoch": 0.7299647203215432, - "grad_norm": 6.2729315757751465, - "learning_rate": 3.145864342036372e-05, - "loss": 0.030225831270217895, - "step": 7800 - }, - { - "epoch": 0.7318364247326241, - "grad_norm": 0.026423340663313866, - "learning_rate": 3.1377463236319476e-05, - "loss": 0.012169972807168961, - "step": 7820 - }, - { - "epoch": 0.733708129143705, - "grad_norm": 0.0296376533806324, - "learning_rate": 3.1296211061370495e-05, - "loss": 0.015344823896884918, - "step": 7840 - }, - { - "epoch": 0.7355798335547858, - "grad_norm": 0.029524821788072586, - "learning_rate": 3.1214887812718094e-05, - "loss": 0.028345003724098206, - "step": 7860 - }, - { - "epoch": 0.7374515379658667, - "grad_norm": 0.06847794353961945, - "learning_rate": 3.113349440836588e-05, - "loss": 0.020069575309753417, - "step": 7880 - }, - { - "epoch": 0.7393232423769476, - "grad_norm": 0.024868430569767952, - "learning_rate": 3.1052031767109376e-05, - "loss": 0.014262473583221436, - "step": 7900 - }, - { - "epoch": 0.7411949467880286, - "grad_norm": 0.24450063705444336, - "learning_rate": 3.097050080852573e-05, - "loss": 0.04350808262825012, - "step": 7920 - }, - { - "epoch": 0.7430666511991094, - "grad_norm": 0.06978324800729752, - "learning_rate": 3.088890245296322e-05, - "loss": 0.015559709072113037, - "step": 7940 - }, - { - "epoch": 0.7449383556101903, - "grad_norm": 0.12675604224205017, - "learning_rate": 3.0807237621530964e-05, - "loss": 0.013867451250553131, - "step": 7960 - }, - { - "epoch": 0.7468100600212711, - "grad_norm": 0.2605513334274292, - "learning_rate": 3.072550723608846e-05, - "loss": 0.012869009375572204, - "step": 7980 - }, - { - "epoch": 0.748681764432352, - "grad_norm": 3.325530529022217, - "learning_rate": 3.064371221923521e-05, - "loss": 0.03036353886127472, - "step": 8000 - }, - { - "epoch": 0.7505534688434329, - "grad_norm": 0.22703051567077637, - "learning_rate": 3.0561853494300294e-05, - "loss": 0.009017374366521835, - "step": 8020 - }, - { - "epoch": 0.7524251732545137, - "grad_norm": 6.404862880706787, - "learning_rate": 3.047993198533195e-05, - "loss": 0.020604299008846284, - "step": 8040 - }, - { - "epoch": 0.7542968776655946, - "grad_norm": 0.06491954624652863, - "learning_rate": 3.039794861708714e-05, - "loss": 0.014963623881340028, - "step": 8060 - }, - { - "epoch": 0.7561685820766756, - "grad_norm": 0.4990088641643524, - "learning_rate": 3.0315904315021128e-05, - "loss": 0.02046530395746231, - "step": 8080 - }, - { - "epoch": 0.7580402864877565, - "grad_norm": 0.3174229562282562, - "learning_rate": 3.023380000527699e-05, - "loss": 0.013621781766414643, - "step": 8100 - }, - { - "epoch": 0.7599119908988373, - "grad_norm": 0.07161428034305573, - "learning_rate": 3.0151636614675218e-05, - "loss": 0.008043503761291504, - "step": 8120 - }, - { - "epoch": 0.7617836953099182, - "grad_norm": 0.6772736310958862, - "learning_rate": 3.0069415070703217e-05, - "loss": 0.03563189804553986, - "step": 8140 - }, - { - "epoch": 0.763655399720999, - "grad_norm": 0.07689516246318817, - "learning_rate": 2.998713630150485e-05, - "loss": 0.008622632920742035, - "step": 8160 - }, - { - "epoch": 0.7655271041320799, - "grad_norm": 0.014181110076606274, - "learning_rate": 2.990480123586994e-05, - "loss": 0.012368627637624741, - "step": 8180 - }, - { - "epoch": 0.7673988085431608, - "grad_norm": 4.4751715660095215, - "learning_rate": 2.9822410803223822e-05, - "loss": 0.02100955694913864, - "step": 8200 - }, - { - "epoch": 0.7692705129542416, - "grad_norm": 0.12694527208805084, - "learning_rate": 2.9739965933616825e-05, - "loss": 0.018182000517845152, - "step": 8220 - }, - { - "epoch": 0.7711422173653226, - "grad_norm": 0.13789872825145721, - "learning_rate": 2.9657467557713792e-05, - "loss": 0.008949784934520722, - "step": 8240 - }, - { - "epoch": 0.7730139217764035, - "grad_norm": 0.04048463702201843, - "learning_rate": 2.957491660678354e-05, - "loss": 0.03582434058189392, - "step": 8260 - }, - { - "epoch": 0.7748856261874844, - "grad_norm": 0.7825964689254761, - "learning_rate": 2.9492314012688378e-05, - "loss": 0.012679101526737213, - "step": 8280 - }, - { - "epoch": 0.7767573305985652, - "grad_norm": 0.14350314438343048, - "learning_rate": 2.9409660707873597e-05, - "loss": 0.010909486562013626, - "step": 8300 - }, - { - "epoch": 0.7786290350096461, - "grad_norm": 0.17676737904548645, - "learning_rate": 2.932695762535691e-05, - "loss": 0.01464642733335495, - "step": 8320 - }, - { - "epoch": 0.780500739420727, - "grad_norm": 0.5979751348495483, - "learning_rate": 2.9244205698717943e-05, - "loss": 0.028799059987068176, - "step": 8340 - }, - { - "epoch": 0.7823724438318078, - "grad_norm": 0.08448052406311035, - "learning_rate": 2.9161405862087676e-05, - "loss": 0.014056096971035003, - "step": 8360 - }, - { - "epoch": 0.7842441482428888, - "grad_norm": 0.5616207122802734, - "learning_rate": 2.9078559050137955e-05, - "loss": 0.008744364231824875, - "step": 8380 - }, - { - "epoch": 0.7861158526539697, - "grad_norm": 0.7264829277992249, - "learning_rate": 2.8995666198070836e-05, - "loss": 0.014575870335102081, - "step": 8400 - }, - { - "epoch": 0.7879875570650505, - "grad_norm": 1.444239616394043, - "learning_rate": 2.891272824160815e-05, - "loss": 0.01230706349015236, - "step": 8420 - }, - { - "epoch": 0.7898592614761314, - "grad_norm": 0.02643579989671707, - "learning_rate": 2.882974611698084e-05, - "loss": 0.01713460832834244, - "step": 8440 - }, - { - "epoch": 0.7917309658872123, - "grad_norm": 0.19893163442611694, - "learning_rate": 2.8746720760918457e-05, - "loss": 0.009562552720308305, - "step": 8460 - }, - { - "epoch": 0.7936026702982931, - "grad_norm": 1.8813897371292114, - "learning_rate": 2.866365311063855e-05, - "loss": 0.01966284364461899, - "step": 8480 - }, - { - "epoch": 0.795474374709374, - "grad_norm": 0.1820579618215561, - "learning_rate": 2.8580544103836114e-05, - "loss": 0.023943188786506652, - "step": 8500 - }, - { - "epoch": 0.7973460791204549, - "grad_norm": 1.3913259506225586, - "learning_rate": 2.849739467867298e-05, - "loss": 0.02233349084854126, - "step": 8520 - }, - { - "epoch": 0.7992177835315358, - "grad_norm": 0.28450486063957214, - "learning_rate": 2.8414205773767223e-05, - "loss": 0.016230446100234986, - "step": 8540 - }, - { - "epoch": 0.8010894879426167, - "grad_norm": 0.46086356043815613, - "learning_rate": 2.83309783281826e-05, - "loss": 0.013964855670928955, - "step": 8560 - }, - { - "epoch": 0.8029611923536976, - "grad_norm": 1.1401137113571167, - "learning_rate": 2.8247713281417924e-05, - "loss": 0.01552264392375946, - "step": 8580 - }, - { - "epoch": 0.8048328967647784, - "grad_norm": 0.02414649911224842, - "learning_rate": 2.8164411573396444e-05, - "loss": 0.00505053773522377, - "step": 8600 - }, - { - "epoch": 0.8067046011758593, - "grad_norm": 0.029010778293013573, - "learning_rate": 2.8081074144455276e-05, - "loss": 0.008068422973155975, - "step": 8620 - }, - { - "epoch": 0.8085763055869402, - "grad_norm": 0.024924319237470627, - "learning_rate": 2.7997701935334747e-05, - "loss": 0.021529987454414368, - "step": 8640 - }, - { - "epoch": 0.810448009998021, - "grad_norm": 0.3544171154499054, - "learning_rate": 2.791429588716782e-05, - "loss": 0.008264218270778657, - "step": 8660 - }, - { - "epoch": 0.8123197144091019, - "grad_norm": 0.011211074888706207, - "learning_rate": 2.7830856941469407e-05, - "loss": 0.013752134144306183, - "step": 8680 - }, - { - "epoch": 0.8141914188201829, - "grad_norm": 0.30479249358177185, - "learning_rate": 2.7747386040125807e-05, - "loss": 0.01313515156507492, - "step": 8700 - }, - { - "epoch": 0.8160631232312637, - "grad_norm": 3.1079516410827637, - "learning_rate": 2.766388412538404e-05, - "loss": 0.013471932709217071, - "step": 8720 - }, - { - "epoch": 0.8179348276423446, - "grad_norm": 0.011288405396044254, - "learning_rate": 2.758035213984121e-05, - "loss": 0.011207062005996703, - "step": 8740 - }, - { - "epoch": 0.8198065320534255, - "grad_norm": 0.011481484398245811, - "learning_rate": 2.749679102643387e-05, - "loss": 0.018254657089710236, - "step": 8760 - }, - { - "epoch": 0.8216782364645063, - "grad_norm": 0.037564992904663086, - "learning_rate": 2.7413201728427372e-05, - "loss": 0.024057184159755707, - "step": 8780 - }, - { - "epoch": 0.8235499408755872, - "grad_norm": 0.03808968514204025, - "learning_rate": 2.7329585189405253e-05, - "loss": 0.006051592528820038, - "step": 8800 - }, - { - "epoch": 0.8254216452866681, - "grad_norm": 0.07610247284173965, - "learning_rate": 2.724594235325852e-05, - "loss": 0.025592076778411865, - "step": 8820 - }, - { - "epoch": 0.827293349697749, - "grad_norm": 0.019049810245633125, - "learning_rate": 2.716227416417505e-05, - "loss": 0.0037486787885427477, - "step": 8840 - }, - { - "epoch": 0.8291650541088299, - "grad_norm": 0.6380273699760437, - "learning_rate": 2.7078581566628897e-05, - "loss": 0.015487492084503174, - "step": 8860 - }, - { - "epoch": 0.8310367585199108, - "grad_norm": 0.05775881186127663, - "learning_rate": 2.699486550536968e-05, - "loss": 0.03133237063884735, - "step": 8880 - }, - { - "epoch": 0.8329084629309916, - "grad_norm": 0.047411222010850906, - "learning_rate": 2.6911126925411845e-05, - "loss": 0.00861177071928978, - "step": 8900 - }, - { - "epoch": 0.8347801673420725, - "grad_norm": 0.23981286585330963, - "learning_rate": 2.682736677202406e-05, - "loss": 0.01839599907398224, - "step": 8920 - }, - { - "epoch": 0.8366518717531534, - "grad_norm": 0.36887305974960327, - "learning_rate": 2.6743585990718505e-05, - "loss": 0.01008533239364624, - "step": 8940 - }, - { - "epoch": 0.8385235761642342, - "grad_norm": 0.8994531035423279, - "learning_rate": 2.6659785527240233e-05, - "loss": 0.027107802033424378, - "step": 8960 - }, - { - "epoch": 0.8403952805753151, - "grad_norm": 0.12780402600765228, - "learning_rate": 2.6575966327556458e-05, - "loss": 0.03549482524394989, - "step": 8980 - }, - { - "epoch": 0.8422669849863961, - "grad_norm": 0.3294568359851837, - "learning_rate": 2.649212933784591e-05, - "loss": 0.02797776460647583, - "step": 9000 - }, - { - "epoch": 0.8441386893974769, - "grad_norm": 0.019461506977677345, - "learning_rate": 2.640827550448812e-05, - "loss": 0.010047334432601928, - "step": 9020 - }, - { - "epoch": 0.8460103938085578, - "grad_norm": 0.056546472012996674, - "learning_rate": 2.6324405774052784e-05, - "loss": 0.02831721007823944, - "step": 9040 - }, - { - "epoch": 0.8478820982196387, - "grad_norm": 0.017190299928188324, - "learning_rate": 2.6240521093289022e-05, - "loss": 0.019623257219791412, - "step": 9060 - }, - { - "epoch": 0.8497538026307195, - "grad_norm": 0.04793965816497803, - "learning_rate": 2.6156622409114728e-05, - "loss": 0.011966148018836975, - "step": 9080 - }, - { - "epoch": 0.8516255070418004, - "grad_norm": 0.006742037367075682, - "learning_rate": 2.607271066860587e-05, - "loss": 0.013694784045219422, - "step": 9100 - }, - { - "epoch": 0.8534972114528813, - "grad_norm": 0.03113027848303318, - "learning_rate": 2.5988786818985812e-05, - "loss": 0.05338943004608154, - "step": 9120 - }, - { - "epoch": 0.8553689158639621, - "grad_norm": 0.6589255928993225, - "learning_rate": 2.5904851807614588e-05, - "loss": 0.01305432766675949, - "step": 9140 - }, - { - "epoch": 0.8572406202750431, - "grad_norm": 0.3030281960964203, - "learning_rate": 2.582090658197825e-05, - "loss": 0.03663805425167084, - "step": 9160 - }, - { - "epoch": 0.859112324686124, - "grad_norm": 0.37101081013679504, - "learning_rate": 2.573695208967814e-05, - "loss": 0.016968609392642976, - "step": 9180 - }, - { - "epoch": 0.8609840290972048, - "grad_norm": 0.7480998039245605, - "learning_rate": 2.5652989278420197e-05, - "loss": 0.021240857243537904, - "step": 9200 - }, - { - "epoch": 0.8628557335082857, - "grad_norm": 0.017131274566054344, - "learning_rate": 2.5569019096004304e-05, - "loss": 0.004783949628472328, - "step": 9220 - }, - { - "epoch": 0.8647274379193666, - "grad_norm": 1.1544040441513062, - "learning_rate": 2.5485042490313504e-05, - "loss": 0.02356208860874176, - "step": 9240 - }, - { - "epoch": 0.8665991423304474, - "grad_norm": 0.13512635231018066, - "learning_rate": 2.540106040930338e-05, - "loss": 0.009329542517662048, - "step": 9260 - }, - { - "epoch": 0.8684708467415283, - "grad_norm": 0.018427839502692223, - "learning_rate": 2.5317073800991304e-05, - "loss": 0.007472375035285949, - "step": 9280 - }, - { - "epoch": 0.8703425511526093, - "grad_norm": 0.02722800336778164, - "learning_rate": 2.5233083613445778e-05, - "loss": 0.020304642617702484, - "step": 9300 - }, - { - "epoch": 0.8722142555636901, - "grad_norm": 0.051702745258808136, - "learning_rate": 2.5149090794775675e-05, - "loss": 0.02955295443534851, - "step": 9320 - }, - { - "epoch": 0.874085959974771, - "grad_norm": 0.1535400152206421, - "learning_rate": 2.5065096293119604e-05, - "loss": 0.030047640204429626, - "step": 9340 - }, - { - "epoch": 0.8759576643858519, - "grad_norm": 0.383573979139328, - "learning_rate": 2.498110105663513e-05, - "loss": 0.011377302557229995, - "step": 9360 - }, - { - "epoch": 0.8778293687969327, - "grad_norm": 0.23541487753391266, - "learning_rate": 2.489710603348817e-05, - "loss": 0.02304387390613556, - "step": 9380 - }, - { - "epoch": 0.8797010732080136, - "grad_norm": 0.029004938900470734, - "learning_rate": 2.4813112171842162e-05, - "loss": 0.020582889020442963, - "step": 9400 - }, - { - "epoch": 0.8815727776190945, - "grad_norm": 0.06564116477966309, - "learning_rate": 2.4729120419847498e-05, - "loss": 0.014207787811756134, - "step": 9420 - }, - { - "epoch": 0.8834444820301753, - "grad_norm": 0.01633615791797638, - "learning_rate": 2.464513172563072e-05, - "loss": 0.01756283938884735, - "step": 9440 - }, - { - "epoch": 0.8853161864412563, - "grad_norm": 0.01287770178169012, - "learning_rate": 2.456114703728386e-05, - "loss": 0.003737853467464447, - "step": 9460 - }, - { - "epoch": 0.8871878908523372, - "grad_norm": 0.05004064738750458, - "learning_rate": 2.448136615728485e-05, - "loss": 0.0324675589799881, - "step": 9480 - }, - { - "epoch": 0.889059595263418, - "grad_norm": 1.20869779586792, - "learning_rate": 2.4397392007153162e-05, - "loss": 0.007156150788068772, - "step": 9500 - }, - { - "epoch": 0.8909312996744989, - "grad_norm": 1.1070218086242676, - "learning_rate": 2.43134246594589e-05, - "loss": 0.009275762736797333, - "step": 9520 - }, - { - "epoch": 0.8928030040855798, - "grad_norm": 0.878593385219574, - "learning_rate": 2.4229465062053136e-05, - "loss": 0.018170186877250673, - "step": 9540 - }, - { - "epoch": 2.2363022211823886, - "grad_norm": 0.008350222371518612, - "learning_rate": 1.4461640332194936e-05, - "loss": 0.0004116921336390078, - "step": 9560 - }, - { - "epoch": 2.24098147308699, - "grad_norm": 0.007046359125524759, - "learning_rate": 1.4366537531356394e-05, - "loss": 0.002331317216157913, - "step": 9580 - }, - { - "epoch": 2.245660724991592, - "grad_norm": 0.017349600791931152, - "learning_rate": 1.4271622228435674e-05, - "loss": 0.002835669182240963, - "step": 9600 - }, - { - "epoch": 2.2503399768961936, - "grad_norm": 0.002660916419699788, - "learning_rate": 1.4176896097057135e-05, - "loss": 0.0046301551163196565, - "step": 9620 - }, - { - "epoch": 2.2550192288007955, - "grad_norm": 0.0053788539953529835, - "learning_rate": 1.4082360807509482e-05, - "loss": 0.0025884199887514113, - "step": 9640 - }, - { - "epoch": 2.2596984807053975, - "grad_norm": 0.008494540117681026, - "learning_rate": 1.3988018026716371e-05, - "loss": 0.004505794495344162, - "step": 9660 - }, - { - "epoch": 2.264377732609999, - "grad_norm": 0.0039060732815414667, - "learning_rate": 1.3893869418206949e-05, - "loss": 0.006675707548856736, - "step": 9680 - }, - { - "epoch": 2.269056984514601, - "grad_norm": 0.009458056651055813, - "learning_rate": 1.3799916642086585e-05, - "loss": 0.0005204954650253057, - "step": 9700 - }, - { - "epoch": 2.2737362364192024, - "grad_norm": 0.003356020897626877, - "learning_rate": 1.3706161355007579e-05, - "loss": 0.0003609266597777605, - "step": 9720 - }, - { - "epoch": 2.2784154883238044, - "grad_norm": 0.5002136826515198, - "learning_rate": 1.3612605210139912e-05, - "loss": 0.004545193165540695, - "step": 9740 - }, - { - "epoch": 2.283094740228406, - "grad_norm": 0.005455829668790102, - "learning_rate": 1.3519249857142147e-05, - "loss": 0.0071854725480079654, - "step": 9760 - }, - { - "epoch": 2.287773992133008, - "grad_norm": 0.0034075307194143534, - "learning_rate": 1.3426096942132305e-05, - "loss": 0.0009716464206576348, - "step": 9780 - }, - { - "epoch": 2.2924532440376093, - "grad_norm": 0.027845729142427444, - "learning_rate": 1.3333148107658883e-05, - "loss": 0.002121881954371929, - "step": 9800 - }, - { - "epoch": 2.2971324959422112, - "grad_norm": 0.09180603921413422, - "learning_rate": 1.3240404992671823e-05, - "loss": 0.0004960776772350073, - "step": 9820 - }, - { - "epoch": 2.3018117478468128, - "grad_norm": 0.002785895951092243, - "learning_rate": 1.3147869232493698e-05, - "loss": 0.0003569391556084156, - "step": 9840 - }, - { - "epoch": 2.3064909997514147, - "grad_norm": 0.0052960398606956005, - "learning_rate": 1.305554245879079e-05, - "loss": 0.006173171475529671, - "step": 9860 - }, - { - "epoch": 2.3111702516560166, - "grad_norm": 0.011150095611810684, - "learning_rate": 1.296342629954439e-05, - "loss": 0.0003610009793192148, - "step": 9880 - }, - { - "epoch": 2.315849503560618, - "grad_norm": 0.005413333419710398, - "learning_rate": 1.2871522379022038e-05, - "loss": 0.00034034508280456066, - "step": 9900 - }, - { - "epoch": 2.32052875546522, - "grad_norm": 0.005048416554927826, - "learning_rate": 1.2779832317748933e-05, - "loss": 0.0004146155435591936, - "step": 9920 - }, - { - "epoch": 2.3252080073698216, - "grad_norm": 0.0036182717885822058, - "learning_rate": 1.2688357732479303e-05, - "loss": 0.0004476548172533512, - "step": 9940 - }, - { - "epoch": 2.3298872592744235, - "grad_norm": 0.003401585388928652, - "learning_rate": 1.2597100236167963e-05, - "loss": 0.010583753883838653, - "step": 9960 - }, - { - "epoch": 2.334566511179025, - "grad_norm": 0.007459722459316254, - "learning_rate": 1.2506061437941804e-05, - "loss": 0.0008749734610319137, - "step": 9980 - }, - { - "epoch": 2.339245763083627, - "grad_norm": 0.0029236485715955496, - "learning_rate": 1.241524294307147e-05, - "loss": 0.01898992955684662, - "step": 10000 - }, - { - "epoch": 2.343925014988229, - "grad_norm": 0.012698110193014145, - "learning_rate": 1.232464635294302e-05, - "loss": 0.00044077420607209207, - "step": 10020 - }, - { - "epoch": 2.3486042668928304, - "grad_norm": 0.004794010892510414, - "learning_rate": 1.2234273265029742e-05, - "loss": 0.0007495431229472161, - "step": 10040 - }, - { - "epoch": 2.3532835187974324, - "grad_norm": 0.0034349120687693357, - "learning_rate": 1.2144125272863905e-05, - "loss": 0.003268691524863243, - "step": 10060 - } - ], - "logging_steps": 20, - "max_steps": 14963, - "num_input_tokens_seen": 0, - "num_train_epochs": 4, - "save_steps": 1000000000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.863966946718694e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/slots/10/checkpoint-10075/training_args.bin b/slots/10/checkpoint-10075/training_args.bin deleted file mode 100644 index cba6bf44229020a6cf5d76cffc747dea705142ea..0000000000000000000000000000000000000000 --- a/slots/10/checkpoint-10075/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66430bba102a8f3dc245713cd6268a99c212c508aacce1d8b9768464f5df26ec -size 5201 diff --git a/slots/10/latest.json b/slots/10/latest.json deleted file mode 100644 index 58c8693234f08ae04ea7c830deb060671aedc6b2..0000000000000000000000000000000000000000 --- a/slots/10/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:10", "checkpoint": "slots/10/checkpoint-10075", "step": 10075, "updated_at": 1776816260} diff --git a/slots/11/checkpoint-10018/config.json b/slots/11/checkpoint-10018/config.json deleted file mode 100644 index 9e5d8b7224eff16a790758ae86dd97c89afeab74..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "TwinyForCausalLM" - ], - "attention_dropout": 0.0, - "dtype": "float32", - "hidden_dropout": 0.0, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 128, - "model_type": "twiny", - "neftune_alpha": 0.0, - "num_attention_heads": 12, - "num_hidden_layers": 3, - "num_key_value_heads": 3, - "qk_norm": true, - "rezero_init": 1.0, - "rms_norm_eps": 1e-06, - "rope_theta": 10000.0, - "transformers_version": "5.0.0", - "use_cache": false, - "vocab_size": 32000 -} diff --git a/slots/11/checkpoint-10018/model.safetensors b/slots/11/checkpoint-10018/model.safetensors deleted file mode 100644 index f93d2b8c9a5ebd1790b1f42034696c2a3925a86c..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc385e0ba2997d889c222255cc32afd608ff9a7becbc9b801a4ddf466b56965a -size 306388092 diff --git a/slots/11/checkpoint-10018/optimizer.pt b/slots/11/checkpoint-10018/optimizer.pt deleted file mode 100644 index 7824734de3dddf7d80b8575f4dd9841b8494671c..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cb5b5c3221bbb44ed59fc9d517d754c22206f83f715c4b3f5045db7b5dadace -size 302484555 diff --git a/slots/11/checkpoint-10018/rng_state.pth b/slots/11/checkpoint-10018/rng_state.pth deleted file mode 100644 index 1feba1a6538e93b94696d3773853dbc8947b0cad..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 -size 14645 diff --git a/slots/11/checkpoint-10018/scaler.pt b/slots/11/checkpoint-10018/scaler.pt deleted file mode 100644 index 95172fd5e950b27ba594eb4e175ebf4e2d80b927..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/scaler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:93218919383530af8961c39d5c4b5f375fe2736cffa9fc9e0924598bb8b36350 -size 1383 diff --git a/slots/11/checkpoint-10018/scheduler.pt b/slots/11/checkpoint-10018/scheduler.pt deleted file mode 100644 index 5dacc271d1b0499ee2354e976dabf1a85f56345a..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e311f2d14f1c14c28988e6e35b49c2006cef0f3031ae333e9760372424fc79da -size 1465 diff --git a/slots/11/checkpoint-10018/trainer_state.json b/slots/11/checkpoint-10018/trainer_state.json deleted file mode 100644 index e131ffe5d92723d3cd2c7782f7bd09bdd424d2e7..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/trainer_state.json +++ /dev/null @@ -1,3541 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.3434579268426425, - "eval_steps": 500, - "global_step": 10018, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001336931908386741, - "grad_norm": Infinity, - "learning_rate": 5e-05, - "loss": 129.80032348632812, - "step": 1 - }, - { - "epoch": 0.002673863816773482, - "grad_norm": 63.64365768432617, - "learning_rate": 4.999995392022967e-05, - "loss": 63.88374408922697, - "step": 20 - }, - { - "epoch": 0.005347727633546964, - "grad_norm": 24.627853393554688, - "learning_rate": 4.999976672145381e-05, - "loss": 12.65963363647461, - "step": 40 - }, - { - "epoch": 0.008021591450320446, - "grad_norm": 14.29983901977539, - "learning_rate": 4.999943552476422e-05, - "loss": 5.90204963684082, - "step": 60 - }, - { - "epoch": 0.010695455267093928, - "grad_norm": 15.690323829650879, - "learning_rate": 4.999896033206858e-05, - "loss": 3.9918922424316405, - "step": 80 - }, - { - "epoch": 0.01336931908386741, - "grad_norm": 31.583160400390625, - "learning_rate": 4.999834114610398e-05, - "loss": 2.9675426483154297, - "step": 100 - }, - { - "epoch": 0.01604318290064089, - "grad_norm": 13.034649848937988, - "learning_rate": 4.999757797043691e-05, - "loss": 2.725296401977539, - "step": 120 - }, - { - "epoch": 0.018717046717414372, - "grad_norm": 8.362203598022461, - "learning_rate": 4.999667080946324e-05, - "loss": 2.2478992462158205, - "step": 140 - }, - { - "epoch": 0.021390910534187856, - "grad_norm": 8.726786613464355, - "learning_rate": 4.999561966840821e-05, - "loss": 1.8447845458984375, - "step": 160 - }, - { - "epoch": 0.024064774350961337, - "grad_norm": 10.092752456665039, - "learning_rate": 4.9994424553326335e-05, - "loss": 1.5611843109130858, - "step": 180 - }, - { - "epoch": 0.02673863816773482, - "grad_norm": 9.090085983276367, - "learning_rate": 4.999308547110146e-05, - "loss": 1.520334815979004, - "step": 200 - }, - { - "epoch": 0.029412501984508302, - "grad_norm": 9.668124198913574, - "learning_rate": 4.999160242944665e-05, - "loss": 1.2818055152893066, - "step": 220 - }, - { - "epoch": 0.03208636580128178, - "grad_norm": 9.182533264160156, - "learning_rate": 4.998997543690418e-05, - "loss": 1.0428407669067383, - "step": 240 - }, - { - "epoch": 0.03476022961805526, - "grad_norm": 5.745838165283203, - "learning_rate": 4.998820450284549e-05, - "loss": 1.2343652725219727, - "step": 260 - }, - { - "epoch": 0.037434093434828744, - "grad_norm": 8.651643753051758, - "learning_rate": 4.99862896374711e-05, - "loss": 0.8859601020812988, - "step": 280 - }, - { - "epoch": 0.04010795725160223, - "grad_norm": 10.765266418457031, - "learning_rate": 4.998423085181056e-05, - "loss": 0.989600658416748, - "step": 300 - }, - { - "epoch": 0.04278182106837571, - "grad_norm": 6.092499256134033, - "learning_rate": 4.998202815772245e-05, - "loss": 0.7189463615417481, - "step": 320 - }, - { - "epoch": 0.04545568488514919, - "grad_norm": 6.352876663208008, - "learning_rate": 4.9979681567894195e-05, - "loss": 0.7489545345306396, - "step": 340 - }, - { - "epoch": 0.048129548701922674, - "grad_norm": 4.620656490325928, - "learning_rate": 4.997719109584209e-05, - "loss": 0.7381401538848877, - "step": 360 - }, - { - "epoch": 0.050803412518696155, - "grad_norm": 7.796917915344238, - "learning_rate": 4.997455675591119e-05, - "loss": 0.5687405109405518, - "step": 380 - }, - { - "epoch": 0.05347727633546964, - "grad_norm": 2.837172508239746, - "learning_rate": 4.9971778563275204e-05, - "loss": 0.5686865329742432, - "step": 400 - }, - { - "epoch": 0.05615114015224312, - "grad_norm": 3.3103690147399902, - "learning_rate": 4.9968856533936436e-05, - "loss": 0.625730562210083, - "step": 420 - }, - { - "epoch": 0.058825003969016604, - "grad_norm": 3.5682132244110107, - "learning_rate": 4.99657906847257e-05, - "loss": 0.6125466346740722, - "step": 440 - }, - { - "epoch": 0.061498867785790085, - "grad_norm": 5.63640832901001, - "learning_rate": 4.996258103330218e-05, - "loss": 0.6182214260101319, - "step": 460 - }, - { - "epoch": 0.06417273160256357, - "grad_norm": 4.698945999145508, - "learning_rate": 4.995922759815339e-05, - "loss": 0.43828091621398924, - "step": 480 - }, - { - "epoch": 0.06684659541933705, - "grad_norm": 2.1976189613342285, - "learning_rate": 4.995573039859501e-05, - "loss": 0.4459230899810791, - "step": 500 - }, - { - "epoch": 0.06952045923611053, - "grad_norm": 3.8809523582458496, - "learning_rate": 4.995208945477081e-05, - "loss": 0.3821882963180542, - "step": 520 - }, - { - "epoch": 0.07219432305288401, - "grad_norm": 3.75144100189209, - "learning_rate": 4.994830478765251e-05, - "loss": 0.5800807476043701, - "step": 540 - }, - { - "epoch": 0.07486818686965749, - "grad_norm": 3.0038585662841797, - "learning_rate": 4.9944376419039684e-05, - "loss": 0.3928264617919922, - "step": 560 - }, - { - "epoch": 0.07754205068643098, - "grad_norm": 3.614591598510742, - "learning_rate": 4.994030437155961e-05, - "loss": 0.48637890815734863, - "step": 580 - }, - { - "epoch": 0.08021591450320446, - "grad_norm": 4.143443584442139, - "learning_rate": 4.993608866866718e-05, - "loss": 0.3650153160095215, - "step": 600 - }, - { - "epoch": 0.08288977831997794, - "grad_norm": 6.692712783813477, - "learning_rate": 4.993172933464471e-05, - "loss": 0.3677916288375854, - "step": 620 - }, - { - "epoch": 0.08556364213675142, - "grad_norm": 8.383441925048828, - "learning_rate": 4.9927226394601815e-05, - "loss": 0.3399480104446411, - "step": 640 - }, - { - "epoch": 0.0882375059535249, - "grad_norm": 5.566338062286377, - "learning_rate": 4.992257987447532e-05, - "loss": 0.28104052543640134, - "step": 660 - }, - { - "epoch": 0.09091136977029839, - "grad_norm": 3.1196420192718506, - "learning_rate": 4.991778980102904e-05, - "loss": 0.351950478553772, - "step": 680 - }, - { - "epoch": 0.09358523358707187, - "grad_norm": 3.47979736328125, - "learning_rate": 4.9912856201853644e-05, - "loss": 0.27501535415649414, - "step": 700 - }, - { - "epoch": 0.09625909740384535, - "grad_norm": 5.446717262268066, - "learning_rate": 4.990777910536653e-05, - "loss": 0.2651593923568726, - "step": 720 - }, - { - "epoch": 0.09893296122061883, - "grad_norm": 7.6145339012146, - "learning_rate": 4.990255854081161e-05, - "loss": 0.35140380859375, - "step": 740 - }, - { - "epoch": 0.10160682503739231, - "grad_norm": 8.445616722106934, - "learning_rate": 4.989719453825918e-05, - "loss": 0.2961219072341919, - "step": 760 - }, - { - "epoch": 0.10428068885416579, - "grad_norm": 6.339537620544434, - "learning_rate": 4.9891687128605744e-05, - "loss": 0.24962289333343507, - "step": 780 - }, - { - "epoch": 0.10695455267093928, - "grad_norm": 3.3369436264038086, - "learning_rate": 4.988603634357383e-05, - "loss": 0.2124847412109375, - "step": 800 - }, - { - "epoch": 0.10962841648771277, - "grad_norm": 2.2909045219421387, - "learning_rate": 4.988024221571177e-05, - "loss": 0.24679112434387207, - "step": 820 - }, - { - "epoch": 0.11230228030448625, - "grad_norm": 3.1149911880493164, - "learning_rate": 4.9874304778393574e-05, - "loss": 0.22161397933959961, - "step": 840 - }, - { - "epoch": 0.11497614412125973, - "grad_norm": 14.802160263061523, - "learning_rate": 4.9868224065818706e-05, - "loss": 0.2623537302017212, - "step": 860 - }, - { - "epoch": 0.11765000793803321, - "grad_norm": 5.586325168609619, - "learning_rate": 4.98620001130119e-05, - "loss": 0.3560942649841309, - "step": 880 - }, - { - "epoch": 0.12032387175480669, - "grad_norm": 3.390017032623291, - "learning_rate": 4.9855632955822916e-05, - "loss": 0.16934787034988402, - "step": 900 - }, - { - "epoch": 0.12299773557158017, - "grad_norm": 6.070940971374512, - "learning_rate": 4.984912263092641e-05, - "loss": 0.2131197214126587, - "step": 920 - }, - { - "epoch": 0.12567159938835365, - "grad_norm": 1.4912281036376953, - "learning_rate": 4.984246917582166e-05, - "loss": 0.25128653049468996, - "step": 940 - }, - { - "epoch": 0.12834546320512713, - "grad_norm": 7.000472545623779, - "learning_rate": 4.9835672628832366e-05, - "loss": 0.2653592586517334, - "step": 960 - }, - { - "epoch": 0.1310193270219006, - "grad_norm": 5.427223205566406, - "learning_rate": 4.9828733029106434e-05, - "loss": 0.1653295636177063, - "step": 980 - }, - { - "epoch": 0.1336931908386741, - "grad_norm": 1.9502102136611938, - "learning_rate": 4.982165041661575e-05, - "loss": 0.2250870943069458, - "step": 1000 - }, - { - "epoch": 0.13636705465544757, - "grad_norm": 0.6216259598731995, - "learning_rate": 4.981442483215595e-05, - "loss": 0.18943849802017212, - "step": 1020 - }, - { - "epoch": 0.13904091847222105, - "grad_norm": 2.3363687992095947, - "learning_rate": 4.98070563173462e-05, - "loss": 0.1673592209815979, - "step": 1040 - }, - { - "epoch": 0.14171478228899453, - "grad_norm": 1.040717601776123, - "learning_rate": 4.979954491462892e-05, - "loss": 0.2113173007965088, - "step": 1060 - }, - { - "epoch": 0.14438864610576801, - "grad_norm": 2.735522747039795, - "learning_rate": 4.979189066726955e-05, - "loss": 0.17504971027374266, - "step": 1080 - }, - { - "epoch": 0.1470625099225415, - "grad_norm": 4.701151371002197, - "learning_rate": 4.978409361935636e-05, - "loss": 0.15881222486495972, - "step": 1100 - }, - { - "epoch": 0.14973637373931498, - "grad_norm": 2.735919237136841, - "learning_rate": 4.9776153815800075e-05, - "loss": 0.14044179916381835, - "step": 1120 - }, - { - "epoch": 0.15241023755608848, - "grad_norm": 3.5479538440704346, - "learning_rate": 4.976807130233375e-05, - "loss": 0.18565714359283447, - "step": 1140 - }, - { - "epoch": 0.15508410137286197, - "grad_norm": 3.2167458534240723, - "learning_rate": 4.975984612551243e-05, - "loss": 0.13236271142959594, - "step": 1160 - }, - { - "epoch": 0.15775796518963545, - "grad_norm": 1.0206760168075562, - "learning_rate": 4.975147833271288e-05, - "loss": 0.19124728441238403, - "step": 1180 - }, - { - "epoch": 0.16043182900640893, - "grad_norm": 4.194457530975342, - "learning_rate": 4.9742967972133335e-05, - "loss": 0.144741427898407, - "step": 1200 - }, - { - "epoch": 0.1631056928231824, - "grad_norm": 3.0225746631622314, - "learning_rate": 4.973431509279323e-05, - "loss": 0.1374324679374695, - "step": 1220 - }, - { - "epoch": 0.1657795566399559, - "grad_norm": 4.243523120880127, - "learning_rate": 4.972551974453287e-05, - "loss": 0.13663809299468993, - "step": 1240 - }, - { - "epoch": 0.16845342045672937, - "grad_norm": 2.4990086555480957, - "learning_rate": 4.971658197801322e-05, - "loss": 0.16817957162857056, - "step": 1260 - }, - { - "epoch": 0.17112728427350285, - "grad_norm": 4.983982563018799, - "learning_rate": 4.9707501844715554e-05, - "loss": 0.13795313835144044, - "step": 1280 - }, - { - "epoch": 0.17380114809027633, - "grad_norm": 3.6780316829681396, - "learning_rate": 4.969827939694115e-05, - "loss": 0.1637880802154541, - "step": 1300 - }, - { - "epoch": 0.1764750119070498, - "grad_norm": 0.7950732707977295, - "learning_rate": 4.968891468781105e-05, - "loss": 0.10979138612747193, - "step": 1320 - }, - { - "epoch": 0.1791488757238233, - "grad_norm": 1.2414121627807617, - "learning_rate": 4.967940777126569e-05, - "loss": 0.13692171573638917, - "step": 1340 - }, - { - "epoch": 0.18182273954059677, - "grad_norm": 2.1383633613586426, - "learning_rate": 4.9669758702064636e-05, - "loss": 0.07821698188781738, - "step": 1360 - }, - { - "epoch": 0.18449660335737025, - "grad_norm": 5.061275959014893, - "learning_rate": 4.965996753578623e-05, - "loss": 0.19053516387939454, - "step": 1380 - }, - { - "epoch": 0.18717046717414373, - "grad_norm": 6.151792049407959, - "learning_rate": 4.9650034328827305e-05, - "loss": 0.11360721588134766, - "step": 1400 - }, - { - "epoch": 0.18984433099091721, - "grad_norm": 1.0604305267333984, - "learning_rate": 4.963995913840284e-05, - "loss": 0.13138024806976317, - "step": 1420 - }, - { - "epoch": 0.1925181948076907, - "grad_norm": 1.7159489393234253, - "learning_rate": 4.9629742022545623e-05, - "loss": 0.08657677173614502, - "step": 1440 - }, - { - "epoch": 0.19519205862446418, - "grad_norm": 2.4207754135131836, - "learning_rate": 4.961938304010595e-05, - "loss": 0.10309149026870727, - "step": 1460 - }, - { - "epoch": 0.19786592244123766, - "grad_norm": 1.532060146331787, - "learning_rate": 4.9608882250751245e-05, - "loss": 0.13628544807434081, - "step": 1480 - }, - { - "epoch": 0.20053978625801114, - "grad_norm": 6.409943580627441, - "learning_rate": 4.959823971496574e-05, - "loss": 0.10584845542907714, - "step": 1500 - }, - { - "epoch": 0.20321365007478462, - "grad_norm": 2.452012538909912, - "learning_rate": 4.9587455494050136e-05, - "loss": 0.06506187915802002, - "step": 1520 - }, - { - "epoch": 0.2058875138915581, - "grad_norm": 5.3016533851623535, - "learning_rate": 4.9576529650121214e-05, - "loss": 0.11848526000976563, - "step": 1540 - }, - { - "epoch": 0.20856137770833158, - "grad_norm": 4.341775894165039, - "learning_rate": 4.956546224611152e-05, - "loss": 0.11318533420562744, - "step": 1560 - }, - { - "epoch": 0.21123524152510506, - "grad_norm": 1.9056169986724854, - "learning_rate": 4.9554253345768965e-05, - "loss": 0.12768398523330687, - "step": 1580 - }, - { - "epoch": 0.21390910534187857, - "grad_norm": 1.8939746618270874, - "learning_rate": 4.9542903013656486e-05, - "loss": 0.10782338380813598, - "step": 1600 - }, - { - "epoch": 0.21658296915865205, - "grad_norm": 8.53671932220459, - "learning_rate": 4.9531411315151654e-05, - "loss": 0.1733921766281128, - "step": 1620 - }, - { - "epoch": 0.21925683297542553, - "grad_norm": 2.0152978897094727, - "learning_rate": 4.951977831644632e-05, - "loss": 0.11197054386138916, - "step": 1640 - }, - { - "epoch": 0.221930696792199, - "grad_norm": 3.8422367572784424, - "learning_rate": 4.95080040845462e-05, - "loss": 0.11441781520843505, - "step": 1660 - }, - { - "epoch": 0.2246045606089725, - "grad_norm": 1.819858193397522, - "learning_rate": 4.949608868727053e-05, - "loss": 0.11403474807739258, - "step": 1680 - }, - { - "epoch": 0.22727842442574597, - "grad_norm": 7.45100212097168, - "learning_rate": 4.948403219325163e-05, - "loss": 0.13117753267288207, - "step": 1700 - }, - { - "epoch": 0.22995228824251945, - "grad_norm": 0.6526040434837341, - "learning_rate": 4.947183467193456e-05, - "loss": 0.07524924874305725, - "step": 1720 - }, - { - "epoch": 0.23262615205929293, - "grad_norm": 3.814746856689453, - "learning_rate": 4.945949619357668e-05, - "loss": 0.07659345269203185, - "step": 1740 - }, - { - "epoch": 0.23530001587606642, - "grad_norm": 2.373124122619629, - "learning_rate": 4.944701682924726e-05, - "loss": 0.1147496223449707, - "step": 1760 - }, - { - "epoch": 0.2379738796928399, - "grad_norm": 0.11161285638809204, - "learning_rate": 4.943439665082707e-05, - "loss": 0.07256829738616943, - "step": 1780 - }, - { - "epoch": 0.24064774350961338, - "grad_norm": 0.45990192890167236, - "learning_rate": 4.942163573100794e-05, - "loss": 0.07726740837097168, - "step": 1800 - }, - { - "epoch": 0.24332160732638686, - "grad_norm": 4.2301926612854, - "learning_rate": 4.940873414329242e-05, - "loss": 0.09349535703659058, - "step": 1820 - }, - { - "epoch": 0.24599547114316034, - "grad_norm": 2.442178726196289, - "learning_rate": 4.939569196199325e-05, - "loss": 0.12413722276687622, - "step": 1840 - }, - { - "epoch": 0.24866933495993382, - "grad_norm": 2.523683786392212, - "learning_rate": 4.938250926223302e-05, - "loss": 0.08566288352012634, - "step": 1860 - }, - { - "epoch": 0.2513431987767073, - "grad_norm": 3.511075258255005, - "learning_rate": 4.936918611994368e-05, - "loss": 0.08007702231407166, - "step": 1880 - }, - { - "epoch": 0.2540170625934808, - "grad_norm": 6.254627704620361, - "learning_rate": 4.935572261186614e-05, - "loss": 0.10983954668045044, - "step": 1900 - }, - { - "epoch": 0.25669092641025426, - "grad_norm": 1.5211899280548096, - "learning_rate": 4.934211881554981e-05, - "loss": 0.09120344519615173, - "step": 1920 - }, - { - "epoch": 0.25936479022702774, - "grad_norm": 2.5893588066101074, - "learning_rate": 4.932837480935214e-05, - "loss": 0.08754412531852722, - "step": 1940 - }, - { - "epoch": 0.2620386540438012, - "grad_norm": 6.878556251525879, - "learning_rate": 4.931449067243821e-05, - "loss": 0.08636274933815002, - "step": 1960 - }, - { - "epoch": 0.2647125178605747, - "grad_norm": 2.9078798294067383, - "learning_rate": 4.9300466484780226e-05, - "loss": 0.09582929015159607, - "step": 1980 - }, - { - "epoch": 0.2673863816773482, - "grad_norm": 3.391852855682373, - "learning_rate": 4.92863023271571e-05, - "loss": 0.0850919783115387, - "step": 2000 - }, - { - "epoch": 0.27006024549412166, - "grad_norm": 5.522103309631348, - "learning_rate": 4.927199828115395e-05, - "loss": 0.050999772548675534, - "step": 2020 - }, - { - "epoch": 0.27273410931089515, - "grad_norm": 0.90350741147995, - "learning_rate": 4.925755442916167e-05, - "loss": 0.10100446939468384, - "step": 2040 - }, - { - "epoch": 0.2754079731276686, - "grad_norm": 1.602030634880066, - "learning_rate": 4.924297085437641e-05, - "loss": 0.0468633770942688, - "step": 2060 - }, - { - "epoch": 0.2780818369444421, - "grad_norm": 1.5823460817337036, - "learning_rate": 4.922824764079913e-05, - "loss": 0.06786358952522278, - "step": 2080 - }, - { - "epoch": 0.2807557007612156, - "grad_norm": 1.6624343395233154, - "learning_rate": 4.92133848732351e-05, - "loss": 0.05772828459739685, - "step": 2100 - }, - { - "epoch": 0.28342956457798907, - "grad_norm": 0.947078287601471, - "learning_rate": 4.9198382637293424e-05, - "loss": 0.08012173175811768, - "step": 2120 - }, - { - "epoch": 0.28610342839476255, - "grad_norm": 0.2919924259185791, - "learning_rate": 4.918324101938653e-05, - "loss": 0.1208539366722107, - "step": 2140 - }, - { - "epoch": 0.28877729221153603, - "grad_norm": 9.258247375488281, - "learning_rate": 4.916796010672969e-05, - "loss": 0.10037034749984741, - "step": 2160 - }, - { - "epoch": 0.2914511560283095, - "grad_norm": 4.0920491218566895, - "learning_rate": 4.915253998734051e-05, - "loss": 0.061488878726959226, - "step": 2180 - }, - { - "epoch": 0.294125019845083, - "grad_norm": 6.1126627922058105, - "learning_rate": 4.913698075003841e-05, - "loss": 0.0862967312335968, - "step": 2200 - }, - { - "epoch": 0.29679888366185647, - "grad_norm": 2.585484743118286, - "learning_rate": 4.912128248444414e-05, - "loss": 0.05393874645233154, - "step": 2220 - }, - { - "epoch": 0.29947274747862995, - "grad_norm": 6.944481372833252, - "learning_rate": 4.9105445280979256e-05, - "loss": 0.08570566773414612, - "step": 2240 - }, - { - "epoch": 0.30214661129540343, - "grad_norm": 1.3824089765548706, - "learning_rate": 4.908946923086556e-05, - "loss": 0.09689127206802368, - "step": 2260 - }, - { - "epoch": 0.30482047511217697, - "grad_norm": 3.4861342906951904, - "learning_rate": 4.907335442612464e-05, - "loss": 0.12550976276397705, - "step": 2280 - }, - { - "epoch": 0.30749433892895045, - "grad_norm": 3.668980121612549, - "learning_rate": 4.905710095957728e-05, - "loss": 0.09089353680610657, - "step": 2300 - }, - { - "epoch": 0.31016820274572393, - "grad_norm": 1.093095064163208, - "learning_rate": 4.904070892484298e-05, - "loss": 0.03925192356109619, - "step": 2320 - }, - { - "epoch": 0.3128420665624974, - "grad_norm": 0.8169485926628113, - "learning_rate": 4.9024178416339364e-05, - "loss": 0.0979581356048584, - "step": 2340 - }, - { - "epoch": 0.3155159303792709, - "grad_norm": 1.892451286315918, - "learning_rate": 4.900750952928166e-05, - "loss": 0.05913209915161133, - "step": 2360 - }, - { - "epoch": 0.3181897941960444, - "grad_norm": 0.24644255638122559, - "learning_rate": 4.8990702359682184e-05, - "loss": 0.06815173625946044, - "step": 2380 - }, - { - "epoch": 0.32086365801281785, - "grad_norm": 2.1861305236816406, - "learning_rate": 4.897375700434972e-05, - "loss": 0.04142785966396332, - "step": 2400 - }, - { - "epoch": 0.32353752182959133, - "grad_norm": 2.6643004417419434, - "learning_rate": 4.8956673560889013e-05, - "loss": 0.05177200436592102, - "step": 2420 - }, - { - "epoch": 0.3262113856463648, - "grad_norm": 2.588113784790039, - "learning_rate": 4.8939452127700195e-05, - "loss": 0.05783546566963196, - "step": 2440 - }, - { - "epoch": 0.3288852494631383, - "grad_norm": 2.419644594192505, - "learning_rate": 4.8922092803978203e-05, - "loss": 0.08906854391098022, - "step": 2460 - }, - { - "epoch": 0.3315591132799118, - "grad_norm": 0.16949939727783203, - "learning_rate": 4.890459568971223e-05, - "loss": 0.10305211544036866, - "step": 2480 - }, - { - "epoch": 0.33423297709668526, - "grad_norm": 0.10032984614372253, - "learning_rate": 4.8886960885685126e-05, - "loss": 0.06348527669906616, - "step": 2500 - }, - { - "epoch": 0.33690684091345874, - "grad_norm": 3.3658738136291504, - "learning_rate": 4.8869188493472854e-05, - "loss": 0.06826075911521912, - "step": 2520 - }, - { - "epoch": 0.3395807047302322, - "grad_norm": 0.8656186461448669, - "learning_rate": 4.885127861544386e-05, - "loss": 0.05929765701293945, - "step": 2540 - }, - { - "epoch": 0.3422545685470057, - "grad_norm": 0.1492065042257309, - "learning_rate": 4.8833231354758496e-05, - "loss": 0.09429731965065002, - "step": 2560 - }, - { - "epoch": 0.3449284323637792, - "grad_norm": 0.6010928153991699, - "learning_rate": 4.881504681536846e-05, - "loss": 0.06262240409851075, - "step": 2580 - }, - { - "epoch": 0.34760229618055266, - "grad_norm": 1.6506450176239014, - "learning_rate": 4.879672510201616e-05, - "loss": 0.061688083410263064, - "step": 2600 - }, - { - "epoch": 0.35027615999732614, - "grad_norm": 0.2703142464160919, - "learning_rate": 4.877826632023412e-05, - "loss": 0.06175137162208557, - "step": 2620 - }, - { - "epoch": 0.3529500238140996, - "grad_norm": 3.1056365966796875, - "learning_rate": 4.875967057634437e-05, - "loss": 0.07828506827354431, - "step": 2640 - }, - { - "epoch": 0.3556238876308731, - "grad_norm": 0.28790283203125, - "learning_rate": 4.874093797745784e-05, - "loss": 0.11355981826782227, - "step": 2660 - }, - { - "epoch": 0.3582977514476466, - "grad_norm": 2.3372068405151367, - "learning_rate": 4.8722068631473746e-05, - "loss": 0.048267871141433716, - "step": 2680 - }, - { - "epoch": 0.36097161526442006, - "grad_norm": 0.12767371535301208, - "learning_rate": 4.8703062647078976e-05, - "loss": 0.04319801032543182, - "step": 2700 - }, - { - "epoch": 0.36364547908119355, - "grad_norm": 0.5145738124847412, - "learning_rate": 4.868392013374741e-05, - "loss": 0.0773090660572052, - "step": 2720 - }, - { - "epoch": 0.366319342897967, - "grad_norm": 0.8518500328063965, - "learning_rate": 4.866464120173937e-05, - "loss": 0.05149460434913635, - "step": 2740 - }, - { - "epoch": 0.3689932067147405, - "grad_norm": 3.6726584434509277, - "learning_rate": 4.8645225962100924e-05, - "loss": 0.06896821856498718, - "step": 2760 - }, - { - "epoch": 0.371667070531514, - "grad_norm": 1.5626497268676758, - "learning_rate": 4.862567452666329e-05, - "loss": 0.047730174660682675, - "step": 2780 - }, - { - "epoch": 0.37434093434828747, - "grad_norm": 6.562028884887695, - "learning_rate": 4.8605987008042144e-05, - "loss": 0.07060698866844177, - "step": 2800 - }, - { - "epoch": 0.37701479816506095, - "grad_norm": 0.7631726861000061, - "learning_rate": 4.8586163519637005e-05, - "loss": 0.04944324493408203, - "step": 2820 - }, - { - "epoch": 0.37968866198183443, - "grad_norm": 1.6982293128967285, - "learning_rate": 4.8566204175630595e-05, - "loss": 0.03000348210334778, - "step": 2840 - }, - { - "epoch": 0.3823625257986079, - "grad_norm": 0.6487429141998291, - "learning_rate": 4.854610909098812e-05, - "loss": 0.06691416501998901, - "step": 2860 - }, - { - "epoch": 0.3850363896153814, - "grad_norm": 0.7648892402648926, - "learning_rate": 4.852587838145668e-05, - "loss": 0.05529783964157105, - "step": 2880 - }, - { - "epoch": 0.38771025343215487, - "grad_norm": 0.11601298302412033, - "learning_rate": 4.850551216356457e-05, - "loss": 0.07780832052230835, - "step": 2900 - }, - { - "epoch": 0.39038411724892835, - "grad_norm": 0.9443137645721436, - "learning_rate": 4.8485010554620594e-05, - "loss": 0.08007023930549621, - "step": 2920 - }, - { - "epoch": 0.39305798106570183, - "grad_norm": 0.8828252553939819, - "learning_rate": 4.846437367271341e-05, - "loss": 0.03541453182697296, - "step": 2940 - }, - { - "epoch": 0.3957318448824753, - "grad_norm": 0.21668888628482819, - "learning_rate": 4.844360163671083e-05, - "loss": 0.08354364633560181, - "step": 2960 - }, - { - "epoch": 0.3984057086992488, - "grad_norm": 0.6840483546257019, - "learning_rate": 4.8422694566259194e-05, - "loss": 0.045807772874832155, - "step": 2980 - }, - { - "epoch": 0.4010795725160223, - "grad_norm": 1.2754698991775513, - "learning_rate": 4.8401652581782584e-05, - "loss": 0.053487342596054074, - "step": 3000 - }, - { - "epoch": 0.40375343633279576, - "grad_norm": 0.19012756645679474, - "learning_rate": 4.838047580448222e-05, - "loss": 0.05881953239440918, - "step": 3020 - }, - { - "epoch": 0.40642730014956924, - "grad_norm": 2.1057698726654053, - "learning_rate": 4.835916435633569e-05, - "loss": 0.031065690517425536, - "step": 3040 - }, - { - "epoch": 0.4091011639663427, - "grad_norm": 4.188559055328369, - "learning_rate": 4.833771836009633e-05, - "loss": 0.07205432653427124, - "step": 3060 - }, - { - "epoch": 0.4117750277831162, - "grad_norm": 6.975829124450684, - "learning_rate": 4.831613793929242e-05, - "loss": 0.04953635036945343, - "step": 3080 - }, - { - "epoch": 0.4144488915998897, - "grad_norm": 4.725269317626953, - "learning_rate": 4.8294423218226546e-05, - "loss": 0.05965519547462463, - "step": 3100 - }, - { - "epoch": 0.41712275541666316, - "grad_norm": 1.7124755382537842, - "learning_rate": 4.827257432197486e-05, - "loss": 0.039625433087348935, - "step": 3120 - }, - { - "epoch": 0.41979661923343664, - "grad_norm": 2.6687324047088623, - "learning_rate": 4.825059137638636e-05, - "loss": 0.05020809769630432, - "step": 3140 - }, - { - "epoch": 0.4224704830502101, - "grad_norm": 1.111640214920044, - "learning_rate": 4.822847450808215e-05, - "loss": 0.04404452443122864, - "step": 3160 - }, - { - "epoch": 0.42514434686698366, - "grad_norm": 0.2128070890903473, - "learning_rate": 4.8206223844454744e-05, - "loss": 0.08283355236053466, - "step": 3180 - }, - { - "epoch": 0.42781821068375714, - "grad_norm": 0.10757248103618622, - "learning_rate": 4.818383951366729e-05, - "loss": 0.08568671345710754, - "step": 3200 - }, - { - "epoch": 0.4304920745005306, - "grad_norm": 0.08344592899084091, - "learning_rate": 4.816132164465289e-05, - "loss": 0.0426956832408905, - "step": 3220 - }, - { - "epoch": 0.4331659383173041, - "grad_norm": 0.5657751560211182, - "learning_rate": 4.813867036711378e-05, - "loss": 0.04971776902675629, - "step": 3240 - }, - { - "epoch": 0.4358398021340776, - "grad_norm": 2.1529288291931152, - "learning_rate": 4.8115885811520654e-05, - "loss": 0.025386181473731995, - "step": 3260 - }, - { - "epoch": 0.43851366595085106, - "grad_norm": 4.228519916534424, - "learning_rate": 4.809296810911188e-05, - "loss": 0.06401395201683044, - "step": 3280 - }, - { - "epoch": 0.44118752976762454, - "grad_norm": 6.770420551300049, - "learning_rate": 4.806991739189274e-05, - "loss": 0.16425553560256959, - "step": 3300 - }, - { - "epoch": 0.443861393584398, - "grad_norm": 0.5303187370300293, - "learning_rate": 4.804673379263467e-05, - "loss": 0.045900467038154605, - "step": 3320 - }, - { - "epoch": 0.4465352574011715, - "grad_norm": 0.221473827958107, - "learning_rate": 4.802341744487453e-05, - "loss": 0.07529735565185547, - "step": 3340 - }, - { - "epoch": 0.449209121217945, - "grad_norm": 3.48736834526062, - "learning_rate": 4.799996848291378e-05, - "loss": 0.062433135509490964, - "step": 3360 - }, - { - "epoch": 0.45188298503471847, - "grad_norm": 2.650038242340088, - "learning_rate": 4.797638704181774e-05, - "loss": 0.03762982189655304, - "step": 3380 - }, - { - "epoch": 0.45455684885149195, - "grad_norm": 3.159665584564209, - "learning_rate": 4.795267325741483e-05, - "loss": 0.04745924174785614, - "step": 3400 - }, - { - "epoch": 0.4572307126682654, - "grad_norm": 0.8763885498046875, - "learning_rate": 4.7928827266295715e-05, - "loss": 0.07380253076553345, - "step": 3420 - }, - { - "epoch": 0.4599045764850389, - "grad_norm": 0.1779366433620453, - "learning_rate": 4.790484920581262e-05, - "loss": 0.045916372537612916, - "step": 3440 - }, - { - "epoch": 0.4625784403018124, - "grad_norm": 1.1228729486465454, - "learning_rate": 4.7880739214078454e-05, - "loss": 0.04461723566055298, - "step": 3460 - }, - { - "epoch": 0.46525230411858587, - "grad_norm": 0.1629919707775116, - "learning_rate": 4.785649742996605e-05, - "loss": 0.017159442603588104, - "step": 3480 - }, - { - "epoch": 0.46792616793535935, - "grad_norm": 3.583951473236084, - "learning_rate": 4.783212399310737e-05, - "loss": 0.047145146131515506, - "step": 3500 - }, - { - "epoch": 0.47060003175213283, - "grad_norm": 0.9766237139701843, - "learning_rate": 4.780761904389267e-05, - "loss": 0.050229442119598386, - "step": 3520 - }, - { - "epoch": 0.4732738955689063, - "grad_norm": 0.05617872253060341, - "learning_rate": 4.778298272346976e-05, - "loss": 0.024862812459468843, - "step": 3540 - }, - { - "epoch": 0.4759477593856798, - "grad_norm": 1.3586453199386597, - "learning_rate": 4.775821517374308e-05, - "loss": 0.02117772251367569, - "step": 3560 - }, - { - "epoch": 0.4786216232024533, - "grad_norm": 1.2116742134094238, - "learning_rate": 4.7733316537373006e-05, - "loss": 0.03060794174671173, - "step": 3580 - }, - { - "epoch": 0.48129548701922675, - "grad_norm": 0.39403238892555237, - "learning_rate": 4.770828695777493e-05, - "loss": 0.05482668280601501, - "step": 3600 - }, - { - "epoch": 0.48396935083600023, - "grad_norm": 0.9248486161231995, - "learning_rate": 4.7683126579118495e-05, - "loss": 0.03612814247608185, - "step": 3620 - }, - { - "epoch": 0.4866432146527737, - "grad_norm": 0.1624649167060852, - "learning_rate": 4.7657835546326736e-05, - "loss": 0.04334873259067536, - "step": 3640 - }, - { - "epoch": 0.4893170784695472, - "grad_norm": 0.5321119427680969, - "learning_rate": 4.763241400507524e-05, - "loss": 0.0461233913898468, - "step": 3660 - }, - { - "epoch": 0.4919909422863207, - "grad_norm": 0.34861093759536743, - "learning_rate": 4.760686210179133e-05, - "loss": 0.024829554557800292, - "step": 3680 - }, - { - "epoch": 0.49466480610309416, - "grad_norm": 1.2561241388320923, - "learning_rate": 4.758117998365322e-05, - "loss": 0.03157005608081818, - "step": 3700 - }, - { - "epoch": 0.49733866991986764, - "grad_norm": 0.8691341280937195, - "learning_rate": 4.7555367798589146e-05, - "loss": 0.04310203492641449, - "step": 3720 - }, - { - "epoch": 0.5000125337366411, - "grad_norm": 0.3134572505950928, - "learning_rate": 4.752942569527653e-05, - "loss": 0.03796039223670959, - "step": 3740 - }, - { - "epoch": 0.5026863975534146, - "grad_norm": 2.3359289169311523, - "learning_rate": 4.75033538231411e-05, - "loss": 0.055599170923233035, - "step": 3760 - }, - { - "epoch": 0.5053602613701881, - "grad_norm": 7.426175594329834, - "learning_rate": 4.747715233235608e-05, - "loss": 0.054436272382736205, - "step": 3780 - }, - { - "epoch": 0.5080341251869616, - "grad_norm": 0.5940203070640564, - "learning_rate": 4.745082137384128e-05, - "loss": 0.03682814538478851, - "step": 3800 - }, - { - "epoch": 0.510707989003735, - "grad_norm": 0.22821389138698578, - "learning_rate": 4.7424361099262225e-05, - "loss": 0.051123309135437014, - "step": 3820 - }, - { - "epoch": 0.5133818528205085, - "grad_norm": 8.20633602142334, - "learning_rate": 4.739777166102932e-05, - "loss": 0.0704378604888916, - "step": 3840 - }, - { - "epoch": 0.516055716637282, - "grad_norm": 3.023848533630371, - "learning_rate": 4.737105321229694e-05, - "loss": 0.03368058800697327, - "step": 3860 - }, - { - "epoch": 0.5187295804540555, - "grad_norm": 0.07666649669408798, - "learning_rate": 4.7344205906962555e-05, - "loss": 0.03665303289890289, - "step": 3880 - }, - { - "epoch": 0.521403444270829, - "grad_norm": 0.7571629881858826, - "learning_rate": 4.731722989966585e-05, - "loss": 0.058415502309799194, - "step": 3900 - }, - { - "epoch": 0.5240773080876024, - "grad_norm": 3.2599120140075684, - "learning_rate": 4.7290125345787816e-05, - "loss": 0.07323018908500671, - "step": 3920 - }, - { - "epoch": 0.5267511719043759, - "grad_norm": 0.28930988907814026, - "learning_rate": 4.7262892401449886e-05, - "loss": 0.054371267557144165, - "step": 3940 - }, - { - "epoch": 0.5294250357211494, - "grad_norm": 2.2296454906463623, - "learning_rate": 4.7235531223513004e-05, - "loss": 0.040819621086120604, - "step": 3960 - }, - { - "epoch": 0.5320988995379229, - "grad_norm": 0.11608211696147919, - "learning_rate": 4.720804196957675e-05, - "loss": 0.05215579271316528, - "step": 3980 - }, - { - "epoch": 0.5347727633546964, - "grad_norm": 1.1587547063827515, - "learning_rate": 4.7180424797978415e-05, - "loss": 0.026277875900268553, - "step": 4000 - }, - { - "epoch": 0.5374466271714698, - "grad_norm": 0.06253435462713242, - "learning_rate": 4.7152679867792074e-05, - "loss": 0.02574407756328583, - "step": 4020 - }, - { - "epoch": 0.5401204909882433, - "grad_norm": 1.3441458940505981, - "learning_rate": 4.71248073388277e-05, - "loss": 0.05538107752799988, - "step": 4040 - }, - { - "epoch": 0.5427943548050168, - "grad_norm": 0.48076340556144714, - "learning_rate": 4.7096807371630236e-05, - "loss": 0.047986540198326114, - "step": 4060 - }, - { - "epoch": 0.5454682186217903, - "grad_norm": 0.5924936532974243, - "learning_rate": 4.706868012747867e-05, - "loss": 0.05463914275169372, - "step": 4080 - }, - { - "epoch": 0.7673995566395854, - "grad_norm": 0.05143728107213974, - "learning_rate": 4.431151627307268e-05, - "loss": 0.00959376593430837, - "step": 4100 - }, - { - "epoch": 0.771142969110998, - "grad_norm": 1.2308074235916138, - "learning_rate": 4.425806509248848e-05, - "loss": 0.002745623141527176, - "step": 4120 - }, - { - "epoch": 0.7748863815824106, - "grad_norm": 2.080223798751831, - "learning_rate": 4.420439652052499e-05, - "loss": 0.012390998750925064, - "step": 4140 - }, - { - "epoch": 0.7786297940538233, - "grad_norm": 0.049312230199575424, - "learning_rate": 4.415051116301072e-05, - "loss": 0.004607534408569336, - "step": 4160 - }, - { - "epoch": 0.7823732065252359, - "grad_norm": 0.07747476547956467, - "learning_rate": 4.409640962822132e-05, - "loss": 0.034441503882408145, - "step": 4180 - }, - { - "epoch": 0.7861166189966485, - "grad_norm": 0.021327875554561615, - "learning_rate": 4.404209252687275e-05, - "loss": 0.009768449515104295, - "step": 4200 - }, - { - "epoch": 0.789860031468061, - "grad_norm": 2.406580924987793, - "learning_rate": 4.398756047211431e-05, - "loss": 0.005304037779569626, - "step": 4220 - }, - { - "epoch": 0.7936034439394737, - "grad_norm": 0.027869906276464462, - "learning_rate": 4.39328140795218e-05, - "loss": 0.00896073654294014, - "step": 4240 - }, - { - "epoch": 0.7973468564108863, - "grad_norm": 0.09702044725418091, - "learning_rate": 4.387785396709052e-05, - "loss": 0.0117533378303051, - "step": 4260 - }, - { - "epoch": 0.801090268882299, - "grad_norm": 0.529065728187561, - "learning_rate": 4.382268075522831e-05, - "loss": 0.0037526611238718035, - "step": 4280 - }, - { - "epoch": 0.8048336813537116, - "grad_norm": 0.015109462663531303, - "learning_rate": 4.3767295066748564e-05, - "loss": 0.0025708725675940513, - "step": 4300 - }, - { - "epoch": 0.8085770938251241, - "grad_norm": 0.7257627248764038, - "learning_rate": 4.371169752686316e-05, - "loss": 0.006234285607933998, - "step": 4320 - }, - { - "epoch": 0.8123205062965367, - "grad_norm": 0.016853008419275284, - "learning_rate": 4.3655888763175436e-05, - "loss": 0.0023587727919220924, - "step": 4340 - }, - { - "epoch": 0.8160639187679494, - "grad_norm": 0.017816167324781418, - "learning_rate": 4.3599869405673085e-05, - "loss": 0.0012389549054205417, - "step": 4360 - }, - { - "epoch": 0.819807331239362, - "grad_norm": 0.014672616496682167, - "learning_rate": 4.354364008672106e-05, - "loss": 0.002244691364467144, - "step": 4380 - }, - { - "epoch": 0.8235507437107746, - "grad_norm": 0.044869400560855865, - "learning_rate": 4.3487201441054435e-05, - "loss": 0.007713723182678223, - "step": 4400 - }, - { - "epoch": 0.8272941561821872, - "grad_norm": 0.06367291510105133, - "learning_rate": 4.343055410577122e-05, - "loss": 0.005743256583809852, - "step": 4420 - }, - { - "epoch": 0.8310375686535998, - "grad_norm": 0.1354215145111084, - "learning_rate": 4.3373698720325176e-05, - "loss": 0.009635470807552338, - "step": 4440 - }, - { - "epoch": 0.8347809811250124, - "grad_norm": 0.9089844822883606, - "learning_rate": 4.331663592651862e-05, - "loss": 0.01007603257894516, - "step": 4460 - }, - { - "epoch": 0.838524393596425, - "grad_norm": 0.025831619277596474, - "learning_rate": 4.3259366368495167e-05, - "loss": 0.006179215386509895, - "step": 4480 - }, - { - "epoch": 0.8422678060678377, - "grad_norm": 0.016653764992952347, - "learning_rate": 4.320189069273243e-05, - "loss": 0.0025156451389193534, - "step": 4500 - }, - { - "epoch": 0.8460112185392502, - "grad_norm": 0.27361780405044556, - "learning_rate": 4.3144209548034766e-05, - "loss": 0.002235286869108677, - "step": 4520 - }, - { - "epoch": 0.8497546310106628, - "grad_norm": 2.6958701610565186, - "learning_rate": 4.3086323585525915e-05, - "loss": 0.03571180701255798, - "step": 4540 - }, - { - "epoch": 0.8534980434820755, - "grad_norm": 0.1260778158903122, - "learning_rate": 4.3028233458641696e-05, - "loss": 0.0036518506705760954, - "step": 4560 - }, - { - "epoch": 0.8572414559534881, - "grad_norm": 0.2445528209209442, - "learning_rate": 4.2969939823122586e-05, - "loss": 0.024949796497821808, - "step": 4580 - }, - { - "epoch": 0.8609848684249007, - "grad_norm": 0.1674242913722992, - "learning_rate": 4.291144333700633e-05, - "loss": 0.002089798077940941, - "step": 4600 - }, - { - "epoch": 0.8647282808963134, - "grad_norm": 0.05161884427070618, - "learning_rate": 4.2852744660620515e-05, - "loss": 0.007847145944833756, - "step": 4620 - }, - { - "epoch": 0.8684716933677259, - "grad_norm": 0.019796324893832207, - "learning_rate": 4.279384445657514e-05, - "loss": 0.0023555334657430647, - "step": 4640 - }, - { - "epoch": 0.8722151058391385, - "grad_norm": 0.0647754967212677, - "learning_rate": 4.2734743389755096e-05, - "loss": 0.009586349129676819, - "step": 4660 - }, - { - "epoch": 0.8759585183105512, - "grad_norm": 0.015243460424244404, - "learning_rate": 4.267544212731268e-05, - "loss": 0.017788709700107576, - "step": 4680 - }, - { - "epoch": 0.8797019307819638, - "grad_norm": 0.05756703019142151, - "learning_rate": 4.261594133866007e-05, - "loss": 0.014256520569324494, - "step": 4700 - }, - { - "epoch": 0.8834453432533764, - "grad_norm": 0.2002931535243988, - "learning_rate": 4.255624169546175e-05, - "loss": 0.0014025470241904258, - "step": 4720 - }, - { - "epoch": 0.887188755724789, - "grad_norm": 0.04325389489531517, - "learning_rate": 4.249634387162696e-05, - "loss": 0.010552891343832017, - "step": 4740 - }, - { - "epoch": 0.8909321681962016, - "grad_norm": 0.8975178599357605, - "learning_rate": 4.243624854330206e-05, - "loss": 0.0032475266605615618, - "step": 4760 - }, - { - "epoch": 0.8946755806676142, - "grad_norm": 0.01541830413043499, - "learning_rate": 4.237595638886288e-05, - "loss": 0.003157203644514084, - "step": 4780 - }, - { - "epoch": 0.8984189931390268, - "grad_norm": 1.673305869102478, - "learning_rate": 4.231546808890713e-05, - "loss": 0.0028239911422133445, - "step": 4800 - }, - { - "epoch": 0.9021624056104395, - "grad_norm": 0.021689629182219505, - "learning_rate": 4.225478432624665e-05, - "loss": 0.0026885712519288062, - "step": 4820 - }, - { - "epoch": 0.905905818081852, - "grad_norm": 0.019590798765420914, - "learning_rate": 4.219390578589973e-05, - "loss": 0.00780024379491806, - "step": 4840 - }, - { - "epoch": 0.9096492305532646, - "grad_norm": 0.024581020697951317, - "learning_rate": 4.213283315508337e-05, - "loss": 0.006697511672973633, - "step": 4860 - }, - { - "epoch": 0.9133926430246773, - "grad_norm": 0.20615583658218384, - "learning_rate": 4.207156712320555e-05, - "loss": 0.007314208894968033, - "step": 4880 - }, - { - "epoch": 0.9171360554960899, - "grad_norm": 0.015673745423555374, - "learning_rate": 4.20101083818574e-05, - "loss": 0.004841562733054161, - "step": 4900 - }, - { - "epoch": 0.9208794679675025, - "grad_norm": 0.008306623436510563, - "learning_rate": 4.194845762480544e-05, - "loss": 0.0010150263085961341, - "step": 4920 - }, - { - "epoch": 0.9246228804389152, - "grad_norm": 0.051861703395843506, - "learning_rate": 4.188661554798369e-05, - "loss": 0.011043114960193634, - "step": 4940 - }, - { - "epoch": 0.9283662929103277, - "grad_norm": 1.7019767761230469, - "learning_rate": 4.1824582849485884e-05, - "loss": 0.004985674470663071, - "step": 4960 - }, - { - "epoch": 0.9321097053817403, - "grad_norm": 0.021240154281258583, - "learning_rate": 4.176236022955755e-05, - "loss": 0.04885836541652679, - "step": 4980 - }, - { - "epoch": 0.935853117853153, - "grad_norm": 0.016504865139722824, - "learning_rate": 4.16999483905881e-05, - "loss": 0.0027378931641578673, - "step": 5000 - }, - { - "epoch": 0.9395965303245656, - "grad_norm": 0.014015628024935722, - "learning_rate": 4.163734803710294e-05, - "loss": 0.012781022489070893, - "step": 5020 - }, - { - "epoch": 0.9433399427959782, - "grad_norm": 0.013812500052154064, - "learning_rate": 4.157455987575545e-05, - "loss": 0.007508871704339981, - "step": 5040 - }, - { - "epoch": 0.9470833552673907, - "grad_norm": 0.01622290164232254, - "learning_rate": 4.1511584615319075e-05, - "loss": 0.0014614147134125234, - "step": 5060 - }, - { - "epoch": 0.9508267677388034, - "grad_norm": 0.01259149145334959, - "learning_rate": 4.144842296667929e-05, - "loss": 0.006202424317598343, - "step": 5080 - }, - { - "epoch": 0.954570180210216, - "grad_norm": 0.012383027002215385, - "learning_rate": 4.138507564282558e-05, - "loss": 0.006122353300452232, - "step": 5100 - }, - { - "epoch": 0.9583135926816286, - "grad_norm": 0.006499920971691608, - "learning_rate": 4.1321543358843385e-05, - "loss": 0.0008865024894475937, - "step": 5120 - }, - { - "epoch": 0.9620570051530413, - "grad_norm": 0.00830752868205309, - "learning_rate": 4.125782683190606e-05, - "loss": 0.0008420860394835472, - "step": 5140 - }, - { - "epoch": 0.9658004176244538, - "grad_norm": 0.01525857299566269, - "learning_rate": 4.119392678126673e-05, - "loss": 0.00587364137172699, - "step": 5160 - }, - { - "epoch": 0.9695438300958664, - "grad_norm": 0.01072095800191164, - "learning_rate": 4.11298439282502e-05, - "loss": 0.00853007659316063, - "step": 5180 - }, - { - "epoch": 0.973287242567279, - "grad_norm": 0.030316641554236412, - "learning_rate": 4.106557899624482e-05, - "loss": 0.0058747071772813795, - "step": 5200 - }, - { - "epoch": 0.9770306550386917, - "grad_norm": 0.0391647033393383, - "learning_rate": 4.1001132710694304e-05, - "loss": 0.0034765828400850295, - "step": 5220 - }, - { - "epoch": 0.9807740675101043, - "grad_norm": 0.04938298836350441, - "learning_rate": 4.093650579908953e-05, - "loss": 0.007594724744558334, - "step": 5240 - }, - { - "epoch": 0.984517479981517, - "grad_norm": 0.005873252172023058, - "learning_rate": 4.087169899096037e-05, - "loss": 0.013347607851028443, - "step": 5260 - }, - { - "epoch": 0.9882608924529295, - "grad_norm": 1.2757259607315063, - "learning_rate": 4.080671301786741e-05, - "loss": 0.004837355017662049, - "step": 5280 - }, - { - "epoch": 0.9920043049243421, - "grad_norm": 0.00920735765248537, - "learning_rate": 4.0741548613393675e-05, - "loss": 0.007415445148944854, - "step": 5300 - }, - { - "epoch": 0.9957477173957547, - "grad_norm": 0.5702093839645386, - "learning_rate": 4.067620651313647e-05, - "loss": 0.00406576506793499, - "step": 5320 - }, - { - "epoch": 0.9994911298671674, - "grad_norm": 1.8361051082611084, - "learning_rate": 4.0610687454698906e-05, - "loss": 0.00997612327337265, - "step": 5340 - }, - { - "epoch": 1.0031819006007008, - "grad_norm": 3.335326910018921, - "learning_rate": 4.0544992177681685e-05, - "loss": 0.008442799001932145, - "step": 5360 - }, - { - "epoch": 1.0069253130721134, - "grad_norm": 0.03184954449534416, - "learning_rate": 4.047912142367473e-05, - "loss": 0.008095134049654007, - "step": 5380 - }, - { - "epoch": 1.010668725543526, - "grad_norm": 0.029989074915647507, - "learning_rate": 4.04130759362488e-05, - "loss": 0.0012585990130901336, - "step": 5400 - }, - { - "epoch": 1.0144121380149385, - "grad_norm": 0.08727464079856873, - "learning_rate": 4.034685646094711e-05, - "loss": 0.012588074803352356, - "step": 5420 - }, - { - "epoch": 1.018155550486351, - "grad_norm": 0.018498806282877922, - "learning_rate": 4.028046374527689e-05, - "loss": 0.001854238100349903, - "step": 5440 - }, - { - "epoch": 1.0218989629577637, - "grad_norm": 0.013779236935079098, - "learning_rate": 4.021389853870095e-05, - "loss": 0.0008004569448530674, - "step": 5460 - }, - { - "epoch": 1.0256423754291764, - "grad_norm": 0.028235070407390594, - "learning_rate": 4.0147161592629306e-05, - "loss": 0.002274145185947418, - "step": 5480 - }, - { - "epoch": 1.029385787900589, - "grad_norm": 0.023030120879411697, - "learning_rate": 4.008025366041055e-05, - "loss": 0.008717305958271027, - "step": 5500 - }, - { - "epoch": 1.0331292003720016, - "grad_norm": 0.018347155302762985, - "learning_rate": 4.001317549732345e-05, - "loss": 0.00244256854057312, - "step": 5520 - }, - { - "epoch": 1.0368726128434143, - "grad_norm": 0.03449391946196556, - "learning_rate": 3.99459278605684e-05, - "loss": 0.0039924226701259615, - "step": 5540 - }, - { - "epoch": 1.0406160253148269, - "grad_norm": 0.030406463891267776, - "learning_rate": 3.9878511509258866e-05, - "loss": 0.0021008485928177834, - "step": 5560 - }, - { - "epoch": 1.0443594377862395, - "grad_norm": 0.01783100888133049, - "learning_rate": 3.9810927204412803e-05, - "loss": 0.0006656501442193985, - "step": 5580 - }, - { - "epoch": 1.0481028502576522, - "grad_norm": 0.05360455811023712, - "learning_rate": 3.974317570894413e-05, - "loss": 0.005278818309307098, - "step": 5600 - }, - { - "epoch": 1.0518462627290646, - "grad_norm": 0.008699169382452965, - "learning_rate": 3.9675257787654e-05, - "loss": 0.005309444293379784, - "step": 5620 - }, - { - "epoch": 1.0555896752004772, - "grad_norm": 0.036641959100961685, - "learning_rate": 3.960717420722227e-05, - "loss": 0.0034692320972681046, - "step": 5640 - }, - { - "epoch": 1.0593330876718898, - "grad_norm": 0.012212110683321953, - "learning_rate": 3.953892573619883e-05, - "loss": 0.005343861132860184, - "step": 5660 - }, - { - "epoch": 1.0630765001433025, - "grad_norm": 0.011296284385025501, - "learning_rate": 3.947051314499489e-05, - "loss": 0.0038058970123529432, - "step": 5680 - }, - { - "epoch": 1.066819912614715, - "grad_norm": 0.05954049900174141, - "learning_rate": 3.94019372058743e-05, - "loss": 0.008142991364002228, - "step": 5700 - }, - { - "epoch": 1.0705633250861277, - "grad_norm": 0.03478416055440903, - "learning_rate": 3.933319869294483e-05, - "loss": 0.0075227849185466765, - "step": 5720 - }, - { - "epoch": 1.0743067375575404, - "grad_norm": 0.014586996287107468, - "learning_rate": 3.9264298382149455e-05, - "loss": 0.0036750122904777526, - "step": 5740 - }, - { - "epoch": 1.078050150028953, - "grad_norm": 0.025754544883966446, - "learning_rate": 3.919523705125757e-05, - "loss": 0.004151013493537903, - "step": 5760 - }, - { - "epoch": 1.0817935625003656, - "grad_norm": 0.03239905461668968, - "learning_rate": 3.9126015479856205e-05, - "loss": 0.00861695185303688, - "step": 5780 - }, - { - "epoch": 1.0855369749717783, - "grad_norm": 0.03506994619965553, - "learning_rate": 3.9056634449341256e-05, - "loss": 0.003123755753040314, - "step": 5800 - }, - { - "epoch": 1.089280387443191, - "grad_norm": 0.0286911278963089, - "learning_rate": 3.898709474290864e-05, - "loss": 0.002537376619875431, - "step": 5820 - }, - { - "epoch": 1.0930237999146033, - "grad_norm": 0.03490692004561424, - "learning_rate": 3.8917397145545454e-05, - "loss": 0.0010227372869849205, - "step": 5840 - }, - { - "epoch": 1.096767212386016, - "grad_norm": 0.013748899102210999, - "learning_rate": 3.884754244402113e-05, - "loss": 0.011847371608018875, - "step": 5860 - }, - { - "epoch": 1.1005106248574286, - "grad_norm": 0.035458195954561234, - "learning_rate": 3.877753142687852e-05, - "loss": 0.009741749614477158, - "step": 5880 - }, - { - "epoch": 1.1042540373288412, - "grad_norm": 0.012493673712015152, - "learning_rate": 3.8707364884425064e-05, - "loss": 0.006607493013143539, - "step": 5900 - }, - { - "epoch": 1.1079974498002538, - "grad_norm": 0.018607834354043007, - "learning_rate": 3.863704360872378e-05, - "loss": 0.0016217166557908058, - "step": 5920 - }, - { - "epoch": 1.1117408622716665, - "grad_norm": 0.0283930953592062, - "learning_rate": 3.8566568393584366e-05, - "loss": 0.002083975449204445, - "step": 5940 - }, - { - "epoch": 1.115484274743079, - "grad_norm": 0.05229801684617996, - "learning_rate": 3.8495940034554283e-05, - "loss": 0.0014217685908079146, - "step": 5960 - }, - { - "epoch": 1.1192276872144917, - "grad_norm": 0.008808930404484272, - "learning_rate": 3.8425159328909684e-05, - "loss": 0.0022570645436644555, - "step": 5980 - }, - { - "epoch": 1.1229710996859044, - "grad_norm": 0.020502232015132904, - "learning_rate": 3.835422707564648e-05, - "loss": 0.003745942190289497, - "step": 6000 - }, - { - "epoch": 1.126714512157317, - "grad_norm": 0.032347094267606735, - "learning_rate": 3.82831440754713e-05, - "loss": 0.003347185626626015, - "step": 6020 - }, - { - "epoch": 1.1304579246287294, - "grad_norm": 0.020310478284955025, - "learning_rate": 3.821191113079246e-05, - "loss": 0.006166417896747589, - "step": 6040 - }, - { - "epoch": 1.134201337100142, - "grad_norm": 0.06390372663736343, - "learning_rate": 3.8140529045710876e-05, - "loss": 0.0013674044981598853, - "step": 6060 - }, - { - "epoch": 1.1379447495715547, - "grad_norm": 1.1938918828964233, - "learning_rate": 3.806899862601105e-05, - "loss": 0.010550644248723984, - "step": 6080 - }, - { - "epoch": 1.1416881620429673, - "grad_norm": 0.035355549305677414, - "learning_rate": 3.799732067915189e-05, - "loss": 0.0069750770926475525, - "step": 6100 - }, - { - "epoch": 1.14543157451438, - "grad_norm": 0.009921093471348286, - "learning_rate": 3.792549601425767e-05, - "loss": 0.0027949588373303415, - "step": 6120 - }, - { - "epoch": 1.1491749869857926, - "grad_norm": 0.06172063946723938, - "learning_rate": 3.785352544210884e-05, - "loss": 0.0009372101165354251, - "step": 6140 - }, - { - "epoch": 1.1529183994572052, - "grad_norm": 0.008572470396757126, - "learning_rate": 3.778140977513294e-05, - "loss": 0.0029502738267183303, - "step": 6160 - }, - { - "epoch": 1.1566618119286178, - "grad_norm": 0.4211727976799011, - "learning_rate": 3.770914982739534e-05, - "loss": 0.014692296087741853, - "step": 6180 - }, - { - "epoch": 1.1604052244000305, - "grad_norm": 0.02292146533727646, - "learning_rate": 3.7636746414590126e-05, - "loss": 0.0020170681178569793, - "step": 6200 - }, - { - "epoch": 1.164148636871443, - "grad_norm": 0.11247449368238449, - "learning_rate": 3.756420035403086e-05, - "loss": 0.006851900368928909, - "step": 6220 - }, - { - "epoch": 1.1678920493428557, - "grad_norm": 0.020755017176270485, - "learning_rate": 3.749151246464137e-05, - "loss": 0.0021739909425377846, - "step": 6240 - }, - { - "epoch": 1.1716354618142684, - "grad_norm": 0.017202025279402733, - "learning_rate": 3.741868356694647e-05, - "loss": 0.002353278361260891, - "step": 6260 - }, - { - "epoch": 1.1753788742856808, - "grad_norm": 0.014947429299354553, - "learning_rate": 3.734571448306274e-05, - "loss": 0.0010860362090170383, - "step": 6280 - }, - { - "epoch": 1.1791222867570934, - "grad_norm": 1.5391262769699097, - "learning_rate": 3.727260603668922e-05, - "loss": 0.01233254000544548, - "step": 6300 - }, - { - "epoch": 1.182865699228506, - "grad_norm": 0.4759792387485504, - "learning_rate": 3.7199359053098133e-05, - "loss": 0.0028501398861408233, - "step": 6320 - }, - { - "epoch": 1.1866091116999187, - "grad_norm": 0.01719040609896183, - "learning_rate": 3.7125974359125536e-05, - "loss": 0.00934450700879097, - "step": 6340 - }, - { - "epoch": 1.1903525241713313, - "grad_norm": 2.4766688346862793, - "learning_rate": 3.7052452783162015e-05, - "loss": 0.018582724034786224, - "step": 6360 - }, - { - "epoch": 1.194095936642744, - "grad_norm": 0.11404932290315628, - "learning_rate": 3.6978795155143326e-05, - "loss": 0.01815672367811203, - "step": 6380 - }, - { - "epoch": 1.1978393491141566, - "grad_norm": 0.021365633234381676, - "learning_rate": 3.690500230654103e-05, - "loss": 0.004123781993985176, - "step": 6400 - }, - { - "epoch": 1.2015827615855692, - "grad_norm": 0.022478772327303886, - "learning_rate": 3.68310750703531e-05, - "loss": 0.0038731731474399567, - "step": 6420 - }, - { - "epoch": 1.2053261740569818, - "grad_norm": 0.15531578660011292, - "learning_rate": 3.67570142810945e-05, - "loss": 0.002076444961130619, - "step": 6440 - }, - { - "epoch": 1.2090695865283942, - "grad_norm": 0.012458150275051594, - "learning_rate": 3.668282077478783e-05, - "loss": 0.0027592860162258146, - "step": 6460 - }, - { - "epoch": 1.2128129989998069, - "grad_norm": 0.01572798565030098, - "learning_rate": 3.66084953889538e-05, - "loss": 0.002740098722279072, - "step": 6480 - }, - { - "epoch": 1.2165564114712195, - "grad_norm": 0.13682503998279572, - "learning_rate": 3.6534038962601835e-05, - "loss": 0.000705425813794136, - "step": 6500 - }, - { - "epoch": 1.2202998239426321, - "grad_norm": 0.030630914494395256, - "learning_rate": 3.64594523362206e-05, - "loss": 0.012480729073286057, - "step": 6520 - }, - { - "epoch": 1.2240432364140448, - "grad_norm": 0.024804554879665375, - "learning_rate": 3.638473635176848e-05, - "loss": 0.0007834361866116523, - "step": 6540 - }, - { - "epoch": 1.2277866488854574, - "grad_norm": 0.011334752663969994, - "learning_rate": 3.630989185266411e-05, - "loss": 0.022086825966835023, - "step": 6560 - }, - { - "epoch": 1.23153006135687, - "grad_norm": 0.020346902310848236, - "learning_rate": 3.623491968377684e-05, - "loss": 0.018024472892284392, - "step": 6580 - }, - { - "epoch": 1.2352734738282827, - "grad_norm": 0.015177210792899132, - "learning_rate": 3.615982069141719e-05, - "loss": 0.005251453071832657, - "step": 6600 - }, - { - "epoch": 1.2390168862996953, - "grad_norm": 0.013680647127330303, - "learning_rate": 3.608459572332733e-05, - "loss": 0.006734563410282135, - "step": 6620 - }, - { - "epoch": 1.242760298771108, - "grad_norm": 0.17980872094631195, - "learning_rate": 3.600924562867144e-05, - "loss": 0.003970410302281379, - "step": 6640 - }, - { - "epoch": 1.2465037112425206, - "grad_norm": 0.015203841030597687, - "learning_rate": 3.593377125802622e-05, - "loss": 0.0032148901373147964, - "step": 6660 - }, - { - "epoch": 1.2502471237139332, - "grad_norm": 0.017300931736826897, - "learning_rate": 3.585817346337119e-05, - "loss": 0.00467667318880558, - "step": 6680 - }, - { - "epoch": 1.2539905361853458, - "grad_norm": 0.028181765228509903, - "learning_rate": 3.5782453098079175e-05, - "loss": 0.0015515764243900776, - "step": 6700 - }, - { - "epoch": 1.2577339486567582, - "grad_norm": 0.01730780117213726, - "learning_rate": 3.570661101690657e-05, - "loss": 0.007991334050893783, - "step": 6720 - }, - { - "epoch": 1.2614773611281709, - "grad_norm": 0.014216347597539425, - "learning_rate": 3.5630648075983763e-05, - "loss": 0.002533360943198204, - "step": 6740 - }, - { - "epoch": 1.2652207735995835, - "grad_norm": 0.1556195169687271, - "learning_rate": 3.555456513280544e-05, - "loss": 0.0032653655856847764, - "step": 6760 - }, - { - "epoch": 1.2689641860709961, - "grad_norm": 0.023955868557095528, - "learning_rate": 3.5478363046220915e-05, - "loss": 0.00850408971309662, - "step": 6780 - }, - { - "epoch": 1.2727075985424088, - "grad_norm": 0.17874136567115784, - "learning_rate": 3.5402042676424424e-05, - "loss": 0.0032720811665058135, - "step": 6800 - }, - { - "epoch": 1.2764510110138214, - "grad_norm": 0.0899379625916481, - "learning_rate": 3.5325604884945434e-05, - "loss": 0.003243798017501831, - "step": 6820 - }, - { - "epoch": 1.280194423485234, - "grad_norm": 0.413362056016922, - "learning_rate": 3.5249050534638906e-05, - "loss": 0.0036127623170614243, - "step": 6840 - }, - { - "epoch": 1.2839378359566467, - "grad_norm": 0.02790931612253189, - "learning_rate": 3.517238048967554e-05, - "loss": 0.008225285261869431, - "step": 6860 - }, - { - "epoch": 1.287681248428059, - "grad_norm": 0.6761110424995422, - "learning_rate": 3.5095595615532056e-05, - "loss": 0.00199942234903574, - "step": 6880 - }, - { - "epoch": 1.2914246608994717, - "grad_norm": 4.593618869781494, - "learning_rate": 3.5018696778981385e-05, - "loss": 0.007301987707614898, - "step": 6900 - }, - { - "epoch": 1.2951680733708844, - "grad_norm": 0.09392693638801575, - "learning_rate": 3.494168484808293e-05, - "loss": 0.009008315950632095, - "step": 6920 - }, - { - "epoch": 1.298911485842297, - "grad_norm": 0.008239852264523506, - "learning_rate": 3.48645606921727e-05, - "loss": 0.012661360204219818, - "step": 6940 - }, - { - "epoch": 1.3026548983137096, - "grad_norm": 0.05141177773475647, - "learning_rate": 3.4787325181853576e-05, - "loss": 0.0007553372532129287, - "step": 6960 - }, - { - "epoch": 1.3063983107851223, - "grad_norm": 0.024333903566002846, - "learning_rate": 3.470997918898541e-05, - "loss": 0.0016128463670611382, - "step": 6980 - }, - { - "epoch": 1.3101417232565349, - "grad_norm": 0.0337531715631485, - "learning_rate": 3.4632523586675254e-05, - "loss": 0.003253454715013504, - "step": 7000 - }, - { - "epoch": 1.3138851357279475, - "grad_norm": 0.05121550336480141, - "learning_rate": 3.4554959249267436e-05, - "loss": 0.0026307271793484686, - "step": 7020 - }, - { - "epoch": 1.3176285481993602, - "grad_norm": 0.025997543707489967, - "learning_rate": 3.447728705233374e-05, - "loss": 0.0012719514779746532, - "step": 7040 - }, - { - "epoch": 1.3213719606707728, - "grad_norm": 0.009486268274486065, - "learning_rate": 3.4399507872663494e-05, - "loss": 0.002009082958102226, - "step": 7060 - }, - { - "epoch": 1.3251153731421854, - "grad_norm": 0.016816232353448868, - "learning_rate": 3.432162258825369e-05, - "loss": 0.0005956823006272316, - "step": 7080 - }, - { - "epoch": 1.328858785613598, - "grad_norm": 0.004733961541205645, - "learning_rate": 3.424363207829906e-05, - "loss": 0.003636709600687027, - "step": 7100 - }, - { - "epoch": 1.3326021980850107, - "grad_norm": 3.666203498840332, - "learning_rate": 3.4165537223182155e-05, - "loss": 0.010488419234752655, - "step": 7120 - }, - { - "epoch": 1.336345610556423, - "grad_norm": 0.021471882238984108, - "learning_rate": 3.408733890446341e-05, - "loss": 0.0009709249250590801, - "step": 7140 - }, - { - "epoch": 1.3400890230278357, - "grad_norm": 0.007639541756361723, - "learning_rate": 3.40090380048712e-05, - "loss": 0.0030905861407518388, - "step": 7160 - }, - { - "epoch": 1.3438324354992484, - "grad_norm": 0.16878941655158997, - "learning_rate": 3.393063540829186e-05, - "loss": 0.0036965351551771163, - "step": 7180 - }, - { - "epoch": 1.347575847970661, - "grad_norm": 0.07014094293117523, - "learning_rate": 3.385213199975971e-05, - "loss": 0.0005677144508808851, - "step": 7200 - }, - { - "epoch": 1.3513192604420736, - "grad_norm": 0.008626374416053295, - "learning_rate": 3.377352866544706e-05, - "loss": 0.0005447934381663799, - "step": 7220 - }, - { - "epoch": 1.3550626729134863, - "grad_norm": 0.013825134374201298, - "learning_rate": 3.3694826292654246e-05, - "loss": 0.004854041337966919, - "step": 7240 - }, - { - "epoch": 1.3588060853848989, - "grad_norm": 0.025015883147716522, - "learning_rate": 3.361602576979956e-05, - "loss": 0.004542553424835205, - "step": 7260 - }, - { - "epoch": 1.3625494978563115, - "grad_norm": 0.009614030830562115, - "learning_rate": 3.353712798640923e-05, - "loss": 0.0008775785565376282, - "step": 7280 - }, - { - "epoch": 1.366292910327724, - "grad_norm": 3.8835268020629883, - "learning_rate": 3.345813383310744e-05, - "loss": 0.0063879616558551785, - "step": 7300 - }, - { - "epoch": 1.3700363227991366, - "grad_norm": 0.005518193822354078, - "learning_rate": 3.337904420160618e-05, - "loss": 0.0010956574231386184, - "step": 7320 - }, - { - "epoch": 1.3737797352705492, - "grad_norm": 0.005018322728574276, - "learning_rate": 3.329985998469526e-05, - "loss": 0.0012317843735218047, - "step": 7340 - }, - { - "epoch": 0.6887872232777639, - "grad_norm": 0.3108454942703247, - "learning_rate": 3.322058207623218e-05, - "loss": 0.010070423781871795, - "step": 7360 - }, - { - "epoch": 0.6906589276888447, - "grad_norm": 0.3556046783924103, - "learning_rate": 3.314121137113209e-05, - "loss": 0.0278738796710968, - "step": 7380 - }, - { - "epoch": 0.6925306320999256, - "grad_norm": 4.041794300079346, - "learning_rate": 3.306174876535762e-05, - "loss": 0.025335192680358887, - "step": 7400 - }, - { - "epoch": 0.6944023365110065, - "grad_norm": 0.04647493362426758, - "learning_rate": 3.2982195155908845e-05, - "loss": 0.05056847333908081, - "step": 7420 - }, - { - "epoch": 0.6962740409220873, - "grad_norm": 0.6827419400215149, - "learning_rate": 3.290653575270209e-05, - "loss": 0.036053261160850524, - "step": 7440 - }, - { - "epoch": 0.6981457453331683, - "grad_norm": 0.256136029958725, - "learning_rate": 3.2826807269966064e-05, - "loss": 0.020640365779399872, - "step": 7460 - }, - { - "epoch": 0.7000174497442492, - "grad_norm": 0.2054845094680786, - "learning_rate": 3.274699043565268e-05, - "loss": 0.03456352353096008, - "step": 7480 - }, - { - "epoch": 0.70188915415533, - "grad_norm": 0.2027648538351059, - "learning_rate": 3.266708615076064e-05, - "loss": 0.00846734493970871, - "step": 7500 - }, - { - "epoch": 0.7037608585664109, - "grad_norm": 1.6423311233520508, - "learning_rate": 3.258709531727582e-05, - "loss": 0.054978948831558225, - "step": 7520 - }, - { - "epoch": 0.7056325629774918, - "grad_norm": 1.775089144706726, - "learning_rate": 3.2507018838161085e-05, - "loss": 0.03238933086395264, - "step": 7540 - }, - { - "epoch": 0.7075042673885726, - "grad_norm": 0.06917860358953476, - "learning_rate": 3.242685761734609e-05, - "loss": 0.016849520802497863, - "step": 7560 - }, - { - "epoch": 0.7093759717996535, - "grad_norm": 0.051443129777908325, - "learning_rate": 3.2346612559717094e-05, - "loss": 0.048251998424530027, - "step": 7580 - }, - { - "epoch": 0.7112476762107344, - "grad_norm": 0.06533925980329514, - "learning_rate": 3.226628457110672e-05, - "loss": 0.03696450293064117, - "step": 7600 - }, - { - "epoch": 0.7131193806218153, - "grad_norm": 0.45661595463752747, - "learning_rate": 3.218587455828377e-05, - "loss": 0.05503013730049133, - "step": 7620 - }, - { - "epoch": 0.7149910850328962, - "grad_norm": 2.0205914974212646, - "learning_rate": 3.210538342894291e-05, - "loss": 0.033562681078910826, - "step": 7640 - }, - { - "epoch": 0.7168627894439771, - "grad_norm": 2.4842448234558105, - "learning_rate": 3.202481209169455e-05, - "loss": 0.019278638064861298, - "step": 7660 - }, - { - "epoch": 0.7187344938550579, - "grad_norm": 0.10550081729888916, - "learning_rate": 3.1944161456054436e-05, - "loss": 0.01638232171535492, - "step": 7680 - }, - { - "epoch": 0.7206061982661388, - "grad_norm": 1.606436014175415, - "learning_rate": 3.1863432432433506e-05, - "loss": 0.020552067458629607, - "step": 7700 - }, - { - "epoch": 0.7224779026772197, - "grad_norm": 0.2617719769477844, - "learning_rate": 3.178262593212757e-05, - "loss": 0.02315783053636551, - "step": 7720 - }, - { - "epoch": 0.7243496070883005, - "grad_norm": 0.9734074473381042, - "learning_rate": 3.1701742867307e-05, - "loss": 0.01938771307468414, - "step": 7740 - }, - { - "epoch": 0.7262213114993814, - "grad_norm": 0.5882985591888428, - "learning_rate": 3.162078415100647e-05, - "loss": 0.011305707693099975, - "step": 7760 - }, - { - "epoch": 0.7280930159104624, - "grad_norm": 0.04298723489046097, - "learning_rate": 3.15397506971146e-05, - "loss": 0.04238930344581604, - "step": 7780 - }, - { - "epoch": 0.7299647203215432, - "grad_norm": 6.2729315757751465, - "learning_rate": 3.145864342036372e-05, - "loss": 0.030225831270217895, - "step": 7800 - }, - { - "epoch": 0.7318364247326241, - "grad_norm": 0.026423340663313866, - "learning_rate": 3.1377463236319476e-05, - "loss": 0.012169972807168961, - "step": 7820 - }, - { - "epoch": 0.733708129143705, - "grad_norm": 0.0296376533806324, - "learning_rate": 3.1296211061370495e-05, - "loss": 0.015344823896884918, - "step": 7840 - }, - { - "epoch": 0.7355798335547858, - "grad_norm": 0.029524821788072586, - "learning_rate": 3.1214887812718094e-05, - "loss": 0.028345003724098206, - "step": 7860 - }, - { - "epoch": 0.7374515379658667, - "grad_norm": 0.06847794353961945, - "learning_rate": 3.113349440836588e-05, - "loss": 0.020069575309753417, - "step": 7880 - }, - { - "epoch": 0.7393232423769476, - "grad_norm": 0.024868430569767952, - "learning_rate": 3.1052031767109376e-05, - "loss": 0.014262473583221436, - "step": 7900 - }, - { - "epoch": 0.7411949467880286, - "grad_norm": 0.24450063705444336, - "learning_rate": 3.097050080852573e-05, - "loss": 0.04350808262825012, - "step": 7920 - }, - { - "epoch": 0.7430666511991094, - "grad_norm": 0.06978324800729752, - "learning_rate": 3.088890245296322e-05, - "loss": 0.015559709072113037, - "step": 7940 - }, - { - "epoch": 0.7449383556101903, - "grad_norm": 0.12675604224205017, - "learning_rate": 3.0807237621530964e-05, - "loss": 0.013867451250553131, - "step": 7960 - }, - { - "epoch": 0.7468100600212711, - "grad_norm": 0.2605513334274292, - "learning_rate": 3.072550723608846e-05, - "loss": 0.012869009375572204, - "step": 7980 - }, - { - "epoch": 0.748681764432352, - "grad_norm": 3.325530529022217, - "learning_rate": 3.064371221923521e-05, - "loss": 0.03036353886127472, - "step": 8000 - }, - { - "epoch": 0.7505534688434329, - "grad_norm": 0.22703051567077637, - "learning_rate": 3.0561853494300294e-05, - "loss": 0.009017374366521835, - "step": 8020 - }, - { - "epoch": 0.7524251732545137, - "grad_norm": 6.404862880706787, - "learning_rate": 3.047993198533195e-05, - "loss": 0.020604299008846284, - "step": 8040 - }, - { - "epoch": 0.7542968776655946, - "grad_norm": 0.06491954624652863, - "learning_rate": 3.039794861708714e-05, - "loss": 0.014963623881340028, - "step": 8060 - }, - { - "epoch": 0.7561685820766756, - "grad_norm": 0.4990088641643524, - "learning_rate": 3.0315904315021128e-05, - "loss": 0.02046530395746231, - "step": 8080 - }, - { - "epoch": 0.7580402864877565, - "grad_norm": 0.3174229562282562, - "learning_rate": 3.023380000527699e-05, - "loss": 0.013621781766414643, - "step": 8100 - }, - { - "epoch": 0.7599119908988373, - "grad_norm": 0.07161428034305573, - "learning_rate": 3.0151636614675218e-05, - "loss": 0.008043503761291504, - "step": 8120 - }, - { - "epoch": 0.7617836953099182, - "grad_norm": 0.6772736310958862, - "learning_rate": 3.0069415070703217e-05, - "loss": 0.03563189804553986, - "step": 8140 - }, - { - "epoch": 0.763655399720999, - "grad_norm": 0.07689516246318817, - "learning_rate": 2.998713630150485e-05, - "loss": 0.008622632920742035, - "step": 8160 - }, - { - "epoch": 0.7655271041320799, - "grad_norm": 0.014181110076606274, - "learning_rate": 2.990480123586994e-05, - "loss": 0.012368627637624741, - "step": 8180 - }, - { - "epoch": 0.7673988085431608, - "grad_norm": 4.4751715660095215, - "learning_rate": 2.9822410803223822e-05, - "loss": 0.02100955694913864, - "step": 8200 - }, - { - "epoch": 0.7692705129542416, - "grad_norm": 0.12694527208805084, - "learning_rate": 2.9739965933616825e-05, - "loss": 0.018182000517845152, - "step": 8220 - }, - { - "epoch": 0.7711422173653226, - "grad_norm": 0.13789872825145721, - "learning_rate": 2.9657467557713792e-05, - "loss": 0.008949784934520722, - "step": 8240 - }, - { - "epoch": 0.7730139217764035, - "grad_norm": 0.04048463702201843, - "learning_rate": 2.957491660678354e-05, - "loss": 0.03582434058189392, - "step": 8260 - }, - { - "epoch": 0.7748856261874844, - "grad_norm": 0.7825964689254761, - "learning_rate": 2.9492314012688378e-05, - "loss": 0.012679101526737213, - "step": 8280 - }, - { - "epoch": 0.7767573305985652, - "grad_norm": 0.14350314438343048, - "learning_rate": 2.9409660707873597e-05, - "loss": 0.010909486562013626, - "step": 8300 - }, - { - "epoch": 0.7786290350096461, - "grad_norm": 0.17676737904548645, - "learning_rate": 2.932695762535691e-05, - "loss": 0.01464642733335495, - "step": 8320 - }, - { - "epoch": 0.780500739420727, - "grad_norm": 0.5979751348495483, - "learning_rate": 2.9244205698717943e-05, - "loss": 0.028799059987068176, - "step": 8340 - }, - { - "epoch": 0.7823724438318078, - "grad_norm": 0.08448052406311035, - "learning_rate": 2.9161405862087676e-05, - "loss": 0.014056096971035003, - "step": 8360 - }, - { - "epoch": 0.7842441482428888, - "grad_norm": 0.5616207122802734, - "learning_rate": 2.9078559050137955e-05, - "loss": 0.008744364231824875, - "step": 8380 - }, - { - "epoch": 0.7861158526539697, - "grad_norm": 0.7264829277992249, - "learning_rate": 2.8995666198070836e-05, - "loss": 0.014575870335102081, - "step": 8400 - }, - { - "epoch": 0.7879875570650505, - "grad_norm": 1.444239616394043, - "learning_rate": 2.891272824160815e-05, - "loss": 0.01230706349015236, - "step": 8420 - }, - { - "epoch": 0.7898592614761314, - "grad_norm": 0.02643579989671707, - "learning_rate": 2.882974611698084e-05, - "loss": 0.01713460832834244, - "step": 8440 - }, - { - "epoch": 0.7917309658872123, - "grad_norm": 0.19893163442611694, - "learning_rate": 2.8746720760918457e-05, - "loss": 0.009562552720308305, - "step": 8460 - }, - { - "epoch": 0.7936026702982931, - "grad_norm": 1.8813897371292114, - "learning_rate": 2.866365311063855e-05, - "loss": 0.01966284364461899, - "step": 8480 - }, - { - "epoch": 0.795474374709374, - "grad_norm": 0.1820579618215561, - "learning_rate": 2.8580544103836114e-05, - "loss": 0.023943188786506652, - "step": 8500 - }, - { - "epoch": 0.7973460791204549, - "grad_norm": 1.3913259506225586, - "learning_rate": 2.849739467867298e-05, - "loss": 0.02233349084854126, - "step": 8520 - }, - { - "epoch": 0.7992177835315358, - "grad_norm": 0.28450486063957214, - "learning_rate": 2.8414205773767223e-05, - "loss": 0.016230446100234986, - "step": 8540 - }, - { - "epoch": 0.8010894879426167, - "grad_norm": 0.46086356043815613, - "learning_rate": 2.83309783281826e-05, - "loss": 0.013964855670928955, - "step": 8560 - }, - { - "epoch": 0.8029611923536976, - "grad_norm": 1.1401137113571167, - "learning_rate": 2.8247713281417924e-05, - "loss": 0.01552264392375946, - "step": 8580 - }, - { - "epoch": 0.8048328967647784, - "grad_norm": 0.02414649911224842, - "learning_rate": 2.8164411573396444e-05, - "loss": 0.00505053773522377, - "step": 8600 - }, - { - "epoch": 0.8067046011758593, - "grad_norm": 0.029010778293013573, - "learning_rate": 2.8081074144455276e-05, - "loss": 0.008068422973155975, - "step": 8620 - }, - { - "epoch": 0.8085763055869402, - "grad_norm": 0.024924319237470627, - "learning_rate": 2.7997701935334747e-05, - "loss": 0.021529987454414368, - "step": 8640 - }, - { - "epoch": 0.810448009998021, - "grad_norm": 0.3544171154499054, - "learning_rate": 2.791429588716782e-05, - "loss": 0.008264218270778657, - "step": 8660 - }, - { - "epoch": 0.8123197144091019, - "grad_norm": 0.011211074888706207, - "learning_rate": 2.7830856941469407e-05, - "loss": 0.013752134144306183, - "step": 8680 - }, - { - "epoch": 0.8141914188201829, - "grad_norm": 0.30479249358177185, - "learning_rate": 2.7747386040125807e-05, - "loss": 0.01313515156507492, - "step": 8700 - }, - { - "epoch": 0.8160631232312637, - "grad_norm": 3.1079516410827637, - "learning_rate": 2.766388412538404e-05, - "loss": 0.013471932709217071, - "step": 8720 - }, - { - "epoch": 0.8179348276423446, - "grad_norm": 0.011288405396044254, - "learning_rate": 2.758035213984121e-05, - "loss": 0.011207062005996703, - "step": 8740 - }, - { - "epoch": 0.8198065320534255, - "grad_norm": 0.011481484398245811, - "learning_rate": 2.749679102643387e-05, - "loss": 0.018254657089710236, - "step": 8760 - }, - { - "epoch": 0.8216782364645063, - "grad_norm": 0.037564992904663086, - "learning_rate": 2.7413201728427372e-05, - "loss": 0.024057184159755707, - "step": 8780 - }, - { - "epoch": 0.8235499408755872, - "grad_norm": 0.03808968514204025, - "learning_rate": 2.7329585189405253e-05, - "loss": 0.006051592528820038, - "step": 8800 - }, - { - "epoch": 0.8254216452866681, - "grad_norm": 0.07610247284173965, - "learning_rate": 2.724594235325852e-05, - "loss": 0.025592076778411865, - "step": 8820 - }, - { - "epoch": 0.827293349697749, - "grad_norm": 0.019049810245633125, - "learning_rate": 2.716227416417505e-05, - "loss": 0.0037486787885427477, - "step": 8840 - }, - { - "epoch": 0.8291650541088299, - "grad_norm": 0.6380273699760437, - "learning_rate": 2.7078581566628897e-05, - "loss": 0.015487492084503174, - "step": 8860 - }, - { - "epoch": 0.8310367585199108, - "grad_norm": 0.05775881186127663, - "learning_rate": 2.699486550536968e-05, - "loss": 0.03133237063884735, - "step": 8880 - }, - { - "epoch": 0.8329084629309916, - "grad_norm": 0.047411222010850906, - "learning_rate": 2.6911126925411845e-05, - "loss": 0.00861177071928978, - "step": 8900 - }, - { - "epoch": 0.8347801673420725, - "grad_norm": 0.23981286585330963, - "learning_rate": 2.682736677202406e-05, - "loss": 0.01839599907398224, - "step": 8920 - }, - { - "epoch": 0.8366518717531534, - "grad_norm": 0.36887305974960327, - "learning_rate": 2.6743585990718505e-05, - "loss": 0.01008533239364624, - "step": 8940 - }, - { - "epoch": 0.8385235761642342, - "grad_norm": 0.8994531035423279, - "learning_rate": 2.6659785527240233e-05, - "loss": 0.027107802033424378, - "step": 8960 - }, - { - "epoch": 0.8403952805753151, - "grad_norm": 0.12780402600765228, - "learning_rate": 2.6575966327556458e-05, - "loss": 0.03549482524394989, - "step": 8980 - }, - { - "epoch": 0.8422669849863961, - "grad_norm": 0.3294568359851837, - "learning_rate": 2.649212933784591e-05, - "loss": 0.02797776460647583, - "step": 9000 - }, - { - "epoch": 0.8441386893974769, - "grad_norm": 0.019461506977677345, - "learning_rate": 2.640827550448812e-05, - "loss": 0.010047334432601928, - "step": 9020 - }, - { - "epoch": 0.8460103938085578, - "grad_norm": 0.056546472012996674, - "learning_rate": 2.6324405774052784e-05, - "loss": 0.02831721007823944, - "step": 9040 - }, - { - "epoch": 0.8478820982196387, - "grad_norm": 0.017190299928188324, - "learning_rate": 2.6240521093289022e-05, - "loss": 0.019623257219791412, - "step": 9060 - }, - { - "epoch": 0.8497538026307195, - "grad_norm": 0.04793965816497803, - "learning_rate": 2.6156622409114728e-05, - "loss": 0.011966148018836975, - "step": 9080 - }, - { - "epoch": 0.8516255070418004, - "grad_norm": 0.006742037367075682, - "learning_rate": 2.607271066860587e-05, - "loss": 0.013694784045219422, - "step": 9100 - }, - { - "epoch": 0.8534972114528813, - "grad_norm": 0.03113027848303318, - "learning_rate": 2.5988786818985812e-05, - "loss": 0.05338943004608154, - "step": 9120 - }, - { - "epoch": 0.8553689158639621, - "grad_norm": 0.6589255928993225, - "learning_rate": 2.5904851807614588e-05, - "loss": 0.01305432766675949, - "step": 9140 - }, - { - "epoch": 0.8572406202750431, - "grad_norm": 0.3030281960964203, - "learning_rate": 2.582090658197825e-05, - "loss": 0.03663805425167084, - "step": 9160 - }, - { - "epoch": 0.859112324686124, - "grad_norm": 0.37101081013679504, - "learning_rate": 2.573695208967814e-05, - "loss": 0.016968609392642976, - "step": 9180 - }, - { - "epoch": 0.8609840290972048, - "grad_norm": 0.7480998039245605, - "learning_rate": 2.5652989278420197e-05, - "loss": 0.021240857243537904, - "step": 9200 - }, - { - "epoch": 0.8628557335082857, - "grad_norm": 0.017131274566054344, - "learning_rate": 2.5569019096004304e-05, - "loss": 0.004783949628472328, - "step": 9220 - }, - { - "epoch": 0.8647274379193666, - "grad_norm": 1.1544040441513062, - "learning_rate": 2.5485042490313504e-05, - "loss": 0.02356208860874176, - "step": 9240 - }, - { - "epoch": 0.8665991423304474, - "grad_norm": 0.13512635231018066, - "learning_rate": 2.540106040930338e-05, - "loss": 0.009329542517662048, - "step": 9260 - }, - { - "epoch": 0.8684708467415283, - "grad_norm": 0.018427839502692223, - "learning_rate": 2.5317073800991304e-05, - "loss": 0.007472375035285949, - "step": 9280 - }, - { - "epoch": 0.8703425511526093, - "grad_norm": 0.02722800336778164, - "learning_rate": 2.5233083613445778e-05, - "loss": 0.020304642617702484, - "step": 9300 - }, - { - "epoch": 0.8722142555636901, - "grad_norm": 0.051702745258808136, - "learning_rate": 2.5149090794775675e-05, - "loss": 0.02955295443534851, - "step": 9320 - }, - { - "epoch": 0.874085959974771, - "grad_norm": 0.1535400152206421, - "learning_rate": 2.5065096293119604e-05, - "loss": 0.030047640204429626, - "step": 9340 - }, - { - "epoch": 0.8759576643858519, - "grad_norm": 0.383573979139328, - "learning_rate": 2.498110105663513e-05, - "loss": 0.011377302557229995, - "step": 9360 - }, - { - "epoch": 0.8778293687969327, - "grad_norm": 0.23541487753391266, - "learning_rate": 2.489710603348817e-05, - "loss": 0.02304387390613556, - "step": 9380 - }, - { - "epoch": 0.8797010732080136, - "grad_norm": 0.029004938900470734, - "learning_rate": 2.4813112171842162e-05, - "loss": 0.020582889020442963, - "step": 9400 - }, - { - "epoch": 0.8815727776190945, - "grad_norm": 0.06564116477966309, - "learning_rate": 2.4729120419847498e-05, - "loss": 0.014207787811756134, - "step": 9420 - }, - { - "epoch": 0.8834444820301753, - "grad_norm": 0.01633615791797638, - "learning_rate": 2.464513172563072e-05, - "loss": 0.01756283938884735, - "step": 9440 - }, - { - "epoch": 0.8853161864412563, - "grad_norm": 0.01287770178169012, - "learning_rate": 2.456114703728386e-05, - "loss": 0.003737853467464447, - "step": 9460 - }, - { - "epoch": 0.8871878908523372, - "grad_norm": 0.05004064738750458, - "learning_rate": 2.448136615728485e-05, - "loss": 0.0324675589799881, - "step": 9480 - }, - { - "epoch": 0.889059595263418, - "grad_norm": 1.20869779586792, - "learning_rate": 2.4397392007153162e-05, - "loss": 0.007156150788068772, - "step": 9500 - }, - { - "epoch": 0.8909312996744989, - "grad_norm": 1.1070218086242676, - "learning_rate": 2.43134246594589e-05, - "loss": 0.009275762736797333, - "step": 9520 - }, - { - "epoch": 0.8928030040855798, - "grad_norm": 0.878593385219574, - "learning_rate": 2.4229465062053136e-05, - "loss": 0.018170186877250673, - "step": 9540 - }, - { - "epoch": 2.236302797078385, - "grad_norm": 0.03912261128425598, - "learning_rate": 1.4461640332194936e-05, - "loss": 0.0013993863249197602, - "step": 9560 - }, - { - "epoch": 2.2409820603868678, - "grad_norm": 0.2482009381055832, - "learning_rate": 1.4366537531356394e-05, - "loss": 0.006357508152723313, - "step": 9580 - }, - { - "epoch": 2.2456613236953507, - "grad_norm": 0.011289082467556, - "learning_rate": 1.4271622228435674e-05, - "loss": 0.020982606709003447, - "step": 9600 - }, - { - "epoch": 2.2503405870038335, - "grad_norm": 0.022541223093867302, - "learning_rate": 1.4176896097057135e-05, - "loss": 0.00703481137752533, - "step": 9620 - }, - { - "epoch": 2.2550198503123164, - "grad_norm": 0.1335306018590927, - "learning_rate": 1.4082360807509482e-05, - "loss": 0.007030846178531646, - "step": 9640 - }, - { - "epoch": 2.2596991136207993, - "grad_norm": 0.20317842066287994, - "learning_rate": 1.3988018026716371e-05, - "loss": 0.006802820414304733, - "step": 9660 - }, - { - "epoch": 2.2643783769292822, - "grad_norm": 0.02236269973218441, - "learning_rate": 1.3893869418206949e-05, - "loss": 0.007227703183889389, - "step": 9680 - }, - { - "epoch": 2.269057640237765, - "grad_norm": 0.014223535545170307, - "learning_rate": 1.3799916642086585e-05, - "loss": 0.0067868843674659726, - "step": 9700 - }, - { - "epoch": 2.273736903546248, - "grad_norm": 0.02096005715429783, - "learning_rate": 1.3706161355007579e-05, - "loss": 0.014182762801647186, - "step": 9720 - }, - { - "epoch": 2.278416166854731, - "grad_norm": 0.022602779790759087, - "learning_rate": 1.3612605210139912e-05, - "loss": 0.007886608690023422, - "step": 9740 - }, - { - "epoch": 2.283095430163214, - "grad_norm": 0.5981806516647339, - "learning_rate": 1.3519249857142147e-05, - "loss": 0.002214055508375168, - "step": 9760 - }, - { - "epoch": 2.2877746934716967, - "grad_norm": 0.014210161752998829, - "learning_rate": 1.3426096942132305e-05, - "loss": 0.0067164845764637, - "step": 9780 - }, - { - "epoch": 2.2924539567801796, - "grad_norm": 0.1347479671239853, - "learning_rate": 1.3333148107658883e-05, - "loss": 0.009656199812889099, - "step": 9800 - }, - { - "epoch": 2.2971332200886625, - "grad_norm": 0.01877514459192753, - "learning_rate": 1.3240404992671823e-05, - "loss": 0.008465659618377686, - "step": 9820 - }, - { - "epoch": 2.3018124833971454, - "grad_norm": 0.046075042337179184, - "learning_rate": 1.3147869232493698e-05, - "loss": 0.010561748594045638, - "step": 9840 - }, - { - "epoch": 2.306491746705628, - "grad_norm": 0.01830120198428631, - "learning_rate": 1.305554245879079e-05, - "loss": 0.0038456227630376816, - "step": 9860 - }, - { - "epoch": 2.3111710100141107, - "grad_norm": 0.7911403179168701, - "learning_rate": 1.296342629954439e-05, - "loss": 0.009874989837408065, - "step": 9880 - }, - { - "epoch": 2.3158502733225936, - "grad_norm": 0.012944846414029598, - "learning_rate": 1.2871522379022038e-05, - "loss": 0.004237812012434006, - "step": 9900 - }, - { - "epoch": 2.3205295366310765, - "grad_norm": 0.018642093986272812, - "learning_rate": 1.2779832317748933e-05, - "loss": 0.013542568683624268, - "step": 9920 - }, - { - "epoch": 2.3252087999395594, - "grad_norm": 2.968116044998169, - "learning_rate": 1.2688357732479303e-05, - "loss": 0.013356439769268036, - "step": 9940 - }, - { - "epoch": 2.3298880632480423, - "grad_norm": 0.7037340402603149, - "learning_rate": 1.2597100236167963e-05, - "loss": 0.008408596366643905, - "step": 9960 - }, - { - "epoch": 2.334567326556525, - "grad_norm": 1.47929847240448, - "learning_rate": 1.2506061437941804e-05, - "loss": 0.012505564093589782, - "step": 9980 - }, - { - "epoch": 2.339246589865008, - "grad_norm": 1.5857082605361938, - "learning_rate": 1.241524294307147e-05, - "loss": 0.007822493463754654, - "step": 10000 - } - ], - "logging_steps": 20, - "max_steps": 14963, - "num_input_tokens_seen": 0, - "num_train_epochs": 4, - "save_steps": 1000000000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.842105679776333e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/slots/11/checkpoint-10018/training_args.bin b/slots/11/checkpoint-10018/training_args.bin deleted file mode 100644 index cba6bf44229020a6cf5d76cffc747dea705142ea..0000000000000000000000000000000000000000 --- a/slots/11/checkpoint-10018/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66430bba102a8f3dc245713cd6268a99c212c508aacce1d8b9768464f5df26ec -size 5201 diff --git a/slots/11/latest.json b/slots/11/latest.json deleted file mode 100644 index fd0347df240197e26f49accdd05b0e995ddd4b23..0000000000000000000000000000000000000000 --- a/slots/11/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:11", "checkpoint": "slots/11/checkpoint-10018", "step": 10018, "updated_at": 1776816036} diff --git a/slots/12/latest.json b/slots/12/latest.json deleted file mode 100644 index 0cb2de688625ff94c682a53dc72a84ed5abd894d..0000000000000000000000000000000000000000 --- a/slots/12/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:12", "checkpoint": "slots/12/checkpoint-9896", "step": 9896, "updated_at": 1776815553} diff --git a/slots/13/latest.json b/slots/13/latest.json deleted file mode 100644 index bfdea27ae41dca2e6758e63dc060edbeafde4af9..0000000000000000000000000000000000000000 --- a/slots/13/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:13", "checkpoint": "slots/13/checkpoint-9966", "step": 9966, "updated_at": 1776815459} diff --git a/slots/14/latest.json b/slots/14/latest.json deleted file mode 100644 index 81361138ee3f1263aa8f20d779fdc9f15e156305..0000000000000000000000000000000000000000 --- a/slots/14/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:14", "checkpoint": "slots/14/checkpoint-8908", "step": 8908, "updated_at": 1776778471} diff --git a/slots/15/latest.json b/slots/15/latest.json deleted file mode 100644 index 958af608a371a85180777aebb6327c350d52b202..0000000000000000000000000000000000000000 --- a/slots/15/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:15", "checkpoint": "slots/15/checkpoint-9384", "step": 9384, "updated_at": 1776780468} diff --git a/slots/16/latest.json b/slots/16/latest.json deleted file mode 100644 index 83410e27473fd6dc3a953644be47ef3394d3bf7e..0000000000000000000000000000000000000000 --- a/slots/16/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:16", "checkpoint": "slots/16/checkpoint-9287", "step": 9287, "updated_at": 1776779956} diff --git a/slots/17/latest.json b/slots/17/latest.json deleted file mode 100644 index 688d909bafd361a1053b8400ebb5b92c484f6d53..0000000000000000000000000000000000000000 --- a/slots/17/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:17", "checkpoint": "slots/17/checkpoint-8838", "step": 8838, "updated_at": 1776778390} diff --git a/slots/18/latest.json b/slots/18/latest.json deleted file mode 100644 index e94b349aff5aedaa6db1fac884cb42f3c2ba26ff..0000000000000000000000000000000000000000 --- a/slots/18/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:18", "checkpoint": "slots/18/checkpoint-9331", "step": 9331, "updated_at": 1776780123} diff --git a/slots/19/checkpoint-10023/config.json b/slots/19/checkpoint-10023/config.json deleted file mode 100644 index 9e5d8b7224eff16a790758ae86dd97c89afeab74..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "TwinyForCausalLM" - ], - "attention_dropout": 0.0, - "dtype": "float32", - "hidden_dropout": 0.0, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 128, - "model_type": "twiny", - "neftune_alpha": 0.0, - "num_attention_heads": 12, - "num_hidden_layers": 3, - "num_key_value_heads": 3, - "qk_norm": true, - "rezero_init": 1.0, - "rms_norm_eps": 1e-06, - "rope_theta": 10000.0, - "transformers_version": "5.0.0", - "use_cache": false, - "vocab_size": 32000 -} diff --git a/slots/19/checkpoint-10023/model.safetensors b/slots/19/checkpoint-10023/model.safetensors deleted file mode 100644 index 177afae32f1dda5169c28bee338135e701ec36e4..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:819aea5f86a06a93c16af51942db633b244dd089a97562358af76aa64a22047f -size 306388092 diff --git a/slots/19/checkpoint-10023/optimizer.pt b/slots/19/checkpoint-10023/optimizer.pt deleted file mode 100644 index 516278aed304ce5f4783d50cf9e46e98bf0c2903..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99cf036a910feeb7978abafa1d3f67329deb2e33088213ef5c6ec4008413c0e5 -size 302484555 diff --git a/slots/19/checkpoint-10023/rng_state.pth b/slots/19/checkpoint-10023/rng_state.pth deleted file mode 100644 index 1feba1a6538e93b94696d3773853dbc8947b0cad..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 -size 14645 diff --git a/slots/19/checkpoint-10023/scaler.pt b/slots/19/checkpoint-10023/scaler.pt deleted file mode 100644 index e17916cb53a6cfb1c6b85f0febf75f0e95c842ca..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/scaler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6422bc21ecb2b7504951d040f397a9c52919ccb608b4183918ca8112df61d802 -size 1383 diff --git a/slots/19/checkpoint-10023/scheduler.pt b/slots/19/checkpoint-10023/scheduler.pt deleted file mode 100644 index e81776a20746eb3f07191a3c408ac3389b2c5596..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85e89ecb4d905ed3a8dd10fec2190ee523da26980c6b2a97f0ec2d74d35647f2 -size 1465 diff --git a/slots/19/checkpoint-10023/trainer_state.json b/slots/19/checkpoint-10023/trainer_state.json deleted file mode 100644 index 03dfa7c1cf468e332c4c98123c12e9c0098d345c..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/trainer_state.json +++ /dev/null @@ -1,3548 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.3446277426697635, - "eval_steps": 500, - "global_step": 10023, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001336931908386741, - "grad_norm": Infinity, - "learning_rate": 5e-05, - "loss": 129.80032348632812, - "step": 1 - }, - { - "epoch": 0.002673863816773482, - "grad_norm": 63.64365768432617, - "learning_rate": 4.999995392022967e-05, - "loss": 63.88374408922697, - "step": 20 - }, - { - "epoch": 0.005347727633546964, - "grad_norm": 24.627853393554688, - "learning_rate": 4.999976672145381e-05, - "loss": 12.65963363647461, - "step": 40 - }, - { - "epoch": 0.008021591450320446, - "grad_norm": 14.29983901977539, - "learning_rate": 4.999943552476422e-05, - "loss": 5.90204963684082, - "step": 60 - }, - { - "epoch": 0.010695455267093928, - "grad_norm": 15.690323829650879, - "learning_rate": 4.999896033206858e-05, - "loss": 3.9918922424316405, - "step": 80 - }, - { - "epoch": 0.01336931908386741, - "grad_norm": 31.583160400390625, - "learning_rate": 4.999834114610398e-05, - "loss": 2.9675426483154297, - "step": 100 - }, - { - "epoch": 0.01604318290064089, - "grad_norm": 13.034649848937988, - "learning_rate": 4.999757797043691e-05, - "loss": 2.725296401977539, - "step": 120 - }, - { - "epoch": 0.018717046717414372, - "grad_norm": 8.362203598022461, - "learning_rate": 4.999667080946324e-05, - "loss": 2.2478992462158205, - "step": 140 - }, - { - "epoch": 0.021390910534187856, - "grad_norm": 8.726786613464355, - "learning_rate": 4.999561966840821e-05, - "loss": 1.8447845458984375, - "step": 160 - }, - { - "epoch": 0.024064774350961337, - "grad_norm": 10.092752456665039, - "learning_rate": 4.9994424553326335e-05, - "loss": 1.5611843109130858, - "step": 180 - }, - { - "epoch": 0.02673863816773482, - "grad_norm": 9.090085983276367, - "learning_rate": 4.999308547110146e-05, - "loss": 1.520334815979004, - "step": 200 - }, - { - "epoch": 0.029412501984508302, - "grad_norm": 9.668124198913574, - "learning_rate": 4.999160242944665e-05, - "loss": 1.2818055152893066, - "step": 220 - }, - { - "epoch": 0.03208636580128178, - "grad_norm": 9.182533264160156, - "learning_rate": 4.998997543690418e-05, - "loss": 1.0428407669067383, - "step": 240 - }, - { - "epoch": 0.03476022961805526, - "grad_norm": 5.745838165283203, - "learning_rate": 4.998820450284549e-05, - "loss": 1.2343652725219727, - "step": 260 - }, - { - "epoch": 0.037434093434828744, - "grad_norm": 8.651643753051758, - "learning_rate": 4.99862896374711e-05, - "loss": 0.8859601020812988, - "step": 280 - }, - { - "epoch": 0.04010795725160223, - "grad_norm": 10.765266418457031, - "learning_rate": 4.998423085181056e-05, - "loss": 0.989600658416748, - "step": 300 - }, - { - "epoch": 0.04278182106837571, - "grad_norm": 6.092499256134033, - "learning_rate": 4.998202815772245e-05, - "loss": 0.7189463615417481, - "step": 320 - }, - { - "epoch": 0.04545568488514919, - "grad_norm": 6.352876663208008, - "learning_rate": 4.9979681567894195e-05, - "loss": 0.7489545345306396, - "step": 340 - }, - { - "epoch": 0.048129548701922674, - "grad_norm": 4.620656490325928, - "learning_rate": 4.997719109584209e-05, - "loss": 0.7381401538848877, - "step": 360 - }, - { - "epoch": 0.050803412518696155, - "grad_norm": 7.796917915344238, - "learning_rate": 4.997455675591119e-05, - "loss": 0.5687405109405518, - "step": 380 - }, - { - "epoch": 0.05347727633546964, - "grad_norm": 2.837172508239746, - "learning_rate": 4.9971778563275204e-05, - "loss": 0.5686865329742432, - "step": 400 - }, - { - "epoch": 0.05615114015224312, - "grad_norm": 3.3103690147399902, - "learning_rate": 4.9968856533936436e-05, - "loss": 0.625730562210083, - "step": 420 - }, - { - "epoch": 0.058825003969016604, - "grad_norm": 3.5682132244110107, - "learning_rate": 4.99657906847257e-05, - "loss": 0.6125466346740722, - "step": 440 - }, - { - "epoch": 0.061498867785790085, - "grad_norm": 5.63640832901001, - "learning_rate": 4.996258103330218e-05, - "loss": 0.6182214260101319, - "step": 460 - }, - { - "epoch": 0.06417273160256357, - "grad_norm": 4.698945999145508, - "learning_rate": 4.995922759815339e-05, - "loss": 0.43828091621398924, - "step": 480 - }, - { - "epoch": 0.06684659541933705, - "grad_norm": 2.1976189613342285, - "learning_rate": 4.995573039859501e-05, - "loss": 0.4459230899810791, - "step": 500 - }, - { - "epoch": 0.06952045923611053, - "grad_norm": 3.8809523582458496, - "learning_rate": 4.995208945477081e-05, - "loss": 0.3821882963180542, - "step": 520 - }, - { - "epoch": 0.07219432305288401, - "grad_norm": 3.75144100189209, - "learning_rate": 4.994830478765251e-05, - "loss": 0.5800807476043701, - "step": 540 - }, - { - "epoch": 0.07486818686965749, - "grad_norm": 3.0038585662841797, - "learning_rate": 4.9944376419039684e-05, - "loss": 0.3928264617919922, - "step": 560 - }, - { - "epoch": 0.07754205068643098, - "grad_norm": 3.614591598510742, - "learning_rate": 4.994030437155961e-05, - "loss": 0.48637890815734863, - "step": 580 - }, - { - "epoch": 0.08021591450320446, - "grad_norm": 4.143443584442139, - "learning_rate": 4.993608866866718e-05, - "loss": 0.3650153160095215, - "step": 600 - }, - { - "epoch": 0.08288977831997794, - "grad_norm": 6.692712783813477, - "learning_rate": 4.993172933464471e-05, - "loss": 0.3677916288375854, - "step": 620 - }, - { - "epoch": 0.08556364213675142, - "grad_norm": 8.383441925048828, - "learning_rate": 4.9927226394601815e-05, - "loss": 0.3399480104446411, - "step": 640 - }, - { - "epoch": 0.0882375059535249, - "grad_norm": 5.566338062286377, - "learning_rate": 4.992257987447532e-05, - "loss": 0.28104052543640134, - "step": 660 - }, - { - "epoch": 0.09091136977029839, - "grad_norm": 3.1196420192718506, - "learning_rate": 4.991778980102904e-05, - "loss": 0.351950478553772, - "step": 680 - }, - { - "epoch": 0.09358523358707187, - "grad_norm": 3.47979736328125, - "learning_rate": 4.9912856201853644e-05, - "loss": 0.27501535415649414, - "step": 700 - }, - { - "epoch": 0.09625909740384535, - "grad_norm": 5.446717262268066, - "learning_rate": 4.990777910536653e-05, - "loss": 0.2651593923568726, - "step": 720 - }, - { - "epoch": 0.09893296122061883, - "grad_norm": 7.6145339012146, - "learning_rate": 4.990255854081161e-05, - "loss": 0.35140380859375, - "step": 740 - }, - { - "epoch": 0.10160682503739231, - "grad_norm": 8.445616722106934, - "learning_rate": 4.989719453825918e-05, - "loss": 0.2961219072341919, - "step": 760 - }, - { - "epoch": 0.10428068885416579, - "grad_norm": 6.339537620544434, - "learning_rate": 4.9891687128605744e-05, - "loss": 0.24962289333343507, - "step": 780 - }, - { - "epoch": 0.10695455267093928, - "grad_norm": 3.3369436264038086, - "learning_rate": 4.988603634357383e-05, - "loss": 0.2124847412109375, - "step": 800 - }, - { - "epoch": 0.10962841648771277, - "grad_norm": 2.2909045219421387, - "learning_rate": 4.988024221571177e-05, - "loss": 0.24679112434387207, - "step": 820 - }, - { - "epoch": 0.11230228030448625, - "grad_norm": 3.1149911880493164, - "learning_rate": 4.9874304778393574e-05, - "loss": 0.22161397933959961, - "step": 840 - }, - { - "epoch": 0.11497614412125973, - "grad_norm": 14.802160263061523, - "learning_rate": 4.9868224065818706e-05, - "loss": 0.2623537302017212, - "step": 860 - }, - { - "epoch": 0.11765000793803321, - "grad_norm": 5.586325168609619, - "learning_rate": 4.98620001130119e-05, - "loss": 0.3560942649841309, - "step": 880 - }, - { - "epoch": 0.12032387175480669, - "grad_norm": 3.390017032623291, - "learning_rate": 4.9855632955822916e-05, - "loss": 0.16934787034988402, - "step": 900 - }, - { - "epoch": 0.12299773557158017, - "grad_norm": 6.070940971374512, - "learning_rate": 4.984912263092641e-05, - "loss": 0.2131197214126587, - "step": 920 - }, - { - "epoch": 0.12567159938835365, - "grad_norm": 1.4912281036376953, - "learning_rate": 4.984246917582166e-05, - "loss": 0.25128653049468996, - "step": 940 - }, - { - "epoch": 0.12834546320512713, - "grad_norm": 7.000472545623779, - "learning_rate": 4.9835672628832366e-05, - "loss": 0.2653592586517334, - "step": 960 - }, - { - "epoch": 0.1310193270219006, - "grad_norm": 5.427223205566406, - "learning_rate": 4.9828733029106434e-05, - "loss": 0.1653295636177063, - "step": 980 - }, - { - "epoch": 0.1336931908386741, - "grad_norm": 1.9502102136611938, - "learning_rate": 4.982165041661575e-05, - "loss": 0.2250870943069458, - "step": 1000 - }, - { - "epoch": 0.13636705465544757, - "grad_norm": 0.6216259598731995, - "learning_rate": 4.981442483215595e-05, - "loss": 0.18943849802017212, - "step": 1020 - }, - { - "epoch": 0.13904091847222105, - "grad_norm": 2.3363687992095947, - "learning_rate": 4.98070563173462e-05, - "loss": 0.1673592209815979, - "step": 1040 - }, - { - "epoch": 0.14171478228899453, - "grad_norm": 1.040717601776123, - "learning_rate": 4.979954491462892e-05, - "loss": 0.2113173007965088, - "step": 1060 - }, - { - "epoch": 0.14438864610576801, - "grad_norm": 2.735522747039795, - "learning_rate": 4.979189066726955e-05, - "loss": 0.17504971027374266, - "step": 1080 - }, - { - "epoch": 0.1470625099225415, - "grad_norm": 4.701151371002197, - "learning_rate": 4.978409361935636e-05, - "loss": 0.15881222486495972, - "step": 1100 - }, - { - "epoch": 0.14973637373931498, - "grad_norm": 2.735919237136841, - "learning_rate": 4.9776153815800075e-05, - "loss": 0.14044179916381835, - "step": 1120 - }, - { - "epoch": 0.15241023755608848, - "grad_norm": 3.5479538440704346, - "learning_rate": 4.976807130233375e-05, - "loss": 0.18565714359283447, - "step": 1140 - }, - { - "epoch": 0.15508410137286197, - "grad_norm": 3.2167458534240723, - "learning_rate": 4.975984612551243e-05, - "loss": 0.13236271142959594, - "step": 1160 - }, - { - "epoch": 0.15775796518963545, - "grad_norm": 1.0206760168075562, - "learning_rate": 4.975147833271288e-05, - "loss": 0.19124728441238403, - "step": 1180 - }, - { - "epoch": 0.16043182900640893, - "grad_norm": 4.194457530975342, - "learning_rate": 4.9742967972133335e-05, - "loss": 0.144741427898407, - "step": 1200 - }, - { - "epoch": 0.1631056928231824, - "grad_norm": 3.0225746631622314, - "learning_rate": 4.973431509279323e-05, - "loss": 0.1374324679374695, - "step": 1220 - }, - { - "epoch": 0.1657795566399559, - "grad_norm": 4.243523120880127, - "learning_rate": 4.972551974453287e-05, - "loss": 0.13663809299468993, - "step": 1240 - }, - { - "epoch": 0.16845342045672937, - "grad_norm": 2.4990086555480957, - "learning_rate": 4.971658197801322e-05, - "loss": 0.16817957162857056, - "step": 1260 - }, - { - "epoch": 0.17112728427350285, - "grad_norm": 4.983982563018799, - "learning_rate": 4.9707501844715554e-05, - "loss": 0.13795313835144044, - "step": 1280 - }, - { - "epoch": 0.17380114809027633, - "grad_norm": 3.6780316829681396, - "learning_rate": 4.969827939694115e-05, - "loss": 0.1637880802154541, - "step": 1300 - }, - { - "epoch": 0.1764750119070498, - "grad_norm": 0.7950732707977295, - "learning_rate": 4.968891468781105e-05, - "loss": 0.10979138612747193, - "step": 1320 - }, - { - "epoch": 0.1791488757238233, - "grad_norm": 1.2414121627807617, - "learning_rate": 4.967940777126569e-05, - "loss": 0.13692171573638917, - "step": 1340 - }, - { - "epoch": 0.18182273954059677, - "grad_norm": 2.1383633613586426, - "learning_rate": 4.9669758702064636e-05, - "loss": 0.07821698188781738, - "step": 1360 - }, - { - "epoch": 0.18449660335737025, - "grad_norm": 5.061275959014893, - "learning_rate": 4.965996753578623e-05, - "loss": 0.19053516387939454, - "step": 1380 - }, - { - "epoch": 0.18717046717414373, - "grad_norm": 6.151792049407959, - "learning_rate": 4.9650034328827305e-05, - "loss": 0.11360721588134766, - "step": 1400 - }, - { - "epoch": 0.18984433099091721, - "grad_norm": 1.0604305267333984, - "learning_rate": 4.963995913840284e-05, - "loss": 0.13138024806976317, - "step": 1420 - }, - { - "epoch": 0.1925181948076907, - "grad_norm": 1.7159489393234253, - "learning_rate": 4.9629742022545623e-05, - "loss": 0.08657677173614502, - "step": 1440 - }, - { - "epoch": 0.19519205862446418, - "grad_norm": 2.4207754135131836, - "learning_rate": 4.961938304010595e-05, - "loss": 0.10309149026870727, - "step": 1460 - }, - { - "epoch": 0.19786592244123766, - "grad_norm": 1.532060146331787, - "learning_rate": 4.9608882250751245e-05, - "loss": 0.13628544807434081, - "step": 1480 - }, - { - "epoch": 0.20053978625801114, - "grad_norm": 6.409943580627441, - "learning_rate": 4.959823971496574e-05, - "loss": 0.10584845542907714, - "step": 1500 - }, - { - "epoch": 0.20321365007478462, - "grad_norm": 2.452012538909912, - "learning_rate": 4.9587455494050136e-05, - "loss": 0.06506187915802002, - "step": 1520 - }, - { - "epoch": 0.2058875138915581, - "grad_norm": 5.3016533851623535, - "learning_rate": 4.9576529650121214e-05, - "loss": 0.11848526000976563, - "step": 1540 - }, - { - "epoch": 0.20856137770833158, - "grad_norm": 4.341775894165039, - "learning_rate": 4.956546224611152e-05, - "loss": 0.11318533420562744, - "step": 1560 - }, - { - "epoch": 0.21123524152510506, - "grad_norm": 1.9056169986724854, - "learning_rate": 4.9554253345768965e-05, - "loss": 0.12768398523330687, - "step": 1580 - }, - { - "epoch": 0.21390910534187857, - "grad_norm": 1.8939746618270874, - "learning_rate": 4.9542903013656486e-05, - "loss": 0.10782338380813598, - "step": 1600 - }, - { - "epoch": 0.21658296915865205, - "grad_norm": 8.53671932220459, - "learning_rate": 4.9531411315151654e-05, - "loss": 0.1733921766281128, - "step": 1620 - }, - { - "epoch": 0.21925683297542553, - "grad_norm": 2.0152978897094727, - "learning_rate": 4.951977831644632e-05, - "loss": 0.11197054386138916, - "step": 1640 - }, - { - "epoch": 0.221930696792199, - "grad_norm": 3.8422367572784424, - "learning_rate": 4.95080040845462e-05, - "loss": 0.11441781520843505, - "step": 1660 - }, - { - "epoch": 0.2246045606089725, - "grad_norm": 1.819858193397522, - "learning_rate": 4.949608868727053e-05, - "loss": 0.11403474807739258, - "step": 1680 - }, - { - "epoch": 0.22727842442574597, - "grad_norm": 7.45100212097168, - "learning_rate": 4.948403219325163e-05, - "loss": 0.13117753267288207, - "step": 1700 - }, - { - "epoch": 0.22995228824251945, - "grad_norm": 0.6526040434837341, - "learning_rate": 4.947183467193456e-05, - "loss": 0.07524924874305725, - "step": 1720 - }, - { - "epoch": 0.23262615205929293, - "grad_norm": 3.814746856689453, - "learning_rate": 4.945949619357668e-05, - "loss": 0.07659345269203185, - "step": 1740 - }, - { - "epoch": 0.23530001587606642, - "grad_norm": 2.373124122619629, - "learning_rate": 4.944701682924726e-05, - "loss": 0.1147496223449707, - "step": 1760 - }, - { - "epoch": 0.2379738796928399, - "grad_norm": 0.11161285638809204, - "learning_rate": 4.943439665082707e-05, - "loss": 0.07256829738616943, - "step": 1780 - }, - { - "epoch": 0.24064774350961338, - "grad_norm": 0.45990192890167236, - "learning_rate": 4.942163573100794e-05, - "loss": 0.07726740837097168, - "step": 1800 - }, - { - "epoch": 0.24332160732638686, - "grad_norm": 4.2301926612854, - "learning_rate": 4.940873414329242e-05, - "loss": 0.09349535703659058, - "step": 1820 - }, - { - "epoch": 0.24599547114316034, - "grad_norm": 2.442178726196289, - "learning_rate": 4.939569196199325e-05, - "loss": 0.12413722276687622, - "step": 1840 - }, - { - "epoch": 0.24866933495993382, - "grad_norm": 2.523683786392212, - "learning_rate": 4.938250926223302e-05, - "loss": 0.08566288352012634, - "step": 1860 - }, - { - "epoch": 0.2513431987767073, - "grad_norm": 3.511075258255005, - "learning_rate": 4.936918611994368e-05, - "loss": 0.08007702231407166, - "step": 1880 - }, - { - "epoch": 0.2540170625934808, - "grad_norm": 6.254627704620361, - "learning_rate": 4.935572261186614e-05, - "loss": 0.10983954668045044, - "step": 1900 - }, - { - "epoch": 0.25669092641025426, - "grad_norm": 1.5211899280548096, - "learning_rate": 4.934211881554981e-05, - "loss": 0.09120344519615173, - "step": 1920 - }, - { - "epoch": 0.25936479022702774, - "grad_norm": 2.5893588066101074, - "learning_rate": 4.932837480935214e-05, - "loss": 0.08754412531852722, - "step": 1940 - }, - { - "epoch": 0.2620386540438012, - "grad_norm": 6.878556251525879, - "learning_rate": 4.931449067243821e-05, - "loss": 0.08636274933815002, - "step": 1960 - }, - { - "epoch": 0.2647125178605747, - "grad_norm": 2.9078798294067383, - "learning_rate": 4.9300466484780226e-05, - "loss": 0.09582929015159607, - "step": 1980 - }, - { - "epoch": 0.2673863816773482, - "grad_norm": 3.391852855682373, - "learning_rate": 4.92863023271571e-05, - "loss": 0.0850919783115387, - "step": 2000 - }, - { - "epoch": 0.27006024549412166, - "grad_norm": 5.522103309631348, - "learning_rate": 4.927199828115395e-05, - "loss": 0.050999772548675534, - "step": 2020 - }, - { - "epoch": 0.27273410931089515, - "grad_norm": 0.90350741147995, - "learning_rate": 4.925755442916167e-05, - "loss": 0.10100446939468384, - "step": 2040 - }, - { - "epoch": 0.2754079731276686, - "grad_norm": 1.602030634880066, - "learning_rate": 4.924297085437641e-05, - "loss": 0.0468633770942688, - "step": 2060 - }, - { - "epoch": 0.2780818369444421, - "grad_norm": 1.5823460817337036, - "learning_rate": 4.922824764079913e-05, - "loss": 0.06786358952522278, - "step": 2080 - }, - { - "epoch": 0.2807557007612156, - "grad_norm": 1.6624343395233154, - "learning_rate": 4.92133848732351e-05, - "loss": 0.05772828459739685, - "step": 2100 - }, - { - "epoch": 0.28342956457798907, - "grad_norm": 0.947078287601471, - "learning_rate": 4.9198382637293424e-05, - "loss": 0.08012173175811768, - "step": 2120 - }, - { - "epoch": 0.28610342839476255, - "grad_norm": 0.2919924259185791, - "learning_rate": 4.918324101938653e-05, - "loss": 0.1208539366722107, - "step": 2140 - }, - { - "epoch": 0.28877729221153603, - "grad_norm": 9.258247375488281, - "learning_rate": 4.916796010672969e-05, - "loss": 0.10037034749984741, - "step": 2160 - }, - { - "epoch": 0.2914511560283095, - "grad_norm": 4.0920491218566895, - "learning_rate": 4.915253998734051e-05, - "loss": 0.061488878726959226, - "step": 2180 - }, - { - "epoch": 0.294125019845083, - "grad_norm": 6.1126627922058105, - "learning_rate": 4.913698075003841e-05, - "loss": 0.0862967312335968, - "step": 2200 - }, - { - "epoch": 0.29679888366185647, - "grad_norm": 2.585484743118286, - "learning_rate": 4.912128248444414e-05, - "loss": 0.05393874645233154, - "step": 2220 - }, - { - "epoch": 0.29947274747862995, - "grad_norm": 6.944481372833252, - "learning_rate": 4.9105445280979256e-05, - "loss": 0.08570566773414612, - "step": 2240 - }, - { - "epoch": 0.30214661129540343, - "grad_norm": 1.3824089765548706, - "learning_rate": 4.908946923086556e-05, - "loss": 0.09689127206802368, - "step": 2260 - }, - { - "epoch": 0.30482047511217697, - "grad_norm": 3.4861342906951904, - "learning_rate": 4.907335442612464e-05, - "loss": 0.12550976276397705, - "step": 2280 - }, - { - "epoch": 0.30749433892895045, - "grad_norm": 3.668980121612549, - "learning_rate": 4.905710095957728e-05, - "loss": 0.09089353680610657, - "step": 2300 - }, - { - "epoch": 0.31016820274572393, - "grad_norm": 1.093095064163208, - "learning_rate": 4.904070892484298e-05, - "loss": 0.03925192356109619, - "step": 2320 - }, - { - "epoch": 0.3128420665624974, - "grad_norm": 0.8169485926628113, - "learning_rate": 4.9024178416339364e-05, - "loss": 0.0979581356048584, - "step": 2340 - }, - { - "epoch": 0.3155159303792709, - "grad_norm": 1.892451286315918, - "learning_rate": 4.900750952928166e-05, - "loss": 0.05913209915161133, - "step": 2360 - }, - { - "epoch": 0.3181897941960444, - "grad_norm": 0.24644255638122559, - "learning_rate": 4.8990702359682184e-05, - "loss": 0.06815173625946044, - "step": 2380 - }, - { - "epoch": 0.32086365801281785, - "grad_norm": 2.1861305236816406, - "learning_rate": 4.897375700434972e-05, - "loss": 0.04142785966396332, - "step": 2400 - }, - { - "epoch": 0.32353752182959133, - "grad_norm": 2.6643004417419434, - "learning_rate": 4.8956673560889013e-05, - "loss": 0.05177200436592102, - "step": 2420 - }, - { - "epoch": 0.3262113856463648, - "grad_norm": 2.588113784790039, - "learning_rate": 4.8939452127700195e-05, - "loss": 0.05783546566963196, - "step": 2440 - }, - { - "epoch": 0.3288852494631383, - "grad_norm": 2.419644594192505, - "learning_rate": 4.8922092803978203e-05, - "loss": 0.08906854391098022, - "step": 2460 - }, - { - "epoch": 0.3315591132799118, - "grad_norm": 0.16949939727783203, - "learning_rate": 4.890459568971223e-05, - "loss": 0.10305211544036866, - "step": 2480 - }, - { - "epoch": 0.33423297709668526, - "grad_norm": 0.10032984614372253, - "learning_rate": 4.8886960885685126e-05, - "loss": 0.06348527669906616, - "step": 2500 - }, - { - "epoch": 0.33690684091345874, - "grad_norm": 3.3658738136291504, - "learning_rate": 4.8869188493472854e-05, - "loss": 0.06826075911521912, - "step": 2520 - }, - { - "epoch": 0.3395807047302322, - "grad_norm": 0.8656186461448669, - "learning_rate": 4.885127861544386e-05, - "loss": 0.05929765701293945, - "step": 2540 - }, - { - "epoch": 0.3422545685470057, - "grad_norm": 0.1492065042257309, - "learning_rate": 4.8833231354758496e-05, - "loss": 0.09429731965065002, - "step": 2560 - }, - { - "epoch": 0.3449284323637792, - "grad_norm": 0.6010928153991699, - "learning_rate": 4.881504681536846e-05, - "loss": 0.06262240409851075, - "step": 2580 - }, - { - "epoch": 0.34760229618055266, - "grad_norm": 1.6506450176239014, - "learning_rate": 4.879672510201616e-05, - "loss": 0.061688083410263064, - "step": 2600 - }, - { - "epoch": 0.35027615999732614, - "grad_norm": 0.2703142464160919, - "learning_rate": 4.877826632023412e-05, - "loss": 0.06175137162208557, - "step": 2620 - }, - { - "epoch": 0.3529500238140996, - "grad_norm": 3.1056365966796875, - "learning_rate": 4.875967057634437e-05, - "loss": 0.07828506827354431, - "step": 2640 - }, - { - "epoch": 0.3556238876308731, - "grad_norm": 0.28790283203125, - "learning_rate": 4.874093797745784e-05, - "loss": 0.11355981826782227, - "step": 2660 - }, - { - "epoch": 0.3582977514476466, - "grad_norm": 2.3372068405151367, - "learning_rate": 4.8722068631473746e-05, - "loss": 0.048267871141433716, - "step": 2680 - }, - { - "epoch": 0.36097161526442006, - "grad_norm": 0.12767371535301208, - "learning_rate": 4.8703062647078976e-05, - "loss": 0.04319801032543182, - "step": 2700 - }, - { - "epoch": 0.36364547908119355, - "grad_norm": 0.5145738124847412, - "learning_rate": 4.868392013374741e-05, - "loss": 0.0773090660572052, - "step": 2720 - }, - { - "epoch": 0.366319342897967, - "grad_norm": 0.8518500328063965, - "learning_rate": 4.866464120173937e-05, - "loss": 0.05149460434913635, - "step": 2740 - }, - { - "epoch": 0.3689932067147405, - "grad_norm": 3.6726584434509277, - "learning_rate": 4.8645225962100924e-05, - "loss": 0.06896821856498718, - "step": 2760 - }, - { - "epoch": 0.371667070531514, - "grad_norm": 1.5626497268676758, - "learning_rate": 4.862567452666329e-05, - "loss": 0.047730174660682675, - "step": 2780 - }, - { - "epoch": 0.37434093434828747, - "grad_norm": 6.562028884887695, - "learning_rate": 4.8605987008042144e-05, - "loss": 0.07060698866844177, - "step": 2800 - }, - { - "epoch": 0.37701479816506095, - "grad_norm": 0.7631726861000061, - "learning_rate": 4.8586163519637005e-05, - "loss": 0.04944324493408203, - "step": 2820 - }, - { - "epoch": 0.37968866198183443, - "grad_norm": 1.6982293128967285, - "learning_rate": 4.8566204175630595e-05, - "loss": 0.03000348210334778, - "step": 2840 - }, - { - "epoch": 0.3823625257986079, - "grad_norm": 0.6487429141998291, - "learning_rate": 4.854610909098812e-05, - "loss": 0.06691416501998901, - "step": 2860 - }, - { - "epoch": 0.3850363896153814, - "grad_norm": 0.7648892402648926, - "learning_rate": 4.852587838145668e-05, - "loss": 0.05529783964157105, - "step": 2880 - }, - { - "epoch": 0.38771025343215487, - "grad_norm": 0.11601298302412033, - "learning_rate": 4.850551216356457e-05, - "loss": 0.07780832052230835, - "step": 2900 - }, - { - "epoch": 0.39038411724892835, - "grad_norm": 0.9443137645721436, - "learning_rate": 4.8485010554620594e-05, - "loss": 0.08007023930549621, - "step": 2920 - }, - { - "epoch": 0.39305798106570183, - "grad_norm": 0.8828252553939819, - "learning_rate": 4.846437367271341e-05, - "loss": 0.03541453182697296, - "step": 2940 - }, - { - "epoch": 0.3957318448824753, - "grad_norm": 0.21668888628482819, - "learning_rate": 4.844360163671083e-05, - "loss": 0.08354364633560181, - "step": 2960 - }, - { - "epoch": 0.3984057086992488, - "grad_norm": 0.6840483546257019, - "learning_rate": 4.8422694566259194e-05, - "loss": 0.045807772874832155, - "step": 2980 - }, - { - "epoch": 0.4010795725160223, - "grad_norm": 1.2754698991775513, - "learning_rate": 4.8401652581782584e-05, - "loss": 0.053487342596054074, - "step": 3000 - }, - { - "epoch": 0.40375343633279576, - "grad_norm": 0.19012756645679474, - "learning_rate": 4.838047580448222e-05, - "loss": 0.05881953239440918, - "step": 3020 - }, - { - "epoch": 0.40642730014956924, - "grad_norm": 2.1057698726654053, - "learning_rate": 4.835916435633569e-05, - "loss": 0.031065690517425536, - "step": 3040 - }, - { - "epoch": 0.4091011639663427, - "grad_norm": 4.188559055328369, - "learning_rate": 4.833771836009633e-05, - "loss": 0.07205432653427124, - "step": 3060 - }, - { - "epoch": 0.4117750277831162, - "grad_norm": 6.975829124450684, - "learning_rate": 4.831613793929242e-05, - "loss": 0.04953635036945343, - "step": 3080 - }, - { - "epoch": 0.4144488915998897, - "grad_norm": 4.725269317626953, - "learning_rate": 4.8294423218226546e-05, - "loss": 0.05965519547462463, - "step": 3100 - }, - { - "epoch": 0.41712275541666316, - "grad_norm": 1.7124755382537842, - "learning_rate": 4.827257432197486e-05, - "loss": 0.039625433087348935, - "step": 3120 - }, - { - "epoch": 0.41979661923343664, - "grad_norm": 2.6687324047088623, - "learning_rate": 4.825059137638636e-05, - "loss": 0.05020809769630432, - "step": 3140 - }, - { - "epoch": 0.4224704830502101, - "grad_norm": 1.111640214920044, - "learning_rate": 4.822847450808215e-05, - "loss": 0.04404452443122864, - "step": 3160 - }, - { - "epoch": 0.42514434686698366, - "grad_norm": 0.2128070890903473, - "learning_rate": 4.8206223844454744e-05, - "loss": 0.08283355236053466, - "step": 3180 - }, - { - "epoch": 0.42781821068375714, - "grad_norm": 0.10757248103618622, - "learning_rate": 4.818383951366729e-05, - "loss": 0.08568671345710754, - "step": 3200 - }, - { - "epoch": 0.4304920745005306, - "grad_norm": 0.08344592899084091, - "learning_rate": 4.816132164465289e-05, - "loss": 0.0426956832408905, - "step": 3220 - }, - { - "epoch": 0.4331659383173041, - "grad_norm": 0.5657751560211182, - "learning_rate": 4.813867036711378e-05, - "loss": 0.04971776902675629, - "step": 3240 - }, - { - "epoch": 0.4358398021340776, - "grad_norm": 2.1529288291931152, - "learning_rate": 4.8115885811520654e-05, - "loss": 0.025386181473731995, - "step": 3260 - }, - { - "epoch": 0.43851366595085106, - "grad_norm": 4.228519916534424, - "learning_rate": 4.809296810911188e-05, - "loss": 0.06401395201683044, - "step": 3280 - }, - { - "epoch": 0.44118752976762454, - "grad_norm": 6.770420551300049, - "learning_rate": 4.806991739189274e-05, - "loss": 0.16425553560256959, - "step": 3300 - }, - { - "epoch": 0.443861393584398, - "grad_norm": 0.5303187370300293, - "learning_rate": 4.804673379263467e-05, - "loss": 0.045900467038154605, - "step": 3320 - }, - { - "epoch": 0.4465352574011715, - "grad_norm": 0.221473827958107, - "learning_rate": 4.802341744487453e-05, - "loss": 0.07529735565185547, - "step": 3340 - }, - { - "epoch": 0.449209121217945, - "grad_norm": 3.48736834526062, - "learning_rate": 4.799996848291378e-05, - "loss": 0.062433135509490964, - "step": 3360 - }, - { - "epoch": 0.45188298503471847, - "grad_norm": 2.650038242340088, - "learning_rate": 4.797638704181774e-05, - "loss": 0.03762982189655304, - "step": 3380 - }, - { - "epoch": 0.45455684885149195, - "grad_norm": 3.159665584564209, - "learning_rate": 4.795267325741483e-05, - "loss": 0.04745924174785614, - "step": 3400 - }, - { - "epoch": 0.4572307126682654, - "grad_norm": 0.8763885498046875, - "learning_rate": 4.7928827266295715e-05, - "loss": 0.07380253076553345, - "step": 3420 - }, - { - "epoch": 0.4599045764850389, - "grad_norm": 0.1779366433620453, - "learning_rate": 4.790484920581262e-05, - "loss": 0.045916372537612916, - "step": 3440 - }, - { - "epoch": 0.4625784403018124, - "grad_norm": 1.1228729486465454, - "learning_rate": 4.7880739214078454e-05, - "loss": 0.04461723566055298, - "step": 3460 - }, - { - "epoch": 0.46525230411858587, - "grad_norm": 0.1629919707775116, - "learning_rate": 4.785649742996605e-05, - "loss": 0.017159442603588104, - "step": 3480 - }, - { - "epoch": 0.46792616793535935, - "grad_norm": 3.583951473236084, - "learning_rate": 4.783212399310737e-05, - "loss": 0.047145146131515506, - "step": 3500 - }, - { - "epoch": 0.47060003175213283, - "grad_norm": 0.9766237139701843, - "learning_rate": 4.780761904389267e-05, - "loss": 0.050229442119598386, - "step": 3520 - }, - { - "epoch": 0.4732738955689063, - "grad_norm": 0.05617872253060341, - "learning_rate": 4.778298272346976e-05, - "loss": 0.024862812459468843, - "step": 3540 - }, - { - "epoch": 0.4759477593856798, - "grad_norm": 1.3586453199386597, - "learning_rate": 4.775821517374308e-05, - "loss": 0.02117772251367569, - "step": 3560 - }, - { - "epoch": 0.4786216232024533, - "grad_norm": 1.2116742134094238, - "learning_rate": 4.7733316537373006e-05, - "loss": 0.03060794174671173, - "step": 3580 - }, - { - "epoch": 0.48129548701922675, - "grad_norm": 0.39403238892555237, - "learning_rate": 4.770828695777493e-05, - "loss": 0.05482668280601501, - "step": 3600 - }, - { - "epoch": 0.48396935083600023, - "grad_norm": 0.9248486161231995, - "learning_rate": 4.7683126579118495e-05, - "loss": 0.03612814247608185, - "step": 3620 - }, - { - "epoch": 0.4866432146527737, - "grad_norm": 0.1624649167060852, - "learning_rate": 4.7657835546326736e-05, - "loss": 0.04334873259067536, - "step": 3640 - }, - { - "epoch": 0.4893170784695472, - "grad_norm": 0.5321119427680969, - "learning_rate": 4.763241400507524e-05, - "loss": 0.0461233913898468, - "step": 3660 - }, - { - "epoch": 0.4919909422863207, - "grad_norm": 0.34861093759536743, - "learning_rate": 4.760686210179133e-05, - "loss": 0.024829554557800292, - "step": 3680 - }, - { - "epoch": 0.49466480610309416, - "grad_norm": 1.2561241388320923, - "learning_rate": 4.758117998365322e-05, - "loss": 0.03157005608081818, - "step": 3700 - }, - { - "epoch": 0.49733866991986764, - "grad_norm": 0.8691341280937195, - "learning_rate": 4.7555367798589146e-05, - "loss": 0.04310203492641449, - "step": 3720 - }, - { - "epoch": 0.5000125337366411, - "grad_norm": 0.3134572505950928, - "learning_rate": 4.752942569527653e-05, - "loss": 0.03796039223670959, - "step": 3740 - }, - { - "epoch": 0.5026863975534146, - "grad_norm": 2.3359289169311523, - "learning_rate": 4.75033538231411e-05, - "loss": 0.055599170923233035, - "step": 3760 - }, - { - "epoch": 0.5053602613701881, - "grad_norm": 7.426175594329834, - "learning_rate": 4.747715233235608e-05, - "loss": 0.054436272382736205, - "step": 3780 - }, - { - "epoch": 0.5080341251869616, - "grad_norm": 0.5940203070640564, - "learning_rate": 4.745082137384128e-05, - "loss": 0.03682814538478851, - "step": 3800 - }, - { - "epoch": 0.510707989003735, - "grad_norm": 0.22821389138698578, - "learning_rate": 4.7424361099262225e-05, - "loss": 0.051123309135437014, - "step": 3820 - }, - { - "epoch": 0.5133818528205085, - "grad_norm": 8.20633602142334, - "learning_rate": 4.739777166102932e-05, - "loss": 0.0704378604888916, - "step": 3840 - }, - { - "epoch": 0.516055716637282, - "grad_norm": 3.023848533630371, - "learning_rate": 4.737105321229694e-05, - "loss": 0.03368058800697327, - "step": 3860 - }, - { - "epoch": 0.5187295804540555, - "grad_norm": 0.07666649669408798, - "learning_rate": 4.7344205906962555e-05, - "loss": 0.03665303289890289, - "step": 3880 - }, - { - "epoch": 0.521403444270829, - "grad_norm": 0.7571629881858826, - "learning_rate": 4.731722989966585e-05, - "loss": 0.058415502309799194, - "step": 3900 - }, - { - "epoch": 0.5240773080876024, - "grad_norm": 3.2599120140075684, - "learning_rate": 4.7290125345787816e-05, - "loss": 0.07323018908500671, - "step": 3920 - }, - { - "epoch": 0.5267511719043759, - "grad_norm": 0.28930988907814026, - "learning_rate": 4.7262892401449886e-05, - "loss": 0.054371267557144165, - "step": 3940 - }, - { - "epoch": 0.5294250357211494, - "grad_norm": 2.2296454906463623, - "learning_rate": 4.7235531223513004e-05, - "loss": 0.040819621086120604, - "step": 3960 - }, - { - "epoch": 0.5320988995379229, - "grad_norm": 0.11608211696147919, - "learning_rate": 4.720804196957675e-05, - "loss": 0.05215579271316528, - "step": 3980 - }, - { - "epoch": 0.5347727633546964, - "grad_norm": 1.1587547063827515, - "learning_rate": 4.7180424797978415e-05, - "loss": 0.026277875900268553, - "step": 4000 - }, - { - "epoch": 0.5374466271714698, - "grad_norm": 0.06253435462713242, - "learning_rate": 4.7152679867792074e-05, - "loss": 0.02574407756328583, - "step": 4020 - }, - { - "epoch": 0.5401204909882433, - "grad_norm": 1.3441458940505981, - "learning_rate": 4.71248073388277e-05, - "loss": 0.05538107752799988, - "step": 4040 - }, - { - "epoch": 0.5427943548050168, - "grad_norm": 0.48076340556144714, - "learning_rate": 4.7096807371630236e-05, - "loss": 0.047986540198326114, - "step": 4060 - }, - { - "epoch": 0.5454682186217903, - "grad_norm": 0.5924936532974243, - "learning_rate": 4.706868012747867e-05, - "loss": 0.05463914275169372, - "step": 4080 - }, - { - "epoch": 0.7673995566395854, - "grad_norm": 0.05143728107213974, - "learning_rate": 4.431151627307268e-05, - "loss": 0.00959376593430837, - "step": 4100 - }, - { - "epoch": 0.771142969110998, - "grad_norm": 1.2308074235916138, - "learning_rate": 4.425806509248848e-05, - "loss": 0.002745623141527176, - "step": 4120 - }, - { - "epoch": 0.7748863815824106, - "grad_norm": 2.080223798751831, - "learning_rate": 4.420439652052499e-05, - "loss": 0.012390998750925064, - "step": 4140 - }, - { - "epoch": 0.7786297940538233, - "grad_norm": 0.049312230199575424, - "learning_rate": 4.415051116301072e-05, - "loss": 0.004607534408569336, - "step": 4160 - }, - { - "epoch": 0.7823732065252359, - "grad_norm": 0.07747476547956467, - "learning_rate": 4.409640962822132e-05, - "loss": 0.034441503882408145, - "step": 4180 - }, - { - "epoch": 0.7861166189966485, - "grad_norm": 0.021327875554561615, - "learning_rate": 4.404209252687275e-05, - "loss": 0.009768449515104295, - "step": 4200 - }, - { - "epoch": 0.789860031468061, - "grad_norm": 2.406580924987793, - "learning_rate": 4.398756047211431e-05, - "loss": 0.005304037779569626, - "step": 4220 - }, - { - "epoch": 0.7936034439394737, - "grad_norm": 0.027869906276464462, - "learning_rate": 4.39328140795218e-05, - "loss": 0.00896073654294014, - "step": 4240 - }, - { - "epoch": 0.7973468564108863, - "grad_norm": 0.09702044725418091, - "learning_rate": 4.387785396709052e-05, - "loss": 0.0117533378303051, - "step": 4260 - }, - { - "epoch": 0.801090268882299, - "grad_norm": 0.529065728187561, - "learning_rate": 4.382268075522831e-05, - "loss": 0.0037526611238718035, - "step": 4280 - }, - { - "epoch": 0.8048336813537116, - "grad_norm": 0.015109462663531303, - "learning_rate": 4.3767295066748564e-05, - "loss": 0.0025708725675940513, - "step": 4300 - }, - { - "epoch": 0.8085770938251241, - "grad_norm": 0.7257627248764038, - "learning_rate": 4.371169752686316e-05, - "loss": 0.006234285607933998, - "step": 4320 - }, - { - "epoch": 0.8123205062965367, - "grad_norm": 0.016853008419275284, - "learning_rate": 4.3655888763175436e-05, - "loss": 0.0023587727919220924, - "step": 4340 - }, - { - "epoch": 0.8160639187679494, - "grad_norm": 0.017816167324781418, - "learning_rate": 4.3599869405673085e-05, - "loss": 0.0012389549054205417, - "step": 4360 - }, - { - "epoch": 0.819807331239362, - "grad_norm": 0.014672616496682167, - "learning_rate": 4.354364008672106e-05, - "loss": 0.002244691364467144, - "step": 4380 - }, - { - "epoch": 0.8235507437107746, - "grad_norm": 0.044869400560855865, - "learning_rate": 4.3487201441054435e-05, - "loss": 0.007713723182678223, - "step": 4400 - }, - { - "epoch": 0.8272941561821872, - "grad_norm": 0.06367291510105133, - "learning_rate": 4.343055410577122e-05, - "loss": 0.005743256583809852, - "step": 4420 - }, - { - "epoch": 0.8310375686535998, - "grad_norm": 0.1354215145111084, - "learning_rate": 4.3373698720325176e-05, - "loss": 0.009635470807552338, - "step": 4440 - }, - { - "epoch": 0.8347809811250124, - "grad_norm": 0.9089844822883606, - "learning_rate": 4.331663592651862e-05, - "loss": 0.01007603257894516, - "step": 4460 - }, - { - "epoch": 0.838524393596425, - "grad_norm": 0.025831619277596474, - "learning_rate": 4.3259366368495167e-05, - "loss": 0.006179215386509895, - "step": 4480 - }, - { - "epoch": 0.8422678060678377, - "grad_norm": 0.016653764992952347, - "learning_rate": 4.320189069273243e-05, - "loss": 0.0025156451389193534, - "step": 4500 - }, - { - "epoch": 0.8460112185392502, - "grad_norm": 0.27361780405044556, - "learning_rate": 4.3144209548034766e-05, - "loss": 0.002235286869108677, - "step": 4520 - }, - { - "epoch": 0.8497546310106628, - "grad_norm": 2.6958701610565186, - "learning_rate": 4.3086323585525915e-05, - "loss": 0.03571180701255798, - "step": 4540 - }, - { - "epoch": 0.8534980434820755, - "grad_norm": 0.1260778158903122, - "learning_rate": 4.3028233458641696e-05, - "loss": 0.0036518506705760954, - "step": 4560 - }, - { - "epoch": 0.8572414559534881, - "grad_norm": 0.2445528209209442, - "learning_rate": 4.2969939823122586e-05, - "loss": 0.024949796497821808, - "step": 4580 - }, - { - "epoch": 0.8609848684249007, - "grad_norm": 0.1674242913722992, - "learning_rate": 4.291144333700633e-05, - "loss": 0.002089798077940941, - "step": 4600 - }, - { - "epoch": 0.8647282808963134, - "grad_norm": 0.05161884427070618, - "learning_rate": 4.2852744660620515e-05, - "loss": 0.007847145944833756, - "step": 4620 - }, - { - "epoch": 0.8684716933677259, - "grad_norm": 0.019796324893832207, - "learning_rate": 4.279384445657514e-05, - "loss": 0.0023555334657430647, - "step": 4640 - }, - { - "epoch": 0.8722151058391385, - "grad_norm": 0.0647754967212677, - "learning_rate": 4.2734743389755096e-05, - "loss": 0.009586349129676819, - "step": 4660 - }, - { - "epoch": 0.8759585183105512, - "grad_norm": 0.015243460424244404, - "learning_rate": 4.267544212731268e-05, - "loss": 0.017788709700107576, - "step": 4680 - }, - { - "epoch": 0.8797019307819638, - "grad_norm": 0.05756703019142151, - "learning_rate": 4.261594133866007e-05, - "loss": 0.014256520569324494, - "step": 4700 - }, - { - "epoch": 0.8834453432533764, - "grad_norm": 0.2002931535243988, - "learning_rate": 4.255624169546175e-05, - "loss": 0.0014025470241904258, - "step": 4720 - }, - { - "epoch": 0.887188755724789, - "grad_norm": 0.04325389489531517, - "learning_rate": 4.249634387162696e-05, - "loss": 0.010552891343832017, - "step": 4740 - }, - { - "epoch": 0.8909321681962016, - "grad_norm": 0.8975178599357605, - "learning_rate": 4.243624854330206e-05, - "loss": 0.0032475266605615618, - "step": 4760 - }, - { - "epoch": 0.8946755806676142, - "grad_norm": 0.01541830413043499, - "learning_rate": 4.237595638886288e-05, - "loss": 0.003157203644514084, - "step": 4780 - }, - { - "epoch": 0.8984189931390268, - "grad_norm": 1.673305869102478, - "learning_rate": 4.231546808890713e-05, - "loss": 0.0028239911422133445, - "step": 4800 - }, - { - "epoch": 0.9021624056104395, - "grad_norm": 0.021689629182219505, - "learning_rate": 4.225478432624665e-05, - "loss": 0.0026885712519288062, - "step": 4820 - }, - { - "epoch": 0.905905818081852, - "grad_norm": 0.019590798765420914, - "learning_rate": 4.219390578589973e-05, - "loss": 0.00780024379491806, - "step": 4840 - }, - { - "epoch": 0.9096492305532646, - "grad_norm": 0.024581020697951317, - "learning_rate": 4.213283315508337e-05, - "loss": 0.006697511672973633, - "step": 4860 - }, - { - "epoch": 0.9133926430246773, - "grad_norm": 0.20615583658218384, - "learning_rate": 4.207156712320555e-05, - "loss": 0.007314208894968033, - "step": 4880 - }, - { - "epoch": 0.9171360554960899, - "grad_norm": 0.015673745423555374, - "learning_rate": 4.20101083818574e-05, - "loss": 0.004841562733054161, - "step": 4900 - }, - { - "epoch": 0.9208794679675025, - "grad_norm": 0.008306623436510563, - "learning_rate": 4.194845762480544e-05, - "loss": 0.0010150263085961341, - "step": 4920 - }, - { - "epoch": 0.9246228804389152, - "grad_norm": 0.051861703395843506, - "learning_rate": 4.188661554798369e-05, - "loss": 0.011043114960193634, - "step": 4940 - }, - { - "epoch": 0.9283662929103277, - "grad_norm": 1.7019767761230469, - "learning_rate": 4.1824582849485884e-05, - "loss": 0.004985674470663071, - "step": 4960 - }, - { - "epoch": 0.9321097053817403, - "grad_norm": 0.021240154281258583, - "learning_rate": 4.176236022955755e-05, - "loss": 0.04885836541652679, - "step": 4980 - }, - { - "epoch": 0.935853117853153, - "grad_norm": 0.016504865139722824, - "learning_rate": 4.16999483905881e-05, - "loss": 0.0027378931641578673, - "step": 5000 - }, - { - "epoch": 0.9395965303245656, - "grad_norm": 0.014015628024935722, - "learning_rate": 4.163734803710294e-05, - "loss": 0.012781022489070893, - "step": 5020 - }, - { - "epoch": 0.9433399427959782, - "grad_norm": 0.013812500052154064, - "learning_rate": 4.157455987575545e-05, - "loss": 0.007508871704339981, - "step": 5040 - }, - { - "epoch": 0.9470833552673907, - "grad_norm": 0.01622290164232254, - "learning_rate": 4.1511584615319075e-05, - "loss": 0.0014614147134125234, - "step": 5060 - }, - { - "epoch": 0.9508267677388034, - "grad_norm": 0.01259149145334959, - "learning_rate": 4.144842296667929e-05, - "loss": 0.006202424317598343, - "step": 5080 - }, - { - "epoch": 0.954570180210216, - "grad_norm": 0.012383027002215385, - "learning_rate": 4.138507564282558e-05, - "loss": 0.006122353300452232, - "step": 5100 - }, - { - "epoch": 0.9583135926816286, - "grad_norm": 0.006499920971691608, - "learning_rate": 4.1321543358843385e-05, - "loss": 0.0008865024894475937, - "step": 5120 - }, - { - "epoch": 0.9620570051530413, - "grad_norm": 0.00830752868205309, - "learning_rate": 4.125782683190606e-05, - "loss": 0.0008420860394835472, - "step": 5140 - }, - { - "epoch": 0.9658004176244538, - "grad_norm": 0.01525857299566269, - "learning_rate": 4.119392678126673e-05, - "loss": 0.00587364137172699, - "step": 5160 - }, - { - "epoch": 0.9695438300958664, - "grad_norm": 0.01072095800191164, - "learning_rate": 4.11298439282502e-05, - "loss": 0.00853007659316063, - "step": 5180 - }, - { - "epoch": 0.973287242567279, - "grad_norm": 0.030316641554236412, - "learning_rate": 4.106557899624482e-05, - "loss": 0.0058747071772813795, - "step": 5200 - }, - { - "epoch": 0.9770306550386917, - "grad_norm": 0.0391647033393383, - "learning_rate": 4.1001132710694304e-05, - "loss": 0.0034765828400850295, - "step": 5220 - }, - { - "epoch": 0.9807740675101043, - "grad_norm": 0.04938298836350441, - "learning_rate": 4.093650579908953e-05, - "loss": 0.007594724744558334, - "step": 5240 - }, - { - "epoch": 0.984517479981517, - "grad_norm": 0.005873252172023058, - "learning_rate": 4.087169899096037e-05, - "loss": 0.013347607851028443, - "step": 5260 - }, - { - "epoch": 0.9882608924529295, - "grad_norm": 1.2757259607315063, - "learning_rate": 4.080671301786741e-05, - "loss": 0.004837355017662049, - "step": 5280 - }, - { - "epoch": 0.9920043049243421, - "grad_norm": 0.00920735765248537, - "learning_rate": 4.0741548613393675e-05, - "loss": 0.007415445148944854, - "step": 5300 - }, - { - "epoch": 0.9957477173957547, - "grad_norm": 0.5702093839645386, - "learning_rate": 4.067620651313647e-05, - "loss": 0.00406576506793499, - "step": 5320 - }, - { - "epoch": 0.9994911298671674, - "grad_norm": 1.8361051082611084, - "learning_rate": 4.0610687454698906e-05, - "loss": 0.00997612327337265, - "step": 5340 - }, - { - "epoch": 1.0031819006007008, - "grad_norm": 3.335326910018921, - "learning_rate": 4.0544992177681685e-05, - "loss": 0.008442799001932145, - "step": 5360 - }, - { - "epoch": 1.0069253130721134, - "grad_norm": 0.03184954449534416, - "learning_rate": 4.047912142367473e-05, - "loss": 0.008095134049654007, - "step": 5380 - }, - { - "epoch": 1.010668725543526, - "grad_norm": 0.029989074915647507, - "learning_rate": 4.04130759362488e-05, - "loss": 0.0012585990130901336, - "step": 5400 - }, - { - "epoch": 1.0144121380149385, - "grad_norm": 0.08727464079856873, - "learning_rate": 4.034685646094711e-05, - "loss": 0.012588074803352356, - "step": 5420 - }, - { - "epoch": 1.018155550486351, - "grad_norm": 0.018498806282877922, - "learning_rate": 4.028046374527689e-05, - "loss": 0.001854238100349903, - "step": 5440 - }, - { - "epoch": 1.0218989629577637, - "grad_norm": 0.013779236935079098, - "learning_rate": 4.021389853870095e-05, - "loss": 0.0008004569448530674, - "step": 5460 - }, - { - "epoch": 1.0256423754291764, - "grad_norm": 0.028235070407390594, - "learning_rate": 4.0147161592629306e-05, - "loss": 0.002274145185947418, - "step": 5480 - }, - { - "epoch": 1.029385787900589, - "grad_norm": 0.023030120879411697, - "learning_rate": 4.008025366041055e-05, - "loss": 0.008717305958271027, - "step": 5500 - }, - { - "epoch": 1.0331292003720016, - "grad_norm": 0.018347155302762985, - "learning_rate": 4.001317549732345e-05, - "loss": 0.00244256854057312, - "step": 5520 - }, - { - "epoch": 1.0368726128434143, - "grad_norm": 0.03449391946196556, - "learning_rate": 3.99459278605684e-05, - "loss": 0.0039924226701259615, - "step": 5540 - }, - { - "epoch": 1.0406160253148269, - "grad_norm": 0.030406463891267776, - "learning_rate": 3.9878511509258866e-05, - "loss": 0.0021008485928177834, - "step": 5560 - }, - { - "epoch": 1.0443594377862395, - "grad_norm": 0.01783100888133049, - "learning_rate": 3.9810927204412803e-05, - "loss": 0.0006656501442193985, - "step": 5580 - }, - { - "epoch": 1.0481028502576522, - "grad_norm": 0.05360455811023712, - "learning_rate": 3.974317570894413e-05, - "loss": 0.005278818309307098, - "step": 5600 - }, - { - "epoch": 1.0518462627290646, - "grad_norm": 0.008699169382452965, - "learning_rate": 3.9675257787654e-05, - "loss": 0.005309444293379784, - "step": 5620 - }, - { - "epoch": 1.0555896752004772, - "grad_norm": 0.036641959100961685, - "learning_rate": 3.960717420722227e-05, - "loss": 0.0034692320972681046, - "step": 5640 - }, - { - "epoch": 1.0593330876718898, - "grad_norm": 0.012212110683321953, - "learning_rate": 3.953892573619883e-05, - "loss": 0.005343861132860184, - "step": 5660 - }, - { - "epoch": 1.0630765001433025, - "grad_norm": 0.011296284385025501, - "learning_rate": 3.947051314499489e-05, - "loss": 0.0038058970123529432, - "step": 5680 - }, - { - "epoch": 1.066819912614715, - "grad_norm": 0.05954049900174141, - "learning_rate": 3.94019372058743e-05, - "loss": 0.008142991364002228, - "step": 5700 - }, - { - "epoch": 1.0705633250861277, - "grad_norm": 0.03478416055440903, - "learning_rate": 3.933319869294483e-05, - "loss": 0.0075227849185466765, - "step": 5720 - }, - { - "epoch": 1.0743067375575404, - "grad_norm": 0.014586996287107468, - "learning_rate": 3.9264298382149455e-05, - "loss": 0.0036750122904777526, - "step": 5740 - }, - { - "epoch": 1.078050150028953, - "grad_norm": 0.025754544883966446, - "learning_rate": 3.919523705125757e-05, - "loss": 0.004151013493537903, - "step": 5760 - }, - { - "epoch": 1.0817935625003656, - "grad_norm": 0.03239905461668968, - "learning_rate": 3.9126015479856205e-05, - "loss": 0.00861695185303688, - "step": 5780 - }, - { - "epoch": 1.0855369749717783, - "grad_norm": 0.03506994619965553, - "learning_rate": 3.9056634449341256e-05, - "loss": 0.003123755753040314, - "step": 5800 - }, - { - "epoch": 1.089280387443191, - "grad_norm": 0.0286911278963089, - "learning_rate": 3.898709474290864e-05, - "loss": 0.002537376619875431, - "step": 5820 - }, - { - "epoch": 1.0930237999146033, - "grad_norm": 0.03490692004561424, - "learning_rate": 3.8917397145545454e-05, - "loss": 0.0010227372869849205, - "step": 5840 - }, - { - "epoch": 1.096767212386016, - "grad_norm": 0.013748899102210999, - "learning_rate": 3.884754244402113e-05, - "loss": 0.011847371608018875, - "step": 5860 - }, - { - "epoch": 1.1005106248574286, - "grad_norm": 0.035458195954561234, - "learning_rate": 3.877753142687852e-05, - "loss": 0.009741749614477158, - "step": 5880 - }, - { - "epoch": 1.1042540373288412, - "grad_norm": 0.012493673712015152, - "learning_rate": 3.8707364884425064e-05, - "loss": 0.006607493013143539, - "step": 5900 - }, - { - "epoch": 1.1079974498002538, - "grad_norm": 0.018607834354043007, - "learning_rate": 3.863704360872378e-05, - "loss": 0.0016217166557908058, - "step": 5920 - }, - { - "epoch": 1.1117408622716665, - "grad_norm": 0.0283930953592062, - "learning_rate": 3.8566568393584366e-05, - "loss": 0.002083975449204445, - "step": 5940 - }, - { - "epoch": 1.115484274743079, - "grad_norm": 0.05229801684617996, - "learning_rate": 3.8495940034554283e-05, - "loss": 0.0014217685908079146, - "step": 5960 - }, - { - "epoch": 1.1192276872144917, - "grad_norm": 0.008808930404484272, - "learning_rate": 3.8425159328909684e-05, - "loss": 0.0022570645436644555, - "step": 5980 - }, - { - "epoch": 1.1229710996859044, - "grad_norm": 0.020502232015132904, - "learning_rate": 3.835422707564648e-05, - "loss": 0.003745942190289497, - "step": 6000 - }, - { - "epoch": 1.126714512157317, - "grad_norm": 0.032347094267606735, - "learning_rate": 3.82831440754713e-05, - "loss": 0.003347185626626015, - "step": 6020 - }, - { - "epoch": 1.1304579246287294, - "grad_norm": 0.020310478284955025, - "learning_rate": 3.821191113079246e-05, - "loss": 0.006166417896747589, - "step": 6040 - }, - { - "epoch": 1.134201337100142, - "grad_norm": 0.06390372663736343, - "learning_rate": 3.8140529045710876e-05, - "loss": 0.0013674044981598853, - "step": 6060 - }, - { - "epoch": 1.1379447495715547, - "grad_norm": 1.1938918828964233, - "learning_rate": 3.806899862601105e-05, - "loss": 0.010550644248723984, - "step": 6080 - }, - { - "epoch": 1.1416881620429673, - "grad_norm": 0.035355549305677414, - "learning_rate": 3.799732067915189e-05, - "loss": 0.0069750770926475525, - "step": 6100 - }, - { - "epoch": 1.14543157451438, - "grad_norm": 0.009921093471348286, - "learning_rate": 3.792549601425767e-05, - "loss": 0.0027949588373303415, - "step": 6120 - }, - { - "epoch": 1.1491749869857926, - "grad_norm": 0.06172063946723938, - "learning_rate": 3.785352544210884e-05, - "loss": 0.0009372101165354251, - "step": 6140 - }, - { - "epoch": 1.1529183994572052, - "grad_norm": 0.008572470396757126, - "learning_rate": 3.778140977513294e-05, - "loss": 0.0029502738267183303, - "step": 6160 - }, - { - "epoch": 1.1566618119286178, - "grad_norm": 0.4211727976799011, - "learning_rate": 3.770914982739534e-05, - "loss": 0.014692296087741853, - "step": 6180 - }, - { - "epoch": 1.1604052244000305, - "grad_norm": 0.02292146533727646, - "learning_rate": 3.7636746414590126e-05, - "loss": 0.0020170681178569793, - "step": 6200 - }, - { - "epoch": 1.164148636871443, - "grad_norm": 0.11247449368238449, - "learning_rate": 3.756420035403086e-05, - "loss": 0.006851900368928909, - "step": 6220 - }, - { - "epoch": 1.1678920493428557, - "grad_norm": 0.020755017176270485, - "learning_rate": 3.749151246464137e-05, - "loss": 0.0021739909425377846, - "step": 6240 - }, - { - "epoch": 1.1716354618142684, - "grad_norm": 0.017202025279402733, - "learning_rate": 3.741868356694647e-05, - "loss": 0.002353278361260891, - "step": 6260 - }, - { - "epoch": 1.1753788742856808, - "grad_norm": 0.014947429299354553, - "learning_rate": 3.734571448306274e-05, - "loss": 0.0010860362090170383, - "step": 6280 - }, - { - "epoch": 1.1791222867570934, - "grad_norm": 1.5391262769699097, - "learning_rate": 3.727260603668922e-05, - "loss": 0.01233254000544548, - "step": 6300 - }, - { - "epoch": 1.182865699228506, - "grad_norm": 0.4759792387485504, - "learning_rate": 3.7199359053098133e-05, - "loss": 0.0028501398861408233, - "step": 6320 - }, - { - "epoch": 1.1866091116999187, - "grad_norm": 0.01719040609896183, - "learning_rate": 3.7125974359125536e-05, - "loss": 0.00934450700879097, - "step": 6340 - }, - { - "epoch": 1.1903525241713313, - "grad_norm": 2.4766688346862793, - "learning_rate": 3.7052452783162015e-05, - "loss": 0.018582724034786224, - "step": 6360 - }, - { - "epoch": 1.194095936642744, - "grad_norm": 0.11404932290315628, - "learning_rate": 3.6978795155143326e-05, - "loss": 0.01815672367811203, - "step": 6380 - }, - { - "epoch": 1.1978393491141566, - "grad_norm": 0.021365633234381676, - "learning_rate": 3.690500230654103e-05, - "loss": 0.004123781993985176, - "step": 6400 - }, - { - "epoch": 1.2015827615855692, - "grad_norm": 0.022478772327303886, - "learning_rate": 3.68310750703531e-05, - "loss": 0.0038731731474399567, - "step": 6420 - }, - { - "epoch": 1.2053261740569818, - "grad_norm": 0.15531578660011292, - "learning_rate": 3.67570142810945e-05, - "loss": 0.002076444961130619, - "step": 6440 - }, - { - "epoch": 1.2090695865283942, - "grad_norm": 0.012458150275051594, - "learning_rate": 3.668282077478783e-05, - "loss": 0.0027592860162258146, - "step": 6460 - }, - { - "epoch": 1.2128129989998069, - "grad_norm": 0.01572798565030098, - "learning_rate": 3.66084953889538e-05, - "loss": 0.002740098722279072, - "step": 6480 - }, - { - "epoch": 1.2165564114712195, - "grad_norm": 0.13682503998279572, - "learning_rate": 3.6534038962601835e-05, - "loss": 0.000705425813794136, - "step": 6500 - }, - { - "epoch": 1.2202998239426321, - "grad_norm": 0.030630914494395256, - "learning_rate": 3.64594523362206e-05, - "loss": 0.012480729073286057, - "step": 6520 - }, - { - "epoch": 1.2240432364140448, - "grad_norm": 0.024804554879665375, - "learning_rate": 3.638473635176848e-05, - "loss": 0.0007834361866116523, - "step": 6540 - }, - { - "epoch": 1.2277866488854574, - "grad_norm": 0.011334752663969994, - "learning_rate": 3.630989185266411e-05, - "loss": 0.022086825966835023, - "step": 6560 - }, - { - "epoch": 1.23153006135687, - "grad_norm": 0.020346902310848236, - "learning_rate": 3.623491968377684e-05, - "loss": 0.018024472892284392, - "step": 6580 - }, - { - "epoch": 1.2352734738282827, - "grad_norm": 0.015177210792899132, - "learning_rate": 3.615982069141719e-05, - "loss": 0.005251453071832657, - "step": 6600 - }, - { - "epoch": 1.2390168862996953, - "grad_norm": 0.013680647127330303, - "learning_rate": 3.608459572332733e-05, - "loss": 0.006734563410282135, - "step": 6620 - }, - { - "epoch": 1.242760298771108, - "grad_norm": 0.17980872094631195, - "learning_rate": 3.600924562867144e-05, - "loss": 0.003970410302281379, - "step": 6640 - }, - { - "epoch": 1.2465037112425206, - "grad_norm": 0.015203841030597687, - "learning_rate": 3.593377125802622e-05, - "loss": 0.0032148901373147964, - "step": 6660 - }, - { - "epoch": 1.2502471237139332, - "grad_norm": 0.017300931736826897, - "learning_rate": 3.585817346337119e-05, - "loss": 0.00467667318880558, - "step": 6680 - }, - { - "epoch": 1.2539905361853458, - "grad_norm": 0.028181765228509903, - "learning_rate": 3.5782453098079175e-05, - "loss": 0.0015515764243900776, - "step": 6700 - }, - { - "epoch": 1.2577339486567582, - "grad_norm": 0.01730780117213726, - "learning_rate": 3.570661101690657e-05, - "loss": 0.007991334050893783, - "step": 6720 - }, - { - "epoch": 1.2614773611281709, - "grad_norm": 0.014216347597539425, - "learning_rate": 3.5630648075983763e-05, - "loss": 0.002533360943198204, - "step": 6740 - }, - { - "epoch": 1.2652207735995835, - "grad_norm": 0.1556195169687271, - "learning_rate": 3.555456513280544e-05, - "loss": 0.0032653655856847764, - "step": 6760 - }, - { - "epoch": 1.2689641860709961, - "grad_norm": 0.023955868557095528, - "learning_rate": 3.5478363046220915e-05, - "loss": 0.00850408971309662, - "step": 6780 - }, - { - "epoch": 1.2727075985424088, - "grad_norm": 0.17874136567115784, - "learning_rate": 3.5402042676424424e-05, - "loss": 0.0032720811665058135, - "step": 6800 - }, - { - "epoch": 1.2764510110138214, - "grad_norm": 0.0899379625916481, - "learning_rate": 3.5325604884945434e-05, - "loss": 0.003243798017501831, - "step": 6820 - }, - { - "epoch": 1.280194423485234, - "grad_norm": 0.413362056016922, - "learning_rate": 3.5249050534638906e-05, - "loss": 0.0036127623170614243, - "step": 6840 - }, - { - "epoch": 1.2839378359566467, - "grad_norm": 0.02790931612253189, - "learning_rate": 3.517238048967554e-05, - "loss": 0.008225285261869431, - "step": 6860 - }, - { - "epoch": 1.287681248428059, - "grad_norm": 0.6761110424995422, - "learning_rate": 3.5095595615532056e-05, - "loss": 0.00199942234903574, - "step": 6880 - }, - { - "epoch": 1.2914246608994717, - "grad_norm": 4.593618869781494, - "learning_rate": 3.5018696778981385e-05, - "loss": 0.007301987707614898, - "step": 6900 - }, - { - "epoch": 1.2951680733708844, - "grad_norm": 0.09392693638801575, - "learning_rate": 3.494168484808293e-05, - "loss": 0.009008315950632095, - "step": 6920 - }, - { - "epoch": 1.298911485842297, - "grad_norm": 0.008239852264523506, - "learning_rate": 3.48645606921727e-05, - "loss": 0.012661360204219818, - "step": 6940 - }, - { - "epoch": 1.3026548983137096, - "grad_norm": 0.05141177773475647, - "learning_rate": 3.4787325181853576e-05, - "loss": 0.0007553372532129287, - "step": 6960 - }, - { - "epoch": 1.3063983107851223, - "grad_norm": 0.024333903566002846, - "learning_rate": 3.470997918898541e-05, - "loss": 0.0016128463670611382, - "step": 6980 - }, - { - "epoch": 1.3101417232565349, - "grad_norm": 0.0337531715631485, - "learning_rate": 3.4632523586675254e-05, - "loss": 0.003253454715013504, - "step": 7000 - }, - { - "epoch": 1.3138851357279475, - "grad_norm": 0.05121550336480141, - "learning_rate": 3.4554959249267436e-05, - "loss": 0.0026307271793484686, - "step": 7020 - }, - { - "epoch": 1.3176285481993602, - "grad_norm": 0.025997543707489967, - "learning_rate": 3.447728705233374e-05, - "loss": 0.0012719514779746532, - "step": 7040 - }, - { - "epoch": 1.3213719606707728, - "grad_norm": 0.009486268274486065, - "learning_rate": 3.4399507872663494e-05, - "loss": 0.002009082958102226, - "step": 7060 - }, - { - "epoch": 1.3251153731421854, - "grad_norm": 0.016816232353448868, - "learning_rate": 3.432162258825369e-05, - "loss": 0.0005956823006272316, - "step": 7080 - }, - { - "epoch": 1.328858785613598, - "grad_norm": 0.004733961541205645, - "learning_rate": 3.424363207829906e-05, - "loss": 0.003636709600687027, - "step": 7100 - }, - { - "epoch": 1.3326021980850107, - "grad_norm": 3.666203498840332, - "learning_rate": 3.4165537223182155e-05, - "loss": 0.010488419234752655, - "step": 7120 - }, - { - "epoch": 1.336345610556423, - "grad_norm": 0.021471882238984108, - "learning_rate": 3.408733890446341e-05, - "loss": 0.0009709249250590801, - "step": 7140 - }, - { - "epoch": 1.3400890230278357, - "grad_norm": 0.007639541756361723, - "learning_rate": 3.40090380048712e-05, - "loss": 0.0030905861407518388, - "step": 7160 - }, - { - "epoch": 1.3438324354992484, - "grad_norm": 0.16878941655158997, - "learning_rate": 3.393063540829186e-05, - "loss": 0.0036965351551771163, - "step": 7180 - }, - { - "epoch": 1.347575847970661, - "grad_norm": 0.07014094293117523, - "learning_rate": 3.385213199975971e-05, - "loss": 0.0005677144508808851, - "step": 7200 - }, - { - "epoch": 1.3513192604420736, - "grad_norm": 0.008626374416053295, - "learning_rate": 3.377352866544706e-05, - "loss": 0.0005447934381663799, - "step": 7220 - }, - { - "epoch": 1.3550626729134863, - "grad_norm": 0.013825134374201298, - "learning_rate": 3.3694826292654246e-05, - "loss": 0.004854041337966919, - "step": 7240 - }, - { - "epoch": 1.3588060853848989, - "grad_norm": 0.025015883147716522, - "learning_rate": 3.361602576979956e-05, - "loss": 0.004542553424835205, - "step": 7260 - }, - { - "epoch": 1.3625494978563115, - "grad_norm": 0.009614030830562115, - "learning_rate": 3.353712798640923e-05, - "loss": 0.0008775785565376282, - "step": 7280 - }, - { - "epoch": 1.366292910327724, - "grad_norm": 3.8835268020629883, - "learning_rate": 3.345813383310744e-05, - "loss": 0.0063879616558551785, - "step": 7300 - }, - { - "epoch": 1.3700363227991366, - "grad_norm": 0.005518193822354078, - "learning_rate": 3.337904420160618e-05, - "loss": 0.0010956574231386184, - "step": 7320 - }, - { - "epoch": 1.3737797352705492, - "grad_norm": 0.005018322728574276, - "learning_rate": 3.329985998469526e-05, - "loss": 0.0012317843735218047, - "step": 7340 - }, - { - "epoch": 0.6887872232777639, - "grad_norm": 0.3108454942703247, - "learning_rate": 3.322058207623218e-05, - "loss": 0.010070423781871795, - "step": 7360 - }, - { - "epoch": 0.6906589276888447, - "grad_norm": 0.3556046783924103, - "learning_rate": 3.314121137113209e-05, - "loss": 0.0278738796710968, - "step": 7380 - }, - { - "epoch": 0.6925306320999256, - "grad_norm": 4.041794300079346, - "learning_rate": 3.306174876535762e-05, - "loss": 0.025335192680358887, - "step": 7400 - }, - { - "epoch": 0.6944023365110065, - "grad_norm": 0.04647493362426758, - "learning_rate": 3.2982195155908845e-05, - "loss": 0.05056847333908081, - "step": 7420 - }, - { - "epoch": 0.6962740409220873, - "grad_norm": 0.6827419400215149, - "learning_rate": 3.290653575270209e-05, - "loss": 0.036053261160850524, - "step": 7440 - }, - { - "epoch": 0.6981457453331683, - "grad_norm": 0.256136029958725, - "learning_rate": 3.2826807269966064e-05, - "loss": 0.020640365779399872, - "step": 7460 - }, - { - "epoch": 0.7000174497442492, - "grad_norm": 0.2054845094680786, - "learning_rate": 3.274699043565268e-05, - "loss": 0.03456352353096008, - "step": 7480 - }, - { - "epoch": 0.70188915415533, - "grad_norm": 0.2027648538351059, - "learning_rate": 3.266708615076064e-05, - "loss": 0.00846734493970871, - "step": 7500 - }, - { - "epoch": 0.7037608585664109, - "grad_norm": 1.6423311233520508, - "learning_rate": 3.258709531727582e-05, - "loss": 0.054978948831558225, - "step": 7520 - }, - { - "epoch": 0.7056325629774918, - "grad_norm": 1.775089144706726, - "learning_rate": 3.2507018838161085e-05, - "loss": 0.03238933086395264, - "step": 7540 - }, - { - "epoch": 0.7075042673885726, - "grad_norm": 0.06917860358953476, - "learning_rate": 3.242685761734609e-05, - "loss": 0.016849520802497863, - "step": 7560 - }, - { - "epoch": 0.7093759717996535, - "grad_norm": 0.051443129777908325, - "learning_rate": 3.2346612559717094e-05, - "loss": 0.048251998424530027, - "step": 7580 - }, - { - "epoch": 0.7112476762107344, - "grad_norm": 0.06533925980329514, - "learning_rate": 3.226628457110672e-05, - "loss": 0.03696450293064117, - "step": 7600 - }, - { - "epoch": 0.7131193806218153, - "grad_norm": 0.45661595463752747, - "learning_rate": 3.218587455828377e-05, - "loss": 0.05503013730049133, - "step": 7620 - }, - { - "epoch": 0.7149910850328962, - "grad_norm": 2.0205914974212646, - "learning_rate": 3.210538342894291e-05, - "loss": 0.033562681078910826, - "step": 7640 - }, - { - "epoch": 0.7168627894439771, - "grad_norm": 2.4842448234558105, - "learning_rate": 3.202481209169455e-05, - "loss": 0.019278638064861298, - "step": 7660 - }, - { - "epoch": 0.7187344938550579, - "grad_norm": 0.10550081729888916, - "learning_rate": 3.1944161456054436e-05, - "loss": 0.01638232171535492, - "step": 7680 - }, - { - "epoch": 0.7206061982661388, - "grad_norm": 1.606436014175415, - "learning_rate": 3.1863432432433506e-05, - "loss": 0.020552067458629607, - "step": 7700 - }, - { - "epoch": 0.7224779026772197, - "grad_norm": 0.2617719769477844, - "learning_rate": 3.178262593212757e-05, - "loss": 0.02315783053636551, - "step": 7720 - }, - { - "epoch": 0.7243496070883005, - "grad_norm": 0.9734074473381042, - "learning_rate": 3.1701742867307e-05, - "loss": 0.01938771307468414, - "step": 7740 - }, - { - "epoch": 0.7262213114993814, - "grad_norm": 0.5882985591888428, - "learning_rate": 3.162078415100647e-05, - "loss": 0.011305707693099975, - "step": 7760 - }, - { - "epoch": 0.7280930159104624, - "grad_norm": 0.04298723489046097, - "learning_rate": 3.15397506971146e-05, - "loss": 0.04238930344581604, - "step": 7780 - }, - { - "epoch": 0.7299647203215432, - "grad_norm": 6.2729315757751465, - "learning_rate": 3.145864342036372e-05, - "loss": 0.030225831270217895, - "step": 7800 - }, - { - "epoch": 0.7318364247326241, - "grad_norm": 0.026423340663313866, - "learning_rate": 3.1377463236319476e-05, - "loss": 0.012169972807168961, - "step": 7820 - }, - { - "epoch": 0.733708129143705, - "grad_norm": 0.0296376533806324, - "learning_rate": 3.1296211061370495e-05, - "loss": 0.015344823896884918, - "step": 7840 - }, - { - "epoch": 0.7355798335547858, - "grad_norm": 0.029524821788072586, - "learning_rate": 3.1214887812718094e-05, - "loss": 0.028345003724098206, - "step": 7860 - }, - { - "epoch": 0.7374515379658667, - "grad_norm": 0.06847794353961945, - "learning_rate": 3.113349440836588e-05, - "loss": 0.020069575309753417, - "step": 7880 - }, - { - "epoch": 0.7393232423769476, - "grad_norm": 0.024868430569767952, - "learning_rate": 3.1052031767109376e-05, - "loss": 0.014262473583221436, - "step": 7900 - }, - { - "epoch": 0.7411949467880286, - "grad_norm": 0.24450063705444336, - "learning_rate": 3.097050080852573e-05, - "loss": 0.04350808262825012, - "step": 7920 - }, - { - "epoch": 0.7430666511991094, - "grad_norm": 0.06978324800729752, - "learning_rate": 3.088890245296322e-05, - "loss": 0.015559709072113037, - "step": 7940 - }, - { - "epoch": 0.7449383556101903, - "grad_norm": 0.12675604224205017, - "learning_rate": 3.0807237621530964e-05, - "loss": 0.013867451250553131, - "step": 7960 - }, - { - "epoch": 0.7468100600212711, - "grad_norm": 0.2605513334274292, - "learning_rate": 3.072550723608846e-05, - "loss": 0.012869009375572204, - "step": 7980 - }, - { - "epoch": 0.748681764432352, - "grad_norm": 3.325530529022217, - "learning_rate": 3.064371221923521e-05, - "loss": 0.03036353886127472, - "step": 8000 - }, - { - "epoch": 0.7505534688434329, - "grad_norm": 0.22703051567077637, - "learning_rate": 3.0561853494300294e-05, - "loss": 0.009017374366521835, - "step": 8020 - }, - { - "epoch": 0.7524251732545137, - "grad_norm": 6.404862880706787, - "learning_rate": 3.047993198533195e-05, - "loss": 0.020604299008846284, - "step": 8040 - }, - { - "epoch": 0.7542968776655946, - "grad_norm": 0.06491954624652863, - "learning_rate": 3.039794861708714e-05, - "loss": 0.014963623881340028, - "step": 8060 - }, - { - "epoch": 0.7561685820766756, - "grad_norm": 0.4990088641643524, - "learning_rate": 3.0315904315021128e-05, - "loss": 0.02046530395746231, - "step": 8080 - }, - { - "epoch": 0.7580402864877565, - "grad_norm": 0.3174229562282562, - "learning_rate": 3.023380000527699e-05, - "loss": 0.013621781766414643, - "step": 8100 - }, - { - "epoch": 0.7599119908988373, - "grad_norm": 0.07161428034305573, - "learning_rate": 3.0151636614675218e-05, - "loss": 0.008043503761291504, - "step": 8120 - }, - { - "epoch": 0.7617836953099182, - "grad_norm": 0.6772736310958862, - "learning_rate": 3.0069415070703217e-05, - "loss": 0.03563189804553986, - "step": 8140 - }, - { - "epoch": 0.763655399720999, - "grad_norm": 0.07689516246318817, - "learning_rate": 2.998713630150485e-05, - "loss": 0.008622632920742035, - "step": 8160 - }, - { - "epoch": 0.7655271041320799, - "grad_norm": 0.014181110076606274, - "learning_rate": 2.990480123586994e-05, - "loss": 0.012368627637624741, - "step": 8180 - }, - { - "epoch": 0.7673988085431608, - "grad_norm": 4.4751715660095215, - "learning_rate": 2.9822410803223822e-05, - "loss": 0.02100955694913864, - "step": 8200 - }, - { - "epoch": 0.7692705129542416, - "grad_norm": 0.12694527208805084, - "learning_rate": 2.9739965933616825e-05, - "loss": 0.018182000517845152, - "step": 8220 - }, - { - "epoch": 0.7711422173653226, - "grad_norm": 0.13789872825145721, - "learning_rate": 2.9657467557713792e-05, - "loss": 0.008949784934520722, - "step": 8240 - }, - { - "epoch": 0.7730139217764035, - "grad_norm": 0.04048463702201843, - "learning_rate": 2.957491660678354e-05, - "loss": 0.03582434058189392, - "step": 8260 - }, - { - "epoch": 0.7748856261874844, - "grad_norm": 0.7825964689254761, - "learning_rate": 2.9492314012688378e-05, - "loss": 0.012679101526737213, - "step": 8280 - }, - { - "epoch": 0.7767573305985652, - "grad_norm": 0.14350314438343048, - "learning_rate": 2.9409660707873597e-05, - "loss": 0.010909486562013626, - "step": 8300 - }, - { - "epoch": 0.7786290350096461, - "grad_norm": 0.17676737904548645, - "learning_rate": 2.932695762535691e-05, - "loss": 0.01464642733335495, - "step": 8320 - }, - { - "epoch": 0.780500739420727, - "grad_norm": 0.5979751348495483, - "learning_rate": 2.9244205698717943e-05, - "loss": 0.028799059987068176, - "step": 8340 - }, - { - "epoch": 0.7823724438318078, - "grad_norm": 0.08448052406311035, - "learning_rate": 2.9161405862087676e-05, - "loss": 0.014056096971035003, - "step": 8360 - }, - { - "epoch": 0.7842441482428888, - "grad_norm": 0.5616207122802734, - "learning_rate": 2.9078559050137955e-05, - "loss": 0.008744364231824875, - "step": 8380 - }, - { - "epoch": 0.7861158526539697, - "grad_norm": 0.7264829277992249, - "learning_rate": 2.8995666198070836e-05, - "loss": 0.014575870335102081, - "step": 8400 - }, - { - "epoch": 0.7879875570650505, - "grad_norm": 1.444239616394043, - "learning_rate": 2.891272824160815e-05, - "loss": 0.01230706349015236, - "step": 8420 - }, - { - "epoch": 0.7898592614761314, - "grad_norm": 0.02643579989671707, - "learning_rate": 2.882974611698084e-05, - "loss": 0.01713460832834244, - "step": 8440 - }, - { - "epoch": 0.7917309658872123, - "grad_norm": 0.19893163442611694, - "learning_rate": 2.8746720760918457e-05, - "loss": 0.009562552720308305, - "step": 8460 - }, - { - "epoch": 0.7936026702982931, - "grad_norm": 1.8813897371292114, - "learning_rate": 2.866365311063855e-05, - "loss": 0.01966284364461899, - "step": 8480 - }, - { - "epoch": 0.795474374709374, - "grad_norm": 0.1820579618215561, - "learning_rate": 2.8580544103836114e-05, - "loss": 0.023943188786506652, - "step": 8500 - }, - { - "epoch": 0.7973460791204549, - "grad_norm": 1.3913259506225586, - "learning_rate": 2.849739467867298e-05, - "loss": 0.02233349084854126, - "step": 8520 - }, - { - "epoch": 0.7992177835315358, - "grad_norm": 0.28450486063957214, - "learning_rate": 2.8414205773767223e-05, - "loss": 0.016230446100234986, - "step": 8540 - }, - { - "epoch": 0.8010894879426167, - "grad_norm": 0.46086356043815613, - "learning_rate": 2.83309783281826e-05, - "loss": 0.013964855670928955, - "step": 8560 - }, - { - "epoch": 0.8029611923536976, - "grad_norm": 1.1401137113571167, - "learning_rate": 2.8247713281417924e-05, - "loss": 0.01552264392375946, - "step": 8580 - }, - { - "epoch": 0.8048328967647784, - "grad_norm": 0.02414649911224842, - "learning_rate": 2.8164411573396444e-05, - "loss": 0.00505053773522377, - "step": 8600 - }, - { - "epoch": 0.8067046011758593, - "grad_norm": 0.029010778293013573, - "learning_rate": 2.8081074144455276e-05, - "loss": 0.008068422973155975, - "step": 8620 - }, - { - "epoch": 0.8085763055869402, - "grad_norm": 0.024924319237470627, - "learning_rate": 2.7997701935334747e-05, - "loss": 0.021529987454414368, - "step": 8640 - }, - { - "epoch": 0.810448009998021, - "grad_norm": 0.3544171154499054, - "learning_rate": 2.791429588716782e-05, - "loss": 0.008264218270778657, - "step": 8660 - }, - { - "epoch": 0.8123197144091019, - "grad_norm": 0.011211074888706207, - "learning_rate": 2.7830856941469407e-05, - "loss": 0.013752134144306183, - "step": 8680 - }, - { - "epoch": 0.8141914188201829, - "grad_norm": 0.30479249358177185, - "learning_rate": 2.7747386040125807e-05, - "loss": 0.01313515156507492, - "step": 8700 - }, - { - "epoch": 0.8160631232312637, - "grad_norm": 3.1079516410827637, - "learning_rate": 2.766388412538404e-05, - "loss": 0.013471932709217071, - "step": 8720 - }, - { - "epoch": 0.8179348276423446, - "grad_norm": 0.011288405396044254, - "learning_rate": 2.758035213984121e-05, - "loss": 0.011207062005996703, - "step": 8740 - }, - { - "epoch": 0.8198065320534255, - "grad_norm": 0.011481484398245811, - "learning_rate": 2.749679102643387e-05, - "loss": 0.018254657089710236, - "step": 8760 - }, - { - "epoch": 0.8216782364645063, - "grad_norm": 0.037564992904663086, - "learning_rate": 2.7413201728427372e-05, - "loss": 0.024057184159755707, - "step": 8780 - }, - { - "epoch": 0.8235499408755872, - "grad_norm": 0.03808968514204025, - "learning_rate": 2.7329585189405253e-05, - "loss": 0.006051592528820038, - "step": 8800 - }, - { - "epoch": 0.8254216452866681, - "grad_norm": 0.07610247284173965, - "learning_rate": 2.724594235325852e-05, - "loss": 0.025592076778411865, - "step": 8820 - }, - { - "epoch": 0.827293349697749, - "grad_norm": 0.019049810245633125, - "learning_rate": 2.716227416417505e-05, - "loss": 0.0037486787885427477, - "step": 8840 - }, - { - "epoch": 0.8291650541088299, - "grad_norm": 0.6380273699760437, - "learning_rate": 2.7078581566628897e-05, - "loss": 0.015487492084503174, - "step": 8860 - }, - { - "epoch": 0.8310367585199108, - "grad_norm": 0.05775881186127663, - "learning_rate": 2.699486550536968e-05, - "loss": 0.03133237063884735, - "step": 8880 - }, - { - "epoch": 0.8329084629309916, - "grad_norm": 0.047411222010850906, - "learning_rate": 2.6911126925411845e-05, - "loss": 0.00861177071928978, - "step": 8900 - }, - { - "epoch": 0.8347801673420725, - "grad_norm": 0.23981286585330963, - "learning_rate": 2.682736677202406e-05, - "loss": 0.01839599907398224, - "step": 8920 - }, - { - "epoch": 0.8366518717531534, - "grad_norm": 0.36887305974960327, - "learning_rate": 2.6743585990718505e-05, - "loss": 0.01008533239364624, - "step": 8940 - }, - { - "epoch": 0.8385235761642342, - "grad_norm": 0.8994531035423279, - "learning_rate": 2.6659785527240233e-05, - "loss": 0.027107802033424378, - "step": 8960 - }, - { - "epoch": 0.8403952805753151, - "grad_norm": 0.12780402600765228, - "learning_rate": 2.6575966327556458e-05, - "loss": 0.03549482524394989, - "step": 8980 - }, - { - "epoch": 0.8422669849863961, - "grad_norm": 0.3294568359851837, - "learning_rate": 2.649212933784591e-05, - "loss": 0.02797776460647583, - "step": 9000 - }, - { - "epoch": 0.8441386893974769, - "grad_norm": 0.019461506977677345, - "learning_rate": 2.640827550448812e-05, - "loss": 0.010047334432601928, - "step": 9020 - }, - { - "epoch": 0.8460103938085578, - "grad_norm": 0.056546472012996674, - "learning_rate": 2.6324405774052784e-05, - "loss": 0.02831721007823944, - "step": 9040 - }, - { - "epoch": 0.8478820982196387, - "grad_norm": 0.017190299928188324, - "learning_rate": 2.6240521093289022e-05, - "loss": 0.019623257219791412, - "step": 9060 - }, - { - "epoch": 0.8497538026307195, - "grad_norm": 0.04793965816497803, - "learning_rate": 2.6156622409114728e-05, - "loss": 0.011966148018836975, - "step": 9080 - }, - { - "epoch": 0.8516255070418004, - "grad_norm": 0.006742037367075682, - "learning_rate": 2.607271066860587e-05, - "loss": 0.013694784045219422, - "step": 9100 - }, - { - "epoch": 0.8534972114528813, - "grad_norm": 0.03113027848303318, - "learning_rate": 2.5988786818985812e-05, - "loss": 0.05338943004608154, - "step": 9120 - }, - { - "epoch": 0.8553689158639621, - "grad_norm": 0.6589255928993225, - "learning_rate": 2.5904851807614588e-05, - "loss": 0.01305432766675949, - "step": 9140 - }, - { - "epoch": 0.8572406202750431, - "grad_norm": 0.3030281960964203, - "learning_rate": 2.582090658197825e-05, - "loss": 0.03663805425167084, - "step": 9160 - }, - { - "epoch": 0.859112324686124, - "grad_norm": 0.37101081013679504, - "learning_rate": 2.573695208967814e-05, - "loss": 0.016968609392642976, - "step": 9180 - }, - { - "epoch": 0.8609840290972048, - "grad_norm": 0.7480998039245605, - "learning_rate": 2.5652989278420197e-05, - "loss": 0.021240857243537904, - "step": 9200 - }, - { - "epoch": 0.8628557335082857, - "grad_norm": 0.017131274566054344, - "learning_rate": 2.5569019096004304e-05, - "loss": 0.004783949628472328, - "step": 9220 - }, - { - "epoch": 0.8647274379193666, - "grad_norm": 1.1544040441513062, - "learning_rate": 2.5485042490313504e-05, - "loss": 0.02356208860874176, - "step": 9240 - }, - { - "epoch": 0.8665991423304474, - "grad_norm": 0.13512635231018066, - "learning_rate": 2.540106040930338e-05, - "loss": 0.009329542517662048, - "step": 9260 - }, - { - "epoch": 0.8684708467415283, - "grad_norm": 0.018427839502692223, - "learning_rate": 2.5317073800991304e-05, - "loss": 0.007472375035285949, - "step": 9280 - }, - { - "epoch": 0.8703425511526093, - "grad_norm": 0.02722800336778164, - "learning_rate": 2.5233083613445778e-05, - "loss": 0.020304642617702484, - "step": 9300 - }, - { - "epoch": 0.8722142555636901, - "grad_norm": 0.051702745258808136, - "learning_rate": 2.5149090794775675e-05, - "loss": 0.02955295443534851, - "step": 9320 - }, - { - "epoch": 0.874085959974771, - "grad_norm": 0.1535400152206421, - "learning_rate": 2.5065096293119604e-05, - "loss": 0.030047640204429626, - "step": 9340 - }, - { - "epoch": 0.8759576643858519, - "grad_norm": 0.383573979139328, - "learning_rate": 2.498110105663513e-05, - "loss": 0.011377302557229995, - "step": 9360 - }, - { - "epoch": 0.8778293687969327, - "grad_norm": 0.23541487753391266, - "learning_rate": 2.489710603348817e-05, - "loss": 0.02304387390613556, - "step": 9380 - }, - { - "epoch": 0.8797010732080136, - "grad_norm": 0.029004938900470734, - "learning_rate": 2.4813112171842162e-05, - "loss": 0.020582889020442963, - "step": 9400 - }, - { - "epoch": 0.8815727776190945, - "grad_norm": 0.06564116477966309, - "learning_rate": 2.4729120419847498e-05, - "loss": 0.014207787811756134, - "step": 9420 - }, - { - "epoch": 0.8834444820301753, - "grad_norm": 0.01633615791797638, - "learning_rate": 2.464513172563072e-05, - "loss": 0.01756283938884735, - "step": 9440 - }, - { - "epoch": 0.8853161864412563, - "grad_norm": 0.01287770178169012, - "learning_rate": 2.456114703728386e-05, - "loss": 0.003737853467464447, - "step": 9460 - }, - { - "epoch": 0.8871878908523372, - "grad_norm": 0.05004064738750458, - "learning_rate": 2.448136615728485e-05, - "loss": 0.0324675589799881, - "step": 9480 - }, - { - "epoch": 0.889059595263418, - "grad_norm": 1.20869779586792, - "learning_rate": 2.4397392007153162e-05, - "loss": 0.007156150788068772, - "step": 9500 - }, - { - "epoch": 0.8909312996744989, - "grad_norm": 1.1070218086242676, - "learning_rate": 2.43134246594589e-05, - "loss": 0.009275762736797333, - "step": 9520 - }, - { - "epoch": 0.8928030040855798, - "grad_norm": 0.878593385219574, - "learning_rate": 2.4229465062053136e-05, - "loss": 0.018170186877250673, - "step": 9540 - }, - { - "epoch": 2.236302797078385, - "grad_norm": 0.043156858533620834, - "learning_rate": 1.4461640332194936e-05, - "loss": 0.0786406546831131, - "step": 9560 - }, - { - "epoch": 2.2409820603868678, - "grad_norm": 5.726193904876709, - "learning_rate": 1.4366537531356394e-05, - "loss": 0.15552978515625, - "step": 9580 - }, - { - "epoch": 2.2456613236953507, - "grad_norm": 0.18552646040916443, - "learning_rate": 1.4271622228435674e-05, - "loss": 0.14015564918518067, - "step": 9600 - }, - { - "epoch": 2.2503405870038335, - "grad_norm": 0.08301093429327011, - "learning_rate": 1.4176896097057135e-05, - "loss": 0.0925659716129303, - "step": 9620 - }, - { - "epoch": 2.2550198503123164, - "grad_norm": 0.026666201651096344, - "learning_rate": 1.4082360807509482e-05, - "loss": 0.10548268556594849, - "step": 9640 - }, - { - "epoch": 2.2596991136207993, - "grad_norm": 1.1046574115753174, - "learning_rate": 1.3988018026716371e-05, - "loss": 0.04000181555747986, - "step": 9660 - }, - { - "epoch": 2.2643783769292822, - "grad_norm": 1.3641736507415771, - "learning_rate": 1.3893869418206949e-05, - "loss": 0.08331295847892761, - "step": 9680 - }, - { - "epoch": 2.269057640237765, - "grad_norm": 0.6985445022583008, - "learning_rate": 1.3799916642086585e-05, - "loss": 0.09075057506561279, - "step": 9700 - }, - { - "epoch": 2.273736903546248, - "grad_norm": 0.0306988712400198, - "learning_rate": 1.3706161355007579e-05, - "loss": 0.13399437665939332, - "step": 9720 - }, - { - "epoch": 2.278416166854731, - "grad_norm": 0.2675832509994507, - "learning_rate": 1.3612605210139912e-05, - "loss": 0.1272665500640869, - "step": 9740 - }, - { - "epoch": 2.283095430163214, - "grad_norm": 10.038541793823242, - "learning_rate": 1.3519249857142147e-05, - "loss": 0.118126380443573, - "step": 9760 - }, - { - "epoch": 2.2877746934716967, - "grad_norm": 0.015401734039187431, - "learning_rate": 1.3426096942132305e-05, - "loss": 0.08764986395835876, - "step": 9780 - }, - { - "epoch": 2.2924539567801796, - "grad_norm": 0.030456993728876114, - "learning_rate": 1.3333148107658883e-05, - "loss": 0.11266089677810669, - "step": 9800 - }, - { - "epoch": 2.2971332200886625, - "grad_norm": 0.11291567236185074, - "learning_rate": 1.3240404992671823e-05, - "loss": 0.05411486625671387, - "step": 9820 - }, - { - "epoch": 2.3018124833971454, - "grad_norm": 5.381721496582031, - "learning_rate": 1.3147869232493698e-05, - "loss": 0.17104675769805908, - "step": 9840 - }, - { - "epoch": 2.306491746705628, - "grad_norm": 0.05387361720204353, - "learning_rate": 1.305554245879079e-05, - "loss": 0.03889726996421814, - "step": 9860 - }, - { - "epoch": 2.3111710100141107, - "grad_norm": 0.1429348886013031, - "learning_rate": 1.296342629954439e-05, - "loss": 0.11462880373001098, - "step": 9880 - }, - { - "epoch": 2.3158502733225936, - "grad_norm": 0.8317188024520874, - "learning_rate": 1.2871522379022038e-05, - "loss": 0.04742775857448578, - "step": 9900 - }, - { - "epoch": 2.3205295366310765, - "grad_norm": 0.12162027508020401, - "learning_rate": 1.2779832317748933e-05, - "loss": 0.10024887323379517, - "step": 9920 - }, - { - "epoch": 2.3252087999395594, - "grad_norm": 11.173067092895508, - "learning_rate": 1.2688357732479303e-05, - "loss": 0.10983879566192627, - "step": 9940 - }, - { - "epoch": 2.3298880632480423, - "grad_norm": 0.03047860600054264, - "learning_rate": 1.2597100236167963e-05, - "loss": 0.08037717342376709, - "step": 9960 - }, - { - "epoch": 2.334567326556525, - "grad_norm": 0.0170467309653759, - "learning_rate": 1.2506061437941804e-05, - "loss": 0.033829569816589355, - "step": 9980 - }, - { - "epoch": 2.339246589865008, - "grad_norm": 0.07754386961460114, - "learning_rate": 1.241524294307147e-05, - "loss": 0.04207524955272675, - "step": 10000 - }, - { - "epoch": 2.343925853173491, - "grad_norm": 0.08229111135005951, - "learning_rate": 1.232464635294302e-05, - "loss": 0.04404653012752533, - "step": 10020 - } - ], - "logging_steps": 20, - "max_steps": 14963, - "num_input_tokens_seen": 0, - "num_train_epochs": 4, - "save_steps": 1000000000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.844023334771277e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/slots/19/checkpoint-10023/training_args.bin b/slots/19/checkpoint-10023/training_args.bin deleted file mode 100644 index cba6bf44229020a6cf5d76cffc747dea705142ea..0000000000000000000000000000000000000000 --- a/slots/19/checkpoint-10023/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66430bba102a8f3dc245713cd6268a99c212c508aacce1d8b9768464f5df26ec -size 5201 diff --git a/slots/19/latest.json b/slots/19/latest.json deleted file mode 100644 index 45f876a1bd8618c7c8e33bd95fc6f09ebd6c69bb..0000000000000000000000000000000000000000 --- a/slots/19/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:19", "checkpoint": "slots/19/checkpoint-10023", "step": 10023, "updated_at": 1776815852} diff --git a/slots/2/latest.json b/slots/2/latest.json deleted file mode 100644 index ef66444cbe1c6cba003284868cb329a787585f96..0000000000000000000000000000000000000000 --- a/slots/2/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:2", "checkpoint": "slots/2/checkpoint-9283", "step": 9283, "updated_at": 1776779921} diff --git a/slots/3/checkpoint-10159/config.json b/slots/3/checkpoint-10159/config.json deleted file mode 100644 index 9e5d8b7224eff16a790758ae86dd97c89afeab74..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "TwinyForCausalLM" - ], - "attention_dropout": 0.0, - "dtype": "float32", - "hidden_dropout": 0.0, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 128, - "model_type": "twiny", - "neftune_alpha": 0.0, - "num_attention_heads": 12, - "num_hidden_layers": 3, - "num_key_value_heads": 3, - "qk_norm": true, - "rezero_init": 1.0, - "rms_norm_eps": 1e-06, - "rope_theta": 10000.0, - "transformers_version": "5.0.0", - "use_cache": false, - "vocab_size": 32000 -} diff --git a/slots/3/checkpoint-10159/model.safetensors b/slots/3/checkpoint-10159/model.safetensors deleted file mode 100644 index 6dd4b3c22491f7069f5f23f45508be36acd7ced3..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7e3fb8e0f817a8a84613cc770e1266f16e31efd7eb15117b14c7bf826f38ae8 -size 306388092 diff --git a/slots/3/checkpoint-10159/optimizer.pt b/slots/3/checkpoint-10159/optimizer.pt deleted file mode 100644 index 881a618119d3846f975aae91fd7954eadaa5ef2b..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2dd01a3f1012d672720e75a79a3bbc62d16db6dc4a002a15e489a1dac10bfa99 -size 302484555 diff --git a/slots/3/checkpoint-10159/rng_state.pth b/slots/3/checkpoint-10159/rng_state.pth deleted file mode 100644 index 1feba1a6538e93b94696d3773853dbc8947b0cad..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 -size 14645 diff --git a/slots/3/checkpoint-10159/scaler.pt b/slots/3/checkpoint-10159/scaler.pt deleted file mode 100644 index 910ca20007edf47d345e38728f52d3f0d4b3b6b5..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/scaler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc6e892dbef0df0b3830e2c843d5652f4a2c85ad5cf966cd03b73832dc1add13 -size 1383 diff --git a/slots/3/checkpoint-10159/scheduler.pt b/slots/3/checkpoint-10159/scheduler.pt deleted file mode 100644 index f968aad92f23c07ddcd595a3ded4865daa47a271..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9de102974c5f914bc9bbae7d7957e5f5c278e9c972df8afc962805d49349cac -size 1465 diff --git a/slots/3/checkpoint-10159/trainer_state.json b/slots/3/checkpoint-10159/trainer_state.json deleted file mode 100644 index dce309bcfcf7d265b3ae09a4fb3adb22838826dd..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/trainer_state.json +++ /dev/null @@ -1,3590 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.376446733167447, - "eval_steps": 500, - "global_step": 10159, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001336931908386741, - "grad_norm": Infinity, - "learning_rate": 5e-05, - "loss": 129.80032348632812, - "step": 1 - }, - { - "epoch": 0.002673863816773482, - "grad_norm": 63.64365768432617, - "learning_rate": 4.999995392022967e-05, - "loss": 63.88374408922697, - "step": 20 - }, - { - "epoch": 0.005347727633546964, - "grad_norm": 24.627853393554688, - "learning_rate": 4.999976672145381e-05, - "loss": 12.65963363647461, - "step": 40 - }, - { - "epoch": 0.008021591450320446, - "grad_norm": 14.29983901977539, - "learning_rate": 4.999943552476422e-05, - "loss": 5.90204963684082, - "step": 60 - }, - { - "epoch": 0.010695455267093928, - "grad_norm": 15.690323829650879, - "learning_rate": 4.999896033206858e-05, - "loss": 3.9918922424316405, - "step": 80 - }, - { - "epoch": 0.01336931908386741, - "grad_norm": 31.583160400390625, - "learning_rate": 4.999834114610398e-05, - "loss": 2.9675426483154297, - "step": 100 - }, - { - "epoch": 0.01604318290064089, - "grad_norm": 13.034649848937988, - "learning_rate": 4.999757797043691e-05, - "loss": 2.725296401977539, - "step": 120 - }, - { - "epoch": 0.018717046717414372, - "grad_norm": 8.362203598022461, - "learning_rate": 4.999667080946324e-05, - "loss": 2.2478992462158205, - "step": 140 - }, - { - "epoch": 0.021390910534187856, - "grad_norm": 8.726786613464355, - "learning_rate": 4.999561966840821e-05, - "loss": 1.8447845458984375, - "step": 160 - }, - { - "epoch": 0.024064774350961337, - "grad_norm": 10.092752456665039, - "learning_rate": 4.9994424553326335e-05, - "loss": 1.5611843109130858, - "step": 180 - }, - { - "epoch": 0.02673863816773482, - "grad_norm": 9.090085983276367, - "learning_rate": 4.999308547110146e-05, - "loss": 1.520334815979004, - "step": 200 - }, - { - "epoch": 0.029412501984508302, - "grad_norm": 9.668124198913574, - "learning_rate": 4.999160242944665e-05, - "loss": 1.2818055152893066, - "step": 220 - }, - { - "epoch": 0.03208636580128178, - "grad_norm": 9.182533264160156, - "learning_rate": 4.998997543690418e-05, - "loss": 1.0428407669067383, - "step": 240 - }, - { - "epoch": 0.03476022961805526, - "grad_norm": 5.745838165283203, - "learning_rate": 4.998820450284549e-05, - "loss": 1.2343652725219727, - "step": 260 - }, - { - "epoch": 0.037434093434828744, - "grad_norm": 8.651643753051758, - "learning_rate": 4.99862896374711e-05, - "loss": 0.8859601020812988, - "step": 280 - }, - { - "epoch": 0.04010795725160223, - "grad_norm": 10.765266418457031, - "learning_rate": 4.998423085181056e-05, - "loss": 0.989600658416748, - "step": 300 - }, - { - "epoch": 0.04278182106837571, - "grad_norm": 6.092499256134033, - "learning_rate": 4.998202815772245e-05, - "loss": 0.7189463615417481, - "step": 320 - }, - { - "epoch": 0.04545568488514919, - "grad_norm": 6.352876663208008, - "learning_rate": 4.9979681567894195e-05, - "loss": 0.7489545345306396, - "step": 340 - }, - { - "epoch": 0.048129548701922674, - "grad_norm": 4.620656490325928, - "learning_rate": 4.997719109584209e-05, - "loss": 0.7381401538848877, - "step": 360 - }, - { - "epoch": 0.050803412518696155, - "grad_norm": 7.796917915344238, - "learning_rate": 4.997455675591119e-05, - "loss": 0.5687405109405518, - "step": 380 - }, - { - "epoch": 0.05347727633546964, - "grad_norm": 2.837172508239746, - "learning_rate": 4.9971778563275204e-05, - "loss": 0.5686865329742432, - "step": 400 - }, - { - "epoch": 0.05615114015224312, - "grad_norm": 3.3103690147399902, - "learning_rate": 4.9968856533936436e-05, - "loss": 0.625730562210083, - "step": 420 - }, - { - "epoch": 0.058825003969016604, - "grad_norm": 3.5682132244110107, - "learning_rate": 4.99657906847257e-05, - "loss": 0.6125466346740722, - "step": 440 - }, - { - "epoch": 0.061498867785790085, - "grad_norm": 5.63640832901001, - "learning_rate": 4.996258103330218e-05, - "loss": 0.6182214260101319, - "step": 460 - }, - { - "epoch": 0.06417273160256357, - "grad_norm": 4.698945999145508, - "learning_rate": 4.995922759815339e-05, - "loss": 0.43828091621398924, - "step": 480 - }, - { - "epoch": 0.06684659541933705, - "grad_norm": 2.1976189613342285, - "learning_rate": 4.995573039859501e-05, - "loss": 0.4459230899810791, - "step": 500 - }, - { - "epoch": 0.06952045923611053, - "grad_norm": 3.8809523582458496, - "learning_rate": 4.995208945477081e-05, - "loss": 0.3821882963180542, - "step": 520 - }, - { - "epoch": 0.07219432305288401, - "grad_norm": 3.75144100189209, - "learning_rate": 4.994830478765251e-05, - "loss": 0.5800807476043701, - "step": 540 - }, - { - "epoch": 0.07486818686965749, - "grad_norm": 3.0038585662841797, - "learning_rate": 4.9944376419039684e-05, - "loss": 0.3928264617919922, - "step": 560 - }, - { - "epoch": 0.07754205068643098, - "grad_norm": 3.614591598510742, - "learning_rate": 4.994030437155961e-05, - "loss": 0.48637890815734863, - "step": 580 - }, - { - "epoch": 0.08021591450320446, - "grad_norm": 4.143443584442139, - "learning_rate": 4.993608866866718e-05, - "loss": 0.3650153160095215, - "step": 600 - }, - { - "epoch": 0.08288977831997794, - "grad_norm": 6.692712783813477, - "learning_rate": 4.993172933464471e-05, - "loss": 0.3677916288375854, - "step": 620 - }, - { - "epoch": 0.08556364213675142, - "grad_norm": 8.383441925048828, - "learning_rate": 4.9927226394601815e-05, - "loss": 0.3399480104446411, - "step": 640 - }, - { - "epoch": 0.0882375059535249, - "grad_norm": 5.566338062286377, - "learning_rate": 4.992257987447532e-05, - "loss": 0.28104052543640134, - "step": 660 - }, - { - "epoch": 0.09091136977029839, - "grad_norm": 3.1196420192718506, - "learning_rate": 4.991778980102904e-05, - "loss": 0.351950478553772, - "step": 680 - }, - { - "epoch": 0.09358523358707187, - "grad_norm": 3.47979736328125, - "learning_rate": 4.9912856201853644e-05, - "loss": 0.27501535415649414, - "step": 700 - }, - { - "epoch": 0.09625909740384535, - "grad_norm": 5.446717262268066, - "learning_rate": 4.990777910536653e-05, - "loss": 0.2651593923568726, - "step": 720 - }, - { - "epoch": 0.09893296122061883, - "grad_norm": 7.6145339012146, - "learning_rate": 4.990255854081161e-05, - "loss": 0.35140380859375, - "step": 740 - }, - { - "epoch": 0.10160682503739231, - "grad_norm": 8.445616722106934, - "learning_rate": 4.989719453825918e-05, - "loss": 0.2961219072341919, - "step": 760 - }, - { - "epoch": 0.10428068885416579, - "grad_norm": 6.339537620544434, - "learning_rate": 4.9891687128605744e-05, - "loss": 0.24962289333343507, - "step": 780 - }, - { - "epoch": 0.10695455267093928, - "grad_norm": 3.3369436264038086, - "learning_rate": 4.988603634357383e-05, - "loss": 0.2124847412109375, - "step": 800 - }, - { - "epoch": 0.10962841648771277, - "grad_norm": 2.2909045219421387, - "learning_rate": 4.988024221571177e-05, - "loss": 0.24679112434387207, - "step": 820 - }, - { - "epoch": 0.11230228030448625, - "grad_norm": 3.1149911880493164, - "learning_rate": 4.9874304778393574e-05, - "loss": 0.22161397933959961, - "step": 840 - }, - { - "epoch": 0.11497614412125973, - "grad_norm": 14.802160263061523, - "learning_rate": 4.9868224065818706e-05, - "loss": 0.2623537302017212, - "step": 860 - }, - { - "epoch": 0.11765000793803321, - "grad_norm": 5.586325168609619, - "learning_rate": 4.98620001130119e-05, - "loss": 0.3560942649841309, - "step": 880 - }, - { - "epoch": 0.12032387175480669, - "grad_norm": 3.390017032623291, - "learning_rate": 4.9855632955822916e-05, - "loss": 0.16934787034988402, - "step": 900 - }, - { - "epoch": 0.12299773557158017, - "grad_norm": 6.070940971374512, - "learning_rate": 4.984912263092641e-05, - "loss": 0.2131197214126587, - "step": 920 - }, - { - "epoch": 0.12567159938835365, - "grad_norm": 1.4912281036376953, - "learning_rate": 4.984246917582166e-05, - "loss": 0.25128653049468996, - "step": 940 - }, - { - "epoch": 0.12834546320512713, - "grad_norm": 7.000472545623779, - "learning_rate": 4.9835672628832366e-05, - "loss": 0.2653592586517334, - "step": 960 - }, - { - "epoch": 0.1310193270219006, - "grad_norm": 5.427223205566406, - "learning_rate": 4.9828733029106434e-05, - "loss": 0.1653295636177063, - "step": 980 - }, - { - "epoch": 0.1336931908386741, - "grad_norm": 1.9502102136611938, - "learning_rate": 4.982165041661575e-05, - "loss": 0.2250870943069458, - "step": 1000 - }, - { - "epoch": 0.13636705465544757, - "grad_norm": 0.6216259598731995, - "learning_rate": 4.981442483215595e-05, - "loss": 0.18943849802017212, - "step": 1020 - }, - { - "epoch": 0.13904091847222105, - "grad_norm": 2.3363687992095947, - "learning_rate": 4.98070563173462e-05, - "loss": 0.1673592209815979, - "step": 1040 - }, - { - "epoch": 0.14171478228899453, - "grad_norm": 1.040717601776123, - "learning_rate": 4.979954491462892e-05, - "loss": 0.2113173007965088, - "step": 1060 - }, - { - "epoch": 0.14438864610576801, - "grad_norm": 2.735522747039795, - "learning_rate": 4.979189066726955e-05, - "loss": 0.17504971027374266, - "step": 1080 - }, - { - "epoch": 0.1470625099225415, - "grad_norm": 4.701151371002197, - "learning_rate": 4.978409361935636e-05, - "loss": 0.15881222486495972, - "step": 1100 - }, - { - "epoch": 0.14973637373931498, - "grad_norm": 2.735919237136841, - "learning_rate": 4.9776153815800075e-05, - "loss": 0.14044179916381835, - "step": 1120 - }, - { - "epoch": 0.15241023755608848, - "grad_norm": 3.5479538440704346, - "learning_rate": 4.976807130233375e-05, - "loss": 0.18565714359283447, - "step": 1140 - }, - { - "epoch": 0.15508410137286197, - "grad_norm": 3.2167458534240723, - "learning_rate": 4.975984612551243e-05, - "loss": 0.13236271142959594, - "step": 1160 - }, - { - "epoch": 0.15775796518963545, - "grad_norm": 1.0206760168075562, - "learning_rate": 4.975147833271288e-05, - "loss": 0.19124728441238403, - "step": 1180 - }, - { - "epoch": 0.16043182900640893, - "grad_norm": 4.194457530975342, - "learning_rate": 4.9742967972133335e-05, - "loss": 0.144741427898407, - "step": 1200 - }, - { - "epoch": 0.1631056928231824, - "grad_norm": 3.0225746631622314, - "learning_rate": 4.973431509279323e-05, - "loss": 0.1374324679374695, - "step": 1220 - }, - { - "epoch": 0.1657795566399559, - "grad_norm": 4.243523120880127, - "learning_rate": 4.972551974453287e-05, - "loss": 0.13663809299468993, - "step": 1240 - }, - { - "epoch": 0.16845342045672937, - "grad_norm": 2.4990086555480957, - "learning_rate": 4.971658197801322e-05, - "loss": 0.16817957162857056, - "step": 1260 - }, - { - "epoch": 0.17112728427350285, - "grad_norm": 4.983982563018799, - "learning_rate": 4.9707501844715554e-05, - "loss": 0.13795313835144044, - "step": 1280 - }, - { - "epoch": 0.17380114809027633, - "grad_norm": 3.6780316829681396, - "learning_rate": 4.969827939694115e-05, - "loss": 0.1637880802154541, - "step": 1300 - }, - { - "epoch": 0.1764750119070498, - "grad_norm": 0.7950732707977295, - "learning_rate": 4.968891468781105e-05, - "loss": 0.10979138612747193, - "step": 1320 - }, - { - "epoch": 0.1791488757238233, - "grad_norm": 1.2414121627807617, - "learning_rate": 4.967940777126569e-05, - "loss": 0.13692171573638917, - "step": 1340 - }, - { - "epoch": 0.18182273954059677, - "grad_norm": 2.1383633613586426, - "learning_rate": 4.9669758702064636e-05, - "loss": 0.07821698188781738, - "step": 1360 - }, - { - "epoch": 0.18449660335737025, - "grad_norm": 5.061275959014893, - "learning_rate": 4.965996753578623e-05, - "loss": 0.19053516387939454, - "step": 1380 - }, - { - "epoch": 0.18717046717414373, - "grad_norm": 6.151792049407959, - "learning_rate": 4.9650034328827305e-05, - "loss": 0.11360721588134766, - "step": 1400 - }, - { - "epoch": 0.18984433099091721, - "grad_norm": 1.0604305267333984, - "learning_rate": 4.963995913840284e-05, - "loss": 0.13138024806976317, - "step": 1420 - }, - { - "epoch": 0.1925181948076907, - "grad_norm": 1.7159489393234253, - "learning_rate": 4.9629742022545623e-05, - "loss": 0.08657677173614502, - "step": 1440 - }, - { - "epoch": 0.19519205862446418, - "grad_norm": 2.4207754135131836, - "learning_rate": 4.961938304010595e-05, - "loss": 0.10309149026870727, - "step": 1460 - }, - { - "epoch": 0.19786592244123766, - "grad_norm": 1.532060146331787, - "learning_rate": 4.9608882250751245e-05, - "loss": 0.13628544807434081, - "step": 1480 - }, - { - "epoch": 0.20053978625801114, - "grad_norm": 6.409943580627441, - "learning_rate": 4.959823971496574e-05, - "loss": 0.10584845542907714, - "step": 1500 - }, - { - "epoch": 0.20321365007478462, - "grad_norm": 2.452012538909912, - "learning_rate": 4.9587455494050136e-05, - "loss": 0.06506187915802002, - "step": 1520 - }, - { - "epoch": 0.2058875138915581, - "grad_norm": 5.3016533851623535, - "learning_rate": 4.9576529650121214e-05, - "loss": 0.11848526000976563, - "step": 1540 - }, - { - "epoch": 0.20856137770833158, - "grad_norm": 4.341775894165039, - "learning_rate": 4.956546224611152e-05, - "loss": 0.11318533420562744, - "step": 1560 - }, - { - "epoch": 0.21123524152510506, - "grad_norm": 1.9056169986724854, - "learning_rate": 4.9554253345768965e-05, - "loss": 0.12768398523330687, - "step": 1580 - }, - { - "epoch": 0.21390910534187857, - "grad_norm": 1.8939746618270874, - "learning_rate": 4.9542903013656486e-05, - "loss": 0.10782338380813598, - "step": 1600 - }, - { - "epoch": 0.21658296915865205, - "grad_norm": 8.53671932220459, - "learning_rate": 4.9531411315151654e-05, - "loss": 0.1733921766281128, - "step": 1620 - }, - { - "epoch": 0.21925683297542553, - "grad_norm": 2.0152978897094727, - "learning_rate": 4.951977831644632e-05, - "loss": 0.11197054386138916, - "step": 1640 - }, - { - "epoch": 0.221930696792199, - "grad_norm": 3.8422367572784424, - "learning_rate": 4.95080040845462e-05, - "loss": 0.11441781520843505, - "step": 1660 - }, - { - "epoch": 0.2246045606089725, - "grad_norm": 1.819858193397522, - "learning_rate": 4.949608868727053e-05, - "loss": 0.11403474807739258, - "step": 1680 - }, - { - "epoch": 0.22727842442574597, - "grad_norm": 7.45100212097168, - "learning_rate": 4.948403219325163e-05, - "loss": 0.13117753267288207, - "step": 1700 - }, - { - "epoch": 0.22995228824251945, - "grad_norm": 0.6526040434837341, - "learning_rate": 4.947183467193456e-05, - "loss": 0.07524924874305725, - "step": 1720 - }, - { - "epoch": 0.23262615205929293, - "grad_norm": 3.814746856689453, - "learning_rate": 4.945949619357668e-05, - "loss": 0.07659345269203185, - "step": 1740 - }, - { - "epoch": 0.23530001587606642, - "grad_norm": 2.373124122619629, - "learning_rate": 4.944701682924726e-05, - "loss": 0.1147496223449707, - "step": 1760 - }, - { - "epoch": 0.2379738796928399, - "grad_norm": 0.11161285638809204, - "learning_rate": 4.943439665082707e-05, - "loss": 0.07256829738616943, - "step": 1780 - }, - { - "epoch": 0.24064774350961338, - "grad_norm": 0.45990192890167236, - "learning_rate": 4.942163573100794e-05, - "loss": 0.07726740837097168, - "step": 1800 - }, - { - "epoch": 0.24332160732638686, - "grad_norm": 4.2301926612854, - "learning_rate": 4.940873414329242e-05, - "loss": 0.09349535703659058, - "step": 1820 - }, - { - "epoch": 0.24599547114316034, - "grad_norm": 2.442178726196289, - "learning_rate": 4.939569196199325e-05, - "loss": 0.12413722276687622, - "step": 1840 - }, - { - "epoch": 0.24866933495993382, - "grad_norm": 2.523683786392212, - "learning_rate": 4.938250926223302e-05, - "loss": 0.08566288352012634, - "step": 1860 - }, - { - "epoch": 0.2513431987767073, - "grad_norm": 3.511075258255005, - "learning_rate": 4.936918611994368e-05, - "loss": 0.08007702231407166, - "step": 1880 - }, - { - "epoch": 0.2540170625934808, - "grad_norm": 6.254627704620361, - "learning_rate": 4.935572261186614e-05, - "loss": 0.10983954668045044, - "step": 1900 - }, - { - "epoch": 0.25669092641025426, - "grad_norm": 1.5211899280548096, - "learning_rate": 4.934211881554981e-05, - "loss": 0.09120344519615173, - "step": 1920 - }, - { - "epoch": 0.25936479022702774, - "grad_norm": 2.5893588066101074, - "learning_rate": 4.932837480935214e-05, - "loss": 0.08754412531852722, - "step": 1940 - }, - { - "epoch": 0.2620386540438012, - "grad_norm": 6.878556251525879, - "learning_rate": 4.931449067243821e-05, - "loss": 0.08636274933815002, - "step": 1960 - }, - { - "epoch": 0.2647125178605747, - "grad_norm": 2.9078798294067383, - "learning_rate": 4.9300466484780226e-05, - "loss": 0.09582929015159607, - "step": 1980 - }, - { - "epoch": 0.2673863816773482, - "grad_norm": 3.391852855682373, - "learning_rate": 4.92863023271571e-05, - "loss": 0.0850919783115387, - "step": 2000 - }, - { - "epoch": 0.27006024549412166, - "grad_norm": 5.522103309631348, - "learning_rate": 4.927199828115395e-05, - "loss": 0.050999772548675534, - "step": 2020 - }, - { - "epoch": 0.27273410931089515, - "grad_norm": 0.90350741147995, - "learning_rate": 4.925755442916167e-05, - "loss": 0.10100446939468384, - "step": 2040 - }, - { - "epoch": 0.2754079731276686, - "grad_norm": 1.602030634880066, - "learning_rate": 4.924297085437641e-05, - "loss": 0.0468633770942688, - "step": 2060 - }, - { - "epoch": 0.2780818369444421, - "grad_norm": 1.5823460817337036, - "learning_rate": 4.922824764079913e-05, - "loss": 0.06786358952522278, - "step": 2080 - }, - { - "epoch": 0.2807557007612156, - "grad_norm": 1.6624343395233154, - "learning_rate": 4.92133848732351e-05, - "loss": 0.05772828459739685, - "step": 2100 - }, - { - "epoch": 0.28342956457798907, - "grad_norm": 0.947078287601471, - "learning_rate": 4.9198382637293424e-05, - "loss": 0.08012173175811768, - "step": 2120 - }, - { - "epoch": 0.28610342839476255, - "grad_norm": 0.2919924259185791, - "learning_rate": 4.918324101938653e-05, - "loss": 0.1208539366722107, - "step": 2140 - }, - { - "epoch": 0.28877729221153603, - "grad_norm": 9.258247375488281, - "learning_rate": 4.916796010672969e-05, - "loss": 0.10037034749984741, - "step": 2160 - }, - { - "epoch": 0.2914511560283095, - "grad_norm": 4.0920491218566895, - "learning_rate": 4.915253998734051e-05, - "loss": 0.061488878726959226, - "step": 2180 - }, - { - "epoch": 0.294125019845083, - "grad_norm": 6.1126627922058105, - "learning_rate": 4.913698075003841e-05, - "loss": 0.0862967312335968, - "step": 2200 - }, - { - "epoch": 0.29679888366185647, - "grad_norm": 2.585484743118286, - "learning_rate": 4.912128248444414e-05, - "loss": 0.05393874645233154, - "step": 2220 - }, - { - "epoch": 0.29947274747862995, - "grad_norm": 6.944481372833252, - "learning_rate": 4.9105445280979256e-05, - "loss": 0.08570566773414612, - "step": 2240 - }, - { - "epoch": 0.30214661129540343, - "grad_norm": 1.3824089765548706, - "learning_rate": 4.908946923086556e-05, - "loss": 0.09689127206802368, - "step": 2260 - }, - { - "epoch": 0.30482047511217697, - "grad_norm": 3.4861342906951904, - "learning_rate": 4.907335442612464e-05, - "loss": 0.12550976276397705, - "step": 2280 - }, - { - "epoch": 0.30749433892895045, - "grad_norm": 3.668980121612549, - "learning_rate": 4.905710095957728e-05, - "loss": 0.09089353680610657, - "step": 2300 - }, - { - "epoch": 0.31016820274572393, - "grad_norm": 1.093095064163208, - "learning_rate": 4.904070892484298e-05, - "loss": 0.03925192356109619, - "step": 2320 - }, - { - "epoch": 0.3128420665624974, - "grad_norm": 0.8169485926628113, - "learning_rate": 4.9024178416339364e-05, - "loss": 0.0979581356048584, - "step": 2340 - }, - { - "epoch": 0.3155159303792709, - "grad_norm": 1.892451286315918, - "learning_rate": 4.900750952928166e-05, - "loss": 0.05913209915161133, - "step": 2360 - }, - { - "epoch": 0.3181897941960444, - "grad_norm": 0.24644255638122559, - "learning_rate": 4.8990702359682184e-05, - "loss": 0.06815173625946044, - "step": 2380 - }, - { - "epoch": 0.32086365801281785, - "grad_norm": 2.1861305236816406, - "learning_rate": 4.897375700434972e-05, - "loss": 0.04142785966396332, - "step": 2400 - }, - { - "epoch": 0.32353752182959133, - "grad_norm": 2.6643004417419434, - "learning_rate": 4.8956673560889013e-05, - "loss": 0.05177200436592102, - "step": 2420 - }, - { - "epoch": 0.3262113856463648, - "grad_norm": 2.588113784790039, - "learning_rate": 4.8939452127700195e-05, - "loss": 0.05783546566963196, - "step": 2440 - }, - { - "epoch": 0.3288852494631383, - "grad_norm": 2.419644594192505, - "learning_rate": 4.8922092803978203e-05, - "loss": 0.08906854391098022, - "step": 2460 - }, - { - "epoch": 0.3315591132799118, - "grad_norm": 0.16949939727783203, - "learning_rate": 4.890459568971223e-05, - "loss": 0.10305211544036866, - "step": 2480 - }, - { - "epoch": 0.33423297709668526, - "grad_norm": 0.10032984614372253, - "learning_rate": 4.8886960885685126e-05, - "loss": 0.06348527669906616, - "step": 2500 - }, - { - "epoch": 0.33690684091345874, - "grad_norm": 3.3658738136291504, - "learning_rate": 4.8869188493472854e-05, - "loss": 0.06826075911521912, - "step": 2520 - }, - { - "epoch": 0.3395807047302322, - "grad_norm": 0.8656186461448669, - "learning_rate": 4.885127861544386e-05, - "loss": 0.05929765701293945, - "step": 2540 - }, - { - "epoch": 0.3422545685470057, - "grad_norm": 0.1492065042257309, - "learning_rate": 4.8833231354758496e-05, - "loss": 0.09429731965065002, - "step": 2560 - }, - { - "epoch": 0.3449284323637792, - "grad_norm": 0.6010928153991699, - "learning_rate": 4.881504681536846e-05, - "loss": 0.06262240409851075, - "step": 2580 - }, - { - "epoch": 0.34760229618055266, - "grad_norm": 1.6506450176239014, - "learning_rate": 4.879672510201616e-05, - "loss": 0.061688083410263064, - "step": 2600 - }, - { - "epoch": 0.35027615999732614, - "grad_norm": 0.2703142464160919, - "learning_rate": 4.877826632023412e-05, - "loss": 0.06175137162208557, - "step": 2620 - }, - { - "epoch": 0.3529500238140996, - "grad_norm": 3.1056365966796875, - "learning_rate": 4.875967057634437e-05, - "loss": 0.07828506827354431, - "step": 2640 - }, - { - "epoch": 0.3556238876308731, - "grad_norm": 0.28790283203125, - "learning_rate": 4.874093797745784e-05, - "loss": 0.11355981826782227, - "step": 2660 - }, - { - "epoch": 0.3582977514476466, - "grad_norm": 2.3372068405151367, - "learning_rate": 4.8722068631473746e-05, - "loss": 0.048267871141433716, - "step": 2680 - }, - { - "epoch": 0.36097161526442006, - "grad_norm": 0.12767371535301208, - "learning_rate": 4.8703062647078976e-05, - "loss": 0.04319801032543182, - "step": 2700 - }, - { - "epoch": 0.36364547908119355, - "grad_norm": 0.5145738124847412, - "learning_rate": 4.868392013374741e-05, - "loss": 0.0773090660572052, - "step": 2720 - }, - { - "epoch": 0.366319342897967, - "grad_norm": 0.8518500328063965, - "learning_rate": 4.866464120173937e-05, - "loss": 0.05149460434913635, - "step": 2740 - }, - { - "epoch": 0.3689932067147405, - "grad_norm": 3.6726584434509277, - "learning_rate": 4.8645225962100924e-05, - "loss": 0.06896821856498718, - "step": 2760 - }, - { - "epoch": 0.371667070531514, - "grad_norm": 1.5626497268676758, - "learning_rate": 4.862567452666329e-05, - "loss": 0.047730174660682675, - "step": 2780 - }, - { - "epoch": 0.37434093434828747, - "grad_norm": 6.562028884887695, - "learning_rate": 4.8605987008042144e-05, - "loss": 0.07060698866844177, - "step": 2800 - }, - { - "epoch": 0.37701479816506095, - "grad_norm": 0.7631726861000061, - "learning_rate": 4.8586163519637005e-05, - "loss": 0.04944324493408203, - "step": 2820 - }, - { - "epoch": 0.37968866198183443, - "grad_norm": 1.6982293128967285, - "learning_rate": 4.8566204175630595e-05, - "loss": 0.03000348210334778, - "step": 2840 - }, - { - "epoch": 0.3823625257986079, - "grad_norm": 0.6487429141998291, - "learning_rate": 4.854610909098812e-05, - "loss": 0.06691416501998901, - "step": 2860 - }, - { - "epoch": 0.3850363896153814, - "grad_norm": 0.7648892402648926, - "learning_rate": 4.852587838145668e-05, - "loss": 0.05529783964157105, - "step": 2880 - }, - { - "epoch": 0.38771025343215487, - "grad_norm": 0.11601298302412033, - "learning_rate": 4.850551216356457e-05, - "loss": 0.07780832052230835, - "step": 2900 - }, - { - "epoch": 0.39038411724892835, - "grad_norm": 0.9443137645721436, - "learning_rate": 4.8485010554620594e-05, - "loss": 0.08007023930549621, - "step": 2920 - }, - { - "epoch": 0.39305798106570183, - "grad_norm": 0.8828252553939819, - "learning_rate": 4.846437367271341e-05, - "loss": 0.03541453182697296, - "step": 2940 - }, - { - "epoch": 0.3957318448824753, - "grad_norm": 0.21668888628482819, - "learning_rate": 4.844360163671083e-05, - "loss": 0.08354364633560181, - "step": 2960 - }, - { - "epoch": 0.3984057086992488, - "grad_norm": 0.6840483546257019, - "learning_rate": 4.8422694566259194e-05, - "loss": 0.045807772874832155, - "step": 2980 - }, - { - "epoch": 0.4010795725160223, - "grad_norm": 1.2754698991775513, - "learning_rate": 4.8401652581782584e-05, - "loss": 0.053487342596054074, - "step": 3000 - }, - { - "epoch": 0.40375343633279576, - "grad_norm": 0.19012756645679474, - "learning_rate": 4.838047580448222e-05, - "loss": 0.05881953239440918, - "step": 3020 - }, - { - "epoch": 0.40642730014956924, - "grad_norm": 2.1057698726654053, - "learning_rate": 4.835916435633569e-05, - "loss": 0.031065690517425536, - "step": 3040 - }, - { - "epoch": 0.4091011639663427, - "grad_norm": 4.188559055328369, - "learning_rate": 4.833771836009633e-05, - "loss": 0.07205432653427124, - "step": 3060 - }, - { - "epoch": 0.4117750277831162, - "grad_norm": 6.975829124450684, - "learning_rate": 4.831613793929242e-05, - "loss": 0.04953635036945343, - "step": 3080 - }, - { - "epoch": 0.4144488915998897, - "grad_norm": 4.725269317626953, - "learning_rate": 4.8294423218226546e-05, - "loss": 0.05965519547462463, - "step": 3100 - }, - { - "epoch": 0.41712275541666316, - "grad_norm": 1.7124755382537842, - "learning_rate": 4.827257432197486e-05, - "loss": 0.039625433087348935, - "step": 3120 - }, - { - "epoch": 0.41979661923343664, - "grad_norm": 2.6687324047088623, - "learning_rate": 4.825059137638636e-05, - "loss": 0.05020809769630432, - "step": 3140 - }, - { - "epoch": 0.4224704830502101, - "grad_norm": 1.111640214920044, - "learning_rate": 4.822847450808215e-05, - "loss": 0.04404452443122864, - "step": 3160 - }, - { - "epoch": 0.42514434686698366, - "grad_norm": 0.2128070890903473, - "learning_rate": 4.8206223844454744e-05, - "loss": 0.08283355236053466, - "step": 3180 - }, - { - "epoch": 0.42781821068375714, - "grad_norm": 0.10757248103618622, - "learning_rate": 4.818383951366729e-05, - "loss": 0.08568671345710754, - "step": 3200 - }, - { - "epoch": 0.4304920745005306, - "grad_norm": 0.08344592899084091, - "learning_rate": 4.816132164465289e-05, - "loss": 0.0426956832408905, - "step": 3220 - }, - { - "epoch": 0.4331659383173041, - "grad_norm": 0.5657751560211182, - "learning_rate": 4.813867036711378e-05, - "loss": 0.04971776902675629, - "step": 3240 - }, - { - "epoch": 0.4358398021340776, - "grad_norm": 2.1529288291931152, - "learning_rate": 4.8115885811520654e-05, - "loss": 0.025386181473731995, - "step": 3260 - }, - { - "epoch": 0.43851366595085106, - "grad_norm": 4.228519916534424, - "learning_rate": 4.809296810911188e-05, - "loss": 0.06401395201683044, - "step": 3280 - }, - { - "epoch": 0.44118752976762454, - "grad_norm": 6.770420551300049, - "learning_rate": 4.806991739189274e-05, - "loss": 0.16425553560256959, - "step": 3300 - }, - { - "epoch": 0.443861393584398, - "grad_norm": 0.5303187370300293, - "learning_rate": 4.804673379263467e-05, - "loss": 0.045900467038154605, - "step": 3320 - }, - { - "epoch": 0.4465352574011715, - "grad_norm": 0.221473827958107, - "learning_rate": 4.802341744487453e-05, - "loss": 0.07529735565185547, - "step": 3340 - }, - { - "epoch": 0.449209121217945, - "grad_norm": 3.48736834526062, - "learning_rate": 4.799996848291378e-05, - "loss": 0.062433135509490964, - "step": 3360 - }, - { - "epoch": 0.45188298503471847, - "grad_norm": 2.650038242340088, - "learning_rate": 4.797638704181774e-05, - "loss": 0.03762982189655304, - "step": 3380 - }, - { - "epoch": 0.45455684885149195, - "grad_norm": 3.159665584564209, - "learning_rate": 4.795267325741483e-05, - "loss": 0.04745924174785614, - "step": 3400 - }, - { - "epoch": 0.4572307126682654, - "grad_norm": 0.8763885498046875, - "learning_rate": 4.7928827266295715e-05, - "loss": 0.07380253076553345, - "step": 3420 - }, - { - "epoch": 0.4599045764850389, - "grad_norm": 0.1779366433620453, - "learning_rate": 4.790484920581262e-05, - "loss": 0.045916372537612916, - "step": 3440 - }, - { - "epoch": 0.4625784403018124, - "grad_norm": 1.1228729486465454, - "learning_rate": 4.7880739214078454e-05, - "loss": 0.04461723566055298, - "step": 3460 - }, - { - "epoch": 0.46525230411858587, - "grad_norm": 0.1629919707775116, - "learning_rate": 4.785649742996605e-05, - "loss": 0.017159442603588104, - "step": 3480 - }, - { - "epoch": 0.46792616793535935, - "grad_norm": 3.583951473236084, - "learning_rate": 4.783212399310737e-05, - "loss": 0.047145146131515506, - "step": 3500 - }, - { - "epoch": 0.47060003175213283, - "grad_norm": 0.9766237139701843, - "learning_rate": 4.780761904389267e-05, - "loss": 0.050229442119598386, - "step": 3520 - }, - { - "epoch": 0.4732738955689063, - "grad_norm": 0.05617872253060341, - "learning_rate": 4.778298272346976e-05, - "loss": 0.024862812459468843, - "step": 3540 - }, - { - "epoch": 0.4759477593856798, - "grad_norm": 1.3586453199386597, - "learning_rate": 4.775821517374308e-05, - "loss": 0.02117772251367569, - "step": 3560 - }, - { - "epoch": 0.4786216232024533, - "grad_norm": 1.2116742134094238, - "learning_rate": 4.7733316537373006e-05, - "loss": 0.03060794174671173, - "step": 3580 - }, - { - "epoch": 0.48129548701922675, - "grad_norm": 0.39403238892555237, - "learning_rate": 4.770828695777493e-05, - "loss": 0.05482668280601501, - "step": 3600 - }, - { - "epoch": 0.48396935083600023, - "grad_norm": 0.9248486161231995, - "learning_rate": 4.7683126579118495e-05, - "loss": 0.03612814247608185, - "step": 3620 - }, - { - "epoch": 0.4866432146527737, - "grad_norm": 0.1624649167060852, - "learning_rate": 4.7657835546326736e-05, - "loss": 0.04334873259067536, - "step": 3640 - }, - { - "epoch": 0.4893170784695472, - "grad_norm": 0.5321119427680969, - "learning_rate": 4.763241400507524e-05, - "loss": 0.0461233913898468, - "step": 3660 - }, - { - "epoch": 0.4919909422863207, - "grad_norm": 0.34861093759536743, - "learning_rate": 4.760686210179133e-05, - "loss": 0.024829554557800292, - "step": 3680 - }, - { - "epoch": 0.49466480610309416, - "grad_norm": 1.2561241388320923, - "learning_rate": 4.758117998365322e-05, - "loss": 0.03157005608081818, - "step": 3700 - }, - { - "epoch": 0.49733866991986764, - "grad_norm": 0.8691341280937195, - "learning_rate": 4.7555367798589146e-05, - "loss": 0.04310203492641449, - "step": 3720 - }, - { - "epoch": 0.5000125337366411, - "grad_norm": 0.3134572505950928, - "learning_rate": 4.752942569527653e-05, - "loss": 0.03796039223670959, - "step": 3740 - }, - { - "epoch": 0.5026863975534146, - "grad_norm": 2.3359289169311523, - "learning_rate": 4.75033538231411e-05, - "loss": 0.055599170923233035, - "step": 3760 - }, - { - "epoch": 0.5053602613701881, - "grad_norm": 7.426175594329834, - "learning_rate": 4.747715233235608e-05, - "loss": 0.054436272382736205, - "step": 3780 - }, - { - "epoch": 0.5080341251869616, - "grad_norm": 0.5940203070640564, - "learning_rate": 4.745082137384128e-05, - "loss": 0.03682814538478851, - "step": 3800 - }, - { - "epoch": 0.510707989003735, - "grad_norm": 0.22821389138698578, - "learning_rate": 4.7424361099262225e-05, - "loss": 0.051123309135437014, - "step": 3820 - }, - { - "epoch": 0.5133818528205085, - "grad_norm": 8.20633602142334, - "learning_rate": 4.739777166102932e-05, - "loss": 0.0704378604888916, - "step": 3840 - }, - { - "epoch": 0.516055716637282, - "grad_norm": 3.023848533630371, - "learning_rate": 4.737105321229694e-05, - "loss": 0.03368058800697327, - "step": 3860 - }, - { - "epoch": 0.5187295804540555, - "grad_norm": 0.07666649669408798, - "learning_rate": 4.7344205906962555e-05, - "loss": 0.03665303289890289, - "step": 3880 - }, - { - "epoch": 0.521403444270829, - "grad_norm": 0.7571629881858826, - "learning_rate": 4.731722989966585e-05, - "loss": 0.058415502309799194, - "step": 3900 - }, - { - "epoch": 0.5240773080876024, - "grad_norm": 3.2599120140075684, - "learning_rate": 4.7290125345787816e-05, - "loss": 0.07323018908500671, - "step": 3920 - }, - { - "epoch": 0.5267511719043759, - "grad_norm": 0.28930988907814026, - "learning_rate": 4.7262892401449886e-05, - "loss": 0.054371267557144165, - "step": 3940 - }, - { - "epoch": 0.5294250357211494, - "grad_norm": 2.2296454906463623, - "learning_rate": 4.7235531223513004e-05, - "loss": 0.040819621086120604, - "step": 3960 - }, - { - "epoch": 0.5320988995379229, - "grad_norm": 0.11608211696147919, - "learning_rate": 4.720804196957675e-05, - "loss": 0.05215579271316528, - "step": 3980 - }, - { - "epoch": 0.5347727633546964, - "grad_norm": 1.1587547063827515, - "learning_rate": 4.7180424797978415e-05, - "loss": 0.026277875900268553, - "step": 4000 - }, - { - "epoch": 0.5374466271714698, - "grad_norm": 0.06253435462713242, - "learning_rate": 4.7152679867792074e-05, - "loss": 0.02574407756328583, - "step": 4020 - }, - { - "epoch": 0.5401204909882433, - "grad_norm": 1.3441458940505981, - "learning_rate": 4.71248073388277e-05, - "loss": 0.05538107752799988, - "step": 4040 - }, - { - "epoch": 0.5427943548050168, - "grad_norm": 0.48076340556144714, - "learning_rate": 4.7096807371630236e-05, - "loss": 0.047986540198326114, - "step": 4060 - }, - { - "epoch": 0.5454682186217903, - "grad_norm": 0.5924936532974243, - "learning_rate": 4.706868012747867e-05, - "loss": 0.05463914275169372, - "step": 4080 - }, - { - "epoch": 0.7673995566395854, - "grad_norm": 0.05143728107213974, - "learning_rate": 4.431151627307268e-05, - "loss": 0.00959376593430837, - "step": 4100 - }, - { - "epoch": 0.771142969110998, - "grad_norm": 1.2308074235916138, - "learning_rate": 4.425806509248848e-05, - "loss": 0.002745623141527176, - "step": 4120 - }, - { - "epoch": 0.7748863815824106, - "grad_norm": 2.080223798751831, - "learning_rate": 4.420439652052499e-05, - "loss": 0.012390998750925064, - "step": 4140 - }, - { - "epoch": 0.7786297940538233, - "grad_norm": 0.049312230199575424, - "learning_rate": 4.415051116301072e-05, - "loss": 0.004607534408569336, - "step": 4160 - }, - { - "epoch": 0.7823732065252359, - "grad_norm": 0.07747476547956467, - "learning_rate": 4.409640962822132e-05, - "loss": 0.034441503882408145, - "step": 4180 - }, - { - "epoch": 0.7861166189966485, - "grad_norm": 0.021327875554561615, - "learning_rate": 4.404209252687275e-05, - "loss": 0.009768449515104295, - "step": 4200 - }, - { - "epoch": 0.789860031468061, - "grad_norm": 2.406580924987793, - "learning_rate": 4.398756047211431e-05, - "loss": 0.005304037779569626, - "step": 4220 - }, - { - "epoch": 0.7936034439394737, - "grad_norm": 0.027869906276464462, - "learning_rate": 4.39328140795218e-05, - "loss": 0.00896073654294014, - "step": 4240 - }, - { - "epoch": 0.7973468564108863, - "grad_norm": 0.09702044725418091, - "learning_rate": 4.387785396709052e-05, - "loss": 0.0117533378303051, - "step": 4260 - }, - { - "epoch": 0.801090268882299, - "grad_norm": 0.529065728187561, - "learning_rate": 4.382268075522831e-05, - "loss": 0.0037526611238718035, - "step": 4280 - }, - { - "epoch": 0.8048336813537116, - "grad_norm": 0.015109462663531303, - "learning_rate": 4.3767295066748564e-05, - "loss": 0.0025708725675940513, - "step": 4300 - }, - { - "epoch": 0.8085770938251241, - "grad_norm": 0.7257627248764038, - "learning_rate": 4.371169752686316e-05, - "loss": 0.006234285607933998, - "step": 4320 - }, - { - "epoch": 0.8123205062965367, - "grad_norm": 0.016853008419275284, - "learning_rate": 4.3655888763175436e-05, - "loss": 0.0023587727919220924, - "step": 4340 - }, - { - "epoch": 0.8160639187679494, - "grad_norm": 0.017816167324781418, - "learning_rate": 4.3599869405673085e-05, - "loss": 0.0012389549054205417, - "step": 4360 - }, - { - "epoch": 0.819807331239362, - "grad_norm": 0.014672616496682167, - "learning_rate": 4.354364008672106e-05, - "loss": 0.002244691364467144, - "step": 4380 - }, - { - "epoch": 0.8235507437107746, - "grad_norm": 0.044869400560855865, - "learning_rate": 4.3487201441054435e-05, - "loss": 0.007713723182678223, - "step": 4400 - }, - { - "epoch": 0.8272941561821872, - "grad_norm": 0.06367291510105133, - "learning_rate": 4.343055410577122e-05, - "loss": 0.005743256583809852, - "step": 4420 - }, - { - "epoch": 0.8310375686535998, - "grad_norm": 0.1354215145111084, - "learning_rate": 4.3373698720325176e-05, - "loss": 0.009635470807552338, - "step": 4440 - }, - { - "epoch": 0.8347809811250124, - "grad_norm": 0.9089844822883606, - "learning_rate": 4.331663592651862e-05, - "loss": 0.01007603257894516, - "step": 4460 - }, - { - "epoch": 0.838524393596425, - "grad_norm": 0.025831619277596474, - "learning_rate": 4.3259366368495167e-05, - "loss": 0.006179215386509895, - "step": 4480 - }, - { - "epoch": 0.8422678060678377, - "grad_norm": 0.016653764992952347, - "learning_rate": 4.320189069273243e-05, - "loss": 0.0025156451389193534, - "step": 4500 - }, - { - "epoch": 0.8460112185392502, - "grad_norm": 0.27361780405044556, - "learning_rate": 4.3144209548034766e-05, - "loss": 0.002235286869108677, - "step": 4520 - }, - { - "epoch": 0.8497546310106628, - "grad_norm": 2.6958701610565186, - "learning_rate": 4.3086323585525915e-05, - "loss": 0.03571180701255798, - "step": 4540 - }, - { - "epoch": 0.8534980434820755, - "grad_norm": 0.1260778158903122, - "learning_rate": 4.3028233458641696e-05, - "loss": 0.0036518506705760954, - "step": 4560 - }, - { - "epoch": 0.8572414559534881, - "grad_norm": 0.2445528209209442, - "learning_rate": 4.2969939823122586e-05, - "loss": 0.024949796497821808, - "step": 4580 - }, - { - "epoch": 0.8609848684249007, - "grad_norm": 0.1674242913722992, - "learning_rate": 4.291144333700633e-05, - "loss": 0.002089798077940941, - "step": 4600 - }, - { - "epoch": 0.8647282808963134, - "grad_norm": 0.05161884427070618, - "learning_rate": 4.2852744660620515e-05, - "loss": 0.007847145944833756, - "step": 4620 - }, - { - "epoch": 0.8684716933677259, - "grad_norm": 0.019796324893832207, - "learning_rate": 4.279384445657514e-05, - "loss": 0.0023555334657430647, - "step": 4640 - }, - { - "epoch": 0.8722151058391385, - "grad_norm": 0.0647754967212677, - "learning_rate": 4.2734743389755096e-05, - "loss": 0.009586349129676819, - "step": 4660 - }, - { - "epoch": 0.8759585183105512, - "grad_norm": 0.015243460424244404, - "learning_rate": 4.267544212731268e-05, - "loss": 0.017788709700107576, - "step": 4680 - }, - { - "epoch": 0.8797019307819638, - "grad_norm": 0.05756703019142151, - "learning_rate": 4.261594133866007e-05, - "loss": 0.014256520569324494, - "step": 4700 - }, - { - "epoch": 0.8834453432533764, - "grad_norm": 0.2002931535243988, - "learning_rate": 4.255624169546175e-05, - "loss": 0.0014025470241904258, - "step": 4720 - }, - { - "epoch": 0.887188755724789, - "grad_norm": 0.04325389489531517, - "learning_rate": 4.249634387162696e-05, - "loss": 0.010552891343832017, - "step": 4740 - }, - { - "epoch": 0.8909321681962016, - "grad_norm": 0.8975178599357605, - "learning_rate": 4.243624854330206e-05, - "loss": 0.0032475266605615618, - "step": 4760 - }, - { - "epoch": 0.8946755806676142, - "grad_norm": 0.01541830413043499, - "learning_rate": 4.237595638886288e-05, - "loss": 0.003157203644514084, - "step": 4780 - }, - { - "epoch": 0.8984189931390268, - "grad_norm": 1.673305869102478, - "learning_rate": 4.231546808890713e-05, - "loss": 0.0028239911422133445, - "step": 4800 - }, - { - "epoch": 0.9021624056104395, - "grad_norm": 0.021689629182219505, - "learning_rate": 4.225478432624665e-05, - "loss": 0.0026885712519288062, - "step": 4820 - }, - { - "epoch": 0.905905818081852, - "grad_norm": 0.019590798765420914, - "learning_rate": 4.219390578589973e-05, - "loss": 0.00780024379491806, - "step": 4840 - }, - { - "epoch": 0.9096492305532646, - "grad_norm": 0.024581020697951317, - "learning_rate": 4.213283315508337e-05, - "loss": 0.006697511672973633, - "step": 4860 - }, - { - "epoch": 0.9133926430246773, - "grad_norm": 0.20615583658218384, - "learning_rate": 4.207156712320555e-05, - "loss": 0.007314208894968033, - "step": 4880 - }, - { - "epoch": 0.9171360554960899, - "grad_norm": 0.015673745423555374, - "learning_rate": 4.20101083818574e-05, - "loss": 0.004841562733054161, - "step": 4900 - }, - { - "epoch": 0.9208794679675025, - "grad_norm": 0.008306623436510563, - "learning_rate": 4.194845762480544e-05, - "loss": 0.0010150263085961341, - "step": 4920 - }, - { - "epoch": 0.9246228804389152, - "grad_norm": 0.051861703395843506, - "learning_rate": 4.188661554798369e-05, - "loss": 0.011043114960193634, - "step": 4940 - }, - { - "epoch": 0.9283662929103277, - "grad_norm": 1.7019767761230469, - "learning_rate": 4.1824582849485884e-05, - "loss": 0.004985674470663071, - "step": 4960 - }, - { - "epoch": 0.9321097053817403, - "grad_norm": 0.021240154281258583, - "learning_rate": 4.176236022955755e-05, - "loss": 0.04885836541652679, - "step": 4980 - }, - { - "epoch": 0.935853117853153, - "grad_norm": 0.016504865139722824, - "learning_rate": 4.16999483905881e-05, - "loss": 0.0027378931641578673, - "step": 5000 - }, - { - "epoch": 0.9395965303245656, - "grad_norm": 0.014015628024935722, - "learning_rate": 4.163734803710294e-05, - "loss": 0.012781022489070893, - "step": 5020 - }, - { - "epoch": 0.9433399427959782, - "grad_norm": 0.013812500052154064, - "learning_rate": 4.157455987575545e-05, - "loss": 0.007508871704339981, - "step": 5040 - }, - { - "epoch": 0.9470833552673907, - "grad_norm": 0.01622290164232254, - "learning_rate": 4.1511584615319075e-05, - "loss": 0.0014614147134125234, - "step": 5060 - }, - { - "epoch": 0.9508267677388034, - "grad_norm": 0.01259149145334959, - "learning_rate": 4.144842296667929e-05, - "loss": 0.006202424317598343, - "step": 5080 - }, - { - "epoch": 0.954570180210216, - "grad_norm": 0.012383027002215385, - "learning_rate": 4.138507564282558e-05, - "loss": 0.006122353300452232, - "step": 5100 - }, - { - "epoch": 0.9583135926816286, - "grad_norm": 0.006499920971691608, - "learning_rate": 4.1321543358843385e-05, - "loss": 0.0008865024894475937, - "step": 5120 - }, - { - "epoch": 0.9620570051530413, - "grad_norm": 0.00830752868205309, - "learning_rate": 4.125782683190606e-05, - "loss": 0.0008420860394835472, - "step": 5140 - }, - { - "epoch": 0.9658004176244538, - "grad_norm": 0.01525857299566269, - "learning_rate": 4.119392678126673e-05, - "loss": 0.00587364137172699, - "step": 5160 - }, - { - "epoch": 0.9695438300958664, - "grad_norm": 0.01072095800191164, - "learning_rate": 4.11298439282502e-05, - "loss": 0.00853007659316063, - "step": 5180 - }, - { - "epoch": 0.973287242567279, - "grad_norm": 0.030316641554236412, - "learning_rate": 4.106557899624482e-05, - "loss": 0.0058747071772813795, - "step": 5200 - }, - { - "epoch": 0.9770306550386917, - "grad_norm": 0.0391647033393383, - "learning_rate": 4.1001132710694304e-05, - "loss": 0.0034765828400850295, - "step": 5220 - }, - { - "epoch": 0.9807740675101043, - "grad_norm": 0.04938298836350441, - "learning_rate": 4.093650579908953e-05, - "loss": 0.007594724744558334, - "step": 5240 - }, - { - "epoch": 0.984517479981517, - "grad_norm": 0.005873252172023058, - "learning_rate": 4.087169899096037e-05, - "loss": 0.013347607851028443, - "step": 5260 - }, - { - "epoch": 0.9882608924529295, - "grad_norm": 1.2757259607315063, - "learning_rate": 4.080671301786741e-05, - "loss": 0.004837355017662049, - "step": 5280 - }, - { - "epoch": 0.9920043049243421, - "grad_norm": 0.00920735765248537, - "learning_rate": 4.0741548613393675e-05, - "loss": 0.007415445148944854, - "step": 5300 - }, - { - "epoch": 0.9957477173957547, - "grad_norm": 0.5702093839645386, - "learning_rate": 4.067620651313647e-05, - "loss": 0.00406576506793499, - "step": 5320 - }, - { - "epoch": 0.9994911298671674, - "grad_norm": 1.8361051082611084, - "learning_rate": 4.0610687454698906e-05, - "loss": 0.00997612327337265, - "step": 5340 - }, - { - "epoch": 1.0031819006007008, - "grad_norm": 3.335326910018921, - "learning_rate": 4.0544992177681685e-05, - "loss": 0.008442799001932145, - "step": 5360 - }, - { - "epoch": 1.0069253130721134, - "grad_norm": 0.03184954449534416, - "learning_rate": 4.047912142367473e-05, - "loss": 0.008095134049654007, - "step": 5380 - }, - { - "epoch": 1.010668725543526, - "grad_norm": 0.029989074915647507, - "learning_rate": 4.04130759362488e-05, - "loss": 0.0012585990130901336, - "step": 5400 - }, - { - "epoch": 1.0144121380149385, - "grad_norm": 0.08727464079856873, - "learning_rate": 4.034685646094711e-05, - "loss": 0.012588074803352356, - "step": 5420 - }, - { - "epoch": 1.018155550486351, - "grad_norm": 0.018498806282877922, - "learning_rate": 4.028046374527689e-05, - "loss": 0.001854238100349903, - "step": 5440 - }, - { - "epoch": 1.0218989629577637, - "grad_norm": 0.013779236935079098, - "learning_rate": 4.021389853870095e-05, - "loss": 0.0008004569448530674, - "step": 5460 - }, - { - "epoch": 1.0256423754291764, - "grad_norm": 0.028235070407390594, - "learning_rate": 4.0147161592629306e-05, - "loss": 0.002274145185947418, - "step": 5480 - }, - { - "epoch": 1.029385787900589, - "grad_norm": 0.023030120879411697, - "learning_rate": 4.008025366041055e-05, - "loss": 0.008717305958271027, - "step": 5500 - }, - { - "epoch": 1.0331292003720016, - "grad_norm": 0.018347155302762985, - "learning_rate": 4.001317549732345e-05, - "loss": 0.00244256854057312, - "step": 5520 - }, - { - "epoch": 1.0368726128434143, - "grad_norm": 0.03449391946196556, - "learning_rate": 3.99459278605684e-05, - "loss": 0.0039924226701259615, - "step": 5540 - }, - { - "epoch": 1.0406160253148269, - "grad_norm": 0.030406463891267776, - "learning_rate": 3.9878511509258866e-05, - "loss": 0.0021008485928177834, - "step": 5560 - }, - { - "epoch": 1.0443594377862395, - "grad_norm": 0.01783100888133049, - "learning_rate": 3.9810927204412803e-05, - "loss": 0.0006656501442193985, - "step": 5580 - }, - { - "epoch": 1.0481028502576522, - "grad_norm": 0.05360455811023712, - "learning_rate": 3.974317570894413e-05, - "loss": 0.005278818309307098, - "step": 5600 - }, - { - "epoch": 1.0518462627290646, - "grad_norm": 0.008699169382452965, - "learning_rate": 3.9675257787654e-05, - "loss": 0.005309444293379784, - "step": 5620 - }, - { - "epoch": 1.0555896752004772, - "grad_norm": 0.036641959100961685, - "learning_rate": 3.960717420722227e-05, - "loss": 0.0034692320972681046, - "step": 5640 - }, - { - "epoch": 1.0593330876718898, - "grad_norm": 0.012212110683321953, - "learning_rate": 3.953892573619883e-05, - "loss": 0.005343861132860184, - "step": 5660 - }, - { - "epoch": 1.0630765001433025, - "grad_norm": 0.011296284385025501, - "learning_rate": 3.947051314499489e-05, - "loss": 0.0038058970123529432, - "step": 5680 - }, - { - "epoch": 1.066819912614715, - "grad_norm": 0.05954049900174141, - "learning_rate": 3.94019372058743e-05, - "loss": 0.008142991364002228, - "step": 5700 - }, - { - "epoch": 1.0705633250861277, - "grad_norm": 0.03478416055440903, - "learning_rate": 3.933319869294483e-05, - "loss": 0.0075227849185466765, - "step": 5720 - }, - { - "epoch": 1.0743067375575404, - "grad_norm": 0.014586996287107468, - "learning_rate": 3.9264298382149455e-05, - "loss": 0.0036750122904777526, - "step": 5740 - }, - { - "epoch": 1.078050150028953, - "grad_norm": 0.025754544883966446, - "learning_rate": 3.919523705125757e-05, - "loss": 0.004151013493537903, - "step": 5760 - }, - { - "epoch": 1.0817935625003656, - "grad_norm": 0.03239905461668968, - "learning_rate": 3.9126015479856205e-05, - "loss": 0.00861695185303688, - "step": 5780 - }, - { - "epoch": 1.0855369749717783, - "grad_norm": 0.03506994619965553, - "learning_rate": 3.9056634449341256e-05, - "loss": 0.003123755753040314, - "step": 5800 - }, - { - "epoch": 1.089280387443191, - "grad_norm": 0.0286911278963089, - "learning_rate": 3.898709474290864e-05, - "loss": 0.002537376619875431, - "step": 5820 - }, - { - "epoch": 1.0930237999146033, - "grad_norm": 0.03490692004561424, - "learning_rate": 3.8917397145545454e-05, - "loss": 0.0010227372869849205, - "step": 5840 - }, - { - "epoch": 1.096767212386016, - "grad_norm": 0.013748899102210999, - "learning_rate": 3.884754244402113e-05, - "loss": 0.011847371608018875, - "step": 5860 - }, - { - "epoch": 1.1005106248574286, - "grad_norm": 0.035458195954561234, - "learning_rate": 3.877753142687852e-05, - "loss": 0.009741749614477158, - "step": 5880 - }, - { - "epoch": 1.1042540373288412, - "grad_norm": 0.012493673712015152, - "learning_rate": 3.8707364884425064e-05, - "loss": 0.006607493013143539, - "step": 5900 - }, - { - "epoch": 1.1079974498002538, - "grad_norm": 0.018607834354043007, - "learning_rate": 3.863704360872378e-05, - "loss": 0.0016217166557908058, - "step": 5920 - }, - { - "epoch": 1.1117408622716665, - "grad_norm": 0.0283930953592062, - "learning_rate": 3.8566568393584366e-05, - "loss": 0.002083975449204445, - "step": 5940 - }, - { - "epoch": 1.115484274743079, - "grad_norm": 0.05229801684617996, - "learning_rate": 3.8495940034554283e-05, - "loss": 0.0014217685908079146, - "step": 5960 - }, - { - "epoch": 1.1192276872144917, - "grad_norm": 0.008808930404484272, - "learning_rate": 3.8425159328909684e-05, - "loss": 0.0022570645436644555, - "step": 5980 - }, - { - "epoch": 1.1229710996859044, - "grad_norm": 0.020502232015132904, - "learning_rate": 3.835422707564648e-05, - "loss": 0.003745942190289497, - "step": 6000 - }, - { - "epoch": 1.126714512157317, - "grad_norm": 0.032347094267606735, - "learning_rate": 3.82831440754713e-05, - "loss": 0.003347185626626015, - "step": 6020 - }, - { - "epoch": 1.1304579246287294, - "grad_norm": 0.020310478284955025, - "learning_rate": 3.821191113079246e-05, - "loss": 0.006166417896747589, - "step": 6040 - }, - { - "epoch": 1.134201337100142, - "grad_norm": 0.06390372663736343, - "learning_rate": 3.8140529045710876e-05, - "loss": 0.0013674044981598853, - "step": 6060 - }, - { - "epoch": 1.1379447495715547, - "grad_norm": 1.1938918828964233, - "learning_rate": 3.806899862601105e-05, - "loss": 0.010550644248723984, - "step": 6080 - }, - { - "epoch": 1.1416881620429673, - "grad_norm": 0.035355549305677414, - "learning_rate": 3.799732067915189e-05, - "loss": 0.0069750770926475525, - "step": 6100 - }, - { - "epoch": 1.14543157451438, - "grad_norm": 0.009921093471348286, - "learning_rate": 3.792549601425767e-05, - "loss": 0.0027949588373303415, - "step": 6120 - }, - { - "epoch": 1.1491749869857926, - "grad_norm": 0.06172063946723938, - "learning_rate": 3.785352544210884e-05, - "loss": 0.0009372101165354251, - "step": 6140 - }, - { - "epoch": 1.1529183994572052, - "grad_norm": 0.008572470396757126, - "learning_rate": 3.778140977513294e-05, - "loss": 0.0029502738267183303, - "step": 6160 - }, - { - "epoch": 1.1566618119286178, - "grad_norm": 0.4211727976799011, - "learning_rate": 3.770914982739534e-05, - "loss": 0.014692296087741853, - "step": 6180 - }, - { - "epoch": 1.1604052244000305, - "grad_norm": 0.02292146533727646, - "learning_rate": 3.7636746414590126e-05, - "loss": 0.0020170681178569793, - "step": 6200 - }, - { - "epoch": 1.164148636871443, - "grad_norm": 0.11247449368238449, - "learning_rate": 3.756420035403086e-05, - "loss": 0.006851900368928909, - "step": 6220 - }, - { - "epoch": 1.1678920493428557, - "grad_norm": 0.020755017176270485, - "learning_rate": 3.749151246464137e-05, - "loss": 0.0021739909425377846, - "step": 6240 - }, - { - "epoch": 1.1716354618142684, - "grad_norm": 0.017202025279402733, - "learning_rate": 3.741868356694647e-05, - "loss": 0.002353278361260891, - "step": 6260 - }, - { - "epoch": 1.1753788742856808, - "grad_norm": 0.014947429299354553, - "learning_rate": 3.734571448306274e-05, - "loss": 0.0010860362090170383, - "step": 6280 - }, - { - "epoch": 1.1791222867570934, - "grad_norm": 1.5391262769699097, - "learning_rate": 3.727260603668922e-05, - "loss": 0.01233254000544548, - "step": 6300 - }, - { - "epoch": 1.182865699228506, - "grad_norm": 0.4759792387485504, - "learning_rate": 3.7199359053098133e-05, - "loss": 0.0028501398861408233, - "step": 6320 - }, - { - "epoch": 1.1866091116999187, - "grad_norm": 0.01719040609896183, - "learning_rate": 3.7125974359125536e-05, - "loss": 0.00934450700879097, - "step": 6340 - }, - { - "epoch": 1.1903525241713313, - "grad_norm": 2.4766688346862793, - "learning_rate": 3.7052452783162015e-05, - "loss": 0.018582724034786224, - "step": 6360 - }, - { - "epoch": 1.194095936642744, - "grad_norm": 0.11404932290315628, - "learning_rate": 3.6978795155143326e-05, - "loss": 0.01815672367811203, - "step": 6380 - }, - { - "epoch": 1.1978393491141566, - "grad_norm": 0.021365633234381676, - "learning_rate": 3.690500230654103e-05, - "loss": 0.004123781993985176, - "step": 6400 - }, - { - "epoch": 1.2015827615855692, - "grad_norm": 0.022478772327303886, - "learning_rate": 3.68310750703531e-05, - "loss": 0.0038731731474399567, - "step": 6420 - }, - { - "epoch": 1.2053261740569818, - "grad_norm": 0.15531578660011292, - "learning_rate": 3.67570142810945e-05, - "loss": 0.002076444961130619, - "step": 6440 - }, - { - "epoch": 1.2090695865283942, - "grad_norm": 0.012458150275051594, - "learning_rate": 3.668282077478783e-05, - "loss": 0.0027592860162258146, - "step": 6460 - }, - { - "epoch": 1.2128129989998069, - "grad_norm": 0.01572798565030098, - "learning_rate": 3.66084953889538e-05, - "loss": 0.002740098722279072, - "step": 6480 - }, - { - "epoch": 1.2165564114712195, - "grad_norm": 0.13682503998279572, - "learning_rate": 3.6534038962601835e-05, - "loss": 0.000705425813794136, - "step": 6500 - }, - { - "epoch": 1.2202998239426321, - "grad_norm": 0.030630914494395256, - "learning_rate": 3.64594523362206e-05, - "loss": 0.012480729073286057, - "step": 6520 - }, - { - "epoch": 1.2240432364140448, - "grad_norm": 0.024804554879665375, - "learning_rate": 3.638473635176848e-05, - "loss": 0.0007834361866116523, - "step": 6540 - }, - { - "epoch": 1.2277866488854574, - "grad_norm": 0.011334752663969994, - "learning_rate": 3.630989185266411e-05, - "loss": 0.022086825966835023, - "step": 6560 - }, - { - "epoch": 1.23153006135687, - "grad_norm": 0.020346902310848236, - "learning_rate": 3.623491968377684e-05, - "loss": 0.018024472892284392, - "step": 6580 - }, - { - "epoch": 1.2352734738282827, - "grad_norm": 0.015177210792899132, - "learning_rate": 3.615982069141719e-05, - "loss": 0.005251453071832657, - "step": 6600 - }, - { - "epoch": 1.2390168862996953, - "grad_norm": 0.013680647127330303, - "learning_rate": 3.608459572332733e-05, - "loss": 0.006734563410282135, - "step": 6620 - }, - { - "epoch": 1.242760298771108, - "grad_norm": 0.17980872094631195, - "learning_rate": 3.600924562867144e-05, - "loss": 0.003970410302281379, - "step": 6640 - }, - { - "epoch": 1.2465037112425206, - "grad_norm": 0.015203841030597687, - "learning_rate": 3.593377125802622e-05, - "loss": 0.0032148901373147964, - "step": 6660 - }, - { - "epoch": 1.2502471237139332, - "grad_norm": 0.017300931736826897, - "learning_rate": 3.585817346337119e-05, - "loss": 0.00467667318880558, - "step": 6680 - }, - { - "epoch": 1.2539905361853458, - "grad_norm": 0.028181765228509903, - "learning_rate": 3.5782453098079175e-05, - "loss": 0.0015515764243900776, - "step": 6700 - }, - { - "epoch": 1.2577339486567582, - "grad_norm": 0.01730780117213726, - "learning_rate": 3.570661101690657e-05, - "loss": 0.007991334050893783, - "step": 6720 - }, - { - "epoch": 1.2614773611281709, - "grad_norm": 0.014216347597539425, - "learning_rate": 3.5630648075983763e-05, - "loss": 0.002533360943198204, - "step": 6740 - }, - { - "epoch": 1.2652207735995835, - "grad_norm": 0.1556195169687271, - "learning_rate": 3.555456513280544e-05, - "loss": 0.0032653655856847764, - "step": 6760 - }, - { - "epoch": 1.2689641860709961, - "grad_norm": 0.023955868557095528, - "learning_rate": 3.5478363046220915e-05, - "loss": 0.00850408971309662, - "step": 6780 - }, - { - "epoch": 1.2727075985424088, - "grad_norm": 0.17874136567115784, - "learning_rate": 3.5402042676424424e-05, - "loss": 0.0032720811665058135, - "step": 6800 - }, - { - "epoch": 1.2764510110138214, - "grad_norm": 0.0899379625916481, - "learning_rate": 3.5325604884945434e-05, - "loss": 0.003243798017501831, - "step": 6820 - }, - { - "epoch": 1.280194423485234, - "grad_norm": 0.413362056016922, - "learning_rate": 3.5249050534638906e-05, - "loss": 0.0036127623170614243, - "step": 6840 - }, - { - "epoch": 1.2839378359566467, - "grad_norm": 0.02790931612253189, - "learning_rate": 3.517238048967554e-05, - "loss": 0.008225285261869431, - "step": 6860 - }, - { - "epoch": 1.287681248428059, - "grad_norm": 0.6761110424995422, - "learning_rate": 3.5095595615532056e-05, - "loss": 0.00199942234903574, - "step": 6880 - }, - { - "epoch": 1.2914246608994717, - "grad_norm": 4.593618869781494, - "learning_rate": 3.5018696778981385e-05, - "loss": 0.007301987707614898, - "step": 6900 - }, - { - "epoch": 1.2951680733708844, - "grad_norm": 0.09392693638801575, - "learning_rate": 3.494168484808293e-05, - "loss": 0.009008315950632095, - "step": 6920 - }, - { - "epoch": 1.298911485842297, - "grad_norm": 0.008239852264523506, - "learning_rate": 3.48645606921727e-05, - "loss": 0.012661360204219818, - "step": 6940 - }, - { - "epoch": 1.3026548983137096, - "grad_norm": 0.05141177773475647, - "learning_rate": 3.4787325181853576e-05, - "loss": 0.0007553372532129287, - "step": 6960 - }, - { - "epoch": 1.3063983107851223, - "grad_norm": 0.024333903566002846, - "learning_rate": 3.470997918898541e-05, - "loss": 0.0016128463670611382, - "step": 6980 - }, - { - "epoch": 1.3101417232565349, - "grad_norm": 0.0337531715631485, - "learning_rate": 3.4632523586675254e-05, - "loss": 0.003253454715013504, - "step": 7000 - }, - { - "epoch": 1.3138851357279475, - "grad_norm": 0.05121550336480141, - "learning_rate": 3.4554959249267436e-05, - "loss": 0.0026307271793484686, - "step": 7020 - }, - { - "epoch": 1.3176285481993602, - "grad_norm": 0.025997543707489967, - "learning_rate": 3.447728705233374e-05, - "loss": 0.0012719514779746532, - "step": 7040 - }, - { - "epoch": 1.3213719606707728, - "grad_norm": 0.009486268274486065, - "learning_rate": 3.4399507872663494e-05, - "loss": 0.002009082958102226, - "step": 7060 - }, - { - "epoch": 1.3251153731421854, - "grad_norm": 0.016816232353448868, - "learning_rate": 3.432162258825369e-05, - "loss": 0.0005956823006272316, - "step": 7080 - }, - { - "epoch": 1.328858785613598, - "grad_norm": 0.004733961541205645, - "learning_rate": 3.424363207829906e-05, - "loss": 0.003636709600687027, - "step": 7100 - }, - { - "epoch": 1.3326021980850107, - "grad_norm": 3.666203498840332, - "learning_rate": 3.4165537223182155e-05, - "loss": 0.010488419234752655, - "step": 7120 - }, - { - "epoch": 1.336345610556423, - "grad_norm": 0.021471882238984108, - "learning_rate": 3.408733890446341e-05, - "loss": 0.0009709249250590801, - "step": 7140 - }, - { - "epoch": 1.3400890230278357, - "grad_norm": 0.007639541756361723, - "learning_rate": 3.40090380048712e-05, - "loss": 0.0030905861407518388, - "step": 7160 - }, - { - "epoch": 1.3438324354992484, - "grad_norm": 0.16878941655158997, - "learning_rate": 3.393063540829186e-05, - "loss": 0.0036965351551771163, - "step": 7180 - }, - { - "epoch": 1.347575847970661, - "grad_norm": 0.07014094293117523, - "learning_rate": 3.385213199975971e-05, - "loss": 0.0005677144508808851, - "step": 7200 - }, - { - "epoch": 1.3513192604420736, - "grad_norm": 0.008626374416053295, - "learning_rate": 3.377352866544706e-05, - "loss": 0.0005447934381663799, - "step": 7220 - }, - { - "epoch": 1.3550626729134863, - "grad_norm": 0.013825134374201298, - "learning_rate": 3.3694826292654246e-05, - "loss": 0.004854041337966919, - "step": 7240 - }, - { - "epoch": 1.3588060853848989, - "grad_norm": 0.025015883147716522, - "learning_rate": 3.361602576979956e-05, - "loss": 0.004542553424835205, - "step": 7260 - }, - { - "epoch": 1.3625494978563115, - "grad_norm": 0.009614030830562115, - "learning_rate": 3.353712798640923e-05, - "loss": 0.0008775785565376282, - "step": 7280 - }, - { - "epoch": 1.366292910327724, - "grad_norm": 3.8835268020629883, - "learning_rate": 3.345813383310744e-05, - "loss": 0.0063879616558551785, - "step": 7300 - }, - { - "epoch": 1.3700363227991366, - "grad_norm": 0.005518193822354078, - "learning_rate": 3.337904420160618e-05, - "loss": 0.0010956574231386184, - "step": 7320 - }, - { - "epoch": 1.3737797352705492, - "grad_norm": 0.005018322728574276, - "learning_rate": 3.329985998469526e-05, - "loss": 0.0012317843735218047, - "step": 7340 - }, - { - "epoch": 0.6887872232777639, - "grad_norm": 0.3108454942703247, - "learning_rate": 3.322058207623218e-05, - "loss": 0.010070423781871795, - "step": 7360 - }, - { - "epoch": 0.6906589276888447, - "grad_norm": 0.3556046783924103, - "learning_rate": 3.314121137113209e-05, - "loss": 0.0278738796710968, - "step": 7380 - }, - { - "epoch": 0.6925306320999256, - "grad_norm": 4.041794300079346, - "learning_rate": 3.306174876535762e-05, - "loss": 0.025335192680358887, - "step": 7400 - }, - { - "epoch": 0.6944023365110065, - "grad_norm": 0.04647493362426758, - "learning_rate": 3.2982195155908845e-05, - "loss": 0.05056847333908081, - "step": 7420 - }, - { - "epoch": 0.6962740409220873, - "grad_norm": 0.6827419400215149, - "learning_rate": 3.290653575270209e-05, - "loss": 0.036053261160850524, - "step": 7440 - }, - { - "epoch": 0.6981457453331683, - "grad_norm": 0.256136029958725, - "learning_rate": 3.2826807269966064e-05, - "loss": 0.020640365779399872, - "step": 7460 - }, - { - "epoch": 0.7000174497442492, - "grad_norm": 0.2054845094680786, - "learning_rate": 3.274699043565268e-05, - "loss": 0.03456352353096008, - "step": 7480 - }, - { - "epoch": 0.70188915415533, - "grad_norm": 0.2027648538351059, - "learning_rate": 3.266708615076064e-05, - "loss": 0.00846734493970871, - "step": 7500 - }, - { - "epoch": 0.7037608585664109, - "grad_norm": 1.6423311233520508, - "learning_rate": 3.258709531727582e-05, - "loss": 0.054978948831558225, - "step": 7520 - }, - { - "epoch": 0.7056325629774918, - "grad_norm": 1.775089144706726, - "learning_rate": 3.2507018838161085e-05, - "loss": 0.03238933086395264, - "step": 7540 - }, - { - "epoch": 0.7075042673885726, - "grad_norm": 0.06917860358953476, - "learning_rate": 3.242685761734609e-05, - "loss": 0.016849520802497863, - "step": 7560 - }, - { - "epoch": 0.7093759717996535, - "grad_norm": 0.051443129777908325, - "learning_rate": 3.2346612559717094e-05, - "loss": 0.048251998424530027, - "step": 7580 - }, - { - "epoch": 0.7112476762107344, - "grad_norm": 0.06533925980329514, - "learning_rate": 3.226628457110672e-05, - "loss": 0.03696450293064117, - "step": 7600 - }, - { - "epoch": 0.7131193806218153, - "grad_norm": 0.45661595463752747, - "learning_rate": 3.218587455828377e-05, - "loss": 0.05503013730049133, - "step": 7620 - }, - { - "epoch": 0.7149910850328962, - "grad_norm": 2.0205914974212646, - "learning_rate": 3.210538342894291e-05, - "loss": 0.033562681078910826, - "step": 7640 - }, - { - "epoch": 0.7168627894439771, - "grad_norm": 2.4842448234558105, - "learning_rate": 3.202481209169455e-05, - "loss": 0.019278638064861298, - "step": 7660 - }, - { - "epoch": 0.7187344938550579, - "grad_norm": 0.10550081729888916, - "learning_rate": 3.1944161456054436e-05, - "loss": 0.01638232171535492, - "step": 7680 - }, - { - "epoch": 0.7206061982661388, - "grad_norm": 1.606436014175415, - "learning_rate": 3.1863432432433506e-05, - "loss": 0.020552067458629607, - "step": 7700 - }, - { - "epoch": 0.7224779026772197, - "grad_norm": 0.2617719769477844, - "learning_rate": 3.178262593212757e-05, - "loss": 0.02315783053636551, - "step": 7720 - }, - { - "epoch": 0.7243496070883005, - "grad_norm": 0.9734074473381042, - "learning_rate": 3.1701742867307e-05, - "loss": 0.01938771307468414, - "step": 7740 - }, - { - "epoch": 0.7262213114993814, - "grad_norm": 0.5882985591888428, - "learning_rate": 3.162078415100647e-05, - "loss": 0.011305707693099975, - "step": 7760 - }, - { - "epoch": 0.7280930159104624, - "grad_norm": 0.04298723489046097, - "learning_rate": 3.15397506971146e-05, - "loss": 0.04238930344581604, - "step": 7780 - }, - { - "epoch": 0.7299647203215432, - "grad_norm": 6.2729315757751465, - "learning_rate": 3.145864342036372e-05, - "loss": 0.030225831270217895, - "step": 7800 - }, - { - "epoch": 0.7318364247326241, - "grad_norm": 0.026423340663313866, - "learning_rate": 3.1377463236319476e-05, - "loss": 0.012169972807168961, - "step": 7820 - }, - { - "epoch": 0.733708129143705, - "grad_norm": 0.0296376533806324, - "learning_rate": 3.1296211061370495e-05, - "loss": 0.015344823896884918, - "step": 7840 - }, - { - "epoch": 0.7355798335547858, - "grad_norm": 0.029524821788072586, - "learning_rate": 3.1214887812718094e-05, - "loss": 0.028345003724098206, - "step": 7860 - }, - { - "epoch": 0.7374515379658667, - "grad_norm": 0.06847794353961945, - "learning_rate": 3.113349440836588e-05, - "loss": 0.020069575309753417, - "step": 7880 - }, - { - "epoch": 0.7393232423769476, - "grad_norm": 0.024868430569767952, - "learning_rate": 3.1052031767109376e-05, - "loss": 0.014262473583221436, - "step": 7900 - }, - { - "epoch": 0.7411949467880286, - "grad_norm": 0.24450063705444336, - "learning_rate": 3.097050080852573e-05, - "loss": 0.04350808262825012, - "step": 7920 - }, - { - "epoch": 0.7430666511991094, - "grad_norm": 0.06978324800729752, - "learning_rate": 3.088890245296322e-05, - "loss": 0.015559709072113037, - "step": 7940 - }, - { - "epoch": 0.7449383556101903, - "grad_norm": 0.12675604224205017, - "learning_rate": 3.0807237621530964e-05, - "loss": 0.013867451250553131, - "step": 7960 - }, - { - "epoch": 0.7468100600212711, - "grad_norm": 0.2605513334274292, - "learning_rate": 3.072550723608846e-05, - "loss": 0.012869009375572204, - "step": 7980 - }, - { - "epoch": 0.748681764432352, - "grad_norm": 3.325530529022217, - "learning_rate": 3.064371221923521e-05, - "loss": 0.03036353886127472, - "step": 8000 - }, - { - "epoch": 0.7505534688434329, - "grad_norm": 0.22703051567077637, - "learning_rate": 3.0561853494300294e-05, - "loss": 0.009017374366521835, - "step": 8020 - }, - { - "epoch": 0.7524251732545137, - "grad_norm": 6.404862880706787, - "learning_rate": 3.047993198533195e-05, - "loss": 0.020604299008846284, - "step": 8040 - }, - { - "epoch": 0.7542968776655946, - "grad_norm": 0.06491954624652863, - "learning_rate": 3.039794861708714e-05, - "loss": 0.014963623881340028, - "step": 8060 - }, - { - "epoch": 0.7561685820766756, - "grad_norm": 0.4990088641643524, - "learning_rate": 3.0315904315021128e-05, - "loss": 0.02046530395746231, - "step": 8080 - }, - { - "epoch": 0.7580402864877565, - "grad_norm": 0.3174229562282562, - "learning_rate": 3.023380000527699e-05, - "loss": 0.013621781766414643, - "step": 8100 - }, - { - "epoch": 0.7599119908988373, - "grad_norm": 0.07161428034305573, - "learning_rate": 3.0151636614675218e-05, - "loss": 0.008043503761291504, - "step": 8120 - }, - { - "epoch": 0.7617836953099182, - "grad_norm": 0.6772736310958862, - "learning_rate": 3.0069415070703217e-05, - "loss": 0.03563189804553986, - "step": 8140 - }, - { - "epoch": 0.763655399720999, - "grad_norm": 0.07689516246318817, - "learning_rate": 2.998713630150485e-05, - "loss": 0.008622632920742035, - "step": 8160 - }, - { - "epoch": 0.7655271041320799, - "grad_norm": 0.014181110076606274, - "learning_rate": 2.990480123586994e-05, - "loss": 0.012368627637624741, - "step": 8180 - }, - { - "epoch": 0.7673988085431608, - "grad_norm": 4.4751715660095215, - "learning_rate": 2.9822410803223822e-05, - "loss": 0.02100955694913864, - "step": 8200 - }, - { - "epoch": 0.7692705129542416, - "grad_norm": 0.12694527208805084, - "learning_rate": 2.9739965933616825e-05, - "loss": 0.018182000517845152, - "step": 8220 - }, - { - "epoch": 0.7711422173653226, - "grad_norm": 0.13789872825145721, - "learning_rate": 2.9657467557713792e-05, - "loss": 0.008949784934520722, - "step": 8240 - }, - { - "epoch": 0.7730139217764035, - "grad_norm": 0.04048463702201843, - "learning_rate": 2.957491660678354e-05, - "loss": 0.03582434058189392, - "step": 8260 - }, - { - "epoch": 0.7748856261874844, - "grad_norm": 0.7825964689254761, - "learning_rate": 2.9492314012688378e-05, - "loss": 0.012679101526737213, - "step": 8280 - }, - { - "epoch": 0.7767573305985652, - "grad_norm": 0.14350314438343048, - "learning_rate": 2.9409660707873597e-05, - "loss": 0.010909486562013626, - "step": 8300 - }, - { - "epoch": 0.7786290350096461, - "grad_norm": 0.17676737904548645, - "learning_rate": 2.932695762535691e-05, - "loss": 0.01464642733335495, - "step": 8320 - }, - { - "epoch": 0.780500739420727, - "grad_norm": 0.5979751348495483, - "learning_rate": 2.9244205698717943e-05, - "loss": 0.028799059987068176, - "step": 8340 - }, - { - "epoch": 0.7823724438318078, - "grad_norm": 0.08448052406311035, - "learning_rate": 2.9161405862087676e-05, - "loss": 0.014056096971035003, - "step": 8360 - }, - { - "epoch": 0.7842441482428888, - "grad_norm": 0.5616207122802734, - "learning_rate": 2.9078559050137955e-05, - "loss": 0.008744364231824875, - "step": 8380 - }, - { - "epoch": 0.7861158526539697, - "grad_norm": 0.7264829277992249, - "learning_rate": 2.8995666198070836e-05, - "loss": 0.014575870335102081, - "step": 8400 - }, - { - "epoch": 0.7879875570650505, - "grad_norm": 1.444239616394043, - "learning_rate": 2.891272824160815e-05, - "loss": 0.01230706349015236, - "step": 8420 - }, - { - "epoch": 0.7898592614761314, - "grad_norm": 0.02643579989671707, - "learning_rate": 2.882974611698084e-05, - "loss": 0.01713460832834244, - "step": 8440 - }, - { - "epoch": 0.7917309658872123, - "grad_norm": 0.19893163442611694, - "learning_rate": 2.8746720760918457e-05, - "loss": 0.009562552720308305, - "step": 8460 - }, - { - "epoch": 0.7936026702982931, - "grad_norm": 1.8813897371292114, - "learning_rate": 2.866365311063855e-05, - "loss": 0.01966284364461899, - "step": 8480 - }, - { - "epoch": 0.795474374709374, - "grad_norm": 0.1820579618215561, - "learning_rate": 2.8580544103836114e-05, - "loss": 0.023943188786506652, - "step": 8500 - }, - { - "epoch": 0.7973460791204549, - "grad_norm": 1.3913259506225586, - "learning_rate": 2.849739467867298e-05, - "loss": 0.02233349084854126, - "step": 8520 - }, - { - "epoch": 0.7992177835315358, - "grad_norm": 0.28450486063957214, - "learning_rate": 2.8414205773767223e-05, - "loss": 0.016230446100234986, - "step": 8540 - }, - { - "epoch": 0.8010894879426167, - "grad_norm": 0.46086356043815613, - "learning_rate": 2.83309783281826e-05, - "loss": 0.013964855670928955, - "step": 8560 - }, - { - "epoch": 0.8029611923536976, - "grad_norm": 1.1401137113571167, - "learning_rate": 2.8247713281417924e-05, - "loss": 0.01552264392375946, - "step": 8580 - }, - { - "epoch": 0.8048328967647784, - "grad_norm": 0.02414649911224842, - "learning_rate": 2.8164411573396444e-05, - "loss": 0.00505053773522377, - "step": 8600 - }, - { - "epoch": 0.8067046011758593, - "grad_norm": 0.029010778293013573, - "learning_rate": 2.8081074144455276e-05, - "loss": 0.008068422973155975, - "step": 8620 - }, - { - "epoch": 0.8085763055869402, - "grad_norm": 0.024924319237470627, - "learning_rate": 2.7997701935334747e-05, - "loss": 0.021529987454414368, - "step": 8640 - }, - { - "epoch": 0.810448009998021, - "grad_norm": 0.3544171154499054, - "learning_rate": 2.791429588716782e-05, - "loss": 0.008264218270778657, - "step": 8660 - }, - { - "epoch": 0.8123197144091019, - "grad_norm": 0.011211074888706207, - "learning_rate": 2.7830856941469407e-05, - "loss": 0.013752134144306183, - "step": 8680 - }, - { - "epoch": 0.8141914188201829, - "grad_norm": 0.30479249358177185, - "learning_rate": 2.7747386040125807e-05, - "loss": 0.01313515156507492, - "step": 8700 - }, - { - "epoch": 0.8160631232312637, - "grad_norm": 3.1079516410827637, - "learning_rate": 2.766388412538404e-05, - "loss": 0.013471932709217071, - "step": 8720 - }, - { - "epoch": 0.8179348276423446, - "grad_norm": 0.011288405396044254, - "learning_rate": 2.758035213984121e-05, - "loss": 0.011207062005996703, - "step": 8740 - }, - { - "epoch": 0.8198065320534255, - "grad_norm": 0.011481484398245811, - "learning_rate": 2.749679102643387e-05, - "loss": 0.018254657089710236, - "step": 8760 - }, - { - "epoch": 0.8216782364645063, - "grad_norm": 0.037564992904663086, - "learning_rate": 2.7413201728427372e-05, - "loss": 0.024057184159755707, - "step": 8780 - }, - { - "epoch": 0.8235499408755872, - "grad_norm": 0.03808968514204025, - "learning_rate": 2.7329585189405253e-05, - "loss": 0.006051592528820038, - "step": 8800 - }, - { - "epoch": 0.8254216452866681, - "grad_norm": 0.07610247284173965, - "learning_rate": 2.724594235325852e-05, - "loss": 0.025592076778411865, - "step": 8820 - }, - { - "epoch": 0.827293349697749, - "grad_norm": 0.019049810245633125, - "learning_rate": 2.716227416417505e-05, - "loss": 0.0037486787885427477, - "step": 8840 - }, - { - "epoch": 0.8291650541088299, - "grad_norm": 0.6380273699760437, - "learning_rate": 2.7078581566628897e-05, - "loss": 0.015487492084503174, - "step": 8860 - }, - { - "epoch": 0.8310367585199108, - "grad_norm": 0.05775881186127663, - "learning_rate": 2.699486550536968e-05, - "loss": 0.03133237063884735, - "step": 8880 - }, - { - "epoch": 0.8329084629309916, - "grad_norm": 0.047411222010850906, - "learning_rate": 2.6911126925411845e-05, - "loss": 0.00861177071928978, - "step": 8900 - }, - { - "epoch": 0.8347801673420725, - "grad_norm": 0.23981286585330963, - "learning_rate": 2.682736677202406e-05, - "loss": 0.01839599907398224, - "step": 8920 - }, - { - "epoch": 0.8366518717531534, - "grad_norm": 0.36887305974960327, - "learning_rate": 2.6743585990718505e-05, - "loss": 0.01008533239364624, - "step": 8940 - }, - { - "epoch": 0.8385235761642342, - "grad_norm": 0.8994531035423279, - "learning_rate": 2.6659785527240233e-05, - "loss": 0.027107802033424378, - "step": 8960 - }, - { - "epoch": 0.8403952805753151, - "grad_norm": 0.12780402600765228, - "learning_rate": 2.6575966327556458e-05, - "loss": 0.03549482524394989, - "step": 8980 - }, - { - "epoch": 0.8422669849863961, - "grad_norm": 0.3294568359851837, - "learning_rate": 2.649212933784591e-05, - "loss": 0.02797776460647583, - "step": 9000 - }, - { - "epoch": 0.8441386893974769, - "grad_norm": 0.019461506977677345, - "learning_rate": 2.640827550448812e-05, - "loss": 0.010047334432601928, - "step": 9020 - }, - { - "epoch": 0.8460103938085578, - "grad_norm": 0.056546472012996674, - "learning_rate": 2.6324405774052784e-05, - "loss": 0.02831721007823944, - "step": 9040 - }, - { - "epoch": 0.8478820982196387, - "grad_norm": 0.017190299928188324, - "learning_rate": 2.6240521093289022e-05, - "loss": 0.019623257219791412, - "step": 9060 - }, - { - "epoch": 0.8497538026307195, - "grad_norm": 0.04793965816497803, - "learning_rate": 2.6156622409114728e-05, - "loss": 0.011966148018836975, - "step": 9080 - }, - { - "epoch": 0.8516255070418004, - "grad_norm": 0.006742037367075682, - "learning_rate": 2.607271066860587e-05, - "loss": 0.013694784045219422, - "step": 9100 - }, - { - "epoch": 0.8534972114528813, - "grad_norm": 0.03113027848303318, - "learning_rate": 2.5988786818985812e-05, - "loss": 0.05338943004608154, - "step": 9120 - }, - { - "epoch": 0.8553689158639621, - "grad_norm": 0.6589255928993225, - "learning_rate": 2.5904851807614588e-05, - "loss": 0.01305432766675949, - "step": 9140 - }, - { - "epoch": 0.8572406202750431, - "grad_norm": 0.3030281960964203, - "learning_rate": 2.582090658197825e-05, - "loss": 0.03663805425167084, - "step": 9160 - }, - { - "epoch": 0.859112324686124, - "grad_norm": 0.37101081013679504, - "learning_rate": 2.573695208967814e-05, - "loss": 0.016968609392642976, - "step": 9180 - }, - { - "epoch": 0.8609840290972048, - "grad_norm": 0.7480998039245605, - "learning_rate": 2.5652989278420197e-05, - "loss": 0.021240857243537904, - "step": 9200 - }, - { - "epoch": 0.8628557335082857, - "grad_norm": 0.017131274566054344, - "learning_rate": 2.5569019096004304e-05, - "loss": 0.004783949628472328, - "step": 9220 - }, - { - "epoch": 0.8647274379193666, - "grad_norm": 1.1544040441513062, - "learning_rate": 2.5485042490313504e-05, - "loss": 0.02356208860874176, - "step": 9240 - }, - { - "epoch": 0.8665991423304474, - "grad_norm": 0.13512635231018066, - "learning_rate": 2.540106040930338e-05, - "loss": 0.009329542517662048, - "step": 9260 - }, - { - "epoch": 0.8684708467415283, - "grad_norm": 0.018427839502692223, - "learning_rate": 2.5317073800991304e-05, - "loss": 0.007472375035285949, - "step": 9280 - }, - { - "epoch": 0.8703425511526093, - "grad_norm": 0.02722800336778164, - "learning_rate": 2.5233083613445778e-05, - "loss": 0.020304642617702484, - "step": 9300 - }, - { - "epoch": 0.8722142555636901, - "grad_norm": 0.051702745258808136, - "learning_rate": 2.5149090794775675e-05, - "loss": 0.02955295443534851, - "step": 9320 - }, - { - "epoch": 0.874085959974771, - "grad_norm": 0.1535400152206421, - "learning_rate": 2.5065096293119604e-05, - "loss": 0.030047640204429626, - "step": 9340 - }, - { - "epoch": 0.8759576643858519, - "grad_norm": 0.383573979139328, - "learning_rate": 2.498110105663513e-05, - "loss": 0.011377302557229995, - "step": 9360 - }, - { - "epoch": 0.8778293687969327, - "grad_norm": 0.23541487753391266, - "learning_rate": 2.489710603348817e-05, - "loss": 0.02304387390613556, - "step": 9380 - }, - { - "epoch": 0.8797010732080136, - "grad_norm": 0.029004938900470734, - "learning_rate": 2.4813112171842162e-05, - "loss": 0.020582889020442963, - "step": 9400 - }, - { - "epoch": 0.8815727776190945, - "grad_norm": 0.06564116477966309, - "learning_rate": 2.4729120419847498e-05, - "loss": 0.014207787811756134, - "step": 9420 - }, - { - "epoch": 0.8834444820301753, - "grad_norm": 0.01633615791797638, - "learning_rate": 2.464513172563072e-05, - "loss": 0.01756283938884735, - "step": 9440 - }, - { - "epoch": 0.8853161864412563, - "grad_norm": 0.01287770178169012, - "learning_rate": 2.456114703728386e-05, - "loss": 0.003737853467464447, - "step": 9460 - }, - { - "epoch": 0.8871878908523372, - "grad_norm": 0.05004064738750458, - "learning_rate": 2.448136615728485e-05, - "loss": 0.0324675589799881, - "step": 9480 - }, - { - "epoch": 0.889059595263418, - "grad_norm": 1.20869779586792, - "learning_rate": 2.4397392007153162e-05, - "loss": 0.007156150788068772, - "step": 9500 - }, - { - "epoch": 0.8909312996744989, - "grad_norm": 1.1070218086242676, - "learning_rate": 2.43134246594589e-05, - "loss": 0.009275762736797333, - "step": 9520 - }, - { - "epoch": 0.8928030040855798, - "grad_norm": 0.878593385219574, - "learning_rate": 2.4229465062053136e-05, - "loss": 0.018170186877250673, - "step": 9540 - }, - { - "epoch": 2.236302797078385, - "grad_norm": 2.294339179992676, - "learning_rate": 1.4461640332194936e-05, - "loss": 0.07619959115982056, - "step": 9560 - }, - { - "epoch": 2.2409820603868678, - "grad_norm": 0.2697487473487854, - "learning_rate": 1.4366537531356394e-05, - "loss": 0.08616560101509094, - "step": 9580 - }, - { - "epoch": 2.2456613236953507, - "grad_norm": 1.5392569303512573, - "learning_rate": 1.4271622228435674e-05, - "loss": 0.052218639850616456, - "step": 9600 - }, - { - "epoch": 2.2503405870038335, - "grad_norm": 2.0239648818969727, - "learning_rate": 1.4176896097057135e-05, - "loss": 0.08808050155639649, - "step": 9620 - }, - { - "epoch": 2.2550198503123164, - "grad_norm": 1.629538655281067, - "learning_rate": 1.4082360807509482e-05, - "loss": 0.07276531457901, - "step": 9640 - }, - { - "epoch": 2.2596991136207993, - "grad_norm": 1.7065048217773438, - "learning_rate": 1.3988018026716371e-05, - "loss": 0.05087214708328247, - "step": 9660 - }, - { - "epoch": 2.2643783769292822, - "grad_norm": 0.10258202999830246, - "learning_rate": 1.3893869418206949e-05, - "loss": 0.05631760954856872, - "step": 9680 - }, - { - "epoch": 2.269057640237765, - "grad_norm": 0.08703255653381348, - "learning_rate": 1.3799916642086585e-05, - "loss": 0.05722883343696594, - "step": 9700 - }, - { - "epoch": 2.273736903546248, - "grad_norm": 0.6752107739448547, - "learning_rate": 1.3706161355007579e-05, - "loss": 0.07108172178268432, - "step": 9720 - }, - { - "epoch": 2.278416166854731, - "grad_norm": 1.734405279159546, - "learning_rate": 1.3612605210139912e-05, - "loss": 0.04115844368934631, - "step": 9740 - }, - { - "epoch": 2.283095430163214, - "grad_norm": 2.0433499813079834, - "learning_rate": 1.3519249857142147e-05, - "loss": 0.053622370958328246, - "step": 9760 - }, - { - "epoch": 2.2877746934716967, - "grad_norm": 1.466838002204895, - "learning_rate": 1.3426096942132305e-05, - "loss": 0.07005876302719116, - "step": 9780 - }, - { - "epoch": 2.2924539567801796, - "grad_norm": 1.3480894565582275, - "learning_rate": 1.3333148107658883e-05, - "loss": 0.0501272439956665, - "step": 9800 - }, - { - "epoch": 2.2971332200886625, - "grad_norm": 2.2553582191467285, - "learning_rate": 1.3240404992671823e-05, - "loss": 0.058852237462997434, - "step": 9820 - }, - { - "epoch": 2.3018124833971454, - "grad_norm": 0.1796468198299408, - "learning_rate": 1.3147869232493698e-05, - "loss": 0.05089703798294067, - "step": 9840 - }, - { - "epoch": 2.306491746705628, - "grad_norm": 3.135744571685791, - "learning_rate": 1.305554245879079e-05, - "loss": 0.04962855279445648, - "step": 9860 - }, - { - "epoch": 2.3111710100141107, - "grad_norm": 2.1585986614227295, - "learning_rate": 1.296342629954439e-05, - "loss": 0.07206055521965027, - "step": 9880 - }, - { - "epoch": 2.3158502733225936, - "grad_norm": 0.10592425614595413, - "learning_rate": 1.2871522379022038e-05, - "loss": 0.04145916402339935, - "step": 9900 - }, - { - "epoch": 2.3205295366310765, - "grad_norm": 0.5150194764137268, - "learning_rate": 1.2779832317748933e-05, - "loss": 0.05638412833213806, - "step": 9920 - }, - { - "epoch": 2.3252087999395594, - "grad_norm": 1.6214760541915894, - "learning_rate": 1.2688357732479303e-05, - "loss": 0.021433608233928682, - "step": 9940 - }, - { - "epoch": 2.3298880632480423, - "grad_norm": 0.027669433504343033, - "learning_rate": 1.2597100236167963e-05, - "loss": 0.036874741315841675, - "step": 9960 - }, - { - "epoch": 2.334567326556525, - "grad_norm": 1.560826301574707, - "learning_rate": 1.2506061437941804e-05, - "loss": 0.05353221893310547, - "step": 9980 - }, - { - "epoch": 2.339246589865008, - "grad_norm": 0.7329757809638977, - "learning_rate": 1.241524294307147e-05, - "loss": 0.042856207489967345, - "step": 10000 - }, - { - "epoch": 2.343925853173491, - "grad_norm": 0.01610807701945305, - "learning_rate": 1.232464635294302e-05, - "loss": 0.02504704296588898, - "step": 10020 - }, - { - "epoch": 2.348605116481974, - "grad_norm": 0.04221898317337036, - "learning_rate": 1.2234273265029742e-05, - "loss": 0.030704396963119506, - "step": 10040 - }, - { - "epoch": 2.353284379790457, - "grad_norm": 0.22617070376873016, - "learning_rate": 1.2144125272863905e-05, - "loss": 0.020115789771080018, - "step": 10060 - }, - { - "epoch": 2.3579636430989397, - "grad_norm": 0.7796891331672668, - "learning_rate": 1.2054203966008747e-05, - "loss": 0.02525162398815155, - "step": 10080 - }, - { - "epoch": 2.3626429064074226, - "grad_norm": 1.364593744277954, - "learning_rate": 1.1964510930030368e-05, - "loss": 0.015173476934432984, - "step": 10100 - }, - { - "epoch": 2.3673221697159055, - "grad_norm": 0.9444358944892883, - "learning_rate": 1.1875047746469847e-05, - "loss": 0.01972121149301529, - "step": 10120 - }, - { - "epoch": 2.3720014330243884, - "grad_norm": 2.1904690265655518, - "learning_rate": 1.1785815992815274e-05, - "loss": 0.033346959948539735, - "step": 10140 - } - ], - "logging_steps": 20, - "max_steps": 14963, - "num_input_tokens_seen": 0, - "num_train_epochs": 4, - "save_steps": 1000000000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.896183550633754e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/slots/3/checkpoint-10159/training_args.bin b/slots/3/checkpoint-10159/training_args.bin deleted file mode 100644 index cba6bf44229020a6cf5d76cffc747dea705142ea..0000000000000000000000000000000000000000 --- a/slots/3/checkpoint-10159/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66430bba102a8f3dc245713cd6268a99c212c508aacce1d8b9768464f5df26ec -size 5201 diff --git a/slots/3/latest.json b/slots/3/latest.json deleted file mode 100644 index 0a24f435e57d862cc4d49e8f016177a61321e611..0000000000000000000000000000000000000000 --- a/slots/3/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:3", "checkpoint": "slots/3/checkpoint-10159", "step": 10159, "updated_at": 1776816190} diff --git a/slots/4/latest.json b/slots/4/latest.json deleted file mode 100644 index c2f834d0d53466590581e9f8ec7040d4c4f56da0..0000000000000000000000000000000000000000 --- a/slots/4/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:4", "checkpoint": "slots/4/checkpoint-9920", "step": 9920, "updated_at": 1776815427} diff --git a/slots/5/latest.json b/slots/5/latest.json deleted file mode 100644 index a9317622e4036a869012f332abc7e83bc75e4ca6..0000000000000000000000000000000000000000 --- a/slots/5/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:5", "checkpoint": "slots/5/checkpoint-9977", "step": 9977, "updated_at": 1776815798} diff --git a/slots/6/latest.json b/slots/6/latest.json deleted file mode 100644 index cac129a277ae42f9ad47642f3a1af5ae216357e1..0000000000000000000000000000000000000000 --- a/slots/6/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:6", "checkpoint": "slots/6/checkpoint-9256", "step": 9256, "updated_at": 1776779775} diff --git a/slots/7/checkpoint-10040/config.json b/slots/7/checkpoint-10040/config.json deleted file mode 100644 index 9e5d8b7224eff16a790758ae86dd97c89afeab74..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "TwinyForCausalLM" - ], - "attention_dropout": 0.0, - "dtype": "float32", - "hidden_dropout": 0.0, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 128, - "model_type": "twiny", - "neftune_alpha": 0.0, - "num_attention_heads": 12, - "num_hidden_layers": 3, - "num_key_value_heads": 3, - "qk_norm": true, - "rezero_init": 1.0, - "rms_norm_eps": 1e-06, - "rope_theta": 10000.0, - "transformers_version": "5.0.0", - "use_cache": false, - "vocab_size": 32000 -} diff --git a/slots/7/checkpoint-10040/model.safetensors b/slots/7/checkpoint-10040/model.safetensors deleted file mode 100644 index dc3f8f3acadd3c3fe74ab6a31e6515cccdcf9fda..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c88cf0d249c9a53bfa06545968e30d25688fd09bfbedd27a2a4ec3a7c20c9646 -size 306388092 diff --git a/slots/7/checkpoint-10040/optimizer.pt b/slots/7/checkpoint-10040/optimizer.pt deleted file mode 100644 index 8a31d69bf9b094ad0f40e08a5724fa926a1117a4..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4415ca7b53cfb59c6dcae2c5cd71e786e204637e38e3fa229a0bf0d227bc5f4 -size 302484555 diff --git a/slots/7/checkpoint-10040/rng_state.pth b/slots/7/checkpoint-10040/rng_state.pth deleted file mode 100644 index 1feba1a6538e93b94696d3773853dbc8947b0cad..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 -size 14645 diff --git a/slots/7/checkpoint-10040/scaler.pt b/slots/7/checkpoint-10040/scaler.pt deleted file mode 100644 index e6c8d50e9bf4409ee2370d14e364a328c55f88ba..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/scaler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43fe95fe06f29ddcc41179a69e1a2be1433c6672ab5e3a74fe327cacbaea9152 -size 1383 diff --git a/slots/7/checkpoint-10040/scheduler.pt b/slots/7/checkpoint-10040/scheduler.pt deleted file mode 100644 index 250e4084d36ef56292d4b82c694e3601759536ab..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccf1825e64cd0689b1ddff4bbd4f507dc8476ba8e849a66a9a7240fc6febe6d1 -size 1465 diff --git a/slots/7/checkpoint-10040/trainer_state.json b/slots/7/checkpoint-10040/trainer_state.json deleted file mode 100644 index cbe2e4c55a90b8750d421a8fa393443e0d90c612..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/trainer_state.json +++ /dev/null @@ -1,3555 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.348605116481974, - "eval_steps": 500, - "global_step": 10040, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001336931908386741, - "grad_norm": Infinity, - "learning_rate": 5e-05, - "loss": 129.80032348632812, - "step": 1 - }, - { - "epoch": 0.002673863816773482, - "grad_norm": 63.64365768432617, - "learning_rate": 4.999995392022967e-05, - "loss": 63.88374408922697, - "step": 20 - }, - { - "epoch": 0.005347727633546964, - "grad_norm": 24.627853393554688, - "learning_rate": 4.999976672145381e-05, - "loss": 12.65963363647461, - "step": 40 - }, - { - "epoch": 0.008021591450320446, - "grad_norm": 14.29983901977539, - "learning_rate": 4.999943552476422e-05, - "loss": 5.90204963684082, - "step": 60 - }, - { - "epoch": 0.010695455267093928, - "grad_norm": 15.690323829650879, - "learning_rate": 4.999896033206858e-05, - "loss": 3.9918922424316405, - "step": 80 - }, - { - "epoch": 0.01336931908386741, - "grad_norm": 31.583160400390625, - "learning_rate": 4.999834114610398e-05, - "loss": 2.9675426483154297, - "step": 100 - }, - { - "epoch": 0.01604318290064089, - "grad_norm": 13.034649848937988, - "learning_rate": 4.999757797043691e-05, - "loss": 2.725296401977539, - "step": 120 - }, - { - "epoch": 0.018717046717414372, - "grad_norm": 8.362203598022461, - "learning_rate": 4.999667080946324e-05, - "loss": 2.2478992462158205, - "step": 140 - }, - { - "epoch": 0.021390910534187856, - "grad_norm": 8.726786613464355, - "learning_rate": 4.999561966840821e-05, - "loss": 1.8447845458984375, - "step": 160 - }, - { - "epoch": 0.024064774350961337, - "grad_norm": 10.092752456665039, - "learning_rate": 4.9994424553326335e-05, - "loss": 1.5611843109130858, - "step": 180 - }, - { - "epoch": 0.02673863816773482, - "grad_norm": 9.090085983276367, - "learning_rate": 4.999308547110146e-05, - "loss": 1.520334815979004, - "step": 200 - }, - { - "epoch": 0.029412501984508302, - "grad_norm": 9.668124198913574, - "learning_rate": 4.999160242944665e-05, - "loss": 1.2818055152893066, - "step": 220 - }, - { - "epoch": 0.03208636580128178, - "grad_norm": 9.182533264160156, - "learning_rate": 4.998997543690418e-05, - "loss": 1.0428407669067383, - "step": 240 - }, - { - "epoch": 0.03476022961805526, - "grad_norm": 5.745838165283203, - "learning_rate": 4.998820450284549e-05, - "loss": 1.2343652725219727, - "step": 260 - }, - { - "epoch": 0.037434093434828744, - "grad_norm": 8.651643753051758, - "learning_rate": 4.99862896374711e-05, - "loss": 0.8859601020812988, - "step": 280 - }, - { - "epoch": 0.04010795725160223, - "grad_norm": 10.765266418457031, - "learning_rate": 4.998423085181056e-05, - "loss": 0.989600658416748, - "step": 300 - }, - { - "epoch": 0.04278182106837571, - "grad_norm": 6.092499256134033, - "learning_rate": 4.998202815772245e-05, - "loss": 0.7189463615417481, - "step": 320 - }, - { - "epoch": 0.04545568488514919, - "grad_norm": 6.352876663208008, - "learning_rate": 4.9979681567894195e-05, - "loss": 0.7489545345306396, - "step": 340 - }, - { - "epoch": 0.048129548701922674, - "grad_norm": 4.620656490325928, - "learning_rate": 4.997719109584209e-05, - "loss": 0.7381401538848877, - "step": 360 - }, - { - "epoch": 0.050803412518696155, - "grad_norm": 7.796917915344238, - "learning_rate": 4.997455675591119e-05, - "loss": 0.5687405109405518, - "step": 380 - }, - { - "epoch": 0.05347727633546964, - "grad_norm": 2.837172508239746, - "learning_rate": 4.9971778563275204e-05, - "loss": 0.5686865329742432, - "step": 400 - }, - { - "epoch": 0.05615114015224312, - "grad_norm": 3.3103690147399902, - "learning_rate": 4.9968856533936436e-05, - "loss": 0.625730562210083, - "step": 420 - }, - { - "epoch": 0.058825003969016604, - "grad_norm": 3.5682132244110107, - "learning_rate": 4.99657906847257e-05, - "loss": 0.6125466346740722, - "step": 440 - }, - { - "epoch": 0.061498867785790085, - "grad_norm": 5.63640832901001, - "learning_rate": 4.996258103330218e-05, - "loss": 0.6182214260101319, - "step": 460 - }, - { - "epoch": 0.06417273160256357, - "grad_norm": 4.698945999145508, - "learning_rate": 4.995922759815339e-05, - "loss": 0.43828091621398924, - "step": 480 - }, - { - "epoch": 0.06684659541933705, - "grad_norm": 2.1976189613342285, - "learning_rate": 4.995573039859501e-05, - "loss": 0.4459230899810791, - "step": 500 - }, - { - "epoch": 0.06952045923611053, - "grad_norm": 3.8809523582458496, - "learning_rate": 4.995208945477081e-05, - "loss": 0.3821882963180542, - "step": 520 - }, - { - "epoch": 0.07219432305288401, - "grad_norm": 3.75144100189209, - "learning_rate": 4.994830478765251e-05, - "loss": 0.5800807476043701, - "step": 540 - }, - { - "epoch": 0.07486818686965749, - "grad_norm": 3.0038585662841797, - "learning_rate": 4.9944376419039684e-05, - "loss": 0.3928264617919922, - "step": 560 - }, - { - "epoch": 0.07754205068643098, - "grad_norm": 3.614591598510742, - "learning_rate": 4.994030437155961e-05, - "loss": 0.48637890815734863, - "step": 580 - }, - { - "epoch": 0.08021591450320446, - "grad_norm": 4.143443584442139, - "learning_rate": 4.993608866866718e-05, - "loss": 0.3650153160095215, - "step": 600 - }, - { - "epoch": 0.08288977831997794, - "grad_norm": 6.692712783813477, - "learning_rate": 4.993172933464471e-05, - "loss": 0.3677916288375854, - "step": 620 - }, - { - "epoch": 0.08556364213675142, - "grad_norm": 8.383441925048828, - "learning_rate": 4.9927226394601815e-05, - "loss": 0.3399480104446411, - "step": 640 - }, - { - "epoch": 0.0882375059535249, - "grad_norm": 5.566338062286377, - "learning_rate": 4.992257987447532e-05, - "loss": 0.28104052543640134, - "step": 660 - }, - { - "epoch": 0.09091136977029839, - "grad_norm": 3.1196420192718506, - "learning_rate": 4.991778980102904e-05, - "loss": 0.351950478553772, - "step": 680 - }, - { - "epoch": 0.09358523358707187, - "grad_norm": 3.47979736328125, - "learning_rate": 4.9912856201853644e-05, - "loss": 0.27501535415649414, - "step": 700 - }, - { - "epoch": 0.09625909740384535, - "grad_norm": 5.446717262268066, - "learning_rate": 4.990777910536653e-05, - "loss": 0.2651593923568726, - "step": 720 - }, - { - "epoch": 0.09893296122061883, - "grad_norm": 7.6145339012146, - "learning_rate": 4.990255854081161e-05, - "loss": 0.35140380859375, - "step": 740 - }, - { - "epoch": 0.10160682503739231, - "grad_norm": 8.445616722106934, - "learning_rate": 4.989719453825918e-05, - "loss": 0.2961219072341919, - "step": 760 - }, - { - "epoch": 0.10428068885416579, - "grad_norm": 6.339537620544434, - "learning_rate": 4.9891687128605744e-05, - "loss": 0.24962289333343507, - "step": 780 - }, - { - "epoch": 0.10695455267093928, - "grad_norm": 3.3369436264038086, - "learning_rate": 4.988603634357383e-05, - "loss": 0.2124847412109375, - "step": 800 - }, - { - "epoch": 0.10962841648771277, - "grad_norm": 2.2909045219421387, - "learning_rate": 4.988024221571177e-05, - "loss": 0.24679112434387207, - "step": 820 - }, - { - "epoch": 0.11230228030448625, - "grad_norm": 3.1149911880493164, - "learning_rate": 4.9874304778393574e-05, - "loss": 0.22161397933959961, - "step": 840 - }, - { - "epoch": 0.11497614412125973, - "grad_norm": 14.802160263061523, - "learning_rate": 4.9868224065818706e-05, - "loss": 0.2623537302017212, - "step": 860 - }, - { - "epoch": 0.11765000793803321, - "grad_norm": 5.586325168609619, - "learning_rate": 4.98620001130119e-05, - "loss": 0.3560942649841309, - "step": 880 - }, - { - "epoch": 0.12032387175480669, - "grad_norm": 3.390017032623291, - "learning_rate": 4.9855632955822916e-05, - "loss": 0.16934787034988402, - "step": 900 - }, - { - "epoch": 0.12299773557158017, - "grad_norm": 6.070940971374512, - "learning_rate": 4.984912263092641e-05, - "loss": 0.2131197214126587, - "step": 920 - }, - { - "epoch": 0.12567159938835365, - "grad_norm": 1.4912281036376953, - "learning_rate": 4.984246917582166e-05, - "loss": 0.25128653049468996, - "step": 940 - }, - { - "epoch": 0.12834546320512713, - "grad_norm": 7.000472545623779, - "learning_rate": 4.9835672628832366e-05, - "loss": 0.2653592586517334, - "step": 960 - }, - { - "epoch": 0.1310193270219006, - "grad_norm": 5.427223205566406, - "learning_rate": 4.9828733029106434e-05, - "loss": 0.1653295636177063, - "step": 980 - }, - { - "epoch": 0.1336931908386741, - "grad_norm": 1.9502102136611938, - "learning_rate": 4.982165041661575e-05, - "loss": 0.2250870943069458, - "step": 1000 - }, - { - "epoch": 0.13636705465544757, - "grad_norm": 0.6216259598731995, - "learning_rate": 4.981442483215595e-05, - "loss": 0.18943849802017212, - "step": 1020 - }, - { - "epoch": 0.13904091847222105, - "grad_norm": 2.3363687992095947, - "learning_rate": 4.98070563173462e-05, - "loss": 0.1673592209815979, - "step": 1040 - }, - { - "epoch": 0.14171478228899453, - "grad_norm": 1.040717601776123, - "learning_rate": 4.979954491462892e-05, - "loss": 0.2113173007965088, - "step": 1060 - }, - { - "epoch": 0.14438864610576801, - "grad_norm": 2.735522747039795, - "learning_rate": 4.979189066726955e-05, - "loss": 0.17504971027374266, - "step": 1080 - }, - { - "epoch": 0.1470625099225415, - "grad_norm": 4.701151371002197, - "learning_rate": 4.978409361935636e-05, - "loss": 0.15881222486495972, - "step": 1100 - }, - { - "epoch": 0.14973637373931498, - "grad_norm": 2.735919237136841, - "learning_rate": 4.9776153815800075e-05, - "loss": 0.14044179916381835, - "step": 1120 - }, - { - "epoch": 0.15241023755608848, - "grad_norm": 3.5479538440704346, - "learning_rate": 4.976807130233375e-05, - "loss": 0.18565714359283447, - "step": 1140 - }, - { - "epoch": 0.15508410137286197, - "grad_norm": 3.2167458534240723, - "learning_rate": 4.975984612551243e-05, - "loss": 0.13236271142959594, - "step": 1160 - }, - { - "epoch": 0.15775796518963545, - "grad_norm": 1.0206760168075562, - "learning_rate": 4.975147833271288e-05, - "loss": 0.19124728441238403, - "step": 1180 - }, - { - "epoch": 0.16043182900640893, - "grad_norm": 4.194457530975342, - "learning_rate": 4.9742967972133335e-05, - "loss": 0.144741427898407, - "step": 1200 - }, - { - "epoch": 0.1631056928231824, - "grad_norm": 3.0225746631622314, - "learning_rate": 4.973431509279323e-05, - "loss": 0.1374324679374695, - "step": 1220 - }, - { - "epoch": 0.1657795566399559, - "grad_norm": 4.243523120880127, - "learning_rate": 4.972551974453287e-05, - "loss": 0.13663809299468993, - "step": 1240 - }, - { - "epoch": 0.16845342045672937, - "grad_norm": 2.4990086555480957, - "learning_rate": 4.971658197801322e-05, - "loss": 0.16817957162857056, - "step": 1260 - }, - { - "epoch": 0.17112728427350285, - "grad_norm": 4.983982563018799, - "learning_rate": 4.9707501844715554e-05, - "loss": 0.13795313835144044, - "step": 1280 - }, - { - "epoch": 0.17380114809027633, - "grad_norm": 3.6780316829681396, - "learning_rate": 4.969827939694115e-05, - "loss": 0.1637880802154541, - "step": 1300 - }, - { - "epoch": 0.1764750119070498, - "grad_norm": 0.7950732707977295, - "learning_rate": 4.968891468781105e-05, - "loss": 0.10979138612747193, - "step": 1320 - }, - { - "epoch": 0.1791488757238233, - "grad_norm": 1.2414121627807617, - "learning_rate": 4.967940777126569e-05, - "loss": 0.13692171573638917, - "step": 1340 - }, - { - "epoch": 0.18182273954059677, - "grad_norm": 2.1383633613586426, - "learning_rate": 4.9669758702064636e-05, - "loss": 0.07821698188781738, - "step": 1360 - }, - { - "epoch": 0.18449660335737025, - "grad_norm": 5.061275959014893, - "learning_rate": 4.965996753578623e-05, - "loss": 0.19053516387939454, - "step": 1380 - }, - { - "epoch": 0.18717046717414373, - "grad_norm": 6.151792049407959, - "learning_rate": 4.9650034328827305e-05, - "loss": 0.11360721588134766, - "step": 1400 - }, - { - "epoch": 0.18984433099091721, - "grad_norm": 1.0604305267333984, - "learning_rate": 4.963995913840284e-05, - "loss": 0.13138024806976317, - "step": 1420 - }, - { - "epoch": 0.1925181948076907, - "grad_norm": 1.7159489393234253, - "learning_rate": 4.9629742022545623e-05, - "loss": 0.08657677173614502, - "step": 1440 - }, - { - "epoch": 0.19519205862446418, - "grad_norm": 2.4207754135131836, - "learning_rate": 4.961938304010595e-05, - "loss": 0.10309149026870727, - "step": 1460 - }, - { - "epoch": 0.19786592244123766, - "grad_norm": 1.532060146331787, - "learning_rate": 4.9608882250751245e-05, - "loss": 0.13628544807434081, - "step": 1480 - }, - { - "epoch": 0.20053978625801114, - "grad_norm": 6.409943580627441, - "learning_rate": 4.959823971496574e-05, - "loss": 0.10584845542907714, - "step": 1500 - }, - { - "epoch": 0.20321365007478462, - "grad_norm": 2.452012538909912, - "learning_rate": 4.9587455494050136e-05, - "loss": 0.06506187915802002, - "step": 1520 - }, - { - "epoch": 0.2058875138915581, - "grad_norm": 5.3016533851623535, - "learning_rate": 4.9576529650121214e-05, - "loss": 0.11848526000976563, - "step": 1540 - }, - { - "epoch": 0.20856137770833158, - "grad_norm": 4.341775894165039, - "learning_rate": 4.956546224611152e-05, - "loss": 0.11318533420562744, - "step": 1560 - }, - { - "epoch": 0.21123524152510506, - "grad_norm": 1.9056169986724854, - "learning_rate": 4.9554253345768965e-05, - "loss": 0.12768398523330687, - "step": 1580 - }, - { - "epoch": 0.21390910534187857, - "grad_norm": 1.8939746618270874, - "learning_rate": 4.9542903013656486e-05, - "loss": 0.10782338380813598, - "step": 1600 - }, - { - "epoch": 0.21658296915865205, - "grad_norm": 8.53671932220459, - "learning_rate": 4.9531411315151654e-05, - "loss": 0.1733921766281128, - "step": 1620 - }, - { - "epoch": 0.21925683297542553, - "grad_norm": 2.0152978897094727, - "learning_rate": 4.951977831644632e-05, - "loss": 0.11197054386138916, - "step": 1640 - }, - { - "epoch": 0.221930696792199, - "grad_norm": 3.8422367572784424, - "learning_rate": 4.95080040845462e-05, - "loss": 0.11441781520843505, - "step": 1660 - }, - { - "epoch": 0.2246045606089725, - "grad_norm": 1.819858193397522, - "learning_rate": 4.949608868727053e-05, - "loss": 0.11403474807739258, - "step": 1680 - }, - { - "epoch": 0.22727842442574597, - "grad_norm": 7.45100212097168, - "learning_rate": 4.948403219325163e-05, - "loss": 0.13117753267288207, - "step": 1700 - }, - { - "epoch": 0.22995228824251945, - "grad_norm": 0.6526040434837341, - "learning_rate": 4.947183467193456e-05, - "loss": 0.07524924874305725, - "step": 1720 - }, - { - "epoch": 0.23262615205929293, - "grad_norm": 3.814746856689453, - "learning_rate": 4.945949619357668e-05, - "loss": 0.07659345269203185, - "step": 1740 - }, - { - "epoch": 0.23530001587606642, - "grad_norm": 2.373124122619629, - "learning_rate": 4.944701682924726e-05, - "loss": 0.1147496223449707, - "step": 1760 - }, - { - "epoch": 0.2379738796928399, - "grad_norm": 0.11161285638809204, - "learning_rate": 4.943439665082707e-05, - "loss": 0.07256829738616943, - "step": 1780 - }, - { - "epoch": 0.24064774350961338, - "grad_norm": 0.45990192890167236, - "learning_rate": 4.942163573100794e-05, - "loss": 0.07726740837097168, - "step": 1800 - }, - { - "epoch": 0.24332160732638686, - "grad_norm": 4.2301926612854, - "learning_rate": 4.940873414329242e-05, - "loss": 0.09349535703659058, - "step": 1820 - }, - { - "epoch": 0.24599547114316034, - "grad_norm": 2.442178726196289, - "learning_rate": 4.939569196199325e-05, - "loss": 0.12413722276687622, - "step": 1840 - }, - { - "epoch": 0.24866933495993382, - "grad_norm": 2.523683786392212, - "learning_rate": 4.938250926223302e-05, - "loss": 0.08566288352012634, - "step": 1860 - }, - { - "epoch": 0.2513431987767073, - "grad_norm": 3.511075258255005, - "learning_rate": 4.936918611994368e-05, - "loss": 0.08007702231407166, - "step": 1880 - }, - { - "epoch": 0.2540170625934808, - "grad_norm": 6.254627704620361, - "learning_rate": 4.935572261186614e-05, - "loss": 0.10983954668045044, - "step": 1900 - }, - { - "epoch": 0.25669092641025426, - "grad_norm": 1.5211899280548096, - "learning_rate": 4.934211881554981e-05, - "loss": 0.09120344519615173, - "step": 1920 - }, - { - "epoch": 0.25936479022702774, - "grad_norm": 2.5893588066101074, - "learning_rate": 4.932837480935214e-05, - "loss": 0.08754412531852722, - "step": 1940 - }, - { - "epoch": 0.2620386540438012, - "grad_norm": 6.878556251525879, - "learning_rate": 4.931449067243821e-05, - "loss": 0.08636274933815002, - "step": 1960 - }, - { - "epoch": 0.2647125178605747, - "grad_norm": 2.9078798294067383, - "learning_rate": 4.9300466484780226e-05, - "loss": 0.09582929015159607, - "step": 1980 - }, - { - "epoch": 0.2673863816773482, - "grad_norm": 3.391852855682373, - "learning_rate": 4.92863023271571e-05, - "loss": 0.0850919783115387, - "step": 2000 - }, - { - "epoch": 0.27006024549412166, - "grad_norm": 5.522103309631348, - "learning_rate": 4.927199828115395e-05, - "loss": 0.050999772548675534, - "step": 2020 - }, - { - "epoch": 0.27273410931089515, - "grad_norm": 0.90350741147995, - "learning_rate": 4.925755442916167e-05, - "loss": 0.10100446939468384, - "step": 2040 - }, - { - "epoch": 0.2754079731276686, - "grad_norm": 1.602030634880066, - "learning_rate": 4.924297085437641e-05, - "loss": 0.0468633770942688, - "step": 2060 - }, - { - "epoch": 0.2780818369444421, - "grad_norm": 1.5823460817337036, - "learning_rate": 4.922824764079913e-05, - "loss": 0.06786358952522278, - "step": 2080 - }, - { - "epoch": 0.2807557007612156, - "grad_norm": 1.6624343395233154, - "learning_rate": 4.92133848732351e-05, - "loss": 0.05772828459739685, - "step": 2100 - }, - { - "epoch": 0.28342956457798907, - "grad_norm": 0.947078287601471, - "learning_rate": 4.9198382637293424e-05, - "loss": 0.08012173175811768, - "step": 2120 - }, - { - "epoch": 0.28610342839476255, - "grad_norm": 0.2919924259185791, - "learning_rate": 4.918324101938653e-05, - "loss": 0.1208539366722107, - "step": 2140 - }, - { - "epoch": 0.28877729221153603, - "grad_norm": 9.258247375488281, - "learning_rate": 4.916796010672969e-05, - "loss": 0.10037034749984741, - "step": 2160 - }, - { - "epoch": 0.2914511560283095, - "grad_norm": 4.0920491218566895, - "learning_rate": 4.915253998734051e-05, - "loss": 0.061488878726959226, - "step": 2180 - }, - { - "epoch": 0.294125019845083, - "grad_norm": 6.1126627922058105, - "learning_rate": 4.913698075003841e-05, - "loss": 0.0862967312335968, - "step": 2200 - }, - { - "epoch": 0.29679888366185647, - "grad_norm": 2.585484743118286, - "learning_rate": 4.912128248444414e-05, - "loss": 0.05393874645233154, - "step": 2220 - }, - { - "epoch": 0.29947274747862995, - "grad_norm": 6.944481372833252, - "learning_rate": 4.9105445280979256e-05, - "loss": 0.08570566773414612, - "step": 2240 - }, - { - "epoch": 0.30214661129540343, - "grad_norm": 1.3824089765548706, - "learning_rate": 4.908946923086556e-05, - "loss": 0.09689127206802368, - "step": 2260 - }, - { - "epoch": 0.30482047511217697, - "grad_norm": 3.4861342906951904, - "learning_rate": 4.907335442612464e-05, - "loss": 0.12550976276397705, - "step": 2280 - }, - { - "epoch": 0.30749433892895045, - "grad_norm": 3.668980121612549, - "learning_rate": 4.905710095957728e-05, - "loss": 0.09089353680610657, - "step": 2300 - }, - { - "epoch": 0.31016820274572393, - "grad_norm": 1.093095064163208, - "learning_rate": 4.904070892484298e-05, - "loss": 0.03925192356109619, - "step": 2320 - }, - { - "epoch": 0.3128420665624974, - "grad_norm": 0.8169485926628113, - "learning_rate": 4.9024178416339364e-05, - "loss": 0.0979581356048584, - "step": 2340 - }, - { - "epoch": 0.3155159303792709, - "grad_norm": 1.892451286315918, - "learning_rate": 4.900750952928166e-05, - "loss": 0.05913209915161133, - "step": 2360 - }, - { - "epoch": 0.3181897941960444, - "grad_norm": 0.24644255638122559, - "learning_rate": 4.8990702359682184e-05, - "loss": 0.06815173625946044, - "step": 2380 - }, - { - "epoch": 0.32086365801281785, - "grad_norm": 2.1861305236816406, - "learning_rate": 4.897375700434972e-05, - "loss": 0.04142785966396332, - "step": 2400 - }, - { - "epoch": 0.32353752182959133, - "grad_norm": 2.6643004417419434, - "learning_rate": 4.8956673560889013e-05, - "loss": 0.05177200436592102, - "step": 2420 - }, - { - "epoch": 0.3262113856463648, - "grad_norm": 2.588113784790039, - "learning_rate": 4.8939452127700195e-05, - "loss": 0.05783546566963196, - "step": 2440 - }, - { - "epoch": 0.3288852494631383, - "grad_norm": 2.419644594192505, - "learning_rate": 4.8922092803978203e-05, - "loss": 0.08906854391098022, - "step": 2460 - }, - { - "epoch": 0.3315591132799118, - "grad_norm": 0.16949939727783203, - "learning_rate": 4.890459568971223e-05, - "loss": 0.10305211544036866, - "step": 2480 - }, - { - "epoch": 0.33423297709668526, - "grad_norm": 0.10032984614372253, - "learning_rate": 4.8886960885685126e-05, - "loss": 0.06348527669906616, - "step": 2500 - }, - { - "epoch": 0.33690684091345874, - "grad_norm": 3.3658738136291504, - "learning_rate": 4.8869188493472854e-05, - "loss": 0.06826075911521912, - "step": 2520 - }, - { - "epoch": 0.3395807047302322, - "grad_norm": 0.8656186461448669, - "learning_rate": 4.885127861544386e-05, - "loss": 0.05929765701293945, - "step": 2540 - }, - { - "epoch": 0.3422545685470057, - "grad_norm": 0.1492065042257309, - "learning_rate": 4.8833231354758496e-05, - "loss": 0.09429731965065002, - "step": 2560 - }, - { - "epoch": 0.3449284323637792, - "grad_norm": 0.6010928153991699, - "learning_rate": 4.881504681536846e-05, - "loss": 0.06262240409851075, - "step": 2580 - }, - { - "epoch": 0.34760229618055266, - "grad_norm": 1.6506450176239014, - "learning_rate": 4.879672510201616e-05, - "loss": 0.061688083410263064, - "step": 2600 - }, - { - "epoch": 0.35027615999732614, - "grad_norm": 0.2703142464160919, - "learning_rate": 4.877826632023412e-05, - "loss": 0.06175137162208557, - "step": 2620 - }, - { - "epoch": 0.3529500238140996, - "grad_norm": 3.1056365966796875, - "learning_rate": 4.875967057634437e-05, - "loss": 0.07828506827354431, - "step": 2640 - }, - { - "epoch": 0.3556238876308731, - "grad_norm": 0.28790283203125, - "learning_rate": 4.874093797745784e-05, - "loss": 0.11355981826782227, - "step": 2660 - }, - { - "epoch": 0.3582977514476466, - "grad_norm": 2.3372068405151367, - "learning_rate": 4.8722068631473746e-05, - "loss": 0.048267871141433716, - "step": 2680 - }, - { - "epoch": 0.36097161526442006, - "grad_norm": 0.12767371535301208, - "learning_rate": 4.8703062647078976e-05, - "loss": 0.04319801032543182, - "step": 2700 - }, - { - "epoch": 0.36364547908119355, - "grad_norm": 0.5145738124847412, - "learning_rate": 4.868392013374741e-05, - "loss": 0.0773090660572052, - "step": 2720 - }, - { - "epoch": 0.366319342897967, - "grad_norm": 0.8518500328063965, - "learning_rate": 4.866464120173937e-05, - "loss": 0.05149460434913635, - "step": 2740 - }, - { - "epoch": 0.3689932067147405, - "grad_norm": 3.6726584434509277, - "learning_rate": 4.8645225962100924e-05, - "loss": 0.06896821856498718, - "step": 2760 - }, - { - "epoch": 0.371667070531514, - "grad_norm": 1.5626497268676758, - "learning_rate": 4.862567452666329e-05, - "loss": 0.047730174660682675, - "step": 2780 - }, - { - "epoch": 0.37434093434828747, - "grad_norm": 6.562028884887695, - "learning_rate": 4.8605987008042144e-05, - "loss": 0.07060698866844177, - "step": 2800 - }, - { - "epoch": 0.37701479816506095, - "grad_norm": 0.7631726861000061, - "learning_rate": 4.8586163519637005e-05, - "loss": 0.04944324493408203, - "step": 2820 - }, - { - "epoch": 0.37968866198183443, - "grad_norm": 1.6982293128967285, - "learning_rate": 4.8566204175630595e-05, - "loss": 0.03000348210334778, - "step": 2840 - }, - { - "epoch": 0.3823625257986079, - "grad_norm": 0.6487429141998291, - "learning_rate": 4.854610909098812e-05, - "loss": 0.06691416501998901, - "step": 2860 - }, - { - "epoch": 0.3850363896153814, - "grad_norm": 0.7648892402648926, - "learning_rate": 4.852587838145668e-05, - "loss": 0.05529783964157105, - "step": 2880 - }, - { - "epoch": 0.38771025343215487, - "grad_norm": 0.11601298302412033, - "learning_rate": 4.850551216356457e-05, - "loss": 0.07780832052230835, - "step": 2900 - }, - { - "epoch": 0.39038411724892835, - "grad_norm": 0.9443137645721436, - "learning_rate": 4.8485010554620594e-05, - "loss": 0.08007023930549621, - "step": 2920 - }, - { - "epoch": 0.39305798106570183, - "grad_norm": 0.8828252553939819, - "learning_rate": 4.846437367271341e-05, - "loss": 0.03541453182697296, - "step": 2940 - }, - { - "epoch": 0.3957318448824753, - "grad_norm": 0.21668888628482819, - "learning_rate": 4.844360163671083e-05, - "loss": 0.08354364633560181, - "step": 2960 - }, - { - "epoch": 0.3984057086992488, - "grad_norm": 0.6840483546257019, - "learning_rate": 4.8422694566259194e-05, - "loss": 0.045807772874832155, - "step": 2980 - }, - { - "epoch": 0.4010795725160223, - "grad_norm": 1.2754698991775513, - "learning_rate": 4.8401652581782584e-05, - "loss": 0.053487342596054074, - "step": 3000 - }, - { - "epoch": 0.40375343633279576, - "grad_norm": 0.19012756645679474, - "learning_rate": 4.838047580448222e-05, - "loss": 0.05881953239440918, - "step": 3020 - }, - { - "epoch": 0.40642730014956924, - "grad_norm": 2.1057698726654053, - "learning_rate": 4.835916435633569e-05, - "loss": 0.031065690517425536, - "step": 3040 - }, - { - "epoch": 0.4091011639663427, - "grad_norm": 4.188559055328369, - "learning_rate": 4.833771836009633e-05, - "loss": 0.07205432653427124, - "step": 3060 - }, - { - "epoch": 0.4117750277831162, - "grad_norm": 6.975829124450684, - "learning_rate": 4.831613793929242e-05, - "loss": 0.04953635036945343, - "step": 3080 - }, - { - "epoch": 0.4144488915998897, - "grad_norm": 4.725269317626953, - "learning_rate": 4.8294423218226546e-05, - "loss": 0.05965519547462463, - "step": 3100 - }, - { - "epoch": 0.41712275541666316, - "grad_norm": 1.7124755382537842, - "learning_rate": 4.827257432197486e-05, - "loss": 0.039625433087348935, - "step": 3120 - }, - { - "epoch": 0.41979661923343664, - "grad_norm": 2.6687324047088623, - "learning_rate": 4.825059137638636e-05, - "loss": 0.05020809769630432, - "step": 3140 - }, - { - "epoch": 0.4224704830502101, - "grad_norm": 1.111640214920044, - "learning_rate": 4.822847450808215e-05, - "loss": 0.04404452443122864, - "step": 3160 - }, - { - "epoch": 0.42514434686698366, - "grad_norm": 0.2128070890903473, - "learning_rate": 4.8206223844454744e-05, - "loss": 0.08283355236053466, - "step": 3180 - }, - { - "epoch": 0.42781821068375714, - "grad_norm": 0.10757248103618622, - "learning_rate": 4.818383951366729e-05, - "loss": 0.08568671345710754, - "step": 3200 - }, - { - "epoch": 0.4304920745005306, - "grad_norm": 0.08344592899084091, - "learning_rate": 4.816132164465289e-05, - "loss": 0.0426956832408905, - "step": 3220 - }, - { - "epoch": 0.4331659383173041, - "grad_norm": 0.5657751560211182, - "learning_rate": 4.813867036711378e-05, - "loss": 0.04971776902675629, - "step": 3240 - }, - { - "epoch": 0.4358398021340776, - "grad_norm": 2.1529288291931152, - "learning_rate": 4.8115885811520654e-05, - "loss": 0.025386181473731995, - "step": 3260 - }, - { - "epoch": 0.43851366595085106, - "grad_norm": 4.228519916534424, - "learning_rate": 4.809296810911188e-05, - "loss": 0.06401395201683044, - "step": 3280 - }, - { - "epoch": 0.44118752976762454, - "grad_norm": 6.770420551300049, - "learning_rate": 4.806991739189274e-05, - "loss": 0.16425553560256959, - "step": 3300 - }, - { - "epoch": 0.443861393584398, - "grad_norm": 0.5303187370300293, - "learning_rate": 4.804673379263467e-05, - "loss": 0.045900467038154605, - "step": 3320 - }, - { - "epoch": 0.4465352574011715, - "grad_norm": 0.221473827958107, - "learning_rate": 4.802341744487453e-05, - "loss": 0.07529735565185547, - "step": 3340 - }, - { - "epoch": 0.449209121217945, - "grad_norm": 3.48736834526062, - "learning_rate": 4.799996848291378e-05, - "loss": 0.062433135509490964, - "step": 3360 - }, - { - "epoch": 0.45188298503471847, - "grad_norm": 2.650038242340088, - "learning_rate": 4.797638704181774e-05, - "loss": 0.03762982189655304, - "step": 3380 - }, - { - "epoch": 0.45455684885149195, - "grad_norm": 3.159665584564209, - "learning_rate": 4.795267325741483e-05, - "loss": 0.04745924174785614, - "step": 3400 - }, - { - "epoch": 0.4572307126682654, - "grad_norm": 0.8763885498046875, - "learning_rate": 4.7928827266295715e-05, - "loss": 0.07380253076553345, - "step": 3420 - }, - { - "epoch": 0.4599045764850389, - "grad_norm": 0.1779366433620453, - "learning_rate": 4.790484920581262e-05, - "loss": 0.045916372537612916, - "step": 3440 - }, - { - "epoch": 0.4625784403018124, - "grad_norm": 1.1228729486465454, - "learning_rate": 4.7880739214078454e-05, - "loss": 0.04461723566055298, - "step": 3460 - }, - { - "epoch": 0.46525230411858587, - "grad_norm": 0.1629919707775116, - "learning_rate": 4.785649742996605e-05, - "loss": 0.017159442603588104, - "step": 3480 - }, - { - "epoch": 0.46792616793535935, - "grad_norm": 3.583951473236084, - "learning_rate": 4.783212399310737e-05, - "loss": 0.047145146131515506, - "step": 3500 - }, - { - "epoch": 0.47060003175213283, - "grad_norm": 0.9766237139701843, - "learning_rate": 4.780761904389267e-05, - "loss": 0.050229442119598386, - "step": 3520 - }, - { - "epoch": 0.4732738955689063, - "grad_norm": 0.05617872253060341, - "learning_rate": 4.778298272346976e-05, - "loss": 0.024862812459468843, - "step": 3540 - }, - { - "epoch": 0.4759477593856798, - "grad_norm": 1.3586453199386597, - "learning_rate": 4.775821517374308e-05, - "loss": 0.02117772251367569, - "step": 3560 - }, - { - "epoch": 0.4786216232024533, - "grad_norm": 1.2116742134094238, - "learning_rate": 4.7733316537373006e-05, - "loss": 0.03060794174671173, - "step": 3580 - }, - { - "epoch": 0.48129548701922675, - "grad_norm": 0.39403238892555237, - "learning_rate": 4.770828695777493e-05, - "loss": 0.05482668280601501, - "step": 3600 - }, - { - "epoch": 0.48396935083600023, - "grad_norm": 0.9248486161231995, - "learning_rate": 4.7683126579118495e-05, - "loss": 0.03612814247608185, - "step": 3620 - }, - { - "epoch": 0.4866432146527737, - "grad_norm": 0.1624649167060852, - "learning_rate": 4.7657835546326736e-05, - "loss": 0.04334873259067536, - "step": 3640 - }, - { - "epoch": 0.4893170784695472, - "grad_norm": 0.5321119427680969, - "learning_rate": 4.763241400507524e-05, - "loss": 0.0461233913898468, - "step": 3660 - }, - { - "epoch": 0.4919909422863207, - "grad_norm": 0.34861093759536743, - "learning_rate": 4.760686210179133e-05, - "loss": 0.024829554557800292, - "step": 3680 - }, - { - "epoch": 0.49466480610309416, - "grad_norm": 1.2561241388320923, - "learning_rate": 4.758117998365322e-05, - "loss": 0.03157005608081818, - "step": 3700 - }, - { - "epoch": 0.49733866991986764, - "grad_norm": 0.8691341280937195, - "learning_rate": 4.7555367798589146e-05, - "loss": 0.04310203492641449, - "step": 3720 - }, - { - "epoch": 0.5000125337366411, - "grad_norm": 0.3134572505950928, - "learning_rate": 4.752942569527653e-05, - "loss": 0.03796039223670959, - "step": 3740 - }, - { - "epoch": 0.5026863975534146, - "grad_norm": 2.3359289169311523, - "learning_rate": 4.75033538231411e-05, - "loss": 0.055599170923233035, - "step": 3760 - }, - { - "epoch": 0.5053602613701881, - "grad_norm": 7.426175594329834, - "learning_rate": 4.747715233235608e-05, - "loss": 0.054436272382736205, - "step": 3780 - }, - { - "epoch": 0.5080341251869616, - "grad_norm": 0.5940203070640564, - "learning_rate": 4.745082137384128e-05, - "loss": 0.03682814538478851, - "step": 3800 - }, - { - "epoch": 0.510707989003735, - "grad_norm": 0.22821389138698578, - "learning_rate": 4.7424361099262225e-05, - "loss": 0.051123309135437014, - "step": 3820 - }, - { - "epoch": 0.5133818528205085, - "grad_norm": 8.20633602142334, - "learning_rate": 4.739777166102932e-05, - "loss": 0.0704378604888916, - "step": 3840 - }, - { - "epoch": 0.516055716637282, - "grad_norm": 3.023848533630371, - "learning_rate": 4.737105321229694e-05, - "loss": 0.03368058800697327, - "step": 3860 - }, - { - "epoch": 0.5187295804540555, - "grad_norm": 0.07666649669408798, - "learning_rate": 4.7344205906962555e-05, - "loss": 0.03665303289890289, - "step": 3880 - }, - { - "epoch": 0.521403444270829, - "grad_norm": 0.7571629881858826, - "learning_rate": 4.731722989966585e-05, - "loss": 0.058415502309799194, - "step": 3900 - }, - { - "epoch": 0.5240773080876024, - "grad_norm": 3.2599120140075684, - "learning_rate": 4.7290125345787816e-05, - "loss": 0.07323018908500671, - "step": 3920 - }, - { - "epoch": 0.5267511719043759, - "grad_norm": 0.28930988907814026, - "learning_rate": 4.7262892401449886e-05, - "loss": 0.054371267557144165, - "step": 3940 - }, - { - "epoch": 0.5294250357211494, - "grad_norm": 2.2296454906463623, - "learning_rate": 4.7235531223513004e-05, - "loss": 0.040819621086120604, - "step": 3960 - }, - { - "epoch": 0.5320988995379229, - "grad_norm": 0.11608211696147919, - "learning_rate": 4.720804196957675e-05, - "loss": 0.05215579271316528, - "step": 3980 - }, - { - "epoch": 0.5347727633546964, - "grad_norm": 1.1587547063827515, - "learning_rate": 4.7180424797978415e-05, - "loss": 0.026277875900268553, - "step": 4000 - }, - { - "epoch": 0.5374466271714698, - "grad_norm": 0.06253435462713242, - "learning_rate": 4.7152679867792074e-05, - "loss": 0.02574407756328583, - "step": 4020 - }, - { - "epoch": 0.5401204909882433, - "grad_norm": 1.3441458940505981, - "learning_rate": 4.71248073388277e-05, - "loss": 0.05538107752799988, - "step": 4040 - }, - { - "epoch": 0.5427943548050168, - "grad_norm": 0.48076340556144714, - "learning_rate": 4.7096807371630236e-05, - "loss": 0.047986540198326114, - "step": 4060 - }, - { - "epoch": 0.5454682186217903, - "grad_norm": 0.5924936532974243, - "learning_rate": 4.706868012747867e-05, - "loss": 0.05463914275169372, - "step": 4080 - }, - { - "epoch": 0.7673995566395854, - "grad_norm": 0.05143728107213974, - "learning_rate": 4.431151627307268e-05, - "loss": 0.00959376593430837, - "step": 4100 - }, - { - "epoch": 0.771142969110998, - "grad_norm": 1.2308074235916138, - "learning_rate": 4.425806509248848e-05, - "loss": 0.002745623141527176, - "step": 4120 - }, - { - "epoch": 0.7748863815824106, - "grad_norm": 2.080223798751831, - "learning_rate": 4.420439652052499e-05, - "loss": 0.012390998750925064, - "step": 4140 - }, - { - "epoch": 0.7786297940538233, - "grad_norm": 0.049312230199575424, - "learning_rate": 4.415051116301072e-05, - "loss": 0.004607534408569336, - "step": 4160 - }, - { - "epoch": 0.7823732065252359, - "grad_norm": 0.07747476547956467, - "learning_rate": 4.409640962822132e-05, - "loss": 0.034441503882408145, - "step": 4180 - }, - { - "epoch": 0.7861166189966485, - "grad_norm": 0.021327875554561615, - "learning_rate": 4.404209252687275e-05, - "loss": 0.009768449515104295, - "step": 4200 - }, - { - "epoch": 0.789860031468061, - "grad_norm": 2.406580924987793, - "learning_rate": 4.398756047211431e-05, - "loss": 0.005304037779569626, - "step": 4220 - }, - { - "epoch": 0.7936034439394737, - "grad_norm": 0.027869906276464462, - "learning_rate": 4.39328140795218e-05, - "loss": 0.00896073654294014, - "step": 4240 - }, - { - "epoch": 0.7973468564108863, - "grad_norm": 0.09702044725418091, - "learning_rate": 4.387785396709052e-05, - "loss": 0.0117533378303051, - "step": 4260 - }, - { - "epoch": 0.801090268882299, - "grad_norm": 0.529065728187561, - "learning_rate": 4.382268075522831e-05, - "loss": 0.0037526611238718035, - "step": 4280 - }, - { - "epoch": 0.8048336813537116, - "grad_norm": 0.015109462663531303, - "learning_rate": 4.3767295066748564e-05, - "loss": 0.0025708725675940513, - "step": 4300 - }, - { - "epoch": 0.8085770938251241, - "grad_norm": 0.7257627248764038, - "learning_rate": 4.371169752686316e-05, - "loss": 0.006234285607933998, - "step": 4320 - }, - { - "epoch": 0.8123205062965367, - "grad_norm": 0.016853008419275284, - "learning_rate": 4.3655888763175436e-05, - "loss": 0.0023587727919220924, - "step": 4340 - }, - { - "epoch": 0.8160639187679494, - "grad_norm": 0.017816167324781418, - "learning_rate": 4.3599869405673085e-05, - "loss": 0.0012389549054205417, - "step": 4360 - }, - { - "epoch": 0.819807331239362, - "grad_norm": 0.014672616496682167, - "learning_rate": 4.354364008672106e-05, - "loss": 0.002244691364467144, - "step": 4380 - }, - { - "epoch": 0.8235507437107746, - "grad_norm": 0.044869400560855865, - "learning_rate": 4.3487201441054435e-05, - "loss": 0.007713723182678223, - "step": 4400 - }, - { - "epoch": 0.8272941561821872, - "grad_norm": 0.06367291510105133, - "learning_rate": 4.343055410577122e-05, - "loss": 0.005743256583809852, - "step": 4420 - }, - { - "epoch": 0.8310375686535998, - "grad_norm": 0.1354215145111084, - "learning_rate": 4.3373698720325176e-05, - "loss": 0.009635470807552338, - "step": 4440 - }, - { - "epoch": 0.8347809811250124, - "grad_norm": 0.9089844822883606, - "learning_rate": 4.331663592651862e-05, - "loss": 0.01007603257894516, - "step": 4460 - }, - { - "epoch": 0.838524393596425, - "grad_norm": 0.025831619277596474, - "learning_rate": 4.3259366368495167e-05, - "loss": 0.006179215386509895, - "step": 4480 - }, - { - "epoch": 0.8422678060678377, - "grad_norm": 0.016653764992952347, - "learning_rate": 4.320189069273243e-05, - "loss": 0.0025156451389193534, - "step": 4500 - }, - { - "epoch": 0.8460112185392502, - "grad_norm": 0.27361780405044556, - "learning_rate": 4.3144209548034766e-05, - "loss": 0.002235286869108677, - "step": 4520 - }, - { - "epoch": 0.8497546310106628, - "grad_norm": 2.6958701610565186, - "learning_rate": 4.3086323585525915e-05, - "loss": 0.03571180701255798, - "step": 4540 - }, - { - "epoch": 0.8534980434820755, - "grad_norm": 0.1260778158903122, - "learning_rate": 4.3028233458641696e-05, - "loss": 0.0036518506705760954, - "step": 4560 - }, - { - "epoch": 0.8572414559534881, - "grad_norm": 0.2445528209209442, - "learning_rate": 4.2969939823122586e-05, - "loss": 0.024949796497821808, - "step": 4580 - }, - { - "epoch": 0.8609848684249007, - "grad_norm": 0.1674242913722992, - "learning_rate": 4.291144333700633e-05, - "loss": 0.002089798077940941, - "step": 4600 - }, - { - "epoch": 0.8647282808963134, - "grad_norm": 0.05161884427070618, - "learning_rate": 4.2852744660620515e-05, - "loss": 0.007847145944833756, - "step": 4620 - }, - { - "epoch": 0.8684716933677259, - "grad_norm": 0.019796324893832207, - "learning_rate": 4.279384445657514e-05, - "loss": 0.0023555334657430647, - "step": 4640 - }, - { - "epoch": 0.8722151058391385, - "grad_norm": 0.0647754967212677, - "learning_rate": 4.2734743389755096e-05, - "loss": 0.009586349129676819, - "step": 4660 - }, - { - "epoch": 0.8759585183105512, - "grad_norm": 0.015243460424244404, - "learning_rate": 4.267544212731268e-05, - "loss": 0.017788709700107576, - "step": 4680 - }, - { - "epoch": 0.8797019307819638, - "grad_norm": 0.05756703019142151, - "learning_rate": 4.261594133866007e-05, - "loss": 0.014256520569324494, - "step": 4700 - }, - { - "epoch": 0.8834453432533764, - "grad_norm": 0.2002931535243988, - "learning_rate": 4.255624169546175e-05, - "loss": 0.0014025470241904258, - "step": 4720 - }, - { - "epoch": 0.887188755724789, - "grad_norm": 0.04325389489531517, - "learning_rate": 4.249634387162696e-05, - "loss": 0.010552891343832017, - "step": 4740 - }, - { - "epoch": 0.8909321681962016, - "grad_norm": 0.8975178599357605, - "learning_rate": 4.243624854330206e-05, - "loss": 0.0032475266605615618, - "step": 4760 - }, - { - "epoch": 0.8946755806676142, - "grad_norm": 0.01541830413043499, - "learning_rate": 4.237595638886288e-05, - "loss": 0.003157203644514084, - "step": 4780 - }, - { - "epoch": 0.8984189931390268, - "grad_norm": 1.673305869102478, - "learning_rate": 4.231546808890713e-05, - "loss": 0.0028239911422133445, - "step": 4800 - }, - { - "epoch": 0.9021624056104395, - "grad_norm": 0.021689629182219505, - "learning_rate": 4.225478432624665e-05, - "loss": 0.0026885712519288062, - "step": 4820 - }, - { - "epoch": 0.905905818081852, - "grad_norm": 0.019590798765420914, - "learning_rate": 4.219390578589973e-05, - "loss": 0.00780024379491806, - "step": 4840 - }, - { - "epoch": 0.9096492305532646, - "grad_norm": 0.024581020697951317, - "learning_rate": 4.213283315508337e-05, - "loss": 0.006697511672973633, - "step": 4860 - }, - { - "epoch": 0.9133926430246773, - "grad_norm": 0.20615583658218384, - "learning_rate": 4.207156712320555e-05, - "loss": 0.007314208894968033, - "step": 4880 - }, - { - "epoch": 0.9171360554960899, - "grad_norm": 0.015673745423555374, - "learning_rate": 4.20101083818574e-05, - "loss": 0.004841562733054161, - "step": 4900 - }, - { - "epoch": 0.9208794679675025, - "grad_norm": 0.008306623436510563, - "learning_rate": 4.194845762480544e-05, - "loss": 0.0010150263085961341, - "step": 4920 - }, - { - "epoch": 0.9246228804389152, - "grad_norm": 0.051861703395843506, - "learning_rate": 4.188661554798369e-05, - "loss": 0.011043114960193634, - "step": 4940 - }, - { - "epoch": 0.9283662929103277, - "grad_norm": 1.7019767761230469, - "learning_rate": 4.1824582849485884e-05, - "loss": 0.004985674470663071, - "step": 4960 - }, - { - "epoch": 0.9321097053817403, - "grad_norm": 0.021240154281258583, - "learning_rate": 4.176236022955755e-05, - "loss": 0.04885836541652679, - "step": 4980 - }, - { - "epoch": 0.935853117853153, - "grad_norm": 0.016504865139722824, - "learning_rate": 4.16999483905881e-05, - "loss": 0.0027378931641578673, - "step": 5000 - }, - { - "epoch": 0.9395965303245656, - "grad_norm": 0.014015628024935722, - "learning_rate": 4.163734803710294e-05, - "loss": 0.012781022489070893, - "step": 5020 - }, - { - "epoch": 0.9433399427959782, - "grad_norm": 0.013812500052154064, - "learning_rate": 4.157455987575545e-05, - "loss": 0.007508871704339981, - "step": 5040 - }, - { - "epoch": 0.9470833552673907, - "grad_norm": 0.01622290164232254, - "learning_rate": 4.1511584615319075e-05, - "loss": 0.0014614147134125234, - "step": 5060 - }, - { - "epoch": 0.9508267677388034, - "grad_norm": 0.01259149145334959, - "learning_rate": 4.144842296667929e-05, - "loss": 0.006202424317598343, - "step": 5080 - }, - { - "epoch": 0.954570180210216, - "grad_norm": 0.012383027002215385, - "learning_rate": 4.138507564282558e-05, - "loss": 0.006122353300452232, - "step": 5100 - }, - { - "epoch": 0.9583135926816286, - "grad_norm": 0.006499920971691608, - "learning_rate": 4.1321543358843385e-05, - "loss": 0.0008865024894475937, - "step": 5120 - }, - { - "epoch": 0.9620570051530413, - "grad_norm": 0.00830752868205309, - "learning_rate": 4.125782683190606e-05, - "loss": 0.0008420860394835472, - "step": 5140 - }, - { - "epoch": 0.9658004176244538, - "grad_norm": 0.01525857299566269, - "learning_rate": 4.119392678126673e-05, - "loss": 0.00587364137172699, - "step": 5160 - }, - { - "epoch": 0.9695438300958664, - "grad_norm": 0.01072095800191164, - "learning_rate": 4.11298439282502e-05, - "loss": 0.00853007659316063, - "step": 5180 - }, - { - "epoch": 0.973287242567279, - "grad_norm": 0.030316641554236412, - "learning_rate": 4.106557899624482e-05, - "loss": 0.0058747071772813795, - "step": 5200 - }, - { - "epoch": 0.9770306550386917, - "grad_norm": 0.0391647033393383, - "learning_rate": 4.1001132710694304e-05, - "loss": 0.0034765828400850295, - "step": 5220 - }, - { - "epoch": 0.9807740675101043, - "grad_norm": 0.04938298836350441, - "learning_rate": 4.093650579908953e-05, - "loss": 0.007594724744558334, - "step": 5240 - }, - { - "epoch": 0.984517479981517, - "grad_norm": 0.005873252172023058, - "learning_rate": 4.087169899096037e-05, - "loss": 0.013347607851028443, - "step": 5260 - }, - { - "epoch": 0.9882608924529295, - "grad_norm": 1.2757259607315063, - "learning_rate": 4.080671301786741e-05, - "loss": 0.004837355017662049, - "step": 5280 - }, - { - "epoch": 0.9920043049243421, - "grad_norm": 0.00920735765248537, - "learning_rate": 4.0741548613393675e-05, - "loss": 0.007415445148944854, - "step": 5300 - }, - { - "epoch": 0.9957477173957547, - "grad_norm": 0.5702093839645386, - "learning_rate": 4.067620651313647e-05, - "loss": 0.00406576506793499, - "step": 5320 - }, - { - "epoch": 0.9994911298671674, - "grad_norm": 1.8361051082611084, - "learning_rate": 4.0610687454698906e-05, - "loss": 0.00997612327337265, - "step": 5340 - }, - { - "epoch": 1.0031819006007008, - "grad_norm": 3.335326910018921, - "learning_rate": 4.0544992177681685e-05, - "loss": 0.008442799001932145, - "step": 5360 - }, - { - "epoch": 1.0069253130721134, - "grad_norm": 0.03184954449534416, - "learning_rate": 4.047912142367473e-05, - "loss": 0.008095134049654007, - "step": 5380 - }, - { - "epoch": 1.010668725543526, - "grad_norm": 0.029989074915647507, - "learning_rate": 4.04130759362488e-05, - "loss": 0.0012585990130901336, - "step": 5400 - }, - { - "epoch": 1.0144121380149385, - "grad_norm": 0.08727464079856873, - "learning_rate": 4.034685646094711e-05, - "loss": 0.012588074803352356, - "step": 5420 - }, - { - "epoch": 1.018155550486351, - "grad_norm": 0.018498806282877922, - "learning_rate": 4.028046374527689e-05, - "loss": 0.001854238100349903, - "step": 5440 - }, - { - "epoch": 1.0218989629577637, - "grad_norm": 0.013779236935079098, - "learning_rate": 4.021389853870095e-05, - "loss": 0.0008004569448530674, - "step": 5460 - }, - { - "epoch": 1.0256423754291764, - "grad_norm": 0.028235070407390594, - "learning_rate": 4.0147161592629306e-05, - "loss": 0.002274145185947418, - "step": 5480 - }, - { - "epoch": 1.029385787900589, - "grad_norm": 0.023030120879411697, - "learning_rate": 4.008025366041055e-05, - "loss": 0.008717305958271027, - "step": 5500 - }, - { - "epoch": 1.0331292003720016, - "grad_norm": 0.018347155302762985, - "learning_rate": 4.001317549732345e-05, - "loss": 0.00244256854057312, - "step": 5520 - }, - { - "epoch": 1.0368726128434143, - "grad_norm": 0.03449391946196556, - "learning_rate": 3.99459278605684e-05, - "loss": 0.0039924226701259615, - "step": 5540 - }, - { - "epoch": 1.0406160253148269, - "grad_norm": 0.030406463891267776, - "learning_rate": 3.9878511509258866e-05, - "loss": 0.0021008485928177834, - "step": 5560 - }, - { - "epoch": 1.0443594377862395, - "grad_norm": 0.01783100888133049, - "learning_rate": 3.9810927204412803e-05, - "loss": 0.0006656501442193985, - "step": 5580 - }, - { - "epoch": 1.0481028502576522, - "grad_norm": 0.05360455811023712, - "learning_rate": 3.974317570894413e-05, - "loss": 0.005278818309307098, - "step": 5600 - }, - { - "epoch": 1.0518462627290646, - "grad_norm": 0.008699169382452965, - "learning_rate": 3.9675257787654e-05, - "loss": 0.005309444293379784, - "step": 5620 - }, - { - "epoch": 1.0555896752004772, - "grad_norm": 0.036641959100961685, - "learning_rate": 3.960717420722227e-05, - "loss": 0.0034692320972681046, - "step": 5640 - }, - { - "epoch": 1.0593330876718898, - "grad_norm": 0.012212110683321953, - "learning_rate": 3.953892573619883e-05, - "loss": 0.005343861132860184, - "step": 5660 - }, - { - "epoch": 1.0630765001433025, - "grad_norm": 0.011296284385025501, - "learning_rate": 3.947051314499489e-05, - "loss": 0.0038058970123529432, - "step": 5680 - }, - { - "epoch": 1.066819912614715, - "grad_norm": 0.05954049900174141, - "learning_rate": 3.94019372058743e-05, - "loss": 0.008142991364002228, - "step": 5700 - }, - { - "epoch": 1.0705633250861277, - "grad_norm": 0.03478416055440903, - "learning_rate": 3.933319869294483e-05, - "loss": 0.0075227849185466765, - "step": 5720 - }, - { - "epoch": 1.0743067375575404, - "grad_norm": 0.014586996287107468, - "learning_rate": 3.9264298382149455e-05, - "loss": 0.0036750122904777526, - "step": 5740 - }, - { - "epoch": 1.078050150028953, - "grad_norm": 0.025754544883966446, - "learning_rate": 3.919523705125757e-05, - "loss": 0.004151013493537903, - "step": 5760 - }, - { - "epoch": 1.0817935625003656, - "grad_norm": 0.03239905461668968, - "learning_rate": 3.9126015479856205e-05, - "loss": 0.00861695185303688, - "step": 5780 - }, - { - "epoch": 1.0855369749717783, - "grad_norm": 0.03506994619965553, - "learning_rate": 3.9056634449341256e-05, - "loss": 0.003123755753040314, - "step": 5800 - }, - { - "epoch": 1.089280387443191, - "grad_norm": 0.0286911278963089, - "learning_rate": 3.898709474290864e-05, - "loss": 0.002537376619875431, - "step": 5820 - }, - { - "epoch": 1.0930237999146033, - "grad_norm": 0.03490692004561424, - "learning_rate": 3.8917397145545454e-05, - "loss": 0.0010227372869849205, - "step": 5840 - }, - { - "epoch": 1.096767212386016, - "grad_norm": 0.013748899102210999, - "learning_rate": 3.884754244402113e-05, - "loss": 0.011847371608018875, - "step": 5860 - }, - { - "epoch": 1.1005106248574286, - "grad_norm": 0.035458195954561234, - "learning_rate": 3.877753142687852e-05, - "loss": 0.009741749614477158, - "step": 5880 - }, - { - "epoch": 1.1042540373288412, - "grad_norm": 0.012493673712015152, - "learning_rate": 3.8707364884425064e-05, - "loss": 0.006607493013143539, - "step": 5900 - }, - { - "epoch": 1.1079974498002538, - "grad_norm": 0.018607834354043007, - "learning_rate": 3.863704360872378e-05, - "loss": 0.0016217166557908058, - "step": 5920 - }, - { - "epoch": 1.1117408622716665, - "grad_norm": 0.0283930953592062, - "learning_rate": 3.8566568393584366e-05, - "loss": 0.002083975449204445, - "step": 5940 - }, - { - "epoch": 1.115484274743079, - "grad_norm": 0.05229801684617996, - "learning_rate": 3.8495940034554283e-05, - "loss": 0.0014217685908079146, - "step": 5960 - }, - { - "epoch": 1.1192276872144917, - "grad_norm": 0.008808930404484272, - "learning_rate": 3.8425159328909684e-05, - "loss": 0.0022570645436644555, - "step": 5980 - }, - { - "epoch": 1.1229710996859044, - "grad_norm": 0.020502232015132904, - "learning_rate": 3.835422707564648e-05, - "loss": 0.003745942190289497, - "step": 6000 - }, - { - "epoch": 1.126714512157317, - "grad_norm": 0.032347094267606735, - "learning_rate": 3.82831440754713e-05, - "loss": 0.003347185626626015, - "step": 6020 - }, - { - "epoch": 1.1304579246287294, - "grad_norm": 0.020310478284955025, - "learning_rate": 3.821191113079246e-05, - "loss": 0.006166417896747589, - "step": 6040 - }, - { - "epoch": 1.134201337100142, - "grad_norm": 0.06390372663736343, - "learning_rate": 3.8140529045710876e-05, - "loss": 0.0013674044981598853, - "step": 6060 - }, - { - "epoch": 1.1379447495715547, - "grad_norm": 1.1938918828964233, - "learning_rate": 3.806899862601105e-05, - "loss": 0.010550644248723984, - "step": 6080 - }, - { - "epoch": 1.1416881620429673, - "grad_norm": 0.035355549305677414, - "learning_rate": 3.799732067915189e-05, - "loss": 0.0069750770926475525, - "step": 6100 - }, - { - "epoch": 1.14543157451438, - "grad_norm": 0.009921093471348286, - "learning_rate": 3.792549601425767e-05, - "loss": 0.0027949588373303415, - "step": 6120 - }, - { - "epoch": 1.1491749869857926, - "grad_norm": 0.06172063946723938, - "learning_rate": 3.785352544210884e-05, - "loss": 0.0009372101165354251, - "step": 6140 - }, - { - "epoch": 1.1529183994572052, - "grad_norm": 0.008572470396757126, - "learning_rate": 3.778140977513294e-05, - "loss": 0.0029502738267183303, - "step": 6160 - }, - { - "epoch": 1.1566618119286178, - "grad_norm": 0.4211727976799011, - "learning_rate": 3.770914982739534e-05, - "loss": 0.014692296087741853, - "step": 6180 - }, - { - "epoch": 1.1604052244000305, - "grad_norm": 0.02292146533727646, - "learning_rate": 3.7636746414590126e-05, - "loss": 0.0020170681178569793, - "step": 6200 - }, - { - "epoch": 1.164148636871443, - "grad_norm": 0.11247449368238449, - "learning_rate": 3.756420035403086e-05, - "loss": 0.006851900368928909, - "step": 6220 - }, - { - "epoch": 1.1678920493428557, - "grad_norm": 0.020755017176270485, - "learning_rate": 3.749151246464137e-05, - "loss": 0.0021739909425377846, - "step": 6240 - }, - { - "epoch": 1.1716354618142684, - "grad_norm": 0.017202025279402733, - "learning_rate": 3.741868356694647e-05, - "loss": 0.002353278361260891, - "step": 6260 - }, - { - "epoch": 1.1753788742856808, - "grad_norm": 0.014947429299354553, - "learning_rate": 3.734571448306274e-05, - "loss": 0.0010860362090170383, - "step": 6280 - }, - { - "epoch": 1.1791222867570934, - "grad_norm": 1.5391262769699097, - "learning_rate": 3.727260603668922e-05, - "loss": 0.01233254000544548, - "step": 6300 - }, - { - "epoch": 1.182865699228506, - "grad_norm": 0.4759792387485504, - "learning_rate": 3.7199359053098133e-05, - "loss": 0.0028501398861408233, - "step": 6320 - }, - { - "epoch": 1.1866091116999187, - "grad_norm": 0.01719040609896183, - "learning_rate": 3.7125974359125536e-05, - "loss": 0.00934450700879097, - "step": 6340 - }, - { - "epoch": 1.1903525241713313, - "grad_norm": 2.4766688346862793, - "learning_rate": 3.7052452783162015e-05, - "loss": 0.018582724034786224, - "step": 6360 - }, - { - "epoch": 1.194095936642744, - "grad_norm": 0.11404932290315628, - "learning_rate": 3.6978795155143326e-05, - "loss": 0.01815672367811203, - "step": 6380 - }, - { - "epoch": 1.1978393491141566, - "grad_norm": 0.021365633234381676, - "learning_rate": 3.690500230654103e-05, - "loss": 0.004123781993985176, - "step": 6400 - }, - { - "epoch": 1.2015827615855692, - "grad_norm": 0.022478772327303886, - "learning_rate": 3.68310750703531e-05, - "loss": 0.0038731731474399567, - "step": 6420 - }, - { - "epoch": 1.2053261740569818, - "grad_norm": 0.15531578660011292, - "learning_rate": 3.67570142810945e-05, - "loss": 0.002076444961130619, - "step": 6440 - }, - { - "epoch": 1.2090695865283942, - "grad_norm": 0.012458150275051594, - "learning_rate": 3.668282077478783e-05, - "loss": 0.0027592860162258146, - "step": 6460 - }, - { - "epoch": 1.2128129989998069, - "grad_norm": 0.01572798565030098, - "learning_rate": 3.66084953889538e-05, - "loss": 0.002740098722279072, - "step": 6480 - }, - { - "epoch": 1.2165564114712195, - "grad_norm": 0.13682503998279572, - "learning_rate": 3.6534038962601835e-05, - "loss": 0.000705425813794136, - "step": 6500 - }, - { - "epoch": 1.2202998239426321, - "grad_norm": 0.030630914494395256, - "learning_rate": 3.64594523362206e-05, - "loss": 0.012480729073286057, - "step": 6520 - }, - { - "epoch": 1.2240432364140448, - "grad_norm": 0.024804554879665375, - "learning_rate": 3.638473635176848e-05, - "loss": 0.0007834361866116523, - "step": 6540 - }, - { - "epoch": 1.2277866488854574, - "grad_norm": 0.011334752663969994, - "learning_rate": 3.630989185266411e-05, - "loss": 0.022086825966835023, - "step": 6560 - }, - { - "epoch": 1.23153006135687, - "grad_norm": 0.020346902310848236, - "learning_rate": 3.623491968377684e-05, - "loss": 0.018024472892284392, - "step": 6580 - }, - { - "epoch": 1.2352734738282827, - "grad_norm": 0.015177210792899132, - "learning_rate": 3.615982069141719e-05, - "loss": 0.005251453071832657, - "step": 6600 - }, - { - "epoch": 1.2390168862996953, - "grad_norm": 0.013680647127330303, - "learning_rate": 3.608459572332733e-05, - "loss": 0.006734563410282135, - "step": 6620 - }, - { - "epoch": 1.242760298771108, - "grad_norm": 0.17980872094631195, - "learning_rate": 3.600924562867144e-05, - "loss": 0.003970410302281379, - "step": 6640 - }, - { - "epoch": 1.2465037112425206, - "grad_norm": 0.015203841030597687, - "learning_rate": 3.593377125802622e-05, - "loss": 0.0032148901373147964, - "step": 6660 - }, - { - "epoch": 1.2502471237139332, - "grad_norm": 0.017300931736826897, - "learning_rate": 3.585817346337119e-05, - "loss": 0.00467667318880558, - "step": 6680 - }, - { - "epoch": 1.2539905361853458, - "grad_norm": 0.028181765228509903, - "learning_rate": 3.5782453098079175e-05, - "loss": 0.0015515764243900776, - "step": 6700 - }, - { - "epoch": 1.2577339486567582, - "grad_norm": 0.01730780117213726, - "learning_rate": 3.570661101690657e-05, - "loss": 0.007991334050893783, - "step": 6720 - }, - { - "epoch": 1.2614773611281709, - "grad_norm": 0.014216347597539425, - "learning_rate": 3.5630648075983763e-05, - "loss": 0.002533360943198204, - "step": 6740 - }, - { - "epoch": 1.2652207735995835, - "grad_norm": 0.1556195169687271, - "learning_rate": 3.555456513280544e-05, - "loss": 0.0032653655856847764, - "step": 6760 - }, - { - "epoch": 1.2689641860709961, - "grad_norm": 0.023955868557095528, - "learning_rate": 3.5478363046220915e-05, - "loss": 0.00850408971309662, - "step": 6780 - }, - { - "epoch": 1.2727075985424088, - "grad_norm": 0.17874136567115784, - "learning_rate": 3.5402042676424424e-05, - "loss": 0.0032720811665058135, - "step": 6800 - }, - { - "epoch": 1.2764510110138214, - "grad_norm": 0.0899379625916481, - "learning_rate": 3.5325604884945434e-05, - "loss": 0.003243798017501831, - "step": 6820 - }, - { - "epoch": 1.280194423485234, - "grad_norm": 0.413362056016922, - "learning_rate": 3.5249050534638906e-05, - "loss": 0.0036127623170614243, - "step": 6840 - }, - { - "epoch": 1.2839378359566467, - "grad_norm": 0.02790931612253189, - "learning_rate": 3.517238048967554e-05, - "loss": 0.008225285261869431, - "step": 6860 - }, - { - "epoch": 1.287681248428059, - "grad_norm": 0.6761110424995422, - "learning_rate": 3.5095595615532056e-05, - "loss": 0.00199942234903574, - "step": 6880 - }, - { - "epoch": 1.2914246608994717, - "grad_norm": 4.593618869781494, - "learning_rate": 3.5018696778981385e-05, - "loss": 0.007301987707614898, - "step": 6900 - }, - { - "epoch": 1.2951680733708844, - "grad_norm": 0.09392693638801575, - "learning_rate": 3.494168484808293e-05, - "loss": 0.009008315950632095, - "step": 6920 - }, - { - "epoch": 1.298911485842297, - "grad_norm": 0.008239852264523506, - "learning_rate": 3.48645606921727e-05, - "loss": 0.012661360204219818, - "step": 6940 - }, - { - "epoch": 1.3026548983137096, - "grad_norm": 0.05141177773475647, - "learning_rate": 3.4787325181853576e-05, - "loss": 0.0007553372532129287, - "step": 6960 - }, - { - "epoch": 1.3063983107851223, - "grad_norm": 0.024333903566002846, - "learning_rate": 3.470997918898541e-05, - "loss": 0.0016128463670611382, - "step": 6980 - }, - { - "epoch": 1.3101417232565349, - "grad_norm": 0.0337531715631485, - "learning_rate": 3.4632523586675254e-05, - "loss": 0.003253454715013504, - "step": 7000 - }, - { - "epoch": 1.3138851357279475, - "grad_norm": 0.05121550336480141, - "learning_rate": 3.4554959249267436e-05, - "loss": 0.0026307271793484686, - "step": 7020 - }, - { - "epoch": 1.3176285481993602, - "grad_norm": 0.025997543707489967, - "learning_rate": 3.447728705233374e-05, - "loss": 0.0012719514779746532, - "step": 7040 - }, - { - "epoch": 1.3213719606707728, - "grad_norm": 0.009486268274486065, - "learning_rate": 3.4399507872663494e-05, - "loss": 0.002009082958102226, - "step": 7060 - }, - { - "epoch": 1.3251153731421854, - "grad_norm": 0.016816232353448868, - "learning_rate": 3.432162258825369e-05, - "loss": 0.0005956823006272316, - "step": 7080 - }, - { - "epoch": 1.328858785613598, - "grad_norm": 0.004733961541205645, - "learning_rate": 3.424363207829906e-05, - "loss": 0.003636709600687027, - "step": 7100 - }, - { - "epoch": 1.3326021980850107, - "grad_norm": 3.666203498840332, - "learning_rate": 3.4165537223182155e-05, - "loss": 0.010488419234752655, - "step": 7120 - }, - { - "epoch": 1.336345610556423, - "grad_norm": 0.021471882238984108, - "learning_rate": 3.408733890446341e-05, - "loss": 0.0009709249250590801, - "step": 7140 - }, - { - "epoch": 1.3400890230278357, - "grad_norm": 0.007639541756361723, - "learning_rate": 3.40090380048712e-05, - "loss": 0.0030905861407518388, - "step": 7160 - }, - { - "epoch": 1.3438324354992484, - "grad_norm": 0.16878941655158997, - "learning_rate": 3.393063540829186e-05, - "loss": 0.0036965351551771163, - "step": 7180 - }, - { - "epoch": 1.347575847970661, - "grad_norm": 0.07014094293117523, - "learning_rate": 3.385213199975971e-05, - "loss": 0.0005677144508808851, - "step": 7200 - }, - { - "epoch": 1.3513192604420736, - "grad_norm": 0.008626374416053295, - "learning_rate": 3.377352866544706e-05, - "loss": 0.0005447934381663799, - "step": 7220 - }, - { - "epoch": 1.3550626729134863, - "grad_norm": 0.013825134374201298, - "learning_rate": 3.3694826292654246e-05, - "loss": 0.004854041337966919, - "step": 7240 - }, - { - "epoch": 1.3588060853848989, - "grad_norm": 0.025015883147716522, - "learning_rate": 3.361602576979956e-05, - "loss": 0.004542553424835205, - "step": 7260 - }, - { - "epoch": 1.3625494978563115, - "grad_norm": 0.009614030830562115, - "learning_rate": 3.353712798640923e-05, - "loss": 0.0008775785565376282, - "step": 7280 - }, - { - "epoch": 1.366292910327724, - "grad_norm": 3.8835268020629883, - "learning_rate": 3.345813383310744e-05, - "loss": 0.0063879616558551785, - "step": 7300 - }, - { - "epoch": 1.3700363227991366, - "grad_norm": 0.005518193822354078, - "learning_rate": 3.337904420160618e-05, - "loss": 0.0010956574231386184, - "step": 7320 - }, - { - "epoch": 1.3737797352705492, - "grad_norm": 0.005018322728574276, - "learning_rate": 3.329985998469526e-05, - "loss": 0.0012317843735218047, - "step": 7340 - }, - { - "epoch": 0.6887872232777639, - "grad_norm": 0.3108454942703247, - "learning_rate": 3.322058207623218e-05, - "loss": 0.010070423781871795, - "step": 7360 - }, - { - "epoch": 0.6906589276888447, - "grad_norm": 0.3556046783924103, - "learning_rate": 3.314121137113209e-05, - "loss": 0.0278738796710968, - "step": 7380 - }, - { - "epoch": 0.6925306320999256, - "grad_norm": 4.041794300079346, - "learning_rate": 3.306174876535762e-05, - "loss": 0.025335192680358887, - "step": 7400 - }, - { - "epoch": 0.6944023365110065, - "grad_norm": 0.04647493362426758, - "learning_rate": 3.2982195155908845e-05, - "loss": 0.05056847333908081, - "step": 7420 - }, - { - "epoch": 0.6962740409220873, - "grad_norm": 0.6827419400215149, - "learning_rate": 3.290653575270209e-05, - "loss": 0.036053261160850524, - "step": 7440 - }, - { - "epoch": 0.6981457453331683, - "grad_norm": 0.256136029958725, - "learning_rate": 3.2826807269966064e-05, - "loss": 0.020640365779399872, - "step": 7460 - }, - { - "epoch": 0.7000174497442492, - "grad_norm": 0.2054845094680786, - "learning_rate": 3.274699043565268e-05, - "loss": 0.03456352353096008, - "step": 7480 - }, - { - "epoch": 0.70188915415533, - "grad_norm": 0.2027648538351059, - "learning_rate": 3.266708615076064e-05, - "loss": 0.00846734493970871, - "step": 7500 - }, - { - "epoch": 0.7037608585664109, - "grad_norm": 1.6423311233520508, - "learning_rate": 3.258709531727582e-05, - "loss": 0.054978948831558225, - "step": 7520 - }, - { - "epoch": 0.7056325629774918, - "grad_norm": 1.775089144706726, - "learning_rate": 3.2507018838161085e-05, - "loss": 0.03238933086395264, - "step": 7540 - }, - { - "epoch": 0.7075042673885726, - "grad_norm": 0.06917860358953476, - "learning_rate": 3.242685761734609e-05, - "loss": 0.016849520802497863, - "step": 7560 - }, - { - "epoch": 0.7093759717996535, - "grad_norm": 0.051443129777908325, - "learning_rate": 3.2346612559717094e-05, - "loss": 0.048251998424530027, - "step": 7580 - }, - { - "epoch": 0.7112476762107344, - "grad_norm": 0.06533925980329514, - "learning_rate": 3.226628457110672e-05, - "loss": 0.03696450293064117, - "step": 7600 - }, - { - "epoch": 0.7131193806218153, - "grad_norm": 0.45661595463752747, - "learning_rate": 3.218587455828377e-05, - "loss": 0.05503013730049133, - "step": 7620 - }, - { - "epoch": 0.7149910850328962, - "grad_norm": 2.0205914974212646, - "learning_rate": 3.210538342894291e-05, - "loss": 0.033562681078910826, - "step": 7640 - }, - { - "epoch": 0.7168627894439771, - "grad_norm": 2.4842448234558105, - "learning_rate": 3.202481209169455e-05, - "loss": 0.019278638064861298, - "step": 7660 - }, - { - "epoch": 0.7187344938550579, - "grad_norm": 0.10550081729888916, - "learning_rate": 3.1944161456054436e-05, - "loss": 0.01638232171535492, - "step": 7680 - }, - { - "epoch": 0.7206061982661388, - "grad_norm": 1.606436014175415, - "learning_rate": 3.1863432432433506e-05, - "loss": 0.020552067458629607, - "step": 7700 - }, - { - "epoch": 0.7224779026772197, - "grad_norm": 0.2617719769477844, - "learning_rate": 3.178262593212757e-05, - "loss": 0.02315783053636551, - "step": 7720 - }, - { - "epoch": 0.7243496070883005, - "grad_norm": 0.9734074473381042, - "learning_rate": 3.1701742867307e-05, - "loss": 0.01938771307468414, - "step": 7740 - }, - { - "epoch": 0.7262213114993814, - "grad_norm": 0.5882985591888428, - "learning_rate": 3.162078415100647e-05, - "loss": 0.011305707693099975, - "step": 7760 - }, - { - "epoch": 0.7280930159104624, - "grad_norm": 0.04298723489046097, - "learning_rate": 3.15397506971146e-05, - "loss": 0.04238930344581604, - "step": 7780 - }, - { - "epoch": 0.7299647203215432, - "grad_norm": 6.2729315757751465, - "learning_rate": 3.145864342036372e-05, - "loss": 0.030225831270217895, - "step": 7800 - }, - { - "epoch": 0.7318364247326241, - "grad_norm": 0.026423340663313866, - "learning_rate": 3.1377463236319476e-05, - "loss": 0.012169972807168961, - "step": 7820 - }, - { - "epoch": 0.733708129143705, - "grad_norm": 0.0296376533806324, - "learning_rate": 3.1296211061370495e-05, - "loss": 0.015344823896884918, - "step": 7840 - }, - { - "epoch": 0.7355798335547858, - "grad_norm": 0.029524821788072586, - "learning_rate": 3.1214887812718094e-05, - "loss": 0.028345003724098206, - "step": 7860 - }, - { - "epoch": 0.7374515379658667, - "grad_norm": 0.06847794353961945, - "learning_rate": 3.113349440836588e-05, - "loss": 0.020069575309753417, - "step": 7880 - }, - { - "epoch": 0.7393232423769476, - "grad_norm": 0.024868430569767952, - "learning_rate": 3.1052031767109376e-05, - "loss": 0.014262473583221436, - "step": 7900 - }, - { - "epoch": 0.7411949467880286, - "grad_norm": 0.24450063705444336, - "learning_rate": 3.097050080852573e-05, - "loss": 0.04350808262825012, - "step": 7920 - }, - { - "epoch": 0.7430666511991094, - "grad_norm": 0.06978324800729752, - "learning_rate": 3.088890245296322e-05, - "loss": 0.015559709072113037, - "step": 7940 - }, - { - "epoch": 0.7449383556101903, - "grad_norm": 0.12675604224205017, - "learning_rate": 3.0807237621530964e-05, - "loss": 0.013867451250553131, - "step": 7960 - }, - { - "epoch": 0.7468100600212711, - "grad_norm": 0.2605513334274292, - "learning_rate": 3.072550723608846e-05, - "loss": 0.012869009375572204, - "step": 7980 - }, - { - "epoch": 0.748681764432352, - "grad_norm": 3.325530529022217, - "learning_rate": 3.064371221923521e-05, - "loss": 0.03036353886127472, - "step": 8000 - }, - { - "epoch": 0.7505534688434329, - "grad_norm": 0.22703051567077637, - "learning_rate": 3.0561853494300294e-05, - "loss": 0.009017374366521835, - "step": 8020 - }, - { - "epoch": 0.7524251732545137, - "grad_norm": 6.404862880706787, - "learning_rate": 3.047993198533195e-05, - "loss": 0.020604299008846284, - "step": 8040 - }, - { - "epoch": 0.7542968776655946, - "grad_norm": 0.06491954624652863, - "learning_rate": 3.039794861708714e-05, - "loss": 0.014963623881340028, - "step": 8060 - }, - { - "epoch": 0.7561685820766756, - "grad_norm": 0.4990088641643524, - "learning_rate": 3.0315904315021128e-05, - "loss": 0.02046530395746231, - "step": 8080 - }, - { - "epoch": 0.7580402864877565, - "grad_norm": 0.3174229562282562, - "learning_rate": 3.023380000527699e-05, - "loss": 0.013621781766414643, - "step": 8100 - }, - { - "epoch": 0.7599119908988373, - "grad_norm": 0.07161428034305573, - "learning_rate": 3.0151636614675218e-05, - "loss": 0.008043503761291504, - "step": 8120 - }, - { - "epoch": 0.7617836953099182, - "grad_norm": 0.6772736310958862, - "learning_rate": 3.0069415070703217e-05, - "loss": 0.03563189804553986, - "step": 8140 - }, - { - "epoch": 0.763655399720999, - "grad_norm": 0.07689516246318817, - "learning_rate": 2.998713630150485e-05, - "loss": 0.008622632920742035, - "step": 8160 - }, - { - "epoch": 0.7655271041320799, - "grad_norm": 0.014181110076606274, - "learning_rate": 2.990480123586994e-05, - "loss": 0.012368627637624741, - "step": 8180 - }, - { - "epoch": 0.7673988085431608, - "grad_norm": 4.4751715660095215, - "learning_rate": 2.9822410803223822e-05, - "loss": 0.02100955694913864, - "step": 8200 - }, - { - "epoch": 0.7692705129542416, - "grad_norm": 0.12694527208805084, - "learning_rate": 2.9739965933616825e-05, - "loss": 0.018182000517845152, - "step": 8220 - }, - { - "epoch": 0.7711422173653226, - "grad_norm": 0.13789872825145721, - "learning_rate": 2.9657467557713792e-05, - "loss": 0.008949784934520722, - "step": 8240 - }, - { - "epoch": 0.7730139217764035, - "grad_norm": 0.04048463702201843, - "learning_rate": 2.957491660678354e-05, - "loss": 0.03582434058189392, - "step": 8260 - }, - { - "epoch": 0.7748856261874844, - "grad_norm": 0.7825964689254761, - "learning_rate": 2.9492314012688378e-05, - "loss": 0.012679101526737213, - "step": 8280 - }, - { - "epoch": 0.7767573305985652, - "grad_norm": 0.14350314438343048, - "learning_rate": 2.9409660707873597e-05, - "loss": 0.010909486562013626, - "step": 8300 - }, - { - "epoch": 0.7786290350096461, - "grad_norm": 0.17676737904548645, - "learning_rate": 2.932695762535691e-05, - "loss": 0.01464642733335495, - "step": 8320 - }, - { - "epoch": 0.780500739420727, - "grad_norm": 0.5979751348495483, - "learning_rate": 2.9244205698717943e-05, - "loss": 0.028799059987068176, - "step": 8340 - }, - { - "epoch": 0.7823724438318078, - "grad_norm": 0.08448052406311035, - "learning_rate": 2.9161405862087676e-05, - "loss": 0.014056096971035003, - "step": 8360 - }, - { - "epoch": 0.7842441482428888, - "grad_norm": 0.5616207122802734, - "learning_rate": 2.9078559050137955e-05, - "loss": 0.008744364231824875, - "step": 8380 - }, - { - "epoch": 0.7861158526539697, - "grad_norm": 0.7264829277992249, - "learning_rate": 2.8995666198070836e-05, - "loss": 0.014575870335102081, - "step": 8400 - }, - { - "epoch": 0.7879875570650505, - "grad_norm": 1.444239616394043, - "learning_rate": 2.891272824160815e-05, - "loss": 0.01230706349015236, - "step": 8420 - }, - { - "epoch": 0.7898592614761314, - "grad_norm": 0.02643579989671707, - "learning_rate": 2.882974611698084e-05, - "loss": 0.01713460832834244, - "step": 8440 - }, - { - "epoch": 0.7917309658872123, - "grad_norm": 0.19893163442611694, - "learning_rate": 2.8746720760918457e-05, - "loss": 0.009562552720308305, - "step": 8460 - }, - { - "epoch": 0.7936026702982931, - "grad_norm": 1.8813897371292114, - "learning_rate": 2.866365311063855e-05, - "loss": 0.01966284364461899, - "step": 8480 - }, - { - "epoch": 0.795474374709374, - "grad_norm": 0.1820579618215561, - "learning_rate": 2.8580544103836114e-05, - "loss": 0.023943188786506652, - "step": 8500 - }, - { - "epoch": 0.7973460791204549, - "grad_norm": 1.3913259506225586, - "learning_rate": 2.849739467867298e-05, - "loss": 0.02233349084854126, - "step": 8520 - }, - { - "epoch": 0.7992177835315358, - "grad_norm": 0.28450486063957214, - "learning_rate": 2.8414205773767223e-05, - "loss": 0.016230446100234986, - "step": 8540 - }, - { - "epoch": 0.8010894879426167, - "grad_norm": 0.46086356043815613, - "learning_rate": 2.83309783281826e-05, - "loss": 0.013964855670928955, - "step": 8560 - }, - { - "epoch": 0.8029611923536976, - "grad_norm": 1.1401137113571167, - "learning_rate": 2.8247713281417924e-05, - "loss": 0.01552264392375946, - "step": 8580 - }, - { - "epoch": 0.8048328967647784, - "grad_norm": 0.02414649911224842, - "learning_rate": 2.8164411573396444e-05, - "loss": 0.00505053773522377, - "step": 8600 - }, - { - "epoch": 0.8067046011758593, - "grad_norm": 0.029010778293013573, - "learning_rate": 2.8081074144455276e-05, - "loss": 0.008068422973155975, - "step": 8620 - }, - { - "epoch": 0.8085763055869402, - "grad_norm": 0.024924319237470627, - "learning_rate": 2.7997701935334747e-05, - "loss": 0.021529987454414368, - "step": 8640 - }, - { - "epoch": 0.810448009998021, - "grad_norm": 0.3544171154499054, - "learning_rate": 2.791429588716782e-05, - "loss": 0.008264218270778657, - "step": 8660 - }, - { - "epoch": 0.8123197144091019, - "grad_norm": 0.011211074888706207, - "learning_rate": 2.7830856941469407e-05, - "loss": 0.013752134144306183, - "step": 8680 - }, - { - "epoch": 0.8141914188201829, - "grad_norm": 0.30479249358177185, - "learning_rate": 2.7747386040125807e-05, - "loss": 0.01313515156507492, - "step": 8700 - }, - { - "epoch": 0.8160631232312637, - "grad_norm": 3.1079516410827637, - "learning_rate": 2.766388412538404e-05, - "loss": 0.013471932709217071, - "step": 8720 - }, - { - "epoch": 0.8179348276423446, - "grad_norm": 0.011288405396044254, - "learning_rate": 2.758035213984121e-05, - "loss": 0.011207062005996703, - "step": 8740 - }, - { - "epoch": 0.8198065320534255, - "grad_norm": 0.011481484398245811, - "learning_rate": 2.749679102643387e-05, - "loss": 0.018254657089710236, - "step": 8760 - }, - { - "epoch": 0.8216782364645063, - "grad_norm": 0.037564992904663086, - "learning_rate": 2.7413201728427372e-05, - "loss": 0.024057184159755707, - "step": 8780 - }, - { - "epoch": 0.8235499408755872, - "grad_norm": 0.03808968514204025, - "learning_rate": 2.7329585189405253e-05, - "loss": 0.006051592528820038, - "step": 8800 - }, - { - "epoch": 0.8254216452866681, - "grad_norm": 0.07610247284173965, - "learning_rate": 2.724594235325852e-05, - "loss": 0.025592076778411865, - "step": 8820 - }, - { - "epoch": 0.827293349697749, - "grad_norm": 0.019049810245633125, - "learning_rate": 2.716227416417505e-05, - "loss": 0.0037486787885427477, - "step": 8840 - }, - { - "epoch": 0.8291650541088299, - "grad_norm": 0.6380273699760437, - "learning_rate": 2.7078581566628897e-05, - "loss": 0.015487492084503174, - "step": 8860 - }, - { - "epoch": 0.8310367585199108, - "grad_norm": 0.05775881186127663, - "learning_rate": 2.699486550536968e-05, - "loss": 0.03133237063884735, - "step": 8880 - }, - { - "epoch": 0.8329084629309916, - "grad_norm": 0.047411222010850906, - "learning_rate": 2.6911126925411845e-05, - "loss": 0.00861177071928978, - "step": 8900 - }, - { - "epoch": 0.8347801673420725, - "grad_norm": 0.23981286585330963, - "learning_rate": 2.682736677202406e-05, - "loss": 0.01839599907398224, - "step": 8920 - }, - { - "epoch": 0.8366518717531534, - "grad_norm": 0.36887305974960327, - "learning_rate": 2.6743585990718505e-05, - "loss": 0.01008533239364624, - "step": 8940 - }, - { - "epoch": 0.8385235761642342, - "grad_norm": 0.8994531035423279, - "learning_rate": 2.6659785527240233e-05, - "loss": 0.027107802033424378, - "step": 8960 - }, - { - "epoch": 0.8403952805753151, - "grad_norm": 0.12780402600765228, - "learning_rate": 2.6575966327556458e-05, - "loss": 0.03549482524394989, - "step": 8980 - }, - { - "epoch": 0.8422669849863961, - "grad_norm": 0.3294568359851837, - "learning_rate": 2.649212933784591e-05, - "loss": 0.02797776460647583, - "step": 9000 - }, - { - "epoch": 0.8441386893974769, - "grad_norm": 0.019461506977677345, - "learning_rate": 2.640827550448812e-05, - "loss": 0.010047334432601928, - "step": 9020 - }, - { - "epoch": 0.8460103938085578, - "grad_norm": 0.056546472012996674, - "learning_rate": 2.6324405774052784e-05, - "loss": 0.02831721007823944, - "step": 9040 - }, - { - "epoch": 0.8478820982196387, - "grad_norm": 0.017190299928188324, - "learning_rate": 2.6240521093289022e-05, - "loss": 0.019623257219791412, - "step": 9060 - }, - { - "epoch": 0.8497538026307195, - "grad_norm": 0.04793965816497803, - "learning_rate": 2.6156622409114728e-05, - "loss": 0.011966148018836975, - "step": 9080 - }, - { - "epoch": 0.8516255070418004, - "grad_norm": 0.006742037367075682, - "learning_rate": 2.607271066860587e-05, - "loss": 0.013694784045219422, - "step": 9100 - }, - { - "epoch": 0.8534972114528813, - "grad_norm": 0.03113027848303318, - "learning_rate": 2.5988786818985812e-05, - "loss": 0.05338943004608154, - "step": 9120 - }, - { - "epoch": 0.8553689158639621, - "grad_norm": 0.6589255928993225, - "learning_rate": 2.5904851807614588e-05, - "loss": 0.01305432766675949, - "step": 9140 - }, - { - "epoch": 0.8572406202750431, - "grad_norm": 0.3030281960964203, - "learning_rate": 2.582090658197825e-05, - "loss": 0.03663805425167084, - "step": 9160 - }, - { - "epoch": 0.859112324686124, - "grad_norm": 0.37101081013679504, - "learning_rate": 2.573695208967814e-05, - "loss": 0.016968609392642976, - "step": 9180 - }, - { - "epoch": 0.8609840290972048, - "grad_norm": 0.7480998039245605, - "learning_rate": 2.5652989278420197e-05, - "loss": 0.021240857243537904, - "step": 9200 - }, - { - "epoch": 0.8628557335082857, - "grad_norm": 0.017131274566054344, - "learning_rate": 2.5569019096004304e-05, - "loss": 0.004783949628472328, - "step": 9220 - }, - { - "epoch": 0.8647274379193666, - "grad_norm": 1.1544040441513062, - "learning_rate": 2.5485042490313504e-05, - "loss": 0.02356208860874176, - "step": 9240 - }, - { - "epoch": 0.8665991423304474, - "grad_norm": 0.13512635231018066, - "learning_rate": 2.540106040930338e-05, - "loss": 0.009329542517662048, - "step": 9260 - }, - { - "epoch": 0.8684708467415283, - "grad_norm": 0.018427839502692223, - "learning_rate": 2.5317073800991304e-05, - "loss": 0.007472375035285949, - "step": 9280 - }, - { - "epoch": 0.8703425511526093, - "grad_norm": 0.02722800336778164, - "learning_rate": 2.5233083613445778e-05, - "loss": 0.020304642617702484, - "step": 9300 - }, - { - "epoch": 0.8722142555636901, - "grad_norm": 0.051702745258808136, - "learning_rate": 2.5149090794775675e-05, - "loss": 0.02955295443534851, - "step": 9320 - }, - { - "epoch": 0.874085959974771, - "grad_norm": 0.1535400152206421, - "learning_rate": 2.5065096293119604e-05, - "loss": 0.030047640204429626, - "step": 9340 - }, - { - "epoch": 0.8759576643858519, - "grad_norm": 0.383573979139328, - "learning_rate": 2.498110105663513e-05, - "loss": 0.011377302557229995, - "step": 9360 - }, - { - "epoch": 0.8778293687969327, - "grad_norm": 0.23541487753391266, - "learning_rate": 2.489710603348817e-05, - "loss": 0.02304387390613556, - "step": 9380 - }, - { - "epoch": 0.8797010732080136, - "grad_norm": 0.029004938900470734, - "learning_rate": 2.4813112171842162e-05, - "loss": 0.020582889020442963, - "step": 9400 - }, - { - "epoch": 0.8815727776190945, - "grad_norm": 0.06564116477966309, - "learning_rate": 2.4729120419847498e-05, - "loss": 0.014207787811756134, - "step": 9420 - }, - { - "epoch": 0.8834444820301753, - "grad_norm": 0.01633615791797638, - "learning_rate": 2.464513172563072e-05, - "loss": 0.01756283938884735, - "step": 9440 - }, - { - "epoch": 0.8853161864412563, - "grad_norm": 0.01287770178169012, - "learning_rate": 2.456114703728386e-05, - "loss": 0.003737853467464447, - "step": 9460 - }, - { - "epoch": 0.8871878908523372, - "grad_norm": 0.05004064738750458, - "learning_rate": 2.448136615728485e-05, - "loss": 0.0324675589799881, - "step": 9480 - }, - { - "epoch": 0.889059595263418, - "grad_norm": 1.20869779586792, - "learning_rate": 2.4397392007153162e-05, - "loss": 0.007156150788068772, - "step": 9500 - }, - { - "epoch": 0.8909312996744989, - "grad_norm": 1.1070218086242676, - "learning_rate": 2.43134246594589e-05, - "loss": 0.009275762736797333, - "step": 9520 - }, - { - "epoch": 0.8928030040855798, - "grad_norm": 0.878593385219574, - "learning_rate": 2.4229465062053136e-05, - "loss": 0.018170186877250673, - "step": 9540 - }, - { - "epoch": 2.236302797078385, - "grad_norm": 1.878964900970459, - "learning_rate": 1.4461640332194936e-05, - "loss": 0.06906706094741821, - "step": 9560 - }, - { - "epoch": 2.2409820603868678, - "grad_norm": 0.7613142728805542, - "learning_rate": 1.4366537531356394e-05, - "loss": 0.03104197680950165, - "step": 9580 - }, - { - "epoch": 2.2456613236953507, - "grad_norm": 2.729102849960327, - "learning_rate": 1.4271622228435674e-05, - "loss": 0.06935752034187317, - "step": 9600 - }, - { - "epoch": 2.2503405870038335, - "grad_norm": 0.7153878808021545, - "learning_rate": 1.4176896097057135e-05, - "loss": 0.020206278562545775, - "step": 9620 - }, - { - "epoch": 2.2550198503123164, - "grad_norm": 0.028817689046263695, - "learning_rate": 1.4082360807509482e-05, - "loss": 0.0680353283882141, - "step": 9640 - }, - { - "epoch": 2.2596991136207993, - "grad_norm": 0.3162664473056793, - "learning_rate": 1.3988018026716371e-05, - "loss": 0.025713080167770387, - "step": 9660 - }, - { - "epoch": 2.2643783769292822, - "grad_norm": 1.5790144205093384, - "learning_rate": 1.3893869418206949e-05, - "loss": 0.05048573017120361, - "step": 9680 - }, - { - "epoch": 2.269057640237765, - "grad_norm": 2.7002806663513184, - "learning_rate": 1.3799916642086585e-05, - "loss": 0.04649330377578735, - "step": 9700 - }, - { - "epoch": 2.273736903546248, - "grad_norm": 2.5395395755767822, - "learning_rate": 1.3706161355007579e-05, - "loss": 0.03647684454917908, - "step": 9720 - }, - { - "epoch": 2.278416166854731, - "grad_norm": 0.08456363528966904, - "learning_rate": 1.3612605210139912e-05, - "loss": 0.02184589356184006, - "step": 9740 - }, - { - "epoch": 2.283095430163214, - "grad_norm": 0.11882820725440979, - "learning_rate": 1.3519249857142147e-05, - "loss": 0.024382126331329346, - "step": 9760 - }, - { - "epoch": 2.2877746934716967, - "grad_norm": 2.03177547454834, - "learning_rate": 1.3426096942132305e-05, - "loss": 0.017325276136398317, - "step": 9780 - }, - { - "epoch": 2.2924539567801796, - "grad_norm": 2.278486490249634, - "learning_rate": 1.3333148107658883e-05, - "loss": 0.020570486783981323, - "step": 9800 - }, - { - "epoch": 2.2971332200886625, - "grad_norm": 0.08188050240278244, - "learning_rate": 1.3240404992671823e-05, - "loss": 0.04151483178138733, - "step": 9820 - }, - { - "epoch": 2.3018124833971454, - "grad_norm": 0.07718763500452042, - "learning_rate": 1.3147869232493698e-05, - "loss": 0.05485020279884338, - "step": 9840 - }, - { - "epoch": 2.306491746705628, - "grad_norm": 1.2894233465194702, - "learning_rate": 1.305554245879079e-05, - "loss": 0.029610657691955568, - "step": 9860 - }, - { - "epoch": 2.3111710100141107, - "grad_norm": 0.07070274651050568, - "learning_rate": 1.296342629954439e-05, - "loss": 0.016826416552066802, - "step": 9880 - }, - { - "epoch": 2.3158502733225936, - "grad_norm": 0.023227320984005928, - "learning_rate": 1.2871522379022038e-05, - "loss": 0.07382934093475342, - "step": 9900 - }, - { - "epoch": 2.3205295366310765, - "grad_norm": 0.773018479347229, - "learning_rate": 1.2779832317748933e-05, - "loss": 0.027895498275756835, - "step": 9920 - }, - { - "epoch": 2.3252087999395594, - "grad_norm": 0.04976315423846245, - "learning_rate": 1.2688357732479303e-05, - "loss": 0.022780410945415497, - "step": 9940 - }, - { - "epoch": 2.3298880632480423, - "grad_norm": 0.39492273330688477, - "learning_rate": 1.2597100236167963e-05, - "loss": 0.019541989266872405, - "step": 9960 - }, - { - "epoch": 2.334567326556525, - "grad_norm": 0.24637845158576965, - "learning_rate": 1.2506061437941804e-05, - "loss": 0.02746429443359375, - "step": 9980 - }, - { - "epoch": 2.339246589865008, - "grad_norm": 4.480206489562988, - "learning_rate": 1.241524294307147e-05, - "loss": 0.023342420160770417, - "step": 10000 - }, - { - "epoch": 2.343925853173491, - "grad_norm": 7.383241653442383, - "learning_rate": 1.232464635294302e-05, - "loss": 0.022446952760219574, - "step": 10020 - }, - { - "epoch": 2.348605116481974, - "grad_norm": 0.07559090107679367, - "learning_rate": 1.2234273265029742e-05, - "loss": 0.05684725046157837, - "step": 10040 - } - ], - "logging_steps": 20, - "max_steps": 14963, - "num_input_tokens_seen": 0, - "num_train_epochs": 4, - "save_steps": 1000000000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.850543361754086e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/slots/7/checkpoint-10040/training_args.bin b/slots/7/checkpoint-10040/training_args.bin deleted file mode 100644 index cba6bf44229020a6cf5d76cffc747dea705142ea..0000000000000000000000000000000000000000 --- a/slots/7/checkpoint-10040/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66430bba102a8f3dc245713cd6268a99c212c508aacce1d8b9768464f5df26ec -size 5201 diff --git a/slots/7/latest.json b/slots/7/latest.json deleted file mode 100644 index fd4aa2ebdaf8c0bd8f924ee6d154ac3cd42711b3..0000000000000000000000000000000000000000 --- a/slots/7/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:7", "checkpoint": "slots/7/checkpoint-10040", "step": 10040, "updated_at": 1776816121} diff --git a/slots/8/checkpoint-10168/config.json b/slots/8/checkpoint-10168/config.json deleted file mode 100644 index 9e5d8b7224eff16a790758ae86dd97c89afeab74..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "TwinyForCausalLM" - ], - "attention_dropout": 0.0, - "dtype": "float32", - "hidden_dropout": 0.0, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 128, - "model_type": "twiny", - "neftune_alpha": 0.0, - "num_attention_heads": 12, - "num_hidden_layers": 3, - "num_key_value_heads": 3, - "qk_norm": true, - "rezero_init": 1.0, - "rms_norm_eps": 1e-06, - "rope_theta": 10000.0, - "transformers_version": "5.0.0", - "use_cache": false, - "vocab_size": 32000 -} diff --git a/slots/8/checkpoint-10168/model.safetensors b/slots/8/checkpoint-10168/model.safetensors deleted file mode 100644 index c336674210912a9e340e03e764af5e0c70937f6e..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b061c0b93c75dcf9ac84813dba214bd45f1fa5bbd98cd1ad72dfd8f5ad791aea -size 306388092 diff --git a/slots/8/checkpoint-10168/optimizer.pt b/slots/8/checkpoint-10168/optimizer.pt deleted file mode 100644 index d7cf59e24f5a73da0bca16c568e0816a7ec92978..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76a399b969a51720e3165474665a51bfff0a2919d76185c119dfc4b7bcae963c -size 302484555 diff --git a/slots/8/checkpoint-10168/rng_state.pth b/slots/8/checkpoint-10168/rng_state.pth deleted file mode 100644 index 1feba1a6538e93b94696d3773853dbc8947b0cad..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/rng_state.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 -size 14645 diff --git a/slots/8/checkpoint-10168/scaler.pt b/slots/8/checkpoint-10168/scaler.pt deleted file mode 100644 index 663dbef12570b63778f31c6463cf24726674ec08..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/scaler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d16c2033d88a862075fa9989ad7fb40b9a5e405e0d3c96932ea681e75146af8c -size 1383 diff --git a/slots/8/checkpoint-10168/scheduler.pt b/slots/8/checkpoint-10168/scheduler.pt deleted file mode 100644 index 1b7535d89559876a685eee3c78ff75b61eabf662..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a648b5e0660bc921d1fed76002ad8340ab04868a2d607cc9ca15f3caf28bdb13 -size 1465 diff --git a/slots/8/checkpoint-10168/trainer_state.json b/slots/8/checkpoint-10168/trainer_state.json deleted file mode 100644 index e7ae05b9b8ccce8ccae387664dd35fb8780408bd..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/trainer_state.json +++ /dev/null @@ -1,3597 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.3785524016562642, - "eval_steps": 500, - "global_step": 10168, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001336931908386741, - "grad_norm": Infinity, - "learning_rate": 5e-05, - "loss": 129.80032348632812, - "step": 1 - }, - { - "epoch": 0.002673863816773482, - "grad_norm": 63.64365768432617, - "learning_rate": 4.999995392022967e-05, - "loss": 63.88374408922697, - "step": 20 - }, - { - "epoch": 0.005347727633546964, - "grad_norm": 24.627853393554688, - "learning_rate": 4.999976672145381e-05, - "loss": 12.65963363647461, - "step": 40 - }, - { - "epoch": 0.008021591450320446, - "grad_norm": 14.29983901977539, - "learning_rate": 4.999943552476422e-05, - "loss": 5.90204963684082, - "step": 60 - }, - { - "epoch": 0.010695455267093928, - "grad_norm": 15.690323829650879, - "learning_rate": 4.999896033206858e-05, - "loss": 3.9918922424316405, - "step": 80 - }, - { - "epoch": 0.01336931908386741, - "grad_norm": 31.583160400390625, - "learning_rate": 4.999834114610398e-05, - "loss": 2.9675426483154297, - "step": 100 - }, - { - "epoch": 0.01604318290064089, - "grad_norm": 13.034649848937988, - "learning_rate": 4.999757797043691e-05, - "loss": 2.725296401977539, - "step": 120 - }, - { - "epoch": 0.018717046717414372, - "grad_norm": 8.362203598022461, - "learning_rate": 4.999667080946324e-05, - "loss": 2.2478992462158205, - "step": 140 - }, - { - "epoch": 0.021390910534187856, - "grad_norm": 8.726786613464355, - "learning_rate": 4.999561966840821e-05, - "loss": 1.8447845458984375, - "step": 160 - }, - { - "epoch": 0.024064774350961337, - "grad_norm": 10.092752456665039, - "learning_rate": 4.9994424553326335e-05, - "loss": 1.5611843109130858, - "step": 180 - }, - { - "epoch": 0.02673863816773482, - "grad_norm": 9.090085983276367, - "learning_rate": 4.999308547110146e-05, - "loss": 1.520334815979004, - "step": 200 - }, - { - "epoch": 0.029412501984508302, - "grad_norm": 9.668124198913574, - "learning_rate": 4.999160242944665e-05, - "loss": 1.2818055152893066, - "step": 220 - }, - { - "epoch": 0.03208636580128178, - "grad_norm": 9.182533264160156, - "learning_rate": 4.998997543690418e-05, - "loss": 1.0428407669067383, - "step": 240 - }, - { - "epoch": 0.03476022961805526, - "grad_norm": 5.745838165283203, - "learning_rate": 4.998820450284549e-05, - "loss": 1.2343652725219727, - "step": 260 - }, - { - "epoch": 0.037434093434828744, - "grad_norm": 8.651643753051758, - "learning_rate": 4.99862896374711e-05, - "loss": 0.8859601020812988, - "step": 280 - }, - { - "epoch": 0.04010795725160223, - "grad_norm": 10.765266418457031, - "learning_rate": 4.998423085181056e-05, - "loss": 0.989600658416748, - "step": 300 - }, - { - "epoch": 0.04278182106837571, - "grad_norm": 6.092499256134033, - "learning_rate": 4.998202815772245e-05, - "loss": 0.7189463615417481, - "step": 320 - }, - { - "epoch": 0.04545568488514919, - "grad_norm": 6.352876663208008, - "learning_rate": 4.9979681567894195e-05, - "loss": 0.7489545345306396, - "step": 340 - }, - { - "epoch": 0.048129548701922674, - "grad_norm": 4.620656490325928, - "learning_rate": 4.997719109584209e-05, - "loss": 0.7381401538848877, - "step": 360 - }, - { - "epoch": 0.050803412518696155, - "grad_norm": 7.796917915344238, - "learning_rate": 4.997455675591119e-05, - "loss": 0.5687405109405518, - "step": 380 - }, - { - "epoch": 0.05347727633546964, - "grad_norm": 2.837172508239746, - "learning_rate": 4.9971778563275204e-05, - "loss": 0.5686865329742432, - "step": 400 - }, - { - "epoch": 0.05615114015224312, - "grad_norm": 3.3103690147399902, - "learning_rate": 4.9968856533936436e-05, - "loss": 0.625730562210083, - "step": 420 - }, - { - "epoch": 0.058825003969016604, - "grad_norm": 3.5682132244110107, - "learning_rate": 4.99657906847257e-05, - "loss": 0.6125466346740722, - "step": 440 - }, - { - "epoch": 0.061498867785790085, - "grad_norm": 5.63640832901001, - "learning_rate": 4.996258103330218e-05, - "loss": 0.6182214260101319, - "step": 460 - }, - { - "epoch": 0.06417273160256357, - "grad_norm": 4.698945999145508, - "learning_rate": 4.995922759815339e-05, - "loss": 0.43828091621398924, - "step": 480 - }, - { - "epoch": 0.06684659541933705, - "grad_norm": 2.1976189613342285, - "learning_rate": 4.995573039859501e-05, - "loss": 0.4459230899810791, - "step": 500 - }, - { - "epoch": 0.06952045923611053, - "grad_norm": 3.8809523582458496, - "learning_rate": 4.995208945477081e-05, - "loss": 0.3821882963180542, - "step": 520 - }, - { - "epoch": 0.07219432305288401, - "grad_norm": 3.75144100189209, - "learning_rate": 4.994830478765251e-05, - "loss": 0.5800807476043701, - "step": 540 - }, - { - "epoch": 0.07486818686965749, - "grad_norm": 3.0038585662841797, - "learning_rate": 4.9944376419039684e-05, - "loss": 0.3928264617919922, - "step": 560 - }, - { - "epoch": 0.07754205068643098, - "grad_norm": 3.614591598510742, - "learning_rate": 4.994030437155961e-05, - "loss": 0.48637890815734863, - "step": 580 - }, - { - "epoch": 0.08021591450320446, - "grad_norm": 4.143443584442139, - "learning_rate": 4.993608866866718e-05, - "loss": 0.3650153160095215, - "step": 600 - }, - { - "epoch": 0.08288977831997794, - "grad_norm": 6.692712783813477, - "learning_rate": 4.993172933464471e-05, - "loss": 0.3677916288375854, - "step": 620 - }, - { - "epoch": 0.08556364213675142, - "grad_norm": 8.383441925048828, - "learning_rate": 4.9927226394601815e-05, - "loss": 0.3399480104446411, - "step": 640 - }, - { - "epoch": 0.0882375059535249, - "grad_norm": 5.566338062286377, - "learning_rate": 4.992257987447532e-05, - "loss": 0.28104052543640134, - "step": 660 - }, - { - "epoch": 0.09091136977029839, - "grad_norm": 3.1196420192718506, - "learning_rate": 4.991778980102904e-05, - "loss": 0.351950478553772, - "step": 680 - }, - { - "epoch": 0.09358523358707187, - "grad_norm": 3.47979736328125, - "learning_rate": 4.9912856201853644e-05, - "loss": 0.27501535415649414, - "step": 700 - }, - { - "epoch": 0.09625909740384535, - "grad_norm": 5.446717262268066, - "learning_rate": 4.990777910536653e-05, - "loss": 0.2651593923568726, - "step": 720 - }, - { - "epoch": 0.09893296122061883, - "grad_norm": 7.6145339012146, - "learning_rate": 4.990255854081161e-05, - "loss": 0.35140380859375, - "step": 740 - }, - { - "epoch": 0.10160682503739231, - "grad_norm": 8.445616722106934, - "learning_rate": 4.989719453825918e-05, - "loss": 0.2961219072341919, - "step": 760 - }, - { - "epoch": 0.10428068885416579, - "grad_norm": 6.339537620544434, - "learning_rate": 4.9891687128605744e-05, - "loss": 0.24962289333343507, - "step": 780 - }, - { - "epoch": 0.10695455267093928, - "grad_norm": 3.3369436264038086, - "learning_rate": 4.988603634357383e-05, - "loss": 0.2124847412109375, - "step": 800 - }, - { - "epoch": 0.10962841648771277, - "grad_norm": 2.2909045219421387, - "learning_rate": 4.988024221571177e-05, - "loss": 0.24679112434387207, - "step": 820 - }, - { - "epoch": 0.11230228030448625, - "grad_norm": 3.1149911880493164, - "learning_rate": 4.9874304778393574e-05, - "loss": 0.22161397933959961, - "step": 840 - }, - { - "epoch": 0.11497614412125973, - "grad_norm": 14.802160263061523, - "learning_rate": 4.9868224065818706e-05, - "loss": 0.2623537302017212, - "step": 860 - }, - { - "epoch": 0.11765000793803321, - "grad_norm": 5.586325168609619, - "learning_rate": 4.98620001130119e-05, - "loss": 0.3560942649841309, - "step": 880 - }, - { - "epoch": 0.12032387175480669, - "grad_norm": 3.390017032623291, - "learning_rate": 4.9855632955822916e-05, - "loss": 0.16934787034988402, - "step": 900 - }, - { - "epoch": 0.12299773557158017, - "grad_norm": 6.070940971374512, - "learning_rate": 4.984912263092641e-05, - "loss": 0.2131197214126587, - "step": 920 - }, - { - "epoch": 0.12567159938835365, - "grad_norm": 1.4912281036376953, - "learning_rate": 4.984246917582166e-05, - "loss": 0.25128653049468996, - "step": 940 - }, - { - "epoch": 0.12834546320512713, - "grad_norm": 7.000472545623779, - "learning_rate": 4.9835672628832366e-05, - "loss": 0.2653592586517334, - "step": 960 - }, - { - "epoch": 0.1310193270219006, - "grad_norm": 5.427223205566406, - "learning_rate": 4.9828733029106434e-05, - "loss": 0.1653295636177063, - "step": 980 - }, - { - "epoch": 0.1336931908386741, - "grad_norm": 1.9502102136611938, - "learning_rate": 4.982165041661575e-05, - "loss": 0.2250870943069458, - "step": 1000 - }, - { - "epoch": 0.13636705465544757, - "grad_norm": 0.6216259598731995, - "learning_rate": 4.981442483215595e-05, - "loss": 0.18943849802017212, - "step": 1020 - }, - { - "epoch": 0.13904091847222105, - "grad_norm": 2.3363687992095947, - "learning_rate": 4.98070563173462e-05, - "loss": 0.1673592209815979, - "step": 1040 - }, - { - "epoch": 0.14171478228899453, - "grad_norm": 1.040717601776123, - "learning_rate": 4.979954491462892e-05, - "loss": 0.2113173007965088, - "step": 1060 - }, - { - "epoch": 0.14438864610576801, - "grad_norm": 2.735522747039795, - "learning_rate": 4.979189066726955e-05, - "loss": 0.17504971027374266, - "step": 1080 - }, - { - "epoch": 0.1470625099225415, - "grad_norm": 4.701151371002197, - "learning_rate": 4.978409361935636e-05, - "loss": 0.15881222486495972, - "step": 1100 - }, - { - "epoch": 0.14973637373931498, - "grad_norm": 2.735919237136841, - "learning_rate": 4.9776153815800075e-05, - "loss": 0.14044179916381835, - "step": 1120 - }, - { - "epoch": 0.15241023755608848, - "grad_norm": 3.5479538440704346, - "learning_rate": 4.976807130233375e-05, - "loss": 0.18565714359283447, - "step": 1140 - }, - { - "epoch": 0.15508410137286197, - "grad_norm": 3.2167458534240723, - "learning_rate": 4.975984612551243e-05, - "loss": 0.13236271142959594, - "step": 1160 - }, - { - "epoch": 0.15775796518963545, - "grad_norm": 1.0206760168075562, - "learning_rate": 4.975147833271288e-05, - "loss": 0.19124728441238403, - "step": 1180 - }, - { - "epoch": 0.16043182900640893, - "grad_norm": 4.194457530975342, - "learning_rate": 4.9742967972133335e-05, - "loss": 0.144741427898407, - "step": 1200 - }, - { - "epoch": 0.1631056928231824, - "grad_norm": 3.0225746631622314, - "learning_rate": 4.973431509279323e-05, - "loss": 0.1374324679374695, - "step": 1220 - }, - { - "epoch": 0.1657795566399559, - "grad_norm": 4.243523120880127, - "learning_rate": 4.972551974453287e-05, - "loss": 0.13663809299468993, - "step": 1240 - }, - { - "epoch": 0.16845342045672937, - "grad_norm": 2.4990086555480957, - "learning_rate": 4.971658197801322e-05, - "loss": 0.16817957162857056, - "step": 1260 - }, - { - "epoch": 0.17112728427350285, - "grad_norm": 4.983982563018799, - "learning_rate": 4.9707501844715554e-05, - "loss": 0.13795313835144044, - "step": 1280 - }, - { - "epoch": 0.17380114809027633, - "grad_norm": 3.6780316829681396, - "learning_rate": 4.969827939694115e-05, - "loss": 0.1637880802154541, - "step": 1300 - }, - { - "epoch": 0.1764750119070498, - "grad_norm": 0.7950732707977295, - "learning_rate": 4.968891468781105e-05, - "loss": 0.10979138612747193, - "step": 1320 - }, - { - "epoch": 0.1791488757238233, - "grad_norm": 1.2414121627807617, - "learning_rate": 4.967940777126569e-05, - "loss": 0.13692171573638917, - "step": 1340 - }, - { - "epoch": 0.18182273954059677, - "grad_norm": 2.1383633613586426, - "learning_rate": 4.9669758702064636e-05, - "loss": 0.07821698188781738, - "step": 1360 - }, - { - "epoch": 0.18449660335737025, - "grad_norm": 5.061275959014893, - "learning_rate": 4.965996753578623e-05, - "loss": 0.19053516387939454, - "step": 1380 - }, - { - "epoch": 0.18717046717414373, - "grad_norm": 6.151792049407959, - "learning_rate": 4.9650034328827305e-05, - "loss": 0.11360721588134766, - "step": 1400 - }, - { - "epoch": 0.18984433099091721, - "grad_norm": 1.0604305267333984, - "learning_rate": 4.963995913840284e-05, - "loss": 0.13138024806976317, - "step": 1420 - }, - { - "epoch": 0.1925181948076907, - "grad_norm": 1.7159489393234253, - "learning_rate": 4.9629742022545623e-05, - "loss": 0.08657677173614502, - "step": 1440 - }, - { - "epoch": 0.19519205862446418, - "grad_norm": 2.4207754135131836, - "learning_rate": 4.961938304010595e-05, - "loss": 0.10309149026870727, - "step": 1460 - }, - { - "epoch": 0.19786592244123766, - "grad_norm": 1.532060146331787, - "learning_rate": 4.9608882250751245e-05, - "loss": 0.13628544807434081, - "step": 1480 - }, - { - "epoch": 0.20053978625801114, - "grad_norm": 6.409943580627441, - "learning_rate": 4.959823971496574e-05, - "loss": 0.10584845542907714, - "step": 1500 - }, - { - "epoch": 0.20321365007478462, - "grad_norm": 2.452012538909912, - "learning_rate": 4.9587455494050136e-05, - "loss": 0.06506187915802002, - "step": 1520 - }, - { - "epoch": 0.2058875138915581, - "grad_norm": 5.3016533851623535, - "learning_rate": 4.9576529650121214e-05, - "loss": 0.11848526000976563, - "step": 1540 - }, - { - "epoch": 0.20856137770833158, - "grad_norm": 4.341775894165039, - "learning_rate": 4.956546224611152e-05, - "loss": 0.11318533420562744, - "step": 1560 - }, - { - "epoch": 0.21123524152510506, - "grad_norm": 1.9056169986724854, - "learning_rate": 4.9554253345768965e-05, - "loss": 0.12768398523330687, - "step": 1580 - }, - { - "epoch": 0.21390910534187857, - "grad_norm": 1.8939746618270874, - "learning_rate": 4.9542903013656486e-05, - "loss": 0.10782338380813598, - "step": 1600 - }, - { - "epoch": 0.21658296915865205, - "grad_norm": 8.53671932220459, - "learning_rate": 4.9531411315151654e-05, - "loss": 0.1733921766281128, - "step": 1620 - }, - { - "epoch": 0.21925683297542553, - "grad_norm": 2.0152978897094727, - "learning_rate": 4.951977831644632e-05, - "loss": 0.11197054386138916, - "step": 1640 - }, - { - "epoch": 0.221930696792199, - "grad_norm": 3.8422367572784424, - "learning_rate": 4.95080040845462e-05, - "loss": 0.11441781520843505, - "step": 1660 - }, - { - "epoch": 0.2246045606089725, - "grad_norm": 1.819858193397522, - "learning_rate": 4.949608868727053e-05, - "loss": 0.11403474807739258, - "step": 1680 - }, - { - "epoch": 0.22727842442574597, - "grad_norm": 7.45100212097168, - "learning_rate": 4.948403219325163e-05, - "loss": 0.13117753267288207, - "step": 1700 - }, - { - "epoch": 0.22995228824251945, - "grad_norm": 0.6526040434837341, - "learning_rate": 4.947183467193456e-05, - "loss": 0.07524924874305725, - "step": 1720 - }, - { - "epoch": 0.23262615205929293, - "grad_norm": 3.814746856689453, - "learning_rate": 4.945949619357668e-05, - "loss": 0.07659345269203185, - "step": 1740 - }, - { - "epoch": 0.23530001587606642, - "grad_norm": 2.373124122619629, - "learning_rate": 4.944701682924726e-05, - "loss": 0.1147496223449707, - "step": 1760 - }, - { - "epoch": 0.2379738796928399, - "grad_norm": 0.11161285638809204, - "learning_rate": 4.943439665082707e-05, - "loss": 0.07256829738616943, - "step": 1780 - }, - { - "epoch": 0.24064774350961338, - "grad_norm": 0.45990192890167236, - "learning_rate": 4.942163573100794e-05, - "loss": 0.07726740837097168, - "step": 1800 - }, - { - "epoch": 0.24332160732638686, - "grad_norm": 4.2301926612854, - "learning_rate": 4.940873414329242e-05, - "loss": 0.09349535703659058, - "step": 1820 - }, - { - "epoch": 0.24599547114316034, - "grad_norm": 2.442178726196289, - "learning_rate": 4.939569196199325e-05, - "loss": 0.12413722276687622, - "step": 1840 - }, - { - "epoch": 0.24866933495993382, - "grad_norm": 2.523683786392212, - "learning_rate": 4.938250926223302e-05, - "loss": 0.08566288352012634, - "step": 1860 - }, - { - "epoch": 0.2513431987767073, - "grad_norm": 3.511075258255005, - "learning_rate": 4.936918611994368e-05, - "loss": 0.08007702231407166, - "step": 1880 - }, - { - "epoch": 0.2540170625934808, - "grad_norm": 6.254627704620361, - "learning_rate": 4.935572261186614e-05, - "loss": 0.10983954668045044, - "step": 1900 - }, - { - "epoch": 0.25669092641025426, - "grad_norm": 1.5211899280548096, - "learning_rate": 4.934211881554981e-05, - "loss": 0.09120344519615173, - "step": 1920 - }, - { - "epoch": 0.25936479022702774, - "grad_norm": 2.5893588066101074, - "learning_rate": 4.932837480935214e-05, - "loss": 0.08754412531852722, - "step": 1940 - }, - { - "epoch": 0.2620386540438012, - "grad_norm": 6.878556251525879, - "learning_rate": 4.931449067243821e-05, - "loss": 0.08636274933815002, - "step": 1960 - }, - { - "epoch": 0.2647125178605747, - "grad_norm": 2.9078798294067383, - "learning_rate": 4.9300466484780226e-05, - "loss": 0.09582929015159607, - "step": 1980 - }, - { - "epoch": 0.2673863816773482, - "grad_norm": 3.391852855682373, - "learning_rate": 4.92863023271571e-05, - "loss": 0.0850919783115387, - "step": 2000 - }, - { - "epoch": 0.27006024549412166, - "grad_norm": 5.522103309631348, - "learning_rate": 4.927199828115395e-05, - "loss": 0.050999772548675534, - "step": 2020 - }, - { - "epoch": 0.27273410931089515, - "grad_norm": 0.90350741147995, - "learning_rate": 4.925755442916167e-05, - "loss": 0.10100446939468384, - "step": 2040 - }, - { - "epoch": 0.2754079731276686, - "grad_norm": 1.602030634880066, - "learning_rate": 4.924297085437641e-05, - "loss": 0.0468633770942688, - "step": 2060 - }, - { - "epoch": 0.2780818369444421, - "grad_norm": 1.5823460817337036, - "learning_rate": 4.922824764079913e-05, - "loss": 0.06786358952522278, - "step": 2080 - }, - { - "epoch": 0.2807557007612156, - "grad_norm": 1.6624343395233154, - "learning_rate": 4.92133848732351e-05, - "loss": 0.05772828459739685, - "step": 2100 - }, - { - "epoch": 0.28342956457798907, - "grad_norm": 0.947078287601471, - "learning_rate": 4.9198382637293424e-05, - "loss": 0.08012173175811768, - "step": 2120 - }, - { - "epoch": 0.28610342839476255, - "grad_norm": 0.2919924259185791, - "learning_rate": 4.918324101938653e-05, - "loss": 0.1208539366722107, - "step": 2140 - }, - { - "epoch": 0.28877729221153603, - "grad_norm": 9.258247375488281, - "learning_rate": 4.916796010672969e-05, - "loss": 0.10037034749984741, - "step": 2160 - }, - { - "epoch": 0.2914511560283095, - "grad_norm": 4.0920491218566895, - "learning_rate": 4.915253998734051e-05, - "loss": 0.061488878726959226, - "step": 2180 - }, - { - "epoch": 0.294125019845083, - "grad_norm": 6.1126627922058105, - "learning_rate": 4.913698075003841e-05, - "loss": 0.0862967312335968, - "step": 2200 - }, - { - "epoch": 0.29679888366185647, - "grad_norm": 2.585484743118286, - "learning_rate": 4.912128248444414e-05, - "loss": 0.05393874645233154, - "step": 2220 - }, - { - "epoch": 0.29947274747862995, - "grad_norm": 6.944481372833252, - "learning_rate": 4.9105445280979256e-05, - "loss": 0.08570566773414612, - "step": 2240 - }, - { - "epoch": 0.30214661129540343, - "grad_norm": 1.3824089765548706, - "learning_rate": 4.908946923086556e-05, - "loss": 0.09689127206802368, - "step": 2260 - }, - { - "epoch": 0.30482047511217697, - "grad_norm": 3.4861342906951904, - "learning_rate": 4.907335442612464e-05, - "loss": 0.12550976276397705, - "step": 2280 - }, - { - "epoch": 0.30749433892895045, - "grad_norm": 3.668980121612549, - "learning_rate": 4.905710095957728e-05, - "loss": 0.09089353680610657, - "step": 2300 - }, - { - "epoch": 0.31016820274572393, - "grad_norm": 1.093095064163208, - "learning_rate": 4.904070892484298e-05, - "loss": 0.03925192356109619, - "step": 2320 - }, - { - "epoch": 0.3128420665624974, - "grad_norm": 0.8169485926628113, - "learning_rate": 4.9024178416339364e-05, - "loss": 0.0979581356048584, - "step": 2340 - }, - { - "epoch": 0.3155159303792709, - "grad_norm": 1.892451286315918, - "learning_rate": 4.900750952928166e-05, - "loss": 0.05913209915161133, - "step": 2360 - }, - { - "epoch": 0.3181897941960444, - "grad_norm": 0.24644255638122559, - "learning_rate": 4.8990702359682184e-05, - "loss": 0.06815173625946044, - "step": 2380 - }, - { - "epoch": 0.32086365801281785, - "grad_norm": 2.1861305236816406, - "learning_rate": 4.897375700434972e-05, - "loss": 0.04142785966396332, - "step": 2400 - }, - { - "epoch": 0.32353752182959133, - "grad_norm": 2.6643004417419434, - "learning_rate": 4.8956673560889013e-05, - "loss": 0.05177200436592102, - "step": 2420 - }, - { - "epoch": 0.3262113856463648, - "grad_norm": 2.588113784790039, - "learning_rate": 4.8939452127700195e-05, - "loss": 0.05783546566963196, - "step": 2440 - }, - { - "epoch": 0.3288852494631383, - "grad_norm": 2.419644594192505, - "learning_rate": 4.8922092803978203e-05, - "loss": 0.08906854391098022, - "step": 2460 - }, - { - "epoch": 0.3315591132799118, - "grad_norm": 0.16949939727783203, - "learning_rate": 4.890459568971223e-05, - "loss": 0.10305211544036866, - "step": 2480 - }, - { - "epoch": 0.33423297709668526, - "grad_norm": 0.10032984614372253, - "learning_rate": 4.8886960885685126e-05, - "loss": 0.06348527669906616, - "step": 2500 - }, - { - "epoch": 0.33690684091345874, - "grad_norm": 3.3658738136291504, - "learning_rate": 4.8869188493472854e-05, - "loss": 0.06826075911521912, - "step": 2520 - }, - { - "epoch": 0.3395807047302322, - "grad_norm": 0.8656186461448669, - "learning_rate": 4.885127861544386e-05, - "loss": 0.05929765701293945, - "step": 2540 - }, - { - "epoch": 0.3422545685470057, - "grad_norm": 0.1492065042257309, - "learning_rate": 4.8833231354758496e-05, - "loss": 0.09429731965065002, - "step": 2560 - }, - { - "epoch": 0.3449284323637792, - "grad_norm": 0.6010928153991699, - "learning_rate": 4.881504681536846e-05, - "loss": 0.06262240409851075, - "step": 2580 - }, - { - "epoch": 0.34760229618055266, - "grad_norm": 1.6506450176239014, - "learning_rate": 4.879672510201616e-05, - "loss": 0.061688083410263064, - "step": 2600 - }, - { - "epoch": 0.35027615999732614, - "grad_norm": 0.2703142464160919, - "learning_rate": 4.877826632023412e-05, - "loss": 0.06175137162208557, - "step": 2620 - }, - { - "epoch": 0.3529500238140996, - "grad_norm": 3.1056365966796875, - "learning_rate": 4.875967057634437e-05, - "loss": 0.07828506827354431, - "step": 2640 - }, - { - "epoch": 0.3556238876308731, - "grad_norm": 0.28790283203125, - "learning_rate": 4.874093797745784e-05, - "loss": 0.11355981826782227, - "step": 2660 - }, - { - "epoch": 0.3582977514476466, - "grad_norm": 2.3372068405151367, - "learning_rate": 4.8722068631473746e-05, - "loss": 0.048267871141433716, - "step": 2680 - }, - { - "epoch": 0.36097161526442006, - "grad_norm": 0.12767371535301208, - "learning_rate": 4.8703062647078976e-05, - "loss": 0.04319801032543182, - "step": 2700 - }, - { - "epoch": 0.36364547908119355, - "grad_norm": 0.5145738124847412, - "learning_rate": 4.868392013374741e-05, - "loss": 0.0773090660572052, - "step": 2720 - }, - { - "epoch": 0.366319342897967, - "grad_norm": 0.8518500328063965, - "learning_rate": 4.866464120173937e-05, - "loss": 0.05149460434913635, - "step": 2740 - }, - { - "epoch": 0.3689932067147405, - "grad_norm": 3.6726584434509277, - "learning_rate": 4.8645225962100924e-05, - "loss": 0.06896821856498718, - "step": 2760 - }, - { - "epoch": 0.371667070531514, - "grad_norm": 1.5626497268676758, - "learning_rate": 4.862567452666329e-05, - "loss": 0.047730174660682675, - "step": 2780 - }, - { - "epoch": 0.37434093434828747, - "grad_norm": 6.562028884887695, - "learning_rate": 4.8605987008042144e-05, - "loss": 0.07060698866844177, - "step": 2800 - }, - { - "epoch": 0.37701479816506095, - "grad_norm": 0.7631726861000061, - "learning_rate": 4.8586163519637005e-05, - "loss": 0.04944324493408203, - "step": 2820 - }, - { - "epoch": 0.37968866198183443, - "grad_norm": 1.6982293128967285, - "learning_rate": 4.8566204175630595e-05, - "loss": 0.03000348210334778, - "step": 2840 - }, - { - "epoch": 0.3823625257986079, - "grad_norm": 0.6487429141998291, - "learning_rate": 4.854610909098812e-05, - "loss": 0.06691416501998901, - "step": 2860 - }, - { - "epoch": 0.3850363896153814, - "grad_norm": 0.7648892402648926, - "learning_rate": 4.852587838145668e-05, - "loss": 0.05529783964157105, - "step": 2880 - }, - { - "epoch": 0.38771025343215487, - "grad_norm": 0.11601298302412033, - "learning_rate": 4.850551216356457e-05, - "loss": 0.07780832052230835, - "step": 2900 - }, - { - "epoch": 0.39038411724892835, - "grad_norm": 0.9443137645721436, - "learning_rate": 4.8485010554620594e-05, - "loss": 0.08007023930549621, - "step": 2920 - }, - { - "epoch": 0.39305798106570183, - "grad_norm": 0.8828252553939819, - "learning_rate": 4.846437367271341e-05, - "loss": 0.03541453182697296, - "step": 2940 - }, - { - "epoch": 0.3957318448824753, - "grad_norm": 0.21668888628482819, - "learning_rate": 4.844360163671083e-05, - "loss": 0.08354364633560181, - "step": 2960 - }, - { - "epoch": 0.3984057086992488, - "grad_norm": 0.6840483546257019, - "learning_rate": 4.8422694566259194e-05, - "loss": 0.045807772874832155, - "step": 2980 - }, - { - "epoch": 0.4010795725160223, - "grad_norm": 1.2754698991775513, - "learning_rate": 4.8401652581782584e-05, - "loss": 0.053487342596054074, - "step": 3000 - }, - { - "epoch": 0.40375343633279576, - "grad_norm": 0.19012756645679474, - "learning_rate": 4.838047580448222e-05, - "loss": 0.05881953239440918, - "step": 3020 - }, - { - "epoch": 0.40642730014956924, - "grad_norm": 2.1057698726654053, - "learning_rate": 4.835916435633569e-05, - "loss": 0.031065690517425536, - "step": 3040 - }, - { - "epoch": 0.4091011639663427, - "grad_norm": 4.188559055328369, - "learning_rate": 4.833771836009633e-05, - "loss": 0.07205432653427124, - "step": 3060 - }, - { - "epoch": 0.4117750277831162, - "grad_norm": 6.975829124450684, - "learning_rate": 4.831613793929242e-05, - "loss": 0.04953635036945343, - "step": 3080 - }, - { - "epoch": 0.4144488915998897, - "grad_norm": 4.725269317626953, - "learning_rate": 4.8294423218226546e-05, - "loss": 0.05965519547462463, - "step": 3100 - }, - { - "epoch": 0.41712275541666316, - "grad_norm": 1.7124755382537842, - "learning_rate": 4.827257432197486e-05, - "loss": 0.039625433087348935, - "step": 3120 - }, - { - "epoch": 0.41979661923343664, - "grad_norm": 2.6687324047088623, - "learning_rate": 4.825059137638636e-05, - "loss": 0.05020809769630432, - "step": 3140 - }, - { - "epoch": 0.4224704830502101, - "grad_norm": 1.111640214920044, - "learning_rate": 4.822847450808215e-05, - "loss": 0.04404452443122864, - "step": 3160 - }, - { - "epoch": 0.42514434686698366, - "grad_norm": 0.2128070890903473, - "learning_rate": 4.8206223844454744e-05, - "loss": 0.08283355236053466, - "step": 3180 - }, - { - "epoch": 0.42781821068375714, - "grad_norm": 0.10757248103618622, - "learning_rate": 4.818383951366729e-05, - "loss": 0.08568671345710754, - "step": 3200 - }, - { - "epoch": 0.4304920745005306, - "grad_norm": 0.08344592899084091, - "learning_rate": 4.816132164465289e-05, - "loss": 0.0426956832408905, - "step": 3220 - }, - { - "epoch": 0.4331659383173041, - "grad_norm": 0.5657751560211182, - "learning_rate": 4.813867036711378e-05, - "loss": 0.04971776902675629, - "step": 3240 - }, - { - "epoch": 0.4358398021340776, - "grad_norm": 2.1529288291931152, - "learning_rate": 4.8115885811520654e-05, - "loss": 0.025386181473731995, - "step": 3260 - }, - { - "epoch": 0.43851366595085106, - "grad_norm": 4.228519916534424, - "learning_rate": 4.809296810911188e-05, - "loss": 0.06401395201683044, - "step": 3280 - }, - { - "epoch": 0.44118752976762454, - "grad_norm": 6.770420551300049, - "learning_rate": 4.806991739189274e-05, - "loss": 0.16425553560256959, - "step": 3300 - }, - { - "epoch": 0.443861393584398, - "grad_norm": 0.5303187370300293, - "learning_rate": 4.804673379263467e-05, - "loss": 0.045900467038154605, - "step": 3320 - }, - { - "epoch": 0.4465352574011715, - "grad_norm": 0.221473827958107, - "learning_rate": 4.802341744487453e-05, - "loss": 0.07529735565185547, - "step": 3340 - }, - { - "epoch": 0.449209121217945, - "grad_norm": 3.48736834526062, - "learning_rate": 4.799996848291378e-05, - "loss": 0.062433135509490964, - "step": 3360 - }, - { - "epoch": 0.45188298503471847, - "grad_norm": 2.650038242340088, - "learning_rate": 4.797638704181774e-05, - "loss": 0.03762982189655304, - "step": 3380 - }, - { - "epoch": 0.45455684885149195, - "grad_norm": 3.159665584564209, - "learning_rate": 4.795267325741483e-05, - "loss": 0.04745924174785614, - "step": 3400 - }, - { - "epoch": 0.4572307126682654, - "grad_norm": 0.8763885498046875, - "learning_rate": 4.7928827266295715e-05, - "loss": 0.07380253076553345, - "step": 3420 - }, - { - "epoch": 0.4599045764850389, - "grad_norm": 0.1779366433620453, - "learning_rate": 4.790484920581262e-05, - "loss": 0.045916372537612916, - "step": 3440 - }, - { - "epoch": 0.4625784403018124, - "grad_norm": 1.1228729486465454, - "learning_rate": 4.7880739214078454e-05, - "loss": 0.04461723566055298, - "step": 3460 - }, - { - "epoch": 0.46525230411858587, - "grad_norm": 0.1629919707775116, - "learning_rate": 4.785649742996605e-05, - "loss": 0.017159442603588104, - "step": 3480 - }, - { - "epoch": 0.46792616793535935, - "grad_norm": 3.583951473236084, - "learning_rate": 4.783212399310737e-05, - "loss": 0.047145146131515506, - "step": 3500 - }, - { - "epoch": 0.47060003175213283, - "grad_norm": 0.9766237139701843, - "learning_rate": 4.780761904389267e-05, - "loss": 0.050229442119598386, - "step": 3520 - }, - { - "epoch": 0.4732738955689063, - "grad_norm": 0.05617872253060341, - "learning_rate": 4.778298272346976e-05, - "loss": 0.024862812459468843, - "step": 3540 - }, - { - "epoch": 0.4759477593856798, - "grad_norm": 1.3586453199386597, - "learning_rate": 4.775821517374308e-05, - "loss": 0.02117772251367569, - "step": 3560 - }, - { - "epoch": 0.4786216232024533, - "grad_norm": 1.2116742134094238, - "learning_rate": 4.7733316537373006e-05, - "loss": 0.03060794174671173, - "step": 3580 - }, - { - "epoch": 0.48129548701922675, - "grad_norm": 0.39403238892555237, - "learning_rate": 4.770828695777493e-05, - "loss": 0.05482668280601501, - "step": 3600 - }, - { - "epoch": 0.48396935083600023, - "grad_norm": 0.9248486161231995, - "learning_rate": 4.7683126579118495e-05, - "loss": 0.03612814247608185, - "step": 3620 - }, - { - "epoch": 0.4866432146527737, - "grad_norm": 0.1624649167060852, - "learning_rate": 4.7657835546326736e-05, - "loss": 0.04334873259067536, - "step": 3640 - }, - { - "epoch": 0.4893170784695472, - "grad_norm": 0.5321119427680969, - "learning_rate": 4.763241400507524e-05, - "loss": 0.0461233913898468, - "step": 3660 - }, - { - "epoch": 0.4919909422863207, - "grad_norm": 0.34861093759536743, - "learning_rate": 4.760686210179133e-05, - "loss": 0.024829554557800292, - "step": 3680 - }, - { - "epoch": 0.49466480610309416, - "grad_norm": 1.2561241388320923, - "learning_rate": 4.758117998365322e-05, - "loss": 0.03157005608081818, - "step": 3700 - }, - { - "epoch": 0.49733866991986764, - "grad_norm": 0.8691341280937195, - "learning_rate": 4.7555367798589146e-05, - "loss": 0.04310203492641449, - "step": 3720 - }, - { - "epoch": 0.5000125337366411, - "grad_norm": 0.3134572505950928, - "learning_rate": 4.752942569527653e-05, - "loss": 0.03796039223670959, - "step": 3740 - }, - { - "epoch": 0.5026863975534146, - "grad_norm": 2.3359289169311523, - "learning_rate": 4.75033538231411e-05, - "loss": 0.055599170923233035, - "step": 3760 - }, - { - "epoch": 0.5053602613701881, - "grad_norm": 7.426175594329834, - "learning_rate": 4.747715233235608e-05, - "loss": 0.054436272382736205, - "step": 3780 - }, - { - "epoch": 0.5080341251869616, - "grad_norm": 0.5940203070640564, - "learning_rate": 4.745082137384128e-05, - "loss": 0.03682814538478851, - "step": 3800 - }, - { - "epoch": 0.510707989003735, - "grad_norm": 0.22821389138698578, - "learning_rate": 4.7424361099262225e-05, - "loss": 0.051123309135437014, - "step": 3820 - }, - { - "epoch": 0.5133818528205085, - "grad_norm": 8.20633602142334, - "learning_rate": 4.739777166102932e-05, - "loss": 0.0704378604888916, - "step": 3840 - }, - { - "epoch": 0.516055716637282, - "grad_norm": 3.023848533630371, - "learning_rate": 4.737105321229694e-05, - "loss": 0.03368058800697327, - "step": 3860 - }, - { - "epoch": 0.5187295804540555, - "grad_norm": 0.07666649669408798, - "learning_rate": 4.7344205906962555e-05, - "loss": 0.03665303289890289, - "step": 3880 - }, - { - "epoch": 0.521403444270829, - "grad_norm": 0.7571629881858826, - "learning_rate": 4.731722989966585e-05, - "loss": 0.058415502309799194, - "step": 3900 - }, - { - "epoch": 0.5240773080876024, - "grad_norm": 3.2599120140075684, - "learning_rate": 4.7290125345787816e-05, - "loss": 0.07323018908500671, - "step": 3920 - }, - { - "epoch": 0.5267511719043759, - "grad_norm": 0.28930988907814026, - "learning_rate": 4.7262892401449886e-05, - "loss": 0.054371267557144165, - "step": 3940 - }, - { - "epoch": 0.5294250357211494, - "grad_norm": 2.2296454906463623, - "learning_rate": 4.7235531223513004e-05, - "loss": 0.040819621086120604, - "step": 3960 - }, - { - "epoch": 0.5320988995379229, - "grad_norm": 0.11608211696147919, - "learning_rate": 4.720804196957675e-05, - "loss": 0.05215579271316528, - "step": 3980 - }, - { - "epoch": 0.5347727633546964, - "grad_norm": 1.1587547063827515, - "learning_rate": 4.7180424797978415e-05, - "loss": 0.026277875900268553, - "step": 4000 - }, - { - "epoch": 0.5374466271714698, - "grad_norm": 0.06253435462713242, - "learning_rate": 4.7152679867792074e-05, - "loss": 0.02574407756328583, - "step": 4020 - }, - { - "epoch": 0.5401204909882433, - "grad_norm": 1.3441458940505981, - "learning_rate": 4.71248073388277e-05, - "loss": 0.05538107752799988, - "step": 4040 - }, - { - "epoch": 0.5427943548050168, - "grad_norm": 0.48076340556144714, - "learning_rate": 4.7096807371630236e-05, - "loss": 0.047986540198326114, - "step": 4060 - }, - { - "epoch": 0.5454682186217903, - "grad_norm": 0.5924936532974243, - "learning_rate": 4.706868012747867e-05, - "loss": 0.05463914275169372, - "step": 4080 - }, - { - "epoch": 0.7673995566395854, - "grad_norm": 0.05143728107213974, - "learning_rate": 4.431151627307268e-05, - "loss": 0.00959376593430837, - "step": 4100 - }, - { - "epoch": 0.771142969110998, - "grad_norm": 1.2308074235916138, - "learning_rate": 4.425806509248848e-05, - "loss": 0.002745623141527176, - "step": 4120 - }, - { - "epoch": 0.7748863815824106, - "grad_norm": 2.080223798751831, - "learning_rate": 4.420439652052499e-05, - "loss": 0.012390998750925064, - "step": 4140 - }, - { - "epoch": 0.7786297940538233, - "grad_norm": 0.049312230199575424, - "learning_rate": 4.415051116301072e-05, - "loss": 0.004607534408569336, - "step": 4160 - }, - { - "epoch": 0.7823732065252359, - "grad_norm": 0.07747476547956467, - "learning_rate": 4.409640962822132e-05, - "loss": 0.034441503882408145, - "step": 4180 - }, - { - "epoch": 0.7861166189966485, - "grad_norm": 0.021327875554561615, - "learning_rate": 4.404209252687275e-05, - "loss": 0.009768449515104295, - "step": 4200 - }, - { - "epoch": 0.789860031468061, - "grad_norm": 2.406580924987793, - "learning_rate": 4.398756047211431e-05, - "loss": 0.005304037779569626, - "step": 4220 - }, - { - "epoch": 0.7936034439394737, - "grad_norm": 0.027869906276464462, - "learning_rate": 4.39328140795218e-05, - "loss": 0.00896073654294014, - "step": 4240 - }, - { - "epoch": 0.7973468564108863, - "grad_norm": 0.09702044725418091, - "learning_rate": 4.387785396709052e-05, - "loss": 0.0117533378303051, - "step": 4260 - }, - { - "epoch": 0.801090268882299, - "grad_norm": 0.529065728187561, - "learning_rate": 4.382268075522831e-05, - "loss": 0.0037526611238718035, - "step": 4280 - }, - { - "epoch": 0.8048336813537116, - "grad_norm": 0.015109462663531303, - "learning_rate": 4.3767295066748564e-05, - "loss": 0.0025708725675940513, - "step": 4300 - }, - { - "epoch": 0.8085770938251241, - "grad_norm": 0.7257627248764038, - "learning_rate": 4.371169752686316e-05, - "loss": 0.006234285607933998, - "step": 4320 - }, - { - "epoch": 0.8123205062965367, - "grad_norm": 0.016853008419275284, - "learning_rate": 4.3655888763175436e-05, - "loss": 0.0023587727919220924, - "step": 4340 - }, - { - "epoch": 0.8160639187679494, - "grad_norm": 0.017816167324781418, - "learning_rate": 4.3599869405673085e-05, - "loss": 0.0012389549054205417, - "step": 4360 - }, - { - "epoch": 0.819807331239362, - "grad_norm": 0.014672616496682167, - "learning_rate": 4.354364008672106e-05, - "loss": 0.002244691364467144, - "step": 4380 - }, - { - "epoch": 0.8235507437107746, - "grad_norm": 0.044869400560855865, - "learning_rate": 4.3487201441054435e-05, - "loss": 0.007713723182678223, - "step": 4400 - }, - { - "epoch": 0.8272941561821872, - "grad_norm": 0.06367291510105133, - "learning_rate": 4.343055410577122e-05, - "loss": 0.005743256583809852, - "step": 4420 - }, - { - "epoch": 0.8310375686535998, - "grad_norm": 0.1354215145111084, - "learning_rate": 4.3373698720325176e-05, - "loss": 0.009635470807552338, - "step": 4440 - }, - { - "epoch": 0.8347809811250124, - "grad_norm": 0.9089844822883606, - "learning_rate": 4.331663592651862e-05, - "loss": 0.01007603257894516, - "step": 4460 - }, - { - "epoch": 0.838524393596425, - "grad_norm": 0.025831619277596474, - "learning_rate": 4.3259366368495167e-05, - "loss": 0.006179215386509895, - "step": 4480 - }, - { - "epoch": 0.8422678060678377, - "grad_norm": 0.016653764992952347, - "learning_rate": 4.320189069273243e-05, - "loss": 0.0025156451389193534, - "step": 4500 - }, - { - "epoch": 0.8460112185392502, - "grad_norm": 0.27361780405044556, - "learning_rate": 4.3144209548034766e-05, - "loss": 0.002235286869108677, - "step": 4520 - }, - { - "epoch": 0.8497546310106628, - "grad_norm": 2.6958701610565186, - "learning_rate": 4.3086323585525915e-05, - "loss": 0.03571180701255798, - "step": 4540 - }, - { - "epoch": 0.8534980434820755, - "grad_norm": 0.1260778158903122, - "learning_rate": 4.3028233458641696e-05, - "loss": 0.0036518506705760954, - "step": 4560 - }, - { - "epoch": 0.8572414559534881, - "grad_norm": 0.2445528209209442, - "learning_rate": 4.2969939823122586e-05, - "loss": 0.024949796497821808, - "step": 4580 - }, - { - "epoch": 0.8609848684249007, - "grad_norm": 0.1674242913722992, - "learning_rate": 4.291144333700633e-05, - "loss": 0.002089798077940941, - "step": 4600 - }, - { - "epoch": 0.8647282808963134, - "grad_norm": 0.05161884427070618, - "learning_rate": 4.2852744660620515e-05, - "loss": 0.007847145944833756, - "step": 4620 - }, - { - "epoch": 0.8684716933677259, - "grad_norm": 0.019796324893832207, - "learning_rate": 4.279384445657514e-05, - "loss": 0.0023555334657430647, - "step": 4640 - }, - { - "epoch": 0.8722151058391385, - "grad_norm": 0.0647754967212677, - "learning_rate": 4.2734743389755096e-05, - "loss": 0.009586349129676819, - "step": 4660 - }, - { - "epoch": 0.8759585183105512, - "grad_norm": 0.015243460424244404, - "learning_rate": 4.267544212731268e-05, - "loss": 0.017788709700107576, - "step": 4680 - }, - { - "epoch": 0.8797019307819638, - "grad_norm": 0.05756703019142151, - "learning_rate": 4.261594133866007e-05, - "loss": 0.014256520569324494, - "step": 4700 - }, - { - "epoch": 0.8834453432533764, - "grad_norm": 0.2002931535243988, - "learning_rate": 4.255624169546175e-05, - "loss": 0.0014025470241904258, - "step": 4720 - }, - { - "epoch": 0.887188755724789, - "grad_norm": 0.04325389489531517, - "learning_rate": 4.249634387162696e-05, - "loss": 0.010552891343832017, - "step": 4740 - }, - { - "epoch": 0.8909321681962016, - "grad_norm": 0.8975178599357605, - "learning_rate": 4.243624854330206e-05, - "loss": 0.0032475266605615618, - "step": 4760 - }, - { - "epoch": 0.8946755806676142, - "grad_norm": 0.01541830413043499, - "learning_rate": 4.237595638886288e-05, - "loss": 0.003157203644514084, - "step": 4780 - }, - { - "epoch": 0.8984189931390268, - "grad_norm": 1.673305869102478, - "learning_rate": 4.231546808890713e-05, - "loss": 0.0028239911422133445, - "step": 4800 - }, - { - "epoch": 0.9021624056104395, - "grad_norm": 0.021689629182219505, - "learning_rate": 4.225478432624665e-05, - "loss": 0.0026885712519288062, - "step": 4820 - }, - { - "epoch": 0.905905818081852, - "grad_norm": 0.019590798765420914, - "learning_rate": 4.219390578589973e-05, - "loss": 0.00780024379491806, - "step": 4840 - }, - { - "epoch": 0.9096492305532646, - "grad_norm": 0.024581020697951317, - "learning_rate": 4.213283315508337e-05, - "loss": 0.006697511672973633, - "step": 4860 - }, - { - "epoch": 0.9133926430246773, - "grad_norm": 0.20615583658218384, - "learning_rate": 4.207156712320555e-05, - "loss": 0.007314208894968033, - "step": 4880 - }, - { - "epoch": 0.9171360554960899, - "grad_norm": 0.015673745423555374, - "learning_rate": 4.20101083818574e-05, - "loss": 0.004841562733054161, - "step": 4900 - }, - { - "epoch": 0.9208794679675025, - "grad_norm": 0.008306623436510563, - "learning_rate": 4.194845762480544e-05, - "loss": 0.0010150263085961341, - "step": 4920 - }, - { - "epoch": 0.9246228804389152, - "grad_norm": 0.051861703395843506, - "learning_rate": 4.188661554798369e-05, - "loss": 0.011043114960193634, - "step": 4940 - }, - { - "epoch": 0.9283662929103277, - "grad_norm": 1.7019767761230469, - "learning_rate": 4.1824582849485884e-05, - "loss": 0.004985674470663071, - "step": 4960 - }, - { - "epoch": 0.9321097053817403, - "grad_norm": 0.021240154281258583, - "learning_rate": 4.176236022955755e-05, - "loss": 0.04885836541652679, - "step": 4980 - }, - { - "epoch": 0.935853117853153, - "grad_norm": 0.016504865139722824, - "learning_rate": 4.16999483905881e-05, - "loss": 0.0027378931641578673, - "step": 5000 - }, - { - "epoch": 0.9395965303245656, - "grad_norm": 0.014015628024935722, - "learning_rate": 4.163734803710294e-05, - "loss": 0.012781022489070893, - "step": 5020 - }, - { - "epoch": 0.9433399427959782, - "grad_norm": 0.013812500052154064, - "learning_rate": 4.157455987575545e-05, - "loss": 0.007508871704339981, - "step": 5040 - }, - { - "epoch": 0.9470833552673907, - "grad_norm": 0.01622290164232254, - "learning_rate": 4.1511584615319075e-05, - "loss": 0.0014614147134125234, - "step": 5060 - }, - { - "epoch": 0.9508267677388034, - "grad_norm": 0.01259149145334959, - "learning_rate": 4.144842296667929e-05, - "loss": 0.006202424317598343, - "step": 5080 - }, - { - "epoch": 0.954570180210216, - "grad_norm": 0.012383027002215385, - "learning_rate": 4.138507564282558e-05, - "loss": 0.006122353300452232, - "step": 5100 - }, - { - "epoch": 0.9583135926816286, - "grad_norm": 0.006499920971691608, - "learning_rate": 4.1321543358843385e-05, - "loss": 0.0008865024894475937, - "step": 5120 - }, - { - "epoch": 0.9620570051530413, - "grad_norm": 0.00830752868205309, - "learning_rate": 4.125782683190606e-05, - "loss": 0.0008420860394835472, - "step": 5140 - }, - { - "epoch": 0.9658004176244538, - "grad_norm": 0.01525857299566269, - "learning_rate": 4.119392678126673e-05, - "loss": 0.00587364137172699, - "step": 5160 - }, - { - "epoch": 0.9695438300958664, - "grad_norm": 0.01072095800191164, - "learning_rate": 4.11298439282502e-05, - "loss": 0.00853007659316063, - "step": 5180 - }, - { - "epoch": 0.973287242567279, - "grad_norm": 0.030316641554236412, - "learning_rate": 4.106557899624482e-05, - "loss": 0.0058747071772813795, - "step": 5200 - }, - { - "epoch": 0.9770306550386917, - "grad_norm": 0.0391647033393383, - "learning_rate": 4.1001132710694304e-05, - "loss": 0.0034765828400850295, - "step": 5220 - }, - { - "epoch": 0.9807740675101043, - "grad_norm": 0.04938298836350441, - "learning_rate": 4.093650579908953e-05, - "loss": 0.007594724744558334, - "step": 5240 - }, - { - "epoch": 0.984517479981517, - "grad_norm": 0.005873252172023058, - "learning_rate": 4.087169899096037e-05, - "loss": 0.013347607851028443, - "step": 5260 - }, - { - "epoch": 0.9882608924529295, - "grad_norm": 1.2757259607315063, - "learning_rate": 4.080671301786741e-05, - "loss": 0.004837355017662049, - "step": 5280 - }, - { - "epoch": 0.9920043049243421, - "grad_norm": 0.00920735765248537, - "learning_rate": 4.0741548613393675e-05, - "loss": 0.007415445148944854, - "step": 5300 - }, - { - "epoch": 0.9957477173957547, - "grad_norm": 0.5702093839645386, - "learning_rate": 4.067620651313647e-05, - "loss": 0.00406576506793499, - "step": 5320 - }, - { - "epoch": 0.9994911298671674, - "grad_norm": 1.8361051082611084, - "learning_rate": 4.0610687454698906e-05, - "loss": 0.00997612327337265, - "step": 5340 - }, - { - "epoch": 1.0031819006007008, - "grad_norm": 3.335326910018921, - "learning_rate": 4.0544992177681685e-05, - "loss": 0.008442799001932145, - "step": 5360 - }, - { - "epoch": 1.0069253130721134, - "grad_norm": 0.03184954449534416, - "learning_rate": 4.047912142367473e-05, - "loss": 0.008095134049654007, - "step": 5380 - }, - { - "epoch": 1.010668725543526, - "grad_norm": 0.029989074915647507, - "learning_rate": 4.04130759362488e-05, - "loss": 0.0012585990130901336, - "step": 5400 - }, - { - "epoch": 1.0144121380149385, - "grad_norm": 0.08727464079856873, - "learning_rate": 4.034685646094711e-05, - "loss": 0.012588074803352356, - "step": 5420 - }, - { - "epoch": 1.018155550486351, - "grad_norm": 0.018498806282877922, - "learning_rate": 4.028046374527689e-05, - "loss": 0.001854238100349903, - "step": 5440 - }, - { - "epoch": 1.0218989629577637, - "grad_norm": 0.013779236935079098, - "learning_rate": 4.021389853870095e-05, - "loss": 0.0008004569448530674, - "step": 5460 - }, - { - "epoch": 1.0256423754291764, - "grad_norm": 0.028235070407390594, - "learning_rate": 4.0147161592629306e-05, - "loss": 0.002274145185947418, - "step": 5480 - }, - { - "epoch": 1.029385787900589, - "grad_norm": 0.023030120879411697, - "learning_rate": 4.008025366041055e-05, - "loss": 0.008717305958271027, - "step": 5500 - }, - { - "epoch": 1.0331292003720016, - "grad_norm": 0.018347155302762985, - "learning_rate": 4.001317549732345e-05, - "loss": 0.00244256854057312, - "step": 5520 - }, - { - "epoch": 1.0368726128434143, - "grad_norm": 0.03449391946196556, - "learning_rate": 3.99459278605684e-05, - "loss": 0.0039924226701259615, - "step": 5540 - }, - { - "epoch": 1.0406160253148269, - "grad_norm": 0.030406463891267776, - "learning_rate": 3.9878511509258866e-05, - "loss": 0.0021008485928177834, - "step": 5560 - }, - { - "epoch": 1.0443594377862395, - "grad_norm": 0.01783100888133049, - "learning_rate": 3.9810927204412803e-05, - "loss": 0.0006656501442193985, - "step": 5580 - }, - { - "epoch": 1.0481028502576522, - "grad_norm": 0.05360455811023712, - "learning_rate": 3.974317570894413e-05, - "loss": 0.005278818309307098, - "step": 5600 - }, - { - "epoch": 1.0518462627290646, - "grad_norm": 0.008699169382452965, - "learning_rate": 3.9675257787654e-05, - "loss": 0.005309444293379784, - "step": 5620 - }, - { - "epoch": 1.0555896752004772, - "grad_norm": 0.036641959100961685, - "learning_rate": 3.960717420722227e-05, - "loss": 0.0034692320972681046, - "step": 5640 - }, - { - "epoch": 1.0593330876718898, - "grad_norm": 0.012212110683321953, - "learning_rate": 3.953892573619883e-05, - "loss": 0.005343861132860184, - "step": 5660 - }, - { - "epoch": 1.0630765001433025, - "grad_norm": 0.011296284385025501, - "learning_rate": 3.947051314499489e-05, - "loss": 0.0038058970123529432, - "step": 5680 - }, - { - "epoch": 1.066819912614715, - "grad_norm": 0.05954049900174141, - "learning_rate": 3.94019372058743e-05, - "loss": 0.008142991364002228, - "step": 5700 - }, - { - "epoch": 1.0705633250861277, - "grad_norm": 0.03478416055440903, - "learning_rate": 3.933319869294483e-05, - "loss": 0.0075227849185466765, - "step": 5720 - }, - { - "epoch": 1.0743067375575404, - "grad_norm": 0.014586996287107468, - "learning_rate": 3.9264298382149455e-05, - "loss": 0.0036750122904777526, - "step": 5740 - }, - { - "epoch": 1.078050150028953, - "grad_norm": 0.025754544883966446, - "learning_rate": 3.919523705125757e-05, - "loss": 0.004151013493537903, - "step": 5760 - }, - { - "epoch": 1.0817935625003656, - "grad_norm": 0.03239905461668968, - "learning_rate": 3.9126015479856205e-05, - "loss": 0.00861695185303688, - "step": 5780 - }, - { - "epoch": 1.0855369749717783, - "grad_norm": 0.03506994619965553, - "learning_rate": 3.9056634449341256e-05, - "loss": 0.003123755753040314, - "step": 5800 - }, - { - "epoch": 1.089280387443191, - "grad_norm": 0.0286911278963089, - "learning_rate": 3.898709474290864e-05, - "loss": 0.002537376619875431, - "step": 5820 - }, - { - "epoch": 1.0930237999146033, - "grad_norm": 0.03490692004561424, - "learning_rate": 3.8917397145545454e-05, - "loss": 0.0010227372869849205, - "step": 5840 - }, - { - "epoch": 1.096767212386016, - "grad_norm": 0.013748899102210999, - "learning_rate": 3.884754244402113e-05, - "loss": 0.011847371608018875, - "step": 5860 - }, - { - "epoch": 1.1005106248574286, - "grad_norm": 0.035458195954561234, - "learning_rate": 3.877753142687852e-05, - "loss": 0.009741749614477158, - "step": 5880 - }, - { - "epoch": 1.1042540373288412, - "grad_norm": 0.012493673712015152, - "learning_rate": 3.8707364884425064e-05, - "loss": 0.006607493013143539, - "step": 5900 - }, - { - "epoch": 1.1079974498002538, - "grad_norm": 0.018607834354043007, - "learning_rate": 3.863704360872378e-05, - "loss": 0.0016217166557908058, - "step": 5920 - }, - { - "epoch": 1.1117408622716665, - "grad_norm": 0.0283930953592062, - "learning_rate": 3.8566568393584366e-05, - "loss": 0.002083975449204445, - "step": 5940 - }, - { - "epoch": 1.115484274743079, - "grad_norm": 0.05229801684617996, - "learning_rate": 3.8495940034554283e-05, - "loss": 0.0014217685908079146, - "step": 5960 - }, - { - "epoch": 1.1192276872144917, - "grad_norm": 0.008808930404484272, - "learning_rate": 3.8425159328909684e-05, - "loss": 0.0022570645436644555, - "step": 5980 - }, - { - "epoch": 1.1229710996859044, - "grad_norm": 0.020502232015132904, - "learning_rate": 3.835422707564648e-05, - "loss": 0.003745942190289497, - "step": 6000 - }, - { - "epoch": 1.126714512157317, - "grad_norm": 0.032347094267606735, - "learning_rate": 3.82831440754713e-05, - "loss": 0.003347185626626015, - "step": 6020 - }, - { - "epoch": 1.1304579246287294, - "grad_norm": 0.020310478284955025, - "learning_rate": 3.821191113079246e-05, - "loss": 0.006166417896747589, - "step": 6040 - }, - { - "epoch": 1.134201337100142, - "grad_norm": 0.06390372663736343, - "learning_rate": 3.8140529045710876e-05, - "loss": 0.0013674044981598853, - "step": 6060 - }, - { - "epoch": 1.1379447495715547, - "grad_norm": 1.1938918828964233, - "learning_rate": 3.806899862601105e-05, - "loss": 0.010550644248723984, - "step": 6080 - }, - { - "epoch": 1.1416881620429673, - "grad_norm": 0.035355549305677414, - "learning_rate": 3.799732067915189e-05, - "loss": 0.0069750770926475525, - "step": 6100 - }, - { - "epoch": 1.14543157451438, - "grad_norm": 0.009921093471348286, - "learning_rate": 3.792549601425767e-05, - "loss": 0.0027949588373303415, - "step": 6120 - }, - { - "epoch": 1.1491749869857926, - "grad_norm": 0.06172063946723938, - "learning_rate": 3.785352544210884e-05, - "loss": 0.0009372101165354251, - "step": 6140 - }, - { - "epoch": 1.1529183994572052, - "grad_norm": 0.008572470396757126, - "learning_rate": 3.778140977513294e-05, - "loss": 0.0029502738267183303, - "step": 6160 - }, - { - "epoch": 1.1566618119286178, - "grad_norm": 0.4211727976799011, - "learning_rate": 3.770914982739534e-05, - "loss": 0.014692296087741853, - "step": 6180 - }, - { - "epoch": 1.1604052244000305, - "grad_norm": 0.02292146533727646, - "learning_rate": 3.7636746414590126e-05, - "loss": 0.0020170681178569793, - "step": 6200 - }, - { - "epoch": 1.164148636871443, - "grad_norm": 0.11247449368238449, - "learning_rate": 3.756420035403086e-05, - "loss": 0.006851900368928909, - "step": 6220 - }, - { - "epoch": 1.1678920493428557, - "grad_norm": 0.020755017176270485, - "learning_rate": 3.749151246464137e-05, - "loss": 0.0021739909425377846, - "step": 6240 - }, - { - "epoch": 1.1716354618142684, - "grad_norm": 0.017202025279402733, - "learning_rate": 3.741868356694647e-05, - "loss": 0.002353278361260891, - "step": 6260 - }, - { - "epoch": 1.1753788742856808, - "grad_norm": 0.014947429299354553, - "learning_rate": 3.734571448306274e-05, - "loss": 0.0010860362090170383, - "step": 6280 - }, - { - "epoch": 1.1791222867570934, - "grad_norm": 1.5391262769699097, - "learning_rate": 3.727260603668922e-05, - "loss": 0.01233254000544548, - "step": 6300 - }, - { - "epoch": 1.182865699228506, - "grad_norm": 0.4759792387485504, - "learning_rate": 3.7199359053098133e-05, - "loss": 0.0028501398861408233, - "step": 6320 - }, - { - "epoch": 1.1866091116999187, - "grad_norm": 0.01719040609896183, - "learning_rate": 3.7125974359125536e-05, - "loss": 0.00934450700879097, - "step": 6340 - }, - { - "epoch": 1.1903525241713313, - "grad_norm": 2.4766688346862793, - "learning_rate": 3.7052452783162015e-05, - "loss": 0.018582724034786224, - "step": 6360 - }, - { - "epoch": 1.194095936642744, - "grad_norm": 0.11404932290315628, - "learning_rate": 3.6978795155143326e-05, - "loss": 0.01815672367811203, - "step": 6380 - }, - { - "epoch": 1.1978393491141566, - "grad_norm": 0.021365633234381676, - "learning_rate": 3.690500230654103e-05, - "loss": 0.004123781993985176, - "step": 6400 - }, - { - "epoch": 1.2015827615855692, - "grad_norm": 0.022478772327303886, - "learning_rate": 3.68310750703531e-05, - "loss": 0.0038731731474399567, - "step": 6420 - }, - { - "epoch": 1.2053261740569818, - "grad_norm": 0.15531578660011292, - "learning_rate": 3.67570142810945e-05, - "loss": 0.002076444961130619, - "step": 6440 - }, - { - "epoch": 1.2090695865283942, - "grad_norm": 0.012458150275051594, - "learning_rate": 3.668282077478783e-05, - "loss": 0.0027592860162258146, - "step": 6460 - }, - { - "epoch": 1.2128129989998069, - "grad_norm": 0.01572798565030098, - "learning_rate": 3.66084953889538e-05, - "loss": 0.002740098722279072, - "step": 6480 - }, - { - "epoch": 1.2165564114712195, - "grad_norm": 0.13682503998279572, - "learning_rate": 3.6534038962601835e-05, - "loss": 0.000705425813794136, - "step": 6500 - }, - { - "epoch": 1.2202998239426321, - "grad_norm": 0.030630914494395256, - "learning_rate": 3.64594523362206e-05, - "loss": 0.012480729073286057, - "step": 6520 - }, - { - "epoch": 1.2240432364140448, - "grad_norm": 0.024804554879665375, - "learning_rate": 3.638473635176848e-05, - "loss": 0.0007834361866116523, - "step": 6540 - }, - { - "epoch": 1.2277866488854574, - "grad_norm": 0.011334752663969994, - "learning_rate": 3.630989185266411e-05, - "loss": 0.022086825966835023, - "step": 6560 - }, - { - "epoch": 1.23153006135687, - "grad_norm": 0.020346902310848236, - "learning_rate": 3.623491968377684e-05, - "loss": 0.018024472892284392, - "step": 6580 - }, - { - "epoch": 1.2352734738282827, - "grad_norm": 0.015177210792899132, - "learning_rate": 3.615982069141719e-05, - "loss": 0.005251453071832657, - "step": 6600 - }, - { - "epoch": 1.2390168862996953, - "grad_norm": 0.013680647127330303, - "learning_rate": 3.608459572332733e-05, - "loss": 0.006734563410282135, - "step": 6620 - }, - { - "epoch": 1.242760298771108, - "grad_norm": 0.17980872094631195, - "learning_rate": 3.600924562867144e-05, - "loss": 0.003970410302281379, - "step": 6640 - }, - { - "epoch": 1.2465037112425206, - "grad_norm": 0.015203841030597687, - "learning_rate": 3.593377125802622e-05, - "loss": 0.0032148901373147964, - "step": 6660 - }, - { - "epoch": 1.2502471237139332, - "grad_norm": 0.017300931736826897, - "learning_rate": 3.585817346337119e-05, - "loss": 0.00467667318880558, - "step": 6680 - }, - { - "epoch": 1.2539905361853458, - "grad_norm": 0.028181765228509903, - "learning_rate": 3.5782453098079175e-05, - "loss": 0.0015515764243900776, - "step": 6700 - }, - { - "epoch": 1.2577339486567582, - "grad_norm": 0.01730780117213726, - "learning_rate": 3.570661101690657e-05, - "loss": 0.007991334050893783, - "step": 6720 - }, - { - "epoch": 1.2614773611281709, - "grad_norm": 0.014216347597539425, - "learning_rate": 3.5630648075983763e-05, - "loss": 0.002533360943198204, - "step": 6740 - }, - { - "epoch": 1.2652207735995835, - "grad_norm": 0.1556195169687271, - "learning_rate": 3.555456513280544e-05, - "loss": 0.0032653655856847764, - "step": 6760 - }, - { - "epoch": 1.2689641860709961, - "grad_norm": 0.023955868557095528, - "learning_rate": 3.5478363046220915e-05, - "loss": 0.00850408971309662, - "step": 6780 - }, - { - "epoch": 1.2727075985424088, - "grad_norm": 0.17874136567115784, - "learning_rate": 3.5402042676424424e-05, - "loss": 0.0032720811665058135, - "step": 6800 - }, - { - "epoch": 1.2764510110138214, - "grad_norm": 0.0899379625916481, - "learning_rate": 3.5325604884945434e-05, - "loss": 0.003243798017501831, - "step": 6820 - }, - { - "epoch": 1.280194423485234, - "grad_norm": 0.413362056016922, - "learning_rate": 3.5249050534638906e-05, - "loss": 0.0036127623170614243, - "step": 6840 - }, - { - "epoch": 1.2839378359566467, - "grad_norm": 0.02790931612253189, - "learning_rate": 3.517238048967554e-05, - "loss": 0.008225285261869431, - "step": 6860 - }, - { - "epoch": 1.287681248428059, - "grad_norm": 0.6761110424995422, - "learning_rate": 3.5095595615532056e-05, - "loss": 0.00199942234903574, - "step": 6880 - }, - { - "epoch": 1.2914246608994717, - "grad_norm": 4.593618869781494, - "learning_rate": 3.5018696778981385e-05, - "loss": 0.007301987707614898, - "step": 6900 - }, - { - "epoch": 1.2951680733708844, - "grad_norm": 0.09392693638801575, - "learning_rate": 3.494168484808293e-05, - "loss": 0.009008315950632095, - "step": 6920 - }, - { - "epoch": 1.298911485842297, - "grad_norm": 0.008239852264523506, - "learning_rate": 3.48645606921727e-05, - "loss": 0.012661360204219818, - "step": 6940 - }, - { - "epoch": 1.3026548983137096, - "grad_norm": 0.05141177773475647, - "learning_rate": 3.4787325181853576e-05, - "loss": 0.0007553372532129287, - "step": 6960 - }, - { - "epoch": 1.3063983107851223, - "grad_norm": 0.024333903566002846, - "learning_rate": 3.470997918898541e-05, - "loss": 0.0016128463670611382, - "step": 6980 - }, - { - "epoch": 1.3101417232565349, - "grad_norm": 0.0337531715631485, - "learning_rate": 3.4632523586675254e-05, - "loss": 0.003253454715013504, - "step": 7000 - }, - { - "epoch": 1.3138851357279475, - "grad_norm": 0.05121550336480141, - "learning_rate": 3.4554959249267436e-05, - "loss": 0.0026307271793484686, - "step": 7020 - }, - { - "epoch": 1.3176285481993602, - "grad_norm": 0.025997543707489967, - "learning_rate": 3.447728705233374e-05, - "loss": 0.0012719514779746532, - "step": 7040 - }, - { - "epoch": 1.3213719606707728, - "grad_norm": 0.009486268274486065, - "learning_rate": 3.4399507872663494e-05, - "loss": 0.002009082958102226, - "step": 7060 - }, - { - "epoch": 1.3251153731421854, - "grad_norm": 0.016816232353448868, - "learning_rate": 3.432162258825369e-05, - "loss": 0.0005956823006272316, - "step": 7080 - }, - { - "epoch": 1.328858785613598, - "grad_norm": 0.004733961541205645, - "learning_rate": 3.424363207829906e-05, - "loss": 0.003636709600687027, - "step": 7100 - }, - { - "epoch": 1.3326021980850107, - "grad_norm": 3.666203498840332, - "learning_rate": 3.4165537223182155e-05, - "loss": 0.010488419234752655, - "step": 7120 - }, - { - "epoch": 1.336345610556423, - "grad_norm": 0.021471882238984108, - "learning_rate": 3.408733890446341e-05, - "loss": 0.0009709249250590801, - "step": 7140 - }, - { - "epoch": 1.3400890230278357, - "grad_norm": 0.007639541756361723, - "learning_rate": 3.40090380048712e-05, - "loss": 0.0030905861407518388, - "step": 7160 - }, - { - "epoch": 1.3438324354992484, - "grad_norm": 0.16878941655158997, - "learning_rate": 3.393063540829186e-05, - "loss": 0.0036965351551771163, - "step": 7180 - }, - { - "epoch": 1.347575847970661, - "grad_norm": 0.07014094293117523, - "learning_rate": 3.385213199975971e-05, - "loss": 0.0005677144508808851, - "step": 7200 - }, - { - "epoch": 1.3513192604420736, - "grad_norm": 0.008626374416053295, - "learning_rate": 3.377352866544706e-05, - "loss": 0.0005447934381663799, - "step": 7220 - }, - { - "epoch": 1.3550626729134863, - "grad_norm": 0.013825134374201298, - "learning_rate": 3.3694826292654246e-05, - "loss": 0.004854041337966919, - "step": 7240 - }, - { - "epoch": 1.3588060853848989, - "grad_norm": 0.025015883147716522, - "learning_rate": 3.361602576979956e-05, - "loss": 0.004542553424835205, - "step": 7260 - }, - { - "epoch": 1.3625494978563115, - "grad_norm": 0.009614030830562115, - "learning_rate": 3.353712798640923e-05, - "loss": 0.0008775785565376282, - "step": 7280 - }, - { - "epoch": 1.366292910327724, - "grad_norm": 3.8835268020629883, - "learning_rate": 3.345813383310744e-05, - "loss": 0.0063879616558551785, - "step": 7300 - }, - { - "epoch": 1.3700363227991366, - "grad_norm": 0.005518193822354078, - "learning_rate": 3.337904420160618e-05, - "loss": 0.0010956574231386184, - "step": 7320 - }, - { - "epoch": 1.3737797352705492, - "grad_norm": 0.005018322728574276, - "learning_rate": 3.329985998469526e-05, - "loss": 0.0012317843735218047, - "step": 7340 - }, - { - "epoch": 0.6887872232777639, - "grad_norm": 0.3108454942703247, - "learning_rate": 3.322058207623218e-05, - "loss": 0.010070423781871795, - "step": 7360 - }, - { - "epoch": 0.6906589276888447, - "grad_norm": 0.3556046783924103, - "learning_rate": 3.314121137113209e-05, - "loss": 0.0278738796710968, - "step": 7380 - }, - { - "epoch": 0.6925306320999256, - "grad_norm": 4.041794300079346, - "learning_rate": 3.306174876535762e-05, - "loss": 0.025335192680358887, - "step": 7400 - }, - { - "epoch": 0.6944023365110065, - "grad_norm": 0.04647493362426758, - "learning_rate": 3.2982195155908845e-05, - "loss": 0.05056847333908081, - "step": 7420 - }, - { - "epoch": 0.6962740409220873, - "grad_norm": 0.6827419400215149, - "learning_rate": 3.290653575270209e-05, - "loss": 0.036053261160850524, - "step": 7440 - }, - { - "epoch": 0.6981457453331683, - "grad_norm": 0.256136029958725, - "learning_rate": 3.2826807269966064e-05, - "loss": 0.020640365779399872, - "step": 7460 - }, - { - "epoch": 0.7000174497442492, - "grad_norm": 0.2054845094680786, - "learning_rate": 3.274699043565268e-05, - "loss": 0.03456352353096008, - "step": 7480 - }, - { - "epoch": 0.70188915415533, - "grad_norm": 0.2027648538351059, - "learning_rate": 3.266708615076064e-05, - "loss": 0.00846734493970871, - "step": 7500 - }, - { - "epoch": 0.7037608585664109, - "grad_norm": 1.6423311233520508, - "learning_rate": 3.258709531727582e-05, - "loss": 0.054978948831558225, - "step": 7520 - }, - { - "epoch": 0.7056325629774918, - "grad_norm": 1.775089144706726, - "learning_rate": 3.2507018838161085e-05, - "loss": 0.03238933086395264, - "step": 7540 - }, - { - "epoch": 0.7075042673885726, - "grad_norm": 0.06917860358953476, - "learning_rate": 3.242685761734609e-05, - "loss": 0.016849520802497863, - "step": 7560 - }, - { - "epoch": 0.7093759717996535, - "grad_norm": 0.051443129777908325, - "learning_rate": 3.2346612559717094e-05, - "loss": 0.048251998424530027, - "step": 7580 - }, - { - "epoch": 0.7112476762107344, - "grad_norm": 0.06533925980329514, - "learning_rate": 3.226628457110672e-05, - "loss": 0.03696450293064117, - "step": 7600 - }, - { - "epoch": 0.7131193806218153, - "grad_norm": 0.45661595463752747, - "learning_rate": 3.218587455828377e-05, - "loss": 0.05503013730049133, - "step": 7620 - }, - { - "epoch": 0.7149910850328962, - "grad_norm": 2.0205914974212646, - "learning_rate": 3.210538342894291e-05, - "loss": 0.033562681078910826, - "step": 7640 - }, - { - "epoch": 0.7168627894439771, - "grad_norm": 2.4842448234558105, - "learning_rate": 3.202481209169455e-05, - "loss": 0.019278638064861298, - "step": 7660 - }, - { - "epoch": 0.7187344938550579, - "grad_norm": 0.10550081729888916, - "learning_rate": 3.1944161456054436e-05, - "loss": 0.01638232171535492, - "step": 7680 - }, - { - "epoch": 0.7206061982661388, - "grad_norm": 1.606436014175415, - "learning_rate": 3.1863432432433506e-05, - "loss": 0.020552067458629607, - "step": 7700 - }, - { - "epoch": 0.7224779026772197, - "grad_norm": 0.2617719769477844, - "learning_rate": 3.178262593212757e-05, - "loss": 0.02315783053636551, - "step": 7720 - }, - { - "epoch": 0.7243496070883005, - "grad_norm": 0.9734074473381042, - "learning_rate": 3.1701742867307e-05, - "loss": 0.01938771307468414, - "step": 7740 - }, - { - "epoch": 0.7262213114993814, - "grad_norm": 0.5882985591888428, - "learning_rate": 3.162078415100647e-05, - "loss": 0.011305707693099975, - "step": 7760 - }, - { - "epoch": 0.7280930159104624, - "grad_norm": 0.04298723489046097, - "learning_rate": 3.15397506971146e-05, - "loss": 0.04238930344581604, - "step": 7780 - }, - { - "epoch": 0.7299647203215432, - "grad_norm": 6.2729315757751465, - "learning_rate": 3.145864342036372e-05, - "loss": 0.030225831270217895, - "step": 7800 - }, - { - "epoch": 0.7318364247326241, - "grad_norm": 0.026423340663313866, - "learning_rate": 3.1377463236319476e-05, - "loss": 0.012169972807168961, - "step": 7820 - }, - { - "epoch": 0.733708129143705, - "grad_norm": 0.0296376533806324, - "learning_rate": 3.1296211061370495e-05, - "loss": 0.015344823896884918, - "step": 7840 - }, - { - "epoch": 0.7355798335547858, - "grad_norm": 0.029524821788072586, - "learning_rate": 3.1214887812718094e-05, - "loss": 0.028345003724098206, - "step": 7860 - }, - { - "epoch": 0.7374515379658667, - "grad_norm": 0.06847794353961945, - "learning_rate": 3.113349440836588e-05, - "loss": 0.020069575309753417, - "step": 7880 - }, - { - "epoch": 0.7393232423769476, - "grad_norm": 0.024868430569767952, - "learning_rate": 3.1052031767109376e-05, - "loss": 0.014262473583221436, - "step": 7900 - }, - { - "epoch": 0.7411949467880286, - "grad_norm": 0.24450063705444336, - "learning_rate": 3.097050080852573e-05, - "loss": 0.04350808262825012, - "step": 7920 - }, - { - "epoch": 0.7430666511991094, - "grad_norm": 0.06978324800729752, - "learning_rate": 3.088890245296322e-05, - "loss": 0.015559709072113037, - "step": 7940 - }, - { - "epoch": 0.7449383556101903, - "grad_norm": 0.12675604224205017, - "learning_rate": 3.0807237621530964e-05, - "loss": 0.013867451250553131, - "step": 7960 - }, - { - "epoch": 0.7468100600212711, - "grad_norm": 0.2605513334274292, - "learning_rate": 3.072550723608846e-05, - "loss": 0.012869009375572204, - "step": 7980 - }, - { - "epoch": 0.748681764432352, - "grad_norm": 3.325530529022217, - "learning_rate": 3.064371221923521e-05, - "loss": 0.03036353886127472, - "step": 8000 - }, - { - "epoch": 0.7505534688434329, - "grad_norm": 0.22703051567077637, - "learning_rate": 3.0561853494300294e-05, - "loss": 0.009017374366521835, - "step": 8020 - }, - { - "epoch": 0.7524251732545137, - "grad_norm": 6.404862880706787, - "learning_rate": 3.047993198533195e-05, - "loss": 0.020604299008846284, - "step": 8040 - }, - { - "epoch": 0.7542968776655946, - "grad_norm": 0.06491954624652863, - "learning_rate": 3.039794861708714e-05, - "loss": 0.014963623881340028, - "step": 8060 - }, - { - "epoch": 0.7561685820766756, - "grad_norm": 0.4990088641643524, - "learning_rate": 3.0315904315021128e-05, - "loss": 0.02046530395746231, - "step": 8080 - }, - { - "epoch": 0.7580402864877565, - "grad_norm": 0.3174229562282562, - "learning_rate": 3.023380000527699e-05, - "loss": 0.013621781766414643, - "step": 8100 - }, - { - "epoch": 0.7599119908988373, - "grad_norm": 0.07161428034305573, - "learning_rate": 3.0151636614675218e-05, - "loss": 0.008043503761291504, - "step": 8120 - }, - { - "epoch": 0.7617836953099182, - "grad_norm": 0.6772736310958862, - "learning_rate": 3.0069415070703217e-05, - "loss": 0.03563189804553986, - "step": 8140 - }, - { - "epoch": 0.763655399720999, - "grad_norm": 0.07689516246318817, - "learning_rate": 2.998713630150485e-05, - "loss": 0.008622632920742035, - "step": 8160 - }, - { - "epoch": 0.7655271041320799, - "grad_norm": 0.014181110076606274, - "learning_rate": 2.990480123586994e-05, - "loss": 0.012368627637624741, - "step": 8180 - }, - { - "epoch": 0.7673988085431608, - "grad_norm": 4.4751715660095215, - "learning_rate": 2.9822410803223822e-05, - "loss": 0.02100955694913864, - "step": 8200 - }, - { - "epoch": 0.7692705129542416, - "grad_norm": 0.12694527208805084, - "learning_rate": 2.9739965933616825e-05, - "loss": 0.018182000517845152, - "step": 8220 - }, - { - "epoch": 0.7711422173653226, - "grad_norm": 0.13789872825145721, - "learning_rate": 2.9657467557713792e-05, - "loss": 0.008949784934520722, - "step": 8240 - }, - { - "epoch": 0.7730139217764035, - "grad_norm": 0.04048463702201843, - "learning_rate": 2.957491660678354e-05, - "loss": 0.03582434058189392, - "step": 8260 - }, - { - "epoch": 0.7748856261874844, - "grad_norm": 0.7825964689254761, - "learning_rate": 2.9492314012688378e-05, - "loss": 0.012679101526737213, - "step": 8280 - }, - { - "epoch": 0.7767573305985652, - "grad_norm": 0.14350314438343048, - "learning_rate": 2.9409660707873597e-05, - "loss": 0.010909486562013626, - "step": 8300 - }, - { - "epoch": 0.7786290350096461, - "grad_norm": 0.17676737904548645, - "learning_rate": 2.932695762535691e-05, - "loss": 0.01464642733335495, - "step": 8320 - }, - { - "epoch": 0.780500739420727, - "grad_norm": 0.5979751348495483, - "learning_rate": 2.9244205698717943e-05, - "loss": 0.028799059987068176, - "step": 8340 - }, - { - "epoch": 0.7823724438318078, - "grad_norm": 0.08448052406311035, - "learning_rate": 2.9161405862087676e-05, - "loss": 0.014056096971035003, - "step": 8360 - }, - { - "epoch": 0.7842441482428888, - "grad_norm": 0.5616207122802734, - "learning_rate": 2.9078559050137955e-05, - "loss": 0.008744364231824875, - "step": 8380 - }, - { - "epoch": 0.7861158526539697, - "grad_norm": 0.7264829277992249, - "learning_rate": 2.8995666198070836e-05, - "loss": 0.014575870335102081, - "step": 8400 - }, - { - "epoch": 0.7879875570650505, - "grad_norm": 1.444239616394043, - "learning_rate": 2.891272824160815e-05, - "loss": 0.01230706349015236, - "step": 8420 - }, - { - "epoch": 0.7898592614761314, - "grad_norm": 0.02643579989671707, - "learning_rate": 2.882974611698084e-05, - "loss": 0.01713460832834244, - "step": 8440 - }, - { - "epoch": 0.7917309658872123, - "grad_norm": 0.19893163442611694, - "learning_rate": 2.8746720760918457e-05, - "loss": 0.009562552720308305, - "step": 8460 - }, - { - "epoch": 0.7936026702982931, - "grad_norm": 1.8813897371292114, - "learning_rate": 2.866365311063855e-05, - "loss": 0.01966284364461899, - "step": 8480 - }, - { - "epoch": 0.795474374709374, - "grad_norm": 0.1820579618215561, - "learning_rate": 2.8580544103836114e-05, - "loss": 0.023943188786506652, - "step": 8500 - }, - { - "epoch": 0.7973460791204549, - "grad_norm": 1.3913259506225586, - "learning_rate": 2.849739467867298e-05, - "loss": 0.02233349084854126, - "step": 8520 - }, - { - "epoch": 0.7992177835315358, - "grad_norm": 0.28450486063957214, - "learning_rate": 2.8414205773767223e-05, - "loss": 0.016230446100234986, - "step": 8540 - }, - { - "epoch": 0.8010894879426167, - "grad_norm": 0.46086356043815613, - "learning_rate": 2.83309783281826e-05, - "loss": 0.013964855670928955, - "step": 8560 - }, - { - "epoch": 0.8029611923536976, - "grad_norm": 1.1401137113571167, - "learning_rate": 2.8247713281417924e-05, - "loss": 0.01552264392375946, - "step": 8580 - }, - { - "epoch": 0.8048328967647784, - "grad_norm": 0.02414649911224842, - "learning_rate": 2.8164411573396444e-05, - "loss": 0.00505053773522377, - "step": 8600 - }, - { - "epoch": 0.8067046011758593, - "grad_norm": 0.029010778293013573, - "learning_rate": 2.8081074144455276e-05, - "loss": 0.008068422973155975, - "step": 8620 - }, - { - "epoch": 0.8085763055869402, - "grad_norm": 0.024924319237470627, - "learning_rate": 2.7997701935334747e-05, - "loss": 0.021529987454414368, - "step": 8640 - }, - { - "epoch": 0.810448009998021, - "grad_norm": 0.3544171154499054, - "learning_rate": 2.791429588716782e-05, - "loss": 0.008264218270778657, - "step": 8660 - }, - { - "epoch": 0.8123197144091019, - "grad_norm": 0.011211074888706207, - "learning_rate": 2.7830856941469407e-05, - "loss": 0.013752134144306183, - "step": 8680 - }, - { - "epoch": 0.8141914188201829, - "grad_norm": 0.30479249358177185, - "learning_rate": 2.7747386040125807e-05, - "loss": 0.01313515156507492, - "step": 8700 - }, - { - "epoch": 0.8160631232312637, - "grad_norm": 3.1079516410827637, - "learning_rate": 2.766388412538404e-05, - "loss": 0.013471932709217071, - "step": 8720 - }, - { - "epoch": 0.8179348276423446, - "grad_norm": 0.011288405396044254, - "learning_rate": 2.758035213984121e-05, - "loss": 0.011207062005996703, - "step": 8740 - }, - { - "epoch": 0.8198065320534255, - "grad_norm": 0.011481484398245811, - "learning_rate": 2.749679102643387e-05, - "loss": 0.018254657089710236, - "step": 8760 - }, - { - "epoch": 0.8216782364645063, - "grad_norm": 0.037564992904663086, - "learning_rate": 2.7413201728427372e-05, - "loss": 0.024057184159755707, - "step": 8780 - }, - { - "epoch": 0.8235499408755872, - "grad_norm": 0.03808968514204025, - "learning_rate": 2.7329585189405253e-05, - "loss": 0.006051592528820038, - "step": 8800 - }, - { - "epoch": 0.8254216452866681, - "grad_norm": 0.07610247284173965, - "learning_rate": 2.724594235325852e-05, - "loss": 0.025592076778411865, - "step": 8820 - }, - { - "epoch": 0.827293349697749, - "grad_norm": 0.019049810245633125, - "learning_rate": 2.716227416417505e-05, - "loss": 0.0037486787885427477, - "step": 8840 - }, - { - "epoch": 0.8291650541088299, - "grad_norm": 0.6380273699760437, - "learning_rate": 2.7078581566628897e-05, - "loss": 0.015487492084503174, - "step": 8860 - }, - { - "epoch": 0.8310367585199108, - "grad_norm": 0.05775881186127663, - "learning_rate": 2.699486550536968e-05, - "loss": 0.03133237063884735, - "step": 8880 - }, - { - "epoch": 0.8329084629309916, - "grad_norm": 0.047411222010850906, - "learning_rate": 2.6911126925411845e-05, - "loss": 0.00861177071928978, - "step": 8900 - }, - { - "epoch": 0.8347801673420725, - "grad_norm": 0.23981286585330963, - "learning_rate": 2.682736677202406e-05, - "loss": 0.01839599907398224, - "step": 8920 - }, - { - "epoch": 0.8366518717531534, - "grad_norm": 0.36887305974960327, - "learning_rate": 2.6743585990718505e-05, - "loss": 0.01008533239364624, - "step": 8940 - }, - { - "epoch": 0.8385235761642342, - "grad_norm": 0.8994531035423279, - "learning_rate": 2.6659785527240233e-05, - "loss": 0.027107802033424378, - "step": 8960 - }, - { - "epoch": 0.8403952805753151, - "grad_norm": 0.12780402600765228, - "learning_rate": 2.6575966327556458e-05, - "loss": 0.03549482524394989, - "step": 8980 - }, - { - "epoch": 0.8422669849863961, - "grad_norm": 0.3294568359851837, - "learning_rate": 2.649212933784591e-05, - "loss": 0.02797776460647583, - "step": 9000 - }, - { - "epoch": 0.8441386893974769, - "grad_norm": 0.019461506977677345, - "learning_rate": 2.640827550448812e-05, - "loss": 0.010047334432601928, - "step": 9020 - }, - { - "epoch": 0.8460103938085578, - "grad_norm": 0.056546472012996674, - "learning_rate": 2.6324405774052784e-05, - "loss": 0.02831721007823944, - "step": 9040 - }, - { - "epoch": 0.8478820982196387, - "grad_norm": 0.017190299928188324, - "learning_rate": 2.6240521093289022e-05, - "loss": 0.019623257219791412, - "step": 9060 - }, - { - "epoch": 0.8497538026307195, - "grad_norm": 0.04793965816497803, - "learning_rate": 2.6156622409114728e-05, - "loss": 0.011966148018836975, - "step": 9080 - }, - { - "epoch": 0.8516255070418004, - "grad_norm": 0.006742037367075682, - "learning_rate": 2.607271066860587e-05, - "loss": 0.013694784045219422, - "step": 9100 - }, - { - "epoch": 0.8534972114528813, - "grad_norm": 0.03113027848303318, - "learning_rate": 2.5988786818985812e-05, - "loss": 0.05338943004608154, - "step": 9120 - }, - { - "epoch": 0.8553689158639621, - "grad_norm": 0.6589255928993225, - "learning_rate": 2.5904851807614588e-05, - "loss": 0.01305432766675949, - "step": 9140 - }, - { - "epoch": 0.8572406202750431, - "grad_norm": 0.3030281960964203, - "learning_rate": 2.582090658197825e-05, - "loss": 0.03663805425167084, - "step": 9160 - }, - { - "epoch": 0.859112324686124, - "grad_norm": 0.37101081013679504, - "learning_rate": 2.573695208967814e-05, - "loss": 0.016968609392642976, - "step": 9180 - }, - { - "epoch": 0.8609840290972048, - "grad_norm": 0.7480998039245605, - "learning_rate": 2.5652989278420197e-05, - "loss": 0.021240857243537904, - "step": 9200 - }, - { - "epoch": 0.8628557335082857, - "grad_norm": 0.017131274566054344, - "learning_rate": 2.5569019096004304e-05, - "loss": 0.004783949628472328, - "step": 9220 - }, - { - "epoch": 0.8647274379193666, - "grad_norm": 1.1544040441513062, - "learning_rate": 2.5485042490313504e-05, - "loss": 0.02356208860874176, - "step": 9240 - }, - { - "epoch": 0.8665991423304474, - "grad_norm": 0.13512635231018066, - "learning_rate": 2.540106040930338e-05, - "loss": 0.009329542517662048, - "step": 9260 - }, - { - "epoch": 0.8684708467415283, - "grad_norm": 0.018427839502692223, - "learning_rate": 2.5317073800991304e-05, - "loss": 0.007472375035285949, - "step": 9280 - }, - { - "epoch": 0.8703425511526093, - "grad_norm": 0.02722800336778164, - "learning_rate": 2.5233083613445778e-05, - "loss": 0.020304642617702484, - "step": 9300 - }, - { - "epoch": 0.8722142555636901, - "grad_norm": 0.051702745258808136, - "learning_rate": 2.5149090794775675e-05, - "loss": 0.02955295443534851, - "step": 9320 - }, - { - "epoch": 0.874085959974771, - "grad_norm": 0.1535400152206421, - "learning_rate": 2.5065096293119604e-05, - "loss": 0.030047640204429626, - "step": 9340 - }, - { - "epoch": 0.8759576643858519, - "grad_norm": 0.383573979139328, - "learning_rate": 2.498110105663513e-05, - "loss": 0.011377302557229995, - "step": 9360 - }, - { - "epoch": 0.8778293687969327, - "grad_norm": 0.23541487753391266, - "learning_rate": 2.489710603348817e-05, - "loss": 0.02304387390613556, - "step": 9380 - }, - { - "epoch": 0.8797010732080136, - "grad_norm": 0.029004938900470734, - "learning_rate": 2.4813112171842162e-05, - "loss": 0.020582889020442963, - "step": 9400 - }, - { - "epoch": 0.8815727776190945, - "grad_norm": 0.06564116477966309, - "learning_rate": 2.4729120419847498e-05, - "loss": 0.014207787811756134, - "step": 9420 - }, - { - "epoch": 0.8834444820301753, - "grad_norm": 0.01633615791797638, - "learning_rate": 2.464513172563072e-05, - "loss": 0.01756283938884735, - "step": 9440 - }, - { - "epoch": 0.8853161864412563, - "grad_norm": 0.01287770178169012, - "learning_rate": 2.456114703728386e-05, - "loss": 0.003737853467464447, - "step": 9460 - }, - { - "epoch": 0.8871878908523372, - "grad_norm": 0.05004064738750458, - "learning_rate": 2.448136615728485e-05, - "loss": 0.0324675589799881, - "step": 9480 - }, - { - "epoch": 0.889059595263418, - "grad_norm": 1.20869779586792, - "learning_rate": 2.4397392007153162e-05, - "loss": 0.007156150788068772, - "step": 9500 - }, - { - "epoch": 0.8909312996744989, - "grad_norm": 1.1070218086242676, - "learning_rate": 2.43134246594589e-05, - "loss": 0.009275762736797333, - "step": 9520 - }, - { - "epoch": 0.8928030040855798, - "grad_norm": 0.878593385219574, - "learning_rate": 2.4229465062053136e-05, - "loss": 0.018170186877250673, - "step": 9540 - }, - { - "epoch": 2.236302797078385, - "grad_norm": 2.294339179992676, - "learning_rate": 1.4461640332194936e-05, - "loss": 0.07619959115982056, - "step": 9560 - }, - { - "epoch": 2.2409820603868678, - "grad_norm": 0.2697487473487854, - "learning_rate": 1.4366537531356394e-05, - "loss": 0.08616560101509094, - "step": 9580 - }, - { - "epoch": 2.2456613236953507, - "grad_norm": 1.5392569303512573, - "learning_rate": 1.4271622228435674e-05, - "loss": 0.052218639850616456, - "step": 9600 - }, - { - "epoch": 2.2503405870038335, - "grad_norm": 2.0239648818969727, - "learning_rate": 1.4176896097057135e-05, - "loss": 0.08808050155639649, - "step": 9620 - }, - { - "epoch": 2.2550198503123164, - "grad_norm": 1.629538655281067, - "learning_rate": 1.4082360807509482e-05, - "loss": 0.07276531457901, - "step": 9640 - }, - { - "epoch": 2.2596991136207993, - "grad_norm": 1.7065048217773438, - "learning_rate": 1.3988018026716371e-05, - "loss": 0.05087214708328247, - "step": 9660 - }, - { - "epoch": 2.2643783769292822, - "grad_norm": 0.10258202999830246, - "learning_rate": 1.3893869418206949e-05, - "loss": 0.05631760954856872, - "step": 9680 - }, - { - "epoch": 2.269057640237765, - "grad_norm": 0.08703255653381348, - "learning_rate": 1.3799916642086585e-05, - "loss": 0.05722883343696594, - "step": 9700 - }, - { - "epoch": 2.273736903546248, - "grad_norm": 0.6752107739448547, - "learning_rate": 1.3706161355007579e-05, - "loss": 0.07108172178268432, - "step": 9720 - }, - { - "epoch": 2.278416166854731, - "grad_norm": 1.734405279159546, - "learning_rate": 1.3612605210139912e-05, - "loss": 0.04115844368934631, - "step": 9740 - }, - { - "epoch": 2.283095430163214, - "grad_norm": 2.0433499813079834, - "learning_rate": 1.3519249857142147e-05, - "loss": 0.053622370958328246, - "step": 9760 - }, - { - "epoch": 2.2877746934716967, - "grad_norm": 1.466838002204895, - "learning_rate": 1.3426096942132305e-05, - "loss": 0.07005876302719116, - "step": 9780 - }, - { - "epoch": 2.2924539567801796, - "grad_norm": 1.3480894565582275, - "learning_rate": 1.3333148107658883e-05, - "loss": 0.0501272439956665, - "step": 9800 - }, - { - "epoch": 2.2971332200886625, - "grad_norm": 2.2553582191467285, - "learning_rate": 1.3240404992671823e-05, - "loss": 0.058852237462997434, - "step": 9820 - }, - { - "epoch": 2.3018124833971454, - "grad_norm": 0.1796468198299408, - "learning_rate": 1.3147869232493698e-05, - "loss": 0.05089703798294067, - "step": 9840 - }, - { - "epoch": 2.306491746705628, - "grad_norm": 3.135744571685791, - "learning_rate": 1.305554245879079e-05, - "loss": 0.04962855279445648, - "step": 9860 - }, - { - "epoch": 2.3111710100141107, - "grad_norm": 2.1585986614227295, - "learning_rate": 1.296342629954439e-05, - "loss": 0.07206055521965027, - "step": 9880 - }, - { - "epoch": 2.3158502733225936, - "grad_norm": 0.10592425614595413, - "learning_rate": 1.2871522379022038e-05, - "loss": 0.04145916402339935, - "step": 9900 - }, - { - "epoch": 2.3205295366310765, - "grad_norm": 0.5150194764137268, - "learning_rate": 1.2779832317748933e-05, - "loss": 0.05638412833213806, - "step": 9920 - }, - { - "epoch": 2.3252087999395594, - "grad_norm": 1.6214760541915894, - "learning_rate": 1.2688357732479303e-05, - "loss": 0.021433608233928682, - "step": 9940 - }, - { - "epoch": 2.3298880632480423, - "grad_norm": 0.027669433504343033, - "learning_rate": 1.2597100236167963e-05, - "loss": 0.036874741315841675, - "step": 9960 - }, - { - "epoch": 2.334567326556525, - "grad_norm": 1.560826301574707, - "learning_rate": 1.2506061437941804e-05, - "loss": 0.05353221893310547, - "step": 9980 - }, - { - "epoch": 2.339246589865008, - "grad_norm": 0.7329757809638977, - "learning_rate": 1.241524294307147e-05, - "loss": 0.042856207489967345, - "step": 10000 - }, - { - "epoch": 2.343925853173491, - "grad_norm": 0.01610807701945305, - "learning_rate": 1.232464635294302e-05, - "loss": 0.02504704296588898, - "step": 10020 - }, - { - "epoch": 2.348605116481974, - "grad_norm": 0.04221898317337036, - "learning_rate": 1.2234273265029742e-05, - "loss": 0.030704396963119506, - "step": 10040 - }, - { - "epoch": 2.353284379790457, - "grad_norm": 0.22617070376873016, - "learning_rate": 1.2144125272863905e-05, - "loss": 0.020115789771080018, - "step": 10060 - }, - { - "epoch": 2.3579636430989397, - "grad_norm": 0.7796891331672668, - "learning_rate": 1.2054203966008747e-05, - "loss": 0.02525162398815155, - "step": 10080 - }, - { - "epoch": 2.3626429064074226, - "grad_norm": 1.364593744277954, - "learning_rate": 1.1964510930030368e-05, - "loss": 0.015173476934432984, - "step": 10100 - }, - { - "epoch": 2.3673221697159055, - "grad_norm": 0.9444358944892883, - "learning_rate": 1.1875047746469847e-05, - "loss": 0.01972121149301529, - "step": 10120 - }, - { - "epoch": 2.3720014330243884, - "grad_norm": 2.1904690265655518, - "learning_rate": 1.1785815992815274e-05, - "loss": 0.033346959948539735, - "step": 10140 - }, - { - "epoch": 2.3766806963328713, - "grad_norm": 1.3154014348983765, - "learning_rate": 1.1696817242474012e-05, - "loss": 0.04375478029251099, - "step": 10160 - } - ], - "logging_steps": 20, - "max_steps": 14963, - "num_input_tokens_seen": 0, - "num_train_epochs": 4, - "save_steps": 1000000000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.899635329624653e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/slots/8/checkpoint-10168/training_args.bin b/slots/8/checkpoint-10168/training_args.bin deleted file mode 100644 index cba6bf44229020a6cf5d76cffc747dea705142ea..0000000000000000000000000000000000000000 --- a/slots/8/checkpoint-10168/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66430bba102a8f3dc245713cd6268a99c212c508aacce1d8b9768464f5df26ec -size 5201 diff --git a/slots/8/latest.json b/slots/8/latest.json deleted file mode 100644 index 70eb5dc6a893b0bcc9e72dc7a314213162693f3c..0000000000000000000000000000000000000000 --- a/slots/8/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:8", "checkpoint": "slots/8/checkpoint-10168", "step": 10168, "updated_at": 1776816241} diff --git a/slots/9/latest.json b/slots/9/latest.json deleted file mode 100644 index 0d5c7bfbb7c2bf6a82595e8bb63b8759b496cc4e..0000000000000000000000000000000000000000 --- a/slots/9/latest.json +++ /dev/null @@ -1 +0,0 @@ -{"worker_id": "slot:9", "checkpoint": "slots/9/checkpoint-9208", "step": 9208, "updated_at": 1776779778}