qwen3B_bespoke_agentflan / trainer_state.json
groundhogLLM's picture
Upload folder using huggingface_hub
9278a32 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9893190921228303,
"eval_steps": 500,
"global_step": 747,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004005340453938585,
"grad_norm": 12.274865344598787,
"learning_rate": 0.0,
"loss": 0.8646,
"step": 1
},
{
"epoch": 0.00801068090787717,
"grad_norm": 12.535290073417938,
"learning_rate": 1.3333333333333336e-07,
"loss": 0.8676,
"step": 2
},
{
"epoch": 0.012016021361815754,
"grad_norm": 13.07061199884287,
"learning_rate": 2.666666666666667e-07,
"loss": 0.8902,
"step": 3
},
{
"epoch": 0.01602136181575434,
"grad_norm": 12.267049803439043,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.8587,
"step": 4
},
{
"epoch": 0.020026702269692925,
"grad_norm": 11.897699285802622,
"learning_rate": 5.333333333333335e-07,
"loss": 0.8435,
"step": 5
},
{
"epoch": 0.02403204272363151,
"grad_norm": 13.076081077633422,
"learning_rate": 6.666666666666667e-07,
"loss": 0.848,
"step": 6
},
{
"epoch": 0.028037383177570093,
"grad_norm": 12.354188617098337,
"learning_rate": 8.000000000000001e-07,
"loss": 0.8607,
"step": 7
},
{
"epoch": 0.03204272363150868,
"grad_norm": 11.607524627207871,
"learning_rate": 9.333333333333334e-07,
"loss": 0.8296,
"step": 8
},
{
"epoch": 0.036048064085447265,
"grad_norm": 11.787571525976068,
"learning_rate": 1.066666666666667e-06,
"loss": 0.8279,
"step": 9
},
{
"epoch": 0.04005340453938585,
"grad_norm": 10.854034800032643,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.8255,
"step": 10
},
{
"epoch": 0.044058744993324434,
"grad_norm": 10.331302172366403,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.8452,
"step": 11
},
{
"epoch": 0.04806408544726302,
"grad_norm": 10.51455049392686,
"learning_rate": 1.4666666666666669e-06,
"loss": 0.8477,
"step": 12
},
{
"epoch": 0.0520694259012016,
"grad_norm": 6.804734157169537,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.702,
"step": 13
},
{
"epoch": 0.056074766355140186,
"grad_norm": 6.630143554129149,
"learning_rate": 1.7333333333333336e-06,
"loss": 0.716,
"step": 14
},
{
"epoch": 0.06008010680907877,
"grad_norm": 5.650387053647136,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.6831,
"step": 15
},
{
"epoch": 0.06408544726301736,
"grad_norm": 5.847220094115504,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7188,
"step": 16
},
{
"epoch": 0.06809078771695594,
"grad_norm": 2.7400023117365913,
"learning_rate": 2.133333333333334e-06,
"loss": 0.6409,
"step": 17
},
{
"epoch": 0.07209612817089453,
"grad_norm": 2.2018255790223518,
"learning_rate": 2.266666666666667e-06,
"loss": 0.539,
"step": 18
},
{
"epoch": 0.07610146862483311,
"grad_norm": 2.392641868923764,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.6092,
"step": 19
},
{
"epoch": 0.0801068090787717,
"grad_norm": 2.1787060198095847,
"learning_rate": 2.5333333333333338e-06,
"loss": 0.6169,
"step": 20
},
{
"epoch": 0.08411214953271028,
"grad_norm": 1.8580036959151014,
"learning_rate": 2.666666666666667e-06,
"loss": 0.5581,
"step": 21
},
{
"epoch": 0.08811748998664887,
"grad_norm": 1.8125640693516234,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.5566,
"step": 22
},
{
"epoch": 0.09212283044058744,
"grad_norm": 1.7280121022360342,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.5672,
"step": 23
},
{
"epoch": 0.09612817089452604,
"grad_norm": 1.8991117514168228,
"learning_rate": 3.066666666666667e-06,
"loss": 0.5354,
"step": 24
},
{
"epoch": 0.10013351134846461,
"grad_norm": 1.6356492105183125,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.5275,
"step": 25
},
{
"epoch": 0.1041388518024032,
"grad_norm": 1.6267128477825465,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5338,
"step": 26
},
{
"epoch": 0.1081441922563418,
"grad_norm": 1.404686847432176,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.5197,
"step": 27
},
{
"epoch": 0.11214953271028037,
"grad_norm": 1.303663556401033,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.5308,
"step": 28
},
{
"epoch": 0.11615487316421896,
"grad_norm": 1.1046386263286005,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.5012,
"step": 29
},
{
"epoch": 0.12016021361815754,
"grad_norm": 1.0758105382558327,
"learning_rate": 3.866666666666667e-06,
"loss": 0.4804,
"step": 30
},
{
"epoch": 0.12416555407209613,
"grad_norm": 0.889059378144954,
"learning_rate": 4.000000000000001e-06,
"loss": 0.4571,
"step": 31
},
{
"epoch": 0.12817089452603472,
"grad_norm": 0.9541992345873649,
"learning_rate": 4.133333333333333e-06,
"loss": 0.4188,
"step": 32
},
{
"epoch": 0.1321762349799733,
"grad_norm": 0.9939649352643045,
"learning_rate": 4.266666666666668e-06,
"loss": 0.4638,
"step": 33
},
{
"epoch": 0.13618157543391188,
"grad_norm": 1.0368182385408335,
"learning_rate": 4.4e-06,
"loss": 0.4594,
"step": 34
},
{
"epoch": 0.14018691588785046,
"grad_norm": 1.0252643282112182,
"learning_rate": 4.533333333333334e-06,
"loss": 0.4349,
"step": 35
},
{
"epoch": 0.14419225634178906,
"grad_norm": 0.8930734240919034,
"learning_rate": 4.666666666666667e-06,
"loss": 0.4105,
"step": 36
},
{
"epoch": 0.14819759679572764,
"grad_norm": 0.8638620093928763,
"learning_rate": 4.800000000000001e-06,
"loss": 0.444,
"step": 37
},
{
"epoch": 0.15220293724966621,
"grad_norm": 0.8266472764867793,
"learning_rate": 4.933333333333334e-06,
"loss": 0.4264,
"step": 38
},
{
"epoch": 0.15620827770360482,
"grad_norm": 0.7587973597324337,
"learning_rate": 5.0666666666666676e-06,
"loss": 0.4153,
"step": 39
},
{
"epoch": 0.1602136181575434,
"grad_norm": 0.7046790303627571,
"learning_rate": 5.2e-06,
"loss": 0.3968,
"step": 40
},
{
"epoch": 0.16421895861148197,
"grad_norm": 0.7828655737674856,
"learning_rate": 5.333333333333334e-06,
"loss": 0.425,
"step": 41
},
{
"epoch": 0.16822429906542055,
"grad_norm": 0.7970939916520573,
"learning_rate": 5.466666666666667e-06,
"loss": 0.4055,
"step": 42
},
{
"epoch": 0.17222963951935916,
"grad_norm": 0.6788619839771596,
"learning_rate": 5.600000000000001e-06,
"loss": 0.3942,
"step": 43
},
{
"epoch": 0.17623497997329773,
"grad_norm": 0.6213101486173681,
"learning_rate": 5.733333333333334e-06,
"loss": 0.4008,
"step": 44
},
{
"epoch": 0.1802403204272363,
"grad_norm": 0.5942610468735896,
"learning_rate": 5.8666666666666675e-06,
"loss": 0.3918,
"step": 45
},
{
"epoch": 0.1842456608811749,
"grad_norm": 0.661249969118244,
"learning_rate": 6e-06,
"loss": 0.3713,
"step": 46
},
{
"epoch": 0.1882510013351135,
"grad_norm": 0.6165605112645042,
"learning_rate": 6.133333333333334e-06,
"loss": 0.3695,
"step": 47
},
{
"epoch": 0.19225634178905207,
"grad_norm": 0.6418004850122087,
"learning_rate": 6.266666666666668e-06,
"loss": 0.3855,
"step": 48
},
{
"epoch": 0.19626168224299065,
"grad_norm": 0.671209019626683,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.4228,
"step": 49
},
{
"epoch": 0.20026702269692923,
"grad_norm": 0.6303030288370243,
"learning_rate": 6.533333333333334e-06,
"loss": 0.3711,
"step": 50
},
{
"epoch": 0.20427236315086783,
"grad_norm": 0.6417652044922048,
"learning_rate": 6.666666666666667e-06,
"loss": 0.389,
"step": 51
},
{
"epoch": 0.2082777036048064,
"grad_norm": 0.572170970965431,
"learning_rate": 6.800000000000001e-06,
"loss": 0.3795,
"step": 52
},
{
"epoch": 0.21228304405874499,
"grad_norm": 0.5711694232400057,
"learning_rate": 6.9333333333333344e-06,
"loss": 0.3689,
"step": 53
},
{
"epoch": 0.2162883845126836,
"grad_norm": 0.5910040436075836,
"learning_rate": 7.066666666666667e-06,
"loss": 0.365,
"step": 54
},
{
"epoch": 0.22029372496662217,
"grad_norm": 0.6284207342849625,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.4132,
"step": 55
},
{
"epoch": 0.22429906542056074,
"grad_norm": 0.5849289722490485,
"learning_rate": 7.333333333333333e-06,
"loss": 0.373,
"step": 56
},
{
"epoch": 0.22830440587449932,
"grad_norm": 0.6341921136746668,
"learning_rate": 7.4666666666666675e-06,
"loss": 0.3918,
"step": 57
},
{
"epoch": 0.23230974632843793,
"grad_norm": 0.5938896188604564,
"learning_rate": 7.600000000000001e-06,
"loss": 0.3663,
"step": 58
},
{
"epoch": 0.2363150867823765,
"grad_norm": 0.5821270563686713,
"learning_rate": 7.733333333333334e-06,
"loss": 0.3465,
"step": 59
},
{
"epoch": 0.24032042723631508,
"grad_norm": 0.5958193467288128,
"learning_rate": 7.866666666666667e-06,
"loss": 0.3619,
"step": 60
},
{
"epoch": 0.24432576769025366,
"grad_norm": 0.5778869298012563,
"learning_rate": 8.000000000000001e-06,
"loss": 0.3501,
"step": 61
},
{
"epoch": 0.24833110814419226,
"grad_norm": 0.5809265935247063,
"learning_rate": 8.133333333333334e-06,
"loss": 0.3593,
"step": 62
},
{
"epoch": 0.2523364485981308,
"grad_norm": 0.5301298964648872,
"learning_rate": 8.266666666666667e-06,
"loss": 0.3262,
"step": 63
},
{
"epoch": 0.25634178905206945,
"grad_norm": 0.6073029142771318,
"learning_rate": 8.400000000000001e-06,
"loss": 0.3458,
"step": 64
},
{
"epoch": 0.260347129506008,
"grad_norm": 0.5862505044336555,
"learning_rate": 8.533333333333335e-06,
"loss": 0.3444,
"step": 65
},
{
"epoch": 0.2643524699599466,
"grad_norm": 0.5545311112728927,
"learning_rate": 8.666666666666668e-06,
"loss": 0.379,
"step": 66
},
{
"epoch": 0.2683578104138852,
"grad_norm": 0.5979912535625811,
"learning_rate": 8.8e-06,
"loss": 0.3582,
"step": 67
},
{
"epoch": 0.27236315086782376,
"grad_norm": 0.5727967554965969,
"learning_rate": 8.933333333333333e-06,
"loss": 0.3428,
"step": 68
},
{
"epoch": 0.27636849132176233,
"grad_norm": 0.6017340630111007,
"learning_rate": 9.066666666666667e-06,
"loss": 0.3587,
"step": 69
},
{
"epoch": 0.2803738317757009,
"grad_norm": 0.5498581806098397,
"learning_rate": 9.200000000000002e-06,
"loss": 0.3567,
"step": 70
},
{
"epoch": 0.28437917222963954,
"grad_norm": 0.5526640416700183,
"learning_rate": 9.333333333333334e-06,
"loss": 0.3337,
"step": 71
},
{
"epoch": 0.2883845126835781,
"grad_norm": 0.5492315682122837,
"learning_rate": 9.466666666666667e-06,
"loss": 0.3486,
"step": 72
},
{
"epoch": 0.2923898531375167,
"grad_norm": 0.5494699596828775,
"learning_rate": 9.600000000000001e-06,
"loss": 0.3374,
"step": 73
},
{
"epoch": 0.2963951935914553,
"grad_norm": 0.630131268447689,
"learning_rate": 9.733333333333334e-06,
"loss": 0.3568,
"step": 74
},
{
"epoch": 0.30040053404539385,
"grad_norm": 0.6336383497338373,
"learning_rate": 9.866666666666668e-06,
"loss": 0.3616,
"step": 75
},
{
"epoch": 0.30440587449933243,
"grad_norm": 0.5624776217319135,
"learning_rate": 1e-05,
"loss": 0.3505,
"step": 76
},
{
"epoch": 0.308411214953271,
"grad_norm": 0.5899336003315098,
"learning_rate": 9.999945361292553e-06,
"loss": 0.3576,
"step": 77
},
{
"epoch": 0.31241655540720964,
"grad_norm": 0.6756783302903452,
"learning_rate": 9.999781446364366e-06,
"loss": 0.3519,
"step": 78
},
{
"epoch": 0.3164218958611482,
"grad_norm": 0.5644425121126243,
"learning_rate": 9.999508258797876e-06,
"loss": 0.3164,
"step": 79
},
{
"epoch": 0.3204272363150868,
"grad_norm": 0.587830154954018,
"learning_rate": 9.999125804563732e-06,
"loss": 0.3268,
"step": 80
},
{
"epoch": 0.32443257676902537,
"grad_norm": 0.5884963318825209,
"learning_rate": 9.998634092020659e-06,
"loss": 0.345,
"step": 81
},
{
"epoch": 0.32843791722296395,
"grad_norm": 0.6887604600916913,
"learning_rate": 9.998033131915266e-06,
"loss": 0.3803,
"step": 82
},
{
"epoch": 0.3324432576769025,
"grad_norm": 0.5384174093778301,
"learning_rate": 9.997322937381829e-06,
"loss": 0.3344,
"step": 83
},
{
"epoch": 0.3364485981308411,
"grad_norm": 0.5281542891218585,
"learning_rate": 9.996503523941994e-06,
"loss": 0.32,
"step": 84
},
{
"epoch": 0.3404539385847797,
"grad_norm": 0.5164890027100179,
"learning_rate": 9.995574909504434e-06,
"loss": 0.3204,
"step": 85
},
{
"epoch": 0.3444592790387183,
"grad_norm": 0.6252952243404047,
"learning_rate": 9.994537114364471e-06,
"loss": 0.3335,
"step": 86
},
{
"epoch": 0.3484646194926569,
"grad_norm": 0.5558865473599024,
"learning_rate": 9.993390161203615e-06,
"loss": 0.3311,
"step": 87
},
{
"epoch": 0.35246995994659547,
"grad_norm": 0.5761999832501623,
"learning_rate": 9.992134075089085e-06,
"loss": 0.3429,
"step": 88
},
{
"epoch": 0.35647530040053405,
"grad_norm": 0.5374104324302127,
"learning_rate": 9.990768883473243e-06,
"loss": 0.3302,
"step": 89
},
{
"epoch": 0.3604806408544726,
"grad_norm": 0.5310051502544871,
"learning_rate": 9.989294616193018e-06,
"loss": 0.345,
"step": 90
},
{
"epoch": 0.3644859813084112,
"grad_norm": 0.6006990772867254,
"learning_rate": 9.987711305469232e-06,
"loss": 0.3351,
"step": 91
},
{
"epoch": 0.3684913217623498,
"grad_norm": 0.559674099597398,
"learning_rate": 9.986018985905901e-06,
"loss": 0.3423,
"step": 92
},
{
"epoch": 0.3724966622162884,
"grad_norm": 0.5126707083736739,
"learning_rate": 9.984217694489493e-06,
"loss": 0.344,
"step": 93
},
{
"epoch": 0.376502002670227,
"grad_norm": 0.6357067005494667,
"learning_rate": 9.982307470588097e-06,
"loss": 0.3356,
"step": 94
},
{
"epoch": 0.38050734312416556,
"grad_norm": 0.5258363366242368,
"learning_rate": 9.98028835595058e-06,
"loss": 0.3405,
"step": 95
},
{
"epoch": 0.38451268357810414,
"grad_norm": 0.6022837706168479,
"learning_rate": 9.978160394705669e-06,
"loss": 0.3451,
"step": 96
},
{
"epoch": 0.3885180240320427,
"grad_norm": 0.5617885913949726,
"learning_rate": 9.975923633360985e-06,
"loss": 0.3141,
"step": 97
},
{
"epoch": 0.3925233644859813,
"grad_norm": 0.6322761944732146,
"learning_rate": 9.973578120802025e-06,
"loss": 0.3225,
"step": 98
},
{
"epoch": 0.3965287049399199,
"grad_norm": 0.5513939521450553,
"learning_rate": 9.971123908291103e-06,
"loss": 0.3269,
"step": 99
},
{
"epoch": 0.40053404539385845,
"grad_norm": 0.5692799860975164,
"learning_rate": 9.968561049466214e-06,
"loss": 0.337,
"step": 100
},
{
"epoch": 0.4045393858477971,
"grad_norm": 0.552465429973677,
"learning_rate": 9.965889600339877e-06,
"loss": 0.3256,
"step": 101
},
{
"epoch": 0.40854472630173566,
"grad_norm": 0.5542585997979107,
"learning_rate": 9.963109619297905e-06,
"loss": 0.3147,
"step": 102
},
{
"epoch": 0.41255006675567424,
"grad_norm": 0.5724996614177005,
"learning_rate": 9.960221167098124e-06,
"loss": 0.3034,
"step": 103
},
{
"epoch": 0.4165554072096128,
"grad_norm": 0.5546269037589538,
"learning_rate": 9.957224306869053e-06,
"loss": 0.3283,
"step": 104
},
{
"epoch": 0.4205607476635514,
"grad_norm": 0.5445864929651966,
"learning_rate": 9.95411910410852e-06,
"loss": 0.3161,
"step": 105
},
{
"epoch": 0.42456608811748997,
"grad_norm": 0.5679124498352474,
"learning_rate": 9.950905626682229e-06,
"loss": 0.3205,
"step": 106
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.5746860342884514,
"learning_rate": 9.947583944822284e-06,
"loss": 0.3087,
"step": 107
},
{
"epoch": 0.4325767690253672,
"grad_norm": 0.6293503344651058,
"learning_rate": 9.944154131125643e-06,
"loss": 0.3481,
"step": 108
},
{
"epoch": 0.43658210947930576,
"grad_norm": 0.5733682700314644,
"learning_rate": 9.940616260552545e-06,
"loss": 0.3292,
"step": 109
},
{
"epoch": 0.44058744993324434,
"grad_norm": 0.5593414000264296,
"learning_rate": 9.936970410424857e-06,
"loss": 0.3282,
"step": 110
},
{
"epoch": 0.4445927903871829,
"grad_norm": 0.5928528566284356,
"learning_rate": 9.933216660424396e-06,
"loss": 0.3305,
"step": 111
},
{
"epoch": 0.4485981308411215,
"grad_norm": 0.5958761955618564,
"learning_rate": 9.92935509259118e-06,
"loss": 0.3372,
"step": 112
},
{
"epoch": 0.45260347129506007,
"grad_norm": 0.5974132983016888,
"learning_rate": 9.92538579132164e-06,
"loss": 0.3258,
"step": 113
},
{
"epoch": 0.45660881174899864,
"grad_norm": 0.576878904820484,
"learning_rate": 9.921308843366773e-06,
"loss": 0.3223,
"step": 114
},
{
"epoch": 0.4606141522029373,
"grad_norm": 0.5705071294697854,
"learning_rate": 9.917124337830242e-06,
"loss": 0.3078,
"step": 115
},
{
"epoch": 0.46461949265687585,
"grad_norm": 0.630779206880613,
"learning_rate": 9.912832366166443e-06,
"loss": 0.3405,
"step": 116
},
{
"epoch": 0.46862483311081443,
"grad_norm": 0.612175246698219,
"learning_rate": 9.908433022178484e-06,
"loss": 0.3247,
"step": 117
},
{
"epoch": 0.472630173564753,
"grad_norm": 0.5888384503018512,
"learning_rate": 9.903926402016153e-06,
"loss": 0.3237,
"step": 118
},
{
"epoch": 0.4766355140186916,
"grad_norm": 0.584706307851654,
"learning_rate": 9.899312604173814e-06,
"loss": 0.3289,
"step": 119
},
{
"epoch": 0.48064085447263016,
"grad_norm": 0.5850586724735871,
"learning_rate": 9.894591729488243e-06,
"loss": 0.3103,
"step": 120
},
{
"epoch": 0.48464619492656874,
"grad_norm": 0.5676874241983956,
"learning_rate": 9.889763881136439e-06,
"loss": 0.3416,
"step": 121
},
{
"epoch": 0.4886515353805073,
"grad_norm": 0.5507142647058878,
"learning_rate": 9.884829164633359e-06,
"loss": 0.332,
"step": 122
},
{
"epoch": 0.49265687583444595,
"grad_norm": 0.5883422169944877,
"learning_rate": 9.879787687829616e-06,
"loss": 0.341,
"step": 123
},
{
"epoch": 0.49666221628838453,
"grad_norm": 0.6052974016126247,
"learning_rate": 9.874639560909118e-06,
"loss": 0.3145,
"step": 124
},
{
"epoch": 0.5006675567423231,
"grad_norm": 0.5623497695712161,
"learning_rate": 9.869384896386669e-06,
"loss": 0.324,
"step": 125
},
{
"epoch": 0.5046728971962616,
"grad_norm": 0.637548494608964,
"learning_rate": 9.864023809105497e-06,
"loss": 0.3512,
"step": 126
},
{
"epoch": 0.5086782376502003,
"grad_norm": 0.5948450995539613,
"learning_rate": 9.858556416234755e-06,
"loss": 0.3323,
"step": 127
},
{
"epoch": 0.5126835781041389,
"grad_norm": 0.6071663583544622,
"learning_rate": 9.852982837266955e-06,
"loss": 0.3106,
"step": 128
},
{
"epoch": 0.5166889185580774,
"grad_norm": 0.5702598727693834,
"learning_rate": 9.847303194015358e-06,
"loss": 0.2964,
"step": 129
},
{
"epoch": 0.520694259012016,
"grad_norm": 0.5427287917310376,
"learning_rate": 9.841517610611309e-06,
"loss": 0.3146,
"step": 130
},
{
"epoch": 0.5246995994659546,
"grad_norm": 0.5757835163942887,
"learning_rate": 9.835626213501526e-06,
"loss": 0.2962,
"step": 131
},
{
"epoch": 0.5287049399198932,
"grad_norm": 0.5698018296606896,
"learning_rate": 9.829629131445342e-06,
"loss": 0.3193,
"step": 132
},
{
"epoch": 0.5327102803738317,
"grad_norm": 0.5410707033561443,
"learning_rate": 9.82352649551188e-06,
"loss": 0.3106,
"step": 133
},
{
"epoch": 0.5367156208277704,
"grad_norm": 0.5502177437943316,
"learning_rate": 9.817318439077197e-06,
"loss": 0.3085,
"step": 134
},
{
"epoch": 0.540720961281709,
"grad_norm": 0.5582555787315889,
"learning_rate": 9.811005097821362e-06,
"loss": 0.3151,
"step": 135
},
{
"epoch": 0.5447263017356475,
"grad_norm": 0.530130894795682,
"learning_rate": 9.804586609725499e-06,
"loss": 0.3144,
"step": 136
},
{
"epoch": 0.5487316421895861,
"grad_norm": 0.5740015673727541,
"learning_rate": 9.798063115068766e-06,
"loss": 0.3306,
"step": 137
},
{
"epoch": 0.5527369826435247,
"grad_norm": 0.5191944650205522,
"learning_rate": 9.791434756425288e-06,
"loss": 0.3084,
"step": 138
},
{
"epoch": 0.5567423230974633,
"grad_norm": 0.5972892802378095,
"learning_rate": 9.784701678661045e-06,
"loss": 0.3163,
"step": 139
},
{
"epoch": 0.5607476635514018,
"grad_norm": 0.5504117854519875,
"learning_rate": 9.777864028930705e-06,
"loss": 0.3167,
"step": 140
},
{
"epoch": 0.5647530040053405,
"grad_norm": 0.537223650006093,
"learning_rate": 9.770921956674402e-06,
"loss": 0.3006,
"step": 141
},
{
"epoch": 0.5687583444592791,
"grad_norm": 0.5781957843358095,
"learning_rate": 9.763875613614482e-06,
"loss": 0.3123,
"step": 142
},
{
"epoch": 0.5727636849132176,
"grad_norm": 0.5957683062334633,
"learning_rate": 9.756725153752173e-06,
"loss": 0.3154,
"step": 143
},
{
"epoch": 0.5767690253671562,
"grad_norm": 0.5368008525982312,
"learning_rate": 9.749470733364231e-06,
"loss": 0.3108,
"step": 144
},
{
"epoch": 0.5807743658210948,
"grad_norm": 0.5390147399817238,
"learning_rate": 9.742112510999516e-06,
"loss": 0.3267,
"step": 145
},
{
"epoch": 0.5847797062750334,
"grad_norm": 0.538592138527249,
"learning_rate": 9.73465064747553e-06,
"loss": 0.3034,
"step": 146
},
{
"epoch": 0.5887850467289719,
"grad_norm": 0.5909802194217371,
"learning_rate": 9.727085305874906e-06,
"loss": 0.3273,
"step": 147
},
{
"epoch": 0.5927903871829105,
"grad_norm": 0.5625936142604115,
"learning_rate": 9.719416651541839e-06,
"loss": 0.3229,
"step": 148
},
{
"epoch": 0.5967957276368492,
"grad_norm": 0.5606718697695303,
"learning_rate": 9.711644852078472e-06,
"loss": 0.3107,
"step": 149
},
{
"epoch": 0.6008010680907877,
"grad_norm": 0.6078858789603624,
"learning_rate": 9.703770077341236e-06,
"loss": 0.3229,
"step": 150
},
{
"epoch": 0.6048064085447263,
"grad_norm": 0.5549984708342697,
"learning_rate": 9.69579249943714e-06,
"loss": 0.3129,
"step": 151
},
{
"epoch": 0.6088117489986649,
"grad_norm": 0.507347145487873,
"learning_rate": 9.687712292719997e-06,
"loss": 0.3002,
"step": 152
},
{
"epoch": 0.6128170894526035,
"grad_norm": 0.6833991975396279,
"learning_rate": 9.67952963378663e-06,
"loss": 0.3087,
"step": 153
},
{
"epoch": 0.616822429906542,
"grad_norm": 0.5915486951914966,
"learning_rate": 9.671244701472999e-06,
"loss": 0.3393,
"step": 154
},
{
"epoch": 0.6208277703604806,
"grad_norm": 0.5643268444514835,
"learning_rate": 9.662857676850306e-06,
"loss": 0.2944,
"step": 155
},
{
"epoch": 0.6248331108144193,
"grad_norm": 0.6409184914823735,
"learning_rate": 9.654368743221022e-06,
"loss": 0.3247,
"step": 156
},
{
"epoch": 0.6288384512683578,
"grad_norm": 0.625767486890822,
"learning_rate": 9.645778086114892e-06,
"loss": 0.3134,
"step": 157
},
{
"epoch": 0.6328437917222964,
"grad_norm": 0.5700497840235004,
"learning_rate": 9.637085893284875e-06,
"loss": 0.3023,
"step": 158
},
{
"epoch": 0.636849132176235,
"grad_norm": 0.5790034198291902,
"learning_rate": 9.628292354703046e-06,
"loss": 0.2933,
"step": 159
},
{
"epoch": 0.6408544726301736,
"grad_norm": 0.5713100840362291,
"learning_rate": 9.619397662556434e-06,
"loss": 0.3042,
"step": 160
},
{
"epoch": 0.6448598130841121,
"grad_norm": 0.5741995306695465,
"learning_rate": 9.610402011242837e-06,
"loss": 0.3196,
"step": 161
},
{
"epoch": 0.6488651535380507,
"grad_norm": 0.5674889795972151,
"learning_rate": 9.601305597366553e-06,
"loss": 0.3071,
"step": 162
},
{
"epoch": 0.6528704939919893,
"grad_norm": 0.5369636049566915,
"learning_rate": 9.592108619734107e-06,
"loss": 0.3247,
"step": 163
},
{
"epoch": 0.6568758344459279,
"grad_norm": 0.5443809471875736,
"learning_rate": 9.582811279349881e-06,
"loss": 0.3072,
"step": 164
},
{
"epoch": 0.6608811748998665,
"grad_norm": 0.5953124014685344,
"learning_rate": 9.573413779411745e-06,
"loss": 0.3085,
"step": 165
},
{
"epoch": 0.664886515353805,
"grad_norm": 0.5401734564217464,
"learning_rate": 9.563916325306595e-06,
"loss": 0.29,
"step": 166
},
{
"epoch": 0.6688918558077437,
"grad_norm": 0.5444349651712469,
"learning_rate": 9.55431912460588e-06,
"loss": 0.3054,
"step": 167
},
{
"epoch": 0.6728971962616822,
"grad_norm": 0.510267722435052,
"learning_rate": 9.544622387061055e-06,
"loss": 0.28,
"step": 168
},
{
"epoch": 0.6769025367156208,
"grad_norm": 0.5184264543864224,
"learning_rate": 9.534826324599002e-06,
"loss": 0.2955,
"step": 169
},
{
"epoch": 0.6809078771695594,
"grad_norm": 0.5637636626391551,
"learning_rate": 9.5249311513174e-06,
"loss": 0.2792,
"step": 170
},
{
"epoch": 0.684913217623498,
"grad_norm": 0.5428313722322577,
"learning_rate": 9.514937083480037e-06,
"loss": 0.2945,
"step": 171
},
{
"epoch": 0.6889185580774366,
"grad_norm": 0.5561412606219924,
"learning_rate": 9.504844339512096e-06,
"loss": 0.315,
"step": 172
},
{
"epoch": 0.6929238985313751,
"grad_norm": 0.5081631254602269,
"learning_rate": 9.494653139995368e-06,
"loss": 0.3066,
"step": 173
},
{
"epoch": 0.6969292389853138,
"grad_norm": 0.5856758014884262,
"learning_rate": 9.484363707663443e-06,
"loss": 0.2801,
"step": 174
},
{
"epoch": 0.7009345794392523,
"grad_norm": 0.5414939052665023,
"learning_rate": 9.473976267396831e-06,
"loss": 0.2894,
"step": 175
},
{
"epoch": 0.7049399198931909,
"grad_norm": 0.5188788468344311,
"learning_rate": 9.463491046218058e-06,
"loss": 0.2917,
"step": 176
},
{
"epoch": 0.7089452603471295,
"grad_norm": 0.6208165205427856,
"learning_rate": 9.452908273286699e-06,
"loss": 0.3124,
"step": 177
},
{
"epoch": 0.7129506008010681,
"grad_norm": 0.4892884392964166,
"learning_rate": 9.442228179894362e-06,
"loss": 0.2937,
"step": 178
},
{
"epoch": 0.7169559412550067,
"grad_norm": 0.5126422005865922,
"learning_rate": 9.431450999459653e-06,
"loss": 0.2902,
"step": 179
},
{
"epoch": 0.7209612817089452,
"grad_norm": 0.5578472838688182,
"learning_rate": 9.420576967523049e-06,
"loss": 0.2886,
"step": 180
},
{
"epoch": 0.7249666221628839,
"grad_norm": 0.5457635354712708,
"learning_rate": 9.409606321741776e-06,
"loss": 0.299,
"step": 181
},
{
"epoch": 0.7289719626168224,
"grad_norm": 0.5665035398169768,
"learning_rate": 9.398539301884592e-06,
"loss": 0.2975,
"step": 182
},
{
"epoch": 0.732977303070761,
"grad_norm": 0.5286590948141091,
"learning_rate": 9.387376149826564e-06,
"loss": 0.2767,
"step": 183
},
{
"epoch": 0.7369826435246996,
"grad_norm": 0.5743317468381327,
"learning_rate": 9.376117109543769e-06,
"loss": 0.2909,
"step": 184
},
{
"epoch": 0.7409879839786382,
"grad_norm": 0.6431953313269012,
"learning_rate": 9.364762427107971e-06,
"loss": 0.3004,
"step": 185
},
{
"epoch": 0.7449933244325768,
"grad_norm": 0.6117784289038739,
"learning_rate": 9.353312350681242e-06,
"loss": 0.3062,
"step": 186
},
{
"epoch": 0.7489986648865153,
"grad_norm": 0.5466166236913528,
"learning_rate": 9.341767130510529e-06,
"loss": 0.3047,
"step": 187
},
{
"epoch": 0.753004005340454,
"grad_norm": 0.5672388678846847,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3099,
"step": 188
},
{
"epoch": 0.7570093457943925,
"grad_norm": 0.5854324070547063,
"learning_rate": 9.318392270316501e-06,
"loss": 0.3097,
"step": 189
},
{
"epoch": 0.7610146862483311,
"grad_norm": 0.5582358269319914,
"learning_rate": 9.306563141162046e-06,
"loss": 0.3061,
"step": 190
},
{
"epoch": 0.7650200267022697,
"grad_norm": 0.5807552655282949,
"learning_rate": 9.29463988999016e-06,
"loss": 0.3004,
"step": 191
},
{
"epoch": 0.7690253671562083,
"grad_norm": 0.5445709450895333,
"learning_rate": 9.282622777389258e-06,
"loss": 0.2864,
"step": 192
},
{
"epoch": 0.7730307076101469,
"grad_norm": 0.6479747482171502,
"learning_rate": 9.270512065999139e-06,
"loss": 0.2979,
"step": 193
},
{
"epoch": 0.7770360480640854,
"grad_norm": 0.5604311181657256,
"learning_rate": 9.258308020505247e-06,
"loss": 0.2997,
"step": 194
},
{
"epoch": 0.7810413885180241,
"grad_norm": 0.5829812198586952,
"learning_rate": 9.246010907632894e-06,
"loss": 0.3233,
"step": 195
},
{
"epoch": 0.7850467289719626,
"grad_norm": 0.580492515245855,
"learning_rate": 9.233620996141421e-06,
"loss": 0.299,
"step": 196
},
{
"epoch": 0.7890520694259012,
"grad_norm": 0.592467400441266,
"learning_rate": 9.221138556818327e-06,
"loss": 0.2967,
"step": 197
},
{
"epoch": 0.7930574098798397,
"grad_norm": 0.5997985716991547,
"learning_rate": 9.20856386247335e-06,
"loss": 0.3123,
"step": 198
},
{
"epoch": 0.7970627503337784,
"grad_norm": 0.5397407737667249,
"learning_rate": 9.195897187932513e-06,
"loss": 0.2953,
"step": 199
},
{
"epoch": 0.8010680907877169,
"grad_norm": 0.5362871237537865,
"learning_rate": 9.1831388100321e-06,
"loss": 0.283,
"step": 200
},
{
"epoch": 0.8050734312416555,
"grad_norm": 0.5737882959212091,
"learning_rate": 9.170289007612625e-06,
"loss": 0.2922,
"step": 201
},
{
"epoch": 0.8090787716955942,
"grad_norm": 0.5932417559868806,
"learning_rate": 9.157348061512728e-06,
"loss": 0.2955,
"step": 202
},
{
"epoch": 0.8130841121495327,
"grad_norm": 0.5072437104528961,
"learning_rate": 9.144316254563032e-06,
"loss": 0.2696,
"step": 203
},
{
"epoch": 0.8170894526034713,
"grad_norm": 0.557818245925382,
"learning_rate": 9.131193871579975e-06,
"loss": 0.2994,
"step": 204
},
{
"epoch": 0.8210947930574098,
"grad_norm": 0.5973919395531655,
"learning_rate": 9.117981199359575e-06,
"loss": 0.3008,
"step": 205
},
{
"epoch": 0.8251001335113485,
"grad_norm": 0.574306055152991,
"learning_rate": 9.104678526671162e-06,
"loss": 0.3086,
"step": 206
},
{
"epoch": 0.829105473965287,
"grad_norm": 0.6115940443338315,
"learning_rate": 9.091286144251077e-06,
"loss": 0.2893,
"step": 207
},
{
"epoch": 0.8331108144192256,
"grad_norm": 0.4939307262823331,
"learning_rate": 9.077804344796302e-06,
"loss": 0.2758,
"step": 208
},
{
"epoch": 0.8371161548731643,
"grad_norm": 0.5523854629181829,
"learning_rate": 9.064233422958078e-06,
"loss": 0.2761,
"step": 209
},
{
"epoch": 0.8411214953271028,
"grad_norm": 0.5641559704847691,
"learning_rate": 9.050573675335453e-06,
"loss": 0.2702,
"step": 210
},
{
"epoch": 0.8451268357810414,
"grad_norm": 0.538525400332704,
"learning_rate": 9.036825400468814e-06,
"loss": 0.2625,
"step": 211
},
{
"epoch": 0.8491321762349799,
"grad_norm": 0.5571857402527918,
"learning_rate": 9.022988898833342e-06,
"loss": 0.2812,
"step": 212
},
{
"epoch": 0.8531375166889186,
"grad_norm": 0.5875899405650873,
"learning_rate": 9.009064472832468e-06,
"loss": 0.3085,
"step": 213
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.5955430126526817,
"learning_rate": 8.995052426791247e-06,
"loss": 0.2921,
"step": 214
},
{
"epoch": 0.8611481975967957,
"grad_norm": 0.5826373250067038,
"learning_rate": 8.980953066949708e-06,
"loss": 0.2912,
"step": 215
},
{
"epoch": 0.8651535380507344,
"grad_norm": 0.5537935549036811,
"learning_rate": 8.966766701456177e-06,
"loss": 0.2809,
"step": 216
},
{
"epoch": 0.8691588785046729,
"grad_norm": 0.5838980864534432,
"learning_rate": 8.952493640360518e-06,
"loss": 0.2909,
"step": 217
},
{
"epoch": 0.8731642189586115,
"grad_norm": 0.5875405818886061,
"learning_rate": 8.938134195607378e-06,
"loss": 0.2952,
"step": 218
},
{
"epoch": 0.87716955941255,
"grad_norm": 0.6226962389915714,
"learning_rate": 8.923688681029356e-06,
"loss": 0.325,
"step": 219
},
{
"epoch": 0.8811748998664887,
"grad_norm": 0.5984817454130974,
"learning_rate": 8.90915741234015e-06,
"loss": 0.3124,
"step": 220
},
{
"epoch": 0.8851802403204272,
"grad_norm": 0.5487459145803628,
"learning_rate": 8.894540707127655e-06,
"loss": 0.2926,
"step": 221
},
{
"epoch": 0.8891855807743658,
"grad_norm": 0.5437039065539668,
"learning_rate": 8.879838884847025e-06,
"loss": 0.2769,
"step": 222
},
{
"epoch": 0.8931909212283045,
"grad_norm": 0.5361919135525014,
"learning_rate": 8.865052266813686e-06,
"loss": 0.2565,
"step": 223
},
{
"epoch": 0.897196261682243,
"grad_norm": 0.6022317598018883,
"learning_rate": 8.850181176196316e-06,
"loss": 0.2904,
"step": 224
},
{
"epoch": 0.9012016021361816,
"grad_norm": 0.5777383647207497,
"learning_rate": 8.835225938009781e-06,
"loss": 0.2942,
"step": 225
},
{
"epoch": 0.9052069425901201,
"grad_norm": 0.5517455644071223,
"learning_rate": 8.820186879108038e-06,
"loss": 0.2827,
"step": 226
},
{
"epoch": 0.9092122830440588,
"grad_norm": 0.5746060945697256,
"learning_rate": 8.80506432817698e-06,
"loss": 0.2901,
"step": 227
},
{
"epoch": 0.9132176234979973,
"grad_norm": 0.5678185790220811,
"learning_rate": 8.789858615727266e-06,
"loss": 0.277,
"step": 228
},
{
"epoch": 0.9172229639519359,
"grad_norm": 0.5818515518798564,
"learning_rate": 8.77457007408708e-06,
"loss": 0.2805,
"step": 229
},
{
"epoch": 0.9212283044058746,
"grad_norm": 0.5828788812029032,
"learning_rate": 8.759199037394888e-06,
"loss": 0.3054,
"step": 230
},
{
"epoch": 0.9252336448598131,
"grad_norm": 0.5578305381732657,
"learning_rate": 8.743745841592118e-06,
"loss": 0.279,
"step": 231
},
{
"epoch": 0.9292389853137517,
"grad_norm": 0.5710661098406483,
"learning_rate": 8.728210824415829e-06,
"loss": 0.2734,
"step": 232
},
{
"epoch": 0.9332443257676902,
"grad_norm": 0.5767939333864601,
"learning_rate": 8.712594325391324e-06,
"loss": 0.2699,
"step": 233
},
{
"epoch": 0.9372496662216289,
"grad_norm": 0.574270861342953,
"learning_rate": 8.69689668582473e-06,
"loss": 0.2766,
"step": 234
},
{
"epoch": 0.9412550066755674,
"grad_norm": 0.5757498082823792,
"learning_rate": 8.681118248795548e-06,
"loss": 0.2818,
"step": 235
},
{
"epoch": 0.945260347129506,
"grad_norm": 0.6077724733956369,
"learning_rate": 8.665259359149132e-06,
"loss": 0.2969,
"step": 236
},
{
"epoch": 0.9492656875834445,
"grad_norm": 0.549944516647202,
"learning_rate": 8.649320363489178e-06,
"loss": 0.2609,
"step": 237
},
{
"epoch": 0.9532710280373832,
"grad_norm": 0.5456975844935816,
"learning_rate": 8.633301610170136e-06,
"loss": 0.287,
"step": 238
},
{
"epoch": 0.9572763684913218,
"grad_norm": 0.5280058829694398,
"learning_rate": 8.617203449289593e-06,
"loss": 0.2644,
"step": 239
},
{
"epoch": 0.9612817089452603,
"grad_norm": 0.5920277658059444,
"learning_rate": 8.601026232680634e-06,
"loss": 0.291,
"step": 240
},
{
"epoch": 0.965287049399199,
"grad_norm": 0.6029720289705192,
"learning_rate": 8.584770313904138e-06,
"loss": 0.2883,
"step": 241
},
{
"epoch": 0.9692923898531375,
"grad_norm": 0.5726602829217651,
"learning_rate": 8.568436048241062e-06,
"loss": 0.265,
"step": 242
},
{
"epoch": 0.9732977303070761,
"grad_norm": 0.5825245425360942,
"learning_rate": 8.552023792684672e-06,
"loss": 0.2868,
"step": 243
},
{
"epoch": 0.9773030707610146,
"grad_norm": 0.6673081291871541,
"learning_rate": 8.535533905932739e-06,
"loss": 0.3054,
"step": 244
},
{
"epoch": 0.9813084112149533,
"grad_norm": 0.6031779392976561,
"learning_rate": 8.518966748379702e-06,
"loss": 0.2851,
"step": 245
},
{
"epoch": 0.9853137516688919,
"grad_norm": 0.6325037575700174,
"learning_rate": 8.502322682108792e-06,
"loss": 0.269,
"step": 246
},
{
"epoch": 0.9893190921228304,
"grad_norm": 0.630286272644598,
"learning_rate": 8.485602070884118e-06,
"loss": 0.2835,
"step": 247
},
{
"epoch": 0.9933244325767691,
"grad_norm": 0.5550792941097276,
"learning_rate": 8.46880528014271e-06,
"loss": 0.2716,
"step": 248
},
{
"epoch": 0.9973297730307076,
"grad_norm": 0.5955656163999381,
"learning_rate": 8.451932676986543e-06,
"loss": 0.2919,
"step": 249
},
{
"epoch": 1.0,
"grad_norm": 0.5955656163999381,
"learning_rate": 8.43498463017451e-06,
"loss": 0.2708,
"step": 250
},
{
"epoch": 1.0040053404539386,
"grad_norm": 0.762897141846903,
"learning_rate": 8.417961510114357e-06,
"loss": 0.2589,
"step": 251
},
{
"epoch": 1.0080106809078773,
"grad_norm": 0.5667256244239827,
"learning_rate": 8.400863688854598e-06,
"loss": 0.2347,
"step": 252
},
{
"epoch": 1.0120160213618157,
"grad_norm": 0.5182073558824543,
"learning_rate": 8.383691540076372e-06,
"loss": 0.2473,
"step": 253
},
{
"epoch": 1.0160213618157543,
"grad_norm": 0.5411427558297038,
"learning_rate": 8.366445439085286e-06,
"loss": 0.239,
"step": 254
},
{
"epoch": 1.020026702269693,
"grad_norm": 0.5411386326348857,
"learning_rate": 8.349125762803204e-06,
"loss": 0.255,
"step": 255
},
{
"epoch": 1.0240320427236316,
"grad_norm": 0.5768680583207191,
"learning_rate": 8.331732889760021e-06,
"loss": 0.2304,
"step": 256
},
{
"epoch": 1.02803738317757,
"grad_norm": 0.5663192339105902,
"learning_rate": 8.314267200085373e-06,
"loss": 0.2364,
"step": 257
},
{
"epoch": 1.0320427236315086,
"grad_norm": 0.5782681146396127,
"learning_rate": 8.296729075500345e-06,
"loss": 0.2554,
"step": 258
},
{
"epoch": 1.0360480640854473,
"grad_norm": 0.5924117511990626,
"learning_rate": 8.279118899309121e-06,
"loss": 0.2381,
"step": 259
},
{
"epoch": 1.0400534045393859,
"grad_norm": 0.5849704627036197,
"learning_rate": 8.261437056390607e-06,
"loss": 0.2592,
"step": 260
},
{
"epoch": 1.0440587449933245,
"grad_norm": 0.5462578431369289,
"learning_rate": 8.243683933190019e-06,
"loss": 0.2481,
"step": 261
},
{
"epoch": 1.048064085447263,
"grad_norm": 0.5687469203522241,
"learning_rate": 8.22585991771044e-06,
"loss": 0.2406,
"step": 262
},
{
"epoch": 1.0520694259012016,
"grad_norm": 0.5764520043363477,
"learning_rate": 8.207965399504334e-06,
"loss": 0.2435,
"step": 263
},
{
"epoch": 1.0560747663551402,
"grad_norm": 0.6130154606997985,
"learning_rate": 8.190000769665044e-06,
"loss": 0.2494,
"step": 264
},
{
"epoch": 1.0600801068090788,
"grad_norm": 0.5421995984684055,
"learning_rate": 8.171966420818227e-06,
"loss": 0.2435,
"step": 265
},
{
"epoch": 1.0640854472630175,
"grad_norm": 0.5828640036968468,
"learning_rate": 8.153862747113293e-06,
"loss": 0.2353,
"step": 266
},
{
"epoch": 1.0680907877169559,
"grad_norm": 0.5148839059504708,
"learning_rate": 8.135690144214767e-06,
"loss": 0.2318,
"step": 267
},
{
"epoch": 1.0720961281708945,
"grad_norm": 0.5486187246706559,
"learning_rate": 8.117449009293668e-06,
"loss": 0.2416,
"step": 268
},
{
"epoch": 1.0761014686248331,
"grad_norm": 0.564502169912709,
"learning_rate": 8.099139741018809e-06,
"loss": 0.2364,
"step": 269
},
{
"epoch": 1.0801068090787718,
"grad_norm": 0.6097314278041118,
"learning_rate": 8.08076273954809e-06,
"loss": 0.2598,
"step": 270
},
{
"epoch": 1.0841121495327102,
"grad_norm": 0.6059107746858474,
"learning_rate": 8.062318406519751e-06,
"loss": 0.2507,
"step": 271
},
{
"epoch": 1.0881174899866488,
"grad_norm": 0.6241821796107588,
"learning_rate": 8.043807145043604e-06,
"loss": 0.2479,
"step": 272
},
{
"epoch": 1.0921228304405874,
"grad_norm": 0.5883002088770041,
"learning_rate": 8.025229359692206e-06,
"loss": 0.2504,
"step": 273
},
{
"epoch": 1.096128170894526,
"grad_norm": 0.5888253144437603,
"learning_rate": 8.00658545649203e-06,
"loss": 0.2346,
"step": 274
},
{
"epoch": 1.1001335113484647,
"grad_norm": 0.5409284658955128,
"learning_rate": 7.987875842914583e-06,
"loss": 0.2357,
"step": 275
},
{
"epoch": 1.1041388518024031,
"grad_norm": 0.5648850017659398,
"learning_rate": 7.969100927867508e-06,
"loss": 0.2479,
"step": 276
},
{
"epoch": 1.1081441922563418,
"grad_norm": 0.6139375755294754,
"learning_rate": 7.950261121685642e-06,
"loss": 0.2452,
"step": 277
},
{
"epoch": 1.1121495327102804,
"grad_norm": 0.6246425570636841,
"learning_rate": 7.931356836122046e-06,
"loss": 0.2404,
"step": 278
},
{
"epoch": 1.116154873164219,
"grad_norm": 0.5298624506016548,
"learning_rate": 7.912388484339012e-06,
"loss": 0.2318,
"step": 279
},
{
"epoch": 1.1201602136181577,
"grad_norm": 0.5727259445791012,
"learning_rate": 7.89335648089903e-06,
"loss": 0.2444,
"step": 280
},
{
"epoch": 1.124165554072096,
"grad_norm": 0.568496396477039,
"learning_rate": 7.874261241755726e-06,
"loss": 0.2361,
"step": 281
},
{
"epoch": 1.1281708945260347,
"grad_norm": 0.5698858845026502,
"learning_rate": 7.855103184244777e-06,
"loss": 0.2475,
"step": 282
},
{
"epoch": 1.1321762349799733,
"grad_norm": 0.6033437616235542,
"learning_rate": 7.835882727074779e-06,
"loss": 0.2483,
"step": 283
},
{
"epoch": 1.136181575433912,
"grad_norm": 0.6137682287341324,
"learning_rate": 7.81660029031811e-06,
"loss": 0.2485,
"step": 284
},
{
"epoch": 1.1401869158878504,
"grad_norm": 0.5389251730544439,
"learning_rate": 7.797256295401738e-06,
"loss": 0.2287,
"step": 285
},
{
"epoch": 1.144192256341789,
"grad_norm": 0.582366745214894,
"learning_rate": 7.777851165098012e-06,
"loss": 0.247,
"step": 286
},
{
"epoch": 1.1481975967957276,
"grad_norm": 0.5617439076162762,
"learning_rate": 7.75838532351543e-06,
"loss": 0.223,
"step": 287
},
{
"epoch": 1.1522029372496663,
"grad_norm": 0.5457463297035726,
"learning_rate": 7.738859196089358e-06,
"loss": 0.2481,
"step": 288
},
{
"epoch": 1.156208277703605,
"grad_norm": 0.6026062150338968,
"learning_rate": 7.719273209572745e-06,
"loss": 0.2602,
"step": 289
},
{
"epoch": 1.1602136181575433,
"grad_norm": 0.595754963300469,
"learning_rate": 7.699627792026784e-06,
"loss": 0.2388,
"step": 290
},
{
"epoch": 1.164218958611482,
"grad_norm": 0.5245236864467587,
"learning_rate": 7.679923372811564e-06,
"loss": 0.2353,
"step": 291
},
{
"epoch": 1.1682242990654206,
"grad_norm": 0.6296844005130243,
"learning_rate": 7.660160382576683e-06,
"loss": 0.2342,
"step": 292
},
{
"epoch": 1.1722296395193592,
"grad_norm": 0.5981183888141479,
"learning_rate": 7.64033925325184e-06,
"loss": 0.2416,
"step": 293
},
{
"epoch": 1.1762349799732976,
"grad_norm": 0.584776287003421,
"learning_rate": 7.620460418037388e-06,
"loss": 0.228,
"step": 294
},
{
"epoch": 1.1802403204272363,
"grad_norm": 0.5906417247227626,
"learning_rate": 7.600524311394873e-06,
"loss": 0.2323,
"step": 295
},
{
"epoch": 1.1842456608811749,
"grad_norm": 0.5834730538207583,
"learning_rate": 7.580531369037534e-06,
"loss": 0.2428,
"step": 296
},
{
"epoch": 1.1882510013351135,
"grad_norm": 0.592770510365303,
"learning_rate": 7.5604820279207816e-06,
"loss": 0.2311,
"step": 297
},
{
"epoch": 1.1922563417890522,
"grad_norm": 0.5932974539859142,
"learning_rate": 7.540376726232648e-06,
"loss": 0.2456,
"step": 298
},
{
"epoch": 1.1962616822429906,
"grad_norm": 0.599679016601175,
"learning_rate": 7.520215903384215e-06,
"loss": 0.2319,
"step": 299
},
{
"epoch": 1.2002670226969292,
"grad_norm": 0.543968089573662,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2451,
"step": 300
},
{
"epoch": 1.2042723631508678,
"grad_norm": 0.5983969022372734,
"learning_rate": 7.4797294579083405e-06,
"loss": 0.2491,
"step": 301
},
{
"epoch": 1.2082777036048065,
"grad_norm": 0.5538744153921799,
"learning_rate": 7.459404720131717e-06,
"loss": 0.233,
"step": 302
},
{
"epoch": 1.2122830440587449,
"grad_norm": 0.5900495952853351,
"learning_rate": 7.439026230877096e-06,
"loss": 0.2297,
"step": 303
},
{
"epoch": 1.2162883845126835,
"grad_norm": 0.5465667743441658,
"learning_rate": 7.4185944355261996e-06,
"loss": 0.2528,
"step": 304
},
{
"epoch": 1.2202937249666221,
"grad_norm": 0.6269598978844211,
"learning_rate": 7.398109780625784e-06,
"loss": 0.2501,
"step": 305
},
{
"epoch": 1.2242990654205608,
"grad_norm": 0.5806451562215877,
"learning_rate": 7.3775727138778776e-06,
"loss": 0.2391,
"step": 306
},
{
"epoch": 1.2283044058744994,
"grad_norm": 0.6320385790504774,
"learning_rate": 7.3569836841299905e-06,
"loss": 0.2464,
"step": 307
},
{
"epoch": 1.232309746328438,
"grad_norm": 0.5737559605551135,
"learning_rate": 7.336343141365311e-06,
"loss": 0.2441,
"step": 308
},
{
"epoch": 1.2363150867823764,
"grad_norm": 0.5952972084149591,
"learning_rate": 7.315651536692873e-06,
"loss": 0.2564,
"step": 309
},
{
"epoch": 1.240320427236315,
"grad_norm": 0.7146063936502873,
"learning_rate": 7.294909322337689e-06,
"loss": 0.2313,
"step": 310
},
{
"epoch": 1.2443257676902537,
"grad_norm": 0.595320191460265,
"learning_rate": 7.274116951630873e-06,
"loss": 0.2368,
"step": 311
},
{
"epoch": 1.2483311081441923,
"grad_norm": 0.5744358155705048,
"learning_rate": 7.253274878999728e-06,
"loss": 0.2282,
"step": 312
},
{
"epoch": 1.2523364485981308,
"grad_norm": 0.6298803013176558,
"learning_rate": 7.232383559957815e-06,
"loss": 0.2418,
"step": 313
},
{
"epoch": 1.2563417890520694,
"grad_norm": 0.6145313830867569,
"learning_rate": 7.211443451095007e-06,
"loss": 0.2365,
"step": 314
},
{
"epoch": 1.260347129506008,
"grad_norm": 0.6285532186169481,
"learning_rate": 7.190455010067494e-06,
"loss": 0.2347,
"step": 315
},
{
"epoch": 1.2643524699599467,
"grad_norm": 0.590793052150211,
"learning_rate": 7.169418695587791e-06,
"loss": 0.2303,
"step": 316
},
{
"epoch": 1.2683578104138853,
"grad_norm": 0.5713241346393119,
"learning_rate": 7.1483349674147125e-06,
"loss": 0.2242,
"step": 317
},
{
"epoch": 1.2723631508678237,
"grad_norm": 0.5490796477452554,
"learning_rate": 7.127204286343321e-06,
"loss": 0.2338,
"step": 318
},
{
"epoch": 1.2763684913217623,
"grad_norm": 0.6071154182227954,
"learning_rate": 7.106027114194856e-06,
"loss": 0.225,
"step": 319
},
{
"epoch": 1.280373831775701,
"grad_norm": 0.5963758796133684,
"learning_rate": 7.084803913806642e-06,
"loss": 0.2369,
"step": 320
},
{
"epoch": 1.2843791722296396,
"grad_norm": 0.6339033842544861,
"learning_rate": 7.063535149021974e-06,
"loss": 0.2441,
"step": 321
},
{
"epoch": 1.288384512683578,
"grad_norm": 0.6284003653179433,
"learning_rate": 7.042221284679982e-06,
"loss": 0.2402,
"step": 322
},
{
"epoch": 1.2923898531375166,
"grad_norm": 0.6593560684745596,
"learning_rate": 7.02086278660546e-06,
"loss": 0.2535,
"step": 323
},
{
"epoch": 1.2963951935914553,
"grad_norm": 0.6387070843334016,
"learning_rate": 6.999460121598704e-06,
"loss": 0.2297,
"step": 324
},
{
"epoch": 1.300400534045394,
"grad_norm": 0.5750425275519615,
"learning_rate": 6.978013757425295e-06,
"loss": 0.2355,
"step": 325
},
{
"epoch": 1.3044058744993325,
"grad_norm": 0.5586774593218413,
"learning_rate": 6.956524162805875e-06,
"loss": 0.2384,
"step": 326
},
{
"epoch": 1.308411214953271,
"grad_norm": 0.5596782830604753,
"learning_rate": 6.934991807405919e-06,
"loss": 0.2305,
"step": 327
},
{
"epoch": 1.3124165554072096,
"grad_norm": 0.5665505846964202,
"learning_rate": 6.913417161825449e-06,
"loss": 0.2239,
"step": 328
},
{
"epoch": 1.3164218958611482,
"grad_norm": 0.5958541676468069,
"learning_rate": 6.8918006975887685e-06,
"loss": 0.2441,
"step": 329
},
{
"epoch": 1.3204272363150868,
"grad_norm": 0.5843413853980698,
"learning_rate": 6.870142887134141e-06,
"loss": 0.2221,
"step": 330
},
{
"epoch": 1.3244325767690253,
"grad_norm": 0.5533441366477334,
"learning_rate": 6.848444203803476e-06,
"loss": 0.224,
"step": 331
},
{
"epoch": 1.328437917222964,
"grad_norm": 0.5944276005227449,
"learning_rate": 6.8267051218319766e-06,
"loss": 0.2333,
"step": 332
},
{
"epoch": 1.3324432576769025,
"grad_norm": 0.5816039732327815,
"learning_rate": 6.804926116337779e-06,
"loss": 0.2332,
"step": 333
},
{
"epoch": 1.3364485981308412,
"grad_norm": 0.5997442007990729,
"learning_rate": 6.783107663311566e-06,
"loss": 0.2288,
"step": 334
},
{
"epoch": 1.3404539385847798,
"grad_norm": 0.5758289065119726,
"learning_rate": 6.7612502396061685e-06,
"loss": 0.238,
"step": 335
},
{
"epoch": 1.3444592790387184,
"grad_norm": 0.578849426349599,
"learning_rate": 6.739354322926136e-06,
"loss": 0.2382,
"step": 336
},
{
"epoch": 1.3484646194926568,
"grad_norm": 0.571090431459051,
"learning_rate": 6.717420391817306e-06,
"loss": 0.2663,
"step": 337
},
{
"epoch": 1.3524699599465955,
"grad_norm": 0.6196058792645048,
"learning_rate": 6.6954489256563334e-06,
"loss": 0.2274,
"step": 338
},
{
"epoch": 1.356475300400534,
"grad_norm": 0.5922325724177396,
"learning_rate": 6.6734404046402256e-06,
"loss": 0.2199,
"step": 339
},
{
"epoch": 1.3604806408544725,
"grad_norm": 0.5523613961098914,
"learning_rate": 6.651395309775837e-06,
"loss": 0.2352,
"step": 340
},
{
"epoch": 1.3644859813084111,
"grad_norm": 0.6615232115067652,
"learning_rate": 6.629314122869363e-06,
"loss": 0.2259,
"step": 341
},
{
"epoch": 1.3684913217623498,
"grad_norm": 0.5749887582077661,
"learning_rate": 6.607197326515808e-06,
"loss": 0.2515,
"step": 342
},
{
"epoch": 1.3724966622162884,
"grad_norm": 0.6229806856360468,
"learning_rate": 6.585045404088442e-06,
"loss": 0.2446,
"step": 343
},
{
"epoch": 1.376502002670227,
"grad_norm": 0.6535943167246338,
"learning_rate": 6.562858839728224e-06,
"loss": 0.233,
"step": 344
},
{
"epoch": 1.3805073431241657,
"grad_norm": 0.564932235491322,
"learning_rate": 6.540638118333235e-06,
"loss": 0.2377,
"step": 345
},
{
"epoch": 1.384512683578104,
"grad_norm": 0.5864382063135621,
"learning_rate": 6.518383725548074e-06,
"loss": 0.2351,
"step": 346
},
{
"epoch": 1.3885180240320427,
"grad_norm": 0.5719846231002432,
"learning_rate": 6.4960961477532444e-06,
"loss": 0.2213,
"step": 347
},
{
"epoch": 1.3925233644859814,
"grad_norm": 0.5996971644613003,
"learning_rate": 6.473775872054522e-06,
"loss": 0.2315,
"step": 348
},
{
"epoch": 1.3965287049399198,
"grad_norm": 0.6164036127115975,
"learning_rate": 6.451423386272312e-06,
"loss": 0.233,
"step": 349
},
{
"epoch": 1.4005340453938584,
"grad_norm": 0.6375735882940162,
"learning_rate": 6.429039178930989e-06,
"loss": 0.2303,
"step": 350
},
{
"epoch": 1.404539385847797,
"grad_norm": 0.6098688935758428,
"learning_rate": 6.406623739248214e-06,
"loss": 0.2337,
"step": 351
},
{
"epoch": 1.4085447263017357,
"grad_norm": 0.5853193211453952,
"learning_rate": 6.384177557124247e-06,
"loss": 0.2317,
"step": 352
},
{
"epoch": 1.4125500667556743,
"grad_norm": 0.5660416243135848,
"learning_rate": 6.361701123131242e-06,
"loss": 0.2399,
"step": 353
},
{
"epoch": 1.416555407209613,
"grad_norm": 0.6522270889233022,
"learning_rate": 6.339194928502516e-06,
"loss": 0.2438,
"step": 354
},
{
"epoch": 1.4205607476635513,
"grad_norm": 0.6108617575895426,
"learning_rate": 6.3166594651218235e-06,
"loss": 0.2273,
"step": 355
},
{
"epoch": 1.42456608811749,
"grad_norm": 0.5025445202572053,
"learning_rate": 6.294095225512604e-06,
"loss": 0.2134,
"step": 356
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.5765280021465139,
"learning_rate": 6.271502702827209e-06,
"loss": 0.2249,
"step": 357
},
{
"epoch": 1.4325767690253672,
"grad_norm": 0.5862375523852222,
"learning_rate": 6.248882390836135e-06,
"loss": 0.2326,
"step": 358
},
{
"epoch": 1.4365821094793056,
"grad_norm": 0.6482330755264025,
"learning_rate": 6.226234783917224e-06,
"loss": 0.2264,
"step": 359
},
{
"epoch": 1.4405874499332443,
"grad_norm": 0.5899710814731541,
"learning_rate": 6.2035603770448664e-06,
"loss": 0.2261,
"step": 360
},
{
"epoch": 1.444592790387183,
"grad_norm": 0.5866295112067526,
"learning_rate": 6.180859665779173e-06,
"loss": 0.2523,
"step": 361
},
{
"epoch": 1.4485981308411215,
"grad_norm": 0.6837319847065889,
"learning_rate": 6.158133146255153e-06,
"loss": 0.2423,
"step": 362
},
{
"epoch": 1.4526034712950602,
"grad_norm": 0.6307432382267119,
"learning_rate": 6.135381315171867e-06,
"loss": 0.2425,
"step": 363
},
{
"epoch": 1.4566088117489986,
"grad_norm": 0.5774555129513689,
"learning_rate": 6.112604669781572e-06,
"loss": 0.244,
"step": 364
},
{
"epoch": 1.4606141522029372,
"grad_norm": 0.5910862302024886,
"learning_rate": 6.089803707878855e-06,
"loss": 0.2466,
"step": 365
},
{
"epoch": 1.4646194926568759,
"grad_norm": 0.5816261166399118,
"learning_rate": 6.066978927789751e-06,
"loss": 0.2166,
"step": 366
},
{
"epoch": 1.4686248331108145,
"grad_norm": 0.5583031010374657,
"learning_rate": 6.04413082836085e-06,
"loss": 0.2274,
"step": 367
},
{
"epoch": 1.472630173564753,
"grad_norm": 0.6020357565888714,
"learning_rate": 6.0212599089484026e-06,
"loss": 0.2423,
"step": 368
},
{
"epoch": 1.4766355140186915,
"grad_norm": 0.6086024579173414,
"learning_rate": 5.998366669407398e-06,
"loss": 0.2347,
"step": 369
},
{
"epoch": 1.4806408544726302,
"grad_norm": 0.6639055801702823,
"learning_rate": 5.975451610080643e-06,
"loss": 0.2257,
"step": 370
},
{
"epoch": 1.4846461949265688,
"grad_norm": 0.5765943328550973,
"learning_rate": 5.952515231787825e-06,
"loss": 0.2299,
"step": 371
},
{
"epoch": 1.4886515353805074,
"grad_norm": 0.6523998837222308,
"learning_rate": 5.929558035814574e-06,
"loss": 0.232,
"step": 372
},
{
"epoch": 1.492656875834446,
"grad_norm": 0.6525256233306673,
"learning_rate": 5.906580523901493e-06,
"loss": 0.2249,
"step": 373
},
{
"epoch": 1.4966622162883845,
"grad_norm": 0.6002500523113792,
"learning_rate": 5.883583198233202e-06,
"loss": 0.2296,
"step": 374
},
{
"epoch": 1.500667556742323,
"grad_norm": 0.5859819020795045,
"learning_rate": 5.86056656142736e-06,
"loss": 0.245,
"step": 375
},
{
"epoch": 1.5046728971962615,
"grad_norm": 0.6224565671534654,
"learning_rate": 5.837531116523683e-06,
"loss": 0.2144,
"step": 376
},
{
"epoch": 1.5086782376502001,
"grad_norm": 0.5645624857480281,
"learning_rate": 5.814477366972945e-06,
"loss": 0.2379,
"step": 377
},
{
"epoch": 1.5126835781041388,
"grad_norm": 0.6412786483722962,
"learning_rate": 5.791405816625974e-06,
"loss": 0.2307,
"step": 378
},
{
"epoch": 1.5166889185580774,
"grad_norm": 0.6304941089005965,
"learning_rate": 5.768316969722651e-06,
"loss": 0.2225,
"step": 379
},
{
"epoch": 1.520694259012016,
"grad_norm": 0.5988757758307962,
"learning_rate": 5.745211330880872e-06,
"loss": 0.2258,
"step": 380
},
{
"epoch": 1.5246995994659547,
"grad_norm": 0.5815105109581388,
"learning_rate": 5.722089405085537e-06,
"loss": 0.2414,
"step": 381
},
{
"epoch": 1.5287049399198933,
"grad_norm": 0.6249599659350047,
"learning_rate": 5.698951697677498e-06,
"loss": 0.2174,
"step": 382
},
{
"epoch": 1.5327102803738317,
"grad_norm": 0.6137726726584709,
"learning_rate": 5.6757987143425276e-06,
"loss": 0.2236,
"step": 383
},
{
"epoch": 1.5367156208277704,
"grad_norm": 0.5347431576806394,
"learning_rate": 5.65263096110026e-06,
"loss": 0.2269,
"step": 384
},
{
"epoch": 1.540720961281709,
"grad_norm": 0.5727634021907735,
"learning_rate": 5.629448944293128e-06,
"loss": 0.2005,
"step": 385
},
{
"epoch": 1.5447263017356474,
"grad_norm": 0.5965865418656213,
"learning_rate": 5.6062531705753075e-06,
"loss": 0.2424,
"step": 386
},
{
"epoch": 1.548731642189586,
"grad_norm": 0.6169279262529003,
"learning_rate": 5.583044146901638e-06,
"loss": 0.2232,
"step": 387
},
{
"epoch": 1.5527369826435247,
"grad_norm": 0.6333814417689827,
"learning_rate": 5.559822380516539e-06,
"loss": 0.2353,
"step": 388
},
{
"epoch": 1.5567423230974633,
"grad_norm": 0.5578859188906384,
"learning_rate": 5.536588378942933e-06,
"loss": 0.2279,
"step": 389
},
{
"epoch": 1.560747663551402,
"grad_norm": 0.5725028706674725,
"learning_rate": 5.513342649971143e-06,
"loss": 0.2166,
"step": 390
},
{
"epoch": 1.5647530040053406,
"grad_norm": 0.5718602933152237,
"learning_rate": 5.490085701647805e-06,
"loss": 0.2248,
"step": 391
},
{
"epoch": 1.5687583444592792,
"grad_norm": 0.5624960499027032,
"learning_rate": 5.466818042264754e-06,
"loss": 0.2218,
"step": 392
},
{
"epoch": 1.5727636849132176,
"grad_norm": 0.5783959440608054,
"learning_rate": 5.443540180347927e-06,
"loss": 0.2271,
"step": 393
},
{
"epoch": 1.5767690253671562,
"grad_norm": 0.6463973620860086,
"learning_rate": 5.420252624646238e-06,
"loss": 0.2297,
"step": 394
},
{
"epoch": 1.5807743658210947,
"grad_norm": 0.5796227960968064,
"learning_rate": 5.396955884120465e-06,
"loss": 0.2249,
"step": 395
},
{
"epoch": 1.5847797062750333,
"grad_norm": 0.6378570148256715,
"learning_rate": 5.373650467932122e-06,
"loss": 0.1968,
"step": 396
},
{
"epoch": 1.588785046728972,
"grad_norm": 0.537428645891956,
"learning_rate": 5.350336885432337e-06,
"loss": 0.2161,
"step": 397
},
{
"epoch": 1.5927903871829105,
"grad_norm": 0.6322802889879201,
"learning_rate": 5.327015646150716e-06,
"loss": 0.2342,
"step": 398
},
{
"epoch": 1.5967957276368492,
"grad_norm": 0.6773017918561144,
"learning_rate": 5.303687259784206e-06,
"loss": 0.2234,
"step": 399
},
{
"epoch": 1.6008010680907878,
"grad_norm": 0.5862814494934032,
"learning_rate": 5.2803522361859596e-06,
"loss": 0.2101,
"step": 400
},
{
"epoch": 1.6048064085447264,
"grad_norm": 0.5824962855214252,
"learning_rate": 5.257011085354187e-06,
"loss": 0.2432,
"step": 401
},
{
"epoch": 1.6088117489986649,
"grad_norm": 0.6031958185300317,
"learning_rate": 5.233664317421012e-06,
"loss": 0.232,
"step": 402
},
{
"epoch": 1.6128170894526035,
"grad_norm": 0.5862841084106785,
"learning_rate": 5.210312442641327e-06,
"loss": 0.2216,
"step": 403
},
{
"epoch": 1.616822429906542,
"grad_norm": 0.6566275421066564,
"learning_rate": 5.18695597138163e-06,
"loss": 0.2285,
"step": 404
},
{
"epoch": 1.6208277703604805,
"grad_norm": 0.6361225120156005,
"learning_rate": 5.1635954141088815e-06,
"loss": 0.2243,
"step": 405
},
{
"epoch": 1.6248331108144192,
"grad_norm": 0.6538742068020424,
"learning_rate": 5.140231281379345e-06,
"loss": 0.2301,
"step": 406
},
{
"epoch": 1.6288384512683578,
"grad_norm": 0.6553343367565488,
"learning_rate": 5.116864083827425e-06,
"loss": 0.2437,
"step": 407
},
{
"epoch": 1.6328437917222964,
"grad_norm": 0.5611348158619709,
"learning_rate": 5.093494332154511e-06,
"loss": 0.2146,
"step": 408
},
{
"epoch": 1.636849132176235,
"grad_norm": 0.549466753270084,
"learning_rate": 5.070122537117812e-06,
"loss": 0.2496,
"step": 409
},
{
"epoch": 1.6408544726301737,
"grad_norm": 0.636480400336947,
"learning_rate": 5.046749209519197e-06,
"loss": 0.2032,
"step": 410
},
{
"epoch": 1.644859813084112,
"grad_norm": 0.5984681247401569,
"learning_rate": 5.023374860194028e-06,
"loss": 0.204,
"step": 411
},
{
"epoch": 1.6488651535380507,
"grad_norm": 0.5956484635411822,
"learning_rate": 5e-06,
"loss": 0.2227,
"step": 412
},
{
"epoch": 1.6528704939919892,
"grad_norm": 0.5495043593104894,
"learning_rate": 4.976625139805974e-06,
"loss": 0.2201,
"step": 413
},
{
"epoch": 1.6568758344459278,
"grad_norm": 0.5546136194425144,
"learning_rate": 4.953250790480805e-06,
"loss": 0.2139,
"step": 414
},
{
"epoch": 1.6608811748998664,
"grad_norm": 0.5761321975039401,
"learning_rate": 4.92987746288219e-06,
"loss": 0.2164,
"step": 415
},
{
"epoch": 1.664886515353805,
"grad_norm": 0.5842428190534428,
"learning_rate": 4.90650566784549e-06,
"loss": 0.2367,
"step": 416
},
{
"epoch": 1.6688918558077437,
"grad_norm": 0.6113684961889835,
"learning_rate": 4.883135916172576e-06,
"loss": 0.2367,
"step": 417
},
{
"epoch": 1.6728971962616823,
"grad_norm": 0.6680898005899061,
"learning_rate": 4.859768718620656e-06,
"loss": 0.2132,
"step": 418
},
{
"epoch": 1.676902536715621,
"grad_norm": 0.5445125151437461,
"learning_rate": 4.83640458589112e-06,
"loss": 0.2138,
"step": 419
},
{
"epoch": 1.6809078771695594,
"grad_norm": 0.5559680738599577,
"learning_rate": 4.8130440286183725e-06,
"loss": 0.2267,
"step": 420
},
{
"epoch": 1.684913217623498,
"grad_norm": 0.6144551876120194,
"learning_rate": 4.789687557358676e-06,
"loss": 0.2182,
"step": 421
},
{
"epoch": 1.6889185580774366,
"grad_norm": 0.5897924033640597,
"learning_rate": 4.7663356825789894e-06,
"loss": 0.2122,
"step": 422
},
{
"epoch": 1.692923898531375,
"grad_norm": 0.5598416977353012,
"learning_rate": 4.742988914645814e-06,
"loss": 0.2216,
"step": 423
},
{
"epoch": 1.6969292389853137,
"grad_norm": 0.5601120553987341,
"learning_rate": 4.719647763814041e-06,
"loss": 0.2177,
"step": 424
},
{
"epoch": 1.7009345794392523,
"grad_norm": 0.5891529754303583,
"learning_rate": 4.696312740215794e-06,
"loss": 0.2005,
"step": 425
},
{
"epoch": 1.704939919893191,
"grad_norm": 0.5680866327716454,
"learning_rate": 4.672984353849285e-06,
"loss": 0.2326,
"step": 426
},
{
"epoch": 1.7089452603471296,
"grad_norm": 0.5657360719382496,
"learning_rate": 4.649663114567663e-06,
"loss": 0.2131,
"step": 427
},
{
"epoch": 1.7129506008010682,
"grad_norm": 0.5740029198869598,
"learning_rate": 4.626349532067879e-06,
"loss": 0.2138,
"step": 428
},
{
"epoch": 1.7169559412550068,
"grad_norm": 0.6388774843029362,
"learning_rate": 4.603044115879536e-06,
"loss": 0.2251,
"step": 429
},
{
"epoch": 1.7209612817089452,
"grad_norm": 0.6877647851263269,
"learning_rate": 4.579747375353763e-06,
"loss": 0.2212,
"step": 430
},
{
"epoch": 1.7249666221628839,
"grad_norm": 0.6414623377958295,
"learning_rate": 4.556459819652074e-06,
"loss": 0.2414,
"step": 431
},
{
"epoch": 1.7289719626168223,
"grad_norm": 0.6323839747789576,
"learning_rate": 4.533181957735247e-06,
"loss": 0.2339,
"step": 432
},
{
"epoch": 1.732977303070761,
"grad_norm": 0.639653221825577,
"learning_rate": 4.509914298352197e-06,
"loss": 0.2215,
"step": 433
},
{
"epoch": 1.7369826435246996,
"grad_norm": 0.6284764820090778,
"learning_rate": 4.486657350028859e-06,
"loss": 0.2119,
"step": 434
},
{
"epoch": 1.7409879839786382,
"grad_norm": 0.6067872660667347,
"learning_rate": 4.463411621057068e-06,
"loss": 0.2177,
"step": 435
},
{
"epoch": 1.7449933244325768,
"grad_norm": 0.6234859050465342,
"learning_rate": 4.4401776194834615e-06,
"loss": 0.207,
"step": 436
},
{
"epoch": 1.7489986648865155,
"grad_norm": 0.6562941478198795,
"learning_rate": 4.4169558530983635e-06,
"loss": 0.2043,
"step": 437
},
{
"epoch": 1.753004005340454,
"grad_norm": 0.5909532065510827,
"learning_rate": 4.393746829424693e-06,
"loss": 0.2124,
"step": 438
},
{
"epoch": 1.7570093457943925,
"grad_norm": 0.6266064150386942,
"learning_rate": 4.3705510557068746e-06,
"loss": 0.2156,
"step": 439
},
{
"epoch": 1.7610146862483311,
"grad_norm": 0.5569141516837799,
"learning_rate": 4.347369038899744e-06,
"loss": 0.1997,
"step": 440
},
{
"epoch": 1.7650200267022695,
"grad_norm": 0.6592779504723204,
"learning_rate": 4.324201285657474e-06,
"loss": 0.215,
"step": 441
},
{
"epoch": 1.7690253671562082,
"grad_norm": 0.6388494836347968,
"learning_rate": 4.3010483023225045e-06,
"loss": 0.2293,
"step": 442
},
{
"epoch": 1.7730307076101468,
"grad_norm": 0.691528320994546,
"learning_rate": 4.277910594914466e-06,
"loss": 0.1971,
"step": 443
},
{
"epoch": 1.7770360480640854,
"grad_norm": 0.6052938531570694,
"learning_rate": 4.254788669119127e-06,
"loss": 0.215,
"step": 444
},
{
"epoch": 1.781041388518024,
"grad_norm": 0.5915787822047736,
"learning_rate": 4.231683030277349e-06,
"loss": 0.22,
"step": 445
},
{
"epoch": 1.7850467289719627,
"grad_norm": 0.669779074015281,
"learning_rate": 4.208594183374026e-06,
"loss": 0.2139,
"step": 446
},
{
"epoch": 1.7890520694259013,
"grad_norm": 0.5637355181902335,
"learning_rate": 4.185522633027057e-06,
"loss": 0.2074,
"step": 447
},
{
"epoch": 1.7930574098798397,
"grad_norm": 0.6354293774014055,
"learning_rate": 4.162468883476319e-06,
"loss": 0.2029,
"step": 448
},
{
"epoch": 1.7970627503337784,
"grad_norm": 0.5585098291406209,
"learning_rate": 4.139433438572641e-06,
"loss": 0.2149,
"step": 449
},
{
"epoch": 1.8010680907877168,
"grad_norm": 0.5982104585865691,
"learning_rate": 4.116416801766801e-06,
"loss": 0.2272,
"step": 450
},
{
"epoch": 1.8050734312416554,
"grad_norm": 0.5958872678061956,
"learning_rate": 4.0934194760985095e-06,
"loss": 0.2033,
"step": 451
},
{
"epoch": 1.809078771695594,
"grad_norm": 0.6149274909600425,
"learning_rate": 4.070441964185428e-06,
"loss": 0.2047,
"step": 452
},
{
"epoch": 1.8130841121495327,
"grad_norm": 0.649920066288117,
"learning_rate": 4.047484768212175e-06,
"loss": 0.2127,
"step": 453
},
{
"epoch": 1.8170894526034713,
"grad_norm": 0.6231664353854522,
"learning_rate": 4.02454838991936e-06,
"loss": 0.214,
"step": 454
},
{
"epoch": 1.82109479305741,
"grad_norm": 0.6324772858655361,
"learning_rate": 4.001633330592604e-06,
"loss": 0.2279,
"step": 455
},
{
"epoch": 1.8251001335113486,
"grad_norm": 0.6387099548525226,
"learning_rate": 3.978740091051599e-06,
"loss": 0.231,
"step": 456
},
{
"epoch": 1.829105473965287,
"grad_norm": 0.5888035825906338,
"learning_rate": 3.955869171639151e-06,
"loss": 0.2043,
"step": 457
},
{
"epoch": 1.8331108144192256,
"grad_norm": 0.5900537278953912,
"learning_rate": 3.933021072210251e-06,
"loss": 0.2189,
"step": 458
},
{
"epoch": 1.8371161548731643,
"grad_norm": 0.6155751843123176,
"learning_rate": 3.910196292121147e-06,
"loss": 0.2172,
"step": 459
},
{
"epoch": 1.8411214953271027,
"grad_norm": 0.6185704225951081,
"learning_rate": 3.887395330218429e-06,
"loss": 0.2433,
"step": 460
},
{
"epoch": 1.8451268357810413,
"grad_norm": 0.5757793970809637,
"learning_rate": 3.864618684828135e-06,
"loss": 0.2086,
"step": 461
},
{
"epoch": 1.84913217623498,
"grad_norm": 0.5715122189230345,
"learning_rate": 3.84186685374485e-06,
"loss": 0.2174,
"step": 462
},
{
"epoch": 1.8531375166889186,
"grad_norm": 0.562934776821966,
"learning_rate": 3.81914033422083e-06,
"loss": 0.1969,
"step": 463
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.584422709822903,
"learning_rate": 3.7964396229551365e-06,
"loss": 0.2264,
"step": 464
},
{
"epoch": 1.8611481975967958,
"grad_norm": 0.5887703309285004,
"learning_rate": 3.7737652160827752e-06,
"loss": 0.2118,
"step": 465
},
{
"epoch": 1.8651535380507345,
"grad_norm": 0.5978955576130686,
"learning_rate": 3.751117609163865e-06,
"loss": 0.2162,
"step": 466
},
{
"epoch": 1.8691588785046729,
"grad_norm": 0.6119642861090717,
"learning_rate": 3.7284972971727907e-06,
"loss": 0.225,
"step": 467
},
{
"epoch": 1.8731642189586115,
"grad_norm": 0.6210379401883211,
"learning_rate": 3.705904774487396e-06,
"loss": 0.2056,
"step": 468
},
{
"epoch": 1.87716955941255,
"grad_norm": 0.6081726738147702,
"learning_rate": 3.683340534878176e-06,
"loss": 0.2046,
"step": 469
},
{
"epoch": 1.8811748998664886,
"grad_norm": 0.6467889875621615,
"learning_rate": 3.6608050714974854e-06,
"loss": 0.2503,
"step": 470
},
{
"epoch": 1.8851802403204272,
"grad_norm": 0.6021455582975629,
"learning_rate": 3.63829887686876e-06,
"loss": 0.2244,
"step": 471
},
{
"epoch": 1.8891855807743658,
"grad_norm": 0.634859275238841,
"learning_rate": 3.6158224428757538e-06,
"loss": 0.2208,
"step": 472
},
{
"epoch": 1.8931909212283045,
"grad_norm": 0.6211383819455257,
"learning_rate": 3.5933762607517875e-06,
"loss": 0.2201,
"step": 473
},
{
"epoch": 1.897196261682243,
"grad_norm": 0.5805284245558593,
"learning_rate": 3.5709608210690127e-06,
"loss": 0.2171,
"step": 474
},
{
"epoch": 1.9012016021361817,
"grad_norm": 0.5969539880951342,
"learning_rate": 3.5485766137276894e-06,
"loss": 0.1989,
"step": 475
},
{
"epoch": 1.9052069425901201,
"grad_norm": 0.5763877271910841,
"learning_rate": 3.526224127945479e-06,
"loss": 0.2051,
"step": 476
},
{
"epoch": 1.9092122830440588,
"grad_norm": 0.573639466833024,
"learning_rate": 3.5039038522467572e-06,
"loss": 0.2216,
"step": 477
},
{
"epoch": 1.9132176234979972,
"grad_norm": 0.6187640637863968,
"learning_rate": 3.4816162744519266e-06,
"loss": 0.2207,
"step": 478
},
{
"epoch": 1.9172229639519358,
"grad_norm": 0.6135054921454743,
"learning_rate": 3.459361881666766e-06,
"loss": 0.216,
"step": 479
},
{
"epoch": 1.9212283044058744,
"grad_norm": 0.5975760329649652,
"learning_rate": 3.4371411602717785e-06,
"loss": 0.1997,
"step": 480
},
{
"epoch": 1.925233644859813,
"grad_norm": 0.5864424343197752,
"learning_rate": 3.4149545959115604e-06,
"loss": 0.1997,
"step": 481
},
{
"epoch": 1.9292389853137517,
"grad_norm": 0.6206284942820284,
"learning_rate": 3.3928026734841935e-06,
"loss": 0.1958,
"step": 482
},
{
"epoch": 1.9332443257676903,
"grad_norm": 0.627723357086264,
"learning_rate": 3.3706858771306393e-06,
"loss": 0.2099,
"step": 483
},
{
"epoch": 1.937249666221629,
"grad_norm": 0.6391449489733926,
"learning_rate": 3.3486046902241663e-06,
"loss": 0.1946,
"step": 484
},
{
"epoch": 1.9412550066755674,
"grad_norm": 0.6232074962194952,
"learning_rate": 3.3265595953597774e-06,
"loss": 0.2167,
"step": 485
},
{
"epoch": 1.945260347129506,
"grad_norm": 0.7178387576084606,
"learning_rate": 3.3045510743436665e-06,
"loss": 0.2052,
"step": 486
},
{
"epoch": 1.9492656875834444,
"grad_norm": 0.66259069636869,
"learning_rate": 3.2825796081826943e-06,
"loss": 0.2103,
"step": 487
},
{
"epoch": 1.953271028037383,
"grad_norm": 0.6319630042493534,
"learning_rate": 3.2606456770738636e-06,
"loss": 0.1899,
"step": 488
},
{
"epoch": 1.9572763684913217,
"grad_norm": 0.6583941141635503,
"learning_rate": 3.2387497603938327e-06,
"loss": 0.2057,
"step": 489
},
{
"epoch": 1.9612817089452603,
"grad_norm": 0.6583371328399131,
"learning_rate": 3.216892336688435e-06,
"loss": 0.208,
"step": 490
},
{
"epoch": 1.965287049399199,
"grad_norm": 0.6626656334140933,
"learning_rate": 3.1950738836622226e-06,
"loss": 0.213,
"step": 491
},
{
"epoch": 1.9692923898531376,
"grad_norm": 0.57636570851929,
"learning_rate": 3.173294878168025e-06,
"loss": 0.2123,
"step": 492
},
{
"epoch": 1.9732977303070762,
"grad_norm": 0.628166413351637,
"learning_rate": 3.1515557961965254e-06,
"loss": 0.2082,
"step": 493
},
{
"epoch": 1.9773030707610146,
"grad_norm": 0.6161007175331755,
"learning_rate": 3.1298571128658593e-06,
"loss": 0.2071,
"step": 494
},
{
"epoch": 1.9813084112149533,
"grad_norm": 0.5578291746094638,
"learning_rate": 3.1081993024112328e-06,
"loss": 0.1953,
"step": 495
},
{
"epoch": 1.985313751668892,
"grad_norm": 0.5742324867913953,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.2183,
"step": 496
},
{
"epoch": 1.9893190921228303,
"grad_norm": 0.5651971848567813,
"learning_rate": 3.0650081925940834e-06,
"loss": 0.1881,
"step": 497
},
{
"epoch": 1.993324432576769,
"grad_norm": 0.5805509174084333,
"learning_rate": 3.043475837194126e-06,
"loss": 0.2148,
"step": 498
},
{
"epoch": 1.9973297730307076,
"grad_norm": 0.753242305787025,
"learning_rate": 3.021986242574707e-06,
"loss": 0.2226,
"step": 499
},
{
"epoch": 2.0,
"grad_norm": 0.7484446599170808,
"learning_rate": 3.000539878401296e-06,
"loss": 0.2156,
"step": 500
},
{
"epoch": 2.0040053404539386,
"grad_norm": 0.7266923236604501,
"learning_rate": 2.9791372133945405e-06,
"loss": 0.1868,
"step": 501
},
{
"epoch": 2.0080106809078773,
"grad_norm": 0.6838990546286972,
"learning_rate": 2.95777871532002e-06,
"loss": 0.1693,
"step": 502
},
{
"epoch": 2.012016021361816,
"grad_norm": 0.6522038544426395,
"learning_rate": 2.936464850978027e-06,
"loss": 0.1648,
"step": 503
},
{
"epoch": 2.0160213618157545,
"grad_norm": 0.6932425119304833,
"learning_rate": 2.9151960861933616e-06,
"loss": 0.1781,
"step": 504
},
{
"epoch": 2.0200267022696927,
"grad_norm": 0.6187151564266777,
"learning_rate": 2.893972885805148e-06,
"loss": 0.1814,
"step": 505
},
{
"epoch": 2.0240320427236314,
"grad_norm": 0.5852591736999052,
"learning_rate": 2.8727957136566825e-06,
"loss": 0.1616,
"step": 506
},
{
"epoch": 2.02803738317757,
"grad_norm": 0.6216187678840704,
"learning_rate": 2.8516650325852883e-06,
"loss": 0.1776,
"step": 507
},
{
"epoch": 2.0320427236315086,
"grad_norm": 0.6805660651282351,
"learning_rate": 2.83058130441221e-06,
"loss": 0.1848,
"step": 508
},
{
"epoch": 2.0360480640854473,
"grad_norm": 0.8026841589589839,
"learning_rate": 2.809544989932508e-06,
"loss": 0.1905,
"step": 509
},
{
"epoch": 2.040053404539386,
"grad_norm": 0.661638883853578,
"learning_rate": 2.7885565489049948e-06,
"loss": 0.1568,
"step": 510
},
{
"epoch": 2.0440587449933245,
"grad_norm": 0.6472428396855733,
"learning_rate": 2.7676164400421864e-06,
"loss": 0.1767,
"step": 511
},
{
"epoch": 2.048064085447263,
"grad_norm": 0.6917047834821735,
"learning_rate": 2.746725121000273e-06,
"loss": 0.1871,
"step": 512
},
{
"epoch": 2.052069425901202,
"grad_norm": 0.6021301697708129,
"learning_rate": 2.725883048369128e-06,
"loss": 0.1695,
"step": 513
},
{
"epoch": 2.05607476635514,
"grad_norm": 0.6089561158393074,
"learning_rate": 2.705090677662311e-06,
"loss": 0.1743,
"step": 514
},
{
"epoch": 2.0600801068090786,
"grad_norm": 0.6433217615782736,
"learning_rate": 2.684348463307128e-06,
"loss": 0.1648,
"step": 515
},
{
"epoch": 2.0640854472630172,
"grad_norm": 0.6675245713273296,
"learning_rate": 2.66365685863469e-06,
"loss": 0.1695,
"step": 516
},
{
"epoch": 2.068090787716956,
"grad_norm": 0.5639272937571744,
"learning_rate": 2.6430163158700116e-06,
"loss": 0.1552,
"step": 517
},
{
"epoch": 2.0720961281708945,
"grad_norm": 0.6756211271225167,
"learning_rate": 2.6224272861221245e-06,
"loss": 0.1871,
"step": 518
},
{
"epoch": 2.076101468624833,
"grad_norm": 0.56951104964083,
"learning_rate": 2.601890219374217e-06,
"loss": 0.175,
"step": 519
},
{
"epoch": 2.0801068090787718,
"grad_norm": 0.5849605152037826,
"learning_rate": 2.5814055644738013e-06,
"loss": 0.1577,
"step": 520
},
{
"epoch": 2.0841121495327104,
"grad_norm": 0.6208978605303654,
"learning_rate": 2.5609737691229055e-06,
"loss": 0.1676,
"step": 521
},
{
"epoch": 2.088117489986649,
"grad_norm": 0.6300535483875338,
"learning_rate": 2.5405952798682844e-06,
"loss": 0.1767,
"step": 522
},
{
"epoch": 2.0921228304405872,
"grad_norm": 0.6515325771939616,
"learning_rate": 2.520270542091663e-06,
"loss": 0.1862,
"step": 523
},
{
"epoch": 2.096128170894526,
"grad_norm": 0.6573962419086741,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.1707,
"step": 524
},
{
"epoch": 2.1001335113484645,
"grad_norm": 0.6008492264289961,
"learning_rate": 2.4797840966157877e-06,
"loss": 0.1562,
"step": 525
},
{
"epoch": 2.104138851802403,
"grad_norm": 0.5554906783739959,
"learning_rate": 2.4596232737673544e-06,
"loss": 0.1753,
"step": 526
},
{
"epoch": 2.1081441922563418,
"grad_norm": 0.5883215878515664,
"learning_rate": 2.439517972079222e-06,
"loss": 0.1739,
"step": 527
},
{
"epoch": 2.1121495327102804,
"grad_norm": 0.6225552692825768,
"learning_rate": 2.4194686309624664e-06,
"loss": 0.1736,
"step": 528
},
{
"epoch": 2.116154873164219,
"grad_norm": 0.6538525824437825,
"learning_rate": 2.3994756886051267e-06,
"loss": 0.1793,
"step": 529
},
{
"epoch": 2.1201602136181577,
"grad_norm": 0.6426343380918219,
"learning_rate": 2.3795395819626116e-06,
"loss": 0.1636,
"step": 530
},
{
"epoch": 2.1241655540720963,
"grad_norm": 0.6984876016877216,
"learning_rate": 2.3596607467481602e-06,
"loss": 0.1818,
"step": 531
},
{
"epoch": 2.128170894526035,
"grad_norm": 0.5969581027123277,
"learning_rate": 2.339839617423318e-06,
"loss": 0.1761,
"step": 532
},
{
"epoch": 2.132176234979973,
"grad_norm": 0.5965591945629233,
"learning_rate": 2.320076627188438e-06,
"loss": 0.1699,
"step": 533
},
{
"epoch": 2.1361815754339117,
"grad_norm": 0.6646391923991977,
"learning_rate": 2.300372207973219e-06,
"loss": 0.1642,
"step": 534
},
{
"epoch": 2.1401869158878504,
"grad_norm": 0.5996069077675478,
"learning_rate": 2.280726790427258e-06,
"loss": 0.1721,
"step": 535
},
{
"epoch": 2.144192256341789,
"grad_norm": 0.6741481126361855,
"learning_rate": 2.261140803910644e-06,
"loss": 0.1727,
"step": 536
},
{
"epoch": 2.1481975967957276,
"grad_norm": 0.6323694946147757,
"learning_rate": 2.2416146764845733e-06,
"loss": 0.1702,
"step": 537
},
{
"epoch": 2.1522029372496663,
"grad_norm": 0.6862267704077283,
"learning_rate": 2.2221488349019903e-06,
"loss": 0.1729,
"step": 538
},
{
"epoch": 2.156208277703605,
"grad_norm": 0.6070081128579359,
"learning_rate": 2.202743704598263e-06,
"loss": 0.1593,
"step": 539
},
{
"epoch": 2.1602136181575435,
"grad_norm": 0.7278096682292641,
"learning_rate": 2.1833997096818897e-06,
"loss": 0.1836,
"step": 540
},
{
"epoch": 2.164218958611482,
"grad_norm": 0.632259449291054,
"learning_rate": 2.1641172729252206e-06,
"loss": 0.1711,
"step": 541
},
{
"epoch": 2.1682242990654204,
"grad_norm": 0.6117574742799209,
"learning_rate": 2.1448968157552243e-06,
"loss": 0.1632,
"step": 542
},
{
"epoch": 2.172229639519359,
"grad_norm": 0.631961523767331,
"learning_rate": 2.1257387582442746e-06,
"loss": 0.1694,
"step": 543
},
{
"epoch": 2.1762349799732976,
"grad_norm": 0.6156725197172384,
"learning_rate": 2.1066435191009717e-06,
"loss": 0.1643,
"step": 544
},
{
"epoch": 2.1802403204272363,
"grad_norm": 0.6322664734036844,
"learning_rate": 2.08761151566099e-06,
"loss": 0.1798,
"step": 545
},
{
"epoch": 2.184245660881175,
"grad_norm": 0.6464718958812395,
"learning_rate": 2.0686431638779564e-06,
"loss": 0.1731,
"step": 546
},
{
"epoch": 2.1882510013351135,
"grad_norm": 0.6399447883731574,
"learning_rate": 2.04973887831436e-06,
"loss": 0.1835,
"step": 547
},
{
"epoch": 2.192256341789052,
"grad_norm": 0.5720412483866,
"learning_rate": 2.030899072132493e-06,
"loss": 0.1574,
"step": 548
},
{
"epoch": 2.196261682242991,
"grad_norm": 0.5860714566791593,
"learning_rate": 2.0121241570854165e-06,
"loss": 0.182,
"step": 549
},
{
"epoch": 2.2002670226969294,
"grad_norm": 0.6083557073323276,
"learning_rate": 1.9934145435079705e-06,
"loss": 0.1661,
"step": 550
},
{
"epoch": 2.204272363150868,
"grad_norm": 0.6104569659276108,
"learning_rate": 1.9747706403077943e-06,
"loss": 0.1979,
"step": 551
},
{
"epoch": 2.2082777036048062,
"grad_norm": 0.62244894576839,
"learning_rate": 1.956192854956397e-06,
"loss": 0.1774,
"step": 552
},
{
"epoch": 2.212283044058745,
"grad_norm": 0.630950626155233,
"learning_rate": 1.9376815934802496e-06,
"loss": 0.1758,
"step": 553
},
{
"epoch": 2.2162883845126835,
"grad_norm": 0.6625191361259387,
"learning_rate": 1.9192372604519127e-06,
"loss": 0.1928,
"step": 554
},
{
"epoch": 2.220293724966622,
"grad_norm": 0.5820926209462741,
"learning_rate": 1.9008602589811931e-06,
"loss": 0.1565,
"step": 555
},
{
"epoch": 2.2242990654205608,
"grad_norm": 0.5567632552609313,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.1756,
"step": 556
},
{
"epoch": 2.2283044058744994,
"grad_norm": 0.5980702955281321,
"learning_rate": 1.864309855785234e-06,
"loss": 0.1594,
"step": 557
},
{
"epoch": 2.232309746328438,
"grad_norm": 0.5820363892907623,
"learning_rate": 1.8461372528867095e-06,
"loss": 0.1768,
"step": 558
},
{
"epoch": 2.2363150867823767,
"grad_norm": 0.5967644083406489,
"learning_rate": 1.8280335791817733e-06,
"loss": 0.1689,
"step": 559
},
{
"epoch": 2.2403204272363153,
"grad_norm": 0.6250075972601391,
"learning_rate": 1.809999230334958e-06,
"loss": 0.1748,
"step": 560
},
{
"epoch": 2.2443257676902535,
"grad_norm": 0.606205688498456,
"learning_rate": 1.7920346004956673e-06,
"loss": 0.1834,
"step": 561
},
{
"epoch": 2.248331108144192,
"grad_norm": 0.5940182864906453,
"learning_rate": 1.7741400822895633e-06,
"loss": 0.1691,
"step": 562
},
{
"epoch": 2.2523364485981308,
"grad_norm": 0.5683156268730282,
"learning_rate": 1.7563160668099838e-06,
"loss": 0.1726,
"step": 563
},
{
"epoch": 2.2563417890520694,
"grad_norm": 0.6031458823381572,
"learning_rate": 1.7385629436093958e-06,
"loss": 0.1618,
"step": 564
},
{
"epoch": 2.260347129506008,
"grad_norm": 0.6310913458129014,
"learning_rate": 1.7208811006908798e-06,
"loss": 0.18,
"step": 565
},
{
"epoch": 2.2643524699599467,
"grad_norm": 0.6148520423770238,
"learning_rate": 1.7032709244996559e-06,
"loss": 0.1699,
"step": 566
},
{
"epoch": 2.2683578104138853,
"grad_norm": 0.6101595073175935,
"learning_rate": 1.6857327999146284e-06,
"loss": 0.1623,
"step": 567
},
{
"epoch": 2.272363150867824,
"grad_norm": 0.5775767049567889,
"learning_rate": 1.6682671102399806e-06,
"loss": 0.1611,
"step": 568
},
{
"epoch": 2.2763684913217626,
"grad_norm": 0.6215298766632447,
"learning_rate": 1.6508742371967962e-06,
"loss": 0.1708,
"step": 569
},
{
"epoch": 2.2803738317757007,
"grad_norm": 0.6064011848210037,
"learning_rate": 1.633554560914714e-06,
"loss": 0.1793,
"step": 570
},
{
"epoch": 2.2843791722296394,
"grad_norm": 0.5913150625405588,
"learning_rate": 1.6163084599236278e-06,
"loss": 0.1734,
"step": 571
},
{
"epoch": 2.288384512683578,
"grad_norm": 0.6290184807128071,
"learning_rate": 1.5991363111454023e-06,
"loss": 0.1643,
"step": 572
},
{
"epoch": 2.2923898531375166,
"grad_norm": 0.6150638457127714,
"learning_rate": 1.5820384898856433e-06,
"loss": 0.1662,
"step": 573
},
{
"epoch": 2.2963951935914553,
"grad_norm": 0.6496821212159293,
"learning_rate": 1.5650153698254916e-06,
"loss": 0.1854,
"step": 574
},
{
"epoch": 2.300400534045394,
"grad_norm": 0.6678529895051883,
"learning_rate": 1.5480673230134585e-06,
"loss": 0.1618,
"step": 575
},
{
"epoch": 2.3044058744993325,
"grad_norm": 0.6202814553309792,
"learning_rate": 1.5311947198572918e-06,
"loss": 0.1669,
"step": 576
},
{
"epoch": 2.308411214953271,
"grad_norm": 0.5462233451328915,
"learning_rate": 1.514397929115884e-06,
"loss": 0.1578,
"step": 577
},
{
"epoch": 2.31241655540721,
"grad_norm": 0.6123799648363422,
"learning_rate": 1.4976773178912085e-06,
"loss": 0.1678,
"step": 578
},
{
"epoch": 2.316421895861148,
"grad_norm": 0.6365385572090578,
"learning_rate": 1.481033251620299e-06,
"loss": 0.1686,
"step": 579
},
{
"epoch": 2.3204272363150866,
"grad_norm": 0.6063441561728166,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.1772,
"step": 580
},
{
"epoch": 2.3244325767690253,
"grad_norm": 0.658171365772341,
"learning_rate": 1.4479762073153304e-06,
"loss": 0.1726,
"step": 581
},
{
"epoch": 2.328437917222964,
"grad_norm": 0.6269724587094964,
"learning_rate": 1.4315639517589398e-06,
"loss": 0.1662,
"step": 582
},
{
"epoch": 2.3324432576769025,
"grad_norm": 0.6133548152558246,
"learning_rate": 1.4152296860958641e-06,
"loss": 0.1702,
"step": 583
},
{
"epoch": 2.336448598130841,
"grad_norm": 0.66729626203352,
"learning_rate": 1.3989737673193682e-06,
"loss": 0.1926,
"step": 584
},
{
"epoch": 2.34045393858478,
"grad_norm": 0.6087596853512452,
"learning_rate": 1.382796550710408e-06,
"loss": 0.1886,
"step": 585
},
{
"epoch": 2.3444592790387184,
"grad_norm": 0.5915578437600094,
"learning_rate": 1.3666983898298659e-06,
"loss": 0.1541,
"step": 586
},
{
"epoch": 2.348464619492657,
"grad_norm": 0.6098604276701993,
"learning_rate": 1.3506796365108232e-06,
"loss": 0.1739,
"step": 587
},
{
"epoch": 2.3524699599465952,
"grad_norm": 0.6102908288565849,
"learning_rate": 1.3347406408508695e-06,
"loss": 0.1715,
"step": 588
},
{
"epoch": 2.356475300400534,
"grad_norm": 0.6133775478993069,
"learning_rate": 1.3188817512044544e-06,
"loss": 0.1646,
"step": 589
},
{
"epoch": 2.3604806408544725,
"grad_norm": 0.6293540508319713,
"learning_rate": 1.3031033141752702e-06,
"loss": 0.1711,
"step": 590
},
{
"epoch": 2.364485981308411,
"grad_norm": 0.7443933199698777,
"learning_rate": 1.2874056746086772e-06,
"loss": 0.152,
"step": 591
},
{
"epoch": 2.3684913217623498,
"grad_norm": 0.5731726505456445,
"learning_rate": 1.2717891755841722e-06,
"loss": 0.1509,
"step": 592
},
{
"epoch": 2.3724966622162884,
"grad_norm": 0.5987015572774188,
"learning_rate": 1.2562541584078835e-06,
"loss": 0.1664,
"step": 593
},
{
"epoch": 2.376502002670227,
"grad_norm": 0.6300995737372567,
"learning_rate": 1.2408009626051137e-06,
"loss": 0.1744,
"step": 594
},
{
"epoch": 2.3805073431241657,
"grad_norm": 0.6142455245728455,
"learning_rate": 1.225429925912921e-06,
"loss": 0.1563,
"step": 595
},
{
"epoch": 2.3845126835781043,
"grad_norm": 0.6234917981255259,
"learning_rate": 1.2101413842727345e-06,
"loss": 0.1648,
"step": 596
},
{
"epoch": 2.3885180240320425,
"grad_norm": 0.5825435971591906,
"learning_rate": 1.1949356718230188e-06,
"loss": 0.1602,
"step": 597
},
{
"epoch": 2.392523364485981,
"grad_norm": 0.6402288566179174,
"learning_rate": 1.1798131208919628e-06,
"loss": 0.1851,
"step": 598
},
{
"epoch": 2.3965287049399198,
"grad_norm": 0.6037042606359994,
"learning_rate": 1.1647740619902193e-06,
"loss": 0.1747,
"step": 599
},
{
"epoch": 2.4005340453938584,
"grad_norm": 0.5948400476478892,
"learning_rate": 1.1498188238036862e-06,
"loss": 0.1554,
"step": 600
},
{
"epoch": 2.404539385847797,
"grad_norm": 0.6966147020255966,
"learning_rate": 1.134947733186315e-06,
"loss": 0.1779,
"step": 601
},
{
"epoch": 2.4085447263017357,
"grad_norm": 0.6005191719492351,
"learning_rate": 1.1201611151529756e-06,
"loss": 0.1607,
"step": 602
},
{
"epoch": 2.4125500667556743,
"grad_norm": 0.5820830342726288,
"learning_rate": 1.105459292872345e-06,
"loss": 0.165,
"step": 603
},
{
"epoch": 2.416555407209613,
"grad_norm": 0.5655842631268676,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.1528,
"step": 604
},
{
"epoch": 2.4205607476635516,
"grad_norm": 0.5964382967668805,
"learning_rate": 1.0763113189706453e-06,
"loss": 0.1694,
"step": 605
},
{
"epoch": 2.4245660881174897,
"grad_norm": 0.6456074354041683,
"learning_rate": 1.0618658043926233e-06,
"loss": 0.1747,
"step": 606
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.7700257765536643,
"learning_rate": 1.047506359639483e-06,
"loss": 0.1804,
"step": 607
},
{
"epoch": 2.432576769025367,
"grad_norm": 0.6024820562838693,
"learning_rate": 1.0332332985438248e-06,
"loss": 0.1704,
"step": 608
},
{
"epoch": 2.4365821094793056,
"grad_norm": 0.6345644986717863,
"learning_rate": 1.0190469330502928e-06,
"loss": 0.1782,
"step": 609
},
{
"epoch": 2.4405874499332443,
"grad_norm": 0.5806972189010675,
"learning_rate": 1.004947573208756e-06,
"loss": 0.1803,
"step": 610
},
{
"epoch": 2.444592790387183,
"grad_norm": 0.6158669549354134,
"learning_rate": 9.909355271675335e-07,
"loss": 0.1531,
"step": 611
},
{
"epoch": 2.4485981308411215,
"grad_norm": 0.6579929468353193,
"learning_rate": 9.770111011666582e-07,
"loss": 0.1821,
"step": 612
},
{
"epoch": 2.45260347129506,
"grad_norm": 0.5991038573483655,
"learning_rate": 9.631745995311881e-07,
"loss": 0.1658,
"step": 613
},
{
"epoch": 2.456608811748999,
"grad_norm": 0.5782202975221776,
"learning_rate": 9.494263246645474e-07,
"loss": 0.1525,
"step": 614
},
{
"epoch": 2.4606141522029374,
"grad_norm": 0.6234695823453394,
"learning_rate": 9.357665770419244e-07,
"loss": 0.1676,
"step": 615
},
{
"epoch": 2.464619492656876,
"grad_norm": 0.6614765092596231,
"learning_rate": 9.221956552036992e-07,
"loss": 0.1809,
"step": 616
},
{
"epoch": 2.4686248331108143,
"grad_norm": 0.5631502943362868,
"learning_rate": 9.08713855748925e-07,
"loss": 0.1511,
"step": 617
},
{
"epoch": 2.472630173564753,
"grad_norm": 0.6001768275330218,
"learning_rate": 8.953214733288384e-07,
"loss": 0.1685,
"step": 618
},
{
"epoch": 2.4766355140186915,
"grad_norm": 0.6128594108692941,
"learning_rate": 8.820188006404268e-07,
"loss": 0.167,
"step": 619
},
{
"epoch": 2.48064085447263,
"grad_norm": 0.6003517878466088,
"learning_rate": 8.688061284200266e-07,
"loss": 0.16,
"step": 620
},
{
"epoch": 2.484646194926569,
"grad_norm": 0.6125927993221344,
"learning_rate": 8.556837454369698e-07,
"loss": 0.1548,
"step": 621
},
{
"epoch": 2.4886515353805074,
"grad_norm": 0.620566464502616,
"learning_rate": 8.426519384872733e-07,
"loss": 0.1753,
"step": 622
},
{
"epoch": 2.492656875834446,
"grad_norm": 0.6137925843567528,
"learning_rate": 8.297109923873753e-07,
"loss": 0.1552,
"step": 623
},
{
"epoch": 2.4966622162883847,
"grad_norm": 0.5783321494225999,
"learning_rate": 8.168611899679013e-07,
"loss": 0.1643,
"step": 624
},
{
"epoch": 2.5006675567423233,
"grad_norm": 0.565759829851567,
"learning_rate": 8.041028120674894e-07,
"loss": 0.1568,
"step": 625
},
{
"epoch": 2.5046728971962615,
"grad_norm": 0.6438983607349034,
"learning_rate": 7.914361375266505e-07,
"loss": 0.167,
"step": 626
},
{
"epoch": 2.5086782376502,
"grad_norm": 0.7266085897712414,
"learning_rate": 7.788614431816743e-07,
"loss": 0.1775,
"step": 627
},
{
"epoch": 2.512683578104139,
"grad_norm": 0.5951499553068635,
"learning_rate": 7.663790038585794e-07,
"loss": 0.1567,
"step": 628
},
{
"epoch": 2.5166889185580774,
"grad_norm": 0.6024813088753026,
"learning_rate": 7.539890923671061e-07,
"loss": 0.1654,
"step": 629
},
{
"epoch": 2.520694259012016,
"grad_norm": 0.5611519260516766,
"learning_rate": 7.416919794947536e-07,
"loss": 0.1666,
"step": 630
},
{
"epoch": 2.5246995994659547,
"grad_norm": 0.6940972612783466,
"learning_rate": 7.294879340008632e-07,
"loss": 0.1745,
"step": 631
},
{
"epoch": 2.5287049399198933,
"grad_norm": 0.5590592473739715,
"learning_rate": 7.173772226107434e-07,
"loss": 0.1698,
"step": 632
},
{
"epoch": 2.5327102803738315,
"grad_norm": 0.591058116554628,
"learning_rate": 7.053601100098401e-07,
"loss": 0.1671,
"step": 633
},
{
"epoch": 2.5367156208277706,
"grad_norm": 0.6077726003073021,
"learning_rate": 6.934368588379553e-07,
"loss": 0.1847,
"step": 634
},
{
"epoch": 2.5407209612817088,
"grad_norm": 0.5900365979656913,
"learning_rate": 6.816077296835006e-07,
"loss": 0.1632,
"step": 635
},
{
"epoch": 2.5447263017356474,
"grad_norm": 0.6291526388062625,
"learning_rate": 6.698729810778065e-07,
"loss": 0.1669,
"step": 636
},
{
"epoch": 2.548731642189586,
"grad_norm": 0.6405676228401413,
"learning_rate": 6.582328694894729e-07,
"loss": 0.1678,
"step": 637
},
{
"epoch": 2.5527369826435247,
"grad_norm": 0.5916007034338923,
"learning_rate": 6.46687649318759e-07,
"loss": 0.1687,
"step": 638
},
{
"epoch": 2.5567423230974633,
"grad_norm": 0.6197597194779472,
"learning_rate": 6.352375728920285e-07,
"loss": 0.159,
"step": 639
},
{
"epoch": 2.560747663551402,
"grad_norm": 0.6078888459723736,
"learning_rate": 6.238828904562316e-07,
"loss": 0.1818,
"step": 640
},
{
"epoch": 2.5647530040053406,
"grad_norm": 0.5953442248473361,
"learning_rate": 6.126238501734372e-07,
"loss": 0.1747,
"step": 641
},
{
"epoch": 2.568758344459279,
"grad_norm": 0.6197098462200006,
"learning_rate": 6.014606981154086e-07,
"loss": 0.1705,
"step": 642
},
{
"epoch": 2.572763684913218,
"grad_norm": 0.6171410856963269,
"learning_rate": 5.903936782582253e-07,
"loss": 0.1771,
"step": 643
},
{
"epoch": 2.576769025367156,
"grad_norm": 0.5991169552059425,
"learning_rate": 5.794230324769518e-07,
"loss": 0.1669,
"step": 644
},
{
"epoch": 2.5807743658210947,
"grad_norm": 0.6103321073889593,
"learning_rate": 5.685490005403499e-07,
"loss": 0.1726,
"step": 645
},
{
"epoch": 2.5847797062750333,
"grad_norm": 0.6256121075513487,
"learning_rate": 5.577718201056392e-07,
"loss": 0.1557,
"step": 646
},
{
"epoch": 2.588785046728972,
"grad_norm": 0.6204867013864792,
"learning_rate": 5.470917267133041e-07,
"loss": 0.1603,
"step": 647
},
{
"epoch": 2.5927903871829105,
"grad_norm": 0.5859779013723672,
"learning_rate": 5.365089537819435e-07,
"loss": 0.1717,
"step": 648
},
{
"epoch": 2.596795727636849,
"grad_norm": 0.5950814615589916,
"learning_rate": 5.260237326031698e-07,
"loss": 0.1684,
"step": 649
},
{
"epoch": 2.600801068090788,
"grad_norm": 0.5788284827989015,
"learning_rate": 5.156362923365587e-07,
"loss": 0.1748,
"step": 650
},
{
"epoch": 2.6048064085447264,
"grad_norm": 0.6500848201090982,
"learning_rate": 5.053468600046324e-07,
"loss": 0.1551,
"step": 651
},
{
"epoch": 2.608811748998665,
"grad_norm": 0.579677458355888,
"learning_rate": 4.951556604879049e-07,
"loss": 0.1561,
"step": 652
},
{
"epoch": 2.6128170894526033,
"grad_norm": 0.5938327964966817,
"learning_rate": 4.850629165199627e-07,
"loss": 0.1748,
"step": 653
},
{
"epoch": 2.616822429906542,
"grad_norm": 0.5844525078342523,
"learning_rate": 4.7506884868259996e-07,
"loss": 0.1565,
"step": 654
},
{
"epoch": 2.6208277703604805,
"grad_norm": 0.5834653222280398,
"learning_rate": 4.651736754009972e-07,
"loss": 0.1631,
"step": 655
},
{
"epoch": 2.624833110814419,
"grad_norm": 0.5977062556990842,
"learning_rate": 4.5537761293894535e-07,
"loss": 0.1555,
"step": 656
},
{
"epoch": 2.628838451268358,
"grad_norm": 0.595817231192985,
"learning_rate": 4.456808753941205e-07,
"loss": 0.1881,
"step": 657
},
{
"epoch": 2.6328437917222964,
"grad_norm": 0.6038863958838983,
"learning_rate": 4.3608367469340553e-07,
"loss": 0.1611,
"step": 658
},
{
"epoch": 2.636849132176235,
"grad_norm": 0.5765506637261613,
"learning_rate": 4.265862205882559e-07,
"loss": 0.1669,
"step": 659
},
{
"epoch": 2.6408544726301737,
"grad_norm": 0.6081323911406266,
"learning_rate": 4.171887206501191e-07,
"loss": 0.166,
"step": 660
},
{
"epoch": 2.6448598130841123,
"grad_norm": 0.7942617005475607,
"learning_rate": 4.078913802658946e-07,
"loss": 0.1526,
"step": 661
},
{
"epoch": 2.6488651535380505,
"grad_norm": 0.5745245099031036,
"learning_rate": 3.9869440263344714e-07,
"loss": 0.1865,
"step": 662
},
{
"epoch": 2.652870493991989,
"grad_norm": 0.6362782671167186,
"learning_rate": 3.895979887571649e-07,
"loss": 0.1702,
"step": 663
},
{
"epoch": 2.656875834445928,
"grad_norm": 0.6659260281530986,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.1588,
"step": 664
},
{
"epoch": 2.6608811748998664,
"grad_norm": 0.5821725085601599,
"learning_rate": 3.717076452969559e-07,
"loss": 0.1585,
"step": 665
},
{
"epoch": 2.664886515353805,
"grad_norm": 0.5721973191552109,
"learning_rate": 3.6291410671512597e-07,
"loss": 0.1546,
"step": 666
},
{
"epoch": 2.6688918558077437,
"grad_norm": 0.6108240231082995,
"learning_rate": 3.542219138851094e-07,
"loss": 0.165,
"step": 667
},
{
"epoch": 2.6728971962616823,
"grad_norm": 0.5890590495394973,
"learning_rate": 3.4563125677897936e-07,
"loss": 0.1697,
"step": 668
},
{
"epoch": 2.676902536715621,
"grad_norm": 0.5687849267564792,
"learning_rate": 3.371423231496951e-07,
"loss": 0.1737,
"step": 669
},
{
"epoch": 2.6809078771695596,
"grad_norm": 0.6029561874285849,
"learning_rate": 3.287552985270015e-07,
"loss": 0.1618,
"step": 670
},
{
"epoch": 2.6849132176234978,
"grad_norm": 0.6317046685851485,
"learning_rate": 3.204703662133724e-07,
"loss": 0.1761,
"step": 671
},
{
"epoch": 2.688918558077437,
"grad_norm": 0.6617100618302314,
"learning_rate": 3.122877072800046e-07,
"loss": 0.1656,
"step": 672
},
{
"epoch": 2.692923898531375,
"grad_norm": 0.5447463259022025,
"learning_rate": 3.0420750056286195e-07,
"loss": 0.1611,
"step": 673
},
{
"epoch": 2.6969292389853137,
"grad_norm": 0.6232513934357247,
"learning_rate": 2.962299226587639e-07,
"loss": 0.1613,
"step": 674
},
{
"epoch": 2.7009345794392523,
"grad_norm": 0.620106407560748,
"learning_rate": 2.8835514792152854e-07,
"loss": 0.1525,
"step": 675
},
{
"epoch": 2.704939919893191,
"grad_norm": 0.6010185411049328,
"learning_rate": 2.8058334845816214e-07,
"loss": 0.1491,
"step": 676
},
{
"epoch": 2.7089452603471296,
"grad_norm": 0.5685858072848753,
"learning_rate": 2.729146941250954e-07,
"loss": 0.1558,
"step": 677
},
{
"epoch": 2.712950600801068,
"grad_norm": 0.605493206929418,
"learning_rate": 2.653493525244721e-07,
"loss": 0.167,
"step": 678
},
{
"epoch": 2.716955941255007,
"grad_norm": 0.5867378767850222,
"learning_rate": 2.5788748900048676e-07,
"loss": 0.1622,
"step": 679
},
{
"epoch": 2.720961281708945,
"grad_norm": 0.6167298470042248,
"learning_rate": 2.5052926663577006e-07,
"loss": 0.161,
"step": 680
},
{
"epoch": 2.724966622162884,
"grad_norm": 0.6730247619234008,
"learning_rate": 2.4327484624782684e-07,
"loss": 0.159,
"step": 681
},
{
"epoch": 2.7289719626168223,
"grad_norm": 0.5614764326727409,
"learning_rate": 2.3612438638551837e-07,
"loss": 0.173,
"step": 682
},
{
"epoch": 2.732977303070761,
"grad_norm": 0.6302046681068347,
"learning_rate": 2.290780433255979e-07,
"loss": 0.1823,
"step": 683
},
{
"epoch": 2.7369826435246996,
"grad_norm": 0.6339293472571709,
"learning_rate": 2.2213597106929608e-07,
"loss": 0.1653,
"step": 684
},
{
"epoch": 2.740987983978638,
"grad_norm": 0.6169502312976588,
"learning_rate": 2.152983213389559e-07,
"loss": 0.1738,
"step": 685
},
{
"epoch": 2.744993324432577,
"grad_norm": 0.59777733584903,
"learning_rate": 2.085652435747132e-07,
"loss": 0.1728,
"step": 686
},
{
"epoch": 2.7489986648865155,
"grad_norm": 0.6587479139210242,
"learning_rate": 2.0193688493123588e-07,
"loss": 0.1748,
"step": 687
},
{
"epoch": 2.753004005340454,
"grad_norm": 0.5962185804964122,
"learning_rate": 1.9541339027450256e-07,
"loss": 0.1617,
"step": 688
},
{
"epoch": 2.7570093457943923,
"grad_norm": 0.7505303172182166,
"learning_rate": 1.889949021786397e-07,
"loss": 0.1722,
"step": 689
},
{
"epoch": 2.7610146862483314,
"grad_norm": 0.5964493036331348,
"learning_rate": 1.8268156092280498e-07,
"loss": 0.1654,
"step": 690
},
{
"epoch": 2.7650200267022695,
"grad_norm": 0.5762622321084703,
"learning_rate": 1.7647350448812105e-07,
"loss": 0.1583,
"step": 691
},
{
"epoch": 2.769025367156208,
"grad_norm": 0.6801945089414425,
"learning_rate": 1.7037086855465902e-07,
"loss": 0.1768,
"step": 692
},
{
"epoch": 2.773030707610147,
"grad_norm": 0.6911899110033116,
"learning_rate": 1.6437378649847458e-07,
"loss": 0.1732,
"step": 693
},
{
"epoch": 2.7770360480640854,
"grad_norm": 0.6400292646247479,
"learning_rate": 1.5848238938869332e-07,
"loss": 0.1713,
"step": 694
},
{
"epoch": 2.781041388518024,
"grad_norm": 0.6293782671500883,
"learning_rate": 1.5269680598464342e-07,
"loss": 0.1698,
"step": 695
},
{
"epoch": 2.7850467289719627,
"grad_norm": 0.6326255144077011,
"learning_rate": 1.4701716273304524e-07,
"loss": 0.1617,
"step": 696
},
{
"epoch": 2.7890520694259013,
"grad_norm": 0.5877129222871145,
"learning_rate": 1.4144358376524504e-07,
"loss": 0.1736,
"step": 697
},
{
"epoch": 2.7930574098798395,
"grad_norm": 0.5768063266465919,
"learning_rate": 1.3597619089450343e-07,
"loss": 0.1678,
"step": 698
},
{
"epoch": 2.7970627503337786,
"grad_norm": 0.6141881975935009,
"learning_rate": 1.3061510361333186e-07,
"loss": 0.174,
"step": 699
},
{
"epoch": 2.801068090787717,
"grad_norm": 0.6135094476904235,
"learning_rate": 1.253604390908819e-07,
"loss": 0.1643,
"step": 700
},
{
"epoch": 2.8050734312416554,
"grad_norm": 0.5963941238541673,
"learning_rate": 1.2021231217038522e-07,
"loss": 0.1719,
"step": 701
},
{
"epoch": 2.809078771695594,
"grad_norm": 0.6188679804552468,
"learning_rate": 1.1517083536664142e-07,
"loss": 0.1732,
"step": 702
},
{
"epoch": 2.8130841121495327,
"grad_norm": 0.6378524366699763,
"learning_rate": 1.10236118863562e-07,
"loss": 0.1657,
"step": 703
},
{
"epoch": 2.8170894526034713,
"grad_norm": 0.6624926543396782,
"learning_rate": 1.0540827051175817e-07,
"loss": 0.163,
"step": 704
},
{
"epoch": 2.82109479305741,
"grad_norm": 0.6241183789468542,
"learning_rate": 1.0068739582618781e-07,
"loss": 0.1738,
"step": 705
},
{
"epoch": 2.8251001335113486,
"grad_norm": 0.5929380446422712,
"learning_rate": 9.607359798384785e-08,
"loss": 0.1698,
"step": 706
},
{
"epoch": 2.8291054739652868,
"grad_norm": 0.6379777759896461,
"learning_rate": 9.15669778215178e-08,
"loss": 0.178,
"step": 707
},
{
"epoch": 2.833110814419226,
"grad_norm": 0.5934788386620728,
"learning_rate": 8.716763383355863e-08,
"loss": 0.1781,
"step": 708
},
{
"epoch": 2.837116154873164,
"grad_norm": 0.67169486389099,
"learning_rate": 8.287566216975795e-08,
"loss": 0.1468,
"step": 709
},
{
"epoch": 2.8411214953271027,
"grad_norm": 0.5945998400168926,
"learning_rate": 7.869115663322879e-08,
"loss": 0.1677,
"step": 710
},
{
"epoch": 2.8451268357810413,
"grad_norm": 0.6379381928228921,
"learning_rate": 7.461420867836078e-08,
"loss": 0.1596,
"step": 711
},
{
"epoch": 2.84913217623498,
"grad_norm": 0.5694853594582754,
"learning_rate": 7.064490740882057e-08,
"loss": 0.1564,
"step": 712
},
{
"epoch": 2.8531375166889186,
"grad_norm": 0.6431564447838445,
"learning_rate": 6.678333957560513e-08,
"loss": 0.1779,
"step": 713
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.603659494625915,
"learning_rate": 6.302958957514372e-08,
"loss": 0.1703,
"step": 714
},
{
"epoch": 2.861148197596796,
"grad_norm": 0.5731990501391916,
"learning_rate": 5.938373944745612e-08,
"loss": 0.1687,
"step": 715
},
{
"epoch": 2.8651535380507345,
"grad_norm": 0.6437255392307442,
"learning_rate": 5.584586887435739e-08,
"loss": 0.1727,
"step": 716
},
{
"epoch": 2.869158878504673,
"grad_norm": 0.5868868712177298,
"learning_rate": 5.241605517771753e-08,
"loss": 0.1621,
"step": 717
},
{
"epoch": 2.8731642189586113,
"grad_norm": 0.6333270306830352,
"learning_rate": 4.909437331777178e-08,
"loss": 0.1635,
"step": 718
},
{
"epoch": 2.87716955941255,
"grad_norm": 0.6290385820686785,
"learning_rate": 4.588089589148192e-08,
"loss": 0.1642,
"step": 719
},
{
"epoch": 2.8811748998664886,
"grad_norm": 0.6191579999515221,
"learning_rate": 4.2775693130948094e-08,
"loss": 0.1789,
"step": 720
},
{
"epoch": 2.885180240320427,
"grad_norm": 0.5476898057957497,
"learning_rate": 3.977883290187667e-08,
"loss": 0.1506,
"step": 721
},
{
"epoch": 2.889185580774366,
"grad_norm": 0.578059211687489,
"learning_rate": 3.689038070209594e-08,
"loss": 0.1592,
"step": 722
},
{
"epoch": 2.8931909212283045,
"grad_norm": 0.5958692191590165,
"learning_rate": 3.4110399660123306e-08,
"loss": 0.1733,
"step": 723
},
{
"epoch": 2.897196261682243,
"grad_norm": 0.5808682012621369,
"learning_rate": 3.143895053378698e-08,
"loss": 0.151,
"step": 724
},
{
"epoch": 2.9012016021361817,
"grad_norm": 0.5943310418284603,
"learning_rate": 2.8876091708898714e-08,
"loss": 0.1733,
"step": 725
},
{
"epoch": 2.9052069425901204,
"grad_norm": 0.5647474381435812,
"learning_rate": 2.642187919797479e-08,
"loss": 0.1621,
"step": 726
},
{
"epoch": 2.9092122830440585,
"grad_norm": 0.5948502812847656,
"learning_rate": 2.4076366639015914e-08,
"loss": 0.158,
"step": 727
},
{
"epoch": 2.913217623497997,
"grad_norm": 0.595088477171892,
"learning_rate": 2.1839605294330935e-08,
"loss": 0.1754,
"step": 728
},
{
"epoch": 2.917222963951936,
"grad_norm": 0.5612839333804125,
"learning_rate": 1.97116440494205e-08,
"loss": 0.1539,
"step": 729
},
{
"epoch": 2.9212283044058744,
"grad_norm": 0.6167465533535336,
"learning_rate": 1.769252941190458e-08,
"loss": 0.1708,
"step": 730
},
{
"epoch": 2.925233644859813,
"grad_norm": 0.6811415193373171,
"learning_rate": 1.5782305510508855e-08,
"loss": 0.1712,
"step": 731
},
{
"epoch": 2.9292389853137517,
"grad_norm": 0.6295676411628186,
"learning_rate": 1.3981014094099354e-08,
"loss": 0.1606,
"step": 732
},
{
"epoch": 2.9332443257676903,
"grad_norm": 0.5883747534045934,
"learning_rate": 1.2288694530769862e-08,
"loss": 0.1713,
"step": 733
},
{
"epoch": 2.937249666221629,
"grad_norm": 0.6467588103154122,
"learning_rate": 1.0705383806982606e-08,
"loss": 0.1882,
"step": 734
},
{
"epoch": 2.9412550066755676,
"grad_norm": 0.7011732558711389,
"learning_rate": 9.231116526757234e-09,
"loss": 0.1704,
"step": 735
},
{
"epoch": 2.945260347129506,
"grad_norm": 0.6116779172964563,
"learning_rate": 7.865924910916977e-09,
"loss": 0.1698,
"step": 736
},
{
"epoch": 2.9492656875834444,
"grad_norm": 0.6138331601067519,
"learning_rate": 6.609838796385326e-09,
"loss": 0.1621,
"step": 737
},
{
"epoch": 2.953271028037383,
"grad_norm": 0.5884709984985781,
"learning_rate": 5.4628856355293245e-09,
"loss": 0.1704,
"step": 738
},
{
"epoch": 2.9572763684913217,
"grad_norm": 0.6040205984929831,
"learning_rate": 4.4250904955656095e-09,
"loss": 0.1665,
"step": 739
},
{
"epoch": 2.9612817089452603,
"grad_norm": 0.5944128522570793,
"learning_rate": 3.496476058006959e-09,
"loss": 0.1696,
"step": 740
},
{
"epoch": 2.965287049399199,
"grad_norm": 0.5824311819553499,
"learning_rate": 2.6770626181715776e-09,
"loss": 0.1878,
"step": 741
},
{
"epoch": 2.9692923898531376,
"grad_norm": 0.69435894344096,
"learning_rate": 1.9668680847356735e-09,
"loss": 0.1689,
"step": 742
},
{
"epoch": 2.9732977303070762,
"grad_norm": 0.6630378694543846,
"learning_rate": 1.3659079793432173e-09,
"loss": 0.1518,
"step": 743
},
{
"epoch": 2.977303070761015,
"grad_norm": 0.5642162664981942,
"learning_rate": 8.741954362678773e-10,
"loss": 0.1691,
"step": 744
},
{
"epoch": 2.981308411214953,
"grad_norm": 0.5752610766888291,
"learning_rate": 4.91741202124918e-10,
"loss": 0.1637,
"step": 745
},
{
"epoch": 2.985313751668892,
"grad_norm": 0.6316838155198957,
"learning_rate": 2.1855363563638708e-10,
"loss": 0.188,
"step": 746
},
{
"epoch": 2.9893190921228303,
"grad_norm": 0.5868868040589227,
"learning_rate": 5.4638707447929315e-11,
"loss": 0.1496,
"step": 747
},
{
"epoch": 2.9893190921228303,
"step": 747,
"total_flos": 194126978285568.0,
"train_loss": 0.254960169514499,
"train_runtime": 5438.7593,
"train_samples_per_second": 13.217,
"train_steps_per_second": 0.137
}
],
"logging_steps": 1,
"max_steps": 747,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": -747,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 194126978285568.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}