detr-hotspot / trainer_state.json
andhaarr's picture
Upload trainer_state.json with huggingface_hub
61162c2 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.31899641577061,
"eval_steps": 500,
"global_step": 2600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 240.98789978027344,
"learning_rate": 9.978494623655915e-06,
"loss": 4.1336,
"step": 10
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 9.953405017921148e-06,
"loss": 3.7709,
"step": 20
},
{
"epoch": 0.11,
"grad_norm": 414.7696533203125,
"learning_rate": 9.917562724014338e-06,
"loss": 3.5379,
"step": 30
},
{
"epoch": 0.14,
"grad_norm": 160.37644958496094,
"learning_rate": 9.881720430107527e-06,
"loss": 3.2512,
"step": 40
},
{
"epoch": 0.18,
"grad_norm": 1477.7020263671875,
"learning_rate": 9.845878136200718e-06,
"loss": 2.8186,
"step": 50
},
{
"epoch": 0.22,
"grad_norm": 444.38775634765625,
"learning_rate": 9.810035842293908e-06,
"loss": 2.5583,
"step": 60
},
{
"epoch": 0.25,
"grad_norm": 119.81842041015625,
"learning_rate": 9.774193548387097e-06,
"loss": 2.4916,
"step": 70
},
{
"epoch": 0.29,
"grad_norm": 175.25006103515625,
"learning_rate": 9.74193548387097e-06,
"loss": 2.4299,
"step": 80
},
{
"epoch": 0.32,
"grad_norm": 116.39012145996094,
"learning_rate": 9.706093189964158e-06,
"loss": 2.2983,
"step": 90
},
{
"epoch": 0.36,
"grad_norm": 313.2846984863281,
"learning_rate": 9.670250896057349e-06,
"loss": 2.233,
"step": 100
},
{
"epoch": 0.39,
"grad_norm": 228.02125549316406,
"learning_rate": 9.634408602150539e-06,
"loss": 2.0787,
"step": 110
},
{
"epoch": 0.43,
"grad_norm": 82.76283264160156,
"learning_rate": 9.598566308243728e-06,
"loss": 2.2126,
"step": 120
},
{
"epoch": 0.47,
"grad_norm": 94.6029052734375,
"learning_rate": 9.562724014336918e-06,
"loss": 2.0342,
"step": 130
},
{
"epoch": 0.5,
"grad_norm": 259.7580871582031,
"learning_rate": 9.526881720430107e-06,
"loss": 1.8917,
"step": 140
},
{
"epoch": 0.54,
"grad_norm": 453.753173828125,
"learning_rate": 9.491039426523298e-06,
"loss": 2.0701,
"step": 150
},
{
"epoch": 0.57,
"grad_norm": 131.5532989501953,
"learning_rate": 9.455197132616488e-06,
"loss": 1.8425,
"step": 160
},
{
"epoch": 0.61,
"grad_norm": 528.49560546875,
"learning_rate": 9.419354838709677e-06,
"loss": 1.7829,
"step": 170
},
{
"epoch": 0.65,
"grad_norm": 510.9673156738281,
"learning_rate": 9.38351254480287e-06,
"loss": 1.8383,
"step": 180
},
{
"epoch": 0.68,
"grad_norm": 140.07981872558594,
"learning_rate": 9.347670250896058e-06,
"loss": 2.0396,
"step": 190
},
{
"epoch": 0.72,
"grad_norm": 284.4012145996094,
"learning_rate": 9.311827956989249e-06,
"loss": 1.8723,
"step": 200
},
{
"epoch": 0.75,
"grad_norm": 105.90425109863281,
"learning_rate": 9.27598566308244e-06,
"loss": 1.8539,
"step": 210
},
{
"epoch": 0.79,
"grad_norm": 77.53498077392578,
"learning_rate": 9.240143369175628e-06,
"loss": 1.8487,
"step": 220
},
{
"epoch": 0.82,
"grad_norm": 203.25697326660156,
"learning_rate": 9.204301075268819e-06,
"loss": 1.7915,
"step": 230
},
{
"epoch": 0.86,
"grad_norm": 167.5474853515625,
"learning_rate": 9.168458781362007e-06,
"loss": 1.6751,
"step": 240
},
{
"epoch": 0.9,
"grad_norm": 141.84117126464844,
"learning_rate": 9.132616487455198e-06,
"loss": 1.7533,
"step": 250
},
{
"epoch": 0.93,
"grad_norm": 170.94772338867188,
"learning_rate": 9.096774193548388e-06,
"loss": 1.6008,
"step": 260
},
{
"epoch": 0.97,
"grad_norm": 155.2710418701172,
"learning_rate": 9.060931899641577e-06,
"loss": 1.7155,
"step": 270
},
{
"epoch": 1.0,
"grad_norm": 165.6481170654297,
"learning_rate": 9.025089605734768e-06,
"loss": 1.6922,
"step": 280
},
{
"epoch": 1.04,
"grad_norm": 108.59546661376953,
"learning_rate": 8.989247311827958e-06,
"loss": 1.737,
"step": 290
},
{
"epoch": 1.08,
"grad_norm": 119.43255615234375,
"learning_rate": 8.953405017921147e-06,
"loss": 1.5624,
"step": 300
},
{
"epoch": 1.11,
"grad_norm": 203.9807891845703,
"learning_rate": 8.917562724014338e-06,
"loss": 1.5555,
"step": 310
},
{
"epoch": 1.15,
"grad_norm": 196.70803833007812,
"learning_rate": 8.881720430107528e-06,
"loss": 1.4945,
"step": 320
},
{
"epoch": 1.18,
"grad_norm": 218.3573760986328,
"learning_rate": 8.845878136200717e-06,
"loss": 1.6784,
"step": 330
},
{
"epoch": 1.22,
"grad_norm": 218.7974395751953,
"learning_rate": 8.810035842293907e-06,
"loss": 1.5671,
"step": 340
},
{
"epoch": 1.25,
"grad_norm": 110.03742980957031,
"learning_rate": 8.774193548387098e-06,
"loss": 1.5509,
"step": 350
},
{
"epoch": 1.29,
"grad_norm": 536.1112060546875,
"learning_rate": 8.738351254480287e-06,
"loss": 1.5836,
"step": 360
},
{
"epoch": 1.33,
"grad_norm": 91.13652038574219,
"learning_rate": 8.702508960573477e-06,
"loss": 1.5235,
"step": 370
},
{
"epoch": 1.36,
"grad_norm": 111.45140075683594,
"learning_rate": 8.666666666666668e-06,
"loss": 1.5352,
"step": 380
},
{
"epoch": 1.4,
"grad_norm": 83.86921691894531,
"learning_rate": 8.630824372759857e-06,
"loss": 1.5994,
"step": 390
},
{
"epoch": 1.43,
"grad_norm": 64.33961486816406,
"learning_rate": 8.594982078853047e-06,
"loss": 1.5309,
"step": 400
},
{
"epoch": 1.47,
"grad_norm": 74.24256134033203,
"learning_rate": 8.559139784946238e-06,
"loss": 1.5243,
"step": 410
},
{
"epoch": 1.51,
"grad_norm": 88.8588638305664,
"learning_rate": 8.523297491039427e-06,
"loss": 1.4779,
"step": 420
},
{
"epoch": 1.54,
"grad_norm": 99.83770751953125,
"learning_rate": 8.487455197132617e-06,
"loss": 1.4671,
"step": 430
},
{
"epoch": 1.58,
"grad_norm": 219.4841766357422,
"learning_rate": 8.451612903225808e-06,
"loss": 1.4821,
"step": 440
},
{
"epoch": 1.61,
"grad_norm": 145.25010681152344,
"learning_rate": 8.415770609318998e-06,
"loss": 1.5258,
"step": 450
},
{
"epoch": 1.65,
"grad_norm": 109.16207885742188,
"learning_rate": 8.379928315412187e-06,
"loss": 1.4973,
"step": 460
},
{
"epoch": 1.68,
"grad_norm": 385.5920104980469,
"learning_rate": 8.344086021505376e-06,
"loss": 1.5602,
"step": 470
},
{
"epoch": 1.72,
"grad_norm": 78.8775634765625,
"learning_rate": 8.308243727598568e-06,
"loss": 1.4304,
"step": 480
},
{
"epoch": 1.76,
"grad_norm": 54.696563720703125,
"learning_rate": 8.272401433691757e-06,
"loss": 1.4581,
"step": 490
},
{
"epoch": 1.79,
"grad_norm": 145.9872283935547,
"learning_rate": 8.236559139784947e-06,
"loss": 1.4707,
"step": 500
},
{
"epoch": 1.83,
"grad_norm": 86.8324203491211,
"learning_rate": 8.200716845878138e-06,
"loss": 1.4593,
"step": 510
},
{
"epoch": 1.86,
"grad_norm": 328.5557556152344,
"learning_rate": 8.164874551971327e-06,
"loss": 1.4688,
"step": 520
},
{
"epoch": 1.9,
"grad_norm": 126.05339813232422,
"learning_rate": 8.129032258064517e-06,
"loss": 1.5903,
"step": 530
},
{
"epoch": 1.94,
"grad_norm": 216.65573120117188,
"learning_rate": 8.093189964157708e-06,
"loss": 1.6609,
"step": 540
},
{
"epoch": 1.97,
"grad_norm": 150.45306396484375,
"learning_rate": 8.057347670250897e-06,
"loss": 1.5142,
"step": 550
},
{
"epoch": 2.01,
"grad_norm": 84.9225845336914,
"learning_rate": 8.021505376344087e-06,
"loss": 1.5273,
"step": 560
},
{
"epoch": 2.04,
"grad_norm": 63.36981201171875,
"learning_rate": 7.985663082437278e-06,
"loss": 1.4743,
"step": 570
},
{
"epoch": 2.08,
"grad_norm": 70.1873779296875,
"learning_rate": 7.949820788530466e-06,
"loss": 1.5223,
"step": 580
},
{
"epoch": 2.11,
"grad_norm": 208.42889404296875,
"learning_rate": 7.913978494623657e-06,
"loss": 1.4587,
"step": 590
},
{
"epoch": 2.15,
"grad_norm": 59.578243255615234,
"learning_rate": 7.878136200716846e-06,
"loss": 1.3852,
"step": 600
},
{
"epoch": 2.19,
"grad_norm": 121.19114685058594,
"learning_rate": 7.842293906810036e-06,
"loss": 1.3795,
"step": 610
},
{
"epoch": 2.22,
"grad_norm": 81.8535385131836,
"learning_rate": 7.806451612903227e-06,
"loss": 1.3634,
"step": 620
},
{
"epoch": 2.26,
"grad_norm": 171.9092559814453,
"learning_rate": 7.770609318996416e-06,
"loss": 1.4038,
"step": 630
},
{
"epoch": 2.29,
"grad_norm": 189.8422393798828,
"learning_rate": 7.734767025089606e-06,
"loss": 1.359,
"step": 640
},
{
"epoch": 2.33,
"grad_norm": 191.68310546875,
"learning_rate": 7.698924731182797e-06,
"loss": 1.4473,
"step": 650
},
{
"epoch": 2.37,
"grad_norm": 60.011138916015625,
"learning_rate": 7.663082437275985e-06,
"loss": 1.3605,
"step": 660
},
{
"epoch": 2.4,
"grad_norm": 85.5229721069336,
"learning_rate": 7.627240143369177e-06,
"loss": 1.3775,
"step": 670
},
{
"epoch": 2.44,
"grad_norm": 72.5903549194336,
"learning_rate": 7.5913978494623665e-06,
"loss": 1.3748,
"step": 680
},
{
"epoch": 2.47,
"grad_norm": 246.1010284423828,
"learning_rate": 7.555555555555556e-06,
"loss": 1.3974,
"step": 690
},
{
"epoch": 2.51,
"grad_norm": 70.74227905273438,
"learning_rate": 7.519713261648746e-06,
"loss": 1.352,
"step": 700
},
{
"epoch": 2.54,
"grad_norm": 131.19888305664062,
"learning_rate": 7.483870967741936e-06,
"loss": 1.3262,
"step": 710
},
{
"epoch": 2.58,
"grad_norm": 178.96922302246094,
"learning_rate": 7.448028673835126e-06,
"loss": 1.3966,
"step": 720
},
{
"epoch": 2.62,
"grad_norm": 173.15139770507812,
"learning_rate": 7.412186379928316e-06,
"loss": 1.3522,
"step": 730
},
{
"epoch": 2.65,
"grad_norm": 139.38653564453125,
"learning_rate": 7.376344086021506e-06,
"loss": 1.3824,
"step": 740
},
{
"epoch": 2.69,
"grad_norm": 91.99787902832031,
"learning_rate": 7.340501792114696e-06,
"loss": 1.3789,
"step": 750
},
{
"epoch": 2.72,
"grad_norm": 113.1898193359375,
"learning_rate": 7.3046594982078856e-06,
"loss": 1.4338,
"step": 760
},
{
"epoch": 2.76,
"grad_norm": 92.90975952148438,
"learning_rate": 7.268817204301076e-06,
"loss": 1.3135,
"step": 770
},
{
"epoch": 2.8,
"grad_norm": 62.657371520996094,
"learning_rate": 7.232974910394266e-06,
"loss": 1.3329,
"step": 780
},
{
"epoch": 2.83,
"grad_norm": 44.4366340637207,
"learning_rate": 7.1971326164874554e-06,
"loss": 1.372,
"step": 790
},
{
"epoch": 2.87,
"grad_norm": 73.2843017578125,
"learning_rate": 7.161290322580646e-06,
"loss": 1.4003,
"step": 800
},
{
"epoch": 2.9,
"grad_norm": 33.83133316040039,
"learning_rate": 7.125448028673836e-06,
"loss": 1.3719,
"step": 810
},
{
"epoch": 2.94,
"grad_norm": 95.57901763916016,
"learning_rate": 7.089605734767025e-06,
"loss": 1.3175,
"step": 820
},
{
"epoch": 2.97,
"grad_norm": 90.033935546875,
"learning_rate": 7.053763440860215e-06,
"loss": 1.3899,
"step": 830
},
{
"epoch": 3.01,
"grad_norm": 73.42399597167969,
"learning_rate": 7.0179211469534055e-06,
"loss": 1.3479,
"step": 840
},
{
"epoch": 3.05,
"grad_norm": 90.77163696289062,
"learning_rate": 6.982078853046595e-06,
"loss": 1.3747,
"step": 850
},
{
"epoch": 3.08,
"grad_norm": 73.06351470947266,
"learning_rate": 6.946236559139785e-06,
"loss": 1.3437,
"step": 860
},
{
"epoch": 3.12,
"grad_norm": 177.39906311035156,
"learning_rate": 6.910394265232976e-06,
"loss": 1.3008,
"step": 870
},
{
"epoch": 3.15,
"grad_norm": 49.48398208618164,
"learning_rate": 6.874551971326166e-06,
"loss": 1.3858,
"step": 880
},
{
"epoch": 3.19,
"grad_norm": 60.6556396484375,
"learning_rate": 6.838709677419355e-06,
"loss": 1.3837,
"step": 890
},
{
"epoch": 3.23,
"grad_norm": 133.91407775878906,
"learning_rate": 6.802867383512546e-06,
"loss": 1.3502,
"step": 900
},
{
"epoch": 3.26,
"grad_norm": 100.1251449584961,
"learning_rate": 6.767025089605736e-06,
"loss": 1.3186,
"step": 910
},
{
"epoch": 3.3,
"grad_norm": 56.571807861328125,
"learning_rate": 6.731182795698925e-06,
"loss": 1.2846,
"step": 920
},
{
"epoch": 3.33,
"grad_norm": 37.74870681762695,
"learning_rate": 6.695340501792115e-06,
"loss": 1.3624,
"step": 930
},
{
"epoch": 3.37,
"grad_norm": 334.59722900390625,
"learning_rate": 6.659498207885306e-06,
"loss": 1.3377,
"step": 940
},
{
"epoch": 3.41,
"grad_norm": 67.167724609375,
"learning_rate": 6.623655913978495e-06,
"loss": 1.3796,
"step": 950
},
{
"epoch": 3.44,
"grad_norm": 46.475154876708984,
"learning_rate": 6.587813620071685e-06,
"loss": 1.3168,
"step": 960
},
{
"epoch": 3.48,
"grad_norm": 94.3563232421875,
"learning_rate": 6.5519713261648755e-06,
"loss": 1.2965,
"step": 970
},
{
"epoch": 3.51,
"grad_norm": 49.33637619018555,
"learning_rate": 6.516129032258065e-06,
"loss": 1.2851,
"step": 980
},
{
"epoch": 3.55,
"grad_norm": 78.11804962158203,
"learning_rate": 6.480286738351255e-06,
"loss": 1.4281,
"step": 990
},
{
"epoch": 3.58,
"grad_norm": 135.7293243408203,
"learning_rate": 6.444444444444445e-06,
"loss": 1.361,
"step": 1000
},
{
"epoch": 3.62,
"grad_norm": 449.07757568359375,
"learning_rate": 6.408602150537635e-06,
"loss": 1.3035,
"step": 1010
},
{
"epoch": 3.66,
"grad_norm": 50.195037841796875,
"learning_rate": 6.372759856630825e-06,
"loss": 1.3036,
"step": 1020
},
{
"epoch": 3.69,
"grad_norm": 51.2278938293457,
"learning_rate": 6.336917562724015e-06,
"loss": 1.3152,
"step": 1030
},
{
"epoch": 3.73,
"grad_norm": 115.5573501586914,
"learning_rate": 6.301075268817205e-06,
"loss": 1.2863,
"step": 1040
},
{
"epoch": 3.76,
"grad_norm": 48.20037078857422,
"learning_rate": 6.2652329749103945e-06,
"loss": 1.289,
"step": 1050
},
{
"epoch": 3.8,
"grad_norm": 82.9049301147461,
"learning_rate": 6.229390681003584e-06,
"loss": 1.3162,
"step": 1060
},
{
"epoch": 3.84,
"grad_norm": 49.85783386230469,
"learning_rate": 6.193548387096775e-06,
"loss": 1.439,
"step": 1070
},
{
"epoch": 3.87,
"grad_norm": 72.38436126708984,
"learning_rate": 6.157706093189964e-06,
"loss": 1.3553,
"step": 1080
},
{
"epoch": 3.91,
"grad_norm": 69.86207580566406,
"learning_rate": 6.121863799283154e-06,
"loss": 1.3005,
"step": 1090
},
{
"epoch": 3.94,
"grad_norm": 121.75460052490234,
"learning_rate": 6.086021505376345e-06,
"loss": 1.3552,
"step": 1100
},
{
"epoch": 3.98,
"grad_norm": 74.01500701904297,
"learning_rate": 6.050179211469534e-06,
"loss": 1.2324,
"step": 1110
},
{
"epoch": 4.01,
"grad_norm": 120.46449279785156,
"learning_rate": 6.014336917562724e-06,
"loss": 1.2653,
"step": 1120
},
{
"epoch": 4.05,
"grad_norm": 129.35137939453125,
"learning_rate": 5.978494623655915e-06,
"loss": 1.2437,
"step": 1130
},
{
"epoch": 4.09,
"grad_norm": 114.84891510009766,
"learning_rate": 5.942652329749104e-06,
"loss": 1.2631,
"step": 1140
},
{
"epoch": 4.12,
"grad_norm": 48.180904388427734,
"learning_rate": 5.906810035842294e-06,
"loss": 1.2735,
"step": 1150
},
{
"epoch": 4.16,
"grad_norm": 40.2717170715332,
"learning_rate": 5.8709677419354835e-06,
"loss": 1.2728,
"step": 1160
},
{
"epoch": 4.19,
"grad_norm": 71.15570068359375,
"learning_rate": 5.835125448028675e-06,
"loss": 1.3059,
"step": 1170
},
{
"epoch": 4.23,
"grad_norm": 62.20613479614258,
"learning_rate": 5.7992831541218645e-06,
"loss": 1.2574,
"step": 1180
},
{
"epoch": 4.27,
"grad_norm": 83.15790557861328,
"learning_rate": 5.763440860215054e-06,
"loss": 1.3026,
"step": 1190
},
{
"epoch": 4.3,
"grad_norm": 79.50647735595703,
"learning_rate": 5.727598566308245e-06,
"loss": 1.237,
"step": 1200
},
{
"epoch": 4.34,
"grad_norm": 69.64093780517578,
"learning_rate": 5.691756272401434e-06,
"loss": 1.2479,
"step": 1210
},
{
"epoch": 4.37,
"grad_norm": 40.61994171142578,
"learning_rate": 5.655913978494624e-06,
"loss": 1.2842,
"step": 1220
},
{
"epoch": 4.41,
"grad_norm": 53.86878967285156,
"learning_rate": 5.620071684587815e-06,
"loss": 1.267,
"step": 1230
},
{
"epoch": 4.44,
"grad_norm": 53.299129486083984,
"learning_rate": 5.584229390681004e-06,
"loss": 1.2256,
"step": 1240
},
{
"epoch": 4.48,
"grad_norm": 124.71385192871094,
"learning_rate": 5.548387096774194e-06,
"loss": 1.245,
"step": 1250
},
{
"epoch": 4.52,
"grad_norm": 48.88494873046875,
"learning_rate": 5.5125448028673844e-06,
"loss": 1.2428,
"step": 1260
},
{
"epoch": 4.55,
"grad_norm": 34.74821472167969,
"learning_rate": 5.476702508960574e-06,
"loss": 1.241,
"step": 1270
},
{
"epoch": 4.59,
"grad_norm": 92.53400421142578,
"learning_rate": 5.440860215053764e-06,
"loss": 1.2655,
"step": 1280
},
{
"epoch": 4.62,
"grad_norm": 79.42727661132812,
"learning_rate": 5.4050179211469535e-06,
"loss": 1.2246,
"step": 1290
},
{
"epoch": 4.66,
"grad_norm": 42.84294891357422,
"learning_rate": 5.369175627240144e-06,
"loss": 1.234,
"step": 1300
},
{
"epoch": 4.7,
"grad_norm": 43.57499313354492,
"learning_rate": 5.333333333333334e-06,
"loss": 1.2637,
"step": 1310
},
{
"epoch": 4.73,
"grad_norm": 141.61326599121094,
"learning_rate": 5.297491039426523e-06,
"loss": 1.2883,
"step": 1320
},
{
"epoch": 4.77,
"grad_norm": 71.41532897949219,
"learning_rate": 5.261648745519714e-06,
"loss": 1.2633,
"step": 1330
},
{
"epoch": 4.8,
"grad_norm": 56.57035827636719,
"learning_rate": 5.2258064516129035e-06,
"loss": 1.238,
"step": 1340
},
{
"epoch": 4.84,
"grad_norm": 38.39865493774414,
"learning_rate": 5.189964157706093e-06,
"loss": 1.2587,
"step": 1350
},
{
"epoch": 4.87,
"grad_norm": 45.26354217529297,
"learning_rate": 5.154121863799284e-06,
"loss": 1.2644,
"step": 1360
},
{
"epoch": 4.91,
"grad_norm": 62.85947799682617,
"learning_rate": 5.118279569892473e-06,
"loss": 1.3197,
"step": 1370
},
{
"epoch": 4.95,
"grad_norm": 171.12010192871094,
"learning_rate": 5.082437275985663e-06,
"loss": 1.2333,
"step": 1380
},
{
"epoch": 4.98,
"grad_norm": 38.02899932861328,
"learning_rate": 5.0465949820788544e-06,
"loss": 1.2987,
"step": 1390
},
{
"epoch": 5.02,
"grad_norm": 47.150367736816406,
"learning_rate": 5.010752688172043e-06,
"loss": 1.2651,
"step": 1400
},
{
"epoch": 5.05,
"grad_norm": 50.13650894165039,
"learning_rate": 4.974910394265233e-06,
"loss": 1.224,
"step": 1410
},
{
"epoch": 5.09,
"grad_norm": 56.77107238769531,
"learning_rate": 4.9390681003584234e-06,
"loss": 1.1944,
"step": 1420
},
{
"epoch": 5.13,
"grad_norm": 45.99001693725586,
"learning_rate": 4.903225806451613e-06,
"loss": 1.2481,
"step": 1430
},
{
"epoch": 5.16,
"grad_norm": 94.43379974365234,
"learning_rate": 4.867383512544804e-06,
"loss": 1.2333,
"step": 1440
},
{
"epoch": 5.2,
"grad_norm": 92.9940414428711,
"learning_rate": 4.831541218637993e-06,
"loss": 1.2503,
"step": 1450
},
{
"epoch": 5.23,
"grad_norm": 62.211280822753906,
"learning_rate": 4.795698924731183e-06,
"loss": 1.238,
"step": 1460
},
{
"epoch": 5.27,
"grad_norm": 56.011722564697266,
"learning_rate": 4.7598566308243735e-06,
"loss": 1.2228,
"step": 1470
},
{
"epoch": 5.3,
"grad_norm": 131.8081817626953,
"learning_rate": 4.724014336917563e-06,
"loss": 1.1854,
"step": 1480
},
{
"epoch": 5.34,
"grad_norm": 38.84933090209961,
"learning_rate": 4.688172043010753e-06,
"loss": 1.2052,
"step": 1490
},
{
"epoch": 5.38,
"grad_norm": 53.045658111572266,
"learning_rate": 4.652329749103943e-06,
"loss": 1.3113,
"step": 1500
},
{
"epoch": 5.41,
"grad_norm": 48.19401931762695,
"learning_rate": 4.616487455197133e-06,
"loss": 1.2192,
"step": 1510
},
{
"epoch": 5.45,
"grad_norm": 31.12712860107422,
"learning_rate": 4.580645161290323e-06,
"loss": 1.2003,
"step": 1520
},
{
"epoch": 5.48,
"grad_norm": 2096.69091796875,
"learning_rate": 4.544802867383513e-06,
"loss": 1.2451,
"step": 1530
},
{
"epoch": 5.52,
"grad_norm": 76.25735473632812,
"learning_rate": 4.508960573476703e-06,
"loss": 1.2152,
"step": 1540
},
{
"epoch": 5.56,
"grad_norm": 66.96879577636719,
"learning_rate": 4.473118279569893e-06,
"loss": 1.1985,
"step": 1550
},
{
"epoch": 5.59,
"grad_norm": 74.71087646484375,
"learning_rate": 4.437275985663082e-06,
"loss": 1.2033,
"step": 1560
},
{
"epoch": 5.63,
"grad_norm": 64.05677795410156,
"learning_rate": 4.401433691756273e-06,
"loss": 1.2461,
"step": 1570
},
{
"epoch": 5.66,
"grad_norm": 35.51209259033203,
"learning_rate": 4.365591397849463e-06,
"loss": 1.2277,
"step": 1580
},
{
"epoch": 5.7,
"grad_norm": 50.55402374267578,
"learning_rate": 4.329749103942653e-06,
"loss": 1.2312,
"step": 1590
},
{
"epoch": 5.73,
"grad_norm": 39.16292953491211,
"learning_rate": 4.293906810035843e-06,
"loss": 1.1947,
"step": 1600
},
{
"epoch": 5.77,
"grad_norm": 108.2676773071289,
"learning_rate": 4.258064516129032e-06,
"loss": 1.2399,
"step": 1610
},
{
"epoch": 5.81,
"grad_norm": 37.47825622558594,
"learning_rate": 4.222222222222223e-06,
"loss": 1.2542,
"step": 1620
},
{
"epoch": 5.84,
"grad_norm": 41.715301513671875,
"learning_rate": 4.1863799283154125e-06,
"loss": 1.1657,
"step": 1630
},
{
"epoch": 5.88,
"grad_norm": 70.7906723022461,
"learning_rate": 4.150537634408602e-06,
"loss": 1.18,
"step": 1640
},
{
"epoch": 5.91,
"grad_norm": 73.00814056396484,
"learning_rate": 4.114695340501793e-06,
"loss": 1.1753,
"step": 1650
},
{
"epoch": 5.95,
"grad_norm": 40.9326286315918,
"learning_rate": 4.078853046594982e-06,
"loss": 1.1954,
"step": 1660
},
{
"epoch": 5.99,
"grad_norm": 71.95938873291016,
"learning_rate": 4.043010752688172e-06,
"loss": 1.239,
"step": 1670
},
{
"epoch": 6.02,
"grad_norm": 35.70452880859375,
"learning_rate": 4.0071684587813626e-06,
"loss": 1.2607,
"step": 1680
},
{
"epoch": 6.06,
"grad_norm": 33.75741958618164,
"learning_rate": 3.971326164874552e-06,
"loss": 1.2169,
"step": 1690
},
{
"epoch": 6.09,
"grad_norm": 39.85159683227539,
"learning_rate": 3.935483870967742e-06,
"loss": 1.198,
"step": 1700
},
{
"epoch": 6.13,
"grad_norm": 51.76079177856445,
"learning_rate": 3.8996415770609324e-06,
"loss": 1.2028,
"step": 1710
},
{
"epoch": 6.16,
"grad_norm": 41.39125061035156,
"learning_rate": 3.863799283154122e-06,
"loss": 1.2141,
"step": 1720
},
{
"epoch": 6.2,
"grad_norm": 53.27019119262695,
"learning_rate": 3.827956989247313e-06,
"loss": 1.1815,
"step": 1730
},
{
"epoch": 6.24,
"grad_norm": 54.405303955078125,
"learning_rate": 3.792114695340502e-06,
"loss": 1.2037,
"step": 1740
},
{
"epoch": 6.27,
"grad_norm": 60.07841491699219,
"learning_rate": 3.756272401433692e-06,
"loss": 1.2098,
"step": 1750
},
{
"epoch": 6.31,
"grad_norm": 82.04875183105469,
"learning_rate": 3.720430107526882e-06,
"loss": 1.1644,
"step": 1760
},
{
"epoch": 6.34,
"grad_norm": 26.22403335571289,
"learning_rate": 3.6845878136200717e-06,
"loss": 1.1614,
"step": 1770
},
{
"epoch": 6.38,
"grad_norm": 48.65098190307617,
"learning_rate": 3.6487455197132623e-06,
"loss": 1.2599,
"step": 1780
},
{
"epoch": 6.42,
"grad_norm": 42.07708740234375,
"learning_rate": 3.6129032258064515e-06,
"loss": 1.0894,
"step": 1790
},
{
"epoch": 6.45,
"grad_norm": 36.60328674316406,
"learning_rate": 3.577060931899642e-06,
"loss": 1.1918,
"step": 1800
},
{
"epoch": 6.49,
"grad_norm": 61.540828704833984,
"learning_rate": 3.541218637992832e-06,
"loss": 1.1573,
"step": 1810
},
{
"epoch": 6.52,
"grad_norm": 55.06793975830078,
"learning_rate": 3.505376344086022e-06,
"loss": 1.2161,
"step": 1820
},
{
"epoch": 6.56,
"grad_norm": 52.780025482177734,
"learning_rate": 3.469534050179212e-06,
"loss": 1.2136,
"step": 1830
},
{
"epoch": 6.59,
"grad_norm": 36.805023193359375,
"learning_rate": 3.4336917562724016e-06,
"loss": 1.2059,
"step": 1840
},
{
"epoch": 6.63,
"grad_norm": 34.84747314453125,
"learning_rate": 3.3978494623655917e-06,
"loss": 1.1723,
"step": 1850
},
{
"epoch": 6.67,
"grad_norm": 29.03413200378418,
"learning_rate": 3.3620071684587818e-06,
"loss": 1.2304,
"step": 1860
},
{
"epoch": 6.7,
"grad_norm": 43.57373809814453,
"learning_rate": 3.3261648745519714e-06,
"loss": 1.2636,
"step": 1870
},
{
"epoch": 6.74,
"grad_norm": 40.338401794433594,
"learning_rate": 3.2903225806451615e-06,
"loss": 1.201,
"step": 1880
},
{
"epoch": 6.77,
"grad_norm": 49.39339828491211,
"learning_rate": 3.254480286738351e-06,
"loss": 1.1907,
"step": 1890
},
{
"epoch": 6.81,
"grad_norm": 44.37787628173828,
"learning_rate": 3.2186379928315413e-06,
"loss": 1.1608,
"step": 1900
},
{
"epoch": 6.85,
"grad_norm": 30.73851776123047,
"learning_rate": 3.182795698924732e-06,
"loss": 1.0991,
"step": 1910
},
{
"epoch": 6.88,
"grad_norm": 75.66316986083984,
"learning_rate": 3.146953405017921e-06,
"loss": 1.2072,
"step": 1920
},
{
"epoch": 6.92,
"grad_norm": 79.0891342163086,
"learning_rate": 3.1111111111111116e-06,
"loss": 1.1899,
"step": 1930
},
{
"epoch": 6.95,
"grad_norm": 44.473907470703125,
"learning_rate": 3.0752688172043017e-06,
"loss": 1.2603,
"step": 1940
},
{
"epoch": 6.99,
"grad_norm": 70.97260284423828,
"learning_rate": 3.0394265232974914e-06,
"loss": 1.1917,
"step": 1950
},
{
"epoch": 7.03,
"grad_norm": 113.41941833496094,
"learning_rate": 3.0035842293906814e-06,
"loss": 1.2268,
"step": 1960
},
{
"epoch": 7.06,
"grad_norm": 31.316911697387695,
"learning_rate": 2.967741935483871e-06,
"loss": 1.1049,
"step": 1970
},
{
"epoch": 7.1,
"grad_norm": 35.60115051269531,
"learning_rate": 2.9318996415770612e-06,
"loss": 1.1542,
"step": 1980
},
{
"epoch": 7.13,
"grad_norm": 57.42852783203125,
"learning_rate": 2.8960573476702513e-06,
"loss": 1.2427,
"step": 1990
},
{
"epoch": 7.17,
"grad_norm": 40.23640441894531,
"learning_rate": 2.860215053763441e-06,
"loss": 1.1448,
"step": 2000
},
{
"epoch": 7.2,
"grad_norm": 36.7147102355957,
"learning_rate": 2.824372759856631e-06,
"loss": 1.1392,
"step": 2010
},
{
"epoch": 7.24,
"grad_norm": 33.4290657043457,
"learning_rate": 2.7885304659498208e-06,
"loss": 1.1704,
"step": 2020
},
{
"epoch": 7.28,
"grad_norm": 56.32290267944336,
"learning_rate": 2.752688172043011e-06,
"loss": 1.1495,
"step": 2030
},
{
"epoch": 7.31,
"grad_norm": 43.89424133300781,
"learning_rate": 2.716845878136201e-06,
"loss": 1.1387,
"step": 2040
},
{
"epoch": 7.35,
"grad_norm": 39.107975006103516,
"learning_rate": 2.6810035842293906e-06,
"loss": 1.3479,
"step": 2050
},
{
"epoch": 7.38,
"grad_norm": 56.70566177368164,
"learning_rate": 2.645161290322581e-06,
"loss": 1.1315,
"step": 2060
},
{
"epoch": 7.42,
"grad_norm": 28.802082061767578,
"learning_rate": 2.6093189964157704e-06,
"loss": 1.1935,
"step": 2070
},
{
"epoch": 7.46,
"grad_norm": 48.39360809326172,
"learning_rate": 2.573476702508961e-06,
"loss": 1.2482,
"step": 2080
},
{
"epoch": 7.49,
"grad_norm": 32.54924774169922,
"learning_rate": 2.537634408602151e-06,
"loss": 1.2273,
"step": 2090
},
{
"epoch": 7.53,
"grad_norm": 39.42136764526367,
"learning_rate": 2.5017921146953407e-06,
"loss": 1.1796,
"step": 2100
},
{
"epoch": 7.56,
"grad_norm": 45.770755767822266,
"learning_rate": 2.4659498207885308e-06,
"loss": 1.1732,
"step": 2110
},
{
"epoch": 7.6,
"grad_norm": 38.276832580566406,
"learning_rate": 2.4301075268817204e-06,
"loss": 1.1732,
"step": 2120
},
{
"epoch": 7.63,
"grad_norm": 41.075775146484375,
"learning_rate": 2.3942652329749105e-06,
"loss": 1.1713,
"step": 2130
},
{
"epoch": 7.67,
"grad_norm": 65.88035583496094,
"learning_rate": 2.3584229390681006e-06,
"loss": 1.1167,
"step": 2140
},
{
"epoch": 7.71,
"grad_norm": 36.76555633544922,
"learning_rate": 2.3225806451612907e-06,
"loss": 1.1947,
"step": 2150
},
{
"epoch": 7.74,
"grad_norm": 60.711055755615234,
"learning_rate": 2.2867383512544804e-06,
"loss": 1.1164,
"step": 2160
},
{
"epoch": 7.78,
"grad_norm": 37.08338928222656,
"learning_rate": 2.2508960573476705e-06,
"loss": 1.1199,
"step": 2170
},
{
"epoch": 7.81,
"grad_norm": 41.63785171508789,
"learning_rate": 2.21505376344086e-06,
"loss": 1.2048,
"step": 2180
},
{
"epoch": 7.85,
"grad_norm": 62.35722351074219,
"learning_rate": 2.1792114695340507e-06,
"loss": 1.2011,
"step": 2190
},
{
"epoch": 7.89,
"grad_norm": 44.37682342529297,
"learning_rate": 2.1433691756272404e-06,
"loss": 1.1709,
"step": 2200
},
{
"epoch": 7.92,
"grad_norm": 40.216739654541016,
"learning_rate": 2.1075268817204305e-06,
"loss": 1.1959,
"step": 2210
},
{
"epoch": 7.96,
"grad_norm": 35.443302154541016,
"learning_rate": 2.07168458781362e-06,
"loss": 1.1123,
"step": 2220
},
{
"epoch": 7.99,
"grad_norm": 59.596954345703125,
"learning_rate": 2.0358422939068102e-06,
"loss": 1.1139,
"step": 2230
},
{
"epoch": 8.03,
"grad_norm": 34.79387283325195,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1845,
"step": 2240
},
{
"epoch": 8.06,
"grad_norm": 50.571712493896484,
"learning_rate": 1.96415770609319e-06,
"loss": 1.1528,
"step": 2250
},
{
"epoch": 8.1,
"grad_norm": 33.95150375366211,
"learning_rate": 1.92831541218638e-06,
"loss": 1.1858,
"step": 2260
},
{
"epoch": 8.14,
"grad_norm": 64.71048736572266,
"learning_rate": 1.89247311827957e-06,
"loss": 1.1199,
"step": 2270
},
{
"epoch": 8.17,
"grad_norm": 76.19322967529297,
"learning_rate": 1.8566308243727599e-06,
"loss": 1.1457,
"step": 2280
},
{
"epoch": 8.21,
"grad_norm": 40.506675720214844,
"learning_rate": 1.82078853046595e-06,
"loss": 1.1886,
"step": 2290
},
{
"epoch": 8.24,
"grad_norm": 34.859432220458984,
"learning_rate": 1.7849462365591399e-06,
"loss": 1.1133,
"step": 2300
},
{
"epoch": 8.28,
"grad_norm": 39.036376953125,
"learning_rate": 1.74910394265233e-06,
"loss": 1.1845,
"step": 2310
},
{
"epoch": 8.32,
"grad_norm": 72.71634674072266,
"learning_rate": 1.7132616487455198e-06,
"loss": 1.1685,
"step": 2320
},
{
"epoch": 8.35,
"grad_norm": 29.658227920532227,
"learning_rate": 1.67741935483871e-06,
"loss": 1.1849,
"step": 2330
},
{
"epoch": 8.39,
"grad_norm": 40.88108825683594,
"learning_rate": 1.6415770609318998e-06,
"loss": 1.1473,
"step": 2340
},
{
"epoch": 8.42,
"grad_norm": 46.79905700683594,
"learning_rate": 1.6057347670250897e-06,
"loss": 1.1407,
"step": 2350
},
{
"epoch": 8.46,
"grad_norm": 40.85004806518555,
"learning_rate": 1.5698924731182796e-06,
"loss": 1.0891,
"step": 2360
},
{
"epoch": 8.49,
"grad_norm": 43.564849853515625,
"learning_rate": 1.5340501792114695e-06,
"loss": 1.1173,
"step": 2370
},
{
"epoch": 8.53,
"grad_norm": 39.988792419433594,
"learning_rate": 1.4982078853046598e-06,
"loss": 1.0948,
"step": 2380
},
{
"epoch": 8.57,
"grad_norm": 33.0150260925293,
"learning_rate": 1.4623655913978497e-06,
"loss": 1.1763,
"step": 2390
},
{
"epoch": 8.6,
"grad_norm": 36.02336120605469,
"learning_rate": 1.4265232974910395e-06,
"loss": 1.1621,
"step": 2400
},
{
"epoch": 8.64,
"grad_norm": 34.898765563964844,
"learning_rate": 1.3906810035842294e-06,
"loss": 1.2091,
"step": 2410
},
{
"epoch": 8.67,
"grad_norm": 34.42953109741211,
"learning_rate": 1.3548387096774195e-06,
"loss": 1.1255,
"step": 2420
},
{
"epoch": 8.71,
"grad_norm": 86.2882080078125,
"learning_rate": 1.3189964157706094e-06,
"loss": 1.1501,
"step": 2430
},
{
"epoch": 8.75,
"grad_norm": 43.11504364013672,
"learning_rate": 1.2831541218637993e-06,
"loss": 1.2159,
"step": 2440
},
{
"epoch": 8.78,
"grad_norm": 37.7353630065918,
"learning_rate": 1.2473118279569894e-06,
"loss": 1.1255,
"step": 2450
},
{
"epoch": 8.82,
"grad_norm": 33.8388671875,
"learning_rate": 1.2114695340501793e-06,
"loss": 1.1677,
"step": 2460
},
{
"epoch": 8.85,
"grad_norm": 129.2806396484375,
"learning_rate": 1.1756272401433692e-06,
"loss": 1.1739,
"step": 2470
},
{
"epoch": 8.89,
"grad_norm": 31.244264602661133,
"learning_rate": 1.1397849462365593e-06,
"loss": 1.1247,
"step": 2480
},
{
"epoch": 8.92,
"grad_norm": 30.236568450927734,
"learning_rate": 1.1039426523297491e-06,
"loss": 1.093,
"step": 2490
},
{
"epoch": 8.96,
"grad_norm": 42.34114456176758,
"learning_rate": 1.0681003584229392e-06,
"loss": 1.1688,
"step": 2500
},
{
"epoch": 9.0,
"grad_norm": 34.8842887878418,
"learning_rate": 1.0322580645161291e-06,
"loss": 1.1828,
"step": 2510
},
{
"epoch": 9.03,
"grad_norm": 34.398521423339844,
"learning_rate": 9.96415770609319e-07,
"loss": 1.1351,
"step": 2520
},
{
"epoch": 9.07,
"grad_norm": 42.38921356201172,
"learning_rate": 9.60573476702509e-07,
"loss": 1.1673,
"step": 2530
},
{
"epoch": 9.1,
"grad_norm": 62.44378662109375,
"learning_rate": 9.24731182795699e-07,
"loss": 1.1386,
"step": 2540
},
{
"epoch": 9.14,
"grad_norm": 30.45479965209961,
"learning_rate": 8.88888888888889e-07,
"loss": 1.1577,
"step": 2550
},
{
"epoch": 9.18,
"grad_norm": 37.57691192626953,
"learning_rate": 8.530465949820789e-07,
"loss": 1.1291,
"step": 2560
},
{
"epoch": 9.21,
"grad_norm": 54.45702362060547,
"learning_rate": 8.17204301075269e-07,
"loss": 1.1575,
"step": 2570
},
{
"epoch": 9.25,
"grad_norm": 57.186737060546875,
"learning_rate": 7.813620071684588e-07,
"loss": 1.132,
"step": 2580
},
{
"epoch": 9.28,
"grad_norm": 35.08234405517578,
"learning_rate": 7.455197132616488e-07,
"loss": 1.1816,
"step": 2590
},
{
"epoch": 9.32,
"grad_norm": 87.56676483154297,
"learning_rate": 7.096774193548388e-07,
"loss": 1.1528,
"step": 2600
}
],
"logging_steps": 10,
"max_steps": 2790,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 200,
"total_flos": 3.965868410199552e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}