DiffPO-8B / trainer_state.json
RuizheChen's picture
Add files using upload-large-folder tool
44d296d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998663994655979,
"eval_steps": 500,
"global_step": 1871,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005344021376085505,
"grad_norm": 486.19793701171875,
"learning_rate": 1.7543859649122806e-10,
"loss": 44.3932,
"step": 10
},
{
"epoch": 0.01068804275217101,
"grad_norm": 479.20001220703125,
"learning_rate": 3.5087719298245613e-10,
"loss": 45.7403,
"step": 20
},
{
"epoch": 0.01603206412825651,
"grad_norm": 404.8466796875,
"learning_rate": 5.263157894736842e-10,
"loss": 45.2088,
"step": 30
},
{
"epoch": 0.02137608550434202,
"grad_norm": 481.6076965332031,
"learning_rate": 7.017543859649123e-10,
"loss": 45.2267,
"step": 40
},
{
"epoch": 0.026720106880427523,
"grad_norm": 505.76458740234375,
"learning_rate": 8.771929824561403e-10,
"loss": 45.4064,
"step": 50
},
{
"epoch": 0.03206412825651302,
"grad_norm": 436.2538146972656,
"learning_rate": 9.99993251508253e-10,
"loss": 45.0983,
"step": 60
},
{
"epoch": 0.03740814963259853,
"grad_norm": 487.7237243652344,
"learning_rate": 9.998732833893071e-10,
"loss": 45.2743,
"step": 70
},
{
"epoch": 0.04275217100868404,
"grad_norm": 504.32977294921875,
"learning_rate": 9.996033902036725e-10,
"loss": 45.9555,
"step": 80
},
{
"epoch": 0.04809619238476954,
"grad_norm": 402.4021911621094,
"learning_rate": 9.991836528993718e-10,
"loss": 45.9827,
"step": 90
},
{
"epoch": 0.053440213760855046,
"grad_norm": 423.0853271484375,
"learning_rate": 9.986141973665967e-10,
"loss": 46.21,
"step": 100
},
{
"epoch": 0.058784235136940546,
"grad_norm": 533.7306518554688,
"learning_rate": 9.978951943999498e-10,
"loss": 41.8617,
"step": 110
},
{
"epoch": 0.06412825651302605,
"grad_norm": 483.0992736816406,
"learning_rate": 9.970268596472183e-10,
"loss": 46.5482,
"step": 120
},
{
"epoch": 0.06947227788911156,
"grad_norm": 457.9750061035156,
"learning_rate": 9.960094535446974e-10,
"loss": 45.5803,
"step": 130
},
{
"epoch": 0.07481629926519706,
"grad_norm": 464.4324951171875,
"learning_rate": 9.948432812390764e-10,
"loss": 44.9389,
"step": 140
},
{
"epoch": 0.08016032064128256,
"grad_norm": 435.9920349121094,
"learning_rate": 9.935286924959192e-10,
"loss": 47.8866,
"step": 150
},
{
"epoch": 0.08550434201736808,
"grad_norm": 456.80889892578125,
"learning_rate": 9.920660815947595e-10,
"loss": 45.0282,
"step": 160
},
{
"epoch": 0.09084836339345358,
"grad_norm": 497.8466491699219,
"learning_rate": 9.904558872108458e-10,
"loss": 46.1007,
"step": 170
},
{
"epoch": 0.09619238476953908,
"grad_norm": 405.5328369140625,
"learning_rate": 9.886985922835717e-10,
"loss": 44.4369,
"step": 180
},
{
"epoch": 0.10153640614562458,
"grad_norm": 467.2921142578125,
"learning_rate": 9.867947238716296e-10,
"loss": 48.2561,
"step": 190
},
{
"epoch": 0.10688042752171009,
"grad_norm": 450.1244812011719,
"learning_rate": 9.847448529949325e-10,
"loss": 43.8374,
"step": 200
},
{
"epoch": 0.11222444889779559,
"grad_norm": 495.2174072265625,
"learning_rate": 9.82549594463349e-10,
"loss": 45.4406,
"step": 210
},
{
"epoch": 0.11756847027388109,
"grad_norm": 410.7157897949219,
"learning_rate": 9.802096066923072e-10,
"loss": 45.8352,
"step": 220
},
{
"epoch": 0.1229124916499666,
"grad_norm": 394.4830017089844,
"learning_rate": 9.777255915053179e-10,
"loss": 46.1355,
"step": 230
},
{
"epoch": 0.1282565130260521,
"grad_norm": 375.6810607910156,
"learning_rate": 9.75098293923479e-10,
"loss": 44.0556,
"step": 240
},
{
"epoch": 0.13360053440213762,
"grad_norm": 552.1492309570312,
"learning_rate": 9.723285019420253e-10,
"loss": 48.5456,
"step": 250
},
{
"epoch": 0.13894455577822312,
"grad_norm": 386.6697998046875,
"learning_rate": 9.69417046293987e-10,
"loss": 47.2565,
"step": 260
},
{
"epoch": 0.14428857715430862,
"grad_norm": 385.3708190917969,
"learning_rate": 9.66364800201032e-10,
"loss": 47.0423,
"step": 270
},
{
"epoch": 0.14963259853039412,
"grad_norm": 407.2390441894531,
"learning_rate": 9.631726791115632e-10,
"loss": 45.1834,
"step": 280
},
{
"epoch": 0.15497661990647962,
"grad_norm": 424.76519775390625,
"learning_rate": 9.598416404261524e-10,
"loss": 45.0167,
"step": 290
},
{
"epoch": 0.16032064128256512,
"grad_norm": 472.8403015136719,
"learning_rate": 9.5637268321039e-10,
"loss": 46.5219,
"step": 300
},
{
"epoch": 0.16566466265865062,
"grad_norm": 428.8890686035156,
"learning_rate": 9.527668478952394e-10,
"loss": 47.501,
"step": 310
},
{
"epoch": 0.17100868403473615,
"grad_norm": 399.3135681152344,
"learning_rate": 9.490252159649852e-10,
"loss": 44.057,
"step": 320
},
{
"epoch": 0.17635270541082165,
"grad_norm": 385.1392517089844,
"learning_rate": 9.451489096328667e-10,
"loss": 43.8841,
"step": 330
},
{
"epoch": 0.18169672678690715,
"grad_norm": 416.7450866699219,
"learning_rate": 9.411390915044974e-10,
"loss": 44.5708,
"step": 340
},
{
"epoch": 0.18704074816299265,
"grad_norm": 374.25531005859375,
"learning_rate": 9.369969642291692e-10,
"loss": 46.3587,
"step": 350
},
{
"epoch": 0.19238476953907815,
"grad_norm": 451.29510498046875,
"learning_rate": 9.327237701391466e-10,
"loss": 46.0082,
"step": 360
},
{
"epoch": 0.19772879091516365,
"grad_norm": 481.7860412597656,
"learning_rate": 9.283207908770579e-10,
"loss": 49.3258,
"step": 370
},
{
"epoch": 0.20307281229124916,
"grad_norm": 493.49517822265625,
"learning_rate": 9.237893470114983e-10,
"loss": 46.3923,
"step": 380
},
{
"epoch": 0.20841683366733466,
"grad_norm": 451.55072021484375,
"learning_rate": 9.191307976409558e-10,
"loss": 46.2008,
"step": 390
},
{
"epoch": 0.21376085504342018,
"grad_norm": 474.09625244140625,
"learning_rate": 9.143465399861828e-10,
"loss": 44.9755,
"step": 400
},
{
"epoch": 0.21910487641950568,
"grad_norm": 454.8025207519531,
"learning_rate": 9.094380089711325e-10,
"loss": 45.1256,
"step": 410
},
{
"epoch": 0.22444889779559118,
"grad_norm": 480.7178955078125,
"learning_rate": 9.04406676792588e-10,
"loss": 48.9151,
"step": 420
},
{
"epoch": 0.22979291917167669,
"grad_norm": 361.1165771484375,
"learning_rate": 8.992540524786122e-10,
"loss": 45.1897,
"step": 430
},
{
"epoch": 0.23513694054776219,
"grad_norm": 455.4756164550781,
"learning_rate": 8.939816814359501e-10,
"loss": 46.2868,
"step": 440
},
{
"epoch": 0.24048096192384769,
"grad_norm": 518.1709594726562,
"learning_rate": 8.885911449865215e-10,
"loss": 48.0527,
"step": 450
},
{
"epoch": 0.2458249832999332,
"grad_norm": 470.18536376953125,
"learning_rate": 8.830840598931412e-10,
"loss": 46.6266,
"step": 460
},
{
"epoch": 0.2511690046760187,
"grad_norm": 456.1210632324219,
"learning_rate": 8.774620778746093e-10,
"loss": 45.275,
"step": 470
},
{
"epoch": 0.2565130260521042,
"grad_norm": 427.2693176269531,
"learning_rate": 8.71726885110318e-10,
"loss": 44.1736,
"step": 480
},
{
"epoch": 0.2618570474281897,
"grad_norm": 465.0010681152344,
"learning_rate": 8.658802017345217e-10,
"loss": 46.5734,
"step": 490
},
{
"epoch": 0.26720106880427524,
"grad_norm": 483.6059265136719,
"learning_rate": 8.599237813204241e-10,
"loss": 47.0762,
"step": 500
},
{
"epoch": 0.2725450901803607,
"grad_norm": 388.6180725097656,
"learning_rate": 8.538594103542357e-10,
"loss": 45.9568,
"step": 510
},
{
"epoch": 0.27788911155644624,
"grad_norm": 492.0127868652344,
"learning_rate": 8.476889076993602e-10,
"loss": 45.8206,
"step": 520
},
{
"epoch": 0.2832331329325317,
"grad_norm": 446.49700927734375,
"learning_rate": 8.414141240508689e-10,
"loss": 46.4758,
"step": 530
},
{
"epoch": 0.28857715430861725,
"grad_norm": 401.5068359375,
"learning_rate": 8.350369413804303e-10,
"loss": 45.8422,
"step": 540
},
{
"epoch": 0.2939211756847027,
"grad_norm": 443.8550109863281,
"learning_rate": 8.285592723718561e-10,
"loss": 46.1345,
"step": 550
},
{
"epoch": 0.29926519706078825,
"grad_norm": 385.59033203125,
"learning_rate": 8.219830598474381e-10,
"loss": 45.8269,
"step": 560
},
{
"epoch": 0.3046092184368738,
"grad_norm": 405.3898010253906,
"learning_rate": 8.153102761852451e-10,
"loss": 45.4571,
"step": 570
},
{
"epoch": 0.30995323981295925,
"grad_norm": 524.8499145507812,
"learning_rate": 8.085429227275549e-10,
"loss": 49.0534,
"step": 580
},
{
"epoch": 0.3152972611890448,
"grad_norm": 485.2023010253906,
"learning_rate": 8.016830291805995e-10,
"loss": 45.2131,
"step": 590
},
{
"epoch": 0.32064128256513025,
"grad_norm": 416.6390686035156,
"learning_rate": 7.947326530058027e-10,
"loss": 44.0664,
"step": 600
},
{
"epoch": 0.3259853039412158,
"grad_norm": 437.5408630371094,
"learning_rate": 7.876938788026944e-10,
"loss": 45.3301,
"step": 610
},
{
"epoch": 0.33132932531730125,
"grad_norm": 471.2472229003906,
"learning_rate": 7.805688176836843e-10,
"loss": 48.167,
"step": 620
},
{
"epoch": 0.3366733466933868,
"grad_norm": 468.9358215332031,
"learning_rate": 7.73359606640884e-10,
"loss": 46.2929,
"step": 630
},
{
"epoch": 0.3420173680694723,
"grad_norm": 523.02783203125,
"learning_rate": 7.660684079051672e-10,
"loss": 46.2754,
"step": 640
},
{
"epoch": 0.3473613894455578,
"grad_norm": 439.5931396484375,
"learning_rate": 7.586974082976608e-10,
"loss": 45.8867,
"step": 650
},
{
"epoch": 0.3527054108216433,
"grad_norm": 464.5501403808594,
"learning_rate": 7.512488185738588e-10,
"loss": 45.7995,
"step": 660
},
{
"epoch": 0.3580494321977288,
"grad_norm": 463.3254699707031,
"learning_rate": 7.437248727605602e-10,
"loss": 45.2951,
"step": 670
},
{
"epoch": 0.3633934535738143,
"grad_norm": 416.180908203125,
"learning_rate": 7.361278274858247e-10,
"loss": 46.9576,
"step": 680
},
{
"epoch": 0.3687374749498998,
"grad_norm": 504.0238037109375,
"learning_rate": 7.284599613021526e-10,
"loss": 47.678,
"step": 690
},
{
"epoch": 0.3740814963259853,
"grad_norm": 410.3598937988281,
"learning_rate": 7.207235740030858e-10,
"loss": 44.9078,
"step": 700
},
{
"epoch": 0.37942551770207084,
"grad_norm": 489.8995361328125,
"learning_rate": 7.1292098593344e-10,
"loss": 45.3449,
"step": 710
},
{
"epoch": 0.3847695390781563,
"grad_norm": 372.5830078125,
"learning_rate": 7.050545372933732e-10,
"loss": 45.1218,
"step": 720
},
{
"epoch": 0.39011356045424184,
"grad_norm": 502.1708068847656,
"learning_rate": 6.97126587436498e-10,
"loss": 47.2275,
"step": 730
},
{
"epoch": 0.3954575818303273,
"grad_norm": 431.7566833496094,
"learning_rate": 6.891395141622495e-10,
"loss": 45.798,
"step": 740
},
{
"epoch": 0.40080160320641284,
"grad_norm": 415.6434631347656,
"learning_rate": 6.810957130027218e-10,
"loss": 45.2911,
"step": 750
},
{
"epoch": 0.4061456245824983,
"grad_norm": 451.16290283203125,
"learning_rate": 6.729975965041849e-10,
"loss": 47.2858,
"step": 760
},
{
"epoch": 0.41148964595858384,
"grad_norm": 427.8857116699219,
"learning_rate": 6.64847593503499e-10,
"loss": 46.2518,
"step": 770
},
{
"epoch": 0.4168336673346693,
"grad_norm": 448.13836669921875,
"learning_rate": 6.566481483996427e-10,
"loss": 43.878,
"step": 780
},
{
"epoch": 0.42217768871075484,
"grad_norm": 495.7452392578125,
"learning_rate": 6.484017204205741e-10,
"loss": 47.3328,
"step": 790
},
{
"epoch": 0.42752171008684037,
"grad_norm": 448.1509704589844,
"learning_rate": 6.401107828856438e-10,
"loss": 45.6594,
"step": 800
},
{
"epoch": 0.43286573146292584,
"grad_norm": 454.22900390625,
"learning_rate": 6.31777822463782e-10,
"loss": 46.073,
"step": 810
},
{
"epoch": 0.43820975283901137,
"grad_norm": 494.20709228515625,
"learning_rate": 6.234053384276815e-10,
"loss": 44.3891,
"step": 820
},
{
"epoch": 0.44355377421509684,
"grad_norm": 397.5838928222656,
"learning_rate": 6.149958419042e-10,
"loss": 44.5643,
"step": 830
},
{
"epoch": 0.44889779559118237,
"grad_norm": 471.062255859375,
"learning_rate": 6.065518551212083e-10,
"loss": 46.9195,
"step": 840
},
{
"epoch": 0.45424181696726784,
"grad_norm": 486.7655334472656,
"learning_rate": 5.98075910651107e-10,
"loss": 47.3481,
"step": 850
},
{
"epoch": 0.45958583834335337,
"grad_norm": 515.932373046875,
"learning_rate": 5.895705506512437e-10,
"loss": 46.0562,
"step": 860
},
{
"epoch": 0.4649298597194389,
"grad_norm": 429.03814697265625,
"learning_rate": 5.810383261014514e-10,
"loss": 44.6224,
"step": 870
},
{
"epoch": 0.47027388109552437,
"grad_norm": 299.903564453125,
"learning_rate": 5.724817960389447e-10,
"loss": 44.7293,
"step": 880
},
{
"epoch": 0.4756179024716099,
"grad_norm": 477.40850830078125,
"learning_rate": 5.639035267907963e-10,
"loss": 45.3137,
"step": 890
},
{
"epoch": 0.48096192384769537,
"grad_norm": 468.72052001953125,
"learning_rate": 5.553060912042296e-10,
"loss": 44.8162,
"step": 900
},
{
"epoch": 0.4863059452237809,
"grad_norm": 454.3052062988281,
"learning_rate": 5.466920678749537e-10,
"loss": 44.9499,
"step": 910
},
{
"epoch": 0.4916499665998664,
"grad_norm": 424.9459228515625,
"learning_rate": 5.380640403737752e-10,
"loss": 47.8759,
"step": 920
},
{
"epoch": 0.4969939879759519,
"grad_norm": 380.2132873535156,
"learning_rate": 5.294245964717187e-10,
"loss": 44.8434,
"step": 930
},
{
"epoch": 0.5023380093520374,
"grad_norm": 516.621826171875,
"learning_rate": 5.207763273638852e-10,
"loss": 46.6005,
"step": 940
},
{
"epoch": 0.5076820307281229,
"grad_norm": 421.64404296875,
"learning_rate": 5.121218268922859e-10,
"loss": 45.6592,
"step": 950
},
{
"epoch": 0.5130260521042084,
"grad_norm": 458.570068359375,
"learning_rate": 5.03463690767881e-10,
"loss": 45.8901,
"step": 960
},
{
"epoch": 0.518370073480294,
"grad_norm": 443.55267333984375,
"learning_rate": 4.94804515792058e-10,
"loss": 44.6454,
"step": 970
},
{
"epoch": 0.5237140948563794,
"grad_norm": 427.3360595703125,
"learning_rate": 4.86146899077783e-10,
"loss": 45.2378,
"step": 980
},
{
"epoch": 0.5290581162324649,
"grad_norm": 299.7554016113281,
"learning_rate": 4.774934372706585e-10,
"loss": 44.3535,
"step": 990
},
{
"epoch": 0.5344021376085505,
"grad_norm": 445.1256103515625,
"learning_rate": 4.688467257701225e-10,
"loss": 45.9619,
"step": 1000
},
{
"epoch": 0.539746158984636,
"grad_norm": 467.1534423828125,
"learning_rate": 4.6020935795101856e-10,
"loss": 46.8164,
"step": 1010
},
{
"epoch": 0.5450901803607214,
"grad_norm": 433.607666015625,
"learning_rate": 4.5158392438577654e-10,
"loss": 44.5307,
"step": 1020
},
{
"epoch": 0.5504342017368069,
"grad_norm": 457.611328125,
"learning_rate": 4.429730120674315e-10,
"loss": 43.718,
"step": 1030
},
{
"epoch": 0.5557782231128925,
"grad_norm": 372.8671569824219,
"learning_rate": 4.343792036337167e-10,
"loss": 44.3206,
"step": 1040
},
{
"epoch": 0.561122244488978,
"grad_norm": 444.3086242675781,
"learning_rate": 4.258050765924633e-10,
"loss": 45.5667,
"step": 1050
},
{
"epoch": 0.5664662658650634,
"grad_norm": 338.9475402832031,
"learning_rate": 4.172532025485384e-10,
"loss": 42.6416,
"step": 1060
},
{
"epoch": 0.571810287241149,
"grad_norm": 448.279052734375,
"learning_rate": 4.0872614643255335e-10,
"loss": 45.6553,
"step": 1070
},
{
"epoch": 0.5771543086172345,
"grad_norm": 459.70159912109375,
"learning_rate": 4.002264657315738e-10,
"loss": 46.4637,
"step": 1080
},
{
"epoch": 0.58249832999332,
"grad_norm": 433.6397705078125,
"learning_rate": 3.9175670972206326e-10,
"loss": 43.3037,
"step": 1090
},
{
"epoch": 0.5878423513694054,
"grad_norm": 431.869873046875,
"learning_rate": 3.8331941870528737e-10,
"loss": 46.3079,
"step": 1100
},
{
"epoch": 0.593186372745491,
"grad_norm": 411.36077880859375,
"learning_rate": 3.7491712324541183e-10,
"loss": 46.909,
"step": 1110
},
{
"epoch": 0.5985303941215765,
"grad_norm": 400.7826843261719,
"learning_rate": 3.6655234341052023e-10,
"loss": 46.5449,
"step": 1120
},
{
"epoch": 0.603874415497662,
"grad_norm": 409.33355712890625,
"learning_rate": 3.5822758801677894e-10,
"loss": 47.9383,
"step": 1130
},
{
"epoch": 0.6092184368737475,
"grad_norm": 399.2182312011719,
"learning_rate": 3.4994535387597803e-10,
"loss": 42.633,
"step": 1140
},
{
"epoch": 0.614562458249833,
"grad_norm": 315.4638977050781,
"learning_rate": 3.417081250466723e-10,
"loss": 43.8757,
"step": 1150
},
{
"epoch": 0.6199064796259185,
"grad_norm": 389.83831787109375,
"learning_rate": 3.3351837208914703e-10,
"loss": 44.3336,
"step": 1160
},
{
"epoch": 0.625250501002004,
"grad_norm": 556.4785766601562,
"learning_rate": 3.253785513244322e-10,
"loss": 48.7932,
"step": 1170
},
{
"epoch": 0.6305945223780896,
"grad_norm": 457.6601867675781,
"learning_rate": 3.172911040975875e-10,
"loss": 45.7914,
"step": 1180
},
{
"epoch": 0.635938543754175,
"grad_norm": 497.7450256347656,
"learning_rate": 3.0925845604547985e-10,
"loss": 45.789,
"step": 1190
},
{
"epoch": 0.6412825651302605,
"grad_norm": 433.0904846191406,
"learning_rate": 3.012830163692706e-10,
"loss": 44.0252,
"step": 1200
},
{
"epoch": 0.6466265865063461,
"grad_norm": 417.3199157714844,
"learning_rate": 2.933671771118333e-10,
"loss": 45.2464,
"step": 1210
},
{
"epoch": 0.6519706078824316,
"grad_norm": 439.3309326171875,
"learning_rate": 2.8551331244031814e-10,
"loss": 43.0369,
"step": 1220
},
{
"epoch": 0.657314629258517,
"grad_norm": 416.8631286621094,
"learning_rate": 2.7772377793407634e-10,
"loss": 44.467,
"step": 1230
},
{
"epoch": 0.6626586506346025,
"grad_norm": 459.6900329589844,
"learning_rate": 2.7000090987816086e-10,
"loss": 45.8894,
"step": 1240
},
{
"epoch": 0.6680026720106881,
"grad_norm": 452.77166748046875,
"learning_rate": 2.623470245626131e-10,
"loss": 46.3879,
"step": 1250
},
{
"epoch": 0.6733466933867736,
"grad_norm": 373.71551513671875,
"learning_rate": 2.547644175877475e-10,
"loss": 44.8361,
"step": 1260
},
{
"epoch": 0.678690714762859,
"grad_norm": 493.52978515625,
"learning_rate": 2.472553631756397e-10,
"loss": 45.5009,
"step": 1270
},
{
"epoch": 0.6840347361389446,
"grad_norm": 495.15216064453125,
"learning_rate": 2.3982211348802956e-10,
"loss": 45.423,
"step": 1280
},
{
"epoch": 0.6893787575150301,
"grad_norm": 426.9014892578125,
"learning_rate": 2.324668979508382e-10,
"loss": 45.0799,
"step": 1290
},
{
"epoch": 0.6947227788911156,
"grad_norm": 475.074951171875,
"learning_rate": 2.251919225855041e-10,
"loss": 45.2446,
"step": 1300
},
{
"epoch": 0.700066800267201,
"grad_norm": 479.0338439941406,
"learning_rate": 2.1799936934734111e-10,
"loss": 44.113,
"step": 1310
},
{
"epoch": 0.7054108216432866,
"grad_norm": 350.0298156738281,
"learning_rate": 2.1089139547111202e-10,
"loss": 45.8131,
"step": 1320
},
{
"epoch": 0.7107548430193721,
"grad_norm": 459.32635498046875,
"learning_rate": 2.0387013282401746e-10,
"loss": 46.7643,
"step": 1330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 455.7977294921875,
"learning_rate": 1.969376872662936e-10,
"loss": 45.0021,
"step": 1340
},
{
"epoch": 0.7214428857715431,
"grad_norm": 425.2674560546875,
"learning_rate": 1.9009613801960964e-10,
"loss": 45.1843,
"step": 1350
},
{
"epoch": 0.7267869071476286,
"grad_norm": 390.6509704589844,
"learning_rate": 1.8334753704345403e-10,
"loss": 44.7194,
"step": 1360
},
{
"epoch": 0.7321309285237141,
"grad_norm": 505.694580078125,
"learning_rate": 1.7669390841969942e-10,
"loss": 46.2759,
"step": 1370
},
{
"epoch": 0.7374749498997996,
"grad_norm": 369.56854248046875,
"learning_rate": 1.7013724774552676e-10,
"loss": 44.7077,
"step": 1380
},
{
"epoch": 0.7428189712758851,
"grad_norm": 429.40838623046875,
"learning_rate": 1.6367952153489342e-10,
"loss": 48.0047,
"step": 1390
},
{
"epoch": 0.7481629926519706,
"grad_norm": 479.33917236328125,
"learning_rate": 1.5732266662872497e-10,
"loss": 45.8104,
"step": 1400
},
{
"epoch": 0.7535070140280561,
"grad_norm": 447.9562072753906,
"learning_rate": 1.510685896140055e-10,
"loss": 46.4843,
"step": 1410
},
{
"epoch": 0.7588510354041417,
"grad_norm": 404.3109130859375,
"learning_rate": 1.4491916625194192e-10,
"loss": 44.9299,
"step": 1420
},
{
"epoch": 0.7641950567802271,
"grad_norm": 444.400634765625,
"learning_rate": 1.3887624091537504e-10,
"loss": 44.375,
"step": 1430
},
{
"epoch": 0.7695390781563126,
"grad_norm": 522.9306030273438,
"learning_rate": 1.329416260356035e-10,
"loss": 45.507,
"step": 1440
},
{
"epoch": 0.7748830995323981,
"grad_norm": 511.14727783203125,
"learning_rate": 1.271171015587877e-10,
"loss": 46.0719,
"step": 1450
},
{
"epoch": 0.7802271209084837,
"grad_norm": 459.71435546875,
"learning_rate": 1.2140441441209837e-10,
"loss": 44.2746,
"step": 1460
},
{
"epoch": 0.7855711422845691,
"grad_norm": 422.275390625,
"learning_rate": 1.158052779797671e-10,
"loss": 46.109,
"step": 1470
},
{
"epoch": 0.7909151636606546,
"grad_norm": 407.68096923828125,
"learning_rate": 1.1032137158919697e-10,
"loss": 44.9659,
"step": 1480
},
{
"epoch": 0.7962591850367402,
"grad_norm": 448.97314453125,
"learning_rate": 1.0495434000728927e-10,
"loss": 47.4394,
"step": 1490
},
{
"epoch": 0.8016032064128257,
"grad_norm": 485.30316162109375,
"learning_rate": 9.970579294713462e-11,
"loss": 46.3913,
"step": 1500
},
{
"epoch": 0.8069472277889111,
"grad_norm": 432.86883544921875,
"learning_rate": 9.457730458521747e-11,
"loss": 47.0394,
"step": 1510
},
{
"epoch": 0.8122912491649966,
"grad_norm": 531.20556640625,
"learning_rate": 8.95704130892801e-11,
"loss": 45.8065,
"step": 1520
},
{
"epoch": 0.8176352705410822,
"grad_norm": 460.9041748046875,
"learning_rate": 8.468662015698525e-11,
"loss": 44.684,
"step": 1530
},
{
"epoch": 0.8229792919171677,
"grad_norm": 520.4033813476562,
"learning_rate": 7.99273905655184e-11,
"loss": 46.8526,
"step": 1540
},
{
"epoch": 0.8283233132932531,
"grad_norm": 411.2076416015625,
"learning_rate": 7.52941517322624e-11,
"loss": 46.6088,
"step": 1550
},
{
"epoch": 0.8336673346693386,
"grad_norm": 419.9526062011719,
"learning_rate": 7.078829328667747e-11,
"loss": 46.7982,
"step": 1560
},
{
"epoch": 0.8390113560454242,
"grad_norm": 461.82977294921875,
"learning_rate": 6.641116665351543e-11,
"loss": 44.4069,
"step": 1570
},
{
"epoch": 0.8443553774215097,
"grad_norm": 462.8651123046875,
"learning_rate": 6.216408464749213e-11,
"loss": 46.1496,
"step": 1580
},
{
"epoch": 0.8496993987975952,
"grad_norm": 462.21368408203125,
"learning_rate": 5.804832107953923e-11,
"loss": 43.4678,
"step": 1590
},
{
"epoch": 0.8550434201736807,
"grad_norm": 480.9452209472656,
"learning_rate": 5.406511037475603e-11,
"loss": 46.4101,
"step": 1600
},
{
"epoch": 0.8603874415497662,
"grad_norm": 467.31610107421875,
"learning_rate": 5.021564720217248e-11,
"loss": 45.7185,
"step": 1610
},
{
"epoch": 0.8657314629258517,
"grad_norm": 411.0181579589844,
"learning_rate": 4.650108611643672e-11,
"loss": 43.6447,
"step": 1620
},
{
"epoch": 0.8710754843019372,
"grad_norm": 424.5155944824219,
"learning_rate": 4.292254121153422e-11,
"loss": 45.3636,
"step": 1630
},
{
"epoch": 0.8764195056780227,
"grad_norm": 410.9560241699219,
"learning_rate": 3.948108578664178e-11,
"loss": 46.3407,
"step": 1640
},
{
"epoch": 0.8817635270541082,
"grad_norm": 477.28240966796875,
"learning_rate": 3.617775202421675e-11,
"loss": 44.1375,
"step": 1650
},
{
"epoch": 0.8871075484301937,
"grad_norm": 461.23309326171875,
"learning_rate": 3.301353068041896e-11,
"loss": 43.7172,
"step": 1660
},
{
"epoch": 0.8924515698062793,
"grad_norm": 447.8024597167969,
"learning_rate": 2.998937078795672e-11,
"loss": 47.4198,
"step": 1670
},
{
"epoch": 0.8977955911823647,
"grad_norm": 511.4967956542969,
"learning_rate": 2.7106179371447437e-11,
"loss": 45.0943,
"step": 1680
},
{
"epoch": 0.9031396125584502,
"grad_norm": 478.0008850097656,
"learning_rate": 2.4364821175376806e-11,
"loss": 46.4821,
"step": 1690
},
{
"epoch": 0.9084836339345357,
"grad_norm": 509.50958251953125,
"learning_rate": 2.1766118404739633e-11,
"loss": 44.1657,
"step": 1700
},
{
"epoch": 0.9138276553106213,
"grad_norm": 423.26739501953125,
"learning_rate": 1.931085047843889e-11,
"loss": 47.6892,
"step": 1710
},
{
"epoch": 0.9191716766867067,
"grad_norm": 518.2147827148438,
"learning_rate": 1.6999753795517883e-11,
"loss": 46.303,
"step": 1720
},
{
"epoch": 0.9245156980627922,
"grad_norm": 386.4200134277344,
"learning_rate": 1.483352151429446e-11,
"loss": 43.9672,
"step": 1730
},
{
"epoch": 0.9298597194388778,
"grad_norm": 476.4378967285156,
"learning_rate": 1.2812803344465052e-11,
"loss": 45.2255,
"step": 1740
},
{
"epoch": 0.9352037408149633,
"grad_norm": 493.2698669433594,
"learning_rate": 1.0938205352239883e-11,
"loss": 45.0955,
"step": 1750
},
{
"epoch": 0.9405477621910487,
"grad_norm": 455.5923156738281,
"learning_rate": 9.210289778567305e-12,
"loss": 43.4817,
"step": 1760
},
{
"epoch": 0.9458917835671342,
"grad_norm": 452.3888854980469,
"learning_rate": 7.629574870503641e-12,
"loss": 43.956,
"step": 1770
},
{
"epoch": 0.9512358049432198,
"grad_norm": 369.5935363769531,
"learning_rate": 6.196534725777081e-12,
"loss": 45.6541,
"step": 1780
},
{
"epoch": 0.9565798263193053,
"grad_norm": 434.7474365234375,
"learning_rate": 4.911599150593193e-12,
"loss": 45.4734,
"step": 1790
},
{
"epoch": 0.9619238476953907,
"grad_norm": 515.116455078125,
"learning_rate": 3.7751535307252726e-12,
"loss": 44.1633,
"step": 1800
},
{
"epoch": 0.9672678690714763,
"grad_norm": 423.6890563964844,
"learning_rate": 2.7875387159265744e-12,
"loss": 45.9123,
"step": 1810
},
{
"epoch": 0.9726118904475618,
"grad_norm": 456.1297607421875,
"learning_rate": 1.949050917700923e-12,
"loss": 46.53,
"step": 1820
},
{
"epoch": 0.9779559118236473,
"grad_norm": 496.5564270019531,
"learning_rate": 1.259941620460947e-12,
"loss": 44.7135,
"step": 1830
},
{
"epoch": 0.9832999331997327,
"grad_norm": 408.35650634765625,
"learning_rate": 7.204175061013562e-13,
"loss": 42.9871,
"step": 1840
},
{
"epoch": 0.9886439545758183,
"grad_norm": 438.6337890625,
"learning_rate": 3.3064039200975115e-13,
"loss": 44.0024,
"step": 1850
},
{
"epoch": 0.9939879759519038,
"grad_norm": 453.5023193359375,
"learning_rate": 9.072718253316792e-14,
"loss": 47.6027,
"step": 1860
},
{
"epoch": 0.9993319973279893,
"grad_norm": 423.05963134765625,
"learning_rate": 7.498339156808421e-16,
"loss": 47.3557,
"step": 1870
},
{
"epoch": 0.9998663994655979,
"step": 1871,
"total_flos": 0.0,
"train_loss": 45.65529471906604,
"train_runtime": 8058.9213,
"train_samples_per_second": 7.43,
"train_steps_per_second": 0.232
}
],
"logging_steps": 10,
"max_steps": 1871,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}