ljcamargo's picture
Training in progress, step 3300, checkpoint
f7b177d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9470512268618166,
"eval_steps": 300,
"global_step": 3300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028698522026115655,
"grad_norm": 168.79563903808594,
"learning_rate": 1.739130434782609e-05,
"loss": 12.0169,
"step": 10
},
{
"epoch": 0.005739704405223131,
"grad_norm": 20.983991622924805,
"learning_rate": 4.63768115942029e-05,
"loss": 7.8235,
"step": 20
},
{
"epoch": 0.008609556607834697,
"grad_norm": 21.168655395507812,
"learning_rate": 7.536231884057971e-05,
"loss": 6.834,
"step": 30
},
{
"epoch": 0.011479408810446262,
"grad_norm": 21.57039451599121,
"learning_rate": 0.00010434782608695653,
"loss": 3.9023,
"step": 40
},
{
"epoch": 0.014349261013057828,
"grad_norm": 48.81906509399414,
"learning_rate": 0.00013333333333333334,
"loss": 2.9802,
"step": 50
},
{
"epoch": 0.017219113215669393,
"grad_norm": 7.396921157836914,
"learning_rate": 0.00016231884057971017,
"loss": 2.6257,
"step": 60
},
{
"epoch": 0.02008896541828096,
"grad_norm": 62.19234848022461,
"learning_rate": 0.00019130434782608697,
"loss": 2.3201,
"step": 70
},
{
"epoch": 0.022958817620892524,
"grad_norm": 8.402580261230469,
"learning_rate": 0.00019999792781461744,
"loss": 2.1749,
"step": 80
},
{
"epoch": 0.02582866982350409,
"grad_norm": 7.064925670623779,
"learning_rate": 0.0001999877785419313,
"loss": 1.8889,
"step": 90
},
{
"epoch": 0.028698522026115655,
"grad_norm": 7.678985118865967,
"learning_rate": 0.0001999691724338023,
"loss": 1.8161,
"step": 100
},
{
"epoch": 0.03156837422872722,
"grad_norm": 9.882554054260254,
"learning_rate": 0.0001999421110639107,
"loss": 1.9209,
"step": 110
},
{
"epoch": 0.034438226431338786,
"grad_norm": 8.960328102111816,
"learning_rate": 0.00019990659672107177,
"loss": 1.8535,
"step": 120
},
{
"epoch": 0.03730807863395035,
"grad_norm": 6.723909378051758,
"learning_rate": 0.00019986263240904216,
"loss": 1.7978,
"step": 130
},
{
"epoch": 0.04017793083656192,
"grad_norm": 14.159058570861816,
"learning_rate": 0.00019981022184626578,
"loss": 1.686,
"step": 140
},
{
"epoch": 0.04304778303917348,
"grad_norm": 12.402606010437012,
"learning_rate": 0.00019974936946555948,
"loss": 1.6932,
"step": 150
},
{
"epoch": 0.04591763524178505,
"grad_norm": 7.793806076049805,
"learning_rate": 0.000199680080413738,
"loss": 1.5665,
"step": 160
},
{
"epoch": 0.048787487444396614,
"grad_norm": 9.647517204284668,
"learning_rate": 0.0001996023605511786,
"loss": 1.5892,
"step": 170
},
{
"epoch": 0.05165733964700818,
"grad_norm": 7.5883564949035645,
"learning_rate": 0.00019951621645132556,
"loss": 1.5003,
"step": 180
},
{
"epoch": 0.054527191849619745,
"grad_norm": 9.5863676071167,
"learning_rate": 0.00019942165540013412,
"loss": 1.4324,
"step": 190
},
{
"epoch": 0.05739704405223131,
"grad_norm": 10.761382102966309,
"learning_rate": 0.00019931868539545416,
"loss": 1.2652,
"step": 200
},
{
"epoch": 0.060266896254842876,
"grad_norm": 23.32731056213379,
"learning_rate": 0.00019920731514635396,
"loss": 1.2868,
"step": 210
},
{
"epoch": 0.06313674845745444,
"grad_norm": 15.128023147583008,
"learning_rate": 0.00019908755407238343,
"loss": 1.2272,
"step": 220
},
{
"epoch": 0.066006600660066,
"grad_norm": 12.924105644226074,
"learning_rate": 0.00019895941230277744,
"loss": 1.307,
"step": 230
},
{
"epoch": 0.06887645286267757,
"grad_norm": 9.334559440612793,
"learning_rate": 0.00019882290067559915,
"loss": 1.1858,
"step": 240
},
{
"epoch": 0.07174630506528913,
"grad_norm": 12.918402671813965,
"learning_rate": 0.0001986780307368233,
"loss": 1.1668,
"step": 250
},
{
"epoch": 0.0746161572679007,
"grad_norm": 8.966814994812012,
"learning_rate": 0.00019852481473935974,
"loss": 1.04,
"step": 260
},
{
"epoch": 0.07748600947051226,
"grad_norm": 10.825933456420898,
"learning_rate": 0.000198363265642017,
"loss": 1.0674,
"step": 270
},
{
"epoch": 0.08035586167312384,
"grad_norm": 20.35280418395996,
"learning_rate": 0.00019819339710840626,
"loss": 1.1564,
"step": 280
},
{
"epoch": 0.0832257138757354,
"grad_norm": 24.500883102416992,
"learning_rate": 0.00019801522350578577,
"loss": 1.0751,
"step": 290
},
{
"epoch": 0.08609556607834697,
"grad_norm": 8.19206428527832,
"learning_rate": 0.00019782875990384568,
"loss": 1.0476,
"step": 300
},
{
"epoch": 0.08896541828095852,
"grad_norm": 8.840872764587402,
"learning_rate": 0.00019763402207343338,
"loss": 1.0478,
"step": 310
},
{
"epoch": 0.0918352704835701,
"grad_norm": 11.326393127441406,
"learning_rate": 0.00019743102648521967,
"loss": 1.0235,
"step": 320
},
{
"epoch": 0.09470512268618166,
"grad_norm": 15.35113525390625,
"learning_rate": 0.00019721979030830572,
"loss": 0.9794,
"step": 330
},
{
"epoch": 0.09757497488879323,
"grad_norm": 11.8535795211792,
"learning_rate": 0.0001970003314087709,
"loss": 1.0072,
"step": 340
},
{
"epoch": 0.10044482709140479,
"grad_norm": 24.779190063476562,
"learning_rate": 0.0001967726683481617,
"loss": 1.0056,
"step": 350
},
{
"epoch": 0.10331467929401636,
"grad_norm": 20.744426727294922,
"learning_rate": 0.00019653682038192188,
"loss": 1.0066,
"step": 360
},
{
"epoch": 0.10618453149662792,
"grad_norm": 21.19144630432129,
"learning_rate": 0.00019629280745776364,
"loss": 0.9673,
"step": 370
},
{
"epoch": 0.10905438369923949,
"grad_norm": 18.140127182006836,
"learning_rate": 0.0001960406502139808,
"loss": 0.9903,
"step": 380
},
{
"epoch": 0.11192423590185105,
"grad_norm": 19.997053146362305,
"learning_rate": 0.00019578036997770296,
"loss": 0.9715,
"step": 390
},
{
"epoch": 0.11479408810446262,
"grad_norm": 15.790470123291016,
"learning_rate": 0.0001955119887630919,
"loss": 0.9508,
"step": 400
},
{
"epoch": 0.11766394030707418,
"grad_norm": 18.330507278442383,
"learning_rate": 0.0001952355292694795,
"loss": 0.9867,
"step": 410
},
{
"epoch": 0.12053379250968575,
"grad_norm": 13.211642265319824,
"learning_rate": 0.0001949510148794478,
"loss": 1.0481,
"step": 420
},
{
"epoch": 0.12340364471229731,
"grad_norm": 9.442767143249512,
"learning_rate": 0.00019465846965685158,
"loss": 0.9686,
"step": 430
},
{
"epoch": 0.12627349691490888,
"grad_norm": 15.597809791564941,
"learning_rate": 0.00019435791834478293,
"loss": 1.0821,
"step": 440
},
{
"epoch": 0.12914334911752046,
"grad_norm": 13.517879486083984,
"learning_rate": 0.0001940493863634784,
"loss": 0.9397,
"step": 450
},
{
"epoch": 0.132013201320132,
"grad_norm": 13.031438827514648,
"learning_rate": 0.00019373289980816917,
"loss": 1.0009,
"step": 460
},
{
"epoch": 0.13488305352274357,
"grad_norm": 14.64666748046875,
"learning_rate": 0.00019340848544687386,
"loss": 0.9571,
"step": 470
},
{
"epoch": 0.13775290572535515,
"grad_norm": 10.706031799316406,
"learning_rate": 0.00019307617071813454,
"loss": 1.0283,
"step": 480
},
{
"epoch": 0.14062275792796672,
"grad_norm": 9.723997116088867,
"learning_rate": 0.00019273598372869603,
"loss": 0.9815,
"step": 490
},
{
"epoch": 0.14349261013057826,
"grad_norm": 9.667860984802246,
"learning_rate": 0.0001923879532511287,
"loss": 0.9424,
"step": 500
},
{
"epoch": 0.14636246233318984,
"grad_norm": 6.956273078918457,
"learning_rate": 0.00019203210872139476,
"loss": 0.9793,
"step": 510
},
{
"epoch": 0.1492323145358014,
"grad_norm": 15.395605087280273,
"learning_rate": 0.00019166848023635883,
"loss": 1.0637,
"step": 520
},
{
"epoch": 0.15210216673841298,
"grad_norm": 23.60310173034668,
"learning_rate": 0.0001912970985512422,
"loss": 0.9625,
"step": 530
},
{
"epoch": 0.15497201894102453,
"grad_norm": 20.658727645874023,
"learning_rate": 0.00019091799507702181,
"loss": 0.9393,
"step": 540
},
{
"epoch": 0.1578418711436361,
"grad_norm": 18.22756576538086,
"learning_rate": 0.0001905312018777733,
"loss": 0.9354,
"step": 550
},
{
"epoch": 0.16071172334624767,
"grad_norm": 11.863499641418457,
"learning_rate": 0.00019013675166795922,
"loss": 0.933,
"step": 560
},
{
"epoch": 0.16358157554885924,
"grad_norm": 11.65882682800293,
"learning_rate": 0.00018973467780966202,
"loss": 0.9119,
"step": 570
},
{
"epoch": 0.1664514277514708,
"grad_norm": 11.474069595336914,
"learning_rate": 0.00018932501430976242,
"loss": 0.9511,
"step": 580
},
{
"epoch": 0.16932127995408236,
"grad_norm": 8.225656509399414,
"learning_rate": 0.00018890779581706303,
"loss": 0.9474,
"step": 590
},
{
"epoch": 0.17219113215669393,
"grad_norm": 15.780831336975098,
"learning_rate": 0.00018848305761935797,
"loss": 0.9528,
"step": 600
},
{
"epoch": 0.1750609843593055,
"grad_norm": 9.415815353393555,
"learning_rate": 0.00018805083564044802,
"loss": 0.8619,
"step": 610
},
{
"epoch": 0.17793083656191705,
"grad_norm": 9.250490188598633,
"learning_rate": 0.0001876111664371025,
"loss": 0.9168,
"step": 620
},
{
"epoch": 0.18080068876452862,
"grad_norm": 15.730814933776855,
"learning_rate": 0.0001871640871959672,
"loss": 0.94,
"step": 630
},
{
"epoch": 0.1836705409671402,
"grad_norm": 9.073026657104492,
"learning_rate": 0.0001867096357304191,
"loss": 0.9471,
"step": 640
},
{
"epoch": 0.18654039316975177,
"grad_norm": 8.982126235961914,
"learning_rate": 0.00018624785047736842,
"loss": 0.9177,
"step": 650
},
{
"epoch": 0.1894102453723633,
"grad_norm": 10.682122230529785,
"learning_rate": 0.00018577877049400746,
"loss": 0.9402,
"step": 660
},
{
"epoch": 0.19228009757497488,
"grad_norm": 8.706944465637207,
"learning_rate": 0.0001853024354545073,
"loss": 0.8867,
"step": 670
},
{
"epoch": 0.19514994977758646,
"grad_norm": 5.8472371101379395,
"learning_rate": 0.00018481888564666208,
"loss": 0.9135,
"step": 680
},
{
"epoch": 0.19801980198019803,
"grad_norm": 5.432713508605957,
"learning_rate": 0.00018432816196848172,
"loss": 0.8525,
"step": 690
},
{
"epoch": 0.20088965418280957,
"grad_norm": 28.993038177490234,
"learning_rate": 0.00018383030592473266,
"loss": 0.8779,
"step": 700
},
{
"epoch": 0.20375950638542115,
"grad_norm": 5.313049793243408,
"learning_rate": 0.0001833253596234274,
"loss": 0.9551,
"step": 710
},
{
"epoch": 0.20662935858803272,
"grad_norm": 18.639175415039062,
"learning_rate": 0.00018281336577226327,
"loss": 0.8694,
"step": 720
},
{
"epoch": 0.2094992107906443,
"grad_norm": 15.578129768371582,
"learning_rate": 0.00018229436767501012,
"loss": 0.9017,
"step": 730
},
{
"epoch": 0.21236906299325584,
"grad_norm": 18.0419864654541,
"learning_rate": 0.0001817684092278477,
"loss": 0.8616,
"step": 740
},
{
"epoch": 0.2152389151958674,
"grad_norm": 8.34323787689209,
"learning_rate": 0.00018123553491565308,
"loss": 0.8902,
"step": 750
},
{
"epoch": 0.21810876739847898,
"grad_norm": 8.49802017211914,
"learning_rate": 0.00018069578980823816,
"loss": 0.8781,
"step": 760
},
{
"epoch": 0.22097861960109055,
"grad_norm": 6.250750541687012,
"learning_rate": 0.00018014921955653772,
"loss": 0.8405,
"step": 770
},
{
"epoch": 0.2238484718037021,
"grad_norm": 25.283082962036133,
"learning_rate": 0.00017959587038874822,
"loss": 0.93,
"step": 780
},
{
"epoch": 0.22671832400631367,
"grad_norm": 18.443071365356445,
"learning_rate": 0.00017903578910641814,
"loss": 0.9202,
"step": 790
},
{
"epoch": 0.22958817620892524,
"grad_norm": 18.457555770874023,
"learning_rate": 0.0001784690230804892,
"loss": 0.9446,
"step": 800
},
{
"epoch": 0.23245802841153682,
"grad_norm": 7.786270618438721,
"learning_rate": 0.00017789562024729012,
"loss": 0.899,
"step": 810
},
{
"epoch": 0.23532788061414836,
"grad_norm": 6.527904033660889,
"learning_rate": 0.00017731562910448202,
"loss": 0.8866,
"step": 820
},
{
"epoch": 0.23819773281675993,
"grad_norm": 8.394437789916992,
"learning_rate": 0.00017672909870695665,
"loss": 0.8749,
"step": 830
},
{
"epoch": 0.2410675850193715,
"grad_norm": 6.815917491912842,
"learning_rate": 0.00017613607866268742,
"loss": 0.8542,
"step": 840
},
{
"epoch": 0.24393743722198308,
"grad_norm": 16.42218780517578,
"learning_rate": 0.00017553661912853347,
"loss": 0.8658,
"step": 850
},
{
"epoch": 0.24680728942459462,
"grad_norm": 14.373140335083008,
"learning_rate": 0.00017493077080599768,
"loss": 0.8756,
"step": 860
},
{
"epoch": 0.2496771416272062,
"grad_norm": 17.368059158325195,
"learning_rate": 0.0001743185849369381,
"loss": 0.9572,
"step": 870
},
{
"epoch": 0.25254699382981777,
"grad_norm": 8.744333267211914,
"learning_rate": 0.0001737001132992344,
"loss": 0.8743,
"step": 880
},
{
"epoch": 0.2554168460324293,
"grad_norm": 9.240042686462402,
"learning_rate": 0.0001730754082024082,
"loss": 0.8666,
"step": 890
},
{
"epoch": 0.2582866982350409,
"grad_norm": 8.81686782836914,
"learning_rate": 0.00017244452248319896,
"loss": 0.8771,
"step": 900
},
{
"epoch": 0.26115655043765246,
"grad_norm": 46.30351638793945,
"learning_rate": 0.00017180750950109504,
"loss": 0.788,
"step": 910
},
{
"epoch": 0.264026402640264,
"grad_norm": 6.262620449066162,
"learning_rate": 0.0001711644231338208,
"loss": 0.916,
"step": 920
},
{
"epoch": 0.2668962548428756,
"grad_norm": 7.936816215515137,
"learning_rate": 0.00017051531777277952,
"loss": 0.8425,
"step": 930
},
{
"epoch": 0.26976610704548715,
"grad_norm": 10.233474731445312,
"learning_rate": 0.00016986024831845296,
"loss": 0.9159,
"step": 940
},
{
"epoch": 0.27263595924809875,
"grad_norm": 13.751338958740234,
"learning_rate": 0.00016919927017575832,
"loss": 0.8484,
"step": 950
},
{
"epoch": 0.2755058114507103,
"grad_norm": 18.70934295654297,
"learning_rate": 0.00016853243924936173,
"loss": 0.8387,
"step": 960
},
{
"epoch": 0.27837566365332184,
"grad_norm": 6.2156853675842285,
"learning_rate": 0.0001678598119389502,
"loss": 0.9127,
"step": 970
},
{
"epoch": 0.28124551585593344,
"grad_norm": 10.486414909362793,
"learning_rate": 0.00016718144513446127,
"loss": 0.861,
"step": 980
},
{
"epoch": 0.284115368058545,
"grad_norm": 7.782724380493164,
"learning_rate": 0.00016649739621127146,
"loss": 0.8739,
"step": 990
},
{
"epoch": 0.2869852202611565,
"grad_norm": 30.388168334960938,
"learning_rate": 0.00016580772302534337,
"loss": 0.9009,
"step": 1000
},
{
"epoch": 0.2898550724637681,
"grad_norm": 7.943617343902588,
"learning_rate": 0.0001651124839083324,
"loss": 0.8113,
"step": 1010
},
{
"epoch": 0.29272492466637967,
"grad_norm": 8.402076721191406,
"learning_rate": 0.00016441173766265315,
"loss": 0.8076,
"step": 1020
},
{
"epoch": 0.29559477686899127,
"grad_norm": 7.3927764892578125,
"learning_rate": 0.00016370554355650584,
"loss": 0.8263,
"step": 1030
},
{
"epoch": 0.2984646290716028,
"grad_norm": 8.749371528625488,
"learning_rate": 0.0001629939613188638,
"loss": 0.8673,
"step": 1040
},
{
"epoch": 0.30133448127421436,
"grad_norm": 4.924167156219482,
"learning_rate": 0.0001622770511344213,
"loss": 0.869,
"step": 1050
},
{
"epoch": 0.30420433347682596,
"grad_norm": 34.14529037475586,
"learning_rate": 0.00016155487363850342,
"loss": 0.9202,
"step": 1060
},
{
"epoch": 0.3070741856794375,
"grad_norm": 13.217582702636719,
"learning_rate": 0.00016082748991193757,
"loss": 0.8409,
"step": 1070
},
{
"epoch": 0.30994403788204905,
"grad_norm": 19.251298904418945,
"learning_rate": 0.00016009496147588735,
"loss": 0.8624,
"step": 1080
},
{
"epoch": 0.31281389008466065,
"grad_norm": 52.710453033447266,
"learning_rate": 0.00015935735028664908,
"loss": 0.8695,
"step": 1090
},
{
"epoch": 0.3156837422872722,
"grad_norm": 15.96419906616211,
"learning_rate": 0.00015861471873041184,
"loss": 0.8773,
"step": 1100
},
{
"epoch": 0.3185535944898838,
"grad_norm": 7.947400093078613,
"learning_rate": 0.0001578671296179806,
"loss": 0.8387,
"step": 1110
},
{
"epoch": 0.32142344669249534,
"grad_norm": 13.167436599731445,
"learning_rate": 0.00015711464617946402,
"loss": 0.8582,
"step": 1120
},
{
"epoch": 0.3242932988951069,
"grad_norm": 11.579595565795898,
"learning_rate": 0.00015635733205892653,
"loss": 0.8615,
"step": 1130
},
{
"epoch": 0.3271631510977185,
"grad_norm": 4.840546131134033,
"learning_rate": 0.00015559525130900523,
"loss": 0.822,
"step": 1140
},
{
"epoch": 0.33003300330033003,
"grad_norm": 8.159014701843262,
"learning_rate": 0.0001548284683854925,
"loss": 0.8512,
"step": 1150
},
{
"epoch": 0.3329028555029416,
"grad_norm": 33.13652038574219,
"learning_rate": 0.00015405704814188442,
"loss": 0.8686,
"step": 1160
},
{
"epoch": 0.3357727077055532,
"grad_norm": 5.398830890655518,
"learning_rate": 0.00015328105582389557,
"loss": 0.8685,
"step": 1170
},
{
"epoch": 0.3386425599081647,
"grad_norm": 23.8563289642334,
"learning_rate": 0.00015250055706394057,
"loss": 0.8617,
"step": 1180
},
{
"epoch": 0.3415124121107763,
"grad_norm": 5.886293411254883,
"learning_rate": 0.00015171561787558297,
"loss": 0.8559,
"step": 1190
},
{
"epoch": 0.34438226431338786,
"grad_norm": 7.887658596038818,
"learning_rate": 0.000150926304647952,
"loss": 0.8811,
"step": 1200
},
{
"epoch": 0.3472521165159994,
"grad_norm": 6.111181259155273,
"learning_rate": 0.00015013268414012742,
"loss": 0.8297,
"step": 1210
},
{
"epoch": 0.350121968718611,
"grad_norm": 6.417325496673584,
"learning_rate": 0.00014933482347549303,
"loss": 0.8296,
"step": 1220
},
{
"epoch": 0.35299182092122255,
"grad_norm": 48.331573486328125,
"learning_rate": 0.00014853279013605957,
"loss": 0.7966,
"step": 1230
},
{
"epoch": 0.3558616731238341,
"grad_norm": 8.638408660888672,
"learning_rate": 0.00014772665195675718,
"loss": 0.8522,
"step": 1240
},
{
"epoch": 0.3587315253264457,
"grad_norm": 6.308197498321533,
"learning_rate": 0.00014691647711969803,
"loss": 0.8228,
"step": 1250
},
{
"epoch": 0.36160137752905724,
"grad_norm": 6.23061990737915,
"learning_rate": 0.0001461023341484094,
"loss": 0.7915,
"step": 1260
},
{
"epoch": 0.36447122973166884,
"grad_norm": 6.377804756164551,
"learning_rate": 0.00014528429190203824,
"loss": 0.8486,
"step": 1270
},
{
"epoch": 0.3673410819342804,
"grad_norm": 6.146363258361816,
"learning_rate": 0.00014446241956952714,
"loss": 0.8927,
"step": 1280
},
{
"epoch": 0.37021093413689193,
"grad_norm": 3.900587320327759,
"learning_rate": 0.0001436367866637622,
"loss": 0.8167,
"step": 1290
},
{
"epoch": 0.37308078633950353,
"grad_norm": 8.58018684387207,
"learning_rate": 0.00014280746301569407,
"loss": 0.8128,
"step": 1300
},
{
"epoch": 0.3759506385421151,
"grad_norm": 5.754461288452148,
"learning_rate": 0.00014197451876843138,
"loss": 0.8441,
"step": 1310
},
{
"epoch": 0.3788204907447266,
"grad_norm": 7.290277004241943,
"learning_rate": 0.00014113802437130845,
"loss": 0.8555,
"step": 1320
},
{
"epoch": 0.3816903429473382,
"grad_norm": 43.14801788330078,
"learning_rate": 0.00014029805057392655,
"loss": 0.8299,
"step": 1330
},
{
"epoch": 0.38456019514994977,
"grad_norm": 5.909049034118652,
"learning_rate": 0.0001394546684201701,
"loss": 0.8448,
"step": 1340
},
{
"epoch": 0.38743004735256137,
"grad_norm": 4.810829162597656,
"learning_rate": 0.00013860794924219782,
"loss": 0.8592,
"step": 1350
},
{
"epoch": 0.3902998995551729,
"grad_norm": 6.602210998535156,
"learning_rate": 0.00013775796465440956,
"loss": 0.8351,
"step": 1360
},
{
"epoch": 0.39316975175778446,
"grad_norm": 7.952111721038818,
"learning_rate": 0.0001369047865473893,
"loss": 0.8243,
"step": 1370
},
{
"epoch": 0.39603960396039606,
"grad_norm": 8.271283149719238,
"learning_rate": 0.00013604848708182466,
"loss": 0.8239,
"step": 1380
},
{
"epoch": 0.3989094561630076,
"grad_norm": 12.694669723510742,
"learning_rate": 0.00013518913868240372,
"loss": 0.8381,
"step": 1390
},
{
"epoch": 0.40177930836561915,
"grad_norm": 22.169252395629883,
"learning_rate": 0.00013432681403168932,
"loss": 0.8227,
"step": 1400
},
{
"epoch": 0.40464916056823075,
"grad_norm": 127.96073913574219,
"learning_rate": 0.00013346158606397182,
"loss": 0.8376,
"step": 1410
},
{
"epoch": 0.4075190127708423,
"grad_norm": 12.16250991821289,
"learning_rate": 0.0001325935279591003,
"loss": 0.8253,
"step": 1420
},
{
"epoch": 0.4103888649734539,
"grad_norm": 11.346808433532715,
"learning_rate": 0.00013172271313629315,
"loss": 0.8554,
"step": 1430
},
{
"epoch": 0.41325871717606544,
"grad_norm": 18.371610641479492,
"learning_rate": 0.0001308492152479283,
"loss": 0.7743,
"step": 1440
},
{
"epoch": 0.416128569378677,
"grad_norm": 17.174100875854492,
"learning_rate": 0.00012997310817331392,
"loss": 0.8342,
"step": 1450
},
{
"epoch": 0.4189984215812886,
"grad_norm": 15.853143692016602,
"learning_rate": 0.00012909446601243972,
"loss": 0.8514,
"step": 1460
},
{
"epoch": 0.4218682737839001,
"grad_norm": 6.734909534454346,
"learning_rate": 0.00012821336307970965,
"loss": 0.7947,
"step": 1470
},
{
"epoch": 0.42473812598651167,
"grad_norm": 7.687751770019531,
"learning_rate": 0.00012732987389765658,
"loss": 0.8249,
"step": 1480
},
{
"epoch": 0.4276079781891233,
"grad_norm": 4.791903972625732,
"learning_rate": 0.00012644407319063918,
"loss": 0.7755,
"step": 1490
},
{
"epoch": 0.4304778303917348,
"grad_norm": 3.5958361625671387,
"learning_rate": 0.0001255560358785219,
"loss": 0.7828,
"step": 1500
},
{
"epoch": 0.4333476825943464,
"grad_norm": 5.9140400886535645,
"learning_rate": 0.00012466583707033832,
"loss": 0.8044,
"step": 1510
},
{
"epoch": 0.43621753479695796,
"grad_norm": 5.575759410858154,
"learning_rate": 0.00012377355205793854,
"loss": 0.7996,
"step": 1520
},
{
"epoch": 0.4390873869995695,
"grad_norm": 6.771875381469727,
"learning_rate": 0.00012287925630962107,
"loss": 0.8261,
"step": 1530
},
{
"epoch": 0.4419572392021811,
"grad_norm": 18.849271774291992,
"learning_rate": 0.00012198302546374978,
"loss": 0.8224,
"step": 1540
},
{
"epoch": 0.44482709140479265,
"grad_norm": 5.645337104797363,
"learning_rate": 0.00012108493532235666,
"loss": 0.8185,
"step": 1550
},
{
"epoch": 0.4476969436074042,
"grad_norm": 4.3476481437683105,
"learning_rate": 0.00012018506184473038,
"loss": 0.7985,
"step": 1560
},
{
"epoch": 0.4505667958100158,
"grad_norm": 8.391561508178711,
"learning_rate": 0.00011928348114099195,
"loss": 0.7965,
"step": 1570
},
{
"epoch": 0.45343664801262734,
"grad_norm": 11.707796096801758,
"learning_rate": 0.00011838026946565723,
"loss": 0.8174,
"step": 1580
},
{
"epoch": 0.45630650021523894,
"grad_norm": 9.046381950378418,
"learning_rate": 0.00011747550321118763,
"loss": 0.8,
"step": 1590
},
{
"epoch": 0.4591763524178505,
"grad_norm": 8.26490306854248,
"learning_rate": 0.00011656925890152877,
"loss": 0.8229,
"step": 1600
},
{
"epoch": 0.46204620462046203,
"grad_norm": 6.398012638092041,
"learning_rate": 0.00011566161318563821,
"loss": 0.8027,
"step": 1610
},
{
"epoch": 0.46491605682307363,
"grad_norm": 5.92479133605957,
"learning_rate": 0.0001147526428310027,
"loss": 0.8094,
"step": 1620
},
{
"epoch": 0.4677859090256852,
"grad_norm": 7.79962158203125,
"learning_rate": 0.00011384242471714512,
"loss": 0.8049,
"step": 1630
},
{
"epoch": 0.4706557612282967,
"grad_norm": 4.564454078674316,
"learning_rate": 0.00011293103582912221,
"loss": 0.8382,
"step": 1640
},
{
"epoch": 0.4735256134309083,
"grad_norm": 20.43712043762207,
"learning_rate": 0.00011201855325101332,
"loss": 0.829,
"step": 1650
},
{
"epoch": 0.47639546563351987,
"grad_norm": 5.778446674346924,
"learning_rate": 0.0001111050541594006,
"loss": 0.8333,
"step": 1660
},
{
"epoch": 0.47926531783613147,
"grad_norm": 5.030070781707764,
"learning_rate": 0.00011019061581684165,
"loss": 0.769,
"step": 1670
},
{
"epoch": 0.482135170038743,
"grad_norm": 5.967840671539307,
"learning_rate": 0.00010927531556533456,
"loss": 0.8041,
"step": 1680
},
{
"epoch": 0.48500502224135456,
"grad_norm": 4.707633972167969,
"learning_rate": 0.00010835923081977673,
"loss": 0.8105,
"step": 1690
},
{
"epoch": 0.48787487444396616,
"grad_norm": 6.354760646820068,
"learning_rate": 0.0001074424390614169,
"loss": 0.8031,
"step": 1700
},
{
"epoch": 0.4907447266465777,
"grad_norm": 6.2033915519714355,
"learning_rate": 0.00010652501783130208,
"loss": 0.7559,
"step": 1710
},
{
"epoch": 0.49361457884918924,
"grad_norm": 3.7331125736236572,
"learning_rate": 0.00010560704472371919,
"loss": 0.8233,
"step": 1720
},
{
"epoch": 0.49648443105180085,
"grad_norm": 9.511772155761719,
"learning_rate": 0.00010468859737963217,
"loss": 0.7945,
"step": 1730
},
{
"epoch": 0.4993542832544124,
"grad_norm": 12.07361125946045,
"learning_rate": 0.00010376975348011533,
"loss": 0.8368,
"step": 1740
},
{
"epoch": 0.5022241354570239,
"grad_norm": 4.957511901855469,
"learning_rate": 0.00010285059073978312,
"loss": 0.8241,
"step": 1750
},
{
"epoch": 0.5050939876596355,
"grad_norm": 4.124336242675781,
"learning_rate": 0.00010193118690021699,
"loss": 0.807,
"step": 1760
},
{
"epoch": 0.5079638398622471,
"grad_norm": 4.789161205291748,
"learning_rate": 0.00010101161972339046,
"loss": 0.8143,
"step": 1770
},
{
"epoch": 0.5108336920648586,
"grad_norm": 5.026962757110596,
"learning_rate": 0.00010009196698509173,
"loss": 0.7765,
"step": 1780
},
{
"epoch": 0.5137035442674702,
"grad_norm": 8.285078048706055,
"learning_rate": 9.91723064683458e-05,
"loss": 0.8053,
"step": 1790
},
{
"epoch": 0.5165733964700818,
"grad_norm": 4.77803897857666,
"learning_rate": 9.825271595683548e-05,
"loss": 0.8072,
"step": 1800
},
{
"epoch": 0.5194432486726933,
"grad_norm": 4.466314315795898,
"learning_rate": 9.73332732283226e-05,
"loss": 0.7936,
"step": 1810
},
{
"epoch": 0.5223131008753049,
"grad_norm": 6.21898078918457,
"learning_rate": 9.641405604806983e-05,
"loss": 0.8018,
"step": 1820
},
{
"epoch": 0.5251829530779165,
"grad_norm": 3.505802869796753,
"learning_rate": 9.549514216226311e-05,
"loss": 0.823,
"step": 1830
},
{
"epoch": 0.528052805280528,
"grad_norm": 4.254824161529541,
"learning_rate": 9.45766092914363e-05,
"loss": 0.824,
"step": 1840
},
{
"epoch": 0.5309226574831396,
"grad_norm": 10.659527778625488,
"learning_rate": 9.365853512389735e-05,
"loss": 0.8169,
"step": 1850
},
{
"epoch": 0.5337925096857512,
"grad_norm": 5.28292989730835,
"learning_rate": 9.274099730915778e-05,
"loss": 0.8076,
"step": 1860
},
{
"epoch": 0.5366623618883628,
"grad_norm": 5.907596588134766,
"learning_rate": 9.182407345136506e-05,
"loss": 0.7863,
"step": 1870
},
{
"epoch": 0.5395322140909743,
"grad_norm": 4.142882347106934,
"learning_rate": 9.090784110273896e-05,
"loss": 0.8133,
"step": 1880
},
{
"epoch": 0.5424020662935859,
"grad_norm": 4.616401195526123,
"learning_rate": 8.99923777570124e-05,
"loss": 0.7853,
"step": 1890
},
{
"epoch": 0.5452719184961975,
"grad_norm": 7.957604885101318,
"learning_rate": 8.907776084287693e-05,
"loss": 0.8275,
"step": 1900
},
{
"epoch": 0.548141770698809,
"grad_norm": 3.326878070831299,
"learning_rate": 8.816406771743412e-05,
"loss": 0.7724,
"step": 1910
},
{
"epoch": 0.5510116229014206,
"grad_norm": 4.447857856750488,
"learning_rate": 8.725137565965262e-05,
"loss": 0.8049,
"step": 1920
},
{
"epoch": 0.5538814751040322,
"grad_norm": 5.452672004699707,
"learning_rate": 8.633976186383217e-05,
"loss": 0.8034,
"step": 1930
},
{
"epoch": 0.5567513273066437,
"grad_norm": 5.054596900939941,
"learning_rate": 8.542930343307444e-05,
"loss": 0.7745,
"step": 1940
},
{
"epoch": 0.5596211795092553,
"grad_norm": 25.82883071899414,
"learning_rate": 8.452007737276191e-05,
"loss": 0.7756,
"step": 1950
},
{
"epoch": 0.5624910317118669,
"grad_norm": 4.046459197998047,
"learning_rate": 8.361216058404468e-05,
"loss": 0.7597,
"step": 1960
},
{
"epoch": 0.5653608839144784,
"grad_norm": 18.29205894470215,
"learning_rate": 8.270562985733652e-05,
"loss": 0.7863,
"step": 1970
},
{
"epoch": 0.56823073611709,
"grad_norm": 7.219738006591797,
"learning_rate": 8.180056186581976e-05,
"loss": 0.7651,
"step": 1980
},
{
"epoch": 0.5711005883197016,
"grad_norm": 4.146981716156006,
"learning_rate": 8.089703315896058e-05,
"loss": 0.7578,
"step": 1990
},
{
"epoch": 0.573970440522313,
"grad_norm": 4.7924675941467285,
"learning_rate": 7.999512015603438e-05,
"loss": 0.7974,
"step": 2000
},
{
"epoch": 0.5768402927249247,
"grad_norm": 5.102847576141357,
"learning_rate": 7.909489913966261e-05,
"loss": 0.805,
"step": 2010
},
{
"epoch": 0.5797101449275363,
"grad_norm": 5.353450298309326,
"learning_rate": 7.819644624936051e-05,
"loss": 0.7895,
"step": 2020
},
{
"epoch": 0.5825799971301477,
"grad_norm": 5.74714469909668,
"learning_rate": 7.72998374750977e-05,
"loss": 0.8029,
"step": 2030
},
{
"epoch": 0.5854498493327593,
"grad_norm": 4.67111873626709,
"learning_rate": 7.640514865087077e-05,
"loss": 0.7763,
"step": 2040
},
{
"epoch": 0.5883197015353709,
"grad_norm": 4.226963996887207,
"learning_rate": 7.551245544828944e-05,
"loss": 0.7935,
"step": 2050
},
{
"epoch": 0.5911895537379825,
"grad_norm": 6.067037105560303,
"learning_rate": 7.46218333701765e-05,
"loss": 0.7835,
"step": 2060
},
{
"epoch": 0.594059405940594,
"grad_norm": 6.7161736488342285,
"learning_rate": 7.373335774418158e-05,
"loss": 0.7793,
"step": 2070
},
{
"epoch": 0.5969292581432056,
"grad_norm": 4.633667945861816,
"learning_rate": 7.28471037164103e-05,
"loss": 0.793,
"step": 2080
},
{
"epoch": 0.5997991103458172,
"grad_norm": 5.508072376251221,
"learning_rate": 7.196314624506834e-05,
"loss": 0.7589,
"step": 2090
},
{
"epoch": 0.6026689625484287,
"grad_norm": 4.465757369995117,
"learning_rate": 7.108156009412176e-05,
"loss": 0.7569,
"step": 2100
},
{
"epoch": 0.6055388147510403,
"grad_norm": 3.5824501514434814,
"learning_rate": 7.02024198269733e-05,
"loss": 0.7963,
"step": 2110
},
{
"epoch": 0.6084086669536519,
"grad_norm": 8.07539176940918,
"learning_rate": 6.932579980015618e-05,
"loss": 0.8183,
"step": 2120
},
{
"epoch": 0.6112785191562634,
"grad_norm": 5.9698615074157715,
"learning_rate": 6.845177415704484e-05,
"loss": 0.749,
"step": 2130
},
{
"epoch": 0.614148371358875,
"grad_norm": 4.034762859344482,
"learning_rate": 6.758041682158431e-05,
"loss": 0.7853,
"step": 2140
},
{
"epoch": 0.6170182235614866,
"grad_norm": 8.13531494140625,
"learning_rate": 6.671180149203751e-05,
"loss": 0.7871,
"step": 2150
},
{
"epoch": 0.6198880757640981,
"grad_norm": 5.809640884399414,
"learning_rate": 6.584600163475222e-05,
"loss": 0.8037,
"step": 2160
},
{
"epoch": 0.6227579279667097,
"grad_norm": 5.849427223205566,
"learning_rate": 6.498309047794713e-05,
"loss": 0.8076,
"step": 2170
},
{
"epoch": 0.6256277801693213,
"grad_norm": 4.466967582702637,
"learning_rate": 6.412314100551854e-05,
"loss": 0.7863,
"step": 2180
},
{
"epoch": 0.6284976323719328,
"grad_norm": 4.934723377227783,
"learning_rate": 6.326622595086722e-05,
"loss": 0.7747,
"step": 2190
},
{
"epoch": 0.6313674845745444,
"grad_norm": 4.067635536193848,
"learning_rate": 6.241241779074705e-05,
"loss": 0.7804,
"step": 2200
},
{
"epoch": 0.634237336777156,
"grad_norm": 4.629720687866211,
"learning_rate": 6.156178873913468e-05,
"loss": 0.7672,
"step": 2210
},
{
"epoch": 0.6371071889797676,
"grad_norm": 3.9992971420288086,
"learning_rate": 6.071441074112194e-05,
"loss": 0.7856,
"step": 2220
},
{
"epoch": 0.6399770411823791,
"grad_norm": 6.1507062911987305,
"learning_rate": 5.9870355466830885e-05,
"loss": 0.752,
"step": 2230
},
{
"epoch": 0.6428468933849907,
"grad_norm": 4.305118083953857,
"learning_rate": 5.902969430535186e-05,
"loss": 0.7506,
"step": 2240
},
{
"epoch": 0.6457167455876023,
"grad_norm": 3.7307469844818115,
"learning_rate": 5.819249835870566e-05,
"loss": 0.7744,
"step": 2250
},
{
"epoch": 0.6485865977902138,
"grad_norm": 5.391602516174316,
"learning_rate": 5.7358838435829664e-05,
"loss": 0.8067,
"step": 2260
},
{
"epoch": 0.6514564499928254,
"grad_norm": 4.221368789672852,
"learning_rate": 5.6528785046589115e-05,
"loss": 0.8257,
"step": 2270
},
{
"epoch": 0.654326302195437,
"grad_norm": 5.274345397949219,
"learning_rate": 5.570240839581323e-05,
"loss": 0.7638,
"step": 2280
},
{
"epoch": 0.6571961543980485,
"grad_norm": 4.528804779052734,
"learning_rate": 5.487977837735756e-05,
"loss": 0.7805,
"step": 2290
},
{
"epoch": 0.6600660066006601,
"grad_norm": 4.387100696563721,
"learning_rate": 5.406096456819234e-05,
"loss": 0.7811,
"step": 2300
},
{
"epoch": 0.6629358588032717,
"grad_norm": 5.64663028717041,
"learning_rate": 5.324603622251797e-05,
"loss": 0.771,
"step": 2310
},
{
"epoch": 0.6658057110058831,
"grad_norm": 4.328652381896973,
"learning_rate": 5.243506226590722e-05,
"loss": 0.7711,
"step": 2320
},
{
"epoch": 0.6686755632084947,
"grad_norm": 4.763848781585693,
"learning_rate": 5.162811128947602e-05,
"loss": 0.7849,
"step": 2330
},
{
"epoch": 0.6715454154111064,
"grad_norm": 6.142160892486572,
"learning_rate": 5.082525154408173e-05,
"loss": 0.7587,
"step": 2340
},
{
"epoch": 0.6744152676137178,
"grad_norm": 6.3459553718566895,
"learning_rate": 5.002655093455086e-05,
"loss": 0.7762,
"step": 2350
},
{
"epoch": 0.6772851198163294,
"grad_norm": 5.520603656768799,
"learning_rate": 4.9232077013935606e-05,
"loss": 0.7854,
"step": 2360
},
{
"epoch": 0.680154972018941,
"grad_norm": 3.9489786624908447,
"learning_rate": 4.844189697780033e-05,
"loss": 0.7599,
"step": 2370
},
{
"epoch": 0.6830248242215526,
"grad_norm": 5.653624057769775,
"learning_rate": 4.765607765853828e-05,
"loss": 0.7875,
"step": 2380
},
{
"epoch": 0.6858946764241641,
"grad_norm": 4.3883957862854,
"learning_rate": 4.6874685519718945e-05,
"loss": 0.7825,
"step": 2390
},
{
"epoch": 0.6887645286267757,
"grad_norm": 3.743744134902954,
"learning_rate": 4.60977866504668e-05,
"loss": 0.7796,
"step": 2400
},
{
"epoch": 0.6916343808293873,
"grad_norm": 5.168239593505859,
"learning_rate": 4.5325446759871316e-05,
"loss": 0.7764,
"step": 2410
},
{
"epoch": 0.6945042330319988,
"grad_norm": 3.202075958251953,
"learning_rate": 4.455773117142965e-05,
"loss": 0.7483,
"step": 2420
},
{
"epoch": 0.6973740852346104,
"grad_norm": 4.126010417938232,
"learning_rate": 4.379470481752139e-05,
"loss": 0.7702,
"step": 2430
},
{
"epoch": 0.700243937437222,
"grad_norm": 5.2914509773254395,
"learning_rate": 4.303643223391698e-05,
"loss": 0.7663,
"step": 2440
},
{
"epoch": 0.7031137896398335,
"grad_norm": 5.010975360870361,
"learning_rate": 4.2282977554319034e-05,
"loss": 0.7911,
"step": 2450
},
{
"epoch": 0.7059836418424451,
"grad_norm": 3.504735231399536,
"learning_rate": 4.153440450493823e-05,
"loss": 0.7452,
"step": 2460
},
{
"epoch": 0.7088534940450567,
"grad_norm": 5.5859880447387695,
"learning_rate": 4.0790776399103294e-05,
"loss": 0.758,
"step": 2470
},
{
"epoch": 0.7117233462476682,
"grad_norm": 6.027501583099365,
"learning_rate": 4.0052156131906214e-05,
"loss": 0.7945,
"step": 2480
},
{
"epoch": 0.7145931984502798,
"grad_norm": 5.546058654785156,
"learning_rate": 3.93186061748824e-05,
"loss": 0.7676,
"step": 2490
},
{
"epoch": 0.7174630506528914,
"grad_norm": 4.879994869232178,
"learning_rate": 3.859018857072719e-05,
"loss": 0.7926,
"step": 2500
},
{
"epoch": 0.7203329028555029,
"grad_norm": 4.717655181884766,
"learning_rate": 3.786696492804812e-05,
"loss": 0.7451,
"step": 2510
},
{
"epoch": 0.7232027550581145,
"grad_norm": 6.432432174682617,
"learning_rate": 3.714899641615438e-05,
"loss": 0.7938,
"step": 2520
},
{
"epoch": 0.7260726072607261,
"grad_norm": 5.008986473083496,
"learning_rate": 3.6436343759882926e-05,
"loss": 0.765,
"step": 2530
},
{
"epoch": 0.7289424594633377,
"grad_norm": 7.00074577331543,
"learning_rate": 3.5729067234462785e-05,
"loss": 0.7794,
"step": 2540
},
{
"epoch": 0.7318123116659492,
"grad_norm": 6.525863170623779,
"learning_rate": 3.5027226660416736e-05,
"loss": 0.7979,
"step": 2550
},
{
"epoch": 0.7346821638685608,
"grad_norm": 5.4863786697387695,
"learning_rate": 3.433088139850193e-05,
"loss": 0.7625,
"step": 2560
},
{
"epoch": 0.7375520160711724,
"grad_norm": 3.975086212158203,
"learning_rate": 3.364009034468926e-05,
"loss": 0.7471,
"step": 2570
},
{
"epoch": 0.7404218682737839,
"grad_norm": 3.787874460220337,
"learning_rate": 3.2954911925181876e-05,
"loss": 0.7662,
"step": 2580
},
{
"epoch": 0.7432917204763955,
"grad_norm": 4.633001804351807,
"learning_rate": 3.2275404091473795e-05,
"loss": 0.774,
"step": 2590
},
{
"epoch": 0.7461615726790071,
"grad_norm": 4.832580089569092,
"learning_rate": 3.1601624315448166e-05,
"loss": 0.7749,
"step": 2600
},
{
"epoch": 0.7490314248816186,
"grad_norm": 4.763906955718994,
"learning_rate": 3.0933629584516665e-05,
"loss": 0.7438,
"step": 2610
},
{
"epoch": 0.7519012770842302,
"grad_norm": 4.065663814544678,
"learning_rate": 3.027147639679928e-05,
"loss": 0.7546,
"step": 2620
},
{
"epoch": 0.7547711292868418,
"grad_norm": 4.496669769287109,
"learning_rate": 2.961522075634604e-05,
"loss": 0.7878,
"step": 2630
},
{
"epoch": 0.7576409814894532,
"grad_norm": 3.8822827339172363,
"learning_rate": 2.896491816840008e-05,
"loss": 0.7884,
"step": 2640
},
{
"epoch": 0.7605108336920648,
"grad_norm": 4.25615119934082,
"learning_rate": 2.8320623634703147e-05,
"loss": 0.7418,
"step": 2650
},
{
"epoch": 0.7633806858946764,
"grad_norm": 4.472879886627197,
"learning_rate": 2.76823916488436e-05,
"loss": 0.7944,
"step": 2660
},
{
"epoch": 0.7662505380972879,
"grad_norm": 6.644125938415527,
"learning_rate": 2.705027619164754e-05,
"loss": 0.7525,
"step": 2670
},
{
"epoch": 0.7691203902998995,
"grad_norm": 3.8960325717926025,
"learning_rate": 2.6424330726612946e-05,
"loss": 0.748,
"step": 2680
},
{
"epoch": 0.7719902425025111,
"grad_norm": 3.907740354537964,
"learning_rate": 2.5804608195388057e-05,
"loss": 0.7686,
"step": 2690
},
{
"epoch": 0.7748600947051227,
"grad_norm": 4.432440757751465,
"learning_rate": 2.5191161013293396e-05,
"loss": 0.7671,
"step": 2700
},
{
"epoch": 0.7777299469077342,
"grad_norm": 4.681542873382568,
"learning_rate": 2.4584041064888798e-05,
"loss": 0.765,
"step": 2710
},
{
"epoch": 0.7805997991103458,
"grad_norm": 4.8185343742370605,
"learning_rate": 2.398329969958486e-05,
"loss": 0.772,
"step": 2720
},
{
"epoch": 0.7834696513129574,
"grad_norm": 4.85504150390625,
"learning_rate": 2.3388987727299982e-05,
"loss": 0.7655,
"step": 2730
},
{
"epoch": 0.7863395035155689,
"grad_norm": 4.443562030792236,
"learning_rate": 2.2801155414162934e-05,
"loss": 0.7885,
"step": 2740
},
{
"epoch": 0.7892093557181805,
"grad_norm": 4.084039211273193,
"learning_rate": 2.221985247826138e-05,
"loss": 0.7679,
"step": 2750
},
{
"epoch": 0.7920792079207921,
"grad_norm": 5.327516555786133,
"learning_rate": 2.164512808543686e-05,
"loss": 0.7704,
"step": 2760
},
{
"epoch": 0.7949490601234036,
"grad_norm": 5.7689313888549805,
"learning_rate": 2.1077030845126256e-05,
"loss": 0.7572,
"step": 2770
},
{
"epoch": 0.7978189123260152,
"grad_norm": 5.112376689910889,
"learning_rate": 2.0515608806250665e-05,
"loss": 0.7633,
"step": 2780
},
{
"epoch": 0.8006887645286268,
"grad_norm": 4.748579502105713,
"learning_rate": 1.996090945315128e-05,
"loss": 0.7757,
"step": 2790
},
{
"epoch": 0.8035586167312383,
"grad_norm": 4.38164758682251,
"learning_rate": 1.941297970157344e-05,
"loss": 0.7517,
"step": 2800
},
{
"epoch": 0.8064284689338499,
"grad_norm": 4.2106523513793945,
"learning_rate": 1.8871865894698336e-05,
"loss": 0.7783,
"step": 2810
},
{
"epoch": 0.8092983211364615,
"grad_norm": 6.83260440826416,
"learning_rate": 1.8337613799223586e-05,
"loss": 0.758,
"step": 2820
},
{
"epoch": 0.812168173339073,
"grad_norm": 4.018373012542725,
"learning_rate": 1.7810268601492164e-05,
"loss": 0.7464,
"step": 2830
},
{
"epoch": 0.8150380255416846,
"grad_norm": 5.183018207550049,
"learning_rate": 1.7289874903670677e-05,
"loss": 0.75,
"step": 2840
},
{
"epoch": 0.8179078777442962,
"grad_norm": 3.9134421348571777,
"learning_rate": 1.6776476719976974e-05,
"loss": 0.7991,
"step": 2850
},
{
"epoch": 0.8207777299469078,
"grad_norm": 5.056222915649414,
"learning_rate": 1.6270117472957534e-05,
"loss": 0.7419,
"step": 2860
},
{
"epoch": 0.8236475821495193,
"grad_norm": 4.9499311447143555,
"learning_rate": 1.5770839989814677e-05,
"loss": 0.7927,
"step": 2870
},
{
"epoch": 0.8265174343521309,
"grad_norm": 4.165496826171875,
"learning_rate": 1.527868649878451e-05,
"loss": 0.7502,
"step": 2880
},
{
"epoch": 0.8293872865547425,
"grad_norm": 5.458337306976318,
"learning_rate": 1.4793698625565122e-05,
"loss": 0.7699,
"step": 2890
},
{
"epoch": 0.832257138757354,
"grad_norm": 4.831928253173828,
"learning_rate": 1.4315917389796119e-05,
"loss": 0.7577,
"step": 2900
},
{
"epoch": 0.8351269909599656,
"grad_norm": 5.4457221031188965,
"learning_rate": 1.3845383201589057e-05,
"loss": 0.76,
"step": 2910
},
{
"epoch": 0.8379968431625772,
"grad_norm": 4.1194586753845215,
"learning_rate": 1.3382135858109735e-05,
"loss": 0.7865,
"step": 2920
},
{
"epoch": 0.8408666953651887,
"grad_norm": 4.45517110824585,
"learning_rate": 1.2926214540212155e-05,
"loss": 0.7414,
"step": 2930
},
{
"epoch": 0.8437365475678003,
"grad_norm": 4.03952169418335,
"learning_rate": 1.2477657809124631e-05,
"loss": 0.78,
"step": 2940
},
{
"epoch": 0.8466063997704119,
"grad_norm": 4.787744998931885,
"learning_rate": 1.2036503603188464e-05,
"loss": 0.7862,
"step": 2950
},
{
"epoch": 0.8494762519730233,
"grad_norm": 6.612007141113281,
"learning_rate": 1.1602789234648948e-05,
"loss": 0.7356,
"step": 2960
},
{
"epoch": 0.8523461041756349,
"grad_norm": 4.051847457885742,
"learning_rate": 1.1176551386499757e-05,
"loss": 0.7261,
"step": 2970
},
{
"epoch": 0.8552159563782465,
"grad_norm": 6.460504055023193,
"learning_rate": 1.0757826109380165e-05,
"loss": 0.7701,
"step": 2980
},
{
"epoch": 0.858085808580858,
"grad_norm": 7.030419826507568,
"learning_rate": 1.034664881852614e-05,
"loss": 0.7938,
"step": 2990
},
{
"epoch": 0.8609556607834696,
"grad_norm": 6.365281581878662,
"learning_rate": 9.943054290774756e-06,
"loss": 0.7574,
"step": 3000
},
{
"epoch": 0.8638255129860812,
"grad_norm": 5.900289535522461,
"learning_rate": 9.547076661622922e-06,
"loss": 0.7758,
"step": 3010
},
{
"epoch": 0.8666953651886928,
"grad_norm": 5.241759777069092,
"learning_rate": 9.15874942234024e-06,
"loss": 0.7805,
"step": 3020
},
{
"epoch": 0.8695652173913043,
"grad_norm": 4.609664440155029,
"learning_rate": 8.778105417136395e-06,
"loss": 0.7642,
"step": 3030
},
{
"epoch": 0.8724350695939159,
"grad_norm": 6.470444202423096,
"learning_rate": 8.405176840383122e-06,
"loss": 0.7928,
"step": 3040
},
{
"epoch": 0.8753049217965275,
"grad_norm": 3.531794786453247,
"learning_rate": 8.039995233891362e-06,
"loss": 0.7503,
"step": 3050
},
{
"epoch": 0.878174773999139,
"grad_norm": 5.537559986114502,
"learning_rate": 7.682591484243417e-06,
"loss": 0.7343,
"step": 3060
},
{
"epoch": 0.8810446262017506,
"grad_norm": 3.7967238426208496,
"learning_rate": 7.332995820180677e-06,
"loss": 0.7345,
"step": 3070
},
{
"epoch": 0.8839144784043622,
"grad_norm": 4.1268839836120605,
"learning_rate": 6.991237810046847e-06,
"loss": 0.7557,
"step": 3080
},
{
"epoch": 0.8867843306069737,
"grad_norm": 7.182312965393066,
"learning_rate": 6.6573463592871085e-06,
"loss": 0.7635,
"step": 3090
},
{
"epoch": 0.8896541828095853,
"grad_norm": 3.4768388271331787,
"learning_rate": 6.331349708003365e-06,
"loss": 0.7325,
"step": 3100
},
{
"epoch": 0.8925240350121969,
"grad_norm": 5.252262115478516,
"learning_rate": 6.013275428565712e-06,
"loss": 0.7513,
"step": 3110
},
{
"epoch": 0.8953938872148084,
"grad_norm": 4.213047027587891,
"learning_rate": 5.703150423280401e-06,
"loss": 0.7685,
"step": 3120
},
{
"epoch": 0.89826373941742,
"grad_norm": 4.207084655761719,
"learning_rate": 5.401000922114485e-06,
"loss": 0.7313,
"step": 3130
},
{
"epoch": 0.9011335916200316,
"grad_norm": 6.862100124359131,
"learning_rate": 5.10685248047732e-06,
"loss": 0.7626,
"step": 3140
},
{
"epoch": 0.9040034438226431,
"grad_norm": 3.541048049926758,
"learning_rate": 4.82072997705908e-06,
"loss": 0.7748,
"step": 3150
},
{
"epoch": 0.9068732960252547,
"grad_norm": 4.149963855743408,
"learning_rate": 4.542657611726664e-06,
"loss": 0.7651,
"step": 3160
},
{
"epoch": 0.9097431482278663,
"grad_norm": 6.455443859100342,
"learning_rate": 4.272658903476745e-06,
"loss": 0.7769,
"step": 3170
},
{
"epoch": 0.9126130004304779,
"grad_norm": 5.111416339874268,
"learning_rate": 4.010756688446726e-06,
"loss": 0.779,
"step": 3180
},
{
"epoch": 0.9154828526330894,
"grad_norm": 5.0384440422058105,
"learning_rate": 3.7569731179831537e-06,
"loss": 0.7353,
"step": 3190
},
{
"epoch": 0.918352704835701,
"grad_norm": 4.619420528411865,
"learning_rate": 3.5113296567682476e-06,
"loss": 0.7686,
"step": 3200
},
{
"epoch": 0.9212225570383126,
"grad_norm": 5.13969612121582,
"learning_rate": 3.2738470810044553e-06,
"loss": 0.7475,
"step": 3210
},
{
"epoch": 0.9240924092409241,
"grad_norm": 4.138948917388916,
"learning_rate": 3.0445454766572235e-06,
"loss": 0.743,
"step": 3220
},
{
"epoch": 0.9269622614435357,
"grad_norm": 3.4994235038757324,
"learning_rate": 2.8234442377561232e-06,
"loss": 0.7491,
"step": 3230
},
{
"epoch": 0.9298321136461473,
"grad_norm": 3.714160442352295,
"learning_rate": 2.6105620647545734e-06,
"loss": 0.7516,
"step": 3240
},
{
"epoch": 0.9327019658487588,
"grad_norm": 3.1646008491516113,
"learning_rate": 2.4059169629481403e-06,
"loss": 0.751,
"step": 3250
},
{
"epoch": 0.9355718180513704,
"grad_norm": 4.828333377838135,
"learning_rate": 2.209526240951665e-06,
"loss": 0.741,
"step": 3260
},
{
"epoch": 0.938441670253982,
"grad_norm": 3.3315179347991943,
"learning_rate": 2.021406509235402e-06,
"loss": 0.7554,
"step": 3270
},
{
"epoch": 0.9413115224565934,
"grad_norm": 6.141576766967773,
"learning_rate": 1.8415736787200433e-06,
"loss": 0.7465,
"step": 3280
},
{
"epoch": 0.944181374659205,
"grad_norm": 4.839749336242676,
"learning_rate": 1.6700429594310063e-06,
"loss": 0.761,
"step": 3290
},
{
"epoch": 0.9470512268618166,
"grad_norm": 4.683228969573975,
"learning_rate": 1.5068288592120283e-06,
"loss": 0.751,
"step": 3300
}
],
"logging_steps": 10,
"max_steps": 3485,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.35032131289088e+20,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}