SmolLM2-135M-Math / trainer_state.json
MihaiPopa-1's picture
Upload folder using huggingface_hub
0b261dd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2805,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010703773080010704,
"grad_norm": 2.640625,
"learning_rate": 1.993582887700535e-05,
"loss": 1.5584056854248047,
"step": 10
},
{
"epoch": 0.02140754616002141,
"grad_norm": 2.53125,
"learning_rate": 1.9864527629233515e-05,
"loss": 1.562470054626465,
"step": 20
},
{
"epoch": 0.03211131924003211,
"grad_norm": 2.46875,
"learning_rate": 1.9793226381461677e-05,
"loss": 1.5967525482177733,
"step": 30
},
{
"epoch": 0.04281509232004282,
"grad_norm": 2.546875,
"learning_rate": 1.972192513368984e-05,
"loss": 1.5231587409973144,
"step": 40
},
{
"epoch": 0.05351886540005352,
"grad_norm": 2.015625,
"learning_rate": 1.9650623885918005e-05,
"loss": 1.4803240776062012,
"step": 50
},
{
"epoch": 0.06422263848006422,
"grad_norm": 2.21875,
"learning_rate": 1.957932263814617e-05,
"loss": 1.5196543693542481,
"step": 60
},
{
"epoch": 0.07492641156007493,
"grad_norm": 2.21875,
"learning_rate": 1.9508021390374332e-05,
"loss": 1.4593828201293946,
"step": 70
},
{
"epoch": 0.08563018464008564,
"grad_norm": 2.28125,
"learning_rate": 1.9436720142602497e-05,
"loss": 1.4797739028930663,
"step": 80
},
{
"epoch": 0.09633395772009633,
"grad_norm": 2.078125,
"learning_rate": 1.9365418894830663e-05,
"loss": 1.5016543388366699,
"step": 90
},
{
"epoch": 0.10703773080010703,
"grad_norm": 2.6875,
"learning_rate": 1.9294117647058825e-05,
"loss": 1.4800514221191405,
"step": 100
},
{
"epoch": 0.11774150388011774,
"grad_norm": 1.75,
"learning_rate": 1.922281639928699e-05,
"loss": 1.445749568939209,
"step": 110
},
{
"epoch": 0.12844527696012845,
"grad_norm": 2.0625,
"learning_rate": 1.9151515151515152e-05,
"loss": 1.4519201278686524,
"step": 120
},
{
"epoch": 0.13914905004013914,
"grad_norm": 2.03125,
"learning_rate": 1.9080213903743317e-05,
"loss": 1.3476288795471192,
"step": 130
},
{
"epoch": 0.14985282312014986,
"grad_norm": 2.234375,
"learning_rate": 1.9008912655971482e-05,
"loss": 1.4554848670959473,
"step": 140
},
{
"epoch": 0.16055659620016055,
"grad_norm": 1.9375,
"learning_rate": 1.8937611408199644e-05,
"loss": 1.4435239791870118,
"step": 150
},
{
"epoch": 0.17126036928017127,
"grad_norm": 1.671875,
"learning_rate": 1.886631016042781e-05,
"loss": 1.3767672538757325,
"step": 160
},
{
"epoch": 0.18196414236018196,
"grad_norm": 2.09375,
"learning_rate": 1.8795008912655972e-05,
"loss": 1.4352334022521973,
"step": 170
},
{
"epoch": 0.19266791544019266,
"grad_norm": 2.0,
"learning_rate": 1.8723707664884137e-05,
"loss": 1.382822322845459,
"step": 180
},
{
"epoch": 0.20337168852020338,
"grad_norm": 1.953125,
"learning_rate": 1.8652406417112302e-05,
"loss": 1.429026985168457,
"step": 190
},
{
"epoch": 0.21407546160021407,
"grad_norm": 1.859375,
"learning_rate": 1.8581105169340464e-05,
"loss": 1.3564122200012207,
"step": 200
},
{
"epoch": 0.2247792346802248,
"grad_norm": 1.7109375,
"learning_rate": 1.850980392156863e-05,
"loss": 1.457004451751709,
"step": 210
},
{
"epoch": 0.23548300776023548,
"grad_norm": 1.9921875,
"learning_rate": 1.843850267379679e-05,
"loss": 1.3679749488830566,
"step": 220
},
{
"epoch": 0.2461867808402462,
"grad_norm": 2.0625,
"learning_rate": 1.8367201426024957e-05,
"loss": 1.4186459541320802,
"step": 230
},
{
"epoch": 0.2568905539202569,
"grad_norm": 1.8984375,
"learning_rate": 1.8295900178253122e-05,
"loss": 1.3645942687988282,
"step": 240
},
{
"epoch": 0.2675943270002676,
"grad_norm": 1.796875,
"learning_rate": 1.8224598930481284e-05,
"loss": 1.3659990310668946,
"step": 250
},
{
"epoch": 0.2782981000802783,
"grad_norm": 1.9375,
"learning_rate": 1.815329768270945e-05,
"loss": 1.3751505851745605,
"step": 260
},
{
"epoch": 0.289001873160289,
"grad_norm": 1.96875,
"learning_rate": 1.808199643493761e-05,
"loss": 1.394303798675537,
"step": 270
},
{
"epoch": 0.2997056462402997,
"grad_norm": 1.71875,
"learning_rate": 1.8010695187165777e-05,
"loss": 1.3266244888305665,
"step": 280
},
{
"epoch": 0.3104094193203104,
"grad_norm": 1.6328125,
"learning_rate": 1.7939393939393942e-05,
"loss": 1.3767006874084473,
"step": 290
},
{
"epoch": 0.3211131924003211,
"grad_norm": 1.8359375,
"learning_rate": 1.7868092691622104e-05,
"loss": 1.3508996963500977,
"step": 300
},
{
"epoch": 0.3318169654803318,
"grad_norm": 1.8984375,
"learning_rate": 1.779679144385027e-05,
"loss": 1.299268627166748,
"step": 310
},
{
"epoch": 0.34252073856034254,
"grad_norm": 1.6015625,
"learning_rate": 1.772549019607843e-05,
"loss": 1.335693073272705,
"step": 320
},
{
"epoch": 0.35322451164035323,
"grad_norm": 1.6171875,
"learning_rate": 1.7654188948306597e-05,
"loss": 1.3631214141845702,
"step": 330
},
{
"epoch": 0.3639282847203639,
"grad_norm": 1.7421875,
"learning_rate": 1.7582887700534762e-05,
"loss": 1.349259376525879,
"step": 340
},
{
"epoch": 0.3746320578003746,
"grad_norm": 1.8515625,
"learning_rate": 1.7511586452762924e-05,
"loss": 1.3234673500061036,
"step": 350
},
{
"epoch": 0.3853358308803853,
"grad_norm": 1.734375,
"learning_rate": 1.744028520499109e-05,
"loss": 1.34688138961792,
"step": 360
},
{
"epoch": 0.39603960396039606,
"grad_norm": 1.796875,
"learning_rate": 1.736898395721925e-05,
"loss": 1.310294246673584,
"step": 370
},
{
"epoch": 0.40674337704040675,
"grad_norm": 1.5703125,
"learning_rate": 1.7297682709447417e-05,
"loss": 1.3146047592163086,
"step": 380
},
{
"epoch": 0.41744715012041744,
"grad_norm": 2.078125,
"learning_rate": 1.7226381461675582e-05,
"loss": 1.3516902923583984,
"step": 390
},
{
"epoch": 0.42815092320042814,
"grad_norm": 1.6875,
"learning_rate": 1.7155080213903744e-05,
"loss": 1.3631095886230469,
"step": 400
},
{
"epoch": 0.4388546962804388,
"grad_norm": 1.578125,
"learning_rate": 1.708377896613191e-05,
"loss": 1.3395885467529296,
"step": 410
},
{
"epoch": 0.4495584693604496,
"grad_norm": 1.7890625,
"learning_rate": 1.701247771836007e-05,
"loss": 1.322316837310791,
"step": 420
},
{
"epoch": 0.46026224244046027,
"grad_norm": 2.03125,
"learning_rate": 1.6941176470588237e-05,
"loss": 1.3892762184143066,
"step": 430
},
{
"epoch": 0.47096601552047096,
"grad_norm": 1.765625,
"learning_rate": 1.6869875222816402e-05,
"loss": 1.3081950187683105,
"step": 440
},
{
"epoch": 0.48166978860048165,
"grad_norm": 1.75,
"learning_rate": 1.6798573975044564e-05,
"loss": 1.3405800819396974,
"step": 450
},
{
"epoch": 0.4923735616804924,
"grad_norm": 1.6171875,
"learning_rate": 1.672727272727273e-05,
"loss": 1.3331517219543456,
"step": 460
},
{
"epoch": 0.5030773347605031,
"grad_norm": 1.703125,
"learning_rate": 1.665597147950089e-05,
"loss": 1.3040351867675781,
"step": 470
},
{
"epoch": 0.5137811078405138,
"grad_norm": 1.7265625,
"learning_rate": 1.6584670231729056e-05,
"loss": 1.319422149658203,
"step": 480
},
{
"epoch": 0.5244848809205245,
"grad_norm": 1.8125,
"learning_rate": 1.6513368983957222e-05,
"loss": 1.3433240890502929,
"step": 490
},
{
"epoch": 0.5351886540005352,
"grad_norm": 1.7265625,
"learning_rate": 1.6442067736185384e-05,
"loss": 1.3346479415893555,
"step": 500
},
{
"epoch": 0.5458924270805459,
"grad_norm": 1.5625,
"learning_rate": 1.637076648841355e-05,
"loss": 1.3032867431640625,
"step": 510
},
{
"epoch": 0.5565962001605566,
"grad_norm": 1.78125,
"learning_rate": 1.629946524064171e-05,
"loss": 1.3006314277648925,
"step": 520
},
{
"epoch": 0.5672999732405672,
"grad_norm": 1.765625,
"learning_rate": 1.6228163992869876e-05,
"loss": 1.3416614532470703,
"step": 530
},
{
"epoch": 0.578003746320578,
"grad_norm": 2.015625,
"learning_rate": 1.615686274509804e-05,
"loss": 1.303782081604004,
"step": 540
},
{
"epoch": 0.5887075194005887,
"grad_norm": 1.578125,
"learning_rate": 1.6085561497326207e-05,
"loss": 1.2814931869506836,
"step": 550
},
{
"epoch": 0.5994112924805994,
"grad_norm": 1.53125,
"learning_rate": 1.601426024955437e-05,
"loss": 1.3404861450195313,
"step": 560
},
{
"epoch": 0.6101150655606101,
"grad_norm": 1.7734375,
"learning_rate": 1.594295900178253e-05,
"loss": 1.3594398498535156,
"step": 570
},
{
"epoch": 0.6208188386406208,
"grad_norm": 1.609375,
"learning_rate": 1.5871657754010696e-05,
"loss": 1.2768223762512207,
"step": 580
},
{
"epoch": 0.6315226117206315,
"grad_norm": 1.609375,
"learning_rate": 1.580035650623886e-05,
"loss": 1.3110815048217774,
"step": 590
},
{
"epoch": 0.6422263848006422,
"grad_norm": 1.6640625,
"learning_rate": 1.5729055258467027e-05,
"loss": 1.2639217376708984,
"step": 600
},
{
"epoch": 0.6529301578806529,
"grad_norm": 1.8203125,
"learning_rate": 1.565775401069519e-05,
"loss": 1.3356239318847656,
"step": 610
},
{
"epoch": 0.6636339309606636,
"grad_norm": 1.8828125,
"learning_rate": 1.558645276292335e-05,
"loss": 1.3733593940734863,
"step": 620
},
{
"epoch": 0.6743377040406744,
"grad_norm": 1.5703125,
"learning_rate": 1.5515151515151516e-05,
"loss": 1.2768065452575683,
"step": 630
},
{
"epoch": 0.6850414771206851,
"grad_norm": 1.7578125,
"learning_rate": 1.544385026737968e-05,
"loss": 1.345008659362793,
"step": 640
},
{
"epoch": 0.6957452502006958,
"grad_norm": 1.4921875,
"learning_rate": 1.5372549019607847e-05,
"loss": 1.2327005386352539,
"step": 650
},
{
"epoch": 0.7064490232807065,
"grad_norm": 1.765625,
"learning_rate": 1.530124777183601e-05,
"loss": 1.327579879760742,
"step": 660
},
{
"epoch": 0.7171527963607172,
"grad_norm": 1.6640625,
"learning_rate": 1.5229946524064172e-05,
"loss": 1.2693171501159668,
"step": 670
},
{
"epoch": 0.7278565694407279,
"grad_norm": 1.640625,
"learning_rate": 1.5158645276292336e-05,
"loss": 1.3229084014892578,
"step": 680
},
{
"epoch": 0.7385603425207385,
"grad_norm": 1.7265625,
"learning_rate": 1.5087344028520501e-05,
"loss": 1.3010024070739745,
"step": 690
},
{
"epoch": 0.7492641156007492,
"grad_norm": 1.59375,
"learning_rate": 1.5016042780748665e-05,
"loss": 1.304527473449707,
"step": 700
},
{
"epoch": 0.7599678886807599,
"grad_norm": 1.8671875,
"learning_rate": 1.4944741532976827e-05,
"loss": 1.2771072387695312,
"step": 710
},
{
"epoch": 0.7706716617607706,
"grad_norm": 1.671875,
"learning_rate": 1.4873440285204992e-05,
"loss": 1.285037899017334,
"step": 720
},
{
"epoch": 0.7813754348407814,
"grad_norm": 1.6171875,
"learning_rate": 1.4802139037433156e-05,
"loss": 1.2612761497497558,
"step": 730
},
{
"epoch": 0.7920792079207921,
"grad_norm": 1.8125,
"learning_rate": 1.4730837789661321e-05,
"loss": 1.3110386848449707,
"step": 740
},
{
"epoch": 0.8027829810008028,
"grad_norm": 1.671875,
"learning_rate": 1.4659536541889485e-05,
"loss": 1.3450962066650392,
"step": 750
},
{
"epoch": 0.8134867540808135,
"grad_norm": 1.796875,
"learning_rate": 1.4588235294117647e-05,
"loss": 1.294900608062744,
"step": 760
},
{
"epoch": 0.8241905271608242,
"grad_norm": 1.4765625,
"learning_rate": 1.4516934046345812e-05,
"loss": 1.3215585708618165,
"step": 770
},
{
"epoch": 0.8348943002408349,
"grad_norm": 1.5859375,
"learning_rate": 1.4445632798573976e-05,
"loss": 1.3044111251831054,
"step": 780
},
{
"epoch": 0.8455980733208456,
"grad_norm": 1.8671875,
"learning_rate": 1.4374331550802141e-05,
"loss": 1.3348912239074706,
"step": 790
},
{
"epoch": 0.8563018464008563,
"grad_norm": 1.65625,
"learning_rate": 1.4303030303030305e-05,
"loss": 1.3434508323669434,
"step": 800
},
{
"epoch": 0.867005619480867,
"grad_norm": 1.625,
"learning_rate": 1.4231729055258467e-05,
"loss": 1.291652297973633,
"step": 810
},
{
"epoch": 0.8777093925608777,
"grad_norm": 1.4921875,
"learning_rate": 1.4160427807486632e-05,
"loss": 1.3067720413208008,
"step": 820
},
{
"epoch": 0.8884131656408885,
"grad_norm": 1.9453125,
"learning_rate": 1.4089126559714796e-05,
"loss": 1.3196195602416991,
"step": 830
},
{
"epoch": 0.8991169387208992,
"grad_norm": 1.5390625,
"learning_rate": 1.4017825311942961e-05,
"loss": 1.3129652976989745,
"step": 840
},
{
"epoch": 0.9098207118009098,
"grad_norm": 1.5,
"learning_rate": 1.3946524064171123e-05,
"loss": 1.2702789306640625,
"step": 850
},
{
"epoch": 0.9205244848809205,
"grad_norm": 1.9453125,
"learning_rate": 1.3875222816399288e-05,
"loss": 1.29964599609375,
"step": 860
},
{
"epoch": 0.9312282579609312,
"grad_norm": 1.5703125,
"learning_rate": 1.3803921568627452e-05,
"loss": 1.301185131072998,
"step": 870
},
{
"epoch": 0.9419320310409419,
"grad_norm": 1.4140625,
"learning_rate": 1.3732620320855616e-05,
"loss": 1.2731993675231934,
"step": 880
},
{
"epoch": 0.9526358041209526,
"grad_norm": 1.421875,
"learning_rate": 1.3661319073083781e-05,
"loss": 1.2806821823120118,
"step": 890
},
{
"epoch": 0.9633395772009633,
"grad_norm": 1.84375,
"learning_rate": 1.3590017825311943e-05,
"loss": 1.2375809669494628,
"step": 900
},
{
"epoch": 0.974043350280974,
"grad_norm": 1.8046875,
"learning_rate": 1.3518716577540108e-05,
"loss": 1.2453808784484863,
"step": 910
},
{
"epoch": 0.9847471233609848,
"grad_norm": 2.03125,
"learning_rate": 1.3447415329768272e-05,
"loss": 1.3074142456054687,
"step": 920
},
{
"epoch": 0.9954508964409955,
"grad_norm": 1.484375,
"learning_rate": 1.3376114081996437e-05,
"loss": 1.2914584159851075,
"step": 930
},
{
"epoch": 1.0053518865400053,
"grad_norm": 2.0625,
"learning_rate": 1.33048128342246e-05,
"loss": 1.3543176651000977,
"step": 940
},
{
"epoch": 1.016055659620016,
"grad_norm": 1.765625,
"learning_rate": 1.3233511586452763e-05,
"loss": 1.3298683166503906,
"step": 950
},
{
"epoch": 1.0267594327000267,
"grad_norm": 1.59375,
"learning_rate": 1.3162210338680928e-05,
"loss": 1.3020204544067382,
"step": 960
},
{
"epoch": 1.0374632057800375,
"grad_norm": 1.6484375,
"learning_rate": 1.3090909090909092e-05,
"loss": 1.3046648025512695,
"step": 970
},
{
"epoch": 1.048166978860048,
"grad_norm": 1.65625,
"learning_rate": 1.3019607843137257e-05,
"loss": 1.2308432579040527,
"step": 980
},
{
"epoch": 1.0588707519400589,
"grad_norm": 1.65625,
"learning_rate": 1.294830659536542e-05,
"loss": 1.2811461448669434,
"step": 990
},
{
"epoch": 1.0695745250200697,
"grad_norm": 1.6640625,
"learning_rate": 1.2877005347593583e-05,
"loss": 1.3090335845947265,
"step": 1000
},
{
"epoch": 1.0802782981000802,
"grad_norm": 1.9921875,
"learning_rate": 1.2805704099821748e-05,
"loss": 1.2958572387695313,
"step": 1010
},
{
"epoch": 1.090982071180091,
"grad_norm": 1.6953125,
"learning_rate": 1.2734402852049912e-05,
"loss": 1.326209259033203,
"step": 1020
},
{
"epoch": 1.1016858442601016,
"grad_norm": 1.75,
"learning_rate": 1.2663101604278077e-05,
"loss": 1.2520675659179688,
"step": 1030
},
{
"epoch": 1.1123896173401124,
"grad_norm": 1.546875,
"learning_rate": 1.259180035650624e-05,
"loss": 1.3478898048400878,
"step": 1040
},
{
"epoch": 1.123093390420123,
"grad_norm": 1.4921875,
"learning_rate": 1.2520499108734403e-05,
"loss": 1.2806931495666505,
"step": 1050
},
{
"epoch": 1.1337971635001338,
"grad_norm": 1.75,
"learning_rate": 1.2449197860962568e-05,
"loss": 1.2603809356689453,
"step": 1060
},
{
"epoch": 1.1445009365801444,
"grad_norm": 1.484375,
"learning_rate": 1.2377896613190731e-05,
"loss": 1.2837313652038573,
"step": 1070
},
{
"epoch": 1.1552047096601552,
"grad_norm": 1.859375,
"learning_rate": 1.2306595365418897e-05,
"loss": 1.271355152130127,
"step": 1080
},
{
"epoch": 1.165908482740166,
"grad_norm": 1.6015625,
"learning_rate": 1.223529411764706e-05,
"loss": 1.2751256942749023,
"step": 1090
},
{
"epoch": 1.1766122558201766,
"grad_norm": 1.5390625,
"learning_rate": 1.2163992869875222e-05,
"loss": 1.2217981338500976,
"step": 1100
},
{
"epoch": 1.1873160289001874,
"grad_norm": 1.46875,
"learning_rate": 1.2092691622103388e-05,
"loss": 1.3460000038146973,
"step": 1110
},
{
"epoch": 1.198019801980198,
"grad_norm": 1.6328125,
"learning_rate": 1.2021390374331551e-05,
"loss": 1.3119497299194336,
"step": 1120
},
{
"epoch": 1.2087235750602088,
"grad_norm": 1.5,
"learning_rate": 1.1950089126559717e-05,
"loss": 1.326594066619873,
"step": 1130
},
{
"epoch": 1.2194273481402194,
"grad_norm": 1.8671875,
"learning_rate": 1.187878787878788e-05,
"loss": 1.313736343383789,
"step": 1140
},
{
"epoch": 1.2301311212202302,
"grad_norm": 1.625,
"learning_rate": 1.1807486631016042e-05,
"loss": 1.2580394744873047,
"step": 1150
},
{
"epoch": 1.2408348943002407,
"grad_norm": 1.6953125,
"learning_rate": 1.1736185383244208e-05,
"loss": 1.3472198486328124,
"step": 1160
},
{
"epoch": 1.2515386673802515,
"grad_norm": 1.6875,
"learning_rate": 1.1664884135472371e-05,
"loss": 1.3223270416259765,
"step": 1170
},
{
"epoch": 1.2622424404602621,
"grad_norm": 1.5390625,
"learning_rate": 1.1593582887700537e-05,
"loss": 1.3479475021362304,
"step": 1180
},
{
"epoch": 1.272946213540273,
"grad_norm": 1.46875,
"learning_rate": 1.15222816399287e-05,
"loss": 1.2691156387329101,
"step": 1190
},
{
"epoch": 1.2836499866202837,
"grad_norm": 1.578125,
"learning_rate": 1.1450980392156862e-05,
"loss": 1.3078096389770508,
"step": 1200
},
{
"epoch": 1.2943537597002943,
"grad_norm": 1.7265625,
"learning_rate": 1.1379679144385028e-05,
"loss": 1.2821264266967773,
"step": 1210
},
{
"epoch": 1.3050575327803051,
"grad_norm": 1.6015625,
"learning_rate": 1.1308377896613191e-05,
"loss": 1.2466256141662597,
"step": 1220
},
{
"epoch": 1.3157613058603157,
"grad_norm": 1.84375,
"learning_rate": 1.1237076648841357e-05,
"loss": 1.301154327392578,
"step": 1230
},
{
"epoch": 1.3264650789403265,
"grad_norm": 1.6171875,
"learning_rate": 1.116577540106952e-05,
"loss": 1.3058858871459962,
"step": 1240
},
{
"epoch": 1.337168852020337,
"grad_norm": 1.6875,
"learning_rate": 1.1094474153297684e-05,
"loss": 1.257982349395752,
"step": 1250
},
{
"epoch": 1.3478726251003479,
"grad_norm": 1.546875,
"learning_rate": 1.1023172905525847e-05,
"loss": 1.278379535675049,
"step": 1260
},
{
"epoch": 1.3585763981803587,
"grad_norm": 1.890625,
"learning_rate": 1.0951871657754011e-05,
"loss": 1.2998493194580079,
"step": 1270
},
{
"epoch": 1.3692801712603693,
"grad_norm": 1.5859375,
"learning_rate": 1.0880570409982176e-05,
"loss": 1.3042527198791505,
"step": 1280
},
{
"epoch": 1.3799839443403799,
"grad_norm": 1.6015625,
"learning_rate": 1.0809269162210338e-05,
"loss": 1.2903579711914062,
"step": 1290
},
{
"epoch": 1.3906877174203907,
"grad_norm": 1.5390625,
"learning_rate": 1.0737967914438504e-05,
"loss": 1.216090202331543,
"step": 1300
},
{
"epoch": 1.4013914905004015,
"grad_norm": 1.4296875,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.2497664451599122,
"step": 1310
},
{
"epoch": 1.412095263580412,
"grad_norm": 1.609375,
"learning_rate": 1.0595365418894833e-05,
"loss": 1.2592049598693849,
"step": 1320
},
{
"epoch": 1.4227990366604228,
"grad_norm": 1.703125,
"learning_rate": 1.0524064171122996e-05,
"loss": 1.3062689781188965,
"step": 1330
},
{
"epoch": 1.4335028097404334,
"grad_norm": 1.515625,
"learning_rate": 1.0452762923351158e-05,
"loss": 1.2577032089233398,
"step": 1340
},
{
"epoch": 1.4442065828204442,
"grad_norm": 1.75,
"learning_rate": 1.0381461675579324e-05,
"loss": 1.2874650001525878,
"step": 1350
},
{
"epoch": 1.4549103559004548,
"grad_norm": 2.0625,
"learning_rate": 1.0310160427807487e-05,
"loss": 1.2887776374816895,
"step": 1360
},
{
"epoch": 1.4656141289804656,
"grad_norm": 1.59375,
"learning_rate": 1.0238859180035653e-05,
"loss": 1.2869946479797363,
"step": 1370
},
{
"epoch": 1.4763179020604764,
"grad_norm": 1.7421875,
"learning_rate": 1.0167557932263816e-05,
"loss": 1.3055774688720703,
"step": 1380
},
{
"epoch": 1.487021675140487,
"grad_norm": 1.7421875,
"learning_rate": 1.0096256684491978e-05,
"loss": 1.2925223350524901,
"step": 1390
},
{
"epoch": 1.4977254482204978,
"grad_norm": 1.5390625,
"learning_rate": 1.0024955436720143e-05,
"loss": 1.3624143600463867,
"step": 1400
},
{
"epoch": 1.5084292213005084,
"grad_norm": 1.7265625,
"learning_rate": 9.953654188948307e-06,
"loss": 1.3100957870483398,
"step": 1410
},
{
"epoch": 1.5191329943805192,
"grad_norm": 1.734375,
"learning_rate": 9.882352941176472e-06,
"loss": 1.2667318344116212,
"step": 1420
},
{
"epoch": 1.5298367674605298,
"grad_norm": 1.5,
"learning_rate": 9.811051693404634e-06,
"loss": 1.2964338302612304,
"step": 1430
},
{
"epoch": 1.5405405405405406,
"grad_norm": 1.703125,
"learning_rate": 9.7397504456328e-06,
"loss": 1.2451062202453613,
"step": 1440
},
{
"epoch": 1.5512443136205514,
"grad_norm": 1.6484375,
"learning_rate": 9.668449197860963e-06,
"loss": 1.2622719764709474,
"step": 1450
},
{
"epoch": 1.561948086700562,
"grad_norm": 1.8359375,
"learning_rate": 9.597147950089127e-06,
"loss": 1.2830778121948243,
"step": 1460
},
{
"epoch": 1.5726518597805725,
"grad_norm": 1.5390625,
"learning_rate": 9.525846702317292e-06,
"loss": 1.3212904930114746,
"step": 1470
},
{
"epoch": 1.5833556328605833,
"grad_norm": 1.515625,
"learning_rate": 9.454545454545456e-06,
"loss": 1.301555347442627,
"step": 1480
},
{
"epoch": 1.5940594059405941,
"grad_norm": 1.453125,
"learning_rate": 9.38324420677362e-06,
"loss": 1.2626118659973145,
"step": 1490
},
{
"epoch": 1.6047631790206047,
"grad_norm": 1.890625,
"learning_rate": 9.311942959001783e-06,
"loss": 1.2342555046081543,
"step": 1500
},
{
"epoch": 1.6154669521006153,
"grad_norm": 1.7421875,
"learning_rate": 9.240641711229947e-06,
"loss": 1.3167900085449218,
"step": 1510
},
{
"epoch": 1.6261707251806263,
"grad_norm": 2.0,
"learning_rate": 9.169340463458112e-06,
"loss": 1.296627902984619,
"step": 1520
},
{
"epoch": 1.636874498260637,
"grad_norm": 1.4453125,
"learning_rate": 9.098039215686276e-06,
"loss": 1.275075340270996,
"step": 1530
},
{
"epoch": 1.6475782713406475,
"grad_norm": 1.53125,
"learning_rate": 9.02673796791444e-06,
"loss": 1.2771642684936524,
"step": 1540
},
{
"epoch": 1.6582820444206583,
"grad_norm": 1.8046875,
"learning_rate": 8.955436720142603e-06,
"loss": 1.2907758712768556,
"step": 1550
},
{
"epoch": 1.668985817500669,
"grad_norm": 1.6953125,
"learning_rate": 8.884135472370767e-06,
"loss": 1.2778194427490235,
"step": 1560
},
{
"epoch": 1.6796895905806797,
"grad_norm": 1.5859375,
"learning_rate": 8.81283422459893e-06,
"loss": 1.2820199012756348,
"step": 1570
},
{
"epoch": 1.6903933636606903,
"grad_norm": 1.8984375,
"learning_rate": 8.741532976827096e-06,
"loss": 1.3197799682617188,
"step": 1580
},
{
"epoch": 1.701097136740701,
"grad_norm": 1.796875,
"learning_rate": 8.67023172905526e-06,
"loss": 1.2711196899414063,
"step": 1590
},
{
"epoch": 1.7118009098207119,
"grad_norm": 1.7265625,
"learning_rate": 8.598930481283423e-06,
"loss": 1.3094602584838868,
"step": 1600
},
{
"epoch": 1.7225046829007225,
"grad_norm": 1.65625,
"learning_rate": 8.527629233511587e-06,
"loss": 1.3037433624267578,
"step": 1610
},
{
"epoch": 1.7332084559807333,
"grad_norm": 1.6484375,
"learning_rate": 8.45632798573975e-06,
"loss": 1.2734570503234863,
"step": 1620
},
{
"epoch": 1.743912229060744,
"grad_norm": 1.5703125,
"learning_rate": 8.385026737967916e-06,
"loss": 1.2476407051086427,
"step": 1630
},
{
"epoch": 1.7546160021407546,
"grad_norm": 1.7265625,
"learning_rate": 8.31372549019608e-06,
"loss": 1.3427558898925782,
"step": 1640
},
{
"epoch": 1.7653197752207652,
"grad_norm": 1.65625,
"learning_rate": 8.242424242424243e-06,
"loss": 1.273496437072754,
"step": 1650
},
{
"epoch": 1.776023548300776,
"grad_norm": 1.71875,
"learning_rate": 8.171122994652407e-06,
"loss": 1.2626665115356446,
"step": 1660
},
{
"epoch": 1.7867273213807868,
"grad_norm": 1.8046875,
"learning_rate": 8.09982174688057e-06,
"loss": 1.2670047760009766,
"step": 1670
},
{
"epoch": 1.7974310944607974,
"grad_norm": 1.7265625,
"learning_rate": 8.028520499108736e-06,
"loss": 1.349191665649414,
"step": 1680
},
{
"epoch": 1.808134867540808,
"grad_norm": 1.578125,
"learning_rate": 7.9572192513369e-06,
"loss": 1.2989972114562989,
"step": 1690
},
{
"epoch": 1.8188386406208188,
"grad_norm": 1.6875,
"learning_rate": 7.885918003565063e-06,
"loss": 1.1850922584533692,
"step": 1700
},
{
"epoch": 1.8295424137008296,
"grad_norm": 1.953125,
"learning_rate": 7.814616755793228e-06,
"loss": 1.3360312461853028,
"step": 1710
},
{
"epoch": 1.8402461867808402,
"grad_norm": 1.6328125,
"learning_rate": 7.74331550802139e-06,
"loss": 1.2957257270812987,
"step": 1720
},
{
"epoch": 1.850949959860851,
"grad_norm": 1.6015625,
"learning_rate": 7.672014260249555e-06,
"loss": 1.2536530494689941,
"step": 1730
},
{
"epoch": 1.8616537329408618,
"grad_norm": 1.7109375,
"learning_rate": 7.60071301247772e-06,
"loss": 1.2660930633544922,
"step": 1740
},
{
"epoch": 1.8723575060208724,
"grad_norm": 1.7265625,
"learning_rate": 7.529411764705883e-06,
"loss": 1.3080876350402832,
"step": 1750
},
{
"epoch": 1.883061279100883,
"grad_norm": 1.640625,
"learning_rate": 7.458110516934047e-06,
"loss": 1.3132406234741212,
"step": 1760
},
{
"epoch": 1.8937650521808937,
"grad_norm": 1.8203125,
"learning_rate": 7.386809269162211e-06,
"loss": 1.31253080368042,
"step": 1770
},
{
"epoch": 1.9044688252609046,
"grad_norm": 1.4140625,
"learning_rate": 7.315508021390375e-06,
"loss": 1.3013240814208984,
"step": 1780
},
{
"epoch": 1.9151725983409151,
"grad_norm": 1.4765625,
"learning_rate": 7.244206773618538e-06,
"loss": 1.2744117736816407,
"step": 1790
},
{
"epoch": 1.9258763714209257,
"grad_norm": 1.4609375,
"learning_rate": 7.172905525846703e-06,
"loss": 1.3057758331298828,
"step": 1800
},
{
"epoch": 1.9365801445009367,
"grad_norm": 1.515625,
"learning_rate": 7.101604278074867e-06,
"loss": 1.224927043914795,
"step": 1810
},
{
"epoch": 1.9472839175809473,
"grad_norm": 1.7109375,
"learning_rate": 7.030303030303031e-06,
"loss": 1.3182221412658692,
"step": 1820
},
{
"epoch": 1.957987690660958,
"grad_norm": 1.5546875,
"learning_rate": 6.959001782531195e-06,
"loss": 1.2400826454162597,
"step": 1830
},
{
"epoch": 1.9686914637409687,
"grad_norm": 1.7421875,
"learning_rate": 6.887700534759358e-06,
"loss": 1.2463386535644532,
"step": 1840
},
{
"epoch": 1.9793952368209795,
"grad_norm": 1.46875,
"learning_rate": 6.8163992869875225e-06,
"loss": 1.3235528945922852,
"step": 1850
},
{
"epoch": 1.99009900990099,
"grad_norm": 1.8203125,
"learning_rate": 6.745098039215687e-06,
"loss": 1.2812946319580079,
"step": 1860
},
{
"epoch": 2.0,
"grad_norm": 4.1875,
"learning_rate": 6.673796791443851e-06,
"loss": 1.2953272819519044,
"step": 1870
},
{
"epoch": 2.0107037730800106,
"grad_norm": 1.5390625,
"learning_rate": 6.602495543672015e-06,
"loss": 1.207719612121582,
"step": 1880
},
{
"epoch": 2.0214075461600216,
"grad_norm": 1.671875,
"learning_rate": 6.531194295900179e-06,
"loss": 1.2520846366882323,
"step": 1890
},
{
"epoch": 2.032111319240032,
"grad_norm": 1.703125,
"learning_rate": 6.459893048128343e-06,
"loss": 1.2905988693237305,
"step": 1900
},
{
"epoch": 2.0428150923200428,
"grad_norm": 1.921875,
"learning_rate": 6.388591800356507e-06,
"loss": 1.3520148277282715,
"step": 1910
},
{
"epoch": 2.0535188654000534,
"grad_norm": 1.8515625,
"learning_rate": 6.3172905525846705e-06,
"loss": 1.2911107063293457,
"step": 1920
},
{
"epoch": 2.0642226384800644,
"grad_norm": 1.671875,
"learning_rate": 6.245989304812835e-06,
"loss": 1.2403117179870606,
"step": 1930
},
{
"epoch": 2.074926411560075,
"grad_norm": 1.96875,
"learning_rate": 6.174688057040999e-06,
"loss": 1.3558055877685546,
"step": 1940
},
{
"epoch": 2.0856301846400855,
"grad_norm": 1.6328125,
"learning_rate": 6.103386809269163e-06,
"loss": 1.3194045066833495,
"step": 1950
},
{
"epoch": 2.096333957720096,
"grad_norm": 2.140625,
"learning_rate": 6.032085561497326e-06,
"loss": 1.3321582794189453,
"step": 1960
},
{
"epoch": 2.107037730800107,
"grad_norm": 1.625,
"learning_rate": 5.96078431372549e-06,
"loss": 1.288839054107666,
"step": 1970
},
{
"epoch": 2.1177415038801177,
"grad_norm": 2.0,
"learning_rate": 5.889483065953655e-06,
"loss": 1.3260244369506835,
"step": 1980
},
{
"epoch": 2.1284452769601283,
"grad_norm": 1.5703125,
"learning_rate": 5.8181818181818185e-06,
"loss": 1.2721702575683593,
"step": 1990
},
{
"epoch": 2.1391490500401393,
"grad_norm": 1.6640625,
"learning_rate": 5.746880570409983e-06,
"loss": 1.2622364044189454,
"step": 2000
},
{
"epoch": 2.14985282312015,
"grad_norm": 1.609375,
"learning_rate": 5.675579322638146e-06,
"loss": 1.30474796295166,
"step": 2010
},
{
"epoch": 2.1605565962001605,
"grad_norm": 1.7890625,
"learning_rate": 5.60427807486631e-06,
"loss": 1.3109374046325684,
"step": 2020
},
{
"epoch": 2.171260369280171,
"grad_norm": 1.71875,
"learning_rate": 5.532976827094475e-06,
"loss": 1.3231799125671386,
"step": 2030
},
{
"epoch": 2.181964142360182,
"grad_norm": 1.796875,
"learning_rate": 5.4616755793226384e-06,
"loss": 1.2993489265441895,
"step": 2040
},
{
"epoch": 2.1926679154401927,
"grad_norm": 1.875,
"learning_rate": 5.390374331550803e-06,
"loss": 1.3044631958007813,
"step": 2050
},
{
"epoch": 2.2033716885202033,
"grad_norm": 1.4609375,
"learning_rate": 5.3190730837789666e-06,
"loss": 1.2702978134155274,
"step": 2060
},
{
"epoch": 2.2140754616002143,
"grad_norm": 1.5546875,
"learning_rate": 5.24777183600713e-06,
"loss": 1.287952709197998,
"step": 2070
},
{
"epoch": 2.224779234680225,
"grad_norm": 1.5546875,
"learning_rate": 5.176470588235295e-06,
"loss": 1.2974214553833008,
"step": 2080
},
{
"epoch": 2.2354830077602355,
"grad_norm": 1.703125,
"learning_rate": 5.105169340463458e-06,
"loss": 1.3148197174072265,
"step": 2090
},
{
"epoch": 2.246186780840246,
"grad_norm": 1.6328125,
"learning_rate": 5.033868092691623e-06,
"loss": 1.3466445922851562,
"step": 2100
},
{
"epoch": 2.256890553920257,
"grad_norm": 1.484375,
"learning_rate": 4.9625668449197864e-06,
"loss": 1.334506893157959,
"step": 2110
},
{
"epoch": 2.2675943270002676,
"grad_norm": 1.9765625,
"learning_rate": 4.891265597147951e-06,
"loss": 1.279165267944336,
"step": 2120
},
{
"epoch": 2.278298100080278,
"grad_norm": 1.890625,
"learning_rate": 4.8199643493761146e-06,
"loss": 1.2512639045715332,
"step": 2130
},
{
"epoch": 2.289001873160289,
"grad_norm": 1.59375,
"learning_rate": 4.748663101604278e-06,
"loss": 1.2572649002075196,
"step": 2140
},
{
"epoch": 2.2997056462403,
"grad_norm": 1.5625,
"learning_rate": 4.677361853832442e-06,
"loss": 1.2503036499023437,
"step": 2150
},
{
"epoch": 2.3104094193203104,
"grad_norm": 1.859375,
"learning_rate": 4.606060606060606e-06,
"loss": 1.2866994857788085,
"step": 2160
},
{
"epoch": 2.321113192400321,
"grad_norm": 1.6171875,
"learning_rate": 4.534759358288771e-06,
"loss": 1.2810638427734375,
"step": 2170
},
{
"epoch": 2.331816965480332,
"grad_norm": 1.71875,
"learning_rate": 4.4634581105169345e-06,
"loss": 1.2588828086853028,
"step": 2180
},
{
"epoch": 2.3425207385603426,
"grad_norm": 1.6875,
"learning_rate": 4.392156862745098e-06,
"loss": 1.2615557670593263,
"step": 2190
},
{
"epoch": 2.353224511640353,
"grad_norm": 1.921875,
"learning_rate": 4.320855614973263e-06,
"loss": 1.2974510192871094,
"step": 2200
},
{
"epoch": 2.3639282847203638,
"grad_norm": 1.78125,
"learning_rate": 4.249554367201426e-06,
"loss": 1.303697681427002,
"step": 2210
},
{
"epoch": 2.374632057800375,
"grad_norm": 1.765625,
"learning_rate": 4.178253119429591e-06,
"loss": 1.303341007232666,
"step": 2220
},
{
"epoch": 2.3853358308803854,
"grad_norm": 1.46875,
"learning_rate": 4.106951871657754e-06,
"loss": 1.306796932220459,
"step": 2230
},
{
"epoch": 2.396039603960396,
"grad_norm": 1.828125,
"learning_rate": 4.035650623885918e-06,
"loss": 1.3068408012390136,
"step": 2240
},
{
"epoch": 2.4067433770404065,
"grad_norm": 1.671875,
"learning_rate": 3.9643493761140825e-06,
"loss": 1.307657527923584,
"step": 2250
},
{
"epoch": 2.4174471501204176,
"grad_norm": 1.75,
"learning_rate": 3.893048128342246e-06,
"loss": 1.2932353973388673,
"step": 2260
},
{
"epoch": 2.428150923200428,
"grad_norm": 1.765625,
"learning_rate": 3.821746880570411e-06,
"loss": 1.2625031471252441,
"step": 2270
},
{
"epoch": 2.4388546962804387,
"grad_norm": 1.703125,
"learning_rate": 3.7504456327985743e-06,
"loss": 1.3354209899902343,
"step": 2280
},
{
"epoch": 2.4495584693604497,
"grad_norm": 1.3671875,
"learning_rate": 3.6791443850267383e-06,
"loss": 1.200312042236328,
"step": 2290
},
{
"epoch": 2.4602622424404603,
"grad_norm": 1.6484375,
"learning_rate": 3.6078431372549024e-06,
"loss": 1.2868337631225586,
"step": 2300
},
{
"epoch": 2.470966015520471,
"grad_norm": 1.78125,
"learning_rate": 3.536541889483066e-06,
"loss": 1.2731021881103515,
"step": 2310
},
{
"epoch": 2.4816697886004815,
"grad_norm": 2.265625,
"learning_rate": 3.46524064171123e-06,
"loss": 1.3145703315734862,
"step": 2320
},
{
"epoch": 2.4923735616804925,
"grad_norm": 1.671875,
"learning_rate": 3.3939393939393946e-06,
"loss": 1.305215549468994,
"step": 2330
},
{
"epoch": 2.503077334760503,
"grad_norm": 1.734375,
"learning_rate": 3.322638146167558e-06,
"loss": 1.3567096710205078,
"step": 2340
},
{
"epoch": 2.5137811078405137,
"grad_norm": 1.6015625,
"learning_rate": 3.2513368983957223e-06,
"loss": 1.3081507682800293,
"step": 2350
},
{
"epoch": 2.5244848809205243,
"grad_norm": 1.7109375,
"learning_rate": 3.180035650623886e-06,
"loss": 1.300461483001709,
"step": 2360
},
{
"epoch": 2.5351886540005353,
"grad_norm": 1.5859375,
"learning_rate": 3.10873440285205e-06,
"loss": 1.3006972312927245,
"step": 2370
},
{
"epoch": 2.545892427080546,
"grad_norm": 1.734375,
"learning_rate": 3.0374331550802145e-06,
"loss": 1.3157925605773926,
"step": 2380
},
{
"epoch": 2.5565962001605564,
"grad_norm": 1.5625,
"learning_rate": 2.966131907308378e-06,
"loss": 1.2608634948730468,
"step": 2390
},
{
"epoch": 2.5672999732405675,
"grad_norm": 1.765625,
"learning_rate": 2.894830659536542e-06,
"loss": 1.237275981903076,
"step": 2400
},
{
"epoch": 2.578003746320578,
"grad_norm": 1.5859375,
"learning_rate": 2.8235294117647062e-06,
"loss": 1.274481964111328,
"step": 2410
},
{
"epoch": 2.5887075194005886,
"grad_norm": 1.65625,
"learning_rate": 2.75222816399287e-06,
"loss": 1.3146997451782227,
"step": 2420
},
{
"epoch": 2.5994112924805997,
"grad_norm": 1.6640625,
"learning_rate": 2.680926916221034e-06,
"loss": 1.3125761032104493,
"step": 2430
},
{
"epoch": 2.6101150655606102,
"grad_norm": 1.6953125,
"learning_rate": 2.6096256684491984e-06,
"loss": 1.2919845581054688,
"step": 2440
},
{
"epoch": 2.620818838640621,
"grad_norm": 1.6484375,
"learning_rate": 2.538324420677362e-06,
"loss": 1.2364542961120606,
"step": 2450
},
{
"epoch": 2.6315226117206314,
"grad_norm": 1.609375,
"learning_rate": 2.467023172905526e-06,
"loss": 1.2624409675598145,
"step": 2460
},
{
"epoch": 2.642226384800642,
"grad_norm": 1.5859375,
"learning_rate": 2.3957219251336898e-06,
"loss": 1.3075796127319337,
"step": 2470
},
{
"epoch": 2.652930157880653,
"grad_norm": 2.109375,
"learning_rate": 2.3244206773618542e-06,
"loss": 1.2835824012756347,
"step": 2480
},
{
"epoch": 2.6636339309606636,
"grad_norm": 1.8359375,
"learning_rate": 2.253119429590018e-06,
"loss": 1.307276153564453,
"step": 2490
},
{
"epoch": 2.674337704040674,
"grad_norm": 1.7421875,
"learning_rate": 2.181818181818182e-06,
"loss": 1.2622486114501954,
"step": 2500
},
{
"epoch": 2.685041477120685,
"grad_norm": 1.6796875,
"learning_rate": 2.110516934046346e-06,
"loss": 1.2514682769775392,
"step": 2510
},
{
"epoch": 2.6957452502006958,
"grad_norm": 1.5625,
"learning_rate": 2.03921568627451e-06,
"loss": 1.2614849090576172,
"step": 2520
},
{
"epoch": 2.7064490232807064,
"grad_norm": 1.5390625,
"learning_rate": 1.9679144385026737e-06,
"loss": 1.3241849899291993,
"step": 2530
},
{
"epoch": 2.7171527963607174,
"grad_norm": 1.7890625,
"learning_rate": 1.896613190730838e-06,
"loss": 1.2841781616210937,
"step": 2540
},
{
"epoch": 2.727856569440728,
"grad_norm": 1.8515625,
"learning_rate": 1.8253119429590018e-06,
"loss": 1.3062438011169433,
"step": 2550
},
{
"epoch": 2.7385603425207385,
"grad_norm": 1.6328125,
"learning_rate": 1.7540106951871661e-06,
"loss": 1.3065251350402831,
"step": 2560
},
{
"epoch": 2.749264115600749,
"grad_norm": 1.65625,
"learning_rate": 1.68270944741533e-06,
"loss": 1.2980451583862305,
"step": 2570
},
{
"epoch": 2.7599678886807597,
"grad_norm": 1.671875,
"learning_rate": 1.6114081996434938e-06,
"loss": 1.2766281127929688,
"step": 2580
},
{
"epoch": 2.7706716617607707,
"grad_norm": 1.5859375,
"learning_rate": 1.5401069518716579e-06,
"loss": 1.3033970832824706,
"step": 2590
},
{
"epoch": 2.7813754348407813,
"grad_norm": 1.4921875,
"learning_rate": 1.468805704099822e-06,
"loss": 1.2335359573364257,
"step": 2600
},
{
"epoch": 2.792079207920792,
"grad_norm": 1.6484375,
"learning_rate": 1.3975044563279858e-06,
"loss": 1.3184511184692382,
"step": 2610
},
{
"epoch": 2.802782981000803,
"grad_norm": 1.609375,
"learning_rate": 1.3262032085561499e-06,
"loss": 1.1845362663269043,
"step": 2620
},
{
"epoch": 2.8134867540808135,
"grad_norm": 1.7890625,
"learning_rate": 1.2549019607843137e-06,
"loss": 1.2506700515747071,
"step": 2630
},
{
"epoch": 2.824190527160824,
"grad_norm": 1.7109375,
"learning_rate": 1.1836007130124778e-06,
"loss": 1.2360112190246582,
"step": 2640
},
{
"epoch": 2.834894300240835,
"grad_norm": 1.703125,
"learning_rate": 1.1122994652406418e-06,
"loss": 1.2875761032104491,
"step": 2650
},
{
"epoch": 2.8455980733208457,
"grad_norm": 1.578125,
"learning_rate": 1.0409982174688057e-06,
"loss": 1.2473506927490234,
"step": 2660
},
{
"epoch": 2.8563018464008563,
"grad_norm": 1.546875,
"learning_rate": 9.696969696969698e-07,
"loss": 1.3208060264587402,
"step": 2670
},
{
"epoch": 2.867005619480867,
"grad_norm": 1.671875,
"learning_rate": 8.983957219251338e-07,
"loss": 1.3371116638183593,
"step": 2680
},
{
"epoch": 2.8777093925608774,
"grad_norm": 1.484375,
"learning_rate": 8.270944741532977e-07,
"loss": 1.2605000495910645,
"step": 2690
},
{
"epoch": 2.8884131656408885,
"grad_norm": 1.6015625,
"learning_rate": 7.557932263814617e-07,
"loss": 1.267725658416748,
"step": 2700
},
{
"epoch": 2.899116938720899,
"grad_norm": 1.640625,
"learning_rate": 6.844919786096257e-07,
"loss": 1.27689208984375,
"step": 2710
},
{
"epoch": 2.9098207118009096,
"grad_norm": 1.578125,
"learning_rate": 6.131907308377896e-07,
"loss": 1.286923885345459,
"step": 2720
},
{
"epoch": 2.9205244848809206,
"grad_norm": 1.765625,
"learning_rate": 5.418894830659537e-07,
"loss": 1.330905055999756,
"step": 2730
},
{
"epoch": 2.9312282579609312,
"grad_norm": 1.71875,
"learning_rate": 4.7058823529411767e-07,
"loss": 1.2354840278625487,
"step": 2740
},
{
"epoch": 2.941932031040942,
"grad_norm": 1.4609375,
"learning_rate": 3.992869875222817e-07,
"loss": 1.2647834777832032,
"step": 2750
},
{
"epoch": 2.952635804120953,
"grad_norm": 1.6875,
"learning_rate": 3.2798573975044564e-07,
"loss": 1.2786317825317384,
"step": 2760
},
{
"epoch": 2.9633395772009634,
"grad_norm": 1.53125,
"learning_rate": 2.5668449197860965e-07,
"loss": 1.2594982147216798,
"step": 2770
},
{
"epoch": 2.974043350280974,
"grad_norm": 1.578125,
"learning_rate": 1.8538324420677363e-07,
"loss": 1.3203317642211914,
"step": 2780
},
{
"epoch": 2.984747123360985,
"grad_norm": 2.046875,
"learning_rate": 1.1408199643493762e-07,
"loss": 1.224764347076416,
"step": 2790
},
{
"epoch": 2.9954508964409956,
"grad_norm": 1.5078125,
"learning_rate": 4.2780748663101606e-08,
"loss": 1.2845193862915039,
"step": 2800
}
],
"logging_steps": 10,
"max_steps": 2805,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7314356060356608.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}