{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9997473684210526,
  "eval_steps": 500,
  "global_step": 1484,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0006736842105263158,
      "grad_norm": 3.6967623233795166,
      "learning_rate": 1.3422818791946309e-06,
      "loss": 2.4093,
      "step": 1
    },
    {
      "epoch": 0.006736842105263158,
      "grad_norm": 2.5490193367004395,
      "learning_rate": 1.3422818791946309e-05,
      "loss": 2.4939,
      "step": 10
    },
    {
      "epoch": 0.013473684210526317,
      "grad_norm": 0.18483224511146545,
      "learning_rate": 2.6845637583892618e-05,
      "loss": 1.1877,
      "step": 20
    },
    {
      "epoch": 0.020210526315789474,
      "grad_norm": 0.2031693309545517,
      "learning_rate": 4.026845637583892e-05,
      "loss": 0.8909,
      "step": 30
    },
    {
      "epoch": 0.026947368421052633,
      "grad_norm": 0.6876732707023621,
      "learning_rate": 5.3691275167785237e-05,
      "loss": 0.7581,
      "step": 40
    },
    {
      "epoch": 0.03368421052631579,
      "grad_norm": 0.09247241914272308,
      "learning_rate": 6.711409395973155e-05,
      "loss": 0.7594,
      "step": 50
    },
    {
      "epoch": 0.04042105263157895,
      "grad_norm": 0.1324968934059143,
      "learning_rate": 8.053691275167784e-05,
      "loss": 0.7405,
      "step": 60
    },
    {
      "epoch": 0.04715789473684211,
      "grad_norm": 0.05673883110284805,
      "learning_rate": 9.395973154362417e-05,
      "loss": 0.7065,
      "step": 70
    },
    {
      "epoch": 0.053894736842105266,
      "grad_norm": 0.04617280140519142,
      "learning_rate": 0.00010738255033557047,
      "loss": 0.6817,
      "step": 80
    },
    {
      "epoch": 0.06063157894736842,
      "grad_norm": 0.04381496459245682,
      "learning_rate": 0.0001208053691275168,
      "loss": 0.6789,
      "step": 90
    },
    {
      "epoch": 0.06736842105263158,
      "grad_norm": 0.07428538799285889,
      "learning_rate": 0.0001342281879194631,
      "loss": 0.6816,
      "step": 100
    },
    {
      "epoch": 0.07410526315789474,
      "grad_norm": 0.04249708354473114,
      "learning_rate": 0.00014765100671140942,
      "loss": 0.6997,
      "step": 110
    },
    {
      "epoch": 0.0808421052631579,
      "grad_norm": 0.05957937240600586,
      "learning_rate": 0.0001610738255033557,
      "loss": 0.6807,
      "step": 120
    },
    {
      "epoch": 0.08757894736842105,
      "grad_norm": 0.03975442424416542,
      "learning_rate": 0.000174496644295302,
      "loss": 0.6733,
      "step": 130
    },
    {
      "epoch": 0.09431578947368421,
      "grad_norm": 0.04079463332891464,
      "learning_rate": 0.00018791946308724833,
      "loss": 0.6556,
      "step": 140
    },
    {
      "epoch": 0.10105263157894737,
      "grad_norm": 0.04245497286319733,
      "learning_rate": 0.00019985018726591762,
      "loss": 0.6575,
      "step": 150
    },
    {
      "epoch": 0.10778947368421053,
      "grad_norm": 0.09695123136043549,
      "learning_rate": 0.00019835205992509364,
      "loss": 0.6916,
      "step": 160
    },
    {
      "epoch": 0.11452631578947368,
      "grad_norm": 0.03505201265215874,
      "learning_rate": 0.00019685393258426966,
      "loss": 0.6622,
      "step": 170
    },
    {
      "epoch": 0.12126315789473684,
      "grad_norm": 0.02820334956049919,
      "learning_rate": 0.0001953558052434457,
      "loss": 0.6497,
      "step": 180
    },
    {
      "epoch": 0.128,
      "grad_norm": 0.04135354235768318,
      "learning_rate": 0.00019385767790262173,
      "loss": 0.6671,
      "step": 190
    },
    {
      "epoch": 0.13473684210526315,
      "grad_norm": 0.031461067497730255,
      "learning_rate": 0.00019235955056179775,
      "loss": 0.657,
      "step": 200
    },
    {
      "epoch": 0.1414736842105263,
      "grad_norm": 0.04208710789680481,
      "learning_rate": 0.0001908614232209738,
      "loss": 0.6766,
      "step": 210
    },
    {
      "epoch": 0.1482105263157895,
      "grad_norm": 3.495147705078125,
      "learning_rate": 0.00018936329588014982,
      "loss": 3.9378,
      "step": 220
    },
    {
      "epoch": 0.15494736842105264,
      "grad_norm": 0.18893112242221832,
      "learning_rate": 0.00018786516853932586,
      "loss": 7.1374,
      "step": 230
    },
    {
      "epoch": 0.1616842105263158,
      "grad_norm": 0.0959916040301323,
      "learning_rate": 0.00018636704119850189,
      "loss": 5.8104,
      "step": 240
    },
    {
      "epoch": 0.16842105263157894,
      "grad_norm": 0.08286964148283005,
      "learning_rate": 0.0001848689138576779,
      "loss": 4.7292,
      "step": 250
    },
    {
      "epoch": 0.1751578947368421,
      "grad_norm": 0.04510454833507538,
      "learning_rate": 0.00018337078651685393,
      "loss": 4.9858,
      "step": 260
    },
    {
      "epoch": 0.18189473684210528,
      "grad_norm": 0.2256896197795868,
      "learning_rate": 0.00018187265917602997,
      "loss": 4.7463,
      "step": 270
    },
    {
      "epoch": 0.18863157894736843,
      "grad_norm": 0.06342379748821259,
      "learning_rate": 0.00018037453183520602,
      "loss": 4.517,
      "step": 280
    },
    {
      "epoch": 0.19536842105263158,
      "grad_norm": 0.07497289776802063,
      "learning_rate": 0.00017887640449438204,
      "loss": 4.4052,
      "step": 290
    },
    {
      "epoch": 0.20210526315789473,
      "grad_norm": 0.08952877670526505,
      "learning_rate": 0.00017737827715355806,
      "loss": 3.9614,
      "step": 300
    },
    {
      "epoch": 0.20884210526315788,
      "grad_norm": 0.044066932052373886,
      "learning_rate": 0.00017588014981273408,
      "loss": 4.5861,
      "step": 310
    },
    {
      "epoch": 0.21557894736842106,
      "grad_norm": 0.08251778781414032,
      "learning_rate": 0.0001743820224719101,
      "loss": 4.5163,
      "step": 320
    },
    {
      "epoch": 0.22231578947368422,
      "grad_norm": 0.04723803699016571,
      "learning_rate": 0.00017288389513108615,
      "loss": 4.1904,
      "step": 330
    },
    {
      "epoch": 0.22905263157894737,
      "grad_norm": 0.09082615375518799,
      "learning_rate": 0.0001713857677902622,
      "loss": 4.1982,
      "step": 340
    },
    {
      "epoch": 0.23578947368421052,
      "grad_norm": 0.04866361245512962,
      "learning_rate": 0.00016988764044943822,
      "loss": 3.8506,
      "step": 350
    },
    {
      "epoch": 0.24252631578947367,
      "grad_norm": 0.04515402019023895,
      "learning_rate": 0.00016838951310861424,
      "loss": 4.4254,
      "step": 360
    },
    {
      "epoch": 0.24926315789473685,
      "grad_norm": 0.14205284416675568,
      "learning_rate": 0.00016689138576779026,
      "loss": 4.4111,
      "step": 370
    },
    {
      "epoch": 0.256,
      "grad_norm": 0.16082021594047546,
      "learning_rate": 0.0001653932584269663,
      "loss": 4.1119,
      "step": 380
    },
    {
      "epoch": 0.26273684210526316,
      "grad_norm": 0.061411116272211075,
      "learning_rate": 0.00016389513108614235,
      "loss": 4.059,
      "step": 390
    },
    {
      "epoch": 0.2694736842105263,
      "grad_norm": 0.058379318565130234,
      "learning_rate": 0.00016239700374531837,
      "loss": 3.7307,
      "step": 400
    },
    {
      "epoch": 0.27621052631578946,
      "grad_norm": 0.048859789967536926,
      "learning_rate": 0.0001608988764044944,
      "loss": 4.3039,
      "step": 410
    },
    {
      "epoch": 0.2829473684210526,
      "grad_norm": 0.06003361940383911,
      "learning_rate": 0.0001594007490636704,
      "loss": 4.2032,
      "step": 420
    },
    {
      "epoch": 0.28968421052631577,
      "grad_norm": 0.10120591521263123,
      "learning_rate": 0.00015790262172284646,
      "loss": 3.9567,
      "step": 430
    },
    {
      "epoch": 0.296421052631579,
      "grad_norm": 0.21033401787281036,
      "learning_rate": 0.00015640449438202248,
      "loss": 3.9369,
      "step": 440
    },
    {
      "epoch": 0.3031578947368421,
      "grad_norm": 0.06378967314958572,
      "learning_rate": 0.00015490636704119852,
      "loss": 3.6318,
      "step": 450
    },
    {
      "epoch": 0.3098947368421053,
      "grad_norm": 0.042198359966278076,
      "learning_rate": 0.00015340823970037455,
      "loss": 4.1789,
      "step": 460
    },
    {
      "epoch": 0.31663157894736843,
      "grad_norm": 0.053648848086595535,
      "learning_rate": 0.00015191011235955057,
      "loss": 4.1562,
      "step": 470
    },
    {
      "epoch": 0.3233684210526316,
      "grad_norm": 0.0808805301785469,
      "learning_rate": 0.00015041198501872659,
      "loss": 3.8883,
      "step": 480
    },
    {
      "epoch": 0.33010526315789473,
      "grad_norm": 0.13895294070243835,
      "learning_rate": 0.00014891385767790263,
      "loss": 3.9055,
      "step": 490
    },
    {
      "epoch": 0.3368421052631579,
      "grad_norm": 0.11999215185642242,
      "learning_rate": 0.00014741573033707865,
      "loss": 3.6025,
      "step": 500
    },
    {
      "epoch": 0.34357894736842104,
      "grad_norm": 0.0969998687505722,
      "learning_rate": 0.0001459176029962547,
      "loss": 4.2401,
      "step": 510
    },
    {
      "epoch": 0.3503157894736842,
      "grad_norm": 0.2578948438167572,
      "learning_rate": 0.00014441947565543072,
      "loss": 4.1355,
      "step": 520
    },
    {
      "epoch": 0.35705263157894734,
      "grad_norm": 0.067634217441082,
      "learning_rate": 0.00014292134831460674,
      "loss": 3.8735,
      "step": 530
    },
    {
      "epoch": 0.36378947368421055,
      "grad_norm": 0.1961352676153183,
      "learning_rate": 0.0001414232209737828,
      "loss": 3.7641,
      "step": 540
    },
    {
      "epoch": 0.3705263157894737,
      "grad_norm": 0.07940343767404556,
      "learning_rate": 0.0001399250936329588,
      "loss": 3.5177,
      "step": 550
    },
    {
      "epoch": 0.37726315789473686,
      "grad_norm": 1.3029491901397705,
      "learning_rate": 0.00013842696629213483,
      "loss": 4.1854,
      "step": 560
    },
    {
      "epoch": 0.384,
      "grad_norm": 0.10544762760400772,
      "learning_rate": 0.00013692883895131088,
      "loss": 4.3064,
      "step": 570
    },
    {
      "epoch": 0.39073684210526316,
      "grad_norm": 0.150394469499588,
      "learning_rate": 0.0001354307116104869,
      "loss": 3.9517,
      "step": 580
    },
    {
      "epoch": 0.3974736842105263,
      "grad_norm": 0.06921563297510147,
      "learning_rate": 0.00013393258426966294,
      "loss": 3.8917,
      "step": 590
    },
    {
      "epoch": 0.40421052631578946,
      "grad_norm": 0.06402010470628738,
      "learning_rate": 0.00013243445692883896,
      "loss": 3.5635,
      "step": 600
    },
    {
      "epoch": 0.4109473684210526,
      "grad_norm": 0.08918313682079315,
      "learning_rate": 0.00013093632958801498,
      "loss": 4.1197,
      "step": 610
    },
    {
      "epoch": 0.41768421052631577,
      "grad_norm": 0.054397523403167725,
      "learning_rate": 0.000129438202247191,
      "loss": 4.0442,
      "step": 620
    },
    {
      "epoch": 0.4244210526315789,
      "grad_norm": 0.068702831864357,
      "learning_rate": 0.00012794007490636705,
      "loss": 3.7506,
      "step": 630
    },
    {
      "epoch": 0.43115789473684213,
      "grad_norm": 0.14575353264808655,
      "learning_rate": 0.0001264419475655431,
      "loss": 3.7359,
      "step": 640
    },
    {
      "epoch": 0.4378947368421053,
      "grad_norm": 0.1481335461139679,
      "learning_rate": 0.00012494382022471912,
      "loss": 3.3705,
      "step": 650
    },
    {
      "epoch": 0.44463157894736843,
      "grad_norm": 0.06438197940587997,
      "learning_rate": 0.00012344569288389514,
      "loss": 4.0248,
      "step": 660
    },
    {
      "epoch": 0.4513684210526316,
      "grad_norm": 0.38855019211769104,
      "learning_rate": 0.00012194756554307116,
      "loss": 4.0265,
      "step": 670
    },
    {
      "epoch": 0.45810526315789474,
      "grad_norm": 0.20793034136295319,
      "learning_rate": 0.00012044943820224719,
      "loss": 3.7305,
      "step": 680
    },
    {
      "epoch": 0.4648421052631579,
      "grad_norm": 0.11011853814125061,
      "learning_rate": 0.00011895131086142324,
      "loss": 3.6933,
      "step": 690
    },
    {
      "epoch": 0.47157894736842104,
      "grad_norm": 0.06795340031385422,
      "learning_rate": 0.00011745318352059926,
      "loss": 3.3734,
      "step": 700
    },
    {
      "epoch": 0.4783157894736842,
      "grad_norm": 0.07788679003715515,
      "learning_rate": 0.00011595505617977529,
      "loss": 3.9053,
      "step": 710
    },
    {
      "epoch": 0.48505263157894735,
      "grad_norm": 0.07339611649513245,
      "learning_rate": 0.00011445692883895131,
      "loss": 3.8685,
      "step": 720
    },
    {
      "epoch": 0.4917894736842105,
      "grad_norm": 0.16048288345336914,
      "learning_rate": 0.00011295880149812735,
      "loss": 3.5673,
      "step": 730
    },
    {
      "epoch": 0.4985263157894737,
      "grad_norm": 0.2596355974674225,
      "learning_rate": 0.00011146067415730337,
      "loss": 3.5684,
      "step": 740
    },
    {
      "epoch": 0.5052631578947369,
      "grad_norm": 0.10115884989500046,
      "learning_rate": 0.00010996254681647941,
      "loss": 3.2226,
      "step": 750
    },
    {
      "epoch": 0.512,
      "grad_norm": 0.13997367024421692,
      "learning_rate": 0.00010846441947565545,
      "loss": 3.8579,
      "step": 760
    },
    {
      "epoch": 0.5187368421052632,
      "grad_norm": 0.08359155058860779,
      "learning_rate": 0.00010696629213483147,
      "loss": 3.8313,
      "step": 770
    },
    {
      "epoch": 0.5254736842105263,
      "grad_norm": 0.2407791018486023,
      "learning_rate": 0.0001054681647940075,
      "loss": 3.5257,
      "step": 780
    },
    {
      "epoch": 0.5322105263157895,
      "grad_norm": 0.34615418314933777,
      "learning_rate": 0.00010397003745318352,
      "loss": 3.5113,
      "step": 790
    },
    {
      "epoch": 0.5389473684210526,
      "grad_norm": 0.06987264007329941,
      "learning_rate": 0.00010247191011235954,
      "loss": 3.1525,
      "step": 800
    },
    {
      "epoch": 0.5456842105263158,
      "grad_norm": 0.07933894544839859,
      "learning_rate": 0.00010097378277153558,
      "loss": 3.718,
      "step": 810
    },
    {
      "epoch": 0.5524210526315789,
      "grad_norm": 0.12424171715974808,
      "learning_rate": 9.947565543071161e-05,
      "loss": 3.6641,
      "step": 820
    },
    {
      "epoch": 0.5591578947368421,
      "grad_norm": 0.2515564262866974,
      "learning_rate": 9.797752808988764e-05,
      "loss": 3.4268,
      "step": 830
    },
    {
      "epoch": 0.5658947368421052,
      "grad_norm": 0.30851560831069946,
      "learning_rate": 9.647940074906368e-05,
      "loss": 3.3856,
      "step": 840
    },
    {
      "epoch": 0.5726315789473684,
      "grad_norm": 0.05149822682142258,
      "learning_rate": 9.49812734082397e-05,
      "loss": 3.1259,
      "step": 850
    },
    {
      "epoch": 0.5793684210526315,
      "grad_norm": 0.17960771918296814,
      "learning_rate": 9.348314606741574e-05,
      "loss": 3.6767,
      "step": 860
    },
    {
      "epoch": 0.5861052631578947,
      "grad_norm": 0.17523854970932007,
      "learning_rate": 9.198501872659176e-05,
      "loss": 3.5995,
      "step": 870
    },
    {
      "epoch": 0.592842105263158,
      "grad_norm": 0.3186163008213043,
      "learning_rate": 9.04868913857678e-05,
      "loss": 3.3966,
      "step": 880
    },
    {
      "epoch": 0.5995789473684211,
      "grad_norm": 0.21263690292835236,
      "learning_rate": 8.898876404494383e-05,
      "loss": 3.3526,
      "step": 890
    },
    {
      "epoch": 0.6063157894736843,
      "grad_norm": 0.10399254411458969,
      "learning_rate": 8.749063670411985e-05,
      "loss": 3.0519,
      "step": 900
    },
    {
      "epoch": 0.6130526315789474,
      "grad_norm": 0.13143524527549744,
      "learning_rate": 8.599250936329589e-05,
      "loss": 3.629,
      "step": 910
    },
    {
      "epoch": 0.6197894736842106,
      "grad_norm": 0.15374666452407837,
      "learning_rate": 8.449438202247192e-05,
      "loss": 3.6895,
      "step": 920
    },
    {
      "epoch": 0.6265263157894737,
      "grad_norm": 0.23757484555244446,
      "learning_rate": 8.299625468164794e-05,
      "loss": 3.3622,
      "step": 930
    },
    {
      "epoch": 0.6332631578947369,
      "grad_norm": 0.1661984622478485,
      "learning_rate": 8.149812734082397e-05,
      "loss": 3.3248,
      "step": 940
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.08603614568710327,
      "learning_rate": 8e-05,
      "loss": 3.0086,
      "step": 950
    },
    {
      "epoch": 0.6467368421052632,
      "grad_norm": 0.07694745808839798,
      "learning_rate": 7.850187265917604e-05,
      "loss": 3.5162,
      "step": 960
    },
    {
      "epoch": 0.6534736842105263,
      "grad_norm": 0.16395558416843414,
      "learning_rate": 7.700374531835206e-05,
      "loss": 3.4812,
      "step": 970
    },
    {
      "epoch": 0.6602105263157895,
      "grad_norm": 0.13817398250102997,
      "learning_rate": 7.55056179775281e-05,
      "loss": 3.2516,
      "step": 980
    },
    {
      "epoch": 0.6669473684210526,
      "grad_norm": 0.25807198882102966,
      "learning_rate": 7.400749063670413e-05,
      "loss": 3.2101,
      "step": 990
    },
    {
      "epoch": 0.6736842105263158,
      "grad_norm": 0.06848172843456268,
      "learning_rate": 7.250936329588015e-05,
      "loss": 2.93,
      "step": 1000
    },
    {
      "epoch": 0.6804210526315789,
      "grad_norm": 1.089575171470642,
      "learning_rate": 7.101123595505618e-05,
      "loss": 3.4925,
      "step": 1010
    },
    {
      "epoch": 0.6871578947368421,
      "grad_norm": 0.20126965641975403,
      "learning_rate": 6.951310861423222e-05,
      "loss": 3.4603,
      "step": 1020
    },
    {
      "epoch": 0.6938947368421052,
      "grad_norm": 0.21779027581214905,
      "learning_rate": 6.801498127340824e-05,
      "loss": 3.1723,
      "step": 1030
    },
    {
      "epoch": 0.7006315789473684,
      "grad_norm": 0.18239159882068634,
      "learning_rate": 6.651685393258428e-05,
      "loss": 3.1903,
      "step": 1040
    },
    {
      "epoch": 0.7073684210526315,
      "grad_norm": 0.06677573919296265,
      "learning_rate": 6.50187265917603e-05,
      "loss": 2.8445,
      "step": 1050
    },
    {
      "epoch": 0.7141052631578947,
      "grad_norm": 0.42619746923446655,
      "learning_rate": 6.352059925093634e-05,
      "loss": 3.4319,
      "step": 1060
    },
    {
      "epoch": 0.7208421052631578,
      "grad_norm": 0.12023507058620453,
      "learning_rate": 6.202247191011237e-05,
      "loss": 3.3826,
      "step": 1070
    },
    {
      "epoch": 0.7275789473684211,
      "grad_norm": 0.15099403262138367,
      "learning_rate": 6.052434456928839e-05,
      "loss": 3.1425,
      "step": 1080
    },
    {
      "epoch": 0.7343157894736843,
      "grad_norm": 0.3474717438220978,
      "learning_rate": 5.902621722846442e-05,
      "loss": 3.1279,
      "step": 1090
    },
    {
      "epoch": 0.7410526315789474,
      "grad_norm": 0.12225649505853653,
      "learning_rate": 5.752808988764046e-05,
      "loss": 2.9033,
      "step": 1100
    },
    {
      "epoch": 0.7477894736842106,
      "grad_norm": 0.19639068841934204,
      "learning_rate": 5.6029962546816485e-05,
      "loss": 3.3681,
      "step": 1110
    },
    {
      "epoch": 0.7545263157894737,
      "grad_norm": 0.10571427643299103,
      "learning_rate": 5.453183520599251e-05,
      "loss": 3.335,
      "step": 1120
    },
    {
      "epoch": 0.7612631578947369,
      "grad_norm": 0.5154901146888733,
      "learning_rate": 5.3033707865168545e-05,
      "loss": 3.0952,
      "step": 1130
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.6122628450393677,
      "learning_rate": 5.153558052434457e-05,
      "loss": 3.1269,
      "step": 1140
    },
    {
      "epoch": 0.7747368421052632,
      "grad_norm": 0.19698569178581238,
      "learning_rate": 5.00374531835206e-05,
      "loss": 2.8233,
      "step": 1150
    },
    {
      "epoch": 0.7814736842105263,
      "grad_norm": 0.13018374145030975,
      "learning_rate": 4.853932584269663e-05,
      "loss": 3.3094,
      "step": 1160
    },
    {
      "epoch": 0.7882105263157895,
      "grad_norm": 0.09522128850221634,
      "learning_rate": 4.704119850187266e-05,
      "loss": 3.2765,
      "step": 1170
    },
    {
      "epoch": 0.7949473684210526,
      "grad_norm": 0.10098107159137726,
      "learning_rate": 4.554307116104869e-05,
      "loss": 3.0807,
      "step": 1180
    },
    {
      "epoch": 0.8016842105263158,
      "grad_norm": 0.18019132316112518,
      "learning_rate": 4.404494382022472e-05,
      "loss": 3.0332,
      "step": 1190
    },
    {
      "epoch": 0.8084210526315789,
      "grad_norm": 0.16289708018302917,
      "learning_rate": 4.2546816479400754e-05,
      "loss": 2.7374,
      "step": 1200
    },
    {
      "epoch": 0.8151578947368421,
      "grad_norm": 0.12666673958301544,
      "learning_rate": 4.104868913857678e-05,
      "loss": 3.2118,
      "step": 1210
    },
    {
      "epoch": 0.8218947368421052,
      "grad_norm": 0.16891352832317352,
      "learning_rate": 3.955056179775281e-05,
      "loss": 3.1902,
      "step": 1220
    },
    {
      "epoch": 0.8286315789473684,
      "grad_norm": 0.10958009213209152,
      "learning_rate": 3.805243445692884e-05,
      "loss": 2.9862,
      "step": 1230
    },
    {
      "epoch": 0.8353684210526315,
      "grad_norm": 0.10642745345830917,
      "learning_rate": 3.655430711610487e-05,
      "loss": 3.0052,
      "step": 1240
    },
    {
      "epoch": 0.8421052631578947,
      "grad_norm": 0.05656813085079193,
      "learning_rate": 3.50561797752809e-05,
      "loss": 2.723,
      "step": 1250
    },
    {
      "epoch": 0.8488421052631578,
      "grad_norm": 0.08322717994451523,
      "learning_rate": 3.355805243445693e-05,
      "loss": 3.234,
      "step": 1260
    },
    {
      "epoch": 0.8555789473684211,
      "grad_norm": 0.13246551156044006,
      "learning_rate": 3.2059925093632956e-05,
      "loss": 3.212,
      "step": 1270
    },
    {
      "epoch": 0.8623157894736843,
      "grad_norm": 0.10225304961204529,
      "learning_rate": 3.056179775280899e-05,
      "loss": 2.9484,
      "step": 1280
    },
    {
      "epoch": 0.8690526315789474,
      "grad_norm": 0.19440552592277527,
      "learning_rate": 2.9063670411985024e-05,
      "loss": 2.9266,
      "step": 1290
    },
    {
      "epoch": 0.8757894736842106,
      "grad_norm": 0.08913037180900574,
      "learning_rate": 2.7565543071161047e-05,
      "loss": 2.6801,
      "step": 1300
    },
    {
      "epoch": 0.8825263157894737,
      "grad_norm": 0.10815408080816269,
      "learning_rate": 2.606741573033708e-05,
      "loss": 3.1505,
      "step": 1310
    },
    {
      "epoch": 0.8892631578947369,
      "grad_norm": 0.14371147751808167,
      "learning_rate": 2.4569288389513108e-05,
      "loss": 3.1293,
      "step": 1320
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.1680973470211029,
      "learning_rate": 2.3071161048689138e-05,
      "loss": 2.8961,
      "step": 1330
    },
    {
      "epoch": 0.9027368421052632,
      "grad_norm": 0.19012019038200378,
      "learning_rate": 2.157303370786517e-05,
      "loss": 2.9096,
      "step": 1340
    },
    {
      "epoch": 0.9094736842105263,
      "grad_norm": 0.060957688838243484,
      "learning_rate": 2.00749063670412e-05,
      "loss": 2.6879,
      "step": 1350
    },
    {
      "epoch": 0.9162105263157895,
      "grad_norm": 0.15055014193058014,
      "learning_rate": 1.857677902621723e-05,
      "loss": 3.108,
      "step": 1360
    },
    {
      "epoch": 0.9229473684210526,
      "grad_norm": 0.1378874033689499,
      "learning_rate": 1.707865168539326e-05,
      "loss": 3.0428,
      "step": 1370
    },
    {
      "epoch": 0.9296842105263158,
      "grad_norm": 0.14901022613048553,
      "learning_rate": 1.558052434456929e-05,
      "loss": 2.8589,
      "step": 1380
    },
    {
      "epoch": 0.9364210526315789,
      "grad_norm": 0.17515867948532104,
      "learning_rate": 1.4082397003745318e-05,
      "loss": 2.8563,
      "step": 1390
    },
    {
      "epoch": 0.9431578947368421,
      "grad_norm": 0.11909812688827515,
      "learning_rate": 1.258426966292135e-05,
      "loss": 2.5759,
      "step": 1400
    },
    {
      "epoch": 0.9498947368421052,
      "grad_norm": 0.16348549723625183,
      "learning_rate": 1.1086142322097379e-05,
      "loss": 3.089,
      "step": 1410
    },
    {
      "epoch": 0.9566315789473684,
      "grad_norm": 0.08107765763998032,
      "learning_rate": 9.588014981273409e-06,
      "loss": 3.0145,
      "step": 1420
    },
    {
      "epoch": 0.9633684210526315,
      "grad_norm": 0.13251617550849915,
      "learning_rate": 8.089887640449438e-06,
      "loss": 2.8256,
      "step": 1430
    },
    {
      "epoch": 0.9701052631578947,
      "grad_norm": 0.10319063812494278,
      "learning_rate": 6.591760299625469e-06,
      "loss": 2.8456,
      "step": 1440
    },
    {
      "epoch": 0.9768421052631578,
      "grad_norm": 0.08950542658567429,
      "learning_rate": 5.093632958801498e-06,
      "loss": 2.605,
      "step": 1450
    },
    {
      "epoch": 0.983578947368421,
      "grad_norm": 0.08379487693309784,
      "learning_rate": 3.5955056179775286e-06,
      "loss": 3.0334,
      "step": 1460
    },
    {
      "epoch": 0.9903157894736843,
      "grad_norm": 0.1561821848154068,
      "learning_rate": 2.097378277153558e-06,
      "loss": 3.0357,
      "step": 1470
    },
    {
      "epoch": 0.9970526315789474,
      "grad_norm": 0.07574011385440826,
      "learning_rate": 5.992509363295881e-07,
      "loss": 2.7458,
      "step": 1480
    }
  ],
  "logging_steps": 10,
  "max_steps": 1484,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2.32780044727799e+18,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}