k1h0's picture
Upload folder using huggingface_hub
6ac8891 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997473684210526,
"eval_steps": 500,
"global_step": 1484,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006736842105263158,
"grad_norm": 3.6967623233795166,
"learning_rate": 1.3422818791946309e-06,
"loss": 2.4093,
"step": 1
},
{
"epoch": 0.006736842105263158,
"grad_norm": 2.5490193367004395,
"learning_rate": 1.3422818791946309e-05,
"loss": 2.4939,
"step": 10
},
{
"epoch": 0.013473684210526317,
"grad_norm": 0.18483224511146545,
"learning_rate": 2.6845637583892618e-05,
"loss": 1.1877,
"step": 20
},
{
"epoch": 0.020210526315789474,
"grad_norm": 0.2031693309545517,
"learning_rate": 4.026845637583892e-05,
"loss": 0.8909,
"step": 30
},
{
"epoch": 0.026947368421052633,
"grad_norm": 0.6876732707023621,
"learning_rate": 5.3691275167785237e-05,
"loss": 0.7581,
"step": 40
},
{
"epoch": 0.03368421052631579,
"grad_norm": 0.09247241914272308,
"learning_rate": 6.711409395973155e-05,
"loss": 0.7594,
"step": 50
},
{
"epoch": 0.04042105263157895,
"grad_norm": 0.1324968934059143,
"learning_rate": 8.053691275167784e-05,
"loss": 0.7405,
"step": 60
},
{
"epoch": 0.04715789473684211,
"grad_norm": 0.05673883110284805,
"learning_rate": 9.395973154362417e-05,
"loss": 0.7065,
"step": 70
},
{
"epoch": 0.053894736842105266,
"grad_norm": 0.04617280140519142,
"learning_rate": 0.00010738255033557047,
"loss": 0.6817,
"step": 80
},
{
"epoch": 0.06063157894736842,
"grad_norm": 0.04381496459245682,
"learning_rate": 0.0001208053691275168,
"loss": 0.6789,
"step": 90
},
{
"epoch": 0.06736842105263158,
"grad_norm": 0.07428538799285889,
"learning_rate": 0.0001342281879194631,
"loss": 0.6816,
"step": 100
},
{
"epoch": 0.07410526315789474,
"grad_norm": 0.04249708354473114,
"learning_rate": 0.00014765100671140942,
"loss": 0.6997,
"step": 110
},
{
"epoch": 0.0808421052631579,
"grad_norm": 0.05957937240600586,
"learning_rate": 0.0001610738255033557,
"loss": 0.6807,
"step": 120
},
{
"epoch": 0.08757894736842105,
"grad_norm": 0.03975442424416542,
"learning_rate": 0.000174496644295302,
"loss": 0.6733,
"step": 130
},
{
"epoch": 0.09431578947368421,
"grad_norm": 0.04079463332891464,
"learning_rate": 0.00018791946308724833,
"loss": 0.6556,
"step": 140
},
{
"epoch": 0.10105263157894737,
"grad_norm": 0.04245497286319733,
"learning_rate": 0.00019985018726591762,
"loss": 0.6575,
"step": 150
},
{
"epoch": 0.10778947368421053,
"grad_norm": 0.09695123136043549,
"learning_rate": 0.00019835205992509364,
"loss": 0.6916,
"step": 160
},
{
"epoch": 0.11452631578947368,
"grad_norm": 0.03505201265215874,
"learning_rate": 0.00019685393258426966,
"loss": 0.6622,
"step": 170
},
{
"epoch": 0.12126315789473684,
"grad_norm": 0.02820334956049919,
"learning_rate": 0.0001953558052434457,
"loss": 0.6497,
"step": 180
},
{
"epoch": 0.128,
"grad_norm": 0.04135354235768318,
"learning_rate": 0.00019385767790262173,
"loss": 0.6671,
"step": 190
},
{
"epoch": 0.13473684210526315,
"grad_norm": 0.031461067497730255,
"learning_rate": 0.00019235955056179775,
"loss": 0.657,
"step": 200
},
{
"epoch": 0.1414736842105263,
"grad_norm": 0.04208710789680481,
"learning_rate": 0.0001908614232209738,
"loss": 0.6766,
"step": 210
},
{
"epoch": 0.1482105263157895,
"grad_norm": 3.495147705078125,
"learning_rate": 0.00018936329588014982,
"loss": 3.9378,
"step": 220
},
{
"epoch": 0.15494736842105264,
"grad_norm": 0.18893112242221832,
"learning_rate": 0.00018786516853932586,
"loss": 7.1374,
"step": 230
},
{
"epoch": 0.1616842105263158,
"grad_norm": 0.0959916040301323,
"learning_rate": 0.00018636704119850189,
"loss": 5.8104,
"step": 240
},
{
"epoch": 0.16842105263157894,
"grad_norm": 0.08286964148283005,
"learning_rate": 0.0001848689138576779,
"loss": 4.7292,
"step": 250
},
{
"epoch": 0.1751578947368421,
"grad_norm": 0.04510454833507538,
"learning_rate": 0.00018337078651685393,
"loss": 4.9858,
"step": 260
},
{
"epoch": 0.18189473684210528,
"grad_norm": 0.2256896197795868,
"learning_rate": 0.00018187265917602997,
"loss": 4.7463,
"step": 270
},
{
"epoch": 0.18863157894736843,
"grad_norm": 0.06342379748821259,
"learning_rate": 0.00018037453183520602,
"loss": 4.517,
"step": 280
},
{
"epoch": 0.19536842105263158,
"grad_norm": 0.07497289776802063,
"learning_rate": 0.00017887640449438204,
"loss": 4.4052,
"step": 290
},
{
"epoch": 0.20210526315789473,
"grad_norm": 0.08952877670526505,
"learning_rate": 0.00017737827715355806,
"loss": 3.9614,
"step": 300
},
{
"epoch": 0.20884210526315788,
"grad_norm": 0.044066932052373886,
"learning_rate": 0.00017588014981273408,
"loss": 4.5861,
"step": 310
},
{
"epoch": 0.21557894736842106,
"grad_norm": 0.08251778781414032,
"learning_rate": 0.0001743820224719101,
"loss": 4.5163,
"step": 320
},
{
"epoch": 0.22231578947368422,
"grad_norm": 0.04723803699016571,
"learning_rate": 0.00017288389513108615,
"loss": 4.1904,
"step": 330
},
{
"epoch": 0.22905263157894737,
"grad_norm": 0.09082615375518799,
"learning_rate": 0.0001713857677902622,
"loss": 4.1982,
"step": 340
},
{
"epoch": 0.23578947368421052,
"grad_norm": 0.04866361245512962,
"learning_rate": 0.00016988764044943822,
"loss": 3.8506,
"step": 350
},
{
"epoch": 0.24252631578947367,
"grad_norm": 0.04515402019023895,
"learning_rate": 0.00016838951310861424,
"loss": 4.4254,
"step": 360
},
{
"epoch": 0.24926315789473685,
"grad_norm": 0.14205284416675568,
"learning_rate": 0.00016689138576779026,
"loss": 4.4111,
"step": 370
},
{
"epoch": 0.256,
"grad_norm": 0.16082021594047546,
"learning_rate": 0.0001653932584269663,
"loss": 4.1119,
"step": 380
},
{
"epoch": 0.26273684210526316,
"grad_norm": 0.061411116272211075,
"learning_rate": 0.00016389513108614235,
"loss": 4.059,
"step": 390
},
{
"epoch": 0.2694736842105263,
"grad_norm": 0.058379318565130234,
"learning_rate": 0.00016239700374531837,
"loss": 3.7307,
"step": 400
},
{
"epoch": 0.27621052631578946,
"grad_norm": 0.048859789967536926,
"learning_rate": 0.0001608988764044944,
"loss": 4.3039,
"step": 410
},
{
"epoch": 0.2829473684210526,
"grad_norm": 0.06003361940383911,
"learning_rate": 0.0001594007490636704,
"loss": 4.2032,
"step": 420
},
{
"epoch": 0.28968421052631577,
"grad_norm": 0.10120591521263123,
"learning_rate": 0.00015790262172284646,
"loss": 3.9567,
"step": 430
},
{
"epoch": 0.296421052631579,
"grad_norm": 0.21033401787281036,
"learning_rate": 0.00015640449438202248,
"loss": 3.9369,
"step": 440
},
{
"epoch": 0.3031578947368421,
"grad_norm": 0.06378967314958572,
"learning_rate": 0.00015490636704119852,
"loss": 3.6318,
"step": 450
},
{
"epoch": 0.3098947368421053,
"grad_norm": 0.042198359966278076,
"learning_rate": 0.00015340823970037455,
"loss": 4.1789,
"step": 460
},
{
"epoch": 0.31663157894736843,
"grad_norm": 0.053648848086595535,
"learning_rate": 0.00015191011235955057,
"loss": 4.1562,
"step": 470
},
{
"epoch": 0.3233684210526316,
"grad_norm": 0.0808805301785469,
"learning_rate": 0.00015041198501872659,
"loss": 3.8883,
"step": 480
},
{
"epoch": 0.33010526315789473,
"grad_norm": 0.13895294070243835,
"learning_rate": 0.00014891385767790263,
"loss": 3.9055,
"step": 490
},
{
"epoch": 0.3368421052631579,
"grad_norm": 0.11999215185642242,
"learning_rate": 0.00014741573033707865,
"loss": 3.6025,
"step": 500
},
{
"epoch": 0.34357894736842104,
"grad_norm": 0.0969998687505722,
"learning_rate": 0.0001459176029962547,
"loss": 4.2401,
"step": 510
},
{
"epoch": 0.3503157894736842,
"grad_norm": 0.2578948438167572,
"learning_rate": 0.00014441947565543072,
"loss": 4.1355,
"step": 520
},
{
"epoch": 0.35705263157894734,
"grad_norm": 0.067634217441082,
"learning_rate": 0.00014292134831460674,
"loss": 3.8735,
"step": 530
},
{
"epoch": 0.36378947368421055,
"grad_norm": 0.1961352676153183,
"learning_rate": 0.0001414232209737828,
"loss": 3.7641,
"step": 540
},
{
"epoch": 0.3705263157894737,
"grad_norm": 0.07940343767404556,
"learning_rate": 0.0001399250936329588,
"loss": 3.5177,
"step": 550
},
{
"epoch": 0.37726315789473686,
"grad_norm": 1.3029491901397705,
"learning_rate": 0.00013842696629213483,
"loss": 4.1854,
"step": 560
},
{
"epoch": 0.384,
"grad_norm": 0.10544762760400772,
"learning_rate": 0.00013692883895131088,
"loss": 4.3064,
"step": 570
},
{
"epoch": 0.39073684210526316,
"grad_norm": 0.150394469499588,
"learning_rate": 0.0001354307116104869,
"loss": 3.9517,
"step": 580
},
{
"epoch": 0.3974736842105263,
"grad_norm": 0.06921563297510147,
"learning_rate": 0.00013393258426966294,
"loss": 3.8917,
"step": 590
},
{
"epoch": 0.40421052631578946,
"grad_norm": 0.06402010470628738,
"learning_rate": 0.00013243445692883896,
"loss": 3.5635,
"step": 600
},
{
"epoch": 0.4109473684210526,
"grad_norm": 0.08918313682079315,
"learning_rate": 0.00013093632958801498,
"loss": 4.1197,
"step": 610
},
{
"epoch": 0.41768421052631577,
"grad_norm": 0.054397523403167725,
"learning_rate": 0.000129438202247191,
"loss": 4.0442,
"step": 620
},
{
"epoch": 0.4244210526315789,
"grad_norm": 0.068702831864357,
"learning_rate": 0.00012794007490636705,
"loss": 3.7506,
"step": 630
},
{
"epoch": 0.43115789473684213,
"grad_norm": 0.14575353264808655,
"learning_rate": 0.0001264419475655431,
"loss": 3.7359,
"step": 640
},
{
"epoch": 0.4378947368421053,
"grad_norm": 0.1481335461139679,
"learning_rate": 0.00012494382022471912,
"loss": 3.3705,
"step": 650
},
{
"epoch": 0.44463157894736843,
"grad_norm": 0.06438197940587997,
"learning_rate": 0.00012344569288389514,
"loss": 4.0248,
"step": 660
},
{
"epoch": 0.4513684210526316,
"grad_norm": 0.38855019211769104,
"learning_rate": 0.00012194756554307116,
"loss": 4.0265,
"step": 670
},
{
"epoch": 0.45810526315789474,
"grad_norm": 0.20793034136295319,
"learning_rate": 0.00012044943820224719,
"loss": 3.7305,
"step": 680
},
{
"epoch": 0.4648421052631579,
"grad_norm": 0.11011853814125061,
"learning_rate": 0.00011895131086142324,
"loss": 3.6933,
"step": 690
},
{
"epoch": 0.47157894736842104,
"grad_norm": 0.06795340031385422,
"learning_rate": 0.00011745318352059926,
"loss": 3.3734,
"step": 700
},
{
"epoch": 0.4783157894736842,
"grad_norm": 0.07788679003715515,
"learning_rate": 0.00011595505617977529,
"loss": 3.9053,
"step": 710
},
{
"epoch": 0.48505263157894735,
"grad_norm": 0.07339611649513245,
"learning_rate": 0.00011445692883895131,
"loss": 3.8685,
"step": 720
},
{
"epoch": 0.4917894736842105,
"grad_norm": 0.16048288345336914,
"learning_rate": 0.00011295880149812735,
"loss": 3.5673,
"step": 730
},
{
"epoch": 0.4985263157894737,
"grad_norm": 0.2596355974674225,
"learning_rate": 0.00011146067415730337,
"loss": 3.5684,
"step": 740
},
{
"epoch": 0.5052631578947369,
"grad_norm": 0.10115884989500046,
"learning_rate": 0.00010996254681647941,
"loss": 3.2226,
"step": 750
},
{
"epoch": 0.512,
"grad_norm": 0.13997367024421692,
"learning_rate": 0.00010846441947565545,
"loss": 3.8579,
"step": 760
},
{
"epoch": 0.5187368421052632,
"grad_norm": 0.08359155058860779,
"learning_rate": 0.00010696629213483147,
"loss": 3.8313,
"step": 770
},
{
"epoch": 0.5254736842105263,
"grad_norm": 0.2407791018486023,
"learning_rate": 0.0001054681647940075,
"loss": 3.5257,
"step": 780
},
{
"epoch": 0.5322105263157895,
"grad_norm": 0.34615418314933777,
"learning_rate": 0.00010397003745318352,
"loss": 3.5113,
"step": 790
},
{
"epoch": 0.5389473684210526,
"grad_norm": 0.06987264007329941,
"learning_rate": 0.00010247191011235954,
"loss": 3.1525,
"step": 800
},
{
"epoch": 0.5456842105263158,
"grad_norm": 0.07933894544839859,
"learning_rate": 0.00010097378277153558,
"loss": 3.718,
"step": 810
},
{
"epoch": 0.5524210526315789,
"grad_norm": 0.12424171715974808,
"learning_rate": 9.947565543071161e-05,
"loss": 3.6641,
"step": 820
},
{
"epoch": 0.5591578947368421,
"grad_norm": 0.2515564262866974,
"learning_rate": 9.797752808988764e-05,
"loss": 3.4268,
"step": 830
},
{
"epoch": 0.5658947368421052,
"grad_norm": 0.30851560831069946,
"learning_rate": 9.647940074906368e-05,
"loss": 3.3856,
"step": 840
},
{
"epoch": 0.5726315789473684,
"grad_norm": 0.05149822682142258,
"learning_rate": 9.49812734082397e-05,
"loss": 3.1259,
"step": 850
},
{
"epoch": 0.5793684210526315,
"grad_norm": 0.17960771918296814,
"learning_rate": 9.348314606741574e-05,
"loss": 3.6767,
"step": 860
},
{
"epoch": 0.5861052631578947,
"grad_norm": 0.17523854970932007,
"learning_rate": 9.198501872659176e-05,
"loss": 3.5995,
"step": 870
},
{
"epoch": 0.592842105263158,
"grad_norm": 0.3186163008213043,
"learning_rate": 9.04868913857678e-05,
"loss": 3.3966,
"step": 880
},
{
"epoch": 0.5995789473684211,
"grad_norm": 0.21263690292835236,
"learning_rate": 8.898876404494383e-05,
"loss": 3.3526,
"step": 890
},
{
"epoch": 0.6063157894736843,
"grad_norm": 0.10399254411458969,
"learning_rate": 8.749063670411985e-05,
"loss": 3.0519,
"step": 900
},
{
"epoch": 0.6130526315789474,
"grad_norm": 0.13143524527549744,
"learning_rate": 8.599250936329589e-05,
"loss": 3.629,
"step": 910
},
{
"epoch": 0.6197894736842106,
"grad_norm": 0.15374666452407837,
"learning_rate": 8.449438202247192e-05,
"loss": 3.6895,
"step": 920
},
{
"epoch": 0.6265263157894737,
"grad_norm": 0.23757484555244446,
"learning_rate": 8.299625468164794e-05,
"loss": 3.3622,
"step": 930
},
{
"epoch": 0.6332631578947369,
"grad_norm": 0.1661984622478485,
"learning_rate": 8.149812734082397e-05,
"loss": 3.3248,
"step": 940
},
{
"epoch": 0.64,
"grad_norm": 0.08603614568710327,
"learning_rate": 8e-05,
"loss": 3.0086,
"step": 950
},
{
"epoch": 0.6467368421052632,
"grad_norm": 0.07694745808839798,
"learning_rate": 7.850187265917604e-05,
"loss": 3.5162,
"step": 960
},
{
"epoch": 0.6534736842105263,
"grad_norm": 0.16395558416843414,
"learning_rate": 7.700374531835206e-05,
"loss": 3.4812,
"step": 970
},
{
"epoch": 0.6602105263157895,
"grad_norm": 0.13817398250102997,
"learning_rate": 7.55056179775281e-05,
"loss": 3.2516,
"step": 980
},
{
"epoch": 0.6669473684210526,
"grad_norm": 0.25807198882102966,
"learning_rate": 7.400749063670413e-05,
"loss": 3.2101,
"step": 990
},
{
"epoch": 0.6736842105263158,
"grad_norm": 0.06848172843456268,
"learning_rate": 7.250936329588015e-05,
"loss": 2.93,
"step": 1000
},
{
"epoch": 0.6804210526315789,
"grad_norm": 1.089575171470642,
"learning_rate": 7.101123595505618e-05,
"loss": 3.4925,
"step": 1010
},
{
"epoch": 0.6871578947368421,
"grad_norm": 0.20126965641975403,
"learning_rate": 6.951310861423222e-05,
"loss": 3.4603,
"step": 1020
},
{
"epoch": 0.6938947368421052,
"grad_norm": 0.21779027581214905,
"learning_rate": 6.801498127340824e-05,
"loss": 3.1723,
"step": 1030
},
{
"epoch": 0.7006315789473684,
"grad_norm": 0.18239159882068634,
"learning_rate": 6.651685393258428e-05,
"loss": 3.1903,
"step": 1040
},
{
"epoch": 0.7073684210526315,
"grad_norm": 0.06677573919296265,
"learning_rate": 6.50187265917603e-05,
"loss": 2.8445,
"step": 1050
},
{
"epoch": 0.7141052631578947,
"grad_norm": 0.42619746923446655,
"learning_rate": 6.352059925093634e-05,
"loss": 3.4319,
"step": 1060
},
{
"epoch": 0.7208421052631578,
"grad_norm": 0.12023507058620453,
"learning_rate": 6.202247191011237e-05,
"loss": 3.3826,
"step": 1070
},
{
"epoch": 0.7275789473684211,
"grad_norm": 0.15099403262138367,
"learning_rate": 6.052434456928839e-05,
"loss": 3.1425,
"step": 1080
},
{
"epoch": 0.7343157894736843,
"grad_norm": 0.3474717438220978,
"learning_rate": 5.902621722846442e-05,
"loss": 3.1279,
"step": 1090
},
{
"epoch": 0.7410526315789474,
"grad_norm": 0.12225649505853653,
"learning_rate": 5.752808988764046e-05,
"loss": 2.9033,
"step": 1100
},
{
"epoch": 0.7477894736842106,
"grad_norm": 0.19639068841934204,
"learning_rate": 5.6029962546816485e-05,
"loss": 3.3681,
"step": 1110
},
{
"epoch": 0.7545263157894737,
"grad_norm": 0.10571427643299103,
"learning_rate": 5.453183520599251e-05,
"loss": 3.335,
"step": 1120
},
{
"epoch": 0.7612631578947369,
"grad_norm": 0.5154901146888733,
"learning_rate": 5.3033707865168545e-05,
"loss": 3.0952,
"step": 1130
},
{
"epoch": 0.768,
"grad_norm": 0.6122628450393677,
"learning_rate": 5.153558052434457e-05,
"loss": 3.1269,
"step": 1140
},
{
"epoch": 0.7747368421052632,
"grad_norm": 0.19698569178581238,
"learning_rate": 5.00374531835206e-05,
"loss": 2.8233,
"step": 1150
},
{
"epoch": 0.7814736842105263,
"grad_norm": 0.13018374145030975,
"learning_rate": 4.853932584269663e-05,
"loss": 3.3094,
"step": 1160
},
{
"epoch": 0.7882105263157895,
"grad_norm": 0.09522128850221634,
"learning_rate": 4.704119850187266e-05,
"loss": 3.2765,
"step": 1170
},
{
"epoch": 0.7949473684210526,
"grad_norm": 0.10098107159137726,
"learning_rate": 4.554307116104869e-05,
"loss": 3.0807,
"step": 1180
},
{
"epoch": 0.8016842105263158,
"grad_norm": 0.18019132316112518,
"learning_rate": 4.404494382022472e-05,
"loss": 3.0332,
"step": 1190
},
{
"epoch": 0.8084210526315789,
"grad_norm": 0.16289708018302917,
"learning_rate": 4.2546816479400754e-05,
"loss": 2.7374,
"step": 1200
},
{
"epoch": 0.8151578947368421,
"grad_norm": 0.12666673958301544,
"learning_rate": 4.104868913857678e-05,
"loss": 3.2118,
"step": 1210
},
{
"epoch": 0.8218947368421052,
"grad_norm": 0.16891352832317352,
"learning_rate": 3.955056179775281e-05,
"loss": 3.1902,
"step": 1220
},
{
"epoch": 0.8286315789473684,
"grad_norm": 0.10958009213209152,
"learning_rate": 3.805243445692884e-05,
"loss": 2.9862,
"step": 1230
},
{
"epoch": 0.8353684210526315,
"grad_norm": 0.10642745345830917,
"learning_rate": 3.655430711610487e-05,
"loss": 3.0052,
"step": 1240
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.05656813085079193,
"learning_rate": 3.50561797752809e-05,
"loss": 2.723,
"step": 1250
},
{
"epoch": 0.8488421052631578,
"grad_norm": 0.08322717994451523,
"learning_rate": 3.355805243445693e-05,
"loss": 3.234,
"step": 1260
},
{
"epoch": 0.8555789473684211,
"grad_norm": 0.13246551156044006,
"learning_rate": 3.2059925093632956e-05,
"loss": 3.212,
"step": 1270
},
{
"epoch": 0.8623157894736843,
"grad_norm": 0.10225304961204529,
"learning_rate": 3.056179775280899e-05,
"loss": 2.9484,
"step": 1280
},
{
"epoch": 0.8690526315789474,
"grad_norm": 0.19440552592277527,
"learning_rate": 2.9063670411985024e-05,
"loss": 2.9266,
"step": 1290
},
{
"epoch": 0.8757894736842106,
"grad_norm": 0.08913037180900574,
"learning_rate": 2.7565543071161047e-05,
"loss": 2.6801,
"step": 1300
},
{
"epoch": 0.8825263157894737,
"grad_norm": 0.10815408080816269,
"learning_rate": 2.606741573033708e-05,
"loss": 3.1505,
"step": 1310
},
{
"epoch": 0.8892631578947369,
"grad_norm": 0.14371147751808167,
"learning_rate": 2.4569288389513108e-05,
"loss": 3.1293,
"step": 1320
},
{
"epoch": 0.896,
"grad_norm": 0.1680973470211029,
"learning_rate": 2.3071161048689138e-05,
"loss": 2.8961,
"step": 1330
},
{
"epoch": 0.9027368421052632,
"grad_norm": 0.19012019038200378,
"learning_rate": 2.157303370786517e-05,
"loss": 2.9096,
"step": 1340
},
{
"epoch": 0.9094736842105263,
"grad_norm": 0.060957688838243484,
"learning_rate": 2.00749063670412e-05,
"loss": 2.6879,
"step": 1350
},
{
"epoch": 0.9162105263157895,
"grad_norm": 0.15055014193058014,
"learning_rate": 1.857677902621723e-05,
"loss": 3.108,
"step": 1360
},
{
"epoch": 0.9229473684210526,
"grad_norm": 0.1378874033689499,
"learning_rate": 1.707865168539326e-05,
"loss": 3.0428,
"step": 1370
},
{
"epoch": 0.9296842105263158,
"grad_norm": 0.14901022613048553,
"learning_rate": 1.558052434456929e-05,
"loss": 2.8589,
"step": 1380
},
{
"epoch": 0.9364210526315789,
"grad_norm": 0.17515867948532104,
"learning_rate": 1.4082397003745318e-05,
"loss": 2.8563,
"step": 1390
},
{
"epoch": 0.9431578947368421,
"grad_norm": 0.11909812688827515,
"learning_rate": 1.258426966292135e-05,
"loss": 2.5759,
"step": 1400
},
{
"epoch": 0.9498947368421052,
"grad_norm": 0.16348549723625183,
"learning_rate": 1.1086142322097379e-05,
"loss": 3.089,
"step": 1410
},
{
"epoch": 0.9566315789473684,
"grad_norm": 0.08107765763998032,
"learning_rate": 9.588014981273409e-06,
"loss": 3.0145,
"step": 1420
},
{
"epoch": 0.9633684210526315,
"grad_norm": 0.13251617550849915,
"learning_rate": 8.089887640449438e-06,
"loss": 2.8256,
"step": 1430
},
{
"epoch": 0.9701052631578947,
"grad_norm": 0.10319063812494278,
"learning_rate": 6.591760299625469e-06,
"loss": 2.8456,
"step": 1440
},
{
"epoch": 0.9768421052631578,
"grad_norm": 0.08950542658567429,
"learning_rate": 5.093632958801498e-06,
"loss": 2.605,
"step": 1450
},
{
"epoch": 0.983578947368421,
"grad_norm": 0.08379487693309784,
"learning_rate": 3.5955056179775286e-06,
"loss": 3.0334,
"step": 1460
},
{
"epoch": 0.9903157894736843,
"grad_norm": 0.1561821848154068,
"learning_rate": 2.097378277153558e-06,
"loss": 3.0357,
"step": 1470
},
{
"epoch": 0.9970526315789474,
"grad_norm": 0.07574011385440826,
"learning_rate": 5.992509363295881e-07,
"loss": 2.7458,
"step": 1480
}
],
"logging_steps": 10,
"max_steps": 1484,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.32780044727799e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}