finance-iter2 / training_log.json
jbae1213's picture
Initial model upload
72ad2fb verified
[
{
"loss": 3.1532291412353515,
"grad_norm": 0.7252150177955627,
"learning_rate": 4.166666666666666e-08,
"entropy": 0.909868873655796,
"num_tokens": 59625.0,
"mean_token_accuracy": 0.5191385246813297,
"epoch": 0.004185413832792717,
"step": 10
},
{
"loss": 3.2842838287353517,
"grad_norm": 0.8542681336402893,
"learning_rate": 8.796296296296296e-08,
"entropy": 0.9062351793050766,
"num_tokens": 115055.0,
"mean_token_accuracy": 0.5090377777814865,
"epoch": 0.008370827665585435,
"step": 20
},
{
"loss": 3.120407485961914,
"grad_norm": 0.8136049509048462,
"learning_rate": 1.3425925925925926e-07,
"entropy": 0.8844040989875793,
"num_tokens": 173001.0,
"mean_token_accuracy": 0.5300635486841202,
"epoch": 0.012556241498378152,
"step": 30
},
{
"loss": 3.1775161743164064,
"grad_norm": 0.7916132807731628,
"learning_rate": 1.8055555555555554e-07,
"entropy": 0.9001422733068466,
"num_tokens": 230759.0,
"mean_token_accuracy": 0.5147604435682297,
"epoch": 0.01674165533117087,
"step": 40
},
{
"loss": 3.1207983016967775,
"grad_norm": 0.8365122675895691,
"learning_rate": 2.2685185185185184e-07,
"entropy": 0.9065567880868912,
"num_tokens": 290037.0,
"mean_token_accuracy": 0.516672582924366,
"epoch": 0.020927069163963585,
"step": 50
},
{
"loss": 3.110250473022461,
"grad_norm": 0.8340612649917603,
"learning_rate": 2.731481481481481e-07,
"entropy": 0.894629393517971,
"num_tokens": 348666.0,
"mean_token_accuracy": 0.5243608556687832,
"epoch": 0.025112482996756304,
"step": 60
},
{
"loss": 3.213370513916016,
"grad_norm": 0.7208571434020996,
"learning_rate": 3.194444444444444e-07,
"entropy": 0.9139490529894829,
"num_tokens": 404064.0,
"mean_token_accuracy": 0.510741152614355,
"epoch": 0.02929789682954902,
"step": 70
},
{
"loss": 3.1525503158569337,
"grad_norm": 0.8069764971733093,
"learning_rate": 3.657407407407407e-07,
"entropy": 0.8901543036103249,
"num_tokens": 459991.0,
"mean_token_accuracy": 0.5182561405003071,
"epoch": 0.03348331066234174,
"step": 80
},
{
"loss": 3.2056060791015626,
"grad_norm": 0.7775021195411682,
"learning_rate": 4.12037037037037e-07,
"entropy": 0.9084297090768814,
"num_tokens": 515607.0,
"mean_token_accuracy": 0.5117158360779286,
"epoch": 0.037668724495134455,
"step": 90
},
{
"loss": 3.1619991302490233,
"grad_norm": 0.7517663240432739,
"learning_rate": 4.5833333333333327e-07,
"entropy": 0.9325117334723473,
"num_tokens": 571331.0,
"mean_token_accuracy": 0.5146001622080802,
"epoch": 0.04185413832792717,
"step": 100
},
{
"loss": 3.083780288696289,
"grad_norm": 0.7766073346138,
"learning_rate": 5.046296296296297e-07,
"entropy": 0.9166652098298073,
"num_tokens": 628370.0,
"mean_token_accuracy": 0.523348405212164,
"epoch": 0.04603955216071989,
"step": 110
},
{
"loss": 2.9515827178955076,
"grad_norm": 0.7737402319908142,
"learning_rate": 5.509259259259259e-07,
"entropy": 0.8990944147109985,
"num_tokens": 691820.0,
"mean_token_accuracy": 0.5330506779253483,
"epoch": 0.05022496599351261,
"step": 120
},
{
"loss": 3.009184646606445,
"grad_norm": 0.7089793086051941,
"learning_rate": 5.972222222222222e-07,
"entropy": 0.9393417268991471,
"num_tokens": 750187.0,
"mean_token_accuracy": 0.52148522362113,
"epoch": 0.054410379826305325,
"step": 130
},
{
"loss": 2.931972885131836,
"grad_norm": 0.6590924263000488,
"learning_rate": 6.435185185185184e-07,
"entropy": 0.9367253214120865,
"num_tokens": 810727.0,
"mean_token_accuracy": 0.52733798250556,
"epoch": 0.05859579365909804,
"step": 140
},
{
"loss": 2.9645376205444336,
"grad_norm": 0.8957489728927612,
"learning_rate": 6.898148148148148e-07,
"entropy": 0.9482719719409942,
"num_tokens": 869700.0,
"mean_token_accuracy": 0.5252767078578472,
"epoch": 0.06278120749189076,
"step": 150
},
{
"loss": 2.9582921981811525,
"grad_norm": 0.7582520842552185,
"learning_rate": 7.361111111111111e-07,
"entropy": 0.9640893578529358,
"num_tokens": 929913.0,
"mean_token_accuracy": 0.5209386304020882,
"epoch": 0.06696662132468348,
"step": 160
},
{
"loss": 2.942486381530762,
"grad_norm": 0.8944061994552612,
"learning_rate": 7.824074074074074e-07,
"entropy": 1.0264274433255196,
"num_tokens": 987456.0,
"mean_token_accuracy": 0.5134634062647819,
"epoch": 0.0711520351574762,
"step": 170
},
{
"loss": 2.8791921615600584,
"grad_norm": 0.8744268417358398,
"learning_rate": 8.287037037037036e-07,
"entropy": 1.0515225112438202,
"num_tokens": 1045578.0,
"mean_token_accuracy": 0.5203944936394691,
"epoch": 0.07533744899026891,
"step": 180
},
{
"loss": 2.671405029296875,
"grad_norm": 0.6432790160179138,
"learning_rate": 8.75e-07,
"entropy": 1.0437349081039429,
"num_tokens": 1105831.0,
"mean_token_accuracy": 0.5427842013537884,
"epoch": 0.07952286282306163,
"step": 190
},
{
"loss": 2.658889389038086,
"grad_norm": 0.606299877166748,
"learning_rate": 9.212962962962962e-07,
"entropy": 1.1231032446026803,
"num_tokens": 1162666.0,
"mean_token_accuracy": 0.5306987896561622,
"epoch": 0.08370827665585434,
"step": 200
},
{
"eval_loss": 2.64770245552063,
"eval_runtime": 46.2508,
"eval_samples_per_second": 137.749,
"eval_steps_per_second": 5.751,
"eval_entropy": 1.1016171117474263,
"eval_num_tokens": 1162666.0,
"eval_mean_token_accuracy": 0.5367447220741358,
"epoch": 0.08370827665585434,
"step": 200
},
{
"loss": 2.6236072540283204,
"grad_norm": 0.497915118932724,
"learning_rate": 9.675925925925926e-07,
"entropy": 1.17362399995327,
"num_tokens": 1219379.0,
"mean_token_accuracy": 0.5290989577770233,
"epoch": 0.08789369048864706,
"step": 210
},
{
"loss": 2.469617462158203,
"grad_norm": 0.46626466512680054,
"learning_rate": 9.999995407884271e-07,
"entropy": 1.1599584549665451,
"num_tokens": 1280477.0,
"mean_token_accuracy": 0.54561567902565,
"epoch": 0.09207910432143979,
"step": 220
},
{
"loss": 2.5247661590576174,
"grad_norm": 0.4741360545158386,
"learning_rate": 9.999913770505991e-07,
"entropy": 1.2275266259908677,
"num_tokens": 1337757.0,
"mean_token_accuracy": 0.5331762477755546,
"epoch": 0.0962645181542325,
"step": 230
},
{
"loss": 2.395884704589844,
"grad_norm": 0.48130306601524353,
"learning_rate": 9.999730088029378e-07,
"entropy": 1.2018450900912285,
"num_tokens": 1397229.0,
"mean_token_accuracy": 0.5442760087549686,
"epoch": 0.10044993198702522,
"step": 240
},
{
"loss": 2.3129932403564455,
"grad_norm": 0.43950778245925903,
"learning_rate": 9.99944436420327e-07,
"entropy": 1.2097549244761467,
"num_tokens": 1457262.0,
"mean_token_accuracy": 0.5529780350625515,
"epoch": 0.10463534581981794,
"step": 250
},
{
"loss": 2.4272939682006838,
"grad_norm": 0.4790073335170746,
"learning_rate": 9.999056604859114e-07,
"entropy": 1.2772572070360184,
"num_tokens": 1511629.0,
"mean_token_accuracy": 0.5349524199962616,
"epoch": 0.10882075965261065,
"step": 260
},
{
"loss": 2.3150350570678713,
"grad_norm": 0.3617287278175354,
"learning_rate": 9.998566817910835e-07,
"entropy": 1.2845856487751006,
"num_tokens": 1567691.0,
"mean_token_accuracy": 0.5448164060711861,
"epoch": 0.11300617348540337,
"step": 270
},
{
"loss": 2.2049863815307615,
"grad_norm": 0.31779831647872925,
"learning_rate": 9.997975013354675e-07,
"entropy": 1.271442210674286,
"num_tokens": 1628624.0,
"mean_token_accuracy": 0.5550637729465961,
"epoch": 0.11719158731819608,
"step": 280
},
{
"loss": 2.1529918670654298,
"grad_norm": 0.32820868492126465,
"learning_rate": 9.997281203268995e-07,
"entropy": 1.279099041223526,
"num_tokens": 1689782.0,
"mean_token_accuracy": 0.5614619500935077,
"epoch": 0.1213770011509888,
"step": 290
},
{
"loss": 2.1452268600463866,
"grad_norm": 0.3186478018760681,
"learning_rate": 9.99648540181402e-07,
"entropy": 1.323472622036934,
"num_tokens": 1749745.0,
"mean_token_accuracy": 0.556930074095726,
"epoch": 0.1255624149837815,
"step": 300
},
{
"loss": 2.134934997558594,
"grad_norm": 0.30079811811447144,
"learning_rate": 9.995587625231564e-07,
"entropy": 1.3632762670516967,
"num_tokens": 1807987.0,
"mean_token_accuracy": 0.5517515152692795,
"epoch": 0.12974782881657423,
"step": 310
},
{
"loss": 1.9839101791381837,
"grad_norm": 0.361982136964798,
"learning_rate": 9.994587891844675e-07,
"entropy": 1.3298138067126275,
"num_tokens": 1867566.0,
"mean_token_accuracy": 0.57241967394948,
"epoch": 0.13393324264936696,
"step": 320
},
{
"loss": 2.0360176086425783,
"grad_norm": 0.3045739233493805,
"learning_rate": 9.99348622205729e-07,
"entropy": 1.3972268849611282,
"num_tokens": 1926290.0,
"mean_token_accuracy": 0.5540065504610538,
"epoch": 0.13811865648215968,
"step": 330
},
{
"loss": 2.0286712646484375,
"grad_norm": 0.31182143092155457,
"learning_rate": 9.99228263835379e-07,
"entropy": 1.4252518475055695,
"num_tokens": 1985494.0,
"mean_token_accuracy": 0.5559632822871208,
"epoch": 0.1423040703149524,
"step": 340
},
{
"loss": 2.064646911621094,
"grad_norm": 0.32973718643188477,
"learning_rate": 9.990977165298569e-07,
"entropy": 1.4729229807853699,
"num_tokens": 2044451.0,
"mean_token_accuracy": 0.542438729852438,
"epoch": 0.1464894841477451,
"step": 350
},
{
"loss": 1.9804191589355469,
"grad_norm": 0.289235383272171,
"learning_rate": 9.989569829535508e-07,
"entropy": 1.465097615122795,
"num_tokens": 2104274.0,
"mean_token_accuracy": 0.5540824517607689,
"epoch": 0.15067489798053782,
"step": 360
},
{
"loss": 1.9926959991455078,
"grad_norm": 0.29901251196861267,
"learning_rate": 9.988060659787448e-07,
"entropy": 1.5359219849109649,
"num_tokens": 2159709.0,
"mean_token_accuracy": 0.5466853015124797,
"epoch": 0.15486031181333054,
"step": 370
},
{
"loss": 1.9560510635375976,
"grad_norm": 0.2495652139186859,
"learning_rate": 9.986449686855592e-07,
"entropy": 1.5187518745660782,
"num_tokens": 2215716.0,
"mean_token_accuracy": 0.5528687633574009,
"epoch": 0.15904572564612326,
"step": 380
},
{
"loss": 1.9273345947265625,
"grad_norm": 0.24777130782604218,
"learning_rate": 9.984736943618888e-07,
"entropy": 1.521226641535759,
"num_tokens": 2271587.0,
"mean_token_accuracy": 0.5515810877084732,
"epoch": 0.163231139478916,
"step": 390
},
{
"loss": 1.8669830322265626,
"grad_norm": 0.22564777731895447,
"learning_rate": 9.982922465033348e-07,
"entropy": 1.5213739037513734,
"num_tokens": 2333246.0,
"mean_token_accuracy": 0.5613327234983444,
"epoch": 0.16741655331170868,
"step": 400
},
{
"eval_loss": 1.8593320846557617,
"eval_runtime": 43.8766,
"eval_samples_per_second": 145.203,
"eval_steps_per_second": 6.062,
"eval_entropy": 1.4658072012707704,
"eval_num_tokens": 2333246.0,
"eval_mean_token_accuracy": 0.5688402788307434,
"epoch": 0.16741655331170868,
"step": 400
},
{
"loss": 1.804941177368164,
"grad_norm": 0.23039200901985168,
"learning_rate": 9.981006288131342e-07,
"entropy": 1.4880431205034257,
"num_tokens": 2391775.0,
"mean_token_accuracy": 0.5735716104507447,
"epoch": 0.1716019671445014,
"step": 410
},
{
"loss": 1.8068187713623047,
"grad_norm": 0.2117597907781601,
"learning_rate": 9.978988452020832e-07,
"entropy": 1.5207171112298965,
"num_tokens": 2449834.0,
"mean_token_accuracy": 0.5626526214182377,
"epoch": 0.17578738097729413,
"step": 420
},
{
"loss": 1.8408426284790038,
"grad_norm": 0.18858124315738678,
"learning_rate": 9.97686899788459e-07,
"entropy": 1.5431180804967881,
"num_tokens": 2509956.0,
"mean_token_accuracy": 0.5639997899532319,
"epoch": 0.17997279481008685,
"step": 430
},
{
"loss": 1.8002569198608398,
"grad_norm": 0.20246392488479614,
"learning_rate": 9.974647968979339e-07,
"entropy": 1.5480373591184615,
"num_tokens": 2566510.0,
"mean_token_accuracy": 0.5718876734375954,
"epoch": 0.18415820864287957,
"step": 440
},
{
"loss": 1.7867300033569335,
"grad_norm": 0.1937456578016281,
"learning_rate": 9.972325410634885e-07,
"entropy": 1.531213068962097,
"num_tokens": 2624567.0,
"mean_token_accuracy": 0.5752546131610871,
"epoch": 0.1883436224756723,
"step": 450
},
{
"loss": 1.755255889892578,
"grad_norm": 0.1926703304052353,
"learning_rate": 9.969901370253187e-07,
"entropy": 1.5252358853816985,
"num_tokens": 2682287.0,
"mean_token_accuracy": 0.5824674129486084,
"epoch": 0.192529036308465,
"step": 460
},
{
"loss": 1.772911834716797,
"grad_norm": 0.21854747831821442,
"learning_rate": 9.96737589730738e-07,
"entropy": 1.575288510322571,
"num_tokens": 2735734.0,
"mean_token_accuracy": 0.5827064469456673,
"epoch": 0.1967144501412577,
"step": 470
},
{
"loss": 1.6943496704101562,
"grad_norm": 0.16926386952400208,
"learning_rate": 9.964749043340788e-07,
"entropy": 1.4982535749673844,
"num_tokens": 2794073.0,
"mean_token_accuracy": 0.5961055085062981,
"epoch": 0.20089986397405044,
"step": 480
},
{
"loss": 1.759925651550293,
"grad_norm": 0.18107837438583374,
"learning_rate": 9.962020861965843e-07,
"entropy": 1.5479711294174194,
"num_tokens": 2852819.0,
"mean_token_accuracy": 0.5893502697348595,
"epoch": 0.20508527780684316,
"step": 490
},
{
"loss": 1.6459325790405273,
"grad_norm": 0.19736573100090027,
"learning_rate": 9.959191408863014e-07,
"entropy": 1.4799151957035064,
"num_tokens": 2916292.0,
"mean_token_accuracy": 0.6062492698431015,
"epoch": 0.20927069163963588,
"step": 500
},
{
"loss": 1.6710922241210937,
"grad_norm": 0.20262014865875244,
"learning_rate": 9.956260741779665e-07,
"entropy": 1.5181541979312896,
"num_tokens": 2975466.0,
"mean_token_accuracy": 0.6021158128976822,
"epoch": 0.21345610547242858,
"step": 510
},
{
"loss": 1.6909679412841796,
"grad_norm": 0.22447596490383148,
"learning_rate": 9.953228920528865e-07,
"entropy": 1.523398867249489,
"num_tokens": 3033907.0,
"mean_token_accuracy": 0.5984062060713768,
"epoch": 0.2176415193052213,
"step": 520
},
{
"loss": 1.6476320266723632,
"grad_norm": 0.20474065840244293,
"learning_rate": 9.950096006988182e-07,
"entropy": 1.5151092141866684,
"num_tokens": 3090006.0,
"mean_token_accuracy": 0.6024264812469482,
"epoch": 0.22182693313801402,
"step": 530
},
{
"loss": 1.656897735595703,
"grad_norm": 0.2609263062477112,
"learning_rate": 9.946862065098413e-07,
"entropy": 1.5152370780706406,
"num_tokens": 3145289.0,
"mean_token_accuracy": 0.6096841841936111,
"epoch": 0.22601234697080674,
"step": 540
},
{
"loss": 1.540487289428711,
"grad_norm": 0.21252916753292084,
"learning_rate": 9.943527160862281e-07,
"entropy": 1.4425812840461731,
"num_tokens": 3202970.0,
"mean_token_accuracy": 0.6310827702283859,
"epoch": 0.23019776080359947,
"step": 550
},
{
"loss": 1.5528440475463867,
"grad_norm": 0.16846199333667755,
"learning_rate": 9.940091362343086e-07,
"entropy": 1.4279247790575027,
"num_tokens": 3258895.0,
"mean_token_accuracy": 0.6345707163214683,
"epoch": 0.23438317463639216,
"step": 560
},
{
"loss": 1.5468204498291016,
"grad_norm": 0.1584591120481491,
"learning_rate": 9.936554739663315e-07,
"entropy": 1.4223629891872407,
"num_tokens": 3315602.0,
"mean_token_accuracy": 0.6405046731233597,
"epoch": 0.23856858846918488,
"step": 570
},
{
"loss": 1.5502227783203124,
"grad_norm": 0.1557629555463791,
"learning_rate": 9.932917365003216e-07,
"entropy": 1.3991417795419694,
"num_tokens": 3376637.0,
"mean_token_accuracy": 0.6378504887223244,
"epoch": 0.2427540023019776,
"step": 580
},
{
"loss": 1.47230224609375,
"grad_norm": 0.16370368003845215,
"learning_rate": 9.929179312599317e-07,
"entropy": 1.3776833653450011,
"num_tokens": 3435084.0,
"mean_token_accuracy": 0.655489268898964,
"epoch": 0.24693941613477033,
"step": 590
},
{
"loss": 1.5052314758300782,
"grad_norm": 0.15081895887851715,
"learning_rate": 9.925340658742926e-07,
"entropy": 1.4087885320186615,
"num_tokens": 3491147.0,
"mean_token_accuracy": 0.6534322142601013,
"epoch": 0.251124829967563,
"step": 600
},
{
"eval_loss": 1.5126348733901978,
"eval_runtime": 43.6936,
"eval_samples_per_second": 145.811,
"eval_steps_per_second": 6.088,
"eval_entropy": 1.3902130023877424,
"eval_num_tokens": 3491147.0,
"eval_mean_token_accuracy": 0.654797031018967,
"epoch": 0.251124829967563,
"step": 600
},
{
"loss": 1.5386703491210938,
"grad_norm": 0.13764511048793793,
"learning_rate": 9.921401481778548e-07,
"entropy": 1.4563434034585954,
"num_tokens": 3547282.0,
"mean_token_accuracy": 0.6465040192008018,
"epoch": 0.2553102438003558,
"step": 610
},
{
"loss": 1.5452125549316407,
"grad_norm": 0.12197960168123245,
"learning_rate": 9.917361862102316e-07,
"entropy": 1.4381413817405702,
"num_tokens": 3604669.0,
"mean_token_accuracy": 0.6443240866065025,
"epoch": 0.25949565763314847,
"step": 620
},
{
"loss": 1.5622711181640625,
"grad_norm": 0.11413100361824036,
"learning_rate": 9.913221882160325e-07,
"entropy": 1.4670737832784653,
"num_tokens": 3660716.0,
"mean_token_accuracy": 0.6438136756420135,
"epoch": 0.2636810714659412,
"step": 630
},
{
"loss": 1.5195579528808594,
"grad_norm": 0.11969699710607529,
"learning_rate": 9.908981626446967e-07,
"entropy": 1.44781274497509,
"num_tokens": 3716067.0,
"mean_token_accuracy": 0.6527407199144364,
"epoch": 0.2678664852987339,
"step": 640
},
{
"loss": 1.562466812133789,
"grad_norm": 0.10654503107070923,
"learning_rate": 9.904641181503193e-07,
"entropy": 1.4735447496175766,
"num_tokens": 3772158.0,
"mean_token_accuracy": 0.6453444182872772,
"epoch": 0.2720518991315266,
"step": 650
},
{
"loss": 1.4119970321655273,
"grad_norm": 0.11862610280513763,
"learning_rate": 9.900200635914762e-07,
"entropy": 1.3951878100633621,
"num_tokens": 3833284.0,
"mean_token_accuracy": 0.6668122097849846,
"epoch": 0.27623731296431936,
"step": 660
},
{
"loss": 1.4448695182800293,
"grad_norm": 0.11796533316373825,
"learning_rate": 9.895660080310418e-07,
"entropy": 1.4141918390989303,
"num_tokens": 3890126.0,
"mean_token_accuracy": 0.6602638632059097,
"epoch": 0.28042272679711205,
"step": 670
},
{
"loss": 1.4727934837341308,
"grad_norm": 0.10836026817560196,
"learning_rate": 9.891019607360042e-07,
"entropy": 1.4221189886331558,
"num_tokens": 3946816.0,
"mean_token_accuracy": 0.6601494640111923,
"epoch": 0.2846081406299048,
"step": 680
},
{
"loss": 1.4374773979187012,
"grad_norm": 0.11260558664798737,
"learning_rate": 9.88627931177278e-07,
"entropy": 1.388827046751976,
"num_tokens": 4006104.0,
"mean_token_accuracy": 0.6628721192479133,
"epoch": 0.2887935544626975,
"step": 690
},
{
"loss": 1.4522128105163574,
"grad_norm": 0.09237143397331238,
"learning_rate": 9.88143929029508e-07,
"entropy": 1.4040265291929246,
"num_tokens": 4064859.0,
"mean_token_accuracy": 0.6620885074138642,
"epoch": 0.2929789682954902,
"step": 700
},
{
"loss": 1.4630813598632812,
"grad_norm": 0.11288689821958542,
"learning_rate": 9.876499641708741e-07,
"entropy": 1.4170700162649155,
"num_tokens": 4122576.0,
"mean_token_accuracy": 0.6592713505029678,
"epoch": 0.29716438212828294,
"step": 710
},
{
"loss": 1.3941055297851563,
"grad_norm": 0.09874723106622696,
"learning_rate": 9.871460466828888e-07,
"entropy": 1.3975009769201279,
"num_tokens": 4180815.0,
"mean_token_accuracy": 0.6696879684925079,
"epoch": 0.30134979596107564,
"step": 720
},
{
"loss": 1.4689726829528809,
"grad_norm": 0.10610879957675934,
"learning_rate": 9.866321868501912e-07,
"entropy": 1.4678748458623887,
"num_tokens": 4239666.0,
"mean_token_accuracy": 0.6571864351630211,
"epoch": 0.3055352097938684,
"step": 730
},
{
"loss": 1.479258155822754,
"grad_norm": 0.12200459837913513,
"learning_rate": 9.861083951603377e-07,
"entropy": 1.430861946940422,
"num_tokens": 4297700.0,
"mean_token_accuracy": 0.6564601019024849,
"epoch": 0.3097206236266611,
"step": 740
},
{
"loss": 1.4389605522155762,
"grad_norm": 0.12583598494529724,
"learning_rate": 9.855746823035876e-07,
"entropy": 1.432998749613762,
"num_tokens": 4355152.0,
"mean_token_accuracy": 0.6649609237909317,
"epoch": 0.3139060374594538,
"step": 750
},
{
"loss": 1.451263999938965,
"grad_norm": 0.11383051425218582,
"learning_rate": 9.850310591726846e-07,
"entropy": 1.4290786892175675,
"num_tokens": 4414094.0,
"mean_token_accuracy": 0.6572059765458107,
"epoch": 0.31809145129224653,
"step": 760
},
{
"loss": 1.5145987510681151,
"grad_norm": 0.10066704452037811,
"learning_rate": 9.844775368626358e-07,
"entropy": 1.4577032029628754,
"num_tokens": 4472646.0,
"mean_token_accuracy": 0.6500703617930412,
"epoch": 0.3222768651250392,
"step": 770
},
{
"loss": 1.4831979751586915,
"grad_norm": 0.08656121045351028,
"learning_rate": 9.839141266704833e-07,
"entropy": 1.4568557769060135,
"num_tokens": 4529048.0,
"mean_token_accuracy": 0.6541818514466285,
"epoch": 0.326462278957832,
"step": 780
},
{
"loss": 1.422746181488037,
"grad_norm": 0.10030363500118256,
"learning_rate": 9.833408400950753e-07,
"entropy": 1.4236784011125565,
"num_tokens": 4587248.0,
"mean_token_accuracy": 0.6693221822381019,
"epoch": 0.33064769279062467,
"step": 790
},
{
"loss": 1.511890697479248,
"grad_norm": 0.1208115741610527,
"learning_rate": 9.827576888368306e-07,
"entropy": 1.4541470259428024,
"num_tokens": 4643727.0,
"mean_token_accuracy": 0.6536489054560661,
"epoch": 0.33483310662341736,
"step": 800
},
{
"eval_loss": 1.4582873582839966,
"eval_runtime": 43.7122,
"eval_samples_per_second": 145.749,
"eval_steps_per_second": 6.085,
"eval_entropy": 1.4234498008749539,
"eval_num_tokens": 4643727.0,
"eval_mean_token_accuracy": 0.6611158966569972,
"epoch": 0.33483310662341736,
"step": 800
},
{
"loss": 1.519627285003662,
"grad_norm": 0.10558852553367615,
"learning_rate": 9.821646847974998e-07,
"entropy": 1.4889809876680373,
"num_tokens": 4699602.0,
"mean_token_accuracy": 0.6542887255549431,
"epoch": 0.3390185204562101,
"step": 810
},
{
"loss": 1.4185623168945312,
"grad_norm": 0.11619652807712555,
"learning_rate": 9.815618400799228e-07,
"entropy": 1.4101483166217803,
"num_tokens": 4759712.0,
"mean_token_accuracy": 0.6648697286844254,
"epoch": 0.3432039342890028,
"step": 820
},
{
"loss": 1.4773643493652344,
"grad_norm": 0.10418440401554108,
"learning_rate": 9.809491669877815e-07,
"entropy": 1.4431717425584794,
"num_tokens": 4817721.0,
"mean_token_accuracy": 0.6592238992452621,
"epoch": 0.34738934812179556,
"step": 830
},
{
"loss": 1.4270614624023437,
"grad_norm": 0.09047893434762955,
"learning_rate": 9.803266780253487e-07,
"entropy": 1.4172182738780976,
"num_tokens": 4876476.0,
"mean_token_accuracy": 0.6634449914097786,
"epoch": 0.35157476195458826,
"step": 840
},
{
"loss": 1.4365344047546387,
"grad_norm": 0.11413703858852386,
"learning_rate": 9.796943858972328e-07,
"entropy": 1.424839785695076,
"num_tokens": 4935356.0,
"mean_token_accuracy": 0.664018252491951,
"epoch": 0.35576017578738095,
"step": 850
},
{
"loss": 1.4368658065795898,
"grad_norm": 0.10197298973798752,
"learning_rate": 9.790523035081194e-07,
"entropy": 1.4327729046344757,
"num_tokens": 4996023.0,
"mean_token_accuracy": 0.667100901901722,
"epoch": 0.3599455896201737,
"step": 860
},
{
"loss": 1.4285932540893556,
"grad_norm": 0.09999420493841171,
"learning_rate": 9.78400443962506e-07,
"entropy": 1.4451387345790863,
"num_tokens": 5054143.0,
"mean_token_accuracy": 0.6665249273180962,
"epoch": 0.3641310034529664,
"step": 870
},
{
"loss": 1.4537543296813964,
"grad_norm": 0.12824219465255737,
"learning_rate": 9.777388205644365e-07,
"entropy": 1.4365610003471374,
"num_tokens": 5109151.0,
"mean_token_accuracy": 0.6605026423931122,
"epoch": 0.36831641728575915,
"step": 880
},
{
"loss": 1.4052467346191406,
"grad_norm": 0.10136168450117111,
"learning_rate": 9.770674468172288e-07,
"entropy": 1.4461679026484489,
"num_tokens": 5169545.0,
"mean_token_accuracy": 0.6698134854435921,
"epoch": 0.37250183111855184,
"step": 890
},
{
"loss": 1.5341646194458007,
"grad_norm": 0.125015527009964,
"learning_rate": 9.763863364231995e-07,
"entropy": 1.4948209792375564,
"num_tokens": 5226362.0,
"mean_token_accuracy": 0.6506395027041435,
"epoch": 0.3766872449513446,
"step": 900
},
{
"loss": 1.451594066619873,
"grad_norm": 0.12184764444828033,
"learning_rate": 9.75695503283383e-07,
"entropy": 1.454634991288185,
"num_tokens": 5287471.0,
"mean_token_accuracy": 0.6617233619093895,
"epoch": 0.3808726587841373,
"step": 910
},
{
"loss": 1.3663444519042969,
"grad_norm": 0.09586543589830399,
"learning_rate": 9.749949614972505e-07,
"entropy": 1.4007300227880477,
"num_tokens": 5346427.0,
"mean_token_accuracy": 0.6761364534497261,
"epoch": 0.38505807261693,
"step": 920
},
{
"loss": 1.4433299064636231,
"grad_norm": 0.09879063069820404,
"learning_rate": 9.74284725362419e-07,
"entropy": 1.44069661796093,
"num_tokens": 5406471.0,
"mean_token_accuracy": 0.6573658585548401,
"epoch": 0.38924348644972273,
"step": 930
},
{
"loss": 1.3213248252868652,
"grad_norm": 0.09394767135381699,
"learning_rate": 9.735648093743621e-07,
"entropy": 1.3663470640778541,
"num_tokens": 5468090.0,
"mean_token_accuracy": 0.6877701610326767,
"epoch": 0.3934289002825154,
"step": 940
},
{
"loss": 1.3884021759033203,
"grad_norm": 0.10035385936498642,
"learning_rate": 9.728352282261124e-07,
"entropy": 1.4055696964263915,
"num_tokens": 5527409.0,
"mean_token_accuracy": 0.6716061800718307,
"epoch": 0.3976143141153082,
"step": 950
},
{
"loss": 1.4895167350769043,
"grad_norm": 0.1361590176820755,
"learning_rate": 9.72095996807963e-07,
"entropy": 1.4704587817192079,
"num_tokens": 5586447.0,
"mean_token_accuracy": 0.6556992784142495,
"epoch": 0.40179972794810087,
"step": 960
},
{
"loss": 1.422182846069336,
"grad_norm": 0.12393207103013992,
"learning_rate": 9.713471302071624e-07,
"entropy": 1.4276411414146424,
"num_tokens": 5644917.0,
"mean_token_accuracy": 0.663788178563118,
"epoch": 0.40598514178089357,
"step": 970
},
{
"loss": 1.4414152145385741,
"grad_norm": 0.12177922576665878,
"learning_rate": 9.705886437076078e-07,
"entropy": 1.4314857304096222,
"num_tokens": 5706907.0,
"mean_token_accuracy": 0.6635714635252953,
"epoch": 0.4101705556136863,
"step": 980
},
{
"loss": 1.4422160148620606,
"grad_norm": 0.09565871953964233,
"learning_rate": 9.698205527895317e-07,
"entropy": 1.4681658923625946,
"num_tokens": 5767067.0,
"mean_token_accuracy": 0.6618433445692062,
"epoch": 0.414355969446479,
"step": 990
},
{
"loss": 1.3973498344421387,
"grad_norm": 0.11843396723270416,
"learning_rate": 9.69042873129187e-07,
"entropy": 1.4245391979813575,
"num_tokens": 5826368.0,
"mean_token_accuracy": 0.6698687911033631,
"epoch": 0.41854138327927176,
"step": 1000
},
{
"eval_loss": 1.430882215499878,
"eval_runtime": 42.5472,
"eval_samples_per_second": 149.739,
"eval_steps_per_second": 6.252,
"eval_entropy": 1.4260875381025158,
"eval_num_tokens": 5826368.0,
"eval_mean_token_accuracy": 0.6650281033121553,
"epoch": 0.41854138327927176,
"step": 1000
},
{
"loss": 1.454050064086914,
"grad_norm": 0.11938533186912537,
"learning_rate": 9.682556205985273e-07,
"entropy": 1.447835522890091,
"num_tokens": 5882058.0,
"mean_token_accuracy": 0.6635008811950683,
"epoch": 0.42272679711206446,
"step": 1010
},
{
"loss": 1.3930376052856446,
"grad_norm": 0.1063380166888237,
"learning_rate": 9.674588112648819e-07,
"entropy": 1.4178766876459121,
"num_tokens": 5938913.0,
"mean_token_accuracy": 0.6699633210897445,
"epoch": 0.42691221094485715,
"step": 1020
},
{
"loss": 1.4532501220703125,
"grad_norm": 0.1084047332406044,
"learning_rate": 9.666524613906283e-07,
"entropy": 1.4572493433952332,
"num_tokens": 5993465.0,
"mean_token_accuracy": 0.6680980160832405,
"epoch": 0.4310976247776499,
"step": 1030
},
{
"loss": 1.4012516021728516,
"grad_norm": 0.10825818032026291,
"learning_rate": 9.658365874328613e-07,
"entropy": 1.434103360772133,
"num_tokens": 6049913.0,
"mean_token_accuracy": 0.670105955004692,
"epoch": 0.4352830386104426,
"step": 1040
},
{
"loss": 1.4673041343688964,
"grad_norm": 0.09340775012969971,
"learning_rate": 9.650112060430556e-07,
"entropy": 1.4505166023969651,
"num_tokens": 6106055.0,
"mean_token_accuracy": 0.6630285322666168,
"epoch": 0.43946845244323535,
"step": 1050
},
{
"loss": 1.425284481048584,
"grad_norm": 0.12269195914268494,
"learning_rate": 9.641763340667264e-07,
"entropy": 1.438645276427269,
"num_tokens": 6165982.0,
"mean_token_accuracy": 0.6631047874689102,
"epoch": 0.44365386627602804,
"step": 1060
},
{
"loss": 1.4093000411987304,
"grad_norm": 0.12008947134017944,
"learning_rate": 9.633319885430863e-07,
"entropy": 1.4247242331504821,
"num_tokens": 6221254.0,
"mean_token_accuracy": 0.668901015818119,
"epoch": 0.44783928010882074,
"step": 1070
},
{
"loss": 1.4575057983398438,
"grad_norm": 0.12600930035114288,
"learning_rate": 9.62478186704697e-07,
"entropy": 1.4588077813386917,
"num_tokens": 6281193.0,
"mean_token_accuracy": 0.659762179851532,
"epoch": 0.4520246939416135,
"step": 1080
},
{
"loss": 1.3971601486206056,
"grad_norm": 0.09669267386198044,
"learning_rate": 9.616149459771174e-07,
"entropy": 1.4378665208816528,
"num_tokens": 6338625.0,
"mean_token_accuracy": 0.6723957479000091,
"epoch": 0.4562101077744062,
"step": 1090
},
{
"loss": 1.4475428581237793,
"grad_norm": 0.10479287803173065,
"learning_rate": 9.607422839785487e-07,
"entropy": 1.4612567931413651,
"num_tokens": 6398522.0,
"mean_token_accuracy": 0.6564841374754906,
"epoch": 0.46039552160719893,
"step": 1100
},
{
"loss": 1.4175043106079102,
"grad_norm": 0.10358787328004837,
"learning_rate": 9.598602185194733e-07,
"entropy": 1.4467926740646362,
"num_tokens": 6458089.0,
"mean_token_accuracy": 0.669213418662548,
"epoch": 0.4645809354399916,
"step": 1110
},
{
"loss": 1.4430898666381835,
"grad_norm": 0.12753859162330627,
"learning_rate": 9.589687676022933e-07,
"entropy": 1.4614018350839615,
"num_tokens": 6517387.0,
"mean_token_accuracy": 0.663593128323555,
"epoch": 0.4687663492727843,
"step": 1120
},
{
"loss": 1.4554133415222168,
"grad_norm": 0.11801481246948242,
"learning_rate": 9.580679494209621e-07,
"entropy": 1.463664811849594,
"num_tokens": 6574281.0,
"mean_token_accuracy": 0.6585227012634277,
"epoch": 0.47295176310557707,
"step": 1130
},
{
"loss": 1.4516281127929687,
"grad_norm": 0.1230725646018982,
"learning_rate": 9.57157782360612e-07,
"entropy": 1.4588176727294921,
"num_tokens": 6632526.0,
"mean_token_accuracy": 0.6620682567358017,
"epoch": 0.47713717693836977,
"step": 1140
},
{
"loss": 1.3834566116333007,
"grad_norm": 0.10615360736846924,
"learning_rate": 9.562382849971814e-07,
"entropy": 1.4231864005327224,
"num_tokens": 6686576.0,
"mean_token_accuracy": 0.6769091472029686,
"epoch": 0.4813225907711625,
"step": 1150
},
{
"loss": 1.3678070068359376,
"grad_norm": 0.10580965131521225,
"learning_rate": 9.553094760970338e-07,
"entropy": 1.4144569963216782,
"num_tokens": 6743418.0,
"mean_token_accuracy": 0.6736478328704834,
"epoch": 0.4855080046039552,
"step": 1160
},
{
"loss": 1.4649283409118652,
"grad_norm": 0.11393830180168152,
"learning_rate": 9.543713746165746e-07,
"entropy": 1.461512914299965,
"num_tokens": 6801169.0,
"mean_token_accuracy": 0.6581070765852928,
"epoch": 0.4896934184367479,
"step": 1170
},
{
"loss": 1.3680376052856444,
"grad_norm": 0.19611844420433044,
"learning_rate": 9.534239997018663e-07,
"entropy": 1.4197842329740524,
"num_tokens": 6858807.0,
"mean_token_accuracy": 0.6744951158761978,
"epoch": 0.49387883226954066,
"step": 1180
},
{
"loss": 1.4589731216430664,
"grad_norm": 0.12470986694097519,
"learning_rate": 9.52467370688235e-07,
"entropy": 1.4711190968751908,
"num_tokens": 6915842.0,
"mean_token_accuracy": 0.6595605373382568,
"epoch": 0.49806424610233335,
"step": 1190
},
{
"loss": 1.3511184692382812,
"grad_norm": 0.1231166198849678,
"learning_rate": 9.515015070998781e-07,
"entropy": 1.3929312020540237,
"num_tokens": 6973364.0,
"mean_token_accuracy": 0.6785273075103759,
"epoch": 0.502249659935126,
"step": 1200
},
{
"eval_loss": 1.4083536863327026,
"eval_runtime": 43.0534,
"eval_samples_per_second": 147.979,
"eval_steps_per_second": 6.178,
"eval_entropy": 1.401145983907513,
"eval_num_tokens": 6973364.0,
"eval_mean_token_accuracy": 0.6672393374873283,
"epoch": 0.502249659935126,
"step": 1200
},
{
"loss": 1.4147989273071289,
"grad_norm": 0.10981585085391998,
"learning_rate": 9.505264286494644e-07,
"entropy": 1.4393782436847686,
"num_tokens": 7029183.0,
"mean_token_accuracy": 0.6653257578611373,
"epoch": 0.5064350737679189,
"step": 1210
},
{
"loss": 1.4123595237731934,
"grad_norm": 0.12332361936569214,
"learning_rate": 9.495421552377325e-07,
"entropy": 1.4351352035999299,
"num_tokens": 7089107.0,
"mean_token_accuracy": 0.6679085582494736,
"epoch": 0.5106204876007115,
"step": 1220
},
{
"loss": 1.35689115524292,
"grad_norm": 0.10939253121614456,
"learning_rate": 9.485487069530841e-07,
"entropy": 1.384123608469963,
"num_tokens": 7145731.0,
"mean_token_accuracy": 0.6764253750443459,
"epoch": 0.5148059014335042,
"step": 1230
},
{
"loss": 1.4721358299255372,
"grad_norm": 0.1354241967201233,
"learning_rate": 9.475461040711745e-07,
"entropy": 1.4555100411176682,
"num_tokens": 7201497.0,
"mean_token_accuracy": 0.6551220327615738,
"epoch": 0.5189913152662969,
"step": 1240
},
{
"loss": 1.406270408630371,
"grad_norm": 0.11071319878101349,
"learning_rate": 9.465343670544987e-07,
"entropy": 1.446416699886322,
"num_tokens": 7255249.0,
"mean_token_accuracy": 0.6669346168637276,
"epoch": 0.5231767290990896,
"step": 1250
},
{
"loss": 1.409125804901123,
"grad_norm": 0.1242227554321289,
"learning_rate": 9.455135165519734e-07,
"entropy": 1.4336748003959656,
"num_tokens": 7312069.0,
"mean_token_accuracy": 0.6685505136847496,
"epoch": 0.5273621429318824,
"step": 1260
},
{
"loss": 1.353925609588623,
"grad_norm": 0.12051878869533539,
"learning_rate": 9.444835733985157e-07,
"entropy": 1.3861510157585144,
"num_tokens": 7374935.0,
"mean_token_accuracy": 0.6735975816845894,
"epoch": 0.5315475567646751,
"step": 1270
},
{
"loss": 1.3926225662231446,
"grad_norm": 0.1231522411108017,
"learning_rate": 9.434445586146182e-07,
"entropy": 1.431991320848465,
"num_tokens": 7429456.0,
"mean_token_accuracy": 0.6716481134295463,
"epoch": 0.5357329705974678,
"step": 1280
},
{
"loss": 1.3677814483642579,
"grad_norm": 0.10811372101306915,
"learning_rate": 9.423964934059202e-07,
"entropy": 1.4019683420658111,
"num_tokens": 7487005.0,
"mean_token_accuracy": 0.6747205436229706,
"epoch": 0.5399183844302605,
"step": 1290
},
{
"loss": 1.3889549255371094,
"grad_norm": 0.12505528330802917,
"learning_rate": 9.413393991627736e-07,
"entropy": 1.3941765069961547,
"num_tokens": 7547594.0,
"mean_token_accuracy": 0.6716236621141434,
"epoch": 0.5441037982630532,
"step": 1300
},
{
"loss": 1.388343048095703,
"grad_norm": 0.11002212017774582,
"learning_rate": 9.40273297459808e-07,
"entropy": 1.4113761156797409,
"num_tokens": 7605828.0,
"mean_token_accuracy": 0.6661069095134735,
"epoch": 0.548289212095846,
"step": 1310
},
{
"loss": 1.3891004562377929,
"grad_norm": 0.14147064089775085,
"learning_rate": 9.391982100554889e-07,
"entropy": 1.4317275822162627,
"num_tokens": 7661455.0,
"mean_token_accuracy": 0.6669554397463798,
"epoch": 0.5524746259286387,
"step": 1320
},
{
"loss": 1.3904253959655761,
"grad_norm": 0.13139671087265015,
"learning_rate": 9.38114158891675e-07,
"entropy": 1.4096351087093353,
"num_tokens": 7719091.0,
"mean_token_accuracy": 0.671739687025547,
"epoch": 0.5566600397614314,
"step": 1330
},
{
"loss": 1.463707733154297,
"grad_norm": 0.09927231818437576,
"learning_rate": 9.370211660931693e-07,
"entropy": 1.4864629238843918,
"num_tokens": 7774511.0,
"mean_token_accuracy": 0.660004960000515,
"epoch": 0.5608454535942241,
"step": 1340
},
{
"loss": 1.3764376640319824,
"grad_norm": 0.11545363068580627,
"learning_rate": 9.35919253967268e-07,
"entropy": 1.3998028621077538,
"num_tokens": 7836251.0,
"mean_token_accuracy": 0.6720214635133743,
"epoch": 0.5650308674270168,
"step": 1350
},
{
"loss": 1.3152969360351563,
"grad_norm": 0.1053733229637146,
"learning_rate": 9.348084450033051e-07,
"entropy": 1.3938700079917907,
"num_tokens": 7893911.0,
"mean_token_accuracy": 0.6841806307435035,
"epoch": 0.5692162812598096,
"step": 1360
},
{
"loss": 1.422788143157959,
"grad_norm": 0.09823399037122726,
"learning_rate": 9.336887618721938e-07,
"entropy": 1.445565864443779,
"num_tokens": 7949863.0,
"mean_token_accuracy": 0.6624092936515809,
"epoch": 0.5734016950926023,
"step": 1370
},
{
"loss": 1.3210840225219727,
"grad_norm": 0.1335407942533493,
"learning_rate": 9.325602274259629e-07,
"entropy": 1.3757253885269165,
"num_tokens": 8008384.0,
"mean_token_accuracy": 0.6824934765696525,
"epoch": 0.577587108925395,
"step": 1380
},
{
"loss": 1.397932243347168,
"grad_norm": 0.09968513995409012,
"learning_rate": 9.314228646972919e-07,
"entropy": 1.4251334190368652,
"num_tokens": 8067031.0,
"mean_token_accuracy": 0.666124664247036,
"epoch": 0.5817725227581877,
"step": 1390
},
{
"loss": 1.312647533416748,
"grad_norm": 0.12575951218605042,
"learning_rate": 9.302766968990387e-07,
"entropy": 1.355531930923462,
"num_tokens": 8126287.0,
"mean_token_accuracy": 0.6826214835047721,
"epoch": 0.5859579365909804,
"step": 1400
},
{
"eval_loss": 1.386446237564087,
"eval_runtime": 42.6243,
"eval_samples_per_second": 149.469,
"eval_steps_per_second": 6.241,
"eval_entropy": 1.4034466080199508,
"eval_num_tokens": 8126287.0,
"eval_mean_token_accuracy": 0.6737931832335049,
"epoch": 0.5859579365909804,
"step": 1400
},
{
"loss": 1.38052396774292,
"grad_norm": 0.13619256019592285,
"learning_rate": 9.291217474237685e-07,
"entropy": 1.404805138707161,
"num_tokens": 8184847.0,
"mean_token_accuracy": 0.6700320944190026,
"epoch": 0.5901433504237732,
"step": 1410
},
{
"loss": 1.4232772827148437,
"grad_norm": 0.12265791743993759,
"learning_rate": 9.27958039843274e-07,
"entropy": 1.4586470276117325,
"num_tokens": 8243143.0,
"mean_token_accuracy": 0.6625824689865112,
"epoch": 0.5943287642565659,
"step": 1420
},
{
"loss": 1.3759157180786132,
"grad_norm": 0.12311021983623505,
"learning_rate": 9.267855979080959e-07,
"entropy": 1.4208383083343505,
"num_tokens": 8301096.0,
"mean_token_accuracy": 0.6705714225769043,
"epoch": 0.5985141780893586,
"step": 1430
},
{
"loss": 1.4408933639526367,
"grad_norm": 0.10979989171028137,
"learning_rate": 9.256044455470372e-07,
"entropy": 1.4562449276447296,
"num_tokens": 8357561.0,
"mean_token_accuracy": 0.6647118896245956,
"epoch": 0.6026995919221513,
"step": 1440
},
{
"loss": 1.4200193405151367,
"grad_norm": 0.10581167787313461,
"learning_rate": 9.244146068666756e-07,
"entropy": 1.4489133656024933,
"num_tokens": 8411021.0,
"mean_token_accuracy": 0.6702521324157715,
"epoch": 0.606885005754944,
"step": 1450
},
{
"loss": 1.3639183044433594,
"grad_norm": 0.12785717844963074,
"learning_rate": 9.232161061508707e-07,
"entropy": 1.3970074653625488,
"num_tokens": 8473715.0,
"mean_token_accuracy": 0.6738650560379028,
"epoch": 0.6110704195877368,
"step": 1460
},
{
"loss": 1.3117795944213868,
"grad_norm": 0.11914683878421783,
"learning_rate": 9.220089678602692e-07,
"entropy": 1.3731692731380463,
"num_tokens": 8536821.0,
"mean_token_accuracy": 0.6784457266330719,
"epoch": 0.6152558334205295,
"step": 1470
},
{
"loss": 1.3580459594726562,
"grad_norm": 0.10762108862400055,
"learning_rate": 9.20793216631805e-07,
"entropy": 1.3978804230690003,
"num_tokens": 8596217.0,
"mean_token_accuracy": 0.6741666734218598,
"epoch": 0.6194412472533222,
"step": 1480
},
{
"loss": 1.3858207702636718,
"grad_norm": 0.13189709186553955,
"learning_rate": 9.195688772781969e-07,
"entropy": 1.4172445833683014,
"num_tokens": 8649547.0,
"mean_token_accuracy": 0.6702063709497452,
"epoch": 0.6236266610861149,
"step": 1490
},
{
"loss": 1.3870158195495605,
"grad_norm": 0.13120818138122559,
"learning_rate": 9.183359747874416e-07,
"entropy": 1.424094271659851,
"num_tokens": 8704916.0,
"mean_token_accuracy": 0.669642123579979,
"epoch": 0.6278120749189076,
"step": 1500
},
{
"loss": 1.4398550033569335,
"grad_norm": 0.12010879069566727,
"learning_rate": 9.170945343223045e-07,
"entropy": 1.4305728733539582,
"num_tokens": 8760259.0,
"mean_token_accuracy": 0.6612218707799912,
"epoch": 0.6319974887517004,
"step": 1510
},
{
"loss": 1.3878154754638672,
"grad_norm": 0.1339423507452011,
"learning_rate": 9.15844581219805e-07,
"entropy": 1.3878618061542511,
"num_tokens": 8816700.0,
"mean_token_accuracy": 0.6718688145279884,
"epoch": 0.6361829025844931,
"step": 1520
},
{
"loss": 1.3522814750671386,
"grad_norm": 0.13170458376407623,
"learning_rate": 9.145861409907009e-07,
"entropy": 1.3895842641592027,
"num_tokens": 8876509.0,
"mean_token_accuracy": 0.6753421723842621,
"epoch": 0.6403683164172858,
"step": 1530
},
{
"loss": 1.3812095642089843,
"grad_norm": 0.1139625683426857,
"learning_rate": 9.133192393189664e-07,
"entropy": 1.4209527760744094,
"num_tokens": 8936438.0,
"mean_token_accuracy": 0.6720142468810082,
"epoch": 0.6445537302500784,
"step": 1540
},
{
"loss": 1.4154645919799804,
"grad_norm": 0.13268420100212097,
"learning_rate": 9.120439020612685e-07,
"entropy": 1.424301978945732,
"num_tokens": 8994731.0,
"mean_token_accuracy": 0.6668044954538346,
"epoch": 0.6487391440828711,
"step": 1550
},
{
"loss": 1.3785716056823731,
"grad_norm": 0.11167196929454803,
"learning_rate": 9.107601552464393e-07,
"entropy": 1.3881200447678566,
"num_tokens": 9052527.0,
"mean_token_accuracy": 0.6731992438435555,
"epoch": 0.652924557915664,
"step": 1560
},
{
"loss": 1.3963075637817384,
"grad_norm": 0.1282496154308319,
"learning_rate": 9.094680250749447e-07,
"entropy": 1.408314546942711,
"num_tokens": 9111578.0,
"mean_token_accuracy": 0.6680608317255974,
"epoch": 0.6571099717484566,
"step": 1570
},
{
"loss": 1.3251177787780761,
"grad_norm": 0.12457749992609024,
"learning_rate": 9.081675379183494e-07,
"entropy": 1.3645547151565551,
"num_tokens": 9171878.0,
"mean_token_accuracy": 0.6805019825696945,
"epoch": 0.6612953855812493,
"step": 1580
},
{
"loss": 1.3337480545043945,
"grad_norm": 0.10987865179777145,
"learning_rate": 9.068587203187794e-07,
"entropy": 1.3783577740192414,
"num_tokens": 9231431.0,
"mean_token_accuracy": 0.6761843442916871,
"epoch": 0.665480799414042,
"step": 1590
},
{
"loss": 1.3129050254821777,
"grad_norm": 0.11137118935585022,
"learning_rate": 9.055415989883792e-07,
"entropy": 1.3690737694501878,
"num_tokens": 9287759.0,
"mean_token_accuracy": 0.6817014619708062,
"epoch": 0.6696662132468347,
"step": 1600
},
{
"eval_loss": 1.3662420511245728,
"eval_runtime": 43.7555,
"eval_samples_per_second": 145.605,
"eval_steps_per_second": 6.079,
"eval_entropy": 1.3844989090037525,
"eval_num_tokens": 9287759.0,
"eval_mean_token_accuracy": 0.6762344077565616,
"epoch": 0.6696662132468347,
"step": 1600
},
{
"loss": 1.3789652824401855,
"grad_norm": 0.11303029209375381,
"learning_rate": 9.042162008087678e-07,
"entropy": 1.388508751988411,
"num_tokens": 9347815.0,
"mean_token_accuracy": 0.671443772315979,
"epoch": 0.6738516270796275,
"step": 1610
},
{
"loss": 1.3409759521484375,
"grad_norm": 0.12162081152200699,
"learning_rate": 9.028825528304891e-07,
"entropy": 1.3988509953022004,
"num_tokens": 9404534.0,
"mean_token_accuracy": 0.6778050258755683,
"epoch": 0.6780370409124202,
"step": 1620
},
{
"loss": 1.286928367614746,
"grad_norm": 0.1191353127360344,
"learning_rate": 9.015406822724603e-07,
"entropy": 1.3400784492492677,
"num_tokens": 9465006.0,
"mean_token_accuracy": 0.6883344247937202,
"epoch": 0.6822224547452129,
"step": 1630
},
{
"loss": 1.3931745529174804,
"grad_norm": 0.09988338500261307,
"learning_rate": 9.001906165214163e-07,
"entropy": 1.4158646211028099,
"num_tokens": 9523244.0,
"mean_token_accuracy": 0.6664687514305114,
"epoch": 0.6864078685780056,
"step": 1640
},
{
"loss": 1.3149008750915527,
"grad_norm": 0.1224365308880806,
"learning_rate": 8.988323831313509e-07,
"entropy": 1.3621025055646896,
"num_tokens": 9583571.0,
"mean_token_accuracy": 0.6805920660495758,
"epoch": 0.6905932824107983,
"step": 1650
},
{
"loss": 1.3128664016723632,
"grad_norm": 0.10845732688903809,
"learning_rate": 8.974660098229538e-07,
"entropy": 1.366037741303444,
"num_tokens": 9640353.0,
"mean_token_accuracy": 0.6822919920086861,
"epoch": 0.6947786962435911,
"step": 1660
},
{
"loss": 1.3836250305175781,
"grad_norm": 0.12312953174114227,
"learning_rate": 8.960915244830462e-07,
"entropy": 1.4012254863977431,
"num_tokens": 9701108.0,
"mean_token_accuracy": 0.6682980388402939,
"epoch": 0.6989641100763838,
"step": 1670
},
{
"loss": 1.298573875427246,
"grad_norm": 0.10932071506977081,
"learning_rate": 8.947089551640099e-07,
"entropy": 1.351333498954773,
"num_tokens": 9758477.0,
"mean_token_accuracy": 0.6857402086257934,
"epoch": 0.7031495239091765,
"step": 1680
},
{
"loss": 1.3268583297729493,
"grad_norm": 0.1166784018278122,
"learning_rate": 8.933183300832159e-07,
"entropy": 1.3652890086174012,
"num_tokens": 9816530.0,
"mean_token_accuracy": 0.6774859979748726,
"epoch": 0.7073349377419692,
"step": 1690
},
{
"loss": 1.37611722946167,
"grad_norm": 0.1278134286403656,
"learning_rate": 8.919196776224483e-07,
"entropy": 1.399143072962761,
"num_tokens": 9872452.0,
"mean_token_accuracy": 0.6704028770327568,
"epoch": 0.7115203515747619,
"step": 1700
},
{
"loss": 1.3107229232788087,
"grad_norm": 0.12152674794197083,
"learning_rate": 8.905130263273252e-07,
"entropy": 1.3753829419612884,
"num_tokens": 9934101.0,
"mean_token_accuracy": 0.68070268034935,
"epoch": 0.7157057654075547,
"step": 1710
},
{
"loss": 1.3585830688476563,
"grad_norm": 0.12099979817867279,
"learning_rate": 8.890984049067154e-07,
"entropy": 1.3618301630020142,
"num_tokens": 9993614.0,
"mean_token_accuracy": 0.6762332633137703,
"epoch": 0.7198911792403474,
"step": 1720
},
{
"loss": 1.302845287322998,
"grad_norm": 0.11998716741800308,
"learning_rate": 8.876758422321534e-07,
"entropy": 1.356363880634308,
"num_tokens": 10047945.0,
"mean_token_accuracy": 0.6853278845548629,
"epoch": 0.7240765930731401,
"step": 1730
},
{
"loss": 1.3057265281677246,
"grad_norm": 0.11447525024414062,
"learning_rate": 8.862453673372495e-07,
"entropy": 1.3511420711874962,
"num_tokens": 10105849.0,
"mean_token_accuracy": 0.6814648106694221,
"epoch": 0.7282620069059328,
"step": 1740
},
{
"loss": 1.379593563079834,
"grad_norm": 0.13615551590919495,
"learning_rate": 8.848070094170972e-07,
"entropy": 1.4266703605651856,
"num_tokens": 10160689.0,
"mean_token_accuracy": 0.6730331972241401,
"epoch": 0.7324474207387256,
"step": 1750
},
{
"loss": 1.3482324600219726,
"grad_norm": 0.1049669086933136,
"learning_rate": 8.833607978276782e-07,
"entropy": 1.365234938263893,
"num_tokens": 10219317.0,
"mean_token_accuracy": 0.6763183102011681,
"epoch": 0.7366328345715183,
"step": 1760
},
{
"loss": 1.308854579925537,
"grad_norm": 0.11895614117383957,
"learning_rate": 8.819067620852621e-07,
"entropy": 1.3593208014965057,
"num_tokens": 10281133.0,
"mean_token_accuracy": 0.6821026623249054,
"epoch": 0.740818248404311,
"step": 1770
},
{
"loss": 1.3750693321228027,
"grad_norm": 0.13367140293121338,
"learning_rate": 8.804449318658047e-07,
"entropy": 1.391082948446274,
"num_tokens": 10338588.0,
"mean_token_accuracy": 0.6708121821284294,
"epoch": 0.7450036622371037,
"step": 1780
},
{
"loss": 1.3176989555358887,
"grad_norm": 0.10955236107110977,
"learning_rate": 8.789753370043425e-07,
"entropy": 1.373031947016716,
"num_tokens": 10398744.0,
"mean_token_accuracy": 0.6810923710465431,
"epoch": 0.7491890760698964,
"step": 1790
},
{
"loss": 1.3639984130859375,
"grad_norm": 0.12343617528676987,
"learning_rate": 8.77498007494383e-07,
"entropy": 1.4000030606985092,
"num_tokens": 10458537.0,
"mean_token_accuracy": 0.6697928130626678,
"epoch": 0.7533744899026892,
"step": 1800
},
{
"eval_loss": 1.350634217262268,
"eval_runtime": 42.5241,
"eval_samples_per_second": 149.821,
"eval_steps_per_second": 6.255,
"eval_entropy": 1.3988571140102875,
"eval_num_tokens": 10458537.0,
"eval_mean_token_accuracy": 0.6771848898633082,
"epoch": 0.7533744899026892,
"step": 1800
},
{
"loss": 1.3396940231323242,
"grad_norm": 0.14190584421157837,
"learning_rate": 8.760129734872932e-07,
"entropy": 1.3851164013147355,
"num_tokens": 10516646.0,
"mean_token_accuracy": 0.6750243782997132,
"epoch": 0.7575599037354819,
"step": 1810
},
{
"loss": 1.3640681266784669,
"grad_norm": 0.11394577473402023,
"learning_rate": 8.745202652916841e-07,
"entropy": 1.400177638232708,
"num_tokens": 10576044.0,
"mean_token_accuracy": 0.6688720732927322,
"epoch": 0.7617453175682746,
"step": 1820
},
{
"loss": 1.4145827293395996,
"grad_norm": 0.1021205335855484,
"learning_rate": 8.73019913372792e-07,
"entropy": 1.4293284267187119,
"num_tokens": 10635490.0,
"mean_token_accuracy": 0.664357790350914,
"epoch": 0.7659307314010673,
"step": 1830
},
{
"loss": 1.3291969299316406,
"grad_norm": 0.104949451982975,
"learning_rate": 8.715119483518568e-07,
"entropy": 1.392353293299675,
"num_tokens": 10696235.0,
"mean_token_accuracy": 0.6753359526395798,
"epoch": 0.77011614523386,
"step": 1840
},
{
"loss": 1.3674373626708984,
"grad_norm": 0.13051320612430573,
"learning_rate": 8.699964010054972e-07,
"entropy": 1.3989370226860047,
"num_tokens": 10756113.0,
"mean_token_accuracy": 0.6702560499310494,
"epoch": 0.7743015590666528,
"step": 1850
},
{
"loss": 1.3329706192016602,
"grad_norm": 0.11483673751354218,
"learning_rate": 8.684733022650819e-07,
"entropy": 1.368683397769928,
"num_tokens": 10811097.0,
"mean_token_accuracy": 0.6795622929930687,
"epoch": 0.7784869728994455,
"step": 1860
},
{
"loss": 1.34647216796875,
"grad_norm": 0.12257901579141617,
"learning_rate": 8.669426832160995e-07,
"entropy": 1.3777292981743812,
"num_tokens": 10869645.0,
"mean_token_accuracy": 0.6771846890449524,
"epoch": 0.7826723867322382,
"step": 1870
},
{
"loss": 1.2678668022155761,
"grad_norm": 0.10710500180721283,
"learning_rate": 8.65404575097523e-07,
"entropy": 1.319590486586094,
"num_tokens": 10929506.0,
"mean_token_accuracy": 0.6875983402132988,
"epoch": 0.7868578005650309,
"step": 1880
},
{
"loss": 1.364974021911621,
"grad_norm": 0.11756409704685211,
"learning_rate": 8.638590093011722e-07,
"entropy": 1.400401759147644,
"num_tokens": 10984931.0,
"mean_token_accuracy": 0.67054513245821,
"epoch": 0.7910432143978235,
"step": 1890
},
{
"loss": 1.3333361625671387,
"grad_norm": 0.13867364823818207,
"learning_rate": 8.623060173710743e-07,
"entropy": 1.369761797785759,
"num_tokens": 11040065.0,
"mean_token_accuracy": 0.6754815384745598,
"epoch": 0.7952286282306164,
"step": 1900
},
{
"loss": 1.2708181381225585,
"grad_norm": 0.10772886127233505,
"learning_rate": 8.607456310028185e-07,
"entropy": 1.3362341210246087,
"num_tokens": 11101320.0,
"mean_token_accuracy": 0.6911322221159935,
"epoch": 0.799414042063409,
"step": 1910
},
{
"loss": 1.3482179641723633,
"grad_norm": 0.13811437785625458,
"learning_rate": 8.591778820429104e-07,
"entropy": 1.3786241382360458,
"num_tokens": 11159403.0,
"mean_token_accuracy": 0.676637114584446,
"epoch": 0.8035994558962017,
"step": 1920
},
{
"loss": 1.3001495361328126,
"grad_norm": 0.11261286586523056,
"learning_rate": 8.576028024881208e-07,
"entropy": 1.342548942565918,
"num_tokens": 11215300.0,
"mean_token_accuracy": 0.6838564172387123,
"epoch": 0.8077848697289944,
"step": 1930
},
{
"loss": 1.37919921875,
"grad_norm": 0.11729196459054947,
"learning_rate": 8.560204244848339e-07,
"entropy": 1.399843516945839,
"num_tokens": 11274016.0,
"mean_token_accuracy": 0.6692644655704498,
"epoch": 0.8119702835617871,
"step": 1940
},
{
"loss": 1.3153133392333984,
"grad_norm": 0.11101414263248444,
"learning_rate": 8.544307803283903e-07,
"entropy": 1.3550761044025421,
"num_tokens": 11331840.0,
"mean_token_accuracy": 0.6830023691058159,
"epoch": 0.8161556973945799,
"step": 1950
},
{
"loss": 1.3670063018798828,
"grad_norm": 0.10791585594415665,
"learning_rate": 8.528339024624287e-07,
"entropy": 1.3926001816987992,
"num_tokens": 11388250.0,
"mean_token_accuracy": 0.6758360341191292,
"epoch": 0.8203411112273726,
"step": 1960
},
{
"loss": 1.322612190246582,
"grad_norm": 0.12179048359394073,
"learning_rate": 8.512298234782227e-07,
"entropy": 1.3523173958063126,
"num_tokens": 11444623.0,
"mean_token_accuracy": 0.6819486439228057,
"epoch": 0.8245265250601653,
"step": 1970
},
{
"loss": 1.4050199508666992,
"grad_norm": 0.1269518882036209,
"learning_rate": 8.496185761140165e-07,
"entropy": 1.4183282285928727,
"num_tokens": 11501456.0,
"mean_token_accuracy": 0.6670055955648422,
"epoch": 0.828711938892958,
"step": 1980
},
{
"loss": 1.3573097229003905,
"grad_norm": 0.09794802963733673,
"learning_rate": 8.480001932543561e-07,
"entropy": 1.3888707369565965,
"num_tokens": 11562134.0,
"mean_token_accuracy": 0.6723511442542076,
"epoch": 0.8328973527257507,
"step": 1990
},
{
"loss": 1.3131244659423829,
"grad_norm": 0.12277819216251373,
"learning_rate": 8.463747079294192e-07,
"entropy": 1.3465208828449249,
"num_tokens": 11618831.0,
"mean_token_accuracy": 0.6795975625514984,
"epoch": 0.8370827665585435,
"step": 2000
},
{
"eval_loss": 1.3373528718948364,
"eval_runtime": 42.4003,
"eval_samples_per_second": 150.258,
"eval_steps_per_second": 6.274,
"eval_entropy": 1.3585354107663148,
"eval_num_tokens": 11618831.0,
"eval_mean_token_accuracy": 0.678231207947982,
"epoch": 0.8370827665585435,
"step": 2000
},
{
"loss": 1.3982874870300293,
"grad_norm": 0.13690534234046936,
"learning_rate": 8.447421533143396e-07,
"entropy": 1.4036804780364036,
"num_tokens": 11676394.0,
"mean_token_accuracy": 0.6648698434233665,
"epoch": 0.8412681803913362,
"step": 2010
},
{
"loss": 1.321161937713623,
"grad_norm": 0.16348762810230255,
"learning_rate": 8.431025627285313e-07,
"entropy": 1.349110186100006,
"num_tokens": 11730143.0,
"mean_token_accuracy": 0.6850418791174888,
"epoch": 0.8454535942241289,
"step": 2020
},
{
"loss": 1.337346076965332,
"grad_norm": 0.12358900159597397,
"learning_rate": 8.414559696350078e-07,
"entropy": 1.3770191550254822,
"num_tokens": 11786616.0,
"mean_token_accuracy": 0.6773856431245804,
"epoch": 0.8496390080569216,
"step": 2030
},
{
"loss": 1.341224193572998,
"grad_norm": 0.11463375389575958,
"learning_rate": 8.398024076396996e-07,
"entropy": 1.345393455028534,
"num_tokens": 11845477.0,
"mean_token_accuracy": 0.6754840731620788,
"epoch": 0.8538244218897143,
"step": 2040
},
{
"loss": 1.3237956047058106,
"grad_norm": 0.12505337595939636,
"learning_rate": 8.381419104907681e-07,
"entropy": 1.3643497437238694,
"num_tokens": 11901746.0,
"mean_token_accuracy": 0.6791232407093049,
"epoch": 0.8580098357225071,
"step": 2050
},
{
"loss": 1.3346891403198242,
"grad_norm": 0.15036678314208984,
"learning_rate": 8.364745120779164e-07,
"entropy": 1.3704555958509446,
"num_tokens": 11959605.0,
"mean_token_accuracy": 0.6759614482522011,
"epoch": 0.8621952495552998,
"step": 2060
},
{
"loss": 1.4080591201782227,
"grad_norm": 0.14488154649734497,
"learning_rate": 8.348002464316987e-07,
"entropy": 1.4137721806764603,
"num_tokens": 12018839.0,
"mean_token_accuracy": 0.6624691441655159,
"epoch": 0.8663806633880925,
"step": 2070
},
{
"loss": 1.3576594352722169,
"grad_norm": 0.1306961327791214,
"learning_rate": 8.331191477228246e-07,
"entropy": 1.4100464552640914,
"num_tokens": 12077962.0,
"mean_token_accuracy": 0.6744375959038734,
"epoch": 0.8705660772208852,
"step": 2080
},
{
"loss": 1.3189333915710448,
"grad_norm": 0.09990637004375458,
"learning_rate": 8.314312502614625e-07,
"entropy": 1.3474989101290702,
"num_tokens": 12137755.0,
"mean_token_accuracy": 0.6803866818547248,
"epoch": 0.8747514910536779,
"step": 2090
},
{
"loss": 1.350827980041504,
"grad_norm": 0.1305275708436966,
"learning_rate": 8.29736588496539e-07,
"entropy": 1.384324887394905,
"num_tokens": 12194836.0,
"mean_token_accuracy": 0.6731877833604812,
"epoch": 0.8789369048864707,
"step": 2100
},
{
"loss": 1.3458109855651856,
"grad_norm": 0.12695269286632538,
"learning_rate": 8.280351970150358e-07,
"entropy": 1.3462085962295531,
"num_tokens": 12254568.0,
"mean_token_accuracy": 0.6745196804404259,
"epoch": 0.8831223187192634,
"step": 2110
},
{
"loss": 1.3157236099243164,
"grad_norm": 0.12223149091005325,
"learning_rate": 8.263271105412843e-07,
"entropy": 1.345698779821396,
"num_tokens": 12313266.0,
"mean_token_accuracy": 0.6800820276141166,
"epoch": 0.8873077325520561,
"step": 2120
},
{
"loss": 1.3625286102294922,
"grad_norm": 0.12075755000114441,
"learning_rate": 8.246123639362557e-07,
"entropy": 1.3751042202115058,
"num_tokens": 12368266.0,
"mean_token_accuracy": 0.6779290676116944,
"epoch": 0.8914931463848488,
"step": 2130
},
{
"loss": 1.3247100830078125,
"grad_norm": 0.13140852749347687,
"learning_rate": 8.22890992196851e-07,
"entropy": 1.3399439036846161,
"num_tokens": 12427195.0,
"mean_token_accuracy": 0.6778766274452209,
"epoch": 0.8956785602176415,
"step": 2140
},
{
"loss": 1.3223968505859376,
"grad_norm": 0.11262480914592743,
"learning_rate": 8.211630304551856e-07,
"entropy": 1.3523710697889328,
"num_tokens": 12481690.0,
"mean_token_accuracy": 0.6801952719688416,
"epoch": 0.8998639740504343,
"step": 2150
},
{
"loss": 1.2305709838867187,
"grad_norm": 0.1140614002943039,
"learning_rate": 8.194285139778727e-07,
"entropy": 1.2833492413163186,
"num_tokens": 12544082.0,
"mean_token_accuracy": 0.696322962641716,
"epoch": 0.904049387883227,
"step": 2160
},
{
"loss": 1.2584315299987794,
"grad_norm": 0.1213318482041359,
"learning_rate": 8.176874781653042e-07,
"entropy": 1.2917151510715486,
"num_tokens": 12605884.0,
"mean_token_accuracy": 0.6918731480836868,
"epoch": 0.9082348017160197,
"step": 2170
},
{
"loss": 1.4043787956237792,
"grad_norm": 0.11265023797750473,
"learning_rate": 8.159399585509271e-07,
"entropy": 1.4147561937570572,
"num_tokens": 12662340.0,
"mean_token_accuracy": 0.6659792140126228,
"epoch": 0.9124202155488124,
"step": 2180
},
{
"loss": 1.2520899772644043,
"grad_norm": 0.12448090314865112,
"learning_rate": 8.14185990800518e-07,
"entropy": 1.278790497779846,
"num_tokens": 12718634.0,
"mean_token_accuracy": 0.6945044815540313,
"epoch": 0.9166056293816051,
"step": 2190
},
{
"loss": 1.3444849967956543,
"grad_norm": 0.12102659791707993,
"learning_rate": 8.124256107114569e-07,
"entropy": 1.3645626872777938,
"num_tokens": 12774125.0,
"mean_token_accuracy": 0.6725556075572967,
"epoch": 0.9207910432143979,
"step": 2200
},
{
"eval_loss": 1.3292649984359741,
"eval_runtime": 42.3319,
"eval_samples_per_second": 150.501,
"eval_steps_per_second": 6.284,
"eval_entropy": 1.3460397827894168,
"eval_num_tokens": 12774125.0,
"eval_mean_token_accuracy": 0.6789818390419609,
"epoch": 0.9207910432143979,
"step": 2200
},
{
"loss": 1.3911532402038573,
"grad_norm": 0.1305384337902069,
"learning_rate": 8.106588542119957e-07,
"entropy": 1.3969669669866562,
"num_tokens": 12832025.0,
"mean_token_accuracy": 0.6678112506866455,
"epoch": 0.9249764570471906,
"step": 2210
},
{
"loss": 1.3338760375976562,
"grad_norm": 0.1187131404876709,
"learning_rate": 8.088857573605237e-07,
"entropy": 1.375227126479149,
"num_tokens": 12888734.0,
"mean_token_accuracy": 0.6802457317709922,
"epoch": 0.9291618708799833,
"step": 2220
},
{
"loss": 1.2834582328796387,
"grad_norm": 0.14116276800632477,
"learning_rate": 8.071063563448339e-07,
"entropy": 1.3024362832307816,
"num_tokens": 12943886.0,
"mean_token_accuracy": 0.6881816878914833,
"epoch": 0.933347284712776,
"step": 2230
},
{
"loss": 1.312158203125,
"grad_norm": 0.12964707612991333,
"learning_rate": 8.053206874813829e-07,
"entropy": 1.364695656299591,
"num_tokens": 13000723.0,
"mean_token_accuracy": 0.6795030117034913,
"epoch": 0.9375326985455686,
"step": 2240
},
{
"loss": 1.3068957328796387,
"grad_norm": 0.12915638089179993,
"learning_rate": 8.035287872145502e-07,
"entropy": 1.3586914032697677,
"num_tokens": 13059283.0,
"mean_token_accuracy": 0.6813771218061447,
"epoch": 0.9417181123783614,
"step": 2250
},
{
"loss": 1.3330992698669433,
"grad_norm": 0.10278042405843735,
"learning_rate": 8.017306921158942e-07,
"entropy": 1.3742854058742524,
"num_tokens": 13118033.0,
"mean_token_accuracy": 0.6798395842313767,
"epoch": 0.9459035262111541,
"step": 2260
},
{
"loss": 1.4351073265075684,
"grad_norm": 0.14834155142307281,
"learning_rate": 7.99926438883406e-07,
"entropy": 1.409215834736824,
"num_tokens": 13173943.0,
"mean_token_accuracy": 0.6597715452313423,
"epoch": 0.9500889400439468,
"step": 2270
},
{
"loss": 1.2880861282348632,
"grad_norm": 0.10686289519071579,
"learning_rate": 7.981160643407603e-07,
"entropy": 1.3126128152012826,
"num_tokens": 13233131.0,
"mean_token_accuracy": 0.6831002920866013,
"epoch": 0.9542743538767395,
"step": 2280
},
{
"loss": 1.37518310546875,
"grad_norm": 0.1137382760643959,
"learning_rate": 7.962996054365642e-07,
"entropy": 1.3762210130691528,
"num_tokens": 13289669.0,
"mean_token_accuracy": 0.6739885672926903,
"epoch": 0.9584597677095322,
"step": 2290
},
{
"loss": 1.2979955673217773,
"grad_norm": 0.12840472161769867,
"learning_rate": 7.944770992436026e-07,
"entropy": 1.3360363632440566,
"num_tokens": 13347131.0,
"mean_token_accuracy": 0.6823042362928391,
"epoch": 0.962645181542325,
"step": 2300
},
{
"loss": 1.3241994857788086,
"grad_norm": 0.11081521958112717,
"learning_rate": 7.926485829580814e-07,
"entropy": 1.3549024030566215,
"num_tokens": 13405191.0,
"mean_token_accuracy": 0.6771049797534943,
"epoch": 0.9668305953751177,
"step": 2310
},
{
"loss": 1.3815485000610352,
"grad_norm": 0.11393143981695175,
"learning_rate": 7.908140938988692e-07,
"entropy": 1.3816259652376175,
"num_tokens": 13463913.0,
"mean_token_accuracy": 0.6688653215765953,
"epoch": 0.9710160092079104,
"step": 2320
},
{
"loss": 1.298836898803711,
"grad_norm": 0.12833324074745178,
"learning_rate": 7.889736695067348e-07,
"entropy": 1.3122636392712592,
"num_tokens": 13523958.0,
"mean_token_accuracy": 0.6812730312347413,
"epoch": 0.9752014230407031,
"step": 2330
},
{
"loss": 1.3618934631347657,
"grad_norm": 0.13094215095043182,
"learning_rate": 7.87127347343584e-07,
"entropy": 1.3717454001307487,
"num_tokens": 13581681.0,
"mean_token_accuracy": 0.6716930896043778,
"epoch": 0.9793868368734958,
"step": 2340
},
{
"loss": 1.3211997032165528,
"grad_norm": 0.14429600536823273,
"learning_rate": 7.852751650916917e-07,
"entropy": 1.3575677514076232,
"num_tokens": 13641102.0,
"mean_token_accuracy": 0.6812080055475235,
"epoch": 0.9835722507062886,
"step": 2350
},
{
"loss": 1.3058280944824219,
"grad_norm": 0.13168948888778687,
"learning_rate": 7.83417160552934e-07,
"entropy": 1.3377871721982957,
"num_tokens": 13697001.0,
"mean_token_accuracy": 0.6837321490049362,
"epoch": 0.9877576645390813,
"step": 2360
},
{
"loss": 1.320173168182373,
"grad_norm": 0.13248135149478912,
"learning_rate": 7.815533716480158e-07,
"entropy": 1.3715132981538773,
"num_tokens": 13754970.0,
"mean_token_accuracy": 0.6818105265498161,
"epoch": 0.991943078371874,
"step": 2370
},
{
"loss": 1.3184805870056153,
"grad_norm": 0.1117711067199707,
"learning_rate": 7.796838364156977e-07,
"entropy": 1.3519122838973998,
"num_tokens": 13814161.0,
"mean_token_accuracy": 0.6789533212780953,
"epoch": 0.9961284922046667,
"step": 2380
},
{
"loss": 1.381266212463379,
"grad_norm": 0.3463696539402008,
"learning_rate": 7.778085930120191e-07,
"entropy": 1.3519603207304671,
"num_tokens": 13865252.0,
"mean_token_accuracy": 0.6710431801306235,
"epoch": 1.0,
"step": 2390
},
{
"loss": 1.3650718688964845,
"grad_norm": 0.10732991993427277,
"learning_rate": 7.759276797095196e-07,
"entropy": 1.3758342564105988,
"num_tokens": 13925700.0,
"mean_token_accuracy": 0.6686381295323371,
"epoch": 1.0041854138327928,
"step": 2400
},
{
"eval_loss": 1.324312686920166,
"eval_runtime": 42.9132,
"eval_samples_per_second": 148.463,
"eval_steps_per_second": 6.199,
"eval_entropy": 1.3503220336777824,
"eval_num_tokens": 13925700.0,
"eval_mean_token_accuracy": 0.6799099082337287,
"epoch": 1.0041854138327928,
"step": 2400
},
{
"loss": 1.275872802734375,
"grad_norm": 0.10125313699245453,
"learning_rate": 7.740411348964576e-07,
"entropy": 1.3205101490020752,
"num_tokens": 13983023.0,
"mean_token_accuracy": 0.6841968685388565,
"epoch": 1.0083708276655854,
"step": 2410
},
{
"loss": 1.3197596549987793,
"grad_norm": 0.12487287819385529,
"learning_rate": 7.721489970760275e-07,
"entropy": 1.3373865127563476,
"num_tokens": 14044602.0,
"mean_token_accuracy": 0.677445650100708,
"epoch": 1.0125562414983782,
"step": 2420
},
{
"loss": 1.2989977836608886,
"grad_norm": 0.1548726111650467,
"learning_rate": 7.702513048655733e-07,
"entropy": 1.3116925165057183,
"num_tokens": 14104408.0,
"mean_token_accuracy": 0.6814423218369484,
"epoch": 1.0167416553311708,
"step": 2430
},
{
"loss": 1.2706897735595704,
"grad_norm": 0.11503283679485321,
"learning_rate": 7.683480969958003e-07,
"entropy": 1.310747703909874,
"num_tokens": 14162736.0,
"mean_token_accuracy": 0.6892054408788681,
"epoch": 1.0209270691639636,
"step": 2440
},
{
"loss": 1.2928138732910157,
"grad_norm": 0.12240534275770187,
"learning_rate": 7.664394123099853e-07,
"entropy": 1.3191738039255143,
"num_tokens": 14221626.0,
"mean_token_accuracy": 0.6869289621710777,
"epoch": 1.0251124829967564,
"step": 2450
},
{
"loss": 1.293262767791748,
"grad_norm": 0.11519357562065125,
"learning_rate": 7.64525289763184e-07,
"entropy": 1.320760977268219,
"num_tokens": 14280841.0,
"mean_token_accuracy": 0.6857645198702812,
"epoch": 1.029297896829549,
"step": 2460
},
{
"loss": 1.3051738739013672,
"grad_norm": 0.11736012250185013,
"learning_rate": 7.626057684214341e-07,
"entropy": 1.316636176407337,
"num_tokens": 14338816.0,
"mean_token_accuracy": 0.6803102239966392,
"epoch": 1.0334833106623418,
"step": 2470
},
{
"loss": 1.3561962127685547,
"grad_norm": 0.13388119637966156,
"learning_rate": 7.606808874609605e-07,
"entropy": 1.36598659157753,
"num_tokens": 14395539.0,
"mean_token_accuracy": 0.6725652754306793,
"epoch": 1.0376687244951344,
"step": 2480
},
{
"loss": 1.3418392181396483,
"grad_norm": 0.12838061153888702,
"learning_rate": 7.587506861673737e-07,
"entropy": 1.3244032382965087,
"num_tokens": 14451789.0,
"mean_token_accuracy": 0.6773718982934952,
"epoch": 1.0418541383279272,
"step": 2490
},
{
"loss": 1.2931674003601075,
"grad_norm": 0.12182667851448059,
"learning_rate": 7.568152039348695e-07,
"entropy": 1.3194489538669587,
"num_tokens": 14510441.0,
"mean_token_accuracy": 0.6842545494437218,
"epoch": 1.04603955216072,
"step": 2500
},
{
"loss": 1.2959155082702636,
"grad_norm": 0.11124531924724579,
"learning_rate": 7.548744802654241e-07,
"entropy": 1.3410497322678565,
"num_tokens": 14571458.0,
"mean_token_accuracy": 0.6812979131937027,
"epoch": 1.0502249659935126,
"step": 2510
},
{
"loss": 1.3612911224365234,
"grad_norm": 0.12837456166744232,
"learning_rate": 7.529285547679882e-07,
"entropy": 1.3736032456159593,
"num_tokens": 14627118.0,
"mean_token_accuracy": 0.672698700428009,
"epoch": 1.0544103798263054,
"step": 2520
},
{
"loss": 1.2740073204040527,
"grad_norm": 0.1267591416835785,
"learning_rate": 7.509774671576785e-07,
"entropy": 1.3048336684703827,
"num_tokens": 14685752.0,
"mean_token_accuracy": 0.6858905151486396,
"epoch": 1.058595793659098,
"step": 2530
},
{
"loss": 1.3410483360290528,
"grad_norm": 0.11439883708953857,
"learning_rate": 7.490212572549666e-07,
"entropy": 1.3314668446779252,
"num_tokens": 14742644.0,
"mean_token_accuracy": 0.6746952176094055,
"epoch": 1.0627812074918908,
"step": 2540
},
{
"loss": 1.2937799453735352,
"grad_norm": 0.12421438843011856,
"learning_rate": 7.470599649848681e-07,
"entropy": 1.3203342527151107,
"num_tokens": 14801546.0,
"mean_token_accuracy": 0.6863655790686607,
"epoch": 1.0669666213246836,
"step": 2550
},
{
"loss": 1.3474176406860352,
"grad_norm": 0.11059686541557312,
"learning_rate": 7.450936303761256e-07,
"entropy": 1.3507545605301856,
"num_tokens": 14861872.0,
"mean_token_accuracy": 0.6777540385723114,
"epoch": 1.0711520351574761,
"step": 2560
},
{
"loss": 1.2592041015625,
"grad_norm": 0.12262172996997833,
"learning_rate": 7.431222935603929e-07,
"entropy": 1.2903067260980605,
"num_tokens": 14919917.0,
"mean_token_accuracy": 0.6862245246767997,
"epoch": 1.075337448990269,
"step": 2570
},
{
"loss": 1.3285273551940917,
"grad_norm": 0.1249430701136589,
"learning_rate": 7.411459947714156e-07,
"entropy": 1.346482941508293,
"num_tokens": 14977173.0,
"mean_token_accuracy": 0.677224400639534,
"epoch": 1.0795228628230615,
"step": 2580
},
{
"loss": 1.3090217590332032,
"grad_norm": 0.15991806983947754,
"learning_rate": 7.391647743442103e-07,
"entropy": 1.3448469370603562,
"num_tokens": 15036719.0,
"mean_token_accuracy": 0.6807536914944649,
"epoch": 1.0837082766558543,
"step": 2590
},
{
"loss": 1.385681438446045,
"grad_norm": 0.12378425896167755,
"learning_rate": 7.37178672714241e-07,
"entropy": 1.4169642955064774,
"num_tokens": 15093272.0,
"mean_token_accuracy": 0.666177037358284,
"epoch": 1.0878936904886471,
"step": 2600
},
{
"eval_loss": 1.3201794624328613,
"eval_runtime": 43.6566,
"eval_samples_per_second": 145.934,
"eval_steps_per_second": 6.093,
"eval_entropy": 1.318832386705212,
"eval_num_tokens": 15093272.0,
"eval_mean_token_accuracy": 0.6805890722382337,
"epoch": 1.0878936904886471,
"step": 2600
},
{
"loss": 1.3124534606933593,
"grad_norm": 0.14278866350650787,
"learning_rate": 7.351877304165939e-07,
"entropy": 1.3207478374242783,
"num_tokens": 15151531.0,
"mean_token_accuracy": 0.6814302504062653,
"epoch": 1.0920791043214397,
"step": 2610
},
{
"loss": 1.310394859313965,
"grad_norm": 0.11016988754272461,
"learning_rate": 7.331919880851505e-07,
"entropy": 1.3247565850615501,
"num_tokens": 15208078.0,
"mean_token_accuracy": 0.6797660425305366,
"epoch": 1.0962645181542325,
"step": 2620
},
{
"loss": 1.3397459030151366,
"grad_norm": 0.12294236570596695,
"learning_rate": 7.311914864517574e-07,
"entropy": 1.344627757370472,
"num_tokens": 15262908.0,
"mean_token_accuracy": 0.6800432533025742,
"epoch": 1.1004499319870251,
"step": 2630
},
{
"loss": 1.3385157585144043,
"grad_norm": 0.1285414695739746,
"learning_rate": 7.291862663453963e-07,
"entropy": 1.342196998000145,
"num_tokens": 15323145.0,
"mean_token_accuracy": 0.6765275478363038,
"epoch": 1.104635345819818,
"step": 2640
},
{
"loss": 1.30029239654541,
"grad_norm": 0.13284096121788025,
"learning_rate": 7.271763686913493e-07,
"entropy": 1.3492845341563224,
"num_tokens": 15380781.0,
"mean_token_accuracy": 0.6857595443725586,
"epoch": 1.1088207596526107,
"step": 2650
},
{
"loss": 1.287161159515381,
"grad_norm": 0.12089403718709946,
"learning_rate": 7.251618345103646e-07,
"entropy": 1.3121826618909835,
"num_tokens": 15439602.0,
"mean_token_accuracy": 0.6850664153695106,
"epoch": 1.1130061734854033,
"step": 2660
},
{
"loss": 1.2762629508972168,
"grad_norm": 0.12427452206611633,
"learning_rate": 7.231427049178192e-07,
"entropy": 1.2992495775222779,
"num_tokens": 15495803.0,
"mean_token_accuracy": 0.6846798285841942,
"epoch": 1.1171915873181961,
"step": 2670
},
{
"loss": 1.274948501586914,
"grad_norm": 0.13808666169643402,
"learning_rate": 7.211190211228791e-07,
"entropy": 1.305306363105774,
"num_tokens": 15550588.0,
"mean_token_accuracy": 0.6887386977672577,
"epoch": 1.1213770011509887,
"step": 2680
},
{
"loss": 1.2809961318969727,
"grad_norm": 0.1604543924331665,
"learning_rate": 7.190908244276592e-07,
"entropy": 1.291318878531456,
"num_tokens": 15607839.0,
"mean_token_accuracy": 0.6838915839791297,
"epoch": 1.1255624149837815,
"step": 2690
},
{
"loss": 1.3102614402770996,
"grad_norm": 0.1264321208000183,
"learning_rate": 7.170581562263795e-07,
"entropy": 1.3290839582681655,
"num_tokens": 15666987.0,
"mean_token_accuracy": 0.6819840222597122,
"epoch": 1.1297478288165743,
"step": 2700
},
{
"loss": 1.3671725273132325,
"grad_norm": 0.1209392324090004,
"learning_rate": 7.150210580045207e-07,
"entropy": 1.3735456377267838,
"num_tokens": 15724955.0,
"mean_token_accuracy": 0.6725474014878273,
"epoch": 1.133933242649367,
"step": 2710
},
{
"loss": 1.3231231689453125,
"grad_norm": 0.12559957802295685,
"learning_rate": 7.129795713379776e-07,
"entropy": 1.340329071879387,
"num_tokens": 15782149.0,
"mean_token_accuracy": 0.6805369645357132,
"epoch": 1.1381186564821597,
"step": 2720
},
{
"loss": 1.2828726768493652,
"grad_norm": 0.13034865260124207,
"learning_rate": 7.109337378922102e-07,
"entropy": 1.2797758102416992,
"num_tokens": 15835973.0,
"mean_token_accuracy": 0.6902579948306083,
"epoch": 1.1423040703149523,
"step": 2730
},
{
"loss": 1.329068374633789,
"grad_norm": 0.1187472939491272,
"learning_rate": 7.088835994213937e-07,
"entropy": 1.3206837117671966,
"num_tokens": 15895605.0,
"mean_token_accuracy": 0.6760165989398956,
"epoch": 1.146489484147745,
"step": 2740
},
{
"loss": 1.2608001708984375,
"grad_norm": 0.11278735101222992,
"learning_rate": 7.068291977675661e-07,
"entropy": 1.314364343881607,
"num_tokens": 15956260.0,
"mean_token_accuracy": 0.6899202361702919,
"epoch": 1.150674897980538,
"step": 2750
},
{
"loss": 1.3079211235046386,
"grad_norm": 0.10432706028223038,
"learning_rate": 7.047705748597741e-07,
"entropy": 1.3454543590545653,
"num_tokens": 16013636.0,
"mean_token_accuracy": 0.6848849534988404,
"epoch": 1.1548603118133305,
"step": 2760
},
{
"loss": 1.3299365043640137,
"grad_norm": 0.1423172652721405,
"learning_rate": 7.027077727132178e-07,
"entropy": 1.3436584562063216,
"num_tokens": 16070788.0,
"mean_token_accuracy": 0.6782758548855782,
"epoch": 1.1590457256461233,
"step": 2770
},
{
"loss": 1.2507868766784669,
"grad_norm": 0.12985938787460327,
"learning_rate": 7.006408334283929e-07,
"entropy": 1.300880002975464,
"num_tokens": 16132003.0,
"mean_token_accuracy": 0.6908931702375412,
"epoch": 1.163231139478916,
"step": 2780
},
{
"loss": 1.3301843643188476,
"grad_norm": 0.14071504771709442,
"learning_rate": 6.985697991902313e-07,
"entropy": 1.3270384550094605,
"num_tokens": 16192149.0,
"mean_token_accuracy": 0.6777920231223107,
"epoch": 1.1674165533117087,
"step": 2790
},
{
"loss": 1.2743472099304198,
"grad_norm": 0.1140187457203865,
"learning_rate": 6.964947122672406e-07,
"entropy": 1.3053037211298943,
"num_tokens": 16251607.0,
"mean_token_accuracy": 0.6888150230050087,
"epoch": 1.1716019671445015,
"step": 2800
},
{
"eval_loss": 1.3166502714157104,
"eval_runtime": 43.6438,
"eval_samples_per_second": 145.977,
"eval_steps_per_second": 6.095,
"eval_entropy": 1.3201469900016498,
"eval_num_tokens": 16251607.0,
"eval_mean_token_accuracy": 0.6811264934844541,
"epoch": 1.1716019671445015,
"step": 2800
},
{
"loss": 1.3409744262695313,
"grad_norm": 0.10443054884672165,
"learning_rate": 6.944156150106407e-07,
"entropy": 1.342512857913971,
"num_tokens": 16312813.0,
"mean_token_accuracy": 0.6724711164832116,
"epoch": 1.175787380977294,
"step": 2810
},
{
"loss": 1.3988855361938477,
"grad_norm": 0.1189141720533371,
"learning_rate": 6.923325498535005e-07,
"entropy": 1.396900659799576,
"num_tokens": 16370227.0,
"mean_token_accuracy": 0.6674019232392311,
"epoch": 1.1799727948100869,
"step": 2820
},
{
"loss": 1.3619994163513183,
"grad_norm": 0.11577111482620239,
"learning_rate": 6.902455593098711e-07,
"entropy": 1.3739877551794053,
"num_tokens": 16431284.0,
"mean_token_accuracy": 0.6685123056173324,
"epoch": 1.1841582086428795,
"step": 2830
},
{
"loss": 1.2823293685913086,
"grad_norm": 0.1623101532459259,
"learning_rate": 6.881546859739178e-07,
"entropy": 1.287187758088112,
"num_tokens": 16490232.0,
"mean_token_accuracy": 0.6852916941046715,
"epoch": 1.1883436224756723,
"step": 2840
},
{
"loss": 1.295179557800293,
"grad_norm": 0.1286296844482422,
"learning_rate": 6.860599725190516e-07,
"entropy": 1.3181857854127883,
"num_tokens": 16549313.0,
"mean_token_accuracy": 0.6848940759897232,
"epoch": 1.192529036308465,
"step": 2850
},
{
"loss": 1.351776695251465,
"grad_norm": 0.12631654739379883,
"learning_rate": 6.839614616970579e-07,
"entropy": 1.3548940598964692,
"num_tokens": 16607551.0,
"mean_token_accuracy": 0.6714933633804321,
"epoch": 1.1967144501412577,
"step": 2860
},
{
"loss": 1.3425410270690918,
"grad_norm": 0.1304273158311844,
"learning_rate": 6.818591963372242e-07,
"entropy": 1.3392845541238785,
"num_tokens": 16667124.0,
"mean_token_accuracy": 0.6802757531404495,
"epoch": 1.2008998639740505,
"step": 2870
},
{
"loss": 1.3046100616455079,
"grad_norm": 0.12168211489915848,
"learning_rate": 6.797532193454654e-07,
"entropy": 1.3106303334236145,
"num_tokens": 16725868.0,
"mean_token_accuracy": 0.6820132330060005,
"epoch": 1.2050852778068433,
"step": 2880
},
{
"loss": 1.266930389404297,
"grad_norm": 0.16589786112308502,
"learning_rate": 6.776435737034484e-07,
"entropy": 1.2930086612701417,
"num_tokens": 16780751.0,
"mean_token_accuracy": 0.6916173666715622,
"epoch": 1.2092706916396359,
"step": 2890
},
{
"loss": 1.290895366668701,
"grad_norm": 0.11063241213560104,
"learning_rate": 6.755303024677153e-07,
"entropy": 1.3148932754993439,
"num_tokens": 16838274.0,
"mean_token_accuracy": 0.6836249440908432,
"epoch": 1.2134561054724287,
"step": 2900
},
{
"loss": 1.3228137016296386,
"grad_norm": 0.10892044007778168,
"learning_rate": 6.734134487688043e-07,
"entropy": 1.3384662061929702,
"num_tokens": 16896457.0,
"mean_token_accuracy": 0.6798223108053207,
"epoch": 1.2176415193052212,
"step": 2910
},
{
"loss": 1.321933650970459,
"grad_norm": 0.13741441071033478,
"learning_rate": 6.712930558103691e-07,
"entropy": 1.3460487127304077,
"num_tokens": 16955127.0,
"mean_token_accuracy": 0.6765735790133476,
"epoch": 1.221826933138014,
"step": 2920
},
{
"loss": 1.3986333847045898,
"grad_norm": 0.13116198778152466,
"learning_rate": 6.691691668682977e-07,
"entropy": 1.3796002447605134,
"num_tokens": 17010269.0,
"mean_token_accuracy": 0.6650555938482284,
"epoch": 1.2260123469708066,
"step": 2930
},
{
"loss": 1.3275323867797852,
"grad_norm": 0.1158343181014061,
"learning_rate": 6.670418252898284e-07,
"entropy": 1.3303757071495057,
"num_tokens": 17067471.0,
"mean_token_accuracy": 0.6782015576958657,
"epoch": 1.2301977608035994,
"step": 2940
},
{
"loss": 1.299326515197754,
"grad_norm": 0.13345105946063995,
"learning_rate": 6.649110744926669e-07,
"entropy": 1.319593369960785,
"num_tokens": 17123848.0,
"mean_token_accuracy": 0.6838883191347123,
"epoch": 1.2343831746363922,
"step": 2950
},
{
"loss": 1.2679595947265625,
"grad_norm": 0.13203385472297668,
"learning_rate": 6.627769579640975e-07,
"entropy": 1.2961439684033393,
"num_tokens": 17180001.0,
"mean_token_accuracy": 0.6859666183590889,
"epoch": 1.2385685884691848,
"step": 2960
},
{
"loss": 1.3538383483886718,
"grad_norm": 0.1471163034439087,
"learning_rate": 6.606395192600978e-07,
"entropy": 1.3352440029382706,
"num_tokens": 17233262.0,
"mean_token_accuracy": 0.6754373088479042,
"epoch": 1.2427540023019776,
"step": 2970
},
{
"loss": 1.3183878898620605,
"grad_norm": 0.12840019166469574,
"learning_rate": 6.584988020044485e-07,
"entropy": 1.349251627922058,
"num_tokens": 17287228.0,
"mean_token_accuracy": 0.6815307438373566,
"epoch": 1.2469394161347704,
"step": 2980
},
{
"loss": 1.2082359313964843,
"grad_norm": 0.15220077335834503,
"learning_rate": 6.563548498878438e-07,
"entropy": 1.2743981599807739,
"num_tokens": 17348380.0,
"mean_token_accuracy": 0.6978771463036537,
"epoch": 1.251124829967563,
"step": 2990
},
{
"loss": 1.3413416862487793,
"grad_norm": 0.12702776491641998,
"learning_rate": 6.542077066669993e-07,
"entropy": 1.338026624917984,
"num_tokens": 17403328.0,
"mean_token_accuracy": 0.675315049290657,
"epoch": 1.2553102438003558,
"step": 3000
},
{
"eval_loss": 1.3136601448059082,
"eval_runtime": 43.154,
"eval_samples_per_second": 147.634,
"eval_steps_per_second": 6.164,
"eval_entropy": 1.3217154624766874,
"eval_num_tokens": 17403328.0,
"eval_mean_token_accuracy": 0.6815465722317086,
"epoch": 1.2553102438003558,
"step": 3000
},
{
"loss": 1.3451406478881835,
"grad_norm": 0.1156093031167984,
"learning_rate": 6.52057416163759e-07,
"entropy": 1.3520446419715881,
"num_tokens": 17460648.0,
"mean_token_accuracy": 0.6702774554491043,
"epoch": 1.2594956576331484,
"step": 3010
},
{
"loss": 1.3822593688964844,
"grad_norm": 0.12327724695205688,
"learning_rate": 6.499040222642007e-07,
"entropy": 1.365411925315857,
"num_tokens": 17519410.0,
"mean_token_accuracy": 0.6692025378346443,
"epoch": 1.2636810714659412,
"step": 3020
},
{
"loss": 1.3682982444763183,
"grad_norm": 0.12616313993930817,
"learning_rate": 6.477475689177407e-07,
"entropy": 1.3488513588905335,
"num_tokens": 17575033.0,
"mean_token_accuracy": 0.6721004649996758,
"epoch": 1.2678664852987338,
"step": 3030
},
{
"loss": 1.3206647872924804,
"grad_norm": 0.11206343024969101,
"learning_rate": 6.455881001362372e-07,
"entropy": 1.3416712805628777,
"num_tokens": 17634281.0,
"mean_token_accuracy": 0.6792711272835732,
"epoch": 1.2720518991315266,
"step": 3040
},
{
"loss": 1.368018913269043,
"grad_norm": 0.1311446875333786,
"learning_rate": 6.434256599930909e-07,
"entropy": 1.37212732732296,
"num_tokens": 17689407.0,
"mean_token_accuracy": 0.6717290371656418,
"epoch": 1.2762373129643194,
"step": 3050
},
{
"loss": 1.3607032775878907,
"grad_norm": 0.14133571088314056,
"learning_rate": 6.412602926223464e-07,
"entropy": 1.3578105926513673,
"num_tokens": 17748080.0,
"mean_token_accuracy": 0.6729270294308662,
"epoch": 1.280422726797112,
"step": 3060
},
{
"loss": 1.259375,
"grad_norm": 0.12888510525226593,
"learning_rate": 6.390920422177909e-07,
"entropy": 1.2880975693464278,
"num_tokens": 17809457.0,
"mean_token_accuracy": 0.6906314134597779,
"epoch": 1.2846081406299048,
"step": 3070
},
{
"loss": 1.2357722282409669,
"grad_norm": 0.09784252196550369,
"learning_rate": 6.36920953032053e-07,
"entropy": 1.3111811935901643,
"num_tokens": 17871869.0,
"mean_token_accuracy": 0.6910292714834213,
"epoch": 1.2887935544626976,
"step": 3080
},
{
"loss": 1.2405315399169923,
"grad_norm": 0.13264605402946472,
"learning_rate": 6.347470693756987e-07,
"entropy": 1.2896562367677689,
"num_tokens": 17933114.0,
"mean_token_accuracy": 0.6923574149608612,
"epoch": 1.2929789682954902,
"step": 3090
},
{
"loss": 1.2628044128417968,
"grad_norm": 0.11528719961643219,
"learning_rate": 6.325704356163273e-07,
"entropy": 1.2994973942637444,
"num_tokens": 17989694.0,
"mean_token_accuracy": 0.6884831428527832,
"epoch": 1.297164382128283,
"step": 3100
},
{
"loss": 1.2996297836303712,
"grad_norm": 0.1078164130449295,
"learning_rate": 6.303910961776664e-07,
"entropy": 1.315569232404232,
"num_tokens": 18049760.0,
"mean_token_accuracy": 0.6808459624648094,
"epoch": 1.3013497959610756,
"step": 3110
},
{
"loss": 1.3285269737243652,
"grad_norm": 0.13121522963047028,
"learning_rate": 6.282090955386642e-07,
"entropy": 1.3389025837183,
"num_tokens": 18106326.0,
"mean_token_accuracy": 0.6797921672463417,
"epoch": 1.3055352097938684,
"step": 3120
},
{
"loss": 1.3413330078125,
"grad_norm": 0.1084539070725441,
"learning_rate": 6.260244782325829e-07,
"entropy": 1.3604058563709258,
"num_tokens": 18165478.0,
"mean_token_accuracy": 0.6726910755038261,
"epoch": 1.309720623626661,
"step": 3130
},
{
"loss": 1.3873212814331055,
"grad_norm": 0.11267993599176407,
"learning_rate": 6.238372888460892e-07,
"entropy": 1.404004666209221,
"num_tokens": 18221418.0,
"mean_token_accuracy": 0.6710177347064018,
"epoch": 1.3139060374594538,
"step": 3140
},
{
"loss": 1.312180519104004,
"grad_norm": 0.12451887875795364,
"learning_rate": 6.216475720183437e-07,
"entropy": 1.322364729642868,
"num_tokens": 18278027.0,
"mean_token_accuracy": 0.6799433350563049,
"epoch": 1.3180914512922466,
"step": 3150
},
{
"loss": 1.2946537017822266,
"grad_norm": 0.15065018832683563,
"learning_rate": 6.194553724400911e-07,
"entropy": 1.3054640024900437,
"num_tokens": 18334990.0,
"mean_token_accuracy": 0.6847308576107025,
"epoch": 1.3222768651250392,
"step": 3160
},
{
"loss": 1.2657323837280274,
"grad_norm": 0.11712754517793655,
"learning_rate": 6.172607348527474e-07,
"entropy": 1.2842485100030898,
"num_tokens": 18393253.0,
"mean_token_accuracy": 0.6883261352777481,
"epoch": 1.326462278957832,
"step": 3170
},
{
"loss": 1.3007762908935547,
"grad_norm": 0.16621780395507812,
"learning_rate": 6.150637040474868e-07,
"entropy": 1.3247014865279199,
"num_tokens": 18449407.0,
"mean_token_accuracy": 0.6868977710604668,
"epoch": 1.3306476927906248,
"step": 3180
},
{
"loss": 1.2898554801940918,
"grad_norm": 0.13524088263511658,
"learning_rate": 6.128643248643274e-07,
"entropy": 1.305448915064335,
"num_tokens": 18506773.0,
"mean_token_accuracy": 0.6856573060154915,
"epoch": 1.3348331066234174,
"step": 3190
},
{
"loss": 1.3588788986206055,
"grad_norm": 0.1192813366651535,
"learning_rate": 6.106626421912163e-07,
"entropy": 1.3537309616804123,
"num_tokens": 18568894.0,
"mean_token_accuracy": 0.6747590154409409,
"epoch": 1.3390185204562102,
"step": 3200
},
{
"eval_loss": 1.3109967708587646,
"eval_runtime": 43.5438,
"eval_samples_per_second": 146.312,
"eval_steps_per_second": 6.109,
"eval_entropy": 1.3213256283810264,
"eval_num_tokens": 18568894.0,
"eval_mean_token_accuracy": 0.6819047645518654,
"epoch": 1.3390185204562102,
"step": 3200
},
{
"loss": 1.3187339782714844,
"grad_norm": 0.12998685240745544,
"learning_rate": 6.084587009631135e-07,
"entropy": 1.3331160172820091,
"num_tokens": 18627669.0,
"mean_token_accuracy": 0.6809702217578888,
"epoch": 1.3432039342890028,
"step": 3210
},
{
"loss": 1.287522792816162,
"grad_norm": 0.11987276375293732,
"learning_rate": 6.062525461610746e-07,
"entropy": 1.3107565701007844,
"num_tokens": 18690323.0,
"mean_token_accuracy": 0.6874667569994927,
"epoch": 1.3473893481217956,
"step": 3220
},
{
"loss": 1.2994555473327636,
"grad_norm": 0.14244310557842255,
"learning_rate": 6.040442228113328e-07,
"entropy": 1.3177940219640731,
"num_tokens": 18749330.0,
"mean_token_accuracy": 0.6785065039992333,
"epoch": 1.3515747619545881,
"step": 3230
},
{
"loss": 1.348573875427246,
"grad_norm": 0.12585744261741638,
"learning_rate": 6.018337759843803e-07,
"entropy": 1.3356850504875184,
"num_tokens": 18805120.0,
"mean_token_accuracy": 0.676536102592945,
"epoch": 1.355760175787381,
"step": 3240
},
{
"loss": 1.3361006736755372,
"grad_norm": 0.1416776031255722,
"learning_rate": 5.996212507940475e-07,
"entropy": 1.355094811320305,
"num_tokens": 18861563.0,
"mean_token_accuracy": 0.6747770622372627,
"epoch": 1.3599455896201738,
"step": 3250
},
{
"loss": 1.2256298065185547,
"grad_norm": 0.10666567087173462,
"learning_rate": 5.974066923965835e-07,
"entropy": 1.2700331062078476,
"num_tokens": 18922654.0,
"mean_token_accuracy": 0.697255577147007,
"epoch": 1.3641310034529663,
"step": 3260
},
{
"loss": 1.3261382102966308,
"grad_norm": 0.1291145235300064,
"learning_rate": 5.951901459897337e-07,
"entropy": 1.351950439810753,
"num_tokens": 18980966.0,
"mean_token_accuracy": 0.6802997335791587,
"epoch": 1.3683164172857591,
"step": 3270
},
{
"loss": 1.339816188812256,
"grad_norm": 0.12637273967266083,
"learning_rate": 5.929716568118176e-07,
"entropy": 1.341824659705162,
"num_tokens": 19041925.0,
"mean_token_accuracy": 0.6740039184689521,
"epoch": 1.372501831118552,
"step": 3280
},
{
"loss": 1.322571086883545,
"grad_norm": 0.13360652327537537,
"learning_rate": 5.907512701408049e-07,
"entropy": 1.3231751516461372,
"num_tokens": 19097885.0,
"mean_token_accuracy": 0.6797602906823158,
"epoch": 1.3766872449513445,
"step": 3290
},
{
"loss": 1.2750181198120116,
"grad_norm": 0.1212676391005516,
"learning_rate": 5.885290312933929e-07,
"entropy": 1.2946186915040017,
"num_tokens": 19156636.0,
"mean_token_accuracy": 0.6879573374986648,
"epoch": 1.3808726587841373,
"step": 3300
},
{
"loss": 1.2478185653686524,
"grad_norm": 0.10930495709180832,
"learning_rate": 5.863049856240797e-07,
"entropy": 1.2818130880594254,
"num_tokens": 19215539.0,
"mean_token_accuracy": 0.6884996458888054,
"epoch": 1.38505807261693,
"step": 3310
},
{
"loss": 1.2877882957458495,
"grad_norm": 0.14618222415447235,
"learning_rate": 5.840791785242399e-07,
"entropy": 1.3158632695674897,
"num_tokens": 19275155.0,
"mean_token_accuracy": 0.6801917359232903,
"epoch": 1.3892434864497227,
"step": 3320
},
{
"loss": 1.3204275131225587,
"grad_norm": 0.13408797979354858,
"learning_rate": 5.818516554211983e-07,
"entropy": 1.3392174810171127,
"num_tokens": 19331554.0,
"mean_token_accuracy": 0.6769860580563545,
"epoch": 1.3934289002825153,
"step": 3330
},
{
"loss": 1.2603473663330078,
"grad_norm": 0.15371856093406677,
"learning_rate": 5.796224617773012e-07,
"entropy": 1.2774315923452377,
"num_tokens": 19389359.0,
"mean_token_accuracy": 0.6910146772861481,
"epoch": 1.3976143141153081,
"step": 3340
},
{
"loss": 1.3022661209106445,
"grad_norm": 0.1194000095129013,
"learning_rate": 5.773916430889905e-07,
"entropy": 1.3322788611054421,
"num_tokens": 19449266.0,
"mean_token_accuracy": 0.6764059454202652,
"epoch": 1.401799727948101,
"step": 3350
},
{
"loss": 1.280670738220215,
"grad_norm": 0.1414560228586197,
"learning_rate": 5.751592448858737e-07,
"entropy": 1.28292535841465,
"num_tokens": 19505413.0,
"mean_token_accuracy": 0.6831368803977966,
"epoch": 1.4059851417808935,
"step": 3360
},
{
"loss": 1.2672719955444336,
"grad_norm": 0.1158173456788063,
"learning_rate": 5.729253127297955e-07,
"entropy": 1.2811419636011123,
"num_tokens": 19564391.0,
"mean_token_accuracy": 0.6885835364460945,
"epoch": 1.4101705556136863,
"step": 3370
},
{
"loss": 1.3353286743164063,
"grad_norm": 0.12490648031234741,
"learning_rate": 5.706898922139074e-07,
"entropy": 1.3280266046524047,
"num_tokens": 19623582.0,
"mean_token_accuracy": 0.6795374467968941,
"epoch": 1.4143559694464791,
"step": 3380
},
{
"loss": 1.2236414909362794,
"grad_norm": 0.1268617957830429,
"learning_rate": 5.684530289617376e-07,
"entropy": 1.281736159324646,
"num_tokens": 19682008.0,
"mean_token_accuracy": 0.6963353782892228,
"epoch": 1.4185413832792717,
"step": 3390
},
{
"loss": 1.365687370300293,
"grad_norm": 0.12744104862213135,
"learning_rate": 5.662147686262595e-07,
"entropy": 1.3710105925798417,
"num_tokens": 19735892.0,
"mean_token_accuracy": 0.6697546020150185,
"epoch": 1.4227267971120645,
"step": 3400
},
{
"eval_loss": 1.3087373971939087,
"eval_runtime": 43.1953,
"eval_samples_per_second": 147.493,
"eval_steps_per_second": 6.158,
"eval_entropy": 1.314033669636662,
"eval_num_tokens": 19735892.0,
"eval_mean_token_accuracy": 0.6822930157632756,
"epoch": 1.4227267971120645,
"step": 3400
},
{
"loss": 1.2659673690795898,
"grad_norm": 0.12900042533874512,
"learning_rate": 5.639751568889601e-07,
"entropy": 1.2991064012050628,
"num_tokens": 19795701.0,
"mean_token_accuracy": 0.6906736105680465,
"epoch": 1.426912210944857,
"step": 3410
},
{
"loss": 1.3444564819335938,
"grad_norm": 0.141217440366745,
"learning_rate": 5.617342394589076e-07,
"entropy": 1.328627872467041,
"num_tokens": 19852770.0,
"mean_token_accuracy": 0.6754001170396805,
"epoch": 1.43109762477765,
"step": 3420
},
{
"loss": 1.306549644470215,
"grad_norm": 0.12908576428890228,
"learning_rate": 5.594920620718189e-07,
"entropy": 1.3152456805109978,
"num_tokens": 19912101.0,
"mean_token_accuracy": 0.6831103786826134,
"epoch": 1.4352830386104425,
"step": 3430
},
{
"loss": 1.4111559867858887,
"grad_norm": 0.12521252036094666,
"learning_rate": 5.572486704891254e-07,
"entropy": 1.3963622391223907,
"num_tokens": 19969427.0,
"mean_token_accuracy": 0.6657738149166107,
"epoch": 1.4394684524432353,
"step": 3440
},
{
"loss": 1.3437400817871095,
"grad_norm": 0.14173802733421326,
"learning_rate": 5.550041104970396e-07,
"entropy": 1.3275486350059509,
"num_tokens": 20030520.0,
"mean_token_accuracy": 0.6768685072660446,
"epoch": 1.443653866276028,
"step": 3450
},
{
"loss": 1.3257243156433105,
"grad_norm": 0.12309889495372772,
"learning_rate": 5.527584279056207e-07,
"entropy": 1.3419605940580368,
"num_tokens": 20088125.0,
"mean_token_accuracy": 0.6762049332261085,
"epoch": 1.4478392801088207,
"step": 3460
},
{
"loss": 1.226247215270996,
"grad_norm": 0.13416838645935059,
"learning_rate": 5.505116685478394e-07,
"entropy": 1.284440317749977,
"num_tokens": 20147039.0,
"mean_token_accuracy": 0.6947048246860504,
"epoch": 1.4520246939416135,
"step": 3470
},
{
"loss": 1.3330312728881837,
"grad_norm": 0.1232227310538292,
"learning_rate": 5.48263878278642e-07,
"entropy": 1.3413183093070984,
"num_tokens": 20205035.0,
"mean_token_accuracy": 0.67842618227005,
"epoch": 1.4562101077744063,
"step": 3480
},
{
"loss": 1.3752978324890137,
"grad_norm": 0.1453479379415512,
"learning_rate": 5.460151029740161e-07,
"entropy": 1.3477472990751267,
"num_tokens": 20260344.0,
"mean_token_accuracy": 0.6687687709927559,
"epoch": 1.4603955216071989,
"step": 3490
},
{
"loss": 1.221930980682373,
"grad_norm": 0.1277054399251938,
"learning_rate": 5.437653885300522e-07,
"entropy": 1.261066934466362,
"num_tokens": 20318023.0,
"mean_token_accuracy": 0.6955515563488006,
"epoch": 1.4645809354399917,
"step": 3500
},
{
"loss": 1.2911027908325194,
"grad_norm": 0.12536244094371796,
"learning_rate": 5.415147808620086e-07,
"entropy": 1.3049872070550919,
"num_tokens": 20376931.0,
"mean_token_accuracy": 0.6845586389303208,
"epoch": 1.4687663492727843,
"step": 3510
},
{
"loss": 1.289406967163086,
"grad_norm": 0.10206779837608337,
"learning_rate": 5.392633259033735e-07,
"entropy": 1.3262745544314385,
"num_tokens": 20435694.0,
"mean_token_accuracy": 0.6822992920875549,
"epoch": 1.472951763105577,
"step": 3520
},
{
"loss": 1.36239652633667,
"grad_norm": 0.12939555943012238,
"learning_rate": 5.370110696049282e-07,
"entropy": 1.353842854499817,
"num_tokens": 20494766.0,
"mean_token_accuracy": 0.6707737103104592,
"epoch": 1.4771371769383697,
"step": 3530
},
{
"loss": 1.2756587028503419,
"grad_norm": 0.11803429573774338,
"learning_rate": 5.34758057933808e-07,
"entropy": 1.3108687788248061,
"num_tokens": 20552412.0,
"mean_token_accuracy": 0.6879084140062333,
"epoch": 1.4813225907711625,
"step": 3540
},
{
"loss": 1.328935432434082,
"grad_norm": 0.13244083523750305,
"learning_rate": 5.325043368725662e-07,
"entropy": 1.3331556499004364,
"num_tokens": 20610804.0,
"mean_token_accuracy": 0.6783339202404022,
"epoch": 1.4855080046039553,
"step": 3550
},
{
"loss": 1.334804153442383,
"grad_norm": 0.14279146492481232,
"learning_rate": 5.302499524182327e-07,
"entropy": 1.3319466978311538,
"num_tokens": 20668475.0,
"mean_token_accuracy": 0.6799613311886787,
"epoch": 1.4896934184367479,
"step": 3560
},
{
"loss": 1.2445655822753907,
"grad_norm": 0.137944757938385,
"learning_rate": 5.279949505813783e-07,
"entropy": 1.2786899566650392,
"num_tokens": 20731142.0,
"mean_token_accuracy": 0.6872789070010186,
"epoch": 1.4938788322695407,
"step": 3570
},
{
"loss": 1.2799750328063966,
"grad_norm": 0.14270278811454773,
"learning_rate": 5.257393773851733e-07,
"entropy": 1.3207889288663863,
"num_tokens": 20791636.0,
"mean_token_accuracy": 0.6855424389243125,
"epoch": 1.4980642461023335,
"step": 3580
},
{
"loss": 1.2714473724365234,
"grad_norm": 0.13160590827465057,
"learning_rate": 5.234832788644492e-07,
"entropy": 1.2881531581282615,
"num_tokens": 20850942.0,
"mean_token_accuracy": 0.6868263691663742,
"epoch": 1.502249659935126,
"step": 3590
},
{
"loss": 1.377396297454834,
"grad_norm": 0.13724471628665924,
"learning_rate": 5.212267010647594e-07,
"entropy": 1.350425472855568,
"num_tokens": 20909794.0,
"mean_token_accuracy": 0.6673172801733017,
"epoch": 1.5064350737679189,
"step": 3600
},
{
"eval_loss": 1.3067371845245361,
"eval_runtime": 43.2705,
"eval_samples_per_second": 147.237,
"eval_steps_per_second": 6.147,
"eval_entropy": 1.3166241054248093,
"eval_num_tokens": 20909794.0,
"eval_mean_token_accuracy": 0.6824193930715546,
"epoch": 1.5064350737679189,
"step": 3600
},
{
"loss": 1.3214588165283203,
"grad_norm": 0.12896448373794556,
"learning_rate": 5.189696900414387e-07,
"entropy": 1.3233384594321251,
"num_tokens": 20966668.0,
"mean_token_accuracy": 0.6812221944332123,
"epoch": 1.5106204876007117,
"step": 3610
},
{
"loss": 1.2983431816101074,
"grad_norm": 0.11282876133918762,
"learning_rate": 5.167122918586641e-07,
"entropy": 1.3307133883237838,
"num_tokens": 21019757.0,
"mean_token_accuracy": 0.6807741552591324,
"epoch": 1.5148059014335042,
"step": 3620
},
{
"loss": 1.2715835571289062,
"grad_norm": 0.15096786618232727,
"learning_rate": 5.144545525885137e-07,
"entropy": 1.3070465952157975,
"num_tokens": 21077694.0,
"mean_token_accuracy": 0.6909742683172226,
"epoch": 1.5189913152662968,
"step": 3630
},
{
"loss": 1.2847407341003418,
"grad_norm": 0.13508452475070953,
"learning_rate": 5.121965183100278e-07,
"entropy": 1.2937607616186142,
"num_tokens": 21135607.0,
"mean_token_accuracy": 0.6900022774934769,
"epoch": 1.5231767290990896,
"step": 3640
},
{
"loss": 1.3469314575195312,
"grad_norm": 0.12108864635229111,
"learning_rate": 5.099382351082666e-07,
"entropy": 1.3381920427083969,
"num_tokens": 21196736.0,
"mean_token_accuracy": 0.6733641669154167,
"epoch": 1.5273621429318824,
"step": 3650
},
{
"loss": 1.3375173568725587,
"grad_norm": 0.10356143862009048,
"learning_rate": 5.076797490733718e-07,
"entropy": 1.339997085928917,
"num_tokens": 21253173.0,
"mean_token_accuracy": 0.6792127892374993,
"epoch": 1.531547556764675,
"step": 3660
},
{
"loss": 1.2826983451843261,
"grad_norm": 0.13479599356651306,
"learning_rate": 5.054211062996241e-07,
"entropy": 1.305300708115101,
"num_tokens": 21309039.0,
"mean_token_accuracy": 0.6865562707185745,
"epoch": 1.5357329705974678,
"step": 3670
},
{
"loss": 1.2372420310974122,
"grad_norm": 0.13392086327075958,
"learning_rate": 5.031623528845032e-07,
"entropy": 1.2712924674153327,
"num_tokens": 21368207.0,
"mean_token_accuracy": 0.6935058936476708,
"epoch": 1.5399183844302606,
"step": 3680
},
{
"loss": 1.232171630859375,
"grad_norm": 0.13666661083698273,
"learning_rate": 5.009035349277469e-07,
"entropy": 1.2765518009662629,
"num_tokens": 21425778.0,
"mean_token_accuracy": 0.6935865059494972,
"epoch": 1.5441037982630532,
"step": 3690
},
{
"loss": 1.2454364776611329,
"grad_norm": 0.11271411925554276,
"learning_rate": 4.986446985304105e-07,
"entropy": 1.2914676815271378,
"num_tokens": 21484225.0,
"mean_token_accuracy": 0.6902065351605415,
"epoch": 1.548289212095846,
"step": 3700
},
{
"loss": 1.3030299186706542,
"grad_norm": 0.1399720460176468,
"learning_rate": 4.963858897939254e-07,
"entropy": 1.3240129977464676,
"num_tokens": 21541427.0,
"mean_token_accuracy": 0.6800246313214302,
"epoch": 1.5524746259286388,
"step": 3710
},
{
"loss": 1.2892935752868653,
"grad_norm": 0.12150213122367859,
"learning_rate": 4.941271548191588e-07,
"entropy": 1.3206008851528168,
"num_tokens": 21600143.0,
"mean_token_accuracy": 0.6826386615633965,
"epoch": 1.5566600397614314,
"step": 3720
},
{
"loss": 1.257300853729248,
"grad_norm": 0.12151734530925751,
"learning_rate": 4.918685397054718e-07,
"entropy": 1.3101585179567337,
"num_tokens": 21656388.0,
"mean_token_accuracy": 0.6894284501671791,
"epoch": 1.560845453594224,
"step": 3730
},
{
"loss": 1.233230972290039,
"grad_norm": 0.11451518535614014,
"learning_rate": 4.896100905497803e-07,
"entropy": 1.2788519978523254,
"num_tokens": 21715109.0,
"mean_token_accuracy": 0.6912301525473594,
"epoch": 1.5650308674270168,
"step": 3740
},
{
"loss": 1.3578661918640136,
"grad_norm": 0.12948159873485565,
"learning_rate": 4.873518534456119e-07,
"entropy": 1.3681051909923554,
"num_tokens": 21772746.0,
"mean_token_accuracy": 0.6737246960401535,
"epoch": 1.5692162812598096,
"step": 3750
},
{
"loss": 1.3305506706237793,
"grad_norm": 0.132918581366539,
"learning_rate": 4.850938744821674e-07,
"entropy": 1.3546297058463097,
"num_tokens": 21830592.0,
"mean_token_accuracy": 0.6777592465281487,
"epoch": 1.5734016950926022,
"step": 3760
},
{
"loss": 1.2513206481933594,
"grad_norm": 0.11073267459869385,
"learning_rate": 4.828361997433782e-07,
"entropy": 1.3076282858848571,
"num_tokens": 21892167.0,
"mean_token_accuracy": 0.68986496925354,
"epoch": 1.577587108925395,
"step": 3770
},
{
"loss": 1.271761131286621,
"grad_norm": 0.1349179595708847,
"learning_rate": 4.805788753069673e-07,
"entropy": 1.3031177580356599,
"num_tokens": 21952326.0,
"mean_token_accuracy": 0.6844272211194038,
"epoch": 1.5817725227581878,
"step": 3780
},
{
"loss": 1.3118972778320312,
"grad_norm": 0.16216766834259033,
"learning_rate": 4.783219472435081e-07,
"entropy": 1.3089008510112763,
"num_tokens": 22012045.0,
"mean_token_accuracy": 0.6802516788244247,
"epoch": 1.5859579365909804,
"step": 3790
},
{
"loss": 1.276634979248047,
"grad_norm": 0.13579685986042023,
"learning_rate": 4.760654616154842e-07,
"entropy": 1.309640994668007,
"num_tokens": 22068798.0,
"mean_token_accuracy": 0.6864374697208404,
"epoch": 1.5901433504237732,
"step": 3800
},
{
"eval_loss": 1.3049228191375732,
"eval_runtime": 43.7061,
"eval_samples_per_second": 145.769,
"eval_steps_per_second": 6.086,
"eval_entropy": 1.3191418598469038,
"eval_num_tokens": 22068798.0,
"eval_mean_token_accuracy": 0.6827442256131566,
"epoch": 1.5901433504237732,
"step": 3800
},
{
"loss": 1.2272584915161133,
"grad_norm": 0.14792950451374054,
"learning_rate": 4.7380946447634935e-07,
"entropy": 1.2662998199462892,
"num_tokens": 22128395.0,
"mean_token_accuracy": 0.6912488013505935,
"epoch": 1.594328764256566,
"step": 3810
},
{
"loss": 1.3102972030639648,
"grad_norm": 0.13187845051288605,
"learning_rate": 4.7155400186958744e-07,
"entropy": 1.3161917060613633,
"num_tokens": 22185985.0,
"mean_token_accuracy": 0.6805871248245239,
"epoch": 1.5985141780893586,
"step": 3820
},
{
"loss": 1.2989760398864747,
"grad_norm": 0.11845917999744415,
"learning_rate": 4.6929911982777325e-07,
"entropy": 1.3359744518995285,
"num_tokens": 22241668.0,
"mean_token_accuracy": 0.6845213517546653,
"epoch": 1.6026995919221512,
"step": 3830
},
{
"loss": 1.3590301513671874,
"grad_norm": 0.15108934044837952,
"learning_rate": 4.670448643716322e-07,
"entropy": 1.3409444272518158,
"num_tokens": 22297005.0,
"mean_token_accuracy": 0.6736213758587837,
"epoch": 1.606885005754944,
"step": 3840
},
{
"loss": 1.3382845878601075,
"grad_norm": 0.1333889216184616,
"learning_rate": 4.6479128150910196e-07,
"entropy": 1.3449043482542038,
"num_tokens": 22357044.0,
"mean_token_accuracy": 0.675865213572979,
"epoch": 1.6110704195877368,
"step": 3850
},
{
"loss": 1.310408592224121,
"grad_norm": 0.13304699957370758,
"learning_rate": 4.625384172343926e-07,
"entropy": 1.3386895060539246,
"num_tokens": 22413961.0,
"mean_token_accuracy": 0.6803735584020615,
"epoch": 1.6152558334205294,
"step": 3860
},
{
"loss": 1.2976115226745606,
"grad_norm": 0.11845416575670242,
"learning_rate": 4.602863175270483e-07,
"entropy": 1.3106703519821168,
"num_tokens": 22473509.0,
"mean_token_accuracy": 0.6821468025445938,
"epoch": 1.6194412472533222,
"step": 3870
},
{
"loss": 1.2801057815551757,
"grad_norm": 0.13403619825839996,
"learning_rate": 4.580350283510088e-07,
"entropy": 1.295821413397789,
"num_tokens": 22527439.0,
"mean_token_accuracy": 0.688027186691761,
"epoch": 1.623626661086115,
"step": 3880
},
{
"loss": 1.2886218070983886,
"grad_norm": 0.14298778772354126,
"learning_rate": 4.55784595653672e-07,
"entropy": 1.300849825143814,
"num_tokens": 22586180.0,
"mean_token_accuracy": 0.6847355782985687,
"epoch": 1.6278120749189076,
"step": 3890
},
{
"loss": 1.3447054862976073,
"grad_norm": 0.13079994916915894,
"learning_rate": 4.535350653649549e-07,
"entropy": 1.3326701998710633,
"num_tokens": 22642360.0,
"mean_token_accuracy": 0.6774184912443161,
"epoch": 1.6319974887517004,
"step": 3900
},
{
"loss": 1.3529298782348633,
"grad_norm": 0.10864491760730743,
"learning_rate": 4.512864833963571e-07,
"entropy": 1.338020858168602,
"num_tokens": 22699591.0,
"mean_token_accuracy": 0.6726731553673744,
"epoch": 1.6361829025844932,
"step": 3910
},
{
"loss": 1.2351530075073243,
"grad_norm": 0.12713301181793213,
"learning_rate": 4.4903889564002394e-07,
"entropy": 1.2726581797003746,
"num_tokens": 22757192.0,
"mean_token_accuracy": 0.6924531191587449,
"epoch": 1.6403683164172858,
"step": 3920
},
{
"loss": 1.2388504028320313,
"grad_norm": 0.12611106038093567,
"learning_rate": 4.467923479678091e-07,
"entropy": 1.2651499658823013,
"num_tokens": 22813695.0,
"mean_token_accuracy": 0.6929998561739922,
"epoch": 1.6445537302500783,
"step": 3930
},
{
"loss": 1.2846209526062011,
"grad_norm": 0.1429147869348526,
"learning_rate": 4.4454688623033894e-07,
"entropy": 1.3101652726531028,
"num_tokens": 22873575.0,
"mean_token_accuracy": 0.6836878523230553,
"epoch": 1.6487391440828711,
"step": 3940
},
{
"loss": 1.3049224853515624,
"grad_norm": 0.1356530785560608,
"learning_rate": 4.4230255625607637e-07,
"entropy": 1.3245902001857757,
"num_tokens": 22930361.0,
"mean_token_accuracy": 0.6820057585835457,
"epoch": 1.652924557915664,
"step": 3950
},
{
"loss": 1.3374080657958984,
"grad_norm": 0.13884375989437103,
"learning_rate": 4.400594038503864e-07,
"entropy": 1.3170197814702989,
"num_tokens": 22987617.0,
"mean_token_accuracy": 0.6769963175058364,
"epoch": 1.6571099717484565,
"step": 3960
},
{
"loss": 1.3491817474365235,
"grad_norm": 0.1354888528585434,
"learning_rate": 4.3781747479459974e-07,
"entropy": 1.3501463949680328,
"num_tokens": 23042051.0,
"mean_token_accuracy": 0.6761467263102532,
"epoch": 1.6612953855812493,
"step": 3970
},
{
"loss": 1.3287680625915528,
"grad_norm": 0.16015098989009857,
"learning_rate": 4.355768148450799e-07,
"entropy": 1.3458044916391372,
"num_tokens": 23098670.0,
"mean_token_accuracy": 0.6781758189201355,
"epoch": 1.6654807994140421,
"step": 3980
},
{
"loss": 1.2587160110473632,
"grad_norm": 0.13395771384239197,
"learning_rate": 4.3333746973228854e-07,
"entropy": 1.2841592252254486,
"num_tokens": 23157362.0,
"mean_token_accuracy": 0.6881028071045876,
"epoch": 1.6696662132468347,
"step": 3990
},
{
"loss": 1.3319854736328125,
"grad_norm": 0.13594871759414673,
"learning_rate": 4.310994851598522e-07,
"entropy": 1.3213010758161545,
"num_tokens": 23212648.0,
"mean_token_accuracy": 0.6781036898493766,
"epoch": 1.6738516270796275,
"step": 4000
},
{
"eval_loss": 1.3033655881881714,
"eval_runtime": 43.5667,
"eval_samples_per_second": 146.235,
"eval_steps_per_second": 6.106,
"eval_entropy": 1.31763897399257,
"eval_num_tokens": 23212648.0,
"eval_mean_token_accuracy": 0.6829292187117096,
"epoch": 1.6738516270796275,
"step": 4000
},
{
"loss": 1.2960718154907227,
"grad_norm": 0.13045227527618408,
"learning_rate": 4.288629068036296e-07,
"entropy": 1.3475263714790344,
"num_tokens": 23274106.0,
"mean_token_accuracy": 0.6837118580937386,
"epoch": 1.6780370409124203,
"step": 4010
},
{
"loss": 1.2239046096801758,
"grad_norm": 0.1388995498418808,
"learning_rate": 4.2662778031077993e-07,
"entropy": 1.2441598355770112,
"num_tokens": 23333462.0,
"mean_token_accuracy": 0.6996882349252701,
"epoch": 1.682222454745213,
"step": 4020
},
{
"loss": 1.356397533416748,
"grad_norm": 0.13702833652496338,
"learning_rate": 4.243941512988304e-07,
"entropy": 1.363625492155552,
"num_tokens": 23392153.0,
"mean_token_accuracy": 0.6763727009296417,
"epoch": 1.6864078685780055,
"step": 4030
},
{
"loss": 1.2513771057128906,
"grad_norm": 0.14271850883960724,
"learning_rate": 4.221620653547454e-07,
"entropy": 1.2843372076749802,
"num_tokens": 23454405.0,
"mean_token_accuracy": 0.6873761117458344,
"epoch": 1.6905932824107983,
"step": 4040
},
{
"loss": 1.2633016586303711,
"grad_norm": 0.1479983925819397,
"learning_rate": 4.199315680339968e-07,
"entropy": 1.2902348592877388,
"num_tokens": 23515963.0,
"mean_token_accuracy": 0.6904997587203979,
"epoch": 1.6947786962435911,
"step": 4050
},
{
"loss": 1.3018023490905761,
"grad_norm": 0.12075834721326828,
"learning_rate": 4.1770270485963294e-07,
"entropy": 1.315699815750122,
"num_tokens": 23573373.0,
"mean_token_accuracy": 0.6817046746611595,
"epoch": 1.6989641100763837,
"step": 4060
},
{
"loss": 1.3296629905700683,
"grad_norm": 0.15214762091636658,
"learning_rate": 4.154755213213513e-07,
"entropy": 1.339156760275364,
"num_tokens": 23630153.0,
"mean_token_accuracy": 0.6760370403528213,
"epoch": 1.7031495239091765,
"step": 4070
},
{
"loss": 1.288606834411621,
"grad_norm": 0.1338847577571869,
"learning_rate": 4.132500628745681e-07,
"entropy": 1.308351318538189,
"num_tokens": 23689525.0,
"mean_token_accuracy": 0.6800839513540268,
"epoch": 1.7073349377419693,
"step": 4080
},
{
"loss": 1.3122243881225586,
"grad_norm": 0.13693219423294067,
"learning_rate": 4.110263749394918e-07,
"entropy": 1.310598623752594,
"num_tokens": 23746173.0,
"mean_token_accuracy": 0.6841694295406342,
"epoch": 1.711520351574762,
"step": 4090
},
{
"loss": 1.3211769104003905,
"grad_norm": 0.12190598249435425,
"learning_rate": 4.0880450290019594e-07,
"entropy": 1.3578921407461166,
"num_tokens": 23804574.0,
"mean_token_accuracy": 0.6757835909724236,
"epoch": 1.7157057654075547,
"step": 4100
},
{
"loss": 1.2779497146606444,
"grad_norm": 0.14035965502262115,
"learning_rate": 4.0658449210369295e-07,
"entropy": 1.311075533926487,
"num_tokens": 23859817.0,
"mean_token_accuracy": 0.6860598146915435,
"epoch": 1.7198911792403475,
"step": 4110
},
{
"loss": 1.289837646484375,
"grad_norm": 0.11380521208047867,
"learning_rate": 4.0436638785900797e-07,
"entropy": 1.3117400839924813,
"num_tokens": 23918028.0,
"mean_token_accuracy": 0.6838786184787751,
"epoch": 1.72407659307314,
"step": 4120
},
{
"loss": 1.2879505157470703,
"grad_norm": 0.15264193713665009,
"learning_rate": 4.0215023543625494e-07,
"entropy": 1.3319763213396072,
"num_tokens": 23977871.0,
"mean_token_accuracy": 0.6834924459457398,
"epoch": 1.7282620069059327,
"step": 4130
},
{
"loss": 1.2729723930358887,
"grad_norm": 0.13197912275791168,
"learning_rate": 3.999360800657121e-07,
"entropy": 1.3003861784934998,
"num_tokens": 24032724.0,
"mean_token_accuracy": 0.6865213885903358,
"epoch": 1.7324474207387257,
"step": 4140
},
{
"loss": 1.2687364578247071,
"grad_norm": 0.12328355014324188,
"learning_rate": 3.977239669368997e-07,
"entropy": 1.2848697736859322,
"num_tokens": 24091459.0,
"mean_token_accuracy": 0.686721895635128,
"epoch": 1.7366328345715183,
"step": 4150
},
{
"loss": 1.310719871520996,
"grad_norm": 0.13127021491527557,
"learning_rate": 3.955139411976564e-07,
"entropy": 1.3004455357789992,
"num_tokens": 24145064.0,
"mean_token_accuracy": 0.6823625862598419,
"epoch": 1.7408182484043109,
"step": 4160
},
{
"loss": 1.277029323577881,
"grad_norm": 0.13161002099514008,
"learning_rate": 3.9330604795321877e-07,
"entropy": 1.2868661388754845,
"num_tokens": 24202651.0,
"mean_token_accuracy": 0.6845416814088822,
"epoch": 1.7450036622371037,
"step": 4170
},
{
"loss": 1.2249256134033204,
"grad_norm": 0.1313517987728119,
"learning_rate": 3.911003322653009e-07,
"entropy": 1.2720478802919388,
"num_tokens": 24259573.0,
"mean_token_accuracy": 0.6942620486021042,
"epoch": 1.7491890760698965,
"step": 4180
},
{
"loss": 1.3295866966247558,
"grad_norm": 0.1546325832605362,
"learning_rate": 3.888968391511738e-07,
"entropy": 1.32426298558712,
"num_tokens": 24322196.0,
"mean_token_accuracy": 0.6730754569172859,
"epoch": 1.753374489902689,
"step": 4190
},
{
"loss": 1.3283195495605469,
"grad_norm": 0.13663379848003387,
"learning_rate": 3.866956135827475e-07,
"entropy": 1.3125308185815812,
"num_tokens": 24376182.0,
"mean_token_accuracy": 0.6829302325844765,
"epoch": 1.7575599037354819,
"step": 4200
},
{
"eval_loss": 1.3021423816680908,
"eval_runtime": 43.7163,
"eval_samples_per_second": 145.735,
"eval_steps_per_second": 6.085,
"eval_entropy": 1.3239203189548694,
"eval_num_tokens": 24376182.0,
"eval_mean_token_accuracy": 0.6831337011846385,
"epoch": 1.7575599037354819,
"step": 4200
},
{
"loss": 1.2734957695007325,
"grad_norm": 0.13826783001422882,
"learning_rate": 3.844967004856526e-07,
"entropy": 1.3006668120622635,
"num_tokens": 24433979.0,
"mean_token_accuracy": 0.6882383152842522,
"epoch": 1.7617453175682747,
"step": 4210
},
{
"loss": 1.298065757751465,
"grad_norm": 0.11341753602027893,
"learning_rate": 3.8230014473832386e-07,
"entropy": 1.3199127793312073,
"num_tokens": 24496717.0,
"mean_token_accuracy": 0.6763370648026467,
"epoch": 1.7659307314010673,
"step": 4220
},
{
"loss": 1.2942770004272461,
"grad_norm": 0.11990880221128464,
"learning_rate": 3.801059911710835e-07,
"entropy": 1.3037174761295318,
"num_tokens": 24556339.0,
"mean_token_accuracy": 0.6810034438967705,
"epoch": 1.7701161452338599,
"step": 4230
},
{
"loss": 1.3638200759887695,
"grad_norm": 0.12909641861915588,
"learning_rate": 3.779142845652275e-07,
"entropy": 1.37214894592762,
"num_tokens": 24610844.0,
"mean_token_accuracy": 0.6698960587382317,
"epoch": 1.7743015590666529,
"step": 4240
},
{
"loss": 1.4049521446228028,
"grad_norm": 0.137081578373909,
"learning_rate": 3.757250696521104e-07,
"entropy": 1.3875975281000137,
"num_tokens": 24663935.0,
"mean_token_accuracy": 0.6685925871133804,
"epoch": 1.7784869728994455,
"step": 4250
},
{
"loss": 1.2346957206726075,
"grad_norm": 0.15599705278873444,
"learning_rate": 3.7353839111223285e-07,
"entropy": 1.2952020585536956,
"num_tokens": 24724653.0,
"mean_token_accuracy": 0.6917664587497712,
"epoch": 1.782672386732238,
"step": 4260
},
{
"loss": 1.3122922897338867,
"grad_norm": 0.14105546474456787,
"learning_rate": 3.713542935743299e-07,
"entropy": 1.3242159157991409,
"num_tokens": 24783350.0,
"mean_token_accuracy": 0.6838966220617294,
"epoch": 1.7868578005650309,
"step": 4270
},
{
"loss": 1.2724437713623047,
"grad_norm": 0.14495964348316193,
"learning_rate": 3.6917282161445986e-07,
"entropy": 1.2849380433559419,
"num_tokens": 24840720.0,
"mean_token_accuracy": 0.6882339790463448,
"epoch": 1.7910432143978237,
"step": 4280
},
{
"loss": 1.2272959709167481,
"grad_norm": 0.1268715113401413,
"learning_rate": 3.66994019755095e-07,
"entropy": 1.2522281989455224,
"num_tokens": 24900268.0,
"mean_token_accuracy": 0.6954008027911186,
"epoch": 1.7952286282306162,
"step": 4290
},
{
"loss": 1.2925883293151856,
"grad_norm": 0.12049921602010727,
"learning_rate": 3.648179324642119e-07,
"entropy": 1.3150138720870017,
"num_tokens": 24955875.0,
"mean_token_accuracy": 0.6815980896353722,
"epoch": 1.799414042063409,
"step": 4300
},
{
"loss": 1.2687823295593261,
"grad_norm": 0.1410578191280365,
"learning_rate": 3.62644604154385e-07,
"entropy": 1.292095237970352,
"num_tokens": 25015757.0,
"mean_token_accuracy": 0.6861520081758499,
"epoch": 1.8035994558962019,
"step": 4310
},
{
"loss": 1.3122770309448242,
"grad_norm": 0.1278466135263443,
"learning_rate": 3.6047407918187923e-07,
"entropy": 1.32326979637146,
"num_tokens": 25073319.0,
"mean_token_accuracy": 0.6822131305932999,
"epoch": 1.8077848697289944,
"step": 4320
},
{
"loss": 1.239914321899414,
"grad_norm": 0.1450994610786438,
"learning_rate": 3.5830640184574567e-07,
"entropy": 1.2679915323853492,
"num_tokens": 25132470.0,
"mean_token_accuracy": 0.6903218165040016,
"epoch": 1.811970283561787,
"step": 4330
},
{
"loss": 1.3607330322265625,
"grad_norm": 0.14295367896556854,
"learning_rate": 3.5614161638691655e-07,
"entropy": 1.361120554804802,
"num_tokens": 25185195.0,
"mean_token_accuracy": 0.6752493545413017,
"epoch": 1.81615569739458,
"step": 4340
},
{
"loss": 1.2877880096435548,
"grad_norm": 0.1336987167596817,
"learning_rate": 3.539797669873029e-07,
"entropy": 1.294544619321823,
"num_tokens": 25241604.0,
"mean_token_accuracy": 0.6817367270588874,
"epoch": 1.8203411112273726,
"step": 4350
},
{
"loss": 1.2616640090942384,
"grad_norm": 0.12443723529577255,
"learning_rate": 3.518208977688924e-07,
"entropy": 1.3023397505283356,
"num_tokens": 25301515.0,
"mean_token_accuracy": 0.6868541851639748,
"epoch": 1.8245265250601652,
"step": 4360
},
{
"loss": 1.2237573623657227,
"grad_norm": 0.14553169906139374,
"learning_rate": 3.496650527928495e-07,
"entropy": 1.2511302560567856,
"num_tokens": 25357723.0,
"mean_token_accuracy": 0.6974032506346702,
"epoch": 1.828711938892958,
"step": 4370
},
{
"loss": 1.3280474662780761,
"grad_norm": 0.12313038110733032,
"learning_rate": 3.4751227605861544e-07,
"entropy": 1.3370114535093307,
"num_tokens": 25417249.0,
"mean_token_accuracy": 0.6781404823064804,
"epoch": 1.8328973527257508,
"step": 4380
},
{
"loss": 1.3116769790649414,
"grad_norm": 0.12443029880523682,
"learning_rate": 3.453626115030103e-07,
"entropy": 1.323847246170044,
"num_tokens": 25476722.0,
"mean_token_accuracy": 0.6824665144085884,
"epoch": 1.8370827665585434,
"step": 4390
},
{
"loss": 1.2541227340698242,
"grad_norm": 0.14306563138961792,
"learning_rate": 3.4321610299933754e-07,
"entropy": 1.275883974134922,
"num_tokens": 25536071.0,
"mean_token_accuracy": 0.6896202132105828,
"epoch": 1.8412681803913362,
"step": 4400
},
{
"eval_loss": 1.300899624824524,
"eval_runtime": 43.6667,
"eval_samples_per_second": 145.901,
"eval_steps_per_second": 6.092,
"eval_entropy": 1.31194052436298,
"eval_num_tokens": 25536071.0,
"eval_mean_token_accuracy": 0.6834116909736977,
"epoch": 1.8412681803913362,
"step": 4400
},
{
"loss": 1.3052658081054687,
"grad_norm": 0.12484145909547806,
"learning_rate": 3.410727943564865e-07,
"entropy": 1.304879105091095,
"num_tokens": 25592326.0,
"mean_token_accuracy": 0.6803866416215897,
"epoch": 1.845453594224129,
"step": 4410
},
{
"loss": 1.2852392196655273,
"grad_norm": 0.1245272308588028,
"learning_rate": 3.3893272931804004e-07,
"entropy": 1.2998150080442428,
"num_tokens": 25650560.0,
"mean_token_accuracy": 0.6859546720981597,
"epoch": 1.8496390080569216,
"step": 4420
},
{
"loss": 1.3093093872070312,
"grad_norm": 0.13694874942302704,
"learning_rate": 3.367959515613809e-07,
"entropy": 1.326773339509964,
"num_tokens": 25710390.0,
"mean_token_accuracy": 0.6779543459415436,
"epoch": 1.8538244218897142,
"step": 4430
},
{
"loss": 1.3431434631347656,
"grad_norm": 0.13425195217132568,
"learning_rate": 3.346625046968003e-07,
"entropy": 1.3320137143135071,
"num_tokens": 25765683.0,
"mean_token_accuracy": 0.6735352456569672,
"epoch": 1.8580098357225072,
"step": 4440
},
{
"loss": 1.2724491119384767,
"grad_norm": 0.13366232812404633,
"learning_rate": 3.325324322666081e-07,
"entropy": 1.27188421189785,
"num_tokens": 25824731.0,
"mean_token_accuracy": 0.689846420288086,
"epoch": 1.8621952495552998,
"step": 4450
},
{
"loss": 1.2903520584106445,
"grad_norm": 0.12611544132232666,
"learning_rate": 3.3040577774424437e-07,
"entropy": 1.3168232500553132,
"num_tokens": 25885073.0,
"mean_token_accuracy": 0.6854806423187256,
"epoch": 1.8663806633880924,
"step": 4460
},
{
"loss": 1.3013708114624023,
"grad_norm": 0.14317074418067932,
"learning_rate": 3.2828258453339155e-07,
"entropy": 1.3177704036235809,
"num_tokens": 25942626.0,
"mean_token_accuracy": 0.6822627365589142,
"epoch": 1.8705660772208852,
"step": 4470
},
{
"loss": 1.332556915283203,
"grad_norm": 0.14360307157039642,
"learning_rate": 3.261628959670889e-07,
"entropy": 1.3369245409965516,
"num_tokens": 25997260.0,
"mean_token_accuracy": 0.6774563640356064,
"epoch": 1.874751491053678,
"step": 4480
},
{
"loss": 1.2616004943847656,
"grad_norm": 0.13137495517730713,
"learning_rate": 3.240467553068481e-07,
"entropy": 1.2717559725046157,
"num_tokens": 26055446.0,
"mean_token_accuracy": 0.6905860707163811,
"epoch": 1.8789369048864706,
"step": 4490
},
{
"loss": 1.408462142944336,
"grad_norm": 0.13120928406715393,
"learning_rate": 3.2193420574177034e-07,
"entropy": 1.3706548005342483,
"num_tokens": 26111925.0,
"mean_token_accuracy": 0.6645766496658325,
"epoch": 1.8831223187192634,
"step": 4500
},
{
"loss": 1.2302813529968262,
"grad_norm": 0.15121367573738098,
"learning_rate": 3.1982529038766505e-07,
"entropy": 1.274702313542366,
"num_tokens": 26171418.0,
"mean_token_accuracy": 0.6942442029714584,
"epoch": 1.8873077325520562,
"step": 4510
},
{
"loss": 1.2387846946716308,
"grad_norm": 0.11012545973062515,
"learning_rate": 3.1772005228616933e-07,
"entropy": 1.2893740877509117,
"num_tokens": 26232638.0,
"mean_token_accuracy": 0.6922576785087585,
"epoch": 1.8914931463848488,
"step": 4520
},
{
"loss": 1.292118453979492,
"grad_norm": 0.14446091651916504,
"learning_rate": 3.156185344038699e-07,
"entropy": 1.3311437577009202,
"num_tokens": 26293065.0,
"mean_token_accuracy": 0.6810448184609413,
"epoch": 1.8956785602176414,
"step": 4530
},
{
"loss": 1.33145170211792,
"grad_norm": 0.14474819600582123,
"learning_rate": 3.135207796314263e-07,
"entropy": 1.3151442527770996,
"num_tokens": 26349311.0,
"mean_token_accuracy": 0.6806560069322586,
"epoch": 1.8998639740504344,
"step": 4540
},
{
"loss": 1.2638792037963866,
"grad_norm": 0.11845609545707703,
"learning_rate": 3.114268307826953e-07,
"entropy": 1.2752373963594437,
"num_tokens": 26407067.0,
"mean_token_accuracy": 0.6892140090465546,
"epoch": 1.904049387883227,
"step": 4550
},
{
"loss": 1.341792106628418,
"grad_norm": 0.1555166393518448,
"learning_rate": 3.093367305938572e-07,
"entropy": 1.3313662111759186,
"num_tokens": 26463271.0,
"mean_token_accuracy": 0.6772884294390679,
"epoch": 1.9082348017160196,
"step": 4560
},
{
"loss": 1.2394842147827148,
"grad_norm": 0.13164710998535156,
"learning_rate": 3.072505217225435e-07,
"entropy": 1.2927237793803215,
"num_tokens": 26519442.0,
"mean_token_accuracy": 0.688689187169075,
"epoch": 1.9124202155488124,
"step": 4570
},
{
"loss": 1.2526350021362305,
"grad_norm": 0.12433302402496338,
"learning_rate": 3.051682467469663e-07,
"entropy": 1.3005468085408212,
"num_tokens": 26576793.0,
"mean_token_accuracy": 0.6895026102662086,
"epoch": 1.9166056293816052,
"step": 4580
},
{
"loss": 1.300935935974121,
"grad_norm": 0.14517027139663696,
"learning_rate": 3.030899481650496e-07,
"entropy": 1.3120550215244293,
"num_tokens": 26632676.0,
"mean_token_accuracy": 0.6833431273698807,
"epoch": 1.9207910432143978,
"step": 4590
},
{
"loss": 1.305576705932617,
"grad_norm": 0.13038092851638794,
"learning_rate": 3.010156683935614e-07,
"entropy": 1.3124109566211701,
"num_tokens": 26690585.0,
"mean_token_accuracy": 0.6795723259449005,
"epoch": 1.9249764570471906,
"step": 4600
},
{
"eval_loss": 1.299921989440918,
"eval_runtime": 43.8297,
"eval_samples_per_second": 145.358,
"eval_steps_per_second": 6.069,
"eval_entropy": 1.3205744122204028,
"eval_num_tokens": 26690585.0,
"eval_mean_token_accuracy": 0.6834659193243299,
"epoch": 1.9249764570471906,
"step": 4600
},
{
"loss": 1.2759065628051758,
"grad_norm": 0.13065700232982635,
"learning_rate": 2.9894544976724845e-07,
"entropy": 1.3232569113373756,
"num_tokens": 26750126.0,
"mean_token_accuracy": 0.6856075286865234,
"epoch": 1.9291618708799834,
"step": 4610
},
{
"loss": 1.3960214614868165,
"grad_norm": 0.13134630024433136,
"learning_rate": 2.968793345379722e-07,
"entropy": 1.3706552177667617,
"num_tokens": 26809589.0,
"mean_token_accuracy": 0.6644584119319916,
"epoch": 1.933347284712776,
"step": 4620
},
{
"loss": 1.2981806755065919,
"grad_norm": 0.13195905089378357,
"learning_rate": 2.9481736487384615e-07,
"entropy": 1.2926361411809921,
"num_tokens": 26868122.0,
"mean_token_accuracy": 0.6837931454181672,
"epoch": 1.9375326985455685,
"step": 4630
},
{
"loss": 1.3205986022949219,
"grad_norm": 0.1443631947040558,
"learning_rate": 2.9275958285837567e-07,
"entropy": 1.3107433021068573,
"num_tokens": 26928383.0,
"mean_token_accuracy": 0.6803469866514206,
"epoch": 1.9417181123783616,
"step": 4640
},
{
"loss": 1.2733318328857421,
"grad_norm": 0.12333279103040695,
"learning_rate": 2.907060304895984e-07,
"entropy": 1.306384412944317,
"num_tokens": 26987347.0,
"mean_token_accuracy": 0.6883521243929863,
"epoch": 1.9459035262111541,
"step": 4650
},
{
"loss": 1.2352895736694336,
"grad_norm": 0.15071088075637817,
"learning_rate": 2.8865674967922815e-07,
"entropy": 1.2537823468446732,
"num_tokens": 27044802.0,
"mean_token_accuracy": 0.6937527641654014,
"epoch": 1.9500889400439467,
"step": 4660
},
{
"loss": 1.2543825149536132,
"grad_norm": 0.11400660872459412,
"learning_rate": 2.866117822517982e-07,
"entropy": 1.2866296932101249,
"num_tokens": 27102535.0,
"mean_token_accuracy": 0.6873494073748588,
"epoch": 1.9542743538767395,
"step": 4670
},
{
"loss": 1.3186541557312013,
"grad_norm": 0.13829229772090912,
"learning_rate": 2.8457116994380913e-07,
"entropy": 1.331754493713379,
"num_tokens": 27160092.0,
"mean_token_accuracy": 0.6785844698548317,
"epoch": 1.9584597677095323,
"step": 4680
},
{
"loss": 1.305195140838623,
"grad_norm": 0.13482722640037537,
"learning_rate": 2.8253495440287555e-07,
"entropy": 1.3345891624689101,
"num_tokens": 27216273.0,
"mean_token_accuracy": 0.6796174451708794,
"epoch": 1.962645181542325,
"step": 4690
},
{
"loss": 1.2968315124511718,
"grad_norm": 0.13567803800106049,
"learning_rate": 2.805031771868774e-07,
"entropy": 1.3210385277867318,
"num_tokens": 27274967.0,
"mean_token_accuracy": 0.6838418498635292,
"epoch": 1.9668305953751177,
"step": 4700
},
{
"loss": 1.2280590057373046,
"grad_norm": 0.11562594771385193,
"learning_rate": 2.784758797631113e-07,
"entropy": 1.2723073571920396,
"num_tokens": 27332706.0,
"mean_token_accuracy": 0.693420697748661,
"epoch": 1.9710160092079105,
"step": 4710
},
{
"loss": 1.2968725204467773,
"grad_norm": 0.1304372102022171,
"learning_rate": 2.7645310350744293e-07,
"entropy": 1.3245373040437698,
"num_tokens": 27391429.0,
"mean_token_accuracy": 0.685273765027523,
"epoch": 1.9752014230407031,
"step": 4720
},
{
"loss": 1.2400999069213867,
"grad_norm": 0.12765555083751678,
"learning_rate": 2.744348897034657e-07,
"entropy": 1.2704340279102326,
"num_tokens": 27449195.0,
"mean_token_accuracy": 0.6926597207784653,
"epoch": 1.9793868368734957,
"step": 4730
},
{
"loss": 1.313642406463623,
"grad_norm": 0.12744539976119995,
"learning_rate": 2.724212795416544e-07,
"entropy": 1.323761799931526,
"num_tokens": 27507409.0,
"mean_token_accuracy": 0.6781192749738694,
"epoch": 1.9835722507062887,
"step": 4740
},
{
"loss": 1.3404882431030274,
"grad_norm": 0.1226453185081482,
"learning_rate": 2.704123141185275e-07,
"entropy": 1.3297797441482544,
"num_tokens": 27562728.0,
"mean_token_accuracy": 0.6767387732863426,
"epoch": 1.9877576645390813,
"step": 4750
},
{
"loss": 1.3537111282348633,
"grad_norm": 0.1401350498199463,
"learning_rate": 2.6840803443580715e-07,
"entropy": 1.3468406647443771,
"num_tokens": 27622323.0,
"mean_token_accuracy": 0.6730136394500732,
"epoch": 1.991943078371874,
"step": 4760
},
{
"loss": 1.337536907196045,
"grad_norm": 0.13678760826587677,
"learning_rate": 2.664084813995818e-07,
"entropy": 1.3439167469739914,
"num_tokens": 27679189.0,
"mean_token_accuracy": 0.676570326089859,
"epoch": 1.9961284922046667,
"step": 4770
},
{
"loss": 1.2568793296813965,
"grad_norm": 0.37100762128829956,
"learning_rate": 2.644136958194727e-07,
"entropy": 1.2735960676863387,
"num_tokens": 27730417.0,
"mean_token_accuracy": 0.6950372699144725,
"epoch": 2.0,
"step": 4780
},
{
"loss": 1.2392065048217773,
"grad_norm": 0.12272343039512634,
"learning_rate": 2.624237184078004e-07,
"entropy": 1.2709258124232292,
"num_tokens": 27790663.0,
"mean_token_accuracy": 0.6911343216896058,
"epoch": 2.0041854138327926,
"step": 4790
},
{
"loss": 1.2543585777282715,
"grad_norm": 0.11581531912088394,
"learning_rate": 2.6043858977875287e-07,
"entropy": 1.3081357836723329,
"num_tokens": 27848150.0,
"mean_token_accuracy": 0.6878180950880051,
"epoch": 2.0083708276655856,
"step": 4800
},
{
"eval_loss": 1.2991561889648438,
"eval_runtime": 42.5158,
"eval_samples_per_second": 149.85,
"eval_steps_per_second": 6.256,
"eval_entropy": 1.2998529121391755,
"eval_num_tokens": 27848150.0,
"eval_mean_token_accuracy": 0.683588629378412,
"epoch": 2.0083708276655856,
"step": 4800
},
{
"loss": 1.26077880859375,
"grad_norm": 0.1441148817539215,
"learning_rate": 2.584583504475587e-07,
"entropy": 1.2779432222247125,
"num_tokens": 27905176.0,
"mean_token_accuracy": 0.6864334151148797,
"epoch": 2.012556241498378,
"step": 4810
},
{
"loss": 1.2669543266296386,
"grad_norm": 0.1253192126750946,
"learning_rate": 2.5648304082965775e-07,
"entropy": 1.2866142064332962,
"num_tokens": 27963167.0,
"mean_token_accuracy": 0.6874963492155075,
"epoch": 2.016741655331171,
"step": 4820
},
{
"loss": 1.3348177909851073,
"grad_norm": 0.14420834183692932,
"learning_rate": 2.5451270123987843e-07,
"entropy": 1.3393577009439468,
"num_tokens": 28020052.0,
"mean_token_accuracy": 0.6757590815424919,
"epoch": 2.020927069163964,
"step": 4830
},
{
"loss": 1.2622191429138183,
"grad_norm": 0.13085411489009857,
"learning_rate": 2.5254737189161373e-07,
"entropy": 1.3007038220763207,
"num_tokens": 28078981.0,
"mean_token_accuracy": 0.6892907366156578,
"epoch": 2.0251124829967564,
"step": 4840
},
{
"loss": 1.3640257835388183,
"grad_norm": 0.15776140987873077,
"learning_rate": 2.5058709289600067e-07,
"entropy": 1.3638720154762267,
"num_tokens": 28134625.0,
"mean_token_accuracy": 0.6710668623447418,
"epoch": 2.029297896829549,
"step": 4850
},
{
"loss": 1.2844989776611329,
"grad_norm": 0.11809894442558289,
"learning_rate": 2.486319042611019e-07,
"entropy": 1.2931891083717346,
"num_tokens": 28194798.0,
"mean_token_accuracy": 0.6872560605406761,
"epoch": 2.0334833106623416,
"step": 4860
},
{
"loss": 1.3038467407226562,
"grad_norm": 0.1337338089942932,
"learning_rate": 2.4668184589108867e-07,
"entropy": 1.33267682492733,
"num_tokens": 28252833.0,
"mean_token_accuracy": 0.6824387982487679,
"epoch": 2.0376687244951346,
"step": 4870
},
{
"loss": 1.3354660034179688,
"grad_norm": 0.15302719175815582,
"learning_rate": 2.4473695758542707e-07,
"entropy": 1.343110579252243,
"num_tokens": 28312031.0,
"mean_token_accuracy": 0.6740260154008866,
"epoch": 2.041854138327927,
"step": 4880
},
{
"loss": 1.2677290916442872,
"grad_norm": 0.13939176499843597,
"learning_rate": 2.4279727903806556e-07,
"entropy": 1.2891633421182633,
"num_tokens": 28370177.0,
"mean_token_accuracy": 0.6864463344216347,
"epoch": 2.0460395521607198,
"step": 4890
},
{
"loss": 1.2482310295104981,
"grad_norm": 0.1433423012495041,
"learning_rate": 2.408628498366242e-07,
"entropy": 1.2710548743605614,
"num_tokens": 28429335.0,
"mean_token_accuracy": 0.6896666899323464,
"epoch": 2.050224965993513,
"step": 4900
},
{
"loss": 1.2301183700561524,
"grad_norm": 0.12237236648797989,
"learning_rate": 2.389337094615875e-07,
"entropy": 1.278349894285202,
"num_tokens": 28489915.0,
"mean_token_accuracy": 0.6930847212672233,
"epoch": 2.0544103798263054,
"step": 4910
},
{
"loss": 1.2912443161010743,
"grad_norm": 0.17072777450084686,
"learning_rate": 2.370098972854987e-07,
"entropy": 1.277844424545765,
"num_tokens": 28547300.0,
"mean_token_accuracy": 0.68348438590765,
"epoch": 2.058595793659098,
"step": 4920
},
{
"loss": 1.2300211906433105,
"grad_norm": 0.12249208986759186,
"learning_rate": 2.3509145257215495e-07,
"entropy": 1.2522578805685043,
"num_tokens": 28607326.0,
"mean_token_accuracy": 0.6944442689418793,
"epoch": 2.0627812074918905,
"step": 4930
},
{
"loss": 1.2790916442871094,
"grad_norm": 0.22464126348495483,
"learning_rate": 2.3317841447580767e-07,
"entropy": 1.2809948831796647,
"num_tokens": 28661604.0,
"mean_token_accuracy": 0.6859934300184249,
"epoch": 2.0669666213246836,
"step": 4940
},
{
"loss": 1.2724569320678711,
"grad_norm": 0.13732433319091797,
"learning_rate": 2.312708220403623e-07,
"entropy": 1.31765376329422,
"num_tokens": 28718818.0,
"mean_token_accuracy": 0.6880935072898865,
"epoch": 2.071152035157476,
"step": 4950
},
{
"loss": 1.3442106246948242,
"grad_norm": 0.12912045419216156,
"learning_rate": 2.2936871419858194e-07,
"entropy": 1.3523348033428193,
"num_tokens": 28775584.0,
"mean_token_accuracy": 0.6736010074615478,
"epoch": 2.0753374489902687,
"step": 4960
},
{
"loss": 1.1986734390258789,
"grad_norm": 0.13469566404819489,
"learning_rate": 2.2747212977129217e-07,
"entropy": 1.2723553344607352,
"num_tokens": 28835156.0,
"mean_token_accuracy": 0.6995670750737191,
"epoch": 2.0795228628230618,
"step": 4970
},
{
"loss": 1.3475229263305664,
"grad_norm": 0.1290500909090042,
"learning_rate": 2.2558110746658953e-07,
"entropy": 1.3560008838772775,
"num_tokens": 28895271.0,
"mean_token_accuracy": 0.6725587636232376,
"epoch": 2.0837082766558543,
"step": 4980
},
{
"loss": 1.342457389831543,
"grad_norm": 0.1255948692560196,
"learning_rate": 2.236956858790513e-07,
"entropy": 1.3365329071879386,
"num_tokens": 28953018.0,
"mean_token_accuracy": 0.679172757267952,
"epoch": 2.087893690488647,
"step": 4990
},
{
"loss": 1.2630849838256837,
"grad_norm": 0.13795071840286255,
"learning_rate": 2.218159034889469e-07,
"entropy": 1.2892632216215134,
"num_tokens": 29012433.0,
"mean_token_accuracy": 0.6912027075886726,
"epoch": 2.09207910432144,
"step": 5000
},
{
"eval_loss": 1.298377513885498,
"eval_runtime": 43.7919,
"eval_samples_per_second": 145.484,
"eval_steps_per_second": 6.074,
"eval_entropy": 1.3099012760291422,
"eval_num_tokens": 29012433.0,
"eval_mean_token_accuracy": 0.6836510939257485,
"epoch": 2.09207910432144,
"step": 5000
},
{
"loss": 1.2960596084594727,
"grad_norm": 0.11307420581579208,
"learning_rate": 2.1994179866145396e-07,
"entropy": 1.3118484735488891,
"num_tokens": 29070217.0,
"mean_token_accuracy": 0.6797604545950889,
"epoch": 2.0962645181542325,
"step": 5010
},
{
"loss": 1.3223162651062013,
"grad_norm": 0.15304112434387207,
"learning_rate": 2.180734096458746e-07,
"entropy": 1.3404868721961976,
"num_tokens": 29126476.0,
"mean_token_accuracy": 0.6759276837110519,
"epoch": 2.100449931987025,
"step": 5020
},
{
"loss": 1.3450361251831056,
"grad_norm": 0.11615368723869324,
"learning_rate": 2.1621077457485427e-07,
"entropy": 1.3462235242128373,
"num_tokens": 29184125.0,
"mean_token_accuracy": 0.6701866090297699,
"epoch": 2.104635345819818,
"step": 5030
},
{
"loss": 1.3292051315307618,
"grad_norm": 0.1241302341222763,
"learning_rate": 2.1435393146360453e-07,
"entropy": 1.3317017763853074,
"num_tokens": 29243309.0,
"mean_token_accuracy": 0.6787121832370758,
"epoch": 2.1088207596526107,
"step": 5040
},
{
"loss": 1.309870719909668,
"grad_norm": 0.12809441983699799,
"learning_rate": 2.1250291820912648e-07,
"entropy": 1.3308863699436189,
"num_tokens": 29302274.0,
"mean_token_accuracy": 0.6813490375876426,
"epoch": 2.1130061734854033,
"step": 5050
},
{
"loss": 1.259312343597412,
"grad_norm": 0.11709679663181305,
"learning_rate": 2.1065777258943763e-07,
"entropy": 1.2945900693535806,
"num_tokens": 29359001.0,
"mean_token_accuracy": 0.6841064542531967,
"epoch": 2.117191587318196,
"step": 5060
},
{
"loss": 1.1917829513549805,
"grad_norm": 0.13013018667697906,
"learning_rate": 2.0881853226280082e-07,
"entropy": 1.252656841278076,
"num_tokens": 29417257.0,
"mean_token_accuracy": 0.7048160001635552,
"epoch": 2.121377001150989,
"step": 5070
},
{
"loss": 1.2949867248535156,
"grad_norm": 0.15123531222343445,
"learning_rate": 2.0698523476695506e-07,
"entropy": 1.316368493437767,
"num_tokens": 29474012.0,
"mean_token_accuracy": 0.6840205147862435,
"epoch": 2.1255624149837815,
"step": 5080
},
{
"loss": 1.231495475769043,
"grad_norm": 0.13549339771270752,
"learning_rate": 2.0515791751835066e-07,
"entropy": 1.261933021247387,
"num_tokens": 29535364.0,
"mean_token_accuracy": 0.6923261538147927,
"epoch": 2.129747828816574,
"step": 5090
},
{
"loss": 1.265492820739746,
"grad_norm": 0.12323841452598572,
"learning_rate": 2.0333661781138406e-07,
"entropy": 1.2891878262162209,
"num_tokens": 29594045.0,
"mean_token_accuracy": 0.6874890491366387,
"epoch": 2.133933242649367,
"step": 5100
},
{
"loss": 1.3455522537231446,
"grad_norm": 0.12925904989242554,
"learning_rate": 2.015213728176381e-07,
"entropy": 1.355113722383976,
"num_tokens": 29654672.0,
"mean_token_accuracy": 0.6736163109540939,
"epoch": 2.1381186564821597,
"step": 5110
},
{
"loss": 1.2876879692077636,
"grad_norm": 0.10625462979078293,
"learning_rate": 1.9971221958512259e-07,
"entropy": 1.308001670241356,
"num_tokens": 29713404.0,
"mean_token_accuracy": 0.6850254252552986,
"epoch": 2.1423040703149523,
"step": 5120
},
{
"loss": 1.269423484802246,
"grad_norm": 0.14946334064006805,
"learning_rate": 1.9790919503751786e-07,
"entropy": 1.2912926644086837,
"num_tokens": 29768834.0,
"mean_token_accuracy": 0.6910573810338974,
"epoch": 2.146489484147745,
"step": 5130
},
{
"loss": 1.3150415420532227,
"grad_norm": 0.15966582298278809,
"learning_rate": 1.961123359734222e-07,
"entropy": 1.3350969046354293,
"num_tokens": 29823986.0,
"mean_token_accuracy": 0.6827343329787254,
"epoch": 2.150674897980538,
"step": 5140
},
{
"loss": 1.2534076690673828,
"grad_norm": 0.13799019157886505,
"learning_rate": 1.9432167906560025e-07,
"entropy": 1.2794459909200668,
"num_tokens": 29882161.0,
"mean_token_accuracy": 0.6894301295280456,
"epoch": 2.1548603118133305,
"step": 5150
},
{
"loss": 1.226758861541748,
"grad_norm": 0.16427931189537048,
"learning_rate": 1.9253726086023376e-07,
"entropy": 1.2521668612957,
"num_tokens": 29938237.0,
"mean_token_accuracy": 0.6923803791403771,
"epoch": 2.159045725646123,
"step": 5160
},
{
"loss": 1.2537633895874023,
"grad_norm": 0.13021980226039886,
"learning_rate": 1.9075911777617776e-07,
"entropy": 1.2832251608371734,
"num_tokens": 29993951.0,
"mean_token_accuracy": 0.6919069468975068,
"epoch": 2.163231139478916,
"step": 5170
},
{
"loss": 1.2582441329956056,
"grad_norm": 0.13316968083381653,
"learning_rate": 1.8898728610421473e-07,
"entropy": 1.2960840493440628,
"num_tokens": 30053405.0,
"mean_token_accuracy": 0.6867008566856384,
"epoch": 2.1674165533117087,
"step": 5180
},
{
"loss": 1.2535063743591308,
"grad_norm": 0.1502976417541504,
"learning_rate": 1.8722180200631598e-07,
"entropy": 1.291701939702034,
"num_tokens": 30111434.0,
"mean_token_accuracy": 0.6882436692714691,
"epoch": 2.1716019671445013,
"step": 5190
},
{
"loss": 1.2767062187194824,
"grad_norm": 0.1319260597229004,
"learning_rate": 1.8546270151490278e-07,
"entropy": 1.298856572806835,
"num_tokens": 30168307.0,
"mean_token_accuracy": 0.68586795181036,
"epoch": 2.1757873809772943,
"step": 5200
},
{
"eval_loss": 1.297808051109314,
"eval_runtime": 43.6552,
"eval_samples_per_second": 145.939,
"eval_steps_per_second": 6.093,
"eval_entropy": 1.307837866750875,
"eval_num_tokens": 30168307.0,
"eval_mean_token_accuracy": 0.6838091374339914,
"epoch": 2.1757873809772943,
"step": 5200
},
{
"loss": 1.2791316032409668,
"grad_norm": 0.12315330654382706,
"learning_rate": 1.8371002053211048e-07,
"entropy": 1.3057184204459191,
"num_tokens": 30225681.0,
"mean_token_accuracy": 0.6861935615539551,
"epoch": 2.179972794810087,
"step": 5210
},
{
"loss": 1.32224760055542,
"grad_norm": 0.13483846187591553,
"learning_rate": 1.819637948290569e-07,
"entropy": 1.3247323662042618,
"num_tokens": 30283602.0,
"mean_token_accuracy": 0.677856071293354,
"epoch": 2.1841582086428795,
"step": 5220
},
{
"loss": 1.245813751220703,
"grad_norm": 0.12423646450042725,
"learning_rate": 1.8022406004511114e-07,
"entropy": 1.2820057839155197,
"num_tokens": 30343652.0,
"mean_token_accuracy": 0.6916850328445434,
"epoch": 2.1883436224756725,
"step": 5230
},
{
"loss": 1.313099193572998,
"grad_norm": 0.1301707625389099,
"learning_rate": 1.7849085168716704e-07,
"entropy": 1.3053890287876129,
"num_tokens": 30400983.0,
"mean_token_accuracy": 0.6804193690419197,
"epoch": 2.192529036308465,
"step": 5240
},
{
"loss": 1.2556833267211913,
"grad_norm": 0.1505342423915863,
"learning_rate": 1.7676420512891842e-07,
"entropy": 1.2873410269618035,
"num_tokens": 30459009.0,
"mean_token_accuracy": 0.6887684732675552,
"epoch": 2.1967144501412577,
"step": 5250
},
{
"loss": 1.2559351921081543,
"grad_norm": 0.13919785618782043,
"learning_rate": 1.7504415561013614e-07,
"entropy": 1.2811901897192002,
"num_tokens": 30516861.0,
"mean_token_accuracy": 0.6915321722626686,
"epoch": 2.2008998639740502,
"step": 5260
},
{
"loss": 1.2761926651000977,
"grad_norm": 0.12455730140209198,
"learning_rate": 1.7333073823595025e-07,
"entropy": 1.2844579115509986,
"num_tokens": 30575979.0,
"mean_token_accuracy": 0.6861526161432266,
"epoch": 2.2050852778068433,
"step": 5270
},
{
"loss": 1.2840014457702638,
"grad_norm": 0.13087549805641174,
"learning_rate": 1.7162398797613282e-07,
"entropy": 1.2940828785300256,
"num_tokens": 30633600.0,
"mean_token_accuracy": 0.685159420967102,
"epoch": 2.209270691639636,
"step": 5280
},
{
"loss": 1.3325956344604493,
"grad_norm": 0.15323391556739807,
"learning_rate": 1.6992393966438405e-07,
"entropy": 1.3237911939620972,
"num_tokens": 30693015.0,
"mean_token_accuracy": 0.6795177638530732,
"epoch": 2.2134561054724284,
"step": 5290
},
{
"loss": 1.310387420654297,
"grad_norm": 0.12490073591470718,
"learning_rate": 1.6823062799762205e-07,
"entropy": 1.3257877498865127,
"num_tokens": 30749233.0,
"mean_token_accuracy": 0.6818015187978744,
"epoch": 2.2176415193052215,
"step": 5300
},
{
"loss": 1.2663789749145509,
"grad_norm": 0.1327386498451233,
"learning_rate": 1.6654408753527361e-07,
"entropy": 1.3193859189748764,
"num_tokens": 30809674.0,
"mean_token_accuracy": 0.6879936501383781,
"epoch": 2.221826933138014,
"step": 5310
},
{
"loss": 1.3301811218261719,
"grad_norm": 0.14070047438144684,
"learning_rate": 1.6486435269856985e-07,
"entropy": 1.3461501210927964,
"num_tokens": 30867279.0,
"mean_token_accuracy": 0.6762196362018585,
"epoch": 2.2260123469708066,
"step": 5320
},
{
"loss": 1.2323862075805665,
"grad_norm": 0.14718832075595856,
"learning_rate": 1.6319145776984361e-07,
"entropy": 1.2663889586925507,
"num_tokens": 30923604.0,
"mean_token_accuracy": 0.6963629499077797,
"epoch": 2.2301977608035997,
"step": 5330
},
{
"loss": 1.4061556816101075,
"grad_norm": 0.11397302895784378,
"learning_rate": 1.6152543689182885e-07,
"entropy": 1.3801796600222587,
"num_tokens": 30983941.0,
"mean_token_accuracy": 0.6657746851444244,
"epoch": 2.2343831746363922,
"step": 5340
},
{
"loss": 1.3607137680053711,
"grad_norm": 0.13541868329048157,
"learning_rate": 1.5986632406696515e-07,
"entropy": 1.3243082225322724,
"num_tokens": 31042120.0,
"mean_token_accuracy": 0.6700464963912964,
"epoch": 2.238568588469185,
"step": 5350
},
{
"loss": 1.2886553764343263,
"grad_norm": 0.1389724761247635,
"learning_rate": 1.5821415315670251e-07,
"entropy": 1.3397713720798492,
"num_tokens": 31102163.0,
"mean_token_accuracy": 0.6880467623472214,
"epoch": 2.2427540023019774,
"step": 5360
},
{
"loss": 1.220026397705078,
"grad_norm": 0.1286465972661972,
"learning_rate": 1.5656895788081104e-07,
"entropy": 1.256170129776001,
"num_tokens": 31159675.0,
"mean_token_accuracy": 0.6972913891077042,
"epoch": 2.2469394161347704,
"step": 5370
},
{
"loss": 1.2820704460144043,
"grad_norm": 0.1213146299123764,
"learning_rate": 1.5493077181669272e-07,
"entropy": 1.2981676012277603,
"num_tokens": 31219684.0,
"mean_token_accuracy": 0.688413429260254,
"epoch": 2.251124829967563,
"step": 5380
},
{
"loss": 1.2337745666503905,
"grad_norm": 0.1330552101135254,
"learning_rate": 1.532996283986957e-07,
"entropy": 1.2481247037649155,
"num_tokens": 31284113.0,
"mean_token_accuracy": 0.694562304019928,
"epoch": 2.2553102438003556,
"step": 5390
},
{
"loss": 1.3011648178100585,
"grad_norm": 0.14552603662014008,
"learning_rate": 1.5167556091743238e-07,
"entropy": 1.3327119797468185,
"num_tokens": 31344186.0,
"mean_token_accuracy": 0.6838112965226173,
"epoch": 2.2594956576331486,
"step": 5400
},
{
"eval_loss": 1.297374963760376,
"eval_runtime": 43.7773,
"eval_samples_per_second": 145.532,
"eval_steps_per_second": 6.076,
"eval_entropy": 1.315984815135038,
"eval_num_tokens": 31344186.0,
"eval_mean_token_accuracy": 0.6838296656321762,
"epoch": 2.2594956576331486,
"step": 5400
},
{
"loss": 1.256122875213623,
"grad_norm": 0.11425146460533142,
"learning_rate": 1.5005860251909918e-07,
"entropy": 1.2993682414293288,
"num_tokens": 31399330.0,
"mean_token_accuracy": 0.6879714965820313,
"epoch": 2.263681071465941,
"step": 5410
},
{
"loss": 1.243597412109375,
"grad_norm": 0.14105035364627838,
"learning_rate": 1.4844878620480124e-07,
"entropy": 1.2901643484830856,
"num_tokens": 31458043.0,
"mean_token_accuracy": 0.6898476853966713,
"epoch": 2.267866485298734,
"step": 5420
},
{
"loss": 1.3062746047973632,
"grad_norm": 0.1269349455833435,
"learning_rate": 1.4684614482987805e-07,
"entropy": 1.3157608151435851,
"num_tokens": 31515675.0,
"mean_token_accuracy": 0.6781650841236114,
"epoch": 2.272051899131527,
"step": 5430
},
{
"loss": 1.3145343780517578,
"grad_norm": 0.13170845806598663,
"learning_rate": 1.452507111032329e-07,
"entropy": 1.3244775086641312,
"num_tokens": 31573727.0,
"mean_token_accuracy": 0.6795195579528809,
"epoch": 2.2762373129643194,
"step": 5440
},
{
"loss": 1.2743712425231934,
"grad_norm": 0.13130150735378265,
"learning_rate": 1.4366251758666558e-07,
"entropy": 1.3025973543524743,
"num_tokens": 31632527.0,
"mean_token_accuracy": 0.6849103718996048,
"epoch": 2.280422726797112,
"step": 5450
},
{
"loss": 1.2729061126708985,
"grad_norm": 0.14398252964019775,
"learning_rate": 1.4208159669420817e-07,
"entropy": 1.2966506034135818,
"num_tokens": 31688226.0,
"mean_token_accuracy": 0.6885296568274498,
"epoch": 2.2846081406299046,
"step": 5460
},
{
"loss": 1.308814811706543,
"grad_norm": 0.1466449350118637,
"learning_rate": 1.405079806914623e-07,
"entropy": 1.306171926856041,
"num_tokens": 31743518.0,
"mean_token_accuracy": 0.6793627932667732,
"epoch": 2.2887935544626976,
"step": 5470
},
{
"loss": 1.295256996154785,
"grad_norm": 0.12834027409553528,
"learning_rate": 1.389417016949419e-07,
"entropy": 1.316891822218895,
"num_tokens": 31800911.0,
"mean_token_accuracy": 0.6853345051407814,
"epoch": 2.29297896829549,
"step": 5480
},
{
"loss": 1.3327623367309571,
"grad_norm": 0.1662720888853073,
"learning_rate": 1.3738279167141725e-07,
"entropy": 1.3393938541412354,
"num_tokens": 31860118.0,
"mean_token_accuracy": 0.6735303267836571,
"epoch": 2.297164382128283,
"step": 5490
},
{
"loss": 1.279651165008545,
"grad_norm": 0.12805919349193573,
"learning_rate": 1.3583128243726227e-07,
"entropy": 1.2862314611673356,
"num_tokens": 31917654.0,
"mean_token_accuracy": 0.6885863587260246,
"epoch": 2.301349795961076,
"step": 5500
},
{
"loss": 1.2778766632080079,
"grad_norm": 0.16033422946929932,
"learning_rate": 1.3428720565780578e-07,
"entropy": 1.300406639277935,
"num_tokens": 31974868.0,
"mean_token_accuracy": 0.6882018774747849,
"epoch": 2.3055352097938684,
"step": 5510
},
{
"loss": 1.342056941986084,
"grad_norm": 0.16284961998462677,
"learning_rate": 1.327505928466842e-07,
"entropy": 1.3293492585420608,
"num_tokens": 32033943.0,
"mean_token_accuracy": 0.6768848091363907,
"epoch": 2.309720623626661,
"step": 5520
},
{
"loss": 1.206116008758545,
"grad_norm": 0.1340523660182953,
"learning_rate": 1.3122147536519985e-07,
"entropy": 1.258744315803051,
"num_tokens": 32095146.0,
"mean_token_accuracy": 0.6991240099072457,
"epoch": 2.3139060374594536,
"step": 5530
},
{
"loss": 1.255533218383789,
"grad_norm": 0.12596993148326874,
"learning_rate": 1.2969988442167934e-07,
"entropy": 1.2745139241218566,
"num_tokens": 32158070.0,
"mean_token_accuracy": 0.6862679213285446,
"epoch": 2.3180914512922466,
"step": 5540
},
{
"loss": 1.270913314819336,
"grad_norm": 0.14521045982837677,
"learning_rate": 1.2818585107083797e-07,
"entropy": 1.2917841017246245,
"num_tokens": 32213049.0,
"mean_token_accuracy": 0.688049279153347,
"epoch": 2.322276865125039,
"step": 5550
},
{
"loss": 1.2614711761474608,
"grad_norm": 0.12460001558065414,
"learning_rate": 1.2667940621314516e-07,
"entropy": 1.288702441751957,
"num_tokens": 32270375.0,
"mean_token_accuracy": 0.691080367565155,
"epoch": 2.326462278957832,
"step": 5560
},
{
"loss": 1.297041893005371,
"grad_norm": 0.12860845029354095,
"learning_rate": 1.2518058059419356e-07,
"entropy": 1.2874844074249268,
"num_tokens": 32327913.0,
"mean_token_accuracy": 0.6812907472252846,
"epoch": 2.330647692790625,
"step": 5570
},
{
"loss": 1.2307467460632324,
"grad_norm": 0.13343603909015656,
"learning_rate": 1.2368940480407242e-07,
"entropy": 1.2836890518665314,
"num_tokens": 32385583.0,
"mean_token_accuracy": 0.6963008731603623,
"epoch": 2.3348331066234174,
"step": 5580
},
{
"loss": 1.3668609619140626,
"grad_norm": 0.13145415484905243,
"learning_rate": 1.2220590927674286e-07,
"entropy": 1.3669442266225815,
"num_tokens": 32441025.0,
"mean_token_accuracy": 0.671772038936615,
"epoch": 2.33901852045621,
"step": 5590
},
{
"loss": 1.3068817138671875,
"grad_norm": 0.13144521415233612,
"learning_rate": 1.2073012428941588e-07,
"entropy": 1.3122945204377174,
"num_tokens": 32499899.0,
"mean_token_accuracy": 0.674852766096592,
"epoch": 2.343203934289003,
"step": 5600
},
{
"eval_loss": 1.2969086170196533,
"eval_runtime": 43.7803,
"eval_samples_per_second": 145.522,
"eval_steps_per_second": 6.076,
"eval_entropy": 1.312249709789018,
"eval_num_tokens": 32499899.0,
"eval_mean_token_accuracy": 0.6838675230965578,
"epoch": 2.343203934289003,
"step": 5600
},
{
"loss": 1.33385009765625,
"grad_norm": 0.125252828001976,
"learning_rate": 1.1926207996193638e-07,
"entropy": 1.3582130268216133,
"num_tokens": 32556560.0,
"mean_token_accuracy": 0.678239768743515,
"epoch": 2.3473893481217956,
"step": 5610
},
{
"loss": 1.247665023803711,
"grad_norm": 0.13559651374816895,
"learning_rate": 1.178018062561662e-07,
"entropy": 1.2727186426520347,
"num_tokens": 32617607.0,
"mean_token_accuracy": 0.6923329353332519,
"epoch": 2.351574761954588,
"step": 5620
},
{
"loss": 1.2945147514343263,
"grad_norm": 0.12413690984249115,
"learning_rate": 1.1634933297537425e-07,
"entropy": 1.3126976788043976,
"num_tokens": 32676183.0,
"mean_token_accuracy": 0.6811081647872925,
"epoch": 2.355760175787381,
"step": 5630
},
{
"loss": 1.2760995864868163,
"grad_norm": 0.15444409847259521,
"learning_rate": 1.1490468976362766e-07,
"entropy": 1.3008133977651597,
"num_tokens": 32732392.0,
"mean_token_accuracy": 0.6872219279408455,
"epoch": 2.3599455896201738,
"step": 5640
},
{
"loss": 1.302404022216797,
"grad_norm": 0.1389995664358139,
"learning_rate": 1.1346790610518636e-07,
"entropy": 1.3151475220918656,
"num_tokens": 32788966.0,
"mean_token_accuracy": 0.6797765508294106,
"epoch": 2.3641310034529663,
"step": 5650
},
{
"loss": 1.288191795349121,
"grad_norm": 0.14642177522182465,
"learning_rate": 1.1203901132390225e-07,
"entropy": 1.3152502685785294,
"num_tokens": 32849483.0,
"mean_token_accuracy": 0.6831487894058228,
"epoch": 2.368316417285759,
"step": 5660
},
{
"loss": 1.2493846893310547,
"grad_norm": 0.13448752462863922,
"learning_rate": 1.1061803458261976e-07,
"entropy": 1.2866099685430528,
"num_tokens": 32907775.0,
"mean_token_accuracy": 0.6911607295274734,
"epoch": 2.372501831118552,
"step": 5670
},
{
"loss": 1.2729656219482421,
"grad_norm": 0.1279105842113495,
"learning_rate": 1.0920500488258134e-07,
"entropy": 1.294448482990265,
"num_tokens": 32966950.0,
"mean_token_accuracy": 0.6881255716085434,
"epoch": 2.3766872449513445,
"step": 5680
},
{
"loss": 1.2728429794311524,
"grad_norm": 0.1403297632932663,
"learning_rate": 1.0779995106283552e-07,
"entropy": 1.2703639656305312,
"num_tokens": 33022913.0,
"mean_token_accuracy": 0.6847912818193436,
"epoch": 2.380872658784137,
"step": 5690
},
{
"loss": 1.299112606048584,
"grad_norm": 0.11831526458263397,
"learning_rate": 1.0640290179964756e-07,
"entropy": 1.324224580824375,
"num_tokens": 33079983.0,
"mean_token_accuracy": 0.6824282988905906,
"epoch": 2.38505807261693,
"step": 5700
},
{
"loss": 1.327120018005371,
"grad_norm": 0.13661810755729675,
"learning_rate": 1.0501388560591523e-07,
"entropy": 1.3056075662374496,
"num_tokens": 33136523.0,
"mean_token_accuracy": 0.677336810529232,
"epoch": 2.3892434864497227,
"step": 5710
},
{
"loss": 1.3516573905944824,
"grad_norm": 0.12402050942182541,
"learning_rate": 1.0363293083058622e-07,
"entropy": 1.3417491644620896,
"num_tokens": 33194784.0,
"mean_token_accuracy": 0.675346839427948,
"epoch": 2.3934289002825153,
"step": 5720
},
{
"loss": 1.3104659080505372,
"grad_norm": 0.13492602109909058,
"learning_rate": 1.0226006565807982e-07,
"entropy": 1.3131451904773712,
"num_tokens": 33251897.0,
"mean_token_accuracy": 0.6822021931409836,
"epoch": 2.3976143141153083,
"step": 5730
},
{
"loss": 1.3267166137695312,
"grad_norm": 0.13064873218536377,
"learning_rate": 1.0089531810771163e-07,
"entropy": 1.3214107781648636,
"num_tokens": 33307060.0,
"mean_token_accuracy": 0.6773762717843056,
"epoch": 2.401799727948101,
"step": 5740
},
{
"loss": 1.316312599182129,
"grad_norm": 0.15154863893985748,
"learning_rate": 9.953871603312141e-08,
"entropy": 1.3157601684331894,
"num_tokens": 33362416.0,
"mean_token_accuracy": 0.6785706043243408,
"epoch": 2.4059851417808935,
"step": 5750
},
{
"loss": 1.2769282341003418,
"grad_norm": 0.12917333841323853,
"learning_rate": 9.819028712170512e-08,
"entropy": 1.281336858868599,
"num_tokens": 33422722.0,
"mean_token_accuracy": 0.6885020643472671,
"epoch": 2.4101705556136865,
"step": 5760
},
{
"loss": 1.3640668869018555,
"grad_norm": 0.1428443044424057,
"learning_rate": 9.68500588940498e-08,
"entropy": 1.3538337886333465,
"num_tokens": 33483556.0,
"mean_token_accuracy": 0.6655726253986358,
"epoch": 2.414355969446479,
"step": 5770
},
{
"loss": 1.251881980895996,
"grad_norm": 0.14195656776428223,
"learning_rate": 9.551805870337104e-08,
"entropy": 1.2702584967017174,
"num_tokens": 33543254.0,
"mean_token_accuracy": 0.6900080740451813,
"epoch": 2.4185413832792717,
"step": 5780
},
{
"loss": 1.3439226150512695,
"grad_norm": 0.16237884759902954,
"learning_rate": 9.419431373495612e-08,
"entropy": 1.3545999929308892,
"num_tokens": 33601741.0,
"mean_token_accuracy": 0.6744641482830047,
"epoch": 2.4227267971120643,
"step": 5790
},
{
"loss": 1.314307975769043,
"grad_norm": 0.15124961733818054,
"learning_rate": 9.287885100560771e-08,
"entropy": 1.320368728041649,
"num_tokens": 33657327.0,
"mean_token_accuracy": 0.6819353699684143,
"epoch": 2.4269122109448573,
"step": 5800
},
{
"eval_loss": 1.2966619729995728,
"eval_runtime": 42.94,
"eval_samples_per_second": 148.37,
"eval_steps_per_second": 6.195,
"eval_entropy": 1.3098934839542646,
"eval_num_tokens": 33657327.0,
"eval_mean_token_accuracy": 0.6839417216921211,
"epoch": 2.4269122109448573,
"step": 5800
},
{
"loss": 1.2694414138793946,
"grad_norm": 0.1228335052728653,
"learning_rate": 9.157169736309384e-08,
"entropy": 1.29910968542099,
"num_tokens": 33713833.0,
"mean_token_accuracy": 0.6918980091810226,
"epoch": 2.43109762477765,
"step": 5810
},
{
"loss": 1.3777572631835937,
"grad_norm": 0.12483090162277222,
"learning_rate": 9.02728794855988e-08,
"entropy": 1.3498617202043532,
"num_tokens": 33771185.0,
"mean_token_accuracy": 0.6699911892414093,
"epoch": 2.4352830386104425,
"step": 5820
},
{
"loss": 1.3352084159851074,
"grad_norm": 0.14710542559623718,
"learning_rate": 8.898242388117949e-08,
"entropy": 1.3336048945784569,
"num_tokens": 33828941.0,
"mean_token_accuracy": 0.6756347686052322,
"epoch": 2.4394684524432355,
"step": 5830
},
{
"loss": 1.310356903076172,
"grad_norm": 0.16344612836837769,
"learning_rate": 8.770035688722399e-08,
"entropy": 1.3448477059602737,
"num_tokens": 33890202.0,
"mean_token_accuracy": 0.6772162079811096,
"epoch": 2.443653866276028,
"step": 5840
},
{
"loss": 1.2573143005371095,
"grad_norm": 0.13473457098007202,
"learning_rate": 8.642670466991381e-08,
"entropy": 1.27697846442461,
"num_tokens": 33945323.0,
"mean_token_accuracy": 0.6902707099914551,
"epoch": 2.4478392801088207,
"step": 5850
},
{
"loss": 1.2643320083618164,
"grad_norm": 0.12609098851680756,
"learning_rate": 8.516149322369054e-08,
"entropy": 1.3083055540919304,
"num_tokens": 34005115.0,
"mean_token_accuracy": 0.6905182540416718,
"epoch": 2.4520246939416133,
"step": 5860
},
{
"loss": 1.3385416984558105,
"grad_norm": 0.1266658753156662,
"learning_rate": 8.390474837072492e-08,
"entropy": 1.330283808708191,
"num_tokens": 34061458.0,
"mean_token_accuracy": 0.6766823455691338,
"epoch": 2.4562101077744063,
"step": 5870
},
{
"loss": 1.247739601135254,
"grad_norm": 0.1117442175745964,
"learning_rate": 8.265649576038946e-08,
"entropy": 1.2861711964011193,
"num_tokens": 34117371.0,
"mean_token_accuracy": 0.6933671846985817,
"epoch": 2.460395521607199,
"step": 5880
},
{
"loss": 1.3221072196960448,
"grad_norm": 0.1515118032693863,
"learning_rate": 8.141676086873573e-08,
"entropy": 1.3331793665885925,
"num_tokens": 34172239.0,
"mean_token_accuracy": 0.6770834714174271,
"epoch": 2.4645809354399915,
"step": 5890
},
{
"loss": 1.269434928894043,
"grad_norm": 0.13494186103343964,
"learning_rate": 8.018556899797396e-08,
"entropy": 1.2998870089650154,
"num_tokens": 34234355.0,
"mean_token_accuracy": 0.687006613612175,
"epoch": 2.4687663492727845,
"step": 5900
},
{
"loss": 1.326502799987793,
"grad_norm": 0.14256730675697327,
"learning_rate": 7.896294527595638e-08,
"entropy": 1.350116790831089,
"num_tokens": 34295462.0,
"mean_token_accuracy": 0.6760937020182609,
"epoch": 2.472951763105577,
"step": 5910
},
{
"loss": 1.3374545097351074,
"grad_norm": 0.15305058658123016,
"learning_rate": 7.774891465566518e-08,
"entropy": 1.3536745309829712,
"num_tokens": 34353670.0,
"mean_token_accuracy": 0.6751428216695785,
"epoch": 2.4771371769383697,
"step": 5920
},
{
"loss": 1.3062148094177246,
"grad_norm": 0.1269030123949051,
"learning_rate": 7.654350191470216e-08,
"entropy": 1.3079909563064576,
"num_tokens": 34409937.0,
"mean_token_accuracy": 0.6825913473963737,
"epoch": 2.4813225907711627,
"step": 5930
},
{
"loss": 1.3188300132751465,
"grad_norm": 0.11656031757593155,
"learning_rate": 7.534673165478417e-08,
"entropy": 1.3348352879285812,
"num_tokens": 34470681.0,
"mean_token_accuracy": 0.673864497244358,
"epoch": 2.4855080046039553,
"step": 5940
},
{
"loss": 1.263766098022461,
"grad_norm": 0.12488370388746262,
"learning_rate": 7.415862830124032e-08,
"entropy": 1.3003046184778213,
"num_tokens": 34530193.0,
"mean_token_accuracy": 0.6913181528449058,
"epoch": 2.489693418436748,
"step": 5950
},
{
"loss": 1.297060203552246,
"grad_norm": 0.14231456816196442,
"learning_rate": 7.297921610251323e-08,
"entropy": 1.3110292360186577,
"num_tokens": 34585018.0,
"mean_token_accuracy": 0.6845840275287628,
"epoch": 2.493878832269541,
"step": 5960
},
{
"loss": 1.2582796096801758,
"grad_norm": 0.13470889627933502,
"learning_rate": 7.180851912966501e-08,
"entropy": 1.276314914226532,
"num_tokens": 34640793.0,
"mean_token_accuracy": 0.6882349893450737,
"epoch": 2.4980642461023335,
"step": 5970
},
{
"loss": 1.2998489379882812,
"grad_norm": 0.15721286833286285,
"learning_rate": 7.064656127588508e-08,
"entropy": 1.3124357014894485,
"num_tokens": 34694819.0,
"mean_token_accuracy": 0.6838565751910209,
"epoch": 2.502249659935126,
"step": 5980
},
{
"loss": 1.306645965576172,
"grad_norm": 0.11785798519849777,
"learning_rate": 6.949336625600316e-08,
"entropy": 1.3165518283843993,
"num_tokens": 34751259.0,
"mean_token_accuracy": 0.6841330319643021,
"epoch": 2.5064350737679186,
"step": 5990
},
{
"loss": 1.3169086456298829,
"grad_norm": 0.11234049499034882,
"learning_rate": 6.834895760600517e-08,
"entropy": 1.3190216064453124,
"num_tokens": 34808644.0,
"mean_token_accuracy": 0.6796000450849533,
"epoch": 2.5106204876007117,
"step": 6000
},
{
"eval_loss": 1.2964025735855103,
"eval_runtime": 43.5438,
"eval_samples_per_second": 146.312,
"eval_steps_per_second": 6.109,
"eval_entropy": 1.3063210257910247,
"eval_num_tokens": 34808644.0,
"eval_mean_token_accuracy": 0.6839508241728732,
"epoch": 2.5106204876007117,
"step": 6000
},
{
"loss": 1.2840510368347169,
"grad_norm": 0.15145522356033325,
"learning_rate": 6.721335868255229e-08,
"entropy": 1.2826346635818482,
"num_tokens": 34863928.0,
"mean_token_accuracy": 0.6869580999016762,
"epoch": 2.5148059014335042,
"step": 6010
},
{
"loss": 1.267976665496826,
"grad_norm": 0.13854120671749115,
"learning_rate": 6.60865926625051e-08,
"entropy": 1.2938668191432954,
"num_tokens": 34926879.0,
"mean_token_accuracy": 0.6888927921652794,
"epoch": 2.518991315266297,
"step": 6020
},
{
"loss": 1.332705307006836,
"grad_norm": 0.13647040724754333,
"learning_rate": 6.496868254245025e-08,
"entropy": 1.3259623274207115,
"num_tokens": 34984932.0,
"mean_token_accuracy": 0.6760903507471084,
"epoch": 2.52317672909909,
"step": 6030
},
{
"loss": 1.2476840019226074,
"grad_norm": 0.13999158143997192,
"learning_rate": 6.385965113823039e-08,
"entropy": 1.2729045450687408,
"num_tokens": 35042011.0,
"mean_token_accuracy": 0.6900967061519623,
"epoch": 2.5273621429318824,
"step": 6040
},
{
"loss": 1.2344088554382324,
"grad_norm": 0.13583189249038696,
"learning_rate": 6.275952108448018e-08,
"entropy": 1.276967915892601,
"num_tokens": 35100640.0,
"mean_token_accuracy": 0.6938279047608376,
"epoch": 2.531547556764675,
"step": 6050
},
{
"loss": 1.253906536102295,
"grad_norm": 0.14375676214694977,
"learning_rate": 6.166831483416229e-08,
"entropy": 1.2771710246801375,
"num_tokens": 35158864.0,
"mean_token_accuracy": 0.6879908561706543,
"epoch": 2.5357329705974676,
"step": 6060
},
{
"loss": 1.3232227325439454,
"grad_norm": 0.12950512766838074,
"learning_rate": 6.058605465811085e-08,
"entropy": 1.3327802419662476,
"num_tokens": 35216659.0,
"mean_token_accuracy": 0.6788379862904549,
"epoch": 2.5399183844302606,
"step": 6070
},
{
"loss": 1.2316680908203126,
"grad_norm": 0.14246362447738647,
"learning_rate": 5.9512762644576054e-08,
"entropy": 1.2707349091768265,
"num_tokens": 35276829.0,
"mean_token_accuracy": 0.6953957095742226,
"epoch": 2.544103798263053,
"step": 6080
},
{
"loss": 1.2903837203979491,
"grad_norm": 0.14620284736156464,
"learning_rate": 5.844846069877329e-08,
"entropy": 1.3116936787962914,
"num_tokens": 35335255.0,
"mean_token_accuracy": 0.6844032138586045,
"epoch": 2.5482892120958462,
"step": 6090
},
{
"loss": 1.2689558029174806,
"grad_norm": 0.1156439483165741,
"learning_rate": 5.7393170542436694e-08,
"entropy": 1.2803223952651024,
"num_tokens": 35398265.0,
"mean_token_accuracy": 0.6874969124794006,
"epoch": 2.552474625928639,
"step": 6100
},
{
"loss": 1.2471202850341796,
"grad_norm": 0.14455300569534302,
"learning_rate": 5.6346913713375076e-08,
"entropy": 1.2767839536070824,
"num_tokens": 35457467.0,
"mean_token_accuracy": 0.6907158330082893,
"epoch": 2.5566600397614314,
"step": 6110
},
{
"loss": 1.2724437713623047,
"grad_norm": 0.1570362001657486,
"learning_rate": 5.5309711565033055e-08,
"entropy": 1.2919223070144654,
"num_tokens": 35514814.0,
"mean_token_accuracy": 0.688132356107235,
"epoch": 2.560845453594224,
"step": 6120
},
{
"loss": 1.3232336044311523,
"grad_norm": 0.15807446837425232,
"learning_rate": 5.4281585266054755e-08,
"entropy": 1.3095539420843125,
"num_tokens": 35571529.0,
"mean_token_accuracy": 0.6772763684391976,
"epoch": 2.5650308674270166,
"step": 6130
},
{
"loss": 1.2672816276550294,
"grad_norm": 0.12889625132083893,
"learning_rate": 5.326255579985173e-08,
"entropy": 1.2937352240085602,
"num_tokens": 35630131.0,
"mean_token_accuracy": 0.6905935257673264,
"epoch": 2.5692162812598096,
"step": 6140
},
{
"loss": 1.2375890731811523,
"grad_norm": 0.13400977849960327,
"learning_rate": 5.225264396417522e-08,
"entropy": 1.2529444962739944,
"num_tokens": 35690381.0,
"mean_token_accuracy": 0.6925143092870713,
"epoch": 2.573401695092602,
"step": 6150
},
{
"loss": 1.261359405517578,
"grad_norm": 0.10552431643009186,
"learning_rate": 5.125187037069123e-08,
"entropy": 1.2899439319968224,
"num_tokens": 35755420.0,
"mean_token_accuracy": 0.6874815404415131,
"epoch": 2.577587108925395,
"step": 6160
},
{
"loss": 1.2803380966186524,
"grad_norm": 0.12414208054542542,
"learning_rate": 5.026025544455986e-08,
"entropy": 1.3017450347542763,
"num_tokens": 35812900.0,
"mean_token_accuracy": 0.6863995373249054,
"epoch": 2.581772522758188,
"step": 6170
},
{
"loss": 1.2126134872436523,
"grad_norm": 0.11522486060857773,
"learning_rate": 4.9277819424018815e-08,
"entropy": 1.255126628279686,
"num_tokens": 35872604.0,
"mean_token_accuracy": 0.6964934691786766,
"epoch": 2.5859579365909804,
"step": 6180
},
{
"loss": 1.253915023803711,
"grad_norm": 0.11392216384410858,
"learning_rate": 4.830458235996976e-08,
"entropy": 1.2905526503920555,
"num_tokens": 35930850.0,
"mean_token_accuracy": 0.6916301295161247,
"epoch": 2.590143350423773,
"step": 6190
},
{
"loss": 1.2767410278320312,
"grad_norm": 0.12976758182048798,
"learning_rate": 4.7340564115569804e-08,
"entropy": 1.283238247036934,
"num_tokens": 35988841.0,
"mean_token_accuracy": 0.687064278125763,
"epoch": 2.594328764256566,
"step": 6200
},
{
"eval_loss": 1.2962790727615356,
"eval_runtime": 43.7745,
"eval_samples_per_second": 145.542,
"eval_steps_per_second": 6.077,
"eval_entropy": 1.3085140944423532,
"eval_num_tokens": 35988841.0,
"eval_mean_token_accuracy": 0.6838811584433219,
"epoch": 2.594328764256566,
"step": 6200
},
{
"loss": 1.3264198303222656,
"grad_norm": 0.13761702179908752,
"learning_rate": 4.638578436582552e-08,
"entropy": 1.3237684190273284,
"num_tokens": 36044753.0,
"mean_token_accuracy": 0.67743628770113,
"epoch": 2.5985141780893586,
"step": 6210
},
{
"loss": 1.257272720336914,
"grad_norm": 0.1342718005180359,
"learning_rate": 4.544026259719158e-08,
"entropy": 1.288350522518158,
"num_tokens": 36103553.0,
"mean_token_accuracy": 0.6897284865379334,
"epoch": 2.602699591922151,
"step": 6220
},
{
"loss": 1.2772136688232423,
"grad_norm": 0.12511885166168213,
"learning_rate": 4.4504018107173304e-08,
"entropy": 1.3188367202877997,
"num_tokens": 36162370.0,
"mean_token_accuracy": 0.6876938834786415,
"epoch": 2.606885005754944,
"step": 6230
},
{
"loss": 1.3926493644714355,
"grad_norm": 0.15498943626880646,
"learning_rate": 4.3577070003932234e-08,
"entropy": 1.3926087036728858,
"num_tokens": 36218877.0,
"mean_token_accuracy": 0.6645623058080673,
"epoch": 2.611070419587737,
"step": 6240
},
{
"loss": 1.2876951217651367,
"grad_norm": 0.13086406886577606,
"learning_rate": 4.265943720589688e-08,
"entropy": 1.3051115587353705,
"num_tokens": 36274783.0,
"mean_token_accuracy": 0.6853921875357628,
"epoch": 2.6152558334205294,
"step": 6250
},
{
"loss": 1.3133883476257324,
"grad_norm": 0.15468242764472961,
"learning_rate": 4.175113844137596e-08,
"entropy": 1.3004416555166245,
"num_tokens": 36330126.0,
"mean_token_accuracy": 0.6820774778723717,
"epoch": 2.619441247253322,
"step": 6260
},
{
"loss": 1.3524452209472657,
"grad_norm": 0.14072105288505554,
"learning_rate": 4.08521922481766e-08,
"entropy": 1.3288619458675384,
"num_tokens": 36386837.0,
"mean_token_accuracy": 0.6738715380430221,
"epoch": 2.623626661086115,
"step": 6270
},
{
"loss": 1.2715925216674804,
"grad_norm": 0.13218800723552704,
"learning_rate": 3.9962616973225784e-08,
"entropy": 1.3046256229281425,
"num_tokens": 36446096.0,
"mean_token_accuracy": 0.6843361258506775,
"epoch": 2.6278120749189076,
"step": 6280
},
{
"loss": 1.3278305053710937,
"grad_norm": 0.12469816952943802,
"learning_rate": 3.90824307721957e-08,
"entropy": 1.3449779450893402,
"num_tokens": 36502354.0,
"mean_token_accuracy": 0.6795141100883484,
"epoch": 2.6319974887517006,
"step": 6290
},
{
"loss": 1.2798616409301757,
"grad_norm": 0.12675845623016357,
"learning_rate": 3.821165160913381e-08,
"entropy": 1.290797685086727,
"num_tokens": 36560258.0,
"mean_token_accuracy": 0.6858594298362732,
"epoch": 2.636182902584493,
"step": 6300
},
{
"loss": 1.2590128898620605,
"grad_norm": 0.14704617857933044,
"learning_rate": 3.735029725609567e-08,
"entropy": 1.2577021181583405,
"num_tokens": 36616149.0,
"mean_token_accuracy": 0.6896016135811806,
"epoch": 2.6403683164172858,
"step": 6310
},
{
"loss": 1.2890483856201171,
"grad_norm": 0.14774031937122345,
"learning_rate": 3.649838529278232e-08,
"entropy": 1.3169309496879578,
"num_tokens": 36675723.0,
"mean_token_accuracy": 0.683777266740799,
"epoch": 2.6445537302500783,
"step": 6320
},
{
"loss": 1.3321017265319823,
"grad_norm": 0.16364073753356934,
"learning_rate": 3.565593310618165e-08,
"entropy": 1.3246647357940673,
"num_tokens": 36731369.0,
"mean_token_accuracy": 0.6758654475212097,
"epoch": 2.648739144082871,
"step": 6330
},
{
"loss": 1.3064552307128907,
"grad_norm": 0.1471003293991089,
"learning_rate": 3.48229578902135e-08,
"entropy": 1.3057933449745178,
"num_tokens": 36790065.0,
"mean_token_accuracy": 0.6810683965682983,
"epoch": 2.652924557915664,
"step": 6340
},
{
"loss": 1.279481792449951,
"grad_norm": 0.14192688465118408,
"learning_rate": 3.39994766453785e-08,
"entropy": 1.303479927778244,
"num_tokens": 36851965.0,
"mean_token_accuracy": 0.6806197896599769,
"epoch": 2.6571099717484565,
"step": 6350
},
{
"loss": 1.2731066703796388,
"grad_norm": 0.1496737152338028,
"learning_rate": 3.3185506178411593e-08,
"entropy": 1.3053823009133338,
"num_tokens": 36911260.0,
"mean_token_accuracy": 0.685496874153614,
"epoch": 2.6612953855812496,
"step": 6360
},
{
"loss": 1.3233325958251954,
"grad_norm": 0.1397167593240738,
"learning_rate": 3.238106310193822e-08,
"entropy": 1.3544006377458573,
"num_tokens": 36968655.0,
"mean_token_accuracy": 0.6773801222443581,
"epoch": 2.665480799414042,
"step": 6370
},
{
"loss": 1.273273754119873,
"grad_norm": 0.1353992074728012,
"learning_rate": 3.158616383413648e-08,
"entropy": 1.2890938267111778,
"num_tokens": 37023385.0,
"mean_token_accuracy": 0.6872632309794426,
"epoch": 2.6696662132468347,
"step": 6380
},
{
"loss": 1.269486427307129,
"grad_norm": 0.14086788892745972,
"learning_rate": 3.080082459840072e-08,
"entropy": 1.29090928286314,
"num_tokens": 37081937.0,
"mean_token_accuracy": 0.6892599433660507,
"epoch": 2.6738516270796273,
"step": 6390
},
{
"loss": 1.2413150787353515,
"grad_norm": 0.147287517786026,
"learning_rate": 3.0025061423011366e-08,
"entropy": 1.2670000731945037,
"num_tokens": 37139738.0,
"mean_token_accuracy": 0.6992234885692596,
"epoch": 2.6780370409124203,
"step": 6400
},
{
"eval_loss": 1.2961639165878296,
"eval_runtime": 43.8119,
"eval_samples_per_second": 145.417,
"eval_steps_per_second": 6.071,
"eval_entropy": 1.3104161902477867,
"eval_num_tokens": 37139738.0,
"eval_mean_token_accuracy": 0.6839385229842108,
"epoch": 2.6780370409124203,
"step": 6400
},
{
"loss": 1.2215076446533204,
"grad_norm": 0.12445386499166489,
"learning_rate": 2.92588901408074e-08,
"entropy": 1.2454605296254158,
"num_tokens": 37196528.0,
"mean_token_accuracy": 0.6982016503810883,
"epoch": 2.682222454745213,
"step": 6410
},
{
"loss": 1.3178998947143554,
"grad_norm": 0.14504940807819366,
"learning_rate": 2.8502326388863073e-08,
"entropy": 1.3217849105596542,
"num_tokens": 37251881.0,
"mean_token_accuracy": 0.6820187479257583,
"epoch": 2.6864078685780055,
"step": 6420
},
{
"loss": 1.2787066459655763,
"grad_norm": 0.1357499063014984,
"learning_rate": 2.7755385608169368e-08,
"entropy": 1.293387584388256,
"num_tokens": 37308322.0,
"mean_token_accuracy": 0.6850109323859215,
"epoch": 2.6905932824107985,
"step": 6430
},
{
"loss": 1.3130731582641602,
"grad_norm": 0.1391121745109558,
"learning_rate": 2.701808304331826e-08,
"entropy": 1.3160455033183098,
"num_tokens": 37367065.0,
"mean_token_accuracy": 0.6788717776536941,
"epoch": 2.694778696243591,
"step": 6440
},
{
"loss": 1.324008083343506,
"grad_norm": 0.17093950510025024,
"learning_rate": 2.6290433742191697e-08,
"entropy": 1.3303591817617417,
"num_tokens": 37423674.0,
"mean_token_accuracy": 0.6737320378422738,
"epoch": 2.6989641100763837,
"step": 6450
},
{
"loss": 1.3142354965209961,
"grad_norm": 0.11529888957738876,
"learning_rate": 2.5572452555654766e-08,
"entropy": 1.3235249876976014,
"num_tokens": 37478758.0,
"mean_token_accuracy": 0.6834891051054001,
"epoch": 2.7031495239091763,
"step": 6460
},
{
"loss": 1.3274757385253906,
"grad_norm": 0.14736518263816833,
"learning_rate": 2.4864154137252348e-08,
"entropy": 1.323398619890213,
"num_tokens": 37535301.0,
"mean_token_accuracy": 0.676533716917038,
"epoch": 2.7073349377419693,
"step": 6470
},
{
"loss": 1.3244875907897948,
"grad_norm": 0.1322164386510849,
"learning_rate": 2.4165552942910005e-08,
"entropy": 1.3245794102549553,
"num_tokens": 37593859.0,
"mean_token_accuracy": 0.6777049407362938,
"epoch": 2.711520351574762,
"step": 6480
},
{
"loss": 1.3350665092468261,
"grad_norm": 0.15845166146755219,
"learning_rate": 2.3476663230639294e-08,
"entropy": 1.311027655005455,
"num_tokens": 37650900.0,
"mean_token_accuracy": 0.6777532756328583,
"epoch": 2.715705765407555,
"step": 6490
},
{
"loss": 1.3243823051452637,
"grad_norm": 0.1440412998199463,
"learning_rate": 2.279749906024625e-08,
"entropy": 1.316267091035843,
"num_tokens": 37709121.0,
"mean_token_accuracy": 0.6794763222336769,
"epoch": 2.7198911792403475,
"step": 6500
},
{
"loss": 1.2479528427124023,
"grad_norm": 0.1294708549976349,
"learning_rate": 2.2128074293044973e-08,
"entropy": 1.2721221387386321,
"num_tokens": 37769916.0,
"mean_token_accuracy": 0.6923005178570747,
"epoch": 2.72407659307314,
"step": 6510
},
{
"loss": 1.2882716178894043,
"grad_norm": 0.1557369977235794,
"learning_rate": 2.1468402591574176e-08,
"entropy": 1.2828272953629494,
"num_tokens": 37827782.0,
"mean_token_accuracy": 0.6824876755475998,
"epoch": 2.7282620069059327,
"step": 6520
},
{
"loss": 1.3350841522216796,
"grad_norm": 0.14170175790786743,
"learning_rate": 2.0818497419318847e-08,
"entropy": 1.3358486652374268,
"num_tokens": 37889014.0,
"mean_token_accuracy": 0.6755576729774475,
"epoch": 2.7324474207387257,
"step": 6530
},
{
"loss": 1.2199977874755858,
"grad_norm": 0.1191650778055191,
"learning_rate": 2.017837204043521e-08,
"entropy": 1.2538551360368728,
"num_tokens": 37949279.0,
"mean_token_accuracy": 0.6934547841548919,
"epoch": 2.7366328345715183,
"step": 6540
},
{
"loss": 1.3062921524047852,
"grad_norm": 0.12893186509609222,
"learning_rate": 1.954803951947992e-08,
"entropy": 1.2967224359512328,
"num_tokens": 38003642.0,
"mean_token_accuracy": 0.679786990582943,
"epoch": 2.740818248404311,
"step": 6550
},
{
"loss": 1.3133017539978027,
"grad_norm": 0.15340933203697205,
"learning_rate": 1.8927512721143733e-08,
"entropy": 1.3341112226247787,
"num_tokens": 38061010.0,
"mean_token_accuracy": 0.6817509040236474,
"epoch": 2.745003662237104,
"step": 6560
},
{
"loss": 1.2640517234802247,
"grad_norm": 0.14441581070423126,
"learning_rate": 1.831680430998872e-08,
"entropy": 1.2839445233345033,
"num_tokens": 38118091.0,
"mean_token_accuracy": 0.6878686159849167,
"epoch": 2.7491890760698965,
"step": 6570
},
{
"loss": 1.3473605155944823,
"grad_norm": 0.1267869770526886,
"learning_rate": 1.7715926750189736e-08,
"entropy": 1.3488942801952362,
"num_tokens": 38173499.0,
"mean_token_accuracy": 0.673286820948124,
"epoch": 2.753374489902689,
"step": 6580
},
{
"loss": 1.2386550903320312,
"grad_norm": 0.1410847008228302,
"learning_rate": 1.7124892305280248e-08,
"entropy": 1.2806343123316766,
"num_tokens": 38232916.0,
"mean_token_accuracy": 0.6908275470137596,
"epoch": 2.7575599037354817,
"step": 6590
},
{
"loss": 1.2414596557617188,
"grad_norm": 0.15371793508529663,
"learning_rate": 1.6543713037901863e-08,
"entropy": 1.2626485541462897,
"num_tokens": 38293881.0,
"mean_token_accuracy": 0.6889653459191323,
"epoch": 2.7617453175682747,
"step": 6600
},
{
"eval_loss": 1.2960588932037354,
"eval_runtime": 43.7085,
"eval_samples_per_second": 145.761,
"eval_steps_per_second": 6.086,
"eval_entropy": 1.3095905193708892,
"eval_num_tokens": 38293881.0,
"eval_mean_token_accuracy": 0.6839702539426044,
"epoch": 2.7617453175682747,
"step": 6600
},
{
"loss": 1.2328216552734375,
"grad_norm": 0.14509297907352448,
"learning_rate": 1.5972400809558305e-08,
"entropy": 1.2864874497056007,
"num_tokens": 38354279.0,
"mean_token_accuracy": 0.6910390242934227,
"epoch": 2.7659307314010673,
"step": 6610
},
{
"loss": 1.3384916305541992,
"grad_norm": 0.13343819975852966,
"learning_rate": 1.541096728037322e-08,
"entropy": 1.325030580163002,
"num_tokens": 38412664.0,
"mean_token_accuracy": 0.6726853728294373,
"epoch": 2.77011614523386,
"step": 6620
},
{
"loss": 1.2254823684692382,
"grad_norm": 0.1242731511592865,
"learning_rate": 1.4859423908851976e-08,
"entropy": 1.2863211989402772,
"num_tokens": 38478157.0,
"mean_token_accuracy": 0.6928248971700668,
"epoch": 2.774301559066653,
"step": 6630
},
{
"loss": 1.3083304405212401,
"grad_norm": 0.13707546889781952,
"learning_rate": 1.43177819516484e-08,
"entropy": 1.3157197803258895,
"num_tokens": 38536491.0,
"mean_token_accuracy": 0.6797169283032417,
"epoch": 2.7784869728994455,
"step": 6640
},
{
"loss": 1.3308774948120117,
"grad_norm": 0.18031752109527588,
"learning_rate": 1.3786052463334363e-08,
"entropy": 1.3267883569002152,
"num_tokens": 38592693.0,
"mean_token_accuracy": 0.6799778997898102,
"epoch": 2.782672386732238,
"step": 6650
},
{
"loss": 1.254627799987793,
"grad_norm": 0.13183258473873138,
"learning_rate": 1.3264246296174675e-08,
"entropy": 1.2955256581306458,
"num_tokens": 38649528.0,
"mean_token_accuracy": 0.6876810878515244,
"epoch": 2.7868578005650306,
"step": 6660
},
{
"loss": 1.3457258224487305,
"grad_norm": 0.13854053616523743,
"learning_rate": 1.2752374099905371e-08,
"entropy": 1.33312628865242,
"num_tokens": 38706620.0,
"mean_token_accuracy": 0.6743656143546104,
"epoch": 2.7910432143978237,
"step": 6670
},
{
"loss": 1.3620613098144532,
"grad_norm": 0.15011081099510193,
"learning_rate": 1.2250446321516173e-08,
"entropy": 1.3549015790224075,
"num_tokens": 38764666.0,
"mean_token_accuracy": 0.6692588478326797,
"epoch": 2.7952286282306162,
"step": 6680
},
{
"loss": 1.259181022644043,
"grad_norm": 0.15715329349040985,
"learning_rate": 1.1758473205037812e-08,
"entropy": 1.279186724126339,
"num_tokens": 38823325.0,
"mean_token_accuracy": 0.6928275167942047,
"epoch": 2.7994140420634093,
"step": 6690
},
{
"loss": 1.2290419578552245,
"grad_norm": 0.1388121098279953,
"learning_rate": 1.127646479133243e-08,
"entropy": 1.2684768080711364,
"num_tokens": 38880457.0,
"mean_token_accuracy": 0.694522102177143,
"epoch": 2.803599455896202,
"step": 6700
},
{
"loss": 1.3010470390319824,
"grad_norm": 0.15357019007205963,
"learning_rate": 1.0804430917888795e-08,
"entropy": 1.3155488684773444,
"num_tokens": 38936834.0,
"mean_token_accuracy": 0.6843758165836334,
"epoch": 2.8077848697289944,
"step": 6710
},
{
"loss": 1.33712797164917,
"grad_norm": 0.1296703815460205,
"learning_rate": 1.0342381218621798e-08,
"entropy": 1.3263304769992827,
"num_tokens": 38992580.0,
"mean_token_accuracy": 0.6779214948415756,
"epoch": 2.811970283561787,
"step": 6720
},
{
"loss": 1.326553726196289,
"grad_norm": 0.11933048069477081,
"learning_rate": 9.890325123675324e-09,
"entropy": 1.3206409364938736,
"num_tokens": 39048629.0,
"mean_token_accuracy": 0.6810004383325576,
"epoch": 2.81615569739458,
"step": 6730
},
{
"loss": 1.2582717895507813,
"grad_norm": 0.13642576336860657,
"learning_rate": 9.44827185923036e-09,
"entropy": 1.3072202578186989,
"num_tokens": 39109993.0,
"mean_token_accuracy": 0.6876247569918632,
"epoch": 2.8203411112273726,
"step": 6740
},
{
"loss": 1.3292433738708496,
"grad_norm": 0.14473669230937958,
"learning_rate": 9.016230447316142e-09,
"entropy": 1.312994186580181,
"num_tokens": 39167300.0,
"mean_token_accuracy": 0.6826678797602653,
"epoch": 2.824526525060165,
"step": 6750
},
{
"loss": 1.2947887420654296,
"grad_norm": 0.14324665069580078,
"learning_rate": 8.59420970562652e-09,
"entropy": 1.301099643111229,
"num_tokens": 39225969.0,
"mean_token_accuracy": 0.6844487801194191,
"epoch": 2.8287119388929582,
"step": 6760
},
{
"loss": 1.3236183166503905,
"grad_norm": 0.12903346121311188,
"learning_rate": 8.182218247339557e-09,
"entropy": 1.3240256026387214,
"num_tokens": 39284458.0,
"mean_token_accuracy": 0.6791065171360969,
"epoch": 2.832897352725751,
"step": 6770
},
{
"loss": 1.2615336418151855,
"grad_norm": 0.13563166558742523,
"learning_rate": 7.7802644809421e-09,
"entropy": 1.2816553667187691,
"num_tokens": 39343117.0,
"mean_token_accuracy": 0.6875470012426377,
"epoch": 2.8370827665585434,
"step": 6780
},
{
"loss": 1.302596092224121,
"grad_norm": 0.1292281448841095,
"learning_rate": 7.388356610057878e-09,
"entropy": 1.3054527580738067,
"num_tokens": 39399894.0,
"mean_token_accuracy": 0.6824864596128464,
"epoch": 2.841268180391336,
"step": 6790
},
{
"loss": 1.266486930847168,
"grad_norm": 0.13657227158546448,
"learning_rate": 7.006502633280398e-09,
"entropy": 1.2712685942649842,
"num_tokens": 39460071.0,
"mean_token_accuracy": 0.6891940608620644,
"epoch": 2.845453594224129,
"step": 6800
},
{
"eval_loss": 1.2960532903671265,
"eval_runtime": 43.6876,
"eval_samples_per_second": 145.831,
"eval_steps_per_second": 6.089,
"eval_entropy": 1.3085124981134457,
"eval_num_tokens": 39460071.0,
"eval_mean_token_accuracy": 0.6840056230251054,
"epoch": 2.845453594224129,
"step": 6800
},
{
"loss": 1.2665786743164062,
"grad_norm": 0.13498687744140625,
"learning_rate": 6.6347103440092534e-09,
"entropy": 1.304034498333931,
"num_tokens": 39521096.0,
"mean_token_accuracy": 0.6852239608764649,
"epoch": 2.8496390080569216,
"step": 6810
},
{
"loss": 1.2787626266479493,
"grad_norm": 0.16180647909641266,
"learning_rate": 6.272987330291635e-09,
"entropy": 1.2805368885397912,
"num_tokens": 39577586.0,
"mean_token_accuracy": 0.6863118633627892,
"epoch": 2.853824421889714,
"step": 6820
},
{
"loss": 1.298048973083496,
"grad_norm": 0.14410941302776337,
"learning_rate": 5.921340974666733e-09,
"entropy": 1.315412837266922,
"num_tokens": 39635685.0,
"mean_token_accuracy": 0.6825838565826416,
"epoch": 2.858009835722507,
"step": 6830
},
{
"loss": 1.2789037704467774,
"grad_norm": 0.12575282156467438,
"learning_rate": 5.57977845401586e-09,
"entropy": 1.3064978927373887,
"num_tokens": 39698219.0,
"mean_token_accuracy": 0.6813527047634125,
"epoch": 2.8621952495553,
"step": 6840
},
{
"loss": 1.2924142837524415,
"grad_norm": 0.14836302399635315,
"learning_rate": 5.248306739415453e-09,
"entropy": 1.3290347814559937,
"num_tokens": 39758992.0,
"mean_token_accuracy": 0.6835691928863525,
"epoch": 2.8663806633880924,
"step": 6850
},
{
"loss": 1.3144015312194823,
"grad_norm": 0.1364041566848755,
"learning_rate": 4.926932595994804e-09,
"entropy": 1.3275774329900742,
"num_tokens": 39818224.0,
"mean_token_accuracy": 0.6781793549656868,
"epoch": 2.870566077220885,
"step": 6860
},
{
"loss": 1.2337275505065919,
"grad_norm": 0.15484337508678436,
"learning_rate": 4.61566258279833e-09,
"entropy": 1.2519328325986863,
"num_tokens": 39875194.0,
"mean_token_accuracy": 0.6923688799142838,
"epoch": 2.874751491053678,
"step": 6870
},
{
"loss": 1.3397406578063964,
"grad_norm": 0.13291706144809723,
"learning_rate": 4.314503052651408e-09,
"entropy": 1.3448736280202866,
"num_tokens": 39930172.0,
"mean_token_accuracy": 0.6757234945893288,
"epoch": 2.8789369048864706,
"step": 6880
},
{
"loss": 1.337942886352539,
"grad_norm": 0.1345345377922058,
"learning_rate": 4.023460152030811e-09,
"entropy": 1.3406923681497573,
"num_tokens": 39989342.0,
"mean_token_accuracy": 0.6747447595000267,
"epoch": 2.8831223187192636,
"step": 6890
},
{
"loss": 1.2957025527954102,
"grad_norm": 0.14848950505256653,
"learning_rate": 3.74253982093925e-09,
"entropy": 1.3245025753974915,
"num_tokens": 40044850.0,
"mean_token_accuracy": 0.6843625560402871,
"epoch": 2.887307732552056,
"step": 6900
},
{
"loss": 1.2321189880371093,
"grad_norm": 0.13322904706001282,
"learning_rate": 3.471747792784141e-09,
"entropy": 1.262104222178459,
"num_tokens": 40102787.0,
"mean_token_accuracy": 0.6951159760355949,
"epoch": 2.891493146384849,
"step": 6910
},
{
"loss": 1.2964617729187011,
"grad_norm": 0.11292250454425812,
"learning_rate": 3.211089594260585e-09,
"entropy": 1.3182623267173768,
"num_tokens": 40163366.0,
"mean_token_accuracy": 0.6819359913468361,
"epoch": 2.8956785602176414,
"step": 6920
},
{
"loss": 1.3324262619018554,
"grad_norm": 0.15124228596687317,
"learning_rate": 2.9605705452387943e-09,
"entropy": 1.3444043919444084,
"num_tokens": 40221526.0,
"mean_token_accuracy": 0.6710021272301674,
"epoch": 2.8998639740504344,
"step": 6930
},
{
"loss": 1.2928382873535156,
"grad_norm": 0.1519566923379898,
"learning_rate": 2.7201957586550084e-09,
"entropy": 1.299992610514164,
"num_tokens": 40277117.0,
"mean_token_accuracy": 0.6823776334524154,
"epoch": 2.904049387883227,
"step": 6940
},
{
"loss": 1.2859591484069823,
"grad_norm": 0.128694087266922,
"learning_rate": 2.489970140407638e-09,
"entropy": 1.3093111872673036,
"num_tokens": 40334263.0,
"mean_token_accuracy": 0.6845874279737473,
"epoch": 2.9082348017160196,
"step": 6950
},
{
"loss": 1.2404520988464356,
"grad_norm": 0.13099578022956848,
"learning_rate": 2.2698983892568413e-09,
"entropy": 1.2709315478801728,
"num_tokens": 40388570.0,
"mean_token_accuracy": 0.6908060133457183,
"epoch": 2.9124202155488126,
"step": 6960
},
{
"loss": 1.30335693359375,
"grad_norm": 0.12444788217544556,
"learning_rate": 2.0599849967287696e-09,
"entropy": 1.324267864227295,
"num_tokens": 40447336.0,
"mean_token_accuracy": 0.6855339229106903,
"epoch": 2.916605629381605,
"step": 6970
},
{
"loss": 1.3318076133728027,
"grad_norm": 0.14273810386657715,
"learning_rate": 1.860234247023973e-09,
"entropy": 1.345059370994568,
"num_tokens": 40509083.0,
"mean_token_accuracy": 0.6763337209820748,
"epoch": 2.9207910432143978,
"step": 6980
},
{
"loss": 1.2318530082702637,
"grad_norm": 0.13854491710662842,
"learning_rate": 1.6706502169296366e-09,
"entropy": 1.2656858801841735,
"num_tokens": 40566757.0,
"mean_token_accuracy": 0.6908913642168045,
"epoch": 2.9249764570471903,
"step": 6990
},
{
"loss": 1.2753348350524902,
"grad_norm": 0.1317194700241089,
"learning_rate": 1.4912367757366485e-09,
"entropy": 1.290731391310692,
"num_tokens": 40626221.0,
"mean_token_accuracy": 0.6856264978647232,
"epoch": 2.9291618708799834,
"step": 7000
},
{
"eval_loss": 1.296015739440918,
"eval_runtime": 42.3973,
"eval_samples_per_second": 150.269,
"eval_steps_per_second": 6.274,
"eval_entropy": 1.3096338595662798,
"eval_num_tokens": 40626221.0,
"eval_mean_token_accuracy": 0.6839476989624196,
"epoch": 2.9291618708799834,
"step": 7000
},
{
"loss": 1.3039022445678712,
"grad_norm": 0.1216905489563942,
"learning_rate": 1.3219975851607724e-09,
"entropy": 1.3058283895254135,
"num_tokens": 40684010.0,
"mean_token_accuracy": 0.6836184665560723,
"epoch": 2.933347284712776,
"step": 7010
},
{
"loss": 1.3275947570800781,
"grad_norm": 0.15599027276039124,
"learning_rate": 1.1629360992673754e-09,
"entropy": 1.3284942299127578,
"num_tokens": 40742081.0,
"mean_token_accuracy": 0.6771559327840805,
"epoch": 2.9375326985455685,
"step": 7020
},
{
"loss": 1.2864752769470216,
"grad_norm": 0.12884531915187836,
"learning_rate": 1.014055564401539e-09,
"entropy": 1.2828212678432465,
"num_tokens": 40801021.0,
"mean_token_accuracy": 0.68373833745718,
"epoch": 2.9417181123783616,
"step": 7030
},
{
"loss": 1.318696880340576,
"grad_norm": 0.1270311027765274,
"learning_rate": 8.753590191213356e-10,
"entropy": 1.3187800377607346,
"num_tokens": 40858738.0,
"mean_token_accuracy": 0.6756755083799362,
"epoch": 2.945903526211154,
"step": 7040
},
{
"loss": 1.274400520324707,
"grad_norm": 0.15278004109859467,
"learning_rate": 7.468492941362647e-10,
"entropy": 1.2988332599401473,
"num_tokens": 40917615.0,
"mean_token_accuracy": 0.6798348844051361,
"epoch": 2.9500889400439467,
"step": 7050
},
{
"loss": 1.2988153457641602,
"grad_norm": 0.1352374255657196,
"learning_rate": 6.285290122489128e-10,
"entropy": 1.3167090728878974,
"num_tokens": 40975462.0,
"mean_token_accuracy": 0.6834619447588921,
"epoch": 2.9542743538767393,
"step": 7060
},
{
"loss": 1.3265647888183594,
"grad_norm": 0.13059331476688385,
"learning_rate": 5.204005883019392e-10,
"entropy": 1.325848352909088,
"num_tokens": 41034824.0,
"mean_token_accuracy": 0.6760321959853173,
"epoch": 2.9584597677095323,
"step": 7070
},
{
"loss": 1.3572219848632812,
"grad_norm": 0.14386902749538422,
"learning_rate": 4.224662291285597e-10,
"entropy": 1.356474344432354,
"num_tokens": 41092574.0,
"mean_token_accuracy": 0.6717446967959404,
"epoch": 2.962645181542325,
"step": 7080
},
{
"loss": 1.362534523010254,
"grad_norm": 0.12292881309986115,
"learning_rate": 3.347279335074726e-10,
"entropy": 1.3564091578125954,
"num_tokens": 41151205.0,
"mean_token_accuracy": 0.6701879113912582,
"epoch": 2.966830595375118,
"step": 7090
},
{
"loss": 1.28519287109375,
"grad_norm": 0.17997978627681732,
"learning_rate": 2.571874921221129e-10,
"entropy": 1.2821272403001784,
"num_tokens": 41208932.0,
"mean_token_accuracy": 0.6873822212219238,
"epoch": 2.9710160092079105,
"step": 7100
},
{
"loss": 1.338641357421875,
"grad_norm": 0.13907863199710846,
"learning_rate": 1.8984648752429221e-10,
"entropy": 1.3435455054044723,
"num_tokens": 41265349.0,
"mean_token_accuracy": 0.6759589716792107,
"epoch": 2.975201423040703,
"step": 7110
},
{
"loss": 1.3811635971069336,
"grad_norm": 0.13205569982528687,
"learning_rate": 1.3270629410150335e-10,
"entropy": 1.4000085026025773,
"num_tokens": 41324164.0,
"mean_token_accuracy": 0.6713890418410301,
"epoch": 2.9793868368734957,
"step": 7120
},
{
"loss": 1.2836057662963867,
"grad_norm": 0.13349401950836182,
"learning_rate": 8.576807804921981e-11,
"entropy": 1.3049908488988877,
"num_tokens": 41383405.0,
"mean_token_accuracy": 0.6830727905035019,
"epoch": 2.9835722507062887,
"step": 7130
},
{
"loss": 1.2696043014526368,
"grad_norm": 0.17530304193496704,
"learning_rate": 4.903279734697063e-11,
"entropy": 1.2787989050149917,
"num_tokens": 41441198.0,
"mean_token_accuracy": 0.6855189517140389,
"epoch": 2.9877576645390813,
"step": 7140
},
{
"loss": 1.2740005493164062,
"grad_norm": 0.15486350655555725,
"learning_rate": 2.2501201738689414e-11,
"entropy": 1.293800377845764,
"num_tokens": 41496526.0,
"mean_token_accuracy": 0.6852918058633805,
"epoch": 2.991943078371874,
"step": 7150
},
{
"loss": 1.309797191619873,
"grad_norm": 0.15995003283023834,
"learning_rate": 6.173832717559779e-12,
"entropy": 1.321927347779274,
"num_tokens": 41546682.0,
"mean_token_accuracy": 0.6836003750562668,
"epoch": 2.996128492204667,
"step": 7160
},
{
"loss": 1.2691694259643556,
"grad_norm": 0.2798561751842499,
"learning_rate": 5.102351502417335e-14,
"entropy": 1.282310128211975,
"num_tokens": 41595613.0,
"mean_token_accuracy": 0.6916061446473405,
"epoch": 3.0,
"step": 7170
},
{
"train_runtime": 24564.0326,
"train_samples_per_second": 7.003,
"train_steps_per_second": 0.292,
"total_flos": 1.1844488431738552e+18,
"train_loss": 1.40203029108513,
"epoch": 3.0,
"step": 7170
}
]