| [ |
| { |
| "loss": 3.1532291412353515, |
| "grad_norm": 0.7252150177955627, |
| "learning_rate": 4.166666666666666e-08, |
| "entropy": 0.909868873655796, |
| "num_tokens": 59625.0, |
| "mean_token_accuracy": 0.5191385246813297, |
| "epoch": 0.004185413832792717, |
| "step": 10 |
| }, |
| { |
| "loss": 3.2842838287353517, |
| "grad_norm": 0.8542681336402893, |
| "learning_rate": 8.796296296296296e-08, |
| "entropy": 0.9062351793050766, |
| "num_tokens": 115055.0, |
| "mean_token_accuracy": 0.5090377777814865, |
| "epoch": 0.008370827665585435, |
| "step": 20 |
| }, |
| { |
| "loss": 3.120407485961914, |
| "grad_norm": 0.8136049509048462, |
| "learning_rate": 1.3425925925925926e-07, |
| "entropy": 0.8844040989875793, |
| "num_tokens": 173001.0, |
| "mean_token_accuracy": 0.5300635486841202, |
| "epoch": 0.012556241498378152, |
| "step": 30 |
| }, |
| { |
| "loss": 3.1775161743164064, |
| "grad_norm": 0.7916132807731628, |
| "learning_rate": 1.8055555555555554e-07, |
| "entropy": 0.9001422733068466, |
| "num_tokens": 230759.0, |
| "mean_token_accuracy": 0.5147604435682297, |
| "epoch": 0.01674165533117087, |
| "step": 40 |
| }, |
| { |
| "loss": 3.1207983016967775, |
| "grad_norm": 0.8365122675895691, |
| "learning_rate": 2.2685185185185184e-07, |
| "entropy": 0.9065567880868912, |
| "num_tokens": 290037.0, |
| "mean_token_accuracy": 0.516672582924366, |
| "epoch": 0.020927069163963585, |
| "step": 50 |
| }, |
| { |
| "loss": 3.110250473022461, |
| "grad_norm": 0.8340612649917603, |
| "learning_rate": 2.731481481481481e-07, |
| "entropy": 0.894629393517971, |
| "num_tokens": 348666.0, |
| "mean_token_accuracy": 0.5243608556687832, |
| "epoch": 0.025112482996756304, |
| "step": 60 |
| }, |
| { |
| "loss": 3.213370513916016, |
| "grad_norm": 0.7208571434020996, |
| "learning_rate": 3.194444444444444e-07, |
| "entropy": 0.9139490529894829, |
| "num_tokens": 404064.0, |
| "mean_token_accuracy": 0.510741152614355, |
| "epoch": 0.02929789682954902, |
| "step": 70 |
| }, |
| { |
| "loss": 3.1525503158569337, |
| "grad_norm": 0.8069764971733093, |
| "learning_rate": 3.657407407407407e-07, |
| "entropy": 0.8901543036103249, |
| "num_tokens": 459991.0, |
| "mean_token_accuracy": 0.5182561405003071, |
| "epoch": 0.03348331066234174, |
| "step": 80 |
| }, |
| { |
| "loss": 3.2056060791015626, |
| "grad_norm": 0.7775021195411682, |
| "learning_rate": 4.12037037037037e-07, |
| "entropy": 0.9084297090768814, |
| "num_tokens": 515607.0, |
| "mean_token_accuracy": 0.5117158360779286, |
| "epoch": 0.037668724495134455, |
| "step": 90 |
| }, |
| { |
| "loss": 3.1619991302490233, |
| "grad_norm": 0.7517663240432739, |
| "learning_rate": 4.5833333333333327e-07, |
| "entropy": 0.9325117334723473, |
| "num_tokens": 571331.0, |
| "mean_token_accuracy": 0.5146001622080802, |
| "epoch": 0.04185413832792717, |
| "step": 100 |
| }, |
| { |
| "loss": 3.083780288696289, |
| "grad_norm": 0.7766073346138, |
| "learning_rate": 5.046296296296297e-07, |
| "entropy": 0.9166652098298073, |
| "num_tokens": 628370.0, |
| "mean_token_accuracy": 0.523348405212164, |
| "epoch": 0.04603955216071989, |
| "step": 110 |
| }, |
| { |
| "loss": 2.9515827178955076, |
| "grad_norm": 0.7737402319908142, |
| "learning_rate": 5.509259259259259e-07, |
| "entropy": 0.8990944147109985, |
| "num_tokens": 691820.0, |
| "mean_token_accuracy": 0.5330506779253483, |
| "epoch": 0.05022496599351261, |
| "step": 120 |
| }, |
| { |
| "loss": 3.009184646606445, |
| "grad_norm": 0.7089793086051941, |
| "learning_rate": 5.972222222222222e-07, |
| "entropy": 0.9393417268991471, |
| "num_tokens": 750187.0, |
| "mean_token_accuracy": 0.52148522362113, |
| "epoch": 0.054410379826305325, |
| "step": 130 |
| }, |
| { |
| "loss": 2.931972885131836, |
| "grad_norm": 0.6590924263000488, |
| "learning_rate": 6.435185185185184e-07, |
| "entropy": 0.9367253214120865, |
| "num_tokens": 810727.0, |
| "mean_token_accuracy": 0.52733798250556, |
| "epoch": 0.05859579365909804, |
| "step": 140 |
| }, |
| { |
| "loss": 2.9645376205444336, |
| "grad_norm": 0.8957489728927612, |
| "learning_rate": 6.898148148148148e-07, |
| "entropy": 0.9482719719409942, |
| "num_tokens": 869700.0, |
| "mean_token_accuracy": 0.5252767078578472, |
| "epoch": 0.06278120749189076, |
| "step": 150 |
| }, |
| { |
| "loss": 2.9582921981811525, |
| "grad_norm": 0.7582520842552185, |
| "learning_rate": 7.361111111111111e-07, |
| "entropy": 0.9640893578529358, |
| "num_tokens": 929913.0, |
| "mean_token_accuracy": 0.5209386304020882, |
| "epoch": 0.06696662132468348, |
| "step": 160 |
| }, |
| { |
| "loss": 2.942486381530762, |
| "grad_norm": 0.8944061994552612, |
| "learning_rate": 7.824074074074074e-07, |
| "entropy": 1.0264274433255196, |
| "num_tokens": 987456.0, |
| "mean_token_accuracy": 0.5134634062647819, |
| "epoch": 0.0711520351574762, |
| "step": 170 |
| }, |
| { |
| "loss": 2.8791921615600584, |
| "grad_norm": 0.8744268417358398, |
| "learning_rate": 8.287037037037036e-07, |
| "entropy": 1.0515225112438202, |
| "num_tokens": 1045578.0, |
| "mean_token_accuracy": 0.5203944936394691, |
| "epoch": 0.07533744899026891, |
| "step": 180 |
| }, |
| { |
| "loss": 2.671405029296875, |
| "grad_norm": 0.6432790160179138, |
| "learning_rate": 8.75e-07, |
| "entropy": 1.0437349081039429, |
| "num_tokens": 1105831.0, |
| "mean_token_accuracy": 0.5427842013537884, |
| "epoch": 0.07952286282306163, |
| "step": 190 |
| }, |
| { |
| "loss": 2.658889389038086, |
| "grad_norm": 0.606299877166748, |
| "learning_rate": 9.212962962962962e-07, |
| "entropy": 1.1231032446026803, |
| "num_tokens": 1162666.0, |
| "mean_token_accuracy": 0.5306987896561622, |
| "epoch": 0.08370827665585434, |
| "step": 200 |
| }, |
| { |
| "eval_loss": 2.64770245552063, |
| "eval_runtime": 46.2508, |
| "eval_samples_per_second": 137.749, |
| "eval_steps_per_second": 5.751, |
| "eval_entropy": 1.1016171117474263, |
| "eval_num_tokens": 1162666.0, |
| "eval_mean_token_accuracy": 0.5367447220741358, |
| "epoch": 0.08370827665585434, |
| "step": 200 |
| }, |
| { |
| "loss": 2.6236072540283204, |
| "grad_norm": 0.497915118932724, |
| "learning_rate": 9.675925925925926e-07, |
| "entropy": 1.17362399995327, |
| "num_tokens": 1219379.0, |
| "mean_token_accuracy": 0.5290989577770233, |
| "epoch": 0.08789369048864706, |
| "step": 210 |
| }, |
| { |
| "loss": 2.469617462158203, |
| "grad_norm": 0.46626466512680054, |
| "learning_rate": 9.999995407884271e-07, |
| "entropy": 1.1599584549665451, |
| "num_tokens": 1280477.0, |
| "mean_token_accuracy": 0.54561567902565, |
| "epoch": 0.09207910432143979, |
| "step": 220 |
| }, |
| { |
| "loss": 2.5247661590576174, |
| "grad_norm": 0.4741360545158386, |
| "learning_rate": 9.999913770505991e-07, |
| "entropy": 1.2275266259908677, |
| "num_tokens": 1337757.0, |
| "mean_token_accuracy": 0.5331762477755546, |
| "epoch": 0.0962645181542325, |
| "step": 230 |
| }, |
| { |
| "loss": 2.395884704589844, |
| "grad_norm": 0.48130306601524353, |
| "learning_rate": 9.999730088029378e-07, |
| "entropy": 1.2018450900912285, |
| "num_tokens": 1397229.0, |
| "mean_token_accuracy": 0.5442760087549686, |
| "epoch": 0.10044993198702522, |
| "step": 240 |
| }, |
| { |
| "loss": 2.3129932403564455, |
| "grad_norm": 0.43950778245925903, |
| "learning_rate": 9.99944436420327e-07, |
| "entropy": 1.2097549244761467, |
| "num_tokens": 1457262.0, |
| "mean_token_accuracy": 0.5529780350625515, |
| "epoch": 0.10463534581981794, |
| "step": 250 |
| }, |
| { |
| "loss": 2.4272939682006838, |
| "grad_norm": 0.4790073335170746, |
| "learning_rate": 9.999056604859114e-07, |
| "entropy": 1.2772572070360184, |
| "num_tokens": 1511629.0, |
| "mean_token_accuracy": 0.5349524199962616, |
| "epoch": 0.10882075965261065, |
| "step": 260 |
| }, |
| { |
| "loss": 2.3150350570678713, |
| "grad_norm": 0.3617287278175354, |
| "learning_rate": 9.998566817910835e-07, |
| "entropy": 1.2845856487751006, |
| "num_tokens": 1567691.0, |
| "mean_token_accuracy": 0.5448164060711861, |
| "epoch": 0.11300617348540337, |
| "step": 270 |
| }, |
| { |
| "loss": 2.2049863815307615, |
| "grad_norm": 0.31779831647872925, |
| "learning_rate": 9.997975013354675e-07, |
| "entropy": 1.271442210674286, |
| "num_tokens": 1628624.0, |
| "mean_token_accuracy": 0.5550637729465961, |
| "epoch": 0.11719158731819608, |
| "step": 280 |
| }, |
| { |
| "loss": 2.1529918670654298, |
| "grad_norm": 0.32820868492126465, |
| "learning_rate": 9.997281203268995e-07, |
| "entropy": 1.279099041223526, |
| "num_tokens": 1689782.0, |
| "mean_token_accuracy": 0.5614619500935077, |
| "epoch": 0.1213770011509888, |
| "step": 290 |
| }, |
| { |
| "loss": 2.1452268600463866, |
| "grad_norm": 0.3186478018760681, |
| "learning_rate": 9.99648540181402e-07, |
| "entropy": 1.323472622036934, |
| "num_tokens": 1749745.0, |
| "mean_token_accuracy": 0.556930074095726, |
| "epoch": 0.1255624149837815, |
| "step": 300 |
| }, |
| { |
| "loss": 2.134934997558594, |
| "grad_norm": 0.30079811811447144, |
| "learning_rate": 9.995587625231564e-07, |
| "entropy": 1.3632762670516967, |
| "num_tokens": 1807987.0, |
| "mean_token_accuracy": 0.5517515152692795, |
| "epoch": 0.12974782881657423, |
| "step": 310 |
| }, |
| { |
| "loss": 1.9839101791381837, |
| "grad_norm": 0.361982136964798, |
| "learning_rate": 9.994587891844675e-07, |
| "entropy": 1.3298138067126275, |
| "num_tokens": 1867566.0, |
| "mean_token_accuracy": 0.57241967394948, |
| "epoch": 0.13393324264936696, |
| "step": 320 |
| }, |
| { |
| "loss": 2.0360176086425783, |
| "grad_norm": 0.3045739233493805, |
| "learning_rate": 9.99348622205729e-07, |
| "entropy": 1.3972268849611282, |
| "num_tokens": 1926290.0, |
| "mean_token_accuracy": 0.5540065504610538, |
| "epoch": 0.13811865648215968, |
| "step": 330 |
| }, |
| { |
| "loss": 2.0286712646484375, |
| "grad_norm": 0.31182143092155457, |
| "learning_rate": 9.99228263835379e-07, |
| "entropy": 1.4252518475055695, |
| "num_tokens": 1985494.0, |
| "mean_token_accuracy": 0.5559632822871208, |
| "epoch": 0.1423040703149524, |
| "step": 340 |
| }, |
| { |
| "loss": 2.064646911621094, |
| "grad_norm": 0.32973718643188477, |
| "learning_rate": 9.990977165298569e-07, |
| "entropy": 1.4729229807853699, |
| "num_tokens": 2044451.0, |
| "mean_token_accuracy": 0.542438729852438, |
| "epoch": 0.1464894841477451, |
| "step": 350 |
| }, |
| { |
| "loss": 1.9804191589355469, |
| "grad_norm": 0.289235383272171, |
| "learning_rate": 9.989569829535508e-07, |
| "entropy": 1.465097615122795, |
| "num_tokens": 2104274.0, |
| "mean_token_accuracy": 0.5540824517607689, |
| "epoch": 0.15067489798053782, |
| "step": 360 |
| }, |
| { |
| "loss": 1.9926959991455078, |
| "grad_norm": 0.29901251196861267, |
| "learning_rate": 9.988060659787448e-07, |
| "entropy": 1.5359219849109649, |
| "num_tokens": 2159709.0, |
| "mean_token_accuracy": 0.5466853015124797, |
| "epoch": 0.15486031181333054, |
| "step": 370 |
| }, |
| { |
| "loss": 1.9560510635375976, |
| "grad_norm": 0.2495652139186859, |
| "learning_rate": 9.986449686855592e-07, |
| "entropy": 1.5187518745660782, |
| "num_tokens": 2215716.0, |
| "mean_token_accuracy": 0.5528687633574009, |
| "epoch": 0.15904572564612326, |
| "step": 380 |
| }, |
| { |
| "loss": 1.9273345947265625, |
| "grad_norm": 0.24777130782604218, |
| "learning_rate": 9.984736943618888e-07, |
| "entropy": 1.521226641535759, |
| "num_tokens": 2271587.0, |
| "mean_token_accuracy": 0.5515810877084732, |
| "epoch": 0.163231139478916, |
| "step": 390 |
| }, |
| { |
| "loss": 1.8669830322265626, |
| "grad_norm": 0.22564777731895447, |
| "learning_rate": 9.982922465033348e-07, |
| "entropy": 1.5213739037513734, |
| "num_tokens": 2333246.0, |
| "mean_token_accuracy": 0.5613327234983444, |
| "epoch": 0.16741655331170868, |
| "step": 400 |
| }, |
| { |
| "eval_loss": 1.8593320846557617, |
| "eval_runtime": 43.8766, |
| "eval_samples_per_second": 145.203, |
| "eval_steps_per_second": 6.062, |
| "eval_entropy": 1.4658072012707704, |
| "eval_num_tokens": 2333246.0, |
| "eval_mean_token_accuracy": 0.5688402788307434, |
| "epoch": 0.16741655331170868, |
| "step": 400 |
| }, |
| { |
| "loss": 1.804941177368164, |
| "grad_norm": 0.23039200901985168, |
| "learning_rate": 9.981006288131342e-07, |
| "entropy": 1.4880431205034257, |
| "num_tokens": 2391775.0, |
| "mean_token_accuracy": 0.5735716104507447, |
| "epoch": 0.1716019671445014, |
| "step": 410 |
| }, |
| { |
| "loss": 1.8068187713623047, |
| "grad_norm": 0.2117597907781601, |
| "learning_rate": 9.978988452020832e-07, |
| "entropy": 1.5207171112298965, |
| "num_tokens": 2449834.0, |
| "mean_token_accuracy": 0.5626526214182377, |
| "epoch": 0.17578738097729413, |
| "step": 420 |
| }, |
| { |
| "loss": 1.8408426284790038, |
| "grad_norm": 0.18858124315738678, |
| "learning_rate": 9.97686899788459e-07, |
| "entropy": 1.5431180804967881, |
| "num_tokens": 2509956.0, |
| "mean_token_accuracy": 0.5639997899532319, |
| "epoch": 0.17997279481008685, |
| "step": 430 |
| }, |
| { |
| "loss": 1.8002569198608398, |
| "grad_norm": 0.20246392488479614, |
| "learning_rate": 9.974647968979339e-07, |
| "entropy": 1.5480373591184615, |
| "num_tokens": 2566510.0, |
| "mean_token_accuracy": 0.5718876734375954, |
| "epoch": 0.18415820864287957, |
| "step": 440 |
| }, |
| { |
| "loss": 1.7867300033569335, |
| "grad_norm": 0.1937456578016281, |
| "learning_rate": 9.972325410634885e-07, |
| "entropy": 1.531213068962097, |
| "num_tokens": 2624567.0, |
| "mean_token_accuracy": 0.5752546131610871, |
| "epoch": 0.1883436224756723, |
| "step": 450 |
| }, |
| { |
| "loss": 1.755255889892578, |
| "grad_norm": 0.1926703304052353, |
| "learning_rate": 9.969901370253187e-07, |
| "entropy": 1.5252358853816985, |
| "num_tokens": 2682287.0, |
| "mean_token_accuracy": 0.5824674129486084, |
| "epoch": 0.192529036308465, |
| "step": 460 |
| }, |
| { |
| "loss": 1.772911834716797, |
| "grad_norm": 0.21854747831821442, |
| "learning_rate": 9.96737589730738e-07, |
| "entropy": 1.575288510322571, |
| "num_tokens": 2735734.0, |
| "mean_token_accuracy": 0.5827064469456673, |
| "epoch": 0.1967144501412577, |
| "step": 470 |
| }, |
| { |
| "loss": 1.6943496704101562, |
| "grad_norm": 0.16926386952400208, |
| "learning_rate": 9.964749043340788e-07, |
| "entropy": 1.4982535749673844, |
| "num_tokens": 2794073.0, |
| "mean_token_accuracy": 0.5961055085062981, |
| "epoch": 0.20089986397405044, |
| "step": 480 |
| }, |
| { |
| "loss": 1.759925651550293, |
| "grad_norm": 0.18107837438583374, |
| "learning_rate": 9.962020861965843e-07, |
| "entropy": 1.5479711294174194, |
| "num_tokens": 2852819.0, |
| "mean_token_accuracy": 0.5893502697348595, |
| "epoch": 0.20508527780684316, |
| "step": 490 |
| }, |
| { |
| "loss": 1.6459325790405273, |
| "grad_norm": 0.19736573100090027, |
| "learning_rate": 9.959191408863014e-07, |
| "entropy": 1.4799151957035064, |
| "num_tokens": 2916292.0, |
| "mean_token_accuracy": 0.6062492698431015, |
| "epoch": 0.20927069163963588, |
| "step": 500 |
| }, |
| { |
| "loss": 1.6710922241210937, |
| "grad_norm": 0.20262014865875244, |
| "learning_rate": 9.956260741779665e-07, |
| "entropy": 1.5181541979312896, |
| "num_tokens": 2975466.0, |
| "mean_token_accuracy": 0.6021158128976822, |
| "epoch": 0.21345610547242858, |
| "step": 510 |
| }, |
| { |
| "loss": 1.6909679412841796, |
| "grad_norm": 0.22447596490383148, |
| "learning_rate": 9.953228920528865e-07, |
| "entropy": 1.523398867249489, |
| "num_tokens": 3033907.0, |
| "mean_token_accuracy": 0.5984062060713768, |
| "epoch": 0.2176415193052213, |
| "step": 520 |
| }, |
| { |
| "loss": 1.6476320266723632, |
| "grad_norm": 0.20474065840244293, |
| "learning_rate": 9.950096006988182e-07, |
| "entropy": 1.5151092141866684, |
| "num_tokens": 3090006.0, |
| "mean_token_accuracy": 0.6024264812469482, |
| "epoch": 0.22182693313801402, |
| "step": 530 |
| }, |
| { |
| "loss": 1.656897735595703, |
| "grad_norm": 0.2609263062477112, |
| "learning_rate": 9.946862065098413e-07, |
| "entropy": 1.5152370780706406, |
| "num_tokens": 3145289.0, |
| "mean_token_accuracy": 0.6096841841936111, |
| "epoch": 0.22601234697080674, |
| "step": 540 |
| }, |
| { |
| "loss": 1.540487289428711, |
| "grad_norm": 0.21252916753292084, |
| "learning_rate": 9.943527160862281e-07, |
| "entropy": 1.4425812840461731, |
| "num_tokens": 3202970.0, |
| "mean_token_accuracy": 0.6310827702283859, |
| "epoch": 0.23019776080359947, |
| "step": 550 |
| }, |
| { |
| "loss": 1.5528440475463867, |
| "grad_norm": 0.16846199333667755, |
| "learning_rate": 9.940091362343086e-07, |
| "entropy": 1.4279247790575027, |
| "num_tokens": 3258895.0, |
| "mean_token_accuracy": 0.6345707163214683, |
| "epoch": 0.23438317463639216, |
| "step": 560 |
| }, |
| { |
| "loss": 1.5468204498291016, |
| "grad_norm": 0.1584591120481491, |
| "learning_rate": 9.936554739663315e-07, |
| "entropy": 1.4223629891872407, |
| "num_tokens": 3315602.0, |
| "mean_token_accuracy": 0.6405046731233597, |
| "epoch": 0.23856858846918488, |
| "step": 570 |
| }, |
| { |
| "loss": 1.5502227783203124, |
| "grad_norm": 0.1557629555463791, |
| "learning_rate": 9.932917365003216e-07, |
| "entropy": 1.3991417795419694, |
| "num_tokens": 3376637.0, |
| "mean_token_accuracy": 0.6378504887223244, |
| "epoch": 0.2427540023019776, |
| "step": 580 |
| }, |
| { |
| "loss": 1.47230224609375, |
| "grad_norm": 0.16370368003845215, |
| "learning_rate": 9.929179312599317e-07, |
| "entropy": 1.3776833653450011, |
| "num_tokens": 3435084.0, |
| "mean_token_accuracy": 0.655489268898964, |
| "epoch": 0.24693941613477033, |
| "step": 590 |
| }, |
| { |
| "loss": 1.5052314758300782, |
| "grad_norm": 0.15081895887851715, |
| "learning_rate": 9.925340658742926e-07, |
| "entropy": 1.4087885320186615, |
| "num_tokens": 3491147.0, |
| "mean_token_accuracy": 0.6534322142601013, |
| "epoch": 0.251124829967563, |
| "step": 600 |
| }, |
| { |
| "eval_loss": 1.5126348733901978, |
| "eval_runtime": 43.6936, |
| "eval_samples_per_second": 145.811, |
| "eval_steps_per_second": 6.088, |
| "eval_entropy": 1.3902130023877424, |
| "eval_num_tokens": 3491147.0, |
| "eval_mean_token_accuracy": 0.654797031018967, |
| "epoch": 0.251124829967563, |
| "step": 600 |
| }, |
| { |
| "loss": 1.5386703491210938, |
| "grad_norm": 0.13764511048793793, |
| "learning_rate": 9.921401481778548e-07, |
| "entropy": 1.4563434034585954, |
| "num_tokens": 3547282.0, |
| "mean_token_accuracy": 0.6465040192008018, |
| "epoch": 0.2553102438003558, |
| "step": 610 |
| }, |
| { |
| "loss": 1.5452125549316407, |
| "grad_norm": 0.12197960168123245, |
| "learning_rate": 9.917361862102316e-07, |
| "entropy": 1.4381413817405702, |
| "num_tokens": 3604669.0, |
| "mean_token_accuracy": 0.6443240866065025, |
| "epoch": 0.25949565763314847, |
| "step": 620 |
| }, |
| { |
| "loss": 1.5622711181640625, |
| "grad_norm": 0.11413100361824036, |
| "learning_rate": 9.913221882160325e-07, |
| "entropy": 1.4670737832784653, |
| "num_tokens": 3660716.0, |
| "mean_token_accuracy": 0.6438136756420135, |
| "epoch": 0.2636810714659412, |
| "step": 630 |
| }, |
| { |
| "loss": 1.5195579528808594, |
| "grad_norm": 0.11969699710607529, |
| "learning_rate": 9.908981626446967e-07, |
| "entropy": 1.44781274497509, |
| "num_tokens": 3716067.0, |
| "mean_token_accuracy": 0.6527407199144364, |
| "epoch": 0.2678664852987339, |
| "step": 640 |
| }, |
| { |
| "loss": 1.562466812133789, |
| "grad_norm": 0.10654503107070923, |
| "learning_rate": 9.904641181503193e-07, |
| "entropy": 1.4735447496175766, |
| "num_tokens": 3772158.0, |
| "mean_token_accuracy": 0.6453444182872772, |
| "epoch": 0.2720518991315266, |
| "step": 650 |
| }, |
| { |
| "loss": 1.4119970321655273, |
| "grad_norm": 0.11862610280513763, |
| "learning_rate": 9.900200635914762e-07, |
| "entropy": 1.3951878100633621, |
| "num_tokens": 3833284.0, |
| "mean_token_accuracy": 0.6668122097849846, |
| "epoch": 0.27623731296431936, |
| "step": 660 |
| }, |
| { |
| "loss": 1.4448695182800293, |
| "grad_norm": 0.11796533316373825, |
| "learning_rate": 9.895660080310418e-07, |
| "entropy": 1.4141918390989303, |
| "num_tokens": 3890126.0, |
| "mean_token_accuracy": 0.6602638632059097, |
| "epoch": 0.28042272679711205, |
| "step": 670 |
| }, |
| { |
| "loss": 1.4727934837341308, |
| "grad_norm": 0.10836026817560196, |
| "learning_rate": 9.891019607360042e-07, |
| "entropy": 1.4221189886331558, |
| "num_tokens": 3946816.0, |
| "mean_token_accuracy": 0.6601494640111923, |
| "epoch": 0.2846081406299048, |
| "step": 680 |
| }, |
| { |
| "loss": 1.4374773979187012, |
| "grad_norm": 0.11260558664798737, |
| "learning_rate": 9.88627931177278e-07, |
| "entropy": 1.388827046751976, |
| "num_tokens": 4006104.0, |
| "mean_token_accuracy": 0.6628721192479133, |
| "epoch": 0.2887935544626975, |
| "step": 690 |
| }, |
| { |
| "loss": 1.4522128105163574, |
| "grad_norm": 0.09237143397331238, |
| "learning_rate": 9.88143929029508e-07, |
| "entropy": 1.4040265291929246, |
| "num_tokens": 4064859.0, |
| "mean_token_accuracy": 0.6620885074138642, |
| "epoch": 0.2929789682954902, |
| "step": 700 |
| }, |
| { |
| "loss": 1.4630813598632812, |
| "grad_norm": 0.11288689821958542, |
| "learning_rate": 9.876499641708741e-07, |
| "entropy": 1.4170700162649155, |
| "num_tokens": 4122576.0, |
| "mean_token_accuracy": 0.6592713505029678, |
| "epoch": 0.29716438212828294, |
| "step": 710 |
| }, |
| { |
| "loss": 1.3941055297851563, |
| "grad_norm": 0.09874723106622696, |
| "learning_rate": 9.871460466828888e-07, |
| "entropy": 1.3975009769201279, |
| "num_tokens": 4180815.0, |
| "mean_token_accuracy": 0.6696879684925079, |
| "epoch": 0.30134979596107564, |
| "step": 720 |
| }, |
| { |
| "loss": 1.4689726829528809, |
| "grad_norm": 0.10610879957675934, |
| "learning_rate": 9.866321868501912e-07, |
| "entropy": 1.4678748458623887, |
| "num_tokens": 4239666.0, |
| "mean_token_accuracy": 0.6571864351630211, |
| "epoch": 0.3055352097938684, |
| "step": 730 |
| }, |
| { |
| "loss": 1.479258155822754, |
| "grad_norm": 0.12200459837913513, |
| "learning_rate": 9.861083951603377e-07, |
| "entropy": 1.430861946940422, |
| "num_tokens": 4297700.0, |
| "mean_token_accuracy": 0.6564601019024849, |
| "epoch": 0.3097206236266611, |
| "step": 740 |
| }, |
| { |
| "loss": 1.4389605522155762, |
| "grad_norm": 0.12583598494529724, |
| "learning_rate": 9.855746823035876e-07, |
| "entropy": 1.432998749613762, |
| "num_tokens": 4355152.0, |
| "mean_token_accuracy": 0.6649609237909317, |
| "epoch": 0.3139060374594538, |
| "step": 750 |
| }, |
| { |
| "loss": 1.451263999938965, |
| "grad_norm": 0.11383051425218582, |
| "learning_rate": 9.850310591726846e-07, |
| "entropy": 1.4290786892175675, |
| "num_tokens": 4414094.0, |
| "mean_token_accuracy": 0.6572059765458107, |
| "epoch": 0.31809145129224653, |
| "step": 760 |
| }, |
| { |
| "loss": 1.5145987510681151, |
| "grad_norm": 0.10066704452037811, |
| "learning_rate": 9.844775368626358e-07, |
| "entropy": 1.4577032029628754, |
| "num_tokens": 4472646.0, |
| "mean_token_accuracy": 0.6500703617930412, |
| "epoch": 0.3222768651250392, |
| "step": 770 |
| }, |
| { |
| "loss": 1.4831979751586915, |
| "grad_norm": 0.08656121045351028, |
| "learning_rate": 9.839141266704833e-07, |
| "entropy": 1.4568557769060135, |
| "num_tokens": 4529048.0, |
| "mean_token_accuracy": 0.6541818514466285, |
| "epoch": 0.326462278957832, |
| "step": 780 |
| }, |
| { |
| "loss": 1.422746181488037, |
| "grad_norm": 0.10030363500118256, |
| "learning_rate": 9.833408400950753e-07, |
| "entropy": 1.4236784011125565, |
| "num_tokens": 4587248.0, |
| "mean_token_accuracy": 0.6693221822381019, |
| "epoch": 0.33064769279062467, |
| "step": 790 |
| }, |
| { |
| "loss": 1.511890697479248, |
| "grad_norm": 0.1208115741610527, |
| "learning_rate": 9.827576888368306e-07, |
| "entropy": 1.4541470259428024, |
| "num_tokens": 4643727.0, |
| "mean_token_accuracy": 0.6536489054560661, |
| "epoch": 0.33483310662341736, |
| "step": 800 |
| }, |
| { |
| "eval_loss": 1.4582873582839966, |
| "eval_runtime": 43.7122, |
| "eval_samples_per_second": 145.749, |
| "eval_steps_per_second": 6.085, |
| "eval_entropy": 1.4234498008749539, |
| "eval_num_tokens": 4643727.0, |
| "eval_mean_token_accuracy": 0.6611158966569972, |
| "epoch": 0.33483310662341736, |
| "step": 800 |
| }, |
| { |
| "loss": 1.519627285003662, |
| "grad_norm": 0.10558852553367615, |
| "learning_rate": 9.821646847974998e-07, |
| "entropy": 1.4889809876680373, |
| "num_tokens": 4699602.0, |
| "mean_token_accuracy": 0.6542887255549431, |
| "epoch": 0.3390185204562101, |
| "step": 810 |
| }, |
| { |
| "loss": 1.4185623168945312, |
| "grad_norm": 0.11619652807712555, |
| "learning_rate": 9.815618400799228e-07, |
| "entropy": 1.4101483166217803, |
| "num_tokens": 4759712.0, |
| "mean_token_accuracy": 0.6648697286844254, |
| "epoch": 0.3432039342890028, |
| "step": 820 |
| }, |
| { |
| "loss": 1.4773643493652344, |
| "grad_norm": 0.10418440401554108, |
| "learning_rate": 9.809491669877815e-07, |
| "entropy": 1.4431717425584794, |
| "num_tokens": 4817721.0, |
| "mean_token_accuracy": 0.6592238992452621, |
| "epoch": 0.34738934812179556, |
| "step": 830 |
| }, |
| { |
| "loss": 1.4270614624023437, |
| "grad_norm": 0.09047893434762955, |
| "learning_rate": 9.803266780253487e-07, |
| "entropy": 1.4172182738780976, |
| "num_tokens": 4876476.0, |
| "mean_token_accuracy": 0.6634449914097786, |
| "epoch": 0.35157476195458826, |
| "step": 840 |
| }, |
| { |
| "loss": 1.4365344047546387, |
| "grad_norm": 0.11413703858852386, |
| "learning_rate": 9.796943858972328e-07, |
| "entropy": 1.424839785695076, |
| "num_tokens": 4935356.0, |
| "mean_token_accuracy": 0.664018252491951, |
| "epoch": 0.35576017578738095, |
| "step": 850 |
| }, |
| { |
| "loss": 1.4368658065795898, |
| "grad_norm": 0.10197298973798752, |
| "learning_rate": 9.790523035081194e-07, |
| "entropy": 1.4327729046344757, |
| "num_tokens": 4996023.0, |
| "mean_token_accuracy": 0.667100901901722, |
| "epoch": 0.3599455896201737, |
| "step": 860 |
| }, |
| { |
| "loss": 1.4285932540893556, |
| "grad_norm": 0.09999420493841171, |
| "learning_rate": 9.78400443962506e-07, |
| "entropy": 1.4451387345790863, |
| "num_tokens": 5054143.0, |
| "mean_token_accuracy": 0.6665249273180962, |
| "epoch": 0.3641310034529664, |
| "step": 870 |
| }, |
| { |
| "loss": 1.4537543296813964, |
| "grad_norm": 0.12824219465255737, |
| "learning_rate": 9.777388205644365e-07, |
| "entropy": 1.4365610003471374, |
| "num_tokens": 5109151.0, |
| "mean_token_accuracy": 0.6605026423931122, |
| "epoch": 0.36831641728575915, |
| "step": 880 |
| }, |
| { |
| "loss": 1.4052467346191406, |
| "grad_norm": 0.10136168450117111, |
| "learning_rate": 9.770674468172288e-07, |
| "entropy": 1.4461679026484489, |
| "num_tokens": 5169545.0, |
| "mean_token_accuracy": 0.6698134854435921, |
| "epoch": 0.37250183111855184, |
| "step": 890 |
| }, |
| { |
| "loss": 1.5341646194458007, |
| "grad_norm": 0.125015527009964, |
| "learning_rate": 9.763863364231995e-07, |
| "entropy": 1.4948209792375564, |
| "num_tokens": 5226362.0, |
| "mean_token_accuracy": 0.6506395027041435, |
| "epoch": 0.3766872449513446, |
| "step": 900 |
| }, |
| { |
| "loss": 1.451594066619873, |
| "grad_norm": 0.12184764444828033, |
| "learning_rate": 9.75695503283383e-07, |
| "entropy": 1.454634991288185, |
| "num_tokens": 5287471.0, |
| "mean_token_accuracy": 0.6617233619093895, |
| "epoch": 0.3808726587841373, |
| "step": 910 |
| }, |
| { |
| "loss": 1.3663444519042969, |
| "grad_norm": 0.09586543589830399, |
| "learning_rate": 9.749949614972505e-07, |
| "entropy": 1.4007300227880477, |
| "num_tokens": 5346427.0, |
| "mean_token_accuracy": 0.6761364534497261, |
| "epoch": 0.38505807261693, |
| "step": 920 |
| }, |
| { |
| "loss": 1.4433299064636231, |
| "grad_norm": 0.09879063069820404, |
| "learning_rate": 9.74284725362419e-07, |
| "entropy": 1.44069661796093, |
| "num_tokens": 5406471.0, |
| "mean_token_accuracy": 0.6573658585548401, |
| "epoch": 0.38924348644972273, |
| "step": 930 |
| }, |
| { |
| "loss": 1.3213248252868652, |
| "grad_norm": 0.09394767135381699, |
| "learning_rate": 9.735648093743621e-07, |
| "entropy": 1.3663470640778541, |
| "num_tokens": 5468090.0, |
| "mean_token_accuracy": 0.6877701610326767, |
| "epoch": 0.3934289002825154, |
| "step": 940 |
| }, |
| { |
| "loss": 1.3884021759033203, |
| "grad_norm": 0.10035385936498642, |
| "learning_rate": 9.728352282261124e-07, |
| "entropy": 1.4055696964263915, |
| "num_tokens": 5527409.0, |
| "mean_token_accuracy": 0.6716061800718307, |
| "epoch": 0.3976143141153082, |
| "step": 950 |
| }, |
| { |
| "loss": 1.4895167350769043, |
| "grad_norm": 0.1361590176820755, |
| "learning_rate": 9.72095996807963e-07, |
| "entropy": 1.4704587817192079, |
| "num_tokens": 5586447.0, |
| "mean_token_accuracy": 0.6556992784142495, |
| "epoch": 0.40179972794810087, |
| "step": 960 |
| }, |
| { |
| "loss": 1.422182846069336, |
| "grad_norm": 0.12393207103013992, |
| "learning_rate": 9.713471302071624e-07, |
| "entropy": 1.4276411414146424, |
| "num_tokens": 5644917.0, |
| "mean_token_accuracy": 0.663788178563118, |
| "epoch": 0.40598514178089357, |
| "step": 970 |
| }, |
| { |
| "loss": 1.4414152145385741, |
| "grad_norm": 0.12177922576665878, |
| "learning_rate": 9.705886437076078e-07, |
| "entropy": 1.4314857304096222, |
| "num_tokens": 5706907.0, |
| "mean_token_accuracy": 0.6635714635252953, |
| "epoch": 0.4101705556136863, |
| "step": 980 |
| }, |
| { |
| "loss": 1.4422160148620606, |
| "grad_norm": 0.09565871953964233, |
| "learning_rate": 9.698205527895317e-07, |
| "entropy": 1.4681658923625946, |
| "num_tokens": 5767067.0, |
| "mean_token_accuracy": 0.6618433445692062, |
| "epoch": 0.414355969446479, |
| "step": 990 |
| }, |
| { |
| "loss": 1.3973498344421387, |
| "grad_norm": 0.11843396723270416, |
| "learning_rate": 9.69042873129187e-07, |
| "entropy": 1.4245391979813575, |
| "num_tokens": 5826368.0, |
| "mean_token_accuracy": 0.6698687911033631, |
| "epoch": 0.41854138327927176, |
| "step": 1000 |
| }, |
| { |
| "eval_loss": 1.430882215499878, |
| "eval_runtime": 42.5472, |
| "eval_samples_per_second": 149.739, |
| "eval_steps_per_second": 6.252, |
| "eval_entropy": 1.4260875381025158, |
| "eval_num_tokens": 5826368.0, |
| "eval_mean_token_accuracy": 0.6650281033121553, |
| "epoch": 0.41854138327927176, |
| "step": 1000 |
| }, |
| { |
| "loss": 1.454050064086914, |
| "grad_norm": 0.11938533186912537, |
| "learning_rate": 9.682556205985273e-07, |
| "entropy": 1.447835522890091, |
| "num_tokens": 5882058.0, |
| "mean_token_accuracy": 0.6635008811950683, |
| "epoch": 0.42272679711206446, |
| "step": 1010 |
| }, |
| { |
| "loss": 1.3930376052856446, |
| "grad_norm": 0.1063380166888237, |
| "learning_rate": 9.674588112648819e-07, |
| "entropy": 1.4178766876459121, |
| "num_tokens": 5938913.0, |
| "mean_token_accuracy": 0.6699633210897445, |
| "epoch": 0.42691221094485715, |
| "step": 1020 |
| }, |
| { |
| "loss": 1.4532501220703125, |
| "grad_norm": 0.1084047332406044, |
| "learning_rate": 9.666524613906283e-07, |
| "entropy": 1.4572493433952332, |
| "num_tokens": 5993465.0, |
| "mean_token_accuracy": 0.6680980160832405, |
| "epoch": 0.4310976247776499, |
| "step": 1030 |
| }, |
| { |
| "loss": 1.4012516021728516, |
| "grad_norm": 0.10825818032026291, |
| "learning_rate": 9.658365874328613e-07, |
| "entropy": 1.434103360772133, |
| "num_tokens": 6049913.0, |
| "mean_token_accuracy": 0.670105955004692, |
| "epoch": 0.4352830386104426, |
| "step": 1040 |
| }, |
| { |
| "loss": 1.4673041343688964, |
| "grad_norm": 0.09340775012969971, |
| "learning_rate": 9.650112060430556e-07, |
| "entropy": 1.4505166023969651, |
| "num_tokens": 6106055.0, |
| "mean_token_accuracy": 0.6630285322666168, |
| "epoch": 0.43946845244323535, |
| "step": 1050 |
| }, |
| { |
| "loss": 1.425284481048584, |
| "grad_norm": 0.12269195914268494, |
| "learning_rate": 9.641763340667264e-07, |
| "entropy": 1.438645276427269, |
| "num_tokens": 6165982.0, |
| "mean_token_accuracy": 0.6631047874689102, |
| "epoch": 0.44365386627602804, |
| "step": 1060 |
| }, |
| { |
| "loss": 1.4093000411987304, |
| "grad_norm": 0.12008947134017944, |
| "learning_rate": 9.633319885430863e-07, |
| "entropy": 1.4247242331504821, |
| "num_tokens": 6221254.0, |
| "mean_token_accuracy": 0.668901015818119, |
| "epoch": 0.44783928010882074, |
| "step": 1070 |
| }, |
| { |
| "loss": 1.4575057983398438, |
| "grad_norm": 0.12600930035114288, |
| "learning_rate": 9.62478186704697e-07, |
| "entropy": 1.4588077813386917, |
| "num_tokens": 6281193.0, |
| "mean_token_accuracy": 0.659762179851532, |
| "epoch": 0.4520246939416135, |
| "step": 1080 |
| }, |
| { |
| "loss": 1.3971601486206056, |
| "grad_norm": 0.09669267386198044, |
| "learning_rate": 9.616149459771174e-07, |
| "entropy": 1.4378665208816528, |
| "num_tokens": 6338625.0, |
| "mean_token_accuracy": 0.6723957479000091, |
| "epoch": 0.4562101077744062, |
| "step": 1090 |
| }, |
| { |
| "loss": 1.4475428581237793, |
| "grad_norm": 0.10479287803173065, |
| "learning_rate": 9.607422839785487e-07, |
| "entropy": 1.4612567931413651, |
| "num_tokens": 6398522.0, |
| "mean_token_accuracy": 0.6564841374754906, |
| "epoch": 0.46039552160719893, |
| "step": 1100 |
| }, |
| { |
| "loss": 1.4175043106079102, |
| "grad_norm": 0.10358787328004837, |
| "learning_rate": 9.598602185194733e-07, |
| "entropy": 1.4467926740646362, |
| "num_tokens": 6458089.0, |
| "mean_token_accuracy": 0.669213418662548, |
| "epoch": 0.4645809354399916, |
| "step": 1110 |
| }, |
| { |
| "loss": 1.4430898666381835, |
| "grad_norm": 0.12753859162330627, |
| "learning_rate": 9.589687676022933e-07, |
| "entropy": 1.4614018350839615, |
| "num_tokens": 6517387.0, |
| "mean_token_accuracy": 0.663593128323555, |
| "epoch": 0.4687663492727843, |
| "step": 1120 |
| }, |
| { |
| "loss": 1.4554133415222168, |
| "grad_norm": 0.11801481246948242, |
| "learning_rate": 9.580679494209621e-07, |
| "entropy": 1.463664811849594, |
| "num_tokens": 6574281.0, |
| "mean_token_accuracy": 0.6585227012634277, |
| "epoch": 0.47295176310557707, |
| "step": 1130 |
| }, |
| { |
| "loss": 1.4516281127929687, |
| "grad_norm": 0.1230725646018982, |
| "learning_rate": 9.57157782360612e-07, |
| "entropy": 1.4588176727294921, |
| "num_tokens": 6632526.0, |
| "mean_token_accuracy": 0.6620682567358017, |
| "epoch": 0.47713717693836977, |
| "step": 1140 |
| }, |
| { |
| "loss": 1.3834566116333007, |
| "grad_norm": 0.10615360736846924, |
| "learning_rate": 9.562382849971814e-07, |
| "entropy": 1.4231864005327224, |
| "num_tokens": 6686576.0, |
| "mean_token_accuracy": 0.6769091472029686, |
| "epoch": 0.4813225907711625, |
| "step": 1150 |
| }, |
| { |
| "loss": 1.3678070068359376, |
| "grad_norm": 0.10580965131521225, |
| "learning_rate": 9.553094760970338e-07, |
| "entropy": 1.4144569963216782, |
| "num_tokens": 6743418.0, |
| "mean_token_accuracy": 0.6736478328704834, |
| "epoch": 0.4855080046039552, |
| "step": 1160 |
| }, |
| { |
| "loss": 1.4649283409118652, |
| "grad_norm": 0.11393830180168152, |
| "learning_rate": 9.543713746165746e-07, |
| "entropy": 1.461512914299965, |
| "num_tokens": 6801169.0, |
| "mean_token_accuracy": 0.6581070765852928, |
| "epoch": 0.4896934184367479, |
| "step": 1170 |
| }, |
| { |
| "loss": 1.3680376052856444, |
| "grad_norm": 0.19611844420433044, |
| "learning_rate": 9.534239997018663e-07, |
| "entropy": 1.4197842329740524, |
| "num_tokens": 6858807.0, |
| "mean_token_accuracy": 0.6744951158761978, |
| "epoch": 0.49387883226954066, |
| "step": 1180 |
| }, |
| { |
| "loss": 1.4589731216430664, |
| "grad_norm": 0.12470986694097519, |
| "learning_rate": 9.52467370688235e-07, |
| "entropy": 1.4711190968751908, |
| "num_tokens": 6915842.0, |
| "mean_token_accuracy": 0.6595605373382568, |
| "epoch": 0.49806424610233335, |
| "step": 1190 |
| }, |
| { |
| "loss": 1.3511184692382812, |
| "grad_norm": 0.1231166198849678, |
| "learning_rate": 9.515015070998781e-07, |
| "entropy": 1.3929312020540237, |
| "num_tokens": 6973364.0, |
| "mean_token_accuracy": 0.6785273075103759, |
| "epoch": 0.502249659935126, |
| "step": 1200 |
| }, |
| { |
| "eval_loss": 1.4083536863327026, |
| "eval_runtime": 43.0534, |
| "eval_samples_per_second": 147.979, |
| "eval_steps_per_second": 6.178, |
| "eval_entropy": 1.401145983907513, |
| "eval_num_tokens": 6973364.0, |
| "eval_mean_token_accuracy": 0.6672393374873283, |
| "epoch": 0.502249659935126, |
| "step": 1200 |
| }, |
| { |
| "loss": 1.4147989273071289, |
| "grad_norm": 0.10981585085391998, |
| "learning_rate": 9.505264286494644e-07, |
| "entropy": 1.4393782436847686, |
| "num_tokens": 7029183.0, |
| "mean_token_accuracy": 0.6653257578611373, |
| "epoch": 0.5064350737679189, |
| "step": 1210 |
| }, |
| { |
| "loss": 1.4123595237731934, |
| "grad_norm": 0.12332361936569214, |
| "learning_rate": 9.495421552377325e-07, |
| "entropy": 1.4351352035999299, |
| "num_tokens": 7089107.0, |
| "mean_token_accuracy": 0.6679085582494736, |
| "epoch": 0.5106204876007115, |
| "step": 1220 |
| }, |
| { |
| "loss": 1.35689115524292, |
| "grad_norm": 0.10939253121614456, |
| "learning_rate": 9.485487069530841e-07, |
| "entropy": 1.384123608469963, |
| "num_tokens": 7145731.0, |
| "mean_token_accuracy": 0.6764253750443459, |
| "epoch": 0.5148059014335042, |
| "step": 1230 |
| }, |
| { |
| "loss": 1.4721358299255372, |
| "grad_norm": 0.1354241967201233, |
| "learning_rate": 9.475461040711745e-07, |
| "entropy": 1.4555100411176682, |
| "num_tokens": 7201497.0, |
| "mean_token_accuracy": 0.6551220327615738, |
| "epoch": 0.5189913152662969, |
| "step": 1240 |
| }, |
| { |
| "loss": 1.406270408630371, |
| "grad_norm": 0.11071319878101349, |
| "learning_rate": 9.465343670544987e-07, |
| "entropy": 1.446416699886322, |
| "num_tokens": 7255249.0, |
| "mean_token_accuracy": 0.6669346168637276, |
| "epoch": 0.5231767290990896, |
| "step": 1250 |
| }, |
| { |
| "loss": 1.409125804901123, |
| "grad_norm": 0.1242227554321289, |
| "learning_rate": 9.455135165519734e-07, |
| "entropy": 1.4336748003959656, |
| "num_tokens": 7312069.0, |
| "mean_token_accuracy": 0.6685505136847496, |
| "epoch": 0.5273621429318824, |
| "step": 1260 |
| }, |
| { |
| "loss": 1.353925609588623, |
| "grad_norm": 0.12051878869533539, |
| "learning_rate": 9.444835733985157e-07, |
| "entropy": 1.3861510157585144, |
| "num_tokens": 7374935.0, |
| "mean_token_accuracy": 0.6735975816845894, |
| "epoch": 0.5315475567646751, |
| "step": 1270 |
| }, |
| { |
| "loss": 1.3926225662231446, |
| "grad_norm": 0.1231522411108017, |
| "learning_rate": 9.434445586146182e-07, |
| "entropy": 1.431991320848465, |
| "num_tokens": 7429456.0, |
| "mean_token_accuracy": 0.6716481134295463, |
| "epoch": 0.5357329705974678, |
| "step": 1280 |
| }, |
| { |
| "loss": 1.3677814483642579, |
| "grad_norm": 0.10811372101306915, |
| "learning_rate": 9.423964934059202e-07, |
| "entropy": 1.4019683420658111, |
| "num_tokens": 7487005.0, |
| "mean_token_accuracy": 0.6747205436229706, |
| "epoch": 0.5399183844302605, |
| "step": 1290 |
| }, |
| { |
| "loss": 1.3889549255371094, |
| "grad_norm": 0.12505528330802917, |
| "learning_rate": 9.413393991627736e-07, |
| "entropy": 1.3941765069961547, |
| "num_tokens": 7547594.0, |
| "mean_token_accuracy": 0.6716236621141434, |
| "epoch": 0.5441037982630532, |
| "step": 1300 |
| }, |
| { |
| "loss": 1.388343048095703, |
| "grad_norm": 0.11002212017774582, |
| "learning_rate": 9.40273297459808e-07, |
| "entropy": 1.4113761156797409, |
| "num_tokens": 7605828.0, |
| "mean_token_accuracy": 0.6661069095134735, |
| "epoch": 0.548289212095846, |
| "step": 1310 |
| }, |
| { |
| "loss": 1.3891004562377929, |
| "grad_norm": 0.14147064089775085, |
| "learning_rate": 9.391982100554889e-07, |
| "entropy": 1.4317275822162627, |
| "num_tokens": 7661455.0, |
| "mean_token_accuracy": 0.6669554397463798, |
| "epoch": 0.5524746259286387, |
| "step": 1320 |
| }, |
| { |
| "loss": 1.3904253959655761, |
| "grad_norm": 0.13139671087265015, |
| "learning_rate": 9.38114158891675e-07, |
| "entropy": 1.4096351087093353, |
| "num_tokens": 7719091.0, |
| "mean_token_accuracy": 0.671739687025547, |
| "epoch": 0.5566600397614314, |
| "step": 1330 |
| }, |
| { |
| "loss": 1.463707733154297, |
| "grad_norm": 0.09927231818437576, |
| "learning_rate": 9.370211660931693e-07, |
| "entropy": 1.4864629238843918, |
| "num_tokens": 7774511.0, |
| "mean_token_accuracy": 0.660004960000515, |
| "epoch": 0.5608454535942241, |
| "step": 1340 |
| }, |
| { |
| "loss": 1.3764376640319824, |
| "grad_norm": 0.11545363068580627, |
| "learning_rate": 9.35919253967268e-07, |
| "entropy": 1.3998028621077538, |
| "num_tokens": 7836251.0, |
| "mean_token_accuracy": 0.6720214635133743, |
| "epoch": 0.5650308674270168, |
| "step": 1350 |
| }, |
| { |
| "loss": 1.3152969360351563, |
| "grad_norm": 0.1053733229637146, |
| "learning_rate": 9.348084450033051e-07, |
| "entropy": 1.3938700079917907, |
| "num_tokens": 7893911.0, |
| "mean_token_accuracy": 0.6841806307435035, |
| "epoch": 0.5692162812598096, |
| "step": 1360 |
| }, |
| { |
| "loss": 1.422788143157959, |
| "grad_norm": 0.09823399037122726, |
| "learning_rate": 9.336887618721938e-07, |
| "entropy": 1.445565864443779, |
| "num_tokens": 7949863.0, |
| "mean_token_accuracy": 0.6624092936515809, |
| "epoch": 0.5734016950926023, |
| "step": 1370 |
| }, |
| { |
| "loss": 1.3210840225219727, |
| "grad_norm": 0.1335407942533493, |
| "learning_rate": 9.325602274259629e-07, |
| "entropy": 1.3757253885269165, |
| "num_tokens": 8008384.0, |
| "mean_token_accuracy": 0.6824934765696525, |
| "epoch": 0.577587108925395, |
| "step": 1380 |
| }, |
| { |
| "loss": 1.397932243347168, |
| "grad_norm": 0.09968513995409012, |
| "learning_rate": 9.314228646972919e-07, |
| "entropy": 1.4251334190368652, |
| "num_tokens": 8067031.0, |
| "mean_token_accuracy": 0.666124664247036, |
| "epoch": 0.5817725227581877, |
| "step": 1390 |
| }, |
| { |
| "loss": 1.312647533416748, |
| "grad_norm": 0.12575951218605042, |
| "learning_rate": 9.302766968990387e-07, |
| "entropy": 1.355531930923462, |
| "num_tokens": 8126287.0, |
| "mean_token_accuracy": 0.6826214835047721, |
| "epoch": 0.5859579365909804, |
| "step": 1400 |
| }, |
| { |
| "eval_loss": 1.386446237564087, |
| "eval_runtime": 42.6243, |
| "eval_samples_per_second": 149.469, |
| "eval_steps_per_second": 6.241, |
| "eval_entropy": 1.4034466080199508, |
| "eval_num_tokens": 8126287.0, |
| "eval_mean_token_accuracy": 0.6737931832335049, |
| "epoch": 0.5859579365909804, |
| "step": 1400 |
| }, |
| { |
| "loss": 1.38052396774292, |
| "grad_norm": 0.13619256019592285, |
| "learning_rate": 9.291217474237685e-07, |
| "entropy": 1.404805138707161, |
| "num_tokens": 8184847.0, |
| "mean_token_accuracy": 0.6700320944190026, |
| "epoch": 0.5901433504237732, |
| "step": 1410 |
| }, |
| { |
| "loss": 1.4232772827148437, |
| "grad_norm": 0.12265791743993759, |
| "learning_rate": 9.27958039843274e-07, |
| "entropy": 1.4586470276117325, |
| "num_tokens": 8243143.0, |
| "mean_token_accuracy": 0.6625824689865112, |
| "epoch": 0.5943287642565659, |
| "step": 1420 |
| }, |
| { |
| "loss": 1.3759157180786132, |
| "grad_norm": 0.12311021983623505, |
| "learning_rate": 9.267855979080959e-07, |
| "entropy": 1.4208383083343505, |
| "num_tokens": 8301096.0, |
| "mean_token_accuracy": 0.6705714225769043, |
| "epoch": 0.5985141780893586, |
| "step": 1430 |
| }, |
| { |
| "loss": 1.4408933639526367, |
| "grad_norm": 0.10979989171028137, |
| "learning_rate": 9.256044455470372e-07, |
| "entropy": 1.4562449276447296, |
| "num_tokens": 8357561.0, |
| "mean_token_accuracy": 0.6647118896245956, |
| "epoch": 0.6026995919221513, |
| "step": 1440 |
| }, |
| { |
| "loss": 1.4200193405151367, |
| "grad_norm": 0.10581167787313461, |
| "learning_rate": 9.244146068666756e-07, |
| "entropy": 1.4489133656024933, |
| "num_tokens": 8411021.0, |
| "mean_token_accuracy": 0.6702521324157715, |
| "epoch": 0.606885005754944, |
| "step": 1450 |
| }, |
| { |
| "loss": 1.3639183044433594, |
| "grad_norm": 0.12785717844963074, |
| "learning_rate": 9.232161061508707e-07, |
| "entropy": 1.3970074653625488, |
| "num_tokens": 8473715.0, |
| "mean_token_accuracy": 0.6738650560379028, |
| "epoch": 0.6110704195877368, |
| "step": 1460 |
| }, |
| { |
| "loss": 1.3117795944213868, |
| "grad_norm": 0.11914683878421783, |
| "learning_rate": 9.220089678602692e-07, |
| "entropy": 1.3731692731380463, |
| "num_tokens": 8536821.0, |
| "mean_token_accuracy": 0.6784457266330719, |
| "epoch": 0.6152558334205295, |
| "step": 1470 |
| }, |
| { |
| "loss": 1.3580459594726562, |
| "grad_norm": 0.10762108862400055, |
| "learning_rate": 9.20793216631805e-07, |
| "entropy": 1.3978804230690003, |
| "num_tokens": 8596217.0, |
| "mean_token_accuracy": 0.6741666734218598, |
| "epoch": 0.6194412472533222, |
| "step": 1480 |
| }, |
| { |
| "loss": 1.3858207702636718, |
| "grad_norm": 0.13189709186553955, |
| "learning_rate": 9.195688772781969e-07, |
| "entropy": 1.4172445833683014, |
| "num_tokens": 8649547.0, |
| "mean_token_accuracy": 0.6702063709497452, |
| "epoch": 0.6236266610861149, |
| "step": 1490 |
| }, |
| { |
| "loss": 1.3870158195495605, |
| "grad_norm": 0.13120818138122559, |
| "learning_rate": 9.183359747874416e-07, |
| "entropy": 1.424094271659851, |
| "num_tokens": 8704916.0, |
| "mean_token_accuracy": 0.669642123579979, |
| "epoch": 0.6278120749189076, |
| "step": 1500 |
| }, |
| { |
| "loss": 1.4398550033569335, |
| "grad_norm": 0.12010879069566727, |
| "learning_rate": 9.170945343223045e-07, |
| "entropy": 1.4305728733539582, |
| "num_tokens": 8760259.0, |
| "mean_token_accuracy": 0.6612218707799912, |
| "epoch": 0.6319974887517004, |
| "step": 1510 |
| }, |
| { |
| "loss": 1.3878154754638672, |
| "grad_norm": 0.1339423507452011, |
| "learning_rate": 9.15844581219805e-07, |
| "entropy": 1.3878618061542511, |
| "num_tokens": 8816700.0, |
| "mean_token_accuracy": 0.6718688145279884, |
| "epoch": 0.6361829025844931, |
| "step": 1520 |
| }, |
| { |
| "loss": 1.3522814750671386, |
| "grad_norm": 0.13170458376407623, |
| "learning_rate": 9.145861409907009e-07, |
| "entropy": 1.3895842641592027, |
| "num_tokens": 8876509.0, |
| "mean_token_accuracy": 0.6753421723842621, |
| "epoch": 0.6403683164172858, |
| "step": 1530 |
| }, |
| { |
| "loss": 1.3812095642089843, |
| "grad_norm": 0.1139625683426857, |
| "learning_rate": 9.133192393189664e-07, |
| "entropy": 1.4209527760744094, |
| "num_tokens": 8936438.0, |
| "mean_token_accuracy": 0.6720142468810082, |
| "epoch": 0.6445537302500784, |
| "step": 1540 |
| }, |
| { |
| "loss": 1.4154645919799804, |
| "grad_norm": 0.13268420100212097, |
| "learning_rate": 9.120439020612685e-07, |
| "entropy": 1.424301978945732, |
| "num_tokens": 8994731.0, |
| "mean_token_accuracy": 0.6668044954538346, |
| "epoch": 0.6487391440828711, |
| "step": 1550 |
| }, |
| { |
| "loss": 1.3785716056823731, |
| "grad_norm": 0.11167196929454803, |
| "learning_rate": 9.107601552464393e-07, |
| "entropy": 1.3881200447678566, |
| "num_tokens": 9052527.0, |
| "mean_token_accuracy": 0.6731992438435555, |
| "epoch": 0.652924557915664, |
| "step": 1560 |
| }, |
| { |
| "loss": 1.3963075637817384, |
| "grad_norm": 0.1282496154308319, |
| "learning_rate": 9.094680250749447e-07, |
| "entropy": 1.408314546942711, |
| "num_tokens": 9111578.0, |
| "mean_token_accuracy": 0.6680608317255974, |
| "epoch": 0.6571099717484566, |
| "step": 1570 |
| }, |
| { |
| "loss": 1.3251177787780761, |
| "grad_norm": 0.12457749992609024, |
| "learning_rate": 9.081675379183494e-07, |
| "entropy": 1.3645547151565551, |
| "num_tokens": 9171878.0, |
| "mean_token_accuracy": 0.6805019825696945, |
| "epoch": 0.6612953855812493, |
| "step": 1580 |
| }, |
| { |
| "loss": 1.3337480545043945, |
| "grad_norm": 0.10987865179777145, |
| "learning_rate": 9.068587203187794e-07, |
| "entropy": 1.3783577740192414, |
| "num_tokens": 9231431.0, |
| "mean_token_accuracy": 0.6761843442916871, |
| "epoch": 0.665480799414042, |
| "step": 1590 |
| }, |
| { |
| "loss": 1.3129050254821777, |
| "grad_norm": 0.11137118935585022, |
| "learning_rate": 9.055415989883792e-07, |
| "entropy": 1.3690737694501878, |
| "num_tokens": 9287759.0, |
| "mean_token_accuracy": 0.6817014619708062, |
| "epoch": 0.6696662132468347, |
| "step": 1600 |
| }, |
| { |
| "eval_loss": 1.3662420511245728, |
| "eval_runtime": 43.7555, |
| "eval_samples_per_second": 145.605, |
| "eval_steps_per_second": 6.079, |
| "eval_entropy": 1.3844989090037525, |
| "eval_num_tokens": 9287759.0, |
| "eval_mean_token_accuracy": 0.6762344077565616, |
| "epoch": 0.6696662132468347, |
| "step": 1600 |
| }, |
| { |
| "loss": 1.3789652824401855, |
| "grad_norm": 0.11303029209375381, |
| "learning_rate": 9.042162008087678e-07, |
| "entropy": 1.388508751988411, |
| "num_tokens": 9347815.0, |
| "mean_token_accuracy": 0.671443772315979, |
| "epoch": 0.6738516270796275, |
| "step": 1610 |
| }, |
| { |
| "loss": 1.3409759521484375, |
| "grad_norm": 0.12162081152200699, |
| "learning_rate": 9.028825528304891e-07, |
| "entropy": 1.3988509953022004, |
| "num_tokens": 9404534.0, |
| "mean_token_accuracy": 0.6778050258755683, |
| "epoch": 0.6780370409124202, |
| "step": 1620 |
| }, |
| { |
| "loss": 1.286928367614746, |
| "grad_norm": 0.1191353127360344, |
| "learning_rate": 9.015406822724603e-07, |
| "entropy": 1.3400784492492677, |
| "num_tokens": 9465006.0, |
| "mean_token_accuracy": 0.6883344247937202, |
| "epoch": 0.6822224547452129, |
| "step": 1630 |
| }, |
| { |
| "loss": 1.3931745529174804, |
| "grad_norm": 0.09988338500261307, |
| "learning_rate": 9.001906165214163e-07, |
| "entropy": 1.4158646211028099, |
| "num_tokens": 9523244.0, |
| "mean_token_accuracy": 0.6664687514305114, |
| "epoch": 0.6864078685780056, |
| "step": 1640 |
| }, |
| { |
| "loss": 1.3149008750915527, |
| "grad_norm": 0.1224365308880806, |
| "learning_rate": 8.988323831313509e-07, |
| "entropy": 1.3621025055646896, |
| "num_tokens": 9583571.0, |
| "mean_token_accuracy": 0.6805920660495758, |
| "epoch": 0.6905932824107983, |
| "step": 1650 |
| }, |
| { |
| "loss": 1.3128664016723632, |
| "grad_norm": 0.10845732688903809, |
| "learning_rate": 8.974660098229538e-07, |
| "entropy": 1.366037741303444, |
| "num_tokens": 9640353.0, |
| "mean_token_accuracy": 0.6822919920086861, |
| "epoch": 0.6947786962435911, |
| "step": 1660 |
| }, |
| { |
| "loss": 1.3836250305175781, |
| "grad_norm": 0.12312953174114227, |
| "learning_rate": 8.960915244830462e-07, |
| "entropy": 1.4012254863977431, |
| "num_tokens": 9701108.0, |
| "mean_token_accuracy": 0.6682980388402939, |
| "epoch": 0.6989641100763838, |
| "step": 1670 |
| }, |
| { |
| "loss": 1.298573875427246, |
| "grad_norm": 0.10932071506977081, |
| "learning_rate": 8.947089551640099e-07, |
| "entropy": 1.351333498954773, |
| "num_tokens": 9758477.0, |
| "mean_token_accuracy": 0.6857402086257934, |
| "epoch": 0.7031495239091765, |
| "step": 1680 |
| }, |
| { |
| "loss": 1.3268583297729493, |
| "grad_norm": 0.1166784018278122, |
| "learning_rate": 8.933183300832159e-07, |
| "entropy": 1.3652890086174012, |
| "num_tokens": 9816530.0, |
| "mean_token_accuracy": 0.6774859979748726, |
| "epoch": 0.7073349377419692, |
| "step": 1690 |
| }, |
| { |
| "loss": 1.37611722946167, |
| "grad_norm": 0.1278134286403656, |
| "learning_rate": 8.919196776224483e-07, |
| "entropy": 1.399143072962761, |
| "num_tokens": 9872452.0, |
| "mean_token_accuracy": 0.6704028770327568, |
| "epoch": 0.7115203515747619, |
| "step": 1700 |
| }, |
| { |
| "loss": 1.3107229232788087, |
| "grad_norm": 0.12152674794197083, |
| "learning_rate": 8.905130263273252e-07, |
| "entropy": 1.3753829419612884, |
| "num_tokens": 9934101.0, |
| "mean_token_accuracy": 0.68070268034935, |
| "epoch": 0.7157057654075547, |
| "step": 1710 |
| }, |
| { |
| "loss": 1.3585830688476563, |
| "grad_norm": 0.12099979817867279, |
| "learning_rate": 8.890984049067154e-07, |
| "entropy": 1.3618301630020142, |
| "num_tokens": 9993614.0, |
| "mean_token_accuracy": 0.6762332633137703, |
| "epoch": 0.7198911792403474, |
| "step": 1720 |
| }, |
| { |
| "loss": 1.302845287322998, |
| "grad_norm": 0.11998716741800308, |
| "learning_rate": 8.876758422321534e-07, |
| "entropy": 1.356363880634308, |
| "num_tokens": 10047945.0, |
| "mean_token_accuracy": 0.6853278845548629, |
| "epoch": 0.7240765930731401, |
| "step": 1730 |
| }, |
| { |
| "loss": 1.3057265281677246, |
| "grad_norm": 0.11447525024414062, |
| "learning_rate": 8.862453673372495e-07, |
| "entropy": 1.3511420711874962, |
| "num_tokens": 10105849.0, |
| "mean_token_accuracy": 0.6814648106694221, |
| "epoch": 0.7282620069059328, |
| "step": 1740 |
| }, |
| { |
| "loss": 1.379593563079834, |
| "grad_norm": 0.13615551590919495, |
| "learning_rate": 8.848070094170972e-07, |
| "entropy": 1.4266703605651856, |
| "num_tokens": 10160689.0, |
| "mean_token_accuracy": 0.6730331972241401, |
| "epoch": 0.7324474207387256, |
| "step": 1750 |
| }, |
| { |
| "loss": 1.3482324600219726, |
| "grad_norm": 0.1049669086933136, |
| "learning_rate": 8.833607978276782e-07, |
| "entropy": 1.365234938263893, |
| "num_tokens": 10219317.0, |
| "mean_token_accuracy": 0.6763183102011681, |
| "epoch": 0.7366328345715183, |
| "step": 1760 |
| }, |
| { |
| "loss": 1.308854579925537, |
| "grad_norm": 0.11895614117383957, |
| "learning_rate": 8.819067620852621e-07, |
| "entropy": 1.3593208014965057, |
| "num_tokens": 10281133.0, |
| "mean_token_accuracy": 0.6821026623249054, |
| "epoch": 0.740818248404311, |
| "step": 1770 |
| }, |
| { |
| "loss": 1.3750693321228027, |
| "grad_norm": 0.13367140293121338, |
| "learning_rate": 8.804449318658047e-07, |
| "entropy": 1.391082948446274, |
| "num_tokens": 10338588.0, |
| "mean_token_accuracy": 0.6708121821284294, |
| "epoch": 0.7450036622371037, |
| "step": 1780 |
| }, |
| { |
| "loss": 1.3176989555358887, |
| "grad_norm": 0.10955236107110977, |
| "learning_rate": 8.789753370043425e-07, |
| "entropy": 1.373031947016716, |
| "num_tokens": 10398744.0, |
| "mean_token_accuracy": 0.6810923710465431, |
| "epoch": 0.7491890760698964, |
| "step": 1790 |
| }, |
| { |
| "loss": 1.3639984130859375, |
| "grad_norm": 0.12343617528676987, |
| "learning_rate": 8.77498007494383e-07, |
| "entropy": 1.4000030606985092, |
| "num_tokens": 10458537.0, |
| "mean_token_accuracy": 0.6697928130626678, |
| "epoch": 0.7533744899026892, |
| "step": 1800 |
| }, |
| { |
| "eval_loss": 1.350634217262268, |
| "eval_runtime": 42.5241, |
| "eval_samples_per_second": 149.821, |
| "eval_steps_per_second": 6.255, |
| "eval_entropy": 1.3988571140102875, |
| "eval_num_tokens": 10458537.0, |
| "eval_mean_token_accuracy": 0.6771848898633082, |
| "epoch": 0.7533744899026892, |
| "step": 1800 |
| }, |
| { |
| "loss": 1.3396940231323242, |
| "grad_norm": 0.14190584421157837, |
| "learning_rate": 8.760129734872932e-07, |
| "entropy": 1.3851164013147355, |
| "num_tokens": 10516646.0, |
| "mean_token_accuracy": 0.6750243782997132, |
| "epoch": 0.7575599037354819, |
| "step": 1810 |
| }, |
| { |
| "loss": 1.3640681266784669, |
| "grad_norm": 0.11394577473402023, |
| "learning_rate": 8.745202652916841e-07, |
| "entropy": 1.400177638232708, |
| "num_tokens": 10576044.0, |
| "mean_token_accuracy": 0.6688720732927322, |
| "epoch": 0.7617453175682746, |
| "step": 1820 |
| }, |
| { |
| "loss": 1.4145827293395996, |
| "grad_norm": 0.1021205335855484, |
| "learning_rate": 8.73019913372792e-07, |
| "entropy": 1.4293284267187119, |
| "num_tokens": 10635490.0, |
| "mean_token_accuracy": 0.664357790350914, |
| "epoch": 0.7659307314010673, |
| "step": 1830 |
| }, |
| { |
| "loss": 1.3291969299316406, |
| "grad_norm": 0.104949451982975, |
| "learning_rate": 8.715119483518568e-07, |
| "entropy": 1.392353293299675, |
| "num_tokens": 10696235.0, |
| "mean_token_accuracy": 0.6753359526395798, |
| "epoch": 0.77011614523386, |
| "step": 1840 |
| }, |
| { |
| "loss": 1.3674373626708984, |
| "grad_norm": 0.13051320612430573, |
| "learning_rate": 8.699964010054972e-07, |
| "entropy": 1.3989370226860047, |
| "num_tokens": 10756113.0, |
| "mean_token_accuracy": 0.6702560499310494, |
| "epoch": 0.7743015590666528, |
| "step": 1850 |
| }, |
| { |
| "loss": 1.3329706192016602, |
| "grad_norm": 0.11483673751354218, |
| "learning_rate": 8.684733022650819e-07, |
| "entropy": 1.368683397769928, |
| "num_tokens": 10811097.0, |
| "mean_token_accuracy": 0.6795622929930687, |
| "epoch": 0.7784869728994455, |
| "step": 1860 |
| }, |
| { |
| "loss": 1.34647216796875, |
| "grad_norm": 0.12257901579141617, |
| "learning_rate": 8.669426832160995e-07, |
| "entropy": 1.3777292981743812, |
| "num_tokens": 10869645.0, |
| "mean_token_accuracy": 0.6771846890449524, |
| "epoch": 0.7826723867322382, |
| "step": 1870 |
| }, |
| { |
| "loss": 1.2678668022155761, |
| "grad_norm": 0.10710500180721283, |
| "learning_rate": 8.65404575097523e-07, |
| "entropy": 1.319590486586094, |
| "num_tokens": 10929506.0, |
| "mean_token_accuracy": 0.6875983402132988, |
| "epoch": 0.7868578005650309, |
| "step": 1880 |
| }, |
| { |
| "loss": 1.364974021911621, |
| "grad_norm": 0.11756409704685211, |
| "learning_rate": 8.638590093011722e-07, |
| "entropy": 1.400401759147644, |
| "num_tokens": 10984931.0, |
| "mean_token_accuracy": 0.67054513245821, |
| "epoch": 0.7910432143978235, |
| "step": 1890 |
| }, |
| { |
| "loss": 1.3333361625671387, |
| "grad_norm": 0.13867364823818207, |
| "learning_rate": 8.623060173710743e-07, |
| "entropy": 1.369761797785759, |
| "num_tokens": 11040065.0, |
| "mean_token_accuracy": 0.6754815384745598, |
| "epoch": 0.7952286282306164, |
| "step": 1900 |
| }, |
| { |
| "loss": 1.2708181381225585, |
| "grad_norm": 0.10772886127233505, |
| "learning_rate": 8.607456310028185e-07, |
| "entropy": 1.3362341210246087, |
| "num_tokens": 11101320.0, |
| "mean_token_accuracy": 0.6911322221159935, |
| "epoch": 0.799414042063409, |
| "step": 1910 |
| }, |
| { |
| "loss": 1.3482179641723633, |
| "grad_norm": 0.13811437785625458, |
| "learning_rate": 8.591778820429104e-07, |
| "entropy": 1.3786241382360458, |
| "num_tokens": 11159403.0, |
| "mean_token_accuracy": 0.676637114584446, |
| "epoch": 0.8035994558962017, |
| "step": 1920 |
| }, |
| { |
| "loss": 1.3001495361328126, |
| "grad_norm": 0.11261286586523056, |
| "learning_rate": 8.576028024881208e-07, |
| "entropy": 1.342548942565918, |
| "num_tokens": 11215300.0, |
| "mean_token_accuracy": 0.6838564172387123, |
| "epoch": 0.8077848697289944, |
| "step": 1930 |
| }, |
| { |
| "loss": 1.37919921875, |
| "grad_norm": 0.11729196459054947, |
| "learning_rate": 8.560204244848339e-07, |
| "entropy": 1.399843516945839, |
| "num_tokens": 11274016.0, |
| "mean_token_accuracy": 0.6692644655704498, |
| "epoch": 0.8119702835617871, |
| "step": 1940 |
| }, |
| { |
| "loss": 1.3153133392333984, |
| "grad_norm": 0.11101414263248444, |
| "learning_rate": 8.544307803283903e-07, |
| "entropy": 1.3550761044025421, |
| "num_tokens": 11331840.0, |
| "mean_token_accuracy": 0.6830023691058159, |
| "epoch": 0.8161556973945799, |
| "step": 1950 |
| }, |
| { |
| "loss": 1.3670063018798828, |
| "grad_norm": 0.10791585594415665, |
| "learning_rate": 8.528339024624287e-07, |
| "entropy": 1.3926001816987992, |
| "num_tokens": 11388250.0, |
| "mean_token_accuracy": 0.6758360341191292, |
| "epoch": 0.8203411112273726, |
| "step": 1960 |
| }, |
| { |
| "loss": 1.322612190246582, |
| "grad_norm": 0.12179048359394073, |
| "learning_rate": 8.512298234782227e-07, |
| "entropy": 1.3523173958063126, |
| "num_tokens": 11444623.0, |
| "mean_token_accuracy": 0.6819486439228057, |
| "epoch": 0.8245265250601653, |
| "step": 1970 |
| }, |
| { |
| "loss": 1.4050199508666992, |
| "grad_norm": 0.1269518882036209, |
| "learning_rate": 8.496185761140165e-07, |
| "entropy": 1.4183282285928727, |
| "num_tokens": 11501456.0, |
| "mean_token_accuracy": 0.6670055955648422, |
| "epoch": 0.828711938892958, |
| "step": 1980 |
| }, |
| { |
| "loss": 1.3573097229003905, |
| "grad_norm": 0.09794802963733673, |
| "learning_rate": 8.480001932543561e-07, |
| "entropy": 1.3888707369565965, |
| "num_tokens": 11562134.0, |
| "mean_token_accuracy": 0.6723511442542076, |
| "epoch": 0.8328973527257507, |
| "step": 1990 |
| }, |
| { |
| "loss": 1.3131244659423829, |
| "grad_norm": 0.12277819216251373, |
| "learning_rate": 8.463747079294192e-07, |
| "entropy": 1.3465208828449249, |
| "num_tokens": 11618831.0, |
| "mean_token_accuracy": 0.6795975625514984, |
| "epoch": 0.8370827665585435, |
| "step": 2000 |
| }, |
| { |
| "eval_loss": 1.3373528718948364, |
| "eval_runtime": 42.4003, |
| "eval_samples_per_second": 150.258, |
| "eval_steps_per_second": 6.274, |
| "eval_entropy": 1.3585354107663148, |
| "eval_num_tokens": 11618831.0, |
| "eval_mean_token_accuracy": 0.678231207947982, |
| "epoch": 0.8370827665585435, |
| "step": 2000 |
| }, |
| { |
| "loss": 1.3982874870300293, |
| "grad_norm": 0.13690534234046936, |
| "learning_rate": 8.447421533143396e-07, |
| "entropy": 1.4036804780364036, |
| "num_tokens": 11676394.0, |
| "mean_token_accuracy": 0.6648698434233665, |
| "epoch": 0.8412681803913362, |
| "step": 2010 |
| }, |
| { |
| "loss": 1.321161937713623, |
| "grad_norm": 0.16348762810230255, |
| "learning_rate": 8.431025627285313e-07, |
| "entropy": 1.349110186100006, |
| "num_tokens": 11730143.0, |
| "mean_token_accuracy": 0.6850418791174888, |
| "epoch": 0.8454535942241289, |
| "step": 2020 |
| }, |
| { |
| "loss": 1.337346076965332, |
| "grad_norm": 0.12358900159597397, |
| "learning_rate": 8.414559696350078e-07, |
| "entropy": 1.3770191550254822, |
| "num_tokens": 11786616.0, |
| "mean_token_accuracy": 0.6773856431245804, |
| "epoch": 0.8496390080569216, |
| "step": 2030 |
| }, |
| { |
| "loss": 1.341224193572998, |
| "grad_norm": 0.11463375389575958, |
| "learning_rate": 8.398024076396996e-07, |
| "entropy": 1.345393455028534, |
| "num_tokens": 11845477.0, |
| "mean_token_accuracy": 0.6754840731620788, |
| "epoch": 0.8538244218897143, |
| "step": 2040 |
| }, |
| { |
| "loss": 1.3237956047058106, |
| "grad_norm": 0.12505337595939636, |
| "learning_rate": 8.381419104907681e-07, |
| "entropy": 1.3643497437238694, |
| "num_tokens": 11901746.0, |
| "mean_token_accuracy": 0.6791232407093049, |
| "epoch": 0.8580098357225071, |
| "step": 2050 |
| }, |
| { |
| "loss": 1.3346891403198242, |
| "grad_norm": 0.15036678314208984, |
| "learning_rate": 8.364745120779164e-07, |
| "entropy": 1.3704555958509446, |
| "num_tokens": 11959605.0, |
| "mean_token_accuracy": 0.6759614482522011, |
| "epoch": 0.8621952495552998, |
| "step": 2060 |
| }, |
| { |
| "loss": 1.4080591201782227, |
| "grad_norm": 0.14488154649734497, |
| "learning_rate": 8.348002464316987e-07, |
| "entropy": 1.4137721806764603, |
| "num_tokens": 12018839.0, |
| "mean_token_accuracy": 0.6624691441655159, |
| "epoch": 0.8663806633880925, |
| "step": 2070 |
| }, |
| { |
| "loss": 1.3576594352722169, |
| "grad_norm": 0.1306961327791214, |
| "learning_rate": 8.331191477228246e-07, |
| "entropy": 1.4100464552640914, |
| "num_tokens": 12077962.0, |
| "mean_token_accuracy": 0.6744375959038734, |
| "epoch": 0.8705660772208852, |
| "step": 2080 |
| }, |
| { |
| "loss": 1.3189333915710448, |
| "grad_norm": 0.09990637004375458, |
| "learning_rate": 8.314312502614625e-07, |
| "entropy": 1.3474989101290702, |
| "num_tokens": 12137755.0, |
| "mean_token_accuracy": 0.6803866818547248, |
| "epoch": 0.8747514910536779, |
| "step": 2090 |
| }, |
| { |
| "loss": 1.350827980041504, |
| "grad_norm": 0.1305275708436966, |
| "learning_rate": 8.29736588496539e-07, |
| "entropy": 1.384324887394905, |
| "num_tokens": 12194836.0, |
| "mean_token_accuracy": 0.6731877833604812, |
| "epoch": 0.8789369048864707, |
| "step": 2100 |
| }, |
| { |
| "loss": 1.3458109855651856, |
| "grad_norm": 0.12695269286632538, |
| "learning_rate": 8.280351970150358e-07, |
| "entropy": 1.3462085962295531, |
| "num_tokens": 12254568.0, |
| "mean_token_accuracy": 0.6745196804404259, |
| "epoch": 0.8831223187192634, |
| "step": 2110 |
| }, |
| { |
| "loss": 1.3157236099243164, |
| "grad_norm": 0.12223149091005325, |
| "learning_rate": 8.263271105412843e-07, |
| "entropy": 1.345698779821396, |
| "num_tokens": 12313266.0, |
| "mean_token_accuracy": 0.6800820276141166, |
| "epoch": 0.8873077325520561, |
| "step": 2120 |
| }, |
| { |
| "loss": 1.3625286102294922, |
| "grad_norm": 0.12075755000114441, |
| "learning_rate": 8.246123639362557e-07, |
| "entropy": 1.3751042202115058, |
| "num_tokens": 12368266.0, |
| "mean_token_accuracy": 0.6779290676116944, |
| "epoch": 0.8914931463848488, |
| "step": 2130 |
| }, |
| { |
| "loss": 1.3247100830078125, |
| "grad_norm": 0.13140852749347687, |
| "learning_rate": 8.22890992196851e-07, |
| "entropy": 1.3399439036846161, |
| "num_tokens": 12427195.0, |
| "mean_token_accuracy": 0.6778766274452209, |
| "epoch": 0.8956785602176415, |
| "step": 2140 |
| }, |
| { |
| "loss": 1.3223968505859376, |
| "grad_norm": 0.11262480914592743, |
| "learning_rate": 8.211630304551856e-07, |
| "entropy": 1.3523710697889328, |
| "num_tokens": 12481690.0, |
| "mean_token_accuracy": 0.6801952719688416, |
| "epoch": 0.8998639740504343, |
| "step": 2150 |
| }, |
| { |
| "loss": 1.2305709838867187, |
| "grad_norm": 0.1140614002943039, |
| "learning_rate": 8.194285139778727e-07, |
| "entropy": 1.2833492413163186, |
| "num_tokens": 12544082.0, |
| "mean_token_accuracy": 0.696322962641716, |
| "epoch": 0.904049387883227, |
| "step": 2160 |
| }, |
| { |
| "loss": 1.2584315299987794, |
| "grad_norm": 0.1213318482041359, |
| "learning_rate": 8.176874781653042e-07, |
| "entropy": 1.2917151510715486, |
| "num_tokens": 12605884.0, |
| "mean_token_accuracy": 0.6918731480836868, |
| "epoch": 0.9082348017160197, |
| "step": 2170 |
| }, |
| { |
| "loss": 1.4043787956237792, |
| "grad_norm": 0.11265023797750473, |
| "learning_rate": 8.159399585509271e-07, |
| "entropy": 1.4147561937570572, |
| "num_tokens": 12662340.0, |
| "mean_token_accuracy": 0.6659792140126228, |
| "epoch": 0.9124202155488124, |
| "step": 2180 |
| }, |
| { |
| "loss": 1.2520899772644043, |
| "grad_norm": 0.12448090314865112, |
| "learning_rate": 8.14185990800518e-07, |
| "entropy": 1.278790497779846, |
| "num_tokens": 12718634.0, |
| "mean_token_accuracy": 0.6945044815540313, |
| "epoch": 0.9166056293816051, |
| "step": 2190 |
| }, |
| { |
| "loss": 1.3444849967956543, |
| "grad_norm": 0.12102659791707993, |
| "learning_rate": 8.124256107114569e-07, |
| "entropy": 1.3645626872777938, |
| "num_tokens": 12774125.0, |
| "mean_token_accuracy": 0.6725556075572967, |
| "epoch": 0.9207910432143979, |
| "step": 2200 |
| }, |
| { |
| "eval_loss": 1.3292649984359741, |
| "eval_runtime": 42.3319, |
| "eval_samples_per_second": 150.501, |
| "eval_steps_per_second": 6.284, |
| "eval_entropy": 1.3460397827894168, |
| "eval_num_tokens": 12774125.0, |
| "eval_mean_token_accuracy": 0.6789818390419609, |
| "epoch": 0.9207910432143979, |
| "step": 2200 |
| }, |
| { |
| "loss": 1.3911532402038573, |
| "grad_norm": 0.1305384337902069, |
| "learning_rate": 8.106588542119957e-07, |
| "entropy": 1.3969669669866562, |
| "num_tokens": 12832025.0, |
| "mean_token_accuracy": 0.6678112506866455, |
| "epoch": 0.9249764570471906, |
| "step": 2210 |
| }, |
| { |
| "loss": 1.3338760375976562, |
| "grad_norm": 0.1187131404876709, |
| "learning_rate": 8.088857573605237e-07, |
| "entropy": 1.375227126479149, |
| "num_tokens": 12888734.0, |
| "mean_token_accuracy": 0.6802457317709922, |
| "epoch": 0.9291618708799833, |
| "step": 2220 |
| }, |
| { |
| "loss": 1.2834582328796387, |
| "grad_norm": 0.14116276800632477, |
| "learning_rate": 8.071063563448339e-07, |
| "entropy": 1.3024362832307816, |
| "num_tokens": 12943886.0, |
| "mean_token_accuracy": 0.6881816878914833, |
| "epoch": 0.933347284712776, |
| "step": 2230 |
| }, |
| { |
| "loss": 1.312158203125, |
| "grad_norm": 0.12964707612991333, |
| "learning_rate": 8.053206874813829e-07, |
| "entropy": 1.364695656299591, |
| "num_tokens": 13000723.0, |
| "mean_token_accuracy": 0.6795030117034913, |
| "epoch": 0.9375326985455686, |
| "step": 2240 |
| }, |
| { |
| "loss": 1.3068957328796387, |
| "grad_norm": 0.12915638089179993, |
| "learning_rate": 8.035287872145502e-07, |
| "entropy": 1.3586914032697677, |
| "num_tokens": 13059283.0, |
| "mean_token_accuracy": 0.6813771218061447, |
| "epoch": 0.9417181123783614, |
| "step": 2250 |
| }, |
| { |
| "loss": 1.3330992698669433, |
| "grad_norm": 0.10278042405843735, |
| "learning_rate": 8.017306921158942e-07, |
| "entropy": 1.3742854058742524, |
| "num_tokens": 13118033.0, |
| "mean_token_accuracy": 0.6798395842313767, |
| "epoch": 0.9459035262111541, |
| "step": 2260 |
| }, |
| { |
| "loss": 1.4351073265075684, |
| "grad_norm": 0.14834155142307281, |
| "learning_rate": 7.99926438883406e-07, |
| "entropy": 1.409215834736824, |
| "num_tokens": 13173943.0, |
| "mean_token_accuracy": 0.6597715452313423, |
| "epoch": 0.9500889400439468, |
| "step": 2270 |
| }, |
| { |
| "loss": 1.2880861282348632, |
| "grad_norm": 0.10686289519071579, |
| "learning_rate": 7.981160643407603e-07, |
| "entropy": 1.3126128152012826, |
| "num_tokens": 13233131.0, |
| "mean_token_accuracy": 0.6831002920866013, |
| "epoch": 0.9542743538767395, |
| "step": 2280 |
| }, |
| { |
| "loss": 1.37518310546875, |
| "grad_norm": 0.1137382760643959, |
| "learning_rate": 7.962996054365642e-07, |
| "entropy": 1.3762210130691528, |
| "num_tokens": 13289669.0, |
| "mean_token_accuracy": 0.6739885672926903, |
| "epoch": 0.9584597677095322, |
| "step": 2290 |
| }, |
| { |
| "loss": 1.2979955673217773, |
| "grad_norm": 0.12840472161769867, |
| "learning_rate": 7.944770992436026e-07, |
| "entropy": 1.3360363632440566, |
| "num_tokens": 13347131.0, |
| "mean_token_accuracy": 0.6823042362928391, |
| "epoch": 0.962645181542325, |
| "step": 2300 |
| }, |
| { |
| "loss": 1.3241994857788086, |
| "grad_norm": 0.11081521958112717, |
| "learning_rate": 7.926485829580814e-07, |
| "entropy": 1.3549024030566215, |
| "num_tokens": 13405191.0, |
| "mean_token_accuracy": 0.6771049797534943, |
| "epoch": 0.9668305953751177, |
| "step": 2310 |
| }, |
| { |
| "loss": 1.3815485000610352, |
| "grad_norm": 0.11393143981695175, |
| "learning_rate": 7.908140938988692e-07, |
| "entropy": 1.3816259652376175, |
| "num_tokens": 13463913.0, |
| "mean_token_accuracy": 0.6688653215765953, |
| "epoch": 0.9710160092079104, |
| "step": 2320 |
| }, |
| { |
| "loss": 1.298836898803711, |
| "grad_norm": 0.12833324074745178, |
| "learning_rate": 7.889736695067348e-07, |
| "entropy": 1.3122636392712592, |
| "num_tokens": 13523958.0, |
| "mean_token_accuracy": 0.6812730312347413, |
| "epoch": 0.9752014230407031, |
| "step": 2330 |
| }, |
| { |
| "loss": 1.3618934631347657, |
| "grad_norm": 0.13094215095043182, |
| "learning_rate": 7.87127347343584e-07, |
| "entropy": 1.3717454001307487, |
| "num_tokens": 13581681.0, |
| "mean_token_accuracy": 0.6716930896043778, |
| "epoch": 0.9793868368734958, |
| "step": 2340 |
| }, |
| { |
| "loss": 1.3211997032165528, |
| "grad_norm": 0.14429600536823273, |
| "learning_rate": 7.852751650916917e-07, |
| "entropy": 1.3575677514076232, |
| "num_tokens": 13641102.0, |
| "mean_token_accuracy": 0.6812080055475235, |
| "epoch": 0.9835722507062886, |
| "step": 2350 |
| }, |
| { |
| "loss": 1.3058280944824219, |
| "grad_norm": 0.13168948888778687, |
| "learning_rate": 7.83417160552934e-07, |
| "entropy": 1.3377871721982957, |
| "num_tokens": 13697001.0, |
| "mean_token_accuracy": 0.6837321490049362, |
| "epoch": 0.9877576645390813, |
| "step": 2360 |
| }, |
| { |
| "loss": 1.320173168182373, |
| "grad_norm": 0.13248135149478912, |
| "learning_rate": 7.815533716480158e-07, |
| "entropy": 1.3715132981538773, |
| "num_tokens": 13754970.0, |
| "mean_token_accuracy": 0.6818105265498161, |
| "epoch": 0.991943078371874, |
| "step": 2370 |
| }, |
| { |
| "loss": 1.3184805870056153, |
| "grad_norm": 0.1117711067199707, |
| "learning_rate": 7.796838364156977e-07, |
| "entropy": 1.3519122838973998, |
| "num_tokens": 13814161.0, |
| "mean_token_accuracy": 0.6789533212780953, |
| "epoch": 0.9961284922046667, |
| "step": 2380 |
| }, |
| { |
| "loss": 1.381266212463379, |
| "grad_norm": 0.3463696539402008, |
| "learning_rate": 7.778085930120191e-07, |
| "entropy": 1.3519603207304671, |
| "num_tokens": 13865252.0, |
| "mean_token_accuracy": 0.6710431801306235, |
| "epoch": 1.0, |
| "step": 2390 |
| }, |
| { |
| "loss": 1.3650718688964845, |
| "grad_norm": 0.10732991993427277, |
| "learning_rate": 7.759276797095196e-07, |
| "entropy": 1.3758342564105988, |
| "num_tokens": 13925700.0, |
| "mean_token_accuracy": 0.6686381295323371, |
| "epoch": 1.0041854138327928, |
| "step": 2400 |
| }, |
| { |
| "eval_loss": 1.324312686920166, |
| "eval_runtime": 42.9132, |
| "eval_samples_per_second": 148.463, |
| "eval_steps_per_second": 6.199, |
| "eval_entropy": 1.3503220336777824, |
| "eval_num_tokens": 13925700.0, |
| "eval_mean_token_accuracy": 0.6799099082337287, |
| "epoch": 1.0041854138327928, |
| "step": 2400 |
| }, |
| { |
| "loss": 1.275872802734375, |
| "grad_norm": 0.10125313699245453, |
| "learning_rate": 7.740411348964576e-07, |
| "entropy": 1.3205101490020752, |
| "num_tokens": 13983023.0, |
| "mean_token_accuracy": 0.6841968685388565, |
| "epoch": 1.0083708276655854, |
| "step": 2410 |
| }, |
| { |
| "loss": 1.3197596549987793, |
| "grad_norm": 0.12487287819385529, |
| "learning_rate": 7.721489970760275e-07, |
| "entropy": 1.3373865127563476, |
| "num_tokens": 14044602.0, |
| "mean_token_accuracy": 0.677445650100708, |
| "epoch": 1.0125562414983782, |
| "step": 2420 |
| }, |
| { |
| "loss": 1.2989977836608886, |
| "grad_norm": 0.1548726111650467, |
| "learning_rate": 7.702513048655733e-07, |
| "entropy": 1.3116925165057183, |
| "num_tokens": 14104408.0, |
| "mean_token_accuracy": 0.6814423218369484, |
| "epoch": 1.0167416553311708, |
| "step": 2430 |
| }, |
| { |
| "loss": 1.2706897735595704, |
| "grad_norm": 0.11503283679485321, |
| "learning_rate": 7.683480969958003e-07, |
| "entropy": 1.310747703909874, |
| "num_tokens": 14162736.0, |
| "mean_token_accuracy": 0.6892054408788681, |
| "epoch": 1.0209270691639636, |
| "step": 2440 |
| }, |
| { |
| "loss": 1.2928138732910157, |
| "grad_norm": 0.12240534275770187, |
| "learning_rate": 7.664394123099853e-07, |
| "entropy": 1.3191738039255143, |
| "num_tokens": 14221626.0, |
| "mean_token_accuracy": 0.6869289621710777, |
| "epoch": 1.0251124829967564, |
| "step": 2450 |
| }, |
| { |
| "loss": 1.293262767791748, |
| "grad_norm": 0.11519357562065125, |
| "learning_rate": 7.64525289763184e-07, |
| "entropy": 1.320760977268219, |
| "num_tokens": 14280841.0, |
| "mean_token_accuracy": 0.6857645198702812, |
| "epoch": 1.029297896829549, |
| "step": 2460 |
| }, |
| { |
| "loss": 1.3051738739013672, |
| "grad_norm": 0.11736012250185013, |
| "learning_rate": 7.626057684214341e-07, |
| "entropy": 1.316636176407337, |
| "num_tokens": 14338816.0, |
| "mean_token_accuracy": 0.6803102239966392, |
| "epoch": 1.0334833106623418, |
| "step": 2470 |
| }, |
| { |
| "loss": 1.3561962127685547, |
| "grad_norm": 0.13388119637966156, |
| "learning_rate": 7.606808874609605e-07, |
| "entropy": 1.36598659157753, |
| "num_tokens": 14395539.0, |
| "mean_token_accuracy": 0.6725652754306793, |
| "epoch": 1.0376687244951344, |
| "step": 2480 |
| }, |
| { |
| "loss": 1.3418392181396483, |
| "grad_norm": 0.12838061153888702, |
| "learning_rate": 7.587506861673737e-07, |
| "entropy": 1.3244032382965087, |
| "num_tokens": 14451789.0, |
| "mean_token_accuracy": 0.6773718982934952, |
| "epoch": 1.0418541383279272, |
| "step": 2490 |
| }, |
| { |
| "loss": 1.2931674003601075, |
| "grad_norm": 0.12182667851448059, |
| "learning_rate": 7.568152039348695e-07, |
| "entropy": 1.3194489538669587, |
| "num_tokens": 14510441.0, |
| "mean_token_accuracy": 0.6842545494437218, |
| "epoch": 1.04603955216072, |
| "step": 2500 |
| }, |
| { |
| "loss": 1.2959155082702636, |
| "grad_norm": 0.11124531924724579, |
| "learning_rate": 7.548744802654241e-07, |
| "entropy": 1.3410497322678565, |
| "num_tokens": 14571458.0, |
| "mean_token_accuracy": 0.6812979131937027, |
| "epoch": 1.0502249659935126, |
| "step": 2510 |
| }, |
| { |
| "loss": 1.3612911224365234, |
| "grad_norm": 0.12837456166744232, |
| "learning_rate": 7.529285547679882e-07, |
| "entropy": 1.3736032456159593, |
| "num_tokens": 14627118.0, |
| "mean_token_accuracy": 0.672698700428009, |
| "epoch": 1.0544103798263054, |
| "step": 2520 |
| }, |
| { |
| "loss": 1.2740073204040527, |
| "grad_norm": 0.1267591416835785, |
| "learning_rate": 7.509774671576785e-07, |
| "entropy": 1.3048336684703827, |
| "num_tokens": 14685752.0, |
| "mean_token_accuracy": 0.6858905151486396, |
| "epoch": 1.058595793659098, |
| "step": 2530 |
| }, |
| { |
| "loss": 1.3410483360290528, |
| "grad_norm": 0.11439883708953857, |
| "learning_rate": 7.490212572549666e-07, |
| "entropy": 1.3314668446779252, |
| "num_tokens": 14742644.0, |
| "mean_token_accuracy": 0.6746952176094055, |
| "epoch": 1.0627812074918908, |
| "step": 2540 |
| }, |
| { |
| "loss": 1.2937799453735352, |
| "grad_norm": 0.12421438843011856, |
| "learning_rate": 7.470599649848681e-07, |
| "entropy": 1.3203342527151107, |
| "num_tokens": 14801546.0, |
| "mean_token_accuracy": 0.6863655790686607, |
| "epoch": 1.0669666213246836, |
| "step": 2550 |
| }, |
| { |
| "loss": 1.3474176406860352, |
| "grad_norm": 0.11059686541557312, |
| "learning_rate": 7.450936303761256e-07, |
| "entropy": 1.3507545605301856, |
| "num_tokens": 14861872.0, |
| "mean_token_accuracy": 0.6777540385723114, |
| "epoch": 1.0711520351574761, |
| "step": 2560 |
| }, |
| { |
| "loss": 1.2592041015625, |
| "grad_norm": 0.12262172996997833, |
| "learning_rate": 7.431222935603929e-07, |
| "entropy": 1.2903067260980605, |
| "num_tokens": 14919917.0, |
| "mean_token_accuracy": 0.6862245246767997, |
| "epoch": 1.075337448990269, |
| "step": 2570 |
| }, |
| { |
| "loss": 1.3285273551940917, |
| "grad_norm": 0.1249430701136589, |
| "learning_rate": 7.411459947714156e-07, |
| "entropy": 1.346482941508293, |
| "num_tokens": 14977173.0, |
| "mean_token_accuracy": 0.677224400639534, |
| "epoch": 1.0795228628230615, |
| "step": 2580 |
| }, |
| { |
| "loss": 1.3090217590332032, |
| "grad_norm": 0.15991806983947754, |
| "learning_rate": 7.391647743442103e-07, |
| "entropy": 1.3448469370603562, |
| "num_tokens": 15036719.0, |
| "mean_token_accuracy": 0.6807536914944649, |
| "epoch": 1.0837082766558543, |
| "step": 2590 |
| }, |
| { |
| "loss": 1.385681438446045, |
| "grad_norm": 0.12378425896167755, |
| "learning_rate": 7.37178672714241e-07, |
| "entropy": 1.4169642955064774, |
| "num_tokens": 15093272.0, |
| "mean_token_accuracy": 0.666177037358284, |
| "epoch": 1.0878936904886471, |
| "step": 2600 |
| }, |
| { |
| "eval_loss": 1.3201794624328613, |
| "eval_runtime": 43.6566, |
| "eval_samples_per_second": 145.934, |
| "eval_steps_per_second": 6.093, |
| "eval_entropy": 1.318832386705212, |
| "eval_num_tokens": 15093272.0, |
| "eval_mean_token_accuracy": 0.6805890722382337, |
| "epoch": 1.0878936904886471, |
| "step": 2600 |
| }, |
| { |
| "loss": 1.3124534606933593, |
| "grad_norm": 0.14278866350650787, |
| "learning_rate": 7.351877304165939e-07, |
| "entropy": 1.3207478374242783, |
| "num_tokens": 15151531.0, |
| "mean_token_accuracy": 0.6814302504062653, |
| "epoch": 1.0920791043214397, |
| "step": 2610 |
| }, |
| { |
| "loss": 1.310394859313965, |
| "grad_norm": 0.11016988754272461, |
| "learning_rate": 7.331919880851505e-07, |
| "entropy": 1.3247565850615501, |
| "num_tokens": 15208078.0, |
| "mean_token_accuracy": 0.6797660425305366, |
| "epoch": 1.0962645181542325, |
| "step": 2620 |
| }, |
| { |
| "loss": 1.3397459030151366, |
| "grad_norm": 0.12294236570596695, |
| "learning_rate": 7.311914864517574e-07, |
| "entropy": 1.344627757370472, |
| "num_tokens": 15262908.0, |
| "mean_token_accuracy": 0.6800432533025742, |
| "epoch": 1.1004499319870251, |
| "step": 2630 |
| }, |
| { |
| "loss": 1.3385157585144043, |
| "grad_norm": 0.1285414695739746, |
| "learning_rate": 7.291862663453963e-07, |
| "entropy": 1.342196998000145, |
| "num_tokens": 15323145.0, |
| "mean_token_accuracy": 0.6765275478363038, |
| "epoch": 1.104635345819818, |
| "step": 2640 |
| }, |
| { |
| "loss": 1.30029239654541, |
| "grad_norm": 0.13284096121788025, |
| "learning_rate": 7.271763686913493e-07, |
| "entropy": 1.3492845341563224, |
| "num_tokens": 15380781.0, |
| "mean_token_accuracy": 0.6857595443725586, |
| "epoch": 1.1088207596526107, |
| "step": 2650 |
| }, |
| { |
| "loss": 1.287161159515381, |
| "grad_norm": 0.12089403718709946, |
| "learning_rate": 7.251618345103646e-07, |
| "entropy": 1.3121826618909835, |
| "num_tokens": 15439602.0, |
| "mean_token_accuracy": 0.6850664153695106, |
| "epoch": 1.1130061734854033, |
| "step": 2660 |
| }, |
| { |
| "loss": 1.2762629508972168, |
| "grad_norm": 0.12427452206611633, |
| "learning_rate": 7.231427049178192e-07, |
| "entropy": 1.2992495775222779, |
| "num_tokens": 15495803.0, |
| "mean_token_accuracy": 0.6846798285841942, |
| "epoch": 1.1171915873181961, |
| "step": 2670 |
| }, |
| { |
| "loss": 1.274948501586914, |
| "grad_norm": 0.13808666169643402, |
| "learning_rate": 7.211190211228791e-07, |
| "entropy": 1.305306363105774, |
| "num_tokens": 15550588.0, |
| "mean_token_accuracy": 0.6887386977672577, |
| "epoch": 1.1213770011509887, |
| "step": 2680 |
| }, |
| { |
| "loss": 1.2809961318969727, |
| "grad_norm": 0.1604543924331665, |
| "learning_rate": 7.190908244276592e-07, |
| "entropy": 1.291318878531456, |
| "num_tokens": 15607839.0, |
| "mean_token_accuracy": 0.6838915839791297, |
| "epoch": 1.1255624149837815, |
| "step": 2690 |
| }, |
| { |
| "loss": 1.3102614402770996, |
| "grad_norm": 0.1264321208000183, |
| "learning_rate": 7.170581562263795e-07, |
| "entropy": 1.3290839582681655, |
| "num_tokens": 15666987.0, |
| "mean_token_accuracy": 0.6819840222597122, |
| "epoch": 1.1297478288165743, |
| "step": 2700 |
| }, |
| { |
| "loss": 1.3671725273132325, |
| "grad_norm": 0.1209392324090004, |
| "learning_rate": 7.150210580045207e-07, |
| "entropy": 1.3735456377267838, |
| "num_tokens": 15724955.0, |
| "mean_token_accuracy": 0.6725474014878273, |
| "epoch": 1.133933242649367, |
| "step": 2710 |
| }, |
| { |
| "loss": 1.3231231689453125, |
| "grad_norm": 0.12559957802295685, |
| "learning_rate": 7.129795713379776e-07, |
| "entropy": 1.340329071879387, |
| "num_tokens": 15782149.0, |
| "mean_token_accuracy": 0.6805369645357132, |
| "epoch": 1.1381186564821597, |
| "step": 2720 |
| }, |
| { |
| "loss": 1.2828726768493652, |
| "grad_norm": 0.13034865260124207, |
| "learning_rate": 7.109337378922102e-07, |
| "entropy": 1.2797758102416992, |
| "num_tokens": 15835973.0, |
| "mean_token_accuracy": 0.6902579948306083, |
| "epoch": 1.1423040703149523, |
| "step": 2730 |
| }, |
| { |
| "loss": 1.329068374633789, |
| "grad_norm": 0.1187472939491272, |
| "learning_rate": 7.088835994213937e-07, |
| "entropy": 1.3206837117671966, |
| "num_tokens": 15895605.0, |
| "mean_token_accuracy": 0.6760165989398956, |
| "epoch": 1.146489484147745, |
| "step": 2740 |
| }, |
| { |
| "loss": 1.2608001708984375, |
| "grad_norm": 0.11278735101222992, |
| "learning_rate": 7.068291977675661e-07, |
| "entropy": 1.314364343881607, |
| "num_tokens": 15956260.0, |
| "mean_token_accuracy": 0.6899202361702919, |
| "epoch": 1.150674897980538, |
| "step": 2750 |
| }, |
| { |
| "loss": 1.3079211235046386, |
| "grad_norm": 0.10432706028223038, |
| "learning_rate": 7.047705748597741e-07, |
| "entropy": 1.3454543590545653, |
| "num_tokens": 16013636.0, |
| "mean_token_accuracy": 0.6848849534988404, |
| "epoch": 1.1548603118133305, |
| "step": 2760 |
| }, |
| { |
| "loss": 1.3299365043640137, |
| "grad_norm": 0.1423172652721405, |
| "learning_rate": 7.027077727132178e-07, |
| "entropy": 1.3436584562063216, |
| "num_tokens": 16070788.0, |
| "mean_token_accuracy": 0.6782758548855782, |
| "epoch": 1.1590457256461233, |
| "step": 2770 |
| }, |
| { |
| "loss": 1.2507868766784669, |
| "grad_norm": 0.12985938787460327, |
| "learning_rate": 7.006408334283929e-07, |
| "entropy": 1.300880002975464, |
| "num_tokens": 16132003.0, |
| "mean_token_accuracy": 0.6908931702375412, |
| "epoch": 1.163231139478916, |
| "step": 2780 |
| }, |
| { |
| "loss": 1.3301843643188476, |
| "grad_norm": 0.14071504771709442, |
| "learning_rate": 6.985697991902313e-07, |
| "entropy": 1.3270384550094605, |
| "num_tokens": 16192149.0, |
| "mean_token_accuracy": 0.6777920231223107, |
| "epoch": 1.1674165533117087, |
| "step": 2790 |
| }, |
| { |
| "loss": 1.2743472099304198, |
| "grad_norm": 0.1140187457203865, |
| "learning_rate": 6.964947122672406e-07, |
| "entropy": 1.3053037211298943, |
| "num_tokens": 16251607.0, |
| "mean_token_accuracy": 0.6888150230050087, |
| "epoch": 1.1716019671445015, |
| "step": 2800 |
| }, |
| { |
| "eval_loss": 1.3166502714157104, |
| "eval_runtime": 43.6438, |
| "eval_samples_per_second": 145.977, |
| "eval_steps_per_second": 6.095, |
| "eval_entropy": 1.3201469900016498, |
| "eval_num_tokens": 16251607.0, |
| "eval_mean_token_accuracy": 0.6811264934844541, |
| "epoch": 1.1716019671445015, |
| "step": 2800 |
| }, |
| { |
| "loss": 1.3409744262695313, |
| "grad_norm": 0.10443054884672165, |
| "learning_rate": 6.944156150106407e-07, |
| "entropy": 1.342512857913971, |
| "num_tokens": 16312813.0, |
| "mean_token_accuracy": 0.6724711164832116, |
| "epoch": 1.175787380977294, |
| "step": 2810 |
| }, |
| { |
| "loss": 1.3988855361938477, |
| "grad_norm": 0.1189141720533371, |
| "learning_rate": 6.923325498535005e-07, |
| "entropy": 1.396900659799576, |
| "num_tokens": 16370227.0, |
| "mean_token_accuracy": 0.6674019232392311, |
| "epoch": 1.1799727948100869, |
| "step": 2820 |
| }, |
| { |
| "loss": 1.3619994163513183, |
| "grad_norm": 0.11577111482620239, |
| "learning_rate": 6.902455593098711e-07, |
| "entropy": 1.3739877551794053, |
| "num_tokens": 16431284.0, |
| "mean_token_accuracy": 0.6685123056173324, |
| "epoch": 1.1841582086428795, |
| "step": 2830 |
| }, |
| { |
| "loss": 1.2823293685913086, |
| "grad_norm": 0.1623101532459259, |
| "learning_rate": 6.881546859739178e-07, |
| "entropy": 1.287187758088112, |
| "num_tokens": 16490232.0, |
| "mean_token_accuracy": 0.6852916941046715, |
| "epoch": 1.1883436224756723, |
| "step": 2840 |
| }, |
| { |
| "loss": 1.295179557800293, |
| "grad_norm": 0.1286296844482422, |
| "learning_rate": 6.860599725190516e-07, |
| "entropy": 1.3181857854127883, |
| "num_tokens": 16549313.0, |
| "mean_token_accuracy": 0.6848940759897232, |
| "epoch": 1.192529036308465, |
| "step": 2850 |
| }, |
| { |
| "loss": 1.351776695251465, |
| "grad_norm": 0.12631654739379883, |
| "learning_rate": 6.839614616970579e-07, |
| "entropy": 1.3548940598964692, |
| "num_tokens": 16607551.0, |
| "mean_token_accuracy": 0.6714933633804321, |
| "epoch": 1.1967144501412577, |
| "step": 2860 |
| }, |
| { |
| "loss": 1.3425410270690918, |
| "grad_norm": 0.1304273158311844, |
| "learning_rate": 6.818591963372242e-07, |
| "entropy": 1.3392845541238785, |
| "num_tokens": 16667124.0, |
| "mean_token_accuracy": 0.6802757531404495, |
| "epoch": 1.2008998639740505, |
| "step": 2870 |
| }, |
| { |
| "loss": 1.3046100616455079, |
| "grad_norm": 0.12168211489915848, |
| "learning_rate": 6.797532193454654e-07, |
| "entropy": 1.3106303334236145, |
| "num_tokens": 16725868.0, |
| "mean_token_accuracy": 0.6820132330060005, |
| "epoch": 1.2050852778068433, |
| "step": 2880 |
| }, |
| { |
| "loss": 1.266930389404297, |
| "grad_norm": 0.16589786112308502, |
| "learning_rate": 6.776435737034484e-07, |
| "entropy": 1.2930086612701417, |
| "num_tokens": 16780751.0, |
| "mean_token_accuracy": 0.6916173666715622, |
| "epoch": 1.2092706916396359, |
| "step": 2890 |
| }, |
| { |
| "loss": 1.290895366668701, |
| "grad_norm": 0.11063241213560104, |
| "learning_rate": 6.755303024677153e-07, |
| "entropy": 1.3148932754993439, |
| "num_tokens": 16838274.0, |
| "mean_token_accuracy": 0.6836249440908432, |
| "epoch": 1.2134561054724287, |
| "step": 2900 |
| }, |
| { |
| "loss": 1.3228137016296386, |
| "grad_norm": 0.10892044007778168, |
| "learning_rate": 6.734134487688043e-07, |
| "entropy": 1.3384662061929702, |
| "num_tokens": 16896457.0, |
| "mean_token_accuracy": 0.6798223108053207, |
| "epoch": 1.2176415193052212, |
| "step": 2910 |
| }, |
| { |
| "loss": 1.321933650970459, |
| "grad_norm": 0.13741441071033478, |
| "learning_rate": 6.712930558103691e-07, |
| "entropy": 1.3460487127304077, |
| "num_tokens": 16955127.0, |
| "mean_token_accuracy": 0.6765735790133476, |
| "epoch": 1.221826933138014, |
| "step": 2920 |
| }, |
| { |
| "loss": 1.3986333847045898, |
| "grad_norm": 0.13116198778152466, |
| "learning_rate": 6.691691668682977e-07, |
| "entropy": 1.3796002447605134, |
| "num_tokens": 17010269.0, |
| "mean_token_accuracy": 0.6650555938482284, |
| "epoch": 1.2260123469708066, |
| "step": 2930 |
| }, |
| { |
| "loss": 1.3275323867797852, |
| "grad_norm": 0.1158343181014061, |
| "learning_rate": 6.670418252898284e-07, |
| "entropy": 1.3303757071495057, |
| "num_tokens": 17067471.0, |
| "mean_token_accuracy": 0.6782015576958657, |
| "epoch": 1.2301977608035994, |
| "step": 2940 |
| }, |
| { |
| "loss": 1.299326515197754, |
| "grad_norm": 0.13345105946063995, |
| "learning_rate": 6.649110744926669e-07, |
| "entropy": 1.319593369960785, |
| "num_tokens": 17123848.0, |
| "mean_token_accuracy": 0.6838883191347123, |
| "epoch": 1.2343831746363922, |
| "step": 2950 |
| }, |
| { |
| "loss": 1.2679595947265625, |
| "grad_norm": 0.13203385472297668, |
| "learning_rate": 6.627769579640975e-07, |
| "entropy": 1.2961439684033393, |
| "num_tokens": 17180001.0, |
| "mean_token_accuracy": 0.6859666183590889, |
| "epoch": 1.2385685884691848, |
| "step": 2960 |
| }, |
| { |
| "loss": 1.3538383483886718, |
| "grad_norm": 0.1471163034439087, |
| "learning_rate": 6.606395192600978e-07, |
| "entropy": 1.3352440029382706, |
| "num_tokens": 17233262.0, |
| "mean_token_accuracy": 0.6754373088479042, |
| "epoch": 1.2427540023019776, |
| "step": 2970 |
| }, |
| { |
| "loss": 1.3183878898620605, |
| "grad_norm": 0.12840019166469574, |
| "learning_rate": 6.584988020044485e-07, |
| "entropy": 1.349251627922058, |
| "num_tokens": 17287228.0, |
| "mean_token_accuracy": 0.6815307438373566, |
| "epoch": 1.2469394161347704, |
| "step": 2980 |
| }, |
| { |
| "loss": 1.2082359313964843, |
| "grad_norm": 0.15220077335834503, |
| "learning_rate": 6.563548498878438e-07, |
| "entropy": 1.2743981599807739, |
| "num_tokens": 17348380.0, |
| "mean_token_accuracy": 0.6978771463036537, |
| "epoch": 1.251124829967563, |
| "step": 2990 |
| }, |
| { |
| "loss": 1.3413416862487793, |
| "grad_norm": 0.12702776491641998, |
| "learning_rate": 6.542077066669993e-07, |
| "entropy": 1.338026624917984, |
| "num_tokens": 17403328.0, |
| "mean_token_accuracy": 0.675315049290657, |
| "epoch": 1.2553102438003558, |
| "step": 3000 |
| }, |
| { |
| "eval_loss": 1.3136601448059082, |
| "eval_runtime": 43.154, |
| "eval_samples_per_second": 147.634, |
| "eval_steps_per_second": 6.164, |
| "eval_entropy": 1.3217154624766874, |
| "eval_num_tokens": 17403328.0, |
| "eval_mean_token_accuracy": 0.6815465722317086, |
| "epoch": 1.2553102438003558, |
| "step": 3000 |
| }, |
| { |
| "loss": 1.3451406478881835, |
| "grad_norm": 0.1156093031167984, |
| "learning_rate": 6.52057416163759e-07, |
| "entropy": 1.3520446419715881, |
| "num_tokens": 17460648.0, |
| "mean_token_accuracy": 0.6702774554491043, |
| "epoch": 1.2594956576331484, |
| "step": 3010 |
| }, |
| { |
| "loss": 1.3822593688964844, |
| "grad_norm": 0.12327724695205688, |
| "learning_rate": 6.499040222642007e-07, |
| "entropy": 1.365411925315857, |
| "num_tokens": 17519410.0, |
| "mean_token_accuracy": 0.6692025378346443, |
| "epoch": 1.2636810714659412, |
| "step": 3020 |
| }, |
| { |
| "loss": 1.3682982444763183, |
| "grad_norm": 0.12616313993930817, |
| "learning_rate": 6.477475689177407e-07, |
| "entropy": 1.3488513588905335, |
| "num_tokens": 17575033.0, |
| "mean_token_accuracy": 0.6721004649996758, |
| "epoch": 1.2678664852987338, |
| "step": 3030 |
| }, |
| { |
| "loss": 1.3206647872924804, |
| "grad_norm": 0.11206343024969101, |
| "learning_rate": 6.455881001362372e-07, |
| "entropy": 1.3416712805628777, |
| "num_tokens": 17634281.0, |
| "mean_token_accuracy": 0.6792711272835732, |
| "epoch": 1.2720518991315266, |
| "step": 3040 |
| }, |
| { |
| "loss": 1.368018913269043, |
| "grad_norm": 0.1311446875333786, |
| "learning_rate": 6.434256599930909e-07, |
| "entropy": 1.37212732732296, |
| "num_tokens": 17689407.0, |
| "mean_token_accuracy": 0.6717290371656418, |
| "epoch": 1.2762373129643194, |
| "step": 3050 |
| }, |
| { |
| "loss": 1.3607032775878907, |
| "grad_norm": 0.14133571088314056, |
| "learning_rate": 6.412602926223464e-07, |
| "entropy": 1.3578105926513673, |
| "num_tokens": 17748080.0, |
| "mean_token_accuracy": 0.6729270294308662, |
| "epoch": 1.280422726797112, |
| "step": 3060 |
| }, |
| { |
| "loss": 1.259375, |
| "grad_norm": 0.12888510525226593, |
| "learning_rate": 6.390920422177909e-07, |
| "entropy": 1.2880975693464278, |
| "num_tokens": 17809457.0, |
| "mean_token_accuracy": 0.6906314134597779, |
| "epoch": 1.2846081406299048, |
| "step": 3070 |
| }, |
| { |
| "loss": 1.2357722282409669, |
| "grad_norm": 0.09784252196550369, |
| "learning_rate": 6.36920953032053e-07, |
| "entropy": 1.3111811935901643, |
| "num_tokens": 17871869.0, |
| "mean_token_accuracy": 0.6910292714834213, |
| "epoch": 1.2887935544626976, |
| "step": 3080 |
| }, |
| { |
| "loss": 1.2405315399169923, |
| "grad_norm": 0.13264605402946472, |
| "learning_rate": 6.347470693756987e-07, |
| "entropy": 1.2896562367677689, |
| "num_tokens": 17933114.0, |
| "mean_token_accuracy": 0.6923574149608612, |
| "epoch": 1.2929789682954902, |
| "step": 3090 |
| }, |
| { |
| "loss": 1.2628044128417968, |
| "grad_norm": 0.11528719961643219, |
| "learning_rate": 6.325704356163273e-07, |
| "entropy": 1.2994973942637444, |
| "num_tokens": 17989694.0, |
| "mean_token_accuracy": 0.6884831428527832, |
| "epoch": 1.297164382128283, |
| "step": 3100 |
| }, |
| { |
| "loss": 1.2996297836303712, |
| "grad_norm": 0.1078164130449295, |
| "learning_rate": 6.303910961776664e-07, |
| "entropy": 1.315569232404232, |
| "num_tokens": 18049760.0, |
| "mean_token_accuracy": 0.6808459624648094, |
| "epoch": 1.3013497959610756, |
| "step": 3110 |
| }, |
| { |
| "loss": 1.3285269737243652, |
| "grad_norm": 0.13121522963047028, |
| "learning_rate": 6.282090955386642e-07, |
| "entropy": 1.3389025837183, |
| "num_tokens": 18106326.0, |
| "mean_token_accuracy": 0.6797921672463417, |
| "epoch": 1.3055352097938684, |
| "step": 3120 |
| }, |
| { |
| "loss": 1.3413330078125, |
| "grad_norm": 0.1084539070725441, |
| "learning_rate": 6.260244782325829e-07, |
| "entropy": 1.3604058563709258, |
| "num_tokens": 18165478.0, |
| "mean_token_accuracy": 0.6726910755038261, |
| "epoch": 1.309720623626661, |
| "step": 3130 |
| }, |
| { |
| "loss": 1.3873212814331055, |
| "grad_norm": 0.11267993599176407, |
| "learning_rate": 6.238372888460892e-07, |
| "entropy": 1.404004666209221, |
| "num_tokens": 18221418.0, |
| "mean_token_accuracy": 0.6710177347064018, |
| "epoch": 1.3139060374594538, |
| "step": 3140 |
| }, |
| { |
| "loss": 1.312180519104004, |
| "grad_norm": 0.12451887875795364, |
| "learning_rate": 6.216475720183437e-07, |
| "entropy": 1.322364729642868, |
| "num_tokens": 18278027.0, |
| "mean_token_accuracy": 0.6799433350563049, |
| "epoch": 1.3180914512922466, |
| "step": 3150 |
| }, |
| { |
| "loss": 1.2946537017822266, |
| "grad_norm": 0.15065018832683563, |
| "learning_rate": 6.194553724400911e-07, |
| "entropy": 1.3054640024900437, |
| "num_tokens": 18334990.0, |
| "mean_token_accuracy": 0.6847308576107025, |
| "epoch": 1.3222768651250392, |
| "step": 3160 |
| }, |
| { |
| "loss": 1.2657323837280274, |
| "grad_norm": 0.11712754517793655, |
| "learning_rate": 6.172607348527474e-07, |
| "entropy": 1.2842485100030898, |
| "num_tokens": 18393253.0, |
| "mean_token_accuracy": 0.6883261352777481, |
| "epoch": 1.326462278957832, |
| "step": 3170 |
| }, |
| { |
| "loss": 1.3007762908935547, |
| "grad_norm": 0.16621780395507812, |
| "learning_rate": 6.150637040474868e-07, |
| "entropy": 1.3247014865279199, |
| "num_tokens": 18449407.0, |
| "mean_token_accuracy": 0.6868977710604668, |
| "epoch": 1.3306476927906248, |
| "step": 3180 |
| }, |
| { |
| "loss": 1.2898554801940918, |
| "grad_norm": 0.13524088263511658, |
| "learning_rate": 6.128643248643274e-07, |
| "entropy": 1.305448915064335, |
| "num_tokens": 18506773.0, |
| "mean_token_accuracy": 0.6856573060154915, |
| "epoch": 1.3348331066234174, |
| "step": 3190 |
| }, |
| { |
| "loss": 1.3588788986206055, |
| "grad_norm": 0.1192813366651535, |
| "learning_rate": 6.106626421912163e-07, |
| "entropy": 1.3537309616804123, |
| "num_tokens": 18568894.0, |
| "mean_token_accuracy": 0.6747590154409409, |
| "epoch": 1.3390185204562102, |
| "step": 3200 |
| }, |
| { |
| "eval_loss": 1.3109967708587646, |
| "eval_runtime": 43.5438, |
| "eval_samples_per_second": 146.312, |
| "eval_steps_per_second": 6.109, |
| "eval_entropy": 1.3213256283810264, |
| "eval_num_tokens": 18568894.0, |
| "eval_mean_token_accuracy": 0.6819047645518654, |
| "epoch": 1.3390185204562102, |
| "step": 3200 |
| }, |
| { |
| "loss": 1.3187339782714844, |
| "grad_norm": 0.12998685240745544, |
| "learning_rate": 6.084587009631135e-07, |
| "entropy": 1.3331160172820091, |
| "num_tokens": 18627669.0, |
| "mean_token_accuracy": 0.6809702217578888, |
| "epoch": 1.3432039342890028, |
| "step": 3210 |
| }, |
| { |
| "loss": 1.287522792816162, |
| "grad_norm": 0.11987276375293732, |
| "learning_rate": 6.062525461610746e-07, |
| "entropy": 1.3107565701007844, |
| "num_tokens": 18690323.0, |
| "mean_token_accuracy": 0.6874667569994927, |
| "epoch": 1.3473893481217956, |
| "step": 3220 |
| }, |
| { |
| "loss": 1.2994555473327636, |
| "grad_norm": 0.14244310557842255, |
| "learning_rate": 6.040442228113328e-07, |
| "entropy": 1.3177940219640731, |
| "num_tokens": 18749330.0, |
| "mean_token_accuracy": 0.6785065039992333, |
| "epoch": 1.3515747619545881, |
| "step": 3230 |
| }, |
| { |
| "loss": 1.348573875427246, |
| "grad_norm": 0.12585744261741638, |
| "learning_rate": 6.018337759843803e-07, |
| "entropy": 1.3356850504875184, |
| "num_tokens": 18805120.0, |
| "mean_token_accuracy": 0.676536102592945, |
| "epoch": 1.355760175787381, |
| "step": 3240 |
| }, |
| { |
| "loss": 1.3361006736755372, |
| "grad_norm": 0.1416776031255722, |
| "learning_rate": 5.996212507940475e-07, |
| "entropy": 1.355094811320305, |
| "num_tokens": 18861563.0, |
| "mean_token_accuracy": 0.6747770622372627, |
| "epoch": 1.3599455896201738, |
| "step": 3250 |
| }, |
| { |
| "loss": 1.2256298065185547, |
| "grad_norm": 0.10666567087173462, |
| "learning_rate": 5.974066923965835e-07, |
| "entropy": 1.2700331062078476, |
| "num_tokens": 18922654.0, |
| "mean_token_accuracy": 0.697255577147007, |
| "epoch": 1.3641310034529663, |
| "step": 3260 |
| }, |
| { |
| "loss": 1.3261382102966308, |
| "grad_norm": 0.1291145235300064, |
| "learning_rate": 5.951901459897337e-07, |
| "entropy": 1.351950439810753, |
| "num_tokens": 18980966.0, |
| "mean_token_accuracy": 0.6802997335791587, |
| "epoch": 1.3683164172857591, |
| "step": 3270 |
| }, |
| { |
| "loss": 1.339816188812256, |
| "grad_norm": 0.12637273967266083, |
| "learning_rate": 5.929716568118176e-07, |
| "entropy": 1.341824659705162, |
| "num_tokens": 19041925.0, |
| "mean_token_accuracy": 0.6740039184689521, |
| "epoch": 1.372501831118552, |
| "step": 3280 |
| }, |
| { |
| "loss": 1.322571086883545, |
| "grad_norm": 0.13360652327537537, |
| "learning_rate": 5.907512701408049e-07, |
| "entropy": 1.3231751516461372, |
| "num_tokens": 19097885.0, |
| "mean_token_accuracy": 0.6797602906823158, |
| "epoch": 1.3766872449513445, |
| "step": 3290 |
| }, |
| { |
| "loss": 1.2750181198120116, |
| "grad_norm": 0.1212676391005516, |
| "learning_rate": 5.885290312933929e-07, |
| "entropy": 1.2946186915040017, |
| "num_tokens": 19156636.0, |
| "mean_token_accuracy": 0.6879573374986648, |
| "epoch": 1.3808726587841373, |
| "step": 3300 |
| }, |
| { |
| "loss": 1.2478185653686524, |
| "grad_norm": 0.10930495709180832, |
| "learning_rate": 5.863049856240797e-07, |
| "entropy": 1.2818130880594254, |
| "num_tokens": 19215539.0, |
| "mean_token_accuracy": 0.6884996458888054, |
| "epoch": 1.38505807261693, |
| "step": 3310 |
| }, |
| { |
| "loss": 1.2877882957458495, |
| "grad_norm": 0.14618222415447235, |
| "learning_rate": 5.840791785242399e-07, |
| "entropy": 1.3158632695674897, |
| "num_tokens": 19275155.0, |
| "mean_token_accuracy": 0.6801917359232903, |
| "epoch": 1.3892434864497227, |
| "step": 3320 |
| }, |
| { |
| "loss": 1.3204275131225587, |
| "grad_norm": 0.13408797979354858, |
| "learning_rate": 5.818516554211983e-07, |
| "entropy": 1.3392174810171127, |
| "num_tokens": 19331554.0, |
| "mean_token_accuracy": 0.6769860580563545, |
| "epoch": 1.3934289002825153, |
| "step": 3330 |
| }, |
| { |
| "loss": 1.2603473663330078, |
| "grad_norm": 0.15371856093406677, |
| "learning_rate": 5.796224617773012e-07, |
| "entropy": 1.2774315923452377, |
| "num_tokens": 19389359.0, |
| "mean_token_accuracy": 0.6910146772861481, |
| "epoch": 1.3976143141153081, |
| "step": 3340 |
| }, |
| { |
| "loss": 1.3022661209106445, |
| "grad_norm": 0.1194000095129013, |
| "learning_rate": 5.773916430889905e-07, |
| "entropy": 1.3322788611054421, |
| "num_tokens": 19449266.0, |
| "mean_token_accuracy": 0.6764059454202652, |
| "epoch": 1.401799727948101, |
| "step": 3350 |
| }, |
| { |
| "loss": 1.280670738220215, |
| "grad_norm": 0.1414560228586197, |
| "learning_rate": 5.751592448858737e-07, |
| "entropy": 1.28292535841465, |
| "num_tokens": 19505413.0, |
| "mean_token_accuracy": 0.6831368803977966, |
| "epoch": 1.4059851417808935, |
| "step": 3360 |
| }, |
| { |
| "loss": 1.2672719955444336, |
| "grad_norm": 0.1158173456788063, |
| "learning_rate": 5.729253127297955e-07, |
| "entropy": 1.2811419636011123, |
| "num_tokens": 19564391.0, |
| "mean_token_accuracy": 0.6885835364460945, |
| "epoch": 1.4101705556136863, |
| "step": 3370 |
| }, |
| { |
| "loss": 1.3353286743164063, |
| "grad_norm": 0.12490648031234741, |
| "learning_rate": 5.706898922139074e-07, |
| "entropy": 1.3280266046524047, |
| "num_tokens": 19623582.0, |
| "mean_token_accuracy": 0.6795374467968941, |
| "epoch": 1.4143559694464791, |
| "step": 3380 |
| }, |
| { |
| "loss": 1.2236414909362794, |
| "grad_norm": 0.1268617957830429, |
| "learning_rate": 5.684530289617376e-07, |
| "entropy": 1.281736159324646, |
| "num_tokens": 19682008.0, |
| "mean_token_accuracy": 0.6963353782892228, |
| "epoch": 1.4185413832792717, |
| "step": 3390 |
| }, |
| { |
| "loss": 1.365687370300293, |
| "grad_norm": 0.12744104862213135, |
| "learning_rate": 5.662147686262595e-07, |
| "entropy": 1.3710105925798417, |
| "num_tokens": 19735892.0, |
| "mean_token_accuracy": 0.6697546020150185, |
| "epoch": 1.4227267971120645, |
| "step": 3400 |
| }, |
| { |
| "eval_loss": 1.3087373971939087, |
| "eval_runtime": 43.1953, |
| "eval_samples_per_second": 147.493, |
| "eval_steps_per_second": 6.158, |
| "eval_entropy": 1.314033669636662, |
| "eval_num_tokens": 19735892.0, |
| "eval_mean_token_accuracy": 0.6822930157632756, |
| "epoch": 1.4227267971120645, |
| "step": 3400 |
| }, |
| { |
| "loss": 1.2659673690795898, |
| "grad_norm": 0.12900042533874512, |
| "learning_rate": 5.639751568889601e-07, |
| "entropy": 1.2991064012050628, |
| "num_tokens": 19795701.0, |
| "mean_token_accuracy": 0.6906736105680465, |
| "epoch": 1.426912210944857, |
| "step": 3410 |
| }, |
| { |
| "loss": 1.3444564819335938, |
| "grad_norm": 0.141217440366745, |
| "learning_rate": 5.617342394589076e-07, |
| "entropy": 1.328627872467041, |
| "num_tokens": 19852770.0, |
| "mean_token_accuracy": 0.6754001170396805, |
| "epoch": 1.43109762477765, |
| "step": 3420 |
| }, |
| { |
| "loss": 1.306549644470215, |
| "grad_norm": 0.12908576428890228, |
| "learning_rate": 5.594920620718189e-07, |
| "entropy": 1.3152456805109978, |
| "num_tokens": 19912101.0, |
| "mean_token_accuracy": 0.6831103786826134, |
| "epoch": 1.4352830386104425, |
| "step": 3430 |
| }, |
| { |
| "loss": 1.4111559867858887, |
| "grad_norm": 0.12521252036094666, |
| "learning_rate": 5.572486704891254e-07, |
| "entropy": 1.3963622391223907, |
| "num_tokens": 19969427.0, |
| "mean_token_accuracy": 0.6657738149166107, |
| "epoch": 1.4394684524432353, |
| "step": 3440 |
| }, |
| { |
| "loss": 1.3437400817871095, |
| "grad_norm": 0.14173802733421326, |
| "learning_rate": 5.550041104970396e-07, |
| "entropy": 1.3275486350059509, |
| "num_tokens": 20030520.0, |
| "mean_token_accuracy": 0.6768685072660446, |
| "epoch": 1.443653866276028, |
| "step": 3450 |
| }, |
| { |
| "loss": 1.3257243156433105, |
| "grad_norm": 0.12309889495372772, |
| "learning_rate": 5.527584279056207e-07, |
| "entropy": 1.3419605940580368, |
| "num_tokens": 20088125.0, |
| "mean_token_accuracy": 0.6762049332261085, |
| "epoch": 1.4478392801088207, |
| "step": 3460 |
| }, |
| { |
| "loss": 1.226247215270996, |
| "grad_norm": 0.13416838645935059, |
| "learning_rate": 5.505116685478394e-07, |
| "entropy": 1.284440317749977, |
| "num_tokens": 20147039.0, |
| "mean_token_accuracy": 0.6947048246860504, |
| "epoch": 1.4520246939416135, |
| "step": 3470 |
| }, |
| { |
| "loss": 1.3330312728881837, |
| "grad_norm": 0.1232227310538292, |
| "learning_rate": 5.48263878278642e-07, |
| "entropy": 1.3413183093070984, |
| "num_tokens": 20205035.0, |
| "mean_token_accuracy": 0.67842618227005, |
| "epoch": 1.4562101077744063, |
| "step": 3480 |
| }, |
| { |
| "loss": 1.3752978324890137, |
| "grad_norm": 0.1453479379415512, |
| "learning_rate": 5.460151029740161e-07, |
| "entropy": 1.3477472990751267, |
| "num_tokens": 20260344.0, |
| "mean_token_accuracy": 0.6687687709927559, |
| "epoch": 1.4603955216071989, |
| "step": 3490 |
| }, |
| { |
| "loss": 1.221930980682373, |
| "grad_norm": 0.1277054399251938, |
| "learning_rate": 5.437653885300522e-07, |
| "entropy": 1.261066934466362, |
| "num_tokens": 20318023.0, |
| "mean_token_accuracy": 0.6955515563488006, |
| "epoch": 1.4645809354399917, |
| "step": 3500 |
| }, |
| { |
| "loss": 1.2911027908325194, |
| "grad_norm": 0.12536244094371796, |
| "learning_rate": 5.415147808620086e-07, |
| "entropy": 1.3049872070550919, |
| "num_tokens": 20376931.0, |
| "mean_token_accuracy": 0.6845586389303208, |
| "epoch": 1.4687663492727843, |
| "step": 3510 |
| }, |
| { |
| "loss": 1.289406967163086, |
| "grad_norm": 0.10206779837608337, |
| "learning_rate": 5.392633259033735e-07, |
| "entropy": 1.3262745544314385, |
| "num_tokens": 20435694.0, |
| "mean_token_accuracy": 0.6822992920875549, |
| "epoch": 1.472951763105577, |
| "step": 3520 |
| }, |
| { |
| "loss": 1.36239652633667, |
| "grad_norm": 0.12939555943012238, |
| "learning_rate": 5.370110696049282e-07, |
| "entropy": 1.353842854499817, |
| "num_tokens": 20494766.0, |
| "mean_token_accuracy": 0.6707737103104592, |
| "epoch": 1.4771371769383697, |
| "step": 3530 |
| }, |
| { |
| "loss": 1.2756587028503419, |
| "grad_norm": 0.11803429573774338, |
| "learning_rate": 5.34758057933808e-07, |
| "entropy": 1.3108687788248061, |
| "num_tokens": 20552412.0, |
| "mean_token_accuracy": 0.6879084140062333, |
| "epoch": 1.4813225907711625, |
| "step": 3540 |
| }, |
| { |
| "loss": 1.328935432434082, |
| "grad_norm": 0.13244083523750305, |
| "learning_rate": 5.325043368725662e-07, |
| "entropy": 1.3331556499004364, |
| "num_tokens": 20610804.0, |
| "mean_token_accuracy": 0.6783339202404022, |
| "epoch": 1.4855080046039553, |
| "step": 3550 |
| }, |
| { |
| "loss": 1.334804153442383, |
| "grad_norm": 0.14279146492481232, |
| "learning_rate": 5.302499524182327e-07, |
| "entropy": 1.3319466978311538, |
| "num_tokens": 20668475.0, |
| "mean_token_accuracy": 0.6799613311886787, |
| "epoch": 1.4896934184367479, |
| "step": 3560 |
| }, |
| { |
| "loss": 1.2445655822753907, |
| "grad_norm": 0.137944757938385, |
| "learning_rate": 5.279949505813783e-07, |
| "entropy": 1.2786899566650392, |
| "num_tokens": 20731142.0, |
| "mean_token_accuracy": 0.6872789070010186, |
| "epoch": 1.4938788322695407, |
| "step": 3570 |
| }, |
| { |
| "loss": 1.2799750328063966, |
| "grad_norm": 0.14270278811454773, |
| "learning_rate": 5.257393773851733e-07, |
| "entropy": 1.3207889288663863, |
| "num_tokens": 20791636.0, |
| "mean_token_accuracy": 0.6855424389243125, |
| "epoch": 1.4980642461023335, |
| "step": 3580 |
| }, |
| { |
| "loss": 1.2714473724365234, |
| "grad_norm": 0.13160590827465057, |
| "learning_rate": 5.234832788644492e-07, |
| "entropy": 1.2881531581282615, |
| "num_tokens": 20850942.0, |
| "mean_token_accuracy": 0.6868263691663742, |
| "epoch": 1.502249659935126, |
| "step": 3590 |
| }, |
| { |
| "loss": 1.377396297454834, |
| "grad_norm": 0.13724471628665924, |
| "learning_rate": 5.212267010647594e-07, |
| "entropy": 1.350425472855568, |
| "num_tokens": 20909794.0, |
| "mean_token_accuracy": 0.6673172801733017, |
| "epoch": 1.5064350737679189, |
| "step": 3600 |
| }, |
| { |
| "eval_loss": 1.3067371845245361, |
| "eval_runtime": 43.2705, |
| "eval_samples_per_second": 147.237, |
| "eval_steps_per_second": 6.147, |
| "eval_entropy": 1.3166241054248093, |
| "eval_num_tokens": 20909794.0, |
| "eval_mean_token_accuracy": 0.6824193930715546, |
| "epoch": 1.5064350737679189, |
| "step": 3600 |
| }, |
| { |
| "loss": 1.3214588165283203, |
| "grad_norm": 0.12896448373794556, |
| "learning_rate": 5.189696900414387e-07, |
| "entropy": 1.3233384594321251, |
| "num_tokens": 20966668.0, |
| "mean_token_accuracy": 0.6812221944332123, |
| "epoch": 1.5106204876007117, |
| "step": 3610 |
| }, |
| { |
| "loss": 1.2983431816101074, |
| "grad_norm": 0.11282876133918762, |
| "learning_rate": 5.167122918586641e-07, |
| "entropy": 1.3307133883237838, |
| "num_tokens": 21019757.0, |
| "mean_token_accuracy": 0.6807741552591324, |
| "epoch": 1.5148059014335042, |
| "step": 3620 |
| }, |
| { |
| "loss": 1.2715835571289062, |
| "grad_norm": 0.15096786618232727, |
| "learning_rate": 5.144545525885137e-07, |
| "entropy": 1.3070465952157975, |
| "num_tokens": 21077694.0, |
| "mean_token_accuracy": 0.6909742683172226, |
| "epoch": 1.5189913152662968, |
| "step": 3630 |
| }, |
| { |
| "loss": 1.2847407341003418, |
| "grad_norm": 0.13508452475070953, |
| "learning_rate": 5.121965183100278e-07, |
| "entropy": 1.2937607616186142, |
| "num_tokens": 21135607.0, |
| "mean_token_accuracy": 0.6900022774934769, |
| "epoch": 1.5231767290990896, |
| "step": 3640 |
| }, |
| { |
| "loss": 1.3469314575195312, |
| "grad_norm": 0.12108864635229111, |
| "learning_rate": 5.099382351082666e-07, |
| "entropy": 1.3381920427083969, |
| "num_tokens": 21196736.0, |
| "mean_token_accuracy": 0.6733641669154167, |
| "epoch": 1.5273621429318824, |
| "step": 3650 |
| }, |
| { |
| "loss": 1.3375173568725587, |
| "grad_norm": 0.10356143862009048, |
| "learning_rate": 5.076797490733718e-07, |
| "entropy": 1.339997085928917, |
| "num_tokens": 21253173.0, |
| "mean_token_accuracy": 0.6792127892374993, |
| "epoch": 1.531547556764675, |
| "step": 3660 |
| }, |
| { |
| "loss": 1.2826983451843261, |
| "grad_norm": 0.13479599356651306, |
| "learning_rate": 5.054211062996241e-07, |
| "entropy": 1.305300708115101, |
| "num_tokens": 21309039.0, |
| "mean_token_accuracy": 0.6865562707185745, |
| "epoch": 1.5357329705974678, |
| "step": 3670 |
| }, |
| { |
| "loss": 1.2372420310974122, |
| "grad_norm": 0.13392086327075958, |
| "learning_rate": 5.031623528845032e-07, |
| "entropy": 1.2712924674153327, |
| "num_tokens": 21368207.0, |
| "mean_token_accuracy": 0.6935058936476708, |
| "epoch": 1.5399183844302606, |
| "step": 3680 |
| }, |
| { |
| "loss": 1.232171630859375, |
| "grad_norm": 0.13666661083698273, |
| "learning_rate": 5.009035349277469e-07, |
| "entropy": 1.2765518009662629, |
| "num_tokens": 21425778.0, |
| "mean_token_accuracy": 0.6935865059494972, |
| "epoch": 1.5441037982630532, |
| "step": 3690 |
| }, |
| { |
| "loss": 1.2454364776611329, |
| "grad_norm": 0.11271411925554276, |
| "learning_rate": 4.986446985304105e-07, |
| "entropy": 1.2914676815271378, |
| "num_tokens": 21484225.0, |
| "mean_token_accuracy": 0.6902065351605415, |
| "epoch": 1.548289212095846, |
| "step": 3700 |
| }, |
| { |
| "loss": 1.3030299186706542, |
| "grad_norm": 0.1399720460176468, |
| "learning_rate": 4.963858897939254e-07, |
| "entropy": 1.3240129977464676, |
| "num_tokens": 21541427.0, |
| "mean_token_accuracy": 0.6800246313214302, |
| "epoch": 1.5524746259286388, |
| "step": 3710 |
| }, |
| { |
| "loss": 1.2892935752868653, |
| "grad_norm": 0.12150213122367859, |
| "learning_rate": 4.941271548191588e-07, |
| "entropy": 1.3206008851528168, |
| "num_tokens": 21600143.0, |
| "mean_token_accuracy": 0.6826386615633965, |
| "epoch": 1.5566600397614314, |
| "step": 3720 |
| }, |
| { |
| "loss": 1.257300853729248, |
| "grad_norm": 0.12151734530925751, |
| "learning_rate": 4.918685397054718e-07, |
| "entropy": 1.3101585179567337, |
| "num_tokens": 21656388.0, |
| "mean_token_accuracy": 0.6894284501671791, |
| "epoch": 1.560845453594224, |
| "step": 3730 |
| }, |
| { |
| "loss": 1.233230972290039, |
| "grad_norm": 0.11451518535614014, |
| "learning_rate": 4.896100905497803e-07, |
| "entropy": 1.2788519978523254, |
| "num_tokens": 21715109.0, |
| "mean_token_accuracy": 0.6912301525473594, |
| "epoch": 1.5650308674270168, |
| "step": 3740 |
| }, |
| { |
| "loss": 1.3578661918640136, |
| "grad_norm": 0.12948159873485565, |
| "learning_rate": 4.873518534456119e-07, |
| "entropy": 1.3681051909923554, |
| "num_tokens": 21772746.0, |
| "mean_token_accuracy": 0.6737246960401535, |
| "epoch": 1.5692162812598096, |
| "step": 3750 |
| }, |
| { |
| "loss": 1.3305506706237793, |
| "grad_norm": 0.132918581366539, |
| "learning_rate": 4.850938744821674e-07, |
| "entropy": 1.3546297058463097, |
| "num_tokens": 21830592.0, |
| "mean_token_accuracy": 0.6777592465281487, |
| "epoch": 1.5734016950926022, |
| "step": 3760 |
| }, |
| { |
| "loss": 1.2513206481933594, |
| "grad_norm": 0.11073267459869385, |
| "learning_rate": 4.828361997433782e-07, |
| "entropy": 1.3076282858848571, |
| "num_tokens": 21892167.0, |
| "mean_token_accuracy": 0.68986496925354, |
| "epoch": 1.577587108925395, |
| "step": 3770 |
| }, |
| { |
| "loss": 1.271761131286621, |
| "grad_norm": 0.1349179595708847, |
| "learning_rate": 4.805788753069673e-07, |
| "entropy": 1.3031177580356599, |
| "num_tokens": 21952326.0, |
| "mean_token_accuracy": 0.6844272211194038, |
| "epoch": 1.5817725227581878, |
| "step": 3780 |
| }, |
| { |
| "loss": 1.3118972778320312, |
| "grad_norm": 0.16216766834259033, |
| "learning_rate": 4.783219472435081e-07, |
| "entropy": 1.3089008510112763, |
| "num_tokens": 22012045.0, |
| "mean_token_accuracy": 0.6802516788244247, |
| "epoch": 1.5859579365909804, |
| "step": 3790 |
| }, |
| { |
| "loss": 1.276634979248047, |
| "grad_norm": 0.13579685986042023, |
| "learning_rate": 4.760654616154842e-07, |
| "entropy": 1.309640994668007, |
| "num_tokens": 22068798.0, |
| "mean_token_accuracy": 0.6864374697208404, |
| "epoch": 1.5901433504237732, |
| "step": 3800 |
| }, |
| { |
| "eval_loss": 1.3049228191375732, |
| "eval_runtime": 43.7061, |
| "eval_samples_per_second": 145.769, |
| "eval_steps_per_second": 6.086, |
| "eval_entropy": 1.3191418598469038, |
| "eval_num_tokens": 22068798.0, |
| "eval_mean_token_accuracy": 0.6827442256131566, |
| "epoch": 1.5901433504237732, |
| "step": 3800 |
| }, |
| { |
| "loss": 1.2272584915161133, |
| "grad_norm": 0.14792950451374054, |
| "learning_rate": 4.7380946447634935e-07, |
| "entropy": 1.2662998199462892, |
| "num_tokens": 22128395.0, |
| "mean_token_accuracy": 0.6912488013505935, |
| "epoch": 1.594328764256566, |
| "step": 3810 |
| }, |
| { |
| "loss": 1.3102972030639648, |
| "grad_norm": 0.13187845051288605, |
| "learning_rate": 4.7155400186958744e-07, |
| "entropy": 1.3161917060613633, |
| "num_tokens": 22185985.0, |
| "mean_token_accuracy": 0.6805871248245239, |
| "epoch": 1.5985141780893586, |
| "step": 3820 |
| }, |
| { |
| "loss": 1.2989760398864747, |
| "grad_norm": 0.11845917999744415, |
| "learning_rate": 4.6929911982777325e-07, |
| "entropy": 1.3359744518995285, |
| "num_tokens": 22241668.0, |
| "mean_token_accuracy": 0.6845213517546653, |
| "epoch": 1.6026995919221512, |
| "step": 3830 |
| }, |
| { |
| "loss": 1.3590301513671874, |
| "grad_norm": 0.15108934044837952, |
| "learning_rate": 4.670448643716322e-07, |
| "entropy": 1.3409444272518158, |
| "num_tokens": 22297005.0, |
| "mean_token_accuracy": 0.6736213758587837, |
| "epoch": 1.606885005754944, |
| "step": 3840 |
| }, |
| { |
| "loss": 1.3382845878601075, |
| "grad_norm": 0.1333889216184616, |
| "learning_rate": 4.6479128150910196e-07, |
| "entropy": 1.3449043482542038, |
| "num_tokens": 22357044.0, |
| "mean_token_accuracy": 0.675865213572979, |
| "epoch": 1.6110704195877368, |
| "step": 3850 |
| }, |
| { |
| "loss": 1.310408592224121, |
| "grad_norm": 0.13304699957370758, |
| "learning_rate": 4.625384172343926e-07, |
| "entropy": 1.3386895060539246, |
| "num_tokens": 22413961.0, |
| "mean_token_accuracy": 0.6803735584020615, |
| "epoch": 1.6152558334205294, |
| "step": 3860 |
| }, |
| { |
| "loss": 1.2976115226745606, |
| "grad_norm": 0.11845416575670242, |
| "learning_rate": 4.602863175270483e-07, |
| "entropy": 1.3106703519821168, |
| "num_tokens": 22473509.0, |
| "mean_token_accuracy": 0.6821468025445938, |
| "epoch": 1.6194412472533222, |
| "step": 3870 |
| }, |
| { |
| "loss": 1.2801057815551757, |
| "grad_norm": 0.13403619825839996, |
| "learning_rate": 4.580350283510088e-07, |
| "entropy": 1.295821413397789, |
| "num_tokens": 22527439.0, |
| "mean_token_accuracy": 0.688027186691761, |
| "epoch": 1.623626661086115, |
| "step": 3880 |
| }, |
| { |
| "loss": 1.2886218070983886, |
| "grad_norm": 0.14298778772354126, |
| "learning_rate": 4.55784595653672e-07, |
| "entropy": 1.300849825143814, |
| "num_tokens": 22586180.0, |
| "mean_token_accuracy": 0.6847355782985687, |
| "epoch": 1.6278120749189076, |
| "step": 3890 |
| }, |
| { |
| "loss": 1.3447054862976073, |
| "grad_norm": 0.13079994916915894, |
| "learning_rate": 4.535350653649549e-07, |
| "entropy": 1.3326701998710633, |
| "num_tokens": 22642360.0, |
| "mean_token_accuracy": 0.6774184912443161, |
| "epoch": 1.6319974887517004, |
| "step": 3900 |
| }, |
| { |
| "loss": 1.3529298782348633, |
| "grad_norm": 0.10864491760730743, |
| "learning_rate": 4.512864833963571e-07, |
| "entropy": 1.338020858168602, |
| "num_tokens": 22699591.0, |
| "mean_token_accuracy": 0.6726731553673744, |
| "epoch": 1.6361829025844932, |
| "step": 3910 |
| }, |
| { |
| "loss": 1.2351530075073243, |
| "grad_norm": 0.12713301181793213, |
| "learning_rate": 4.4903889564002394e-07, |
| "entropy": 1.2726581797003746, |
| "num_tokens": 22757192.0, |
| "mean_token_accuracy": 0.6924531191587449, |
| "epoch": 1.6403683164172858, |
| "step": 3920 |
| }, |
| { |
| "loss": 1.2388504028320313, |
| "grad_norm": 0.12611106038093567, |
| "learning_rate": 4.467923479678091e-07, |
| "entropy": 1.2651499658823013, |
| "num_tokens": 22813695.0, |
| "mean_token_accuracy": 0.6929998561739922, |
| "epoch": 1.6445537302500783, |
| "step": 3930 |
| }, |
| { |
| "loss": 1.2846209526062011, |
| "grad_norm": 0.1429147869348526, |
| "learning_rate": 4.4454688623033894e-07, |
| "entropy": 1.3101652726531028, |
| "num_tokens": 22873575.0, |
| "mean_token_accuracy": 0.6836878523230553, |
| "epoch": 1.6487391440828711, |
| "step": 3940 |
| }, |
| { |
| "loss": 1.3049224853515624, |
| "grad_norm": 0.1356530785560608, |
| "learning_rate": 4.4230255625607637e-07, |
| "entropy": 1.3245902001857757, |
| "num_tokens": 22930361.0, |
| "mean_token_accuracy": 0.6820057585835457, |
| "epoch": 1.652924557915664, |
| "step": 3950 |
| }, |
| { |
| "loss": 1.3374080657958984, |
| "grad_norm": 0.13884375989437103, |
| "learning_rate": 4.400594038503864e-07, |
| "entropy": 1.3170197814702989, |
| "num_tokens": 22987617.0, |
| "mean_token_accuracy": 0.6769963175058364, |
| "epoch": 1.6571099717484565, |
| "step": 3960 |
| }, |
| { |
| "loss": 1.3491817474365235, |
| "grad_norm": 0.1354888528585434, |
| "learning_rate": 4.3781747479459974e-07, |
| "entropy": 1.3501463949680328, |
| "num_tokens": 23042051.0, |
| "mean_token_accuracy": 0.6761467263102532, |
| "epoch": 1.6612953855812493, |
| "step": 3970 |
| }, |
| { |
| "loss": 1.3287680625915528, |
| "grad_norm": 0.16015098989009857, |
| "learning_rate": 4.355768148450799e-07, |
| "entropy": 1.3458044916391372, |
| "num_tokens": 23098670.0, |
| "mean_token_accuracy": 0.6781758189201355, |
| "epoch": 1.6654807994140421, |
| "step": 3980 |
| }, |
| { |
| "loss": 1.2587160110473632, |
| "grad_norm": 0.13395771384239197, |
| "learning_rate": 4.3333746973228854e-07, |
| "entropy": 1.2841592252254486, |
| "num_tokens": 23157362.0, |
| "mean_token_accuracy": 0.6881028071045876, |
| "epoch": 1.6696662132468347, |
| "step": 3990 |
| }, |
| { |
| "loss": 1.3319854736328125, |
| "grad_norm": 0.13594871759414673, |
| "learning_rate": 4.310994851598522e-07, |
| "entropy": 1.3213010758161545, |
| "num_tokens": 23212648.0, |
| "mean_token_accuracy": 0.6781036898493766, |
| "epoch": 1.6738516270796275, |
| "step": 4000 |
| }, |
| { |
| "eval_loss": 1.3033655881881714, |
| "eval_runtime": 43.5667, |
| "eval_samples_per_second": 146.235, |
| "eval_steps_per_second": 6.106, |
| "eval_entropy": 1.31763897399257, |
| "eval_num_tokens": 23212648.0, |
| "eval_mean_token_accuracy": 0.6829292187117096, |
| "epoch": 1.6738516270796275, |
| "step": 4000 |
| }, |
| { |
| "loss": 1.2960718154907227, |
| "grad_norm": 0.13045227527618408, |
| "learning_rate": 4.288629068036296e-07, |
| "entropy": 1.3475263714790344, |
| "num_tokens": 23274106.0, |
| "mean_token_accuracy": 0.6837118580937386, |
| "epoch": 1.6780370409124203, |
| "step": 4010 |
| }, |
| { |
| "loss": 1.2239046096801758, |
| "grad_norm": 0.1388995498418808, |
| "learning_rate": 4.2662778031077993e-07, |
| "entropy": 1.2441598355770112, |
| "num_tokens": 23333462.0, |
| "mean_token_accuracy": 0.6996882349252701, |
| "epoch": 1.682222454745213, |
| "step": 4020 |
| }, |
| { |
| "loss": 1.356397533416748, |
| "grad_norm": 0.13702833652496338, |
| "learning_rate": 4.243941512988304e-07, |
| "entropy": 1.363625492155552, |
| "num_tokens": 23392153.0, |
| "mean_token_accuracy": 0.6763727009296417, |
| "epoch": 1.6864078685780055, |
| "step": 4030 |
| }, |
| { |
| "loss": 1.2513771057128906, |
| "grad_norm": 0.14271850883960724, |
| "learning_rate": 4.221620653547454e-07, |
| "entropy": 1.2843372076749802, |
| "num_tokens": 23454405.0, |
| "mean_token_accuracy": 0.6873761117458344, |
| "epoch": 1.6905932824107983, |
| "step": 4040 |
| }, |
| { |
| "loss": 1.2633016586303711, |
| "grad_norm": 0.1479983925819397, |
| "learning_rate": 4.199315680339968e-07, |
| "entropy": 1.2902348592877388, |
| "num_tokens": 23515963.0, |
| "mean_token_accuracy": 0.6904997587203979, |
| "epoch": 1.6947786962435911, |
| "step": 4050 |
| }, |
| { |
| "loss": 1.3018023490905761, |
| "grad_norm": 0.12075834721326828, |
| "learning_rate": 4.1770270485963294e-07, |
| "entropy": 1.315699815750122, |
| "num_tokens": 23573373.0, |
| "mean_token_accuracy": 0.6817046746611595, |
| "epoch": 1.6989641100763837, |
| "step": 4060 |
| }, |
| { |
| "loss": 1.3296629905700683, |
| "grad_norm": 0.15214762091636658, |
| "learning_rate": 4.154755213213513e-07, |
| "entropy": 1.339156760275364, |
| "num_tokens": 23630153.0, |
| "mean_token_accuracy": 0.6760370403528213, |
| "epoch": 1.7031495239091765, |
| "step": 4070 |
| }, |
| { |
| "loss": 1.288606834411621, |
| "grad_norm": 0.1338847577571869, |
| "learning_rate": 4.132500628745681e-07, |
| "entropy": 1.308351318538189, |
| "num_tokens": 23689525.0, |
| "mean_token_accuracy": 0.6800839513540268, |
| "epoch": 1.7073349377419693, |
| "step": 4080 |
| }, |
| { |
| "loss": 1.3122243881225586, |
| "grad_norm": 0.13693219423294067, |
| "learning_rate": 4.110263749394918e-07, |
| "entropy": 1.310598623752594, |
| "num_tokens": 23746173.0, |
| "mean_token_accuracy": 0.6841694295406342, |
| "epoch": 1.711520351574762, |
| "step": 4090 |
| }, |
| { |
| "loss": 1.3211769104003905, |
| "grad_norm": 0.12190598249435425, |
| "learning_rate": 4.0880450290019594e-07, |
| "entropy": 1.3578921407461166, |
| "num_tokens": 23804574.0, |
| "mean_token_accuracy": 0.6757835909724236, |
| "epoch": 1.7157057654075547, |
| "step": 4100 |
| }, |
| { |
| "loss": 1.2779497146606444, |
| "grad_norm": 0.14035965502262115, |
| "learning_rate": 4.0658449210369295e-07, |
| "entropy": 1.311075533926487, |
| "num_tokens": 23859817.0, |
| "mean_token_accuracy": 0.6860598146915435, |
| "epoch": 1.7198911792403475, |
| "step": 4110 |
| }, |
| { |
| "loss": 1.289837646484375, |
| "grad_norm": 0.11380521208047867, |
| "learning_rate": 4.0436638785900797e-07, |
| "entropy": 1.3117400839924813, |
| "num_tokens": 23918028.0, |
| "mean_token_accuracy": 0.6838786184787751, |
| "epoch": 1.72407659307314, |
| "step": 4120 |
| }, |
| { |
| "loss": 1.2879505157470703, |
| "grad_norm": 0.15264193713665009, |
| "learning_rate": 4.0215023543625494e-07, |
| "entropy": 1.3319763213396072, |
| "num_tokens": 23977871.0, |
| "mean_token_accuracy": 0.6834924459457398, |
| "epoch": 1.7282620069059327, |
| "step": 4130 |
| }, |
| { |
| "loss": 1.2729723930358887, |
| "grad_norm": 0.13197912275791168, |
| "learning_rate": 3.999360800657121e-07, |
| "entropy": 1.3003861784934998, |
| "num_tokens": 24032724.0, |
| "mean_token_accuracy": 0.6865213885903358, |
| "epoch": 1.7324474207387257, |
| "step": 4140 |
| }, |
| { |
| "loss": 1.2687364578247071, |
| "grad_norm": 0.12328355014324188, |
| "learning_rate": 3.977239669368997e-07, |
| "entropy": 1.2848697736859322, |
| "num_tokens": 24091459.0, |
| "mean_token_accuracy": 0.686721895635128, |
| "epoch": 1.7366328345715183, |
| "step": 4150 |
| }, |
| { |
| "loss": 1.310719871520996, |
| "grad_norm": 0.13127021491527557, |
| "learning_rate": 3.955139411976564e-07, |
| "entropy": 1.3004455357789992, |
| "num_tokens": 24145064.0, |
| "mean_token_accuracy": 0.6823625862598419, |
| "epoch": 1.7408182484043109, |
| "step": 4160 |
| }, |
| { |
| "loss": 1.277029323577881, |
| "grad_norm": 0.13161002099514008, |
| "learning_rate": 3.9330604795321877e-07, |
| "entropy": 1.2868661388754845, |
| "num_tokens": 24202651.0, |
| "mean_token_accuracy": 0.6845416814088822, |
| "epoch": 1.7450036622371037, |
| "step": 4170 |
| }, |
| { |
| "loss": 1.2249256134033204, |
| "grad_norm": 0.1313517987728119, |
| "learning_rate": 3.911003322653009e-07, |
| "entropy": 1.2720478802919388, |
| "num_tokens": 24259573.0, |
| "mean_token_accuracy": 0.6942620486021042, |
| "epoch": 1.7491890760698965, |
| "step": 4180 |
| }, |
| { |
| "loss": 1.3295866966247558, |
| "grad_norm": 0.1546325832605362, |
| "learning_rate": 3.888968391511738e-07, |
| "entropy": 1.32426298558712, |
| "num_tokens": 24322196.0, |
| "mean_token_accuracy": 0.6730754569172859, |
| "epoch": 1.753374489902689, |
| "step": 4190 |
| }, |
| { |
| "loss": 1.3283195495605469, |
| "grad_norm": 0.13663379848003387, |
| "learning_rate": 3.866956135827475e-07, |
| "entropy": 1.3125308185815812, |
| "num_tokens": 24376182.0, |
| "mean_token_accuracy": 0.6829302325844765, |
| "epoch": 1.7575599037354819, |
| "step": 4200 |
| }, |
| { |
| "eval_loss": 1.3021423816680908, |
| "eval_runtime": 43.7163, |
| "eval_samples_per_second": 145.735, |
| "eval_steps_per_second": 6.085, |
| "eval_entropy": 1.3239203189548694, |
| "eval_num_tokens": 24376182.0, |
| "eval_mean_token_accuracy": 0.6831337011846385, |
| "epoch": 1.7575599037354819, |
| "step": 4200 |
| }, |
| { |
| "loss": 1.2734957695007325, |
| "grad_norm": 0.13826783001422882, |
| "learning_rate": 3.844967004856526e-07, |
| "entropy": 1.3006668120622635, |
| "num_tokens": 24433979.0, |
| "mean_token_accuracy": 0.6882383152842522, |
| "epoch": 1.7617453175682747, |
| "step": 4210 |
| }, |
| { |
| "loss": 1.298065757751465, |
| "grad_norm": 0.11341753602027893, |
| "learning_rate": 3.8230014473832386e-07, |
| "entropy": 1.3199127793312073, |
| "num_tokens": 24496717.0, |
| "mean_token_accuracy": 0.6763370648026467, |
| "epoch": 1.7659307314010673, |
| "step": 4220 |
| }, |
| { |
| "loss": 1.2942770004272461, |
| "grad_norm": 0.11990880221128464, |
| "learning_rate": 3.801059911710835e-07, |
| "entropy": 1.3037174761295318, |
| "num_tokens": 24556339.0, |
| "mean_token_accuracy": 0.6810034438967705, |
| "epoch": 1.7701161452338599, |
| "step": 4230 |
| }, |
| { |
| "loss": 1.3638200759887695, |
| "grad_norm": 0.12909641861915588, |
| "learning_rate": 3.779142845652275e-07, |
| "entropy": 1.37214894592762, |
| "num_tokens": 24610844.0, |
| "mean_token_accuracy": 0.6698960587382317, |
| "epoch": 1.7743015590666529, |
| "step": 4240 |
| }, |
| { |
| "loss": 1.4049521446228028, |
| "grad_norm": 0.137081578373909, |
| "learning_rate": 3.757250696521104e-07, |
| "entropy": 1.3875975281000137, |
| "num_tokens": 24663935.0, |
| "mean_token_accuracy": 0.6685925871133804, |
| "epoch": 1.7784869728994455, |
| "step": 4250 |
| }, |
| { |
| "loss": 1.2346957206726075, |
| "grad_norm": 0.15599705278873444, |
| "learning_rate": 3.7353839111223285e-07, |
| "entropy": 1.2952020585536956, |
| "num_tokens": 24724653.0, |
| "mean_token_accuracy": 0.6917664587497712, |
| "epoch": 1.782672386732238, |
| "step": 4260 |
| }, |
| { |
| "loss": 1.3122922897338867, |
| "grad_norm": 0.14105546474456787, |
| "learning_rate": 3.713542935743299e-07, |
| "entropy": 1.3242159157991409, |
| "num_tokens": 24783350.0, |
| "mean_token_accuracy": 0.6838966220617294, |
| "epoch": 1.7868578005650309, |
| "step": 4270 |
| }, |
| { |
| "loss": 1.2724437713623047, |
| "grad_norm": 0.14495964348316193, |
| "learning_rate": 3.6917282161445986e-07, |
| "entropy": 1.2849380433559419, |
| "num_tokens": 24840720.0, |
| "mean_token_accuracy": 0.6882339790463448, |
| "epoch": 1.7910432143978237, |
| "step": 4280 |
| }, |
| { |
| "loss": 1.2272959709167481, |
| "grad_norm": 0.1268715113401413, |
| "learning_rate": 3.66994019755095e-07, |
| "entropy": 1.2522281989455224, |
| "num_tokens": 24900268.0, |
| "mean_token_accuracy": 0.6954008027911186, |
| "epoch": 1.7952286282306162, |
| "step": 4290 |
| }, |
| { |
| "loss": 1.2925883293151856, |
| "grad_norm": 0.12049921602010727, |
| "learning_rate": 3.648179324642119e-07, |
| "entropy": 1.3150138720870017, |
| "num_tokens": 24955875.0, |
| "mean_token_accuracy": 0.6815980896353722, |
| "epoch": 1.799414042063409, |
| "step": 4300 |
| }, |
| { |
| "loss": 1.2687823295593261, |
| "grad_norm": 0.1410578191280365, |
| "learning_rate": 3.62644604154385e-07, |
| "entropy": 1.292095237970352, |
| "num_tokens": 25015757.0, |
| "mean_token_accuracy": 0.6861520081758499, |
| "epoch": 1.8035994558962019, |
| "step": 4310 |
| }, |
| { |
| "loss": 1.3122770309448242, |
| "grad_norm": 0.1278466135263443, |
| "learning_rate": 3.6047407918187923e-07, |
| "entropy": 1.32326979637146, |
| "num_tokens": 25073319.0, |
| "mean_token_accuracy": 0.6822131305932999, |
| "epoch": 1.8077848697289944, |
| "step": 4320 |
| }, |
| { |
| "loss": 1.239914321899414, |
| "grad_norm": 0.1450994610786438, |
| "learning_rate": 3.5830640184574567e-07, |
| "entropy": 1.2679915323853492, |
| "num_tokens": 25132470.0, |
| "mean_token_accuracy": 0.6903218165040016, |
| "epoch": 1.811970283561787, |
| "step": 4330 |
| }, |
| { |
| "loss": 1.3607330322265625, |
| "grad_norm": 0.14295367896556854, |
| "learning_rate": 3.5614161638691655e-07, |
| "entropy": 1.361120554804802, |
| "num_tokens": 25185195.0, |
| "mean_token_accuracy": 0.6752493545413017, |
| "epoch": 1.81615569739458, |
| "step": 4340 |
| }, |
| { |
| "loss": 1.2877880096435548, |
| "grad_norm": 0.1336987167596817, |
| "learning_rate": 3.539797669873029e-07, |
| "entropy": 1.294544619321823, |
| "num_tokens": 25241604.0, |
| "mean_token_accuracy": 0.6817367270588874, |
| "epoch": 1.8203411112273726, |
| "step": 4350 |
| }, |
| { |
| "loss": 1.2616640090942384, |
| "grad_norm": 0.12443723529577255, |
| "learning_rate": 3.518208977688924e-07, |
| "entropy": 1.3023397505283356, |
| "num_tokens": 25301515.0, |
| "mean_token_accuracy": 0.6868541851639748, |
| "epoch": 1.8245265250601652, |
| "step": 4360 |
| }, |
| { |
| "loss": 1.2237573623657227, |
| "grad_norm": 0.14553169906139374, |
| "learning_rate": 3.496650527928495e-07, |
| "entropy": 1.2511302560567856, |
| "num_tokens": 25357723.0, |
| "mean_token_accuracy": 0.6974032506346702, |
| "epoch": 1.828711938892958, |
| "step": 4370 |
| }, |
| { |
| "loss": 1.3280474662780761, |
| "grad_norm": 0.12313038110733032, |
| "learning_rate": 3.4751227605861544e-07, |
| "entropy": 1.3370114535093307, |
| "num_tokens": 25417249.0, |
| "mean_token_accuracy": 0.6781404823064804, |
| "epoch": 1.8328973527257508, |
| "step": 4380 |
| }, |
| { |
| "loss": 1.3116769790649414, |
| "grad_norm": 0.12443029880523682, |
| "learning_rate": 3.453626115030103e-07, |
| "entropy": 1.323847246170044, |
| "num_tokens": 25476722.0, |
| "mean_token_accuracy": 0.6824665144085884, |
| "epoch": 1.8370827665585434, |
| "step": 4390 |
| }, |
| { |
| "loss": 1.2541227340698242, |
| "grad_norm": 0.14306563138961792, |
| "learning_rate": 3.4321610299933754e-07, |
| "entropy": 1.275883974134922, |
| "num_tokens": 25536071.0, |
| "mean_token_accuracy": 0.6896202132105828, |
| "epoch": 1.8412681803913362, |
| "step": 4400 |
| }, |
| { |
| "eval_loss": 1.300899624824524, |
| "eval_runtime": 43.6667, |
| "eval_samples_per_second": 145.901, |
| "eval_steps_per_second": 6.092, |
| "eval_entropy": 1.31194052436298, |
| "eval_num_tokens": 25536071.0, |
| "eval_mean_token_accuracy": 0.6834116909736977, |
| "epoch": 1.8412681803913362, |
| "step": 4400 |
| }, |
| { |
| "loss": 1.3052658081054687, |
| "grad_norm": 0.12484145909547806, |
| "learning_rate": 3.410727943564865e-07, |
| "entropy": 1.304879105091095, |
| "num_tokens": 25592326.0, |
| "mean_token_accuracy": 0.6803866416215897, |
| "epoch": 1.845453594224129, |
| "step": 4410 |
| }, |
| { |
| "loss": 1.2852392196655273, |
| "grad_norm": 0.1245272308588028, |
| "learning_rate": 3.3893272931804004e-07, |
| "entropy": 1.2998150080442428, |
| "num_tokens": 25650560.0, |
| "mean_token_accuracy": 0.6859546720981597, |
| "epoch": 1.8496390080569216, |
| "step": 4420 |
| }, |
| { |
| "loss": 1.3093093872070312, |
| "grad_norm": 0.13694874942302704, |
| "learning_rate": 3.367959515613809e-07, |
| "entropy": 1.326773339509964, |
| "num_tokens": 25710390.0, |
| "mean_token_accuracy": 0.6779543459415436, |
| "epoch": 1.8538244218897142, |
| "step": 4430 |
| }, |
| { |
| "loss": 1.3431434631347656, |
| "grad_norm": 0.13425195217132568, |
| "learning_rate": 3.346625046968003e-07, |
| "entropy": 1.3320137143135071, |
| "num_tokens": 25765683.0, |
| "mean_token_accuracy": 0.6735352456569672, |
| "epoch": 1.8580098357225072, |
| "step": 4440 |
| }, |
| { |
| "loss": 1.2724491119384767, |
| "grad_norm": 0.13366232812404633, |
| "learning_rate": 3.325324322666081e-07, |
| "entropy": 1.27188421189785, |
| "num_tokens": 25824731.0, |
| "mean_token_accuracy": 0.689846420288086, |
| "epoch": 1.8621952495552998, |
| "step": 4450 |
| }, |
| { |
| "loss": 1.2903520584106445, |
| "grad_norm": 0.12611544132232666, |
| "learning_rate": 3.3040577774424437e-07, |
| "entropy": 1.3168232500553132, |
| "num_tokens": 25885073.0, |
| "mean_token_accuracy": 0.6854806423187256, |
| "epoch": 1.8663806633880924, |
| "step": 4460 |
| }, |
| { |
| "loss": 1.3013708114624023, |
| "grad_norm": 0.14317074418067932, |
| "learning_rate": 3.2828258453339155e-07, |
| "entropy": 1.3177704036235809, |
| "num_tokens": 25942626.0, |
| "mean_token_accuracy": 0.6822627365589142, |
| "epoch": 1.8705660772208852, |
| "step": 4470 |
| }, |
| { |
| "loss": 1.332556915283203, |
| "grad_norm": 0.14360307157039642, |
| "learning_rate": 3.261628959670889e-07, |
| "entropy": 1.3369245409965516, |
| "num_tokens": 25997260.0, |
| "mean_token_accuracy": 0.6774563640356064, |
| "epoch": 1.874751491053678, |
| "step": 4480 |
| }, |
| { |
| "loss": 1.2616004943847656, |
| "grad_norm": 0.13137495517730713, |
| "learning_rate": 3.240467553068481e-07, |
| "entropy": 1.2717559725046157, |
| "num_tokens": 26055446.0, |
| "mean_token_accuracy": 0.6905860707163811, |
| "epoch": 1.8789369048864706, |
| "step": 4490 |
| }, |
| { |
| "loss": 1.408462142944336, |
| "grad_norm": 0.13120928406715393, |
| "learning_rate": 3.2193420574177034e-07, |
| "entropy": 1.3706548005342483, |
| "num_tokens": 26111925.0, |
| "mean_token_accuracy": 0.6645766496658325, |
| "epoch": 1.8831223187192634, |
| "step": 4500 |
| }, |
| { |
| "loss": 1.2302813529968262, |
| "grad_norm": 0.15121367573738098, |
| "learning_rate": 3.1982529038766505e-07, |
| "entropy": 1.274702313542366, |
| "num_tokens": 26171418.0, |
| "mean_token_accuracy": 0.6942442029714584, |
| "epoch": 1.8873077325520562, |
| "step": 4510 |
| }, |
| { |
| "loss": 1.2387846946716308, |
| "grad_norm": 0.11012545973062515, |
| "learning_rate": 3.1772005228616933e-07, |
| "entropy": 1.2893740877509117, |
| "num_tokens": 26232638.0, |
| "mean_token_accuracy": 0.6922576785087585, |
| "epoch": 1.8914931463848488, |
| "step": 4520 |
| }, |
| { |
| "loss": 1.292118453979492, |
| "grad_norm": 0.14446091651916504, |
| "learning_rate": 3.156185344038699e-07, |
| "entropy": 1.3311437577009202, |
| "num_tokens": 26293065.0, |
| "mean_token_accuracy": 0.6810448184609413, |
| "epoch": 1.8956785602176414, |
| "step": 4530 |
| }, |
| { |
| "loss": 1.33145170211792, |
| "grad_norm": 0.14474819600582123, |
| "learning_rate": 3.135207796314263e-07, |
| "entropy": 1.3151442527770996, |
| "num_tokens": 26349311.0, |
| "mean_token_accuracy": 0.6806560069322586, |
| "epoch": 1.8998639740504344, |
| "step": 4540 |
| }, |
| { |
| "loss": 1.2638792037963866, |
| "grad_norm": 0.11845609545707703, |
| "learning_rate": 3.114268307826953e-07, |
| "entropy": 1.2752373963594437, |
| "num_tokens": 26407067.0, |
| "mean_token_accuracy": 0.6892140090465546, |
| "epoch": 1.904049387883227, |
| "step": 4550 |
| }, |
| { |
| "loss": 1.341792106628418, |
| "grad_norm": 0.1555166393518448, |
| "learning_rate": 3.093367305938572e-07, |
| "entropy": 1.3313662111759186, |
| "num_tokens": 26463271.0, |
| "mean_token_accuracy": 0.6772884294390679, |
| "epoch": 1.9082348017160196, |
| "step": 4560 |
| }, |
| { |
| "loss": 1.2394842147827148, |
| "grad_norm": 0.13164710998535156, |
| "learning_rate": 3.072505217225435e-07, |
| "entropy": 1.2927237793803215, |
| "num_tokens": 26519442.0, |
| "mean_token_accuracy": 0.688689187169075, |
| "epoch": 1.9124202155488124, |
| "step": 4570 |
| }, |
| { |
| "loss": 1.2526350021362305, |
| "grad_norm": 0.12433302402496338, |
| "learning_rate": 3.051682467469663e-07, |
| "entropy": 1.3005468085408212, |
| "num_tokens": 26576793.0, |
| "mean_token_accuracy": 0.6895026102662086, |
| "epoch": 1.9166056293816052, |
| "step": 4580 |
| }, |
| { |
| "loss": 1.300935935974121, |
| "grad_norm": 0.14517027139663696, |
| "learning_rate": 3.030899481650496e-07, |
| "entropy": 1.3120550215244293, |
| "num_tokens": 26632676.0, |
| "mean_token_accuracy": 0.6833431273698807, |
| "epoch": 1.9207910432143978, |
| "step": 4590 |
| }, |
| { |
| "loss": 1.305576705932617, |
| "grad_norm": 0.13038092851638794, |
| "learning_rate": 3.010156683935614e-07, |
| "entropy": 1.3124109566211701, |
| "num_tokens": 26690585.0, |
| "mean_token_accuracy": 0.6795723259449005, |
| "epoch": 1.9249764570471906, |
| "step": 4600 |
| }, |
| { |
| "eval_loss": 1.299921989440918, |
| "eval_runtime": 43.8297, |
| "eval_samples_per_second": 145.358, |
| "eval_steps_per_second": 6.069, |
| "eval_entropy": 1.3205744122204028, |
| "eval_num_tokens": 26690585.0, |
| "eval_mean_token_accuracy": 0.6834659193243299, |
| "epoch": 1.9249764570471906, |
| "step": 4600 |
| }, |
| { |
| "loss": 1.2759065628051758, |
| "grad_norm": 0.13065700232982635, |
| "learning_rate": 2.9894544976724845e-07, |
| "entropy": 1.3232569113373756, |
| "num_tokens": 26750126.0, |
| "mean_token_accuracy": 0.6856075286865234, |
| "epoch": 1.9291618708799834, |
| "step": 4610 |
| }, |
| { |
| "loss": 1.3960214614868165, |
| "grad_norm": 0.13134630024433136, |
| "learning_rate": 2.968793345379722e-07, |
| "entropy": 1.3706552177667617, |
| "num_tokens": 26809589.0, |
| "mean_token_accuracy": 0.6644584119319916, |
| "epoch": 1.933347284712776, |
| "step": 4620 |
| }, |
| { |
| "loss": 1.2981806755065919, |
| "grad_norm": 0.13195905089378357, |
| "learning_rate": 2.9481736487384615e-07, |
| "entropy": 1.2926361411809921, |
| "num_tokens": 26868122.0, |
| "mean_token_accuracy": 0.6837931454181672, |
| "epoch": 1.9375326985455685, |
| "step": 4630 |
| }, |
| { |
| "loss": 1.3205986022949219, |
| "grad_norm": 0.1443631947040558, |
| "learning_rate": 2.9275958285837567e-07, |
| "entropy": 1.3107433021068573, |
| "num_tokens": 26928383.0, |
| "mean_token_accuracy": 0.6803469866514206, |
| "epoch": 1.9417181123783616, |
| "step": 4640 |
| }, |
| { |
| "loss": 1.2733318328857421, |
| "grad_norm": 0.12333279103040695, |
| "learning_rate": 2.907060304895984e-07, |
| "entropy": 1.306384412944317, |
| "num_tokens": 26987347.0, |
| "mean_token_accuracy": 0.6883521243929863, |
| "epoch": 1.9459035262111541, |
| "step": 4650 |
| }, |
| { |
| "loss": 1.2352895736694336, |
| "grad_norm": 0.15071088075637817, |
| "learning_rate": 2.8865674967922815e-07, |
| "entropy": 1.2537823468446732, |
| "num_tokens": 27044802.0, |
| "mean_token_accuracy": 0.6937527641654014, |
| "epoch": 1.9500889400439467, |
| "step": 4660 |
| }, |
| { |
| "loss": 1.2543825149536132, |
| "grad_norm": 0.11400660872459412, |
| "learning_rate": 2.866117822517982e-07, |
| "entropy": 1.2866296932101249, |
| "num_tokens": 27102535.0, |
| "mean_token_accuracy": 0.6873494073748588, |
| "epoch": 1.9542743538767395, |
| "step": 4670 |
| }, |
| { |
| "loss": 1.3186541557312013, |
| "grad_norm": 0.13829229772090912, |
| "learning_rate": 2.8457116994380913e-07, |
| "entropy": 1.331754493713379, |
| "num_tokens": 27160092.0, |
| "mean_token_accuracy": 0.6785844698548317, |
| "epoch": 1.9584597677095323, |
| "step": 4680 |
| }, |
| { |
| "loss": 1.305195140838623, |
| "grad_norm": 0.13482722640037537, |
| "learning_rate": 2.8253495440287555e-07, |
| "entropy": 1.3345891624689101, |
| "num_tokens": 27216273.0, |
| "mean_token_accuracy": 0.6796174451708794, |
| "epoch": 1.962645181542325, |
| "step": 4690 |
| }, |
| { |
| "loss": 1.2968315124511718, |
| "grad_norm": 0.13567803800106049, |
| "learning_rate": 2.805031771868774e-07, |
| "entropy": 1.3210385277867318, |
| "num_tokens": 27274967.0, |
| "mean_token_accuracy": 0.6838418498635292, |
| "epoch": 1.9668305953751177, |
| "step": 4700 |
| }, |
| { |
| "loss": 1.2280590057373046, |
| "grad_norm": 0.11562594771385193, |
| "learning_rate": 2.784758797631113e-07, |
| "entropy": 1.2723073571920396, |
| "num_tokens": 27332706.0, |
| "mean_token_accuracy": 0.693420697748661, |
| "epoch": 1.9710160092079105, |
| "step": 4710 |
| }, |
| { |
| "loss": 1.2968725204467773, |
| "grad_norm": 0.1304372102022171, |
| "learning_rate": 2.7645310350744293e-07, |
| "entropy": 1.3245373040437698, |
| "num_tokens": 27391429.0, |
| "mean_token_accuracy": 0.685273765027523, |
| "epoch": 1.9752014230407031, |
| "step": 4720 |
| }, |
| { |
| "loss": 1.2400999069213867, |
| "grad_norm": 0.12765555083751678, |
| "learning_rate": 2.744348897034657e-07, |
| "entropy": 1.2704340279102326, |
| "num_tokens": 27449195.0, |
| "mean_token_accuracy": 0.6926597207784653, |
| "epoch": 1.9793868368734957, |
| "step": 4730 |
| }, |
| { |
| "loss": 1.313642406463623, |
| "grad_norm": 0.12744539976119995, |
| "learning_rate": 2.724212795416544e-07, |
| "entropy": 1.323761799931526, |
| "num_tokens": 27507409.0, |
| "mean_token_accuracy": 0.6781192749738694, |
| "epoch": 1.9835722507062887, |
| "step": 4740 |
| }, |
| { |
| "loss": 1.3404882431030274, |
| "grad_norm": 0.1226453185081482, |
| "learning_rate": 2.704123141185275e-07, |
| "entropy": 1.3297797441482544, |
| "num_tokens": 27562728.0, |
| "mean_token_accuracy": 0.6767387732863426, |
| "epoch": 1.9877576645390813, |
| "step": 4750 |
| }, |
| { |
| "loss": 1.3537111282348633, |
| "grad_norm": 0.1401350498199463, |
| "learning_rate": 2.6840803443580715e-07, |
| "entropy": 1.3468406647443771, |
| "num_tokens": 27622323.0, |
| "mean_token_accuracy": 0.6730136394500732, |
| "epoch": 1.991943078371874, |
| "step": 4760 |
| }, |
| { |
| "loss": 1.337536907196045, |
| "grad_norm": 0.13678760826587677, |
| "learning_rate": 2.664084813995818e-07, |
| "entropy": 1.3439167469739914, |
| "num_tokens": 27679189.0, |
| "mean_token_accuracy": 0.676570326089859, |
| "epoch": 1.9961284922046667, |
| "step": 4770 |
| }, |
| { |
| "loss": 1.2568793296813965, |
| "grad_norm": 0.37100762128829956, |
| "learning_rate": 2.644136958194727e-07, |
| "entropy": 1.2735960676863387, |
| "num_tokens": 27730417.0, |
| "mean_token_accuracy": 0.6950372699144725, |
| "epoch": 2.0, |
| "step": 4780 |
| }, |
| { |
| "loss": 1.2392065048217773, |
| "grad_norm": 0.12272343039512634, |
| "learning_rate": 2.624237184078004e-07, |
| "entropy": 1.2709258124232292, |
| "num_tokens": 27790663.0, |
| "mean_token_accuracy": 0.6911343216896058, |
| "epoch": 2.0041854138327926, |
| "step": 4790 |
| }, |
| { |
| "loss": 1.2543585777282715, |
| "grad_norm": 0.11581531912088394, |
| "learning_rate": 2.6043858977875287e-07, |
| "entropy": 1.3081357836723329, |
| "num_tokens": 27848150.0, |
| "mean_token_accuracy": 0.6878180950880051, |
| "epoch": 2.0083708276655856, |
| "step": 4800 |
| }, |
| { |
| "eval_loss": 1.2991561889648438, |
| "eval_runtime": 42.5158, |
| "eval_samples_per_second": 149.85, |
| "eval_steps_per_second": 6.256, |
| "eval_entropy": 1.2998529121391755, |
| "eval_num_tokens": 27848150.0, |
| "eval_mean_token_accuracy": 0.683588629378412, |
| "epoch": 2.0083708276655856, |
| "step": 4800 |
| }, |
| { |
| "loss": 1.26077880859375, |
| "grad_norm": 0.1441148817539215, |
| "learning_rate": 2.584583504475587e-07, |
| "entropy": 1.2779432222247125, |
| "num_tokens": 27905176.0, |
| "mean_token_accuracy": 0.6864334151148797, |
| "epoch": 2.012556241498378, |
| "step": 4810 |
| }, |
| { |
| "loss": 1.2669543266296386, |
| "grad_norm": 0.1253192126750946, |
| "learning_rate": 2.5648304082965775e-07, |
| "entropy": 1.2866142064332962, |
| "num_tokens": 27963167.0, |
| "mean_token_accuracy": 0.6874963492155075, |
| "epoch": 2.016741655331171, |
| "step": 4820 |
| }, |
| { |
| "loss": 1.3348177909851073, |
| "grad_norm": 0.14420834183692932, |
| "learning_rate": 2.5451270123987843e-07, |
| "entropy": 1.3393577009439468, |
| "num_tokens": 28020052.0, |
| "mean_token_accuracy": 0.6757590815424919, |
| "epoch": 2.020927069163964, |
| "step": 4830 |
| }, |
| { |
| "loss": 1.2622191429138183, |
| "grad_norm": 0.13085411489009857, |
| "learning_rate": 2.5254737189161373e-07, |
| "entropy": 1.3007038220763207, |
| "num_tokens": 28078981.0, |
| "mean_token_accuracy": 0.6892907366156578, |
| "epoch": 2.0251124829967564, |
| "step": 4840 |
| }, |
| { |
| "loss": 1.3640257835388183, |
| "grad_norm": 0.15776140987873077, |
| "learning_rate": 2.5058709289600067e-07, |
| "entropy": 1.3638720154762267, |
| "num_tokens": 28134625.0, |
| "mean_token_accuracy": 0.6710668623447418, |
| "epoch": 2.029297896829549, |
| "step": 4850 |
| }, |
| { |
| "loss": 1.2844989776611329, |
| "grad_norm": 0.11809894442558289, |
| "learning_rate": 2.486319042611019e-07, |
| "entropy": 1.2931891083717346, |
| "num_tokens": 28194798.0, |
| "mean_token_accuracy": 0.6872560605406761, |
| "epoch": 2.0334833106623416, |
| "step": 4860 |
| }, |
| { |
| "loss": 1.3038467407226562, |
| "grad_norm": 0.1337338089942932, |
| "learning_rate": 2.4668184589108867e-07, |
| "entropy": 1.33267682492733, |
| "num_tokens": 28252833.0, |
| "mean_token_accuracy": 0.6824387982487679, |
| "epoch": 2.0376687244951346, |
| "step": 4870 |
| }, |
| { |
| "loss": 1.3354660034179688, |
| "grad_norm": 0.15302719175815582, |
| "learning_rate": 2.4473695758542707e-07, |
| "entropy": 1.343110579252243, |
| "num_tokens": 28312031.0, |
| "mean_token_accuracy": 0.6740260154008866, |
| "epoch": 2.041854138327927, |
| "step": 4880 |
| }, |
| { |
| "loss": 1.2677290916442872, |
| "grad_norm": 0.13939176499843597, |
| "learning_rate": 2.4279727903806556e-07, |
| "entropy": 1.2891633421182633, |
| "num_tokens": 28370177.0, |
| "mean_token_accuracy": 0.6864463344216347, |
| "epoch": 2.0460395521607198, |
| "step": 4890 |
| }, |
| { |
| "loss": 1.2482310295104981, |
| "grad_norm": 0.1433423012495041, |
| "learning_rate": 2.408628498366242e-07, |
| "entropy": 1.2710548743605614, |
| "num_tokens": 28429335.0, |
| "mean_token_accuracy": 0.6896666899323464, |
| "epoch": 2.050224965993513, |
| "step": 4900 |
| }, |
| { |
| "loss": 1.2301183700561524, |
| "grad_norm": 0.12237236648797989, |
| "learning_rate": 2.389337094615875e-07, |
| "entropy": 1.278349894285202, |
| "num_tokens": 28489915.0, |
| "mean_token_accuracy": 0.6930847212672233, |
| "epoch": 2.0544103798263054, |
| "step": 4910 |
| }, |
| { |
| "loss": 1.2912443161010743, |
| "grad_norm": 0.17072777450084686, |
| "learning_rate": 2.370098972854987e-07, |
| "entropy": 1.277844424545765, |
| "num_tokens": 28547300.0, |
| "mean_token_accuracy": 0.68348438590765, |
| "epoch": 2.058595793659098, |
| "step": 4920 |
| }, |
| { |
| "loss": 1.2300211906433105, |
| "grad_norm": 0.12249208986759186, |
| "learning_rate": 2.3509145257215495e-07, |
| "entropy": 1.2522578805685043, |
| "num_tokens": 28607326.0, |
| "mean_token_accuracy": 0.6944442689418793, |
| "epoch": 2.0627812074918905, |
| "step": 4930 |
| }, |
| { |
| "loss": 1.2790916442871094, |
| "grad_norm": 0.22464126348495483, |
| "learning_rate": 2.3317841447580767e-07, |
| "entropy": 1.2809948831796647, |
| "num_tokens": 28661604.0, |
| "mean_token_accuracy": 0.6859934300184249, |
| "epoch": 2.0669666213246836, |
| "step": 4940 |
| }, |
| { |
| "loss": 1.2724569320678711, |
| "grad_norm": 0.13732433319091797, |
| "learning_rate": 2.312708220403623e-07, |
| "entropy": 1.31765376329422, |
| "num_tokens": 28718818.0, |
| "mean_token_accuracy": 0.6880935072898865, |
| "epoch": 2.071152035157476, |
| "step": 4950 |
| }, |
| { |
| "loss": 1.3442106246948242, |
| "grad_norm": 0.12912045419216156, |
| "learning_rate": 2.2936871419858194e-07, |
| "entropy": 1.3523348033428193, |
| "num_tokens": 28775584.0, |
| "mean_token_accuracy": 0.6736010074615478, |
| "epoch": 2.0753374489902687, |
| "step": 4960 |
| }, |
| { |
| "loss": 1.1986734390258789, |
| "grad_norm": 0.13469566404819489, |
| "learning_rate": 2.2747212977129217e-07, |
| "entropy": 1.2723553344607352, |
| "num_tokens": 28835156.0, |
| "mean_token_accuracy": 0.6995670750737191, |
| "epoch": 2.0795228628230618, |
| "step": 4970 |
| }, |
| { |
| "loss": 1.3475229263305664, |
| "grad_norm": 0.1290500909090042, |
| "learning_rate": 2.2558110746658953e-07, |
| "entropy": 1.3560008838772775, |
| "num_tokens": 28895271.0, |
| "mean_token_accuracy": 0.6725587636232376, |
| "epoch": 2.0837082766558543, |
| "step": 4980 |
| }, |
| { |
| "loss": 1.342457389831543, |
| "grad_norm": 0.1255948692560196, |
| "learning_rate": 2.236956858790513e-07, |
| "entropy": 1.3365329071879386, |
| "num_tokens": 28953018.0, |
| "mean_token_accuracy": 0.679172757267952, |
| "epoch": 2.087893690488647, |
| "step": 4990 |
| }, |
| { |
| "loss": 1.2630849838256837, |
| "grad_norm": 0.13795071840286255, |
| "learning_rate": 2.218159034889469e-07, |
| "entropy": 1.2892632216215134, |
| "num_tokens": 29012433.0, |
| "mean_token_accuracy": 0.6912027075886726, |
| "epoch": 2.09207910432144, |
| "step": 5000 |
| }, |
| { |
| "eval_loss": 1.298377513885498, |
| "eval_runtime": 43.7919, |
| "eval_samples_per_second": 145.484, |
| "eval_steps_per_second": 6.074, |
| "eval_entropy": 1.3099012760291422, |
| "eval_num_tokens": 29012433.0, |
| "eval_mean_token_accuracy": 0.6836510939257485, |
| "epoch": 2.09207910432144, |
| "step": 5000 |
| }, |
| { |
| "loss": 1.2960596084594727, |
| "grad_norm": 0.11307420581579208, |
| "learning_rate": 2.1994179866145396e-07, |
| "entropy": 1.3118484735488891, |
| "num_tokens": 29070217.0, |
| "mean_token_accuracy": 0.6797604545950889, |
| "epoch": 2.0962645181542325, |
| "step": 5010 |
| }, |
| { |
| "loss": 1.3223162651062013, |
| "grad_norm": 0.15304112434387207, |
| "learning_rate": 2.180734096458746e-07, |
| "entropy": 1.3404868721961976, |
| "num_tokens": 29126476.0, |
| "mean_token_accuracy": 0.6759276837110519, |
| "epoch": 2.100449931987025, |
| "step": 5020 |
| }, |
| { |
| "loss": 1.3450361251831056, |
| "grad_norm": 0.11615368723869324, |
| "learning_rate": 2.1621077457485427e-07, |
| "entropy": 1.3462235242128373, |
| "num_tokens": 29184125.0, |
| "mean_token_accuracy": 0.6701866090297699, |
| "epoch": 2.104635345819818, |
| "step": 5030 |
| }, |
| { |
| "loss": 1.3292051315307618, |
| "grad_norm": 0.1241302341222763, |
| "learning_rate": 2.1435393146360453e-07, |
| "entropy": 1.3317017763853074, |
| "num_tokens": 29243309.0, |
| "mean_token_accuracy": 0.6787121832370758, |
| "epoch": 2.1088207596526107, |
| "step": 5040 |
| }, |
| { |
| "loss": 1.309870719909668, |
| "grad_norm": 0.12809441983699799, |
| "learning_rate": 2.1250291820912648e-07, |
| "entropy": 1.3308863699436189, |
| "num_tokens": 29302274.0, |
| "mean_token_accuracy": 0.6813490375876426, |
| "epoch": 2.1130061734854033, |
| "step": 5050 |
| }, |
| { |
| "loss": 1.259312343597412, |
| "grad_norm": 0.11709679663181305, |
| "learning_rate": 2.1065777258943763e-07, |
| "entropy": 1.2945900693535806, |
| "num_tokens": 29359001.0, |
| "mean_token_accuracy": 0.6841064542531967, |
| "epoch": 2.117191587318196, |
| "step": 5060 |
| }, |
| { |
| "loss": 1.1917829513549805, |
| "grad_norm": 0.13013018667697906, |
| "learning_rate": 2.0881853226280082e-07, |
| "entropy": 1.252656841278076, |
| "num_tokens": 29417257.0, |
| "mean_token_accuracy": 0.7048160001635552, |
| "epoch": 2.121377001150989, |
| "step": 5070 |
| }, |
| { |
| "loss": 1.2949867248535156, |
| "grad_norm": 0.15123531222343445, |
| "learning_rate": 2.0698523476695506e-07, |
| "entropy": 1.316368493437767, |
| "num_tokens": 29474012.0, |
| "mean_token_accuracy": 0.6840205147862435, |
| "epoch": 2.1255624149837815, |
| "step": 5080 |
| }, |
| { |
| "loss": 1.231495475769043, |
| "grad_norm": 0.13549339771270752, |
| "learning_rate": 2.0515791751835066e-07, |
| "entropy": 1.261933021247387, |
| "num_tokens": 29535364.0, |
| "mean_token_accuracy": 0.6923261538147927, |
| "epoch": 2.129747828816574, |
| "step": 5090 |
| }, |
| { |
| "loss": 1.265492820739746, |
| "grad_norm": 0.12323841452598572, |
| "learning_rate": 2.0333661781138406e-07, |
| "entropy": 1.2891878262162209, |
| "num_tokens": 29594045.0, |
| "mean_token_accuracy": 0.6874890491366387, |
| "epoch": 2.133933242649367, |
| "step": 5100 |
| }, |
| { |
| "loss": 1.3455522537231446, |
| "grad_norm": 0.12925904989242554, |
| "learning_rate": 2.015213728176381e-07, |
| "entropy": 1.355113722383976, |
| "num_tokens": 29654672.0, |
| "mean_token_accuracy": 0.6736163109540939, |
| "epoch": 2.1381186564821597, |
| "step": 5110 |
| }, |
| { |
| "loss": 1.2876879692077636, |
| "grad_norm": 0.10625462979078293, |
| "learning_rate": 1.9971221958512259e-07, |
| "entropy": 1.308001670241356, |
| "num_tokens": 29713404.0, |
| "mean_token_accuracy": 0.6850254252552986, |
| "epoch": 2.1423040703149523, |
| "step": 5120 |
| }, |
| { |
| "loss": 1.269423484802246, |
| "grad_norm": 0.14946334064006805, |
| "learning_rate": 1.9790919503751786e-07, |
| "entropy": 1.2912926644086837, |
| "num_tokens": 29768834.0, |
| "mean_token_accuracy": 0.6910573810338974, |
| "epoch": 2.146489484147745, |
| "step": 5130 |
| }, |
| { |
| "loss": 1.3150415420532227, |
| "grad_norm": 0.15966582298278809, |
| "learning_rate": 1.961123359734222e-07, |
| "entropy": 1.3350969046354293, |
| "num_tokens": 29823986.0, |
| "mean_token_accuracy": 0.6827343329787254, |
| "epoch": 2.150674897980538, |
| "step": 5140 |
| }, |
| { |
| "loss": 1.2534076690673828, |
| "grad_norm": 0.13799019157886505, |
| "learning_rate": 1.9432167906560025e-07, |
| "entropy": 1.2794459909200668, |
| "num_tokens": 29882161.0, |
| "mean_token_accuracy": 0.6894301295280456, |
| "epoch": 2.1548603118133305, |
| "step": 5150 |
| }, |
| { |
| "loss": 1.226758861541748, |
| "grad_norm": 0.16427931189537048, |
| "learning_rate": 1.9253726086023376e-07, |
| "entropy": 1.2521668612957, |
| "num_tokens": 29938237.0, |
| "mean_token_accuracy": 0.6923803791403771, |
| "epoch": 2.159045725646123, |
| "step": 5160 |
| }, |
| { |
| "loss": 1.2537633895874023, |
| "grad_norm": 0.13021980226039886, |
| "learning_rate": 1.9075911777617776e-07, |
| "entropy": 1.2832251608371734, |
| "num_tokens": 29993951.0, |
| "mean_token_accuracy": 0.6919069468975068, |
| "epoch": 2.163231139478916, |
| "step": 5170 |
| }, |
| { |
| "loss": 1.2582441329956056, |
| "grad_norm": 0.13316968083381653, |
| "learning_rate": 1.8898728610421473e-07, |
| "entropy": 1.2960840493440628, |
| "num_tokens": 30053405.0, |
| "mean_token_accuracy": 0.6867008566856384, |
| "epoch": 2.1674165533117087, |
| "step": 5180 |
| }, |
| { |
| "loss": 1.2535063743591308, |
| "grad_norm": 0.1502976417541504, |
| "learning_rate": 1.8722180200631598e-07, |
| "entropy": 1.291701939702034, |
| "num_tokens": 30111434.0, |
| "mean_token_accuracy": 0.6882436692714691, |
| "epoch": 2.1716019671445013, |
| "step": 5190 |
| }, |
| { |
| "loss": 1.2767062187194824, |
| "grad_norm": 0.1319260597229004, |
| "learning_rate": 1.8546270151490278e-07, |
| "entropy": 1.298856572806835, |
| "num_tokens": 30168307.0, |
| "mean_token_accuracy": 0.68586795181036, |
| "epoch": 2.1757873809772943, |
| "step": 5200 |
| }, |
| { |
| "eval_loss": 1.297808051109314, |
| "eval_runtime": 43.6552, |
| "eval_samples_per_second": 145.939, |
| "eval_steps_per_second": 6.093, |
| "eval_entropy": 1.307837866750875, |
| "eval_num_tokens": 30168307.0, |
| "eval_mean_token_accuracy": 0.6838091374339914, |
| "epoch": 2.1757873809772943, |
| "step": 5200 |
| }, |
| { |
| "loss": 1.2791316032409668, |
| "grad_norm": 0.12315330654382706, |
| "learning_rate": 1.8371002053211048e-07, |
| "entropy": 1.3057184204459191, |
| "num_tokens": 30225681.0, |
| "mean_token_accuracy": 0.6861935615539551, |
| "epoch": 2.179972794810087, |
| "step": 5210 |
| }, |
| { |
| "loss": 1.32224760055542, |
| "grad_norm": 0.13483846187591553, |
| "learning_rate": 1.819637948290569e-07, |
| "entropy": 1.3247323662042618, |
| "num_tokens": 30283602.0, |
| "mean_token_accuracy": 0.677856071293354, |
| "epoch": 2.1841582086428795, |
| "step": 5220 |
| }, |
| { |
| "loss": 1.245813751220703, |
| "grad_norm": 0.12423646450042725, |
| "learning_rate": 1.8022406004511114e-07, |
| "entropy": 1.2820057839155197, |
| "num_tokens": 30343652.0, |
| "mean_token_accuracy": 0.6916850328445434, |
| "epoch": 2.1883436224756725, |
| "step": 5230 |
| }, |
| { |
| "loss": 1.313099193572998, |
| "grad_norm": 0.1301707625389099, |
| "learning_rate": 1.7849085168716704e-07, |
| "entropy": 1.3053890287876129, |
| "num_tokens": 30400983.0, |
| "mean_token_accuracy": 0.6804193690419197, |
| "epoch": 2.192529036308465, |
| "step": 5240 |
| }, |
| { |
| "loss": 1.2556833267211913, |
| "grad_norm": 0.1505342423915863, |
| "learning_rate": 1.7676420512891842e-07, |
| "entropy": 1.2873410269618035, |
| "num_tokens": 30459009.0, |
| "mean_token_accuracy": 0.6887684732675552, |
| "epoch": 2.1967144501412577, |
| "step": 5250 |
| }, |
| { |
| "loss": 1.2559351921081543, |
| "grad_norm": 0.13919785618782043, |
| "learning_rate": 1.7504415561013614e-07, |
| "entropy": 1.2811901897192002, |
| "num_tokens": 30516861.0, |
| "mean_token_accuracy": 0.6915321722626686, |
| "epoch": 2.2008998639740502, |
| "step": 5260 |
| }, |
| { |
| "loss": 1.2761926651000977, |
| "grad_norm": 0.12455730140209198, |
| "learning_rate": 1.7333073823595025e-07, |
| "entropy": 1.2844579115509986, |
| "num_tokens": 30575979.0, |
| "mean_token_accuracy": 0.6861526161432266, |
| "epoch": 2.2050852778068433, |
| "step": 5270 |
| }, |
| { |
| "loss": 1.2840014457702638, |
| "grad_norm": 0.13087549805641174, |
| "learning_rate": 1.7162398797613282e-07, |
| "entropy": 1.2940828785300256, |
| "num_tokens": 30633600.0, |
| "mean_token_accuracy": 0.685159420967102, |
| "epoch": 2.209270691639636, |
| "step": 5280 |
| }, |
| { |
| "loss": 1.3325956344604493, |
| "grad_norm": 0.15323391556739807, |
| "learning_rate": 1.6992393966438405e-07, |
| "entropy": 1.3237911939620972, |
| "num_tokens": 30693015.0, |
| "mean_token_accuracy": 0.6795177638530732, |
| "epoch": 2.2134561054724284, |
| "step": 5290 |
| }, |
| { |
| "loss": 1.310387420654297, |
| "grad_norm": 0.12490073591470718, |
| "learning_rate": 1.6823062799762205e-07, |
| "entropy": 1.3257877498865127, |
| "num_tokens": 30749233.0, |
| "mean_token_accuracy": 0.6818015187978744, |
| "epoch": 2.2176415193052215, |
| "step": 5300 |
| }, |
| { |
| "loss": 1.2663789749145509, |
| "grad_norm": 0.1327386498451233, |
| "learning_rate": 1.6654408753527361e-07, |
| "entropy": 1.3193859189748764, |
| "num_tokens": 30809674.0, |
| "mean_token_accuracy": 0.6879936501383781, |
| "epoch": 2.221826933138014, |
| "step": 5310 |
| }, |
| { |
| "loss": 1.3301811218261719, |
| "grad_norm": 0.14070047438144684, |
| "learning_rate": 1.6486435269856985e-07, |
| "entropy": 1.3461501210927964, |
| "num_tokens": 30867279.0, |
| "mean_token_accuracy": 0.6762196362018585, |
| "epoch": 2.2260123469708066, |
| "step": 5320 |
| }, |
| { |
| "loss": 1.2323862075805665, |
| "grad_norm": 0.14718832075595856, |
| "learning_rate": 1.6319145776984361e-07, |
| "entropy": 1.2663889586925507, |
| "num_tokens": 30923604.0, |
| "mean_token_accuracy": 0.6963629499077797, |
| "epoch": 2.2301977608035997, |
| "step": 5330 |
| }, |
| { |
| "loss": 1.4061556816101075, |
| "grad_norm": 0.11397302895784378, |
| "learning_rate": 1.6152543689182885e-07, |
| "entropy": 1.3801796600222587, |
| "num_tokens": 30983941.0, |
| "mean_token_accuracy": 0.6657746851444244, |
| "epoch": 2.2343831746363922, |
| "step": 5340 |
| }, |
| { |
| "loss": 1.3607137680053711, |
| "grad_norm": 0.13541868329048157, |
| "learning_rate": 1.5986632406696515e-07, |
| "entropy": 1.3243082225322724, |
| "num_tokens": 31042120.0, |
| "mean_token_accuracy": 0.6700464963912964, |
| "epoch": 2.238568588469185, |
| "step": 5350 |
| }, |
| { |
| "loss": 1.2886553764343263, |
| "grad_norm": 0.1389724761247635, |
| "learning_rate": 1.5821415315670251e-07, |
| "entropy": 1.3397713720798492, |
| "num_tokens": 31102163.0, |
| "mean_token_accuracy": 0.6880467623472214, |
| "epoch": 2.2427540023019774, |
| "step": 5360 |
| }, |
| { |
| "loss": 1.220026397705078, |
| "grad_norm": 0.1286465972661972, |
| "learning_rate": 1.5656895788081104e-07, |
| "entropy": 1.256170129776001, |
| "num_tokens": 31159675.0, |
| "mean_token_accuracy": 0.6972913891077042, |
| "epoch": 2.2469394161347704, |
| "step": 5370 |
| }, |
| { |
| "loss": 1.2820704460144043, |
| "grad_norm": 0.1213146299123764, |
| "learning_rate": 1.5493077181669272e-07, |
| "entropy": 1.2981676012277603, |
| "num_tokens": 31219684.0, |
| "mean_token_accuracy": 0.688413429260254, |
| "epoch": 2.251124829967563, |
| "step": 5380 |
| }, |
| { |
| "loss": 1.2337745666503905, |
| "grad_norm": 0.1330552101135254, |
| "learning_rate": 1.532996283986957e-07, |
| "entropy": 1.2481247037649155, |
| "num_tokens": 31284113.0, |
| "mean_token_accuracy": 0.694562304019928, |
| "epoch": 2.2553102438003556, |
| "step": 5390 |
| }, |
| { |
| "loss": 1.3011648178100585, |
| "grad_norm": 0.14552603662014008, |
| "learning_rate": 1.5167556091743238e-07, |
| "entropy": 1.3327119797468185, |
| "num_tokens": 31344186.0, |
| "mean_token_accuracy": 0.6838112965226173, |
| "epoch": 2.2594956576331486, |
| "step": 5400 |
| }, |
| { |
| "eval_loss": 1.297374963760376, |
| "eval_runtime": 43.7773, |
| "eval_samples_per_second": 145.532, |
| "eval_steps_per_second": 6.076, |
| "eval_entropy": 1.315984815135038, |
| "eval_num_tokens": 31344186.0, |
| "eval_mean_token_accuracy": 0.6838296656321762, |
| "epoch": 2.2594956576331486, |
| "step": 5400 |
| }, |
| { |
| "loss": 1.256122875213623, |
| "grad_norm": 0.11425146460533142, |
| "learning_rate": 1.5005860251909918e-07, |
| "entropy": 1.2993682414293288, |
| "num_tokens": 31399330.0, |
| "mean_token_accuracy": 0.6879714965820313, |
| "epoch": 2.263681071465941, |
| "step": 5410 |
| }, |
| { |
| "loss": 1.243597412109375, |
| "grad_norm": 0.14105035364627838, |
| "learning_rate": 1.4844878620480124e-07, |
| "entropy": 1.2901643484830856, |
| "num_tokens": 31458043.0, |
| "mean_token_accuracy": 0.6898476853966713, |
| "epoch": 2.267866485298734, |
| "step": 5420 |
| }, |
| { |
| "loss": 1.3062746047973632, |
| "grad_norm": 0.1269349455833435, |
| "learning_rate": 1.4684614482987805e-07, |
| "entropy": 1.3157608151435851, |
| "num_tokens": 31515675.0, |
| "mean_token_accuracy": 0.6781650841236114, |
| "epoch": 2.272051899131527, |
| "step": 5430 |
| }, |
| { |
| "loss": 1.3145343780517578, |
| "grad_norm": 0.13170845806598663, |
| "learning_rate": 1.452507111032329e-07, |
| "entropy": 1.3244775086641312, |
| "num_tokens": 31573727.0, |
| "mean_token_accuracy": 0.6795195579528809, |
| "epoch": 2.2762373129643194, |
| "step": 5440 |
| }, |
| { |
| "loss": 1.2743712425231934, |
| "grad_norm": 0.13130150735378265, |
| "learning_rate": 1.4366251758666558e-07, |
| "entropy": 1.3025973543524743, |
| "num_tokens": 31632527.0, |
| "mean_token_accuracy": 0.6849103718996048, |
| "epoch": 2.280422726797112, |
| "step": 5450 |
| }, |
| { |
| "loss": 1.2729061126708985, |
| "grad_norm": 0.14398252964019775, |
| "learning_rate": 1.4208159669420817e-07, |
| "entropy": 1.2966506034135818, |
| "num_tokens": 31688226.0, |
| "mean_token_accuracy": 0.6885296568274498, |
| "epoch": 2.2846081406299046, |
| "step": 5460 |
| }, |
| { |
| "loss": 1.308814811706543, |
| "grad_norm": 0.1466449350118637, |
| "learning_rate": 1.405079806914623e-07, |
| "entropy": 1.306171926856041, |
| "num_tokens": 31743518.0, |
| "mean_token_accuracy": 0.6793627932667732, |
| "epoch": 2.2887935544626976, |
| "step": 5470 |
| }, |
| { |
| "loss": 1.295256996154785, |
| "grad_norm": 0.12834027409553528, |
| "learning_rate": 1.389417016949419e-07, |
| "entropy": 1.316891822218895, |
| "num_tokens": 31800911.0, |
| "mean_token_accuracy": 0.6853345051407814, |
| "epoch": 2.29297896829549, |
| "step": 5480 |
| }, |
| { |
| "loss": 1.3327623367309571, |
| "grad_norm": 0.1662720888853073, |
| "learning_rate": 1.3738279167141725e-07, |
| "entropy": 1.3393938541412354, |
| "num_tokens": 31860118.0, |
| "mean_token_accuracy": 0.6735303267836571, |
| "epoch": 2.297164382128283, |
| "step": 5490 |
| }, |
| { |
| "loss": 1.279651165008545, |
| "grad_norm": 0.12805919349193573, |
| "learning_rate": 1.3583128243726227e-07, |
| "entropy": 1.2862314611673356, |
| "num_tokens": 31917654.0, |
| "mean_token_accuracy": 0.6885863587260246, |
| "epoch": 2.301349795961076, |
| "step": 5500 |
| }, |
| { |
| "loss": 1.2778766632080079, |
| "grad_norm": 0.16033422946929932, |
| "learning_rate": 1.3428720565780578e-07, |
| "entropy": 1.300406639277935, |
| "num_tokens": 31974868.0, |
| "mean_token_accuracy": 0.6882018774747849, |
| "epoch": 2.3055352097938684, |
| "step": 5510 |
| }, |
| { |
| "loss": 1.342056941986084, |
| "grad_norm": 0.16284961998462677, |
| "learning_rate": 1.327505928466842e-07, |
| "entropy": 1.3293492585420608, |
| "num_tokens": 32033943.0, |
| "mean_token_accuracy": 0.6768848091363907, |
| "epoch": 2.309720623626661, |
| "step": 5520 |
| }, |
| { |
| "loss": 1.206116008758545, |
| "grad_norm": 0.1340523660182953, |
| "learning_rate": 1.3122147536519985e-07, |
| "entropy": 1.258744315803051, |
| "num_tokens": 32095146.0, |
| "mean_token_accuracy": 0.6991240099072457, |
| "epoch": 2.3139060374594536, |
| "step": 5530 |
| }, |
| { |
| "loss": 1.255533218383789, |
| "grad_norm": 0.12596993148326874, |
| "learning_rate": 1.2969988442167934e-07, |
| "entropy": 1.2745139241218566, |
| "num_tokens": 32158070.0, |
| "mean_token_accuracy": 0.6862679213285446, |
| "epoch": 2.3180914512922466, |
| "step": 5540 |
| }, |
| { |
| "loss": 1.270913314819336, |
| "grad_norm": 0.14521045982837677, |
| "learning_rate": 1.2818585107083797e-07, |
| "entropy": 1.2917841017246245, |
| "num_tokens": 32213049.0, |
| "mean_token_accuracy": 0.688049279153347, |
| "epoch": 2.322276865125039, |
| "step": 5550 |
| }, |
| { |
| "loss": 1.2614711761474608, |
| "grad_norm": 0.12460001558065414, |
| "learning_rate": 1.2667940621314516e-07, |
| "entropy": 1.288702441751957, |
| "num_tokens": 32270375.0, |
| "mean_token_accuracy": 0.691080367565155, |
| "epoch": 2.326462278957832, |
| "step": 5560 |
| }, |
| { |
| "loss": 1.297041893005371, |
| "grad_norm": 0.12860845029354095, |
| "learning_rate": 1.2518058059419356e-07, |
| "entropy": 1.2874844074249268, |
| "num_tokens": 32327913.0, |
| "mean_token_accuracy": 0.6812907472252846, |
| "epoch": 2.330647692790625, |
| "step": 5570 |
| }, |
| { |
| "loss": 1.2307467460632324, |
| "grad_norm": 0.13343603909015656, |
| "learning_rate": 1.2368940480407242e-07, |
| "entropy": 1.2836890518665314, |
| "num_tokens": 32385583.0, |
| "mean_token_accuracy": 0.6963008731603623, |
| "epoch": 2.3348331066234174, |
| "step": 5580 |
| }, |
| { |
| "loss": 1.3668609619140626, |
| "grad_norm": 0.13145415484905243, |
| "learning_rate": 1.2220590927674286e-07, |
| "entropy": 1.3669442266225815, |
| "num_tokens": 32441025.0, |
| "mean_token_accuracy": 0.671772038936615, |
| "epoch": 2.33901852045621, |
| "step": 5590 |
| }, |
| { |
| "loss": 1.3068817138671875, |
| "grad_norm": 0.13144521415233612, |
| "learning_rate": 1.2073012428941588e-07, |
| "entropy": 1.3122945204377174, |
| "num_tokens": 32499899.0, |
| "mean_token_accuracy": 0.674852766096592, |
| "epoch": 2.343203934289003, |
| "step": 5600 |
| }, |
| { |
| "eval_loss": 1.2969086170196533, |
| "eval_runtime": 43.7803, |
| "eval_samples_per_second": 145.522, |
| "eval_steps_per_second": 6.076, |
| "eval_entropy": 1.312249709789018, |
| "eval_num_tokens": 32499899.0, |
| "eval_mean_token_accuracy": 0.6838675230965578, |
| "epoch": 2.343203934289003, |
| "step": 5600 |
| }, |
| { |
| "loss": 1.33385009765625, |
| "grad_norm": 0.125252828001976, |
| "learning_rate": 1.1926207996193638e-07, |
| "entropy": 1.3582130268216133, |
| "num_tokens": 32556560.0, |
| "mean_token_accuracy": 0.678239768743515, |
| "epoch": 2.3473893481217956, |
| "step": 5610 |
| }, |
| { |
| "loss": 1.247665023803711, |
| "grad_norm": 0.13559651374816895, |
| "learning_rate": 1.178018062561662e-07, |
| "entropy": 1.2727186426520347, |
| "num_tokens": 32617607.0, |
| "mean_token_accuracy": 0.6923329353332519, |
| "epoch": 2.351574761954588, |
| "step": 5620 |
| }, |
| { |
| "loss": 1.2945147514343263, |
| "grad_norm": 0.12413690984249115, |
| "learning_rate": 1.1634933297537425e-07, |
| "entropy": 1.3126976788043976, |
| "num_tokens": 32676183.0, |
| "mean_token_accuracy": 0.6811081647872925, |
| "epoch": 2.355760175787381, |
| "step": 5630 |
| }, |
| { |
| "loss": 1.2760995864868163, |
| "grad_norm": 0.15444409847259521, |
| "learning_rate": 1.1490468976362766e-07, |
| "entropy": 1.3008133977651597, |
| "num_tokens": 32732392.0, |
| "mean_token_accuracy": 0.6872219279408455, |
| "epoch": 2.3599455896201738, |
| "step": 5640 |
| }, |
| { |
| "loss": 1.302404022216797, |
| "grad_norm": 0.1389995664358139, |
| "learning_rate": 1.1346790610518636e-07, |
| "entropy": 1.3151475220918656, |
| "num_tokens": 32788966.0, |
| "mean_token_accuracy": 0.6797765508294106, |
| "epoch": 2.3641310034529663, |
| "step": 5650 |
| }, |
| { |
| "loss": 1.288191795349121, |
| "grad_norm": 0.14642177522182465, |
| "learning_rate": 1.1203901132390225e-07, |
| "entropy": 1.3152502685785294, |
| "num_tokens": 32849483.0, |
| "mean_token_accuracy": 0.6831487894058228, |
| "epoch": 2.368316417285759, |
| "step": 5660 |
| }, |
| { |
| "loss": 1.2493846893310547, |
| "grad_norm": 0.13448752462863922, |
| "learning_rate": 1.1061803458261976e-07, |
| "entropy": 1.2866099685430528, |
| "num_tokens": 32907775.0, |
| "mean_token_accuracy": 0.6911607295274734, |
| "epoch": 2.372501831118552, |
| "step": 5670 |
| }, |
| { |
| "loss": 1.2729656219482421, |
| "grad_norm": 0.1279105842113495, |
| "learning_rate": 1.0920500488258134e-07, |
| "entropy": 1.294448482990265, |
| "num_tokens": 32966950.0, |
| "mean_token_accuracy": 0.6881255716085434, |
| "epoch": 2.3766872449513445, |
| "step": 5680 |
| }, |
| { |
| "loss": 1.2728429794311524, |
| "grad_norm": 0.1403297632932663, |
| "learning_rate": 1.0779995106283552e-07, |
| "entropy": 1.2703639656305312, |
| "num_tokens": 33022913.0, |
| "mean_token_accuracy": 0.6847912818193436, |
| "epoch": 2.380872658784137, |
| "step": 5690 |
| }, |
| { |
| "loss": 1.299112606048584, |
| "grad_norm": 0.11831526458263397, |
| "learning_rate": 1.0640290179964756e-07, |
| "entropy": 1.324224580824375, |
| "num_tokens": 33079983.0, |
| "mean_token_accuracy": 0.6824282988905906, |
| "epoch": 2.38505807261693, |
| "step": 5700 |
| }, |
| { |
| "loss": 1.327120018005371, |
| "grad_norm": 0.13661810755729675, |
| "learning_rate": 1.0501388560591523e-07, |
| "entropy": 1.3056075662374496, |
| "num_tokens": 33136523.0, |
| "mean_token_accuracy": 0.677336810529232, |
| "epoch": 2.3892434864497227, |
| "step": 5710 |
| }, |
| { |
| "loss": 1.3516573905944824, |
| "grad_norm": 0.12402050942182541, |
| "learning_rate": 1.0363293083058622e-07, |
| "entropy": 1.3417491644620896, |
| "num_tokens": 33194784.0, |
| "mean_token_accuracy": 0.675346839427948, |
| "epoch": 2.3934289002825153, |
| "step": 5720 |
| }, |
| { |
| "loss": 1.3104659080505372, |
| "grad_norm": 0.13492602109909058, |
| "learning_rate": 1.0226006565807982e-07, |
| "entropy": 1.3131451904773712, |
| "num_tokens": 33251897.0, |
| "mean_token_accuracy": 0.6822021931409836, |
| "epoch": 2.3976143141153083, |
| "step": 5730 |
| }, |
| { |
| "loss": 1.3267166137695312, |
| "grad_norm": 0.13064873218536377, |
| "learning_rate": 1.0089531810771163e-07, |
| "entropy": 1.3214107781648636, |
| "num_tokens": 33307060.0, |
| "mean_token_accuracy": 0.6773762717843056, |
| "epoch": 2.401799727948101, |
| "step": 5740 |
| }, |
| { |
| "loss": 1.316312599182129, |
| "grad_norm": 0.15154863893985748, |
| "learning_rate": 9.953871603312141e-08, |
| "entropy": 1.3157601684331894, |
| "num_tokens": 33362416.0, |
| "mean_token_accuracy": 0.6785706043243408, |
| "epoch": 2.4059851417808935, |
| "step": 5750 |
| }, |
| { |
| "loss": 1.2769282341003418, |
| "grad_norm": 0.12917333841323853, |
| "learning_rate": 9.819028712170512e-08, |
| "entropy": 1.281336858868599, |
| "num_tokens": 33422722.0, |
| "mean_token_accuracy": 0.6885020643472671, |
| "epoch": 2.4101705556136865, |
| "step": 5760 |
| }, |
| { |
| "loss": 1.3640668869018555, |
| "grad_norm": 0.1428443044424057, |
| "learning_rate": 9.68500588940498e-08, |
| "entropy": 1.3538337886333465, |
| "num_tokens": 33483556.0, |
| "mean_token_accuracy": 0.6655726253986358, |
| "epoch": 2.414355969446479, |
| "step": 5770 |
| }, |
| { |
| "loss": 1.251881980895996, |
| "grad_norm": 0.14195656776428223, |
| "learning_rate": 9.551805870337104e-08, |
| "entropy": 1.2702584967017174, |
| "num_tokens": 33543254.0, |
| "mean_token_accuracy": 0.6900080740451813, |
| "epoch": 2.4185413832792717, |
| "step": 5780 |
| }, |
| { |
| "loss": 1.3439226150512695, |
| "grad_norm": 0.16237884759902954, |
| "learning_rate": 9.419431373495612e-08, |
| "entropy": 1.3545999929308892, |
| "num_tokens": 33601741.0, |
| "mean_token_accuracy": 0.6744641482830047, |
| "epoch": 2.4227267971120643, |
| "step": 5790 |
| }, |
| { |
| "loss": 1.314307975769043, |
| "grad_norm": 0.15124961733818054, |
| "learning_rate": 9.287885100560771e-08, |
| "entropy": 1.320368728041649, |
| "num_tokens": 33657327.0, |
| "mean_token_accuracy": 0.6819353699684143, |
| "epoch": 2.4269122109448573, |
| "step": 5800 |
| }, |
| { |
| "eval_loss": 1.2966619729995728, |
| "eval_runtime": 42.94, |
| "eval_samples_per_second": 148.37, |
| "eval_steps_per_second": 6.195, |
| "eval_entropy": 1.3098934839542646, |
| "eval_num_tokens": 33657327.0, |
| "eval_mean_token_accuracy": 0.6839417216921211, |
| "epoch": 2.4269122109448573, |
| "step": 5800 |
| }, |
| { |
| "loss": 1.2694414138793946, |
| "grad_norm": 0.1228335052728653, |
| "learning_rate": 9.157169736309384e-08, |
| "entropy": 1.29910968542099, |
| "num_tokens": 33713833.0, |
| "mean_token_accuracy": 0.6918980091810226, |
| "epoch": 2.43109762477765, |
| "step": 5810 |
| }, |
| { |
| "loss": 1.3777572631835937, |
| "grad_norm": 0.12483090162277222, |
| "learning_rate": 9.02728794855988e-08, |
| "entropy": 1.3498617202043532, |
| "num_tokens": 33771185.0, |
| "mean_token_accuracy": 0.6699911892414093, |
| "epoch": 2.4352830386104425, |
| "step": 5820 |
| }, |
| { |
| "loss": 1.3352084159851074, |
| "grad_norm": 0.14710542559623718, |
| "learning_rate": 8.898242388117949e-08, |
| "entropy": 1.3336048945784569, |
| "num_tokens": 33828941.0, |
| "mean_token_accuracy": 0.6756347686052322, |
| "epoch": 2.4394684524432355, |
| "step": 5830 |
| }, |
| { |
| "loss": 1.310356903076172, |
| "grad_norm": 0.16344612836837769, |
| "learning_rate": 8.770035688722399e-08, |
| "entropy": 1.3448477059602737, |
| "num_tokens": 33890202.0, |
| "mean_token_accuracy": 0.6772162079811096, |
| "epoch": 2.443653866276028, |
| "step": 5840 |
| }, |
| { |
| "loss": 1.2573143005371095, |
| "grad_norm": 0.13473457098007202, |
| "learning_rate": 8.642670466991381e-08, |
| "entropy": 1.27697846442461, |
| "num_tokens": 33945323.0, |
| "mean_token_accuracy": 0.6902707099914551, |
| "epoch": 2.4478392801088207, |
| "step": 5850 |
| }, |
| { |
| "loss": 1.2643320083618164, |
| "grad_norm": 0.12609098851680756, |
| "learning_rate": 8.516149322369054e-08, |
| "entropy": 1.3083055540919304, |
| "num_tokens": 34005115.0, |
| "mean_token_accuracy": 0.6905182540416718, |
| "epoch": 2.4520246939416133, |
| "step": 5860 |
| }, |
| { |
| "loss": 1.3385416984558105, |
| "grad_norm": 0.1266658753156662, |
| "learning_rate": 8.390474837072492e-08, |
| "entropy": 1.330283808708191, |
| "num_tokens": 34061458.0, |
| "mean_token_accuracy": 0.6766823455691338, |
| "epoch": 2.4562101077744063, |
| "step": 5870 |
| }, |
| { |
| "loss": 1.247739601135254, |
| "grad_norm": 0.1117442175745964, |
| "learning_rate": 8.265649576038946e-08, |
| "entropy": 1.2861711964011193, |
| "num_tokens": 34117371.0, |
| "mean_token_accuracy": 0.6933671846985817, |
| "epoch": 2.460395521607199, |
| "step": 5880 |
| }, |
| { |
| "loss": 1.3221072196960448, |
| "grad_norm": 0.1515118032693863, |
| "learning_rate": 8.141676086873573e-08, |
| "entropy": 1.3331793665885925, |
| "num_tokens": 34172239.0, |
| "mean_token_accuracy": 0.6770834714174271, |
| "epoch": 2.4645809354399915, |
| "step": 5890 |
| }, |
| { |
| "loss": 1.269434928894043, |
| "grad_norm": 0.13494186103343964, |
| "learning_rate": 8.018556899797396e-08, |
| "entropy": 1.2998870089650154, |
| "num_tokens": 34234355.0, |
| "mean_token_accuracy": 0.687006613612175, |
| "epoch": 2.4687663492727845, |
| "step": 5900 |
| }, |
| { |
| "loss": 1.326502799987793, |
| "grad_norm": 0.14256730675697327, |
| "learning_rate": 7.896294527595638e-08, |
| "entropy": 1.350116790831089, |
| "num_tokens": 34295462.0, |
| "mean_token_accuracy": 0.6760937020182609, |
| "epoch": 2.472951763105577, |
| "step": 5910 |
| }, |
| { |
| "loss": 1.3374545097351074, |
| "grad_norm": 0.15305058658123016, |
| "learning_rate": 7.774891465566518e-08, |
| "entropy": 1.3536745309829712, |
| "num_tokens": 34353670.0, |
| "mean_token_accuracy": 0.6751428216695785, |
| "epoch": 2.4771371769383697, |
| "step": 5920 |
| }, |
| { |
| "loss": 1.3062148094177246, |
| "grad_norm": 0.1269030123949051, |
| "learning_rate": 7.654350191470216e-08, |
| "entropy": 1.3079909563064576, |
| "num_tokens": 34409937.0, |
| "mean_token_accuracy": 0.6825913473963737, |
| "epoch": 2.4813225907711627, |
| "step": 5930 |
| }, |
| { |
| "loss": 1.3188300132751465, |
| "grad_norm": 0.11656031757593155, |
| "learning_rate": 7.534673165478417e-08, |
| "entropy": 1.3348352879285812, |
| "num_tokens": 34470681.0, |
| "mean_token_accuracy": 0.673864497244358, |
| "epoch": 2.4855080046039553, |
| "step": 5940 |
| }, |
| { |
| "loss": 1.263766098022461, |
| "grad_norm": 0.12488370388746262, |
| "learning_rate": 7.415862830124032e-08, |
| "entropy": 1.3003046184778213, |
| "num_tokens": 34530193.0, |
| "mean_token_accuracy": 0.6913181528449058, |
| "epoch": 2.489693418436748, |
| "step": 5950 |
| }, |
| { |
| "loss": 1.297060203552246, |
| "grad_norm": 0.14231456816196442, |
| "learning_rate": 7.297921610251323e-08, |
| "entropy": 1.3110292360186577, |
| "num_tokens": 34585018.0, |
| "mean_token_accuracy": 0.6845840275287628, |
| "epoch": 2.493878832269541, |
| "step": 5960 |
| }, |
| { |
| "loss": 1.2582796096801758, |
| "grad_norm": 0.13470889627933502, |
| "learning_rate": 7.180851912966501e-08, |
| "entropy": 1.276314914226532, |
| "num_tokens": 34640793.0, |
| "mean_token_accuracy": 0.6882349893450737, |
| "epoch": 2.4980642461023335, |
| "step": 5970 |
| }, |
| { |
| "loss": 1.2998489379882812, |
| "grad_norm": 0.15721286833286285, |
| "learning_rate": 7.064656127588508e-08, |
| "entropy": 1.3124357014894485, |
| "num_tokens": 34694819.0, |
| "mean_token_accuracy": 0.6838565751910209, |
| "epoch": 2.502249659935126, |
| "step": 5980 |
| }, |
| { |
| "loss": 1.306645965576172, |
| "grad_norm": 0.11785798519849777, |
| "learning_rate": 6.949336625600316e-08, |
| "entropy": 1.3165518283843993, |
| "num_tokens": 34751259.0, |
| "mean_token_accuracy": 0.6841330319643021, |
| "epoch": 2.5064350737679186, |
| "step": 5990 |
| }, |
| { |
| "loss": 1.3169086456298829, |
| "grad_norm": 0.11234049499034882, |
| "learning_rate": 6.834895760600517e-08, |
| "entropy": 1.3190216064453124, |
| "num_tokens": 34808644.0, |
| "mean_token_accuracy": 0.6796000450849533, |
| "epoch": 2.5106204876007117, |
| "step": 6000 |
| }, |
| { |
| "eval_loss": 1.2964025735855103, |
| "eval_runtime": 43.5438, |
| "eval_samples_per_second": 146.312, |
| "eval_steps_per_second": 6.109, |
| "eval_entropy": 1.3063210257910247, |
| "eval_num_tokens": 34808644.0, |
| "eval_mean_token_accuracy": 0.6839508241728732, |
| "epoch": 2.5106204876007117, |
| "step": 6000 |
| }, |
| { |
| "loss": 1.2840510368347169, |
| "grad_norm": 0.15145522356033325, |
| "learning_rate": 6.721335868255229e-08, |
| "entropy": 1.2826346635818482, |
| "num_tokens": 34863928.0, |
| "mean_token_accuracy": 0.6869580999016762, |
| "epoch": 2.5148059014335042, |
| "step": 6010 |
| }, |
| { |
| "loss": 1.267976665496826, |
| "grad_norm": 0.13854120671749115, |
| "learning_rate": 6.60865926625051e-08, |
| "entropy": 1.2938668191432954, |
| "num_tokens": 34926879.0, |
| "mean_token_accuracy": 0.6888927921652794, |
| "epoch": 2.518991315266297, |
| "step": 6020 |
| }, |
| { |
| "loss": 1.332705307006836, |
| "grad_norm": 0.13647040724754333, |
| "learning_rate": 6.496868254245025e-08, |
| "entropy": 1.3259623274207115, |
| "num_tokens": 34984932.0, |
| "mean_token_accuracy": 0.6760903507471084, |
| "epoch": 2.52317672909909, |
| "step": 6030 |
| }, |
| { |
| "loss": 1.2476840019226074, |
| "grad_norm": 0.13999158143997192, |
| "learning_rate": 6.385965113823039e-08, |
| "entropy": 1.2729045450687408, |
| "num_tokens": 35042011.0, |
| "mean_token_accuracy": 0.6900967061519623, |
| "epoch": 2.5273621429318824, |
| "step": 6040 |
| }, |
| { |
| "loss": 1.2344088554382324, |
| "grad_norm": 0.13583189249038696, |
| "learning_rate": 6.275952108448018e-08, |
| "entropy": 1.276967915892601, |
| "num_tokens": 35100640.0, |
| "mean_token_accuracy": 0.6938279047608376, |
| "epoch": 2.531547556764675, |
| "step": 6050 |
| }, |
| { |
| "loss": 1.253906536102295, |
| "grad_norm": 0.14375676214694977, |
| "learning_rate": 6.166831483416229e-08, |
| "entropy": 1.2771710246801375, |
| "num_tokens": 35158864.0, |
| "mean_token_accuracy": 0.6879908561706543, |
| "epoch": 2.5357329705974676, |
| "step": 6060 |
| }, |
| { |
| "loss": 1.3232227325439454, |
| "grad_norm": 0.12950512766838074, |
| "learning_rate": 6.058605465811085e-08, |
| "entropy": 1.3327802419662476, |
| "num_tokens": 35216659.0, |
| "mean_token_accuracy": 0.6788379862904549, |
| "epoch": 2.5399183844302606, |
| "step": 6070 |
| }, |
| { |
| "loss": 1.2316680908203126, |
| "grad_norm": 0.14246362447738647, |
| "learning_rate": 5.9512762644576054e-08, |
| "entropy": 1.2707349091768265, |
| "num_tokens": 35276829.0, |
| "mean_token_accuracy": 0.6953957095742226, |
| "epoch": 2.544103798263053, |
| "step": 6080 |
| }, |
| { |
| "loss": 1.2903837203979491, |
| "grad_norm": 0.14620284736156464, |
| "learning_rate": 5.844846069877329e-08, |
| "entropy": 1.3116936787962914, |
| "num_tokens": 35335255.0, |
| "mean_token_accuracy": 0.6844032138586045, |
| "epoch": 2.5482892120958462, |
| "step": 6090 |
| }, |
| { |
| "loss": 1.2689558029174806, |
| "grad_norm": 0.1156439483165741, |
| "learning_rate": 5.7393170542436694e-08, |
| "entropy": 1.2803223952651024, |
| "num_tokens": 35398265.0, |
| "mean_token_accuracy": 0.6874969124794006, |
| "epoch": 2.552474625928639, |
| "step": 6100 |
| }, |
| { |
| "loss": 1.2471202850341796, |
| "grad_norm": 0.14455300569534302, |
| "learning_rate": 5.6346913713375076e-08, |
| "entropy": 1.2767839536070824, |
| "num_tokens": 35457467.0, |
| "mean_token_accuracy": 0.6907158330082893, |
| "epoch": 2.5566600397614314, |
| "step": 6110 |
| }, |
| { |
| "loss": 1.2724437713623047, |
| "grad_norm": 0.1570362001657486, |
| "learning_rate": 5.5309711565033055e-08, |
| "entropy": 1.2919223070144654, |
| "num_tokens": 35514814.0, |
| "mean_token_accuracy": 0.688132356107235, |
| "epoch": 2.560845453594224, |
| "step": 6120 |
| }, |
| { |
| "loss": 1.3232336044311523, |
| "grad_norm": 0.15807446837425232, |
| "learning_rate": 5.4281585266054755e-08, |
| "entropy": 1.3095539420843125, |
| "num_tokens": 35571529.0, |
| "mean_token_accuracy": 0.6772763684391976, |
| "epoch": 2.5650308674270166, |
| "step": 6130 |
| }, |
| { |
| "loss": 1.2672816276550294, |
| "grad_norm": 0.12889625132083893, |
| "learning_rate": 5.326255579985173e-08, |
| "entropy": 1.2937352240085602, |
| "num_tokens": 35630131.0, |
| "mean_token_accuracy": 0.6905935257673264, |
| "epoch": 2.5692162812598096, |
| "step": 6140 |
| }, |
| { |
| "loss": 1.2375890731811523, |
| "grad_norm": 0.13400977849960327, |
| "learning_rate": 5.225264396417522e-08, |
| "entropy": 1.2529444962739944, |
| "num_tokens": 35690381.0, |
| "mean_token_accuracy": 0.6925143092870713, |
| "epoch": 2.573401695092602, |
| "step": 6150 |
| }, |
| { |
| "loss": 1.261359405517578, |
| "grad_norm": 0.10552431643009186, |
| "learning_rate": 5.125187037069123e-08, |
| "entropy": 1.2899439319968224, |
| "num_tokens": 35755420.0, |
| "mean_token_accuracy": 0.6874815404415131, |
| "epoch": 2.577587108925395, |
| "step": 6160 |
| }, |
| { |
| "loss": 1.2803380966186524, |
| "grad_norm": 0.12414208054542542, |
| "learning_rate": 5.026025544455986e-08, |
| "entropy": 1.3017450347542763, |
| "num_tokens": 35812900.0, |
| "mean_token_accuracy": 0.6863995373249054, |
| "epoch": 2.581772522758188, |
| "step": 6170 |
| }, |
| { |
| "loss": 1.2126134872436523, |
| "grad_norm": 0.11522486060857773, |
| "learning_rate": 4.9277819424018815e-08, |
| "entropy": 1.255126628279686, |
| "num_tokens": 35872604.0, |
| "mean_token_accuracy": 0.6964934691786766, |
| "epoch": 2.5859579365909804, |
| "step": 6180 |
| }, |
| { |
| "loss": 1.253915023803711, |
| "grad_norm": 0.11392216384410858, |
| "learning_rate": 4.830458235996976e-08, |
| "entropy": 1.2905526503920555, |
| "num_tokens": 35930850.0, |
| "mean_token_accuracy": 0.6916301295161247, |
| "epoch": 2.590143350423773, |
| "step": 6190 |
| }, |
| { |
| "loss": 1.2767410278320312, |
| "grad_norm": 0.12976758182048798, |
| "learning_rate": 4.7340564115569804e-08, |
| "entropy": 1.283238247036934, |
| "num_tokens": 35988841.0, |
| "mean_token_accuracy": 0.687064278125763, |
| "epoch": 2.594328764256566, |
| "step": 6200 |
| }, |
| { |
| "eval_loss": 1.2962790727615356, |
| "eval_runtime": 43.7745, |
| "eval_samples_per_second": 145.542, |
| "eval_steps_per_second": 6.077, |
| "eval_entropy": 1.3085140944423532, |
| "eval_num_tokens": 35988841.0, |
| "eval_mean_token_accuracy": 0.6838811584433219, |
| "epoch": 2.594328764256566, |
| "step": 6200 |
| }, |
| { |
| "loss": 1.3264198303222656, |
| "grad_norm": 0.13761702179908752, |
| "learning_rate": 4.638578436582552e-08, |
| "entropy": 1.3237684190273284, |
| "num_tokens": 36044753.0, |
| "mean_token_accuracy": 0.67743628770113, |
| "epoch": 2.5985141780893586, |
| "step": 6210 |
| }, |
| { |
| "loss": 1.257272720336914, |
| "grad_norm": 0.1342718005180359, |
| "learning_rate": 4.544026259719158e-08, |
| "entropy": 1.288350522518158, |
| "num_tokens": 36103553.0, |
| "mean_token_accuracy": 0.6897284865379334, |
| "epoch": 2.602699591922151, |
| "step": 6220 |
| }, |
| { |
| "loss": 1.2772136688232423, |
| "grad_norm": 0.12511885166168213, |
| "learning_rate": 4.4504018107173304e-08, |
| "entropy": 1.3188367202877997, |
| "num_tokens": 36162370.0, |
| "mean_token_accuracy": 0.6876938834786415, |
| "epoch": 2.606885005754944, |
| "step": 6230 |
| }, |
| { |
| "loss": 1.3926493644714355, |
| "grad_norm": 0.15498943626880646, |
| "learning_rate": 4.3577070003932234e-08, |
| "entropy": 1.3926087036728858, |
| "num_tokens": 36218877.0, |
| "mean_token_accuracy": 0.6645623058080673, |
| "epoch": 2.611070419587737, |
| "step": 6240 |
| }, |
| { |
| "loss": 1.2876951217651367, |
| "grad_norm": 0.13086406886577606, |
| "learning_rate": 4.265943720589688e-08, |
| "entropy": 1.3051115587353705, |
| "num_tokens": 36274783.0, |
| "mean_token_accuracy": 0.6853921875357628, |
| "epoch": 2.6152558334205294, |
| "step": 6250 |
| }, |
| { |
| "loss": 1.3133883476257324, |
| "grad_norm": 0.15468242764472961, |
| "learning_rate": 4.175113844137596e-08, |
| "entropy": 1.3004416555166245, |
| "num_tokens": 36330126.0, |
| "mean_token_accuracy": 0.6820774778723717, |
| "epoch": 2.619441247253322, |
| "step": 6260 |
| }, |
| { |
| "loss": 1.3524452209472657, |
| "grad_norm": 0.14072105288505554, |
| "learning_rate": 4.08521922481766e-08, |
| "entropy": 1.3288619458675384, |
| "num_tokens": 36386837.0, |
| "mean_token_accuracy": 0.6738715380430221, |
| "epoch": 2.623626661086115, |
| "step": 6270 |
| }, |
| { |
| "loss": 1.2715925216674804, |
| "grad_norm": 0.13218800723552704, |
| "learning_rate": 3.9962616973225784e-08, |
| "entropy": 1.3046256229281425, |
| "num_tokens": 36446096.0, |
| "mean_token_accuracy": 0.6843361258506775, |
| "epoch": 2.6278120749189076, |
| "step": 6280 |
| }, |
| { |
| "loss": 1.3278305053710937, |
| "grad_norm": 0.12469816952943802, |
| "learning_rate": 3.90824307721957e-08, |
| "entropy": 1.3449779450893402, |
| "num_tokens": 36502354.0, |
| "mean_token_accuracy": 0.6795141100883484, |
| "epoch": 2.6319974887517006, |
| "step": 6290 |
| }, |
| { |
| "loss": 1.2798616409301757, |
| "grad_norm": 0.12675845623016357, |
| "learning_rate": 3.821165160913381e-08, |
| "entropy": 1.290797685086727, |
| "num_tokens": 36560258.0, |
| "mean_token_accuracy": 0.6858594298362732, |
| "epoch": 2.636182902584493, |
| "step": 6300 |
| }, |
| { |
| "loss": 1.2590128898620605, |
| "grad_norm": 0.14704617857933044, |
| "learning_rate": 3.735029725609567e-08, |
| "entropy": 1.2577021181583405, |
| "num_tokens": 36616149.0, |
| "mean_token_accuracy": 0.6896016135811806, |
| "epoch": 2.6403683164172858, |
| "step": 6310 |
| }, |
| { |
| "loss": 1.2890483856201171, |
| "grad_norm": 0.14774031937122345, |
| "learning_rate": 3.649838529278232e-08, |
| "entropy": 1.3169309496879578, |
| "num_tokens": 36675723.0, |
| "mean_token_accuracy": 0.683777266740799, |
| "epoch": 2.6445537302500783, |
| "step": 6320 |
| }, |
| { |
| "loss": 1.3321017265319823, |
| "grad_norm": 0.16364073753356934, |
| "learning_rate": 3.565593310618165e-08, |
| "entropy": 1.3246647357940673, |
| "num_tokens": 36731369.0, |
| "mean_token_accuracy": 0.6758654475212097, |
| "epoch": 2.648739144082871, |
| "step": 6330 |
| }, |
| { |
| "loss": 1.3064552307128907, |
| "grad_norm": 0.1471003293991089, |
| "learning_rate": 3.48229578902135e-08, |
| "entropy": 1.3057933449745178, |
| "num_tokens": 36790065.0, |
| "mean_token_accuracy": 0.6810683965682983, |
| "epoch": 2.652924557915664, |
| "step": 6340 |
| }, |
| { |
| "loss": 1.279481792449951, |
| "grad_norm": 0.14192688465118408, |
| "learning_rate": 3.39994766453785e-08, |
| "entropy": 1.303479927778244, |
| "num_tokens": 36851965.0, |
| "mean_token_accuracy": 0.6806197896599769, |
| "epoch": 2.6571099717484565, |
| "step": 6350 |
| }, |
| { |
| "loss": 1.2731066703796388, |
| "grad_norm": 0.1496737152338028, |
| "learning_rate": 3.3185506178411593e-08, |
| "entropy": 1.3053823009133338, |
| "num_tokens": 36911260.0, |
| "mean_token_accuracy": 0.685496874153614, |
| "epoch": 2.6612953855812496, |
| "step": 6360 |
| }, |
| { |
| "loss": 1.3233325958251954, |
| "grad_norm": 0.1397167593240738, |
| "learning_rate": 3.238106310193822e-08, |
| "entropy": 1.3544006377458573, |
| "num_tokens": 36968655.0, |
| "mean_token_accuracy": 0.6773801222443581, |
| "epoch": 2.665480799414042, |
| "step": 6370 |
| }, |
| { |
| "loss": 1.273273754119873, |
| "grad_norm": 0.1353992074728012, |
| "learning_rate": 3.158616383413648e-08, |
| "entropy": 1.2890938267111778, |
| "num_tokens": 37023385.0, |
| "mean_token_accuracy": 0.6872632309794426, |
| "epoch": 2.6696662132468347, |
| "step": 6380 |
| }, |
| { |
| "loss": 1.269486427307129, |
| "grad_norm": 0.14086788892745972, |
| "learning_rate": 3.080082459840072e-08, |
| "entropy": 1.29090928286314, |
| "num_tokens": 37081937.0, |
| "mean_token_accuracy": 0.6892599433660507, |
| "epoch": 2.6738516270796273, |
| "step": 6390 |
| }, |
| { |
| "loss": 1.2413150787353515, |
| "grad_norm": 0.147287517786026, |
| "learning_rate": 3.0025061423011366e-08, |
| "entropy": 1.2670000731945037, |
| "num_tokens": 37139738.0, |
| "mean_token_accuracy": 0.6992234885692596, |
| "epoch": 2.6780370409124203, |
| "step": 6400 |
| }, |
| { |
| "eval_loss": 1.2961639165878296, |
| "eval_runtime": 43.8119, |
| "eval_samples_per_second": 145.417, |
| "eval_steps_per_second": 6.071, |
| "eval_entropy": 1.3104161902477867, |
| "eval_num_tokens": 37139738.0, |
| "eval_mean_token_accuracy": 0.6839385229842108, |
| "epoch": 2.6780370409124203, |
| "step": 6400 |
| }, |
| { |
| "loss": 1.2215076446533204, |
| "grad_norm": 0.12445386499166489, |
| "learning_rate": 2.92588901408074e-08, |
| "entropy": 1.2454605296254158, |
| "num_tokens": 37196528.0, |
| "mean_token_accuracy": 0.6982016503810883, |
| "epoch": 2.682222454745213, |
| "step": 6410 |
| }, |
| { |
| "loss": 1.3178998947143554, |
| "grad_norm": 0.14504940807819366, |
| "learning_rate": 2.8502326388863073e-08, |
| "entropy": 1.3217849105596542, |
| "num_tokens": 37251881.0, |
| "mean_token_accuracy": 0.6820187479257583, |
| "epoch": 2.6864078685780055, |
| "step": 6420 |
| }, |
| { |
| "loss": 1.2787066459655763, |
| "grad_norm": 0.1357499063014984, |
| "learning_rate": 2.7755385608169368e-08, |
| "entropy": 1.293387584388256, |
| "num_tokens": 37308322.0, |
| "mean_token_accuracy": 0.6850109323859215, |
| "epoch": 2.6905932824107985, |
| "step": 6430 |
| }, |
| { |
| "loss": 1.3130731582641602, |
| "grad_norm": 0.1391121745109558, |
| "learning_rate": 2.701808304331826e-08, |
| "entropy": 1.3160455033183098, |
| "num_tokens": 37367065.0, |
| "mean_token_accuracy": 0.6788717776536941, |
| "epoch": 2.694778696243591, |
| "step": 6440 |
| }, |
| { |
| "loss": 1.324008083343506, |
| "grad_norm": 0.17093950510025024, |
| "learning_rate": 2.6290433742191697e-08, |
| "entropy": 1.3303591817617417, |
| "num_tokens": 37423674.0, |
| "mean_token_accuracy": 0.6737320378422738, |
| "epoch": 2.6989641100763837, |
| "step": 6450 |
| }, |
| { |
| "loss": 1.3142354965209961, |
| "grad_norm": 0.11529888957738876, |
| "learning_rate": 2.5572452555654766e-08, |
| "entropy": 1.3235249876976014, |
| "num_tokens": 37478758.0, |
| "mean_token_accuracy": 0.6834891051054001, |
| "epoch": 2.7031495239091763, |
| "step": 6460 |
| }, |
| { |
| "loss": 1.3274757385253906, |
| "grad_norm": 0.14736518263816833, |
| "learning_rate": 2.4864154137252348e-08, |
| "entropy": 1.323398619890213, |
| "num_tokens": 37535301.0, |
| "mean_token_accuracy": 0.676533716917038, |
| "epoch": 2.7073349377419693, |
| "step": 6470 |
| }, |
| { |
| "loss": 1.3244875907897948, |
| "grad_norm": 0.1322164386510849, |
| "learning_rate": 2.4165552942910005e-08, |
| "entropy": 1.3245794102549553, |
| "num_tokens": 37593859.0, |
| "mean_token_accuracy": 0.6777049407362938, |
| "epoch": 2.711520351574762, |
| "step": 6480 |
| }, |
| { |
| "loss": 1.3350665092468261, |
| "grad_norm": 0.15845166146755219, |
| "learning_rate": 2.3476663230639294e-08, |
| "entropy": 1.311027655005455, |
| "num_tokens": 37650900.0, |
| "mean_token_accuracy": 0.6777532756328583, |
| "epoch": 2.715705765407555, |
| "step": 6490 |
| }, |
| { |
| "loss": 1.3243823051452637, |
| "grad_norm": 0.1440412998199463, |
| "learning_rate": 2.279749906024625e-08, |
| "entropy": 1.316267091035843, |
| "num_tokens": 37709121.0, |
| "mean_token_accuracy": 0.6794763222336769, |
| "epoch": 2.7198911792403475, |
| "step": 6500 |
| }, |
| { |
| "loss": 1.2479528427124023, |
| "grad_norm": 0.1294708549976349, |
| "learning_rate": 2.2128074293044973e-08, |
| "entropy": 1.2721221387386321, |
| "num_tokens": 37769916.0, |
| "mean_token_accuracy": 0.6923005178570747, |
| "epoch": 2.72407659307314, |
| "step": 6510 |
| }, |
| { |
| "loss": 1.2882716178894043, |
| "grad_norm": 0.1557369977235794, |
| "learning_rate": 2.1468402591574176e-08, |
| "entropy": 1.2828272953629494, |
| "num_tokens": 37827782.0, |
| "mean_token_accuracy": 0.6824876755475998, |
| "epoch": 2.7282620069059327, |
| "step": 6520 |
| }, |
| { |
| "loss": 1.3350841522216796, |
| "grad_norm": 0.14170175790786743, |
| "learning_rate": 2.0818497419318847e-08, |
| "entropy": 1.3358486652374268, |
| "num_tokens": 37889014.0, |
| "mean_token_accuracy": 0.6755576729774475, |
| "epoch": 2.7324474207387257, |
| "step": 6530 |
| }, |
| { |
| "loss": 1.2199977874755858, |
| "grad_norm": 0.1191650778055191, |
| "learning_rate": 2.017837204043521e-08, |
| "entropy": 1.2538551360368728, |
| "num_tokens": 37949279.0, |
| "mean_token_accuracy": 0.6934547841548919, |
| "epoch": 2.7366328345715183, |
| "step": 6540 |
| }, |
| { |
| "loss": 1.3062921524047852, |
| "grad_norm": 0.12893186509609222, |
| "learning_rate": 1.954803951947992e-08, |
| "entropy": 1.2967224359512328, |
| "num_tokens": 38003642.0, |
| "mean_token_accuracy": 0.679786990582943, |
| "epoch": 2.740818248404311, |
| "step": 6550 |
| }, |
| { |
| "loss": 1.3133017539978027, |
| "grad_norm": 0.15340933203697205, |
| "learning_rate": 1.8927512721143733e-08, |
| "entropy": 1.3341112226247787, |
| "num_tokens": 38061010.0, |
| "mean_token_accuracy": 0.6817509040236474, |
| "epoch": 2.745003662237104, |
| "step": 6560 |
| }, |
| { |
| "loss": 1.2640517234802247, |
| "grad_norm": 0.14441581070423126, |
| "learning_rate": 1.831680430998872e-08, |
| "entropy": 1.2839445233345033, |
| "num_tokens": 38118091.0, |
| "mean_token_accuracy": 0.6878686159849167, |
| "epoch": 2.7491890760698965, |
| "step": 6570 |
| }, |
| { |
| "loss": 1.3473605155944823, |
| "grad_norm": 0.1267869770526886, |
| "learning_rate": 1.7715926750189736e-08, |
| "entropy": 1.3488942801952362, |
| "num_tokens": 38173499.0, |
| "mean_token_accuracy": 0.673286820948124, |
| "epoch": 2.753374489902689, |
| "step": 6580 |
| }, |
| { |
| "loss": 1.2386550903320312, |
| "grad_norm": 0.1410847008228302, |
| "learning_rate": 1.7124892305280248e-08, |
| "entropy": 1.2806343123316766, |
| "num_tokens": 38232916.0, |
| "mean_token_accuracy": 0.6908275470137596, |
| "epoch": 2.7575599037354817, |
| "step": 6590 |
| }, |
| { |
| "loss": 1.2414596557617188, |
| "grad_norm": 0.15371793508529663, |
| "learning_rate": 1.6543713037901863e-08, |
| "entropy": 1.2626485541462897, |
| "num_tokens": 38293881.0, |
| "mean_token_accuracy": 0.6889653459191323, |
| "epoch": 2.7617453175682747, |
| "step": 6600 |
| }, |
| { |
| "eval_loss": 1.2960588932037354, |
| "eval_runtime": 43.7085, |
| "eval_samples_per_second": 145.761, |
| "eval_steps_per_second": 6.086, |
| "eval_entropy": 1.3095905193708892, |
| "eval_num_tokens": 38293881.0, |
| "eval_mean_token_accuracy": 0.6839702539426044, |
| "epoch": 2.7617453175682747, |
| "step": 6600 |
| }, |
| { |
| "loss": 1.2328216552734375, |
| "grad_norm": 0.14509297907352448, |
| "learning_rate": 1.5972400809558305e-08, |
| "entropy": 1.2864874497056007, |
| "num_tokens": 38354279.0, |
| "mean_token_accuracy": 0.6910390242934227, |
| "epoch": 2.7659307314010673, |
| "step": 6610 |
| }, |
| { |
| "loss": 1.3384916305541992, |
| "grad_norm": 0.13343819975852966, |
| "learning_rate": 1.541096728037322e-08, |
| "entropy": 1.325030580163002, |
| "num_tokens": 38412664.0, |
| "mean_token_accuracy": 0.6726853728294373, |
| "epoch": 2.77011614523386, |
| "step": 6620 |
| }, |
| { |
| "loss": 1.2254823684692382, |
| "grad_norm": 0.1242731511592865, |
| "learning_rate": 1.4859423908851976e-08, |
| "entropy": 1.2863211989402772, |
| "num_tokens": 38478157.0, |
| "mean_token_accuracy": 0.6928248971700668, |
| "epoch": 2.774301559066653, |
| "step": 6630 |
| }, |
| { |
| "loss": 1.3083304405212401, |
| "grad_norm": 0.13707546889781952, |
| "learning_rate": 1.43177819516484e-08, |
| "entropy": 1.3157197803258895, |
| "num_tokens": 38536491.0, |
| "mean_token_accuracy": 0.6797169283032417, |
| "epoch": 2.7784869728994455, |
| "step": 6640 |
| }, |
| { |
| "loss": 1.3308774948120117, |
| "grad_norm": 0.18031752109527588, |
| "learning_rate": 1.3786052463334363e-08, |
| "entropy": 1.3267883569002152, |
| "num_tokens": 38592693.0, |
| "mean_token_accuracy": 0.6799778997898102, |
| "epoch": 2.782672386732238, |
| "step": 6650 |
| }, |
| { |
| "loss": 1.254627799987793, |
| "grad_norm": 0.13183258473873138, |
| "learning_rate": 1.3264246296174675e-08, |
| "entropy": 1.2955256581306458, |
| "num_tokens": 38649528.0, |
| "mean_token_accuracy": 0.6876810878515244, |
| "epoch": 2.7868578005650306, |
| "step": 6660 |
| }, |
| { |
| "loss": 1.3457258224487305, |
| "grad_norm": 0.13854053616523743, |
| "learning_rate": 1.2752374099905371e-08, |
| "entropy": 1.33312628865242, |
| "num_tokens": 38706620.0, |
| "mean_token_accuracy": 0.6743656143546104, |
| "epoch": 2.7910432143978237, |
| "step": 6670 |
| }, |
| { |
| "loss": 1.3620613098144532, |
| "grad_norm": 0.15011081099510193, |
| "learning_rate": 1.2250446321516173e-08, |
| "entropy": 1.3549015790224075, |
| "num_tokens": 38764666.0, |
| "mean_token_accuracy": 0.6692588478326797, |
| "epoch": 2.7952286282306162, |
| "step": 6680 |
| }, |
| { |
| "loss": 1.259181022644043, |
| "grad_norm": 0.15715329349040985, |
| "learning_rate": 1.1758473205037812e-08, |
| "entropy": 1.279186724126339, |
| "num_tokens": 38823325.0, |
| "mean_token_accuracy": 0.6928275167942047, |
| "epoch": 2.7994140420634093, |
| "step": 6690 |
| }, |
| { |
| "loss": 1.2290419578552245, |
| "grad_norm": 0.1388121098279953, |
| "learning_rate": 1.127646479133243e-08, |
| "entropy": 1.2684768080711364, |
| "num_tokens": 38880457.0, |
| "mean_token_accuracy": 0.694522102177143, |
| "epoch": 2.803599455896202, |
| "step": 6700 |
| }, |
| { |
| "loss": 1.3010470390319824, |
| "grad_norm": 0.15357019007205963, |
| "learning_rate": 1.0804430917888795e-08, |
| "entropy": 1.3155488684773444, |
| "num_tokens": 38936834.0, |
| "mean_token_accuracy": 0.6843758165836334, |
| "epoch": 2.8077848697289944, |
| "step": 6710 |
| }, |
| { |
| "loss": 1.33712797164917, |
| "grad_norm": 0.1296703815460205, |
| "learning_rate": 1.0342381218621798e-08, |
| "entropy": 1.3263304769992827, |
| "num_tokens": 38992580.0, |
| "mean_token_accuracy": 0.6779214948415756, |
| "epoch": 2.811970283561787, |
| "step": 6720 |
| }, |
| { |
| "loss": 1.326553726196289, |
| "grad_norm": 0.11933048069477081, |
| "learning_rate": 9.890325123675324e-09, |
| "entropy": 1.3206409364938736, |
| "num_tokens": 39048629.0, |
| "mean_token_accuracy": 0.6810004383325576, |
| "epoch": 2.81615569739458, |
| "step": 6730 |
| }, |
| { |
| "loss": 1.2582717895507813, |
| "grad_norm": 0.13642576336860657, |
| "learning_rate": 9.44827185923036e-09, |
| "entropy": 1.3072202578186989, |
| "num_tokens": 39109993.0, |
| "mean_token_accuracy": 0.6876247569918632, |
| "epoch": 2.8203411112273726, |
| "step": 6740 |
| }, |
| { |
| "loss": 1.3292433738708496, |
| "grad_norm": 0.14473669230937958, |
| "learning_rate": 9.016230447316142e-09, |
| "entropy": 1.312994186580181, |
| "num_tokens": 39167300.0, |
| "mean_token_accuracy": 0.6826678797602653, |
| "epoch": 2.824526525060165, |
| "step": 6750 |
| }, |
| { |
| "loss": 1.2947887420654296, |
| "grad_norm": 0.14324665069580078, |
| "learning_rate": 8.59420970562652e-09, |
| "entropy": 1.301099643111229, |
| "num_tokens": 39225969.0, |
| "mean_token_accuracy": 0.6844487801194191, |
| "epoch": 2.8287119388929582, |
| "step": 6760 |
| }, |
| { |
| "loss": 1.3236183166503905, |
| "grad_norm": 0.12903346121311188, |
| "learning_rate": 8.182218247339557e-09, |
| "entropy": 1.3240256026387214, |
| "num_tokens": 39284458.0, |
| "mean_token_accuracy": 0.6791065171360969, |
| "epoch": 2.832897352725751, |
| "step": 6770 |
| }, |
| { |
| "loss": 1.2615336418151855, |
| "grad_norm": 0.13563166558742523, |
| "learning_rate": 7.7802644809421e-09, |
| "entropy": 1.2816553667187691, |
| "num_tokens": 39343117.0, |
| "mean_token_accuracy": 0.6875470012426377, |
| "epoch": 2.8370827665585434, |
| "step": 6780 |
| }, |
| { |
| "loss": 1.302596092224121, |
| "grad_norm": 0.1292281448841095, |
| "learning_rate": 7.388356610057878e-09, |
| "entropy": 1.3054527580738067, |
| "num_tokens": 39399894.0, |
| "mean_token_accuracy": 0.6824864596128464, |
| "epoch": 2.841268180391336, |
| "step": 6790 |
| }, |
| { |
| "loss": 1.266486930847168, |
| "grad_norm": 0.13657227158546448, |
| "learning_rate": 7.006502633280398e-09, |
| "entropy": 1.2712685942649842, |
| "num_tokens": 39460071.0, |
| "mean_token_accuracy": 0.6891940608620644, |
| "epoch": 2.845453594224129, |
| "step": 6800 |
| }, |
| { |
| "eval_loss": 1.2960532903671265, |
| "eval_runtime": 43.6876, |
| "eval_samples_per_second": 145.831, |
| "eval_steps_per_second": 6.089, |
| "eval_entropy": 1.3085124981134457, |
| "eval_num_tokens": 39460071.0, |
| "eval_mean_token_accuracy": 0.6840056230251054, |
| "epoch": 2.845453594224129, |
| "step": 6800 |
| }, |
| { |
| "loss": 1.2665786743164062, |
| "grad_norm": 0.13498687744140625, |
| "learning_rate": 6.6347103440092534e-09, |
| "entropy": 1.304034498333931, |
| "num_tokens": 39521096.0, |
| "mean_token_accuracy": 0.6852239608764649, |
| "epoch": 2.8496390080569216, |
| "step": 6810 |
| }, |
| { |
| "loss": 1.2787626266479493, |
| "grad_norm": 0.16180647909641266, |
| "learning_rate": 6.272987330291635e-09, |
| "entropy": 1.2805368885397912, |
| "num_tokens": 39577586.0, |
| "mean_token_accuracy": 0.6863118633627892, |
| "epoch": 2.853824421889714, |
| "step": 6820 |
| }, |
| { |
| "loss": 1.298048973083496, |
| "grad_norm": 0.14410941302776337, |
| "learning_rate": 5.921340974666733e-09, |
| "entropy": 1.315412837266922, |
| "num_tokens": 39635685.0, |
| "mean_token_accuracy": 0.6825838565826416, |
| "epoch": 2.858009835722507, |
| "step": 6830 |
| }, |
| { |
| "loss": 1.2789037704467774, |
| "grad_norm": 0.12575282156467438, |
| "learning_rate": 5.57977845401586e-09, |
| "entropy": 1.3064978927373887, |
| "num_tokens": 39698219.0, |
| "mean_token_accuracy": 0.6813527047634125, |
| "epoch": 2.8621952495553, |
| "step": 6840 |
| }, |
| { |
| "loss": 1.2924142837524415, |
| "grad_norm": 0.14836302399635315, |
| "learning_rate": 5.248306739415453e-09, |
| "entropy": 1.3290347814559937, |
| "num_tokens": 39758992.0, |
| "mean_token_accuracy": 0.6835691928863525, |
| "epoch": 2.8663806633880924, |
| "step": 6850 |
| }, |
| { |
| "loss": 1.3144015312194823, |
| "grad_norm": 0.1364041566848755, |
| "learning_rate": 4.926932595994804e-09, |
| "entropy": 1.3275774329900742, |
| "num_tokens": 39818224.0, |
| "mean_token_accuracy": 0.6781793549656868, |
| "epoch": 2.870566077220885, |
| "step": 6860 |
| }, |
| { |
| "loss": 1.2337275505065919, |
| "grad_norm": 0.15484337508678436, |
| "learning_rate": 4.61566258279833e-09, |
| "entropy": 1.2519328325986863, |
| "num_tokens": 39875194.0, |
| "mean_token_accuracy": 0.6923688799142838, |
| "epoch": 2.874751491053678, |
| "step": 6870 |
| }, |
| { |
| "loss": 1.3397406578063964, |
| "grad_norm": 0.13291706144809723, |
| "learning_rate": 4.314503052651408e-09, |
| "entropy": 1.3448736280202866, |
| "num_tokens": 39930172.0, |
| "mean_token_accuracy": 0.6757234945893288, |
| "epoch": 2.8789369048864706, |
| "step": 6880 |
| }, |
| { |
| "loss": 1.337942886352539, |
| "grad_norm": 0.1345345377922058, |
| "learning_rate": 4.023460152030811e-09, |
| "entropy": 1.3406923681497573, |
| "num_tokens": 39989342.0, |
| "mean_token_accuracy": 0.6747447595000267, |
| "epoch": 2.8831223187192636, |
| "step": 6890 |
| }, |
| { |
| "loss": 1.2957025527954102, |
| "grad_norm": 0.14848950505256653, |
| "learning_rate": 3.74253982093925e-09, |
| "entropy": 1.3245025753974915, |
| "num_tokens": 40044850.0, |
| "mean_token_accuracy": 0.6843625560402871, |
| "epoch": 2.887307732552056, |
| "step": 6900 |
| }, |
| { |
| "loss": 1.2321189880371093, |
| "grad_norm": 0.13322904706001282, |
| "learning_rate": 3.471747792784141e-09, |
| "entropy": 1.262104222178459, |
| "num_tokens": 40102787.0, |
| "mean_token_accuracy": 0.6951159760355949, |
| "epoch": 2.891493146384849, |
| "step": 6910 |
| }, |
| { |
| "loss": 1.2964617729187011, |
| "grad_norm": 0.11292250454425812, |
| "learning_rate": 3.211089594260585e-09, |
| "entropy": 1.3182623267173768, |
| "num_tokens": 40163366.0, |
| "mean_token_accuracy": 0.6819359913468361, |
| "epoch": 2.8956785602176414, |
| "step": 6920 |
| }, |
| { |
| "loss": 1.3324262619018554, |
| "grad_norm": 0.15124228596687317, |
| "learning_rate": 2.9605705452387943e-09, |
| "entropy": 1.3444043919444084, |
| "num_tokens": 40221526.0, |
| "mean_token_accuracy": 0.6710021272301674, |
| "epoch": 2.8998639740504344, |
| "step": 6930 |
| }, |
| { |
| "loss": 1.2928382873535156, |
| "grad_norm": 0.1519566923379898, |
| "learning_rate": 2.7201957586550084e-09, |
| "entropy": 1.299992610514164, |
| "num_tokens": 40277117.0, |
| "mean_token_accuracy": 0.6823776334524154, |
| "epoch": 2.904049387883227, |
| "step": 6940 |
| }, |
| { |
| "loss": 1.2859591484069823, |
| "grad_norm": 0.128694087266922, |
| "learning_rate": 2.489970140407638e-09, |
| "entropy": 1.3093111872673036, |
| "num_tokens": 40334263.0, |
| "mean_token_accuracy": 0.6845874279737473, |
| "epoch": 2.9082348017160196, |
| "step": 6950 |
| }, |
| { |
| "loss": 1.2404520988464356, |
| "grad_norm": 0.13099578022956848, |
| "learning_rate": 2.2698983892568413e-09, |
| "entropy": 1.2709315478801728, |
| "num_tokens": 40388570.0, |
| "mean_token_accuracy": 0.6908060133457183, |
| "epoch": 2.9124202155488126, |
| "step": 6960 |
| }, |
| { |
| "loss": 1.30335693359375, |
| "grad_norm": 0.12444788217544556, |
| "learning_rate": 2.0599849967287696e-09, |
| "entropy": 1.324267864227295, |
| "num_tokens": 40447336.0, |
| "mean_token_accuracy": 0.6855339229106903, |
| "epoch": 2.916605629381605, |
| "step": 6970 |
| }, |
| { |
| "loss": 1.3318076133728027, |
| "grad_norm": 0.14273810386657715, |
| "learning_rate": 1.860234247023973e-09, |
| "entropy": 1.345059370994568, |
| "num_tokens": 40509083.0, |
| "mean_token_accuracy": 0.6763337209820748, |
| "epoch": 2.9207910432143978, |
| "step": 6980 |
| }, |
| { |
| "loss": 1.2318530082702637, |
| "grad_norm": 0.13854491710662842, |
| "learning_rate": 1.6706502169296366e-09, |
| "entropy": 1.2656858801841735, |
| "num_tokens": 40566757.0, |
| "mean_token_accuracy": 0.6908913642168045, |
| "epoch": 2.9249764570471903, |
| "step": 6990 |
| }, |
| { |
| "loss": 1.2753348350524902, |
| "grad_norm": 0.1317194700241089, |
| "learning_rate": 1.4912367757366485e-09, |
| "entropy": 1.290731391310692, |
| "num_tokens": 40626221.0, |
| "mean_token_accuracy": 0.6856264978647232, |
| "epoch": 2.9291618708799834, |
| "step": 7000 |
| }, |
| { |
| "eval_loss": 1.296015739440918, |
| "eval_runtime": 42.3973, |
| "eval_samples_per_second": 150.269, |
| "eval_steps_per_second": 6.274, |
| "eval_entropy": 1.3096338595662798, |
| "eval_num_tokens": 40626221.0, |
| "eval_mean_token_accuracy": 0.6839476989624196, |
| "epoch": 2.9291618708799834, |
| "step": 7000 |
| }, |
| { |
| "loss": 1.3039022445678712, |
| "grad_norm": 0.1216905489563942, |
| "learning_rate": 1.3219975851607724e-09, |
| "entropy": 1.3058283895254135, |
| "num_tokens": 40684010.0, |
| "mean_token_accuracy": 0.6836184665560723, |
| "epoch": 2.933347284712776, |
| "step": 7010 |
| }, |
| { |
| "loss": 1.3275947570800781, |
| "grad_norm": 0.15599027276039124, |
| "learning_rate": 1.1629360992673754e-09, |
| "entropy": 1.3284942299127578, |
| "num_tokens": 40742081.0, |
| "mean_token_accuracy": 0.6771559327840805, |
| "epoch": 2.9375326985455685, |
| "step": 7020 |
| }, |
| { |
| "loss": 1.2864752769470216, |
| "grad_norm": 0.12884531915187836, |
| "learning_rate": 1.014055564401539e-09, |
| "entropy": 1.2828212678432465, |
| "num_tokens": 40801021.0, |
| "mean_token_accuracy": 0.68373833745718, |
| "epoch": 2.9417181123783616, |
| "step": 7030 |
| }, |
| { |
| "loss": 1.318696880340576, |
| "grad_norm": 0.1270311027765274, |
| "learning_rate": 8.753590191213356e-10, |
| "entropy": 1.3187800377607346, |
| "num_tokens": 40858738.0, |
| "mean_token_accuracy": 0.6756755083799362, |
| "epoch": 2.945903526211154, |
| "step": 7040 |
| }, |
| { |
| "loss": 1.274400520324707, |
| "grad_norm": 0.15278004109859467, |
| "learning_rate": 7.468492941362647e-10, |
| "entropy": 1.2988332599401473, |
| "num_tokens": 40917615.0, |
| "mean_token_accuracy": 0.6798348844051361, |
| "epoch": 2.9500889400439467, |
| "step": 7050 |
| }, |
| { |
| "loss": 1.2988153457641602, |
| "grad_norm": 0.1352374255657196, |
| "learning_rate": 6.285290122489128e-10, |
| "entropy": 1.3167090728878974, |
| "num_tokens": 40975462.0, |
| "mean_token_accuracy": 0.6834619447588921, |
| "epoch": 2.9542743538767393, |
| "step": 7060 |
| }, |
| { |
| "loss": 1.3265647888183594, |
| "grad_norm": 0.13059331476688385, |
| "learning_rate": 5.204005883019392e-10, |
| "entropy": 1.325848352909088, |
| "num_tokens": 41034824.0, |
| "mean_token_accuracy": 0.6760321959853173, |
| "epoch": 2.9584597677095323, |
| "step": 7070 |
| }, |
| { |
| "loss": 1.3572219848632812, |
| "grad_norm": 0.14386902749538422, |
| "learning_rate": 4.224662291285597e-10, |
| "entropy": 1.356474344432354, |
| "num_tokens": 41092574.0, |
| "mean_token_accuracy": 0.6717446967959404, |
| "epoch": 2.962645181542325, |
| "step": 7080 |
| }, |
| { |
| "loss": 1.362534523010254, |
| "grad_norm": 0.12292881309986115, |
| "learning_rate": 3.347279335074726e-10, |
| "entropy": 1.3564091578125954, |
| "num_tokens": 41151205.0, |
| "mean_token_accuracy": 0.6701879113912582, |
| "epoch": 2.966830595375118, |
| "step": 7090 |
| }, |
| { |
| "loss": 1.28519287109375, |
| "grad_norm": 0.17997978627681732, |
| "learning_rate": 2.571874921221129e-10, |
| "entropy": 1.2821272403001784, |
| "num_tokens": 41208932.0, |
| "mean_token_accuracy": 0.6873822212219238, |
| "epoch": 2.9710160092079105, |
| "step": 7100 |
| }, |
| { |
| "loss": 1.338641357421875, |
| "grad_norm": 0.13907863199710846, |
| "learning_rate": 1.8984648752429221e-10, |
| "entropy": 1.3435455054044723, |
| "num_tokens": 41265349.0, |
| "mean_token_accuracy": 0.6759589716792107, |
| "epoch": 2.975201423040703, |
| "step": 7110 |
| }, |
| { |
| "loss": 1.3811635971069336, |
| "grad_norm": 0.13205569982528687, |
| "learning_rate": 1.3270629410150335e-10, |
| "entropy": 1.4000085026025773, |
| "num_tokens": 41324164.0, |
| "mean_token_accuracy": 0.6713890418410301, |
| "epoch": 2.9793868368734957, |
| "step": 7120 |
| }, |
| { |
| "loss": 1.2836057662963867, |
| "grad_norm": 0.13349401950836182, |
| "learning_rate": 8.576807804921981e-11, |
| "entropy": 1.3049908488988877, |
| "num_tokens": 41383405.0, |
| "mean_token_accuracy": 0.6830727905035019, |
| "epoch": 2.9835722507062887, |
| "step": 7130 |
| }, |
| { |
| "loss": 1.2696043014526368, |
| "grad_norm": 0.17530304193496704, |
| "learning_rate": 4.903279734697063e-11, |
| "entropy": 1.2787989050149917, |
| "num_tokens": 41441198.0, |
| "mean_token_accuracy": 0.6855189517140389, |
| "epoch": 2.9877576645390813, |
| "step": 7140 |
| }, |
| { |
| "loss": 1.2740005493164062, |
| "grad_norm": 0.15486350655555725, |
| "learning_rate": 2.2501201738689414e-11, |
| "entropy": 1.293800377845764, |
| "num_tokens": 41496526.0, |
| "mean_token_accuracy": 0.6852918058633805, |
| "epoch": 2.991943078371874, |
| "step": 7150 |
| }, |
| { |
| "loss": 1.309797191619873, |
| "grad_norm": 0.15995003283023834, |
| "learning_rate": 6.173832717559779e-12, |
| "entropy": 1.321927347779274, |
| "num_tokens": 41546682.0, |
| "mean_token_accuracy": 0.6836003750562668, |
| "epoch": 2.996128492204667, |
| "step": 7160 |
| }, |
| { |
| "loss": 1.2691694259643556, |
| "grad_norm": 0.2798561751842499, |
| "learning_rate": 5.102351502417335e-14, |
| "entropy": 1.282310128211975, |
| "num_tokens": 41595613.0, |
| "mean_token_accuracy": 0.6916061446473405, |
| "epoch": 3.0, |
| "step": 7170 |
| }, |
| { |
| "train_runtime": 24564.0326, |
| "train_samples_per_second": 7.003, |
| "train_steps_per_second": 0.292, |
| "total_flos": 1.1844488431738552e+18, |
| "train_loss": 1.40203029108513, |
| "epoch": 3.0, |
| "step": 7170 |
| } |
| ] |