{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1481057405471802, "epoch": 0.0037418147801683817, "grad_norm": 0.40896540880203247, "learning_rate": 0.0002, "loss": 2.499051332473755, "mean_token_accuracy": 0.5305689871311188, "num_tokens": 16123.0, "step": 1 }, { "entropy": 1.239521712064743, "epoch": 0.007483629560336763, "grad_norm": 0.3786088228225708, "learning_rate": 0.0002, "loss": 2.1649975776672363, "mean_token_accuracy": 0.5674073547124863, "num_tokens": 32231.0, "step": 2 }, { "entropy": 1.4065836369991302, "epoch": 0.011225444340505144, "grad_norm": 0.2935435175895691, "learning_rate": 0.0002, "loss": 1.7277326583862305, "mean_token_accuracy": 0.5904076844453812, "num_tokens": 48717.0, "step": 3 }, { "entropy": 1.3739030063152313, "epoch": 0.014967259120673527, "grad_norm": 0.24068056046962738, "learning_rate": 0.0002, "loss": 1.4146925210952759, "mean_token_accuracy": 0.6330391019582748, "num_tokens": 64917.0, "step": 4 }, { "entropy": 1.3624942004680634, "epoch": 0.018709073900841908, "grad_norm": 0.2722117602825165, "learning_rate": 0.0002, "loss": 1.2977211475372314, "mean_token_accuracy": 0.6365498602390289, "num_tokens": 81360.0, "step": 5 }, { "entropy": 1.268439620733261, "epoch": 0.02245088868101029, "grad_norm": 0.13346025347709656, "learning_rate": 0.0002, "loss": 1.1922200918197632, "mean_token_accuracy": 0.6591676026582718, "num_tokens": 98033.0, "step": 6 }, { "entropy": 1.187461495399475, "epoch": 0.026192703461178673, "grad_norm": 0.10905587673187256, "learning_rate": 0.0002, "loss": 1.090636134147644, "mean_token_accuracy": 0.6683961004018784, "num_tokens": 114410.0, "step": 7 }, { "entropy": 1.1027202904224396, "epoch": 0.029934518241347054, "grad_norm": 0.10468754172325134, "learning_rate": 0.0002, "loss": 1.0090222358703613, "mean_token_accuracy": 0.6826278865337372, "num_tokens": 130663.0, "step": 8 }, { "entropy": 1.0241433680057526, "epoch": 0.03367633302151544, "grad_norm": 0.13387203216552734, "learning_rate": 0.0002, "loss": 0.9953913688659668, "mean_token_accuracy": 0.6843951940536499, "num_tokens": 147024.0, "step": 9 }, { "entropy": 1.0002675652503967, "epoch": 0.037418147801683815, "grad_norm": 0.1420045644044876, "learning_rate": 0.0002, "loss": 0.9541152119636536, "mean_token_accuracy": 0.6879138201475143, "num_tokens": 163186.0, "step": 10 }, { "entropy": 0.9888490438461304, "epoch": 0.0411599625818522, "grad_norm": 0.10480759292840958, "learning_rate": 0.0002, "loss": 0.8834772706031799, "mean_token_accuracy": 0.7008452415466309, "num_tokens": 179486.0, "step": 11 }, { "entropy": 0.9587634801864624, "epoch": 0.04490177736202058, "grad_norm": 0.1189962700009346, "learning_rate": 0.0002, "loss": 0.8404299020767212, "mean_token_accuracy": 0.7084675431251526, "num_tokens": 195940.0, "step": 12 }, { "entropy": 0.8834698051214218, "epoch": 0.04864359214218896, "grad_norm": 0.1070038452744484, "learning_rate": 0.0002, "loss": 0.816959798336029, "mean_token_accuracy": 0.7068669199943542, "num_tokens": 212384.0, "step": 13 }, { "entropy": 0.7648728787899017, "epoch": 0.052385406922357346, "grad_norm": 1.0202980041503906, "learning_rate": 0.0002, "loss": 0.7703532576560974, "mean_token_accuracy": 0.721884474158287, "num_tokens": 228462.0, "step": 14 }, { "entropy": 0.7483080476522446, "epoch": 0.05612722170252572, "grad_norm": 0.12461339682340622, "learning_rate": 0.0002, "loss": 0.745843231678009, "mean_token_accuracy": 0.7246550768613815, "num_tokens": 244599.0, "step": 15 }, { "entropy": 0.7499705106019974, "epoch": 0.05986903648269411, "grad_norm": 0.13838888704776764, "learning_rate": 0.0002, "loss": 0.7328222990036011, "mean_token_accuracy": 0.7272029221057892, "num_tokens": 261162.0, "step": 16 }, { "entropy": 0.7162831723690033, "epoch": 0.06361085126286249, "grad_norm": 0.0821700468659401, "learning_rate": 0.0002, "loss": 0.700190007686615, "mean_token_accuracy": 0.7368839830160141, "num_tokens": 277513.0, "step": 17 }, { "entropy": 0.66506028175354, "epoch": 0.06735266604303088, "grad_norm": 0.08271524310112, "learning_rate": 0.0002, "loss": 0.6616584062576294, "mean_token_accuracy": 0.7501807361841202, "num_tokens": 293628.0, "step": 18 }, { "entropy": 0.6652649641036987, "epoch": 0.07109448082319925, "grad_norm": 0.10451149940490723, "learning_rate": 0.0002, "loss": 0.6696457266807556, "mean_token_accuracy": 0.7403630912303925, "num_tokens": 309771.0, "step": 19 }, { "entropy": 0.671489492058754, "epoch": 0.07483629560336763, "grad_norm": 0.08111453801393509, "learning_rate": 0.0002, "loss": 0.6523128747940063, "mean_token_accuracy": 0.7449511885643005, "num_tokens": 326252.0, "step": 20 }, { "entropy": 0.6829328835010529, "epoch": 0.07857811038353602, "grad_norm": 0.07855828106403351, "learning_rate": 0.0002, "loss": 0.6548086404800415, "mean_token_accuracy": 0.7431468367576599, "num_tokens": 342569.0, "step": 21 }, { "entropy": 0.6616033613681793, "epoch": 0.0823199251637044, "grad_norm": 0.07543554902076721, "learning_rate": 0.0002, "loss": 0.6394403576850891, "mean_token_accuracy": 0.7484261393547058, "num_tokens": 359156.0, "step": 22 }, { "entropy": 0.6383623033761978, "epoch": 0.08606173994387278, "grad_norm": 0.07246740162372589, "learning_rate": 0.0002, "loss": 0.6292484998703003, "mean_token_accuracy": 0.7550594955682755, "num_tokens": 375388.0, "step": 23 }, { "entropy": 0.6223422735929489, "epoch": 0.08980355472404115, "grad_norm": 0.08016548305749893, "learning_rate": 0.0002, "loss": 0.6264731884002686, "mean_token_accuracy": 0.7548545002937317, "num_tokens": 391528.0, "step": 24 }, { "entropy": 0.5979716777801514, "epoch": 0.09354536950420954, "grad_norm": 0.07842142134904861, "learning_rate": 0.0002, "loss": 0.6038044691085815, "mean_token_accuracy": 0.764473095536232, "num_tokens": 407673.0, "step": 25 }, { "entropy": 0.5976411253213882, "epoch": 0.09728718428437792, "grad_norm": 0.0749603658914566, "learning_rate": 0.0002, "loss": 0.5980632305145264, "mean_token_accuracy": 0.7644072473049164, "num_tokens": 423781.0, "step": 26 }, { "entropy": 0.5957016050815582, "epoch": 0.10102899906454631, "grad_norm": 0.061034828424453735, "learning_rate": 0.0002, "loss": 0.5909260511398315, "mean_token_accuracy": 0.7682853490114212, "num_tokens": 439927.0, "step": 27 }, { "entropy": 0.6109822690486908, "epoch": 0.10477081384471469, "grad_norm": 0.061578188091516495, "learning_rate": 0.0002, "loss": 0.5998508334159851, "mean_token_accuracy": 0.7658420503139496, "num_tokens": 456218.0, "step": 28 }, { "entropy": 0.601639524102211, "epoch": 0.10851262862488306, "grad_norm": 0.0625869631767273, "learning_rate": 0.0002, "loss": 0.592888355255127, "mean_token_accuracy": 0.7679047584533691, "num_tokens": 472672.0, "step": 29 }, { "entropy": 0.5943656265735626, "epoch": 0.11225444340505145, "grad_norm": 0.05583951249718666, "learning_rate": 0.0002, "loss": 0.5944483280181885, "mean_token_accuracy": 0.7622693479061127, "num_tokens": 489114.0, "step": 30 }, { "entropy": 0.5988462120294571, "epoch": 0.11599625818521983, "grad_norm": 0.0581178143620491, "learning_rate": 0.0002, "loss": 0.6067461967468262, "mean_token_accuracy": 0.7607288658618927, "num_tokens": 505426.0, "step": 31 }, { "entropy": 0.5756160020828247, "epoch": 0.11973807296538821, "grad_norm": 0.05917786434292793, "learning_rate": 0.0002, "loss": 0.5832271575927734, "mean_token_accuracy": 0.770146518945694, "num_tokens": 521632.0, "step": 32 }, { "entropy": 0.5860312879085541, "epoch": 0.1234798877455566, "grad_norm": 0.057717982679605484, "learning_rate": 0.0002, "loss": 0.592366635799408, "mean_token_accuracy": 0.7664856016635895, "num_tokens": 538173.0, "step": 33 }, { "entropy": 0.5932987481355667, "epoch": 0.12722170252572498, "grad_norm": 0.051627833396196365, "learning_rate": 0.0002, "loss": 0.5942224860191345, "mean_token_accuracy": 0.7634450048208237, "num_tokens": 554522.0, "step": 34 }, { "entropy": 0.5781913548707962, "epoch": 0.13096351730589337, "grad_norm": 0.053737979382276535, "learning_rate": 0.0002, "loss": 0.5713843107223511, "mean_token_accuracy": 0.7748462855815887, "num_tokens": 570944.0, "step": 35 }, { "entropy": 0.5928207337856293, "epoch": 0.13470533208606175, "grad_norm": 0.0513126477599144, "learning_rate": 0.0002, "loss": 0.5946991443634033, "mean_token_accuracy": 0.7643233835697174, "num_tokens": 587342.0, "step": 36 }, { "entropy": 0.5689480155706406, "epoch": 0.1384471468662301, "grad_norm": 0.0563691221177578, "learning_rate": 0.0002, "loss": 0.5712450742721558, "mean_token_accuracy": 0.7735907435417175, "num_tokens": 603727.0, "step": 37 }, { "entropy": 0.5871619284152985, "epoch": 0.1421889616463985, "grad_norm": 0.043151870369911194, "learning_rate": 0.0002, "loss": 0.5806025862693787, "mean_token_accuracy": 0.768414631485939, "num_tokens": 620304.0, "step": 38 }, { "entropy": 0.5789511501789093, "epoch": 0.14593077642656688, "grad_norm": 0.057180438190698624, "learning_rate": 0.0002, "loss": 0.5829247832298279, "mean_token_accuracy": 0.7660035490989685, "num_tokens": 636613.0, "step": 39 }, { "entropy": 0.5511189699172974, "epoch": 0.14967259120673526, "grad_norm": 0.04785468429327011, "learning_rate": 0.0002, "loss": 0.5596879124641418, "mean_token_accuracy": 0.7737152278423309, "num_tokens": 652836.0, "step": 40 }, { "entropy": 0.5728544592857361, "epoch": 0.15341440598690365, "grad_norm": 0.047032520174980164, "learning_rate": 0.0002, "loss": 0.5756531953811646, "mean_token_accuracy": 0.7682489305734634, "num_tokens": 669348.0, "step": 41 }, { "entropy": 0.5809888541698456, "epoch": 0.15715622076707203, "grad_norm": 0.04996408522129059, "learning_rate": 0.0002, "loss": 0.5856860280036926, "mean_token_accuracy": 0.7646850347518921, "num_tokens": 685771.0, "step": 42 }, { "entropy": 0.5943491905927658, "epoch": 0.16089803554724041, "grad_norm": 0.04490286856889725, "learning_rate": 0.0002, "loss": 0.5864270329475403, "mean_token_accuracy": 0.7636495530605316, "num_tokens": 702211.0, "step": 43 }, { "entropy": 0.5895421206951141, "epoch": 0.1646398503274088, "grad_norm": 0.051186852157115936, "learning_rate": 0.0002, "loss": 0.5863322019577026, "mean_token_accuracy": 0.7648472040891647, "num_tokens": 718539.0, "step": 44 }, { "entropy": 0.573004424571991, "epoch": 0.16838166510757718, "grad_norm": 0.044179223477840424, "learning_rate": 0.0002, "loss": 0.5632967352867126, "mean_token_accuracy": 0.7742049247026443, "num_tokens": 734943.0, "step": 45 }, { "entropy": 0.5616976916790009, "epoch": 0.17212347988774557, "grad_norm": 0.04744846373796463, "learning_rate": 0.0002, "loss": 0.5611750483512878, "mean_token_accuracy": 0.7748160660266876, "num_tokens": 751206.0, "step": 46 }, { "entropy": 0.5663218796253204, "epoch": 0.17586529466791395, "grad_norm": 0.05421765521168709, "learning_rate": 0.0002, "loss": 0.5719538927078247, "mean_token_accuracy": 0.7716761082410812, "num_tokens": 767602.0, "step": 47 }, { "entropy": 0.5845721065998077, "epoch": 0.1796071094480823, "grad_norm": 0.04122321680188179, "learning_rate": 0.0002, "loss": 0.5887588858604431, "mean_token_accuracy": 0.7646526545286179, "num_tokens": 784029.0, "step": 48 }, { "entropy": 0.5674261897802353, "epoch": 0.1833489242282507, "grad_norm": 0.05335045978426933, "learning_rate": 0.0002, "loss": 0.5763436555862427, "mean_token_accuracy": 0.7674090713262558, "num_tokens": 800207.0, "step": 49 }, { "entropy": 0.5922754108905792, "epoch": 0.18709073900841908, "grad_norm": 0.04774358496069908, "learning_rate": 0.0002, "loss": 0.592854917049408, "mean_token_accuracy": 0.7636804282665253, "num_tokens": 816757.0, "step": 50 }, { "entropy": 0.5675703585147858, "epoch": 0.19083255378858746, "grad_norm": 0.046180881559848785, "learning_rate": 0.0002, "loss": 0.5643646121025085, "mean_token_accuracy": 0.7744234651327133, "num_tokens": 833143.0, "step": 51 }, { "entropy": 0.5735020041465759, "epoch": 0.19457436856875585, "grad_norm": 0.04306558147072792, "learning_rate": 0.0002, "loss": 0.5688086748123169, "mean_token_accuracy": 0.7720673680305481, "num_tokens": 849533.0, "step": 52 }, { "entropy": 0.5725302696228027, "epoch": 0.19831618334892423, "grad_norm": 0.044849518686532974, "learning_rate": 0.0002, "loss": 0.5705700516700745, "mean_token_accuracy": 0.7675163745880127, "num_tokens": 865711.0, "step": 53 }, { "entropy": 0.568488135933876, "epoch": 0.20205799812909261, "grad_norm": 0.03932643309235573, "learning_rate": 0.0002, "loss": 0.5707889795303345, "mean_token_accuracy": 0.7687725275754929, "num_tokens": 882150.0, "step": 54 }, { "entropy": 0.5733406245708466, "epoch": 0.205799812909261, "grad_norm": 0.044968072324991226, "learning_rate": 0.0002, "loss": 0.5740039348602295, "mean_token_accuracy": 0.7688336670398712, "num_tokens": 898618.0, "step": 55 }, { "entropy": 0.5666982084512711, "epoch": 0.20954162768942938, "grad_norm": 0.03931398317217827, "learning_rate": 0.0002, "loss": 0.5738785266876221, "mean_token_accuracy": 0.7679219394922256, "num_tokens": 914939.0, "step": 56 }, { "entropy": 0.5663618296384811, "epoch": 0.21328344246959777, "grad_norm": 0.0373641774058342, "learning_rate": 0.0002, "loss": 0.5636038780212402, "mean_token_accuracy": 0.7741107642650604, "num_tokens": 931291.0, "step": 57 }, { "entropy": 0.557570144534111, "epoch": 0.21702525724976612, "grad_norm": 0.04060584679245949, "learning_rate": 0.0002, "loss": 0.5589414238929749, "mean_token_accuracy": 0.7753962129354477, "num_tokens": 947611.0, "step": 58 }, { "entropy": 0.5627644211053848, "epoch": 0.2207670720299345, "grad_norm": 0.037169281393289566, "learning_rate": 0.0002, "loss": 0.5654425621032715, "mean_token_accuracy": 0.7718145698308945, "num_tokens": 963820.0, "step": 59 }, { "entropy": 0.58712999522686, "epoch": 0.2245088868101029, "grad_norm": 0.03782787546515465, "learning_rate": 0.0002, "loss": 0.5898170471191406, "mean_token_accuracy": 0.7635077238082886, "num_tokens": 980402.0, "step": 60 }, { "entropy": 0.5586348623037338, "epoch": 0.22825070159027128, "grad_norm": 0.03953346982598305, "learning_rate": 0.0002, "loss": 0.5562594532966614, "mean_token_accuracy": 0.7752978503704071, "num_tokens": 996502.0, "step": 61 }, { "entropy": 0.5691598951816559, "epoch": 0.23199251637043966, "grad_norm": 0.04252421110868454, "learning_rate": 0.0002, "loss": 0.5684412717819214, "mean_token_accuracy": 0.7712201923131943, "num_tokens": 1012676.0, "step": 62 }, { "entropy": 0.5714918673038483, "epoch": 0.23573433115060805, "grad_norm": 0.036386385560035706, "learning_rate": 0.0002, "loss": 0.5729389190673828, "mean_token_accuracy": 0.768106073141098, "num_tokens": 1028906.0, "step": 63 }, { "entropy": 0.5666227042675018, "epoch": 0.23947614593077643, "grad_norm": 0.037684470415115356, "learning_rate": 0.0002, "loss": 0.5600223541259766, "mean_token_accuracy": 0.7734655141830444, "num_tokens": 1045328.0, "step": 64 }, { "entropy": 0.5651632696390152, "epoch": 0.2432179607109448, "grad_norm": 0.03333243355154991, "learning_rate": 0.0002, "loss": 0.5639563798904419, "mean_token_accuracy": 0.771888479590416, "num_tokens": 1061791.0, "step": 65 }, { "entropy": 0.5851249843835831, "epoch": 0.2469597754911132, "grad_norm": 0.04036445543169975, "learning_rate": 0.0002, "loss": 0.5847532749176025, "mean_token_accuracy": 0.7656708210706711, "num_tokens": 1078293.0, "step": 66 }, { "entropy": 0.5670823901891708, "epoch": 0.2507015902712816, "grad_norm": 0.04222024604678154, "learning_rate": 0.0002, "loss": 0.5660995244979858, "mean_token_accuracy": 0.7720949500799179, "num_tokens": 1094672.0, "step": 67 }, { "entropy": 0.581654280424118, "epoch": 0.25444340505144997, "grad_norm": 0.03967028483748436, "learning_rate": 0.0002, "loss": 0.5889865159988403, "mean_token_accuracy": 0.760698065161705, "num_tokens": 1111068.0, "step": 68 }, { "entropy": 0.5533672720193863, "epoch": 0.25818521983161835, "grad_norm": 0.03658512607216835, "learning_rate": 0.0002, "loss": 0.5615257024765015, "mean_token_accuracy": 0.7765155285596848, "num_tokens": 1127289.0, "step": 69 }, { "entropy": 0.5607704222202301, "epoch": 0.26192703461178674, "grad_norm": 0.0379711352288723, "learning_rate": 0.0002, "loss": 0.5662075281143188, "mean_token_accuracy": 0.7751724272966385, "num_tokens": 1143569.0, "step": 70 }, { "entropy": 0.5778918713331223, "epoch": 0.2656688493919551, "grad_norm": 0.038288865238428116, "learning_rate": 0.0002, "loss": 0.5817552804946899, "mean_token_accuracy": 0.7655211091041565, "num_tokens": 1159646.0, "step": 71 }, { "entropy": 0.573161169886589, "epoch": 0.2694106641721235, "grad_norm": 0.038547221571207047, "learning_rate": 0.0002, "loss": 0.5695617198944092, "mean_token_accuracy": 0.7739016711711884, "num_tokens": 1175923.0, "step": 72 }, { "entropy": 0.5844559669494629, "epoch": 0.2731524789522919, "grad_norm": 0.03487812727689743, "learning_rate": 0.0002, "loss": 0.5778559446334839, "mean_token_accuracy": 0.7675636559724808, "num_tokens": 1192471.0, "step": 73 }, { "entropy": 0.578565388917923, "epoch": 0.2768942937324602, "grad_norm": 0.03859493136405945, "learning_rate": 0.0002, "loss": 0.5707017779350281, "mean_token_accuracy": 0.7693561762571335, "num_tokens": 1208749.0, "step": 74 }, { "entropy": 0.5591824799776077, "epoch": 0.2806361085126286, "grad_norm": 0.03378773108124733, "learning_rate": 0.0002, "loss": 0.557567298412323, "mean_token_accuracy": 0.7764061838388443, "num_tokens": 1224922.0, "step": 75 }, { "entropy": 0.568041980266571, "epoch": 0.284377923292797, "grad_norm": 0.03862875699996948, "learning_rate": 0.0002, "loss": 0.570695698261261, "mean_token_accuracy": 0.7686833739280701, "num_tokens": 1241294.0, "step": 76 }, { "entropy": 0.5530785471200943, "epoch": 0.28811973807296537, "grad_norm": 0.03997069224715233, "learning_rate": 0.0002, "loss": 0.5623512268066406, "mean_token_accuracy": 0.7745240479707718, "num_tokens": 1257616.0, "step": 77 }, { "entropy": 0.5595529079437256, "epoch": 0.29186155285313375, "grad_norm": 0.03598308190703392, "learning_rate": 0.0002, "loss": 0.5686611533164978, "mean_token_accuracy": 0.7718778103590012, "num_tokens": 1274217.0, "step": 78 }, { "entropy": 0.5654617100954056, "epoch": 0.29560336763330214, "grad_norm": 0.03698718175292015, "learning_rate": 0.0002, "loss": 0.5718352794647217, "mean_token_accuracy": 0.7710111141204834, "num_tokens": 1290502.0, "step": 79 }, { "entropy": 0.5769922882318497, "epoch": 0.2993451824134705, "grad_norm": 0.03608345612883568, "learning_rate": 0.0002, "loss": 0.5771495699882507, "mean_token_accuracy": 0.7671397477388382, "num_tokens": 1307057.0, "step": 80 }, { "entropy": 0.5775998532772064, "epoch": 0.3030869971936389, "grad_norm": 0.04129846766591072, "learning_rate": 0.0002, "loss": 0.5648953318595886, "mean_token_accuracy": 0.7740987688302994, "num_tokens": 1323158.0, "step": 81 }, { "entropy": 0.578661784529686, "epoch": 0.3068288119738073, "grad_norm": 0.04035583510994911, "learning_rate": 0.0002, "loss": 0.572229266166687, "mean_token_accuracy": 0.769649401307106, "num_tokens": 1339671.0, "step": 82 }, { "entropy": 0.5630823224782944, "epoch": 0.3105706267539757, "grad_norm": 0.035164687782526016, "learning_rate": 0.0002, "loss": 0.5634369254112244, "mean_token_accuracy": 0.7725345641374588, "num_tokens": 1355922.0, "step": 83 }, { "entropy": 0.5712268948554993, "epoch": 0.31431244153414406, "grad_norm": 0.038266371935606, "learning_rate": 0.0002, "loss": 0.5790088772773743, "mean_token_accuracy": 0.7660410851240158, "num_tokens": 1372241.0, "step": 84 }, { "entropy": 0.5503551959991455, "epoch": 0.31805425631431244, "grad_norm": 0.04355614632368088, "learning_rate": 0.0002, "loss": 0.5594754815101624, "mean_token_accuracy": 0.7743213176727295, "num_tokens": 1388447.0, "step": 85 }, { "entropy": 0.5567754805088043, "epoch": 0.32179607109448083, "grad_norm": 0.034040167927742004, "learning_rate": 0.0002, "loss": 0.5562305450439453, "mean_token_accuracy": 0.7782892882823944, "num_tokens": 1404595.0, "step": 86 }, { "entropy": 0.5897853374481201, "epoch": 0.3255378858746492, "grad_norm": 0.04141312837600708, "learning_rate": 0.0002, "loss": 0.5811256766319275, "mean_token_accuracy": 0.7645350694656372, "num_tokens": 1421046.0, "step": 87 }, { "entropy": 0.5651004612445831, "epoch": 0.3292797006548176, "grad_norm": 0.039186883717775345, "learning_rate": 0.0002, "loss": 0.5626670122146606, "mean_token_accuracy": 0.771001011133194, "num_tokens": 1437307.0, "step": 88 }, { "entropy": 0.5479820519685745, "epoch": 0.333021515434986, "grad_norm": 0.038090839982032776, "learning_rate": 0.0002, "loss": 0.5517987012863159, "mean_token_accuracy": 0.7779913991689682, "num_tokens": 1453625.0, "step": 89 }, { "entropy": 0.5513372272253036, "epoch": 0.33676333021515437, "grad_norm": 0.033073123544454575, "learning_rate": 0.0002, "loss": 0.5521109700202942, "mean_token_accuracy": 0.7770368456840515, "num_tokens": 1470001.0, "step": 90 }, { "entropy": 0.5538579821586609, "epoch": 0.34050514499532275, "grad_norm": 0.03432928025722504, "learning_rate": 0.0002, "loss": 0.5595468878746033, "mean_token_accuracy": 0.7756330221891403, "num_tokens": 1486202.0, "step": 91 }, { "entropy": 0.5441462099552155, "epoch": 0.34424695977549113, "grad_norm": 0.03260473906993866, "learning_rate": 0.0002, "loss": 0.5527001023292542, "mean_token_accuracy": 0.7777194529771805, "num_tokens": 1502337.0, "step": 92 }, { "entropy": 0.5642740428447723, "epoch": 0.3479887745556595, "grad_norm": 0.041720353066921234, "learning_rate": 0.0002, "loss": 0.5752084255218506, "mean_token_accuracy": 0.7667101472616196, "num_tokens": 1518821.0, "step": 93 }, { "entropy": 0.565082237124443, "epoch": 0.3517305893358279, "grad_norm": 0.03507543355226517, "learning_rate": 0.0002, "loss": 0.5699793696403503, "mean_token_accuracy": 0.770054817199707, "num_tokens": 1535163.0, "step": 94 }, { "entropy": 0.5870088040828705, "epoch": 0.35547240411599623, "grad_norm": 0.034236419945955276, "learning_rate": 0.0002, "loss": 0.5850114226341248, "mean_token_accuracy": 0.7608266621828079, "num_tokens": 1551565.0, "step": 95 }, { "entropy": 0.5530053824186325, "epoch": 0.3592142188961646, "grad_norm": 0.03369399905204773, "learning_rate": 0.0002, "loss": 0.5534529685974121, "mean_token_accuracy": 0.7759882658720016, "num_tokens": 1567750.0, "step": 96 }, { "entropy": 0.5754924863576889, "epoch": 0.362956033676333, "grad_norm": 0.036406002938747406, "learning_rate": 0.0002, "loss": 0.5705168843269348, "mean_token_accuracy": 0.7698172330856323, "num_tokens": 1584023.0, "step": 97 }, { "entropy": 0.5771925449371338, "epoch": 0.3666978484565014, "grad_norm": 0.032233767211437225, "learning_rate": 0.0002, "loss": 0.5738174319267273, "mean_token_accuracy": 0.7679109573364258, "num_tokens": 1600377.0, "step": 98 }, { "entropy": 0.566839799284935, "epoch": 0.37043966323666977, "grad_norm": 0.029388124123215675, "learning_rate": 0.0002, "loss": 0.5624303817749023, "mean_token_accuracy": 0.771264523267746, "num_tokens": 1616664.0, "step": 99 }, { "entropy": 0.5605880320072174, "epoch": 0.37418147801683815, "grad_norm": 0.034897759556770325, "learning_rate": 0.0002, "loss": 0.5609456896781921, "mean_token_accuracy": 0.7745639681816101, "num_tokens": 1632981.0, "step": 100 }, { "entropy": 0.5694979727268219, "epoch": 0.37792329279700654, "grad_norm": 0.03481722250580788, "learning_rate": 0.0002, "loss": 0.5728567838668823, "mean_token_accuracy": 0.7689409404993057, "num_tokens": 1649432.0, "step": 101 }, { "entropy": 0.5804490298032761, "epoch": 0.3816651075771749, "grad_norm": 0.03589940071105957, "learning_rate": 0.0002, "loss": 0.5847839713096619, "mean_token_accuracy": 0.7632083743810654, "num_tokens": 1666031.0, "step": 102 }, { "entropy": 0.5580839961767197, "epoch": 0.3854069223573433, "grad_norm": 0.031488265842199326, "learning_rate": 0.0002, "loss": 0.5667596459388733, "mean_token_accuracy": 0.7720794081687927, "num_tokens": 1682406.0, "step": 103 }, { "entropy": 0.5474104434251785, "epoch": 0.3891487371375117, "grad_norm": 0.03187083452939987, "learning_rate": 0.0002, "loss": 0.5499236583709717, "mean_token_accuracy": 0.7772009670734406, "num_tokens": 1698795.0, "step": 104 }, { "entropy": 0.5527014136314392, "epoch": 0.3928905519176801, "grad_norm": 0.03492984548211098, "learning_rate": 0.0002, "loss": 0.5512747168540955, "mean_token_accuracy": 0.776108130812645, "num_tokens": 1715480.0, "step": 105 }, { "entropy": 0.579165443778038, "epoch": 0.39663236669784846, "grad_norm": 0.03257554769515991, "learning_rate": 0.0002, "loss": 0.5810192823410034, "mean_token_accuracy": 0.7663566768169403, "num_tokens": 1731889.0, "step": 106 }, { "entropy": 0.5633712112903595, "epoch": 0.40037418147801684, "grad_norm": 0.03179244324564934, "learning_rate": 0.0002, "loss": 0.5622086524963379, "mean_token_accuracy": 0.7680526524782181, "num_tokens": 1748318.0, "step": 107 }, { "entropy": 0.5600844174623489, "epoch": 0.40411599625818523, "grad_norm": 0.029808223247528076, "learning_rate": 0.0002, "loss": 0.5606282949447632, "mean_token_accuracy": 0.7708232551813126, "num_tokens": 1764619.0, "step": 108 }, { "entropy": 0.5492478907108307, "epoch": 0.4078578110383536, "grad_norm": 0.031120680272579193, "learning_rate": 0.0002, "loss": 0.5484419465065002, "mean_token_accuracy": 0.775683268904686, "num_tokens": 1780851.0, "step": 109 }, { "entropy": 0.5517283380031586, "epoch": 0.411599625818522, "grad_norm": 0.03694352135062218, "learning_rate": 0.0002, "loss": 0.5580882430076599, "mean_token_accuracy": 0.774466261267662, "num_tokens": 1796890.0, "step": 110 }, { "entropy": 0.5656300336122513, "epoch": 0.4153414405986904, "grad_norm": 0.03588038682937622, "learning_rate": 0.0002, "loss": 0.5704593658447266, "mean_token_accuracy": 0.7691588401794434, "num_tokens": 1813404.0, "step": 111 }, { "entropy": 0.564102292060852, "epoch": 0.41908325537885877, "grad_norm": 0.03264907747507095, "learning_rate": 0.0002, "loss": 0.5655107498168945, "mean_token_accuracy": 0.7724602967500687, "num_tokens": 1829724.0, "step": 112 }, { "entropy": 0.5644495040178299, "epoch": 0.42282507015902715, "grad_norm": 0.03256542608141899, "learning_rate": 0.0002, "loss": 0.5646591782569885, "mean_token_accuracy": 0.7743334770202637, "num_tokens": 1846177.0, "step": 113 }, { "entropy": 0.545789897441864, "epoch": 0.42656688493919553, "grad_norm": 0.034160368144512177, "learning_rate": 0.0002, "loss": 0.5457491874694824, "mean_token_accuracy": 0.7793226093053818, "num_tokens": 1862412.0, "step": 114 }, { "entropy": 0.5670842975378036, "epoch": 0.4303086997193639, "grad_norm": 0.02954726107418537, "learning_rate": 0.0002, "loss": 0.5644434690475464, "mean_token_accuracy": 0.7711858153343201, "num_tokens": 1878518.0, "step": 115 }, { "entropy": 0.5647070705890656, "epoch": 0.43405051449953225, "grad_norm": 0.028261123225092888, "learning_rate": 0.0002, "loss": 0.5621106624603271, "mean_token_accuracy": 0.776775136590004, "num_tokens": 1895135.0, "step": 116 }, { "entropy": 0.529420793056488, "epoch": 0.43779232927970063, "grad_norm": 0.03301499783992767, "learning_rate": 0.0002, "loss": 0.536541759967804, "mean_token_accuracy": 0.7836042046546936, "num_tokens": 1911161.0, "step": 117 }, { "entropy": 0.5451334565877914, "epoch": 0.441534144059869, "grad_norm": 0.033271510154008865, "learning_rate": 0.0002, "loss": 0.5523592829704285, "mean_token_accuracy": 0.7769709676504135, "num_tokens": 1927550.0, "step": 118 }, { "entropy": 0.536512017250061, "epoch": 0.4452759588400374, "grad_norm": 0.03425843268632889, "learning_rate": 0.0002, "loss": 0.5380823612213135, "mean_token_accuracy": 0.780797928571701, "num_tokens": 1943788.0, "step": 119 }, { "entropy": 0.536301851272583, "epoch": 0.4490177736202058, "grad_norm": 0.03248719125986099, "learning_rate": 0.0002, "loss": 0.5470737218856812, "mean_token_accuracy": 0.7803975343704224, "num_tokens": 1959878.0, "step": 120 }, { "entropy": 0.5517153441905975, "epoch": 0.45275958840037417, "grad_norm": 0.03530304506421089, "learning_rate": 0.0002, "loss": 0.5577021241188049, "mean_token_accuracy": 0.7733452618122101, "num_tokens": 1976131.0, "step": 121 }, { "entropy": 0.5619277656078339, "epoch": 0.45650140318054255, "grad_norm": 0.03460797667503357, "learning_rate": 0.0002, "loss": 0.5516164898872375, "mean_token_accuracy": 0.7756523787975311, "num_tokens": 1992627.0, "step": 122 }, { "entropy": 0.5761916935443878, "epoch": 0.46024321796071094, "grad_norm": 0.03172283619642258, "learning_rate": 0.0002, "loss": 0.571029543876648, "mean_token_accuracy": 0.7667981088161469, "num_tokens": 2009019.0, "step": 123 }, { "entropy": 0.5743123888969421, "epoch": 0.4639850327408793, "grad_norm": 0.0364689975976944, "learning_rate": 0.0002, "loss": 0.5712283849716187, "mean_token_accuracy": 0.7701593190431595, "num_tokens": 2025188.0, "step": 124 }, { "entropy": 0.5582910478115082, "epoch": 0.4677268475210477, "grad_norm": 0.03056769073009491, "learning_rate": 0.0002, "loss": 0.56070876121521, "mean_token_accuracy": 0.7755492180585861, "num_tokens": 2041572.0, "step": 125 }, { "entropy": 0.5542439967393875, "epoch": 0.4714686623012161, "grad_norm": 0.03697546571493149, "learning_rate": 0.0002, "loss": 0.5604549646377563, "mean_token_accuracy": 0.7751918882131577, "num_tokens": 2057989.0, "step": 126 }, { "entropy": 0.5463303178548813, "epoch": 0.4752104770813845, "grad_norm": 0.033879246562719345, "learning_rate": 0.0002, "loss": 0.5539431571960449, "mean_token_accuracy": 0.7758707851171494, "num_tokens": 2074129.0, "step": 127 }, { "entropy": 0.5522827506065369, "epoch": 0.47895229186155286, "grad_norm": 0.03316348418593407, "learning_rate": 0.0002, "loss": 0.5581960082054138, "mean_token_accuracy": 0.7748778462409973, "num_tokens": 2090225.0, "step": 128 }, { "entropy": 0.5740112662315369, "epoch": 0.48269410664172124, "grad_norm": 0.03274102881550789, "learning_rate": 0.0002, "loss": 0.5653910040855408, "mean_token_accuracy": 0.7719868570566177, "num_tokens": 2106644.0, "step": 129 }, { "entropy": 0.5553925186395645, "epoch": 0.4864359214218896, "grad_norm": 0.028283284977078438, "learning_rate": 0.0002, "loss": 0.5513849258422852, "mean_token_accuracy": 0.7774856984615326, "num_tokens": 2123137.0, "step": 130 }, { "entropy": 0.5579676181077957, "epoch": 0.490177736202058, "grad_norm": 0.029911885038018227, "learning_rate": 0.0002, "loss": 0.5568463802337646, "mean_token_accuracy": 0.7730498015880585, "num_tokens": 2139285.0, "step": 131 }, { "entropy": 0.5664242058992386, "epoch": 0.4939195509822264, "grad_norm": 0.03227100148797035, "learning_rate": 0.0002, "loss": 0.5754393339157104, "mean_token_accuracy": 0.7667475491762161, "num_tokens": 2155517.0, "step": 132 }, { "entropy": 0.5501858294010162, "epoch": 0.4976613657623948, "grad_norm": 0.03013962134718895, "learning_rate": 0.0002, "loss": 0.5513433218002319, "mean_token_accuracy": 0.7747298777103424, "num_tokens": 2171722.0, "step": 133 }, { "entropy": 0.5627453327178955, "epoch": 0.5014031805425632, "grad_norm": 0.034450363367795944, "learning_rate": 0.0002, "loss": 0.5604255199432373, "mean_token_accuracy": 0.7740208506584167, "num_tokens": 2188054.0, "step": 134 }, { "entropy": 0.5634363293647766, "epoch": 0.5051449953227315, "grad_norm": 0.03803717717528343, "learning_rate": 0.0002, "loss": 0.558170735836029, "mean_token_accuracy": 0.7775739133358002, "num_tokens": 2204313.0, "step": 135 }, { "entropy": 0.5590767562389374, "epoch": 0.5088868101028999, "grad_norm": 0.029813330620527267, "learning_rate": 0.0002, "loss": 0.5652009844779968, "mean_token_accuracy": 0.7706311643123627, "num_tokens": 2220687.0, "step": 136 }, { "entropy": 0.5706852972507477, "epoch": 0.5126286248830683, "grad_norm": 0.0418686643242836, "learning_rate": 0.0002, "loss": 0.5734685063362122, "mean_token_accuracy": 0.7665899097919464, "num_tokens": 2237258.0, "step": 137 }, { "entropy": 0.5638300180435181, "epoch": 0.5163704396632367, "grad_norm": 0.03304136171936989, "learning_rate": 0.0002, "loss": 0.5663323402404785, "mean_token_accuracy": 0.7701692581176758, "num_tokens": 2253553.0, "step": 138 }, { "entropy": 0.5560389012098312, "epoch": 0.520112254443405, "grad_norm": 0.032340649515390396, "learning_rate": 0.0002, "loss": 0.5557302832603455, "mean_token_accuracy": 0.7773910611867905, "num_tokens": 2269787.0, "step": 139 }, { "entropy": 0.5491623729467392, "epoch": 0.5238540692235735, "grad_norm": 0.03743594512343407, "learning_rate": 0.0002, "loss": 0.5475925803184509, "mean_token_accuracy": 0.7796913385391235, "num_tokens": 2286052.0, "step": 140 }, { "entropy": 0.5624114125967026, "epoch": 0.5275958840037418, "grad_norm": 0.03084268979728222, "learning_rate": 0.0002, "loss": 0.5612790584564209, "mean_token_accuracy": 0.7745496481657028, "num_tokens": 2302516.0, "step": 141 }, { "entropy": 0.5638779103755951, "epoch": 0.5313376987839102, "grad_norm": 0.02851773053407669, "learning_rate": 0.0002, "loss": 0.568551778793335, "mean_token_accuracy": 0.7703356891870499, "num_tokens": 2318761.0, "step": 142 }, { "entropy": 0.5524759441614151, "epoch": 0.5350795135640786, "grad_norm": 0.03449970856308937, "learning_rate": 0.0002, "loss": 0.5582625865936279, "mean_token_accuracy": 0.7745357155799866, "num_tokens": 2335227.0, "step": 143 }, { "entropy": 0.5538729876279831, "epoch": 0.538821328344247, "grad_norm": 0.036926597356796265, "learning_rate": 0.0002, "loss": 0.5551813840866089, "mean_token_accuracy": 0.7734793871641159, "num_tokens": 2351743.0, "step": 144 }, { "entropy": 0.556109830737114, "epoch": 0.5425631431244153, "grad_norm": 0.032143596559762955, "learning_rate": 0.0002, "loss": 0.5621770620346069, "mean_token_accuracy": 0.7720111310482025, "num_tokens": 2368312.0, "step": 145 }, { "entropy": 0.5528390407562256, "epoch": 0.5463049579045838, "grad_norm": 0.027878830209374428, "learning_rate": 0.0002, "loss": 0.551728367805481, "mean_token_accuracy": 0.7765467911958694, "num_tokens": 2384834.0, "step": 146 }, { "entropy": 0.569217711687088, "epoch": 0.5500467726847521, "grad_norm": 0.03398638963699341, "learning_rate": 0.0002, "loss": 0.5663697123527527, "mean_token_accuracy": 0.7732102274894714, "num_tokens": 2401144.0, "step": 147 }, { "entropy": 0.5385106950998306, "epoch": 0.5537885874649204, "grad_norm": 0.034567005932331085, "learning_rate": 0.0002, "loss": 0.5383309721946716, "mean_token_accuracy": 0.781255692243576, "num_tokens": 2417158.0, "step": 148 }, { "entropy": 0.5630964189767838, "epoch": 0.5575304022450889, "grad_norm": 0.029897838830947876, "learning_rate": 0.0002, "loss": 0.5677754282951355, "mean_token_accuracy": 0.7685458660125732, "num_tokens": 2433487.0, "step": 149 }, { "entropy": 0.5507898777723312, "epoch": 0.5612722170252572, "grad_norm": 0.02974529378116131, "learning_rate": 0.0002, "loss": 0.5534771680831909, "mean_token_accuracy": 0.7748892605304718, "num_tokens": 2449770.0, "step": 150 }, { "entropy": 0.5639528781175613, "epoch": 0.5650140318054256, "grad_norm": 0.03235238045454025, "learning_rate": 0.0002, "loss": 0.5681154131889343, "mean_token_accuracy": 0.7700216770172119, "num_tokens": 2466229.0, "step": 151 }, { "entropy": 0.5683706551790237, "epoch": 0.568755846585594, "grad_norm": 0.028963793069124222, "learning_rate": 0.0002, "loss": 0.569283127784729, "mean_token_accuracy": 0.7688962519168854, "num_tokens": 2482737.0, "step": 152 }, { "entropy": 0.5595172494649887, "epoch": 0.5724976613657624, "grad_norm": 0.02971002459526062, "learning_rate": 0.0002, "loss": 0.5543393492698669, "mean_token_accuracy": 0.7762883901596069, "num_tokens": 2499145.0, "step": 153 }, { "entropy": 0.55421943962574, "epoch": 0.5762394761459307, "grad_norm": 0.030361918732523918, "learning_rate": 0.0002, "loss": 0.5593795776367188, "mean_token_accuracy": 0.7707612812519073, "num_tokens": 2515460.0, "step": 154 }, { "entropy": 0.5604497343301773, "epoch": 0.5799812909260992, "grad_norm": 0.03249987214803696, "learning_rate": 0.0002, "loss": 0.559572696685791, "mean_token_accuracy": 0.7736714631319046, "num_tokens": 2531731.0, "step": 155 }, { "entropy": 0.5572012811899185, "epoch": 0.5837231057062675, "grad_norm": 0.028877906501293182, "learning_rate": 0.0002, "loss": 0.5557632446289062, "mean_token_accuracy": 0.7749307751655579, "num_tokens": 2547934.0, "step": 156 }, { "entropy": 0.5711070001125336, "epoch": 0.587464920486436, "grad_norm": 0.030351407825946808, "learning_rate": 0.0002, "loss": 0.5682122707366943, "mean_token_accuracy": 0.7715558409690857, "num_tokens": 2564252.0, "step": 157 }, { "entropy": 0.5656052529811859, "epoch": 0.5912067352666043, "grad_norm": 0.029292697086930275, "learning_rate": 0.0002, "loss": 0.5643728375434875, "mean_token_accuracy": 0.7730299234390259, "num_tokens": 2580465.0, "step": 158 }, { "entropy": 0.5565295219421387, "epoch": 0.5949485500467727, "grad_norm": 0.028714049607515335, "learning_rate": 0.0002, "loss": 0.5634271502494812, "mean_token_accuracy": 0.7702697217464447, "num_tokens": 2596985.0, "step": 159 }, { "entropy": 0.5631282031536102, "epoch": 0.598690364826941, "grad_norm": 0.030091576278209686, "learning_rate": 0.0002, "loss": 0.5721826553344727, "mean_token_accuracy": 0.7689475417137146, "num_tokens": 2613206.0, "step": 160 }, { "entropy": 0.5607286393642426, "epoch": 0.6024321796071095, "grad_norm": 0.03013305738568306, "learning_rate": 0.0002, "loss": 0.5609285235404968, "mean_token_accuracy": 0.7740870416164398, "num_tokens": 2629766.0, "step": 161 }, { "entropy": 0.5548760294914246, "epoch": 0.6061739943872778, "grad_norm": 0.03615036979317665, "learning_rate": 0.0002, "loss": 0.561907172203064, "mean_token_accuracy": 0.7704312056303024, "num_tokens": 2645841.0, "step": 162 }, { "entropy": 0.5578597337007523, "epoch": 0.6099158091674463, "grad_norm": 0.029693420976400375, "learning_rate": 0.0002, "loss": 0.5573199391365051, "mean_token_accuracy": 0.7728497833013535, "num_tokens": 2662175.0, "step": 163 }, { "entropy": 0.5612762272357941, "epoch": 0.6136576239476146, "grad_norm": 0.030115241184830666, "learning_rate": 0.0002, "loss": 0.5610560178756714, "mean_token_accuracy": 0.7720479369163513, "num_tokens": 2678456.0, "step": 164 }, { "entropy": 0.5692281126976013, "epoch": 0.617399438727783, "grad_norm": 0.030713427811861038, "learning_rate": 0.0002, "loss": 0.567272961139679, "mean_token_accuracy": 0.7701284140348434, "num_tokens": 2694886.0, "step": 165 }, { "entropy": 0.5571814477443695, "epoch": 0.6211412535079514, "grad_norm": 0.030081165954470634, "learning_rate": 0.0002, "loss": 0.5578005313873291, "mean_token_accuracy": 0.7734847068786621, "num_tokens": 2711066.0, "step": 166 }, { "entropy": 0.5701806098222733, "epoch": 0.6248830682881198, "grad_norm": 0.024519717320799828, "learning_rate": 0.0002, "loss": 0.5707820057868958, "mean_token_accuracy": 0.765745609998703, "num_tokens": 2727604.0, "step": 167 }, { "entropy": 0.546685203909874, "epoch": 0.6286248830682881, "grad_norm": 0.030948853120207787, "learning_rate": 0.0002, "loss": 0.5538927912712097, "mean_token_accuracy": 0.7749418467283249, "num_tokens": 2743937.0, "step": 168 }, { "entropy": 0.5537951737642288, "epoch": 0.6323666978484564, "grad_norm": 0.03693117946386337, "learning_rate": 0.0002, "loss": 0.5586614608764648, "mean_token_accuracy": 0.7715347409248352, "num_tokens": 2760525.0, "step": 169 }, { "entropy": 0.5430830717086792, "epoch": 0.6361085126286249, "grad_norm": 0.029782412573695183, "learning_rate": 0.0002, "loss": 0.5412864685058594, "mean_token_accuracy": 0.7784539759159088, "num_tokens": 2776721.0, "step": 170 }, { "entropy": 0.5351588726043701, "epoch": 0.6398503274087932, "grad_norm": 0.03263084217905998, "learning_rate": 0.0002, "loss": 0.5388463139533997, "mean_token_accuracy": 0.781808465719223, "num_tokens": 2792933.0, "step": 171 }, { "entropy": 0.5568130016326904, "epoch": 0.6435921421889617, "grad_norm": 0.031154213473200798, "learning_rate": 0.0002, "loss": 0.5626617670059204, "mean_token_accuracy": 0.7720103710889816, "num_tokens": 2809451.0, "step": 172 }, { "entropy": 0.5607169568538666, "epoch": 0.64733395696913, "grad_norm": 0.03371235355734825, "learning_rate": 0.0002, "loss": 0.5647063255310059, "mean_token_accuracy": 0.7718498706817627, "num_tokens": 2825932.0, "step": 173 }, { "entropy": 0.555529311299324, "epoch": 0.6510757717492984, "grad_norm": 0.030816521495580673, "learning_rate": 0.0002, "loss": 0.5564374327659607, "mean_token_accuracy": 0.7758121490478516, "num_tokens": 2842314.0, "step": 174 }, { "entropy": 0.5513110458850861, "epoch": 0.6548175865294668, "grad_norm": 0.02944033220410347, "learning_rate": 0.0002, "loss": 0.5524051189422607, "mean_token_accuracy": 0.77901391685009, "num_tokens": 2858741.0, "step": 175 }, { "entropy": 0.5570909082889557, "epoch": 0.6585594013096352, "grad_norm": 0.030563851818442345, "learning_rate": 0.0002, "loss": 0.552980899810791, "mean_token_accuracy": 0.7785744369029999, "num_tokens": 2874790.0, "step": 176 }, { "entropy": 0.5531197637319565, "epoch": 0.6623012160898035, "grad_norm": 0.026769133284687996, "learning_rate": 0.0002, "loss": 0.5503875017166138, "mean_token_accuracy": 0.7756068855524063, "num_tokens": 2890991.0, "step": 177 }, { "entropy": 0.5576685070991516, "epoch": 0.666043030869972, "grad_norm": 0.031243668869137764, "learning_rate": 0.0002, "loss": 0.5595083236694336, "mean_token_accuracy": 0.7736776024103165, "num_tokens": 2907372.0, "step": 178 }, { "entropy": 0.561943918466568, "epoch": 0.6697848456501403, "grad_norm": 0.029022254049777985, "learning_rate": 0.0002, "loss": 0.5671570301055908, "mean_token_accuracy": 0.7722343951463699, "num_tokens": 2923921.0, "step": 179 }, { "entropy": 0.5484957844018936, "epoch": 0.6735266604303087, "grad_norm": 0.030121706426143646, "learning_rate": 0.0002, "loss": 0.5546964406967163, "mean_token_accuracy": 0.7751270681619644, "num_tokens": 2940247.0, "step": 180 }, { "entropy": 0.554192379117012, "epoch": 0.6772684752104771, "grad_norm": 0.030762923881411552, "learning_rate": 0.0002, "loss": 0.5602478981018066, "mean_token_accuracy": 0.7732126861810684, "num_tokens": 2956527.0, "step": 181 }, { "entropy": 0.5684338361024857, "epoch": 0.6810102899906455, "grad_norm": 0.036885276436805725, "learning_rate": 0.0002, "loss": 0.5655561685562134, "mean_token_accuracy": 0.769650399684906, "num_tokens": 2972654.0, "step": 182 }, { "entropy": 0.5733159780502319, "epoch": 0.6847521047708138, "grad_norm": 0.03168238326907158, "learning_rate": 0.0002, "loss": 0.5698360800743103, "mean_token_accuracy": 0.7700367867946625, "num_tokens": 2989101.0, "step": 183 }, { "entropy": 0.556915819644928, "epoch": 0.6884939195509823, "grad_norm": 0.03091347962617874, "learning_rate": 0.0002, "loss": 0.5448244214057922, "mean_token_accuracy": 0.7791603803634644, "num_tokens": 3005335.0, "step": 184 }, { "entropy": 0.5490943491458893, "epoch": 0.6922357343311506, "grad_norm": 0.032818131148815155, "learning_rate": 0.0002, "loss": 0.5487899780273438, "mean_token_accuracy": 0.7768953591585159, "num_tokens": 3021621.0, "step": 185 }, { "entropy": 0.5296357423067093, "epoch": 0.695977549111319, "grad_norm": 0.03200080245733261, "learning_rate": 0.0002, "loss": 0.5386063456535339, "mean_token_accuracy": 0.7796643227338791, "num_tokens": 3037785.0, "step": 186 }, { "entropy": 0.5606788247823715, "epoch": 0.6997193638914874, "grad_norm": 0.03352601081132889, "learning_rate": 0.0002, "loss": 0.5720128417015076, "mean_token_accuracy": 0.7676278650760651, "num_tokens": 3053806.0, "step": 187 }, { "entropy": 0.5525215566158295, "epoch": 0.7034611786716558, "grad_norm": 0.03217856585979462, "learning_rate": 0.0002, "loss": 0.5599426627159119, "mean_token_accuracy": 0.7706687748432159, "num_tokens": 3070070.0, "step": 188 }, { "entropy": 0.5785647034645081, "epoch": 0.7072029934518241, "grad_norm": 0.03108043409883976, "learning_rate": 0.0002, "loss": 0.5753121376037598, "mean_token_accuracy": 0.7674888074398041, "num_tokens": 3086407.0, "step": 189 }, { "entropy": 0.572156235575676, "epoch": 0.7109448082319925, "grad_norm": 0.036022067070007324, "learning_rate": 0.0002, "loss": 0.5567526817321777, "mean_token_accuracy": 0.7726783901453018, "num_tokens": 3102575.0, "step": 190 }, { "entropy": 0.5531092137098312, "epoch": 0.7146866230121609, "grad_norm": 0.028695300221443176, "learning_rate": 0.0002, "loss": 0.545417070388794, "mean_token_accuracy": 0.7790848612785339, "num_tokens": 3118942.0, "step": 191 }, { "entropy": 0.542072057723999, "epoch": 0.7184284377923292, "grad_norm": 0.02768511138856411, "learning_rate": 0.0002, "loss": 0.5424788594245911, "mean_token_accuracy": 0.7790149599313736, "num_tokens": 3134996.0, "step": 192 }, { "entropy": 0.5440382957458496, "epoch": 0.7221702525724977, "grad_norm": 0.044699691236019135, "learning_rate": 0.0002, "loss": 0.5630879402160645, "mean_token_accuracy": 0.7720867395401001, "num_tokens": 3151144.0, "step": 193 }, { "entropy": 0.5484438389539719, "epoch": 0.725912067352666, "grad_norm": 0.033284809440374374, "learning_rate": 0.0002, "loss": 0.5586625933647156, "mean_token_accuracy": 0.7742896676063538, "num_tokens": 3167431.0, "step": 194 }, { "entropy": 0.5585122853517532, "epoch": 0.7296538821328344, "grad_norm": 0.029940789565443993, "learning_rate": 0.0002, "loss": 0.5640571117401123, "mean_token_accuracy": 0.7736721932888031, "num_tokens": 3183584.0, "step": 195 }, { "entropy": 0.5803828984498978, "epoch": 0.7333956969130028, "grad_norm": 0.03922640532255173, "learning_rate": 0.0002, "loss": 0.5756028294563293, "mean_token_accuracy": 0.7650134712457657, "num_tokens": 3199936.0, "step": 196 }, { "entropy": 0.5695553570985794, "epoch": 0.7371375116931712, "grad_norm": 0.02914128266274929, "learning_rate": 0.0002, "loss": 0.5552971959114075, "mean_token_accuracy": 0.7738740146160126, "num_tokens": 3216327.0, "step": 197 }, { "entropy": 0.5402019023895264, "epoch": 0.7408793264733395, "grad_norm": 0.02753686159849167, "learning_rate": 0.0002, "loss": 0.5362023711204529, "mean_token_accuracy": 0.7808489948511124, "num_tokens": 3232411.0, "step": 198 }, { "entropy": 0.5661509037017822, "epoch": 0.744621141253508, "grad_norm": 0.029173044487833977, "learning_rate": 0.0002, "loss": 0.5666989088058472, "mean_token_accuracy": 0.7697858512401581, "num_tokens": 3248516.0, "step": 199 }, { "entropy": 0.5394262075424194, "epoch": 0.7483629560336763, "grad_norm": 0.03222000226378441, "learning_rate": 0.0002, "loss": 0.5493192076683044, "mean_token_accuracy": 0.7756218761205673, "num_tokens": 3264724.0, "step": 200 }, { "entropy": 0.5624162256717682, "epoch": 0.7521047708138447, "grad_norm": 0.03587524592876434, "learning_rate": 0.0002, "loss": 0.5728610157966614, "mean_token_accuracy": 0.7661173194646835, "num_tokens": 3280953.0, "step": 201 }, { "entropy": 0.5574640333652496, "epoch": 0.7558465855940131, "grad_norm": 0.030263541266322136, "learning_rate": 0.0002, "loss": 0.5545740127563477, "mean_token_accuracy": 0.7747018188238144, "num_tokens": 3297315.0, "step": 202 }, { "entropy": 0.5598777681589127, "epoch": 0.7595884003741815, "grad_norm": 0.0284356027841568, "learning_rate": 0.0002, "loss": 0.5577300190925598, "mean_token_accuracy": 0.7724722474813461, "num_tokens": 3313688.0, "step": 203 }, { "entropy": 0.5658386498689651, "epoch": 0.7633302151543498, "grad_norm": 0.03470136970281601, "learning_rate": 0.0002, "loss": 0.5591439008712769, "mean_token_accuracy": 0.7761197835206985, "num_tokens": 3329826.0, "step": 204 }, { "entropy": 0.5585865527391434, "epoch": 0.7670720299345183, "grad_norm": 0.027583830058574677, "learning_rate": 0.0002, "loss": 0.5561191439628601, "mean_token_accuracy": 0.7717861980199814, "num_tokens": 3346401.0, "step": 205 }, { "entropy": 0.5518056601285934, "epoch": 0.7708138447146866, "grad_norm": 0.034380193799734116, "learning_rate": 0.0002, "loss": 0.56368488073349, "mean_token_accuracy": 0.7690371572971344, "num_tokens": 3362862.0, "step": 206 }, { "entropy": 0.5423950105905533, "epoch": 0.774555659494855, "grad_norm": 0.027748677879571915, "learning_rate": 0.0002, "loss": 0.5500733256340027, "mean_token_accuracy": 0.7782405465841293, "num_tokens": 3379133.0, "step": 207 }, { "entropy": 0.5392836630344391, "epoch": 0.7782974742750234, "grad_norm": 0.030424097552895546, "learning_rate": 0.0002, "loss": 0.5452281832695007, "mean_token_accuracy": 0.7790029048919678, "num_tokens": 3395406.0, "step": 208 }, { "entropy": 0.5665347129106522, "epoch": 0.7820392890551918, "grad_norm": 0.02836509235203266, "learning_rate": 0.0002, "loss": 0.5655370950698853, "mean_token_accuracy": 0.768405556678772, "num_tokens": 3411686.0, "step": 209 }, { "entropy": 0.5624722540378571, "epoch": 0.7857811038353602, "grad_norm": 0.028227761387825012, "learning_rate": 0.0002, "loss": 0.5540167689323425, "mean_token_accuracy": 0.7740924656391144, "num_tokens": 3427914.0, "step": 210 }, { "entropy": 0.555148720741272, "epoch": 0.7895229186155285, "grad_norm": 0.03054502047598362, "learning_rate": 0.0002, "loss": 0.5572685599327087, "mean_token_accuracy": 0.7746326923370361, "num_tokens": 3444170.0, "step": 211 }, { "entropy": 0.5449056923389435, "epoch": 0.7932647333956969, "grad_norm": 0.03224708139896393, "learning_rate": 0.0002, "loss": 0.5572819113731384, "mean_token_accuracy": 0.7724157273769379, "num_tokens": 3460305.0, "step": 212 }, { "entropy": 0.5533578097820282, "epoch": 0.7970065481758652, "grad_norm": 0.031917959451675415, "learning_rate": 0.0002, "loss": 0.557055652141571, "mean_token_accuracy": 0.7715483158826828, "num_tokens": 3476772.0, "step": 213 }, { "entropy": 0.5611972808837891, "epoch": 0.8007483629560337, "grad_norm": 0.031701650470495224, "learning_rate": 0.0002, "loss": 0.5658101439476013, "mean_token_accuracy": 0.7677106559276581, "num_tokens": 3493499.0, "step": 214 }, { "entropy": 0.5572656095027924, "epoch": 0.804490177736202, "grad_norm": 0.02719227597117424, "learning_rate": 0.0002, "loss": 0.5549203157424927, "mean_token_accuracy": 0.774790808558464, "num_tokens": 3509811.0, "step": 215 }, { "entropy": 0.5471508800983429, "epoch": 0.8082319925163705, "grad_norm": 0.025823380798101425, "learning_rate": 0.0002, "loss": 0.5506555438041687, "mean_token_accuracy": 0.7770570069551468, "num_tokens": 3526157.0, "step": 216 }, { "entropy": 0.5587919056415558, "epoch": 0.8119738072965388, "grad_norm": 0.027526551857590675, "learning_rate": 0.0002, "loss": 0.5553531050682068, "mean_token_accuracy": 0.7733194231987, "num_tokens": 3542353.0, "step": 217 }, { "entropy": 0.5590764433145523, "epoch": 0.8157156220767072, "grad_norm": 0.027686061337590218, "learning_rate": 0.0002, "loss": 0.553832471370697, "mean_token_accuracy": 0.7726568281650543, "num_tokens": 3558723.0, "step": 218 }, { "entropy": 0.5684271901845932, "epoch": 0.8194574368568756, "grad_norm": 0.027071600779891014, "learning_rate": 0.0002, "loss": 0.5699101686477661, "mean_token_accuracy": 0.7687496989965439, "num_tokens": 3575290.0, "step": 219 }, { "entropy": 0.5384210348129272, "epoch": 0.823199251637044, "grad_norm": 0.030755044892430305, "learning_rate": 0.0002, "loss": 0.5439192652702332, "mean_token_accuracy": 0.7772842049598694, "num_tokens": 3591563.0, "step": 220 }, { "entropy": 0.524935394525528, "epoch": 0.8269410664172123, "grad_norm": 0.02740432508289814, "learning_rate": 0.0002, "loss": 0.529310941696167, "mean_token_accuracy": 0.784336507320404, "num_tokens": 3607814.0, "step": 221 }, { "entropy": 0.5532049238681793, "epoch": 0.8306828811973808, "grad_norm": 0.034083202481269836, "learning_rate": 0.0002, "loss": 0.5611142516136169, "mean_token_accuracy": 0.7706895172595978, "num_tokens": 3624047.0, "step": 222 }, { "entropy": 0.5380610376596451, "epoch": 0.8344246959775491, "grad_norm": 0.029454410076141357, "learning_rate": 0.0002, "loss": 0.5438103675842285, "mean_token_accuracy": 0.7790344655513763, "num_tokens": 3640194.0, "step": 223 }, { "entropy": 0.5661721527576447, "epoch": 0.8381665107577175, "grad_norm": 0.029397280886769295, "learning_rate": 0.0002, "loss": 0.558972954750061, "mean_token_accuracy": 0.7724218964576721, "num_tokens": 3656608.0, "step": 224 }, { "entropy": 0.5514093935489655, "epoch": 0.8419083255378859, "grad_norm": 0.029793422669172287, "learning_rate": 0.0002, "loss": 0.550917387008667, "mean_token_accuracy": 0.7733565121889114, "num_tokens": 3672523.0, "step": 225 }, { "entropy": 0.5508118569850922, "epoch": 0.8456501403180543, "grad_norm": 0.030908716842532158, "learning_rate": 0.0002, "loss": 0.5537383556365967, "mean_token_accuracy": 0.7725334316492081, "num_tokens": 3688658.0, "step": 226 }, { "entropy": 0.5521706193685532, "epoch": 0.8493919550982226, "grad_norm": 0.03186751529574394, "learning_rate": 0.0002, "loss": 0.5577634572982788, "mean_token_accuracy": 0.7732146978378296, "num_tokens": 3704875.0, "step": 227 }, { "entropy": 0.543274000287056, "epoch": 0.8531337698783911, "grad_norm": 0.030743638053536415, "learning_rate": 0.0002, "loss": 0.5453194379806519, "mean_token_accuracy": 0.7776961177587509, "num_tokens": 3720936.0, "step": 228 }, { "entropy": 0.5507763624191284, "epoch": 0.8568755846585594, "grad_norm": 0.030140401795506477, "learning_rate": 0.0002, "loss": 0.5504044890403748, "mean_token_accuracy": 0.7767813801765442, "num_tokens": 3737279.0, "step": 229 }, { "entropy": 0.5462870597839355, "epoch": 0.8606173994387278, "grad_norm": 0.026473646983504295, "learning_rate": 0.0002, "loss": 0.5481734275817871, "mean_token_accuracy": 0.7772915065288544, "num_tokens": 3753415.0, "step": 230 }, { "entropy": 0.5563444495201111, "epoch": 0.8643592142188962, "grad_norm": 0.02921387553215027, "learning_rate": 0.0002, "loss": 0.5546942949295044, "mean_token_accuracy": 0.7731446027755737, "num_tokens": 3769803.0, "step": 231 }, { "entropy": 0.559598296880722, "epoch": 0.8681010289990645, "grad_norm": 0.03972897306084633, "learning_rate": 0.0002, "loss": 0.5572680234909058, "mean_token_accuracy": 0.773430734872818, "num_tokens": 3785892.0, "step": 232 }, { "entropy": 0.539952963590622, "epoch": 0.8718428437792329, "grad_norm": 0.028981171548366547, "learning_rate": 0.0002, "loss": 0.5390475988388062, "mean_token_accuracy": 0.7811980247497559, "num_tokens": 3802184.0, "step": 233 }, { "entropy": 0.5387761145830154, "epoch": 0.8755846585594013, "grad_norm": 0.026351595297455788, "learning_rate": 0.0002, "loss": 0.5407798290252686, "mean_token_accuracy": 0.7787132114171982, "num_tokens": 3818418.0, "step": 234 }, { "entropy": 0.5693282037973404, "epoch": 0.8793264733395697, "grad_norm": 0.033158186823129654, "learning_rate": 0.0002, "loss": 0.5714267492294312, "mean_token_accuracy": 0.7690801620483398, "num_tokens": 3834874.0, "step": 235 }, { "entropy": 0.5534514784812927, "epoch": 0.883068288119738, "grad_norm": 0.0280459001660347, "learning_rate": 0.0002, "loss": 0.5574108362197876, "mean_token_accuracy": 0.7764205187559128, "num_tokens": 3851261.0, "step": 236 }, { "entropy": 0.5554600358009338, "epoch": 0.8868101028999065, "grad_norm": 0.027284014970064163, "learning_rate": 0.0002, "loss": 0.5592954754829407, "mean_token_accuracy": 0.7728679180145264, "num_tokens": 3867826.0, "step": 237 }, { "entropy": 0.5611312091350555, "epoch": 0.8905519176800748, "grad_norm": 0.027675554156303406, "learning_rate": 0.0002, "loss": 0.5633160471916199, "mean_token_accuracy": 0.7716223746538162, "num_tokens": 3884424.0, "step": 238 }, { "entropy": 0.5698042660951614, "epoch": 0.8942937324602432, "grad_norm": 0.02734820544719696, "learning_rate": 0.0002, "loss": 0.5722016096115112, "mean_token_accuracy": 0.767684668302536, "num_tokens": 3900993.0, "step": 239 }, { "entropy": 0.5487347990274429, "epoch": 0.8980355472404116, "grad_norm": 0.030463971197605133, "learning_rate": 0.0002, "loss": 0.5459187626838684, "mean_token_accuracy": 0.7788650244474411, "num_tokens": 3917455.0, "step": 240 }, { "entropy": 0.5684353709220886, "epoch": 0.90177736202058, "grad_norm": 0.028492476791143417, "learning_rate": 0.0002, "loss": 0.5674321055412292, "mean_token_accuracy": 0.7663144171237946, "num_tokens": 3934049.0, "step": 241 }, { "entropy": 0.5689758509397507, "epoch": 0.9055191768007483, "grad_norm": 0.02926958166062832, "learning_rate": 0.0002, "loss": 0.5745148658752441, "mean_token_accuracy": 0.7678453773260117, "num_tokens": 3950533.0, "step": 242 }, { "entropy": 0.549301877617836, "epoch": 0.9092609915809168, "grad_norm": 0.03295575827360153, "learning_rate": 0.0002, "loss": 0.5597534775733948, "mean_token_accuracy": 0.7714426666498184, "num_tokens": 3966986.0, "step": 243 }, { "entropy": 0.5338816940784454, "epoch": 0.9130028063610851, "grad_norm": 0.030206363648176193, "learning_rate": 0.0002, "loss": 0.5326100587844849, "mean_token_accuracy": 0.7836355268955231, "num_tokens": 3983434.0, "step": 244 }, { "entropy": 0.5674562901258469, "epoch": 0.9167446211412535, "grad_norm": 0.026608271524310112, "learning_rate": 0.0002, "loss": 0.5644797682762146, "mean_token_accuracy": 0.7716486304998398, "num_tokens": 3999756.0, "step": 245 }, { "entropy": 0.5831885486841202, "epoch": 0.9204864359214219, "grad_norm": 0.03711472824215889, "learning_rate": 0.0002, "loss": 0.5693003535270691, "mean_token_accuracy": 0.7677270174026489, "num_tokens": 4016084.0, "step": 246 }, { "entropy": 0.5590741783380508, "epoch": 0.9242282507015903, "grad_norm": 0.027594709768891335, "learning_rate": 0.0002, "loss": 0.5590558052062988, "mean_token_accuracy": 0.7732381373643875, "num_tokens": 4032464.0, "step": 247 }, { "entropy": 0.5414686352014542, "epoch": 0.9279700654817586, "grad_norm": 0.037102047353982925, "learning_rate": 0.0002, "loss": 0.5545523762702942, "mean_token_accuracy": 0.775322362780571, "num_tokens": 4048853.0, "step": 248 }, { "entropy": 0.5506337434053421, "epoch": 0.9317118802619271, "grad_norm": 0.03612777963280678, "learning_rate": 0.0002, "loss": 0.5673890709877014, "mean_token_accuracy": 0.7688823938369751, "num_tokens": 4065031.0, "step": 249 }, { "entropy": 0.542187824845314, "epoch": 0.9354536950420954, "grad_norm": 0.031235933303833008, "learning_rate": 0.0002, "loss": 0.5464475750923157, "mean_token_accuracy": 0.7789596170186996, "num_tokens": 4081635.0, "step": 250 }, { "entropy": 0.5568290203809738, "epoch": 0.9391955098222639, "grad_norm": 0.027413224801421165, "learning_rate": 0.0002, "loss": 0.5562602877616882, "mean_token_accuracy": 0.7737423926591873, "num_tokens": 4098011.0, "step": 251 }, { "entropy": 0.558889165520668, "epoch": 0.9429373246024322, "grad_norm": 0.029295574873685837, "learning_rate": 0.0002, "loss": 0.5547473430633545, "mean_token_accuracy": 0.7740904539823532, "num_tokens": 4114268.0, "step": 252 }, { "entropy": 0.5764719247817993, "epoch": 0.9466791393826005, "grad_norm": 0.03225071728229523, "learning_rate": 0.0002, "loss": 0.5729030966758728, "mean_token_accuracy": 0.7659229934215546, "num_tokens": 4130552.0, "step": 253 }, { "entropy": 0.5606585443019867, "epoch": 0.950420954162769, "grad_norm": 0.02834608033299446, "learning_rate": 0.0002, "loss": 0.5623061656951904, "mean_token_accuracy": 0.7708321511745453, "num_tokens": 4146844.0, "step": 254 }, { "entropy": 0.5444774627685547, "epoch": 0.9541627689429373, "grad_norm": 0.03255439177155495, "learning_rate": 0.0002, "loss": 0.5524637699127197, "mean_token_accuracy": 0.7744161784648895, "num_tokens": 4163084.0, "step": 255 }, { "entropy": 0.5229519456624985, "epoch": 0.9579045837231057, "grad_norm": 0.027845216915011406, "learning_rate": 0.0002, "loss": 0.5284432768821716, "mean_token_accuracy": 0.785067692399025, "num_tokens": 4179192.0, "step": 256 }, { "entropy": 0.5287301391363144, "epoch": 0.961646398503274, "grad_norm": 0.03511723130941391, "learning_rate": 0.0002, "loss": 0.5364463329315186, "mean_token_accuracy": 0.7782928943634033, "num_tokens": 4195604.0, "step": 257 }, { "entropy": 0.5621770173311234, "epoch": 0.9653882132834425, "grad_norm": 0.02962673269212246, "learning_rate": 0.0002, "loss": 0.5591749548912048, "mean_token_accuracy": 0.7710652500391006, "num_tokens": 4211743.0, "step": 258 }, { "entropy": 0.5636511147022247, "epoch": 0.9691300280636108, "grad_norm": 0.04087170958518982, "learning_rate": 0.0002, "loss": 0.5626160502433777, "mean_token_accuracy": 0.771452471613884, "num_tokens": 4228198.0, "step": 259 }, { "entropy": 0.5522175580263138, "epoch": 0.9728718428437793, "grad_norm": 0.029492903500795364, "learning_rate": 0.0002, "loss": 0.5516583323478699, "mean_token_accuracy": 0.7742890268564224, "num_tokens": 4244501.0, "step": 260 }, { "entropy": 0.5577979236841202, "epoch": 0.9766136576239476, "grad_norm": 0.02768765017390251, "learning_rate": 0.0002, "loss": 0.5573770403862, "mean_token_accuracy": 0.7728449106216431, "num_tokens": 4260800.0, "step": 261 }, { "entropy": 0.5833724588155746, "epoch": 0.980355472404116, "grad_norm": 0.030149318277835846, "learning_rate": 0.0002, "loss": 0.5790048837661743, "mean_token_accuracy": 0.7645868510007858, "num_tokens": 4277242.0, "step": 262 }, { "entropy": 0.5686817467212677, "epoch": 0.9840972871842844, "grad_norm": 0.03200973942875862, "learning_rate": 0.0002, "loss": 0.5704789161682129, "mean_token_accuracy": 0.7688680738210678, "num_tokens": 4293490.0, "step": 263 }, { "entropy": 0.5522599965333939, "epoch": 0.9878391019644528, "grad_norm": 0.02735111489892006, "learning_rate": 0.0002, "loss": 0.5483981370925903, "mean_token_accuracy": 0.7776431888341904, "num_tokens": 4309713.0, "step": 264 }, { "entropy": 0.5510786324739456, "epoch": 0.9915809167446211, "grad_norm": 0.027222398668527603, "learning_rate": 0.0002, "loss": 0.5519858598709106, "mean_token_accuracy": 0.7740090191364288, "num_tokens": 4325978.0, "step": 265 }, { "entropy": 0.5590775907039642, "epoch": 0.9953227315247896, "grad_norm": 0.030459199100732803, "learning_rate": 0.0002, "loss": 0.5638831853866577, "mean_token_accuracy": 0.7691285163164139, "num_tokens": 4342145.0, "step": 266 }, { "entropy": 0.5396278500556946, "epoch": 0.9990645463049579, "grad_norm": 0.029775220900774002, "learning_rate": 0.0002, "loss": 0.551082968711853, "mean_token_accuracy": 0.777344822883606, "num_tokens": 4358366.0, "step": 267 }, { "entropy": 0.5386617183685303, "epoch": 1.0, "grad_norm": 0.05107063427567482, "learning_rate": 0.0002, "loss": 0.56319260597229, "mean_token_accuracy": 0.7758007049560547, "num_tokens": 4359498.0, "step": 268 }, { "entropy": 0.5456036031246185, "epoch": 1.0037418147801684, "grad_norm": 0.034975565969944, "learning_rate": 0.0002, "loss": 0.5444031953811646, "mean_token_accuracy": 0.7782553881406784, "num_tokens": 4375874.0, "step": 269 }, { "entropy": 0.554328516125679, "epoch": 1.0074836295603367, "grad_norm": 0.030762778595089912, "learning_rate": 0.0002, "loss": 0.5493590235710144, "mean_token_accuracy": 0.7769091576337814, "num_tokens": 4392309.0, "step": 270 }, { "entropy": 0.544586181640625, "epoch": 1.011225444340505, "grad_norm": 0.027982227504253387, "learning_rate": 0.0002, "loss": 0.5366782546043396, "mean_token_accuracy": 0.7823053598403931, "num_tokens": 4408365.0, "step": 271 }, { "entropy": 0.5558233559131622, "epoch": 1.0149672591206735, "grad_norm": 0.029144754633307457, "learning_rate": 0.0002, "loss": 0.5538930296897888, "mean_token_accuracy": 0.7747932523488998, "num_tokens": 4424690.0, "step": 272 }, { "entropy": 0.5521434098482132, "epoch": 1.018709073900842, "grad_norm": 0.031630512326955795, "learning_rate": 0.0002, "loss": 0.5583912134170532, "mean_token_accuracy": 0.773905873298645, "num_tokens": 4441085.0, "step": 273 }, { "entropy": 0.5409824252128601, "epoch": 1.0224508886810102, "grad_norm": 0.03298581764101982, "learning_rate": 0.0002, "loss": 0.5436674356460571, "mean_token_accuracy": 0.7784581035375595, "num_tokens": 4457337.0, "step": 274 }, { "entropy": 0.5269698351621628, "epoch": 1.0261927034611786, "grad_norm": 0.03633208945393562, "learning_rate": 0.0002, "loss": 0.530029833316803, "mean_token_accuracy": 0.786719799041748, "num_tokens": 4473532.0, "step": 275 }, { "entropy": 0.572344645857811, "epoch": 1.029934518241347, "grad_norm": 0.03007793240249157, "learning_rate": 0.0002, "loss": 0.5664374828338623, "mean_token_accuracy": 0.768335297703743, "num_tokens": 4489887.0, "step": 276 }, { "entropy": 0.5445250272750854, "epoch": 1.0336763330215155, "grad_norm": 0.027243314310908318, "learning_rate": 0.0002, "loss": 0.5401641726493835, "mean_token_accuracy": 0.7808064818382263, "num_tokens": 4505862.0, "step": 277 }, { "entropy": 0.5509742796421051, "epoch": 1.0374181478016837, "grad_norm": 0.032545655965805054, "learning_rate": 0.0002, "loss": 0.5521466732025146, "mean_token_accuracy": 0.7762803286314011, "num_tokens": 4522135.0, "step": 278 }, { "entropy": 0.5502415001392365, "epoch": 1.0411599625818522, "grad_norm": 0.030756743624806404, "learning_rate": 0.0002, "loss": 0.5506622195243835, "mean_token_accuracy": 0.7758103907108307, "num_tokens": 4538594.0, "step": 279 }, { "entropy": 0.5414353311061859, "epoch": 1.0449017773620206, "grad_norm": 0.030841531231999397, "learning_rate": 0.0002, "loss": 0.5470583438873291, "mean_token_accuracy": 0.7776292413473129, "num_tokens": 4555119.0, "step": 280 }, { "entropy": 0.5487425029277802, "epoch": 1.048643592142189, "grad_norm": 0.03335481509566307, "learning_rate": 0.0002, "loss": 0.5511153936386108, "mean_token_accuracy": 0.7753961086273193, "num_tokens": 4571676.0, "step": 281 }, { "entropy": 0.5364932715892792, "epoch": 1.0523854069223573, "grad_norm": 0.03433723747730255, "learning_rate": 0.0002, "loss": 0.5388063788414001, "mean_token_accuracy": 0.7791535705327988, "num_tokens": 4587803.0, "step": 282 }, { "entropy": 0.5218682438135147, "epoch": 1.0561272217025257, "grad_norm": 0.03049764409661293, "learning_rate": 0.0002, "loss": 0.5254226922988892, "mean_token_accuracy": 0.7847179919481277, "num_tokens": 4603856.0, "step": 283 }, { "entropy": 0.5384526699781418, "epoch": 1.0598690364826941, "grad_norm": 0.02954094670712948, "learning_rate": 0.0002, "loss": 0.5442904829978943, "mean_token_accuracy": 0.7810987532138824, "num_tokens": 4619957.0, "step": 284 }, { "entropy": 0.5648271888494492, "epoch": 1.0636108512628626, "grad_norm": 0.029273223131895065, "learning_rate": 0.0002, "loss": 0.565851628780365, "mean_token_accuracy": 0.7694031447172165, "num_tokens": 4636366.0, "step": 285 }, { "entropy": 0.5445346832275391, "epoch": 1.0673526660430308, "grad_norm": 0.04154031351208687, "learning_rate": 0.0002, "loss": 0.5437869429588318, "mean_token_accuracy": 0.7786456942558289, "num_tokens": 4652409.0, "step": 286 }, { "entropy": 0.5666444450616837, "epoch": 1.0710944808231992, "grad_norm": 0.027274858206510544, "learning_rate": 0.0002, "loss": 0.5619191527366638, "mean_token_accuracy": 0.7713726609945297, "num_tokens": 4668805.0, "step": 287 }, { "entropy": 0.5560373812913895, "epoch": 1.0748362956033677, "grad_norm": 0.03042946569621563, "learning_rate": 0.0002, "loss": 0.5536933541297913, "mean_token_accuracy": 0.7707109302282333, "num_tokens": 4685281.0, "step": 288 }, { "entropy": 0.5522497296333313, "epoch": 1.078578110383536, "grad_norm": 0.026407577097415924, "learning_rate": 0.0002, "loss": 0.554541826248169, "mean_token_accuracy": 0.7723578214645386, "num_tokens": 4701429.0, "step": 289 }, { "entropy": 0.5493666082620621, "epoch": 1.0823199251637043, "grad_norm": 0.03922448307275772, "learning_rate": 0.0002, "loss": 0.5535799860954285, "mean_token_accuracy": 0.7752141654491425, "num_tokens": 4717787.0, "step": 290 }, { "entropy": 0.5579231083393097, "epoch": 1.0860617399438728, "grad_norm": 0.029233764857053757, "learning_rate": 0.0002, "loss": 0.5569900274276733, "mean_token_accuracy": 0.7733462601900101, "num_tokens": 4734144.0, "step": 291 }, { "entropy": 0.544972226023674, "epoch": 1.0898035547240412, "grad_norm": 0.030961396172642708, "learning_rate": 0.0002, "loss": 0.5413874983787537, "mean_token_accuracy": 0.7801695913076401, "num_tokens": 4750509.0, "step": 292 }, { "entropy": 0.550209566950798, "epoch": 1.0935453695042094, "grad_norm": 0.03252837061882019, "learning_rate": 0.0002, "loss": 0.5514767169952393, "mean_token_accuracy": 0.7740490287542343, "num_tokens": 4766708.0, "step": 293 }, { "entropy": 0.545928418636322, "epoch": 1.0972871842843779, "grad_norm": 0.02844078466296196, "learning_rate": 0.0002, "loss": 0.5454370975494385, "mean_token_accuracy": 0.7802854478359222, "num_tokens": 4783110.0, "step": 294 }, { "entropy": 0.550410658121109, "epoch": 1.1010289990645463, "grad_norm": 0.0395023413002491, "learning_rate": 0.0002, "loss": 0.5610683560371399, "mean_token_accuracy": 0.7725012004375458, "num_tokens": 4799492.0, "step": 295 }, { "entropy": 0.5291745737195015, "epoch": 1.1047708138447148, "grad_norm": 0.028669750317931175, "learning_rate": 0.0002, "loss": 0.5332962274551392, "mean_token_accuracy": 0.7820043116807938, "num_tokens": 4815864.0, "step": 296 }, { "entropy": 0.5454689562320709, "epoch": 1.108512628624883, "grad_norm": 0.02827887050807476, "learning_rate": 0.0002, "loss": 0.5511517524719238, "mean_token_accuracy": 0.7747574001550674, "num_tokens": 4832267.0, "step": 297 }, { "entropy": 0.5417342334985733, "epoch": 1.1122544434050514, "grad_norm": 0.026385854929685593, "learning_rate": 0.0002, "loss": 0.5412203669548035, "mean_token_accuracy": 0.780335083603859, "num_tokens": 4848653.0, "step": 298 }, { "entropy": 0.5629215389490128, "epoch": 1.1159962581852199, "grad_norm": 0.030779633671045303, "learning_rate": 0.0002, "loss": 0.5625781416893005, "mean_token_accuracy": 0.7703746110200882, "num_tokens": 4865192.0, "step": 299 }, { "entropy": 0.5278398767113686, "epoch": 1.1197380729653883, "grad_norm": 0.02865917608141899, "learning_rate": 0.0002, "loss": 0.5246303081512451, "mean_token_accuracy": 0.7881903648376465, "num_tokens": 4881315.0, "step": 300 }, { "entropy": 0.5360843688249588, "epoch": 1.1234798877455565, "grad_norm": 0.02863423153758049, "learning_rate": 0.0002, "loss": 0.5405621528625488, "mean_token_accuracy": 0.7765359878540039, "num_tokens": 4897572.0, "step": 301 }, { "entropy": 0.5270702391862869, "epoch": 1.127221702525725, "grad_norm": 0.027807647362351418, "learning_rate": 0.0002, "loss": 0.5271122455596924, "mean_token_accuracy": 0.7830122262239456, "num_tokens": 4913718.0, "step": 302 }, { "entropy": 0.5291232466697693, "epoch": 1.1309635173058934, "grad_norm": 0.03156433254480362, "learning_rate": 0.0002, "loss": 0.5328850746154785, "mean_token_accuracy": 0.7853387147188187, "num_tokens": 4930253.0, "step": 303 }, { "entropy": 0.5468447655439377, "epoch": 1.1347053320860618, "grad_norm": 0.033552881330251694, "learning_rate": 0.0002, "loss": 0.5545834898948669, "mean_token_accuracy": 0.7716294378042221, "num_tokens": 4946382.0, "step": 304 }, { "entropy": 0.5517953187227249, "epoch": 1.13844714686623, "grad_norm": 0.030561944469809532, "learning_rate": 0.0002, "loss": 0.5540879964828491, "mean_token_accuracy": 0.7759448438882828, "num_tokens": 4962652.0, "step": 305 }, { "entropy": 0.544833779335022, "epoch": 1.1421889616463985, "grad_norm": 0.030571507290005684, "learning_rate": 0.0002, "loss": 0.5443115234375, "mean_token_accuracy": 0.7782190293073654, "num_tokens": 4978959.0, "step": 306 }, { "entropy": 0.5475269705057144, "epoch": 1.145930776426567, "grad_norm": 0.0296931229531765, "learning_rate": 0.0002, "loss": 0.541431188583374, "mean_token_accuracy": 0.7753712236881256, "num_tokens": 4995357.0, "step": 307 }, { "entropy": 0.5446912348270416, "epoch": 1.1496725912067354, "grad_norm": 0.025116927921772003, "learning_rate": 0.0002, "loss": 0.5437968373298645, "mean_token_accuracy": 0.7787619084119797, "num_tokens": 5011590.0, "step": 308 }, { "entropy": 0.5292570069432259, "epoch": 1.1534144059869036, "grad_norm": 0.027315491810441017, "learning_rate": 0.0002, "loss": 0.5277875065803528, "mean_token_accuracy": 0.7833113670349121, "num_tokens": 5027873.0, "step": 309 }, { "entropy": 0.5242628306150436, "epoch": 1.157156220767072, "grad_norm": 0.027830073609948158, "learning_rate": 0.0002, "loss": 0.523070752620697, "mean_token_accuracy": 0.7879849672317505, "num_tokens": 5044361.0, "step": 310 }, { "entropy": 0.536102682352066, "epoch": 1.1608980355472405, "grad_norm": 0.031033379957079887, "learning_rate": 0.0002, "loss": 0.5378351211547852, "mean_token_accuracy": 0.7815344035625458, "num_tokens": 5060644.0, "step": 311 }, { "entropy": 0.5573316812515259, "epoch": 1.1646398503274087, "grad_norm": 0.03297853097319603, "learning_rate": 0.0002, "loss": 0.5643618106842041, "mean_token_accuracy": 0.7715043723583221, "num_tokens": 5077003.0, "step": 312 }, { "entropy": 0.526486948132515, "epoch": 1.1683816651075771, "grad_norm": 0.029532574117183685, "learning_rate": 0.0002, "loss": 0.5367429256439209, "mean_token_accuracy": 0.7818453460931778, "num_tokens": 5093120.0, "step": 313 }, { "entropy": 0.545007973909378, "epoch": 1.1721234798877456, "grad_norm": 0.0302292387932539, "learning_rate": 0.0002, "loss": 0.5474991798400879, "mean_token_accuracy": 0.7770297825336456, "num_tokens": 5109333.0, "step": 314 }, { "entropy": 0.5457079261541367, "epoch": 1.175865294667914, "grad_norm": 0.03628959506750107, "learning_rate": 0.0002, "loss": 0.5456429719924927, "mean_token_accuracy": 0.779505044221878, "num_tokens": 5125459.0, "step": 315 }, { "entropy": 0.5526050478219986, "epoch": 1.1796071094480822, "grad_norm": 0.031634826213121414, "learning_rate": 0.0002, "loss": 0.5504459738731384, "mean_token_accuracy": 0.7756629437208176, "num_tokens": 5141755.0, "step": 316 }, { "entropy": 0.5621381402015686, "epoch": 1.1833489242282507, "grad_norm": 0.02932395227253437, "learning_rate": 0.0002, "loss": 0.5631870627403259, "mean_token_accuracy": 0.767949178814888, "num_tokens": 5158305.0, "step": 317 }, { "entropy": 0.5412058234214783, "epoch": 1.187090739008419, "grad_norm": 0.03077547252178192, "learning_rate": 0.0002, "loss": 0.5441724061965942, "mean_token_accuracy": 0.7769438326358795, "num_tokens": 5174825.0, "step": 318 }, { "entropy": 0.5375640690326691, "epoch": 1.1908325537885875, "grad_norm": 0.0300463754683733, "learning_rate": 0.0002, "loss": 0.5393084287643433, "mean_token_accuracy": 0.782392755150795, "num_tokens": 5190829.0, "step": 319 }, { "entropy": 0.5544911473989487, "epoch": 1.1945743685687558, "grad_norm": 0.03089406155049801, "learning_rate": 0.0002, "loss": 0.5512977838516235, "mean_token_accuracy": 0.7745725959539413, "num_tokens": 5207283.0, "step": 320 }, { "entropy": 0.5496610552072525, "epoch": 1.1983161833489242, "grad_norm": 0.03022005409002304, "learning_rate": 0.0002, "loss": 0.5407426357269287, "mean_token_accuracy": 0.7819069474935532, "num_tokens": 5223759.0, "step": 321 }, { "entropy": 0.5536633729934692, "epoch": 1.2020579981290926, "grad_norm": 0.03297387808561325, "learning_rate": 0.0002, "loss": 0.5543879866600037, "mean_token_accuracy": 0.7727649062871933, "num_tokens": 5240096.0, "step": 322 }, { "entropy": 0.5441806763410568, "epoch": 1.205799812909261, "grad_norm": 0.029116200283169746, "learning_rate": 0.0002, "loss": 0.5444720387458801, "mean_token_accuracy": 0.7814431339502335, "num_tokens": 5256670.0, "step": 323 }, { "entropy": 0.5429923981428146, "epoch": 1.2095416276894293, "grad_norm": 0.03505397588014603, "learning_rate": 0.0002, "loss": 0.5506747961044312, "mean_token_accuracy": 0.7763912379741669, "num_tokens": 5272766.0, "step": 324 }, { "entropy": 0.5270697474479675, "epoch": 1.2132834424695977, "grad_norm": 0.039405617862939835, "learning_rate": 0.0002, "loss": 0.5409681797027588, "mean_token_accuracy": 0.7786189615726471, "num_tokens": 5289123.0, "step": 325 }, { "entropy": 0.558641791343689, "epoch": 1.2170252572497662, "grad_norm": 0.029413288459181786, "learning_rate": 0.0002, "loss": 0.5564137697219849, "mean_token_accuracy": 0.7740890085697174, "num_tokens": 5305503.0, "step": 326 }, { "entropy": 0.5550449192523956, "epoch": 1.2207670720299344, "grad_norm": 0.031028373166918755, "learning_rate": 0.0002, "loss": 0.5544853210449219, "mean_token_accuracy": 0.7716324329376221, "num_tokens": 5321885.0, "step": 327 }, { "entropy": 0.5564998090267181, "epoch": 1.2245088868101028, "grad_norm": 0.034970104694366455, "learning_rate": 0.0002, "loss": 0.5547239184379578, "mean_token_accuracy": 0.7719462513923645, "num_tokens": 5338376.0, "step": 328 }, { "entropy": 0.5593426823616028, "epoch": 1.2282507015902713, "grad_norm": 0.030654314905405045, "learning_rate": 0.0002, "loss": 0.5594889521598816, "mean_token_accuracy": 0.7690505534410477, "num_tokens": 5354745.0, "step": 329 }, { "entropy": 0.5594028532505035, "epoch": 1.2319925163704397, "grad_norm": 0.02985675260424614, "learning_rate": 0.0002, "loss": 0.560926079750061, "mean_token_accuracy": 0.771067887544632, "num_tokens": 5371364.0, "step": 330 }, { "entropy": 0.5444284975528717, "epoch": 1.2357343311506082, "grad_norm": 0.0331130288541317, "learning_rate": 0.0002, "loss": 0.5528807044029236, "mean_token_accuracy": 0.7744182050228119, "num_tokens": 5387884.0, "step": 331 }, { "entropy": 0.5535553693771362, "epoch": 1.2394761459307764, "grad_norm": 0.035860270261764526, "learning_rate": 0.0002, "loss": 0.5612154603004456, "mean_token_accuracy": 0.7728609591722488, "num_tokens": 5404143.0, "step": 332 }, { "entropy": 0.5594320446252823, "epoch": 1.2432179607109448, "grad_norm": 0.030857175588607788, "learning_rate": 0.0002, "loss": 0.5495461225509644, "mean_token_accuracy": 0.7783895283937454, "num_tokens": 5420613.0, "step": 333 }, { "entropy": 0.5738644152879715, "epoch": 1.2469597754911133, "grad_norm": 0.02752659097313881, "learning_rate": 0.0002, "loss": 0.5670571327209473, "mean_token_accuracy": 0.7706948518753052, "num_tokens": 5437025.0, "step": 334 }, { "entropy": 0.5468066483736038, "epoch": 1.2507015902712815, "grad_norm": 0.030105959624052048, "learning_rate": 0.0002, "loss": 0.5448632836341858, "mean_token_accuracy": 0.7777069211006165, "num_tokens": 5453431.0, "step": 335 }, { "entropy": 0.5508809983730316, "epoch": 1.25444340505145, "grad_norm": 0.031137077137827873, "learning_rate": 0.0002, "loss": 0.5581130981445312, "mean_token_accuracy": 0.7730289697647095, "num_tokens": 5469727.0, "step": 336 }, { "entropy": 0.5199557095766068, "epoch": 1.2581852198316184, "grad_norm": 0.033218562602996826, "learning_rate": 0.0002, "loss": 0.5353677272796631, "mean_token_accuracy": 0.7836348563432693, "num_tokens": 5485615.0, "step": 337 }, { "entropy": 0.5402327626943588, "epoch": 1.2619270346117868, "grad_norm": 0.02909061312675476, "learning_rate": 0.0002, "loss": 0.5445257425308228, "mean_token_accuracy": 0.7775768637657166, "num_tokens": 5501846.0, "step": 338 }, { "entropy": 0.5657909214496613, "epoch": 1.2656688493919552, "grad_norm": 0.03052118793129921, "learning_rate": 0.0002, "loss": 0.5672930479049683, "mean_token_accuracy": 0.7675611525774002, "num_tokens": 5518365.0, "step": 339 }, { "entropy": 0.5483649671077728, "epoch": 1.2694106641721234, "grad_norm": 0.02786743827164173, "learning_rate": 0.0002, "loss": 0.5456503033638, "mean_token_accuracy": 0.7791422605514526, "num_tokens": 5534639.0, "step": 340 }, { "entropy": 0.5500437468290329, "epoch": 1.2731524789522919, "grad_norm": 0.03155668452382088, "learning_rate": 0.0002, "loss": 0.545000433921814, "mean_token_accuracy": 0.7803118973970413, "num_tokens": 5551093.0, "step": 341 }, { "entropy": 0.5697951167821884, "epoch": 1.27689429373246, "grad_norm": 0.03075268305838108, "learning_rate": 0.0002, "loss": 0.5609626173973083, "mean_token_accuracy": 0.7723665684461594, "num_tokens": 5567707.0, "step": 342 }, { "entropy": 0.544351652264595, "epoch": 1.2806361085126285, "grad_norm": 0.03238390013575554, "learning_rate": 0.0002, "loss": 0.5533734560012817, "mean_token_accuracy": 0.7754608392715454, "num_tokens": 5584155.0, "step": 343 }, { "entropy": 0.5441059172153473, "epoch": 1.284377923292797, "grad_norm": 0.02793728932738304, "learning_rate": 0.0002, "loss": 0.5470475554466248, "mean_token_accuracy": 0.7781476378440857, "num_tokens": 5600585.0, "step": 344 }, { "entropy": 0.5576403886079788, "epoch": 1.2881197380729654, "grad_norm": 0.0332297645509243, "learning_rate": 0.0002, "loss": 0.5591012835502625, "mean_token_accuracy": 0.7717157751321793, "num_tokens": 5616865.0, "step": 345 }, { "entropy": 0.5582529455423355, "epoch": 1.2918615528531339, "grad_norm": 0.028861626982688904, "learning_rate": 0.0002, "loss": 0.5597870349884033, "mean_token_accuracy": 0.7722600847482681, "num_tokens": 5633131.0, "step": 346 }, { "entropy": 0.5537585616111755, "epoch": 1.295603367633302, "grad_norm": 0.027739623561501503, "learning_rate": 0.0002, "loss": 0.5517114996910095, "mean_token_accuracy": 0.7751765549182892, "num_tokens": 5649621.0, "step": 347 }, { "entropy": 0.5722759366035461, "epoch": 1.2993451824134705, "grad_norm": 0.029868733137845993, "learning_rate": 0.0002, "loss": 0.5697493553161621, "mean_token_accuracy": 0.769178032875061, "num_tokens": 5666058.0, "step": 348 }, { "entropy": 0.5482298284769058, "epoch": 1.303086997193639, "grad_norm": 0.02905650995671749, "learning_rate": 0.0002, "loss": 0.5505189895629883, "mean_token_accuracy": 0.7772009968757629, "num_tokens": 5682272.0, "step": 349 }, { "entropy": 0.5623439997434616, "epoch": 1.3068288119738072, "grad_norm": 0.028680406510829926, "learning_rate": 0.0002, "loss": 0.5615631937980652, "mean_token_accuracy": 0.7712025493383408, "num_tokens": 5698796.0, "step": 350 }, { "entropy": 0.5541074424982071, "epoch": 1.3105706267539756, "grad_norm": 0.03431180492043495, "learning_rate": 0.0002, "loss": 0.5617666244506836, "mean_token_accuracy": 0.7705400139093399, "num_tokens": 5714994.0, "step": 351 }, { "entropy": 0.5405305176973343, "epoch": 1.314312441534144, "grad_norm": 0.03283194825053215, "learning_rate": 0.0002, "loss": 0.538750946521759, "mean_token_accuracy": 0.7778624445199966, "num_tokens": 5731263.0, "step": 352 }, { "entropy": 0.5537361800670624, "epoch": 1.3180542563143125, "grad_norm": 0.03157467022538185, "learning_rate": 0.0002, "loss": 0.556831955909729, "mean_token_accuracy": 0.7720046639442444, "num_tokens": 5747576.0, "step": 353 }, { "entropy": 0.5540541112422943, "epoch": 1.321796071094481, "grad_norm": 0.03315872326493263, "learning_rate": 0.0002, "loss": 0.5560564398765564, "mean_token_accuracy": 0.7747179567813873, "num_tokens": 5763875.0, "step": 354 }, { "entropy": 0.5485205948352814, "epoch": 1.3255378858746492, "grad_norm": 0.029158933088183403, "learning_rate": 0.0002, "loss": 0.5474769473075867, "mean_token_accuracy": 0.7769359052181244, "num_tokens": 5780494.0, "step": 355 }, { "entropy": 0.5560560077428818, "epoch": 1.3292797006548176, "grad_norm": 0.03023948147892952, "learning_rate": 0.0002, "loss": 0.5578330159187317, "mean_token_accuracy": 0.7706339210271835, "num_tokens": 5796776.0, "step": 356 }, { "entropy": 0.5549474805593491, "epoch": 1.333021515434986, "grad_norm": 0.03123750351369381, "learning_rate": 0.0002, "loss": 0.5531733632087708, "mean_token_accuracy": 0.7738355994224548, "num_tokens": 5813225.0, "step": 357 }, { "entropy": 0.5446926355361938, "epoch": 1.3367633302151543, "grad_norm": 0.03854469954967499, "learning_rate": 0.0002, "loss": 0.5561398863792419, "mean_token_accuracy": 0.7719077616930008, "num_tokens": 5829411.0, "step": 358 }, { "entropy": 0.5601906925439835, "epoch": 1.3405051449953227, "grad_norm": 0.025615639984607697, "learning_rate": 0.0002, "loss": 0.5579116940498352, "mean_token_accuracy": 0.7725162506103516, "num_tokens": 5845753.0, "step": 359 }, { "entropy": 0.557614728808403, "epoch": 1.3442469597754911, "grad_norm": 0.026924598962068558, "learning_rate": 0.0002, "loss": 0.5500644445419312, "mean_token_accuracy": 0.7740714848041534, "num_tokens": 5861927.0, "step": 360 }, { "entropy": 0.5535576045513153, "epoch": 1.3479887745556596, "grad_norm": 0.031272657215595245, "learning_rate": 0.0002, "loss": 0.5418438911437988, "mean_token_accuracy": 0.780152902007103, "num_tokens": 5878289.0, "step": 361 }, { "entropy": 0.5407048761844635, "epoch": 1.351730589335828, "grad_norm": 0.031007423996925354, "learning_rate": 0.0002, "loss": 0.5493313670158386, "mean_token_accuracy": 0.7764623165130615, "num_tokens": 5894592.0, "step": 362 }, { "entropy": 0.5239751785993576, "epoch": 1.3554724041159962, "grad_norm": 0.03374086320400238, "learning_rate": 0.0002, "loss": 0.5344395041465759, "mean_token_accuracy": 0.7812817394733429, "num_tokens": 5910863.0, "step": 363 }, { "entropy": 0.5377437621355057, "epoch": 1.3592142188961647, "grad_norm": 0.04066803306341171, "learning_rate": 0.0002, "loss": 0.5502558946609497, "mean_token_accuracy": 0.7735230922698975, "num_tokens": 5927169.0, "step": 364 }, { "entropy": 0.5404135584831238, "epoch": 1.362956033676333, "grad_norm": 0.030103564262390137, "learning_rate": 0.0002, "loss": 0.5431765913963318, "mean_token_accuracy": 0.780334860086441, "num_tokens": 5943288.0, "step": 365 }, { "entropy": 0.5349705293774605, "epoch": 1.3666978484565013, "grad_norm": 0.031804051250219345, "learning_rate": 0.0002, "loss": 0.5298077464103699, "mean_token_accuracy": 0.7834766954183578, "num_tokens": 5959662.0, "step": 366 }, { "entropy": 0.5429814159870148, "epoch": 1.3704396632366698, "grad_norm": 0.04628051444888115, "learning_rate": 0.0002, "loss": 0.5361793041229248, "mean_token_accuracy": 0.7793655544519424, "num_tokens": 5976139.0, "step": 367 }, { "entropy": 0.5505317896604538, "epoch": 1.3741814780168382, "grad_norm": 0.03267182409763336, "learning_rate": 0.0002, "loss": 0.5444616675376892, "mean_token_accuracy": 0.7798040062189102, "num_tokens": 5992476.0, "step": 368 }, { "entropy": 0.5407690107822418, "epoch": 1.3779232927970066, "grad_norm": 0.0353633388876915, "learning_rate": 0.0002, "loss": 0.5501353740692139, "mean_token_accuracy": 0.7760691344738007, "num_tokens": 6008641.0, "step": 369 }, { "entropy": 0.5465443283319473, "epoch": 1.3816651075771749, "grad_norm": 0.044324446469545364, "learning_rate": 0.0002, "loss": 0.5564755201339722, "mean_token_accuracy": 0.775538980960846, "num_tokens": 6024769.0, "step": 370 }, { "entropy": 0.5609740614891052, "epoch": 1.3854069223573433, "grad_norm": 0.03593122959136963, "learning_rate": 0.0002, "loss": 0.5629419088363647, "mean_token_accuracy": 0.7691068351268768, "num_tokens": 6041060.0, "step": 371 }, { "entropy": 0.5421721637248993, "epoch": 1.3891487371375117, "grad_norm": 0.03346877172589302, "learning_rate": 0.0002, "loss": 0.5368991494178772, "mean_token_accuracy": 0.7809954136610031, "num_tokens": 6057328.0, "step": 372 }, { "entropy": 0.5421962440013885, "epoch": 1.39289055191768, "grad_norm": 0.036160413175821304, "learning_rate": 0.0002, "loss": 0.5371009111404419, "mean_token_accuracy": 0.7804526090621948, "num_tokens": 6073633.0, "step": 373 }, { "entropy": 0.5545593798160553, "epoch": 1.3966323666978484, "grad_norm": 0.03285996615886688, "learning_rate": 0.0002, "loss": 0.5528316497802734, "mean_token_accuracy": 0.7778345346450806, "num_tokens": 6090142.0, "step": 374 }, { "entropy": 0.5461311042308807, "epoch": 1.4003741814780168, "grad_norm": 0.03481744974851608, "learning_rate": 0.0002, "loss": 0.5470185279846191, "mean_token_accuracy": 0.7769876271486282, "num_tokens": 6106491.0, "step": 375 }, { "entropy": 0.5363553166389465, "epoch": 1.4041159962581853, "grad_norm": 0.029494671151041985, "learning_rate": 0.0002, "loss": 0.5371567010879517, "mean_token_accuracy": 0.78060382604599, "num_tokens": 6122724.0, "step": 376 }, { "entropy": 0.5401545614004135, "epoch": 1.4078578110383537, "grad_norm": 0.030447613447904587, "learning_rate": 0.0002, "loss": 0.5506365299224854, "mean_token_accuracy": 0.7772665321826935, "num_tokens": 6139127.0, "step": 377 }, { "entropy": 0.5432114005088806, "epoch": 1.411599625818522, "grad_norm": 0.03443232551217079, "learning_rate": 0.0002, "loss": 0.5483974814414978, "mean_token_accuracy": 0.7753057479858398, "num_tokens": 6155228.0, "step": 378 }, { "entropy": 0.5419820547103882, "epoch": 1.4153414405986904, "grad_norm": 0.030418474227190018, "learning_rate": 0.0002, "loss": 0.5432078838348389, "mean_token_accuracy": 0.7786633670330048, "num_tokens": 6171661.0, "step": 379 }, { "entropy": 0.5554294884204865, "epoch": 1.4190832553788588, "grad_norm": 0.028558963909745216, "learning_rate": 0.0002, "loss": 0.5531105995178223, "mean_token_accuracy": 0.7719776481389999, "num_tokens": 6187948.0, "step": 380 }, { "entropy": 0.5308730006217957, "epoch": 1.422825070159027, "grad_norm": 0.03490149602293968, "learning_rate": 0.0002, "loss": 0.5338871479034424, "mean_token_accuracy": 0.7831013798713684, "num_tokens": 6203996.0, "step": 381 }, { "entropy": 0.5621105879545212, "epoch": 1.4265668849391955, "grad_norm": 0.03489487245678902, "learning_rate": 0.0002, "loss": 0.5650954246520996, "mean_token_accuracy": 0.7674195319414139, "num_tokens": 6220346.0, "step": 382 }, { "entropy": 0.5624908655881882, "epoch": 1.430308699719364, "grad_norm": 0.02940392680466175, "learning_rate": 0.0002, "loss": 0.5624366998672485, "mean_token_accuracy": 0.769148588180542, "num_tokens": 6236743.0, "step": 383 }, { "entropy": 0.5363715589046478, "epoch": 1.4340505144995324, "grad_norm": 0.028942115604877472, "learning_rate": 0.0002, "loss": 0.5339908599853516, "mean_token_accuracy": 0.7834934592247009, "num_tokens": 6252708.0, "step": 384 }, { "entropy": 0.5408411026000977, "epoch": 1.4377923292797006, "grad_norm": 0.0305769219994545, "learning_rate": 0.0002, "loss": 0.5352215766906738, "mean_token_accuracy": 0.7860714495182037, "num_tokens": 6268903.0, "step": 385 }, { "entropy": 0.5410628318786621, "epoch": 1.441534144059869, "grad_norm": 0.029285579919815063, "learning_rate": 0.0002, "loss": 0.5426855087280273, "mean_token_accuracy": 0.7768432199954987, "num_tokens": 6284894.0, "step": 386 }, { "entropy": 0.5362880975008011, "epoch": 1.4452759588400375, "grad_norm": 0.03178134933114052, "learning_rate": 0.0002, "loss": 0.5503253936767578, "mean_token_accuracy": 0.7759049534797668, "num_tokens": 6301216.0, "step": 387 }, { "entropy": 0.5453620404005051, "epoch": 1.4490177736202057, "grad_norm": 0.029615160077810287, "learning_rate": 0.0002, "loss": 0.5539615154266357, "mean_token_accuracy": 0.7736871391534805, "num_tokens": 6317584.0, "step": 388 }, { "entropy": 0.5552696138620377, "epoch": 1.4527595884003741, "grad_norm": 0.03214653581380844, "learning_rate": 0.0002, "loss": 0.5597580671310425, "mean_token_accuracy": 0.7707493901252747, "num_tokens": 6333884.0, "step": 389 }, { "entropy": 0.553122490644455, "epoch": 1.4565014031805426, "grad_norm": 0.029804600402712822, "learning_rate": 0.0002, "loss": 0.552976131439209, "mean_token_accuracy": 0.778336301445961, "num_tokens": 6350141.0, "step": 390 }, { "entropy": 0.5826992094516754, "epoch": 1.460243217960711, "grad_norm": 0.03438711538910866, "learning_rate": 0.0002, "loss": 0.5765487551689148, "mean_token_accuracy": 0.7643037289381027, "num_tokens": 6366374.0, "step": 391 }, { "entropy": 0.5606750249862671, "epoch": 1.4639850327408794, "grad_norm": 0.030389849096536636, "learning_rate": 0.0002, "loss": 0.5595695376396179, "mean_token_accuracy": 0.7718200087547302, "num_tokens": 6382848.0, "step": 392 }, { "entropy": 0.5619854032993317, "epoch": 1.4677268475210477, "grad_norm": 0.032461296766996384, "learning_rate": 0.0002, "loss": 0.5576058030128479, "mean_token_accuracy": 0.7746401876211166, "num_tokens": 6399173.0, "step": 393 }, { "entropy": 0.5408260822296143, "epoch": 1.471468662301216, "grad_norm": 0.03529435396194458, "learning_rate": 0.0002, "loss": 0.5456345081329346, "mean_token_accuracy": 0.7788489162921906, "num_tokens": 6415565.0, "step": 394 }, { "entropy": 0.5425965934991837, "epoch": 1.4752104770813845, "grad_norm": 0.03692852333188057, "learning_rate": 0.0002, "loss": 0.5488424301147461, "mean_token_accuracy": 0.7782263904809952, "num_tokens": 6431912.0, "step": 395 }, { "entropy": 0.5516340583562851, "epoch": 1.4789522918615527, "grad_norm": 0.031000891700387, "learning_rate": 0.0002, "loss": 0.5553445219993591, "mean_token_accuracy": 0.7752650529146194, "num_tokens": 6448548.0, "step": 396 }, { "entropy": 0.538574829697609, "epoch": 1.4826941066417212, "grad_norm": 0.030864855274558067, "learning_rate": 0.0002, "loss": 0.5368215441703796, "mean_token_accuracy": 0.7809993326663971, "num_tokens": 6465030.0, "step": 397 }, { "entropy": 0.5717963427305222, "epoch": 1.4864359214218896, "grad_norm": 0.033221229910850525, "learning_rate": 0.0002, "loss": 0.571186363697052, "mean_token_accuracy": 0.7653579860925674, "num_tokens": 6481528.0, "step": 398 }, { "entropy": 0.5418017208576202, "epoch": 1.490177736202058, "grad_norm": 0.04067196696996689, "learning_rate": 0.0002, "loss": 0.5442001223564148, "mean_token_accuracy": 0.7763307839632034, "num_tokens": 6497840.0, "step": 399 }, { "entropy": 0.5547621697187424, "epoch": 1.4939195509822265, "grad_norm": 0.03348267823457718, "learning_rate": 0.0002, "loss": 0.5626781582832336, "mean_token_accuracy": 0.7712242007255554, "num_tokens": 6514349.0, "step": 400 }, { "entropy": 0.5494479835033417, "epoch": 1.4976613657623947, "grad_norm": 0.03362090513110161, "learning_rate": 0.0002, "loss": 0.548977792263031, "mean_token_accuracy": 0.7767577767372131, "num_tokens": 6530749.0, "step": 401 }, { "entropy": 0.5626181960105896, "epoch": 1.5014031805425632, "grad_norm": 0.03137248754501343, "learning_rate": 0.0002, "loss": 0.5654096603393555, "mean_token_accuracy": 0.7723931819200516, "num_tokens": 6547276.0, "step": 402 }, { "entropy": 0.5499662905931473, "epoch": 1.5051449953227314, "grad_norm": 0.034359052777290344, "learning_rate": 0.0002, "loss": 0.5508401393890381, "mean_token_accuracy": 0.7756681442260742, "num_tokens": 6563580.0, "step": 403 }, { "entropy": 0.5658421665430069, "epoch": 1.5088868101028998, "grad_norm": 0.030933788046240807, "learning_rate": 0.0002, "loss": 0.5622308254241943, "mean_token_accuracy": 0.769567608833313, "num_tokens": 6579736.0, "step": 404 }, { "entropy": 0.547087088227272, "epoch": 1.5126286248830683, "grad_norm": 0.030160700902342796, "learning_rate": 0.0002, "loss": 0.5470564961433411, "mean_token_accuracy": 0.7781479358673096, "num_tokens": 6596131.0, "step": 405 }, { "entropy": 0.5563077032566071, "epoch": 1.5163704396632367, "grad_norm": 0.029513506218791008, "learning_rate": 0.0002, "loss": 0.5557488799095154, "mean_token_accuracy": 0.7776722609996796, "num_tokens": 6612499.0, "step": 406 }, { "entropy": 0.5473329573869705, "epoch": 1.5201122544434051, "grad_norm": 0.031187692657113075, "learning_rate": 0.0002, "loss": 0.5444590449333191, "mean_token_accuracy": 0.7770859450101852, "num_tokens": 6628905.0, "step": 407 }, { "entropy": 0.5493151396512985, "epoch": 1.5238540692235736, "grad_norm": 0.027274703606963158, "learning_rate": 0.0002, "loss": 0.5559489130973816, "mean_token_accuracy": 0.774099811911583, "num_tokens": 6645207.0, "step": 408 }, { "entropy": 0.5369315445423126, "epoch": 1.5275958840037418, "grad_norm": 0.03280489146709442, "learning_rate": 0.0002, "loss": 0.5494750738143921, "mean_token_accuracy": 0.7781352549791336, "num_tokens": 6661441.0, "step": 409 }, { "entropy": 0.543188214302063, "epoch": 1.5313376987839102, "grad_norm": 0.0317704938352108, "learning_rate": 0.0002, "loss": 0.548348069190979, "mean_token_accuracy": 0.7779366374015808, "num_tokens": 6677890.0, "step": 410 }, { "entropy": 0.5514375120401382, "epoch": 1.5350795135640785, "grad_norm": 0.02904539741575718, "learning_rate": 0.0002, "loss": 0.5532687902450562, "mean_token_accuracy": 0.776079460978508, "num_tokens": 6694229.0, "step": 411 }, { "entropy": 0.5228893607854843, "epoch": 1.538821328344247, "grad_norm": 0.027841076254844666, "learning_rate": 0.0002, "loss": 0.522330641746521, "mean_token_accuracy": 0.7864255011081696, "num_tokens": 6710250.0, "step": 412 }, { "entropy": 0.5390310734510422, "epoch": 1.5425631431244153, "grad_norm": 0.02716185338795185, "learning_rate": 0.0002, "loss": 0.5395499467849731, "mean_token_accuracy": 0.7826422601938248, "num_tokens": 6726768.0, "step": 413 }, { "entropy": 0.5508141964673996, "epoch": 1.5463049579045838, "grad_norm": 0.030815092846751213, "learning_rate": 0.0002, "loss": 0.5503819584846497, "mean_token_accuracy": 0.7755144089460373, "num_tokens": 6743055.0, "step": 414 }, { "entropy": 0.5312939435243607, "epoch": 1.5500467726847522, "grad_norm": 0.028637485578656197, "learning_rate": 0.0002, "loss": 0.5298642516136169, "mean_token_accuracy": 0.7852569371461868, "num_tokens": 6759442.0, "step": 415 }, { "entropy": 0.5471786260604858, "epoch": 1.5537885874649204, "grad_norm": 0.030604762956500053, "learning_rate": 0.0002, "loss": 0.5502840876579285, "mean_token_accuracy": 0.7758130580186844, "num_tokens": 6775919.0, "step": 416 }, { "entropy": 0.5734788477420807, "epoch": 1.5575304022450889, "grad_norm": 0.033530574291944504, "learning_rate": 0.0002, "loss": 0.573567807674408, "mean_token_accuracy": 0.7666918784379959, "num_tokens": 6792496.0, "step": 417 }, { "entropy": 0.5556947290897369, "epoch": 1.561272217025257, "grad_norm": 0.029095808044075966, "learning_rate": 0.0002, "loss": 0.5506360530853271, "mean_token_accuracy": 0.7765111029148102, "num_tokens": 6809055.0, "step": 418 }, { "entropy": 0.5287731885910034, "epoch": 1.5650140318054255, "grad_norm": 0.03587370365858078, "learning_rate": 0.0002, "loss": 0.5343160033226013, "mean_token_accuracy": 0.7836072146892548, "num_tokens": 6825353.0, "step": 419 }, { "entropy": 0.5342409163713455, "epoch": 1.568755846585594, "grad_norm": 0.03603408485651016, "learning_rate": 0.0002, "loss": 0.5409013628959656, "mean_token_accuracy": 0.7804750800132751, "num_tokens": 6841745.0, "step": 420 }, { "entropy": 0.5486701726913452, "epoch": 1.5724976613657624, "grad_norm": 0.02864743210375309, "learning_rate": 0.0002, "loss": 0.5528161525726318, "mean_token_accuracy": 0.7741836905479431, "num_tokens": 6857942.0, "step": 421 }, { "entropy": 0.5741837024688721, "epoch": 1.5762394761459309, "grad_norm": 0.0320119671523571, "learning_rate": 0.0002, "loss": 0.5608420372009277, "mean_token_accuracy": 0.7707283794879913, "num_tokens": 6874193.0, "step": 422 }, { "entropy": 0.5495482236146927, "epoch": 1.5799812909260993, "grad_norm": 0.02604423463344574, "learning_rate": 0.0002, "loss": 0.5479333400726318, "mean_token_accuracy": 0.7773087471723557, "num_tokens": 6890547.0, "step": 423 }, { "entropy": 0.5387884378433228, "epoch": 1.5837231057062675, "grad_norm": 0.03170885518193245, "learning_rate": 0.0002, "loss": 0.5462484359741211, "mean_token_accuracy": 0.7735171020030975, "num_tokens": 6906920.0, "step": 424 }, { "entropy": 0.539916068315506, "epoch": 1.587464920486436, "grad_norm": 0.03372619301080704, "learning_rate": 0.0002, "loss": 0.542754590511322, "mean_token_accuracy": 0.7796132117509842, "num_tokens": 6923352.0, "step": 425 }, { "entropy": 0.5413663387298584, "epoch": 1.5912067352666042, "grad_norm": 0.02999868616461754, "learning_rate": 0.0002, "loss": 0.5444542765617371, "mean_token_accuracy": 0.7786892652511597, "num_tokens": 6939337.0, "step": 426 }, { "entropy": 0.556038424372673, "epoch": 1.5949485500467726, "grad_norm": 0.03419700264930725, "learning_rate": 0.0002, "loss": 0.550898015499115, "mean_token_accuracy": 0.7760495245456696, "num_tokens": 6955389.0, "step": 427 }, { "entropy": 0.5516718029975891, "epoch": 1.598690364826941, "grad_norm": 0.0298128854483366, "learning_rate": 0.0002, "loss": 0.5519053339958191, "mean_token_accuracy": 0.7739587277173996, "num_tokens": 6971808.0, "step": 428 }, { "entropy": 0.5532359778881073, "epoch": 1.6024321796071095, "grad_norm": 0.03213290125131607, "learning_rate": 0.0002, "loss": 0.5568399429321289, "mean_token_accuracy": 0.7753729224205017, "num_tokens": 6988128.0, "step": 429 }, { "entropy": 0.5382643342018127, "epoch": 1.606173994387278, "grad_norm": 0.031161464750766754, "learning_rate": 0.0002, "loss": 0.5440113544464111, "mean_token_accuracy": 0.7779531329870224, "num_tokens": 7004368.0, "step": 430 }, { "entropy": 0.5313677787780762, "epoch": 1.6099158091674464, "grad_norm": 0.036605071276426315, "learning_rate": 0.0002, "loss": 0.5367435216903687, "mean_token_accuracy": 0.7821811884641647, "num_tokens": 7020480.0, "step": 431 }, { "entropy": 0.5567297488451004, "epoch": 1.6136576239476146, "grad_norm": 0.027995243668556213, "learning_rate": 0.0002, "loss": 0.5547551512718201, "mean_token_accuracy": 0.7722228318452835, "num_tokens": 7036925.0, "step": 432 }, { "entropy": 0.5448314994573593, "epoch": 1.617399438727783, "grad_norm": 0.03725632280111313, "learning_rate": 0.0002, "loss": 0.5465018153190613, "mean_token_accuracy": 0.7780062705278397, "num_tokens": 7053019.0, "step": 433 }, { "entropy": 0.5258296579122543, "epoch": 1.6211412535079512, "grad_norm": 0.03214319422841072, "learning_rate": 0.0002, "loss": 0.5300624370574951, "mean_token_accuracy": 0.7829313278198242, "num_tokens": 7069021.0, "step": 434 }, { "entropy": 0.5569266527891159, "epoch": 1.6248830682881197, "grad_norm": 0.03432042896747589, "learning_rate": 0.0002, "loss": 0.5578755140304565, "mean_token_accuracy": 0.7711293399333954, "num_tokens": 7085450.0, "step": 435 }, { "entropy": 0.5638464391231537, "epoch": 1.6286248830682881, "grad_norm": 0.03862602636218071, "learning_rate": 0.0002, "loss": 0.5726134777069092, "mean_token_accuracy": 0.7694450467824936, "num_tokens": 7101666.0, "step": 436 }, { "entropy": 0.564548671245575, "epoch": 1.6323666978484566, "grad_norm": 0.032345570623874664, "learning_rate": 0.0002, "loss": 0.5651994943618774, "mean_token_accuracy": 0.7711433917284012, "num_tokens": 7117907.0, "step": 437 }, { "entropy": 0.5587478131055832, "epoch": 1.636108512628625, "grad_norm": 0.031082862988114357, "learning_rate": 0.0002, "loss": 0.5588955879211426, "mean_token_accuracy": 0.7725447416305542, "num_tokens": 7134131.0, "step": 438 }, { "entropy": 0.5472389608621597, "epoch": 1.6398503274087932, "grad_norm": 0.03695904091000557, "learning_rate": 0.0002, "loss": 0.5445616245269775, "mean_token_accuracy": 0.778590515255928, "num_tokens": 7150298.0, "step": 439 }, { "entropy": 0.5535961091518402, "epoch": 1.6435921421889617, "grad_norm": 0.031128892675042152, "learning_rate": 0.0002, "loss": 0.5437783598899841, "mean_token_accuracy": 0.7785230875015259, "num_tokens": 7166639.0, "step": 440 }, { "entropy": 0.5351960062980652, "epoch": 1.6473339569691299, "grad_norm": 0.03949431702494621, "learning_rate": 0.0002, "loss": 0.5358127355575562, "mean_token_accuracy": 0.7802053093910217, "num_tokens": 7182613.0, "step": 441 }, { "entropy": 0.524370513856411, "epoch": 1.6510757717492983, "grad_norm": 0.03402510657906532, "learning_rate": 0.0002, "loss": 0.5297942161560059, "mean_token_accuracy": 0.7861316353082657, "num_tokens": 7198598.0, "step": 442 }, { "entropy": 0.5440799742937088, "epoch": 1.6548175865294668, "grad_norm": 0.03908916562795639, "learning_rate": 0.0002, "loss": 0.5563719868659973, "mean_token_accuracy": 0.773345485329628, "num_tokens": 7214953.0, "step": 443 }, { "entropy": 0.5496329516172409, "epoch": 1.6585594013096352, "grad_norm": 0.036347340792417526, "learning_rate": 0.0002, "loss": 0.5566647052764893, "mean_token_accuracy": 0.7736042439937592, "num_tokens": 7231069.0, "step": 444 }, { "entropy": 0.5510213375091553, "epoch": 1.6623012160898036, "grad_norm": 0.027416400611400604, "learning_rate": 0.0002, "loss": 0.5495529174804688, "mean_token_accuracy": 0.7757058292627335, "num_tokens": 7247326.0, "step": 445 }, { "entropy": 0.5782728493213654, "epoch": 1.666043030869972, "grad_norm": 0.03216573968529701, "learning_rate": 0.0002, "loss": 0.5692035555839539, "mean_token_accuracy": 0.7700701951980591, "num_tokens": 7263765.0, "step": 446 }, { "entropy": 0.5769474655389786, "epoch": 1.6697848456501403, "grad_norm": 0.03461449593305588, "learning_rate": 0.0002, "loss": 0.5692911148071289, "mean_token_accuracy": 0.7688308656215668, "num_tokens": 7280095.0, "step": 447 }, { "entropy": 0.5636246651411057, "epoch": 1.6735266604303087, "grad_norm": 0.02763124369084835, "learning_rate": 0.0002, "loss": 0.5576487183570862, "mean_token_accuracy": 0.7748333811759949, "num_tokens": 7296592.0, "step": 448 }, { "entropy": 0.5515684485435486, "epoch": 1.677268475210477, "grad_norm": 0.03505739942193031, "learning_rate": 0.0002, "loss": 0.562554121017456, "mean_token_accuracy": 0.7732807844877243, "num_tokens": 7313071.0, "step": 449 }, { "entropy": 0.529756709933281, "epoch": 1.6810102899906454, "grad_norm": 0.035316504538059235, "learning_rate": 0.0002, "loss": 0.5393928289413452, "mean_token_accuracy": 0.7774565666913986, "num_tokens": 7329531.0, "step": 450 }, { "entropy": 0.5509119927883148, "epoch": 1.6847521047708138, "grad_norm": 0.03525395318865776, "learning_rate": 0.0002, "loss": 0.5650572180747986, "mean_token_accuracy": 0.7679217755794525, "num_tokens": 7345852.0, "step": 451 }, { "entropy": 0.5615872442722321, "epoch": 1.6884939195509823, "grad_norm": 0.032941099256277084, "learning_rate": 0.0002, "loss": 0.5626966953277588, "mean_token_accuracy": 0.7703739553689957, "num_tokens": 7362126.0, "step": 452 }, { "entropy": 0.555547222495079, "epoch": 1.6922357343311507, "grad_norm": 0.03228066489100456, "learning_rate": 0.0002, "loss": 0.544800877571106, "mean_token_accuracy": 0.7767430245876312, "num_tokens": 7378671.0, "step": 453 }, { "entropy": 0.554116278886795, "epoch": 1.6959775491113191, "grad_norm": 0.029597081243991852, "learning_rate": 0.0002, "loss": 0.5413352847099304, "mean_token_accuracy": 0.7784619033336639, "num_tokens": 7394967.0, "step": 454 }, { "entropy": 0.5580686628818512, "epoch": 1.6997193638914874, "grad_norm": 0.02839960716664791, "learning_rate": 0.0002, "loss": 0.5585195422172546, "mean_token_accuracy": 0.7723167389631271, "num_tokens": 7411309.0, "step": 455 }, { "entropy": 0.5392096787691116, "epoch": 1.7034611786716558, "grad_norm": 0.03588644042611122, "learning_rate": 0.0002, "loss": 0.5462691187858582, "mean_token_accuracy": 0.7782226353883743, "num_tokens": 7427429.0, "step": 456 }, { "entropy": 0.535987101495266, "epoch": 1.707202993451824, "grad_norm": 0.03534339368343353, "learning_rate": 0.0002, "loss": 0.549435019493103, "mean_token_accuracy": 0.7765841335058212, "num_tokens": 7443721.0, "step": 457 }, { "entropy": 0.5456487089395523, "epoch": 1.7109448082319925, "grad_norm": 0.03618441894650459, "learning_rate": 0.0002, "loss": 0.5485998392105103, "mean_token_accuracy": 0.7757130116224289, "num_tokens": 7460111.0, "step": 458 }, { "entropy": 0.5436663031578064, "epoch": 1.714686623012161, "grad_norm": 0.02979116700589657, "learning_rate": 0.0002, "loss": 0.5414945483207703, "mean_token_accuracy": 0.7812917977571487, "num_tokens": 7476124.0, "step": 459 }, { "entropy": 0.5709712207317352, "epoch": 1.7184284377923293, "grad_norm": 0.03200547397136688, "learning_rate": 0.0002, "loss": 0.5619422197341919, "mean_token_accuracy": 0.7735306322574615, "num_tokens": 7492499.0, "step": 460 }, { "entropy": 0.5626240521669388, "epoch": 1.7221702525724978, "grad_norm": 0.03815503418445587, "learning_rate": 0.0002, "loss": 0.5533303618431091, "mean_token_accuracy": 0.7753702253103256, "num_tokens": 7508641.0, "step": 461 }, { "entropy": 0.5480938106775284, "epoch": 1.725912067352666, "grad_norm": 0.03169892355799675, "learning_rate": 0.0002, "loss": 0.5524613261222839, "mean_token_accuracy": 0.7751649022102356, "num_tokens": 7525219.0, "step": 462 }, { "entropy": 0.5562078654766083, "epoch": 1.7296538821328344, "grad_norm": 0.03617829084396362, "learning_rate": 0.0002, "loss": 0.5619810819625854, "mean_token_accuracy": 0.7714113295078278, "num_tokens": 7541689.0, "step": 463 }, { "entropy": 0.5358584523200989, "epoch": 1.7333956969130027, "grad_norm": 0.03426409512758255, "learning_rate": 0.0002, "loss": 0.5471996068954468, "mean_token_accuracy": 0.7751270979642868, "num_tokens": 7558097.0, "step": 464 }, { "entropy": 0.5273950546979904, "epoch": 1.737137511693171, "grad_norm": 0.03135877847671509, "learning_rate": 0.0002, "loss": 0.5319076776504517, "mean_token_accuracy": 0.7831837683916092, "num_tokens": 7574193.0, "step": 465 }, { "entropy": 0.5745384991168976, "epoch": 1.7408793264733395, "grad_norm": 0.03335622698068619, "learning_rate": 0.0002, "loss": 0.5716018676757812, "mean_token_accuracy": 0.7669582962989807, "num_tokens": 7590824.0, "step": 466 }, { "entropy": 0.5475277155637741, "epoch": 1.744621141253508, "grad_norm": 0.02866513840854168, "learning_rate": 0.0002, "loss": 0.5436227321624756, "mean_token_accuracy": 0.777054488658905, "num_tokens": 7607042.0, "step": 467 }, { "entropy": 0.5518149137496948, "epoch": 1.7483629560336764, "grad_norm": 0.029388844966888428, "learning_rate": 0.0002, "loss": 0.5495098233222961, "mean_token_accuracy": 0.7773433327674866, "num_tokens": 7623420.0, "step": 468 }, { "entropy": 0.5374390631914139, "epoch": 1.7521047708138449, "grad_norm": 0.0325518473982811, "learning_rate": 0.0002, "loss": 0.5412787795066833, "mean_token_accuracy": 0.7788903117179871, "num_tokens": 7639630.0, "step": 469 }, { "entropy": 0.5380698144435883, "epoch": 1.755846585594013, "grad_norm": 0.029125649482011795, "learning_rate": 0.0002, "loss": 0.5411547422409058, "mean_token_accuracy": 0.7780955582857132, "num_tokens": 7655842.0, "step": 470 }, { "entropy": 0.5518491268157959, "epoch": 1.7595884003741815, "grad_norm": 0.03188946843147278, "learning_rate": 0.0002, "loss": 0.5559889674186707, "mean_token_accuracy": 0.7736992090940475, "num_tokens": 7672101.0, "step": 471 }, { "entropy": 0.5442283153533936, "epoch": 1.7633302151543497, "grad_norm": 0.034016743302345276, "learning_rate": 0.0002, "loss": 0.5500984191894531, "mean_token_accuracy": 0.7761438190937042, "num_tokens": 7688113.0, "step": 472 }, { "entropy": 0.5488689690828323, "epoch": 1.7670720299345182, "grad_norm": 0.02747703716158867, "learning_rate": 0.0002, "loss": 0.5475065112113953, "mean_token_accuracy": 0.775134801864624, "num_tokens": 7704497.0, "step": 473 }, { "entropy": 0.568826898932457, "epoch": 1.7708138447146866, "grad_norm": 0.03434092178940773, "learning_rate": 0.0002, "loss": 0.5651647448539734, "mean_token_accuracy": 0.7715141028165817, "num_tokens": 7720786.0, "step": 474 }, { "entropy": 0.5751989632844925, "epoch": 1.774555659494855, "grad_norm": 0.03127957507967949, "learning_rate": 0.0002, "loss": 0.5659101605415344, "mean_token_accuracy": 0.7694416791200638, "num_tokens": 7737241.0, "step": 475 }, { "entropy": 0.5532206594944, "epoch": 1.7782974742750235, "grad_norm": 0.02908439189195633, "learning_rate": 0.0002, "loss": 0.5514166355133057, "mean_token_accuracy": 0.7745979428291321, "num_tokens": 7753654.0, "step": 476 }, { "entropy": 0.5416929870843887, "epoch": 1.782039289055192, "grad_norm": 0.03806254267692566, "learning_rate": 0.0002, "loss": 0.5534486770629883, "mean_token_accuracy": 0.7739390730857849, "num_tokens": 7770019.0, "step": 477 }, { "entropy": 0.5363457053899765, "epoch": 1.7857811038353602, "grad_norm": 0.032926302403211594, "learning_rate": 0.0002, "loss": 0.5503825545310974, "mean_token_accuracy": 0.7768030315637589, "num_tokens": 7786449.0, "step": 478 }, { "entropy": 0.5420104712247849, "epoch": 1.7895229186155284, "grad_norm": 0.02965935505926609, "learning_rate": 0.0002, "loss": 0.5425794124603271, "mean_token_accuracy": 0.7801303416490555, "num_tokens": 7802671.0, "step": 479 }, { "entropy": 0.549240380525589, "epoch": 1.7932647333956968, "grad_norm": 0.029267581179738045, "learning_rate": 0.0002, "loss": 0.5447797179222107, "mean_token_accuracy": 0.7785746455192566, "num_tokens": 7819171.0, "step": 480 }, { "entropy": 0.5564038902521133, "epoch": 1.7970065481758652, "grad_norm": 0.027819465845823288, "learning_rate": 0.0002, "loss": 0.5569280385971069, "mean_token_accuracy": 0.7717359662055969, "num_tokens": 7835514.0, "step": 481 }, { "entropy": 0.5513341128826141, "epoch": 1.8007483629560337, "grad_norm": 0.032080937176942825, "learning_rate": 0.0002, "loss": 0.5565280318260193, "mean_token_accuracy": 0.7745318114757538, "num_tokens": 7851901.0, "step": 482 }, { "entropy": 0.5669872015714645, "epoch": 1.8044901777362021, "grad_norm": 0.031251415610313416, "learning_rate": 0.0002, "loss": 0.5653026103973389, "mean_token_accuracy": 0.7678168416023254, "num_tokens": 7868506.0, "step": 483 }, { "entropy": 0.5539208799600601, "epoch": 1.8082319925163706, "grad_norm": 0.02905306965112686, "learning_rate": 0.0002, "loss": 0.5545270442962646, "mean_token_accuracy": 0.7701525986194611, "num_tokens": 7884991.0, "step": 484 }, { "entropy": 0.5545967519283295, "epoch": 1.8119738072965388, "grad_norm": 0.028621984645724297, "learning_rate": 0.0002, "loss": 0.5514732003211975, "mean_token_accuracy": 0.7761166989803314, "num_tokens": 7901376.0, "step": 485 }, { "entropy": 0.5499511659145355, "epoch": 1.8157156220767072, "grad_norm": 0.03022296354174614, "learning_rate": 0.0002, "loss": 0.5498670339584351, "mean_token_accuracy": 0.7770126014947891, "num_tokens": 7917862.0, "step": 486 }, { "entropy": 0.5304104536771774, "epoch": 1.8194574368568754, "grad_norm": 0.03297071531414986, "learning_rate": 0.0002, "loss": 0.5350517630577087, "mean_token_accuracy": 0.7801762819290161, "num_tokens": 7933992.0, "step": 487 }, { "entropy": 0.5290692076086998, "epoch": 1.8231992516370439, "grad_norm": 0.03105652704834938, "learning_rate": 0.0002, "loss": 0.5332382917404175, "mean_token_accuracy": 0.7827692329883575, "num_tokens": 7949802.0, "step": 488 }, { "entropy": 0.5513493865728378, "epoch": 1.8269410664172123, "grad_norm": 0.027769237756729126, "learning_rate": 0.0002, "loss": 0.5537266135215759, "mean_token_accuracy": 0.7724474370479584, "num_tokens": 7966264.0, "step": 489 }, { "entropy": 0.559148445725441, "epoch": 1.8306828811973808, "grad_norm": 0.03133245185017586, "learning_rate": 0.0002, "loss": 0.5547972321510315, "mean_token_accuracy": 0.7729021608829498, "num_tokens": 7982562.0, "step": 490 }, { "entropy": 0.5613508969545364, "epoch": 1.8344246959775492, "grad_norm": 0.031487561762332916, "learning_rate": 0.0002, "loss": 0.5589193105697632, "mean_token_accuracy": 0.7691849023103714, "num_tokens": 7999101.0, "step": 491 }, { "entropy": 0.552077904343605, "epoch": 1.8381665107577176, "grad_norm": 0.030901558697223663, "learning_rate": 0.0002, "loss": 0.5548684597015381, "mean_token_accuracy": 0.7746628671884537, "num_tokens": 8015580.0, "step": 492 }, { "entropy": 0.5537288337945938, "epoch": 1.8419083255378859, "grad_norm": 0.032475873827934265, "learning_rate": 0.0002, "loss": 0.554737389087677, "mean_token_accuracy": 0.7736551910638809, "num_tokens": 8031933.0, "step": 493 }, { "entropy": 0.548131912946701, "epoch": 1.8456501403180543, "grad_norm": 0.034645676612854004, "learning_rate": 0.0002, "loss": 0.5518745183944702, "mean_token_accuracy": 0.7750734686851501, "num_tokens": 8048122.0, "step": 494 }, { "entropy": 0.5457621365785599, "epoch": 1.8493919550982225, "grad_norm": 0.0346519835293293, "learning_rate": 0.0002, "loss": 0.5511569380760193, "mean_token_accuracy": 0.774482324719429, "num_tokens": 8064371.0, "step": 495 }, { "entropy": 0.5622203350067139, "epoch": 1.853133769878391, "grad_norm": 0.04098769649863243, "learning_rate": 0.0002, "loss": 0.5641219615936279, "mean_token_accuracy": 0.7717546820640564, "num_tokens": 8080811.0, "step": 496 }, { "entropy": 0.5483545809984207, "epoch": 1.8568755846585594, "grad_norm": 0.03688424080610275, "learning_rate": 0.0002, "loss": 0.5510388612747192, "mean_token_accuracy": 0.7764346599578857, "num_tokens": 8097126.0, "step": 497 }, { "entropy": 0.5505103766918182, "epoch": 1.8606173994387278, "grad_norm": 0.03670699521899223, "learning_rate": 0.0002, "loss": 0.5573628544807434, "mean_token_accuracy": 0.7726601958274841, "num_tokens": 8113420.0, "step": 498 }, { "entropy": 0.529410183429718, "epoch": 1.8643592142188963, "grad_norm": 0.0299246683716774, "learning_rate": 0.0002, "loss": 0.5223079919815063, "mean_token_accuracy": 0.787264496088028, "num_tokens": 8129867.0, "step": 499 }, { "entropy": 0.5540086030960083, "epoch": 1.8681010289990645, "grad_norm": 0.03435957059264183, "learning_rate": 0.0002, "loss": 0.5479264259338379, "mean_token_accuracy": 0.7777916789054871, "num_tokens": 8146232.0, "step": 500 }, { "entropy": 0.5476558804512024, "epoch": 1.871842843779233, "grad_norm": 0.032948873937129974, "learning_rate": 0.0002, "loss": 0.5458691716194153, "mean_token_accuracy": 0.7800754606723785, "num_tokens": 8162478.0, "step": 501 }, { "entropy": 0.5278200954198837, "epoch": 1.8755846585594012, "grad_norm": 0.02974856086075306, "learning_rate": 0.0002, "loss": 0.5305043458938599, "mean_token_accuracy": 0.785199910402298, "num_tokens": 8179046.0, "step": 502 }, { "entropy": 0.5498995333909988, "epoch": 1.8793264733395696, "grad_norm": 0.035161007195711136, "learning_rate": 0.0002, "loss": 0.5587770342826843, "mean_token_accuracy": 0.7729851007461548, "num_tokens": 8195430.0, "step": 503 }, { "entropy": 0.5525415539741516, "epoch": 1.883068288119738, "grad_norm": 0.0358411967754364, "learning_rate": 0.0002, "loss": 0.5540306568145752, "mean_token_accuracy": 0.7763612270355225, "num_tokens": 8211820.0, "step": 504 }, { "entropy": 0.548132598400116, "epoch": 1.8868101028999065, "grad_norm": 0.030124109238386154, "learning_rate": 0.0002, "loss": 0.5509622693061829, "mean_token_accuracy": 0.7774811685085297, "num_tokens": 8228136.0, "step": 505 }, { "entropy": 0.5653504580259323, "epoch": 1.890551917680075, "grad_norm": 0.03144733980298042, "learning_rate": 0.0002, "loss": 0.5578948259353638, "mean_token_accuracy": 0.7719802111387253, "num_tokens": 8244600.0, "step": 506 }, { "entropy": 0.5680980533361435, "epoch": 1.8942937324602434, "grad_norm": 0.03786737844347954, "learning_rate": 0.0002, "loss": 0.5742643475532532, "mean_token_accuracy": 0.7682982087135315, "num_tokens": 8260924.0, "step": 507 }, { "entropy": 0.5519368350505829, "epoch": 1.8980355472404116, "grad_norm": 0.03175094351172447, "learning_rate": 0.0002, "loss": 0.553012490272522, "mean_token_accuracy": 0.7758679240942001, "num_tokens": 8277138.0, "step": 508 }, { "entropy": 0.550408124923706, "epoch": 1.90177736202058, "grad_norm": 0.03196226805448532, "learning_rate": 0.0002, "loss": 0.5527910590171814, "mean_token_accuracy": 0.7774336487054825, "num_tokens": 8293651.0, "step": 509 }, { "entropy": 0.551310807466507, "epoch": 1.9055191768007482, "grad_norm": 0.032158490270376205, "learning_rate": 0.0002, "loss": 0.5532134175300598, "mean_token_accuracy": 0.7765610069036484, "num_tokens": 8310166.0, "step": 510 }, { "entropy": 0.554396003484726, "epoch": 1.9092609915809167, "grad_norm": 0.03265155106782913, "learning_rate": 0.0002, "loss": 0.5611427426338196, "mean_token_accuracy": 0.770960658788681, "num_tokens": 8326460.0, "step": 511 }, { "entropy": 0.5533443540334702, "epoch": 1.913002806361085, "grad_norm": 0.03062952496111393, "learning_rate": 0.0002, "loss": 0.5535008311271667, "mean_token_accuracy": 0.7743202298879623, "num_tokens": 8342730.0, "step": 512 }, { "entropy": 0.557416245341301, "epoch": 1.9167446211412535, "grad_norm": 0.032427720725536346, "learning_rate": 0.0002, "loss": 0.555341899394989, "mean_token_accuracy": 0.7736751586198807, "num_tokens": 8358790.0, "step": 513 }, { "entropy": 0.5498823821544647, "epoch": 1.920486435921422, "grad_norm": 0.03641689941287041, "learning_rate": 0.0002, "loss": 0.5489510893821716, "mean_token_accuracy": 0.7756739258766174, "num_tokens": 8374932.0, "step": 514 }, { "entropy": 0.5567668825387955, "epoch": 1.9242282507015904, "grad_norm": 0.0356590710580349, "learning_rate": 0.0002, "loss": 0.5600458979606628, "mean_token_accuracy": 0.7731840312480927, "num_tokens": 8391373.0, "step": 515 }, { "entropy": 0.5492214262485504, "epoch": 1.9279700654817586, "grad_norm": 0.032011594623327255, "learning_rate": 0.0002, "loss": 0.5541006326675415, "mean_token_accuracy": 0.7760893553495407, "num_tokens": 8407637.0, "step": 516 }, { "entropy": 0.5398948937654495, "epoch": 1.931711880261927, "grad_norm": 0.03577565401792526, "learning_rate": 0.0002, "loss": 0.5467641949653625, "mean_token_accuracy": 0.775809720158577, "num_tokens": 8423916.0, "step": 517 }, { "entropy": 0.5437736511230469, "epoch": 1.9354536950420953, "grad_norm": 0.031068816781044006, "learning_rate": 0.0002, "loss": 0.5446307063102722, "mean_token_accuracy": 0.7766688168048859, "num_tokens": 8440387.0, "step": 518 }, { "entropy": 0.551026239991188, "epoch": 1.9391955098222637, "grad_norm": 0.03239775449037552, "learning_rate": 0.0002, "loss": 0.5448942184448242, "mean_token_accuracy": 0.7764843702316284, "num_tokens": 8456844.0, "step": 519 }, { "entropy": 0.5524020791053772, "epoch": 1.9429373246024322, "grad_norm": 0.03006759099662304, "learning_rate": 0.0002, "loss": 0.5508519411087036, "mean_token_accuracy": 0.7757467180490494, "num_tokens": 8473098.0, "step": 520 }, { "entropy": 0.5465254038572311, "epoch": 1.9466791393826006, "grad_norm": 0.03377439081668854, "learning_rate": 0.0002, "loss": 0.5440271496772766, "mean_token_accuracy": 0.7764104902744293, "num_tokens": 8489284.0, "step": 521 }, { "entropy": 0.5479972213506699, "epoch": 1.950420954162769, "grad_norm": 0.03804773464798927, "learning_rate": 0.0002, "loss": 0.5570059418678284, "mean_token_accuracy": 0.7720707058906555, "num_tokens": 8505659.0, "step": 522 }, { "entropy": 0.5531162023544312, "epoch": 1.9541627689429373, "grad_norm": 0.0431046187877655, "learning_rate": 0.0002, "loss": 0.5670960545539856, "mean_token_accuracy": 0.7688823044300079, "num_tokens": 8522329.0, "step": 523 }, { "entropy": 0.5688248574733734, "epoch": 1.9579045837231057, "grad_norm": 0.026841329410672188, "learning_rate": 0.0002, "loss": 0.5626019835472107, "mean_token_accuracy": 0.7691622525453568, "num_tokens": 8538842.0, "step": 524 }, { "entropy": 0.5459724515676498, "epoch": 1.961646398503274, "grad_norm": 0.03493349626660347, "learning_rate": 0.0002, "loss": 0.5443795919418335, "mean_token_accuracy": 0.7770666480064392, "num_tokens": 8554945.0, "step": 525 }, { "entropy": 0.5657712519168854, "epoch": 1.9653882132834424, "grad_norm": 0.03769686445593834, "learning_rate": 0.0002, "loss": 0.5527753829956055, "mean_token_accuracy": 0.7778369933366776, "num_tokens": 8570989.0, "step": 526 }, { "entropy": 0.550276130437851, "epoch": 1.9691300280636108, "grad_norm": 0.03369564935564995, "learning_rate": 0.0002, "loss": 0.5424638986587524, "mean_token_accuracy": 0.7803192138671875, "num_tokens": 8587072.0, "step": 527 }, { "entropy": 0.5489895343780518, "epoch": 1.9728718428437793, "grad_norm": 0.03569629415869713, "learning_rate": 0.0002, "loss": 0.559888482093811, "mean_token_accuracy": 0.7720399796962738, "num_tokens": 8603352.0, "step": 528 }, { "entropy": 0.530121460556984, "epoch": 1.9766136576239477, "grad_norm": 0.037291910499334335, "learning_rate": 0.0002, "loss": 0.5450345873832703, "mean_token_accuracy": 0.7796709537506104, "num_tokens": 8619760.0, "step": 529 }, { "entropy": 0.5523941069841385, "epoch": 1.9803554724041161, "grad_norm": 0.027196237817406654, "learning_rate": 0.0002, "loss": 0.5566985011100769, "mean_token_accuracy": 0.773260235786438, "num_tokens": 8636140.0, "step": 530 }, { "entropy": 0.5579734891653061, "epoch": 1.9840972871842844, "grad_norm": 0.029088523238897324, "learning_rate": 0.0002, "loss": 0.5540033578872681, "mean_token_accuracy": 0.7756596505641937, "num_tokens": 8652295.0, "step": 531 }, { "entropy": 0.5574969351291656, "epoch": 1.9878391019644528, "grad_norm": 0.029939375817775726, "learning_rate": 0.0002, "loss": 0.5501161217689514, "mean_token_accuracy": 0.7750376909971237, "num_tokens": 8668973.0, "step": 532 }, { "entropy": 0.5492955148220062, "epoch": 1.991580916744621, "grad_norm": 0.03092138096690178, "learning_rate": 0.0002, "loss": 0.5422185063362122, "mean_token_accuracy": 0.7804518193006516, "num_tokens": 8685148.0, "step": 533 }, { "entropy": 0.5466224402189255, "epoch": 1.9953227315247895, "grad_norm": 0.03692883625626564, "learning_rate": 0.0002, "loss": 0.5514038801193237, "mean_token_accuracy": 0.7737534046173096, "num_tokens": 8701543.0, "step": 534 }, { "entropy": 0.5537078529596329, "epoch": 1.999064546304958, "grad_norm": 0.03208556026220322, "learning_rate": 0.0002, "loss": 0.5545927286148071, "mean_token_accuracy": 0.777570441365242, "num_tokens": 8717790.0, "step": 535 }, { "entropy": 0.5328470468521118, "epoch": 2.0, "grad_norm": 0.056387241929769516, "learning_rate": 0.0002, "loss": 0.5407091379165649, "mean_token_accuracy": 0.7980132699012756, "num_tokens": 8719006.0, "step": 536 }, { "entropy": 0.5399350374937057, "epoch": 2.0037418147801684, "grad_norm": 0.030944975093007088, "learning_rate": 0.0002, "loss": 0.5385851263999939, "mean_token_accuracy": 0.7820405662059784, "num_tokens": 8735642.0, "step": 537 }, { "entropy": 0.5494481921195984, "epoch": 2.007483629560337, "grad_norm": 0.037696994841098785, "learning_rate": 0.0002, "loss": 0.5568894147872925, "mean_token_accuracy": 0.7728834450244904, "num_tokens": 8752037.0, "step": 538 }, { "entropy": 0.5218051299452782, "epoch": 2.0112254443405053, "grad_norm": 0.03197522833943367, "learning_rate": 0.0002, "loss": 0.5231513977050781, "mean_token_accuracy": 0.7889297753572464, "num_tokens": 8768180.0, "step": 539 }, { "entropy": 0.5204869955778122, "epoch": 2.0149672591206733, "grad_norm": 0.03365905210375786, "learning_rate": 0.0002, "loss": 0.5204414129257202, "mean_token_accuracy": 0.7887504994869232, "num_tokens": 8784385.0, "step": 540 }, { "entropy": 0.5250371545553207, "epoch": 2.0187090739008418, "grad_norm": 0.03206612914800644, "learning_rate": 0.0002, "loss": 0.5264713764190674, "mean_token_accuracy": 0.7865318804979324, "num_tokens": 8800264.0, "step": 541 }, { "entropy": 0.5362996757030487, "epoch": 2.02245088868101, "grad_norm": 0.035737182945013046, "learning_rate": 0.0002, "loss": 0.5328425765037537, "mean_token_accuracy": 0.7832369208335876, "num_tokens": 8816869.0, "step": 542 }, { "entropy": 0.5211998522281647, "epoch": 2.0261927034611786, "grad_norm": 0.03382508456707001, "learning_rate": 0.0002, "loss": 0.5247855186462402, "mean_token_accuracy": 0.7869311422109604, "num_tokens": 8833119.0, "step": 543 }, { "entropy": 0.5350741446018219, "epoch": 2.029934518241347, "grad_norm": 0.03478322923183441, "learning_rate": 0.0002, "loss": 0.5424962639808655, "mean_token_accuracy": 0.7780940532684326, "num_tokens": 8849384.0, "step": 544 }, { "entropy": 0.5465849786996841, "epoch": 2.0336763330215155, "grad_norm": 0.04140733554959297, "learning_rate": 0.0002, "loss": 0.5555759072303772, "mean_token_accuracy": 0.7771580815315247, "num_tokens": 8865580.0, "step": 545 }, { "entropy": 0.5315355062484741, "epoch": 2.037418147801684, "grad_norm": 0.037138681858778, "learning_rate": 0.0002, "loss": 0.5277940630912781, "mean_token_accuracy": 0.7869007289409637, "num_tokens": 8882160.0, "step": 546 }, { "entropy": 0.5415049940347672, "epoch": 2.0411599625818524, "grad_norm": 0.0382317453622818, "learning_rate": 0.0002, "loss": 0.52928626537323, "mean_token_accuracy": 0.783332422375679, "num_tokens": 8898284.0, "step": 547 }, { "entropy": 0.5444429516792297, "epoch": 2.0449017773620204, "grad_norm": 0.03212872892618179, "learning_rate": 0.0002, "loss": 0.5390786528587341, "mean_token_accuracy": 0.7800189107656479, "num_tokens": 8914317.0, "step": 548 }, { "entropy": 0.5368607640266418, "epoch": 2.048643592142189, "grad_norm": 0.03962872177362442, "learning_rate": 0.0002, "loss": 0.5424067974090576, "mean_token_accuracy": 0.7807967215776443, "num_tokens": 8930503.0, "step": 549 }, { "entropy": 0.5316442102193832, "epoch": 2.0523854069223573, "grad_norm": 0.04042808711528778, "learning_rate": 0.0002, "loss": 0.5394030809402466, "mean_token_accuracy": 0.7808849960565567, "num_tokens": 8946862.0, "step": 550 }, { "entropy": 0.5393616110086441, "epoch": 2.0561272217025257, "grad_norm": 0.04134383797645569, "learning_rate": 0.0002, "loss": 0.5422969460487366, "mean_token_accuracy": 0.778337299823761, "num_tokens": 8963159.0, "step": 551 }, { "entropy": 0.5272297635674477, "epoch": 2.059869036482694, "grad_norm": 0.03908038139343262, "learning_rate": 0.0002, "loss": 0.5269819498062134, "mean_token_accuracy": 0.7861954718828201, "num_tokens": 8979486.0, "step": 552 }, { "entropy": 0.5292486846446991, "epoch": 2.0636108512628626, "grad_norm": 0.03547659516334534, "learning_rate": 0.0002, "loss": 0.531383752822876, "mean_token_accuracy": 0.7845012545585632, "num_tokens": 8995728.0, "step": 553 }, { "entropy": 0.537693664431572, "epoch": 2.067352666043031, "grad_norm": 0.04505831003189087, "learning_rate": 0.0002, "loss": 0.5415912866592407, "mean_token_accuracy": 0.7810403853654861, "num_tokens": 9012262.0, "step": 554 }, { "entropy": 0.542693018913269, "epoch": 2.0710944808231995, "grad_norm": 0.03637455403804779, "learning_rate": 0.0002, "loss": 0.5454283356666565, "mean_token_accuracy": 0.7768286317586899, "num_tokens": 9028450.0, "step": 555 }, { "entropy": 0.5359488427639008, "epoch": 2.0748362956033675, "grad_norm": 0.038283299654722214, "learning_rate": 0.0002, "loss": 0.5341436266899109, "mean_token_accuracy": 0.7861706465482712, "num_tokens": 9044691.0, "step": 556 }, { "entropy": 0.5348773896694183, "epoch": 2.078578110383536, "grad_norm": 0.038720738142728806, "learning_rate": 0.0002, "loss": 0.5340168476104736, "mean_token_accuracy": 0.7848398089408875, "num_tokens": 9061090.0, "step": 557 }, { "entropy": 0.5301378965377808, "epoch": 2.0823199251637043, "grad_norm": 0.03610686585307121, "learning_rate": 0.0002, "loss": 0.5331196784973145, "mean_token_accuracy": 0.7825122624635696, "num_tokens": 9077457.0, "step": 558 }, { "entropy": 0.5627280175685883, "epoch": 2.086061739943873, "grad_norm": 0.0459170863032341, "learning_rate": 0.0002, "loss": 0.5622618198394775, "mean_token_accuracy": 0.7731509357690811, "num_tokens": 9093892.0, "step": 559 }, { "entropy": 0.5291252806782722, "epoch": 2.0898035547240412, "grad_norm": 0.03501354530453682, "learning_rate": 0.0002, "loss": 0.5241326689720154, "mean_token_accuracy": 0.7903649061918259, "num_tokens": 9110195.0, "step": 560 }, { "entropy": 0.5336360484361649, "epoch": 2.0935453695042097, "grad_norm": 0.03297366574406624, "learning_rate": 0.0002, "loss": 0.5302354097366333, "mean_token_accuracy": 0.7871804982423782, "num_tokens": 9126264.0, "step": 561 }, { "entropy": 0.5324128270149231, "epoch": 2.097287184284378, "grad_norm": 0.040097158402204514, "learning_rate": 0.0002, "loss": 0.5449591875076294, "mean_token_accuracy": 0.7766915112733841, "num_tokens": 9142405.0, "step": 562 }, { "entropy": 0.5327176600694656, "epoch": 2.101028999064546, "grad_norm": 0.03983257710933685, "learning_rate": 0.0002, "loss": 0.5427699089050293, "mean_token_accuracy": 0.780575692653656, "num_tokens": 9158550.0, "step": 563 }, { "entropy": 0.5298762768507004, "epoch": 2.1047708138447145, "grad_norm": 0.035936590284109116, "learning_rate": 0.0002, "loss": 0.5320777297019958, "mean_token_accuracy": 0.7820149213075638, "num_tokens": 9174783.0, "step": 564 }, { "entropy": 0.5250122100114822, "epoch": 2.108512628624883, "grad_norm": 0.03537021949887276, "learning_rate": 0.0002, "loss": 0.5220876932144165, "mean_token_accuracy": 0.7874044477939606, "num_tokens": 9190734.0, "step": 565 }, { "entropy": 0.5498971343040466, "epoch": 2.1122544434050514, "grad_norm": 0.03972788527607918, "learning_rate": 0.0002, "loss": 0.5416819453239441, "mean_token_accuracy": 0.7811024487018585, "num_tokens": 9207046.0, "step": 566 }, { "entropy": 0.5510820746421814, "epoch": 2.11599625818522, "grad_norm": 0.03674028813838959, "learning_rate": 0.0002, "loss": 0.5430952906608582, "mean_token_accuracy": 0.7772987484931946, "num_tokens": 9223541.0, "step": 567 }, { "entropy": 0.5243249386548996, "epoch": 2.1197380729653883, "grad_norm": 0.03868189826607704, "learning_rate": 0.0002, "loss": 0.5305947065353394, "mean_token_accuracy": 0.7821440249681473, "num_tokens": 9239944.0, "step": 568 }, { "entropy": 0.5186186358332634, "epoch": 2.1234798877455567, "grad_norm": 0.03420955687761307, "learning_rate": 0.0002, "loss": 0.5219792127609253, "mean_token_accuracy": 0.787507027387619, "num_tokens": 9256323.0, "step": 569 }, { "entropy": 0.5048380643129349, "epoch": 2.127221702525725, "grad_norm": 0.043813057243824005, "learning_rate": 0.0002, "loss": 0.511600911617279, "mean_token_accuracy": 0.7919255346059799, "num_tokens": 9272250.0, "step": 570 }, { "entropy": 0.5333007425069809, "epoch": 2.130963517305893, "grad_norm": 0.03591044992208481, "learning_rate": 0.0002, "loss": 0.5382859110832214, "mean_token_accuracy": 0.7790134996175766, "num_tokens": 9288633.0, "step": 571 }, { "entropy": 0.5432953387498856, "epoch": 2.1347053320860616, "grad_norm": 0.03850630670785904, "learning_rate": 0.0002, "loss": 0.5398726463317871, "mean_token_accuracy": 0.7803007066249847, "num_tokens": 9304977.0, "step": 572 }, { "entropy": 0.5424948632717133, "epoch": 2.13844714686623, "grad_norm": 0.042041826993227005, "learning_rate": 0.0002, "loss": 0.5371389389038086, "mean_token_accuracy": 0.7817080616950989, "num_tokens": 9321211.0, "step": 573 }, { "entropy": 0.5420571565628052, "epoch": 2.1421889616463985, "grad_norm": 0.03702463209629059, "learning_rate": 0.0002, "loss": 0.5405826568603516, "mean_token_accuracy": 0.7787773013114929, "num_tokens": 9337519.0, "step": 574 }, { "entropy": 0.5343386083841324, "epoch": 2.145930776426567, "grad_norm": 0.0367942713201046, "learning_rate": 0.0002, "loss": 0.5343334078788757, "mean_token_accuracy": 0.7813169211149216, "num_tokens": 9353930.0, "step": 575 }, { "entropy": 0.5107736587524414, "epoch": 2.1496725912067354, "grad_norm": 0.04816743731498718, "learning_rate": 0.0002, "loss": 0.5181273221969604, "mean_token_accuracy": 0.790352001786232, "num_tokens": 9370151.0, "step": 576 }, { "entropy": 0.5483916699886322, "epoch": 2.153414405986904, "grad_norm": 0.03954138606786728, "learning_rate": 0.0002, "loss": 0.5537930130958557, "mean_token_accuracy": 0.7744487076997757, "num_tokens": 9386529.0, "step": 577 }, { "entropy": 0.5222444832324982, "epoch": 2.157156220767072, "grad_norm": 0.04258863255381584, "learning_rate": 0.0002, "loss": 0.5331015586853027, "mean_token_accuracy": 0.7828160971403122, "num_tokens": 9402702.0, "step": 578 }, { "entropy": 0.5395079553127289, "epoch": 2.1608980355472402, "grad_norm": 0.036775294691324234, "learning_rate": 0.0002, "loss": 0.5392586588859558, "mean_token_accuracy": 0.7785846441984177, "num_tokens": 9418983.0, "step": 579 }, { "entropy": 0.5308848768472672, "epoch": 2.1646398503274087, "grad_norm": 0.041630957275629044, "learning_rate": 0.0002, "loss": 0.5223425030708313, "mean_token_accuracy": 0.7881145030260086, "num_tokens": 9435130.0, "step": 580 }, { "entropy": 0.5460510104894638, "epoch": 2.168381665107577, "grad_norm": 0.040873266756534576, "learning_rate": 0.0002, "loss": 0.5389937162399292, "mean_token_accuracy": 0.7796555161476135, "num_tokens": 9451384.0, "step": 581 }, { "entropy": 0.5144870802760124, "epoch": 2.1721234798877456, "grad_norm": 0.04395061731338501, "learning_rate": 0.0002, "loss": 0.5220937132835388, "mean_token_accuracy": 0.7867953330278397, "num_tokens": 9467676.0, "step": 582 }, { "entropy": 0.5361004173755646, "epoch": 2.175865294667914, "grad_norm": 0.03444032743573189, "learning_rate": 0.0002, "loss": 0.5381976962089539, "mean_token_accuracy": 0.7804248631000519, "num_tokens": 9484105.0, "step": 583 }, { "entropy": 0.5315199941396713, "epoch": 2.1796071094480824, "grad_norm": 0.04019028693437576, "learning_rate": 0.0002, "loss": 0.538859486579895, "mean_token_accuracy": 0.7802779376506805, "num_tokens": 9500441.0, "step": 584 }, { "entropy": 0.5049743205308914, "epoch": 2.183348924228251, "grad_norm": 0.038020916283130646, "learning_rate": 0.0002, "loss": 0.5077824592590332, "mean_token_accuracy": 0.794673815369606, "num_tokens": 9516632.0, "step": 585 }, { "entropy": 0.542245015501976, "epoch": 2.187090739008419, "grad_norm": 0.03803880140185356, "learning_rate": 0.0002, "loss": 0.5457203388214111, "mean_token_accuracy": 0.7765202075242996, "num_tokens": 9532790.0, "step": 586 }, { "entropy": 0.545234277844429, "epoch": 2.1908325537885873, "grad_norm": 0.03659515827894211, "learning_rate": 0.0002, "loss": 0.5328729748725891, "mean_token_accuracy": 0.7851473838090897, "num_tokens": 9549021.0, "step": 587 }, { "entropy": 0.5441733747720718, "epoch": 2.1945743685687558, "grad_norm": 0.03839794918894768, "learning_rate": 0.0002, "loss": 0.541313648223877, "mean_token_accuracy": 0.7806493043899536, "num_tokens": 9565414.0, "step": 588 }, { "entropy": 0.5392065942287445, "epoch": 2.198316183348924, "grad_norm": 0.03657695651054382, "learning_rate": 0.0002, "loss": 0.5446825623512268, "mean_token_accuracy": 0.7759186178445816, "num_tokens": 9581834.0, "step": 589 }, { "entropy": 0.5343391597270966, "epoch": 2.2020579981290926, "grad_norm": 0.03904880955815315, "learning_rate": 0.0002, "loss": 0.5319048166275024, "mean_token_accuracy": 0.7858142107725143, "num_tokens": 9598306.0, "step": 590 }, { "entropy": 0.5127864703536034, "epoch": 2.205799812909261, "grad_norm": 0.041219562292099, "learning_rate": 0.0002, "loss": 0.5198400616645813, "mean_token_accuracy": 0.7894931733608246, "num_tokens": 9614512.0, "step": 591 }, { "entropy": 0.5380221456289291, "epoch": 2.2095416276894295, "grad_norm": 0.03763064742088318, "learning_rate": 0.0002, "loss": 0.5350849032402039, "mean_token_accuracy": 0.779957503080368, "num_tokens": 9630831.0, "step": 592 }, { "entropy": 0.5404982268810272, "epoch": 2.213283442469598, "grad_norm": 0.03594009950757027, "learning_rate": 0.0002, "loss": 0.5446127653121948, "mean_token_accuracy": 0.7765700072050095, "num_tokens": 9647260.0, "step": 593 }, { "entropy": 0.5349030494689941, "epoch": 2.217025257249766, "grad_norm": 0.039131198078393936, "learning_rate": 0.0002, "loss": 0.5407675504684448, "mean_token_accuracy": 0.7807668596506119, "num_tokens": 9663454.0, "step": 594 }, { "entropy": 0.5357907861471176, "epoch": 2.2207670720299344, "grad_norm": 0.03754086792469025, "learning_rate": 0.0002, "loss": 0.5390987396240234, "mean_token_accuracy": 0.7814063429832458, "num_tokens": 9679665.0, "step": 595 }, { "entropy": 0.539327397942543, "epoch": 2.224508886810103, "grad_norm": 0.042121171951293945, "learning_rate": 0.0002, "loss": 0.5349074006080627, "mean_token_accuracy": 0.7835494577884674, "num_tokens": 9695690.0, "step": 596 }, { "entropy": 0.5527440309524536, "epoch": 2.2282507015902713, "grad_norm": 0.034759730100631714, "learning_rate": 0.0002, "loss": 0.546990156173706, "mean_token_accuracy": 0.7748693376779556, "num_tokens": 9711925.0, "step": 597 }, { "entropy": 0.5339156091213226, "epoch": 2.2319925163704397, "grad_norm": 0.03824164718389511, "learning_rate": 0.0002, "loss": 0.5315659642219543, "mean_token_accuracy": 0.7847660332918167, "num_tokens": 9728568.0, "step": 598 }, { "entropy": 0.5418261140584946, "epoch": 2.235734331150608, "grad_norm": 0.03952635079622269, "learning_rate": 0.0002, "loss": 0.5444273948669434, "mean_token_accuracy": 0.7786458134651184, "num_tokens": 9744937.0, "step": 599 }, { "entropy": 0.5325147211551666, "epoch": 2.2394761459307766, "grad_norm": 0.038507163524627686, "learning_rate": 0.0002, "loss": 0.538148045539856, "mean_token_accuracy": 0.7803481221199036, "num_tokens": 9761521.0, "step": 600 }, { "entropy": 0.5348295122385025, "epoch": 2.243217960710945, "grad_norm": 0.035764180123806, "learning_rate": 0.0002, "loss": 0.5350884199142456, "mean_token_accuracy": 0.7832496911287308, "num_tokens": 9777702.0, "step": 601 }, { "entropy": 0.549017146229744, "epoch": 2.246959775491113, "grad_norm": 0.037822045385837555, "learning_rate": 0.0002, "loss": 0.5440195798873901, "mean_token_accuracy": 0.7799560874700546, "num_tokens": 9794070.0, "step": 602 }, { "entropy": 0.5402355939149857, "epoch": 2.2507015902712815, "grad_norm": 0.04137027636170387, "learning_rate": 0.0002, "loss": 0.552240788936615, "mean_token_accuracy": 0.7787455171346664, "num_tokens": 9810307.0, "step": 603 }, { "entropy": 0.5575389862060547, "epoch": 2.25444340505145, "grad_norm": 0.03639021888375282, "learning_rate": 0.0002, "loss": 0.555095911026001, "mean_token_accuracy": 0.7715982496738434, "num_tokens": 9826944.0, "step": 604 }, { "entropy": 0.5453804582357407, "epoch": 2.2581852198316184, "grad_norm": 0.0329916886985302, "learning_rate": 0.0002, "loss": 0.5451047420501709, "mean_token_accuracy": 0.778001993894577, "num_tokens": 9843174.0, "step": 605 }, { "entropy": 0.5351513028144836, "epoch": 2.261927034611787, "grad_norm": 0.04027882218360901, "learning_rate": 0.0002, "loss": 0.5335583686828613, "mean_token_accuracy": 0.7831520736217499, "num_tokens": 9859568.0, "step": 606 }, { "entropy": 0.5303051620721817, "epoch": 2.2656688493919552, "grad_norm": 0.037942592054605484, "learning_rate": 0.0002, "loss": 0.5293945670127869, "mean_token_accuracy": 0.7875201851129532, "num_tokens": 9876127.0, "step": 607 }, { "entropy": 0.5205637887120247, "epoch": 2.2694106641721237, "grad_norm": 0.039965420961380005, "learning_rate": 0.0002, "loss": 0.5284023284912109, "mean_token_accuracy": 0.7851175218820572, "num_tokens": 9892336.0, "step": 608 }, { "entropy": 0.5270423293113708, "epoch": 2.2731524789522917, "grad_norm": 0.045534420758485794, "learning_rate": 0.0002, "loss": 0.5361034274101257, "mean_token_accuracy": 0.7813378870487213, "num_tokens": 9908677.0, "step": 609 }, { "entropy": 0.5461472570896149, "epoch": 2.27689429373246, "grad_norm": 0.03911803662776947, "learning_rate": 0.0002, "loss": 0.5419346690177917, "mean_token_accuracy": 0.7793000787496567, "num_tokens": 9925188.0, "step": 610 }, { "entropy": 0.5332899391651154, "epoch": 2.2806361085126285, "grad_norm": 0.03753461316227913, "learning_rate": 0.0002, "loss": 0.5261275172233582, "mean_token_accuracy": 0.7856169193983078, "num_tokens": 9941232.0, "step": 611 }, { "entropy": 0.5298324078321457, "epoch": 2.284377923292797, "grad_norm": 0.03578303009271622, "learning_rate": 0.0002, "loss": 0.525759220123291, "mean_token_accuracy": 0.7869399040937424, "num_tokens": 9957312.0, "step": 612 }, { "entropy": 0.5350215286016464, "epoch": 2.2881197380729654, "grad_norm": 0.04014569893479347, "learning_rate": 0.0002, "loss": 0.5390491485595703, "mean_token_accuracy": 0.7834457159042358, "num_tokens": 9973629.0, "step": 613 }, { "entropy": 0.5366346836090088, "epoch": 2.291861552853134, "grad_norm": 0.03635207563638687, "learning_rate": 0.0002, "loss": 0.5361836552619934, "mean_token_accuracy": 0.7822949439287186, "num_tokens": 9990003.0, "step": 614 }, { "entropy": 0.5358218550682068, "epoch": 2.2956033676333023, "grad_norm": 0.04499870166182518, "learning_rate": 0.0002, "loss": 0.5433334708213806, "mean_token_accuracy": 0.781024381518364, "num_tokens": 10006594.0, "step": 615 }, { "entropy": 0.5238985568284988, "epoch": 2.2993451824134707, "grad_norm": 0.041404612362384796, "learning_rate": 0.0002, "loss": 0.5319328308105469, "mean_token_accuracy": 0.7816060185432434, "num_tokens": 10022841.0, "step": 616 }, { "entropy": 0.5418704599142075, "epoch": 2.3030869971936387, "grad_norm": 0.03798811510205269, "learning_rate": 0.0002, "loss": 0.5385047793388367, "mean_token_accuracy": 0.781515583395958, "num_tokens": 10039191.0, "step": 617 }, { "entropy": 0.5519637167453766, "epoch": 2.306828811973807, "grad_norm": 0.03714706003665924, "learning_rate": 0.0002, "loss": 0.5444304347038269, "mean_token_accuracy": 0.779953271150589, "num_tokens": 10055793.0, "step": 618 }, { "entropy": 0.5363687425851822, "epoch": 2.3105706267539756, "grad_norm": 0.0435946062207222, "learning_rate": 0.0002, "loss": 0.538260817527771, "mean_token_accuracy": 0.7822400480508804, "num_tokens": 10072406.0, "step": 619 }, { "entropy": 0.5363148003816605, "epoch": 2.314312441534144, "grad_norm": 0.03934507444500923, "learning_rate": 0.0002, "loss": 0.5490261316299438, "mean_token_accuracy": 0.7775698453187943, "num_tokens": 10088893.0, "step": 620 }, { "entropy": 0.5337411910295486, "epoch": 2.3180542563143125, "grad_norm": 0.040114130824804306, "learning_rate": 0.0002, "loss": 0.5454047322273254, "mean_token_accuracy": 0.7799661755561829, "num_tokens": 10105348.0, "step": 621 }, { "entropy": 0.5429546684026718, "epoch": 2.321796071094481, "grad_norm": 0.04296046867966652, "learning_rate": 0.0002, "loss": 0.543846070766449, "mean_token_accuracy": 0.7779647558927536, "num_tokens": 10121753.0, "step": 622 }, { "entropy": 0.5331653952598572, "epoch": 2.3255378858746494, "grad_norm": 0.03862839564681053, "learning_rate": 0.0002, "loss": 0.5329957008361816, "mean_token_accuracy": 0.7838963121175766, "num_tokens": 10138069.0, "step": 623 }, { "entropy": 0.5332556366920471, "epoch": 2.3292797006548174, "grad_norm": 0.03637029603123665, "learning_rate": 0.0002, "loss": 0.5306488871574402, "mean_token_accuracy": 0.7843363881111145, "num_tokens": 10154386.0, "step": 624 }, { "entropy": 0.5389147102832794, "epoch": 2.333021515434986, "grad_norm": 0.04242001101374626, "learning_rate": 0.0002, "loss": 0.5379246473312378, "mean_token_accuracy": 0.7805036455392838, "num_tokens": 10170602.0, "step": 625 }, { "entropy": 0.529606968164444, "epoch": 2.3367633302151543, "grad_norm": 0.04366292059421539, "learning_rate": 0.0002, "loss": 0.5345982909202576, "mean_token_accuracy": 0.7849325835704803, "num_tokens": 10186681.0, "step": 626 }, { "entropy": 0.5343451648950577, "epoch": 2.3405051449953227, "grad_norm": 0.04901853948831558, "learning_rate": 0.0002, "loss": 0.5390074253082275, "mean_token_accuracy": 0.7809460461139679, "num_tokens": 10202735.0, "step": 627 }, { "entropy": 0.5364287346601486, "epoch": 2.344246959775491, "grad_norm": 0.03992681950330734, "learning_rate": 0.0002, "loss": 0.5428602695465088, "mean_token_accuracy": 0.7803080379962921, "num_tokens": 10219104.0, "step": 628 }, { "entropy": 0.5363292992115021, "epoch": 2.3479887745556596, "grad_norm": 0.04561900347471237, "learning_rate": 0.0002, "loss": 0.5422950983047485, "mean_token_accuracy": 0.7803726643323898, "num_tokens": 10235450.0, "step": 629 }, { "entropy": 0.5503382086753845, "epoch": 2.351730589335828, "grad_norm": 0.036633238196372986, "learning_rate": 0.0002, "loss": 0.5429909229278564, "mean_token_accuracy": 0.777814120054245, "num_tokens": 10251744.0, "step": 630 }, { "entropy": 0.5556712299585342, "epoch": 2.3554724041159965, "grad_norm": 0.03755469620227814, "learning_rate": 0.0002, "loss": 0.5372464060783386, "mean_token_accuracy": 0.7816385924816132, "num_tokens": 10268228.0, "step": 631 }, { "entropy": 0.54240882396698, "epoch": 2.3592142188961645, "grad_norm": 0.04244554787874222, "learning_rate": 0.0002, "loss": 0.5416730046272278, "mean_token_accuracy": 0.7805517017841339, "num_tokens": 10284594.0, "step": 632 }, { "entropy": 0.5457853078842163, "epoch": 2.362956033676333, "grad_norm": 0.03768390789628029, "learning_rate": 0.0002, "loss": 0.5503990054130554, "mean_token_accuracy": 0.7760391384363174, "num_tokens": 10300645.0, "step": 633 }, { "entropy": 0.5061568543314934, "epoch": 2.3666978484565013, "grad_norm": 0.04066069424152374, "learning_rate": 0.0002, "loss": 0.5147897601127625, "mean_token_accuracy": 0.7923619449138641, "num_tokens": 10317035.0, "step": 634 }, { "entropy": 0.5265238285064697, "epoch": 2.3704396632366698, "grad_norm": 0.045070137828588486, "learning_rate": 0.0002, "loss": 0.5342065691947937, "mean_token_accuracy": 0.7828978300094604, "num_tokens": 10333097.0, "step": 635 }, { "entropy": 0.5213058292865753, "epoch": 2.374181478016838, "grad_norm": 0.04251949489116669, "learning_rate": 0.0002, "loss": 0.5242940783500671, "mean_token_accuracy": 0.7875875681638718, "num_tokens": 10349477.0, "step": 636 }, { "entropy": 0.532469779253006, "epoch": 2.3779232927970066, "grad_norm": 0.04180033504962921, "learning_rate": 0.0002, "loss": 0.5338732600212097, "mean_token_accuracy": 0.7874448299407959, "num_tokens": 10365855.0, "step": 637 }, { "entropy": 0.5583899617195129, "epoch": 2.381665107577175, "grad_norm": 0.036461617797613144, "learning_rate": 0.0002, "loss": 0.5522404313087463, "mean_token_accuracy": 0.7765318900346756, "num_tokens": 10382454.0, "step": 638 }, { "entropy": 0.5361616462469101, "epoch": 2.385406922357343, "grad_norm": 0.03820829838514328, "learning_rate": 0.0002, "loss": 0.5331661701202393, "mean_token_accuracy": 0.7812754958868027, "num_tokens": 10398570.0, "step": 639 }, { "entropy": 0.5388377606868744, "epoch": 2.3891487371375115, "grad_norm": 0.03890148177742958, "learning_rate": 0.0002, "loss": 0.535783052444458, "mean_token_accuracy": 0.7837421149015427, "num_tokens": 10415136.0, "step": 640 }, { "entropy": 0.5403297692537308, "epoch": 2.39289055191768, "grad_norm": 0.037266530096530914, "learning_rate": 0.0002, "loss": 0.5458592176437378, "mean_token_accuracy": 0.7799215018749237, "num_tokens": 10431595.0, "step": 641 }, { "entropy": 0.5327188819646835, "epoch": 2.3966323666978484, "grad_norm": 0.04411016404628754, "learning_rate": 0.0002, "loss": 0.5372153520584106, "mean_token_accuracy": 0.7820907682180405, "num_tokens": 10448092.0, "step": 642 }, { "entropy": 0.5483715236186981, "epoch": 2.400374181478017, "grad_norm": 0.03909829258918762, "learning_rate": 0.0002, "loss": 0.5454411506652832, "mean_token_accuracy": 0.781398132443428, "num_tokens": 10464267.0, "step": 643 }, { "entropy": 0.5467081367969513, "epoch": 2.4041159962581853, "grad_norm": 0.04295220598578453, "learning_rate": 0.0002, "loss": 0.5442530512809753, "mean_token_accuracy": 0.7759910225868225, "num_tokens": 10480622.0, "step": 644 }, { "entropy": 0.545724093914032, "epoch": 2.4078578110383537, "grad_norm": 0.04099191352725029, "learning_rate": 0.0002, "loss": 0.5471324324607849, "mean_token_accuracy": 0.7780001610517502, "num_tokens": 10497093.0, "step": 645 }, { "entropy": 0.5526789277791977, "epoch": 2.411599625818522, "grad_norm": 0.03481397032737732, "learning_rate": 0.0002, "loss": 0.5524189472198486, "mean_token_accuracy": 0.7738725692033768, "num_tokens": 10513288.0, "step": 646 }, { "entropy": 0.5496002286672592, "epoch": 2.4153414405986906, "grad_norm": 0.04474830627441406, "learning_rate": 0.0002, "loss": 0.5568821430206299, "mean_token_accuracy": 0.7747314423322678, "num_tokens": 10529966.0, "step": 647 }, { "entropy": 0.5191539749503136, "epoch": 2.4190832553788586, "grad_norm": 0.04506181180477142, "learning_rate": 0.0002, "loss": 0.5247750878334045, "mean_token_accuracy": 0.7888272404670715, "num_tokens": 10546217.0, "step": 648 }, { "entropy": 0.5462011098861694, "epoch": 2.422825070159027, "grad_norm": 0.03946157172322273, "learning_rate": 0.0002, "loss": 0.5449219942092896, "mean_token_accuracy": 0.7763949930667877, "num_tokens": 10562587.0, "step": 649 }, { "entropy": 0.5374903529882431, "epoch": 2.4265668849391955, "grad_norm": 0.035694316029548645, "learning_rate": 0.0002, "loss": 0.5298718214035034, "mean_token_accuracy": 0.7844248116016388, "num_tokens": 10578673.0, "step": 650 }, { "entropy": 0.5490742027759552, "epoch": 2.430308699719364, "grad_norm": 0.040128957480192184, "learning_rate": 0.0002, "loss": 0.5476623773574829, "mean_token_accuracy": 0.7761844098567963, "num_tokens": 10594904.0, "step": 651 }, { "entropy": 0.5350600033998489, "epoch": 2.4340505144995324, "grad_norm": 0.04965779185295105, "learning_rate": 0.0002, "loss": 0.5467137694358826, "mean_token_accuracy": 0.7777107208967209, "num_tokens": 10611301.0, "step": 652 }, { "entropy": 0.5389928370714188, "epoch": 2.437792329279701, "grad_norm": 0.038716454058885574, "learning_rate": 0.0002, "loss": 0.5406030416488647, "mean_token_accuracy": 0.7798842638731003, "num_tokens": 10627924.0, "step": 653 }, { "entropy": 0.5396043509244919, "epoch": 2.441534144059869, "grad_norm": 0.04796689748764038, "learning_rate": 0.0002, "loss": 0.5485687255859375, "mean_token_accuracy": 0.7767132520675659, "num_tokens": 10643995.0, "step": 654 }, { "entropy": 0.5651813000440598, "epoch": 2.4452759588400372, "grad_norm": 0.03899235278367996, "learning_rate": 0.0002, "loss": 0.5558621883392334, "mean_token_accuracy": 0.7751055210828781, "num_tokens": 10660611.0, "step": 655 }, { "entropy": 0.5467101633548737, "epoch": 2.4490177736202057, "grad_norm": 0.041317425668239594, "learning_rate": 0.0002, "loss": 0.544463574886322, "mean_token_accuracy": 0.7791299223899841, "num_tokens": 10676939.0, "step": 656 }, { "entropy": 0.5405649244785309, "epoch": 2.452759588400374, "grad_norm": 0.03767058625817299, "learning_rate": 0.0002, "loss": 0.5359505414962769, "mean_token_accuracy": 0.7838631421327591, "num_tokens": 10693242.0, "step": 657 }, { "entropy": 0.5295758992433548, "epoch": 2.4565014031805426, "grad_norm": 0.03993664309382439, "learning_rate": 0.0002, "loss": 0.5338568091392517, "mean_token_accuracy": 0.7815168350934982, "num_tokens": 10709228.0, "step": 658 }, { "entropy": 0.5318661481142044, "epoch": 2.460243217960711, "grad_norm": 0.04673660546541214, "learning_rate": 0.0002, "loss": 0.5387503504753113, "mean_token_accuracy": 0.7823595702648163, "num_tokens": 10725743.0, "step": 659 }, { "entropy": 0.5362888127565384, "epoch": 2.4639850327408794, "grad_norm": 0.0443369522690773, "learning_rate": 0.0002, "loss": 0.5374599099159241, "mean_token_accuracy": 0.7816221117973328, "num_tokens": 10742450.0, "step": 660 }, { "entropy": 0.5324875563383102, "epoch": 2.467726847521048, "grad_norm": 0.037758708000183105, "learning_rate": 0.0002, "loss": 0.5326871871948242, "mean_token_accuracy": 0.7862564772367477, "num_tokens": 10758610.0, "step": 661 }, { "entropy": 0.5277500152587891, "epoch": 2.4714686623012163, "grad_norm": 0.042098864912986755, "learning_rate": 0.0002, "loss": 0.5331279635429382, "mean_token_accuracy": 0.7840241938829422, "num_tokens": 10774701.0, "step": 662 }, { "entropy": 0.5366615355014801, "epoch": 2.4752104770813843, "grad_norm": 0.040946412831544876, "learning_rate": 0.0002, "loss": 0.5397564768791199, "mean_token_accuracy": 0.7829322069883347, "num_tokens": 10790740.0, "step": 663 }, { "entropy": 0.5435209423303604, "epoch": 2.4789522918615527, "grad_norm": 0.04173668473958969, "learning_rate": 0.0002, "loss": 0.5457897186279297, "mean_token_accuracy": 0.7782775014638901, "num_tokens": 10806903.0, "step": 664 }, { "entropy": 0.5472803115844727, "epoch": 2.482694106641721, "grad_norm": 0.040667202323675156, "learning_rate": 0.0002, "loss": 0.5462859869003296, "mean_token_accuracy": 0.7769711166620255, "num_tokens": 10823042.0, "step": 665 }, { "entropy": 0.5469382554292679, "epoch": 2.4864359214218896, "grad_norm": 0.04248496890068054, "learning_rate": 0.0002, "loss": 0.5395170450210571, "mean_token_accuracy": 0.7798823863267899, "num_tokens": 10839340.0, "step": 666 }, { "entropy": 0.5202000439167023, "epoch": 2.490177736202058, "grad_norm": 0.03368566930294037, "learning_rate": 0.0002, "loss": 0.5234949588775635, "mean_token_accuracy": 0.786568820476532, "num_tokens": 10855502.0, "step": 667 }, { "entropy": 0.5273594409227371, "epoch": 2.4939195509822265, "grad_norm": 0.04516978561878204, "learning_rate": 0.0002, "loss": 0.5360161066055298, "mean_token_accuracy": 0.7853840887546539, "num_tokens": 10871840.0, "step": 668 }, { "entropy": 0.5393954515457153, "epoch": 2.497661365762395, "grad_norm": 0.03674040734767914, "learning_rate": 0.0002, "loss": 0.5378697514533997, "mean_token_accuracy": 0.7824258059263229, "num_tokens": 10888120.0, "step": 669 }, { "entropy": 0.5479197651147842, "epoch": 2.501403180542563, "grad_norm": 0.03727351129055023, "learning_rate": 0.0002, "loss": 0.5392875671386719, "mean_token_accuracy": 0.7811300158500671, "num_tokens": 10904483.0, "step": 670 }, { "entropy": 0.552995502948761, "epoch": 2.5051449953227314, "grad_norm": 0.036775074899196625, "learning_rate": 0.0002, "loss": 0.5475963950157166, "mean_token_accuracy": 0.7784164547920227, "num_tokens": 10920853.0, "step": 671 }, { "entropy": 0.5446810871362686, "epoch": 2.5088868101029, "grad_norm": 0.038499053567647934, "learning_rate": 0.0002, "loss": 0.5511402487754822, "mean_token_accuracy": 0.7761510908603668, "num_tokens": 10937231.0, "step": 672 }, { "entropy": 0.5175495520234108, "epoch": 2.5126286248830683, "grad_norm": 0.039775073528289795, "learning_rate": 0.0002, "loss": 0.5242205858230591, "mean_token_accuracy": 0.7848553359508514, "num_tokens": 10953429.0, "step": 673 }, { "entropy": 0.5237327665090561, "epoch": 2.5163704396632367, "grad_norm": 0.04171684384346008, "learning_rate": 0.0002, "loss": 0.5307218432426453, "mean_token_accuracy": 0.7838338315486908, "num_tokens": 10969808.0, "step": 674 }, { "entropy": 0.5405460149049759, "epoch": 2.520112254443405, "grad_norm": 0.04240800440311432, "learning_rate": 0.0002, "loss": 0.5408159494400024, "mean_token_accuracy": 0.7787611186504364, "num_tokens": 10986049.0, "step": 675 }, { "entropy": 0.5486787706613541, "epoch": 2.5238540692235736, "grad_norm": 0.039784692227840424, "learning_rate": 0.0002, "loss": 0.5455769896507263, "mean_token_accuracy": 0.7784162014722824, "num_tokens": 11002254.0, "step": 676 }, { "entropy": 0.5363409966230392, "epoch": 2.527595884003742, "grad_norm": 0.03736806660890579, "learning_rate": 0.0002, "loss": 0.5266451239585876, "mean_token_accuracy": 0.7866665124893188, "num_tokens": 11018914.0, "step": 677 }, { "entropy": 0.5279175043106079, "epoch": 2.5313376987839105, "grad_norm": 0.035363830626010895, "learning_rate": 0.0002, "loss": 0.5288829207420349, "mean_token_accuracy": 0.7874743491411209, "num_tokens": 11034952.0, "step": 678 }, { "entropy": 0.5376022309064865, "epoch": 2.5350795135640785, "grad_norm": 0.051831189543008804, "learning_rate": 0.0002, "loss": 0.5518858432769775, "mean_token_accuracy": 0.7750970423221588, "num_tokens": 11051172.0, "step": 679 }, { "entropy": 0.5426171720027924, "epoch": 2.538821328344247, "grad_norm": 0.04189771041274071, "learning_rate": 0.0002, "loss": 0.5544742345809937, "mean_token_accuracy": 0.7774394005537033, "num_tokens": 11067538.0, "step": 680 }, { "entropy": 0.5293037593364716, "epoch": 2.5425631431244153, "grad_norm": 0.04074425622820854, "learning_rate": 0.0002, "loss": 0.5310404896736145, "mean_token_accuracy": 0.7826415598392487, "num_tokens": 11083927.0, "step": 681 }, { "entropy": 0.5473333150148392, "epoch": 2.5463049579045838, "grad_norm": 0.03279516100883484, "learning_rate": 0.0002, "loss": 0.5383847951889038, "mean_token_accuracy": 0.7836183458566666, "num_tokens": 11100675.0, "step": 682 }, { "entropy": 0.5422270894050598, "epoch": 2.550046772684752, "grad_norm": 0.039768971502780914, "learning_rate": 0.0002, "loss": 0.543849766254425, "mean_token_accuracy": 0.7796186804771423, "num_tokens": 11116748.0, "step": 683 }, { "entropy": 0.5384610444307327, "epoch": 2.55378858746492, "grad_norm": 0.037385329604148865, "learning_rate": 0.0002, "loss": 0.54084312915802, "mean_token_accuracy": 0.7830232381820679, "num_tokens": 11133051.0, "step": 684 }, { "entropy": 0.5261296629905701, "epoch": 2.5575304022450887, "grad_norm": 0.039306074380874634, "learning_rate": 0.0002, "loss": 0.531363844871521, "mean_token_accuracy": 0.785315752029419, "num_tokens": 11149362.0, "step": 685 }, { "entropy": 0.5491520762443542, "epoch": 2.561272217025257, "grad_norm": 0.04143069311976433, "learning_rate": 0.0002, "loss": 0.5444177389144897, "mean_token_accuracy": 0.7807131856679916, "num_tokens": 11165746.0, "step": 686 }, { "entropy": 0.53914874792099, "epoch": 2.5650140318054255, "grad_norm": 0.03408098593354225, "learning_rate": 0.0002, "loss": 0.5294961929321289, "mean_token_accuracy": 0.7870545238256454, "num_tokens": 11182138.0, "step": 687 }, { "entropy": 0.5346123576164246, "epoch": 2.568755846585594, "grad_norm": 0.04301401227712631, "learning_rate": 0.0002, "loss": 0.5353041887283325, "mean_token_accuracy": 0.784915953874588, "num_tokens": 11198330.0, "step": 688 }, { "entropy": 0.5318583697080612, "epoch": 2.5724976613657624, "grad_norm": 0.04231448844075203, "learning_rate": 0.0002, "loss": 0.5399123430252075, "mean_token_accuracy": 0.7802146077156067, "num_tokens": 11214613.0, "step": 689 }, { "entropy": 0.5280211716890335, "epoch": 2.576239476145931, "grad_norm": 0.04549930989742279, "learning_rate": 0.0002, "loss": 0.5432953238487244, "mean_token_accuracy": 0.777678519487381, "num_tokens": 11230987.0, "step": 690 }, { "entropy": 0.5567438304424286, "epoch": 2.5799812909260993, "grad_norm": 0.03926197439432144, "learning_rate": 0.0002, "loss": 0.5588645339012146, "mean_token_accuracy": 0.7713411450386047, "num_tokens": 11247503.0, "step": 691 }, { "entropy": 0.542352095246315, "epoch": 2.5837231057062677, "grad_norm": 0.035485655069351196, "learning_rate": 0.0002, "loss": 0.5354308485984802, "mean_token_accuracy": 0.7822972387075424, "num_tokens": 11263949.0, "step": 692 }, { "entropy": 0.5373577028512955, "epoch": 2.587464920486436, "grad_norm": 0.04045470058917999, "learning_rate": 0.0002, "loss": 0.524779200553894, "mean_token_accuracy": 0.785191684961319, "num_tokens": 11280345.0, "step": 693 }, { "entropy": 0.5388759598135948, "epoch": 2.591206735266604, "grad_norm": 0.03759071230888367, "learning_rate": 0.0002, "loss": 0.5312530994415283, "mean_token_accuracy": 0.7809051126241684, "num_tokens": 11296587.0, "step": 694 }, { "entropy": 0.5210207849740982, "epoch": 2.5949485500467726, "grad_norm": 0.03664049878716469, "learning_rate": 0.0002, "loss": 0.526019275188446, "mean_token_accuracy": 0.7867360413074493, "num_tokens": 11313101.0, "step": 695 }, { "entropy": 0.5182994976639748, "epoch": 2.598690364826941, "grad_norm": 0.05368485301733017, "learning_rate": 0.0002, "loss": 0.5354053974151611, "mean_token_accuracy": 0.7826909422874451, "num_tokens": 11329367.0, "step": 696 }, { "entropy": 0.5452821850776672, "epoch": 2.6024321796071095, "grad_norm": 0.04641703888773918, "learning_rate": 0.0002, "loss": 0.5546022057533264, "mean_token_accuracy": 0.7768976241350174, "num_tokens": 11345547.0, "step": 697 }, { "entropy": 0.5391091257333755, "epoch": 2.606173994387278, "grad_norm": 0.04271511733531952, "learning_rate": 0.0002, "loss": 0.541153073310852, "mean_token_accuracy": 0.7804041355848312, "num_tokens": 11361574.0, "step": 698 }, { "entropy": 0.5462173670530319, "epoch": 2.6099158091674464, "grad_norm": 0.03939999267458916, "learning_rate": 0.0002, "loss": 0.5369886159896851, "mean_token_accuracy": 0.7804831266403198, "num_tokens": 11377812.0, "step": 699 }, { "entropy": 0.5714237540960312, "epoch": 2.6136576239476144, "grad_norm": 0.03745459020137787, "learning_rate": 0.0002, "loss": 0.5620177984237671, "mean_token_accuracy": 0.7719487398862839, "num_tokens": 11394403.0, "step": 700 }, { "entropy": 0.5377793908119202, "epoch": 2.617399438727783, "grad_norm": 0.03732477128505707, "learning_rate": 0.0002, "loss": 0.5375291109085083, "mean_token_accuracy": 0.7813573330640793, "num_tokens": 11410706.0, "step": 701 }, { "entropy": 0.5385070145130157, "epoch": 2.6211412535079512, "grad_norm": 0.04680998623371124, "learning_rate": 0.0002, "loss": 0.5455629825592041, "mean_token_accuracy": 0.776125431060791, "num_tokens": 11427143.0, "step": 702 }, { "entropy": 0.5411592125892639, "epoch": 2.6248830682881197, "grad_norm": 0.037070900201797485, "learning_rate": 0.0002, "loss": 0.5470774173736572, "mean_token_accuracy": 0.7772253155708313, "num_tokens": 11443536.0, "step": 703 }, { "entropy": 0.5268983989953995, "epoch": 2.628624883068288, "grad_norm": 0.04107747972011566, "learning_rate": 0.0002, "loss": 0.5320890545845032, "mean_token_accuracy": 0.7819889187812805, "num_tokens": 11459635.0, "step": 704 }, { "entropy": 0.5278744846582413, "epoch": 2.6323666978484566, "grad_norm": 0.03608566150069237, "learning_rate": 0.0002, "loss": 0.5288647413253784, "mean_token_accuracy": 0.7842333018779755, "num_tokens": 11476037.0, "step": 705 }, { "entropy": 0.5504002794623375, "epoch": 2.636108512628625, "grad_norm": 0.041055019944906235, "learning_rate": 0.0002, "loss": 0.5523802638053894, "mean_token_accuracy": 0.7737344652414322, "num_tokens": 11492344.0, "step": 706 }, { "entropy": 0.541622132062912, "epoch": 2.6398503274087934, "grad_norm": 0.03790360316634178, "learning_rate": 0.0002, "loss": 0.5410860776901245, "mean_token_accuracy": 0.7775967717170715, "num_tokens": 11508715.0, "step": 707 }, { "entropy": 0.53721022605896, "epoch": 2.643592142188962, "grad_norm": 0.048964016139507294, "learning_rate": 0.0002, "loss": 0.5369323492050171, "mean_token_accuracy": 0.7816558331251144, "num_tokens": 11525153.0, "step": 708 }, { "entropy": 0.5321754217147827, "epoch": 2.64733395696913, "grad_norm": 0.048466358333826065, "learning_rate": 0.0002, "loss": 0.5365191698074341, "mean_token_accuracy": 0.7804320156574249, "num_tokens": 11541270.0, "step": 709 }, { "entropy": 0.5573434978723526, "epoch": 2.6510757717492983, "grad_norm": 0.045038264244794846, "learning_rate": 0.0002, "loss": 0.5563772320747375, "mean_token_accuracy": 0.7737798243761063, "num_tokens": 11557694.0, "step": 710 }, { "entropy": 0.5524247735738754, "epoch": 2.6548175865294668, "grad_norm": 0.038673996925354004, "learning_rate": 0.0002, "loss": 0.5518113970756531, "mean_token_accuracy": 0.7768261432647705, "num_tokens": 11574308.0, "step": 711 }, { "entropy": 0.5358691960573196, "epoch": 2.658559401309635, "grad_norm": 0.03978041559457779, "learning_rate": 0.0002, "loss": 0.5338990688323975, "mean_token_accuracy": 0.7842043936252594, "num_tokens": 11590586.0, "step": 712 }, { "entropy": 0.5332267433404922, "epoch": 2.6623012160898036, "grad_norm": 0.03574821725487709, "learning_rate": 0.0002, "loss": 0.5405697822570801, "mean_token_accuracy": 0.7808981388807297, "num_tokens": 11606867.0, "step": 713 }, { "entropy": 0.5254797339439392, "epoch": 2.666043030869972, "grad_norm": 0.040162764489650726, "learning_rate": 0.0002, "loss": 0.5316233038902283, "mean_token_accuracy": 0.7839036136865616, "num_tokens": 11623321.0, "step": 714 }, { "entropy": 0.5194612145423889, "epoch": 2.66978484565014, "grad_norm": 0.0536888912320137, "learning_rate": 0.0002, "loss": 0.5308873057365417, "mean_token_accuracy": 0.7844232022762299, "num_tokens": 11639616.0, "step": 715 }, { "entropy": 0.5397140085697174, "epoch": 2.6735266604303085, "grad_norm": 0.034708283841609955, "learning_rate": 0.0002, "loss": 0.5418391227722168, "mean_token_accuracy": 0.7771459370851517, "num_tokens": 11655924.0, "step": 716 }, { "entropy": 0.5523687899112701, "epoch": 2.677268475210477, "grad_norm": 0.03549209609627724, "learning_rate": 0.0002, "loss": 0.5451604127883911, "mean_token_accuracy": 0.7780284285545349, "num_tokens": 11672448.0, "step": 717 }, { "entropy": 0.5573620796203613, "epoch": 2.6810102899906454, "grad_norm": 0.03517598658800125, "learning_rate": 0.0002, "loss": 0.5482261180877686, "mean_token_accuracy": 0.7732254415750504, "num_tokens": 11688985.0, "step": 718 }, { "entropy": 0.5521951466798782, "epoch": 2.684752104770814, "grad_norm": 0.03560207411646843, "learning_rate": 0.0002, "loss": 0.5395568609237671, "mean_token_accuracy": 0.7822758108377457, "num_tokens": 11705608.0, "step": 719 }, { "entropy": 0.5614044666290283, "epoch": 2.6884939195509823, "grad_norm": 0.04236432537436485, "learning_rate": 0.0002, "loss": 0.5560280084609985, "mean_token_accuracy": 0.7751108258962631, "num_tokens": 11721966.0, "step": 720 }, { "entropy": 0.5331545174121857, "epoch": 2.6922357343311507, "grad_norm": 0.03850049898028374, "learning_rate": 0.0002, "loss": 0.5384074449539185, "mean_token_accuracy": 0.7795211225748062, "num_tokens": 11738118.0, "step": 721 }, { "entropy": 0.5322619527578354, "epoch": 2.695977549111319, "grad_norm": 0.04224139824509621, "learning_rate": 0.0002, "loss": 0.5480450987815857, "mean_token_accuracy": 0.7758100479841232, "num_tokens": 11754350.0, "step": 722 }, { "entropy": 0.53462353348732, "epoch": 2.6997193638914876, "grad_norm": 0.03856648504734039, "learning_rate": 0.0002, "loss": 0.5420241355895996, "mean_token_accuracy": 0.7794053852558136, "num_tokens": 11770468.0, "step": 723 }, { "entropy": 0.5529629737138748, "epoch": 2.703461178671656, "grad_norm": 0.03881238028407097, "learning_rate": 0.0002, "loss": 0.5515606999397278, "mean_token_accuracy": 0.777623638510704, "num_tokens": 11786891.0, "step": 724 }, { "entropy": 0.5365050584077835, "epoch": 2.707202993451824, "grad_norm": 0.030840173363685608, "learning_rate": 0.0002, "loss": 0.5374981760978699, "mean_token_accuracy": 0.7810342460870743, "num_tokens": 11803202.0, "step": 725 }, { "entropy": 0.5490061491727829, "epoch": 2.7109448082319925, "grad_norm": 0.03318411111831665, "learning_rate": 0.0002, "loss": 0.5416221022605896, "mean_token_accuracy": 0.7810187339782715, "num_tokens": 11819633.0, "step": 726 }, { "entropy": 0.5287661999464035, "epoch": 2.714686623012161, "grad_norm": 0.033848777413368225, "learning_rate": 0.0002, "loss": 0.5285395383834839, "mean_token_accuracy": 0.785768449306488, "num_tokens": 11835951.0, "step": 727 }, { "entropy": 0.5228402391076088, "epoch": 2.7184284377923293, "grad_norm": 0.037826504558324814, "learning_rate": 0.0002, "loss": 0.5267374515533447, "mean_token_accuracy": 0.7853263914585114, "num_tokens": 11852172.0, "step": 728 }, { "entropy": 0.5451251715421677, "epoch": 2.722170252572498, "grad_norm": 0.03935185819864273, "learning_rate": 0.0002, "loss": 0.5431327223777771, "mean_token_accuracy": 0.7800047546625137, "num_tokens": 11868665.0, "step": 729 }, { "entropy": 0.5370529890060425, "epoch": 2.725912067352666, "grad_norm": 0.040121592581272125, "learning_rate": 0.0002, "loss": 0.5504775643348694, "mean_token_accuracy": 0.7777304202318192, "num_tokens": 11884782.0, "step": 730 }, { "entropy": 0.5336936116218567, "epoch": 2.729653882132834, "grad_norm": 0.046451181173324585, "learning_rate": 0.0002, "loss": 0.5401822328567505, "mean_token_accuracy": 0.7810492217540741, "num_tokens": 11900966.0, "step": 731 }, { "entropy": 0.5421666949987411, "epoch": 2.7333956969130027, "grad_norm": 0.03996991366147995, "learning_rate": 0.0002, "loss": 0.5425142645835876, "mean_token_accuracy": 0.7759256362915039, "num_tokens": 11917559.0, "step": 732 }, { "entropy": 0.5548020005226135, "epoch": 2.737137511693171, "grad_norm": 0.039705440402030945, "learning_rate": 0.0002, "loss": 0.5471047163009644, "mean_token_accuracy": 0.7788440138101578, "num_tokens": 11933791.0, "step": 733 }, { "entropy": 0.5459768623113632, "epoch": 2.7408793264733395, "grad_norm": 0.044193848967552185, "learning_rate": 0.0002, "loss": 0.5505638718605042, "mean_token_accuracy": 0.7753681987524033, "num_tokens": 11949788.0, "step": 734 }, { "entropy": 0.5197051167488098, "epoch": 2.744621141253508, "grad_norm": 0.04006953909993172, "learning_rate": 0.0002, "loss": 0.5269069671630859, "mean_token_accuracy": 0.7862325310707092, "num_tokens": 11965909.0, "step": 735 }, { "entropy": 0.5576485246419907, "epoch": 2.7483629560336764, "grad_norm": 0.03677723556756973, "learning_rate": 0.0002, "loss": 0.5640283823013306, "mean_token_accuracy": 0.7697114050388336, "num_tokens": 11982388.0, "step": 736 }, { "entropy": 0.5379237085580826, "epoch": 2.752104770813845, "grad_norm": 0.03523614630103111, "learning_rate": 0.0002, "loss": 0.5367957353591919, "mean_token_accuracy": 0.7794550508260727, "num_tokens": 11998589.0, "step": 737 }, { "entropy": 0.5357311069965363, "epoch": 2.7558465855940133, "grad_norm": 0.03599949926137924, "learning_rate": 0.0002, "loss": 0.5299929976463318, "mean_token_accuracy": 0.784047082066536, "num_tokens": 12014892.0, "step": 738 }, { "entropy": 0.5434677302837372, "epoch": 2.7595884003741817, "grad_norm": 0.03983872011303902, "learning_rate": 0.0002, "loss": 0.537936806678772, "mean_token_accuracy": 0.7832438200712204, "num_tokens": 12030925.0, "step": 739 }, { "entropy": 0.5472689718008041, "epoch": 2.7633302151543497, "grad_norm": 0.03287053480744362, "learning_rate": 0.0002, "loss": 0.5477735996246338, "mean_token_accuracy": 0.7759514302015305, "num_tokens": 12047168.0, "step": 740 }, { "entropy": 0.5356525778770447, "epoch": 2.767072029934518, "grad_norm": 0.03699969872832298, "learning_rate": 0.0002, "loss": 0.5401504635810852, "mean_token_accuracy": 0.7797222137451172, "num_tokens": 12063859.0, "step": 741 }, { "entropy": 0.522783175110817, "epoch": 2.7708138447146866, "grad_norm": 0.04751390591263771, "learning_rate": 0.0002, "loss": 0.5334336161613464, "mean_token_accuracy": 0.785777673125267, "num_tokens": 12080092.0, "step": 742 }, { "entropy": 0.5513002574443817, "epoch": 2.774555659494855, "grad_norm": 0.04812496900558472, "learning_rate": 0.0002, "loss": 0.5542380809783936, "mean_token_accuracy": 0.7760861963033676, "num_tokens": 12096314.0, "step": 743 }, { "entropy": 0.5436785966157913, "epoch": 2.7782974742750235, "grad_norm": 0.03719832003116608, "learning_rate": 0.0002, "loss": 0.5375255346298218, "mean_token_accuracy": 0.7817601412534714, "num_tokens": 12112385.0, "step": 744 }, { "entropy": 0.5392426550388336, "epoch": 2.782039289055192, "grad_norm": 0.036235589534044266, "learning_rate": 0.0002, "loss": 0.5315327644348145, "mean_token_accuracy": 0.783770278096199, "num_tokens": 12128749.0, "step": 745 }, { "entropy": 0.5371043086051941, "epoch": 2.78578110383536, "grad_norm": 0.04002665355801582, "learning_rate": 0.0002, "loss": 0.5355648994445801, "mean_token_accuracy": 0.7825834453105927, "num_tokens": 12145069.0, "step": 746 }, { "entropy": 0.5386099964380264, "epoch": 2.7895229186155284, "grad_norm": 0.0372973270714283, "learning_rate": 0.0002, "loss": 0.5449782609939575, "mean_token_accuracy": 0.7772656977176666, "num_tokens": 12161381.0, "step": 747 }, { "entropy": 0.49367938190698624, "epoch": 2.793264733395697, "grad_norm": 0.042931776493787766, "learning_rate": 0.0002, "loss": 0.49913763999938965, "mean_token_accuracy": 0.795563668012619, "num_tokens": 12177674.0, "step": 748 }, { "entropy": 0.5577136278152466, "epoch": 2.7970065481758652, "grad_norm": 0.03464139625430107, "learning_rate": 0.0002, "loss": 0.563284158706665, "mean_token_accuracy": 0.7712576389312744, "num_tokens": 12194200.0, "step": 749 }, { "entropy": 0.5163726359605789, "epoch": 2.8007483629560337, "grad_norm": 0.043806042522192, "learning_rate": 0.0002, "loss": 0.5230565071105957, "mean_token_accuracy": 0.7878428548574448, "num_tokens": 12210649.0, "step": 750 }, { "entropy": 0.5474874824285507, "epoch": 2.804490177736202, "grad_norm": 0.03748728707432747, "learning_rate": 0.0002, "loss": 0.5494849681854248, "mean_token_accuracy": 0.777756467461586, "num_tokens": 12226971.0, "step": 751 }, { "entropy": 0.5351517200469971, "epoch": 2.8082319925163706, "grad_norm": 0.045867737382650375, "learning_rate": 0.0002, "loss": 0.539400577545166, "mean_token_accuracy": 0.7824986279010773, "num_tokens": 12243263.0, "step": 752 }, { "entropy": 0.5563795119524002, "epoch": 2.811973807296539, "grad_norm": 0.03956415131688118, "learning_rate": 0.0002, "loss": 0.5521907210350037, "mean_token_accuracy": 0.7774280607700348, "num_tokens": 12259518.0, "step": 753 }, { "entropy": 0.56000916659832, "epoch": 2.8157156220767074, "grad_norm": 0.038831926882267, "learning_rate": 0.0002, "loss": 0.5568797588348389, "mean_token_accuracy": 0.7727828919887543, "num_tokens": 12276004.0, "step": 754 }, { "entropy": 0.5431783348321915, "epoch": 2.8194574368568754, "grad_norm": 0.04772892966866493, "learning_rate": 0.0002, "loss": 0.5474101901054382, "mean_token_accuracy": 0.7786049693822861, "num_tokens": 12292373.0, "step": 755 }, { "entropy": 0.5570650398731232, "epoch": 2.823199251637044, "grad_norm": 0.03613967075943947, "learning_rate": 0.0002, "loss": 0.5507438778877258, "mean_token_accuracy": 0.7748661190271378, "num_tokens": 12309010.0, "step": 756 }, { "entropy": 0.5275236368179321, "epoch": 2.8269410664172123, "grad_norm": 0.04989537596702576, "learning_rate": 0.0002, "loss": 0.5294247269630432, "mean_token_accuracy": 0.7852834612131119, "num_tokens": 12325334.0, "step": 757 }, { "entropy": 0.5346865504980087, "epoch": 2.8306828811973808, "grad_norm": 0.03763777017593384, "learning_rate": 0.0002, "loss": 0.536054790019989, "mean_token_accuracy": 0.7806695699691772, "num_tokens": 12341700.0, "step": 758 }, { "entropy": 0.5543745011091232, "epoch": 2.834424695977549, "grad_norm": 0.045101623982191086, "learning_rate": 0.0002, "loss": 0.5560649037361145, "mean_token_accuracy": 0.7761011719703674, "num_tokens": 12358184.0, "step": 759 }, { "entropy": 0.5500671565532684, "epoch": 2.8381665107577176, "grad_norm": 0.042196061462163925, "learning_rate": 0.0002, "loss": 0.5577619075775146, "mean_token_accuracy": 0.7745834439992905, "num_tokens": 12374727.0, "step": 760 }, { "entropy": 0.5422725081443787, "epoch": 2.8419083255378856, "grad_norm": 0.037925731390714645, "learning_rate": 0.0002, "loss": 0.5486158132553101, "mean_token_accuracy": 0.7735314965248108, "num_tokens": 12391054.0, "step": 761 }, { "entropy": 0.5447213500738144, "epoch": 2.845650140318054, "grad_norm": 0.039297524839639664, "learning_rate": 0.0002, "loss": 0.5439249277114868, "mean_token_accuracy": 0.7782430201768875, "num_tokens": 12407240.0, "step": 762 }, { "entropy": 0.5623101443052292, "epoch": 2.8493919550982225, "grad_norm": 0.03727223724126816, "learning_rate": 0.0002, "loss": 0.5529690980911255, "mean_token_accuracy": 0.7783486098051071, "num_tokens": 12423651.0, "step": 763 }, { "entropy": 0.5487337410449982, "epoch": 2.853133769878391, "grad_norm": 0.041605204343795776, "learning_rate": 0.0002, "loss": 0.5483216047286987, "mean_token_accuracy": 0.7777005285024643, "num_tokens": 12439865.0, "step": 764 }, { "entropy": 0.5403908789157867, "epoch": 2.8568755846585594, "grad_norm": 0.042009830474853516, "learning_rate": 0.0002, "loss": 0.5446419715881348, "mean_token_accuracy": 0.7782749831676483, "num_tokens": 12456283.0, "step": 765 }, { "entropy": 0.5366557389497757, "epoch": 2.860617399438728, "grad_norm": 0.03936697915196419, "learning_rate": 0.0002, "loss": 0.542513370513916, "mean_token_accuracy": 0.7779817581176758, "num_tokens": 12472812.0, "step": 766 }, { "entropy": 0.5674513280391693, "epoch": 2.8643592142188963, "grad_norm": 0.050604403018951416, "learning_rate": 0.0002, "loss": 0.5683247447013855, "mean_token_accuracy": 0.7713179588317871, "num_tokens": 12489449.0, "step": 767 }, { "entropy": 0.5182722359895706, "epoch": 2.8681010289990647, "grad_norm": 0.036767635494470596, "learning_rate": 0.0002, "loss": 0.5209700465202332, "mean_token_accuracy": 0.7906691282987595, "num_tokens": 12505831.0, "step": 768 }, { "entropy": 0.5400542318820953, "epoch": 2.871842843779233, "grad_norm": 0.0423893928527832, "learning_rate": 0.0002, "loss": 0.5363757014274597, "mean_token_accuracy": 0.7849675416946411, "num_tokens": 12522266.0, "step": 769 }, { "entropy": 0.5384216755628586, "epoch": 2.875584658559401, "grad_norm": 0.03423478081822395, "learning_rate": 0.0002, "loss": 0.539215087890625, "mean_token_accuracy": 0.7803387194871902, "num_tokens": 12538797.0, "step": 770 }, { "entropy": 0.5494250059127808, "epoch": 2.8793264733395696, "grad_norm": 0.03864506259560585, "learning_rate": 0.0002, "loss": 0.5536534786224365, "mean_token_accuracy": 0.7749843001365662, "num_tokens": 12554840.0, "step": 771 }, { "entropy": 0.5292802900075912, "epoch": 2.883068288119738, "grad_norm": 0.03668517246842384, "learning_rate": 0.0002, "loss": 0.531915009021759, "mean_token_accuracy": 0.7857315242290497, "num_tokens": 12571194.0, "step": 772 }, { "entropy": 0.5444097071886063, "epoch": 2.8868101028999065, "grad_norm": 0.03593030199408531, "learning_rate": 0.0002, "loss": 0.5466811060905457, "mean_token_accuracy": 0.7787587195634842, "num_tokens": 12587746.0, "step": 773 }, { "entropy": 0.5468859821557999, "epoch": 2.890551917680075, "grad_norm": 0.042690832167863846, "learning_rate": 0.0002, "loss": 0.5463913679122925, "mean_token_accuracy": 0.779534175992012, "num_tokens": 12604183.0, "step": 774 }, { "entropy": 0.5508814752101898, "epoch": 2.8942937324602434, "grad_norm": 0.04205498844385147, "learning_rate": 0.0002, "loss": 0.5481387376785278, "mean_token_accuracy": 0.776447519659996, "num_tokens": 12620732.0, "step": 775 }, { "entropy": 0.5370959490537643, "epoch": 2.8980355472404113, "grad_norm": 0.04001722112298012, "learning_rate": 0.0002, "loss": 0.5357980728149414, "mean_token_accuracy": 0.7828036099672318, "num_tokens": 12636847.0, "step": 776 }, { "entropy": 0.5336840003728867, "epoch": 2.90177736202058, "grad_norm": 0.04124586284160614, "learning_rate": 0.0002, "loss": 0.5350784063339233, "mean_token_accuracy": 0.7848693281412125, "num_tokens": 12653376.0, "step": 777 }, { "entropy": 0.5422462821006775, "epoch": 2.9055191768007482, "grad_norm": 0.04322974756360054, "learning_rate": 0.0002, "loss": 0.5437650680541992, "mean_token_accuracy": 0.7811295241117477, "num_tokens": 12669838.0, "step": 778 }, { "entropy": 0.5301967561244965, "epoch": 2.9092609915809167, "grad_norm": 0.040180791169404984, "learning_rate": 0.0002, "loss": 0.5413050055503845, "mean_token_accuracy": 0.7816843837499619, "num_tokens": 12686338.0, "step": 779 }, { "entropy": 0.5494007170200348, "epoch": 2.913002806361085, "grad_norm": 0.03727947920560837, "learning_rate": 0.0002, "loss": 0.551271915435791, "mean_token_accuracy": 0.7756839543581009, "num_tokens": 12702976.0, "step": 780 }, { "entropy": 0.557955801486969, "epoch": 2.9167446211412535, "grad_norm": 0.03641374036669731, "learning_rate": 0.0002, "loss": 0.5591468214988708, "mean_token_accuracy": 0.7722364217042923, "num_tokens": 12719319.0, "step": 781 }, { "entropy": 0.5437477082014084, "epoch": 2.920486435921422, "grad_norm": 0.03696129098534584, "learning_rate": 0.0002, "loss": 0.539549708366394, "mean_token_accuracy": 0.7802012413740158, "num_tokens": 12735691.0, "step": 782 }, { "entropy": 0.5459663569927216, "epoch": 2.9242282507015904, "grad_norm": 0.03394176810979843, "learning_rate": 0.0002, "loss": 0.5432969331741333, "mean_token_accuracy": 0.7803399115800858, "num_tokens": 12752042.0, "step": 783 }, { "entropy": 0.540153980255127, "epoch": 2.927970065481759, "grad_norm": 0.04523579031229019, "learning_rate": 0.0002, "loss": 0.5408099889755249, "mean_token_accuracy": 0.7797322869300842, "num_tokens": 12768264.0, "step": 784 }, { "entropy": 0.5484558641910553, "epoch": 2.9317118802619273, "grad_norm": 0.03857382759451866, "learning_rate": 0.0002, "loss": 0.554611325263977, "mean_token_accuracy": 0.7754960358142853, "num_tokens": 12784469.0, "step": 785 }, { "entropy": 0.5373403131961823, "epoch": 2.9354536950420953, "grad_norm": 0.04521877318620682, "learning_rate": 0.0002, "loss": 0.5412609577178955, "mean_token_accuracy": 0.7812603563070297, "num_tokens": 12800714.0, "step": 786 }, { "entropy": 0.5420941710472107, "epoch": 2.9391955098222637, "grad_norm": 0.037385161966085434, "learning_rate": 0.0002, "loss": 0.5446354746818542, "mean_token_accuracy": 0.7783695161342621, "num_tokens": 12816921.0, "step": 787 }, { "entropy": 0.5351656675338745, "epoch": 2.942937324602432, "grad_norm": 0.041876692324876785, "learning_rate": 0.0002, "loss": 0.5376321077346802, "mean_token_accuracy": 0.7807199209928513, "num_tokens": 12833350.0, "step": 788 }, { "entropy": 0.5680812299251556, "epoch": 2.9466791393826006, "grad_norm": 0.040565043687820435, "learning_rate": 0.0002, "loss": 0.5634538531303406, "mean_token_accuracy": 0.7689831405878067, "num_tokens": 12849646.0, "step": 789 }, { "entropy": 0.5357328206300735, "epoch": 2.950420954162769, "grad_norm": 0.04082103073596954, "learning_rate": 0.0002, "loss": 0.5352612733840942, "mean_token_accuracy": 0.7824973464012146, "num_tokens": 12865840.0, "step": 790 }, { "entropy": 0.5547877848148346, "epoch": 2.954162768942937, "grad_norm": 0.04521463066339493, "learning_rate": 0.0002, "loss": 0.5542868971824646, "mean_token_accuracy": 0.7752365618944168, "num_tokens": 12882266.0, "step": 791 }, { "entropy": 0.5343262106180191, "epoch": 2.9579045837231055, "grad_norm": 0.039067838340997696, "learning_rate": 0.0002, "loss": 0.5333149433135986, "mean_token_accuracy": 0.783295214176178, "num_tokens": 12898704.0, "step": 792 }, { "entropy": 0.5165642648935318, "epoch": 2.961646398503274, "grad_norm": 0.04161246493458748, "learning_rate": 0.0002, "loss": 0.5219287276268005, "mean_token_accuracy": 0.790781170129776, "num_tokens": 12914733.0, "step": 793 }, { "entropy": 0.5363114923238754, "epoch": 2.9653882132834424, "grad_norm": 0.03739769384264946, "learning_rate": 0.0002, "loss": 0.5376189351081848, "mean_token_accuracy": 0.7812457233667374, "num_tokens": 12931042.0, "step": 794 }, { "entropy": 0.5318800210952759, "epoch": 2.969130028063611, "grad_norm": 0.047191355377435684, "learning_rate": 0.0002, "loss": 0.5360404849052429, "mean_token_accuracy": 0.7821078598499298, "num_tokens": 12947442.0, "step": 795 }, { "entropy": 0.5284593552350998, "epoch": 2.9728718428437793, "grad_norm": 0.03614107519388199, "learning_rate": 0.0002, "loss": 0.5247491598129272, "mean_token_accuracy": 0.7871349304914474, "num_tokens": 12963611.0, "step": 796 }, { "entropy": 0.5265946090221405, "epoch": 2.9766136576239477, "grad_norm": 0.04248823598027229, "learning_rate": 0.0002, "loss": 0.53187096118927, "mean_token_accuracy": 0.78339883685112, "num_tokens": 12979965.0, "step": 797 }, { "entropy": 0.5121617913246155, "epoch": 2.980355472404116, "grad_norm": 0.042288120836019516, "learning_rate": 0.0002, "loss": 0.5201407670974731, "mean_token_accuracy": 0.7870761901140213, "num_tokens": 12996017.0, "step": 798 }, { "entropy": 0.5229809135198593, "epoch": 2.9840972871842846, "grad_norm": 0.040804166346788406, "learning_rate": 0.0002, "loss": 0.5307119488716125, "mean_token_accuracy": 0.7831887602806091, "num_tokens": 13012277.0, "step": 799 }, { "entropy": 0.5386293828487396, "epoch": 2.987839101964453, "grad_norm": 0.04149458184838295, "learning_rate": 0.0002, "loss": 0.5341092348098755, "mean_token_accuracy": 0.783338725566864, "num_tokens": 13028574.0, "step": 800 }, { "entropy": 0.5334920659661293, "epoch": 2.991580916744621, "grad_norm": 0.04282135143876076, "learning_rate": 0.0002, "loss": 0.531876802444458, "mean_token_accuracy": 0.7834694683551788, "num_tokens": 13044829.0, "step": 801 }, { "entropy": 0.5673989802598953, "epoch": 2.9953227315247895, "grad_norm": 0.03961246460676193, "learning_rate": 0.0002, "loss": 0.5678121447563171, "mean_token_accuracy": 0.7711912542581558, "num_tokens": 13061330.0, "step": 802 }, { "entropy": 0.531833752989769, "epoch": 2.999064546304958, "grad_norm": 0.03890501707792282, "learning_rate": 0.0002, "loss": 0.5328924655914307, "mean_token_accuracy": 0.7814844250679016, "num_tokens": 13077343.0, "step": 803 }, { "entropy": 0.5831514596939087, "epoch": 3.0, "grad_norm": 0.06591155380010605, "learning_rate": 0.0002, "loss": 0.5364804267883301, "mean_token_accuracy": 0.7760791182518005, "num_tokens": 13078463.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.218543283492356e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }