{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2857142857142857, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.379179924726486, "epoch": 0.002857142857142857, "grad_norm": 0.248466357588768, "learning_rate": 5.142857142857143e-06, "loss": 2.1112184524536133, "mean_token_accuracy": 0.521992451697588, "num_tokens": 490221.0, "step": 10 }, { "entropy": 2.275108999013901, "epoch": 0.005714285714285714, "grad_norm": 0.16482312977313995, "learning_rate": 1.0857142857142858e-05, "loss": 2.0270994186401365, "mean_token_accuracy": 0.5319361314177513, "num_tokens": 980654.0, "step": 20 }, { "entropy": 2.1243143171072005, "epoch": 0.008571428571428572, "grad_norm": 0.14482936263084412, "learning_rate": 1.657142857142857e-05, "loss": 1.947333526611328, "mean_token_accuracy": 0.5449633926153183, "num_tokens": 1471155.0, "step": 30 }, { "entropy": 2.0759591698646545, "epoch": 0.011428571428571429, "grad_norm": 0.13348551094532013, "learning_rate": 2.2285714285714287e-05, "loss": 1.8653924942016602, "mean_token_accuracy": 0.5575317889451981, "num_tokens": 1961803.0, "step": 40 }, { "entropy": 2.002236345410347, "epoch": 0.014285714285714285, "grad_norm": 0.12573261559009552, "learning_rate": 2.8000000000000003e-05, "loss": 1.8171972274780273, "mean_token_accuracy": 0.5663816690444946, "num_tokens": 2451485.0, "step": 50 }, { "entropy": 1.9370298832654953, "epoch": 0.017142857142857144, "grad_norm": 0.12988947331905365, "learning_rate": 3.3714285714285716e-05, "loss": 1.7517566680908203, "mean_token_accuracy": 0.5784050762653351, "num_tokens": 2940490.0, "step": 60 }, { "entropy": 1.9337659150362014, "epoch": 0.02, "grad_norm": 0.1357634961605072, "learning_rate": 3.942857142857143e-05, "loss": 1.7462808609008789, "mean_token_accuracy": 0.578274418413639, "num_tokens": 3430891.0, "step": 70 }, { "entropy": 1.853649941086769, "epoch": 0.022857142857142857, "grad_norm": 0.14244548976421356, "learning_rate": 4.514285714285714e-05, "loss": 1.675507164001465, "mean_token_accuracy": 0.5902435079216957, "num_tokens": 3921132.0, "step": 80 }, { "entropy": 1.8723271518945694, "epoch": 0.025714285714285714, "grad_norm": 0.1481512039899826, "learning_rate": 5.085714285714286e-05, "loss": 1.7015415191650392, "mean_token_accuracy": 0.5883241996169091, "num_tokens": 4412262.0, "step": 90 }, { "entropy": 1.8528674066066741, "epoch": 0.02857142857142857, "grad_norm": 0.15897610783576965, "learning_rate": 5.6571428571428574e-05, "loss": 1.6771810531616211, "mean_token_accuracy": 0.5901058956980705, "num_tokens": 4902976.0, "step": 100 }, { "entropy": 1.8283027499914168, "epoch": 0.03142857142857143, "grad_norm": 0.15304645895957947, "learning_rate": 6.22857142857143e-05, "loss": 1.6524412155151367, "mean_token_accuracy": 0.5927876815199852, "num_tokens": 5393172.0, "step": 110 }, { "entropy": 1.8105649381875992, "epoch": 0.03428571428571429, "grad_norm": 0.1467505544424057, "learning_rate": 6.800000000000001e-05, "loss": 1.6439619064331055, "mean_token_accuracy": 0.5953322932124138, "num_tokens": 5883816.0, "step": 120 }, { "entropy": 1.8044609040021897, "epoch": 0.037142857142857144, "grad_norm": 0.14967863261699677, "learning_rate": 7.371428571428572e-05, "loss": 1.6340011596679687, "mean_token_accuracy": 0.5963180974125862, "num_tokens": 6373950.0, "step": 130 }, { "entropy": 1.7858079195022583, "epoch": 0.04, "grad_norm": 0.16043871641159058, "learning_rate": 7.942857142857143e-05, "loss": 1.6211162567138673, "mean_token_accuracy": 0.5996491178870201, "num_tokens": 6864677.0, "step": 140 }, { "entropy": 1.7554930090904235, "epoch": 0.04285714285714286, "grad_norm": 0.17714525759220123, "learning_rate": 8.514285714285714e-05, "loss": 1.5993239402770996, "mean_token_accuracy": 0.6028685554862022, "num_tokens": 7354704.0, "step": 150 }, { "entropy": 1.7631021201610566, "epoch": 0.045714285714285714, "grad_norm": 0.1533629149198532, "learning_rate": 9.085714285714286e-05, "loss": 1.6020626068115233, "mean_token_accuracy": 0.6036920040845871, "num_tokens": 7844516.0, "step": 160 }, { "entropy": 1.7572406560182572, "epoch": 0.04857142857142857, "grad_norm": 0.13726936280727386, "learning_rate": 9.657142857142858e-05, "loss": 1.6002904891967773, "mean_token_accuracy": 0.6049441516399383, "num_tokens": 8335216.0, "step": 170 }, { "entropy": 1.7278330773115158, "epoch": 0.05142857142857143, "grad_norm": 0.1435551792383194, "learning_rate": 9.999964291145356e-05, "loss": 1.5778950691223144, "mean_token_accuracy": 0.6091306760907174, "num_tokens": 8825337.0, "step": 180 }, { "entropy": 1.7155232012271882, "epoch": 0.054285714285714284, "grad_norm": 0.12991751730442047, "learning_rate": 9.99956257238817e-05, "loss": 1.5530893325805664, "mean_token_accuracy": 0.6109549820423126, "num_tokens": 9315436.0, "step": 190 }, { "entropy": 1.7033455431461335, "epoch": 0.05714285714285714, "grad_norm": 0.12225023657083511, "learning_rate": 9.998714534787104e-05, "loss": 1.5497390747070312, "mean_token_accuracy": 0.6124480590224266, "num_tokens": 9806074.0, "step": 200 }, { "entropy": 1.704897478222847, "epoch": 0.06, "grad_norm": 0.13096989691257477, "learning_rate": 9.997420254047814e-05, "loss": 1.5562244415283204, "mean_token_accuracy": 0.6124653443694115, "num_tokens": 10296321.0, "step": 210 }, { "entropy": 1.687455916404724, "epoch": 0.06285714285714286, "grad_norm": 0.12014620006084442, "learning_rate": 9.995679845712782e-05, "loss": 1.5376283645629882, "mean_token_accuracy": 0.612839862704277, "num_tokens": 10786325.0, "step": 220 }, { "entropy": 1.6887607038021089, "epoch": 0.06571428571428571, "grad_norm": 0.1178581714630127, "learning_rate": 9.99349346515101e-05, "loss": 1.5371188163757323, "mean_token_accuracy": 0.6150592401623726, "num_tokens": 11276505.0, "step": 230 }, { "entropy": 1.6583371013402939, "epoch": 0.06857142857142857, "grad_norm": 0.12618421018123627, "learning_rate": 9.990861307544141e-05, "loss": 1.512394905090332, "mean_token_accuracy": 0.6184149146080017, "num_tokens": 11767090.0, "step": 240 }, { "entropy": 1.6633832812309266, "epoch": 0.07142857142857142, "grad_norm": 0.1347818523645401, "learning_rate": 9.987783607869043e-05, "loss": 1.5205227851867675, "mean_token_accuracy": 0.6188683018088341, "num_tokens": 12256334.0, "step": 250 }, { "entropy": 1.6662334859371186, "epoch": 0.07428571428571429, "grad_norm": 0.12107276916503906, "learning_rate": 9.984260640876821e-05, "loss": 1.5210847854614258, "mean_token_accuracy": 0.6181401565670968, "num_tokens": 12746242.0, "step": 260 }, { "entropy": 1.669426190853119, "epoch": 0.07714285714285714, "grad_norm": 0.13238908350467682, "learning_rate": 9.980292721068303e-05, "loss": 1.5255352020263673, "mean_token_accuracy": 0.6176877498626709, "num_tokens": 13236893.0, "step": 270 }, { "entropy": 1.6480093061923982, "epoch": 0.08, "grad_norm": 0.119101881980896, "learning_rate": 9.975880202665955e-05, "loss": 1.5022310256958007, "mean_token_accuracy": 0.6195305570960045, "num_tokens": 13727057.0, "step": 280 }, { "entropy": 1.6660549819469452, "epoch": 0.08285714285714285, "grad_norm": 0.12110637873411179, "learning_rate": 9.971023479582257e-05, "loss": 1.519071388244629, "mean_token_accuracy": 0.6170884057879448, "num_tokens": 14217233.0, "step": 290 }, { "entropy": 1.65420058965683, "epoch": 0.08571428571428572, "grad_norm": 0.1198863536119461, "learning_rate": 9.965722985384551e-05, "loss": 1.5074204444885253, "mean_token_accuracy": 0.6207710400223732, "num_tokens": 14706536.0, "step": 300 }, { "entropy": 1.653561717271805, "epoch": 0.08857142857142856, "grad_norm": 0.11553779989480972, "learning_rate": 9.959979193256321e-05, "loss": 1.503816509246826, "mean_token_accuracy": 0.620441184937954, "num_tokens": 15196956.0, "step": 310 }, { "entropy": 1.6494231253862381, "epoch": 0.09142857142857143, "grad_norm": 0.12458748370409012, "learning_rate": 9.953792615954956e-05, "loss": 1.5046157836914062, "mean_token_accuracy": 0.6211020454764367, "num_tokens": 15687053.0, "step": 320 }, { "entropy": 1.6357726126909256, "epoch": 0.09428571428571429, "grad_norm": 0.1191626638174057, "learning_rate": 9.94716380576598e-05, "loss": 1.4901004791259767, "mean_token_accuracy": 0.623300464451313, "num_tokens": 16177605.0, "step": 330 }, { "entropy": 1.6310274928808213, "epoch": 0.09714285714285714, "grad_norm": 0.1165379136800766, "learning_rate": 9.940093354453745e-05, "loss": 1.4859237670898438, "mean_token_accuracy": 0.6221878513693809, "num_tokens": 16668078.0, "step": 340 }, { "entropy": 1.6244249641895294, "epoch": 0.1, "grad_norm": 0.11486439406871796, "learning_rate": 9.932581893208602e-05, "loss": 1.4822301864624023, "mean_token_accuracy": 0.6249096423387528, "num_tokens": 17158239.0, "step": 350 }, { "entropy": 1.6261575251817704, "epoch": 0.10285714285714286, "grad_norm": 0.1269659847021103, "learning_rate": 9.924630092590552e-05, "loss": 1.4802236557006836, "mean_token_accuracy": 0.6244122132658958, "num_tokens": 17648337.0, "step": 360 }, { "entropy": 1.6262814104557037, "epoch": 0.10571428571428572, "grad_norm": 0.12204760313034058, "learning_rate": 9.916238662469393e-05, "loss": 1.4841039657592774, "mean_token_accuracy": 0.62349953353405, "num_tokens": 18139347.0, "step": 370 }, { "entropy": 1.6084025710821153, "epoch": 0.10857142857142857, "grad_norm": 0.11636750400066376, "learning_rate": 9.907408351961338e-05, "loss": 1.4650284767150878, "mean_token_accuracy": 0.6257219180464745, "num_tokens": 18629841.0, "step": 380 }, { "entropy": 1.6091126441955566, "epoch": 0.11142857142857143, "grad_norm": 0.12340579181909561, "learning_rate": 9.89813994936215e-05, "loss": 1.4654415130615235, "mean_token_accuracy": 0.627759400010109, "num_tokens": 19119573.0, "step": 390 }, { "entropy": 1.620647069811821, "epoch": 0.11428571428571428, "grad_norm": 0.1143694818019867, "learning_rate": 9.888434282076758e-05, "loss": 1.4784412384033203, "mean_token_accuracy": 0.6251529678702354, "num_tokens": 19609952.0, "step": 400 }, { "entropy": 1.6150473326444625, "epoch": 0.11714285714285715, "grad_norm": 0.11208155006170273, "learning_rate": 9.878292216545406e-05, "loss": 1.4707134246826172, "mean_token_accuracy": 0.6262766167521476, "num_tokens": 20100484.0, "step": 410 }, { "entropy": 1.6126654744148254, "epoch": 0.12, "grad_norm": 0.11389093846082687, "learning_rate": 9.867714658166294e-05, "loss": 1.4687725067138673, "mean_token_accuracy": 0.6262835577130318, "num_tokens": 20591061.0, "step": 420 }, { "entropy": 1.6087138205766678, "epoch": 0.12285714285714286, "grad_norm": 0.11683399975299835, "learning_rate": 9.856702551214758e-05, "loss": 1.4735491752624512, "mean_token_accuracy": 0.6256266921758652, "num_tokens": 21081884.0, "step": 430 }, { "entropy": 1.6129542291164398, "epoch": 0.12571428571428572, "grad_norm": 0.1123637780547142, "learning_rate": 9.84525687875897e-05, "loss": 1.4666884422302247, "mean_token_accuracy": 0.6269908130168915, "num_tokens": 21572985.0, "step": 440 }, { "entropy": 1.5876862674951553, "epoch": 0.12857142857142856, "grad_norm": 0.11302825808525085, "learning_rate": 9.833378662572183e-05, "loss": 1.44884033203125, "mean_token_accuracy": 0.6304112330079079, "num_tokens": 22062380.0, "step": 450 }, { "entropy": 1.6038869142532348, "epoch": 0.13142857142857142, "grad_norm": 0.11883237957954407, "learning_rate": 9.821068963041507e-05, "loss": 1.4640923500061036, "mean_token_accuracy": 0.6268730536103249, "num_tokens": 22553395.0, "step": 460 }, { "entropy": 1.616351565718651, "epoch": 0.13428571428571429, "grad_norm": 0.1148705706000328, "learning_rate": 9.808328879073251e-05, "loss": 1.4773920059204102, "mean_token_accuracy": 0.6249542534351349, "num_tokens": 23042805.0, "step": 470 }, { "entropy": 1.584571686387062, "epoch": 0.13714285714285715, "grad_norm": 0.12227737158536911, "learning_rate": 9.79515954799483e-05, "loss": 1.4490341186523437, "mean_token_accuracy": 0.6293571904301644, "num_tokens": 23532109.0, "step": 480 }, { "entropy": 1.5974321156740188, "epoch": 0.14, "grad_norm": 0.11681197583675385, "learning_rate": 9.781562145453212e-05, "loss": 1.458102035522461, "mean_token_accuracy": 0.6288253426551819, "num_tokens": 24022475.0, "step": 490 }, { "entropy": 1.5929092824459077, "epoch": 0.14285714285714285, "grad_norm": 0.11527484655380249, "learning_rate": 9.767537885309996e-05, "loss": 1.4527276039123536, "mean_token_accuracy": 0.6284167483448982, "num_tokens": 24512984.0, "step": 500 }, { "entropy": 1.5956292569637298, "epoch": 0.1457142857142857, "grad_norm": 0.11178956925868988, "learning_rate": 9.75308801953302e-05, "loss": 1.4558266639709472, "mean_token_accuracy": 0.628529217839241, "num_tokens": 25003761.0, "step": 510 }, { "entropy": 1.5775458842515946, "epoch": 0.14857142857142858, "grad_norm": 0.11768428981304169, "learning_rate": 9.738213838084621e-05, "loss": 1.4407743453979491, "mean_token_accuracy": 0.6300915256142616, "num_tokens": 25493579.0, "step": 520 }, { "entropy": 1.5778144687414168, "epoch": 0.15142857142857144, "grad_norm": 0.11278413981199265, "learning_rate": 9.722916668806454e-05, "loss": 1.4403908729553223, "mean_token_accuracy": 0.6314086809754371, "num_tokens": 25984228.0, "step": 530 }, { "entropy": 1.570985835790634, "epoch": 0.15428571428571428, "grad_norm": 0.1122731938958168, "learning_rate": 9.707197877300974e-05, "loss": 1.426032543182373, "mean_token_accuracy": 0.6325461342930794, "num_tokens": 26473677.0, "step": 540 }, { "entropy": 1.5796766877174377, "epoch": 0.15714285714285714, "grad_norm": 0.11975421756505966, "learning_rate": 9.691058866809514e-05, "loss": 1.4357484817504882, "mean_token_accuracy": 0.6324031576514244, "num_tokens": 26964375.0, "step": 550 }, { "entropy": 1.5689537853002549, "epoch": 0.16, "grad_norm": 0.11165549606084824, "learning_rate": 9.674501078087018e-05, "loss": 1.4325990676879883, "mean_token_accuracy": 0.6315152242779731, "num_tokens": 27454872.0, "step": 560 }, { "entropy": 1.5615802317857743, "epoch": 0.16285714285714287, "grad_norm": 0.11403413116931915, "learning_rate": 9.657525989273428e-05, "loss": 1.424344253540039, "mean_token_accuracy": 0.6343739420175553, "num_tokens": 27945254.0, "step": 570 }, { "entropy": 1.568941941857338, "epoch": 0.1657142857142857, "grad_norm": 0.11258367449045181, "learning_rate": 9.640135115761721e-05, "loss": 1.4347594261169434, "mean_token_accuracy": 0.6329293712973595, "num_tokens": 28435475.0, "step": 580 }, { "entropy": 1.5669189006090165, "epoch": 0.16857142857142857, "grad_norm": 0.1104324460029602, "learning_rate": 9.622330010062632e-05, "loss": 1.4304702758789063, "mean_token_accuracy": 0.6331642985343933, "num_tokens": 28925167.0, "step": 590 }, { "entropy": 1.574634400010109, "epoch": 0.17142857142857143, "grad_norm": 0.11263342946767807, "learning_rate": 9.604112261666054e-05, "loss": 1.436413288116455, "mean_token_accuracy": 0.6318994089961052, "num_tokens": 29415502.0, "step": 600 }, { "entropy": 1.56950443983078, "epoch": 0.1742857142857143, "grad_norm": 0.11262984573841095, "learning_rate": 9.58548349689915e-05, "loss": 1.424701976776123, "mean_token_accuracy": 0.6350988477468491, "num_tokens": 29906208.0, "step": 610 }, { "entropy": 1.5514621257781982, "epoch": 0.17714285714285713, "grad_norm": 0.11094693094491959, "learning_rate": 9.566445378781162e-05, "loss": 1.4180520057678223, "mean_token_accuracy": 0.6351837337017059, "num_tokens": 30396946.0, "step": 620 }, { "entropy": 1.5565964162349701, "epoch": 0.18, "grad_norm": 0.110066719353199, "learning_rate": 9.546999606874947e-05, "loss": 1.4229997634887694, "mean_token_accuracy": 0.6346639677882194, "num_tokens": 30887292.0, "step": 630 }, { "entropy": 1.5571098238229752, "epoch": 0.18285714285714286, "grad_norm": 0.11187391728162766, "learning_rate": 9.527147917135265e-05, "loss": 1.4242905616760253, "mean_token_accuracy": 0.634743233025074, "num_tokens": 31377366.0, "step": 640 }, { "entropy": 1.5575890451669694, "epoch": 0.18571428571428572, "grad_norm": 0.12017875164747238, "learning_rate": 9.5068920817538e-05, "loss": 1.4210159301757812, "mean_token_accuracy": 0.6346415132284164, "num_tokens": 31867728.0, "step": 650 }, { "entropy": 1.5610258907079697, "epoch": 0.18857142857142858, "grad_norm": 0.11284279823303223, "learning_rate": 9.486233909000957e-05, "loss": 1.4286124229431152, "mean_token_accuracy": 0.633992238342762, "num_tokens": 32357951.0, "step": 660 }, { "entropy": 1.570546704530716, "epoch": 0.19142857142857142, "grad_norm": 0.11221849918365479, "learning_rate": 9.465175243064428e-05, "loss": 1.427887535095215, "mean_token_accuracy": 0.6341994881629944, "num_tokens": 32848000.0, "step": 670 }, { "entropy": 1.5625760078430175, "epoch": 0.19428571428571428, "grad_norm": 0.11504076421260834, "learning_rate": 9.443717963884569e-05, "loss": 1.4254456520080567, "mean_token_accuracy": 0.6330446928739548, "num_tokens": 33338988.0, "step": 680 }, { "entropy": 1.5441237717866898, "epoch": 0.19714285714285715, "grad_norm": 0.11827517300844193, "learning_rate": 9.42186398698657e-05, "loss": 1.409364604949951, "mean_token_accuracy": 0.6384268119931221, "num_tokens": 33829458.0, "step": 690 }, { "entropy": 1.5519202262163163, "epoch": 0.2, "grad_norm": 0.11421947181224823, "learning_rate": 9.399615263309448e-05, "loss": 1.4179234504699707, "mean_token_accuracy": 0.6360488340258599, "num_tokens": 34319687.0, "step": 700 }, { "entropy": 1.5406184524297715, "epoch": 0.20285714285714285, "grad_norm": 0.11246901750564575, "learning_rate": 9.37697377903189e-05, "loss": 1.4044339179992675, "mean_token_accuracy": 0.6360796883702278, "num_tokens": 34810861.0, "step": 710 }, { "entropy": 1.5405683517456055, "epoch": 0.2057142857142857, "grad_norm": 0.10953953862190247, "learning_rate": 9.353941555394946e-05, "loss": 1.4075803756713867, "mean_token_accuracy": 0.6373428180813789, "num_tokens": 35301651.0, "step": 720 }, { "entropy": 1.5303823828697205, "epoch": 0.20857142857142857, "grad_norm": 0.11503418534994125, "learning_rate": 9.330520648521581e-05, "loss": 1.3944478034973145, "mean_token_accuracy": 0.6395626902580261, "num_tokens": 35792660.0, "step": 730 }, { "entropy": 1.546586701273918, "epoch": 0.21142857142857144, "grad_norm": 0.11355750262737274, "learning_rate": 9.306713149233126e-05, "loss": 1.4133389472961426, "mean_token_accuracy": 0.6359729379415512, "num_tokens": 36283021.0, "step": 740 }, { "entropy": 1.5666495472192765, "epoch": 0.21428571428571427, "grad_norm": 0.11290405690670013, "learning_rate": 9.282521182862629e-05, "loss": 1.4316218376159668, "mean_token_accuracy": 0.6324604853987694, "num_tokens": 36773401.0, "step": 750 }, { "entropy": 1.5476279586553574, "epoch": 0.21714285714285714, "grad_norm": 0.11210077255964279, "learning_rate": 9.25794690906512e-05, "loss": 1.4119671821594237, "mean_token_accuracy": 0.6370165795087814, "num_tokens": 37263273.0, "step": 760 }, { "entropy": 1.5502693444490432, "epoch": 0.22, "grad_norm": 0.1148453801870346, "learning_rate": 9.23299252162482e-05, "loss": 1.4147866249084473, "mean_token_accuracy": 0.636229844391346, "num_tokens": 37753396.0, "step": 770 }, { "entropy": 1.5336053550243378, "epoch": 0.22285714285714286, "grad_norm": 0.12296847254037857, "learning_rate": 9.20766024825929e-05, "loss": 1.4014326095581056, "mean_token_accuracy": 0.6389749109745025, "num_tokens": 38243813.0, "step": 780 }, { "entropy": 1.5418373078107834, "epoch": 0.2257142857142857, "grad_norm": 0.11040552705526352, "learning_rate": 9.181952350420568e-05, "loss": 1.408182144165039, "mean_token_accuracy": 0.6375694692134857, "num_tokens": 38734087.0, "step": 790 }, { "entropy": 1.540384903550148, "epoch": 0.22857142857142856, "grad_norm": 0.11459089070558548, "learning_rate": 9.15587112309328e-05, "loss": 1.4076971054077148, "mean_token_accuracy": 0.6367619827389717, "num_tokens": 39223587.0, "step": 800 }, { "entropy": 1.5353645473718642, "epoch": 0.23142857142857143, "grad_norm": 0.10925675183534622, "learning_rate": 9.129418894589765e-05, "loss": 1.4029156684875488, "mean_token_accuracy": 0.6391762167215347, "num_tokens": 39713378.0, "step": 810 }, { "entropy": 1.537180870771408, "epoch": 0.2342857142857143, "grad_norm": 0.11116520315408707, "learning_rate": 9.102598026342224e-05, "loss": 1.4027939796447755, "mean_token_accuracy": 0.6384590819478035, "num_tokens": 40204035.0, "step": 820 }, { "entropy": 1.5364642143249512, "epoch": 0.23714285714285716, "grad_norm": 0.11325648427009583, "learning_rate": 9.075410912691907e-05, "loss": 1.4042522430419921, "mean_token_accuracy": 0.6376483544707299, "num_tokens": 40693853.0, "step": 830 }, { "entropy": 1.5172248750925064, "epoch": 0.24, "grad_norm": 0.11164099723100662, "learning_rate": 9.04785998067537e-05, "loss": 1.38507022857666, "mean_token_accuracy": 0.6411049708724021, "num_tokens": 41184741.0, "step": 840 }, { "entropy": 1.5312542200088501, "epoch": 0.24285714285714285, "grad_norm": 0.11361584812402725, "learning_rate": 9.019947689807812e-05, "loss": 1.401794147491455, "mean_token_accuracy": 0.6388043731451034, "num_tokens": 41675456.0, "step": 850 }, { "entropy": 1.5356624484062196, "epoch": 0.24571428571428572, "grad_norm": 0.1110292598605156, "learning_rate": 8.991676531863508e-05, "loss": 1.398195171356201, "mean_token_accuracy": 0.6395324870944024, "num_tokens": 42165958.0, "step": 860 }, { "entropy": 1.5131745964288712, "epoch": 0.24857142857142858, "grad_norm": 0.11180949956178665, "learning_rate": 8.963049030653357e-05, "loss": 1.386609935760498, "mean_token_accuracy": 0.6415100902318954, "num_tokens": 42655563.0, "step": 870 }, { "entropy": 1.5264964669942855, "epoch": 0.25142857142857145, "grad_norm": 0.11161798238754272, "learning_rate": 8.934067741799588e-05, "loss": 1.3911283493041993, "mean_token_accuracy": 0.6405024617910385, "num_tokens": 43145690.0, "step": 880 }, { "entropy": 1.5238522559404373, "epoch": 0.2542857142857143, "grad_norm": 0.11398132890462875, "learning_rate": 8.90473525250761e-05, "loss": 1.3967710494995118, "mean_token_accuracy": 0.6410051316022873, "num_tokens": 43636186.0, "step": 890 }, { "entropy": 1.5147892564535141, "epoch": 0.2571428571428571, "grad_norm": 0.1122528463602066, "learning_rate": 8.875054181335054e-05, "loss": 1.3884020805358888, "mean_token_accuracy": 0.6408766448497772, "num_tokens": 44126324.0, "step": 900 }, { "entropy": 1.51045164167881, "epoch": 0.26, "grad_norm": 0.11045881360769272, "learning_rate": 8.845027177958e-05, "loss": 1.3785322189331055, "mean_token_accuracy": 0.6415310442447663, "num_tokens": 44616075.0, "step": 910 }, { "entropy": 1.5210172832012177, "epoch": 0.26285714285714284, "grad_norm": 0.10973014682531357, "learning_rate": 8.814656922934445e-05, "loss": 1.3884805679321288, "mean_token_accuracy": 0.6404986321926117, "num_tokens": 45106497.0, "step": 920 }, { "entropy": 1.5296362102031709, "epoch": 0.26571428571428574, "grad_norm": 0.11496967822313309, "learning_rate": 8.783946127465001e-05, "loss": 1.3945645332336425, "mean_token_accuracy": 0.6402764439582824, "num_tokens": 45597040.0, "step": 930 }, { "entropy": 1.5171466410160064, "epoch": 0.26857142857142857, "grad_norm": 0.11463408172130585, "learning_rate": 8.752897533150868e-05, "loss": 1.387949275970459, "mean_token_accuracy": 0.6412890180945396, "num_tokens": 46087347.0, "step": 940 }, { "entropy": 1.5124164909124374, "epoch": 0.2714285714285714, "grad_norm": 0.11042758822441101, "learning_rate": 8.721513911749073e-05, "loss": 1.3821532249450683, "mean_token_accuracy": 0.6414273202419281, "num_tokens": 46577723.0, "step": 950 }, { "entropy": 1.5339836180210114, "epoch": 0.2742857142857143, "grad_norm": 0.115251325070858, "learning_rate": 8.689798064925049e-05, "loss": 1.4023088455200194, "mean_token_accuracy": 0.6390077367424964, "num_tokens": 47067566.0, "step": 960 }, { "entropy": 1.4907908529043197, "epoch": 0.27714285714285714, "grad_norm": 0.10888004302978516, "learning_rate": 8.657752824002512e-05, "loss": 1.3607192993164063, "mean_token_accuracy": 0.6449671313166618, "num_tokens": 47557094.0, "step": 970 }, { "entropy": 1.5133179754018784, "epoch": 0.28, "grad_norm": 0.10961966216564178, "learning_rate": 8.625381049710711e-05, "loss": 1.387069320678711, "mean_token_accuracy": 0.6418288230895997, "num_tokens": 48047288.0, "step": 980 }, { "entropy": 1.5235115170478821, "epoch": 0.28285714285714286, "grad_norm": 0.11490974575281143, "learning_rate": 8.592685631929041e-05, "loss": 1.3894145011901855, "mean_token_accuracy": 0.6398768693208694, "num_tokens": 48537707.0, "step": 990 }, { "entropy": 1.5072008579969407, "epoch": 0.2857142857142857, "grad_norm": 0.11175917088985443, "learning_rate": 8.55966948942907e-05, "loss": 1.3789593696594238, "mean_token_accuracy": 0.6438135176897049, "num_tokens": 49027502.0, "step": 1000 } ], "logging_steps": 10, "max_steps": 3500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.73130249895936e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }