| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 922, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.021719948415122516, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019869848156182213, |
| "loss": 6.0531, |
| "mean_token_accuracy": 0.37881034857127815, |
| "num_tokens": 78573.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04343989683024503, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019725234996384672, |
| "loss": 5.9903, |
| "mean_token_accuracy": 0.3887314551044255, |
| "num_tokens": 157496.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06515984524536754, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019580621836587129, |
| "loss": 5.9929, |
| "mean_token_accuracy": 0.3850487937917933, |
| "num_tokens": 236512.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08687979366049006, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019436008676789588, |
| "loss": 5.986, |
| "mean_token_accuracy": 0.3865805763518438, |
| "num_tokens": 315608.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10859974207561257, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019291395516992047, |
| "loss": 6.0318, |
| "mean_token_accuracy": 0.38490854618139564, |
| "num_tokens": 394791.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13031969049073508, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019146782357194506, |
| "loss": 5.9875, |
| "mean_token_accuracy": 0.38546840590424836, |
| "num_tokens": 473456.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1520396389058576, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00019002169197396965, |
| "loss": 5.9773, |
| "mean_token_accuracy": 0.3837947838823311, |
| "num_tokens": 552247.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.17375958732098012, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001885755603759942, |
| "loss": 6.0197, |
| "mean_token_accuracy": 0.38375892234034836, |
| "num_tokens": 631096.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19547953573610263, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001871294287780188, |
| "loss": 6.0014, |
| "mean_token_accuracy": 0.38519658900331705, |
| "num_tokens": 710275.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.21719948415122514, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001856832971800434, |
| "loss": 6.0217, |
| "mean_token_accuracy": 0.3826989881345071, |
| "num_tokens": 789174.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23891943256634765, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00018423716558206799, |
| "loss": 6.0294, |
| "mean_token_accuracy": 0.3842650496866554, |
| "num_tokens": 867922.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.26063938098147016, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00018279103398409255, |
| "loss": 5.9548, |
| "mean_token_accuracy": 0.3875111517496407, |
| "num_tokens": 947073.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2823593293965927, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00018134490238611714, |
| "loss": 6.0071, |
| "mean_token_accuracy": 0.38376112943515184, |
| "num_tokens": 1026211.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3040792778117152, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017989877078814173, |
| "loss": 6.0105, |
| "mean_token_accuracy": 0.38463024909142407, |
| "num_tokens": 1105147.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3257992262268377, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001784526391901663, |
| "loss": 6.0449, |
| "mean_token_accuracy": 0.38241248747799544, |
| "num_tokens": 1183837.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.34751917464196025, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017700650759219091, |
| "loss": 5.9946, |
| "mean_token_accuracy": 0.3821917780907825, |
| "num_tokens": 1262676.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.36923912305708273, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017556037599421548, |
| "loss": 6.0049, |
| "mean_token_accuracy": 0.3842250820598565, |
| "num_tokens": 1341759.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.39095907147220527, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017411424439624007, |
| "loss": 5.9736, |
| "mean_token_accuracy": 0.3861917880363762, |
| "num_tokens": 1420987.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.41267901988732775, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017266811279826466, |
| "loss": 6.0431, |
| "mean_token_accuracy": 0.38146835477091373, |
| "num_tokens": 1499903.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4343989683024503, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00017122198120028922, |
| "loss": 6.0069, |
| "mean_token_accuracy": 0.3861038032686338, |
| "num_tokens": 1578880.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4561189167175728, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016977584960231381, |
| "loss": 6.0091, |
| "mean_token_accuracy": 0.3845075036631897, |
| "num_tokens": 1657709.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4778388651326953, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001683297180043384, |
| "loss": 6.0029, |
| "mean_token_accuracy": 0.38288453239947556, |
| "num_tokens": 1736620.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.49955881354781784, |
| "grad_norm": 0.0, |
| "learning_rate": 0.000166883586406363, |
| "loss": 5.9933, |
| "mean_token_accuracy": 0.38531269936356694, |
| "num_tokens": 1815478.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5212787619629403, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016543745480838756, |
| "loss": 6.0334, |
| "mean_token_accuracy": 0.3833939728909172, |
| "num_tokens": 1894256.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5429987103780628, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016399132321041215, |
| "loss": 6.0484, |
| "mean_token_accuracy": 0.3841994108865038, |
| "num_tokens": 1972997.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5647186587931854, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016254519161243674, |
| "loss": 6.0285, |
| "mean_token_accuracy": 0.38443261398933826, |
| "num_tokens": 2051890.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5864386072083079, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00016109906001446133, |
| "loss": 5.983, |
| "mean_token_accuracy": 0.385862308065407, |
| "num_tokens": 2130746.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6081585556234304, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015965292841648592, |
| "loss": 6.0046, |
| "mean_token_accuracy": 0.3859730801777914, |
| "num_tokens": 2209603.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6298785040385529, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001582067968185105, |
| "loss": 5.9887, |
| "mean_token_accuracy": 0.38499277490191164, |
| "num_tokens": 2288743.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6515984524536754, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015676066522053508, |
| "loss": 5.9788, |
| "mean_token_accuracy": 0.3864025830756873, |
| "num_tokens": 2367648.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6733184008687979, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015531453362255964, |
| "loss": 6.0162, |
| "mean_token_accuracy": 0.38221894631860776, |
| "num_tokens": 2446256.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6950383492839205, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015386840202458423, |
| "loss": 6.0017, |
| "mean_token_accuracy": 0.38538019855041056, |
| "num_tokens": 2525065.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.716758297699043, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015242227042660883, |
| "loss": 5.9569, |
| "mean_token_accuracy": 0.3852926092222333, |
| "num_tokens": 2604183.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7384782461141655, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00015097613882863342, |
| "loss": 6.0072, |
| "mean_token_accuracy": 0.38616998645011336, |
| "num_tokens": 2683084.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7601981945292879, |
| "grad_norm": 0.0, |
| "learning_rate": 0.000149530007230658, |
| "loss": 6.0428, |
| "mean_token_accuracy": 0.3831095602363348, |
| "num_tokens": 2761824.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7819181429444105, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014808387563268257, |
| "loss": 5.9781, |
| "mean_token_accuracy": 0.3861894382047467, |
| "num_tokens": 2840709.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.803638091359533, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014663774403470716, |
| "loss": 5.9898, |
| "mean_token_accuracy": 0.3846066597965546, |
| "num_tokens": 2919385.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8253580397746555, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014519161243673173, |
| "loss": 6.0098, |
| "mean_token_accuracy": 0.38357423364650456, |
| "num_tokens": 2998308.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8470779881897781, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00014374548083875634, |
| "loss": 6.0349, |
| "mean_token_accuracy": 0.3809299209038727, |
| "num_tokens": 3076962.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8687979366049006, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001422993492407809, |
| "loss": 6.0294, |
| "mean_token_accuracy": 0.38217092433478683, |
| "num_tokens": 3155983.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.890517885020023, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001408532176428055, |
| "loss": 5.9728, |
| "mean_token_accuracy": 0.3870704318396747, |
| "num_tokens": 3234960.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9122378334351456, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001394070860448301, |
| "loss": 6.0541, |
| "mean_token_accuracy": 0.38209462116938087, |
| "num_tokens": 3313797.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9339577818502681, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013796095444685465, |
| "loss": 5.9871, |
| "mean_token_accuracy": 0.38401242352556436, |
| "num_tokens": 3392880.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9556777302653906, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013651482284887927, |
| "loss": 5.9901, |
| "mean_token_accuracy": 0.38375150066567587, |
| "num_tokens": 3471597.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9773976786805131, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013506869125090384, |
| "loss": 5.9797, |
| "mean_token_accuracy": 0.38728573790285736, |
| "num_tokens": 3550697.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9991176270956357, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013362255965292843, |
| "loss": 5.9971, |
| "mean_token_accuracy": 0.38420643559657036, |
| "num_tokens": 3629481.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0195479535736103, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013217642805495302, |
| "loss": 6.0223, |
| "mean_token_accuracy": 0.3833757105509308, |
| "num_tokens": 3703746.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0412679019887328, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00013073029645697758, |
| "loss": 6.0146, |
| "mean_token_accuracy": 0.38417146790307016, |
| "num_tokens": 3782783.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0629878504038552, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012928416485900217, |
| "loss": 5.9713, |
| "mean_token_accuracy": 0.3863212176831439, |
| "num_tokens": 3862048.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0847077988189777, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012783803326102676, |
| "loss": 5.9944, |
| "mean_token_accuracy": 0.38522618702845646, |
| "num_tokens": 3941220.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1064277472341004, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012639190166305135, |
| "loss": 5.9895, |
| "mean_token_accuracy": 0.38548907284857703, |
| "num_tokens": 4020089.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.128147695649223, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012494577006507592, |
| "loss": 5.9901, |
| "mean_token_accuracy": 0.3841517207212746, |
| "num_tokens": 4098981.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1498676440643454, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001234996384671005, |
| "loss": 6.0125, |
| "mean_token_accuracy": 0.38393973018974065, |
| "num_tokens": 4177983.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1715875924794679, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012205350686912509, |
| "loss": 5.9999, |
| "mean_token_accuracy": 0.3848019931232557, |
| "num_tokens": 4256818.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1933075408945903, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00012060737527114966, |
| "loss": 6.0146, |
| "mean_token_accuracy": 0.3834400905063376, |
| "num_tokens": 4335754.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2150274893097128, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011916124367317427, |
| "loss": 5.9832, |
| "mean_token_accuracy": 0.38411646473687144, |
| "num_tokens": 4414480.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.2367474377248353, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011771511207519885, |
| "loss": 6.0393, |
| "mean_token_accuracy": 0.3809544609161094, |
| "num_tokens": 4493321.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2584673861399578, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011626898047722344, |
| "loss": 5.9989, |
| "mean_token_accuracy": 0.38565440035890786, |
| "num_tokens": 4572297.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.2801873345550805, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011482284887924801, |
| "loss": 5.9983, |
| "mean_token_accuracy": 0.3863610655302182, |
| "num_tokens": 4651247.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.301907282970203, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011337671728127259, |
| "loss": 6.0116, |
| "mean_token_accuracy": 0.3831069741398096, |
| "num_tokens": 4730012.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3236272313853255, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001119305856832972, |
| "loss": 6.0195, |
| "mean_token_accuracy": 0.3857766560278833, |
| "num_tokens": 4809013.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.345347179800448, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00011048445408532177, |
| "loss": 6.0108, |
| "mean_token_accuracy": 0.3835394912166521, |
| "num_tokens": 4888027.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.3670671282155704, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010903832248734635, |
| "loss": 5.9558, |
| "mean_token_accuracy": 0.3883327366434969, |
| "num_tokens": 4966859.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3887870766306931, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010759219088937094, |
| "loss": 6.0088, |
| "mean_token_accuracy": 0.385754154715687, |
| "num_tokens": 5045905.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.4105070250458156, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010614605929139552, |
| "loss": 6.0236, |
| "mean_token_accuracy": 0.3829167880234309, |
| "num_tokens": 5124413.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.432226973460938, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001046999276934201, |
| "loss": 6.0154, |
| "mean_token_accuracy": 0.3834054367849603, |
| "num_tokens": 5203273.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4539469218760606, |
| "grad_norm": 0.0, |
| "learning_rate": 0.0001032537960954447, |
| "loss": 6.005, |
| "mean_token_accuracy": 0.3848568681394681, |
| "num_tokens": 5281984.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.475666870291183, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010180766449746928, |
| "loss": 5.9805, |
| "mean_token_accuracy": 0.3862292483681813, |
| "num_tokens": 5360846.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4973868187063055, |
| "grad_norm": 0.0, |
| "learning_rate": 0.00010036153289949386, |
| "loss": 5.9871, |
| "mean_token_accuracy": 0.3851381171494722, |
| "num_tokens": 5439758.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.519106767121428, |
| "grad_norm": 0.0, |
| "learning_rate": 9.891540130151843e-05, |
| "loss": 6.0254, |
| "mean_token_accuracy": 0.3831110308994539, |
| "num_tokens": 5518768.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5408267155365505, |
| "grad_norm": 0.0, |
| "learning_rate": 9.746926970354303e-05, |
| "loss": 6.0339, |
| "mean_token_accuracy": 0.3808896674308926, |
| "num_tokens": 5597672.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.562546663951673, |
| "grad_norm": 0.0, |
| "learning_rate": 9.602313810556762e-05, |
| "loss": 5.994, |
| "mean_token_accuracy": 0.3865658549708314, |
| "num_tokens": 5676393.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.5842666123667957, |
| "grad_norm": 0.0, |
| "learning_rate": 9.45770065075922e-05, |
| "loss": 6.0026, |
| "mean_token_accuracy": 0.38676879862323404, |
| "num_tokens": 5755416.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.6059865607819181, |
| "grad_norm": 0.0, |
| "learning_rate": 9.313087490961678e-05, |
| "loss": 6.0175, |
| "mean_token_accuracy": 0.38389978763880206, |
| "num_tokens": 5834322.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.6277065091970406, |
| "grad_norm": 0.0, |
| "learning_rate": 9.168474331164136e-05, |
| "loss": 6.0224, |
| "mean_token_accuracy": 0.3840769061935134, |
| "num_tokens": 5912925.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.6494264576121633, |
| "grad_norm": 0.0, |
| "learning_rate": 9.023861171366594e-05, |
| "loss": 5.9683, |
| "mean_token_accuracy": 0.38740592543035746, |
| "num_tokens": 5991913.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6711464060272858, |
| "grad_norm": 0.0, |
| "learning_rate": 8.879248011569053e-05, |
| "loss": 6.0136, |
| "mean_token_accuracy": 0.3827170055825263, |
| "num_tokens": 6070546.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6928663544424083, |
| "grad_norm": 0.0, |
| "learning_rate": 8.734634851771512e-05, |
| "loss": 5.9688, |
| "mean_token_accuracy": 0.3871995336958207, |
| "num_tokens": 6149623.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.7145863028575308, |
| "grad_norm": 0.0, |
| "learning_rate": 8.59002169197397e-05, |
| "loss": 6.0639, |
| "mean_token_accuracy": 0.3800558194401674, |
| "num_tokens": 6228396.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.7363062512726533, |
| "grad_norm": 0.0, |
| "learning_rate": 8.445408532176429e-05, |
| "loss": 5.9778, |
| "mean_token_accuracy": 0.3854194077430293, |
| "num_tokens": 6307375.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.7580261996877757, |
| "grad_norm": 0.0, |
| "learning_rate": 8.300795372378887e-05, |
| "loss": 6.0294, |
| "mean_token_accuracy": 0.38350161886774004, |
| "num_tokens": 6386536.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.7797461481028982, |
| "grad_norm": 0.0, |
| "learning_rate": 8.156182212581345e-05, |
| "loss": 5.9736, |
| "mean_token_accuracy": 0.38414000715129076, |
| "num_tokens": 6465708.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.8014660965180207, |
| "grad_norm": 0.0, |
| "learning_rate": 8.011569052783804e-05, |
| "loss": 5.9847, |
| "mean_token_accuracy": 0.3877883184002712, |
| "num_tokens": 6544782.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.8231860449331432, |
| "grad_norm": 0.0, |
| "learning_rate": 7.866955892986261e-05, |
| "loss": 6.043, |
| "mean_token_accuracy": 0.38251175949117167, |
| "num_tokens": 6623497.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.8449059933482657, |
| "grad_norm": 0.0, |
| "learning_rate": 7.72234273318872e-05, |
| "loss": 6.0297, |
| "mean_token_accuracy": 0.38357739897910503, |
| "num_tokens": 6702159.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.8666259417633884, |
| "grad_norm": 0.0, |
| "learning_rate": 7.57772957339118e-05, |
| "loss": 6.0335, |
| "mean_token_accuracy": 0.38290644709486515, |
| "num_tokens": 6780886.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.8883458901785108, |
| "grad_norm": 0.0, |
| "learning_rate": 7.433116413593637e-05, |
| "loss": 6.0273, |
| "mean_token_accuracy": 0.38307004772359504, |
| "num_tokens": 6859533.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.9100658385936333, |
| "grad_norm": 0.0, |
| "learning_rate": 7.288503253796096e-05, |
| "loss": 5.9827, |
| "mean_token_accuracy": 0.385389854805544, |
| "num_tokens": 6938490.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.9317857870087558, |
| "grad_norm": 0.0, |
| "learning_rate": 7.143890093998554e-05, |
| "loss": 6.0212, |
| "mean_token_accuracy": 0.382453205762431, |
| "num_tokens": 7017297.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.9535057354238785, |
| "grad_norm": 0.0, |
| "learning_rate": 6.999276934201012e-05, |
| "loss": 6.0043, |
| "mean_token_accuracy": 0.38360455771908164, |
| "num_tokens": 7096419.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.975225683839001, |
| "grad_norm": 0.0, |
| "learning_rate": 6.854663774403471e-05, |
| "loss": 5.985, |
| "mean_token_accuracy": 0.38632277538999915, |
| "num_tokens": 7175650.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.9969456322541235, |
| "grad_norm": 0.0, |
| "learning_rate": 6.71005061460593e-05, |
| "loss": 6.009, |
| "mean_token_accuracy": 0.38311313565354793, |
| "num_tokens": 7254378.0, |
| "step": 920 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1383, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.107018416918856e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|