| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9458527682011764, |
| "eval_steps": 30, |
| "global_step": 1200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01622388967755019, |
| "grad_norm": 8.625337600708008, |
| "learning_rate": 1.4516129032258066e-05, |
| "loss": 2.6755, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03244777935510038, |
| "grad_norm": 0.859293520450592, |
| "learning_rate": 3.0645161290322585e-05, |
| "loss": 1.0056, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04867166903265058, |
| "grad_norm": 0.7715856432914734, |
| "learning_rate": 4.67741935483871e-05, |
| "loss": 0.6365, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.04867166903265058, |
| "eval_loss": 0.5236709713935852, |
| "eval_runtime": 55.0101, |
| "eval_samples_per_second": 4.726, |
| "eval_steps_per_second": 4.726, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06489555871020077, |
| "grad_norm": 0.5313697457313538, |
| "learning_rate": 6.290322580645161e-05, |
| "loss": 0.4295, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08111944838775097, |
| "grad_norm": 1.1634626388549805, |
| "learning_rate": 7.903225806451613e-05, |
| "loss": 0.3387, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09734333806530115, |
| "grad_norm": 0.7073950171470642, |
| "learning_rate": 9.516129032258065e-05, |
| "loss": 0.2797, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09734333806530115, |
| "eval_loss": 0.2691636085510254, |
| "eval_runtime": 54.4477, |
| "eval_samples_per_second": 4.775, |
| "eval_steps_per_second": 4.775, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11356722774285134, |
| "grad_norm": 0.5980345606803894, |
| "learning_rate": 0.00011129032258064515, |
| "loss": 0.2525, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12979111742040153, |
| "grad_norm": 0.5646739602088928, |
| "learning_rate": 0.0001274193548387097, |
| "loss": 0.2208, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.14601500709795173, |
| "grad_norm": 0.33468499779701233, |
| "learning_rate": 0.00014354838709677422, |
| "loss": 0.2018, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14601500709795173, |
| "eval_loss": 0.1886902004480362, |
| "eval_runtime": 54.4734, |
| "eval_samples_per_second": 4.773, |
| "eval_steps_per_second": 4.773, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.16223889677550193, |
| "grad_norm": 0.3952212929725647, |
| "learning_rate": 0.00015967741935483872, |
| "loss": 0.1841, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1784627864530521, |
| "grad_norm": 0.34694239497184753, |
| "learning_rate": 0.00017580645161290325, |
| "loss": 0.1731, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1946866761306023, |
| "grad_norm": 0.3307943046092987, |
| "learning_rate": 0.00019193548387096775, |
| "loss": 0.1736, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1946866761306023, |
| "eval_loss": 0.1661035567522049, |
| "eval_runtime": 54.4184, |
| "eval_samples_per_second": 4.778, |
| "eval_steps_per_second": 4.778, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2109105658081525, |
| "grad_norm": 0.2933768928050995, |
| "learning_rate": 0.0001999899871766749, |
| "loss": 0.1616, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.22713445548570269, |
| "grad_norm": 0.3262489438056946, |
| "learning_rate": 0.00019990989662046818, |
| "loss": 0.1619, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2433583451632529, |
| "grad_norm": 0.18724548816680908, |
| "learning_rate": 0.00019974977965945, |
| "loss": 0.1482, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2433583451632529, |
| "eval_loss": 0.14517094194889069, |
| "eval_runtime": 54.4554, |
| "eval_samples_per_second": 4.775, |
| "eval_steps_per_second": 4.775, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.25958223484080306, |
| "grad_norm": 0.39443448185920715, |
| "learning_rate": 0.0001995097645450266, |
| "loss": 0.1571, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2758061245183533, |
| "grad_norm": 0.3215681314468384, |
| "learning_rate": 0.00019919004352588767, |
| "loss": 0.1518, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.29203001419590346, |
| "grad_norm": 0.1917540580034256, |
| "learning_rate": 0.0001987908726940178, |
| "loss": 0.1477, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.29203001419590346, |
| "eval_loss": 0.1380539983510971, |
| "eval_runtime": 54.5257, |
| "eval_samples_per_second": 4.768, |
| "eval_steps_per_second": 4.768, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.30825390387345364, |
| "grad_norm": 0.27752983570098877, |
| "learning_rate": 0.00019831257177957044, |
| "loss": 0.1352, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.32447779355100387, |
| "grad_norm": 0.3052772879600525, |
| "learning_rate": 0.00019775552389476864, |
| "loss": 0.1425, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.34070168322855404, |
| "grad_norm": 0.17911162972450256, |
| "learning_rate": 0.00019712017522703764, |
| "loss": 0.1411, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.34070168322855404, |
| "eval_loss": 0.13320107758045197, |
| "eval_runtime": 54.1867, |
| "eval_samples_per_second": 4.798, |
| "eval_steps_per_second": 4.798, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3569255729061042, |
| "grad_norm": 0.15296442806720734, |
| "learning_rate": 0.0001964070346816151, |
| "loss": 0.1378, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.37314946258365445, |
| "grad_norm": 0.17346209287643433, |
| "learning_rate": 0.00019561667347392508, |
| "loss": 0.136, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3893733522612046, |
| "grad_norm": 0.16340570151805878, |
| "learning_rate": 0.00019474972467204297, |
| "loss": 0.1395, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3893733522612046, |
| "eval_loss": 0.13255620002746582, |
| "eval_runtime": 54.3562, |
| "eval_samples_per_second": 4.783, |
| "eval_steps_per_second": 4.783, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4055972419387548, |
| "grad_norm": 0.2969210147857666, |
| "learning_rate": 0.0001938068826896166, |
| "loss": 0.1368, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.421821131616305, |
| "grad_norm": 0.20236295461654663, |
| "learning_rate": 0.00019278890272965096, |
| "loss": 0.1273, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4380450212938552, |
| "grad_norm": 0.2891747057437897, |
| "learning_rate": 0.00019169660017960137, |
| "loss": 0.129, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4380450212938552, |
| "eval_loss": 0.12807676196098328, |
| "eval_runtime": 54.3522, |
| "eval_samples_per_second": 4.784, |
| "eval_steps_per_second": 4.784, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.45426891097140537, |
| "grad_norm": 0.20254789292812347, |
| "learning_rate": 0.0001905308499582597, |
| "loss": 0.141, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4704928006489556, |
| "grad_norm": 0.1715897023677826, |
| "learning_rate": 0.00018929258581495685, |
| "loss": 0.1329, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4867166903265058, |
| "grad_norm": 0.3499029278755188, |
| "learning_rate": 0.00018798279958164295, |
| "loss": 0.1355, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4867166903265058, |
| "eval_loss": 0.12350723892450333, |
| "eval_runtime": 54.5668, |
| "eval_samples_per_second": 4.765, |
| "eval_steps_per_second": 4.765, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.502940580004056, |
| "grad_norm": 0.15775884687900543, |
| "learning_rate": 0.00018660254037844388, |
| "loss": 0.126, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5191644696816061, |
| "grad_norm": 0.19402213394641876, |
| "learning_rate": 0.00018515291377333112, |
| "loss": 0.1332, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5353883593591564, |
| "grad_norm": 0.13237500190734863, |
| "learning_rate": 0.0001836350808965776, |
| "loss": 0.1441, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5353883593591564, |
| "eval_loss": 0.12080405652523041, |
| "eval_runtime": 54.7379, |
| "eval_samples_per_second": 4.75, |
| "eval_steps_per_second": 4.75, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5516122490367066, |
| "grad_norm": 0.29119789600372314, |
| "learning_rate": 0.00018205025751070875, |
| "loss": 0.1245, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5678361387142568, |
| "grad_norm": 0.15898150205612183, |
| "learning_rate": 0.00018039971303669407, |
| "loss": 0.121, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5840600283918069, |
| "grad_norm": 0.13710354268550873, |
| "learning_rate": 0.000178684769537159, |
| "loss": 0.1346, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5840600283918069, |
| "eval_loss": 0.12007435411214828, |
| "eval_runtime": 54.7789, |
| "eval_samples_per_second": 4.746, |
| "eval_steps_per_second": 4.746, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6002839180693571, |
| "grad_norm": 0.12323573976755142, |
| "learning_rate": 0.0001769068006574317, |
| "loss": 0.1311, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6165078077469073, |
| "grad_norm": 0.14197634160518646, |
| "learning_rate": 0.00017506723052527242, |
| "loss": 0.122, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6327316974244576, |
| "grad_norm": 0.13415870070457458, |
| "learning_rate": 0.00017316753261016783, |
| "loss": 0.1242, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6327316974244576, |
| "eval_loss": 0.11815402656793594, |
| "eval_runtime": 54.9041, |
| "eval_samples_per_second": 4.736, |
| "eval_steps_per_second": 4.736, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6489555871020077, |
| "grad_norm": 0.14391979575157166, |
| "learning_rate": 0.00017120922854310257, |
| "loss": 0.1235, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6651794767795579, |
| "grad_norm": 0.12567928433418274, |
| "learning_rate": 0.00016919388689775464, |
| "loss": 0.1292, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6814033664571081, |
| "grad_norm": 0.24638038873672485, |
| "learning_rate": 0.0001671231219340903, |
| "loss": 0.124, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6814033664571081, |
| "eval_loss": 0.11765418201684952, |
| "eval_runtime": 54.8324, |
| "eval_samples_per_second": 4.742, |
| "eval_steps_per_second": 4.742, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6976272561346583, |
| "grad_norm": 0.11872219294309616, |
| "learning_rate": 0.00016499859230536466, |
| "loss": 0.1284, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7138511458122084, |
| "grad_norm": 0.14060816168785095, |
| "learning_rate": 0.00016282199972956425, |
| "loss": 0.1228, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7300750354897587, |
| "grad_norm": 0.1108463779091835, |
| "learning_rate": 0.00016059508762635482, |
| "loss": 0.1315, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7300750354897587, |
| "eval_loss": 0.11692557483911514, |
| "eval_runtime": 55.1996, |
| "eval_samples_per_second": 4.71, |
| "eval_steps_per_second": 4.71, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7462989251673089, |
| "grad_norm": 0.13592351973056793, |
| "learning_rate": 0.00015831963972062733, |
| "loss": 0.1235, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7625228148448591, |
| "grad_norm": 0.12181869149208069, |
| "learning_rate": 0.00015599747861375955, |
| "loss": 0.12, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7787467045224092, |
| "grad_norm": 0.37740716338157654, |
| "learning_rate": 0.00015363046432373824, |
| "loss": 0.119, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7787467045224092, |
| "eval_loss": 0.11858060956001282, |
| "eval_runtime": 55.2002, |
| "eval_samples_per_second": 4.71, |
| "eval_steps_per_second": 4.71, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7949705941999594, |
| "grad_norm": 0.3189144432544708, |
| "learning_rate": 0.00015122049279531143, |
| "loss": 0.1322, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8111944838775096, |
| "grad_norm": 0.15118102729320526, |
| "learning_rate": 0.00014876949438136347, |
| "loss": 0.1211, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8274183735550599, |
| "grad_norm": 0.09746048599481583, |
| "learning_rate": 0.0001462794322967299, |
| "loss": 0.124, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8274183735550599, |
| "eval_loss": 0.11739031225442886, |
| "eval_runtime": 54.9138, |
| "eval_samples_per_second": 4.735, |
| "eval_steps_per_second": 4.735, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.84364226323261, |
| "grad_norm": 0.11682560294866562, |
| "learning_rate": 0.00014375230104569044, |
| "loss": 0.1219, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8598661529101602, |
| "grad_norm": 0.10706628113985062, |
| "learning_rate": 0.0001411901248243993, |
| "loss": 0.1198, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8760900425877104, |
| "grad_norm": 0.26870429515838623, |
| "learning_rate": 0.0001385949558995329, |
| "loss": 0.1253, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8760900425877104, |
| "eval_loss": 0.11531012505292892, |
| "eval_runtime": 54.9326, |
| "eval_samples_per_second": 4.733, |
| "eval_steps_per_second": 4.733, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8923139322652606, |
| "grad_norm": 0.09110305458307266, |
| "learning_rate": 0.0001359688729644536, |
| "loss": 0.1129, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9085378219428107, |
| "grad_norm": 0.11204719543457031, |
| "learning_rate": 0.00013331397947420576, |
| "loss": 0.129, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.924761711620361, |
| "grad_norm": 0.15059269964694977, |
| "learning_rate": 0.00013063240196067836, |
| "loss": 0.1272, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.924761711620361, |
| "eval_loss": 0.11525142192840576, |
| "eval_runtime": 55.1098, |
| "eval_samples_per_second": 4.718, |
| "eval_steps_per_second": 4.718, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9409856012979112, |
| "grad_norm": 0.11085380613803864, |
| "learning_rate": 0.00012792628832928302, |
| "loss": 0.1243, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9572094909754614, |
| "grad_norm": 0.10796050727367401, |
| "learning_rate": 0.00012519780613851254, |
| "loss": 0.1197, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9734333806530115, |
| "grad_norm": 0.08801472187042236, |
| "learning_rate": 0.00012244914086375724, |
| "loss": 0.1261, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9734333806530115, |
| "eval_loss": 0.11538033187389374, |
| "eval_runtime": 54.7547, |
| "eval_samples_per_second": 4.748, |
| "eval_steps_per_second": 4.748, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9896572703305617, |
| "grad_norm": 0.08395121991634369, |
| "learning_rate": 0.00011968249414677055, |
| "loss": 0.1215, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0048671669032652, |
| "grad_norm": 0.10106801986694336, |
| "learning_rate": 0.00011690008203218493, |
| "loss": 0.1206, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0210910565808153, |
| "grad_norm": 0.08509080857038498, |
| "learning_rate": 0.00011410413319249194, |
| "loss": 0.1125, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.0210910565808153, |
| "eval_loss": 0.11403891444206238, |
| "eval_runtime": 54.6547, |
| "eval_samples_per_second": 4.757, |
| "eval_steps_per_second": 4.757, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.0373149462583655, |
| "grad_norm": 0.09074535965919495, |
| "learning_rate": 0.00011129688714290729, |
| "loss": 0.1167, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0535388359359157, |
| "grad_norm": 0.10673358291387558, |
| "learning_rate": 0.00010848059244755093, |
| "loss": 0.1082, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.0697627256134659, |
| "grad_norm": 0.11839364469051361, |
| "learning_rate": 0.00010565750491837925, |
| "loss": 0.1192, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.0697627256134659, |
| "eval_loss": 0.11325760185718536, |
| "eval_runtime": 54.7532, |
| "eval_samples_per_second": 4.749, |
| "eval_steps_per_second": 4.749, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.085986615291016, |
| "grad_norm": 0.08097315579652786, |
| "learning_rate": 0.00010282988580831183, |
| "loss": 0.1188, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.1022105049685662, |
| "grad_norm": 0.10252730548381805, |
| "learning_rate": 0.0001, |
| "loss": 0.1217, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.1184343946461164, |
| "grad_norm": 0.1087854653596878, |
| "learning_rate": 9.71701141916882e-05, |
| "loss": 0.1185, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1184343946461164, |
| "eval_loss": 0.11274029314517975, |
| "eval_runtime": 54.6489, |
| "eval_samples_per_second": 4.758, |
| "eval_steps_per_second": 4.758, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1346582843236666, |
| "grad_norm": 0.09857626259326935, |
| "learning_rate": 9.434249508162076e-05, |
| "loss": 0.1225, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1508821740012167, |
| "grad_norm": 0.09853193908929825, |
| "learning_rate": 9.151940755244912e-05, |
| "loss": 0.1185, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.167106063678767, |
| "grad_norm": 0.0954899936914444, |
| "learning_rate": 8.870311285709274e-05, |
| "loss": 0.1336, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.167106063678767, |
| "eval_loss": 0.11281841993331909, |
| "eval_runtime": 54.5506, |
| "eval_samples_per_second": 4.766, |
| "eval_steps_per_second": 4.766, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.183329953356317, |
| "grad_norm": 0.0967244803905487, |
| "learning_rate": 8.58958668075081e-05, |
| "loss": 0.118, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.1995538430338675, |
| "grad_norm": 0.09406362473964691, |
| "learning_rate": 8.309991796781511e-05, |
| "loss": 0.1218, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2157777327114176, |
| "grad_norm": 0.0989699587225914, |
| "learning_rate": 8.031750585322947e-05, |
| "loss": 0.1178, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.2157777327114176, |
| "eval_loss": 0.11203999817371368, |
| "eval_runtime": 54.636, |
| "eval_samples_per_second": 4.759, |
| "eval_steps_per_second": 4.759, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.2320016223889678, |
| "grad_norm": 0.10200025886297226, |
| "learning_rate": 7.755085913624274e-05, |
| "loss": 0.1126, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.248225512066518, |
| "grad_norm": 0.09863891452550888, |
| "learning_rate": 7.48021938614875e-05, |
| "loss": 0.1253, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.2644494017440682, |
| "grad_norm": 0.07942435145378113, |
| "learning_rate": 7.2073711670717e-05, |
| "loss": 0.1188, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2644494017440682, |
| "eval_loss": 0.11259657144546509, |
| "eval_runtime": 54.5511, |
| "eval_samples_per_second": 4.766, |
| "eval_steps_per_second": 4.766, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2806732914216183, |
| "grad_norm": 0.13004329800605774, |
| "learning_rate": 6.936759803932167e-05, |
| "loss": 0.1158, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.2968971810991685, |
| "grad_norm": 0.08646363765001297, |
| "learning_rate": 6.668602052579424e-05, |
| "loss": 0.1173, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.3131210707767187, |
| "grad_norm": 0.08868124336004257, |
| "learning_rate": 6.403112703554643e-05, |
| "loss": 0.1065, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.3131210707767187, |
| "eval_loss": 0.1115630567073822, |
| "eval_runtime": 55.0171, |
| "eval_samples_per_second": 4.726, |
| "eval_steps_per_second": 4.726, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.3293449604542689, |
| "grad_norm": 0.08065708726644516, |
| "learning_rate": 6.140504410046712e-05, |
| "loss": 0.1243, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.345568850131819, |
| "grad_norm": 0.08410122245550156, |
| "learning_rate": 5.880987517560075e-05, |
| "loss": 0.114, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.3617927398093692, |
| "grad_norm": 0.0926986113190651, |
| "learning_rate": 5.624769895430961e-05, |
| "loss": 0.1178, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3617927398093692, |
| "eval_loss": 0.11092434078454971, |
| "eval_runtime": 55.2141, |
| "eval_samples_per_second": 4.709, |
| "eval_steps_per_second": 4.709, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3780166294869196, |
| "grad_norm": 0.07351404428482056, |
| "learning_rate": 5.372056770327013e-05, |
| "loss": 0.1108, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.3942405191644696, |
| "grad_norm": 0.08835868537425995, |
| "learning_rate": 5.123050561863657e-05, |
| "loss": 0.1127, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.41046440884202, |
| "grad_norm": 0.08012858778238297, |
| "learning_rate": 4.877950720468859e-05, |
| "loss": 0.1118, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.41046440884202, |
| "eval_loss": 0.11060689389705658, |
| "eval_runtime": 55.1249, |
| "eval_samples_per_second": 4.717, |
| "eval_steps_per_second": 4.717, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.4266882985195701, |
| "grad_norm": 0.09063031524419785, |
| "learning_rate": 4.636953567626177e-05, |
| "loss": 0.1195, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.4429121881971203, |
| "grad_norm": 0.07680074125528336, |
| "learning_rate": 4.4002521386240466e-05, |
| "loss": 0.1133, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.4591360778746705, |
| "grad_norm": 0.09211437404155731, |
| "learning_rate": 4.168036027937267e-05, |
| "loss": 0.113, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4591360778746705, |
| "eval_loss": 0.1109917163848877, |
| "eval_runtime": 54.6876, |
| "eval_samples_per_second": 4.754, |
| "eval_steps_per_second": 4.754, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4753599675522207, |
| "grad_norm": 0.08036933839321136, |
| "learning_rate": 3.9404912373645185e-05, |
| "loss": 0.1183, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.4915838572297708, |
| "grad_norm": 0.07682196795940399, |
| "learning_rate": 3.717800027043576e-05, |
| "loss": 0.1198, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.507807746907321, |
| "grad_norm": 0.08601044863462448, |
| "learning_rate": 3.500140769463533e-05, |
| "loss": 0.1209, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.507807746907321, |
| "eval_loss": 0.1104922965168953, |
| "eval_runtime": 54.6525, |
| "eval_samples_per_second": 4.757, |
| "eval_steps_per_second": 4.757, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.5240316365848712, |
| "grad_norm": 0.08242760598659515, |
| "learning_rate": 3.287687806590971e-05, |
| "loss": 0.1127, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.5402555262624213, |
| "grad_norm": 0.08664223551750183, |
| "learning_rate": 3.080611310224539e-05, |
| "loss": 0.1193, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.5564794159399717, |
| "grad_norm": 0.08529651165008545, |
| "learning_rate": 2.879077145689746e-05, |
| "loss": 0.1126, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.5564794159399717, |
| "eval_loss": 0.10984186083078384, |
| "eval_runtime": 54.6191, |
| "eval_samples_per_second": 4.76, |
| "eval_steps_per_second": 4.76, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.5727033056175217, |
| "grad_norm": 0.07219547033309937, |
| "learning_rate": 2.6832467389832173e-05, |
| "loss": 0.1182, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.588927195295072, |
| "grad_norm": 0.10246000438928604, |
| "learning_rate": 2.493276947472756e-05, |
| "loss": 0.1109, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.605151084972622, |
| "grad_norm": 0.08859504014253616, |
| "learning_rate": 2.3093199342568318e-05, |
| "loss": 0.1137, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.605151084972622, |
| "eval_loss": 0.10924239456653595, |
| "eval_runtime": 54.6323, |
| "eval_samples_per_second": 4.759, |
| "eval_steps_per_second": 4.759, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.6213749746501724, |
| "grad_norm": 0.08224408328533173, |
| "learning_rate": 2.1315230462840985e-05, |
| "loss": 0.1168, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6375988643277226, |
| "grad_norm": 0.09429515153169632, |
| "learning_rate": 1.9600286963305957e-05, |
| "loss": 0.114, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.6538227540052728, |
| "grad_norm": 0.08045922219753265, |
| "learning_rate": 1.7949742489291255e-05, |
| "loss": 0.111, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.6538227540052728, |
| "eval_loss": 0.10893593728542328, |
| "eval_runtime": 54.7365, |
| "eval_samples_per_second": 4.75, |
| "eval_steps_per_second": 4.75, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.670046643682823, |
| "grad_norm": 0.0847032219171524, |
| "learning_rate": 1.6364919103422393e-05, |
| "loss": 0.1142, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.6862705333603731, |
| "grad_norm": 0.08252795040607452, |
| "learning_rate": 1.4847086226668872e-05, |
| "loss": 0.1126, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.7024944230379233, |
| "grad_norm": 0.0862165316939354, |
| "learning_rate": 1.339745962155613e-05, |
| "loss": 0.1186, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.7024944230379233, |
| "eval_loss": 0.1090201735496521, |
| "eval_runtime": 54.7713, |
| "eval_samples_per_second": 4.747, |
| "eval_steps_per_second": 4.747, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.7187183127154735, |
| "grad_norm": 0.09021241962909698, |
| "learning_rate": 1.2017200418357078e-05, |
| "loss": 0.1178, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.7349422023930239, |
| "grad_norm": 0.07867942750453949, |
| "learning_rate": 1.0707414185043163e-05, |
| "loss": 0.1041, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.7511660920705738, |
| "grad_norm": 0.10106240212917328, |
| "learning_rate": 9.469150041740338e-06, |
| "loss": 0.1111, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.7511660920705738, |
| "eval_loss": 0.10924577713012695, |
| "eval_runtime": 54.6761, |
| "eval_samples_per_second": 4.755, |
| "eval_steps_per_second": 4.755, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.7673899817481242, |
| "grad_norm": 0.1259811371564865, |
| "learning_rate": 8.303399820398672e-06, |
| "loss": 0.1118, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.7836138714256742, |
| "grad_norm": 0.07995796203613281, |
| "learning_rate": 7.211097270349066e-06, |
| "loss": 0.1005, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7998377611032246, |
| "grad_norm": 0.0811864510178566, |
| "learning_rate": 6.1931173103834115e-06, |
| "loss": 0.115, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.7998377611032246, |
| "eval_loss": 0.10901561379432678, |
| "eval_runtime": 54.5743, |
| "eval_samples_per_second": 4.764, |
| "eval_steps_per_second": 4.764, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.8160616507807745, |
| "grad_norm": 0.10539617389440536, |
| "learning_rate": 5.250275327957032e-06, |
| "loss": 0.1129, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.832285540458325, |
| "grad_norm": 0.15508656203746796, |
| "learning_rate": 4.383326526074916e-06, |
| "loss": 0.1258, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.848509430135875, |
| "grad_norm": 0.08507981151342392, |
| "learning_rate": 3.592965318384944e-06, |
| "loss": 0.1123, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.848509430135875, |
| "eval_loss": 0.1087607592344284, |
| "eval_runtime": 54.908, |
| "eval_samples_per_second": 4.735, |
| "eval_steps_per_second": 4.735, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.8647333198134253, |
| "grad_norm": 0.08362692594528198, |
| "learning_rate": 2.8798247729623806e-06, |
| "loss": 0.1164, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.8809572094909754, |
| "grad_norm": 0.08825893700122833, |
| "learning_rate": 2.2444761052313856e-06, |
| "loss": 0.1223, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.8971810991685256, |
| "grad_norm": 0.07475200295448303, |
| "learning_rate": 1.6874282204295766e-06, |
| "loss": 0.1101, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.8971810991685256, |
| "eval_loss": 0.108616404235363, |
| "eval_runtime": 55.1832, |
| "eval_samples_per_second": 4.712, |
| "eval_steps_per_second": 4.712, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.9134049888460758, |
| "grad_norm": 0.07395757734775543, |
| "learning_rate": 1.209127305982205e-06, |
| "loss": 0.1217, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.929628878523626, |
| "grad_norm": 0.1036606878042221, |
| "learning_rate": 8.099564741123166e-07, |
| "loss": 0.1167, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.9458527682011764, |
| "grad_norm": 0.07940636575222015, |
| "learning_rate": 4.902354549733978e-07, |
| "loss": 0.1111, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.9458527682011764, |
| "eval_loss": 0.10869967937469482, |
| "eval_runtime": 54.8601, |
| "eval_samples_per_second": 4.739, |
| "eval_steps_per_second": 4.739, |
| "step": 1200 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1234, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.675362213715753e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|