{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020242914979757085, "grad_norm": 2.8825272538542044, "learning_rate": 5e-06, "loss": 0.7569, "step": 10 }, { "epoch": 0.04048582995951417, "grad_norm": 2.3606485385931157, "learning_rate": 5e-06, "loss": 0.6504, "step": 20 }, { "epoch": 0.06072874493927125, "grad_norm": 1.426935919321139, "learning_rate": 5e-06, "loss": 0.6287, "step": 30 }, { "epoch": 0.08097165991902834, "grad_norm": 1.9896158325770887, "learning_rate": 5e-06, "loss": 0.6136, "step": 40 }, { "epoch": 0.10121457489878542, "grad_norm": 1.2993488018928139, "learning_rate": 5e-06, "loss": 0.6036, "step": 50 }, { "epoch": 0.1214574898785425, "grad_norm": 1.3692905065095424, "learning_rate": 5e-06, "loss": 0.5974, "step": 60 }, { "epoch": 0.1417004048582996, "grad_norm": 1.817904836385203, "learning_rate": 5e-06, "loss": 0.5925, "step": 70 }, { "epoch": 0.16194331983805668, "grad_norm": 1.37930069536892, "learning_rate": 5e-06, "loss": 0.5878, "step": 80 }, { "epoch": 0.18218623481781376, "grad_norm": 1.9672481173378518, "learning_rate": 5e-06, "loss": 0.5869, "step": 90 }, { "epoch": 0.20242914979757085, "grad_norm": 1.5190200995442689, "learning_rate": 5e-06, "loss": 0.5895, "step": 100 }, { "epoch": 0.22267206477732793, "grad_norm": 1.467909545843528, "learning_rate": 5e-06, "loss": 0.5814, "step": 110 }, { "epoch": 0.242914979757085, "grad_norm": 1.2624328705735892, "learning_rate": 5e-06, "loss": 0.579, "step": 120 }, { "epoch": 0.2631578947368421, "grad_norm": 2.8330967598290644, "learning_rate": 5e-06, "loss": 0.5783, "step": 130 }, { "epoch": 0.2834008097165992, "grad_norm": 2.606189250371495, "learning_rate": 5e-06, "loss": 0.5692, "step": 140 }, { "epoch": 0.30364372469635625, "grad_norm": 2.0755945998925616, "learning_rate": 5e-06, "loss": 0.5778, "step": 150 }, { "epoch": 0.32388663967611336, "grad_norm": 2.112489158829737, "learning_rate": 5e-06, "loss": 0.5757, "step": 160 }, { "epoch": 0.3441295546558704, "grad_norm": 1.773147820799013, "learning_rate": 5e-06, "loss": 0.5757, "step": 170 }, { "epoch": 0.3643724696356275, "grad_norm": 1.9389286348778036, "learning_rate": 5e-06, "loss": 0.5699, "step": 180 }, { "epoch": 0.38461538461538464, "grad_norm": 2.0257261841165506, "learning_rate": 5e-06, "loss": 0.5708, "step": 190 }, { "epoch": 0.4048582995951417, "grad_norm": 1.6235586407175255, "learning_rate": 5e-06, "loss": 0.5664, "step": 200 }, { "epoch": 0.4251012145748988, "grad_norm": 1.3600120185742202, "learning_rate": 5e-06, "loss": 0.5624, "step": 210 }, { "epoch": 0.44534412955465585, "grad_norm": 1.5782442409144959, "learning_rate": 5e-06, "loss": 0.5686, "step": 220 }, { "epoch": 0.46558704453441296, "grad_norm": 1.9084679996506322, "learning_rate": 5e-06, "loss": 0.5681, "step": 230 }, { "epoch": 0.48582995951417, "grad_norm": 1.562642052863713, "learning_rate": 5e-06, "loss": 0.5643, "step": 240 }, { "epoch": 0.5060728744939271, "grad_norm": 1.9619840593224824, "learning_rate": 5e-06, "loss": 0.5666, "step": 250 }, { "epoch": 0.5263157894736842, "grad_norm": 1.5861604962625888, "learning_rate": 5e-06, "loss": 0.5518, "step": 260 }, { "epoch": 0.5465587044534413, "grad_norm": 1.7006302815881864, "learning_rate": 5e-06, "loss": 0.5605, "step": 270 }, { "epoch": 0.5668016194331984, "grad_norm": 1.6651788716675646, "learning_rate": 5e-06, "loss": 0.5568, "step": 280 }, { "epoch": 0.5870445344129555, "grad_norm": 2.2785089542841708, "learning_rate": 5e-06, "loss": 0.5565, "step": 290 }, { "epoch": 0.6072874493927125, "grad_norm": 1.5770155712986973, "learning_rate": 5e-06, "loss": 0.5575, "step": 300 }, { "epoch": 0.6275303643724697, "grad_norm": 1.3220149177446632, "learning_rate": 5e-06, "loss": 0.5622, "step": 310 }, { "epoch": 0.6477732793522267, "grad_norm": 1.3472535369102203, "learning_rate": 5e-06, "loss": 0.5594, "step": 320 }, { "epoch": 0.6680161943319838, "grad_norm": 1.3309997590990947, "learning_rate": 5e-06, "loss": 0.5601, "step": 330 }, { "epoch": 0.6882591093117408, "grad_norm": 1.4273218646615096, "learning_rate": 5e-06, "loss": 0.5601, "step": 340 }, { "epoch": 0.708502024291498, "grad_norm": 1.2085847860457979, "learning_rate": 5e-06, "loss": 0.5619, "step": 350 }, { "epoch": 0.728744939271255, "grad_norm": 1.122941072415825, "learning_rate": 5e-06, "loss": 0.564, "step": 360 }, { "epoch": 0.7489878542510121, "grad_norm": 1.3973216531402086, "learning_rate": 5e-06, "loss": 0.5571, "step": 370 }, { "epoch": 0.7692307692307693, "grad_norm": 1.18082932151255, "learning_rate": 5e-06, "loss": 0.5538, "step": 380 }, { "epoch": 0.7894736842105263, "grad_norm": 1.1780579152037594, "learning_rate": 5e-06, "loss": 0.557, "step": 390 }, { "epoch": 0.8097165991902834, "grad_norm": 1.3785246959799171, "learning_rate": 5e-06, "loss": 0.5576, "step": 400 }, { "epoch": 0.8299595141700404, "grad_norm": 1.2948034196229, "learning_rate": 5e-06, "loss": 0.5524, "step": 410 }, { "epoch": 0.8502024291497976, "grad_norm": 1.186346298990548, "learning_rate": 5e-06, "loss": 0.5598, "step": 420 }, { "epoch": 0.8704453441295547, "grad_norm": 1.1531795830071445, "learning_rate": 5e-06, "loss": 0.5532, "step": 430 }, { "epoch": 0.8906882591093117, "grad_norm": 1.238207775770585, "learning_rate": 5e-06, "loss": 0.5587, "step": 440 }, { "epoch": 0.9109311740890689, "grad_norm": 1.579785932754029, "learning_rate": 5e-06, "loss": 0.5499, "step": 450 }, { "epoch": 0.9311740890688259, "grad_norm": 1.0932455184252672, "learning_rate": 5e-06, "loss": 0.5547, "step": 460 }, { "epoch": 0.951417004048583, "grad_norm": 1.1455184663362503, "learning_rate": 5e-06, "loss": 0.5548, "step": 470 }, { "epoch": 0.97165991902834, "grad_norm": 1.0783335686803248, "learning_rate": 5e-06, "loss": 0.5515, "step": 480 }, { "epoch": 0.9919028340080972, "grad_norm": 1.221913718764988, "learning_rate": 5e-06, "loss": 0.5486, "step": 490 }, { "epoch": 1.0, "eval_loss": 0.27600792050361633, "eval_runtime": 127.9056, "eval_samples_per_second": 104.014, "eval_steps_per_second": 0.407, "step": 494 }, { "epoch": 1.0121457489878543, "grad_norm": 1.946031690795471, "learning_rate": 5e-06, "loss": 0.5099, "step": 500 }, { "epoch": 1.0323886639676114, "grad_norm": 1.9289680946263923, "learning_rate": 5e-06, "loss": 0.4748, "step": 510 }, { "epoch": 1.0526315789473684, "grad_norm": 1.353788705122084, "learning_rate": 5e-06, "loss": 0.468, "step": 520 }, { "epoch": 1.0728744939271255, "grad_norm": 1.205205720059485, "learning_rate": 5e-06, "loss": 0.4735, "step": 530 }, { "epoch": 1.0931174089068827, "grad_norm": 1.282155131195443, "learning_rate": 5e-06, "loss": 0.4657, "step": 540 }, { "epoch": 1.1133603238866396, "grad_norm": 1.5080942345199266, "learning_rate": 5e-06, "loss": 0.4699, "step": 550 }, { "epoch": 1.1336032388663968, "grad_norm": 2.00023442913857, "learning_rate": 5e-06, "loss": 0.4659, "step": 560 }, { "epoch": 1.1538461538461537, "grad_norm": 1.7909520333071323, "learning_rate": 5e-06, "loss": 0.4715, "step": 570 }, { "epoch": 1.174089068825911, "grad_norm": 1.6662712618465199, "learning_rate": 5e-06, "loss": 0.4705, "step": 580 }, { "epoch": 1.194331983805668, "grad_norm": 1.3259873704215552, "learning_rate": 5e-06, "loss": 0.4633, "step": 590 }, { "epoch": 1.214574898785425, "grad_norm": 1.6651495485618664, "learning_rate": 5e-06, "loss": 0.4706, "step": 600 }, { "epoch": 1.2348178137651822, "grad_norm": 1.477115453248806, "learning_rate": 5e-06, "loss": 0.4674, "step": 610 }, { "epoch": 1.2550607287449393, "grad_norm": 1.263440522189552, "learning_rate": 5e-06, "loss": 0.4715, "step": 620 }, { "epoch": 1.2753036437246963, "grad_norm": 1.4025183176779765, "learning_rate": 5e-06, "loss": 0.4715, "step": 630 }, { "epoch": 1.2955465587044535, "grad_norm": 1.3060745387835275, "learning_rate": 5e-06, "loss": 0.473, "step": 640 }, { "epoch": 1.3157894736842106, "grad_norm": 1.2936497262200077, "learning_rate": 5e-06, "loss": 0.4775, "step": 650 }, { "epoch": 1.3360323886639676, "grad_norm": 1.2889809170808153, "learning_rate": 5e-06, "loss": 0.4671, "step": 660 }, { "epoch": 1.3562753036437247, "grad_norm": 1.3063275232148253, "learning_rate": 5e-06, "loss": 0.4808, "step": 670 }, { "epoch": 1.376518218623482, "grad_norm": 1.1392335054785994, "learning_rate": 5e-06, "loss": 0.4765, "step": 680 }, { "epoch": 1.3967611336032388, "grad_norm": 1.1028492143829693, "learning_rate": 5e-06, "loss": 0.4759, "step": 690 }, { "epoch": 1.417004048582996, "grad_norm": 1.1765124226702088, "learning_rate": 5e-06, "loss": 0.4739, "step": 700 }, { "epoch": 1.4372469635627532, "grad_norm": 1.2359278437898007, "learning_rate": 5e-06, "loss": 0.4799, "step": 710 }, { "epoch": 1.45748987854251, "grad_norm": 1.206361807359358, "learning_rate": 5e-06, "loss": 0.4792, "step": 720 }, { "epoch": 1.4777327935222673, "grad_norm": 1.2584288631713865, "learning_rate": 5e-06, "loss": 0.4829, "step": 730 }, { "epoch": 1.4979757085020242, "grad_norm": 1.3266851287176349, "learning_rate": 5e-06, "loss": 0.4757, "step": 740 }, { "epoch": 1.5182186234817814, "grad_norm": 1.174062885309594, "learning_rate": 5e-06, "loss": 0.4786, "step": 750 }, { "epoch": 1.5384615384615383, "grad_norm": 1.217286859959994, "learning_rate": 5e-06, "loss": 0.48, "step": 760 }, { "epoch": 1.5587044534412957, "grad_norm": 1.2963071104766335, "learning_rate": 5e-06, "loss": 0.4771, "step": 770 }, { "epoch": 1.5789473684210527, "grad_norm": 1.1301017979019057, "learning_rate": 5e-06, "loss": 0.4788, "step": 780 }, { "epoch": 1.5991902834008096, "grad_norm": 1.2142771905592493, "learning_rate": 5e-06, "loss": 0.4813, "step": 790 }, { "epoch": 1.6194331983805668, "grad_norm": 1.1156710660198195, "learning_rate": 5e-06, "loss": 0.478, "step": 800 }, { "epoch": 1.639676113360324, "grad_norm": 1.3426333112263065, "learning_rate": 5e-06, "loss": 0.4712, "step": 810 }, { "epoch": 1.6599190283400809, "grad_norm": 1.3351933873992157, "learning_rate": 5e-06, "loss": 0.4782, "step": 820 }, { "epoch": 1.680161943319838, "grad_norm": 1.1798934266276804, "learning_rate": 5e-06, "loss": 0.4759, "step": 830 }, { "epoch": 1.7004048582995952, "grad_norm": 1.0719853459322617, "learning_rate": 5e-06, "loss": 0.4751, "step": 840 }, { "epoch": 1.7206477732793521, "grad_norm": 1.1539682879539461, "learning_rate": 5e-06, "loss": 0.4787, "step": 850 }, { "epoch": 1.7408906882591093, "grad_norm": 1.1331656389881875, "learning_rate": 5e-06, "loss": 0.4782, "step": 860 }, { "epoch": 1.7611336032388665, "grad_norm": 1.2305656250383992, "learning_rate": 5e-06, "loss": 0.4808, "step": 870 }, { "epoch": 1.7813765182186234, "grad_norm": 1.484161550142479, "learning_rate": 5e-06, "loss": 0.4829, "step": 880 }, { "epoch": 1.8016194331983806, "grad_norm": 1.180007815990233, "learning_rate": 5e-06, "loss": 0.4781, "step": 890 }, { "epoch": 1.8218623481781377, "grad_norm": 1.200692474334057, "learning_rate": 5e-06, "loss": 0.4779, "step": 900 }, { "epoch": 1.8421052631578947, "grad_norm": 1.1652057043804007, "learning_rate": 5e-06, "loss": 0.48, "step": 910 }, { "epoch": 1.8623481781376519, "grad_norm": 1.2546941772227673, "learning_rate": 5e-06, "loss": 0.4796, "step": 920 }, { "epoch": 1.882591093117409, "grad_norm": 1.1919644772038955, "learning_rate": 5e-06, "loss": 0.4754, "step": 930 }, { "epoch": 1.902834008097166, "grad_norm": 1.086719187935772, "learning_rate": 5e-06, "loss": 0.4798, "step": 940 }, { "epoch": 1.9230769230769231, "grad_norm": 1.1603089251078402, "learning_rate": 5e-06, "loss": 0.4816, "step": 950 }, { "epoch": 1.9433198380566803, "grad_norm": 1.082753063279857, "learning_rate": 5e-06, "loss": 0.4842, "step": 960 }, { "epoch": 1.9635627530364372, "grad_norm": 1.1470965026545579, "learning_rate": 5e-06, "loss": 0.4826, "step": 970 }, { "epoch": 1.9838056680161942, "grad_norm": 1.317035146950926, "learning_rate": 5e-06, "loss": 0.4834, "step": 980 }, { "epoch": 2.0, "eval_loss": 0.2760300040245056, "eval_runtime": 128.3279, "eval_samples_per_second": 103.672, "eval_steps_per_second": 0.405, "step": 988 }, { "epoch": 2.0040485829959516, "grad_norm": 2.423406809168664, "learning_rate": 5e-06, "loss": 0.4596, "step": 990 }, { "epoch": 2.0242914979757085, "grad_norm": 1.8171058614039022, "learning_rate": 5e-06, "loss": 0.3883, "step": 1000 }, { "epoch": 2.0445344129554655, "grad_norm": 1.5419534841689435, "learning_rate": 5e-06, "loss": 0.3836, "step": 1010 }, { "epoch": 2.064777327935223, "grad_norm": 1.476614885030913, "learning_rate": 5e-06, "loss": 0.3801, "step": 1020 }, { "epoch": 2.08502024291498, "grad_norm": 1.379440361306421, "learning_rate": 5e-06, "loss": 0.3784, "step": 1030 }, { "epoch": 2.1052631578947367, "grad_norm": 1.4083322026387224, "learning_rate": 5e-06, "loss": 0.3778, "step": 1040 }, { "epoch": 2.125506072874494, "grad_norm": 1.4696113259366608, "learning_rate": 5e-06, "loss": 0.3879, "step": 1050 }, { "epoch": 2.145748987854251, "grad_norm": 1.2713132554208233, "learning_rate": 5e-06, "loss": 0.3806, "step": 1060 }, { "epoch": 2.165991902834008, "grad_norm": 1.3917546218353265, "learning_rate": 5e-06, "loss": 0.386, "step": 1070 }, { "epoch": 2.1862348178137654, "grad_norm": 1.4708546369587632, "learning_rate": 5e-06, "loss": 0.3845, "step": 1080 }, { "epoch": 2.2064777327935223, "grad_norm": 1.595163601660772, "learning_rate": 5e-06, "loss": 0.3828, "step": 1090 }, { "epoch": 2.2267206477732793, "grad_norm": 1.4877074059133855, "learning_rate": 5e-06, "loss": 0.3897, "step": 1100 }, { "epoch": 2.246963562753036, "grad_norm": 1.7370364288636082, "learning_rate": 5e-06, "loss": 0.3875, "step": 1110 }, { "epoch": 2.2672064777327936, "grad_norm": 1.8541525908514815, "learning_rate": 5e-06, "loss": 0.3884, "step": 1120 }, { "epoch": 2.2874493927125505, "grad_norm": 1.5781496384158298, "learning_rate": 5e-06, "loss": 0.3894, "step": 1130 }, { "epoch": 2.3076923076923075, "grad_norm": 1.7708762236924664, "learning_rate": 5e-06, "loss": 0.3859, "step": 1140 }, { "epoch": 2.327935222672065, "grad_norm": 1.4475591252123796, "learning_rate": 5e-06, "loss": 0.3887, "step": 1150 }, { "epoch": 2.348178137651822, "grad_norm": 1.2226909125336796, "learning_rate": 5e-06, "loss": 0.3885, "step": 1160 }, { "epoch": 2.3684210526315788, "grad_norm": 1.3657056897603896, "learning_rate": 5e-06, "loss": 0.392, "step": 1170 }, { "epoch": 2.388663967611336, "grad_norm": 1.314590138508193, "learning_rate": 5e-06, "loss": 0.3926, "step": 1180 }, { "epoch": 2.408906882591093, "grad_norm": 1.4093248309707875, "learning_rate": 5e-06, "loss": 0.3929, "step": 1190 }, { "epoch": 2.42914979757085, "grad_norm": 1.3473340294150489, "learning_rate": 5e-06, "loss": 0.39, "step": 1200 }, { "epoch": 2.4493927125506074, "grad_norm": 1.4512860257939517, "learning_rate": 5e-06, "loss": 0.3902, "step": 1210 }, { "epoch": 2.4696356275303644, "grad_norm": 1.4529803502641332, "learning_rate": 5e-06, "loss": 0.3947, "step": 1220 }, { "epoch": 2.4898785425101213, "grad_norm": 1.384452479045095, "learning_rate": 5e-06, "loss": 0.3953, "step": 1230 }, { "epoch": 2.5101214574898787, "grad_norm": 1.2935497884972742, "learning_rate": 5e-06, "loss": 0.3986, "step": 1240 }, { "epoch": 2.5303643724696356, "grad_norm": 1.190834719805313, "learning_rate": 5e-06, "loss": 0.3914, "step": 1250 }, { "epoch": 2.5506072874493926, "grad_norm": 1.5623940633492222, "learning_rate": 5e-06, "loss": 0.3887, "step": 1260 }, { "epoch": 2.57085020242915, "grad_norm": 1.4071250725201772, "learning_rate": 5e-06, "loss": 0.3941, "step": 1270 }, { "epoch": 2.591093117408907, "grad_norm": 1.4686855669517231, "learning_rate": 5e-06, "loss": 0.3956, "step": 1280 }, { "epoch": 2.611336032388664, "grad_norm": 1.29119112691104, "learning_rate": 5e-06, "loss": 0.3998, "step": 1290 }, { "epoch": 2.6315789473684212, "grad_norm": 1.2673323070448081, "learning_rate": 5e-06, "loss": 0.3961, "step": 1300 }, { "epoch": 2.651821862348178, "grad_norm": 1.2450186102009178, "learning_rate": 5e-06, "loss": 0.395, "step": 1310 }, { "epoch": 2.672064777327935, "grad_norm": 1.3598218086289704, "learning_rate": 5e-06, "loss": 0.3969, "step": 1320 }, { "epoch": 2.6923076923076925, "grad_norm": 1.476188415686639, "learning_rate": 5e-06, "loss": 0.401, "step": 1330 }, { "epoch": 2.7125506072874495, "grad_norm": 1.2772087881864984, "learning_rate": 5e-06, "loss": 0.3981, "step": 1340 }, { "epoch": 2.7327935222672064, "grad_norm": 1.3901040643475941, "learning_rate": 5e-06, "loss": 0.399, "step": 1350 }, { "epoch": 2.753036437246964, "grad_norm": 1.2669956962100783, "learning_rate": 5e-06, "loss": 0.3936, "step": 1360 }, { "epoch": 2.7732793522267207, "grad_norm": 1.3605832094262316, "learning_rate": 5e-06, "loss": 0.3996, "step": 1370 }, { "epoch": 2.7935222672064777, "grad_norm": 1.4625604256084817, "learning_rate": 5e-06, "loss": 0.3997, "step": 1380 }, { "epoch": 2.813765182186235, "grad_norm": 1.784848571739763, "learning_rate": 5e-06, "loss": 0.4012, "step": 1390 }, { "epoch": 2.834008097165992, "grad_norm": 1.6211701201270445, "learning_rate": 5e-06, "loss": 0.3948, "step": 1400 }, { "epoch": 2.854251012145749, "grad_norm": 1.5362470053479813, "learning_rate": 5e-06, "loss": 0.4017, "step": 1410 }, { "epoch": 2.8744939271255063, "grad_norm": 1.4029392342160418, "learning_rate": 5e-06, "loss": 0.3992, "step": 1420 }, { "epoch": 2.8947368421052633, "grad_norm": 1.4342688190560124, "learning_rate": 5e-06, "loss": 0.395, "step": 1430 }, { "epoch": 2.91497975708502, "grad_norm": 1.2695118103828797, "learning_rate": 5e-06, "loss": 0.4008, "step": 1440 }, { "epoch": 2.9352226720647776, "grad_norm": 1.2741085284215934, "learning_rate": 5e-06, "loss": 0.4111, "step": 1450 }, { "epoch": 2.9554655870445345, "grad_norm": 1.2910941516817205, "learning_rate": 5e-06, "loss": 0.3975, "step": 1460 }, { "epoch": 2.9757085020242915, "grad_norm": 1.4853937254356013, "learning_rate": 5e-06, "loss": 0.4012, "step": 1470 }, { "epoch": 2.9959514170040484, "grad_norm": 1.4022248360408927, "learning_rate": 5e-06, "loss": 0.4034, "step": 1480 }, { "epoch": 3.0, "eval_loss": 0.29170867800712585, "eval_runtime": 128.4123, "eval_samples_per_second": 103.604, "eval_steps_per_second": 0.405, "step": 1482 }, { "epoch": 3.016194331983806, "grad_norm": 2.450301362827223, "learning_rate": 5e-06, "loss": 0.3253, "step": 1490 }, { "epoch": 3.0364372469635628, "grad_norm": 1.7227064520999624, "learning_rate": 5e-06, "loss": 0.2917, "step": 1500 }, { "epoch": 3.0566801619433197, "grad_norm": 1.708297211407951, "learning_rate": 5e-06, "loss": 0.2951, "step": 1510 }, { "epoch": 3.076923076923077, "grad_norm": 1.6773762742996041, "learning_rate": 5e-06, "loss": 0.2916, "step": 1520 }, { "epoch": 3.097165991902834, "grad_norm": 1.5835044005502448, "learning_rate": 5e-06, "loss": 0.2949, "step": 1530 }, { "epoch": 3.117408906882591, "grad_norm": 1.6991756969105973, "learning_rate": 5e-06, "loss": 0.2906, "step": 1540 }, { "epoch": 3.1376518218623484, "grad_norm": 1.6309085248071211, "learning_rate": 5e-06, "loss": 0.2887, "step": 1550 }, { "epoch": 3.1578947368421053, "grad_norm": 1.467082791239925, "learning_rate": 5e-06, "loss": 0.2958, "step": 1560 }, { "epoch": 3.1781376518218623, "grad_norm": 1.5977415982902385, "learning_rate": 5e-06, "loss": 0.2958, "step": 1570 }, { "epoch": 3.1983805668016196, "grad_norm": 1.794838585622722, "learning_rate": 5e-06, "loss": 0.2985, "step": 1580 }, { "epoch": 3.2186234817813766, "grad_norm": 1.6188047839576813, "learning_rate": 5e-06, "loss": 0.3009, "step": 1590 }, { "epoch": 3.2388663967611335, "grad_norm": 1.5996247856636512, "learning_rate": 5e-06, "loss": 0.2993, "step": 1600 }, { "epoch": 3.2591093117408905, "grad_norm": 1.5799607329692924, "learning_rate": 5e-06, "loss": 0.2981, "step": 1610 }, { "epoch": 3.279352226720648, "grad_norm": 1.6462393560491397, "learning_rate": 5e-06, "loss": 0.3014, "step": 1620 }, { "epoch": 3.299595141700405, "grad_norm": 1.743995264000295, "learning_rate": 5e-06, "loss": 0.3034, "step": 1630 }, { "epoch": 3.3198380566801617, "grad_norm": 1.6858202723993232, "learning_rate": 5e-06, "loss": 0.303, "step": 1640 }, { "epoch": 3.340080971659919, "grad_norm": 1.8191610290572566, "learning_rate": 5e-06, "loss": 0.3003, "step": 1650 }, { "epoch": 3.360323886639676, "grad_norm": 1.601413073895001, "learning_rate": 5e-06, "loss": 0.3051, "step": 1660 }, { "epoch": 3.380566801619433, "grad_norm": 1.7563790440673033, "learning_rate": 5e-06, "loss": 0.2999, "step": 1670 }, { "epoch": 3.4008097165991904, "grad_norm": 1.5579994158970751, "learning_rate": 5e-06, "loss": 0.2989, "step": 1680 }, { "epoch": 3.4210526315789473, "grad_norm": 1.516213960669368, "learning_rate": 5e-06, "loss": 0.3011, "step": 1690 }, { "epoch": 3.4412955465587043, "grad_norm": 1.658051304756316, "learning_rate": 5e-06, "loss": 0.3024, "step": 1700 }, { "epoch": 3.4615384615384617, "grad_norm": 1.5918375939252065, "learning_rate": 5e-06, "loss": 0.3056, "step": 1710 }, { "epoch": 3.4817813765182186, "grad_norm": 1.6167160427713403, "learning_rate": 5e-06, "loss": 0.3059, "step": 1720 }, { "epoch": 3.5020242914979756, "grad_norm": 1.633353659275257, "learning_rate": 5e-06, "loss": 0.3103, "step": 1730 }, { "epoch": 3.522267206477733, "grad_norm": 1.6652424042294116, "learning_rate": 5e-06, "loss": 0.3047, "step": 1740 }, { "epoch": 3.54251012145749, "grad_norm": 1.5945478465514338, "learning_rate": 5e-06, "loss": 0.3097, "step": 1750 }, { "epoch": 3.562753036437247, "grad_norm": 1.4266070240229398, "learning_rate": 5e-06, "loss": 0.3053, "step": 1760 }, { "epoch": 3.582995951417004, "grad_norm": 1.5417916459202972, "learning_rate": 5e-06, "loss": 0.3107, "step": 1770 }, { "epoch": 3.603238866396761, "grad_norm": 1.6806576746631992, "learning_rate": 5e-06, "loss": 0.3077, "step": 1780 }, { "epoch": 3.623481781376518, "grad_norm": 1.6366680075861002, "learning_rate": 5e-06, "loss": 0.3096, "step": 1790 }, { "epoch": 3.6437246963562755, "grad_norm": 1.8008610671747132, "learning_rate": 5e-06, "loss": 0.3095, "step": 1800 }, { "epoch": 3.6639676113360324, "grad_norm": 1.7218050046753175, "learning_rate": 5e-06, "loss": 0.3129, "step": 1810 }, { "epoch": 3.6842105263157894, "grad_norm": 1.54966815167092, "learning_rate": 5e-06, "loss": 0.3089, "step": 1820 }, { "epoch": 3.7044534412955468, "grad_norm": 1.5907699518804959, "learning_rate": 5e-06, "loss": 0.3115, "step": 1830 }, { "epoch": 3.7246963562753037, "grad_norm": 1.5950760981279428, "learning_rate": 5e-06, "loss": 0.3132, "step": 1840 }, { "epoch": 3.7449392712550607, "grad_norm": 1.6178462612967588, "learning_rate": 5e-06, "loss": 0.3112, "step": 1850 }, { "epoch": 3.765182186234818, "grad_norm": 1.6595052047937346, "learning_rate": 5e-06, "loss": 0.3144, "step": 1860 }, { "epoch": 3.785425101214575, "grad_norm": 1.5960921622659012, "learning_rate": 5e-06, "loss": 0.3181, "step": 1870 }, { "epoch": 3.805668016194332, "grad_norm": 1.8789757609659636, "learning_rate": 5e-06, "loss": 0.3128, "step": 1880 }, { "epoch": 3.8259109311740893, "grad_norm": 1.6017653115735, "learning_rate": 5e-06, "loss": 0.318, "step": 1890 }, { "epoch": 3.8461538461538463, "grad_norm": 1.4883871110556444, "learning_rate": 5e-06, "loss": 0.3168, "step": 1900 }, { "epoch": 3.866396761133603, "grad_norm": 1.7783804818718214, "learning_rate": 5e-06, "loss": 0.3114, "step": 1910 }, { "epoch": 3.8866396761133606, "grad_norm": 1.7097670030385954, "learning_rate": 5e-06, "loss": 0.3196, "step": 1920 }, { "epoch": 3.9068825910931175, "grad_norm": 1.519954953217676, "learning_rate": 5e-06, "loss": 0.3161, "step": 1930 }, { "epoch": 3.9271255060728745, "grad_norm": 1.5786075983313086, "learning_rate": 5e-06, "loss": 0.3142, "step": 1940 }, { "epoch": 3.9473684210526314, "grad_norm": 1.8339039956017331, "learning_rate": 5e-06, "loss": 0.3156, "step": 1950 }, { "epoch": 3.967611336032389, "grad_norm": 1.504586213513185, "learning_rate": 5e-06, "loss": 0.3173, "step": 1960 }, { "epoch": 3.9878542510121457, "grad_norm": 1.603597507259475, "learning_rate": 5e-06, "loss": 0.3182, "step": 1970 }, { "epoch": 4.0, "eval_loss": 0.3246242105960846, "eval_runtime": 128.1021, "eval_samples_per_second": 103.855, "eval_steps_per_second": 0.406, "step": 1976 }, { "epoch": 4.008097165991903, "grad_norm": 5.133136743732594, "learning_rate": 5e-06, "loss": 0.2774, "step": 1980 }, { "epoch": 4.02834008097166, "grad_norm": 2.5228879416463936, "learning_rate": 5e-06, "loss": 0.2123, "step": 1990 }, { "epoch": 4.048582995951417, "grad_norm": 2.0169504998470833, "learning_rate": 5e-06, "loss": 0.2057, "step": 2000 }, { "epoch": 4.068825910931174, "grad_norm": 1.9085723632427691, "learning_rate": 5e-06, "loss": 0.2061, "step": 2010 }, { "epoch": 4.089068825910931, "grad_norm": 1.8551214473550242, "learning_rate": 5e-06, "loss": 0.2058, "step": 2020 }, { "epoch": 4.109311740890688, "grad_norm": 1.8804505495475972, "learning_rate": 5e-06, "loss": 0.2076, "step": 2030 }, { "epoch": 4.129554655870446, "grad_norm": 1.8325377033435544, "learning_rate": 5e-06, "loss": 0.2055, "step": 2040 }, { "epoch": 4.149797570850202, "grad_norm": 1.892921271335565, "learning_rate": 5e-06, "loss": 0.2073, "step": 2050 }, { "epoch": 4.17004048582996, "grad_norm": 2.00985346837552, "learning_rate": 5e-06, "loss": 0.2084, "step": 2060 }, { "epoch": 4.190283400809717, "grad_norm": 1.820158031557001, "learning_rate": 5e-06, "loss": 0.209, "step": 2070 }, { "epoch": 4.2105263157894735, "grad_norm": 1.8327459405866524, "learning_rate": 5e-06, "loss": 0.2127, "step": 2080 }, { "epoch": 4.230769230769231, "grad_norm": 1.9394580410768052, "learning_rate": 5e-06, "loss": 0.2123, "step": 2090 }, { "epoch": 4.251012145748988, "grad_norm": 1.9657450113344472, "learning_rate": 5e-06, "loss": 0.2108, "step": 2100 }, { "epoch": 4.271255060728745, "grad_norm": 1.949924211787456, "learning_rate": 5e-06, "loss": 0.2139, "step": 2110 }, { "epoch": 4.291497975708502, "grad_norm": 1.8700037789000614, "learning_rate": 5e-06, "loss": 0.2146, "step": 2120 }, { "epoch": 4.3117408906882595, "grad_norm": 1.8948956761363887, "learning_rate": 5e-06, "loss": 0.2145, "step": 2130 }, { "epoch": 4.331983805668016, "grad_norm": 2.183456888675007, "learning_rate": 5e-06, "loss": 0.2168, "step": 2140 }, { "epoch": 4.352226720647773, "grad_norm": 2.01397562728079, "learning_rate": 5e-06, "loss": 0.2151, "step": 2150 }, { "epoch": 4.372469635627531, "grad_norm": 2.1466988238739697, "learning_rate": 5e-06, "loss": 0.2148, "step": 2160 }, { "epoch": 4.392712550607287, "grad_norm": 1.8790514185213776, "learning_rate": 5e-06, "loss": 0.2159, "step": 2170 }, { "epoch": 4.412955465587045, "grad_norm": 1.9845491426747, "learning_rate": 5e-06, "loss": 0.2179, "step": 2180 }, { "epoch": 4.433198380566802, "grad_norm": 1.9657898682355717, "learning_rate": 5e-06, "loss": 0.2173, "step": 2190 }, { "epoch": 4.4534412955465585, "grad_norm": 1.8874931048307386, "learning_rate": 5e-06, "loss": 0.2156, "step": 2200 }, { "epoch": 4.473684210526316, "grad_norm": 1.845963836099471, "learning_rate": 5e-06, "loss": 0.2188, "step": 2210 }, { "epoch": 4.493927125506072, "grad_norm": 1.9372215749139772, "learning_rate": 5e-06, "loss": 0.2191, "step": 2220 }, { "epoch": 4.51417004048583, "grad_norm": 2.150693124788377, "learning_rate": 5e-06, "loss": 0.2201, "step": 2230 }, { "epoch": 4.534412955465587, "grad_norm": 2.0442818699824317, "learning_rate": 5e-06, "loss": 0.22, "step": 2240 }, { "epoch": 4.554655870445345, "grad_norm": 1.9514620437414862, "learning_rate": 5e-06, "loss": 0.2185, "step": 2250 }, { "epoch": 4.574898785425101, "grad_norm": 1.8963967573781124, "learning_rate": 5e-06, "loss": 0.2213, "step": 2260 }, { "epoch": 4.5951417004048585, "grad_norm": 1.8526961706560494, "learning_rate": 5e-06, "loss": 0.2221, "step": 2270 }, { "epoch": 4.615384615384615, "grad_norm": 1.9821664496420446, "learning_rate": 5e-06, "loss": 0.2214, "step": 2280 }, { "epoch": 4.635627530364372, "grad_norm": 1.915513929466144, "learning_rate": 5e-06, "loss": 0.2237, "step": 2290 }, { "epoch": 4.65587044534413, "grad_norm": 1.8762006846363004, "learning_rate": 5e-06, "loss": 0.2249, "step": 2300 }, { "epoch": 4.676113360323887, "grad_norm": 1.9227849786208417, "learning_rate": 5e-06, "loss": 0.2265, "step": 2310 }, { "epoch": 4.696356275303644, "grad_norm": 1.9572380827157467, "learning_rate": 5e-06, "loss": 0.2265, "step": 2320 }, { "epoch": 4.716599190283401, "grad_norm": 1.8826393602367635, "learning_rate": 5e-06, "loss": 0.2252, "step": 2330 }, { "epoch": 4.7368421052631575, "grad_norm": 1.825743956937918, "learning_rate": 5e-06, "loss": 0.2234, "step": 2340 }, { "epoch": 4.757085020242915, "grad_norm": 2.194102804306542, "learning_rate": 5e-06, "loss": 0.2243, "step": 2350 }, { "epoch": 4.777327935222672, "grad_norm": 1.8765617982862395, "learning_rate": 5e-06, "loss": 0.2246, "step": 2360 }, { "epoch": 4.797570850202429, "grad_norm": 1.8687466226351828, "learning_rate": 5e-06, "loss": 0.2276, "step": 2370 }, { "epoch": 4.817813765182186, "grad_norm": 1.7759017848087686, "learning_rate": 5e-06, "loss": 0.2257, "step": 2380 }, { "epoch": 4.838056680161944, "grad_norm": 1.8906492602012772, "learning_rate": 5e-06, "loss": 0.2253, "step": 2390 }, { "epoch": 4.8582995951417, "grad_norm": 2.050008885412097, "learning_rate": 5e-06, "loss": 0.2294, "step": 2400 }, { "epoch": 4.8785425101214575, "grad_norm": 1.9449950754235916, "learning_rate": 5e-06, "loss": 0.2307, "step": 2410 }, { "epoch": 4.898785425101215, "grad_norm": 1.866407529076546, "learning_rate": 5e-06, "loss": 0.2293, "step": 2420 }, { "epoch": 4.919028340080971, "grad_norm": 1.9434490467739143, "learning_rate": 5e-06, "loss": 0.2285, "step": 2430 }, { "epoch": 4.939271255060729, "grad_norm": 1.9646396229024239, "learning_rate": 5e-06, "loss": 0.2306, "step": 2440 }, { "epoch": 4.959514170040486, "grad_norm": 1.8683726614276728, "learning_rate": 5e-06, "loss": 0.2277, "step": 2450 }, { "epoch": 4.979757085020243, "grad_norm": 1.984199504770925, "learning_rate": 5e-06, "loss": 0.2312, "step": 2460 }, { "epoch": 5.0, "grad_norm": 1.8362584513440876, "learning_rate": 5e-06, "loss": 0.2292, "step": 2470 }, { "epoch": 5.0, "eval_loss": 0.37886306643486023, "eval_runtime": 128.1269, "eval_samples_per_second": 103.835, "eval_steps_per_second": 0.406, "step": 2470 }, { "epoch": 5.0, "step": 2470, "total_flos": 4136504477614080.0, "train_loss": 0.39363470135430095, "train_runtime": 36289.8965, "train_samples_per_second": 34.825, "train_steps_per_second": 0.068 } ], "logging_steps": 10, "max_steps": 2470, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4136504477614080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }