{ "best_global_step": 785, "best_metric": 0.04556597024202347, "best_model_checkpoint": "outputs/checkpoint-785", "epoch": 14.0, "eval_steps": 500, "global_step": 2198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06369426751592357, "grad_norm": 0.5893169641494751, "learning_rate": 0.000199991726421274, "loss": 0.3173, "step": 10 }, { "epoch": 0.12738853503184713, "grad_norm": 0.4055931568145752, "learning_rate": 0.00019996312815368718, "loss": 0.1199, "step": 20 }, { "epoch": 0.1910828025477707, "grad_norm": 0.49983513355255127, "learning_rate": 0.00019991410889510497, "loss": 0.0987, "step": 30 }, { "epoch": 0.25477707006369427, "grad_norm": 0.4363257586956024, "learning_rate": 0.00019984467865943805, "loss": 0.0775, "step": 40 }, { "epoch": 0.3184713375796178, "grad_norm": 0.26337388157844543, "learning_rate": 0.00019975485163025835, "loss": 0.0573, "step": 50 }, { "epoch": 0.3821656050955414, "grad_norm": 0.24820806086063385, "learning_rate": 0.00019964464615790156, "loss": 0.0647, "step": 60 }, { "epoch": 0.445859872611465, "grad_norm": 0.1877005398273468, "learning_rate": 0.0001995140847557183, "loss": 0.0608, "step": 70 }, { "epoch": 0.5095541401273885, "grad_norm": 0.2699015736579895, "learning_rate": 0.00019936319409547513, "loss": 0.0533, "step": 80 }, { "epoch": 0.5732484076433121, "grad_norm": 0.32841813564300537, "learning_rate": 0.00019919200500190587, "loss": 0.0622, "step": 90 }, { "epoch": 0.6369426751592356, "grad_norm": 0.2413034588098526, "learning_rate": 0.00019900055244641447, "loss": 0.0664, "step": 100 }, { "epoch": 0.7006369426751592, "grad_norm": 0.3198252320289612, "learning_rate": 0.000198788875539931, "loss": 0.049, "step": 110 }, { "epoch": 0.7643312101910829, "grad_norm": 0.27829986810684204, "learning_rate": 0.00019855701752492176, "loss": 0.0574, "step": 120 }, { "epoch": 0.8280254777070064, "grad_norm": 0.2980740964412689, "learning_rate": 0.00019830502576655552, "loss": 0.0494, "step": 130 }, { "epoch": 0.89171974522293, "grad_norm": 0.3273659348487854, "learning_rate": 0.00019803295174302752, "loss": 0.0486, "step": 140 }, { "epoch": 0.9554140127388535, "grad_norm": 0.23626568913459778, "learning_rate": 0.00019774085103504326, "loss": 0.0408, "step": 150 }, { "epoch": 1.0, "eval_loss": 0.05023716390132904, "eval_runtime": 152.5451, "eval_samples_per_second": 2.74, "eval_steps_per_second": 0.347, "step": 157 }, { "epoch": 1.019108280254777, "grad_norm": 0.2541586458683014, "learning_rate": 0.00019742878331446414, "loss": 0.0473, "step": 160 }, { "epoch": 1.0828025477707006, "grad_norm": 0.27645406126976013, "learning_rate": 0.00019709681233211733, "loss": 0.0224, "step": 170 }, { "epoch": 1.1464968152866242, "grad_norm": 0.3336308002471924, "learning_rate": 0.0001967450059047726, "loss": 0.0478, "step": 180 }, { "epoch": 1.2101910828025477, "grad_norm": 0.1680356115102768, "learning_rate": 0.00019637343590128809, "loss": 0.0315, "step": 190 }, { "epoch": 1.2738853503184713, "grad_norm": 0.1790059506893158, "learning_rate": 0.00019598217822792892, "loss": 0.0326, "step": 200 }, { "epoch": 1.3375796178343948, "grad_norm": 0.22847947478294373, "learning_rate": 0.00019557131281286024, "loss": 0.0481, "step": 210 }, { "epoch": 1.4012738853503186, "grad_norm": 0.20187054574489594, "learning_rate": 0.0001951409235898194, "loss": 0.0407, "step": 220 }, { "epoch": 1.4649681528662422, "grad_norm": 0.14863349497318268, "learning_rate": 0.0001946910984809694, "loss": 0.0406, "step": 230 }, { "epoch": 1.5286624203821657, "grad_norm": 0.17791222035884857, "learning_rate": 0.00019422192937893775, "loss": 0.0328, "step": 240 }, { "epoch": 1.5923566878980893, "grad_norm": 0.15719226002693176, "learning_rate": 0.00019373351212804404, "loss": 0.0337, "step": 250 }, { "epoch": 1.6560509554140128, "grad_norm": 0.2113306075334549, "learning_rate": 0.0001932259465047206, "loss": 0.0353, "step": 260 }, { "epoch": 1.7197452229299364, "grad_norm": 0.19012148678302765, "learning_rate": 0.0001926993361971293, "loss": 0.0328, "step": 270 }, { "epoch": 1.78343949044586, "grad_norm": 0.1509159654378891, "learning_rate": 0.00019215378878397997, "loss": 0.0407, "step": 280 }, { "epoch": 1.8471337579617835, "grad_norm": 0.1625605821609497, "learning_rate": 0.00019158941571255337, "loss": 0.0379, "step": 290 }, { "epoch": 1.910828025477707, "grad_norm": 0.12683314085006714, "learning_rate": 0.0001910063322759343, "loss": 0.037, "step": 300 }, { "epoch": 1.9745222929936306, "grad_norm": 0.14843901991844177, "learning_rate": 0.00019040465758945883, "loss": 0.0353, "step": 310 }, { "epoch": 2.0, "eval_loss": 0.0473560094833374, "eval_runtime": 151.5547, "eval_samples_per_second": 2.758, "eval_steps_per_second": 0.35, "step": 314 }, { "epoch": 2.038216560509554, "grad_norm": 0.16260643303394318, "learning_rate": 0.00018978451456638088, "loss": 0.0365, "step": 320 }, { "epoch": 2.1019108280254777, "grad_norm": 0.09349235892295837, "learning_rate": 0.00018914602989276294, "loss": 0.0327, "step": 330 }, { "epoch": 2.1656050955414012, "grad_norm": 0.17592753469944, "learning_rate": 0.00018848933400159569, "loss": 0.0258, "step": 340 }, { "epoch": 2.229299363057325, "grad_norm": 0.1627691686153412, "learning_rate": 0.00018781456104615272, "loss": 0.0274, "step": 350 }, { "epoch": 2.2929936305732483, "grad_norm": 0.14966121315956116, "learning_rate": 0.00018712184887258494, "loss": 0.0293, "step": 360 }, { "epoch": 2.356687898089172, "grad_norm": 0.17441661655902863, "learning_rate": 0.0001864113389917606, "loss": 0.0304, "step": 370 }, { "epoch": 2.4203821656050954, "grad_norm": 0.09554579854011536, "learning_rate": 0.00018568317655035676, "loss": 0.0321, "step": 380 }, { "epoch": 2.484076433121019, "grad_norm": 0.2536601126194, "learning_rate": 0.00018493751030120793, "loss": 0.0257, "step": 390 }, { "epoch": 2.5477707006369426, "grad_norm": 0.16189263761043549, "learning_rate": 0.00018417449257291803, "loss": 0.0279, "step": 400 }, { "epoch": 2.611464968152866, "grad_norm": 0.1018817350268364, "learning_rate": 0.00018339427923874207, "loss": 0.0289, "step": 410 }, { "epoch": 2.6751592356687897, "grad_norm": 0.14323534071445465, "learning_rate": 0.00018259702968474327, "loss": 0.0275, "step": 420 }, { "epoch": 2.738853503184713, "grad_norm": 0.11969427019357681, "learning_rate": 0.00018178290677723312, "loss": 0.0281, "step": 430 }, { "epoch": 2.802547770700637, "grad_norm": 0.2024673968553543, "learning_rate": 0.00018095207682950005, "loss": 0.0314, "step": 440 }, { "epoch": 2.8662420382165603, "grad_norm": 0.17737647891044617, "learning_rate": 0.00018010470956783406, "loss": 0.0279, "step": 450 }, { "epoch": 2.9299363057324843, "grad_norm": 0.18298077583312988, "learning_rate": 0.00017924097809685424, "loss": 0.0257, "step": 460 }, { "epoch": 2.9936305732484074, "grad_norm": 0.2549296021461487, "learning_rate": 0.00017836105886414596, "loss": 0.0304, "step": 470 }, { "epoch": 3.0, "eval_loss": 0.05053602159023285, "eval_runtime": 152.6545, "eval_samples_per_second": 2.738, "eval_steps_per_second": 0.347, "step": 471 }, { "epoch": 3.0573248407643314, "grad_norm": 0.17169545590877533, "learning_rate": 0.00017746513162421535, "loss": 0.0208, "step": 480 }, { "epoch": 3.121019108280255, "grad_norm": 0.21220606565475464, "learning_rate": 0.00017655337940176793, "loss": 0.0226, "step": 490 }, { "epoch": 3.1847133757961785, "grad_norm": 0.12988293170928955, "learning_rate": 0.00017562598845431956, "loss": 0.0217, "step": 500 }, { "epoch": 3.248407643312102, "grad_norm": 0.3005841076374054, "learning_rate": 0.00017468314823414669, "loss": 0.0187, "step": 510 }, { "epoch": 3.3121019108280256, "grad_norm": 0.21529339253902435, "learning_rate": 0.000173725051349584, "loss": 0.022, "step": 520 }, { "epoch": 3.375796178343949, "grad_norm": 0.07448782026767731, "learning_rate": 0.00017275189352567745, "loss": 0.0311, "step": 530 }, { "epoch": 3.4394904458598727, "grad_norm": 0.10172971338033676, "learning_rate": 0.0001717638735642005, "loss": 0.0223, "step": 540 }, { "epoch": 3.5031847133757963, "grad_norm": 0.19833995401859283, "learning_rate": 0.0001707611933030419, "loss": 0.0249, "step": 550 }, { "epoch": 3.56687898089172, "grad_norm": 0.234901562333107, "learning_rate": 0.00016974405757497318, "loss": 0.0383, "step": 560 }, { "epoch": 3.6305732484076434, "grad_norm": 0.21425440907478333, "learning_rate": 0.0001687126741658041, "loss": 0.0266, "step": 570 }, { "epoch": 3.694267515923567, "grad_norm": 0.08396715670824051, "learning_rate": 0.00016766725377193557, "loss": 0.0265, "step": 580 }, { "epoch": 3.7579617834394905, "grad_norm": 0.08471404016017914, "learning_rate": 0.00016660800995731693, "loss": 0.0238, "step": 590 }, { "epoch": 3.821656050955414, "grad_norm": 0.0868527814745903, "learning_rate": 0.00016553515910981847, "loss": 0.0269, "step": 600 }, { "epoch": 3.8853503184713376, "grad_norm": 0.3129713535308838, "learning_rate": 0.0001644489203970263, "loss": 0.0223, "step": 610 }, { "epoch": 3.949044585987261, "grad_norm": 0.11113307625055313, "learning_rate": 0.00016334951572146965, "loss": 0.0238, "step": 620 }, { "epoch": 4.0, "eval_loss": 0.04629155993461609, "eval_runtime": 148.9174, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.356, "step": 628 }, { "epoch": 4.012738853503185, "grad_norm": 0.0971466675400734, "learning_rate": 0.00016223716967528958, "loss": 0.018, "step": 630 }, { "epoch": 4.076433121019108, "grad_norm": 0.15950708091259003, "learning_rate": 0.00016111210949435815, "loss": 0.0158, "step": 640 }, { "epoch": 4.140127388535032, "grad_norm": 0.20078063011169434, "learning_rate": 0.00015997456501185727, "loss": 0.0179, "step": 650 }, { "epoch": 4.203821656050955, "grad_norm": 0.1425529569387436, "learning_rate": 0.0001588247686113274, "loss": 0.0245, "step": 660 }, { "epoch": 4.267515923566879, "grad_norm": 0.1291467249393463, "learning_rate": 0.00015766295517919497, "loss": 0.0241, "step": 670 }, { "epoch": 4.3312101910828025, "grad_norm": 0.08179380744695663, "learning_rate": 0.00015648936205678838, "loss": 0.023, "step": 680 }, { "epoch": 4.3949044585987265, "grad_norm": 0.15069672465324402, "learning_rate": 0.00015530422899185298, "loss": 0.0304, "step": 690 }, { "epoch": 4.45859872611465, "grad_norm": 0.14441800117492676, "learning_rate": 0.00015410779808957385, "loss": 0.0246, "step": 700 }, { "epoch": 4.522292993630574, "grad_norm": 0.07218258827924728, "learning_rate": 0.0001529003137631175, "loss": 0.0232, "step": 710 }, { "epoch": 4.585987261146497, "grad_norm": 0.28358036279678345, "learning_rate": 0.0001516820226837017, "loss": 0.0357, "step": 720 }, { "epoch": 4.649681528662421, "grad_norm": 0.48727092146873474, "learning_rate": 0.00015045317373020426, "loss": 0.0192, "step": 730 }, { "epoch": 4.713375796178344, "grad_norm": 0.14064273238182068, "learning_rate": 0.00014921401793832094, "loss": 0.0221, "step": 740 }, { "epoch": 4.777070063694268, "grad_norm": 0.4577218294143677, "learning_rate": 0.00014796480844928218, "loss": 0.0171, "step": 750 }, { "epoch": 4.840764331210191, "grad_norm": 0.07277490198612213, "learning_rate": 0.0001467058004581404, "loss": 0.0244, "step": 760 }, { "epoch": 4.904458598726115, "grad_norm": 0.3607349693775177, "learning_rate": 0.0001454372511616373, "loss": 0.0227, "step": 770 }, { "epoch": 4.968152866242038, "grad_norm": 1.5265377759933472, "learning_rate": 0.00014415941970566233, "loss": 0.0239, "step": 780 }, { "epoch": 5.0, "eval_loss": 0.04556597024202347, "eval_runtime": 148.8315, "eval_samples_per_second": 2.809, "eval_steps_per_second": 0.356, "step": 785 }, { "epoch": 5.031847133757962, "grad_norm": 0.08108412474393845, "learning_rate": 0.00014287256713231314, "loss": 0.0179, "step": 790 }, { "epoch": 5.095541401273885, "grad_norm": 0.07250893861055374, "learning_rate": 0.00014157695632656837, "loss": 0.0128, "step": 800 }, { "epoch": 5.159235668789809, "grad_norm": 0.12970279157161713, "learning_rate": 0.00014027285196258426, "loss": 0.0127, "step": 810 }, { "epoch": 5.222929936305732, "grad_norm": 0.20146997272968292, "learning_rate": 0.00013896052044962557, "loss": 0.014, "step": 820 }, { "epoch": 5.286624203821656, "grad_norm": 0.165513277053833, "learning_rate": 0.00013764022987764209, "loss": 0.0182, "step": 830 }, { "epoch": 5.350318471337579, "grad_norm": 0.1810760200023651, "learning_rate": 0.00013631224996250185, "loss": 0.0164, "step": 840 }, { "epoch": 5.414012738853503, "grad_norm": 0.13012060523033142, "learning_rate": 0.00013497685199089217, "loss": 0.0162, "step": 850 }, { "epoch": 5.477707006369426, "grad_norm": 0.1861809343099594, "learning_rate": 0.00013363430876489976, "loss": 0.0141, "step": 860 }, { "epoch": 5.54140127388535, "grad_norm": 0.2427922487258911, "learning_rate": 0.00013228489454628127, "loss": 0.0191, "step": 870 }, { "epoch": 5.6050955414012735, "grad_norm": 0.06246360391378403, "learning_rate": 0.00013092888500043566, "loss": 0.0143, "step": 880 }, { "epoch": 5.6687898089171975, "grad_norm": 0.15271341800689697, "learning_rate": 0.0001295665571400899, "loss": 0.0185, "step": 890 }, { "epoch": 5.732484076433121, "grad_norm": 0.04112791642546654, "learning_rate": 0.00012819818926870942, "loss": 0.0122, "step": 900 }, { "epoch": 5.796178343949045, "grad_norm": 0.13756102323532104, "learning_rate": 0.00012682406092364446, "loss": 0.0205, "step": 910 }, { "epoch": 5.859872611464969, "grad_norm": 0.1089109405875206, "learning_rate": 0.00012544445281902512, "loss": 0.0175, "step": 920 }, { "epoch": 5.923566878980892, "grad_norm": 0.5035731792449951, "learning_rate": 0.00012405964678841556, "loss": 0.0164, "step": 930 }, { "epoch": 5.987261146496815, "grad_norm": 0.04808522015810013, "learning_rate": 0.0001226699257272393, "loss": 0.0178, "step": 940 }, { "epoch": 6.0, "eval_loss": 0.048118457198143005, "eval_runtime": 148.8942, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.356, "step": 942 }, { "epoch": 6.050955414012739, "grad_norm": 0.09140116721391678, "learning_rate": 0.00012127557353498806, "loss": 0.0095, "step": 950 }, { "epoch": 6.114649681528663, "grad_norm": 0.0554346963763237, "learning_rate": 0.00011987687505722532, "loss": 0.0102, "step": 960 }, { "epoch": 6.178343949044586, "grad_norm": 0.0764077678322792, "learning_rate": 0.00011847411602739645, "loss": 0.0097, "step": 970 }, { "epoch": 6.24203821656051, "grad_norm": 0.0922919437289238, "learning_rate": 0.00011706758300845771, "loss": 0.0127, "step": 980 }, { "epoch": 6.305732484076433, "grad_norm": 0.12583084404468536, "learning_rate": 0.0001156575633343355, "loss": 0.0119, "step": 990 }, { "epoch": 6.369426751592357, "grad_norm": 0.14942176640033722, "learning_rate": 0.00011424434505122851, "loss": 0.0132, "step": 1000 }, { "epoch": 6.43312101910828, "grad_norm": 0.2156478315591812, "learning_rate": 0.00011282821685876399, "loss": 0.012, "step": 1010 }, { "epoch": 6.496815286624204, "grad_norm": 0.12194344401359558, "learning_rate": 0.00011140946805102059, "loss": 0.0136, "step": 1020 }, { "epoch": 6.560509554140127, "grad_norm": 0.1328732818365097, "learning_rate": 0.00010998838845743011, "loss": 0.0131, "step": 1030 }, { "epoch": 6.624203821656051, "grad_norm": 0.30128493905067444, "learning_rate": 0.00010856526838356941, "loss": 0.0109, "step": 1040 }, { "epoch": 6.687898089171974, "grad_norm": 0.16975216567516327, "learning_rate": 0.00010714039855185539, "loss": 0.0149, "step": 1050 }, { "epoch": 6.751592356687898, "grad_norm": 0.08274857699871063, "learning_rate": 0.00010571407004215447, "loss": 0.0155, "step": 1060 }, { "epoch": 6.8152866242038215, "grad_norm": 0.1308615654706955, "learning_rate": 0.00010428657423231969, "loss": 0.0135, "step": 1070 }, { "epoch": 6.8789808917197455, "grad_norm": 0.05005017668008804, "learning_rate": 0.00010285820273866613, "loss": 0.0157, "step": 1080 }, { "epoch": 6.942675159235669, "grad_norm": 0.23810291290283203, "learning_rate": 0.00010142924735639819, "loss": 0.0141, "step": 1090 }, { "epoch": 7.0, "eval_loss": 0.047865718603134155, "eval_runtime": 148.8655, "eval_samples_per_second": 2.808, "eval_steps_per_second": 0.356, "step": 1099 }, { "epoch": 7.006369426751593, "grad_norm": 0.09416891634464264, "learning_rate": 0.0001, "loss": 0.0146, "step": 1100 }, { "epoch": 7.070063694267516, "grad_norm": 0.7533183097839355, "learning_rate": 9.857075264360185e-05, "loss": 0.0113, "step": 1110 }, { "epoch": 7.13375796178344, "grad_norm": 0.16101513803005219, "learning_rate": 9.714179726133388e-05, "loss": 0.0075, "step": 1120 }, { "epoch": 7.197452229299363, "grad_norm": 0.1908101737499237, "learning_rate": 9.571342576768035e-05, "loss": 0.009, "step": 1130 }, { "epoch": 7.261146496815287, "grad_norm": 0.06933945417404175, "learning_rate": 9.428592995784554e-05, "loss": 0.0089, "step": 1140 }, { "epoch": 7.32484076433121, "grad_norm": 0.054749008268117905, "learning_rate": 9.285960144814465e-05, "loss": 0.0097, "step": 1150 }, { "epoch": 7.388535031847134, "grad_norm": 0.21135513484477997, "learning_rate": 9.14347316164306e-05, "loss": 0.011, "step": 1160 }, { "epoch": 7.452229299363057, "grad_norm": 0.0507340133190155, "learning_rate": 9.00116115425699e-05, "loss": 0.0095, "step": 1170 }, { "epoch": 7.515923566878981, "grad_norm": 0.06733115762472153, "learning_rate": 8.859053194897942e-05, "loss": 0.0108, "step": 1180 }, { "epoch": 7.579617834394904, "grad_norm": 0.07039262354373932, "learning_rate": 8.717178314123605e-05, "loss": 0.0082, "step": 1190 }, { "epoch": 7.643312101910828, "grad_norm": 0.08534280955791473, "learning_rate": 8.575565494877147e-05, "loss": 0.0099, "step": 1200 }, { "epoch": 7.707006369426751, "grad_norm": 0.11997800320386887, "learning_rate": 8.434243666566451e-05, "loss": 0.011, "step": 1210 }, { "epoch": 7.770700636942675, "grad_norm": 0.05447472259402275, "learning_rate": 8.293241699154231e-05, "loss": 0.0089, "step": 1220 }, { "epoch": 7.834394904458598, "grad_norm": 0.045016925781965256, "learning_rate": 8.152588397260357e-05, "loss": 0.0087, "step": 1230 }, { "epoch": 7.898089171974522, "grad_norm": 0.10037513077259064, "learning_rate": 8.012312494277472e-05, "loss": 0.0088, "step": 1240 }, { "epoch": 7.961783439490446, "grad_norm": 0.09553356468677521, "learning_rate": 7.872442646501199e-05, "loss": 0.008, "step": 1250 }, { "epoch": 8.0, "eval_loss": 0.05469883605837822, "eval_runtime": 148.8965, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.356, "step": 1256 }, { "epoch": 8.02547770700637, "grad_norm": 0.099692702293396, "learning_rate": 7.733007427276075e-05, "loss": 0.008, "step": 1260 }, { "epoch": 8.089171974522293, "grad_norm": 0.05955551564693451, "learning_rate": 7.594035321158445e-05, "loss": 0.0062, "step": 1270 }, { "epoch": 8.152866242038217, "grad_norm": 0.055873848497867584, "learning_rate": 7.455554718097487e-05, "loss": 0.0051, "step": 1280 }, { "epoch": 8.21656050955414, "grad_norm": 0.11832093447446823, "learning_rate": 7.317593907635558e-05, "loss": 0.0068, "step": 1290 }, { "epoch": 8.280254777070065, "grad_norm": 0.11872788518667221, "learning_rate": 7.180181073129061e-05, "loss": 0.0055, "step": 1300 }, { "epoch": 8.343949044585987, "grad_norm": 0.12502700090408325, "learning_rate": 7.043344285991012e-05, "loss": 0.006, "step": 1310 }, { "epoch": 8.40764331210191, "grad_norm": 0.07949739694595337, "learning_rate": 6.907111499956439e-05, "loss": 0.0056, "step": 1320 }, { "epoch": 8.471337579617835, "grad_norm": 0.08610483258962631, "learning_rate": 6.77151054537188e-05, "loss": 0.0048, "step": 1330 }, { "epoch": 8.535031847133759, "grad_norm": 0.08261114358901978, "learning_rate": 6.636569123510027e-05, "loss": 0.0047, "step": 1340 }, { "epoch": 8.598726114649681, "grad_norm": 0.030890854075551033, "learning_rate": 6.502314800910785e-05, "loss": 0.0052, "step": 1350 }, { "epoch": 8.662420382165605, "grad_norm": 0.07963161170482635, "learning_rate": 6.368775003749816e-05, "loss": 0.0099, "step": 1360 }, { "epoch": 8.726114649681529, "grad_norm": 0.14875206351280212, "learning_rate": 6.235977012235792e-05, "loss": 0.006, "step": 1370 }, { "epoch": 8.789808917197453, "grad_norm": 0.21349501609802246, "learning_rate": 6.103947955037446e-05, "loss": 0.0047, "step": 1380 }, { "epoch": 8.853503184713375, "grad_norm": 0.05400541424751282, "learning_rate": 5.972714803741577e-05, "loss": 0.006, "step": 1390 }, { "epoch": 8.9171974522293, "grad_norm": 0.14428143203258514, "learning_rate": 5.842304367343161e-05, "loss": 0.0095, "step": 1400 }, { "epoch": 8.980891719745223, "grad_norm": 0.07769430428743362, "learning_rate": 5.712743286768687e-05, "loss": 0.0053, "step": 1410 }, { "epoch": 9.0, "eval_loss": 0.05676256865262985, "eval_runtime": 148.9241, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.356, "step": 1413 }, { "epoch": 9.044585987261147, "grad_norm": 0.10749530047178268, "learning_rate": 5.584058029433766e-05, "loss": 0.0038, "step": 1420 }, { "epoch": 9.10828025477707, "grad_norm": 0.016948334872722626, "learning_rate": 5.4562748838362735e-05, "loss": 0.0051, "step": 1430 }, { "epoch": 9.171974522292993, "grad_norm": 0.008234160952270031, "learning_rate": 5.329419954185965e-05, "loss": 0.0043, "step": 1440 }, { "epoch": 9.235668789808917, "grad_norm": 0.04994361847639084, "learning_rate": 5.203519155071785e-05, "loss": 0.0039, "step": 1450 }, { "epoch": 9.299363057324841, "grad_norm": 0.037435177713632584, "learning_rate": 5.078598206167912e-05, "loss": 0.0033, "step": 1460 }, { "epoch": 9.363057324840764, "grad_norm": 0.11694881319999695, "learning_rate": 4.9546826269795765e-05, "loss": 0.0036, "step": 1470 }, { "epoch": 9.426751592356688, "grad_norm": 0.05153834447264671, "learning_rate": 4.831797731629835e-05, "loss": 0.0042, "step": 1480 }, { "epoch": 9.490445859872612, "grad_norm": 0.09336938709020615, "learning_rate": 4.709968623688254e-05, "loss": 0.0028, "step": 1490 }, { "epoch": 9.554140127388536, "grad_norm": 0.03732943907380104, "learning_rate": 4.589220191042616e-05, "loss": 0.0034, "step": 1500 }, { "epoch": 9.617834394904458, "grad_norm": 0.14202427864074707, "learning_rate": 4.469577100814705e-05, "loss": 0.0031, "step": 1510 }, { "epoch": 9.681528662420382, "grad_norm": 0.09861844778060913, "learning_rate": 4.351063794321165e-05, "loss": 0.003, "step": 1520 }, { "epoch": 9.745222929936306, "grad_norm": 0.16652171313762665, "learning_rate": 4.233704482080504e-05, "loss": 0.0041, "step": 1530 }, { "epoch": 9.80891719745223, "grad_norm": 0.05778292566537857, "learning_rate": 4.11752313886726e-05, "loss": 0.0042, "step": 1540 }, { "epoch": 9.872611464968152, "grad_norm": 0.012399845756590366, "learning_rate": 4.0025434988142766e-05, "loss": 0.0037, "step": 1550 }, { "epoch": 9.936305732484076, "grad_norm": 0.0798059031367302, "learning_rate": 3.888789050564188e-05, "loss": 0.0047, "step": 1560 }, { "epoch": 10.0, "grad_norm": 0.04369504004716873, "learning_rate": 3.776283032471044e-05, "loss": 0.0029, "step": 1570 }, { "epoch": 10.0, "eval_loss": 0.07112779468297958, "eval_runtime": 148.9356, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.356, "step": 1570 }, { "epoch": 10.063694267515924, "grad_norm": 0.02837546356022358, "learning_rate": 3.6650484278530387e-05, "loss": 0.0023, "step": 1580 }, { "epoch": 10.127388535031848, "grad_norm": 0.00527458218857646, "learning_rate": 3.5551079602973734e-05, "loss": 0.0041, "step": 1590 }, { "epoch": 10.19108280254777, "grad_norm": 0.013476898893713951, "learning_rate": 3.446484089018153e-05, "loss": 0.0028, "step": 1600 }, { "epoch": 10.254777070063694, "grad_norm": 0.009121859446167946, "learning_rate": 3.3391990042683055e-05, "loss": 0.0032, "step": 1610 }, { "epoch": 10.318471337579618, "grad_norm": 0.035027824342250824, "learning_rate": 3.233274622806446e-05, "loss": 0.0038, "step": 1620 }, { "epoch": 10.382165605095542, "grad_norm": 0.05406291410326958, "learning_rate": 3.1287325834195915e-05, "loss": 0.0027, "step": 1630 }, { "epoch": 10.445859872611464, "grad_norm": 0.021644996479153633, "learning_rate": 3.025594242502684e-05, "loss": 0.0022, "step": 1640 }, { "epoch": 10.509554140127388, "grad_norm": 0.04471028223633766, "learning_rate": 2.9238806696958087e-05, "loss": 0.0034, "step": 1650 }, { "epoch": 10.573248407643312, "grad_norm": 0.014225292019546032, "learning_rate": 2.823612643579949e-05, "loss": 0.002, "step": 1660 }, { "epoch": 10.636942675159236, "grad_norm": 0.005709750112146139, "learning_rate": 2.7248106474322554e-05, "loss": 0.0021, "step": 1670 }, { "epoch": 10.700636942675159, "grad_norm": 0.015249662101268768, "learning_rate": 2.627494865041602e-05, "loss": 0.002, "step": 1680 }, { "epoch": 10.764331210191083, "grad_norm": 0.061606843024492264, "learning_rate": 2.5316851765853344e-05, "loss": 0.0035, "step": 1690 }, { "epoch": 10.828025477707007, "grad_norm": 0.056455183774232864, "learning_rate": 2.437401154568044e-05, "loss": 0.0016, "step": 1700 }, { "epoch": 10.89171974522293, "grad_norm": 0.050760120153427124, "learning_rate": 2.3446620598232104e-05, "loss": 0.0031, "step": 1710 }, { "epoch": 10.955414012738853, "grad_norm": 0.0020009365398436785, "learning_rate": 2.253486837578468e-05, "loss": 0.0024, "step": 1720 }, { "epoch": 11.0, "eval_loss": 0.07927798479795456, "eval_runtime": 148.9334, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.356, "step": 1727 }, { "epoch": 11.019108280254777, "grad_norm": 0.010104876011610031, "learning_rate": 2.163894113585404e-05, "loss": 0.0021, "step": 1730 }, { "epoch": 11.0828025477707, "grad_norm": 0.00586892431601882, "learning_rate": 2.075902190314578e-05, "loss": 0.0018, "step": 1740 }, { "epoch": 11.146496815286625, "grad_norm": 0.004691167734563351, "learning_rate": 1.9895290432165935e-05, "loss": 0.0021, "step": 1750 }, { "epoch": 11.210191082802547, "grad_norm": 0.0037749160546809435, "learning_rate": 1.904792317049996e-05, "loss": 0.0016, "step": 1760 }, { "epoch": 11.273885350318471, "grad_norm": 0.0812540128827095, "learning_rate": 1.82170932227669e-05, "loss": 0.0023, "step": 1770 }, { "epoch": 11.337579617834395, "grad_norm": 0.03934706375002861, "learning_rate": 1.740297031525674e-05, "loss": 0.0027, "step": 1780 }, { "epoch": 11.401273885350319, "grad_norm": 0.005765652749687433, "learning_rate": 1.660572076125797e-05, "loss": 0.0017, "step": 1790 }, { "epoch": 11.464968152866241, "grad_norm": 0.006609324831515551, "learning_rate": 1.5825507427081976e-05, "loss": 0.0019, "step": 1800 }, { "epoch": 11.528662420382165, "grad_norm": 0.046800799667835236, "learning_rate": 1.5062489698792082e-05, "loss": 0.0024, "step": 1810 }, { "epoch": 11.59235668789809, "grad_norm": 0.0012369153555482626, "learning_rate": 1.4316823449643257e-05, "loss": 0.0015, "step": 1820 }, { "epoch": 11.656050955414013, "grad_norm": 0.008111722767353058, "learning_rate": 1.3588661008239412e-05, "loss": 0.0023, "step": 1830 }, { "epoch": 11.719745222929935, "grad_norm": 0.0041403137147426605, "learning_rate": 1.2878151127415094e-05, "loss": 0.0021, "step": 1840 }, { "epoch": 11.78343949044586, "grad_norm": 0.19675485789775848, "learning_rate": 1.2185438953847328e-05, "loss": 0.0032, "step": 1850 }, { "epoch": 11.847133757961783, "grad_norm": 0.006231395993381739, "learning_rate": 1.1510665998404336e-05, "loss": 0.0022, "step": 1860 }, { "epoch": 11.910828025477707, "grad_norm": 0.0019039853941649199, "learning_rate": 1.0853970107237088e-05, "loss": 0.0028, "step": 1870 }, { "epoch": 11.97452229299363, "grad_norm": 0.0023176763206720352, "learning_rate": 1.0215485433619132e-05, "loss": 0.0017, "step": 1880 }, { "epoch": 12.0, "eval_loss": 0.08632908761501312, "eval_runtime": 148.9624, "eval_samples_per_second": 2.806, "eval_steps_per_second": 0.356, "step": 1884 }, { "epoch": 12.038216560509554, "grad_norm": 0.0012192321009933949, "learning_rate": 9.595342410541209e-06, "loss": 0.0017, "step": 1890 }, { "epoch": 12.101910828025478, "grad_norm": 0.01771487295627594, "learning_rate": 8.993667724065747e-06, "loss": 0.002, "step": 1900 }, { "epoch": 12.165605095541402, "grad_norm": 0.004799762275069952, "learning_rate": 8.410584287446643e-06, "loss": 0.0018, "step": 1910 }, { "epoch": 12.229299363057326, "grad_norm": 0.0028903819620609283, "learning_rate": 7.846211216020039e-06, "loss": 0.0017, "step": 1920 }, { "epoch": 12.292993630573248, "grad_norm": 0.002215326763689518, "learning_rate": 7.3006638028707e-06, "loss": 0.0018, "step": 1930 }, { "epoch": 12.356687898089172, "grad_norm": 0.0046812682412564754, "learning_rate": 6.77405349527942e-06, "loss": 0.0022, "step": 1940 }, { "epoch": 12.420382165605096, "grad_norm": 0.00792867224663496, "learning_rate": 6.266487871955962e-06, "loss": 0.0018, "step": 1950 }, { "epoch": 12.48407643312102, "grad_norm": 0.0029917878564447165, "learning_rate": 5.778070621062281e-06, "loss": 0.0019, "step": 1960 }, { "epoch": 12.547770700636942, "grad_norm": 0.0012242052471265197, "learning_rate": 5.308901519030607e-06, "loss": 0.0015, "step": 1970 }, { "epoch": 12.611464968152866, "grad_norm": 0.0016652451595291495, "learning_rate": 4.859076410180629e-06, "loss": 0.0018, "step": 1980 }, { "epoch": 12.67515923566879, "grad_norm": 0.0020181615836918354, "learning_rate": 4.42868718713978e-06, "loss": 0.0026, "step": 1990 }, { "epoch": 12.738853503184714, "grad_norm": 0.01197089534252882, "learning_rate": 4.017821772071084e-06, "loss": 0.0018, "step": 2000 }, { "epoch": 12.802547770700636, "grad_norm": 0.0036414351779967546, "learning_rate": 3.6265640987119042e-06, "loss": 0.0016, "step": 2010 }, { "epoch": 12.86624203821656, "grad_norm": 0.005194537341594696, "learning_rate": 3.2549940952274483e-06, "loss": 0.0017, "step": 2020 }, { "epoch": 12.929936305732484, "grad_norm": 0.0025584339164197445, "learning_rate": 2.903187667882701e-06, "loss": 0.0019, "step": 2030 }, { "epoch": 12.993630573248408, "grad_norm": 0.08787062019109726, "learning_rate": 2.5712166855359045e-06, "loss": 0.0027, "step": 2040 }, { "epoch": 13.0, "eval_loss": 0.09124071151018143, "eval_runtime": 148.9651, "eval_samples_per_second": 2.806, "eval_steps_per_second": 0.356, "step": 2041 }, { "epoch": 13.05732484076433, "grad_norm": 0.001067174132913351, "learning_rate": 2.2591489649567587e-06, "loss": 0.0019, "step": 2050 }, { "epoch": 13.121019108280255, "grad_norm": 0.04148571938276291, "learning_rate": 1.967048256972492e-06, "loss": 0.0018, "step": 2060 }, { "epoch": 13.184713375796179, "grad_norm": 0.0012596879387274384, "learning_rate": 1.6949742334445018e-06, "loss": 0.0017, "step": 2070 }, { "epoch": 13.248407643312103, "grad_norm": 0.0008848529541864991, "learning_rate": 1.4429824750782583e-06, "loss": 0.0019, "step": 2080 }, { "epoch": 13.312101910828025, "grad_norm": 0.004621226340532303, "learning_rate": 1.211124460069013e-06, "loss": 0.0022, "step": 2090 }, { "epoch": 13.375796178343949, "grad_norm": 0.0037146620452404022, "learning_rate": 9.99447553585542e-07, "loss": 0.0018, "step": 2100 }, { "epoch": 13.439490445859873, "grad_norm": 0.0031733817886561155, "learning_rate": 8.079949980941526e-07, "loss": 0.0016, "step": 2110 }, { "epoch": 13.503184713375797, "grad_norm": 0.04481673985719681, "learning_rate": 6.368059045248842e-07, "loss": 0.0021, "step": 2120 }, { "epoch": 13.566878980891719, "grad_norm": 0.008498159237205982, "learning_rate": 4.859152442817205e-07, "loss": 0.002, "step": 2130 }, { "epoch": 13.630573248407643, "grad_norm": 0.003298922209069133, "learning_rate": 3.5535384209846036e-07, "loss": 0.0017, "step": 2140 }, { "epoch": 13.694267515923567, "grad_norm": 0.003794416319578886, "learning_rate": 2.4514836974165454e-07, "loss": 0.0016, "step": 2150 }, { "epoch": 13.757961783439491, "grad_norm": 0.005654670298099518, "learning_rate": 1.5532134056196468e-07, "loss": 0.0017, "step": 2160 }, { "epoch": 13.821656050955415, "grad_norm": 0.0006748574669472873, "learning_rate": 8.589110489505281e-08, "loss": 0.0019, "step": 2170 }, { "epoch": 13.885350318471337, "grad_norm": 0.05673813074827194, "learning_rate": 3.687184631284701e-08, "loss": 0.0023, "step": 2180 }, { "epoch": 13.949044585987261, "grad_norm": 0.044436752796173096, "learning_rate": 8.273578726014642e-09, "loss": 0.002, "step": 2190 }, { "epoch": 14.0, "eval_loss": 0.09217014908790588, "eval_runtime": 148.9613, "eval_samples_per_second": 2.806, "eval_steps_per_second": 0.356, "step": 2198 } ], "logging_steps": 10, "max_steps": 2198, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.05580823441408e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }