| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 4665, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006430868167202572, | |
| "grad_norm": 4.730106830596924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8965, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012861736334405145, | |
| "grad_norm": 2.0916528701782227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8501, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01929260450160772, | |
| "grad_norm": 0.8616329431533813, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8318, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02572347266881029, | |
| "grad_norm": 0.7073356509208679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8193, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03215434083601286, | |
| "grad_norm": 1.3553227186203003, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8094, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03858520900321544, | |
| "grad_norm": 0.6813921928405762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8042, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04501607717041801, | |
| "grad_norm": 1.011812448501587, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7912, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05144694533762058, | |
| "grad_norm": 2.1421170234680176, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7894, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05787781350482315, | |
| "grad_norm": 1.2183887958526611, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7799, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06430868167202572, | |
| "grad_norm": 1.1371192932128906, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7832, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0707395498392283, | |
| "grad_norm": 1.3382622003555298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7791, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07717041800643087, | |
| "grad_norm": 0.6512683629989624, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7784, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08360128617363344, | |
| "grad_norm": 0.632128894329071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7665, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09003215434083602, | |
| "grad_norm": 0.4840221405029297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7643, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09646302250803858, | |
| "grad_norm": 0.5068333745002747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7772, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10289389067524116, | |
| "grad_norm": 0.5646687150001526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7679, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10932475884244373, | |
| "grad_norm": 0.9486843943595886, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7724, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1157556270096463, | |
| "grad_norm": 0.5386438369750977, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7615, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12218649517684887, | |
| "grad_norm": 0.7876360416412354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.767, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.12861736334405144, | |
| "grad_norm": 0.5463624596595764, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7666, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13504823151125403, | |
| "grad_norm": 0.7764928340911865, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7564, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1414790996784566, | |
| "grad_norm": 0.8027799725532532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7546, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.14790996784565916, | |
| "grad_norm": 1.3793781995773315, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7597, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15434083601286175, | |
| "grad_norm": 0.4461333155632019, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7637, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1607717041800643, | |
| "grad_norm": 0.5069792866706848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7663, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.16720257234726688, | |
| "grad_norm": 0.4338971972465515, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7567, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17363344051446947, | |
| "grad_norm": 0.5476683974266052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7581, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18006430868167203, | |
| "grad_norm": 0.5481943488121033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7591, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1864951768488746, | |
| "grad_norm": 0.5772653818130493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.754, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19292604501607716, | |
| "grad_norm": 0.455953449010849, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7601, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.19935691318327975, | |
| "grad_norm": 0.5032763481140137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7534, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2057877813504823, | |
| "grad_norm": 0.5598179697990417, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7567, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21221864951768488, | |
| "grad_norm": 0.5307891368865967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7701, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.21864951768488747, | |
| "grad_norm": 0.654017984867096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7565, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.22508038585209003, | |
| "grad_norm": 0.49319934844970703, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7423, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2315112540192926, | |
| "grad_norm": 0.5508062243461609, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7569, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2379421221864952, | |
| "grad_norm": 0.5229071378707886, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7557, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.24437299035369775, | |
| "grad_norm": 0.5349363684654236, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7569, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2508038585209003, | |
| "grad_norm": 0.4683885872364044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7496, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2572347266881029, | |
| "grad_norm": 0.4715147316455841, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7528, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.26366559485530544, | |
| "grad_norm": 0.42438462376594543, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7612, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.27009646302250806, | |
| "grad_norm": 0.7097042798995972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.749, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2765273311897106, | |
| "grad_norm": 0.43447116017341614, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7396, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2829581993569132, | |
| "grad_norm": 0.5114174485206604, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7471, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.28938906752411575, | |
| "grad_norm": 0.4831767678260803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7449, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2958199356913183, | |
| "grad_norm": 0.49875640869140625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7483, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3022508038585209, | |
| "grad_norm": 0.489754855632782, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7451, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3086816720257235, | |
| "grad_norm": 0.5207638144493103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7463, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.31511254019292606, | |
| "grad_norm": 0.5335565805435181, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7403, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3215434083601286, | |
| "grad_norm": 0.5709009766578674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7425, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3279742765273312, | |
| "grad_norm": 0.45602336525917053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7506, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.33440514469453375, | |
| "grad_norm": 0.49307122826576233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7404, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3408360128617363, | |
| "grad_norm": 0.49949631094932556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7465, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.34726688102893893, | |
| "grad_norm": 0.43178921937942505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7531, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3536977491961415, | |
| "grad_norm": 0.46599695086479187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7428, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.36012861736334406, | |
| "grad_norm": 0.4541228115558624, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7427, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3665594855305466, | |
| "grad_norm": 0.474062979221344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7408, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3729903536977492, | |
| "grad_norm": 0.47080013155937195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7423, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.37942122186495175, | |
| "grad_norm": 0.46832209825515747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7383, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3858520900321543, | |
| "grad_norm": 0.4157029986381531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7509, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.39228295819935693, | |
| "grad_norm": 0.512323260307312, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7387, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3987138263665595, | |
| "grad_norm": 0.4567117393016815, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7379, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.40514469453376206, | |
| "grad_norm": 0.4581872522830963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7384, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4115755627009646, | |
| "grad_norm": 0.43767768144607544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7349, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4180064308681672, | |
| "grad_norm": 0.5342622399330139, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7379, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.42443729903536975, | |
| "grad_norm": 0.561100423336029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7361, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.43086816720257237, | |
| "grad_norm": 0.4456179141998291, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7373, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.43729903536977494, | |
| "grad_norm": 0.5494213104248047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7428, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4437299035369775, | |
| "grad_norm": 0.6552912592887878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7397, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.45016077170418006, | |
| "grad_norm": 0.6181358098983765, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7375, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4565916398713826, | |
| "grad_norm": 0.45060378313064575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7352, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4630225080385852, | |
| "grad_norm": 0.4362952411174774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7385, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4694533762057878, | |
| "grad_norm": 0.4858558177947998, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7364, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4758842443729904, | |
| "grad_norm": 0.5299956202507019, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7368, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.48231511254019294, | |
| "grad_norm": 0.6190028190612793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7286, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4887459807073955, | |
| "grad_norm": 0.42335936427116394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7368, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.49517684887459806, | |
| "grad_norm": 0.5196449160575867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7432, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5016077170418006, | |
| "grad_norm": 0.587582528591156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7405, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5080385852090032, | |
| "grad_norm": 0.5125169157981873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7346, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5144694533762058, | |
| "grad_norm": 0.42861226201057434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7343, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5209003215434084, | |
| "grad_norm": 0.41551926732063293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7401, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5273311897106109, | |
| "grad_norm": 0.511359453201294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7333, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5337620578778135, | |
| "grad_norm": 0.4694862961769104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7405, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5401929260450161, | |
| "grad_norm": 0.48257577419281006, | |
| "learning_rate": 5e-06, | |
| "loss": 0.735, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5466237942122186, | |
| "grad_norm": 0.4408928155899048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7235, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5530546623794212, | |
| "grad_norm": 0.5172277092933655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7398, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5594855305466238, | |
| "grad_norm": 0.4982483983039856, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7383, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5659163987138264, | |
| "grad_norm": 0.4628170132637024, | |
| "learning_rate": 5e-06, | |
| "loss": 0.741, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.572347266881029, | |
| "grad_norm": 0.46205154061317444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7304, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5787781350482315, | |
| "grad_norm": 0.4182255268096924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7351, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5852090032154341, | |
| "grad_norm": 0.5075783729553223, | |
| "learning_rate": 5e-06, | |
| "loss": 0.732, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5916398713826366, | |
| "grad_norm": 0.4244738817214966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7357, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5980707395498392, | |
| "grad_norm": 0.473067969083786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.739, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6045016077170418, | |
| "grad_norm": 0.4673498272895813, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7421, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6109324758842444, | |
| "grad_norm": 0.41633403301239014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.733, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.617363344051447, | |
| "grad_norm": 0.4132135212421417, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7306, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6237942122186495, | |
| "grad_norm": 0.48543354868888855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7267, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6302250803858521, | |
| "grad_norm": 0.45808976888656616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7339, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6366559485530546, | |
| "grad_norm": 0.46231311559677124, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7321, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6430868167202572, | |
| "grad_norm": 0.43952593207359314, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7244, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6495176848874598, | |
| "grad_norm": 0.4417444169521332, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7339, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6559485530546624, | |
| "grad_norm": 0.460893839597702, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7317, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.662379421221865, | |
| "grad_norm": 0.4568830728530884, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7294, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6688102893890675, | |
| "grad_norm": 0.4672453999519348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7306, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6752411575562701, | |
| "grad_norm": 0.677759051322937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7234, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6816720257234726, | |
| "grad_norm": 0.6139099597930908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7346, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6881028938906752, | |
| "grad_norm": 0.4375672936439514, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7323, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6945337620578779, | |
| "grad_norm": 0.39761385321617126, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7348, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7009646302250804, | |
| "grad_norm": 0.4165497422218323, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7267, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.707395498392283, | |
| "grad_norm": 0.4338400661945343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7305, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7138263665594855, | |
| "grad_norm": 0.43742889165878296, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7311, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7202572347266881, | |
| "grad_norm": 0.45418253540992737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7277, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7266881028938906, | |
| "grad_norm": 0.42107295989990234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7319, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7331189710610932, | |
| "grad_norm": 0.40914231538772583, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7251, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7395498392282959, | |
| "grad_norm": 0.42050373554229736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7403, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7459807073954984, | |
| "grad_norm": 0.4320564568042755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7319, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.752411575562701, | |
| "grad_norm": 0.4363013803958893, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7315, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7588424437299035, | |
| "grad_norm": 0.50197434425354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7287, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7652733118971061, | |
| "grad_norm": 0.5218095183372498, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7294, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.7717041800643086, | |
| "grad_norm": 0.4605177044868469, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7218, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7781350482315113, | |
| "grad_norm": 0.4640117287635803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7297, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7845659163987139, | |
| "grad_norm": 0.43073731660842896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7299, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7909967845659164, | |
| "grad_norm": 0.43865692615509033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7299, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.797427652733119, | |
| "grad_norm": 0.4125821590423584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.738, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8038585209003215, | |
| "grad_norm": 0.41375529766082764, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7324, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8102893890675241, | |
| "grad_norm": 0.43837442994117737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7219, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8167202572347267, | |
| "grad_norm": 0.45211347937583923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.725, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8231511254019293, | |
| "grad_norm": 0.4578869640827179, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7307, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8295819935691319, | |
| "grad_norm": 0.42134717106819153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7203, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8360128617363344, | |
| "grad_norm": 0.4580245912075043, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7242, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.842443729903537, | |
| "grad_norm": 0.4085647165775299, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7182, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8488745980707395, | |
| "grad_norm": 0.5211206674575806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7259, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8553054662379421, | |
| "grad_norm": 0.5008974671363831, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7164, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8617363344051447, | |
| "grad_norm": 0.4538878798484802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7198, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8681672025723473, | |
| "grad_norm": 0.407697468996048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7255, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8745980707395499, | |
| "grad_norm": 0.4785331189632416, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7273, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8810289389067524, | |
| "grad_norm": 0.4148114323616028, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7247, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.887459807073955, | |
| "grad_norm": 0.38368263840675354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7268, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8938906752411575, | |
| "grad_norm": 0.40352755784988403, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7285, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9003215434083601, | |
| "grad_norm": 0.4232112467288971, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7249, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9067524115755627, | |
| "grad_norm": 0.4366598129272461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7249, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9131832797427653, | |
| "grad_norm": 0.4052753150463104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7245, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9196141479099679, | |
| "grad_norm": 0.44511350989341736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7305, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9260450160771704, | |
| "grad_norm": 0.44077491760253906, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7294, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.932475884244373, | |
| "grad_norm": 0.48145055770874023, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7215, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9389067524115756, | |
| "grad_norm": 0.42547109723091125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7292, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9453376205787781, | |
| "grad_norm": 0.4287051558494568, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7155, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9517684887459807, | |
| "grad_norm": 0.4230569005012512, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7198, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9581993569131833, | |
| "grad_norm": 0.4719720184803009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7297, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.9646302250803859, | |
| "grad_norm": 0.43969443440437317, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7287, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9710610932475884, | |
| "grad_norm": 0.4330804646015167, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7209, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.977491961414791, | |
| "grad_norm": 0.38477885723114014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7246, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9839228295819936, | |
| "grad_norm": 0.46622517704963684, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7281, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.9903536977491961, | |
| "grad_norm": 0.47588425874710083, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7264, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.9967845659163987, | |
| "grad_norm": 0.43833327293395996, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7345, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.7232183814048767, | |
| "eval_runtime": 140.4382, | |
| "eval_samples_per_second": 298.373, | |
| "eval_steps_per_second": 1.168, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.0032154340836013, | |
| "grad_norm": 0.6193146705627441, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7137, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.0096463022508038, | |
| "grad_norm": 0.4350009560585022, | |
| "learning_rate": 5e-06, | |
| "loss": 0.682, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.0160771704180065, | |
| "grad_norm": 0.44338950514793396, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6971, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.022508038585209, | |
| "grad_norm": 0.4447615444660187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.687, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.0289389067524115, | |
| "grad_norm": 0.4388461112976074, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0353697749196142, | |
| "grad_norm": 0.523465096950531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6922, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.0418006430868167, | |
| "grad_norm": 0.46927884221076965, | |
| "learning_rate": 5e-06, | |
| "loss": 0.678, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.0482315112540193, | |
| "grad_norm": 0.47517913579940796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6735, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.0546623794212218, | |
| "grad_norm": 0.42782580852508545, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.0610932475884245, | |
| "grad_norm": 0.4523870348930359, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6902, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.067524115755627, | |
| "grad_norm": 0.46784067153930664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6814, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.0739549839228295, | |
| "grad_norm": 0.43110141158103943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6793, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.0803858520900322, | |
| "grad_norm": 0.44774794578552246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6853, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.0868167202572347, | |
| "grad_norm": 0.5212559103965759, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6863, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.0932475884244373, | |
| "grad_norm": 0.49354031682014465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.09967845659164, | |
| "grad_norm": 0.4270998239517212, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6845, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.1061093247588425, | |
| "grad_norm": 0.43898463249206543, | |
| "learning_rate": 5e-06, | |
| "loss": 0.683, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.112540192926045, | |
| "grad_norm": 0.413997083902359, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6755, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.1189710610932475, | |
| "grad_norm": 0.4593328833580017, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6888, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.1254019292604502, | |
| "grad_norm": 0.47585126757621765, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6865, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.1318327974276527, | |
| "grad_norm": 0.4210260808467865, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6833, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.1382636655948553, | |
| "grad_norm": 0.44741642475128174, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6849, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.144694533762058, | |
| "grad_norm": 0.437013179063797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6876, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.1511254019292605, | |
| "grad_norm": 0.45807886123657227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.157556270096463, | |
| "grad_norm": 0.46383801102638245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6718, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1639871382636655, | |
| "grad_norm": 0.5218130946159363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6831, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.1704180064308682, | |
| "grad_norm": 0.4491232931613922, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6837, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.1768488745980707, | |
| "grad_norm": 0.45633968710899353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6782, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.1832797427652733, | |
| "grad_norm": 0.4493134915828705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.685, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.189710610932476, | |
| "grad_norm": 0.42465460300445557, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6878, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.1961414790996785, | |
| "grad_norm": 0.4019971787929535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.677, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.202572347266881, | |
| "grad_norm": 0.45171162486076355, | |
| "learning_rate": 5e-06, | |
| "loss": 0.686, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.2090032154340835, | |
| "grad_norm": 0.43534427881240845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6816, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.2154340836012862, | |
| "grad_norm": 0.4472478926181793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6839, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.2218649517684887, | |
| "grad_norm": 0.41748490929603577, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6913, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.2282958199356913, | |
| "grad_norm": 0.4147058427333832, | |
| "learning_rate": 5e-06, | |
| "loss": 0.681, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.234726688102894, | |
| "grad_norm": 0.41148611903190613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6819, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.2411575562700965, | |
| "grad_norm": 0.4393286406993866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6903, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.247588424437299, | |
| "grad_norm": 0.4369371235370636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.2540192926045015, | |
| "grad_norm": 0.5068768858909607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6869, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.2604501607717042, | |
| "grad_norm": 0.43698254227638245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6853, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.2668810289389068, | |
| "grad_norm": 0.439619779586792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.681, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.2733118971061093, | |
| "grad_norm": 0.4120563566684723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6815, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.279742765273312, | |
| "grad_norm": 0.39503830671310425, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6789, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.2861736334405145, | |
| "grad_norm": 0.46114471554756165, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.292604501607717, | |
| "grad_norm": 0.44008398056030273, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6916, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.2990353697749195, | |
| "grad_norm": 0.41201072931289673, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6832, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.3054662379421222, | |
| "grad_norm": 0.42483213543891907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6876, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.3118971061093248, | |
| "grad_norm": 0.4412234127521515, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6736, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.3183279742765273, | |
| "grad_norm": 0.42483067512512207, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6866, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.32475884244373, | |
| "grad_norm": 0.412567675113678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6877, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.3311897106109325, | |
| "grad_norm": 0.4586575925350189, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6898, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.337620578778135, | |
| "grad_norm": 0.41622015833854675, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6836, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.3440514469453375, | |
| "grad_norm": 0.4891818165779114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6825, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.3504823151125402, | |
| "grad_norm": 0.42719897627830505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6915, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.3569131832797428, | |
| "grad_norm": 0.4267047643661499, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6829, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.3633440514469453, | |
| "grad_norm": 0.48593926429748535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6865, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.369774919614148, | |
| "grad_norm": 0.46337270736694336, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6784, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.3762057877813505, | |
| "grad_norm": 0.42621052265167236, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6815, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.382636655948553, | |
| "grad_norm": 0.4374221861362457, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6823, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.3890675241157555, | |
| "grad_norm": 0.49710190296173096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6788, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.3954983922829582, | |
| "grad_norm": 0.43773987889289856, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6897, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.4019292604501608, | |
| "grad_norm": 0.420480877161026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6877, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.4083601286173635, | |
| "grad_norm": 0.41698333621025085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6782, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.414790996784566, | |
| "grad_norm": 0.45082971453666687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6884, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.4212218649517685, | |
| "grad_norm": 0.44047510623931885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.427652733118971, | |
| "grad_norm": 0.43505367636680603, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.4340836012861735, | |
| "grad_norm": 0.42423659563064575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.4405144694533762, | |
| "grad_norm": 0.5248254537582397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6796, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.4469453376205788, | |
| "grad_norm": 0.47595280408859253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.4533762057877815, | |
| "grad_norm": 0.41155120730400085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6894, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.459807073954984, | |
| "grad_norm": 0.4290079176425934, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.4662379421221865, | |
| "grad_norm": 0.4066310226917267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6823, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.472668810289389, | |
| "grad_norm": 0.4360986351966858, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6886, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.4790996784565915, | |
| "grad_norm": 0.3978036642074585, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6849, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.4855305466237942, | |
| "grad_norm": 0.41997137665748596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6834, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.4919614147909968, | |
| "grad_norm": 0.4409499764442444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6873, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.4983922829581995, | |
| "grad_norm": 0.44297707080841064, | |
| "learning_rate": 5e-06, | |
| "loss": 0.682, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.504823151125402, | |
| "grad_norm": 0.4389215409755707, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6892, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.5112540192926045, | |
| "grad_norm": 0.44480717182159424, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6884, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.517684887459807, | |
| "grad_norm": 0.44355508685112, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6876, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.5241157556270095, | |
| "grad_norm": 0.46087756752967834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6828, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.5305466237942122, | |
| "grad_norm": 0.4672592580318451, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.5369774919614148, | |
| "grad_norm": 0.3982642889022827, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6819, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.5434083601286175, | |
| "grad_norm": 0.40598487854003906, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6838, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.54983922829582, | |
| "grad_norm": 0.4345811605453491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6815, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.5562700964630225, | |
| "grad_norm": 0.43568089604377747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6911, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.562700964630225, | |
| "grad_norm": 0.43922844529151917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6834, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.5691318327974275, | |
| "grad_norm": 0.4793626368045807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6833, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.5755627009646302, | |
| "grad_norm": 0.45686307549476624, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6864, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.5819935691318328, | |
| "grad_norm": 0.41061604022979736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6768, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.5884244372990355, | |
| "grad_norm": 0.45562943816185, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6743, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.594855305466238, | |
| "grad_norm": 0.4296809136867523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6802, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.6012861736334405, | |
| "grad_norm": 0.4218539595603943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6857, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.607717041800643, | |
| "grad_norm": 0.45991575717926025, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.6141479099678455, | |
| "grad_norm": 0.43671083450317383, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.6205787781350482, | |
| "grad_norm": 0.4566083550453186, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6808, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.6270096463022508, | |
| "grad_norm": 0.4394720792770386, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.6334405144694535, | |
| "grad_norm": 0.4831625521183014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6846, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.639871382636656, | |
| "grad_norm": 0.4193684458732605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.6463022508038585, | |
| "grad_norm": 0.47811901569366455, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6758, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.652733118971061, | |
| "grad_norm": 0.4748084545135498, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6861, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.6591639871382635, | |
| "grad_norm": 0.42245808243751526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6745, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.6655948553054662, | |
| "grad_norm": 0.4233896732330322, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6862, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.6720257234726688, | |
| "grad_norm": 0.4050581455230713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.676, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.6784565916398715, | |
| "grad_norm": 0.41761529445648193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6802, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.684887459807074, | |
| "grad_norm": 0.4060108959674835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.6913183279742765, | |
| "grad_norm": 0.40532317757606506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.697749196141479, | |
| "grad_norm": 0.47850868105888367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6879, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.7041800643086815, | |
| "grad_norm": 0.4595736861228943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6861, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.7106109324758842, | |
| "grad_norm": 0.3999944031238556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6811, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.717041800643087, | |
| "grad_norm": 0.4152866303920746, | |
| "learning_rate": 5e-06, | |
| "loss": 0.676, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.7234726688102895, | |
| "grad_norm": 0.5088204741477966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6868, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.729903536977492, | |
| "grad_norm": 0.4383102059364319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6818, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.7363344051446945, | |
| "grad_norm": 0.4219019114971161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6767, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.742765273311897, | |
| "grad_norm": 0.4663357138633728, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6783, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.7491961414790995, | |
| "grad_norm": 0.43959125876426697, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6824, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.7556270096463023, | |
| "grad_norm": 0.43940484523773193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6769, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.762057877813505, | |
| "grad_norm": 0.4071805477142334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6893, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.7684887459807075, | |
| "grad_norm": 0.40912047028541565, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6877, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.77491961414791, | |
| "grad_norm": 0.4339078366756439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6788, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.7813504823151125, | |
| "grad_norm": 0.4326707720756531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6831, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.787781350482315, | |
| "grad_norm": 0.4307939410209656, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6878, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.7942122186495175, | |
| "grad_norm": 0.3918740749359131, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6912, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.8006430868167203, | |
| "grad_norm": 0.40045326948165894, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.807073954983923, | |
| "grad_norm": 0.40693676471710205, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6842, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.8135048231511255, | |
| "grad_norm": 0.3957662284374237, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6777, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.819935691318328, | |
| "grad_norm": 0.4147331714630127, | |
| "learning_rate": 5e-06, | |
| "loss": 0.679, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.8263665594855305, | |
| "grad_norm": 0.421351820230484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.832797427652733, | |
| "grad_norm": 0.4355145990848541, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6847, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.8392282958199357, | |
| "grad_norm": 0.41382715106010437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6777, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.8456591639871383, | |
| "grad_norm": 0.4280456006526947, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6781, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.852090032154341, | |
| "grad_norm": 0.49057596921920776, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6844, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.8585209003215435, | |
| "grad_norm": 0.41203203797340393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.864951768488746, | |
| "grad_norm": 0.4137505888938904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6808, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.8713826366559485, | |
| "grad_norm": 0.46201327443122864, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6798, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.877813504823151, | |
| "grad_norm": 0.4486853778362274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6862, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.8842443729903537, | |
| "grad_norm": 0.4082019329071045, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6843, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.8906752411575563, | |
| "grad_norm": 0.48371437191963196, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6781, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.897106109324759, | |
| "grad_norm": 0.4703909456729889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6843, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.9035369774919615, | |
| "grad_norm": 0.40100324153900146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6838, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.909967845659164, | |
| "grad_norm": 0.4890632629394531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6771, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.9163987138263665, | |
| "grad_norm": 0.4435786008834839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.922829581993569, | |
| "grad_norm": 0.43092137575149536, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6723, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.9292604501607717, | |
| "grad_norm": 0.45887503027915955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6811, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.9356913183279743, | |
| "grad_norm": 0.4193710386753082, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6801, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.942122186495177, | |
| "grad_norm": 0.45064231753349304, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6718, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.9485530546623795, | |
| "grad_norm": 0.42754465341567993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6736, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.954983922829582, | |
| "grad_norm": 0.41854625940322876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6818, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.9614147909967845, | |
| "grad_norm": 0.4096625745296478, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.967845659163987, | |
| "grad_norm": 0.41215813159942627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6752, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.9742765273311897, | |
| "grad_norm": 0.47351738810539246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.9807073954983923, | |
| "grad_norm": 0.4130474925041199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.987138263665595, | |
| "grad_norm": 0.41312044858932495, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6729, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.9935691318327975, | |
| "grad_norm": 0.40591350197792053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6722, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.40826907753944397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.686, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.7126035690307617, | |
| "eval_runtime": 139.886, | |
| "eval_samples_per_second": 299.551, | |
| "eval_steps_per_second": 1.172, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.0064308681672025, | |
| "grad_norm": 0.561937153339386, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6404, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.012861736334405, | |
| "grad_norm": 0.5182865858078003, | |
| "learning_rate": 5e-06, | |
| "loss": 0.64, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.0192926045016075, | |
| "grad_norm": 0.5018469095230103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.0257234726688105, | |
| "grad_norm": 0.4944702982902527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6381, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.032154340836013, | |
| "grad_norm": 0.4453904330730438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6372, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.0385852090032155, | |
| "grad_norm": 0.43754857778549194, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.045016077170418, | |
| "grad_norm": 0.43891504406929016, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.0514469453376205, | |
| "grad_norm": 0.42905229330062866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.057877813504823, | |
| "grad_norm": 0.4976702928543091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.0643086816720255, | |
| "grad_norm": 0.4487721621990204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6367, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.0707395498392285, | |
| "grad_norm": 0.48409563302993774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.077170418006431, | |
| "grad_norm": 0.4372307360172272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6441, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.0836012861736335, | |
| "grad_norm": 0.46748873591423035, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.090032154340836, | |
| "grad_norm": 0.46158337593078613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6334, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.0964630225080385, | |
| "grad_norm": 0.43315762281417847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.102893890675241, | |
| "grad_norm": 0.4499836564064026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6444, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.1093247588424435, | |
| "grad_norm": 0.45043113827705383, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6384, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.1157556270096465, | |
| "grad_norm": 0.4223954677581787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.122186495176849, | |
| "grad_norm": 0.45464426279067993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6466, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.1286173633440515, | |
| "grad_norm": 0.4449687898159027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6297, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.135048231511254, | |
| "grad_norm": 0.4286421537399292, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6408, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.1414790996784565, | |
| "grad_norm": 0.4531044065952301, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6416, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.147909967845659, | |
| "grad_norm": 0.44140374660491943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.154340836012862, | |
| "grad_norm": 0.4317072927951813, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.1607717041800645, | |
| "grad_norm": 0.47132372856140137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6233, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.167202572347267, | |
| "grad_norm": 0.4424266815185547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6417, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.1736334405144695, | |
| "grad_norm": 0.42858994007110596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.180064308681672, | |
| "grad_norm": 0.4775191843509674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.1864951768488745, | |
| "grad_norm": 0.46210339665412903, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6427, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.192926045016077, | |
| "grad_norm": 0.46885842084884644, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6456, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.19935691318328, | |
| "grad_norm": 0.4760117828845978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6435, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.2057877813504825, | |
| "grad_norm": 0.5036518573760986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.212218649517685, | |
| "grad_norm": 0.4869937002658844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.634, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.2186495176848875, | |
| "grad_norm": 0.4235919117927551, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.22508038585209, | |
| "grad_norm": 0.4805637001991272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6413, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.2315112540192925, | |
| "grad_norm": 0.4369070827960968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.237942122186495, | |
| "grad_norm": 0.43729808926582336, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6489, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.244372990353698, | |
| "grad_norm": 0.514873206615448, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6451, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.2508038585209005, | |
| "grad_norm": 0.4235300123691559, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6355, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.257234726688103, | |
| "grad_norm": 0.4553775191307068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6374, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.2636655948553055, | |
| "grad_norm": 0.444014310836792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6504, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.270096463022508, | |
| "grad_norm": 0.4582684636116028, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6338, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.2765273311897105, | |
| "grad_norm": 0.5020269751548767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.634, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.282958199356913, | |
| "grad_norm": 0.44650107622146606, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6382, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.289389067524116, | |
| "grad_norm": 0.46265995502471924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.2958199356913185, | |
| "grad_norm": 0.44239240884780884, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6406, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.302250803858521, | |
| "grad_norm": 0.49851298332214355, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6373, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.3086816720257235, | |
| "grad_norm": 0.44977161288261414, | |
| "learning_rate": 5e-06, | |
| "loss": 0.637, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.315112540192926, | |
| "grad_norm": 0.47006750106811523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.3215434083601285, | |
| "grad_norm": 0.43386831879615784, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.327974276527331, | |
| "grad_norm": 0.4304748475551605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6404, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.334405144694534, | |
| "grad_norm": 0.4528113901615143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6409, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.3408360128617365, | |
| "grad_norm": 0.43319591879844666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6397, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.347266881028939, | |
| "grad_norm": 0.45046094059944153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6437, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.3536977491961415, | |
| "grad_norm": 0.4646282494068146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6473, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.360128617363344, | |
| "grad_norm": 0.4470469355583191, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6337, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.3665594855305465, | |
| "grad_norm": 0.45889395475387573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6393, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.372990353697749, | |
| "grad_norm": 0.41356027126312256, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6354, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.379421221864952, | |
| "grad_norm": 0.4529729187488556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6403, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.3858520900321545, | |
| "grad_norm": 0.4174403250217438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6377, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.392282958199357, | |
| "grad_norm": 0.4575382173061371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6391, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.3987138263665595, | |
| "grad_norm": 0.4538198709487915, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6415, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.405144694533762, | |
| "grad_norm": 0.4399804472923279, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.4115755627009645, | |
| "grad_norm": 0.4382459819316864, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6415, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.418006430868167, | |
| "grad_norm": 0.42854952812194824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6377, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.42443729903537, | |
| "grad_norm": 0.4322606921195984, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6435, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.4308681672025725, | |
| "grad_norm": 0.45412078499794006, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6401, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.437299035369775, | |
| "grad_norm": 0.4531524181365967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6454, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.4437299035369775, | |
| "grad_norm": 0.44421976804733276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6484, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.45016077170418, | |
| "grad_norm": 0.4720653295516968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6431, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.4565916398713825, | |
| "grad_norm": 0.4742264747619629, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6371, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.463022508038585, | |
| "grad_norm": 0.43964919447898865, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6398, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.469453376205788, | |
| "grad_norm": 0.43848881125450134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6419, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.4758842443729905, | |
| "grad_norm": 0.4461026191711426, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6379, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.482315112540193, | |
| "grad_norm": 0.4415052831172943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6439, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.4887459807073955, | |
| "grad_norm": 0.4191666543483734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6446, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.495176848874598, | |
| "grad_norm": 0.43842655420303345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6439, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.5016077170418005, | |
| "grad_norm": 0.4282081127166748, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6479, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.508038585209003, | |
| "grad_norm": 0.4626299738883972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6448, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.514469453376206, | |
| "grad_norm": 0.44208213686943054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6439, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.5209003215434085, | |
| "grad_norm": 0.43402865529060364, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.527331189710611, | |
| "grad_norm": 0.475366473197937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6446, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.5337620578778135, | |
| "grad_norm": 0.4425564408302307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6384, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.540192926045016, | |
| "grad_norm": 0.43618571758270264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6408, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.5466237942122185, | |
| "grad_norm": 0.49698591232299805, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6303, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.553054662379421, | |
| "grad_norm": 0.5115300416946411, | |
| "learning_rate": 5e-06, | |
| "loss": 0.651, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.559485530546624, | |
| "grad_norm": 0.6572542190551758, | |
| "learning_rate": 5e-06, | |
| "loss": 0.643, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.5659163987138265, | |
| "grad_norm": 0.42369475960731506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6388, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.572347266881029, | |
| "grad_norm": 0.44530463218688965, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.5787781350482315, | |
| "grad_norm": 0.49780553579330444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6334, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.585209003215434, | |
| "grad_norm": 0.45106691122055054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6398, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.5916398713826365, | |
| "grad_norm": 0.43149319291114807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.598070739549839, | |
| "grad_norm": 0.44727641344070435, | |
| "learning_rate": 5e-06, | |
| "loss": 0.643, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.604501607717042, | |
| "grad_norm": 0.46136513352394104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6407, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.6109324758842445, | |
| "grad_norm": 0.43373093008995056, | |
| "learning_rate": 5e-06, | |
| "loss": 0.635, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.617363344051447, | |
| "grad_norm": 0.42928022146224976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.6237942122186495, | |
| "grad_norm": 0.4322867691516876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.630225080385852, | |
| "grad_norm": 0.4528902471065521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6382, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.6366559485530545, | |
| "grad_norm": 0.45928648114204407, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.643086816720257, | |
| "grad_norm": 0.4491860270500183, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6378, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.64951768488746, | |
| "grad_norm": 0.44847381114959717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6444, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.6559485530546625, | |
| "grad_norm": 0.42893797159194946, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6424, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.662379421221865, | |
| "grad_norm": 0.4790765345096588, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.6688102893890675, | |
| "grad_norm": 0.4363596737384796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.67524115755627, | |
| "grad_norm": 0.4559105634689331, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6475, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.6816720257234725, | |
| "grad_norm": 0.4630337059497833, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6443, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.688102893890675, | |
| "grad_norm": 0.4628975987434387, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6425, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.694533762057878, | |
| "grad_norm": 0.507043719291687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6385, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.7009646302250805, | |
| "grad_norm": 0.4438357353210449, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6415, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.707395498392283, | |
| "grad_norm": 0.4772716462612152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.7138263665594855, | |
| "grad_norm": 0.4417891502380371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6365, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.720257234726688, | |
| "grad_norm": 0.44056421518325806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6365, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.7266881028938905, | |
| "grad_norm": 0.4614117741584778, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6425, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.733118971061093, | |
| "grad_norm": 0.5050578117370605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6471, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.739549839228296, | |
| "grad_norm": 0.4652516543865204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 2.7459807073954985, | |
| "grad_norm": 0.44051194190979004, | |
| "learning_rate": 5e-06, | |
| "loss": 0.647, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 2.752411575562701, | |
| "grad_norm": 0.45722419023513794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.64, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 2.7588424437299035, | |
| "grad_norm": 0.4401618242263794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6439, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 2.765273311897106, | |
| "grad_norm": 0.4500727653503418, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6355, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.7717041800643085, | |
| "grad_norm": 0.44464367628097534, | |
| "learning_rate": 5e-06, | |
| "loss": 0.641, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 2.778135048231511, | |
| "grad_norm": 0.433173805475235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6479, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.784565916398714, | |
| "grad_norm": 0.44234412908554077, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6474, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 2.7909967845659165, | |
| "grad_norm": 0.4479934275150299, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6428, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 2.797427652733119, | |
| "grad_norm": 0.44619840383529663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6441, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.8038585209003215, | |
| "grad_norm": 0.4315251111984253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6477, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 2.810289389067524, | |
| "grad_norm": 0.43586409091949463, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6504, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 2.816720257234727, | |
| "grad_norm": 0.4625302851200104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6523, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 2.823151125401929, | |
| "grad_norm": 0.46072930097579956, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6423, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 2.829581993569132, | |
| "grad_norm": 0.4475337266921997, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6483, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.8360128617363345, | |
| "grad_norm": 0.43071475625038147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6426, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 2.842443729903537, | |
| "grad_norm": 0.4402061700820923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 2.8488745980707395, | |
| "grad_norm": 0.47659188508987427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6378, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 2.855305466237942, | |
| "grad_norm": 0.44324415922164917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6445, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 2.861736334405145, | |
| "grad_norm": 0.5019335746765137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.868167202572347, | |
| "grad_norm": 0.47943827509880066, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6403, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 2.87459807073955, | |
| "grad_norm": 0.4461328387260437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6423, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 2.8810289389067525, | |
| "grad_norm": 0.4263225495815277, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6379, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.887459807073955, | |
| "grad_norm": 0.45077577233314514, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6419, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 2.8938906752411575, | |
| "grad_norm": 0.4050474762916565, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6361, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.90032154340836, | |
| "grad_norm": 0.4340299069881439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6386, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 2.906752411575563, | |
| "grad_norm": 0.42679664492607117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6423, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 2.913183279742765, | |
| "grad_norm": 0.4645717144012451, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6422, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 2.919614147909968, | |
| "grad_norm": 0.471055805683136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 2.9260450160771705, | |
| "grad_norm": 0.5037959814071655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6419, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.932475884244373, | |
| "grad_norm": 0.4540598690509796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6393, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 2.9389067524115755, | |
| "grad_norm": 0.44270047545433044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6367, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 2.945337620578778, | |
| "grad_norm": 0.43946129083633423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 2.951768488745981, | |
| "grad_norm": 0.46501582860946655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6372, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 2.958199356913183, | |
| "grad_norm": 0.4638145864009857, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6479, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.964630225080386, | |
| "grad_norm": 0.4363510310649872, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6391, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 2.9710610932475885, | |
| "grad_norm": 0.42436662316322327, | |
| "learning_rate": 5e-06, | |
| "loss": 0.643, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 2.977491961414791, | |
| "grad_norm": 0.4569937288761139, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 2.9839228295819935, | |
| "grad_norm": 0.4475838243961334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.990353697749196, | |
| "grad_norm": 0.47496703267097473, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6315, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.996784565916399, | |
| "grad_norm": 0.4460706114768982, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6421, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.7122252583503723, | |
| "eval_runtime": 140.1434, | |
| "eval_samples_per_second": 299.001, | |
| "eval_steps_per_second": 1.17, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 4665, | |
| "total_flos": 2.2026686217575832e+20, | |
| "train_loss": 0.6892199146274841, | |
| "train_runtime": 30399.4314, | |
| "train_samples_per_second": 78.569, | |
| "train_steps_per_second": 0.153 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4665, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2026686217575832e+20, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |