| { | |
| "best_metric": 1.9664931297302246, | |
| "best_model_checkpoint": "./lora_bn_resume/checkpoint-3000", | |
| "epoch": 1.9292604501607717, | |
| "eval_steps": 200, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006430868167202572, | |
| "grad_norm": 0.7529953718185425, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 2.01, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012861736334405145, | |
| "grad_norm": 0.8143910765647888, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 1.9794, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01929260450160772, | |
| "grad_norm": 0.7554563283920288, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 1.9687, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02572347266881029, | |
| "grad_norm": 0.701172411441803, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 2.0374, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03215434083601286, | |
| "grad_norm": 0.7426002621650696, | |
| "learning_rate": 0.00015, | |
| "loss": 1.8484, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03858520900321544, | |
| "grad_norm": 0.7900332808494568, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 1.91, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04501607717041801, | |
| "grad_norm": 0.7825136184692383, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 1.9625, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05144694533762058, | |
| "grad_norm": 0.9338003993034363, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 1.9668, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05787781350482315, | |
| "grad_norm": 0.8660485148429871, | |
| "learning_rate": 0.00027, | |
| "loss": 2.0447, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06430868167202572, | |
| "grad_norm": 0.8631746768951416, | |
| "learning_rate": 0.0003, | |
| "loss": 2.0347, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0707395498392283, | |
| "grad_norm": 0.9202760457992554, | |
| "learning_rate": 0.00029934282584884994, | |
| "loss": 2.0218, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07717041800643087, | |
| "grad_norm": 0.8508992791175842, | |
| "learning_rate": 0.00029868565169769985, | |
| "loss": 1.9808, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08360128617363344, | |
| "grad_norm": 0.9962050914764404, | |
| "learning_rate": 0.0002980284775465498, | |
| "loss": 1.9586, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09003215434083602, | |
| "grad_norm": 0.9159810543060303, | |
| "learning_rate": 0.00029737130339539973, | |
| "loss": 2.0257, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09646302250803858, | |
| "grad_norm": 0.8135138750076294, | |
| "learning_rate": 0.0002967141292442497, | |
| "loss": 2.0103, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10289389067524116, | |
| "grad_norm": 0.7933633327484131, | |
| "learning_rate": 0.00029605695509309966, | |
| "loss": 2.028, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10932475884244373, | |
| "grad_norm": 0.9258368611335754, | |
| "learning_rate": 0.00029539978094194957, | |
| "loss": 2.0654, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1157556270096463, | |
| "grad_norm": 0.8758969902992249, | |
| "learning_rate": 0.00029474260679079954, | |
| "loss": 1.9928, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12218649517684887, | |
| "grad_norm": 0.8316165804862976, | |
| "learning_rate": 0.00029408543263964945, | |
| "loss": 1.9748, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.12861736334405144, | |
| "grad_norm": 0.8353763222694397, | |
| "learning_rate": 0.0002934282584884994, | |
| "loss": 2.0167, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12861736334405144, | |
| "eval_loss": 2.0699551105499268, | |
| "eval_runtime": 131.8406, | |
| "eval_samples_per_second": 15.17, | |
| "eval_steps_per_second": 1.896, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13504823151125403, | |
| "grad_norm": 0.8024882078170776, | |
| "learning_rate": 0.0002927710843373494, | |
| "loss": 2.1039, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1414790996784566, | |
| "grad_norm": 0.861377477645874, | |
| "learning_rate": 0.0002921139101861993, | |
| "loss": 2.023, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.14790996784565916, | |
| "grad_norm": 0.8247071504592896, | |
| "learning_rate": 0.00029145673603504926, | |
| "loss": 1.9341, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15434083601286175, | |
| "grad_norm": 0.8182681202888489, | |
| "learning_rate": 0.0002907995618838992, | |
| "loss": 2.0137, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1607717041800643, | |
| "grad_norm": 0.8556217551231384, | |
| "learning_rate": 0.00029014238773274913, | |
| "loss": 2.0638, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.16720257234726688, | |
| "grad_norm": 0.7721512913703918, | |
| "learning_rate": 0.0002894852135815991, | |
| "loss": 2.0061, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17363344051446947, | |
| "grad_norm": 0.7948784828186035, | |
| "learning_rate": 0.000288828039430449, | |
| "loss": 1.9751, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18006430868167203, | |
| "grad_norm": 0.7582404613494873, | |
| "learning_rate": 0.000288170865279299, | |
| "loss": 2.0254, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1864951768488746, | |
| "grad_norm": 0.9620535969734192, | |
| "learning_rate": 0.00028751369112814894, | |
| "loss": 1.9978, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19292604501607716, | |
| "grad_norm": 0.7374221682548523, | |
| "learning_rate": 0.00028685651697699885, | |
| "loss": 2.0631, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.19935691318327975, | |
| "grad_norm": 0.794651210308075, | |
| "learning_rate": 0.0002861993428258488, | |
| "loss": 1.9507, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2057877813504823, | |
| "grad_norm": 0.7450920939445496, | |
| "learning_rate": 0.00028554216867469873, | |
| "loss": 2.0363, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21221864951768488, | |
| "grad_norm": 0.7574348449707031, | |
| "learning_rate": 0.0002848849945235487, | |
| "loss": 2.0508, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.21864951768488747, | |
| "grad_norm": 0.9118533134460449, | |
| "learning_rate": 0.00028422782037239866, | |
| "loss": 2.0118, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.22508038585209003, | |
| "grad_norm": 0.8136394023895264, | |
| "learning_rate": 0.0002835706462212486, | |
| "loss": 2.1211, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2315112540192926, | |
| "grad_norm": 0.9099079966545105, | |
| "learning_rate": 0.00028291347207009854, | |
| "loss": 2.0346, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2379421221864952, | |
| "grad_norm": 0.830896258354187, | |
| "learning_rate": 0.0002822562979189485, | |
| "loss": 2.0494, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.24437299035369775, | |
| "grad_norm": 0.789002001285553, | |
| "learning_rate": 0.0002815991237677984, | |
| "loss": 1.9791, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2508038585209003, | |
| "grad_norm": 0.8194644451141357, | |
| "learning_rate": 0.0002809419496166484, | |
| "loss": 2.0106, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2572347266881029, | |
| "grad_norm": 0.8226191401481628, | |
| "learning_rate": 0.00028028477546549835, | |
| "loss": 2.0268, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2572347266881029, | |
| "eval_loss": 2.057727575302124, | |
| "eval_runtime": 127.2637, | |
| "eval_samples_per_second": 15.715, | |
| "eval_steps_per_second": 1.964, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.26366559485530544, | |
| "grad_norm": 0.796454668045044, | |
| "learning_rate": 0.00027962760131434826, | |
| "loss": 2.0376, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.27009646302250806, | |
| "grad_norm": 0.8327352404594421, | |
| "learning_rate": 0.0002789704271631982, | |
| "loss": 2.0481, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2765273311897106, | |
| "grad_norm": 0.8051420450210571, | |
| "learning_rate": 0.0002783132530120482, | |
| "loss": 1.99, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2829581993569132, | |
| "grad_norm": 0.7519128322601318, | |
| "learning_rate": 0.0002776560788608981, | |
| "loss": 2.0339, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.28938906752411575, | |
| "grad_norm": 0.8251495957374573, | |
| "learning_rate": 0.00027699890470974807, | |
| "loss": 2.0289, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2958199356913183, | |
| "grad_norm": 0.7058277130126953, | |
| "learning_rate": 0.000276341730558598, | |
| "loss": 2.0669, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3022508038585209, | |
| "grad_norm": 0.8475114107131958, | |
| "learning_rate": 0.00027568455640744795, | |
| "loss": 2.0506, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3086816720257235, | |
| "grad_norm": 0.7855744957923889, | |
| "learning_rate": 0.0002750273822562979, | |
| "loss": 1.97, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.31511254019292606, | |
| "grad_norm": 0.727988064289093, | |
| "learning_rate": 0.0002743702081051478, | |
| "loss": 2.0705, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3215434083601286, | |
| "grad_norm": 0.7662935853004456, | |
| "learning_rate": 0.0002737130339539978, | |
| "loss": 1.9678, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3279742765273312, | |
| "grad_norm": 0.9171555638313293, | |
| "learning_rate": 0.00027305585980284776, | |
| "loss": 1.9818, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.33440514469453375, | |
| "grad_norm": 0.7959179282188416, | |
| "learning_rate": 0.00027239868565169767, | |
| "loss": 2.0014, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3408360128617363, | |
| "grad_norm": 0.9359775185585022, | |
| "learning_rate": 0.00027174151150054763, | |
| "loss": 2.0244, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.34726688102893893, | |
| "grad_norm": 0.7740966081619263, | |
| "learning_rate": 0.0002710843373493976, | |
| "loss": 2.0883, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3536977491961415, | |
| "grad_norm": 0.868601381778717, | |
| "learning_rate": 0.0002704271631982475, | |
| "loss": 2.0226, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.36012861736334406, | |
| "grad_norm": 0.8721134662628174, | |
| "learning_rate": 0.0002697699890470975, | |
| "loss": 2.0965, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3665594855305466, | |
| "grad_norm": 0.8080394268035889, | |
| "learning_rate": 0.00026911281489594744, | |
| "loss": 2.0082, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3729903536977492, | |
| "grad_norm": 1.7169413566589355, | |
| "learning_rate": 0.00026845564074479735, | |
| "loss": 2.039, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.37942122186495175, | |
| "grad_norm": 0.8220880031585693, | |
| "learning_rate": 0.0002677984665936473, | |
| "loss": 2.0696, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3858520900321543, | |
| "grad_norm": 0.7639694213867188, | |
| "learning_rate": 0.00026714129244249723, | |
| "loss": 2.0014, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3858520900321543, | |
| "eval_loss": 2.0443177223205566, | |
| "eval_runtime": 133.8726, | |
| "eval_samples_per_second": 14.94, | |
| "eval_steps_per_second": 1.867, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.39228295819935693, | |
| "grad_norm": 0.817965567111969, | |
| "learning_rate": 0.0002664841182913472, | |
| "loss": 2.0553, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3987138263665595, | |
| "grad_norm": 0.871166467666626, | |
| "learning_rate": 0.00026582694414019716, | |
| "loss": 2.0027, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.40514469453376206, | |
| "grad_norm": 0.7483948469161987, | |
| "learning_rate": 0.00026516976998904707, | |
| "loss": 2.0355, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4115755627009646, | |
| "grad_norm": 0.8223303556442261, | |
| "learning_rate": 0.00026451259583789704, | |
| "loss": 2.0076, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4180064308681672, | |
| "grad_norm": 0.80986088514328, | |
| "learning_rate": 0.00026385542168674695, | |
| "loss": 2.0781, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.42443729903536975, | |
| "grad_norm": 0.7527362704277039, | |
| "learning_rate": 0.0002631982475355969, | |
| "loss": 1.9727, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.43086816720257237, | |
| "grad_norm": 0.7571489810943604, | |
| "learning_rate": 0.0002625410733844469, | |
| "loss": 2.0205, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.43729903536977494, | |
| "grad_norm": 0.7976600527763367, | |
| "learning_rate": 0.0002618838992332968, | |
| "loss": 2.0505, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4437299035369775, | |
| "grad_norm": 0.8057394623756409, | |
| "learning_rate": 0.00026122672508214676, | |
| "loss": 2.0351, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.45016077170418006, | |
| "grad_norm": 0.8420009016990662, | |
| "learning_rate": 0.0002605695509309967, | |
| "loss": 1.9655, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4565916398713826, | |
| "grad_norm": 0.853597104549408, | |
| "learning_rate": 0.00025991237677984664, | |
| "loss": 1.9939, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4630225080385852, | |
| "grad_norm": 0.7588443160057068, | |
| "learning_rate": 0.0002592552026286966, | |
| "loss": 2.032, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4694533762057878, | |
| "grad_norm": 0.8099080920219421, | |
| "learning_rate": 0.0002585980284775465, | |
| "loss": 1.9817, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4758842443729904, | |
| "grad_norm": 0.7894070148468018, | |
| "learning_rate": 0.0002579408543263965, | |
| "loss": 2.0001, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.48231511254019294, | |
| "grad_norm": 0.7474116683006287, | |
| "learning_rate": 0.00025728368017524644, | |
| "loss": 2.0077, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4887459807073955, | |
| "grad_norm": 0.8076878786087036, | |
| "learning_rate": 0.00025662650602409636, | |
| "loss": 2.0394, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.49517684887459806, | |
| "grad_norm": 0.7559667825698853, | |
| "learning_rate": 0.0002559693318729463, | |
| "loss": 1.9753, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5016077170418006, | |
| "grad_norm": 0.7402215600013733, | |
| "learning_rate": 0.00025531215772179623, | |
| "loss": 2.0353, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5080385852090032, | |
| "grad_norm": 0.7112523317337036, | |
| "learning_rate": 0.0002546549835706462, | |
| "loss": 1.989, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5144694533762058, | |
| "grad_norm": 0.7255666255950928, | |
| "learning_rate": 0.00025399780941949616, | |
| "loss": 1.9912, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5144694533762058, | |
| "eval_loss": 2.0358893871307373, | |
| "eval_runtime": 131.9747, | |
| "eval_samples_per_second": 15.154, | |
| "eval_steps_per_second": 1.894, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5209003215434084, | |
| "grad_norm": 0.7614848613739014, | |
| "learning_rate": 0.0002533406352683461, | |
| "loss": 1.9507, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5273311897106109, | |
| "grad_norm": 0.7834282517433167, | |
| "learning_rate": 0.00025268346111719604, | |
| "loss": 2.0572, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5337620578778135, | |
| "grad_norm": 0.8642615079879761, | |
| "learning_rate": 0.00025202628696604595, | |
| "loss": 1.9766, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5401929260450161, | |
| "grad_norm": 0.7937222123146057, | |
| "learning_rate": 0.0002513691128148959, | |
| "loss": 1.9718, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5466237942122186, | |
| "grad_norm": 0.7922580242156982, | |
| "learning_rate": 0.0002507119386637459, | |
| "loss": 2.0098, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5530546623794212, | |
| "grad_norm": 0.7464605569839478, | |
| "learning_rate": 0.0002500547645125958, | |
| "loss": 1.9529, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5594855305466238, | |
| "grad_norm": 0.7568275332450867, | |
| "learning_rate": 0.00024939759036144576, | |
| "loss": 1.989, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5659163987138264, | |
| "grad_norm": 0.7011362910270691, | |
| "learning_rate": 0.00024874041621029573, | |
| "loss": 2.031, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.572347266881029, | |
| "grad_norm": 0.7106270790100098, | |
| "learning_rate": 0.00024808324205914564, | |
| "loss": 2.022, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5787781350482315, | |
| "grad_norm": 0.7415210604667664, | |
| "learning_rate": 0.0002474260679079956, | |
| "loss": 2.0595, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5852090032154341, | |
| "grad_norm": 0.7313567399978638, | |
| "learning_rate": 0.0002467688937568455, | |
| "loss": 2.0293, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5916398713826366, | |
| "grad_norm": 0.692523181438446, | |
| "learning_rate": 0.0002461117196056955, | |
| "loss": 2.0746, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5980707395498392, | |
| "grad_norm": 0.6929277181625366, | |
| "learning_rate": 0.00024545454545454545, | |
| "loss": 1.955, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6045016077170418, | |
| "grad_norm": 0.7199161648750305, | |
| "learning_rate": 0.00024479737130339536, | |
| "loss": 2.0454, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6109324758842444, | |
| "grad_norm": 0.767314076423645, | |
| "learning_rate": 0.00024414019715224533, | |
| "loss": 2.0428, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.617363344051447, | |
| "grad_norm": 0.8044443130493164, | |
| "learning_rate": 0.00024348302300109526, | |
| "loss": 1.9423, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6237942122186495, | |
| "grad_norm": 0.702936589717865, | |
| "learning_rate": 0.0002428258488499452, | |
| "loss": 1.9271, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6302250803858521, | |
| "grad_norm": 0.7394160032272339, | |
| "learning_rate": 0.00024216867469879517, | |
| "loss": 1.9674, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6366559485530546, | |
| "grad_norm": 0.7981842160224915, | |
| "learning_rate": 0.0002415115005476451, | |
| "loss": 1.9932, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6430868167202572, | |
| "grad_norm": 0.871896505355835, | |
| "learning_rate": 0.00024085432639649505, | |
| "loss": 2.0182, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6430868167202572, | |
| "eval_loss": 2.024224281311035, | |
| "eval_runtime": 130.1041, | |
| "eval_samples_per_second": 15.372, | |
| "eval_steps_per_second": 1.922, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6495176848874598, | |
| "grad_norm": 0.7123499512672424, | |
| "learning_rate": 0.00024019715224534498, | |
| "loss": 2.0923, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6559485530546624, | |
| "grad_norm": 0.7226546406745911, | |
| "learning_rate": 0.00023953997809419495, | |
| "loss": 2.0035, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.662379421221865, | |
| "grad_norm": 0.7627468109130859, | |
| "learning_rate": 0.0002388828039430449, | |
| "loss": 1.9667, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6688102893890675, | |
| "grad_norm": 0.8175467252731323, | |
| "learning_rate": 0.00023822562979189483, | |
| "loss": 1.948, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6752411575562701, | |
| "grad_norm": 0.690073549747467, | |
| "learning_rate": 0.0002375684556407448, | |
| "loss": 2.0498, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6816720257234726, | |
| "grad_norm": 0.9848446249961853, | |
| "learning_rate": 0.0002369112814895947, | |
| "loss": 1.9874, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6881028938906752, | |
| "grad_norm": 0.7157571315765381, | |
| "learning_rate": 0.00023625410733844467, | |
| "loss": 2.0488, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6945337620578779, | |
| "grad_norm": 0.8503302931785583, | |
| "learning_rate": 0.00023559693318729464, | |
| "loss": 1.9958, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7009646302250804, | |
| "grad_norm": 0.7864677906036377, | |
| "learning_rate": 0.00023493975903614455, | |
| "loss": 2.0212, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.707395498392283, | |
| "grad_norm": 1.7837698459625244, | |
| "learning_rate": 0.0002342825848849945, | |
| "loss": 1.9828, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7138263665594855, | |
| "grad_norm": 0.7183972001075745, | |
| "learning_rate": 0.00023362541073384445, | |
| "loss": 2.0652, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7202572347266881, | |
| "grad_norm": 0.7377676963806152, | |
| "learning_rate": 0.0002329682365826944, | |
| "loss": 2.0123, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7266881028938906, | |
| "grad_norm": 0.7170071601867676, | |
| "learning_rate": 0.00023231106243154436, | |
| "loss": 1.9759, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7331189710610932, | |
| "grad_norm": 0.6442170143127441, | |
| "learning_rate": 0.00023165388828039427, | |
| "loss": 2.047, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7395498392282959, | |
| "grad_norm": 0.7356306910514832, | |
| "learning_rate": 0.00023099671412924423, | |
| "loss": 2.0438, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7459807073954984, | |
| "grad_norm": 0.7483031153678894, | |
| "learning_rate": 0.0002303395399780942, | |
| "loss": 2.0274, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.752411575562701, | |
| "grad_norm": 0.7624642848968506, | |
| "learning_rate": 0.0002296823658269441, | |
| "loss": 1.9938, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7588424437299035, | |
| "grad_norm": 0.7435073256492615, | |
| "learning_rate": 0.00022902519167579408, | |
| "loss": 1.9848, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7652733118971061, | |
| "grad_norm": 0.7327163219451904, | |
| "learning_rate": 0.000228368017524644, | |
| "loss": 2.0286, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.7717041800643086, | |
| "grad_norm": 0.8398700952529907, | |
| "learning_rate": 0.00022771084337349395, | |
| "loss": 1.999, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7717041800643086, | |
| "eval_loss": 2.0166773796081543, | |
| "eval_runtime": 129.989, | |
| "eval_samples_per_second": 15.386, | |
| "eval_steps_per_second": 1.923, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7781350482315113, | |
| "grad_norm": 0.6727181673049927, | |
| "learning_rate": 0.00022705366922234392, | |
| "loss": 2.0044, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7845659163987139, | |
| "grad_norm": 0.8738404512405396, | |
| "learning_rate": 0.00022639649507119383, | |
| "loss": 2.0246, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7909967845659164, | |
| "grad_norm": 0.760010302066803, | |
| "learning_rate": 0.0002257393209200438, | |
| "loss": 2.0058, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.797427652733119, | |
| "grad_norm": 0.701081395149231, | |
| "learning_rate": 0.00022508214676889373, | |
| "loss": 1.9974, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8038585209003215, | |
| "grad_norm": 0.7346913814544678, | |
| "learning_rate": 0.00022442497261774367, | |
| "loss": 2.0884, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8102893890675241, | |
| "grad_norm": 0.7433114647865295, | |
| "learning_rate": 0.00022376779846659364, | |
| "loss": 1.9927, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8167202572347267, | |
| "grad_norm": 0.7781444787979126, | |
| "learning_rate": 0.00022311062431544358, | |
| "loss": 2.001, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8231511254019293, | |
| "grad_norm": 0.7538995742797852, | |
| "learning_rate": 0.00022245345016429352, | |
| "loss": 1.9947, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8295819935691319, | |
| "grad_norm": 0.7132537961006165, | |
| "learning_rate": 0.00022179627601314345, | |
| "loss": 1.9781, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8360128617363344, | |
| "grad_norm": 0.7174340486526489, | |
| "learning_rate": 0.0002211391018619934, | |
| "loss": 1.9848, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.842443729903537, | |
| "grad_norm": 0.7245258092880249, | |
| "learning_rate": 0.00022048192771084336, | |
| "loss": 2.005, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8488745980707395, | |
| "grad_norm": 0.667892336845398, | |
| "learning_rate": 0.0002198247535596933, | |
| "loss": 1.9939, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8553054662379421, | |
| "grad_norm": 0.7173146605491638, | |
| "learning_rate": 0.00021916757940854324, | |
| "loss": 2.0636, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8617363344051447, | |
| "grad_norm": 0.7765901684761047, | |
| "learning_rate": 0.0002185104052573932, | |
| "loss": 1.9966, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8681672025723473, | |
| "grad_norm": 0.7077351808547974, | |
| "learning_rate": 0.00021785323110624314, | |
| "loss": 2.0078, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8745980707395499, | |
| "grad_norm": 0.736723780632019, | |
| "learning_rate": 0.00021719605695509308, | |
| "loss": 2.0292, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8810289389067524, | |
| "grad_norm": 0.732185959815979, | |
| "learning_rate": 0.00021653888280394302, | |
| "loss": 2.0223, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.887459807073955, | |
| "grad_norm": 0.7002454400062561, | |
| "learning_rate": 0.00021588170865279298, | |
| "loss": 2.0068, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8938906752411575, | |
| "grad_norm": 0.75859534740448, | |
| "learning_rate": 0.00021522453450164292, | |
| "loss": 1.9556, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9003215434083601, | |
| "grad_norm": 0.7475289106369019, | |
| "learning_rate": 0.00021456736035049286, | |
| "loss": 1.9792, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9003215434083601, | |
| "eval_loss": 2.0089023113250732, | |
| "eval_runtime": 130.0325, | |
| "eval_samples_per_second": 15.381, | |
| "eval_steps_per_second": 1.923, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9067524115755627, | |
| "grad_norm": 0.7917546629905701, | |
| "learning_rate": 0.00021391018619934283, | |
| "loss": 1.9999, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9131832797427653, | |
| "grad_norm": 0.7062447667121887, | |
| "learning_rate": 0.00021325301204819274, | |
| "loss": 1.9779, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9196141479099679, | |
| "grad_norm": 0.6973288655281067, | |
| "learning_rate": 0.0002125958378970427, | |
| "loss": 2.0511, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9260450160771704, | |
| "grad_norm": 0.7297340035438538, | |
| "learning_rate": 0.00021193866374589267, | |
| "loss": 1.9764, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.932475884244373, | |
| "grad_norm": 0.9256350994110107, | |
| "learning_rate": 0.00021128148959474258, | |
| "loss": 1.9559, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9389067524115756, | |
| "grad_norm": 0.6994000673294067, | |
| "learning_rate": 0.00021062431544359255, | |
| "loss": 2.0152, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9453376205787781, | |
| "grad_norm": 0.7412806749343872, | |
| "learning_rate": 0.00020996714129244246, | |
| "loss": 1.9494, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9517684887459807, | |
| "grad_norm": 0.729680061340332, | |
| "learning_rate": 0.00020930996714129242, | |
| "loss": 2.0272, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9581993569131833, | |
| "grad_norm": 0.7601342797279358, | |
| "learning_rate": 0.0002086527929901424, | |
| "loss": 1.9714, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.9646302250803859, | |
| "grad_norm": 0.6875161528587341, | |
| "learning_rate": 0.0002079956188389923, | |
| "loss": 1.993, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9710610932475884, | |
| "grad_norm": 0.7520968317985535, | |
| "learning_rate": 0.00020733844468784227, | |
| "loss": 2.0471, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.977491961414791, | |
| "grad_norm": 0.8061411380767822, | |
| "learning_rate": 0.00020668127053669218, | |
| "loss": 2.0145, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9839228295819936, | |
| "grad_norm": 0.7837228775024414, | |
| "learning_rate": 0.00020602409638554214, | |
| "loss": 1.9889, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.9903536977491961, | |
| "grad_norm": 0.744296133518219, | |
| "learning_rate": 0.0002053669222343921, | |
| "loss": 1.9834, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.9967845659163987, | |
| "grad_norm": 0.7137749791145325, | |
| "learning_rate": 0.00020470974808324202, | |
| "loss": 2.0582, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.0032154340836013, | |
| "grad_norm": 0.718320906162262, | |
| "learning_rate": 0.000204052573932092, | |
| "loss": 1.9576, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.0096463022508038, | |
| "grad_norm": 0.719998836517334, | |
| "learning_rate": 0.00020339539978094195, | |
| "loss": 1.9138, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.0160771704180065, | |
| "grad_norm": 0.7154316306114197, | |
| "learning_rate": 0.00020273822562979186, | |
| "loss": 1.875, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.022508038585209, | |
| "grad_norm": 0.6565534472465515, | |
| "learning_rate": 0.00020208105147864183, | |
| "loss": 1.9994, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.0289389067524115, | |
| "grad_norm": 0.7222368121147156, | |
| "learning_rate": 0.00020142387732749177, | |
| "loss": 1.9591, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0289389067524115, | |
| "eval_loss": 2.002497673034668, | |
| "eval_runtime": 131.2869, | |
| "eval_samples_per_second": 15.234, | |
| "eval_steps_per_second": 1.904, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0353697749196142, | |
| "grad_norm": 0.7213057279586792, | |
| "learning_rate": 0.0002007667031763417, | |
| "loss": 1.9464, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.0418006430868167, | |
| "grad_norm": 0.6436830163002014, | |
| "learning_rate": 0.00020010952902519167, | |
| "loss": 1.8951, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.0482315112540193, | |
| "grad_norm": 0.7160071134567261, | |
| "learning_rate": 0.00019945235487404158, | |
| "loss": 1.9062, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.0546623794212218, | |
| "grad_norm": 0.6585739850997925, | |
| "learning_rate": 0.00019879518072289155, | |
| "loss": 1.9514, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.0610932475884245, | |
| "grad_norm": 0.7445241808891296, | |
| "learning_rate": 0.0001981380065717415, | |
| "loss": 1.8301, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.067524115755627, | |
| "grad_norm": 0.6654142141342163, | |
| "learning_rate": 0.00019748083242059143, | |
| "loss": 1.9048, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.0739549839228295, | |
| "grad_norm": 0.7550114393234253, | |
| "learning_rate": 0.0001968236582694414, | |
| "loss": 1.9266, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.0803858520900322, | |
| "grad_norm": 0.7276896834373474, | |
| "learning_rate": 0.00019616648411829133, | |
| "loss": 1.8942, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.0868167202572347, | |
| "grad_norm": 0.7431575059890747, | |
| "learning_rate": 0.00019550930996714127, | |
| "loss": 1.9148, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.0932475884244373, | |
| "grad_norm": 0.74256831407547, | |
| "learning_rate": 0.0001948521358159912, | |
| "loss": 1.942, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.09967845659164, | |
| "grad_norm": 0.7295734286308289, | |
| "learning_rate": 0.00019419496166484117, | |
| "loss": 1.9331, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.1061093247588425, | |
| "grad_norm": 0.7749672532081604, | |
| "learning_rate": 0.0001935377875136911, | |
| "loss": 1.9373, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.112540192926045, | |
| "grad_norm": 0.6896611452102661, | |
| "learning_rate": 0.00019288061336254105, | |
| "loss": 1.8813, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.1189710610932475, | |
| "grad_norm": 0.7282217741012573, | |
| "learning_rate": 0.00019222343921139102, | |
| "loss": 1.9634, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.1254019292604502, | |
| "grad_norm": 0.7761743068695068, | |
| "learning_rate": 0.00019156626506024093, | |
| "loss": 1.8708, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.1318327974276527, | |
| "grad_norm": 0.7596757411956787, | |
| "learning_rate": 0.0001909090909090909, | |
| "loss": 1.9446, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.1382636655948553, | |
| "grad_norm": 0.7023797631263733, | |
| "learning_rate": 0.00019025191675794086, | |
| "loss": 1.8837, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.144694533762058, | |
| "grad_norm": 0.7191573977470398, | |
| "learning_rate": 0.00018959474260679077, | |
| "loss": 1.9141, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.1511254019292605, | |
| "grad_norm": 0.784885048866272, | |
| "learning_rate": 0.00018893756845564074, | |
| "loss": 1.9506, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.157556270096463, | |
| "grad_norm": 0.710903525352478, | |
| "learning_rate": 0.00018828039430449068, | |
| "loss": 1.9157, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.157556270096463, | |
| "eval_loss": 1.998835563659668, | |
| "eval_runtime": 121.0458, | |
| "eval_samples_per_second": 16.523, | |
| "eval_steps_per_second": 2.065, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1639871382636655, | |
| "grad_norm": 0.7552351355552673, | |
| "learning_rate": 0.00018762322015334062, | |
| "loss": 1.9139, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.1704180064308682, | |
| "grad_norm": 0.7722271084785461, | |
| "learning_rate": 0.00018696604600219058, | |
| "loss": 1.863, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.1768488745980707, | |
| "grad_norm": 0.7195548415184021, | |
| "learning_rate": 0.0001863088718510405, | |
| "loss": 1.8697, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.1832797427652733, | |
| "grad_norm": 0.7423893809318542, | |
| "learning_rate": 0.00018565169769989046, | |
| "loss": 1.9772, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.189710610932476, | |
| "grad_norm": 0.7222315073013306, | |
| "learning_rate": 0.00018499452354874042, | |
| "loss": 1.9308, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.1961414790996785, | |
| "grad_norm": 0.6815035939216614, | |
| "learning_rate": 0.00018433734939759034, | |
| "loss": 1.9675, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.202572347266881, | |
| "grad_norm": 0.7621594071388245, | |
| "learning_rate": 0.0001836801752464403, | |
| "loss": 1.9295, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.2090032154340835, | |
| "grad_norm": 0.7405025959014893, | |
| "learning_rate": 0.0001830230010952902, | |
| "loss": 1.9088, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.2154340836012862, | |
| "grad_norm": 0.6729809641838074, | |
| "learning_rate": 0.00018236582694414018, | |
| "loss": 1.9446, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.2218649517684887, | |
| "grad_norm": 0.7389471530914307, | |
| "learning_rate": 0.00018170865279299014, | |
| "loss": 1.8841, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.2282958199356913, | |
| "grad_norm": 0.6453628540039062, | |
| "learning_rate": 0.00018105147864184006, | |
| "loss": 1.8661, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.234726688102894, | |
| "grad_norm": 0.6971079111099243, | |
| "learning_rate": 0.00018039430449069002, | |
| "loss": 1.9807, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.2411575562700965, | |
| "grad_norm": 0.7807840704917908, | |
| "learning_rate": 0.00017973713033953996, | |
| "loss": 1.9475, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.247588424437299, | |
| "grad_norm": 0.78909832239151, | |
| "learning_rate": 0.0001790799561883899, | |
| "loss": 1.8439, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.2540192926045015, | |
| "grad_norm": 0.7715321183204651, | |
| "learning_rate": 0.00017842278203723986, | |
| "loss": 1.9478, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.2604501607717042, | |
| "grad_norm": 0.7786479592323303, | |
| "learning_rate": 0.0001777656078860898, | |
| "loss": 1.8773, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.2668810289389068, | |
| "grad_norm": 0.6935726404190063, | |
| "learning_rate": 0.00017710843373493974, | |
| "loss": 1.94, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.2733118971061093, | |
| "grad_norm": 0.7824066877365112, | |
| "learning_rate": 0.00017645125958378968, | |
| "loss": 1.8996, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.279742765273312, | |
| "grad_norm": 0.7019379138946533, | |
| "learning_rate": 0.00017579408543263962, | |
| "loss": 1.9114, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.2861736334405145, | |
| "grad_norm": 0.8215466737747192, | |
| "learning_rate": 0.00017513691128148958, | |
| "loss": 1.8294, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2861736334405145, | |
| "eval_loss": 1.9947528839111328, | |
| "eval_runtime": 132.3397, | |
| "eval_samples_per_second": 15.113, | |
| "eval_steps_per_second": 1.889, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.292604501607717, | |
| "grad_norm": 0.7088531851768494, | |
| "learning_rate": 0.00017447973713033952, | |
| "loss": 1.9497, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.2990353697749195, | |
| "grad_norm": 0.7754150032997131, | |
| "learning_rate": 0.00017382256297918946, | |
| "loss": 1.9047, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.3054662379421222, | |
| "grad_norm": 0.7185202836990356, | |
| "learning_rate": 0.00017316538882803943, | |
| "loss": 1.8529, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.3118971061093248, | |
| "grad_norm": 0.7496573328971863, | |
| "learning_rate": 0.00017250821467688937, | |
| "loss": 1.8618, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.3183279742765273, | |
| "grad_norm": 0.6794284582138062, | |
| "learning_rate": 0.0001718510405257393, | |
| "loss": 1.898, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.32475884244373, | |
| "grad_norm": 0.7059448957443237, | |
| "learning_rate": 0.00017119386637458924, | |
| "loss": 1.9594, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.3311897106109325, | |
| "grad_norm": 0.7007871866226196, | |
| "learning_rate": 0.0001705366922234392, | |
| "loss": 1.9476, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.337620578778135, | |
| "grad_norm": 0.6973986029624939, | |
| "learning_rate": 0.00016987951807228915, | |
| "loss": 1.9567, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.3440514469453375, | |
| "grad_norm": 0.7169969081878662, | |
| "learning_rate": 0.00016922234392113909, | |
| "loss": 1.9685, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.3504823151125402, | |
| "grad_norm": 0.7009272575378418, | |
| "learning_rate": 0.00016856516976998905, | |
| "loss": 1.9714, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.3569131832797428, | |
| "grad_norm": 0.7070193290710449, | |
| "learning_rate": 0.00016790799561883896, | |
| "loss": 1.9695, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.3633440514469453, | |
| "grad_norm": 0.7268947958946228, | |
| "learning_rate": 0.00016725082146768893, | |
| "loss": 1.9107, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.369774919614148, | |
| "grad_norm": 0.7544928789138794, | |
| "learning_rate": 0.00016659364731653887, | |
| "loss": 1.8658, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.3762057877813505, | |
| "grad_norm": 0.6320627927780151, | |
| "learning_rate": 0.0001659364731653888, | |
| "loss": 1.8917, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.382636655948553, | |
| "grad_norm": 0.6863923668861389, | |
| "learning_rate": 0.00016527929901423877, | |
| "loss": 1.9237, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.3890675241157555, | |
| "grad_norm": 0.7775669097900391, | |
| "learning_rate": 0.00016462212486308868, | |
| "loss": 1.8548, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.3954983922829582, | |
| "grad_norm": 0.7198719382286072, | |
| "learning_rate": 0.00016396495071193865, | |
| "loss": 1.9145, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.4019292604501608, | |
| "grad_norm": 0.7938317656517029, | |
| "learning_rate": 0.00016330777656078861, | |
| "loss": 1.8939, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.4083601286173635, | |
| "grad_norm": 0.7361711263656616, | |
| "learning_rate": 0.00016265060240963853, | |
| "loss": 1.9642, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.414790996784566, | |
| "grad_norm": 0.7385576963424683, | |
| "learning_rate": 0.0001619934282584885, | |
| "loss": 1.9134, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.414790996784566, | |
| "eval_loss": 1.9883830547332764, | |
| "eval_runtime": 130.0767, | |
| "eval_samples_per_second": 15.376, | |
| "eval_steps_per_second": 1.922, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.4212218649517685, | |
| "grad_norm": 0.7863461971282959, | |
| "learning_rate": 0.0001613362541073384, | |
| "loss": 2.0157, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.427652733118971, | |
| "grad_norm": 0.7755898237228394, | |
| "learning_rate": 0.00016067907995618837, | |
| "loss": 1.8973, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.4340836012861735, | |
| "grad_norm": 0.7090388536453247, | |
| "learning_rate": 0.00016002190580503833, | |
| "loss": 1.9034, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.4405144694533762, | |
| "grad_norm": 0.6487644910812378, | |
| "learning_rate": 0.00015936473165388825, | |
| "loss": 1.906, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.4469453376205788, | |
| "grad_norm": 0.6597898006439209, | |
| "learning_rate": 0.0001587075575027382, | |
| "loss": 1.843, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.4533762057877815, | |
| "grad_norm": 0.7069796323776245, | |
| "learning_rate": 0.00015805038335158818, | |
| "loss": 1.9554, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.459807073954984, | |
| "grad_norm": 0.7358680367469788, | |
| "learning_rate": 0.0001573932092004381, | |
| "loss": 1.9268, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.4662379421221865, | |
| "grad_norm": 0.675457775592804, | |
| "learning_rate": 0.00015673603504928806, | |
| "loss": 1.8981, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.472668810289389, | |
| "grad_norm": 0.7369397878646851, | |
| "learning_rate": 0.000156078860898138, | |
| "loss": 1.9535, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.4790996784565915, | |
| "grad_norm": 0.666994035243988, | |
| "learning_rate": 0.00015542168674698793, | |
| "loss": 1.8657, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.4855305466237942, | |
| "grad_norm": 0.7241340279579163, | |
| "learning_rate": 0.0001547645125958379, | |
| "loss": 1.8097, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.4919614147909968, | |
| "grad_norm": 0.7224936485290527, | |
| "learning_rate": 0.0001541073384446878, | |
| "loss": 1.8397, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.4983922829581995, | |
| "grad_norm": 0.7167637348175049, | |
| "learning_rate": 0.00015345016429353778, | |
| "loss": 1.9225, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.504823151125402, | |
| "grad_norm": 0.7176666259765625, | |
| "learning_rate": 0.00015279299014238771, | |
| "loss": 1.8764, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.5112540192926045, | |
| "grad_norm": 0.735252857208252, | |
| "learning_rate": 0.00015213581599123765, | |
| "loss": 1.8935, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.517684887459807, | |
| "grad_norm": 0.6805827021598816, | |
| "learning_rate": 0.00015147864184008762, | |
| "loss": 1.9212, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.5241157556270095, | |
| "grad_norm": 0.7019375562667847, | |
| "learning_rate": 0.00015082146768893756, | |
| "loss": 1.9318, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.5305466237942122, | |
| "grad_norm": 0.6795372366905212, | |
| "learning_rate": 0.0001501642935377875, | |
| "loss": 1.9023, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.5369774919614148, | |
| "grad_norm": 0.6497982144355774, | |
| "learning_rate": 0.00014950711938663743, | |
| "loss": 1.9721, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.5434083601286175, | |
| "grad_norm": 0.7713346481323242, | |
| "learning_rate": 0.0001488499452354874, | |
| "loss": 1.9906, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.5434083601286175, | |
| "eval_loss": 1.9822700023651123, | |
| "eval_runtime": 130.376, | |
| "eval_samples_per_second": 15.34, | |
| "eval_steps_per_second": 1.918, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.54983922829582, | |
| "grad_norm": 0.7202898263931274, | |
| "learning_rate": 0.00014819277108433734, | |
| "loss": 1.8816, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.5562700964630225, | |
| "grad_norm": 0.7167313694953918, | |
| "learning_rate": 0.00014753559693318728, | |
| "loss": 1.9316, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.562700964630225, | |
| "grad_norm": 0.7133712768554688, | |
| "learning_rate": 0.00014687842278203724, | |
| "loss": 2.0053, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.5691318327974275, | |
| "grad_norm": 0.76304692029953, | |
| "learning_rate": 0.00014622124863088718, | |
| "loss": 1.8718, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.5755627009646302, | |
| "grad_norm": 0.667654812335968, | |
| "learning_rate": 0.00014556407447973712, | |
| "loss": 1.8727, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.5819935691318328, | |
| "grad_norm": 0.7308873534202576, | |
| "learning_rate": 0.00014490690032858706, | |
| "loss": 1.8918, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.5884244372990355, | |
| "grad_norm": 0.9376251697540283, | |
| "learning_rate": 0.00014424972617743702, | |
| "loss": 1.96, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.594855305466238, | |
| "grad_norm": 0.6924982666969299, | |
| "learning_rate": 0.00014359255202628696, | |
| "loss": 1.8744, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.6012861736334405, | |
| "grad_norm": 0.7420899868011475, | |
| "learning_rate": 0.0001429353778751369, | |
| "loss": 1.9112, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.607717041800643, | |
| "grad_norm": 0.7384818196296692, | |
| "learning_rate": 0.00014227820372398684, | |
| "loss": 1.9562, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.6141479099678455, | |
| "grad_norm": 0.7550799250602722, | |
| "learning_rate": 0.0001416210295728368, | |
| "loss": 1.891, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.6205787781350482, | |
| "grad_norm": 0.7184371948242188, | |
| "learning_rate": 0.00014096385542168674, | |
| "loss": 1.9361, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.6270096463022508, | |
| "grad_norm": 0.770914614200592, | |
| "learning_rate": 0.00014030668127053668, | |
| "loss": 1.9132, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.6334405144694535, | |
| "grad_norm": 0.7566716074943542, | |
| "learning_rate": 0.00013964950711938662, | |
| "loss": 1.8982, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.639871382636656, | |
| "grad_norm": 0.6670147776603699, | |
| "learning_rate": 0.00013899233296823656, | |
| "loss": 1.9211, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.6463022508038585, | |
| "grad_norm": 0.7093060612678528, | |
| "learning_rate": 0.00013833515881708653, | |
| "loss": 1.8881, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.652733118971061, | |
| "grad_norm": 0.6549977660179138, | |
| "learning_rate": 0.00013767798466593646, | |
| "loss": 1.9187, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.6591639871382635, | |
| "grad_norm": 0.7039531469345093, | |
| "learning_rate": 0.0001370208105147864, | |
| "loss": 1.9165, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.6655948553054662, | |
| "grad_norm": 0.7216307520866394, | |
| "learning_rate": 0.00013636363636363634, | |
| "loss": 1.9228, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.6720257234726688, | |
| "grad_norm": 0.6866537928581238, | |
| "learning_rate": 0.00013570646221248628, | |
| "loss": 1.9003, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.6720257234726688, | |
| "eval_loss": 1.977206826210022, | |
| "eval_runtime": 131.9243, | |
| "eval_samples_per_second": 15.16, | |
| "eval_steps_per_second": 1.895, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.6784565916398715, | |
| "grad_norm": 0.7328875660896301, | |
| "learning_rate": 0.00013504928806133625, | |
| "loss": 1.9, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.684887459807074, | |
| "grad_norm": 0.7623500227928162, | |
| "learning_rate": 0.00013439211391018618, | |
| "loss": 1.9117, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.6913183279742765, | |
| "grad_norm": 0.6996557712554932, | |
| "learning_rate": 0.00013373493975903612, | |
| "loss": 1.8342, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.697749196141479, | |
| "grad_norm": 0.6597011685371399, | |
| "learning_rate": 0.00013307776560788606, | |
| "loss": 1.911, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.7041800643086815, | |
| "grad_norm": 0.7154627442359924, | |
| "learning_rate": 0.00013242059145673603, | |
| "loss": 1.8955, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.7106109324758842, | |
| "grad_norm": 0.6822642087936401, | |
| "learning_rate": 0.00013176341730558597, | |
| "loss": 1.928, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.717041800643087, | |
| "grad_norm": 0.6770340204238892, | |
| "learning_rate": 0.0001311062431544359, | |
| "loss": 1.934, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.7234726688102895, | |
| "grad_norm": 0.7235671877861023, | |
| "learning_rate": 0.00013044906900328584, | |
| "loss": 1.9248, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.729903536977492, | |
| "grad_norm": 0.6428620219230652, | |
| "learning_rate": 0.0001297918948521358, | |
| "loss": 1.8998, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.7363344051446945, | |
| "grad_norm": 0.7132564783096313, | |
| "learning_rate": 0.00012913472070098575, | |
| "loss": 1.9353, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.742765273311897, | |
| "grad_norm": 0.7110019326210022, | |
| "learning_rate": 0.0001284775465498357, | |
| "loss": 1.8877, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.7491961414790995, | |
| "grad_norm": 0.7546197772026062, | |
| "learning_rate": 0.00012782037239868565, | |
| "loss": 1.9219, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.7556270096463023, | |
| "grad_norm": 0.8485615253448486, | |
| "learning_rate": 0.0001271631982475356, | |
| "loss": 1.9238, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.762057877813505, | |
| "grad_norm": 0.7058401703834534, | |
| "learning_rate": 0.00012650602409638553, | |
| "loss": 1.9012, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.7684887459807075, | |
| "grad_norm": 0.7222112417221069, | |
| "learning_rate": 0.00012584884994523547, | |
| "loss": 1.8442, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.77491961414791, | |
| "grad_norm": 0.7010639905929565, | |
| "learning_rate": 0.00012519167579408543, | |
| "loss": 1.9322, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.7813504823151125, | |
| "grad_norm": 0.6908234357833862, | |
| "learning_rate": 0.00012453450164293537, | |
| "loss": 1.9456, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.787781350482315, | |
| "grad_norm": 0.6615903973579407, | |
| "learning_rate": 0.0001238773274917853, | |
| "loss": 1.9052, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.7942122186495175, | |
| "grad_norm": 0.6688089370727539, | |
| "learning_rate": 0.00012322015334063528, | |
| "loss": 1.87, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.8006430868167203, | |
| "grad_norm": 0.7396994233131409, | |
| "learning_rate": 0.00012256297918948522, | |
| "loss": 1.9243, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.8006430868167203, | |
| "eval_loss": 1.974278450012207, | |
| "eval_runtime": 144.2243, | |
| "eval_samples_per_second": 13.867, | |
| "eval_steps_per_second": 1.733, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.807073954983923, | |
| "grad_norm": 0.6520466208457947, | |
| "learning_rate": 0.00012190580503833514, | |
| "loss": 1.902, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.8135048231511255, | |
| "grad_norm": 0.7591603398323059, | |
| "learning_rate": 0.00012124863088718509, | |
| "loss": 1.9079, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.819935691318328, | |
| "grad_norm": 0.6622514128684998, | |
| "learning_rate": 0.00012059145673603504, | |
| "loss": 1.9288, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.8263665594855305, | |
| "grad_norm": 0.7578607797622681, | |
| "learning_rate": 0.00011993428258488498, | |
| "loss": 1.8936, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.832797427652733, | |
| "grad_norm": 0.730093240737915, | |
| "learning_rate": 0.00011927710843373494, | |
| "loss": 1.8809, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.8392282958199357, | |
| "grad_norm": 0.6403250098228455, | |
| "learning_rate": 0.00011861993428258487, | |
| "loss": 1.8866, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.8456591639871383, | |
| "grad_norm": 0.7032350897789001, | |
| "learning_rate": 0.00011796276013143481, | |
| "loss": 1.938, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.852090032154341, | |
| "grad_norm": 0.7376342415809631, | |
| "learning_rate": 0.00011730558598028478, | |
| "loss": 1.8925, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.8585209003215435, | |
| "grad_norm": 0.7093110680580139, | |
| "learning_rate": 0.00011664841182913472, | |
| "loss": 1.9029, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.864951768488746, | |
| "grad_norm": 0.6826250553131104, | |
| "learning_rate": 0.00011599123767798466, | |
| "loss": 1.8956, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.8713826366559485, | |
| "grad_norm": 0.7709969282150269, | |
| "learning_rate": 0.0001153340635268346, | |
| "loss": 1.92, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.877813504823151, | |
| "grad_norm": 0.6641222238540649, | |
| "learning_rate": 0.00011467688937568453, | |
| "loss": 1.8998, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.8842443729903537, | |
| "grad_norm": 0.7321887612342834, | |
| "learning_rate": 0.0001140197152245345, | |
| "loss": 1.9257, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.8906752411575563, | |
| "grad_norm": 0.7000001668930054, | |
| "learning_rate": 0.00011336254107338444, | |
| "loss": 1.8944, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.897106109324759, | |
| "grad_norm": 0.7347818613052368, | |
| "learning_rate": 0.00011270536692223438, | |
| "loss": 1.9256, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.9035369774919615, | |
| "grad_norm": 0.708888590335846, | |
| "learning_rate": 0.00011204819277108433, | |
| "loss": 1.9307, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.909967845659164, | |
| "grad_norm": 0.6980915665626526, | |
| "learning_rate": 0.00011139101861993428, | |
| "loss": 1.883, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.9163987138263665, | |
| "grad_norm": 0.8052535653114319, | |
| "learning_rate": 0.00011073384446878422, | |
| "loss": 1.899, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.922829581993569, | |
| "grad_norm": 0.707011878490448, | |
| "learning_rate": 0.00011007667031763416, | |
| "loss": 1.9263, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.9292604501607717, | |
| "grad_norm": 0.7086938619613647, | |
| "learning_rate": 0.00010941949616648411, | |
| "loss": 1.883, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.9292604501607717, | |
| "eval_loss": 1.9664931297302246, | |
| "eval_runtime": 133.023, | |
| "eval_samples_per_second": 15.035, | |
| "eval_steps_per_second": 1.879, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4665, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.0137669676957696e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |