| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 5624, | |
| "global_step": 16872, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 64.30284881591797, | |
| "learning_rate": 2.9620853080568726e-09, | |
| "loss": 3.7905, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 74.81798553466797, | |
| "learning_rate": 9.478672985781992e-08, | |
| "loss": 4.4137, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 101.4776840209961, | |
| "learning_rate": 1.8957345971563984e-07, | |
| "loss": 4.2954, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 68.84349822998047, | |
| "learning_rate": 2.843601895734597e-07, | |
| "loss": 3.376, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 42.47146224975586, | |
| "learning_rate": 3.791469194312797e-07, | |
| "loss": 1.962, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 46.63043212890625, | |
| "learning_rate": 4.7393364928909956e-07, | |
| "loss": 1.168, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.709139347076416, | |
| "learning_rate": 5.687203791469194e-07, | |
| "loss": 0.4681, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.304001808166504, | |
| "learning_rate": 6.635071090047394e-07, | |
| "loss": 0.4059, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5251598358154297, | |
| "learning_rate": 7.582938388625594e-07, | |
| "loss": 0.3999, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.9685628414154053, | |
| "learning_rate": 8.530805687203792e-07, | |
| "loss": 0.3904, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.7696113586425781, | |
| "learning_rate": 9.478672985781991e-07, | |
| "loss": 0.3769, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.8694322109222412, | |
| "learning_rate": 1.042654028436019e-06, | |
| "loss": 0.3718, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.7985926866531372, | |
| "learning_rate": 1.1374407582938388e-06, | |
| "loss": 0.3569, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.9462366104125977, | |
| "learning_rate": 1.2322274881516587e-06, | |
| "loss": 0.3432, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.916548490524292, | |
| "learning_rate": 1.3270142180094788e-06, | |
| "loss": 0.331, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.2683026790618896, | |
| "learning_rate": 1.4218009478672987e-06, | |
| "loss": 0.334, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.4642925262451172, | |
| "learning_rate": 1.5165876777251187e-06, | |
| "loss": 0.3386, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.3890858888626099, | |
| "learning_rate": 1.6113744075829384e-06, | |
| "loss": 0.3073, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.4565709829330444, | |
| "learning_rate": 1.7061611374407585e-06, | |
| "loss": 0.3263, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.349706768989563, | |
| "learning_rate": 1.8009478672985784e-06, | |
| "loss": 0.3165, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.7937098741531372, | |
| "learning_rate": 1.8957345971563982e-06, | |
| "loss": 0.3052, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.616604208946228, | |
| "learning_rate": 1.990521327014218e-06, | |
| "loss": 0.3138, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.3185547590255737, | |
| "learning_rate": 2.085308056872038e-06, | |
| "loss": 0.297, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.8201861381530762, | |
| "learning_rate": 2.180094786729858e-06, | |
| "loss": 0.3072, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.1981289386749268, | |
| "learning_rate": 2.2748815165876777e-06, | |
| "loss": 0.2957, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.7006583213806152, | |
| "learning_rate": 2.369668246445498e-06, | |
| "loss": 0.2941, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.1674447059631348, | |
| "learning_rate": 2.4644549763033174e-06, | |
| "loss": 0.2798, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.3364813327789307, | |
| "learning_rate": 2.5592417061611373e-06, | |
| "loss": 0.2996, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.2685174942016602, | |
| "learning_rate": 2.6540284360189576e-06, | |
| "loss": 0.3027, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.3952556848526, | |
| "learning_rate": 2.7488151658767775e-06, | |
| "loss": 0.2985, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.251714825630188, | |
| "learning_rate": 2.8436018957345973e-06, | |
| "loss": 0.2905, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.553152322769165, | |
| "learning_rate": 2.938388625592417e-06, | |
| "loss": 0.278, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.380920171737671, | |
| "learning_rate": 3.0331753554502375e-06, | |
| "loss": 0.2813, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.5351643562316895, | |
| "learning_rate": 3.1279620853080574e-06, | |
| "loss": 0.2805, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.4867887496948242, | |
| "learning_rate": 3.222748815165877e-06, | |
| "loss": 0.2767, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.317229986190796, | |
| "learning_rate": 3.3175355450236967e-06, | |
| "loss": 0.2859, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.8770791292190552, | |
| "learning_rate": 3.412322274881517e-06, | |
| "loss": 0.2875, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.4476697444915771, | |
| "learning_rate": 3.507109004739337e-06, | |
| "loss": 0.2884, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.351965069770813, | |
| "learning_rate": 3.6018957345971567e-06, | |
| "loss": 0.2802, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.4647209644317627, | |
| "learning_rate": 3.6966824644549766e-06, | |
| "loss": 0.2703, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.901773452758789, | |
| "learning_rate": 3.7914691943127964e-06, | |
| "loss": 0.2815, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.139844536781311, | |
| "learning_rate": 3.886255924170616e-06, | |
| "loss": 0.2658, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.1863261461257935, | |
| "learning_rate": 3.981042654028436e-06, | |
| "loss": 0.2707, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.2720916271209717, | |
| "learning_rate": 4.075829383886256e-06, | |
| "loss": 0.2646, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.6161096096038818, | |
| "learning_rate": 4.170616113744076e-06, | |
| "loss": 0.2748, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.4303381443023682, | |
| "learning_rate": 4.265402843601897e-06, | |
| "loss": 0.2691, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.401880145072937, | |
| "learning_rate": 4.360189573459716e-06, | |
| "loss": 0.2699, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.3495467901229858, | |
| "learning_rate": 4.4549763033175355e-06, | |
| "loss": 0.2772, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.3915464878082275, | |
| "learning_rate": 4.549763033175355e-06, | |
| "loss": 0.2752, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.3412673473358154, | |
| "learning_rate": 4.644549763033176e-06, | |
| "loss": 0.2751, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.3777296543121338, | |
| "learning_rate": 4.739336492890996e-06, | |
| "loss": 0.2717, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.2612873315811157, | |
| "learning_rate": 4.834123222748816e-06, | |
| "loss": 0.2678, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.2989717721939087, | |
| "learning_rate": 4.928909952606635e-06, | |
| "loss": 0.2778, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.3525702953338623, | |
| "learning_rate": 4.999996575341721e-06, | |
| "loss": 0.2719, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.4678899049758911, | |
| "learning_rate": 4.999914384012144e-06, | |
| "loss": 0.2755, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.2093278169631958, | |
| "learning_rate": 4.999722607745944e-06, | |
| "loss": 0.2755, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.4915423393249512, | |
| "learning_rate": 4.999421254949728e-06, | |
| "loss": 0.2686, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.1101861000061035, | |
| "learning_rate": 4.999010338833436e-06, | |
| "loss": 0.2594, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.3432806730270386, | |
| "learning_rate": 4.9984898774097735e-06, | |
| "loss": 0.2658, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.2808105945587158, | |
| "learning_rate": 4.997859893493414e-06, | |
| "loss": 0.2632, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.3815045356750488, | |
| "learning_rate": 4.997120414700003e-06, | |
| "loss": 0.2557, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.4393643140792847, | |
| "learning_rate": 4.996271473444944e-06, | |
| "loss": 0.263, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.138375163078308, | |
| "learning_rate": 4.995313106941982e-06, | |
| "loss": 0.2805, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.6412934064865112, | |
| "learning_rate": 4.994245357201568e-06, | |
| "loss": 0.2641, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.465922236442566, | |
| "learning_rate": 4.9930682710290205e-06, | |
| "loss": 0.2637, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.4526797533035278, | |
| "learning_rate": 4.991781900022471e-06, | |
| "loss": 0.2596, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.504759669303894, | |
| "learning_rate": 4.990386300570607e-06, | |
| "loss": 0.2633, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.5599263906478882, | |
| "learning_rate": 4.988881533850192e-06, | |
| "loss": 0.2658, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.1662814617156982, | |
| "learning_rate": 4.987267665823392e-06, | |
| "loss": 0.2694, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.3952819108963013, | |
| "learning_rate": 4.98554476723488e-06, | |
| "loss": 0.2449, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.2887946367263794, | |
| "learning_rate": 4.983712913608736e-06, | |
| "loss": 0.2651, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.5893690586090088, | |
| "learning_rate": 4.981772185245135e-06, | |
| "loss": 0.2568, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.228550672531128, | |
| "learning_rate": 4.979722667216829e-06, | |
| "loss": 0.2667, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.2756662368774414, | |
| "learning_rate": 4.977564449365415e-06, | |
| "loss": 0.2508, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.5225822925567627, | |
| "learning_rate": 4.975297626297399e-06, | |
| "loss": 0.2691, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.2656946182250977, | |
| "learning_rate": 4.972922297380052e-06, | |
| "loss": 0.2704, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.3268104791641235, | |
| "learning_rate": 4.970438566737043e-06, | |
| "loss": 0.2577, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.5536099672317505, | |
| "learning_rate": 4.96784654324389e-06, | |
| "loss": 0.2578, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.1516194343566895, | |
| "learning_rate": 4.965146340523175e-06, | |
| "loss": 0.2446, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.1923089027404785, | |
| "learning_rate": 4.962338076939569e-06, | |
| "loss": 0.2569, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.124197006225586, | |
| "learning_rate": 4.959421875594643e-06, | |
| "loss": 0.2625, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.680388331413269, | |
| "learning_rate": 4.95639786432147e-06, | |
| "loss": 0.264, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.3039462566375732, | |
| "learning_rate": 4.953266175679023e-06, | |
| "loss": 0.2624, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.109054684638977, | |
| "learning_rate": 4.9500269469463655e-06, | |
| "loss": 0.2548, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.2704750299453735, | |
| "learning_rate": 4.94668032011663e-06, | |
| "loss": 0.2569, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.1952179670333862, | |
| "learning_rate": 4.943226441890794e-06, | |
| "loss": 0.2599, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.2229312658309937, | |
| "learning_rate": 4.939665463671255e-06, | |
| "loss": 0.2577, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.3956924676895142, | |
| "learning_rate": 4.935997541555188e-06, | |
| "loss": 0.2642, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.116629958152771, | |
| "learning_rate": 4.932222836327703e-06, | |
| "loss": 0.2587, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.1389435529708862, | |
| "learning_rate": 4.928341513454801e-06, | |
| "loss": 0.2566, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.3800580501556396, | |
| "learning_rate": 4.9243537430761155e-06, | |
| "loss": 0.2579, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.3852914571762085, | |
| "learning_rate": 4.920259699997461e-06, | |
| "loss": 0.2666, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.31257963180542, | |
| "learning_rate": 4.916059563683162e-06, | |
| "loss": 0.2547, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.599116563796997, | |
| "learning_rate": 4.911753518248194e-06, | |
| "loss": 0.2612, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.2397140264511108, | |
| "learning_rate": 4.907341752450105e-06, | |
| "loss": 0.2589, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.3178327083587646, | |
| "learning_rate": 4.9028244596807525e-06, | |
| "loss": 0.2605, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.7413417100906372, | |
| "learning_rate": 4.898201837957811e-06, | |
| "loss": 0.2565, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.314085602760315, | |
| "learning_rate": 4.893474089916105e-06, | |
| "loss": 0.2498, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.1399492025375366, | |
| "learning_rate": 4.888641422798719e-06, | |
| "loss": 0.2647, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.3332985639572144, | |
| "learning_rate": 4.883704048447916e-06, | |
| "loss": 0.2594, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.3460063934326172, | |
| "learning_rate": 4.87866218329585e-06, | |
| "loss": 0.2571, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.5006327629089355, | |
| "learning_rate": 4.87351604835508e-06, | |
| "loss": 0.2458, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.1781283617019653, | |
| "learning_rate": 4.868265869208879e-06, | |
| "loss": 0.2452, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.117686152458191, | |
| "learning_rate": 4.862911876001348e-06, | |
| "loss": 0.2469, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.9969549775123596, | |
| "learning_rate": 4.857454303427328e-06, | |
| "loss": 0.2453, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.4894945621490479, | |
| "learning_rate": 4.851893390722109e-06, | |
| "loss": 0.2457, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.106041431427002, | |
| "learning_rate": 4.846229381650946e-06, | |
| "loss": 0.2474, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.035601019859314, | |
| "learning_rate": 4.840462524498372e-06, | |
| "loss": 0.2593, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.7077690362930298, | |
| "learning_rate": 4.834593072057313e-06, | |
| "loss": 0.2506, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.1017436981201172, | |
| "learning_rate": 4.8286212816180124e-06, | |
| "loss": 0.2506, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.2720685005187988, | |
| "learning_rate": 4.8225474149567434e-06, | |
| "loss": 0.2567, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.328189730644226, | |
| "learning_rate": 4.816371738324343e-06, | |
| "loss": 0.2531, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.2597825527191162, | |
| "learning_rate": 4.810094522434534e-06, | |
| "loss": 0.246, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.244281530380249, | |
| "learning_rate": 4.803716042452063e-06, | |
| "loss": 0.2433, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.4658986330032349, | |
| "learning_rate": 4.797236577980634e-06, | |
| "loss": 0.2496, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.4121670722961426, | |
| "learning_rate": 4.7906564130506575e-06, | |
| "loss": 0.2531, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.1751240491867065, | |
| "learning_rate": 4.783975836106791e-06, | |
| "loss": 0.2515, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.2011898756027222, | |
| "learning_rate": 4.777195139995308e-06, | |
| "loss": 0.2453, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.5764689445495605, | |
| "learning_rate": 4.770314621951245e-06, | |
| "loss": 0.2496, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.4584077596664429, | |
| "learning_rate": 4.763334583585388e-06, | |
| "loss": 0.2392, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0098185539245605, | |
| "learning_rate": 4.756255330871039e-06, | |
| "loss": 0.2393, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.3514459133148193, | |
| "learning_rate": 4.749077174130609e-06, | |
| "loss": 0.2572, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.3888107538223267, | |
| "learning_rate": 4.741800428022014e-06, | |
| "loss": 0.2383, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.3402737379074097, | |
| "learning_rate": 4.734425411524884e-06, | |
| "loss": 0.2556, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.2175307273864746, | |
| "learning_rate": 4.726952447926576e-06, | |
| "loss": 0.2555, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.386852502822876, | |
| "learning_rate": 4.719381864808005e-06, | |
| "loss": 0.2503, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.2774380445480347, | |
| "learning_rate": 4.711713994029284e-06, | |
| "loss": 0.2503, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.177322268486023, | |
| "learning_rate": 4.703949171715179e-06, | |
| "loss": 0.2574, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.269942283630371, | |
| "learning_rate": 4.69608773824037e-06, | |
| "loss": 0.2529, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.2209409475326538, | |
| "learning_rate": 4.688130038214534e-06, | |
| "loss": 0.2536, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.4368942975997925, | |
| "learning_rate": 4.6800764204672385e-06, | |
| "loss": 0.2378, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.6493048667907715, | |
| "learning_rate": 4.671927238032651e-06, | |
| "loss": 0.2538, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.038549542427063, | |
| "learning_rate": 4.6636828481340594e-06, | |
| "loss": 0.2501, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.343204379081726, | |
| "learning_rate": 4.655343612168219e-06, | |
| "loss": 0.251, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.4020464420318604, | |
| "learning_rate": 4.646909895689508e-06, | |
| "loss": 0.2564, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.1331307888031006, | |
| "learning_rate": 4.638382068393899e-06, | |
| "loss": 0.2505, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.3825620412826538, | |
| "learning_rate": 4.629760504102761e-06, | |
| "loss": 0.2513, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.310570478439331, | |
| "learning_rate": 4.621045580746467e-06, | |
| "loss": 0.2464, | |
| "step": 4384 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.15547776222229, | |
| "learning_rate": 4.61223768034783e-06, | |
| "loss": 0.2515, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.340010404586792, | |
| "learning_rate": 4.603337189005354e-06, | |
| "loss": 0.2473, | |
| "step": 4448 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.2413158416748047, | |
| "learning_rate": 4.594344496876313e-06, | |
| "loss": 0.2354, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.2394189834594727, | |
| "learning_rate": 4.585259998159646e-06, | |
| "loss": 0.2512, | |
| "step": 4512 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.2866027355194092, | |
| "learning_rate": 4.576084091078677e-06, | |
| "loss": 0.2364, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.1080009937286377, | |
| "learning_rate": 4.5668171778636585e-06, | |
| "loss": 0.2432, | |
| "step": 4576 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.2469310760498047, | |
| "learning_rate": 4.5574596647341414e-06, | |
| "loss": 0.256, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0387507677078247, | |
| "learning_rate": 4.548011961881167e-06, | |
| "loss": 0.232, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2382770776748657, | |
| "learning_rate": 4.538474483449286e-06, | |
| "loss": 0.2552, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.2282336950302124, | |
| "learning_rate": 4.528847647518403e-06, | |
| "loss": 0.2525, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.4016482830047607, | |
| "learning_rate": 4.5191318760854526e-06, | |
| "loss": 0.2582, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.3214083909988403, | |
| "learning_rate": 4.509327595045898e-06, | |
| "loss": 0.2578, | |
| "step": 4768 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.9114232063293457, | |
| "learning_rate": 4.499435234175065e-06, | |
| "loss": 0.2533, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.172450065612793, | |
| "learning_rate": 4.4894552271093e-06, | |
| "loss": 0.264, | |
| "step": 4832 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.249770998954773, | |
| "learning_rate": 4.4793880113269595e-06, | |
| "loss": 0.2389, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.0912755727767944, | |
| "learning_rate": 4.469234028129241e-06, | |
| "loss": 0.2456, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.1503956317901611, | |
| "learning_rate": 4.458993722620827e-06, | |
| "loss": 0.2562, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.1564654111862183, | |
| "learning_rate": 4.448667543690384e-06, | |
| "loss": 0.25, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.271000862121582, | |
| "learning_rate": 4.438255943990879e-06, | |
| "loss": 0.243, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.0601048469543457, | |
| "learning_rate": 4.427759379919739e-06, | |
| "loss": 0.2397, | |
| "step": 5024 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.214858889579773, | |
| "learning_rate": 4.417178311598845e-06, | |
| "loss": 0.2442, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.0516908168792725, | |
| "learning_rate": 4.406513202854363e-06, | |
| "loss": 0.2467, | |
| "step": 5088 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.326076865196228, | |
| "learning_rate": 4.3957645211964065e-06, | |
| "loss": 0.2488, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.173823356628418, | |
| "learning_rate": 4.384932737798554e-06, | |
| "loss": 0.241, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.4526327848434448, | |
| "learning_rate": 4.3740183274771845e-06, | |
| "loss": 0.2553, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.2346609830856323, | |
| "learning_rate": 4.363021768670668e-06, | |
| "loss": 0.242, | |
| "step": 5216 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.8957495093345642, | |
| "learning_rate": 4.351943543418392e-06, | |
| "loss": 0.2444, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.097772479057312, | |
| "learning_rate": 4.340784137339632e-06, | |
| "loss": 0.2531, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.1537779569625854, | |
| "learning_rate": 4.329544039612264e-06, | |
| "loss": 0.2507, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.1922253370285034, | |
| "learning_rate": 4.318223742951321e-06, | |
| "loss": 0.2335, | |
| "step": 5344 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.1036819219589233, | |
| "learning_rate": 4.306823743587394e-06, | |
| "loss": 0.2465, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.229779839515686, | |
| "learning_rate": 4.295344541244879e-06, | |
| "loss": 0.2403, | |
| "step": 5408 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.4036519527435303, | |
| "learning_rate": 4.283786639120074e-06, | |
| "loss": 0.254, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.9732062816619873, | |
| "learning_rate": 4.272150543859117e-06, | |
| "loss": 0.2517, | |
| "step": 5472 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.3309801816940308, | |
| "learning_rate": 4.260436765535784e-06, | |
| "loss": 0.25, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.3353493213653564, | |
| "learning_rate": 4.2486458176291176e-06, | |
| "loss": 0.2482, | |
| "step": 5536 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.6585358381271362, | |
| "learning_rate": 4.236778217000934e-06, | |
| "loss": 0.248, | |
| "step": 5568 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.9717461466789246, | |
| "learning_rate": 4.224834483873152e-06, | |
| "loss": 0.2366, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.9571962356567383, | |
| "learning_rate": 4.2128151418049976e-06, | |
| "loss": 0.2404, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.0692377090454102, | |
| "learning_rate": 4.200720717670048e-06, | |
| "loss": 0.2135, | |
| "step": 5664 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.1159001588821411, | |
| "learning_rate": 4.188551741633144e-06, | |
| "loss": 0.1854, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.4514949321746826, | |
| "learning_rate": 4.176308747127136e-06, | |
| "loss": 0.2095, | |
| "step": 5728 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.4603676795959473, | |
| "learning_rate": 4.1639922708295176e-06, | |
| "loss": 0.2015, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.1802875995635986, | |
| "learning_rate": 4.151602852638888e-06, | |
| "loss": 0.222, | |
| "step": 5792 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.2036052942276, | |
| "learning_rate": 4.139141035651288e-06, | |
| "loss": 0.2093, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.1690653562545776, | |
| "learning_rate": 4.126607366136395e-06, | |
| "loss": 0.1925, | |
| "step": 5856 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.9996016621589661, | |
| "learning_rate": 4.114002393513577e-06, | |
| "loss": 0.206, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.1670773029327393, | |
| "learning_rate": 4.101326670327807e-06, | |
| "loss": 0.2097, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.8733654022216797, | |
| "learning_rate": 4.0885807522254435e-06, | |
| "loss": 0.2015, | |
| "step": 5952 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.2280749082565308, | |
| "learning_rate": 4.075765197929872e-06, | |
| "loss": 0.2108, | |
| "step": 5984 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.1926356554031372, | |
| "learning_rate": 4.0628805692170105e-06, | |
| "loss": 0.2047, | |
| "step": 6016 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.0048396587371826, | |
| "learning_rate": 4.049927430890693e-06, | |
| "loss": 0.2077, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.026442050933838, | |
| "learning_rate": 4.0369063507578995e-06, | |
| "loss": 0.2051, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.1310842037200928, | |
| "learning_rate": 4.023817899603875e-06, | |
| "loss": 0.2055, | |
| "step": 6112 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.1275712251663208, | |
| "learning_rate": 4.010662651167106e-06, | |
| "loss": 0.1965, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.1789113283157349, | |
| "learning_rate": 3.997441182114164e-06, | |
| "loss": 0.2118, | |
| "step": 6176 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.3836599588394165, | |
| "learning_rate": 3.984154072014438e-06, | |
| "loss": 0.2056, | |
| "step": 6208 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.1013050079345703, | |
| "learning_rate": 3.970801903314722e-06, | |
| "loss": 0.2109, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.1018249988555908, | |
| "learning_rate": 3.957385261313685e-06, | |
| "loss": 0.202, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.3906185626983643, | |
| "learning_rate": 3.943904734136213e-06, | |
| "loss": 0.2065, | |
| "step": 6304 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.2197610139846802, | |
| "learning_rate": 3.930360912707632e-06, | |
| "loss": 0.2096, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.0342845916748047, | |
| "learning_rate": 3.916754390727795e-06, | |
| "loss": 0.2024, | |
| "step": 6368 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.236260175704956, | |
| "learning_rate": 3.90308576464507e-06, | |
| "loss": 0.216, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.3906086683273315, | |
| "learning_rate": 3.889355633630186e-06, | |
| "loss": 0.2153, | |
| "step": 6432 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.2441129684448242, | |
| "learning_rate": 3.875564599549968e-06, | |
| "loss": 0.2092, | |
| "step": 6464 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.2320338487625122, | |
| "learning_rate": 3.861713266940959e-06, | |
| "loss": 0.2038, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.6422646045684814, | |
| "learning_rate": 3.847802242982915e-06, | |
| "loss": 0.205, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.1179068088531494, | |
| "learning_rate": 3.83383213747219e-06, | |
| "loss": 0.2162, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.0986745357513428, | |
| "learning_rate": 3.8198035627950084e-06, | |
| "loss": 0.1956, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.340859055519104, | |
| "learning_rate": 3.8057171339006138e-06, | |
| "loss": 0.2093, | |
| "step": 6624 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.7803446054458618, | |
| "learning_rate": 3.791573468274323e-06, | |
| "loss": 0.2133, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.022388219833374, | |
| "learning_rate": 3.777373185910448e-06, | |
| "loss": 0.2182, | |
| "step": 6688 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.0795223712921143, | |
| "learning_rate": 3.7631169092851226e-06, | |
| "loss": 0.2051, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0785856246948242, | |
| "learning_rate": 3.7488052633290174e-06, | |
| "loss": 0.2047, | |
| "step": 6752 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.0391508340835571, | |
| "learning_rate": 3.7344388753999434e-06, | |
| "loss": 0.2081, | |
| "step": 6784 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.3015925884246826, | |
| "learning_rate": 3.720018375255352e-06, | |
| "loss": 0.2013, | |
| "step": 6816 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.2382066249847412, | |
| "learning_rate": 3.7055443950247276e-06, | |
| "loss": 0.2037, | |
| "step": 6848 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.1386123895645142, | |
| "learning_rate": 3.691017569181882e-06, | |
| "loss": 0.2046, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.9857081770896912, | |
| "learning_rate": 3.6764385345171393e-06, | |
| "loss": 0.207, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.1276394128799438, | |
| "learning_rate": 3.661807930109422e-06, | |
| "loss": 0.2134, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.1821982860565186, | |
| "learning_rate": 3.647126397298234e-06, | |
| "loss": 0.2162, | |
| "step": 6976 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.218800663948059, | |
| "learning_rate": 3.632394579655555e-06, | |
| "loss": 0.2023, | |
| "step": 7008 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.083310842514038, | |
| "learning_rate": 3.6176131229576193e-06, | |
| "loss": 0.1999, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.0640002489089966, | |
| "learning_rate": 3.602782675156617e-06, | |
| "loss": 0.2125, | |
| "step": 7072 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.1672149896621704, | |
| "learning_rate": 3.5879038863522843e-06, | |
| "loss": 0.2157, | |
| "step": 7104 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.1732845306396484, | |
| "learning_rate": 3.572977408763407e-06, | |
| "loss": 0.2082, | |
| "step": 7136 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.1544941663742065, | |
| "learning_rate": 3.5580038966992344e-06, | |
| "loss": 0.2067, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.2914546728134155, | |
| "learning_rate": 3.5429840065307924e-06, | |
| "loss": 0.2019, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0473650693893433, | |
| "learning_rate": 3.527918396662115e-06, | |
| "loss": 0.1952, | |
| "step": 7232 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.2211614847183228, | |
| "learning_rate": 3.512807727501379e-06, | |
| "loss": 0.2093, | |
| "step": 7264 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.1035760641098022, | |
| "learning_rate": 3.4976526614319573e-06, | |
| "loss": 0.2007, | |
| "step": 7296 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.2120308876037598, | |
| "learning_rate": 3.4824538627833825e-06, | |
| "loss": 0.2205, | |
| "step": 7328 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.8647122979164124, | |
| "learning_rate": 3.4672119978022277e-06, | |
| "loss": 0.2063, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.1142189502716064, | |
| "learning_rate": 3.4519277346228953e-06, | |
| "loss": 0.2075, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.3183207511901855, | |
| "learning_rate": 3.436601743238335e-06, | |
| "loss": 0.2094, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.0320820808410645, | |
| "learning_rate": 3.421234695470673e-06, | |
| "loss": 0.2029, | |
| "step": 7456 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.3065481185913086, | |
| "learning_rate": 3.4058272649417607e-06, | |
| "loss": 0.2127, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.209372639656067, | |
| "learning_rate": 3.3903801270436465e-06, | |
| "loss": 0.2015, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.0177247524261475, | |
| "learning_rate": 3.374893958908971e-06, | |
| "loss": 0.2075, | |
| "step": 7552 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.4457709789276123, | |
| "learning_rate": 3.3593694393812827e-06, | |
| "loss": 0.2098, | |
| "step": 7584 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.1711078882217407, | |
| "learning_rate": 3.3438072489852837e-06, | |
| "loss": 0.2088, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.1928409337997437, | |
| "learning_rate": 3.3282080698969953e-06, | |
| "loss": 0.1918, | |
| "step": 7648 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.9215808510780334, | |
| "learning_rate": 3.3125725859138548e-06, | |
| "loss": 0.2106, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.3021633625030518, | |
| "learning_rate": 3.2969014824247436e-06, | |
| "loss": 0.2018, | |
| "step": 7712 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.1597398519515991, | |
| "learning_rate": 3.28119544637994e-06, | |
| "loss": 0.2035, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.2015706300735474, | |
| "learning_rate": 3.265455166261009e-06, | |
| "loss": 0.2027, | |
| "step": 7776 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.1449264287948608, | |
| "learning_rate": 3.2496813320506183e-06, | |
| "loss": 0.2165, | |
| "step": 7808 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.1332265138626099, | |
| "learning_rate": 3.2338746352022965e-06, | |
| "loss": 0.2006, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.430891990661621, | |
| "learning_rate": 3.2180357686101226e-06, | |
| "loss": 0.2102, | |
| "step": 7872 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.4063985347747803, | |
| "learning_rate": 3.2021654265783505e-06, | |
| "loss": 0.196, | |
| "step": 7904 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.3970558643341064, | |
| "learning_rate": 3.1862643047909746e-06, | |
| "loss": 0.2161, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.3233983516693115, | |
| "learning_rate": 3.170333100281236e-06, | |
| "loss": 0.1921, | |
| "step": 7968 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.2325806617736816, | |
| "learning_rate": 3.154372511401064e-06, | |
| "loss": 0.2042, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.0558186769485474, | |
| "learning_rate": 3.1383832377904676e-06, | |
| "loss": 0.2056, | |
| "step": 8032 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.189503788948059, | |
| "learning_rate": 3.1223659803468653e-06, | |
| "loss": 0.203, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.1646627187728882, | |
| "learning_rate": 3.1063214411943576e-06, | |
| "loss": 0.2088, | |
| "step": 8096 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.1149977445602417, | |
| "learning_rate": 3.0902503236529533e-06, | |
| "loss": 0.2081, | |
| "step": 8128 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.5566740036010742, | |
| "learning_rate": 3.074153332207738e-06, | |
| "loss": 0.2141, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.304262638092041, | |
| "learning_rate": 3.058031172477992e-06, | |
| "loss": 0.2006, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.1247010231018066, | |
| "learning_rate": 3.041884551186258e-06, | |
| "loss": 0.2109, | |
| "step": 8224 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.4587311744689941, | |
| "learning_rate": 3.0257141761273627e-06, | |
| "loss": 0.2016, | |
| "step": 8256 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.1545838117599487, | |
| "learning_rate": 3.0095207561373935e-06, | |
| "loss": 0.183, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.3221790790557861, | |
| "learning_rate": 2.9933050010626208e-06, | |
| "loss": 0.1985, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.1717700958251953, | |
| "learning_rate": 2.9770676217283844e-06, | |
| "loss": 0.2113, | |
| "step": 8352 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.0709022283554077, | |
| "learning_rate": 2.960809329907934e-06, | |
| "loss": 0.2012, | |
| "step": 8384 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.4594495296478271, | |
| "learning_rate": 2.944530838291229e-06, | |
| "loss": 0.2039, | |
| "step": 8416 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.3135212659835815, | |
| "learning_rate": 2.928232860453694e-06, | |
| "loss": 0.206, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.316394329071045, | |
| "learning_rate": 2.911916110824945e-06, | |
| "loss": 0.212, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.3602452278137207, | |
| "learning_rate": 2.895581304657465e-06, | |
| "loss": 0.2068, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.2073897123336792, | |
| "learning_rate": 2.8792291579952553e-06, | |
| "loss": 0.2098, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.2983072996139526, | |
| "learning_rate": 2.8628603876424467e-06, | |
| "loss": 0.2086, | |
| "step": 8576 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.0781196355819702, | |
| "learning_rate": 2.846475711131877e-06, | |
| "loss": 0.201, | |
| "step": 8608 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.1917251348495483, | |
| "learning_rate": 2.8300758466936366e-06, | |
| "loss": 0.1982, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.2894983291625977, | |
| "learning_rate": 2.813661513223588e-06, | |
| "loss": 0.1943, | |
| "step": 8672 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.2249202728271484, | |
| "learning_rate": 2.7972334302518504e-06, | |
| "loss": 0.2145, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.1947064399719238, | |
| "learning_rate": 2.7807923179112576e-06, | |
| "loss": 0.2003, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.0660251379013062, | |
| "learning_rate": 2.764338896905792e-06, | |
| "loss": 0.1984, | |
| "step": 8768 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.0243247747421265, | |
| "learning_rate": 2.7478738884789934e-06, | |
| "loss": 0.2036, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.286199927330017, | |
| "learning_rate": 2.731398014382341e-06, | |
| "loss": 0.2027, | |
| "step": 8832 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.1617448329925537, | |
| "learning_rate": 2.714911996843617e-06, | |
| "loss": 0.2162, | |
| "step": 8864 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.1921496391296387, | |
| "learning_rate": 2.6984165585352435e-06, | |
| "loss": 0.2124, | |
| "step": 8896 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.2066140174865723, | |
| "learning_rate": 2.6819124225426085e-06, | |
| "loss": 0.199, | |
| "step": 8928 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.0459320545196533, | |
| "learning_rate": 2.665400312332368e-06, | |
| "loss": 0.2072, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.2983636856079102, | |
| "learning_rate": 2.648880951720729e-06, | |
| "loss": 0.2024, | |
| "step": 8992 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.0876768827438354, | |
| "learning_rate": 2.6323550648417267e-06, | |
| "loss": 0.2143, | |
| "step": 9024 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.0047022104263306, | |
| "learning_rate": 2.6158233761154744e-06, | |
| "loss": 0.2043, | |
| "step": 9056 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.9878237247467041, | |
| "learning_rate": 2.5992866102164146e-06, | |
| "loss": 0.1991, | |
| "step": 9088 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.9894827604293823, | |
| "learning_rate": 2.58274549204155e-06, | |
| "loss": 0.1979, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.9374232292175293, | |
| "learning_rate": 2.5662007466786674e-06, | |
| "loss": 0.2055, | |
| "step": 9152 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.259948492050171, | |
| "learning_rate": 2.5496530993745518e-06, | |
| "loss": 0.2057, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.958737850189209, | |
| "learning_rate": 2.533103275503197e-06, | |
| "loss": 0.2029, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.079717755317688, | |
| "learning_rate": 2.5165520005340082e-06, | |
| "loss": 0.2049, | |
| "step": 9248 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.1001982688903809, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.211, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.2408779859542847, | |
| "learning_rate": 2.4834479994659926e-06, | |
| "loss": 0.2028, | |
| "step": 9312 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.0395313501358032, | |
| "learning_rate": 2.4668967244968035e-06, | |
| "loss": 0.1988, | |
| "step": 9344 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.0080056190490723, | |
| "learning_rate": 2.4503469006254487e-06, | |
| "loss": 0.1988, | |
| "step": 9376 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.4669593572616577, | |
| "learning_rate": 2.4337992533213334e-06, | |
| "loss": 0.1942, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.3393524885177612, | |
| "learning_rate": 2.4172545079584508e-06, | |
| "loss": 0.1964, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.9786149263381958, | |
| "learning_rate": 2.4007133897835863e-06, | |
| "loss": 0.1984, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.1999776363372803, | |
| "learning_rate": 2.3841766238845264e-06, | |
| "loss": 0.2102, | |
| "step": 9504 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.3276174068450928, | |
| "learning_rate": 2.367644935158274e-06, | |
| "loss": 0.1941, | |
| "step": 9536 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.0124472379684448, | |
| "learning_rate": 2.3511190482792713e-06, | |
| "loss": 0.199, | |
| "step": 9568 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.258489966392517, | |
| "learning_rate": 2.3345996876676334e-06, | |
| "loss": 0.2008, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.1993016004562378, | |
| "learning_rate": 2.318087577457392e-06, | |
| "loss": 0.2154, | |
| "step": 9632 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.250908613204956, | |
| "learning_rate": 2.3015834414647573e-06, | |
| "loss": 0.2068, | |
| "step": 9664 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.211915373802185, | |
| "learning_rate": 2.2850880031563845e-06, | |
| "loss": 0.1946, | |
| "step": 9696 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.0278340578079224, | |
| "learning_rate": 2.26860198561766e-06, | |
| "loss": 0.1948, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.2455780506134033, | |
| "learning_rate": 2.2521261115210074e-06, | |
| "loss": 0.197, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.2321908473968506, | |
| "learning_rate": 2.2356611030942084e-06, | |
| "loss": 0.2075, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.0618436336517334, | |
| "learning_rate": 2.219207682088743e-06, | |
| "loss": 0.1931, | |
| "step": 9824 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.36842942237854, | |
| "learning_rate": 2.20276656974815e-06, | |
| "loss": 0.1999, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.033603310585022, | |
| "learning_rate": 2.186338486776412e-06, | |
| "loss": 0.2028, | |
| "step": 9888 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.303781270980835, | |
| "learning_rate": 2.169924153306363e-06, | |
| "loss": 0.214, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.2051355838775635, | |
| "learning_rate": 2.153524288868124e-06, | |
| "loss": 0.2091, | |
| "step": 9952 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.9946267604827881, | |
| "learning_rate": 2.137139612357554e-06, | |
| "loss": 0.1942, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.283492088317871, | |
| "learning_rate": 2.120770842004746e-06, | |
| "loss": 0.1971, | |
| "step": 10016 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.9329233169555664, | |
| "learning_rate": 2.1044186953425358e-06, | |
| "loss": 0.203, | |
| "step": 10048 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.082767367362976, | |
| "learning_rate": 2.0880838891750553e-06, | |
| "loss": 0.2012, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.1007740497589111, | |
| "learning_rate": 2.0717671395463063e-06, | |
| "loss": 0.2028, | |
| "step": 10112 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.2502014636993408, | |
| "learning_rate": 2.0554691617087725e-06, | |
| "loss": 0.2121, | |
| "step": 10144 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.073034405708313, | |
| "learning_rate": 2.0391906700920667e-06, | |
| "loss": 0.1994, | |
| "step": 10176 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.9409189820289612, | |
| "learning_rate": 2.0229323782716156e-06, | |
| "loss": 0.2054, | |
| "step": 10208 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.197383999824524, | |
| "learning_rate": 2.0066949989373797e-06, | |
| "loss": 0.1946, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.261612892150879, | |
| "learning_rate": 1.9904792438626074e-06, | |
| "loss": 0.2038, | |
| "step": 10272 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.4839472770690918, | |
| "learning_rate": 1.9742858238726377e-06, | |
| "loss": 0.2067, | |
| "step": 10304 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.0103521347045898, | |
| "learning_rate": 1.9581154488137425e-06, | |
| "loss": 0.2104, | |
| "step": 10336 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.2776283025741577, | |
| "learning_rate": 1.9419688275220085e-06, | |
| "loss": 0.196, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.0784910917282104, | |
| "learning_rate": 1.9258466677922624e-06, | |
| "loss": 0.1975, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.9643808007240295, | |
| "learning_rate": 1.909749676347047e-06, | |
| "loss": 0.2111, | |
| "step": 10432 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.3432437181472778, | |
| "learning_rate": 1.8936785588056428e-06, | |
| "loss": 0.1923, | |
| "step": 10464 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.133470892906189, | |
| "learning_rate": 1.8776340196531351e-06, | |
| "loss": 0.2016, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.0897003412246704, | |
| "learning_rate": 1.8616167622095328e-06, | |
| "loss": 0.193, | |
| "step": 10528 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.9629374146461487, | |
| "learning_rate": 1.8456274885989374e-06, | |
| "loss": 0.1937, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.406630039215088, | |
| "learning_rate": 1.829666899718765e-06, | |
| "loss": 0.1997, | |
| "step": 10592 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.165366291999817, | |
| "learning_rate": 1.8137356952090258e-06, | |
| "loss": 0.1976, | |
| "step": 10624 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.2674609422683716, | |
| "learning_rate": 1.7978345734216502e-06, | |
| "loss": 0.1908, | |
| "step": 10656 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.2211626768112183, | |
| "learning_rate": 1.7819642313898783e-06, | |
| "loss": 0.1984, | |
| "step": 10688 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.1066653728485107, | |
| "learning_rate": 1.766125364797704e-06, | |
| "loss": 0.2035, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.539157748222351, | |
| "learning_rate": 1.7503186679493821e-06, | |
| "loss": 0.201, | |
| "step": 10752 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.1897879838943481, | |
| "learning_rate": 1.7345448337389918e-06, | |
| "loss": 0.194, | |
| "step": 10784 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.162571668624878, | |
| "learning_rate": 1.7188045536200604e-06, | |
| "loss": 0.1899, | |
| "step": 10816 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.0616639852523804, | |
| "learning_rate": 1.7030985175752574e-06, | |
| "loss": 0.1978, | |
| "step": 10848 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.0391641855239868, | |
| "learning_rate": 1.687427414086146e-06, | |
| "loss": 0.2017, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.0870113372802734, | |
| "learning_rate": 1.6717919301030055e-06, | |
| "loss": 0.2012, | |
| "step": 10912 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.143446922302246, | |
| "learning_rate": 1.6561927510147172e-06, | |
| "loss": 0.1911, | |
| "step": 10944 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.015080213546753, | |
| "learning_rate": 1.6406305606187183e-06, | |
| "loss": 0.198, | |
| "step": 10976 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.9722278714179993, | |
| "learning_rate": 1.6251060410910301e-06, | |
| "loss": 0.1862, | |
| "step": 11008 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.9311158061027527, | |
| "learning_rate": 1.6096198729563539e-06, | |
| "loss": 0.198, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.3127020597457886, | |
| "learning_rate": 1.5941727350582399e-06, | |
| "loss": 0.2, | |
| "step": 11072 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.9450055956840515, | |
| "learning_rate": 1.5787653045293278e-06, | |
| "loss": 0.2015, | |
| "step": 11104 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.1553057432174683, | |
| "learning_rate": 1.5633982567616657e-06, | |
| "loss": 0.2068, | |
| "step": 11136 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.8617095351219177, | |
| "learning_rate": 1.548072265377105e-06, | |
| "loss": 0.2014, | |
| "step": 11168 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.9857013821601868, | |
| "learning_rate": 1.532788002197773e-06, | |
| "loss": 0.2031, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.0158861875534058, | |
| "learning_rate": 1.5175461372166177e-06, | |
| "loss": 0.1941, | |
| "step": 11232 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.1121022701263428, | |
| "learning_rate": 1.5023473385680438e-06, | |
| "loss": 0.1708, | |
| "step": 11264 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.1300593614578247, | |
| "learning_rate": 1.4871922724986215e-06, | |
| "loss": 0.1504, | |
| "step": 11296 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.232246994972229, | |
| "learning_rate": 1.4720816033378856e-06, | |
| "loss": 0.151, | |
| "step": 11328 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.2618398666381836, | |
| "learning_rate": 1.4570159934692085e-06, | |
| "loss": 0.1421, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.275038242340088, | |
| "learning_rate": 1.4419961033007669e-06, | |
| "loss": 0.1457, | |
| "step": 11392 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.079405426979065, | |
| "learning_rate": 1.427022591236594e-06, | |
| "loss": 0.144, | |
| "step": 11424 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.1695400476455688, | |
| "learning_rate": 1.4120961136477168e-06, | |
| "loss": 0.1531, | |
| "step": 11456 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.008547306060791, | |
| "learning_rate": 1.3972173248433832e-06, | |
| "loss": 0.1453, | |
| "step": 11488 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.0746265649795532, | |
| "learning_rate": 1.3823868770423815e-06, | |
| "loss": 0.1446, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.1596614122390747, | |
| "learning_rate": 1.3676054203444462e-06, | |
| "loss": 0.1477, | |
| "step": 11552 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.1029706001281738, | |
| "learning_rate": 1.3528736027017663e-06, | |
| "loss": 0.1477, | |
| "step": 11584 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.9398396015167236, | |
| "learning_rate": 1.3381920698905788e-06, | |
| "loss": 0.1477, | |
| "step": 11616 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.0209776163101196, | |
| "learning_rate": 1.3235614654828604e-06, | |
| "loss": 0.1448, | |
| "step": 11648 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.9415841102600098, | |
| "learning_rate": 1.3089824308181187e-06, | |
| "loss": 0.1481, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.088069200515747, | |
| "learning_rate": 1.2944556049752726e-06, | |
| "loss": 0.149, | |
| "step": 11712 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.3269786834716797, | |
| "learning_rate": 1.2799816247446494e-06, | |
| "loss": 0.1497, | |
| "step": 11744 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.9119545817375183, | |
| "learning_rate": 1.265561124600057e-06, | |
| "loss": 0.1467, | |
| "step": 11776 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.0677683353424072, | |
| "learning_rate": 1.251194736670983e-06, | |
| "loss": 0.1448, | |
| "step": 11808 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.0756884813308716, | |
| "learning_rate": 1.2368830907148778e-06, | |
| "loss": 0.1363, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.0578961372375488, | |
| "learning_rate": 1.2226268140895528e-06, | |
| "loss": 0.1527, | |
| "step": 11872 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.0562700033187866, | |
| "learning_rate": 1.2084265317256772e-06, | |
| "loss": 0.1449, | |
| "step": 11904 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.0958082675933838, | |
| "learning_rate": 1.1942828660993869e-06, | |
| "loss": 0.1474, | |
| "step": 11936 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.9672511219978333, | |
| "learning_rate": 1.1801964372049932e-06, | |
| "loss": 0.1459, | |
| "step": 11968 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.1125974655151367, | |
| "learning_rate": 1.1661678625278106e-06, | |
| "loss": 0.1483, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.227283239364624, | |
| "learning_rate": 1.152197757017086e-06, | |
| "loss": 0.1453, | |
| "step": 12032 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.3206896781921387, | |
| "learning_rate": 1.1382867330590414e-06, | |
| "loss": 0.1425, | |
| "step": 12064 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.9912922978401184, | |
| "learning_rate": 1.1244354004500335e-06, | |
| "loss": 0.1529, | |
| "step": 12096 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.8996345400810242, | |
| "learning_rate": 1.110644366369815e-06, | |
| "loss": 0.1437, | |
| "step": 12128 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.9117119312286377, | |
| "learning_rate": 1.0969142353549315e-06, | |
| "loss": 0.1429, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.1916334629058838, | |
| "learning_rate": 1.0832456092722063e-06, | |
| "loss": 0.1509, | |
| "step": 12192 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.1345574855804443, | |
| "learning_rate": 1.0696390872923696e-06, | |
| "loss": 0.1547, | |
| "step": 12224 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.3311399221420288, | |
| "learning_rate": 1.0560952658637869e-06, | |
| "loss": 0.1428, | |
| "step": 12256 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.0195939540863037, | |
| "learning_rate": 1.042614738686315e-06, | |
| "loss": 0.1447, | |
| "step": 12288 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.1453065872192383, | |
| "learning_rate": 1.029198096685278e-06, | |
| "loss": 0.1384, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.1899516582489014, | |
| "learning_rate": 1.0158459279855632e-06, | |
| "loss": 0.1433, | |
| "step": 12352 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.060065507888794, | |
| "learning_rate": 1.0025588178858372e-06, | |
| "loss": 0.1456, | |
| "step": 12384 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.1489146947860718, | |
| "learning_rate": 9.893373488328953e-07, | |
| "loss": 0.1433, | |
| "step": 12416 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.3114417791366577, | |
| "learning_rate": 9.761821003961246e-07, | |
| "loss": 0.1467, | |
| "step": 12448 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.3255183696746826, | |
| "learning_rate": 9.630936492421005e-07, | |
| "loss": 0.1463, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.2642742395401, | |
| "learning_rate": 9.500725691093085e-07, | |
| "loss": 0.1525, | |
| "step": 12512 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.2281138896942139, | |
| "learning_rate": 9.371194307829895e-07, | |
| "loss": 0.1383, | |
| "step": 12544 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.9627875089645386, | |
| "learning_rate": 9.242348020701295e-07, | |
| "loss": 0.1642, | |
| "step": 12576 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.1187249422073364, | |
| "learning_rate": 9.114192477745568e-07, | |
| "loss": 0.1439, | |
| "step": 12608 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.0410840511322021, | |
| "learning_rate": 8.986733296721931e-07, | |
| "loss": 0.142, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.2345024347305298, | |
| "learning_rate": 8.859976064864235e-07, | |
| "loss": 0.1512, | |
| "step": 12672 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.6558443307876587, | |
| "learning_rate": 8.733926338636056e-07, | |
| "loss": 0.1363, | |
| "step": 12704 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.3118538856506348, | |
| "learning_rate": 8.608589643487128e-07, | |
| "loss": 0.1471, | |
| "step": 12736 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 1.1155567169189453, | |
| "learning_rate": 8.483971473611133e-07, | |
| "loss": 0.1396, | |
| "step": 12768 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.0880179405212402, | |
| "learning_rate": 8.360077291704821e-07, | |
| "loss": 0.1413, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.9752321839332581, | |
| "learning_rate": 8.236912528728647e-07, | |
| "loss": 0.146, | |
| "step": 12832 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.9778379201889038, | |
| "learning_rate": 8.114482583668576e-07, | |
| "loss": 0.1403, | |
| "step": 12864 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.8760839700698853, | |
| "learning_rate": 7.99279282329952e-07, | |
| "loss": 0.148, | |
| "step": 12896 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.187658667564392, | |
| "learning_rate": 7.871848581950039e-07, | |
| "loss": 0.132, | |
| "step": 12928 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.9668059349060059, | |
| "learning_rate": 7.751655161268481e-07, | |
| "loss": 0.1424, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.1318392753601074, | |
| "learning_rate": 7.632217829990668e-07, | |
| "loss": 0.1516, | |
| "step": 12992 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.3520994186401367, | |
| "learning_rate": 7.513541823708828e-07, | |
| "loss": 0.1495, | |
| "step": 13024 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.3352413177490234, | |
| "learning_rate": 7.395632344642173e-07, | |
| "loss": 0.1446, | |
| "step": 13056 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.0273305177688599, | |
| "learning_rate": 7.278494561408833e-07, | |
| "loss": 0.1391, | |
| "step": 13088 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.2872681617736816, | |
| "learning_rate": 7.162133608799271e-07, | |
| "loss": 0.1391, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.0563528537750244, | |
| "learning_rate": 7.046554587551216e-07, | |
| "loss": 0.1521, | |
| "step": 13152 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.1487845182418823, | |
| "learning_rate": 6.931762564126074e-07, | |
| "loss": 0.1411, | |
| "step": 13184 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.058159351348877, | |
| "learning_rate": 6.817762570486791e-07, | |
| "loss": 0.1424, | |
| "step": 13216 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.249377727508545, | |
| "learning_rate": 6.704559603877367e-07, | |
| "loss": 0.1448, | |
| "step": 13248 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.9334893226623535, | |
| "learning_rate": 6.592158626603689e-07, | |
| "loss": 0.1384, | |
| "step": 13280 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.5639148950576782, | |
| "learning_rate": 6.480564565816091e-07, | |
| "loss": 0.1426, | |
| "step": 13312 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.0596867799758911, | |
| "learning_rate": 6.369782313293335e-07, | |
| "loss": 0.1358, | |
| "step": 13344 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.8567415475845337, | |
| "learning_rate": 6.259816725228158e-07, | |
| "loss": 0.1465, | |
| "step": 13376 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.1086764335632324, | |
| "learning_rate": 6.150672622014459e-07, | |
| "loss": 0.1538, | |
| "step": 13408 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.1636631488800049, | |
| "learning_rate": 6.042354788035943e-07, | |
| "loss": 0.1389, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.274596929550171, | |
| "learning_rate": 5.934867971456384e-07, | |
| "loss": 0.1464, | |
| "step": 13472 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.173563003540039, | |
| "learning_rate": 5.828216884011553e-07, | |
| "loss": 0.1435, | |
| "step": 13504 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.0788921117782593, | |
| "learning_rate": 5.722406200802613e-07, | |
| "loss": 0.145, | |
| "step": 13536 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.1613490581512451, | |
| "learning_rate": 5.617440560091212e-07, | |
| "loss": 0.1474, | |
| "step": 13568 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.9075080156326294, | |
| "learning_rate": 5.513324563096167e-07, | |
| "loss": 0.1423, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 1.1296495199203491, | |
| "learning_rate": 5.41006277379173e-07, | |
| "loss": 0.1506, | |
| "step": 13632 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.2199699878692627, | |
| "learning_rate": 5.307659718707603e-07, | |
| "loss": 0.1459, | |
| "step": 13664 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.2415364980697632, | |
| "learning_rate": 5.20611988673041e-07, | |
| "loss": 0.1459, | |
| "step": 13696 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.9814534187316895, | |
| "learning_rate": 5.105447728907012e-07, | |
| "loss": 0.1405, | |
| "step": 13728 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.1437889337539673, | |
| "learning_rate": 5.00564765824936e-07, | |
| "loss": 0.147, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.1459242105484009, | |
| "learning_rate": 4.906724049541023e-07, | |
| "loss": 0.1454, | |
| "step": 13792 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.093807578086853, | |
| "learning_rate": 4.808681239145479e-07, | |
| "loss": 0.1448, | |
| "step": 13824 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.1457182168960571, | |
| "learning_rate": 4.711523524815978e-07, | |
| "loss": 0.1391, | |
| "step": 13856 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.0422513484954834, | |
| "learning_rate": 4.615255165507146e-07, | |
| "loss": 0.1435, | |
| "step": 13888 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.1171213388442993, | |
| "learning_rate": 4.5198803811883326e-07, | |
| "loss": 0.1545, | |
| "step": 13920 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.3410863876342773, | |
| "learning_rate": 4.4254033526585917e-07, | |
| "loss": 0.1526, | |
| "step": 13952 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.9821498394012451, | |
| "learning_rate": 4.331828221363424e-07, | |
| "loss": 0.1407, | |
| "step": 13984 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.2533886432647705, | |
| "learning_rate": 4.239159089213246e-07, | |
| "loss": 0.1358, | |
| "step": 14016 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.9848787784576416, | |
| "learning_rate": 4.147400018403544e-07, | |
| "loss": 0.1449, | |
| "step": 14048 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.9092288613319397, | |
| "learning_rate": 4.056555031236878e-07, | |
| "loss": 0.1433, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.3839354515075684, | |
| "learning_rate": 3.966628109946469e-07, | |
| "loss": 0.1494, | |
| "step": 14112 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.9744381904602051, | |
| "learning_rate": 3.877623196521707e-07, | |
| "loss": 0.1426, | |
| "step": 14144 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 1.165624976158142, | |
| "learning_rate": 3.7895441925353356e-07, | |
| "loss": 0.1418, | |
| "step": 14176 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.9751477241516113, | |
| "learning_rate": 3.702394958972391e-07, | |
| "loss": 0.1479, | |
| "step": 14208 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 1.0645439624786377, | |
| "learning_rate": 3.616179316061011e-07, | |
| "loss": 0.1373, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.9858296513557434, | |
| "learning_rate": 3.5309010431049284e-07, | |
| "loss": 0.1367, | |
| "step": 14272 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.0521584749221802, | |
| "learning_rate": 3.44656387831781e-07, | |
| "loss": 0.1421, | |
| "step": 14304 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.2546510696411133, | |
| "learning_rate": 3.363171518659408e-07, | |
| "loss": 0.1384, | |
| "step": 14336 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 1.0795034170150757, | |
| "learning_rate": 3.280727619673496e-07, | |
| "loss": 0.1463, | |
| "step": 14368 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.1764500141143799, | |
| "learning_rate": 3.199235795327615e-07, | |
| "loss": 0.1499, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 1.1234067678451538, | |
| "learning_rate": 3.1186996178546674e-07, | |
| "loss": 0.1497, | |
| "step": 14432 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.9825364947319031, | |
| "learning_rate": 3.039122617596302e-07, | |
| "loss": 0.1514, | |
| "step": 14464 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 1.263085961341858, | |
| "learning_rate": 2.960508282848215e-07, | |
| "loss": 0.1476, | |
| "step": 14496 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 1.084181308746338, | |
| "learning_rate": 2.8828600597071597e-07, | |
| "loss": 0.1308, | |
| "step": 14528 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.1697498559951782, | |
| "learning_rate": 2.8061813519199536e-07, | |
| "loss": 0.1348, | |
| "step": 14560 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.3982306718826294, | |
| "learning_rate": 2.7304755207342467e-07, | |
| "loss": 0.1455, | |
| "step": 14592 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.1802705526351929, | |
| "learning_rate": 2.655745884751157e-07, | |
| "loss": 0.1437, | |
| "step": 14624 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 1.0200531482696533, | |
| "learning_rate": 2.581995719779856e-07, | |
| "loss": 0.1394, | |
| "step": 14656 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 1.1693042516708374, | |
| "learning_rate": 2.5092282586939187e-07, | |
| "loss": 0.151, | |
| "step": 14688 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.116024374961853, | |
| "learning_rate": 2.437446691289616e-07, | |
| "loss": 0.1478, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.05259370803833, | |
| "learning_rate": 2.3666541641461231e-07, | |
| "loss": 0.1436, | |
| "step": 14752 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.0545703172683716, | |
| "learning_rate": 2.2968537804875485e-07, | |
| "loss": 0.1379, | |
| "step": 14784 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.0618197917938232, | |
| "learning_rate": 2.228048600046928e-07, | |
| "loss": 0.1409, | |
| "step": 14816 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.389092206954956, | |
| "learning_rate": 2.1602416389320922e-07, | |
| "loss": 0.1499, | |
| "step": 14848 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.0467108488082886, | |
| "learning_rate": 2.0934358694934347e-07, | |
| "loss": 0.1406, | |
| "step": 14880 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.1932706832885742, | |
| "learning_rate": 2.0276342201936637e-07, | |
| "loss": 0.1468, | |
| "step": 14912 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.2286850214004517, | |
| "learning_rate": 1.9628395754793777e-07, | |
| "loss": 0.1457, | |
| "step": 14944 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.9705607891082764, | |
| "learning_rate": 1.899054775654663e-07, | |
| "loss": 0.1439, | |
| "step": 14976 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.9110348224639893, | |
| "learning_rate": 1.8362826167565796e-07, | |
| "loss": 0.1439, | |
| "step": 15008 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.9858996272087097, | |
| "learning_rate": 1.774525850432568e-07, | |
| "loss": 0.1528, | |
| "step": 15040 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 1.1253962516784668, | |
| "learning_rate": 1.7137871838198817e-07, | |
| "loss": 0.1408, | |
| "step": 15072 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.5971510410308838, | |
| "learning_rate": 1.654069279426873e-07, | |
| "loss": 0.1497, | |
| "step": 15104 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.8475412130355835, | |
| "learning_rate": 1.5953747550162907e-07, | |
| "loss": 0.1456, | |
| "step": 15136 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.9866968989372253, | |
| "learning_rate": 1.537706183490545e-07, | |
| "loss": 0.1349, | |
| "step": 15168 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.1067461967468262, | |
| "learning_rate": 1.481066092778913e-07, | |
| "loss": 0.1457, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.1080329418182373, | |
| "learning_rate": 1.4254569657267235e-07, | |
| "loss": 0.146, | |
| "step": 15232 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.992157518863678, | |
| "learning_rate": 1.370881239986524e-07, | |
| "loss": 0.1439, | |
| "step": 15264 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 1.032788872718811, | |
| "learning_rate": 1.3173413079112128e-07, | |
| "loss": 0.1369, | |
| "step": 15296 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.9706469774246216, | |
| "learning_rate": 1.264839516449204e-07, | |
| "loss": 0.136, | |
| "step": 15328 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 1.1187324523925781, | |
| "learning_rate": 1.2133781670415013e-07, | |
| "loss": 0.1359, | |
| "step": 15360 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.1595239639282227, | |
| "learning_rate": 1.1629595155208424e-07, | |
| "loss": 0.1401, | |
| "step": 15392 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.087785243988037, | |
| "learning_rate": 1.1135857720128151e-07, | |
| "loss": 0.1358, | |
| "step": 15424 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.0765306949615479, | |
| "learning_rate": 1.0652591008389557e-07, | |
| "loss": 0.1438, | |
| "step": 15456 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.1016918420791626, | |
| "learning_rate": 1.0179816204218928e-07, | |
| "loss": 0.1373, | |
| "step": 15488 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 1.0536975860595703, | |
| "learning_rate": 9.717554031924842e-08, | |
| "loss": 0.1349, | |
| "step": 15520 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.8933613300323486, | |
| "learning_rate": 9.265824754989467e-08, | |
| "loss": 0.1316, | |
| "step": 15552 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.9983497858047485, | |
| "learning_rate": 8.824648175180722e-08, | |
| "loss": 0.1346, | |
| "step": 15584 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.0600674152374268, | |
| "learning_rate": 8.394043631683862e-08, | |
| "loss": 0.1533, | |
| "step": 15616 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.0515772104263306, | |
| "learning_rate": 7.974030000253986e-08, | |
| "loss": 0.139, | |
| "step": 15648 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 1.4163565635681152, | |
| "learning_rate": 7.564625692388499e-08, | |
| "loss": 0.1323, | |
| "step": 15680 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 1.0619480609893799, | |
| "learning_rate": 7.165848654519969e-08, | |
| "loss": 0.1373, | |
| "step": 15712 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.9783567786216736, | |
| "learning_rate": 6.777716367229764e-08, | |
| "loss": 0.1525, | |
| "step": 15744 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.0433095693588257, | |
| "learning_rate": 6.400245844481262e-08, | |
| "loss": 0.1409, | |
| "step": 15776 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.258354663848877, | |
| "learning_rate": 6.033453632874498e-08, | |
| "loss": 0.1402, | |
| "step": 15808 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.1823972463607788, | |
| "learning_rate": 5.677355810920604e-08, | |
| "loss": 0.1418, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.2745051383972168, | |
| "learning_rate": 5.3319679883370724e-08, | |
| "loss": 0.1471, | |
| "step": 15872 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.238215684890747, | |
| "learning_rate": 4.9973053053634365e-08, | |
| "loss": 0.1426, | |
| "step": 15904 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.0394669771194458, | |
| "learning_rate": 4.6733824320976674e-08, | |
| "loss": 0.1335, | |
| "step": 15936 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.2872110605239868, | |
| "learning_rate": 4.360213567853072e-08, | |
| "loss": 0.1544, | |
| "step": 15968 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.2786180973052979, | |
| "learning_rate": 4.057812440535797e-08, | |
| "loss": 0.1461, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.169412612915039, | |
| "learning_rate": 3.766192306043165e-08, | |
| "loss": 0.1413, | |
| "step": 16032 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.2436180114746094, | |
| "learning_rate": 3.485365947682562e-08, | |
| "loss": 0.1357, | |
| "step": 16064 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.1065387725830078, | |
| "learning_rate": 3.215345675611076e-08, | |
| "loss": 0.1472, | |
| "step": 16096 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 1.00310218334198, | |
| "learning_rate": 2.9561433262957072e-08, | |
| "loss": 0.1499, | |
| "step": 16128 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.9328859448432922, | |
| "learning_rate": 2.7077702619948963e-08, | |
| "loss": 0.1376, | |
| "step": 16160 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.122558832168579, | |
| "learning_rate": 2.4702373702600868e-08, | |
| "loss": 0.1461, | |
| "step": 16192 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.1374387741088867, | |
| "learning_rate": 2.2435550634585522e-08, | |
| "loss": 0.1427, | |
| "step": 16224 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.102001428604126, | |
| "learning_rate": 2.027733278317151e-08, | |
| "loss": 0.1402, | |
| "step": 16256 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.3255332708358765, | |
| "learning_rate": 1.822781475486507e-08, | |
| "loss": 0.1427, | |
| "step": 16288 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.1129345893859863, | |
| "learning_rate": 1.628708639126425e-08, | |
| "loss": 0.1443, | |
| "step": 16320 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.9628246426582336, | |
| "learning_rate": 1.4455232765120397e-08, | |
| "loss": 0.1425, | |
| "step": 16352 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 1.2334058284759521, | |
| "learning_rate": 1.273233417660863e-08, | |
| "loss": 0.134, | |
| "step": 16384 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.1855486631393433, | |
| "learning_rate": 1.1118466149808994e-08, | |
| "loss": 0.1403, | |
| "step": 16416 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.0412129163742065, | |
| "learning_rate": 9.61369942939383e-09, | |
| "loss": 0.1369, | |
| "step": 16448 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.2178360223770142, | |
| "learning_rate": 8.218099977528871e-09, | |
| "loss": 0.1346, | |
| "step": 16480 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.1358833312988281, | |
| "learning_rate": 6.9317289709799896e-09, | |
| "loss": 0.1504, | |
| "step": 16512 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.1772416830062866, | |
| "learning_rate": 5.754642798432297e-09, | |
| "loss": 0.144, | |
| "step": 16544 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.4765156507492065, | |
| "learning_rate": 4.686893058018227e-09, | |
| "loss": 0.1531, | |
| "step": 16576 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.0203588008880615, | |
| "learning_rate": 3.728526555056289e-09, | |
| "loss": 0.1439, | |
| "step": 16608 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.053261637687683, | |
| "learning_rate": 2.879585299997434e-09, | |
| "loss": 0.1438, | |
| "step": 16640 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.2388105392456055, | |
| "learning_rate": 2.1401065065859704e-09, | |
| "loss": 0.145, | |
| "step": 16672 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 0.9954524040222168, | |
| "learning_rate": 1.5101225902267036e-09, | |
| "loss": 0.147, | |
| "step": 16704 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.2384732961654663, | |
| "learning_rate": 9.89661166564404e-10, | |
| "loss": 0.1492, | |
| "step": 16736 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.8787427544593811, | |
| "learning_rate": 5.787450502728331e-10, | |
| "loss": 0.1299, | |
| "step": 16768 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.9824705719947815, | |
| "learning_rate": 2.7739225405609694e-10, | |
| "loss": 0.1428, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.0306531190872192, | |
| "learning_rate": 8.561598785705727e-11, | |
| "loss": 0.1434, | |
| "step": 16832 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.0343343019485474, | |
| "learning_rate": 3.424658279460591e-12, | |
| "loss": 0.1521, | |
| "step": 16864 | |
| } | |
| ], | |
| "logging_steps": 32, | |
| "max_steps": 16872, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 5624, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |