| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.2461059190031152, |
| "eval_steps": 500, |
| "global_step": 300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 0.012170583144449788, |
| "learning_rate": 3e-05, |
| "loss": 1.3728, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.012441205679409878, |
| "learning_rate": 3e-05, |
| "loss": 1.3427, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.012336052820426525, |
| "learning_rate": 3e-05, |
| "loss": 1.3983, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.013373499678550779, |
| "learning_rate": 3e-05, |
| "loss": 1.4146, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.016376857472937493, |
| "learning_rate": 3e-05, |
| "loss": 1.3665, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.01605485412905265, |
| "learning_rate": 3e-05, |
| "loss": 1.3865, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.01460238003538074, |
| "learning_rate": 3e-05, |
| "loss": 1.315, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.015553135144585159, |
| "learning_rate": 3e-05, |
| "loss": 1.3531, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.017868624998865858, |
| "learning_rate": 3e-05, |
| "loss": 1.3722, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.0164281315042581, |
| "learning_rate": 3e-05, |
| "loss": 1.3179, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.01913856813259346, |
| "learning_rate": 3e-05, |
| "loss": 1.3595, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.0197227271190587, |
| "learning_rate": 3e-05, |
| "loss": 1.3904, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.02003628325196358, |
| "learning_rate": 3e-05, |
| "loss": 1.4069, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.020818781168561763, |
| "learning_rate": 3e-05, |
| "loss": 1.4516, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.020893517671768724, |
| "learning_rate": 3e-05, |
| "loss": 1.3407, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.02002708390520797, |
| "learning_rate": 3e-05, |
| "loss": 1.2756, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.02147346418874998, |
| "learning_rate": 3e-05, |
| "loss": 1.4446, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.026756951404830708, |
| "learning_rate": 3e-05, |
| "loss": 1.3975, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.02147348610321771, |
| "learning_rate": 3e-05, |
| "loss": 1.3954, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.02000891892461215, |
| "learning_rate": 3e-05, |
| "loss": 1.3254, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.02051820688028316, |
| "learning_rate": 3e-05, |
| "loss": 1.3688, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.019940598635671183, |
| "learning_rate": 3e-05, |
| "loss": 1.3304, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.019387442345986872, |
| "learning_rate": 3e-05, |
| "loss": 1.3422, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.024538417724117162, |
| "learning_rate": 3e-05, |
| "loss": 1.403, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.0197067247131013, |
| "learning_rate": 3e-05, |
| "loss": 1.3088, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.019906918553002587, |
| "learning_rate": 3e-05, |
| "loss": 1.3156, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.01690103579384712, |
| "learning_rate": 3e-05, |
| "loss": 1.3548, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.016146757145965005, |
| "learning_rate": 3e-05, |
| "loss": 1.323, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.015853850291687382, |
| "learning_rate": 3e-05, |
| "loss": 1.2991, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.014301767220445117, |
| "learning_rate": 3e-05, |
| "loss": 1.299, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.01456552708533801, |
| "learning_rate": 3e-05, |
| "loss": 1.327, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.014037544424116656, |
| "learning_rate": 3e-05, |
| "loss": 1.33, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.014164388239542412, |
| "learning_rate": 3e-05, |
| "loss": 1.3307, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.012972207094917622, |
| "learning_rate": 3e-05, |
| "loss": 1.3028, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.012981889713494583, |
| "learning_rate": 3e-05, |
| "loss": 1.2963, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.013445913227694615, |
| "learning_rate": 3e-05, |
| "loss": 1.2444, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.012792648708552297, |
| "learning_rate": 3e-05, |
| "loss": 1.26, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.012141794711979773, |
| "learning_rate": 3e-05, |
| "loss": 1.3438, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.013862600605300035, |
| "learning_rate": 3e-05, |
| "loss": 1.3285, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.012129096713587641, |
| "learning_rate": 3e-05, |
| "loss": 1.3221, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.01871167034290605, |
| "learning_rate": 3e-05, |
| "loss": 1.2861, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.012436045483287136, |
| "learning_rate": 3e-05, |
| "loss": 1.3068, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.012364790923012296, |
| "learning_rate": 3e-05, |
| "loss": 1.3156, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.012376437364947857, |
| "learning_rate": 3e-05, |
| "loss": 1.3088, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.013969833036331685, |
| "learning_rate": 3e-05, |
| "loss": 1.3316, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.012765341814671809, |
| "learning_rate": 3e-05, |
| "loss": 1.3352, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.01261010445191611, |
| "learning_rate": 3e-05, |
| "loss": 1.3144, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.012151618759459493, |
| "learning_rate": 3e-05, |
| "loss": 1.3204, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.012998675059932196, |
| "learning_rate": 3e-05, |
| "loss": 1.2887, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.012169489046924829, |
| "learning_rate": 3e-05, |
| "loss": 1.3332, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.011920646653506707, |
| "learning_rate": 3e-05, |
| "loss": 1.3553, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.011751923105885437, |
| "learning_rate": 3e-05, |
| "loss": 1.3671, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.011025753112968974, |
| "learning_rate": 3e-05, |
| "loss": 1.2771, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.012584882940640368, |
| "learning_rate": 3e-05, |
| "loss": 1.2277, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.01179619281685698, |
| "learning_rate": 3e-05, |
| "loss": 1.323, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.014564873082288777, |
| "learning_rate": 3e-05, |
| "loss": 1.2949, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.019435936977958635, |
| "learning_rate": 3e-05, |
| "loss": 1.3098, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.012111351048084126, |
| "learning_rate": 3e-05, |
| "loss": 1.3918, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.012162697288403175, |
| "learning_rate": 3e-05, |
| "loss": 1.2861, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.011286641826996159, |
| "learning_rate": 3e-05, |
| "loss": 1.316, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.014424400496857113, |
| "learning_rate": 3e-05, |
| "loss": 1.2717, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.011730736205609419, |
| "learning_rate": 3e-05, |
| "loss": 1.3252, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.011179658792604664, |
| "learning_rate": 3e-05, |
| "loss": 1.3211, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.011909295790935862, |
| "learning_rate": 3e-05, |
| "loss": 1.2757, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.013362118144354595, |
| "learning_rate": 3e-05, |
| "loss": 1.2457, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.019726440026194825, |
| "learning_rate": 3e-05, |
| "loss": 1.3159, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.01151143382587086, |
| "learning_rate": 3e-05, |
| "loss": 1.2628, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.011159675986273602, |
| "learning_rate": 3e-05, |
| "loss": 1.2901, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.01052362952035569, |
| "learning_rate": 3e-05, |
| "loss": 1.2749, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.014028722939171852, |
| "learning_rate": 3e-05, |
| "loss": 1.33, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.010897615932695505, |
| "learning_rate": 3e-05, |
| "loss": 1.3084, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.011353909572748124, |
| "learning_rate": 3e-05, |
| "loss": 1.277, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.011100433604844168, |
| "learning_rate": 3e-05, |
| "loss": 1.3586, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.011880796497294469, |
| "learning_rate": 3e-05, |
| "loss": 1.2959, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.011759412083305202, |
| "learning_rate": 3e-05, |
| "loss": 1.2957, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.011480127470140948, |
| "learning_rate": 3e-05, |
| "loss": 1.3278, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.011449384427383405, |
| "learning_rate": 3e-05, |
| "loss": 1.2633, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.011764058753943114, |
| "learning_rate": 3e-05, |
| "loss": 1.2808, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.01124430953679315, |
| "learning_rate": 3e-05, |
| "loss": 1.2664, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.01057061147022858, |
| "learning_rate": 3e-05, |
| "loss": 1.2969, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.011616218617106901, |
| "learning_rate": 3e-05, |
| "loss": 1.2982, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.01144732781571693, |
| "learning_rate": 3e-05, |
| "loss": 1.2784, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.011077640894084905, |
| "learning_rate": 3e-05, |
| "loss": 1.2819, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.01613205804007413, |
| "learning_rate": 3e-05, |
| "loss": 1.311, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.013574606375148299, |
| "learning_rate": 3e-05, |
| "loss": 1.2561, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.01335654343450743, |
| "learning_rate": 3e-05, |
| "loss": 1.2731, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.012449392909288468, |
| "learning_rate": 3e-05, |
| "loss": 1.2822, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.012435933132224748, |
| "learning_rate": 3e-05, |
| "loss": 1.2447, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.011302776757667351, |
| "learning_rate": 3e-05, |
| "loss": 1.2786, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.012040795209866716, |
| "learning_rate": 3e-05, |
| "loss": 1.3479, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.011073079422530921, |
| "learning_rate": 3e-05, |
| "loss": 1.2794, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.013030304746973722, |
| "learning_rate": 3e-05, |
| "loss": 1.3131, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.012526490354635381, |
| "learning_rate": 3e-05, |
| "loss": 1.2458, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.012847228081473072, |
| "learning_rate": 3e-05, |
| "loss": 1.2614, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.012688536258452844, |
| "learning_rate": 3e-05, |
| "loss": 1.2399, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.012208675458581508, |
| "learning_rate": 3e-05, |
| "loss": 1.3144, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.010859781351210412, |
| "learning_rate": 3e-05, |
| "loss": 1.2324, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.012478911974195483, |
| "learning_rate": 3e-05, |
| "loss": 1.2719, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.012373249769099943, |
| "learning_rate": 3e-05, |
| "loss": 1.2622, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.013463086982838338, |
| "learning_rate": 3e-05, |
| "loss": 1.2776, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.016385409822984896, |
| "learning_rate": 3e-05, |
| "loss": 1.2808, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.012219344462321249, |
| "learning_rate": 3e-05, |
| "loss": 1.3073, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.01219020267598617, |
| "learning_rate": 3e-05, |
| "loss": 1.2182, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.012387262921564371, |
| "learning_rate": 3e-05, |
| "loss": 1.3265, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.01206079560473451, |
| "learning_rate": 3e-05, |
| "loss": 1.2228, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.01225473000339032, |
| "learning_rate": 3e-05, |
| "loss": 1.2843, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.01326596734337093, |
| "learning_rate": 3e-05, |
| "loss": 1.2789, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.014527084891658, |
| "learning_rate": 3e-05, |
| "loss": 1.2407, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.013506152190492582, |
| "learning_rate": 3e-05, |
| "loss": 1.3537, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.012350713785703687, |
| "learning_rate": 3e-05, |
| "loss": 1.299, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.013548036090856945, |
| "learning_rate": 3e-05, |
| "loss": 1.3407, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.012716548388196138, |
| "learning_rate": 3e-05, |
| "loss": 1.2837, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.012376449836024567, |
| "learning_rate": 3e-05, |
| "loss": 1.2408, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.013149525793997427, |
| "learning_rate": 3e-05, |
| "loss": 1.2552, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.012168644210731216, |
| "learning_rate": 3e-05, |
| "loss": 1.3157, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.012324914288283324, |
| "learning_rate": 3e-05, |
| "loss": 1.3123, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.012072754893928983, |
| "learning_rate": 3e-05, |
| "loss": 1.2701, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.01264419972632402, |
| "learning_rate": 3e-05, |
| "loss": 1.2983, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.042949486922442916, |
| "learning_rate": 3e-05, |
| "loss": 1.3216, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.02568672309407492, |
| "learning_rate": 3e-05, |
| "loss": 1.3005, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.012811282089116556, |
| "learning_rate": 3e-05, |
| "loss": 1.3233, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.017868025898532574, |
| "learning_rate": 3e-05, |
| "loss": 1.3054, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.012993051949728001, |
| "learning_rate": 3e-05, |
| "loss": 1.2471, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.018403888249234238, |
| "learning_rate": 3e-05, |
| "loss": 1.3104, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.018015251441908964, |
| "learning_rate": 3e-05, |
| "loss": 1.2492, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.013906581394784246, |
| "learning_rate": 3e-05, |
| "loss": 1.2466, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.014862930320357416, |
| "learning_rate": 3e-05, |
| "loss": 1.3047, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.04412014629710122, |
| "learning_rate": 3e-05, |
| "loss": 1.2054, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.026614261465937473, |
| "learning_rate": 3e-05, |
| "loss": 1.2655, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.01376408626191131, |
| "learning_rate": 3e-05, |
| "loss": 1.2382, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.015429717937867932, |
| "learning_rate": 3e-05, |
| "loss": 1.3278, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.014533210702277371, |
| "learning_rate": 3e-05, |
| "loss": 1.3138, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.012982144097939177, |
| "learning_rate": 3e-05, |
| "loss": 1.3109, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.0182304886411741, |
| "learning_rate": 3e-05, |
| "loss": 1.2985, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.017093917039141884, |
| "learning_rate": 3e-05, |
| "loss": 1.2468, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.014512262916267834, |
| "learning_rate": 3e-05, |
| "loss": 1.3445, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.012373216012523584, |
| "learning_rate": 3e-05, |
| "loss": 1.2921, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.014913292517795098, |
| "learning_rate": 3e-05, |
| "loss": 1.3327, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.01548966880954737, |
| "learning_rate": 3e-05, |
| "loss": 1.3017, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.0140787411578127, |
| "learning_rate": 3e-05, |
| "loss": 1.3248, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.01421362111312793, |
| "learning_rate": 3e-05, |
| "loss": 1.3478, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.012945172730785133, |
| "learning_rate": 3e-05, |
| "loss": 1.2547, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.013197491856509017, |
| "learning_rate": 3e-05, |
| "loss": 1.2809, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.014138707993779835, |
| "learning_rate": 3e-05, |
| "loss": 1.2585, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.013999463137089363, |
| "learning_rate": 3e-05, |
| "loss": 1.316, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.014733166701341028, |
| "learning_rate": 3e-05, |
| "loss": 1.3576, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.012930967759936974, |
| "learning_rate": 3e-05, |
| "loss": 1.2939, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.013560296387293834, |
| "learning_rate": 3e-05, |
| "loss": 1.3307, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.017730136160404638, |
| "learning_rate": 3e-05, |
| "loss": 1.3118, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.013975982126514604, |
| "learning_rate": 3e-05, |
| "loss": 1.2836, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.013166737892207972, |
| "learning_rate": 3e-05, |
| "loss": 1.274, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.01341654728143307, |
| "learning_rate": 3e-05, |
| "loss": 1.2447, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.015608064826784552, |
| "learning_rate": 3e-05, |
| "loss": 1.2743, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.014402235915901979, |
| "learning_rate": 3e-05, |
| "loss": 1.2351, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.013660194729217125, |
| "learning_rate": 3e-05, |
| "loss": 1.2584, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.013437763813973025, |
| "learning_rate": 3e-05, |
| "loss": 1.2281, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.01391064432526438, |
| "learning_rate": 3e-05, |
| "loss": 1.2922, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.013854673514263218, |
| "learning_rate": 3e-05, |
| "loss": 1.2589, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.013357866419948893, |
| "learning_rate": 3e-05, |
| "loss": 1.3146, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.013746796253108871, |
| "learning_rate": 3e-05, |
| "loss": 1.2896, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.01323955577224082, |
| "learning_rate": 3e-05, |
| "loss": 1.2661, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.014094881922329888, |
| "learning_rate": 3e-05, |
| "loss": 1.2296, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.021786851768047143, |
| "learning_rate": 3e-05, |
| "loss": 1.3146, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.014115247163622169, |
| "learning_rate": 3e-05, |
| "loss": 1.2445, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.013235405572388292, |
| "learning_rate": 3e-05, |
| "loss": 1.2828, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.013292324416878417, |
| "learning_rate": 3e-05, |
| "loss": 1.2226, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.014395884061164101, |
| "learning_rate": 3e-05, |
| "loss": 1.3279, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.013904468976095652, |
| "learning_rate": 3e-05, |
| "loss": 1.3052, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.01498037832336143, |
| "learning_rate": 3e-05, |
| "loss": 1.2751, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.013935053913175984, |
| "learning_rate": 3e-05, |
| "loss": 1.2712, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.013342634505562194, |
| "learning_rate": 3e-05, |
| "loss": 1.2478, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.014280227334636308, |
| "learning_rate": 3e-05, |
| "loss": 1.2832, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.013662238228160744, |
| "learning_rate": 3e-05, |
| "loss": 1.3105, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.013883566745022063, |
| "learning_rate": 3e-05, |
| "loss": 1.2882, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.014231700687422954, |
| "learning_rate": 3e-05, |
| "loss": 1.3134, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.015856322519580097, |
| "learning_rate": 3e-05, |
| "loss": 1.3416, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.014864139947341821, |
| "learning_rate": 3e-05, |
| "loss": 1.2528, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.01384672514806748, |
| "learning_rate": 3e-05, |
| "loss": 1.2146, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.015464948934868796, |
| "learning_rate": 3e-05, |
| "loss": 1.2165, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.015385891583895897, |
| "learning_rate": 3e-05, |
| "loss": 1.3034, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.015500556463836994, |
| "learning_rate": 3e-05, |
| "loss": 1.2462, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.015360964744432567, |
| "learning_rate": 3e-05, |
| "loss": 1.2508, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.018104130847576297, |
| "learning_rate": 3e-05, |
| "loss": 1.2617, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.017297392951343114, |
| "learning_rate": 3e-05, |
| "loss": 1.2694, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.014605348589579433, |
| "learning_rate": 3e-05, |
| "loss": 1.2521, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.014287950283906567, |
| "learning_rate": 3e-05, |
| "loss": 1.2583, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.01496726177324148, |
| "learning_rate": 3e-05, |
| "loss": 1.2689, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.015097305854620989, |
| "learning_rate": 3e-05, |
| "loss": 1.3264, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.014393731181691946, |
| "learning_rate": 3e-05, |
| "loss": 1.2509, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.014329101467007178, |
| "learning_rate": 3e-05, |
| "loss": 1.2791, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.01432823143229832, |
| "learning_rate": 3e-05, |
| "loss": 1.3058, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.01409215280735208, |
| "learning_rate": 3e-05, |
| "loss": 1.298, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.014322129770126826, |
| "learning_rate": 3e-05, |
| "loss": 1.2822, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.015535730832931206, |
| "learning_rate": 3e-05, |
| "loss": 1.2952, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.014586001705999283, |
| "learning_rate": 3e-05, |
| "loss": 1.2916, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.014346450671961445, |
| "learning_rate": 3e-05, |
| "loss": 1.286, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.01465216301040808, |
| "learning_rate": 3e-05, |
| "loss": 1.3168, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.014633342656678221, |
| "learning_rate": 3e-05, |
| "loss": 1.222, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.019433028609002513, |
| "learning_rate": 3e-05, |
| "loss": 1.2868, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.015999291729556466, |
| "learning_rate": 3e-05, |
| "loss": 1.2546, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.013974319782604303, |
| "learning_rate": 3e-05, |
| "loss": 1.3313, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.019684778346554094, |
| "learning_rate": 3e-05, |
| "loss": 1.2183, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.033028122367380335, |
| "learning_rate": 3e-05, |
| "loss": 1.2943, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.015226007463866797, |
| "learning_rate": 3e-05, |
| "loss": 1.2398, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.018039902586464752, |
| "learning_rate": 3e-05, |
| "loss": 1.2804, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.01458863130389662, |
| "learning_rate": 3e-05, |
| "loss": 1.3344, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.015824538304288215, |
| "learning_rate": 3e-05, |
| "loss": 1.2351, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.014833851868165835, |
| "learning_rate": 3e-05, |
| "loss": 1.2385, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.013926386093476583, |
| "learning_rate": 3e-05, |
| "loss": 1.2263, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.015632500062928826, |
| "learning_rate": 3e-05, |
| "loss": 1.2931, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.014724285681248521, |
| "learning_rate": 3e-05, |
| "loss": 1.3084, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.014461044938261114, |
| "learning_rate": 3e-05, |
| "loss": 1.3348, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.014888010433011196, |
| "learning_rate": 3e-05, |
| "loss": 1.2726, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.015186640580123631, |
| "learning_rate": 3e-05, |
| "loss": 1.266, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.014169851274828012, |
| "learning_rate": 3e-05, |
| "loss": 1.2275, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.014597768020650003, |
| "learning_rate": 3e-05, |
| "loss": 1.166, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.016348049435478462, |
| "learning_rate": 3e-05, |
| "loss": 1.3315, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.01389791869445101, |
| "learning_rate": 3e-05, |
| "loss": 1.2339, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.014730542143538614, |
| "learning_rate": 3e-05, |
| "loss": 1.2884, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.01495965630015944, |
| "learning_rate": 3e-05, |
| "loss": 1.1792, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.016164599847288814, |
| "learning_rate": 3e-05, |
| "loss": 1.2511, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.014356323440192154, |
| "learning_rate": 3e-05, |
| "loss": 1.2653, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.014496008840759921, |
| "learning_rate": 3e-05, |
| "loss": 1.3067, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.014173980530200598, |
| "learning_rate": 3e-05, |
| "loss": 1.1765, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.019941463494425323, |
| "learning_rate": 3e-05, |
| "loss": 1.2627, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.014559827444157727, |
| "learning_rate": 3e-05, |
| "loss": 1.2726, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.015633910509735647, |
| "learning_rate": 3e-05, |
| "loss": 1.2062, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.014734762194709058, |
| "learning_rate": 3e-05, |
| "loss": 1.2472, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.014923021609054498, |
| "learning_rate": 3e-05, |
| "loss": 1.2648, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.01609643463392318, |
| "learning_rate": 3e-05, |
| "loss": 1.3097, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.015313840216276563, |
| "learning_rate": 3e-05, |
| "loss": 1.2462, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.014704752471017907, |
| "learning_rate": 3e-05, |
| "loss": 1.2852, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.01565975211378067, |
| "learning_rate": 3e-05, |
| "loss": 1.2879, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.014461185719430197, |
| "learning_rate": 3e-05, |
| "loss": 1.2475, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.015730123295533558, |
| "learning_rate": 3e-05, |
| "loss": 1.3275, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.014412110316894376, |
| "learning_rate": 3e-05, |
| "loss": 1.2307, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.014929677218383007, |
| "learning_rate": 3e-05, |
| "loss": 1.2575, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.015039383819204387, |
| "learning_rate": 3e-05, |
| "loss": 1.2906, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.01682053180538288, |
| "learning_rate": 3e-05, |
| "loss": 1.2758, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.014600617493742423, |
| "learning_rate": 3e-05, |
| "loss": 1.2972, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.01713356030477511, |
| "learning_rate": 3e-05, |
| "loss": 1.23, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.013713370678298656, |
| "learning_rate": 3e-05, |
| "loss": 1.2082, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.016029060499359975, |
| "learning_rate": 3e-05, |
| "loss": 1.2185, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.01455024577177708, |
| "learning_rate": 3e-05, |
| "loss": 1.3149, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.01596055912900773, |
| "learning_rate": 3e-05, |
| "loss": 1.2845, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.015210011602171563, |
| "learning_rate": 3e-05, |
| "loss": 1.2585, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.015453757198715246, |
| "learning_rate": 3e-05, |
| "loss": 1.2693, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.01610457228670694, |
| "learning_rate": 3e-05, |
| "loss": 1.2811, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.015836760290716817, |
| "learning_rate": 3e-05, |
| "loss": 1.3358, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.015349071831974473, |
| "learning_rate": 3e-05, |
| "loss": 1.2107, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.016164039133739174, |
| "learning_rate": 3e-05, |
| "loss": 1.2893, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.0159590525586263, |
| "learning_rate": 3e-05, |
| "loss": 1.2611, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.01565613651294786, |
| "learning_rate": 3e-05, |
| "loss": 1.2711, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.014721531061888017, |
| "learning_rate": 3e-05, |
| "loss": 1.2482, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.01753826817531843, |
| "learning_rate": 3e-05, |
| "loss": 1.2687, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.015353098066933114, |
| "learning_rate": 3e-05, |
| "loss": 1.2904, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.015204215112422145, |
| "learning_rate": 3e-05, |
| "loss": 1.2441, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.01627876478176434, |
| "learning_rate": 3e-05, |
| "loss": 1.2617, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.015173324308372222, |
| "learning_rate": 3e-05, |
| "loss": 1.2161, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.01821412576469944, |
| "learning_rate": 3e-05, |
| "loss": 1.2901, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.06506686696796124, |
| "learning_rate": 3e-05, |
| "loss": 1.2874, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 0.01729616604201323, |
| "learning_rate": 3e-05, |
| "loss": 1.2356, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 0.01675033762473565, |
| "learning_rate": 3e-05, |
| "loss": 1.236, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.02192888065943148, |
| "learning_rate": 3e-05, |
| "loss": 1.2421, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.016455712121954427, |
| "learning_rate": 3e-05, |
| "loss": 1.247, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.01814348646038703, |
| "learning_rate": 3e-05, |
| "loss": 1.2485, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.01567451038250417, |
| "learning_rate": 3e-05, |
| "loss": 1.3521, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.016734589709319087, |
| "learning_rate": 3e-05, |
| "loss": 1.3147, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.01665607669544241, |
| "learning_rate": 3e-05, |
| "loss": 1.2557, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.015340602915616324, |
| "learning_rate": 3e-05, |
| "loss": 1.217, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.015484796323673096, |
| "learning_rate": 3e-05, |
| "loss": 1.2644, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.01882586923200815, |
| "learning_rate": 3e-05, |
| "loss": 1.3565, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.01809392578664252, |
| "learning_rate": 3e-05, |
| "loss": 1.2218, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.01611243987811714, |
| "learning_rate": 3e-05, |
| "loss": 1.3112, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.015800275555133508, |
| "learning_rate": 3e-05, |
| "loss": 1.253, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.024573084585391664, |
| "learning_rate": 3e-05, |
| "loss": 1.3009, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.074452403888149, |
| "learning_rate": 3e-05, |
| "loss": 1.2592, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.017056904078373382, |
| "learning_rate": 3e-05, |
| "loss": 1.2774, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 0.01821583975502332, |
| "learning_rate": 3e-05, |
| "loss": 1.1952, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 0.015751526406141428, |
| "learning_rate": 3e-05, |
| "loss": 1.2365, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.015753098191682944, |
| "learning_rate": 3e-05, |
| "loss": 1.2847, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.01588334509784229, |
| "learning_rate": 3e-05, |
| "loss": 1.3455, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.016152905035665025, |
| "learning_rate": 3e-05, |
| "loss": 1.2748, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.015193625382186535, |
| "learning_rate": 3e-05, |
| "loss": 1.2911, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.015901242732867416, |
| "learning_rate": 3e-05, |
| "loss": 1.2784, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.016971976726614598, |
| "learning_rate": 3e-05, |
| "loss": 1.2659, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.01612818830639207, |
| "learning_rate": 3e-05, |
| "loss": 1.2668, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.017395387712717487, |
| "learning_rate": 3e-05, |
| "loss": 1.3022, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.01731193234013385, |
| "learning_rate": 3e-05, |
| "loss": 1.2033, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.01651302780868134, |
| "learning_rate": 3e-05, |
| "loss": 1.251, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.01646013792423402, |
| "learning_rate": 3e-05, |
| "loss": 1.3053, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.016605424458919063, |
| "learning_rate": 3e-05, |
| "loss": 1.2226, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.01642401242189507, |
| "learning_rate": 3e-05, |
| "loss": 1.2478, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.020526048019968234, |
| "learning_rate": 3e-05, |
| "loss": 1.2803, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.015357937815369777, |
| "learning_rate": 3e-05, |
| "loss": 1.3025, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.01661270914190639, |
| "learning_rate": 3e-05, |
| "loss": 1.297, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.017421945864776314, |
| "learning_rate": 3e-05, |
| "loss": 1.2958, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.017204965850903572, |
| "learning_rate": 3e-05, |
| "loss": 1.2675, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.016739239213025746, |
| "learning_rate": 3e-05, |
| "loss": 1.2631, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.015307923198409266, |
| "learning_rate": 3e-05, |
| "loss": 1.2664, |
| "step": 300 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 480, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 50, |
| "total_flos": 4174125975207936.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|