diff --git "a/checkpoint-700/trainer_state.json" "b/checkpoint-700/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-700/trainer_state.json" @@ -0,0 +1,4921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.055607917059378, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.210263192653656, + "learning_rate": 2.0100502512562817e-07, + "loss": 1.4183, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.24909919500350952, + "learning_rate": 4.0201005025125634e-07, + "loss": 1.4152, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.2290315181016922, + "learning_rate": 6.030150753768845e-07, + "loss": 1.3743, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 0.22113870084285736, + "learning_rate": 8.040201005025127e-07, + "loss": 1.4406, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.20976945757865906, + "learning_rate": 1.0050251256281409e-06, + "loss": 1.3628, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.21381014585494995, + "learning_rate": 1.206030150753769e-06, + "loss": 1.3938, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.19989733397960663, + "learning_rate": 1.407035175879397e-06, + "loss": 1.4001, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.20655588805675507, + "learning_rate": 1.6080402010050254e-06, + "loss": 1.3672, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.20128700137138367, + "learning_rate": 1.8090452261306535e-06, + "loss": 1.4108, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.1865876019001007, + "learning_rate": 2.0100502512562818e-06, + "loss": 1.3995, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.18870285153388977, + "learning_rate": 2.21105527638191e-06, + "loss": 1.4106, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.19342871010303497, + "learning_rate": 2.412060301507538e-06, + "loss": 1.4363, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.1840343475341797, + "learning_rate": 2.6130653266331663e-06, + "loss": 1.3765, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.18182729184627533, + "learning_rate": 2.814070351758794e-06, + "loss": 1.3622, + "step": 14 + }, + { + "epoch": 0.02, + "grad_norm": 0.1627752184867859, + "learning_rate": 3.015075376884422e-06, + "loss": 1.3835, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.19723327457904816, + "learning_rate": 3.2160804020100507e-06, + "loss": 1.3779, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.17026425898075104, + "learning_rate": 3.4170854271356786e-06, + "loss": 1.429, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.17131948471069336, + "learning_rate": 3.618090452261307e-06, + "loss": 1.351, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.23470793664455414, + "learning_rate": 3.819095477386935e-06, + "loss": 1.3856, + "step": 19 + }, + { + "epoch": 0.03, + "grad_norm": 0.1551675647497177, + "learning_rate": 4.0201005025125635e-06, + "loss": 1.3569, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 0.15824469923973083, + "learning_rate": 4.221105527638191e-06, + "loss": 1.3878, + "step": 21 + }, + { + "epoch": 0.03, + "grad_norm": 0.14669974148273468, + "learning_rate": 4.42211055276382e-06, + "loss": 1.3648, + "step": 22 + }, + { + "epoch": 0.03, + "grad_norm": 0.14185361564159393, + "learning_rate": 4.623115577889448e-06, + "loss": 1.4757, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.13700777292251587, + "learning_rate": 4.824120603015076e-06, + "loss": 1.3496, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.1328812837600708, + "learning_rate": 5.025125628140704e-06, + "loss": 1.3425, + "step": 25 + }, + { + "epoch": 0.04, + "grad_norm": 0.12427424639463425, + "learning_rate": 5.2261306532663325e-06, + "loss": 1.3551, + "step": 26 + }, + { + "epoch": 0.04, + "grad_norm": 0.1253511607646942, + "learning_rate": 5.42713567839196e-06, + "loss": 1.3634, + "step": 27 + }, + { + "epoch": 0.04, + "grad_norm": 0.12026986479759216, + "learning_rate": 5.628140703517588e-06, + "loss": 1.4022, + "step": 28 + }, + { + "epoch": 0.04, + "grad_norm": 0.13982507586479187, + "learning_rate": 5.829145728643216e-06, + "loss": 1.4141, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.18250446021556854, + "learning_rate": 6.030150753768844e-06, + "loss": 1.3279, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.12397727370262146, + "learning_rate": 6.231155778894474e-06, + "loss": 1.3424, + "step": 31 + }, + { + "epoch": 0.05, + "grad_norm": 0.12383382022380829, + "learning_rate": 6.4321608040201015e-06, + "loss": 1.3985, + "step": 32 + }, + { + "epoch": 0.05, + "grad_norm": 0.11650067567825317, + "learning_rate": 6.633165829145729e-06, + "loss": 1.3692, + "step": 33 + }, + { + "epoch": 0.05, + "grad_norm": 0.11204084008932114, + "learning_rate": 6.834170854271357e-06, + "loss": 1.3203, + "step": 34 + }, + { + "epoch": 0.05, + "grad_norm": 0.13849350810050964, + "learning_rate": 7.035175879396986e-06, + "loss": 1.4094, + "step": 35 + }, + { + "epoch": 0.05, + "grad_norm": 0.1125435009598732, + "learning_rate": 7.236180904522614e-06, + "loss": 1.3705, + "step": 36 + }, + { + "epoch": 0.06, + "grad_norm": 0.10300172120332718, + "learning_rate": 7.437185929648242e-06, + "loss": 1.351, + "step": 37 + }, + { + "epoch": 0.06, + "grad_norm": 0.11846320331096649, + "learning_rate": 7.63819095477387e-06, + "loss": 1.3901, + "step": 38 + }, + { + "epoch": 0.06, + "grad_norm": 0.10557560622692108, + "learning_rate": 7.839195979899498e-06, + "loss": 1.3445, + "step": 39 + }, + { + "epoch": 0.06, + "grad_norm": 0.1127225011587143, + "learning_rate": 8.040201005025127e-06, + "loss": 1.4358, + "step": 40 + }, + { + "epoch": 0.06, + "grad_norm": 0.1260991394519806, + "learning_rate": 8.241206030150754e-06, + "loss": 1.3948, + "step": 41 + }, + { + "epoch": 0.06, + "grad_norm": 0.10526841133832932, + "learning_rate": 8.442211055276383e-06, + "loss": 1.2704, + "step": 42 + }, + { + "epoch": 0.06, + "grad_norm": 1.0842941999435425, + "learning_rate": 8.64321608040201e-06, + "loss": 1.2788, + "step": 43 + }, + { + "epoch": 0.07, + "grad_norm": 0.11623168736696243, + "learning_rate": 8.84422110552764e-06, + "loss": 1.3541, + "step": 44 + }, + { + "epoch": 0.07, + "grad_norm": 0.12414997816085815, + "learning_rate": 9.045226130653267e-06, + "loss": 1.2478, + "step": 45 + }, + { + "epoch": 0.07, + "grad_norm": 0.16141410171985626, + "learning_rate": 9.246231155778896e-06, + "loss": 1.3026, + "step": 46 + }, + { + "epoch": 0.07, + "grad_norm": 0.12323018908500671, + "learning_rate": 9.447236180904523e-06, + "loss": 1.3331, + "step": 47 + }, + { + "epoch": 0.07, + "grad_norm": 0.11293739825487137, + "learning_rate": 9.648241206030152e-06, + "loss": 1.3711, + "step": 48 + }, + { + "epoch": 0.07, + "grad_norm": 0.28998640179634094, + "learning_rate": 9.84924623115578e-06, + "loss": 1.3316, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.10780947655439377, + "learning_rate": 1.0050251256281408e-05, + "loss": 1.3175, + "step": 50 + }, + { + "epoch": 0.08, + "grad_norm": 0.11442453414201736, + "learning_rate": 1.0251256281407036e-05, + "loss": 1.3146, + "step": 51 + }, + { + "epoch": 0.08, + "grad_norm": 0.10730454325675964, + "learning_rate": 1.0452261306532665e-05, + "loss": 1.3401, + "step": 52 + }, + { + "epoch": 0.08, + "grad_norm": 0.16564249992370605, + "learning_rate": 1.0653266331658292e-05, + "loss": 1.3425, + "step": 53 + }, + { + "epoch": 0.08, + "grad_norm": 0.10222669690847397, + "learning_rate": 1.085427135678392e-05, + "loss": 1.227, + "step": 54 + }, + { + "epoch": 0.08, + "grad_norm": 0.10880003124475479, + "learning_rate": 1.1055276381909548e-05, + "loss": 1.2967, + "step": 55 + }, + { + "epoch": 0.08, + "grad_norm": 1.0954923629760742, + "learning_rate": 1.1256281407035177e-05, + "loss": 1.4358, + "step": 56 + }, + { + "epoch": 0.09, + "grad_norm": 0.11673203110694885, + "learning_rate": 1.1457286432160805e-05, + "loss": 1.3099, + "step": 57 + }, + { + "epoch": 0.09, + "grad_norm": 0.1613866239786148, + "learning_rate": 1.1658291457286432e-05, + "loss": 1.3468, + "step": 58 + }, + { + "epoch": 0.09, + "grad_norm": 0.12227887660264969, + "learning_rate": 1.1859296482412061e-05, + "loss": 1.4211, + "step": 59 + }, + { + "epoch": 0.09, + "grad_norm": 0.11897600442171097, + "learning_rate": 1.2060301507537688e-05, + "loss": 1.3768, + "step": 60 + }, + { + "epoch": 0.09, + "grad_norm": 0.10784732550382614, + "learning_rate": 1.2261306532663317e-05, + "loss": 1.324, + "step": 61 + }, + { + "epoch": 0.09, + "grad_norm": 0.11652999371290207, + "learning_rate": 1.2462311557788947e-05, + "loss": 1.3655, + "step": 62 + }, + { + "epoch": 0.1, + "grad_norm": 0.11904505640268326, + "learning_rate": 1.2663316582914573e-05, + "loss": 1.3891, + "step": 63 + }, + { + "epoch": 0.1, + "grad_norm": 0.11254779994487762, + "learning_rate": 1.2864321608040203e-05, + "loss": 1.2749, + "step": 64 + }, + { + "epoch": 0.1, + "grad_norm": 0.11637745797634125, + "learning_rate": 1.3065326633165832e-05, + "loss": 1.3365, + "step": 65 + }, + { + "epoch": 0.1, + "grad_norm": 0.11625081300735474, + "learning_rate": 1.3266331658291459e-05, + "loss": 1.2813, + "step": 66 + }, + { + "epoch": 0.1, + "grad_norm": 0.11456788331270218, + "learning_rate": 1.3467336683417087e-05, + "loss": 1.3131, + "step": 67 + }, + { + "epoch": 0.1, + "grad_norm": 0.1079697236418724, + "learning_rate": 1.3668341708542715e-05, + "loss": 1.2449, + "step": 68 + }, + { + "epoch": 0.1, + "grad_norm": 0.11469058692455292, + "learning_rate": 1.3869346733668343e-05, + "loss": 1.2179, + "step": 69 + }, + { + "epoch": 0.11, + "grad_norm": 0.11703373491764069, + "learning_rate": 1.4070351758793972e-05, + "loss": 1.2839, + "step": 70 + }, + { + "epoch": 0.11, + "grad_norm": 0.10526423901319504, + "learning_rate": 1.4271356783919599e-05, + "loss": 1.2923, + "step": 71 + }, + { + "epoch": 0.11, + "grad_norm": 0.11774110794067383, + "learning_rate": 1.4472361809045228e-05, + "loss": 1.4061, + "step": 72 + }, + { + "epoch": 0.11, + "grad_norm": 0.11455068737268448, + "learning_rate": 1.4673366834170855e-05, + "loss": 1.4107, + "step": 73 + }, + { + "epoch": 0.11, + "grad_norm": 0.12205720692873001, + "learning_rate": 1.4874371859296483e-05, + "loss": 1.2833, + "step": 74 + }, + { + "epoch": 0.11, + "grad_norm": 0.12115150690078735, + "learning_rate": 1.5075376884422112e-05, + "loss": 1.316, + "step": 75 + }, + { + "epoch": 0.11, + "grad_norm": 0.1174568384885788, + "learning_rate": 1.527638190954774e-05, + "loss": 1.3618, + "step": 76 + }, + { + "epoch": 0.12, + "grad_norm": 0.1132812350988388, + "learning_rate": 1.547738693467337e-05, + "loss": 1.3542, + "step": 77 + }, + { + "epoch": 0.12, + "grad_norm": 0.10755915939807892, + "learning_rate": 1.5678391959798997e-05, + "loss": 1.3555, + "step": 78 + }, + { + "epoch": 0.12, + "grad_norm": 0.12377703189849854, + "learning_rate": 1.5879396984924624e-05, + "loss": 1.3475, + "step": 79 + }, + { + "epoch": 0.12, + "grad_norm": 0.13303188979625702, + "learning_rate": 1.6080402010050254e-05, + "loss": 1.3049, + "step": 80 + }, + { + "epoch": 0.12, + "grad_norm": 0.11295345425605774, + "learning_rate": 1.628140703517588e-05, + "loss": 1.2913, + "step": 81 + }, + { + "epoch": 0.12, + "grad_norm": 0.10876049846410751, + "learning_rate": 1.6482412060301508e-05, + "loss": 1.3252, + "step": 82 + }, + { + "epoch": 0.13, + "grad_norm": 0.1045972928404808, + "learning_rate": 1.6683417085427135e-05, + "loss": 1.333, + "step": 83 + }, + { + "epoch": 0.13, + "grad_norm": 0.12309195101261139, + "learning_rate": 1.6884422110552766e-05, + "loss": 1.3179, + "step": 84 + }, + { + "epoch": 0.13, + "grad_norm": 0.1403413861989975, + "learning_rate": 1.7085427135678393e-05, + "loss": 1.3049, + "step": 85 + }, + { + "epoch": 0.13, + "grad_norm": 0.13645939528942108, + "learning_rate": 1.728643216080402e-05, + "loss": 1.361, + "step": 86 + }, + { + "epoch": 0.13, + "grad_norm": 0.14400267601013184, + "learning_rate": 1.748743718592965e-05, + "loss": 1.3078, + "step": 87 + }, + { + "epoch": 0.13, + "grad_norm": 0.11410026252269745, + "learning_rate": 1.768844221105528e-05, + "loss": 1.3697, + "step": 88 + }, + { + "epoch": 0.13, + "grad_norm": 0.11151797324419022, + "learning_rate": 1.7889447236180904e-05, + "loss": 1.3144, + "step": 89 + }, + { + "epoch": 0.14, + "grad_norm": 0.13003084063529968, + "learning_rate": 1.8090452261306535e-05, + "loss": 1.3546, + "step": 90 + }, + { + "epoch": 0.14, + "grad_norm": 0.1052040383219719, + "learning_rate": 1.829145728643216e-05, + "loss": 1.2748, + "step": 91 + }, + { + "epoch": 0.14, + "grad_norm": 0.10659239441156387, + "learning_rate": 1.8492462311557792e-05, + "loss": 1.2383, + "step": 92 + }, + { + "epoch": 0.14, + "grad_norm": 0.11741909384727478, + "learning_rate": 1.869346733668342e-05, + "loss": 1.4332, + "step": 93 + }, + { + "epoch": 0.14, + "grad_norm": 0.11060652136802673, + "learning_rate": 1.8894472361809046e-05, + "loss": 1.295, + "step": 94 + }, + { + "epoch": 0.14, + "grad_norm": 0.10553266108036041, + "learning_rate": 1.9095477386934677e-05, + "loss": 1.2388, + "step": 95 + }, + { + "epoch": 0.14, + "grad_norm": 0.11126605421304703, + "learning_rate": 1.9296482412060304e-05, + "loss": 1.2539, + "step": 96 + }, + { + "epoch": 0.15, + "grad_norm": 0.14795471727848053, + "learning_rate": 1.949748743718593e-05, + "loss": 1.34, + "step": 97 + }, + { + "epoch": 0.15, + "grad_norm": 0.12964512407779694, + "learning_rate": 1.969849246231156e-05, + "loss": 1.3589, + "step": 98 + }, + { + "epoch": 0.15, + "grad_norm": 0.1271078735589981, + "learning_rate": 1.9899497487437188e-05, + "loss": 1.3351, + "step": 99 + }, + { + "epoch": 0.15, + "grad_norm": 0.16622687876224518, + "learning_rate": 2.0100502512562815e-05, + "loss": 1.2137, + "step": 100 + }, + { + "epoch": 0.15, + "grad_norm": 0.10592926293611526, + "learning_rate": 2.0301507537688442e-05, + "loss": 1.2803, + "step": 101 + }, + { + "epoch": 0.15, + "grad_norm": 0.11464520543813705, + "learning_rate": 2.0502512562814073e-05, + "loss": 1.3636, + "step": 102 + }, + { + "epoch": 0.16, + "grad_norm": 0.11597637087106705, + "learning_rate": 2.07035175879397e-05, + "loss": 1.3655, + "step": 103 + }, + { + "epoch": 0.16, + "grad_norm": 0.1618194431066513, + "learning_rate": 2.090452261306533e-05, + "loss": 1.4171, + "step": 104 + }, + { + "epoch": 0.16, + "grad_norm": 0.2717505097389221, + "learning_rate": 2.1105527638190954e-05, + "loss": 1.3915, + "step": 105 + }, + { + "epoch": 0.16, + "grad_norm": 0.10637391358613968, + "learning_rate": 2.1306532663316584e-05, + "loss": 1.2644, + "step": 106 + }, + { + "epoch": 0.16, + "grad_norm": 0.12094827741384506, + "learning_rate": 2.150753768844221e-05, + "loss": 1.3708, + "step": 107 + }, + { + "epoch": 0.16, + "grad_norm": 0.1092429831624031, + "learning_rate": 2.170854271356784e-05, + "loss": 1.2942, + "step": 108 + }, + { + "epoch": 0.16, + "grad_norm": 0.1192881315946579, + "learning_rate": 2.1909547738693472e-05, + "loss": 1.3798, + "step": 109 + }, + { + "epoch": 0.17, + "grad_norm": 0.12474899739027023, + "learning_rate": 2.2110552763819096e-05, + "loss": 1.326, + "step": 110 + }, + { + "epoch": 0.17, + "grad_norm": 0.15612299740314484, + "learning_rate": 2.2311557788944723e-05, + "loss": 1.3614, + "step": 111 + }, + { + "epoch": 0.17, + "grad_norm": 0.12052815407514572, + "learning_rate": 2.2512562814070353e-05, + "loss": 1.2885, + "step": 112 + }, + { + "epoch": 0.17, + "grad_norm": 0.11309295147657394, + "learning_rate": 2.2713567839195984e-05, + "loss": 1.3322, + "step": 113 + }, + { + "epoch": 0.17, + "grad_norm": 0.19794340431690216, + "learning_rate": 2.291457286432161e-05, + "loss": 1.3053, + "step": 114 + }, + { + "epoch": 0.17, + "grad_norm": 0.10213331878185272, + "learning_rate": 2.3115577889447238e-05, + "loss": 1.2523, + "step": 115 + }, + { + "epoch": 0.17, + "grad_norm": 0.10369960963726044, + "learning_rate": 2.3316582914572865e-05, + "loss": 1.2962, + "step": 116 + }, + { + "epoch": 0.18, + "grad_norm": 0.11149720847606659, + "learning_rate": 2.3517587939698495e-05, + "loss": 1.3309, + "step": 117 + }, + { + "epoch": 0.18, + "grad_norm": 0.10644638538360596, + "learning_rate": 2.3718592964824122e-05, + "loss": 1.3246, + "step": 118 + }, + { + "epoch": 0.18, + "grad_norm": 0.1313220113515854, + "learning_rate": 2.3919597989949753e-05, + "loss": 1.3743, + "step": 119 + }, + { + "epoch": 0.18, + "grad_norm": 0.1134113296866417, + "learning_rate": 2.4120603015075376e-05, + "loss": 1.3282, + "step": 120 + }, + { + "epoch": 0.18, + "grad_norm": 0.11324608325958252, + "learning_rate": 2.4321608040201007e-05, + "loss": 1.3839, + "step": 121 + }, + { + "epoch": 0.18, + "grad_norm": 0.12287817150354385, + "learning_rate": 2.4522613065326634e-05, + "loss": 1.3144, + "step": 122 + }, + { + "epoch": 0.19, + "grad_norm": 0.10702131688594818, + "learning_rate": 2.4723618090452264e-05, + "loss": 1.3413, + "step": 123 + }, + { + "epoch": 0.19, + "grad_norm": 0.09920056164264679, + "learning_rate": 2.4924623115577894e-05, + "loss": 1.2599, + "step": 124 + }, + { + "epoch": 0.19, + "grad_norm": 0.10186468809843063, + "learning_rate": 2.512562814070352e-05, + "loss": 1.2778, + "step": 125 + }, + { + "epoch": 0.19, + "grad_norm": 0.11593952029943466, + "learning_rate": 2.5326633165829145e-05, + "loss": 1.3145, + "step": 126 + }, + { + "epoch": 0.19, + "grad_norm": 0.10868196189403534, + "learning_rate": 2.5527638190954776e-05, + "loss": 1.2797, + "step": 127 + }, + { + "epoch": 0.19, + "grad_norm": 0.11534610390663147, + "learning_rate": 2.5728643216080406e-05, + "loss": 1.3008, + "step": 128 + }, + { + "epoch": 0.19, + "grad_norm": 0.26320281624794006, + "learning_rate": 2.5929648241206033e-05, + "loss": 1.3269, + "step": 129 + }, + { + "epoch": 0.2, + "grad_norm": 0.12328831851482391, + "learning_rate": 2.6130653266331663e-05, + "loss": 1.4226, + "step": 130 + }, + { + "epoch": 0.2, + "grad_norm": 0.12117883563041687, + "learning_rate": 2.6331658291457287e-05, + "loss": 1.2847, + "step": 131 + }, + { + "epoch": 0.2, + "grad_norm": 0.10399331897497177, + "learning_rate": 2.6532663316582917e-05, + "loss": 1.3781, + "step": 132 + }, + { + "epoch": 0.2, + "grad_norm": 0.10431778430938721, + "learning_rate": 2.6733668341708545e-05, + "loss": 1.2756, + "step": 133 + }, + { + "epoch": 0.2, + "grad_norm": 0.09728407114744186, + "learning_rate": 2.6934673366834175e-05, + "loss": 1.2437, + "step": 134 + }, + { + "epoch": 0.2, + "grad_norm": 0.1047501266002655, + "learning_rate": 2.7135678391959802e-05, + "loss": 1.297, + "step": 135 + }, + { + "epoch": 0.21, + "grad_norm": 0.10363683104515076, + "learning_rate": 2.733668341708543e-05, + "loss": 1.3615, + "step": 136 + }, + { + "epoch": 0.21, + "grad_norm": 0.10048768669366837, + "learning_rate": 2.7537688442211056e-05, + "loss": 1.313, + "step": 137 + }, + { + "epoch": 0.21, + "grad_norm": 0.10527737438678741, + "learning_rate": 2.7738693467336686e-05, + "loss": 1.3972, + "step": 138 + }, + { + "epoch": 0.21, + "grad_norm": 0.17517946660518646, + "learning_rate": 2.7939698492462314e-05, + "loss": 1.3066, + "step": 139 + }, + { + "epoch": 0.21, + "grad_norm": 0.10221141576766968, + "learning_rate": 2.8140703517587944e-05, + "loss": 1.3255, + "step": 140 + }, + { + "epoch": 0.21, + "grad_norm": 0.11932506412267685, + "learning_rate": 2.8341708542713568e-05, + "loss": 1.363, + "step": 141 + }, + { + "epoch": 0.21, + "grad_norm": 0.10982365906238556, + "learning_rate": 2.8542713567839198e-05, + "loss": 1.3888, + "step": 142 + }, + { + "epoch": 0.22, + "grad_norm": 0.10286767780780792, + "learning_rate": 2.8743718592964825e-05, + "loss": 1.3519, + "step": 143 + }, + { + "epoch": 0.22, + "grad_norm": 0.09760483354330063, + "learning_rate": 2.8944723618090455e-05, + "loss": 1.332, + "step": 144 + }, + { + "epoch": 0.22, + "grad_norm": 0.09790109097957611, + "learning_rate": 2.9145728643216086e-05, + "loss": 1.2453, + "step": 145 + }, + { + "epoch": 0.22, + "grad_norm": 0.1019718199968338, + "learning_rate": 2.934673366834171e-05, + "loss": 1.3745, + "step": 146 + }, + { + "epoch": 0.22, + "grad_norm": 0.10731212049722672, + "learning_rate": 2.954773869346734e-05, + "loss": 1.2644, + "step": 147 + }, + { + "epoch": 0.22, + "grad_norm": 0.10532107204198837, + "learning_rate": 2.9748743718592967e-05, + "loss": 1.3438, + "step": 148 + }, + { + "epoch": 0.22, + "grad_norm": 0.22097086906433105, + "learning_rate": 2.9949748743718597e-05, + "loss": 1.3349, + "step": 149 + }, + { + "epoch": 0.23, + "grad_norm": 0.09888072311878204, + "learning_rate": 3.0150753768844224e-05, + "loss": 1.2711, + "step": 150 + }, + { + "epoch": 0.23, + "grad_norm": 0.1759546548128128, + "learning_rate": 3.035175879396985e-05, + "loss": 1.319, + "step": 151 + }, + { + "epoch": 0.23, + "grad_norm": 0.10204854607582092, + "learning_rate": 3.055276381909548e-05, + "loss": 1.3303, + "step": 152 + }, + { + "epoch": 0.23, + "grad_norm": 0.11660198122262955, + "learning_rate": 3.0753768844221106e-05, + "loss": 1.2966, + "step": 153 + }, + { + "epoch": 0.23, + "grad_norm": 0.09345217794179916, + "learning_rate": 3.095477386934674e-05, + "loss": 1.2801, + "step": 154 + }, + { + "epoch": 0.23, + "grad_norm": 0.1068679466843605, + "learning_rate": 3.1155778894472366e-05, + "loss": 1.3135, + "step": 155 + }, + { + "epoch": 0.24, + "grad_norm": 0.10100909322500229, + "learning_rate": 3.1356783919597993e-05, + "loss": 1.321, + "step": 156 + }, + { + "epoch": 0.24, + "grad_norm": 0.09783848375082016, + "learning_rate": 3.155778894472362e-05, + "loss": 1.2777, + "step": 157 + }, + { + "epoch": 0.24, + "grad_norm": 0.09745295345783234, + "learning_rate": 3.175879396984925e-05, + "loss": 1.3265, + "step": 158 + }, + { + "epoch": 0.24, + "grad_norm": 0.11172570288181305, + "learning_rate": 3.1959798994974875e-05, + "loss": 1.3462, + "step": 159 + }, + { + "epoch": 0.24, + "grad_norm": 0.09654249995946884, + "learning_rate": 3.216080402010051e-05, + "loss": 1.2953, + "step": 160 + }, + { + "epoch": 0.24, + "grad_norm": 0.09633474797010422, + "learning_rate": 3.236180904522613e-05, + "loss": 1.3353, + "step": 161 + }, + { + "epoch": 0.24, + "grad_norm": 0.09879904240369797, + "learning_rate": 3.256281407035176e-05, + "loss": 1.2828, + "step": 162 + }, + { + "epoch": 0.25, + "grad_norm": 0.09953232854604721, + "learning_rate": 3.276381909547739e-05, + "loss": 1.34, + "step": 163 + }, + { + "epoch": 0.25, + "grad_norm": 0.10357166826725006, + "learning_rate": 3.2964824120603016e-05, + "loss": 1.2813, + "step": 164 + }, + { + "epoch": 0.25, + "grad_norm": 0.10495917499065399, + "learning_rate": 3.316582914572865e-05, + "loss": 1.3169, + "step": 165 + }, + { + "epoch": 0.25, + "grad_norm": 0.0949726328253746, + "learning_rate": 3.336683417085427e-05, + "loss": 1.3016, + "step": 166 + }, + { + "epoch": 0.25, + "grad_norm": 0.1002039760351181, + "learning_rate": 3.3567839195979904e-05, + "loss": 1.3942, + "step": 167 + }, + { + "epoch": 0.25, + "grad_norm": 0.09216566383838654, + "learning_rate": 3.376884422110553e-05, + "loss": 1.2328, + "step": 168 + }, + { + "epoch": 0.25, + "grad_norm": 0.09849084168672562, + "learning_rate": 3.396984924623116e-05, + "loss": 1.3503, + "step": 169 + }, + { + "epoch": 0.26, + "grad_norm": 0.09866531193256378, + "learning_rate": 3.4170854271356785e-05, + "loss": 1.2544, + "step": 170 + }, + { + "epoch": 0.26, + "grad_norm": 0.10076246410608292, + "learning_rate": 3.437185929648241e-05, + "loss": 1.2835, + "step": 171 + }, + { + "epoch": 0.26, + "grad_norm": 0.09949547797441483, + "learning_rate": 3.457286432160804e-05, + "loss": 1.3232, + "step": 172 + }, + { + "epoch": 0.26, + "grad_norm": 0.09655594825744629, + "learning_rate": 3.477386934673367e-05, + "loss": 1.2828, + "step": 173 + }, + { + "epoch": 0.26, + "grad_norm": 0.44782304763793945, + "learning_rate": 3.49748743718593e-05, + "loss": 1.2887, + "step": 174 + }, + { + "epoch": 0.26, + "grad_norm": 0.09793172776699066, + "learning_rate": 3.517587939698493e-05, + "loss": 1.2415, + "step": 175 + }, + { + "epoch": 0.27, + "grad_norm": 0.1036762222647667, + "learning_rate": 3.537688442211056e-05, + "loss": 1.2714, + "step": 176 + }, + { + "epoch": 0.27, + "grad_norm": 0.19909432530403137, + "learning_rate": 3.557788944723618e-05, + "loss": 1.315, + "step": 177 + }, + { + "epoch": 0.27, + "grad_norm": 0.09891911596059799, + "learning_rate": 3.577889447236181e-05, + "loss": 1.2656, + "step": 178 + }, + { + "epoch": 0.27, + "grad_norm": 0.09634336829185486, + "learning_rate": 3.597989949748744e-05, + "loss": 1.3001, + "step": 179 + }, + { + "epoch": 0.27, + "grad_norm": 0.09586773067712784, + "learning_rate": 3.618090452261307e-05, + "loss": 1.4021, + "step": 180 + }, + { + "epoch": 0.27, + "grad_norm": 0.11308431625366211, + "learning_rate": 3.6381909547738696e-05, + "loss": 1.3212, + "step": 181 + }, + { + "epoch": 0.27, + "grad_norm": 0.09527049213647842, + "learning_rate": 3.658291457286432e-05, + "loss": 1.3211, + "step": 182 + }, + { + "epoch": 0.28, + "grad_norm": 0.10926982015371323, + "learning_rate": 3.678391959798995e-05, + "loss": 1.3242, + "step": 183 + }, + { + "epoch": 0.28, + "grad_norm": 0.10241182148456573, + "learning_rate": 3.6984924623115584e-05, + "loss": 1.3411, + "step": 184 + }, + { + "epoch": 0.28, + "grad_norm": 0.11616214364767075, + "learning_rate": 3.718592964824121e-05, + "loss": 1.3032, + "step": 185 + }, + { + "epoch": 0.28, + "grad_norm": 0.09690311551094055, + "learning_rate": 3.738693467336684e-05, + "loss": 1.3796, + "step": 186 + }, + { + "epoch": 0.28, + "grad_norm": 0.09707053750753403, + "learning_rate": 3.7587939698492465e-05, + "loss": 1.3441, + "step": 187 + }, + { + "epoch": 0.28, + "grad_norm": 0.09699778258800507, + "learning_rate": 3.778894472361809e-05, + "loss": 1.3808, + "step": 188 + }, + { + "epoch": 0.29, + "grad_norm": 0.0989750325679779, + "learning_rate": 3.798994974874372e-05, + "loss": 1.2693, + "step": 189 + }, + { + "epoch": 0.29, + "grad_norm": 0.09727182239294052, + "learning_rate": 3.819095477386935e-05, + "loss": 1.3243, + "step": 190 + }, + { + "epoch": 0.29, + "grad_norm": 0.10256370157003403, + "learning_rate": 3.839195979899498e-05, + "loss": 1.3211, + "step": 191 + }, + { + "epoch": 0.29, + "grad_norm": 0.10001495480537415, + "learning_rate": 3.859296482412061e-05, + "loss": 1.2681, + "step": 192 + }, + { + "epoch": 0.29, + "grad_norm": 0.10139815509319305, + "learning_rate": 3.8793969849246234e-05, + "loss": 1.332, + "step": 193 + }, + { + "epoch": 0.29, + "grad_norm": 0.10533817112445831, + "learning_rate": 3.899497487437186e-05, + "loss": 1.3703, + "step": 194 + }, + { + "epoch": 0.29, + "grad_norm": 0.0879909098148346, + "learning_rate": 3.919597989949749e-05, + "loss": 1.2355, + "step": 195 + }, + { + "epoch": 0.3, + "grad_norm": 0.09673386067152023, + "learning_rate": 3.939698492462312e-05, + "loss": 1.258, + "step": 196 + }, + { + "epoch": 0.3, + "grad_norm": 0.09318586438894272, + "learning_rate": 3.959798994974874e-05, + "loss": 1.3312, + "step": 197 + }, + { + "epoch": 0.3, + "grad_norm": 0.10109449177980423, + "learning_rate": 3.9798994974874376e-05, + "loss": 1.2697, + "step": 198 + }, + { + "epoch": 0.3, + "grad_norm": 0.11375732719898224, + "learning_rate": 4e-05, + "loss": 1.3485, + "step": 199 + }, + { + "epoch": 0.3, + "grad_norm": 0.12646842002868652, + "learning_rate": 3.999996919696056e-05, + "loss": 1.3475, + "step": 200 + }, + { + "epoch": 0.3, + "grad_norm": 0.09267274290323257, + "learning_rate": 3.9999876787937116e-05, + "loss": 1.3154, + "step": 201 + }, + { + "epoch": 0.3, + "grad_norm": 0.08901134133338928, + "learning_rate": 3.999972277321432e-05, + "loss": 1.2643, + "step": 202 + }, + { + "epoch": 0.31, + "grad_norm": 0.0929809957742691, + "learning_rate": 3.999950715326658e-05, + "loss": 1.235, + "step": 203 + }, + { + "epoch": 0.31, + "grad_norm": 0.09264566004276276, + "learning_rate": 3.9999229928758075e-05, + "loss": 1.3377, + "step": 204 + }, + { + "epoch": 0.31, + "grad_norm": 0.10625364631414413, + "learning_rate": 3.999889110054274e-05, + "loss": 1.3187, + "step": 205 + }, + { + "epoch": 0.31, + "grad_norm": 0.09376952052116394, + "learning_rate": 3.999849066966427e-05, + "loss": 1.3649, + "step": 206 + }, + { + "epoch": 0.31, + "grad_norm": 0.09811846166849136, + "learning_rate": 3.999802863735611e-05, + "loss": 1.3262, + "step": 207 + }, + { + "epoch": 0.31, + "grad_norm": 0.09455687552690506, + "learning_rate": 3.999750500504146e-05, + "loss": 1.3481, + "step": 208 + }, + { + "epoch": 0.32, + "grad_norm": 0.10669883340597153, + "learning_rate": 3.999691977433327e-05, + "loss": 1.2883, + "step": 209 + }, + { + "epoch": 0.32, + "grad_norm": 0.09470435231924057, + "learning_rate": 3.9996272947034225e-05, + "loss": 1.3461, + "step": 210 + }, + { + "epoch": 0.32, + "grad_norm": 0.09564365446567535, + "learning_rate": 3.999556452513676e-05, + "loss": 1.3361, + "step": 211 + }, + { + "epoch": 0.32, + "grad_norm": 0.09272690862417221, + "learning_rate": 3.9994794510823015e-05, + "loss": 1.3515, + "step": 212 + }, + { + "epoch": 0.32, + "grad_norm": 0.10989698022603989, + "learning_rate": 3.999396290646487e-05, + "loss": 1.2228, + "step": 213 + }, + { + "epoch": 0.32, + "grad_norm": 0.09243155270814896, + "learning_rate": 3.9993069714623934e-05, + "loss": 1.3532, + "step": 214 + }, + { + "epoch": 0.32, + "grad_norm": 0.0882272869348526, + "learning_rate": 3.999211493805149e-05, + "loss": 1.276, + "step": 215 + }, + { + "epoch": 0.33, + "grad_norm": 0.08949492126703262, + "learning_rate": 3.999109857968855e-05, + "loss": 1.2981, + "step": 216 + }, + { + "epoch": 0.33, + "grad_norm": 0.0926244854927063, + "learning_rate": 3.9990020642665815e-05, + "loss": 1.3382, + "step": 217 + }, + { + "epoch": 0.33, + "grad_norm": 0.08923252671957016, + "learning_rate": 3.998888113030364e-05, + "loss": 1.2973, + "step": 218 + }, + { + "epoch": 0.33, + "grad_norm": 0.09143295139074326, + "learning_rate": 3.998768004611209e-05, + "loss": 1.284, + "step": 219 + }, + { + "epoch": 0.33, + "grad_norm": 0.09087208658456802, + "learning_rate": 3.998641739379085e-05, + "loss": 1.3586, + "step": 220 + }, + { + "epoch": 0.33, + "grad_norm": 0.10466624051332474, + "learning_rate": 3.9985093177229276e-05, + "loss": 1.3206, + "step": 221 + }, + { + "epoch": 0.33, + "grad_norm": 0.09122570604085922, + "learning_rate": 3.998370740050638e-05, + "loss": 1.2804, + "step": 222 + }, + { + "epoch": 0.34, + "grad_norm": 0.1117967963218689, + "learning_rate": 3.9982260067890737e-05, + "loss": 1.3034, + "step": 223 + }, + { + "epoch": 0.34, + "grad_norm": 0.09492266178131104, + "learning_rate": 3.998075118384061e-05, + "loss": 1.3842, + "step": 224 + }, + { + "epoch": 0.34, + "grad_norm": 0.08940907567739487, + "learning_rate": 3.997918075300379e-05, + "loss": 1.2819, + "step": 225 + }, + { + "epoch": 0.34, + "grad_norm": 0.08896268904209137, + "learning_rate": 3.99775487802177e-05, + "loss": 1.3225, + "step": 226 + }, + { + "epoch": 0.34, + "grad_norm": 0.0978945717215538, + "learning_rate": 3.997585527050931e-05, + "loss": 1.2499, + "step": 227 + }, + { + "epoch": 0.34, + "grad_norm": 0.09165825694799423, + "learning_rate": 3.997410022909514e-05, + "loss": 1.3174, + "step": 228 + }, + { + "epoch": 0.35, + "grad_norm": 0.09468834102153778, + "learning_rate": 3.997228366138125e-05, + "loss": 1.2563, + "step": 229 + }, + { + "epoch": 0.35, + "grad_norm": 0.09366954863071442, + "learning_rate": 3.9970405572963226e-05, + "loss": 1.2454, + "step": 230 + }, + { + "epoch": 0.35, + "grad_norm": 0.11515671759843826, + "learning_rate": 3.9968465969626146e-05, + "loss": 1.2864, + "step": 231 + }, + { + "epoch": 0.35, + "grad_norm": 0.09367504715919495, + "learning_rate": 3.996646485734458e-05, + "loss": 1.3034, + "step": 232 + }, + { + "epoch": 0.35, + "grad_norm": 0.10032579302787781, + "learning_rate": 3.9964402242282565e-05, + "loss": 1.2857, + "step": 233 + }, + { + "epoch": 0.35, + "grad_norm": 0.0859631672501564, + "learning_rate": 3.996227813079357e-05, + "loss": 1.3411, + "step": 234 + }, + { + "epoch": 0.35, + "grad_norm": 0.09757768362760544, + "learning_rate": 3.9960092529420524e-05, + "loss": 1.3795, + "step": 235 + }, + { + "epoch": 0.36, + "grad_norm": 0.10491438955068588, + "learning_rate": 3.995784544489573e-05, + "loss": 1.3314, + "step": 236 + }, + { + "epoch": 0.36, + "grad_norm": 0.10739076882600784, + "learning_rate": 3.995553688414089e-05, + "loss": 1.292, + "step": 237 + }, + { + "epoch": 0.36, + "grad_norm": 0.09025067090988159, + "learning_rate": 3.995316685426708e-05, + "loss": 1.2785, + "step": 238 + }, + { + "epoch": 0.36, + "grad_norm": 0.10969384759664536, + "learning_rate": 3.9950735362574716e-05, + "loss": 1.2447, + "step": 239 + }, + { + "epoch": 0.36, + "grad_norm": 0.09104477614164352, + "learning_rate": 3.994824241655352e-05, + "loss": 1.2467, + "step": 240 + }, + { + "epoch": 0.36, + "grad_norm": 0.09550480544567108, + "learning_rate": 3.994568802388252e-05, + "loss": 1.3962, + "step": 241 + }, + { + "epoch": 0.36, + "grad_norm": 0.16599306464195251, + "learning_rate": 3.994307219243004e-05, + "loss": 1.3304, + "step": 242 + }, + { + "epoch": 0.37, + "grad_norm": 0.09215097874403, + "learning_rate": 3.9940394930253614e-05, + "loss": 1.2156, + "step": 243 + }, + { + "epoch": 0.37, + "grad_norm": 0.09193315356969833, + "learning_rate": 3.9937656245600044e-05, + "loss": 1.2319, + "step": 244 + }, + { + "epoch": 0.37, + "grad_norm": 0.11010710895061493, + "learning_rate": 3.9934856146905304e-05, + "loss": 1.3113, + "step": 245 + }, + { + "epoch": 0.37, + "grad_norm": 0.08709528297185898, + "learning_rate": 3.993199464279455e-05, + "loss": 1.2888, + "step": 246 + }, + { + "epoch": 0.37, + "grad_norm": 0.09270288795232773, + "learning_rate": 3.992907174208207e-05, + "loss": 1.3151, + "step": 247 + }, + { + "epoch": 0.37, + "grad_norm": 0.09684521704912186, + "learning_rate": 3.9926087453771306e-05, + "loss": 1.3228, + "step": 248 + }, + { + "epoch": 0.38, + "grad_norm": 0.08628430962562561, + "learning_rate": 3.992304178705477e-05, + "loss": 1.2889, + "step": 249 + }, + { + "epoch": 0.38, + "grad_norm": 0.09040474891662598, + "learning_rate": 3.9919934751314026e-05, + "loss": 1.3189, + "step": 250 + }, + { + "epoch": 0.38, + "grad_norm": 0.09273010492324829, + "learning_rate": 3.991676635611971e-05, + "loss": 1.342, + "step": 251 + }, + { + "epoch": 0.38, + "grad_norm": 0.08744922280311584, + "learning_rate": 3.9913536611231425e-05, + "loss": 1.2523, + "step": 252 + }, + { + "epoch": 0.38, + "grad_norm": 0.09474039077758789, + "learning_rate": 3.9910245526597774e-05, + "loss": 1.2678, + "step": 253 + }, + { + "epoch": 0.38, + "grad_norm": 0.10110338777303696, + "learning_rate": 3.99068931123563e-05, + "loss": 1.3925, + "step": 254 + }, + { + "epoch": 0.38, + "grad_norm": 0.09279424697160721, + "learning_rate": 3.990347937883346e-05, + "loss": 1.3169, + "step": 255 + }, + { + "epoch": 0.39, + "grad_norm": 0.0888659805059433, + "learning_rate": 3.9900004336544566e-05, + "loss": 1.3002, + "step": 256 + }, + { + "epoch": 0.39, + "grad_norm": 0.09020593017339706, + "learning_rate": 3.989646799619384e-05, + "loss": 1.3132, + "step": 257 + }, + { + "epoch": 0.39, + "grad_norm": 0.09785262495279312, + "learning_rate": 3.9892870368674265e-05, + "loss": 1.3859, + "step": 258 + }, + { + "epoch": 0.39, + "grad_norm": 0.09490260481834412, + "learning_rate": 3.988921146506764e-05, + "loss": 1.3504, + "step": 259 + }, + { + "epoch": 0.39, + "grad_norm": 0.09417515993118286, + "learning_rate": 3.988549129664448e-05, + "loss": 1.3105, + "step": 260 + }, + { + "epoch": 0.39, + "grad_norm": 0.10338198393583298, + "learning_rate": 3.988170987486405e-05, + "loss": 1.3519, + "step": 261 + }, + { + "epoch": 0.4, + "grad_norm": 0.09932684898376465, + "learning_rate": 3.987786721137428e-05, + "loss": 1.3255, + "step": 262 + }, + { + "epoch": 0.4, + "grad_norm": 0.27185529470443726, + "learning_rate": 3.9873963318011734e-05, + "loss": 1.3155, + "step": 263 + }, + { + "epoch": 0.4, + "grad_norm": 0.09261494874954224, + "learning_rate": 3.9869998206801594e-05, + "loss": 1.3543, + "step": 264 + }, + { + "epoch": 0.4, + "grad_norm": 0.17083612084388733, + "learning_rate": 3.9865971889957604e-05, + "loss": 1.2713, + "step": 265 + }, + { + "epoch": 0.4, + "grad_norm": 0.08977536112070084, + "learning_rate": 3.986188437988205e-05, + "loss": 1.2593, + "step": 266 + }, + { + "epoch": 0.4, + "grad_norm": 0.10692433267831802, + "learning_rate": 3.985773568916569e-05, + "loss": 1.2575, + "step": 267 + }, + { + "epoch": 0.4, + "grad_norm": 0.10271864384412766, + "learning_rate": 3.985352583058777e-05, + "loss": 1.2751, + "step": 268 + }, + { + "epoch": 0.41, + "grad_norm": 0.0912923738360405, + "learning_rate": 3.9849254817115925e-05, + "loss": 1.2933, + "step": 269 + }, + { + "epoch": 0.41, + "grad_norm": 0.08729909360408783, + "learning_rate": 3.984492266190618e-05, + "loss": 1.3279, + "step": 270 + }, + { + "epoch": 0.41, + "grad_norm": 0.10055013000965118, + "learning_rate": 3.984052937830289e-05, + "loss": 1.2595, + "step": 271 + }, + { + "epoch": 0.41, + "grad_norm": 0.09685542434453964, + "learning_rate": 3.98360749798387e-05, + "loss": 1.2156, + "step": 272 + }, + { + "epoch": 0.41, + "grad_norm": 0.08521684259176254, + "learning_rate": 3.9831559480234506e-05, + "loss": 1.124, + "step": 273 + }, + { + "epoch": 0.41, + "grad_norm": 0.09162552654743195, + "learning_rate": 3.982698289339943e-05, + "loss": 1.2539, + "step": 274 + }, + { + "epoch": 0.41, + "grad_norm": 0.11223973333835602, + "learning_rate": 3.982234523343074e-05, + "loss": 1.3281, + "step": 275 + }, + { + "epoch": 0.42, + "grad_norm": 0.09942933171987534, + "learning_rate": 3.981764651461385e-05, + "loss": 1.322, + "step": 276 + }, + { + "epoch": 0.42, + "grad_norm": 0.0901501402258873, + "learning_rate": 3.9812886751422234e-05, + "loss": 1.2477, + "step": 277 + }, + { + "epoch": 0.42, + "grad_norm": 0.09549134969711304, + "learning_rate": 3.980806595851742e-05, + "loss": 1.3206, + "step": 278 + }, + { + "epoch": 0.42, + "grad_norm": 0.11076848208904266, + "learning_rate": 3.9803184150748895e-05, + "loss": 1.334, + "step": 279 + }, + { + "epoch": 0.42, + "grad_norm": 0.08751917630434036, + "learning_rate": 3.979824134315413e-05, + "loss": 1.2837, + "step": 280 + }, + { + "epoch": 0.42, + "grad_norm": 0.12577761709690094, + "learning_rate": 3.979323755095846e-05, + "loss": 1.3161, + "step": 281 + }, + { + "epoch": 0.43, + "grad_norm": 0.09072336554527283, + "learning_rate": 3.97881727895751e-05, + "loss": 1.3205, + "step": 282 + }, + { + "epoch": 0.43, + "grad_norm": 0.31922072172164917, + "learning_rate": 3.9783047074605046e-05, + "loss": 1.2184, + "step": 283 + }, + { + "epoch": 0.43, + "grad_norm": 0.08622689545154572, + "learning_rate": 3.977786042183706e-05, + "loss": 1.3115, + "step": 284 + }, + { + "epoch": 0.43, + "grad_norm": 0.08856454491615295, + "learning_rate": 3.97726128472476e-05, + "loss": 1.3125, + "step": 285 + }, + { + "epoch": 0.43, + "grad_norm": 0.08550390601158142, + "learning_rate": 3.976730436700081e-05, + "loss": 1.2277, + "step": 286 + }, + { + "epoch": 0.43, + "grad_norm": 0.09026214480400085, + "learning_rate": 3.976193499744841e-05, + "loss": 1.2338, + "step": 287 + }, + { + "epoch": 0.43, + "grad_norm": 0.0971190556883812, + "learning_rate": 3.9756504755129685e-05, + "loss": 1.3346, + "step": 288 + }, + { + "epoch": 0.44, + "grad_norm": 0.09054573625326157, + "learning_rate": 3.9751013656771446e-05, + "loss": 1.3248, + "step": 289 + }, + { + "epoch": 0.44, + "grad_norm": 0.10101917386054993, + "learning_rate": 3.974546171928793e-05, + "loss": 1.3252, + "step": 290 + }, + { + "epoch": 0.44, + "grad_norm": 0.09141317754983902, + "learning_rate": 3.973984895978081e-05, + "loss": 1.2967, + "step": 291 + }, + { + "epoch": 0.44, + "grad_norm": 0.09146902710199356, + "learning_rate": 3.973417539553908e-05, + "loss": 1.2733, + "step": 292 + }, + { + "epoch": 0.44, + "grad_norm": 0.09041708707809448, + "learning_rate": 3.972844104403904e-05, + "loss": 1.3458, + "step": 293 + }, + { + "epoch": 0.44, + "grad_norm": 0.10464771836996078, + "learning_rate": 3.972264592294424e-05, + "loss": 1.3056, + "step": 294 + }, + { + "epoch": 0.44, + "grad_norm": 0.09591035544872284, + "learning_rate": 3.971679005010541e-05, + "loss": 1.3111, + "step": 295 + }, + { + "epoch": 0.45, + "grad_norm": 0.08957832306623459, + "learning_rate": 3.971087344356042e-05, + "loss": 1.3352, + "step": 296 + }, + { + "epoch": 0.45, + "grad_norm": 0.10896068811416626, + "learning_rate": 3.970489612153423e-05, + "loss": 1.441, + "step": 297 + }, + { + "epoch": 0.45, + "grad_norm": 0.1001083105802536, + "learning_rate": 3.969885810243879e-05, + "loss": 1.3175, + "step": 298 + }, + { + "epoch": 0.45, + "grad_norm": 0.08651696145534515, + "learning_rate": 3.9692759404873036e-05, + "loss": 1.2785, + "step": 299 + }, + { + "epoch": 0.45, + "grad_norm": 0.09035685658454895, + "learning_rate": 3.968660004762282e-05, + "loss": 1.3457, + "step": 300 + }, + { + "epoch": 0.45, + "grad_norm": 0.09436280280351639, + "learning_rate": 3.968038004966082e-05, + "loss": 1.2321, + "step": 301 + }, + { + "epoch": 0.46, + "grad_norm": 0.08704681694507599, + "learning_rate": 3.9674099430146543e-05, + "loss": 1.1919, + "step": 302 + }, + { + "epoch": 0.46, + "grad_norm": 0.1024637520313263, + "learning_rate": 3.9667758208426184e-05, + "loss": 1.2424, + "step": 303 + }, + { + "epoch": 0.46, + "grad_norm": 0.09690110385417938, + "learning_rate": 3.966135640403265e-05, + "loss": 1.3631, + "step": 304 + }, + { + "epoch": 0.46, + "grad_norm": 0.09614802151918411, + "learning_rate": 3.9654894036685426e-05, + "loss": 1.3976, + "step": 305 + }, + { + "epoch": 0.46, + "grad_norm": 0.10086134821176529, + "learning_rate": 3.9648371126290585e-05, + "loss": 1.4004, + "step": 306 + }, + { + "epoch": 0.46, + "grad_norm": 0.0938752219080925, + "learning_rate": 3.964178769294067e-05, + "loss": 1.2791, + "step": 307 + }, + { + "epoch": 0.46, + "grad_norm": 0.11438926309347153, + "learning_rate": 3.963514375691464e-05, + "loss": 1.1532, + "step": 308 + }, + { + "epoch": 0.47, + "grad_norm": 0.11092479526996613, + "learning_rate": 3.962843933867786e-05, + "loss": 1.3017, + "step": 309 + }, + { + "epoch": 0.47, + "grad_norm": 0.0874851793050766, + "learning_rate": 3.962167445888196e-05, + "loss": 1.2625, + "step": 310 + }, + { + "epoch": 0.47, + "grad_norm": 0.08893942087888718, + "learning_rate": 3.961484913836484e-05, + "loss": 1.2932, + "step": 311 + }, + { + "epoch": 0.47, + "grad_norm": 0.09928154945373535, + "learning_rate": 3.960796339815055e-05, + "loss": 1.3172, + "step": 312 + }, + { + "epoch": 0.47, + "grad_norm": 0.08946632593870163, + "learning_rate": 3.9601017259449264e-05, + "loss": 1.3556, + "step": 313 + }, + { + "epoch": 0.47, + "grad_norm": 0.1440565139055252, + "learning_rate": 3.9594010743657206e-05, + "loss": 1.2899, + "step": 314 + }, + { + "epoch": 0.48, + "grad_norm": 0.08712323755025864, + "learning_rate": 3.958694387235657e-05, + "loss": 1.295, + "step": 315 + }, + { + "epoch": 0.48, + "grad_norm": 0.09696578979492188, + "learning_rate": 3.957981666731547e-05, + "loss": 1.323, + "step": 316 + }, + { + "epoch": 0.48, + "grad_norm": 0.08793231844902039, + "learning_rate": 3.9572629150487865e-05, + "loss": 1.3316, + "step": 317 + }, + { + "epoch": 0.48, + "grad_norm": 0.0858168676495552, + "learning_rate": 3.956538134401349e-05, + "loss": 1.23, + "step": 318 + }, + { + "epoch": 0.48, + "grad_norm": 0.08655478060245514, + "learning_rate": 3.9558073270217784e-05, + "loss": 1.2627, + "step": 319 + }, + { + "epoch": 0.48, + "grad_norm": 0.08557727932929993, + "learning_rate": 3.955070495161185e-05, + "loss": 1.245, + "step": 320 + }, + { + "epoch": 0.48, + "grad_norm": 0.11078085005283356, + "learning_rate": 3.9543276410892334e-05, + "loss": 1.3109, + "step": 321 + }, + { + "epoch": 0.49, + "grad_norm": 0.08862492442131042, + "learning_rate": 3.953578767094142e-05, + "loss": 1.2609, + "step": 322 + }, + { + "epoch": 0.49, + "grad_norm": 0.0872182697057724, + "learning_rate": 3.952823875482668e-05, + "loss": 1.2967, + "step": 323 + }, + { + "epoch": 0.49, + "grad_norm": 0.0848020613193512, + "learning_rate": 3.9520629685801075e-05, + "loss": 1.2122, + "step": 324 + }, + { + "epoch": 0.49, + "grad_norm": 0.08789847046136856, + "learning_rate": 3.951296048730286e-05, + "loss": 1.2725, + "step": 325 + }, + { + "epoch": 0.49, + "grad_norm": 0.12200132757425308, + "learning_rate": 3.95052311829555e-05, + "loss": 1.2661, + "step": 326 + }, + { + "epoch": 0.49, + "grad_norm": 0.09069744497537613, + "learning_rate": 3.949744179656759e-05, + "loss": 1.2852, + "step": 327 + }, + { + "epoch": 0.49, + "grad_norm": 0.09917408972978592, + "learning_rate": 3.9489592352132806e-05, + "loss": 1.2502, + "step": 328 + }, + { + "epoch": 0.5, + "grad_norm": 0.08770907670259476, + "learning_rate": 3.948168287382983e-05, + "loss": 1.2982, + "step": 329 + }, + { + "epoch": 0.5, + "grad_norm": 0.09041351079940796, + "learning_rate": 3.947371338602227e-05, + "loss": 1.3082, + "step": 330 + }, + { + "epoch": 0.5, + "grad_norm": 0.09154205024242401, + "learning_rate": 3.946568391325855e-05, + "loss": 1.2446, + "step": 331 + }, + { + "epoch": 0.5, + "grad_norm": 0.09865383803844452, + "learning_rate": 3.94575944802719e-05, + "loss": 1.1797, + "step": 332 + }, + { + "epoch": 0.5, + "grad_norm": 0.1080142930150032, + "learning_rate": 3.9449445111980214e-05, + "loss": 1.2742, + "step": 333 + }, + { + "epoch": 0.5, + "grad_norm": 0.09101110696792603, + "learning_rate": 3.9441235833486045e-05, + "loss": 1.3099, + "step": 334 + }, + { + "epoch": 0.51, + "grad_norm": 0.09537916630506516, + "learning_rate": 3.943296667007646e-05, + "loss": 1.383, + "step": 335 + }, + { + "epoch": 0.51, + "grad_norm": 0.09077882021665573, + "learning_rate": 3.9424637647222994e-05, + "loss": 1.2639, + "step": 336 + }, + { + "epoch": 0.51, + "grad_norm": 0.08973152190446854, + "learning_rate": 3.941624879058157e-05, + "loss": 1.3743, + "step": 337 + }, + { + "epoch": 0.51, + "grad_norm": 0.08761763572692871, + "learning_rate": 3.940780012599241e-05, + "loss": 1.2851, + "step": 338 + }, + { + "epoch": 0.51, + "grad_norm": 0.09210208803415298, + "learning_rate": 3.9399291679479974e-05, + "loss": 1.2034, + "step": 339 + }, + { + "epoch": 0.51, + "grad_norm": 0.08868398517370224, + "learning_rate": 3.9390723477252866e-05, + "loss": 1.258, + "step": 340 + }, + { + "epoch": 0.51, + "grad_norm": 0.2671092748641968, + "learning_rate": 3.9382095545703744e-05, + "loss": 1.2147, + "step": 341 + }, + { + "epoch": 0.52, + "grad_norm": 0.09890039265155792, + "learning_rate": 3.937340791140927e-05, + "loss": 1.3691, + "step": 342 + }, + { + "epoch": 0.52, + "grad_norm": 0.0892067700624466, + "learning_rate": 3.9364660601129996e-05, + "loss": 1.3063, + "step": 343 + }, + { + "epoch": 0.52, + "grad_norm": 0.0856950506567955, + "learning_rate": 3.9355853641810286e-05, + "loss": 1.2963, + "step": 344 + }, + { + "epoch": 0.52, + "grad_norm": 0.09282035380601883, + "learning_rate": 3.934698706057827e-05, + "loss": 1.2772, + "step": 345 + }, + { + "epoch": 0.52, + "grad_norm": 0.09212394803762436, + "learning_rate": 3.933806088474569e-05, + "loss": 1.3132, + "step": 346 + }, + { + "epoch": 0.52, + "grad_norm": 0.09096615016460419, + "learning_rate": 3.9329075141807906e-05, + "loss": 1.2305, + "step": 347 + }, + { + "epoch": 0.52, + "grad_norm": 0.08938733488321304, + "learning_rate": 3.932002985944372e-05, + "loss": 1.2999, + "step": 348 + }, + { + "epoch": 0.53, + "grad_norm": 0.0863930881023407, + "learning_rate": 3.9310925065515354e-05, + "loss": 1.3369, + "step": 349 + }, + { + "epoch": 0.53, + "grad_norm": 0.09227600693702698, + "learning_rate": 3.930176078806835e-05, + "loss": 1.2709, + "step": 350 + }, + { + "epoch": 0.53, + "grad_norm": 0.10280267149209976, + "learning_rate": 3.929253705533144e-05, + "loss": 1.3535, + "step": 351 + }, + { + "epoch": 0.53, + "grad_norm": 0.09201916307210922, + "learning_rate": 3.928325389571656e-05, + "loss": 1.3371, + "step": 352 + }, + { + "epoch": 0.53, + "grad_norm": 0.16000549495220184, + "learning_rate": 3.9273911337818654e-05, + "loss": 1.3661, + "step": 353 + }, + { + "epoch": 0.53, + "grad_norm": 0.09236800670623779, + "learning_rate": 3.9264509410415626e-05, + "loss": 1.3085, + "step": 354 + }, + { + "epoch": 0.54, + "grad_norm": 0.10606455057859421, + "learning_rate": 3.925504814246828e-05, + "loss": 1.2602, + "step": 355 + }, + { + "epoch": 0.54, + "grad_norm": 0.10082320868968964, + "learning_rate": 3.924552756312019e-05, + "loss": 1.3324, + "step": 356 + }, + { + "epoch": 0.54, + "grad_norm": 0.09485574811697006, + "learning_rate": 3.923594770169764e-05, + "loss": 1.2639, + "step": 357 + }, + { + "epoch": 0.54, + "grad_norm": 0.09355232864618301, + "learning_rate": 3.922630858770952e-05, + "loss": 1.2088, + "step": 358 + }, + { + "epoch": 0.54, + "grad_norm": 0.09845402091741562, + "learning_rate": 3.9216610250847216e-05, + "loss": 1.3115, + "step": 359 + }, + { + "epoch": 0.54, + "grad_norm": 0.09732622653245926, + "learning_rate": 3.9206852720984566e-05, + "loss": 1.3648, + "step": 360 + }, + { + "epoch": 0.54, + "grad_norm": 0.09608597308397293, + "learning_rate": 3.919703602817772e-05, + "loss": 1.3328, + "step": 361 + }, + { + "epoch": 0.55, + "grad_norm": 0.09924275428056717, + "learning_rate": 3.918716020266509e-05, + "loss": 1.2821, + "step": 362 + }, + { + "epoch": 0.55, + "grad_norm": 0.11267717182636261, + "learning_rate": 3.9177225274867196e-05, + "loss": 1.2426, + "step": 363 + }, + { + "epoch": 0.55, + "grad_norm": 0.09425169974565506, + "learning_rate": 3.916723127538666e-05, + "loss": 1.3103, + "step": 364 + }, + { + "epoch": 0.55, + "grad_norm": 0.08777039498090744, + "learning_rate": 3.915717823500802e-05, + "loss": 1.3051, + "step": 365 + }, + { + "epoch": 0.55, + "grad_norm": 0.10187118500471115, + "learning_rate": 3.9147066184697706e-05, + "loss": 1.3257, + "step": 366 + }, + { + "epoch": 0.55, + "grad_norm": 0.09155014902353287, + "learning_rate": 3.9136895155603904e-05, + "loss": 1.268, + "step": 367 + }, + { + "epoch": 0.55, + "grad_norm": 0.09189581871032715, + "learning_rate": 3.9126665179056474e-05, + "loss": 1.2971, + "step": 368 + }, + { + "epoch": 0.56, + "grad_norm": 0.08968351781368256, + "learning_rate": 3.911637628656685e-05, + "loss": 1.2954, + "step": 369 + }, + { + "epoch": 0.56, + "grad_norm": 0.0899188220500946, + "learning_rate": 3.9106028509827957e-05, + "loss": 1.2451, + "step": 370 + }, + { + "epoch": 0.56, + "grad_norm": 0.08969484269618988, + "learning_rate": 3.909562188071408e-05, + "loss": 1.2686, + "step": 371 + }, + { + "epoch": 0.56, + "grad_norm": 0.09562985599040985, + "learning_rate": 3.9085156431280814e-05, + "loss": 1.3731, + "step": 372 + }, + { + "epoch": 0.56, + "grad_norm": 0.09706033766269684, + "learning_rate": 3.907463219376491e-05, + "loss": 1.278, + "step": 373 + }, + { + "epoch": 0.56, + "grad_norm": 0.10006203502416611, + "learning_rate": 3.906404920058423e-05, + "loss": 1.3011, + "step": 374 + }, + { + "epoch": 0.57, + "grad_norm": 0.09256932884454727, + "learning_rate": 3.905340748433761e-05, + "loss": 1.2794, + "step": 375 + }, + { + "epoch": 0.57, + "grad_norm": 0.09249746799468994, + "learning_rate": 3.904270707780475e-05, + "loss": 1.2208, + "step": 376 + }, + { + "epoch": 0.57, + "grad_norm": 0.09291823208332062, + "learning_rate": 3.903194801394618e-05, + "loss": 1.3053, + "step": 377 + }, + { + "epoch": 0.57, + "grad_norm": 0.09092970192432404, + "learning_rate": 3.9021130325903076e-05, + "loss": 1.2138, + "step": 378 + }, + { + "epoch": 0.57, + "grad_norm": 0.128141388297081, + "learning_rate": 3.9010254046997205e-05, + "loss": 1.236, + "step": 379 + }, + { + "epoch": 0.57, + "grad_norm": 0.09919355809688568, + "learning_rate": 3.899931921073081e-05, + "loss": 1.3844, + "step": 380 + }, + { + "epoch": 0.57, + "grad_norm": 0.09970150887966156, + "learning_rate": 3.898832585078652e-05, + "loss": 1.2792, + "step": 381 + }, + { + "epoch": 0.58, + "grad_norm": 0.14203396439552307, + "learning_rate": 3.8977274001027206e-05, + "loss": 1.2745, + "step": 382 + }, + { + "epoch": 0.58, + "grad_norm": 0.09901085495948792, + "learning_rate": 3.8966163695495946e-05, + "loss": 1.3443, + "step": 383 + }, + { + "epoch": 0.58, + "grad_norm": 0.09638842940330505, + "learning_rate": 3.8954994968415844e-05, + "loss": 1.313, + "step": 384 + }, + { + "epoch": 0.58, + "grad_norm": 0.0943804606795311, + "learning_rate": 3.8943767854189984e-05, + "loss": 1.2727, + "step": 385 + }, + { + "epoch": 0.58, + "grad_norm": 0.09030349552631378, + "learning_rate": 3.8932482387401284e-05, + "loss": 1.283, + "step": 386 + }, + { + "epoch": 0.58, + "grad_norm": 0.09015900641679764, + "learning_rate": 3.89211386028124e-05, + "loss": 1.1901, + "step": 387 + }, + { + "epoch": 0.59, + "grad_norm": 0.09004110842943192, + "learning_rate": 3.8909736535365666e-05, + "loss": 1.249, + "step": 388 + }, + { + "epoch": 0.59, + "grad_norm": 0.09241710603237152, + "learning_rate": 3.889827622018289e-05, + "loss": 1.3375, + "step": 389 + }, + { + "epoch": 0.59, + "grad_norm": 0.09758353978395462, + "learning_rate": 3.888675769256533e-05, + "loss": 1.2276, + "step": 390 + }, + { + "epoch": 0.59, + "grad_norm": 0.09788186848163605, + "learning_rate": 3.8875180987993564e-05, + "loss": 1.3301, + "step": 391 + }, + { + "epoch": 0.59, + "grad_norm": 0.09194394201040268, + "learning_rate": 3.886354614212735e-05, + "loss": 1.3375, + "step": 392 + }, + { + "epoch": 0.59, + "grad_norm": 0.09099754691123962, + "learning_rate": 3.885185319080555e-05, + "loss": 1.3426, + "step": 393 + }, + { + "epoch": 0.59, + "grad_norm": 0.11997543275356293, + "learning_rate": 3.884010217004601e-05, + "loss": 1.3569, + "step": 394 + }, + { + "epoch": 0.6, + "grad_norm": 0.08870114386081696, + "learning_rate": 3.8828293116045455e-05, + "loss": 1.2033, + "step": 395 + }, + { + "epoch": 0.6, + "grad_norm": 0.09448851644992828, + "learning_rate": 3.881642606517934e-05, + "loss": 1.269, + "step": 396 + }, + { + "epoch": 0.6, + "grad_norm": 0.09142625331878662, + "learning_rate": 3.880450105400181e-05, + "loss": 1.1921, + "step": 397 + }, + { + "epoch": 0.6, + "grad_norm": 0.09058167785406113, + "learning_rate": 3.879251811924551e-05, + "loss": 1.2658, + "step": 398 + }, + { + "epoch": 0.6, + "grad_norm": 0.09190699458122253, + "learning_rate": 3.878047729782153e-05, + "loss": 1.3166, + "step": 399 + }, + { + "epoch": 0.6, + "grad_norm": 0.09996069967746735, + "learning_rate": 3.8768378626819254e-05, + "loss": 1.2286, + "step": 400 + }, + { + "epoch": 0.6, + "grad_norm": 0.0970202311873436, + "learning_rate": 3.875622214350627e-05, + "loss": 1.1907, + "step": 401 + }, + { + "epoch": 0.61, + "grad_norm": 0.09404029697179794, + "learning_rate": 3.874400788532823e-05, + "loss": 1.2374, + "step": 402 + }, + { + "epoch": 0.61, + "grad_norm": 0.09835929423570633, + "learning_rate": 3.8731735889908775e-05, + "loss": 1.1777, + "step": 403 + }, + { + "epoch": 0.61, + "grad_norm": 0.09699105471372604, + "learning_rate": 3.871940619504938e-05, + "loss": 1.2535, + "step": 404 + }, + { + "epoch": 0.61, + "grad_norm": 0.09607970714569092, + "learning_rate": 3.870701883872924e-05, + "loss": 1.1928, + "step": 405 + }, + { + "epoch": 0.61, + "grad_norm": 0.0941271260380745, + "learning_rate": 3.869457385910519e-05, + "loss": 1.3559, + "step": 406 + }, + { + "epoch": 0.61, + "grad_norm": 0.10668264329433441, + "learning_rate": 3.868207129451155e-05, + "loss": 1.2654, + "step": 407 + }, + { + "epoch": 0.62, + "grad_norm": 0.0934709683060646, + "learning_rate": 3.8669511183460014e-05, + "loss": 1.3092, + "step": 408 + }, + { + "epoch": 0.62, + "grad_norm": 0.18777990341186523, + "learning_rate": 3.865689356463954e-05, + "loss": 1.361, + "step": 409 + }, + { + "epoch": 0.62, + "grad_norm": 0.09203624725341797, + "learning_rate": 3.864421847691624e-05, + "loss": 1.2223, + "step": 410 + }, + { + "epoch": 0.62, + "grad_norm": 0.09202445298433304, + "learning_rate": 3.863148595933322e-05, + "loss": 1.2545, + "step": 411 + }, + { + "epoch": 0.62, + "grad_norm": 0.09081540256738663, + "learning_rate": 3.861869605111053e-05, + "loss": 1.3743, + "step": 412 + }, + { + "epoch": 0.62, + "grad_norm": 0.09260386228561401, + "learning_rate": 3.8605848791644935e-05, + "loss": 1.211, + "step": 413 + }, + { + "epoch": 0.62, + "grad_norm": 0.08667195588350296, + "learning_rate": 3.859294422050994e-05, + "loss": 1.2772, + "step": 414 + }, + { + "epoch": 0.63, + "grad_norm": 0.10777632892131805, + "learning_rate": 3.857998237745552e-05, + "loss": 1.3319, + "step": 415 + }, + { + "epoch": 0.63, + "grad_norm": 0.08903021365404129, + "learning_rate": 3.856696330240809e-05, + "loss": 1.2044, + "step": 416 + }, + { + "epoch": 0.63, + "grad_norm": 0.08992208540439606, + "learning_rate": 3.855388703547038e-05, + "loss": 1.1144, + "step": 417 + }, + { + "epoch": 0.63, + "grad_norm": 0.09120858460664749, + "learning_rate": 3.8540753616921255e-05, + "loss": 1.1396, + "step": 418 + }, + { + "epoch": 0.63, + "grad_norm": 0.08874053508043289, + "learning_rate": 3.8527563087215634e-05, + "loss": 1.1963, + "step": 419 + }, + { + "epoch": 0.63, + "grad_norm": 0.08650252968072891, + "learning_rate": 3.8514315486984353e-05, + "loss": 1.1986, + "step": 420 + }, + { + "epoch": 0.63, + "grad_norm": 0.1317654252052307, + "learning_rate": 3.850101085703406e-05, + "loss": 1.2851, + "step": 421 + }, + { + "epoch": 0.64, + "grad_norm": 0.12072870880365372, + "learning_rate": 3.848764923834704e-05, + "loss": 1.2521, + "step": 422 + }, + { + "epoch": 0.64, + "grad_norm": 0.09248242527246475, + "learning_rate": 3.8474230672081166e-05, + "loss": 1.2991, + "step": 423 + }, + { + "epoch": 0.64, + "grad_norm": 0.09300348907709122, + "learning_rate": 3.846075519956968e-05, + "loss": 1.3268, + "step": 424 + }, + { + "epoch": 0.64, + "grad_norm": 0.0924823060631752, + "learning_rate": 3.844722286232114e-05, + "loss": 1.2201, + "step": 425 + }, + { + "epoch": 0.64, + "grad_norm": 0.087965227663517, + "learning_rate": 3.843363370201926e-05, + "loss": 1.1802, + "step": 426 + }, + { + "epoch": 0.64, + "grad_norm": 0.09820380061864853, + "learning_rate": 3.841998776052278e-05, + "loss": 1.2884, + "step": 427 + }, + { + "epoch": 0.65, + "grad_norm": 0.09926720708608627, + "learning_rate": 3.8406285079865345e-05, + "loss": 1.3814, + "step": 428 + }, + { + "epoch": 0.65, + "grad_norm": 0.09610018879175186, + "learning_rate": 3.839252570225539e-05, + "loss": 1.3105, + "step": 429 + }, + { + "epoch": 0.65, + "grad_norm": 0.09590383619070053, + "learning_rate": 3.8378709670075964e-05, + "loss": 1.2749, + "step": 430 + }, + { + "epoch": 0.65, + "grad_norm": 0.09373871237039566, + "learning_rate": 3.836483702588465e-05, + "loss": 1.3538, + "step": 431 + }, + { + "epoch": 0.65, + "grad_norm": 0.09673967957496643, + "learning_rate": 3.8350907812413415e-05, + "loss": 1.3166, + "step": 432 + }, + { + "epoch": 0.65, + "grad_norm": 0.09773720055818558, + "learning_rate": 3.8336922072568466e-05, + "loss": 1.2087, + "step": 433 + }, + { + "epoch": 0.65, + "grad_norm": 0.097389817237854, + "learning_rate": 3.8322879849430126e-05, + "loss": 1.3319, + "step": 434 + }, + { + "epoch": 0.66, + "grad_norm": 0.09051577001810074, + "learning_rate": 3.830878118625272e-05, + "loss": 1.2758, + "step": 435 + }, + { + "epoch": 0.66, + "grad_norm": 0.09671849012374878, + "learning_rate": 3.8294626126464406e-05, + "loss": 1.2432, + "step": 436 + }, + { + "epoch": 0.66, + "grad_norm": 0.09156167507171631, + "learning_rate": 3.8280414713667086e-05, + "loss": 1.2259, + "step": 437 + }, + { + "epoch": 0.66, + "grad_norm": 0.09500493854284286, + "learning_rate": 3.8266146991636225e-05, + "loss": 1.2644, + "step": 438 + }, + { + "epoch": 0.66, + "grad_norm": 0.09015759080648422, + "learning_rate": 3.825182300432073e-05, + "loss": 1.2385, + "step": 439 + }, + { + "epoch": 0.66, + "grad_norm": 0.09304061532020569, + "learning_rate": 3.823744279584285e-05, + "loss": 1.3014, + "step": 440 + }, + { + "epoch": 0.67, + "grad_norm": 0.09482213109731674, + "learning_rate": 3.8223006410498e-05, + "loss": 1.2916, + "step": 441 + }, + { + "epoch": 0.67, + "grad_norm": 0.09148463606834412, + "learning_rate": 3.8208513892754614e-05, + "loss": 1.262, + "step": 442 + }, + { + "epoch": 0.67, + "grad_norm": 0.10267407447099686, + "learning_rate": 3.819396528725408e-05, + "loss": 1.3148, + "step": 443 + }, + { + "epoch": 0.67, + "grad_norm": 0.0900326743721962, + "learning_rate": 3.8179360638810503e-05, + "loss": 1.2096, + "step": 444 + }, + { + "epoch": 0.67, + "grad_norm": 0.09562507271766663, + "learning_rate": 3.816469999241065e-05, + "loss": 1.3104, + "step": 445 + }, + { + "epoch": 0.67, + "grad_norm": 0.08906915038824081, + "learning_rate": 3.814998339321376e-05, + "loss": 1.1844, + "step": 446 + }, + { + "epoch": 0.67, + "grad_norm": 0.09294212609529495, + "learning_rate": 3.813521088655144e-05, + "loss": 1.2458, + "step": 447 + }, + { + "epoch": 0.68, + "grad_norm": 0.09855691343545914, + "learning_rate": 3.81203825179275e-05, + "loss": 1.2412, + "step": 448 + }, + { + "epoch": 0.68, + "grad_norm": 0.09475833177566528, + "learning_rate": 3.810549833301782e-05, + "loss": 1.2138, + "step": 449 + }, + { + "epoch": 0.68, + "grad_norm": 0.09860041737556458, + "learning_rate": 3.80905583776702e-05, + "loss": 1.2639, + "step": 450 + }, + { + "epoch": 0.68, + "grad_norm": 0.16529129445552826, + "learning_rate": 3.8075562697904274e-05, + "loss": 1.3165, + "step": 451 + }, + { + "epoch": 0.68, + "grad_norm": 0.09372055530548096, + "learning_rate": 3.806051133991127e-05, + "loss": 1.3213, + "step": 452 + }, + { + "epoch": 0.68, + "grad_norm": 0.09442650526762009, + "learning_rate": 3.804540435005395e-05, + "loss": 1.2739, + "step": 453 + }, + { + "epoch": 0.68, + "grad_norm": 0.09894248843193054, + "learning_rate": 3.803024177486643e-05, + "loss": 1.2779, + "step": 454 + }, + { + "epoch": 0.69, + "grad_norm": 0.0943969115614891, + "learning_rate": 3.801502366105406e-05, + "loss": 1.1779, + "step": 455 + }, + { + "epoch": 0.69, + "grad_norm": 0.09841915965080261, + "learning_rate": 3.799975005549325e-05, + "loss": 1.3049, + "step": 456 + }, + { + "epoch": 0.69, + "grad_norm": 0.09713226556777954, + "learning_rate": 3.7984421005231356e-05, + "loss": 1.2766, + "step": 457 + }, + { + "epoch": 0.69, + "grad_norm": 0.10096335411071777, + "learning_rate": 3.7969036557486505e-05, + "loss": 1.2518, + "step": 458 + }, + { + "epoch": 0.69, + "grad_norm": 0.09753825515508652, + "learning_rate": 3.795359675964746e-05, + "loss": 1.3055, + "step": 459 + }, + { + "epoch": 0.69, + "grad_norm": 0.1474526971578598, + "learning_rate": 3.793810165927352e-05, + "loss": 1.3029, + "step": 460 + }, + { + "epoch": 0.7, + "grad_norm": 0.1883493959903717, + "learning_rate": 3.7922551304094275e-05, + "loss": 1.3089, + "step": 461 + }, + { + "epoch": 0.7, + "grad_norm": 0.11237627267837524, + "learning_rate": 3.790694574200957e-05, + "loss": 1.3217, + "step": 462 + }, + { + "epoch": 0.7, + "grad_norm": 0.10173798352479935, + "learning_rate": 3.789128502108925e-05, + "loss": 1.3224, + "step": 463 + }, + { + "epoch": 0.7, + "grad_norm": 0.09086309373378754, + "learning_rate": 3.7875569189573125e-05, + "loss": 1.2644, + "step": 464 + }, + { + "epoch": 0.7, + "grad_norm": 0.09624147415161133, + "learning_rate": 3.785979829587072e-05, + "loss": 1.2384, + "step": 465 + }, + { + "epoch": 0.7, + "grad_norm": 0.09342305362224579, + "learning_rate": 3.7843972388561174e-05, + "loss": 1.3013, + "step": 466 + }, + { + "epoch": 0.7, + "grad_norm": 0.10934414714574814, + "learning_rate": 3.782809151639311e-05, + "loss": 1.2427, + "step": 467 + }, + { + "epoch": 0.71, + "grad_norm": 0.09088652580976486, + "learning_rate": 3.781215572828442e-05, + "loss": 1.327, + "step": 468 + }, + { + "epoch": 0.71, + "grad_norm": 0.10025391727685928, + "learning_rate": 3.7796165073322194e-05, + "loss": 1.2259, + "step": 469 + }, + { + "epoch": 0.71, + "grad_norm": 0.09943482279777527, + "learning_rate": 3.7780119600762495e-05, + "loss": 1.3134, + "step": 470 + }, + { + "epoch": 0.71, + "grad_norm": 0.09569672495126724, + "learning_rate": 3.7764019360030255e-05, + "loss": 1.3153, + "step": 471 + }, + { + "epoch": 0.71, + "grad_norm": 0.10055287182331085, + "learning_rate": 3.774786440071913e-05, + "loss": 1.3079, + "step": 472 + }, + { + "epoch": 0.71, + "grad_norm": 0.19955043494701385, + "learning_rate": 3.7731654772591285e-05, + "loss": 1.3384, + "step": 473 + }, + { + "epoch": 0.71, + "grad_norm": 0.0957150086760521, + "learning_rate": 3.77153905255773e-05, + "loss": 1.2701, + "step": 474 + }, + { + "epoch": 0.72, + "grad_norm": 0.09553897380828857, + "learning_rate": 3.769907170977601e-05, + "loss": 1.2705, + "step": 475 + }, + { + "epoch": 0.72, + "grad_norm": 0.10279964655637741, + "learning_rate": 3.7682698375454325e-05, + "loss": 1.3458, + "step": 476 + }, + { + "epoch": 0.72, + "grad_norm": 0.09367337822914124, + "learning_rate": 3.766627057304708e-05, + "loss": 1.2557, + "step": 477 + }, + { + "epoch": 0.72, + "grad_norm": 0.09905607253313065, + "learning_rate": 3.764978835315692e-05, + "loss": 1.2258, + "step": 478 + }, + { + "epoch": 0.72, + "grad_norm": 0.09889770299196243, + "learning_rate": 3.763325176655409e-05, + "loss": 1.3232, + "step": 479 + }, + { + "epoch": 0.72, + "grad_norm": 0.09260537475347519, + "learning_rate": 3.7616660864176277e-05, + "loss": 1.2362, + "step": 480 + }, + { + "epoch": 0.73, + "grad_norm": 0.1085742637515068, + "learning_rate": 3.760001569712853e-05, + "loss": 1.2701, + "step": 481 + }, + { + "epoch": 0.73, + "grad_norm": 0.0957634299993515, + "learning_rate": 3.7583316316683014e-05, + "loss": 1.2644, + "step": 482 + }, + { + "epoch": 0.73, + "grad_norm": 0.09362093359231949, + "learning_rate": 3.7566562774278885e-05, + "loss": 1.3075, + "step": 483 + }, + { + "epoch": 0.73, + "grad_norm": 0.09374577552080154, + "learning_rate": 3.7549755121522165e-05, + "loss": 1.2638, + "step": 484 + }, + { + "epoch": 0.73, + "grad_norm": 0.10661576688289642, + "learning_rate": 3.753289341018552e-05, + "loss": 1.3212, + "step": 485 + }, + { + "epoch": 0.73, + "grad_norm": 0.23099136352539062, + "learning_rate": 3.7515977692208154e-05, + "loss": 1.2561, + "step": 486 + }, + { + "epoch": 0.73, + "grad_norm": 0.09974721819162369, + "learning_rate": 3.749900801969561e-05, + "loss": 1.2522, + "step": 487 + }, + { + "epoch": 0.74, + "grad_norm": 0.10135775059461594, + "learning_rate": 3.748198444491965e-05, + "loss": 1.2344, + "step": 488 + }, + { + "epoch": 0.74, + "grad_norm": 0.09234325587749481, + "learning_rate": 3.746490702031805e-05, + "loss": 1.2536, + "step": 489 + }, + { + "epoch": 0.74, + "grad_norm": 0.09407179802656174, + "learning_rate": 3.744777579849447e-05, + "loss": 1.2624, + "step": 490 + }, + { + "epoch": 0.74, + "grad_norm": 0.10846799612045288, + "learning_rate": 3.743059083221828e-05, + "loss": 1.2913, + "step": 491 + }, + { + "epoch": 0.74, + "grad_norm": 0.095631904900074, + "learning_rate": 3.74133521744244e-05, + "loss": 1.2566, + "step": 492 + }, + { + "epoch": 0.74, + "grad_norm": 0.09939732402563095, + "learning_rate": 3.739605987821313e-05, + "loss": 1.3523, + "step": 493 + }, + { + "epoch": 0.74, + "grad_norm": 0.09304359555244446, + "learning_rate": 3.737871399685001e-05, + "loss": 1.1763, + "step": 494 + }, + { + "epoch": 0.75, + "grad_norm": 0.09246867895126343, + "learning_rate": 3.7361314583765616e-05, + "loss": 1.2075, + "step": 495 + }, + { + "epoch": 0.75, + "grad_norm": 0.10237107425928116, + "learning_rate": 3.7343861692555435e-05, + "loss": 1.2416, + "step": 496 + }, + { + "epoch": 0.75, + "grad_norm": 0.09497663378715515, + "learning_rate": 3.7326355376979676e-05, + "loss": 1.3629, + "step": 497 + }, + { + "epoch": 0.75, + "grad_norm": 0.09394579380750656, + "learning_rate": 3.7308795690963106e-05, + "loss": 1.1942, + "step": 498 + }, + { + "epoch": 0.75, + "grad_norm": 0.09781768172979355, + "learning_rate": 3.72911826885949e-05, + "loss": 1.3069, + "step": 499 + }, + { + "epoch": 0.75, + "grad_norm": 0.39589712023735046, + "learning_rate": 3.7273516424128464e-05, + "loss": 1.2557, + "step": 500 + }, + { + "epoch": 0.76, + "grad_norm": 0.09347429126501083, + "learning_rate": 3.725579695198126e-05, + "loss": 1.2297, + "step": 501 + }, + { + "epoch": 0.76, + "grad_norm": 0.09679634869098663, + "learning_rate": 3.7238024326734635e-05, + "loss": 1.2854, + "step": 502 + }, + { + "epoch": 0.76, + "grad_norm": 0.10017355531454086, + "learning_rate": 3.722019860313369e-05, + "loss": 1.2725, + "step": 503 + }, + { + "epoch": 0.76, + "grad_norm": 0.11011228710412979, + "learning_rate": 3.7202319836087066e-05, + "loss": 1.3004, + "step": 504 + }, + { + "epoch": 0.76, + "grad_norm": 0.10220441222190857, + "learning_rate": 3.718438808066679e-05, + "loss": 1.2979, + "step": 505 + }, + { + "epoch": 0.76, + "grad_norm": 0.09771667420864105, + "learning_rate": 3.716640339210815e-05, + "loss": 1.2511, + "step": 506 + }, + { + "epoch": 0.76, + "grad_norm": 0.14080555737018585, + "learning_rate": 3.714836582580943e-05, + "loss": 1.2587, + "step": 507 + }, + { + "epoch": 0.77, + "grad_norm": 0.09860668331384659, + "learning_rate": 3.713027543733181e-05, + "loss": 1.2013, + "step": 508 + }, + { + "epoch": 0.77, + "grad_norm": 0.10123850405216217, + "learning_rate": 3.71121322823992e-05, + "loss": 1.2942, + "step": 509 + }, + { + "epoch": 0.77, + "grad_norm": 0.09241792559623718, + "learning_rate": 3.709393641689803e-05, + "loss": 1.2403, + "step": 510 + }, + { + "epoch": 0.77, + "grad_norm": 0.09487643837928772, + "learning_rate": 3.707568789687709e-05, + "loss": 1.287, + "step": 511 + }, + { + "epoch": 0.77, + "grad_norm": 0.10037890076637268, + "learning_rate": 3.7057386778547376e-05, + "loss": 1.2634, + "step": 512 + }, + { + "epoch": 0.77, + "grad_norm": 0.11450394988059998, + "learning_rate": 3.703903311828188e-05, + "loss": 1.2197, + "step": 513 + }, + { + "epoch": 0.78, + "grad_norm": 0.3487299382686615, + "learning_rate": 3.702062697261547e-05, + "loss": 1.2756, + "step": 514 + }, + { + "epoch": 0.78, + "grad_norm": 0.10744113475084305, + "learning_rate": 3.7002168398244665e-05, + "loss": 1.2993, + "step": 515 + }, + { + "epoch": 0.78, + "grad_norm": 0.09575808048248291, + "learning_rate": 3.698365745202748e-05, + "loss": 1.2115, + "step": 516 + }, + { + "epoch": 0.78, + "grad_norm": 0.0996333658695221, + "learning_rate": 3.6965094190983256e-05, + "loss": 1.2943, + "step": 517 + }, + { + "epoch": 0.78, + "grad_norm": 0.09348877519369125, + "learning_rate": 3.6946478672292486e-05, + "loss": 1.2575, + "step": 518 + }, + { + "epoch": 0.78, + "grad_norm": 0.09612870961427689, + "learning_rate": 3.692781095329662e-05, + "loss": 1.3455, + "step": 519 + }, + { + "epoch": 0.78, + "grad_norm": 0.09679670631885529, + "learning_rate": 3.6909091091497906e-05, + "loss": 1.1723, + "step": 520 + }, + { + "epoch": 0.79, + "grad_norm": 0.1148669496178627, + "learning_rate": 3.689031914455921e-05, + "loss": 1.2642, + "step": 521 + }, + { + "epoch": 0.79, + "grad_norm": 0.0929841473698616, + "learning_rate": 3.687149517030384e-05, + "loss": 1.2956, + "step": 522 + }, + { + "epoch": 0.79, + "grad_norm": 0.11430943012237549, + "learning_rate": 3.685261922671535e-05, + "loss": 1.3066, + "step": 523 + }, + { + "epoch": 0.79, + "grad_norm": 0.10125204175710678, + "learning_rate": 3.683369137193738e-05, + "loss": 1.2799, + "step": 524 + }, + { + "epoch": 0.79, + "grad_norm": 0.10533029586076736, + "learning_rate": 3.681471166427349e-05, + "loss": 1.2379, + "step": 525 + }, + { + "epoch": 0.79, + "grad_norm": 0.09584540873765945, + "learning_rate": 3.679568016218693e-05, + "loss": 1.2684, + "step": 526 + }, + { + "epoch": 0.79, + "grad_norm": 0.09271769970655441, + "learning_rate": 3.6776596924300525e-05, + "loss": 1.2618, + "step": 527 + }, + { + "epoch": 0.8, + "grad_norm": 0.1034354418516159, + "learning_rate": 3.6757462009396446e-05, + "loss": 1.345, + "step": 528 + }, + { + "epoch": 0.8, + "grad_norm": 0.09431083500385284, + "learning_rate": 3.673827547641604e-05, + "loss": 1.2289, + "step": 529 + }, + { + "epoch": 0.8, + "grad_norm": 0.0986984595656395, + "learning_rate": 3.671903738445967e-05, + "loss": 1.3366, + "step": 530 + }, + { + "epoch": 0.8, + "grad_norm": 0.10041950643062592, + "learning_rate": 3.6699747792786496e-05, + "loss": 1.1818, + "step": 531 + }, + { + "epoch": 0.8, + "grad_norm": 0.09923342615365982, + "learning_rate": 3.6680406760814336e-05, + "loss": 1.3051, + "step": 532 + }, + { + "epoch": 0.8, + "grad_norm": 0.09473925828933716, + "learning_rate": 3.6661014348119433e-05, + "loss": 1.2343, + "step": 533 + }, + { + "epoch": 0.81, + "grad_norm": 0.09728220850229263, + "learning_rate": 3.664157061443632e-05, + "loss": 1.2822, + "step": 534 + }, + { + "epoch": 0.81, + "grad_norm": 0.0991523414850235, + "learning_rate": 3.6622075619657606e-05, + "loss": 1.3052, + "step": 535 + }, + { + "epoch": 0.81, + "grad_norm": 0.10301706194877625, + "learning_rate": 3.6602529423833806e-05, + "loss": 1.2459, + "step": 536 + }, + { + "epoch": 0.81, + "grad_norm": 0.09386403858661652, + "learning_rate": 3.658293208717313e-05, + "loss": 1.247, + "step": 537 + }, + { + "epoch": 0.81, + "grad_norm": 0.11030415445566177, + "learning_rate": 3.6563283670041337e-05, + "loss": 1.3022, + "step": 538 + }, + { + "epoch": 0.81, + "grad_norm": 0.11287105828523636, + "learning_rate": 3.6543584232961535e-05, + "loss": 1.2078, + "step": 539 + }, + { + "epoch": 0.81, + "grad_norm": 0.12047088146209717, + "learning_rate": 3.652383383661396e-05, + "loss": 1.3637, + "step": 540 + }, + { + "epoch": 0.82, + "grad_norm": 0.09657835215330124, + "learning_rate": 3.650403254183585e-05, + "loss": 1.3079, + "step": 541 + }, + { + "epoch": 0.82, + "grad_norm": 0.10076452046632767, + "learning_rate": 3.648418040962121e-05, + "loss": 1.3558, + "step": 542 + }, + { + "epoch": 0.82, + "grad_norm": 0.09592485427856445, + "learning_rate": 3.646427750112063e-05, + "loss": 1.2828, + "step": 543 + }, + { + "epoch": 0.82, + "grad_norm": 0.09359516203403473, + "learning_rate": 3.644432387764113e-05, + "loss": 1.178, + "step": 544 + }, + { + "epoch": 0.82, + "grad_norm": 0.10380193591117859, + "learning_rate": 3.6424319600645925e-05, + "loss": 1.2955, + "step": 545 + }, + { + "epoch": 0.82, + "grad_norm": 0.10107357800006866, + "learning_rate": 3.640426473175427e-05, + "loss": 1.3021, + "step": 546 + }, + { + "epoch": 0.82, + "grad_norm": 0.10654807835817337, + "learning_rate": 3.638415933274127e-05, + "loss": 1.2851, + "step": 547 + }, + { + "epoch": 0.83, + "grad_norm": 0.0942719355225563, + "learning_rate": 3.636400346553765e-05, + "loss": 1.2524, + "step": 548 + }, + { + "epoch": 0.83, + "grad_norm": 0.10161761194467545, + "learning_rate": 3.634379719222961e-05, + "loss": 1.2514, + "step": 549 + }, + { + "epoch": 0.83, + "grad_norm": 0.09901178628206253, + "learning_rate": 3.632354057505862e-05, + "loss": 1.2728, + "step": 550 + }, + { + "epoch": 0.83, + "grad_norm": 0.1025756374001503, + "learning_rate": 3.6303233676421205e-05, + "loss": 1.2906, + "step": 551 + }, + { + "epoch": 0.83, + "grad_norm": 0.09773645550012589, + "learning_rate": 3.62828765588688e-05, + "loss": 1.2359, + "step": 552 + }, + { + "epoch": 0.83, + "grad_norm": 0.10127020627260208, + "learning_rate": 3.6262469285107505e-05, + "loss": 1.3059, + "step": 553 + }, + { + "epoch": 0.84, + "grad_norm": 0.1283726692199707, + "learning_rate": 3.6242011917997936e-05, + "loss": 1.2667, + "step": 554 + }, + { + "epoch": 0.84, + "grad_norm": 0.09700168669223785, + "learning_rate": 3.6221504520554983e-05, + "loss": 1.2946, + "step": 555 + }, + { + "epoch": 0.84, + "grad_norm": 0.09845522046089172, + "learning_rate": 3.620094715594768e-05, + "loss": 1.319, + "step": 556 + }, + { + "epoch": 0.84, + "grad_norm": 0.09200394153594971, + "learning_rate": 3.6180339887498953e-05, + "loss": 1.2773, + "step": 557 + }, + { + "epoch": 0.84, + "grad_norm": 0.1027708649635315, + "learning_rate": 3.615968277868545e-05, + "loss": 1.3582, + "step": 558 + }, + { + "epoch": 0.84, + "grad_norm": 0.10044080018997192, + "learning_rate": 3.613897589313735e-05, + "loss": 1.2858, + "step": 559 + }, + { + "epoch": 0.84, + "grad_norm": 0.10135126113891602, + "learning_rate": 3.611821929463815e-05, + "loss": 1.2375, + "step": 560 + }, + { + "epoch": 0.85, + "grad_norm": 0.10125390440225601, + "learning_rate": 3.6097413047124475e-05, + "loss": 1.3544, + "step": 561 + }, + { + "epoch": 0.85, + "grad_norm": 0.0995013639330864, + "learning_rate": 3.60765572146859e-05, + "loss": 1.3038, + "step": 562 + }, + { + "epoch": 0.85, + "grad_norm": 0.09311343729496002, + "learning_rate": 3.605565186156474e-05, + "loss": 1.2396, + "step": 563 + }, + { + "epoch": 0.85, + "grad_norm": 0.09740523993968964, + "learning_rate": 3.603469705215582e-05, + "loss": 1.2878, + "step": 564 + }, + { + "epoch": 0.85, + "grad_norm": 0.11362947523593903, + "learning_rate": 3.601369285100632e-05, + "loss": 1.3271, + "step": 565 + }, + { + "epoch": 0.85, + "grad_norm": 0.09949863702058792, + "learning_rate": 3.599263932281557e-05, + "loss": 1.3138, + "step": 566 + }, + { + "epoch": 0.86, + "grad_norm": 0.0992012619972229, + "learning_rate": 3.597153653243484e-05, + "loss": 1.2969, + "step": 567 + }, + { + "epoch": 0.86, + "grad_norm": 0.09764760732650757, + "learning_rate": 3.5950384544867133e-05, + "loss": 1.3148, + "step": 568 + }, + { + "epoch": 0.86, + "grad_norm": 0.1094299852848053, + "learning_rate": 3.5929183425267e-05, + "loss": 1.3693, + "step": 569 + }, + { + "epoch": 0.86, + "grad_norm": 0.10212252289056778, + "learning_rate": 3.5907933238940336e-05, + "loss": 1.3674, + "step": 570 + }, + { + "epoch": 0.86, + "grad_norm": 0.1036219522356987, + "learning_rate": 3.588663405134417e-05, + "loss": 1.278, + "step": 571 + }, + { + "epoch": 0.86, + "grad_norm": 0.1177903488278389, + "learning_rate": 3.5865285928086475e-05, + "loss": 1.2635, + "step": 572 + }, + { + "epoch": 0.86, + "grad_norm": 0.1065855324268341, + "learning_rate": 3.584388893492596e-05, + "loss": 1.3486, + "step": 573 + }, + { + "epoch": 0.87, + "grad_norm": 0.09521755576133728, + "learning_rate": 3.582244313777187e-05, + "loss": 1.247, + "step": 574 + }, + { + "epoch": 0.87, + "grad_norm": 0.09789706021547318, + "learning_rate": 3.580094860268377e-05, + "loss": 1.2417, + "step": 575 + }, + { + "epoch": 0.87, + "grad_norm": 0.09712113440036774, + "learning_rate": 3.577940539587137e-05, + "loss": 1.2698, + "step": 576 + }, + { + "epoch": 0.87, + "grad_norm": 0.11199524998664856, + "learning_rate": 3.5757813583694294e-05, + "loss": 1.2876, + "step": 577 + }, + { + "epoch": 0.87, + "grad_norm": 0.10178734362125397, + "learning_rate": 3.5736173232661876e-05, + "loss": 1.2512, + "step": 578 + }, + { + "epoch": 0.87, + "grad_norm": 0.09522188454866409, + "learning_rate": 3.5714484409432994e-05, + "loss": 1.3234, + "step": 579 + }, + { + "epoch": 0.87, + "grad_norm": 0.12323606014251709, + "learning_rate": 3.56927471808158e-05, + "loss": 1.2714, + "step": 580 + }, + { + "epoch": 0.88, + "grad_norm": 0.09996538609266281, + "learning_rate": 3.567096161376757e-05, + "loss": 1.2863, + "step": 581 + }, + { + "epoch": 0.88, + "grad_norm": 0.09796728193759918, + "learning_rate": 3.564912777539447e-05, + "loss": 1.2674, + "step": 582 + }, + { + "epoch": 0.88, + "grad_norm": 0.0988701730966568, + "learning_rate": 3.562724573295136e-05, + "loss": 1.3397, + "step": 583 + }, + { + "epoch": 0.88, + "grad_norm": 0.10059138387441635, + "learning_rate": 3.560531555384158e-05, + "loss": 1.2771, + "step": 584 + }, + { + "epoch": 0.88, + "grad_norm": 0.15794380009174347, + "learning_rate": 3.5583337305616755e-05, + "loss": 1.2181, + "step": 585 + }, + { + "epoch": 0.88, + "grad_norm": 0.09785021096467972, + "learning_rate": 3.5561311055976556e-05, + "loss": 1.2615, + "step": 586 + }, + { + "epoch": 0.89, + "grad_norm": 0.09910497814416885, + "learning_rate": 3.553923687276854e-05, + "loss": 1.2092, + "step": 587 + }, + { + "epoch": 0.89, + "grad_norm": 0.09988433122634888, + "learning_rate": 3.551711482398789e-05, + "loss": 1.2396, + "step": 588 + }, + { + "epoch": 0.89, + "grad_norm": 0.10087579488754272, + "learning_rate": 3.5494944977777245e-05, + "loss": 1.2102, + "step": 589 + }, + { + "epoch": 0.89, + "grad_norm": 0.09836583584547043, + "learning_rate": 3.5472727402426475e-05, + "loss": 1.2465, + "step": 590 + }, + { + "epoch": 0.89, + "grad_norm": 0.09926188737154007, + "learning_rate": 3.5450462166372455e-05, + "loss": 1.2807, + "step": 591 + }, + { + "epoch": 0.89, + "grad_norm": 0.09702812135219574, + "learning_rate": 3.5428149338198885e-05, + "loss": 1.2095, + "step": 592 + }, + { + "epoch": 0.89, + "grad_norm": 0.10201310366392136, + "learning_rate": 3.540578898663606e-05, + "loss": 1.2091, + "step": 593 + }, + { + "epoch": 0.9, + "grad_norm": 0.10112461447715759, + "learning_rate": 3.538338118056065e-05, + "loss": 1.258, + "step": 594 + }, + { + "epoch": 0.9, + "grad_norm": 0.12413302063941956, + "learning_rate": 3.536092598899552e-05, + "loss": 1.2773, + "step": 595 + }, + { + "epoch": 0.9, + "grad_norm": 0.10083062201738358, + "learning_rate": 3.533842348110947e-05, + "loss": 1.3226, + "step": 596 + }, + { + "epoch": 0.9, + "grad_norm": 0.10039084404706955, + "learning_rate": 3.531587372621708e-05, + "loss": 1.3209, + "step": 597 + }, + { + "epoch": 0.9, + "grad_norm": 0.22292360663414001, + "learning_rate": 3.5293276793778445e-05, + "loss": 1.314, + "step": 598 + }, + { + "epoch": 0.9, + "grad_norm": 0.11285489052534103, + "learning_rate": 3.527063275339898e-05, + "loss": 1.2625, + "step": 599 + }, + { + "epoch": 0.9, + "grad_norm": 0.10537991672754288, + "learning_rate": 3.524794167482921e-05, + "loss": 1.2545, + "step": 600 + }, + { + "epoch": 0.91, + "grad_norm": 0.10544916242361069, + "learning_rate": 3.522520362796456e-05, + "loss": 1.2781, + "step": 601 + }, + { + "epoch": 0.91, + "grad_norm": 0.09848339110612869, + "learning_rate": 3.520241868284513e-05, + "loss": 1.215, + "step": 602 + }, + { + "epoch": 0.91, + "grad_norm": 0.10107920318841934, + "learning_rate": 3.517958690965547e-05, + "loss": 1.2638, + "step": 603 + }, + { + "epoch": 0.91, + "grad_norm": 0.10027237236499786, + "learning_rate": 3.5156708378724365e-05, + "loss": 1.2571, + "step": 604 + }, + { + "epoch": 0.91, + "grad_norm": 0.10818390548229218, + "learning_rate": 3.5133783160524676e-05, + "loss": 1.1813, + "step": 605 + }, + { + "epoch": 0.91, + "grad_norm": 0.10106457024812698, + "learning_rate": 3.511081132567302e-05, + "loss": 1.2597, + "step": 606 + }, + { + "epoch": 0.92, + "grad_norm": 0.10528743267059326, + "learning_rate": 3.508779294492963e-05, + "loss": 1.2586, + "step": 607 + }, + { + "epoch": 0.92, + "grad_norm": 0.10281350463628769, + "learning_rate": 3.506472808919814e-05, + "loss": 1.1717, + "step": 608 + }, + { + "epoch": 0.92, + "grad_norm": 0.10465335845947266, + "learning_rate": 3.504161682952528e-05, + "loss": 1.262, + "step": 609 + }, + { + "epoch": 0.92, + "grad_norm": 0.10388662666082382, + "learning_rate": 3.501845923710079e-05, + "loss": 1.2159, + "step": 610 + }, + { + "epoch": 0.92, + "grad_norm": 0.09473969787359238, + "learning_rate": 3.499525538325706e-05, + "loss": 1.2097, + "step": 611 + }, + { + "epoch": 0.92, + "grad_norm": 0.12780483067035675, + "learning_rate": 3.4972005339469044e-05, + "loss": 1.2865, + "step": 612 + }, + { + "epoch": 0.92, + "grad_norm": 0.10141358524560928, + "learning_rate": 3.494870917735392e-05, + "loss": 1.251, + "step": 613 + }, + { + "epoch": 0.93, + "grad_norm": 0.10082539170980453, + "learning_rate": 3.4925366968670954e-05, + "loss": 1.1985, + "step": 614 + }, + { + "epoch": 0.93, + "grad_norm": 0.11620905995368958, + "learning_rate": 3.490197878532125e-05, + "loss": 1.2248, + "step": 615 + }, + { + "epoch": 0.93, + "grad_norm": 0.11251901090145111, + "learning_rate": 3.487854469934752e-05, + "loss": 1.2783, + "step": 616 + }, + { + "epoch": 0.93, + "grad_norm": 0.10215379297733307, + "learning_rate": 3.485506478293386e-05, + "loss": 1.2728, + "step": 617 + }, + { + "epoch": 0.93, + "grad_norm": 0.10099799931049347, + "learning_rate": 3.4831539108405554e-05, + "loss": 1.2482, + "step": 618 + }, + { + "epoch": 0.93, + "grad_norm": 0.10392336547374725, + "learning_rate": 3.4807967748228844e-05, + "loss": 1.3398, + "step": 619 + }, + { + "epoch": 0.93, + "grad_norm": 0.09926697611808777, + "learning_rate": 3.4784350775010666e-05, + "loss": 1.2032, + "step": 620 + }, + { + "epoch": 0.94, + "grad_norm": 0.11885109543800354, + "learning_rate": 3.476068826149849e-05, + "loss": 1.2758, + "step": 621 + }, + { + "epoch": 0.94, + "grad_norm": 0.11147400736808777, + "learning_rate": 3.473698028058004e-05, + "loss": 1.3395, + "step": 622 + }, + { + "epoch": 0.94, + "grad_norm": 0.10120638459920883, + "learning_rate": 3.471322690528311e-05, + "loss": 1.2443, + "step": 623 + }, + { + "epoch": 0.94, + "grad_norm": 0.0963161513209343, + "learning_rate": 3.46894282087753e-05, + "loss": 1.2044, + "step": 624 + }, + { + "epoch": 0.94, + "grad_norm": 0.09679801762104034, + "learning_rate": 3.4665584264363853e-05, + "loss": 1.2588, + "step": 625 + }, + { + "epoch": 0.94, + "grad_norm": 0.09973888844251633, + "learning_rate": 3.464169514549535e-05, + "loss": 1.3125, + "step": 626 + }, + { + "epoch": 0.95, + "grad_norm": 0.14279921352863312, + "learning_rate": 3.4617760925755535e-05, + "loss": 1.3325, + "step": 627 + }, + { + "epoch": 0.95, + "grad_norm": 0.09309370070695877, + "learning_rate": 3.4593781678869086e-05, + "loss": 1.2179, + "step": 628 + }, + { + "epoch": 0.95, + "grad_norm": 0.09645505994558334, + "learning_rate": 3.456975747869938e-05, + "loss": 1.3101, + "step": 629 + }, + { + "epoch": 0.95, + "grad_norm": 0.09843626618385315, + "learning_rate": 3.454568839924823e-05, + "loss": 1.2498, + "step": 630 + }, + { + "epoch": 0.95, + "grad_norm": 0.10125508904457092, + "learning_rate": 3.452157451465574e-05, + "loss": 1.3227, + "step": 631 + }, + { + "epoch": 0.95, + "grad_norm": 0.10442937910556793, + "learning_rate": 3.44974158992e-05, + "loss": 1.2261, + "step": 632 + }, + { + "epoch": 0.95, + "grad_norm": 0.12025795876979828, + "learning_rate": 3.447321262729689e-05, + "loss": 1.2882, + "step": 633 + }, + { + "epoch": 0.96, + "grad_norm": 0.12409981340169907, + "learning_rate": 3.444896477349983e-05, + "loss": 1.2798, + "step": 634 + }, + { + "epoch": 0.96, + "grad_norm": 0.1082785576581955, + "learning_rate": 3.442467241249959e-05, + "loss": 1.2554, + "step": 635 + }, + { + "epoch": 0.96, + "grad_norm": 0.10678684711456299, + "learning_rate": 3.440033561912402e-05, + "loss": 1.275, + "step": 636 + }, + { + "epoch": 0.96, + "grad_norm": 0.101189024746418, + "learning_rate": 3.437595446833785e-05, + "loss": 1.3104, + "step": 637 + }, + { + "epoch": 0.96, + "grad_norm": 0.09756095707416534, + "learning_rate": 3.435152903524243e-05, + "loss": 1.2176, + "step": 638 + }, + { + "epoch": 0.96, + "grad_norm": 0.10495869815349579, + "learning_rate": 3.432705939507551e-05, + "loss": 1.2767, + "step": 639 + }, + { + "epoch": 0.97, + "grad_norm": 0.11430257558822632, + "learning_rate": 3.430254562321103e-05, + "loss": 1.2273, + "step": 640 + }, + { + "epoch": 0.97, + "grad_norm": 0.09991898387670517, + "learning_rate": 3.427798779515886e-05, + "loss": 1.251, + "step": 641 + }, + { + "epoch": 0.97, + "grad_norm": 0.10373462736606598, + "learning_rate": 3.425338598656456e-05, + "loss": 1.3045, + "step": 642 + }, + { + "epoch": 0.97, + "grad_norm": 0.10063493996858597, + "learning_rate": 3.422874027320919e-05, + "loss": 1.3163, + "step": 643 + }, + { + "epoch": 0.97, + "grad_norm": 0.09804417937994003, + "learning_rate": 3.420405073100905e-05, + "loss": 1.2773, + "step": 644 + }, + { + "epoch": 0.97, + "grad_norm": 0.10426832735538483, + "learning_rate": 3.41793174360154e-05, + "loss": 1.31, + "step": 645 + }, + { + "epoch": 0.97, + "grad_norm": 0.10056082159280777, + "learning_rate": 3.415454046441433e-05, + "loss": 1.2994, + "step": 646 + }, + { + "epoch": 0.98, + "grad_norm": 0.10127512365579605, + "learning_rate": 3.412971989252644e-05, + "loss": 1.2515, + "step": 647 + }, + { + "epoch": 0.98, + "grad_norm": 0.1075519472360611, + "learning_rate": 3.410485579680664e-05, + "loss": 1.2516, + "step": 648 + }, + { + "epoch": 0.98, + "grad_norm": 0.1012885719537735, + "learning_rate": 3.407994825384389e-05, + "loss": 1.2629, + "step": 649 + }, + { + "epoch": 0.98, + "grad_norm": 0.5178219079971313, + "learning_rate": 3.4054997340361e-05, + "loss": 1.2238, + "step": 650 + }, + { + "epoch": 0.98, + "grad_norm": 0.10396286100149155, + "learning_rate": 3.403000313321437e-05, + "loss": 1.3326, + "step": 651 + }, + { + "epoch": 0.98, + "grad_norm": 0.11031419783830643, + "learning_rate": 3.4004965709393754e-05, + "loss": 1.3087, + "step": 652 + }, + { + "epoch": 0.98, + "grad_norm": 0.10638967901468277, + "learning_rate": 3.397988514602202e-05, + "loss": 1.2847, + "step": 653 + }, + { + "epoch": 0.99, + "grad_norm": 0.10948842763900757, + "learning_rate": 3.3954761520354934e-05, + "loss": 1.2085, + "step": 654 + }, + { + "epoch": 0.99, + "grad_norm": 0.10390760004520416, + "learning_rate": 3.3929594909780895e-05, + "loss": 1.2409, + "step": 655 + }, + { + "epoch": 0.99, + "grad_norm": 0.18467743694782257, + "learning_rate": 3.390438539182071e-05, + "loss": 1.2138, + "step": 656 + }, + { + "epoch": 0.99, + "grad_norm": 0.10950042307376862, + "learning_rate": 3.387913304412737e-05, + "loss": 1.2487, + "step": 657 + }, + { + "epoch": 0.99, + "grad_norm": 0.09976302832365036, + "learning_rate": 3.385383794448577e-05, + "loss": 1.2847, + "step": 658 + }, + { + "epoch": 0.99, + "grad_norm": 0.1013193428516388, + "learning_rate": 3.38285001708125e-05, + "loss": 1.1697, + "step": 659 + }, + { + "epoch": 1.0, + "grad_norm": 0.10355798900127411, + "learning_rate": 3.380311980115561e-05, + "loss": 1.2567, + "step": 660 + }, + { + "epoch": 1.0, + "grad_norm": 0.1062517762184143, + "learning_rate": 3.377769691369436e-05, + "loss": 1.206, + "step": 661 + }, + { + "epoch": 1.0, + "grad_norm": 0.10621873289346695, + "learning_rate": 3.375223158673897e-05, + "loss": 1.2086, + "step": 662 + }, + { + "epoch": 1.0, + "grad_norm": 0.1044023185968399, + "learning_rate": 3.372672389873037e-05, + "loss": 1.3169, + "step": 663 + }, + { + "epoch": 1.0, + "grad_norm": 0.17443831264972687, + "learning_rate": 3.370117392824001e-05, + "loss": 1.2068, + "step": 664 + }, + { + "epoch": 1.0, + "grad_norm": 0.15318287909030914, + "learning_rate": 3.367558175396956e-05, + "loss": 1.2492, + "step": 665 + }, + { + "epoch": 1.0, + "grad_norm": 0.138312429189682, + "learning_rate": 3.364994745475069e-05, + "loss": 1.1956, + "step": 666 + }, + { + "epoch": 1.01, + "grad_norm": 0.11741306632757187, + "learning_rate": 3.3624271109544836e-05, + "loss": 1.2354, + "step": 667 + }, + { + "epoch": 1.01, + "grad_norm": 0.11220765858888626, + "learning_rate": 3.359855279744295e-05, + "loss": 1.2772, + "step": 668 + }, + { + "epoch": 1.01, + "grad_norm": 0.11640415340662003, + "learning_rate": 3.3572792597665244e-05, + "loss": 1.2325, + "step": 669 + }, + { + "epoch": 1.01, + "grad_norm": 0.12331370264291763, + "learning_rate": 3.3546990589560974e-05, + "loss": 1.2054, + "step": 670 + }, + { + "epoch": 1.01, + "grad_norm": 0.43303507566452026, + "learning_rate": 3.352114685260815e-05, + "loss": 1.1879, + "step": 671 + }, + { + "epoch": 1.01, + "grad_norm": 0.11492155492305756, + "learning_rate": 3.3495261466413344e-05, + "loss": 1.1726, + "step": 672 + }, + { + "epoch": 1.01, + "grad_norm": 0.15229344367980957, + "learning_rate": 3.3469334510711415e-05, + "loss": 1.1263, + "step": 673 + }, + { + "epoch": 1.02, + "grad_norm": 0.12326847016811371, + "learning_rate": 3.3443366065365274e-05, + "loss": 1.115, + "step": 674 + }, + { + "epoch": 1.02, + "grad_norm": 0.1297662854194641, + "learning_rate": 3.3417356210365616e-05, + "loss": 1.1661, + "step": 675 + }, + { + "epoch": 1.02, + "grad_norm": 0.11710534244775772, + "learning_rate": 3.33913050258307e-05, + "loss": 1.2177, + "step": 676 + }, + { + "epoch": 1.02, + "grad_norm": 0.11790799349546432, + "learning_rate": 3.3365212592006096e-05, + "loss": 1.165, + "step": 677 + }, + { + "epoch": 1.02, + "grad_norm": 0.1255820244550705, + "learning_rate": 3.333907898926443e-05, + "loss": 1.1518, + "step": 678 + }, + { + "epoch": 1.02, + "grad_norm": 0.11197784543037415, + "learning_rate": 3.3312904298105136e-05, + "loss": 1.0982, + "step": 679 + }, + { + "epoch": 1.03, + "grad_norm": 0.13386082649230957, + "learning_rate": 3.328668859915423e-05, + "loss": 1.1454, + "step": 680 + }, + { + "epoch": 1.03, + "grad_norm": 0.12398520857095718, + "learning_rate": 3.326043197316402e-05, + "loss": 1.0849, + "step": 681 + }, + { + "epoch": 1.03, + "grad_norm": 0.11305395513772964, + "learning_rate": 3.3234134501012914e-05, + "loss": 1.2355, + "step": 682 + }, + { + "epoch": 1.03, + "grad_norm": 0.11172299087047577, + "learning_rate": 3.3207796263705094e-05, + "loss": 1.1795, + "step": 683 + }, + { + "epoch": 1.03, + "grad_norm": 0.10579285770654678, + "learning_rate": 3.318141734237035e-05, + "loss": 1.114, + "step": 684 + }, + { + "epoch": 1.03, + "grad_norm": 0.11569254845380783, + "learning_rate": 3.315499781826378e-05, + "loss": 1.2016, + "step": 685 + }, + { + "epoch": 1.03, + "grad_norm": 0.127850741147995, + "learning_rate": 3.3128537772765547e-05, + "loss": 1.1837, + "step": 686 + }, + { + "epoch": 1.04, + "grad_norm": 0.10873915255069733, + "learning_rate": 3.3102037287380624e-05, + "loss": 1.1717, + "step": 687 + }, + { + "epoch": 1.04, + "grad_norm": 0.1114748865365982, + "learning_rate": 3.307549644373857e-05, + "loss": 1.1275, + "step": 688 + }, + { + "epoch": 1.04, + "grad_norm": 0.11789058893918991, + "learning_rate": 3.304891532359325e-05, + "loss": 1.2325, + "step": 689 + }, + { + "epoch": 1.04, + "grad_norm": 0.1156744733452797, + "learning_rate": 3.302229400882259e-05, + "loss": 1.2373, + "step": 690 + }, + { + "epoch": 1.04, + "grad_norm": 0.11554136127233505, + "learning_rate": 3.299563258142833e-05, + "loss": 1.2358, + "step": 691 + }, + { + "epoch": 1.04, + "grad_norm": 0.1327182948589325, + "learning_rate": 3.296893112353577e-05, + "loss": 1.1234, + "step": 692 + }, + { + "epoch": 1.05, + "grad_norm": 0.12316634505987167, + "learning_rate": 3.2942189717393526e-05, + "loss": 1.2036, + "step": 693 + }, + { + "epoch": 1.05, + "grad_norm": 0.11200796812772751, + "learning_rate": 3.291540844537324e-05, + "loss": 1.2025, + "step": 694 + }, + { + "epoch": 1.05, + "grad_norm": 0.11107231676578522, + "learning_rate": 3.288858738996939e-05, + "loss": 1.2021, + "step": 695 + }, + { + "epoch": 1.05, + "grad_norm": 0.12234508246183395, + "learning_rate": 3.286172663379896e-05, + "loss": 1.2193, + "step": 696 + }, + { + "epoch": 1.05, + "grad_norm": 0.1103217676281929, + "learning_rate": 3.283482625960125e-05, + "loss": 1.2, + "step": 697 + }, + { + "epoch": 1.05, + "grad_norm": 0.11867187172174454, + "learning_rate": 3.280788635023758e-05, + "loss": 1.1411, + "step": 698 + }, + { + "epoch": 1.05, + "grad_norm": 0.12065901607275009, + "learning_rate": 3.278090698869108e-05, + "loss": 1.1952, + "step": 699 + }, + { + "epoch": 1.06, + "grad_norm": 0.12243732064962387, + "learning_rate": 3.2753888258066374e-05, + "loss": 1.2088, + "step": 700 + } + ], + "logging_steps": 1.0, + "max_steps": 1989, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "total_flos": 3.805551069631611e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}