diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7082 @@ +{ + "best_metric": 0.035888671875, + "best_model_checkpoint": "./results_morgangen/checkpoint-100000", + "epoch": 0.001, + "eval_steps": 20000, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-06, + "grad_norm": 2.3610475063323975, + "learning_rate": 4.9328196339992464e-06, + "loss": 3.6864, + "step": 100 + }, + { + "epoch": 2e-06, + "grad_norm": 1.656525731086731, + "learning_rate": 5.719504324825564e-06, + "loss": 1.718, + "step": 200 + }, + { + "epoch": 3e-06, + "grad_norm": 1.8576802015304565, + "learning_rate": 6.1708683260303926e-06, + "loss": 1.4701, + "step": 300 + }, + { + "epoch": 4e-06, + "grad_norm": 1.383483648300171, + "learning_rate": 6.488740554563935e-06, + "loss": 1.3455, + "step": 400 + }, + { + "epoch": 5e-06, + "grad_norm": 1.4719737768173218, + "learning_rate": 6.734317372309117e-06, + "loss": 1.261, + "step": 500 + }, + { + "epoch": 6e-06, + "grad_norm": 1.5180269479751587, + "learning_rate": 6.934466112452983e-06, + "loss": 1.1993, + "step": 600 + }, + { + "epoch": 7e-06, + "grad_norm": 1.4714821577072144, + "learning_rate": 7.103398676137137e-06, + "loss": 1.1509, + "step": 700 + }, + { + "epoch": 8e-06, + "grad_norm": 1.4220333099365234, + "learning_rate": 7.249551256067741e-06, + "loss": 1.104, + "step": 800 + }, + { + "epoch": 9e-06, + "grad_norm": 2.056689977645874, + "learning_rate": 7.378343796989793e-06, + "loss": 1.0759, + "step": 900 + }, + { + "epoch": 1e-05, + "grad_norm": 1.5754709243774414, + "learning_rate": 7.493465960993282e-06, + "loss": 1.0397, + "step": 1000 + }, + { + "epoch": 1.1e-05, + "grad_norm": 1.4028830528259277, + "learning_rate": 7.596550404874257e-06, + "loss": 1.0055, + "step": 1100 + }, + { + "epoch": 1.2e-05, + "grad_norm": 1.4989030361175537, + "learning_rate": 7.691601109175854e-06, + "loss": 0.9769, + "step": 1200 + }, + { + "epoch": 1.3e-05, + "grad_norm": 1.9516693353652954, + "learning_rate": 7.778996312200985e-06, + "loss": 0.9499, + "step": 1300 + }, + { + "epoch": 1.4e-05, + "grad_norm": 1.7118744850158691, + "learning_rate": 7.859877791059908e-06, + "loss": 0.9245, + "step": 1400 + }, + { + "epoch": 1.5e-05, + "grad_norm": 1.9349831342697144, + "learning_rate": 7.935149519312563e-06, + "loss": 0.9002, + "step": 1500 + }, + { + "epoch": 1.6e-05, + "grad_norm": 1.3496636152267456, + "learning_rate": 8.005539439502828e-06, + "loss": 0.881, + "step": 1600 + }, + { + "epoch": 1.7e-05, + "grad_norm": 1.6026203632354736, + "learning_rate": 8.071642395272339e-06, + "loss": 0.8592, + "step": 1700 + }, + { + "epoch": 1.8e-05, + "grad_norm": 1.9675606489181519, + "learning_rate": 8.133950723905457e-06, + "loss": 0.841, + "step": 1800 + }, + { + "epoch": 1.9e-05, + "grad_norm": 2.057551145553589, + "learning_rate": 8.19287653490949e-06, + "loss": 0.8228, + "step": 1900 + }, + { + "epoch": 2e-05, + "grad_norm": 1.8387638330459595, + "learning_rate": 8.248223335219199e-06, + "loss": 0.8037, + "step": 2000 + }, + { + "epoch": 2.1e-05, + "grad_norm": 1.7323527336120605, + "learning_rate": 8.30140420048809e-06, + "loss": 0.7874, + "step": 2100 + }, + { + "epoch": 2.2e-05, + "grad_norm": 2.6023099422454834, + "learning_rate": 8.352101374530827e-06, + "loss": 0.7677, + "step": 2200 + }, + { + "epoch": 2.3e-05, + "grad_norm": 1.9935753345489502, + "learning_rate": 8.400536533238381e-06, + "loss": 0.7554, + "step": 2300 + }, + { + "epoch": 2.4e-05, + "grad_norm": 1.632101058959961, + "learning_rate": 8.446902938290931e-06, + "loss": 0.7376, + "step": 2400 + }, + { + "epoch": 2.5e-05, + "grad_norm": 2.2343554496765137, + "learning_rate": 8.491370094967829e-06, + "loss": 0.7202, + "step": 2500 + }, + { + "epoch": 2.6e-05, + "grad_norm": 2.182626962661743, + "learning_rate": 8.534087492996389e-06, + "loss": 0.706, + "step": 2600 + }, + { + "epoch": 2.7e-05, + "grad_norm": 1.9008549451828003, + "learning_rate": 8.575187638879847e-06, + "loss": 0.694, + "step": 2700 + }, + { + "epoch": 2.8e-05, + "grad_norm": 1.8491170406341553, + "learning_rate": 8.614788534877808e-06, + "loss": 0.6765, + "step": 2800 + }, + { + "epoch": 2.9e-05, + "grad_norm": 2.2471179962158203, + "learning_rate": 8.652995721556234e-06, + "loss": 0.6633, + "step": 2900 + }, + { + "epoch": 3e-05, + "grad_norm": 1.9031002521514893, + "learning_rate": 8.689903972981059e-06, + "loss": 0.6455, + "step": 3000 + }, + { + "epoch": 3.1e-05, + "grad_norm": 1.8448904752731323, + "learning_rate": 8.725598713115716e-06, + "loss": 0.6356, + "step": 3100 + }, + { + "epoch": 3.2e-05, + "grad_norm": 1.7731763124465942, + "learning_rate": 8.760157206696729e-06, + "loss": 0.6215, + "step": 3200 + }, + { + "epoch": 3.3e-05, + "grad_norm": 1.9336998462677002, + "learning_rate": 8.79364956635058e-06, + "loss": 0.6082, + "step": 3300 + }, + { + "epoch": 3.4e-05, + "grad_norm": 1.5020197629928589, + "learning_rate": 8.82613960896169e-06, + "loss": 0.5918, + "step": 3400 + }, + { + "epoch": 3.5e-05, + "grad_norm": 1.8765689134597778, + "learning_rate": 8.85768558758383e-06, + "loss": 0.5799, + "step": 3500 + }, + { + "epoch": 3.6e-05, + "grad_norm": 1.8304609060287476, + "learning_rate": 8.888340819988166e-06, + "loss": 0.5658, + "step": 3600 + }, + { + "epoch": 3.7e-05, + "grad_norm": 1.663233757019043, + "learning_rate": 8.918154230884686e-06, + "loss": 0.5574, + "step": 3700 + }, + { + "epoch": 3.8e-05, + "grad_norm": 1.8172345161437988, + "learning_rate": 8.947170821665072e-06, + "loss": 0.5465, + "step": 3800 + }, + { + "epoch": 3.9e-05, + "grad_norm": 1.7093075513839722, + "learning_rate": 8.975432078990786e-06, + "loss": 0.5315, + "step": 3900 + }, + { + "epoch": 4e-05, + "grad_norm": 1.7914036512374878, + "learning_rate": 9.002976331538332e-06, + "loss": 0.5227, + "step": 4000 + }, + { + "epoch": 4.1e-05, + "grad_norm": 1.441435694694519, + "learning_rate": 9.029839062600307e-06, + "loss": 0.5131, + "step": 4100 + }, + { + "epoch": 4.2e-05, + "grad_norm": 1.3954449892044067, + "learning_rate": 9.056053184939176e-06, + "loss": 0.4998, + "step": 4200 + }, + { + "epoch": 4.3e-05, + "grad_norm": 1.7888164520263672, + "learning_rate": 9.081649283234784e-06, + "loss": 0.4961, + "step": 4300 + }, + { + "epoch": 4.4e-05, + "grad_norm": 1.7587261199951172, + "learning_rate": 9.106655828605087e-06, + "loss": 0.4875, + "step": 4400 + }, + { + "epoch": 4.5e-05, + "grad_norm": 1.9113582372665405, + "learning_rate": 9.13109936897355e-06, + "loss": 0.4794, + "step": 4500 + }, + { + "epoch": 4.6e-05, + "grad_norm": 1.6648356914520264, + "learning_rate": 9.155004698474792e-06, + "loss": 0.4697, + "step": 4600 + }, + { + "epoch": 4.7e-05, + "grad_norm": 1.5259454250335693, + "learning_rate": 9.17839500860873e-06, + "loss": 0.4622, + "step": 4700 + }, + { + "epoch": 4.8e-05, + "grad_norm": 1.8361080884933472, + "learning_rate": 9.201292023453135e-06, + "loss": 0.453, + "step": 4800 + }, + { + "epoch": 4.9e-05, + "grad_norm": 1.6309137344360352, + "learning_rate": 9.22371612091062e-06, + "loss": 0.4429, + "step": 4900 + }, + { + "epoch": 5e-05, + "grad_norm": 1.7207796573638916, + "learning_rate": 9.245686441685918e-06, + "loss": 0.4382, + "step": 5000 + }, + { + "epoch": 5.1e-05, + "grad_norm": 1.552103042602539, + "learning_rate": 9.267220987454044e-06, + "loss": 0.4315, + "step": 5100 + }, + { + "epoch": 5.2e-05, + "grad_norm": 1.6008425951004028, + "learning_rate": 9.28833670948078e-06, + "loss": 0.4244, + "step": 5200 + }, + { + "epoch": 5.3e-05, + "grad_norm": 1.8220570087432861, + "learning_rate": 9.309049588788657e-06, + "loss": 0.4162, + "step": 5300 + }, + { + "epoch": 5.4e-05, + "grad_norm": 1.5230612754821777, + "learning_rate": 9.329374708818158e-06, + "loss": 0.4112, + "step": 5400 + }, + { + "epoch": 5.5e-05, + "grad_norm": 1.7809470891952515, + "learning_rate": 9.349326321411793e-06, + "loss": 0.4052, + "step": 5500 + }, + { + "epoch": 5.6e-05, + "grad_norm": 1.5959115028381348, + "learning_rate": 9.368917906844062e-06, + "loss": 0.4009, + "step": 5600 + }, + { + "epoch": 5.7e-05, + "grad_norm": 1.563692331314087, + "learning_rate": 9.388162228530614e-06, + "loss": 0.394, + "step": 5700 + }, + { + "epoch": 5.8e-05, + "grad_norm": 1.4869149923324585, + "learning_rate": 9.407071382972726e-06, + "loss": 0.3879, + "step": 5800 + }, + { + "epoch": 5.9e-05, + "grad_norm": 1.5701963901519775, + "learning_rate": 9.425656845426483e-06, + "loss": 0.3784, + "step": 5900 + }, + { + "epoch": 6e-05, + "grad_norm": 1.496894359588623, + "learning_rate": 9.443929511728523e-06, + "loss": 0.3746, + "step": 6000 + }, + { + "epoch": 6.1e-05, + "grad_norm": 1.4307634830474854, + "learning_rate": 9.461899736660011e-06, + "loss": 0.372, + "step": 6100 + }, + { + "epoch": 6.2e-05, + "grad_norm": 1.4771479368209839, + "learning_rate": 9.479577369187091e-06, + "loss": 0.3661, + "step": 6200 + }, + { + "epoch": 6.3e-05, + "grad_norm": 1.2904491424560547, + "learning_rate": 9.496971784878123e-06, + "loss": 0.3625, + "step": 6300 + }, + { + "epoch": 6.4e-05, + "grad_norm": 1.5488417148590088, + "learning_rate": 9.514091915764837e-06, + "loss": 0.3547, + "step": 6400 + }, + { + "epoch": 6.5e-05, + "grad_norm": 1.4266217947006226, + "learning_rate": 9.530946277885485e-06, + "loss": 0.3491, + "step": 6500 + }, + { + "epoch": 6.6e-05, + "grad_norm": 1.5423930883407593, + "learning_rate": 9.547542996722649e-06, + "loss": 0.3442, + "step": 6600 + }, + { + "epoch": 6.7e-05, + "grad_norm": 1.3324171304702759, + "learning_rate": 9.563889830725893e-06, + "loss": 0.3427, + "step": 6700 + }, + { + "epoch": 6.8e-05, + "grad_norm": 1.3407986164093018, + "learning_rate": 9.57999419308974e-06, + "loss": 0.3376, + "step": 6800 + }, + { + "epoch": 6.9e-05, + "grad_norm": 1.303074598312378, + "learning_rate": 9.595863171939976e-06, + "loss": 0.3346, + "step": 6900 + }, + { + "epoch": 7e-05, + "grad_norm": 1.3955286741256714, + "learning_rate": 9.611192939364202e-06, + "loss": 0.3283, + "step": 7000 + }, + { + "epoch": 7.1e-05, + "grad_norm": 1.3460999727249146, + "learning_rate": 9.626462440880078e-06, + "loss": 0.3269, + "step": 7100 + }, + { + "epoch": 7.2e-05, + "grad_norm": 1.5643832683563232, + "learning_rate": 9.641671209028838e-06, + "loss": 0.3235, + "step": 7200 + }, + { + "epoch": 7.3e-05, + "grad_norm": 1.503298044204712, + "learning_rate": 9.65666987557147e-06, + "loss": 0.3184, + "step": 7300 + }, + { + "epoch": 7.4e-05, + "grad_norm": 1.4040926694869995, + "learning_rate": 9.671464166396914e-06, + "loss": 0.3173, + "step": 7400 + }, + { + "epoch": 7.5e-05, + "grad_norm": 1.5793565511703491, + "learning_rate": 9.686059576466255e-06, + "loss": 0.3118, + "step": 7500 + }, + { + "epoch": 7.6e-05, + "grad_norm": 1.2530806064605713, + "learning_rate": 9.700461382066083e-06, + "loss": 0.3073, + "step": 7600 + }, + { + "epoch": 7.7e-05, + "grad_norm": 1.6009125709533691, + "learning_rate": 9.714674652259765e-06, + "loss": 0.3058, + "step": 7700 + }, + { + "epoch": 7.8e-05, + "grad_norm": 1.52604341506958, + "learning_rate": 9.7287042595988e-06, + "loss": 0.299, + "step": 7800 + }, + { + "epoch": 7.9e-05, + "grad_norm": 1.512654185295105, + "learning_rate": 9.742554890150908e-06, + "loss": 0.2997, + "step": 7900 + }, + { + "epoch": 8e-05, + "grad_norm": 1.3372293710708618, + "learning_rate": 9.75623105289651e-06, + "loss": 0.2959, + "step": 8000 + }, + { + "epoch": 8.1e-05, + "grad_norm": 1.3194124698638916, + "learning_rate": 9.769737088540707e-06, + "loss": 0.2915, + "step": 8100 + }, + { + "epoch": 8.2e-05, + "grad_norm": 1.3267931938171387, + "learning_rate": 9.783077177783901e-06, + "loss": 0.2883, + "step": 8200 + }, + { + "epoch": 8.3e-05, + "grad_norm": 1.4453672170639038, + "learning_rate": 9.796255349090433e-06, + "loss": 0.2857, + "step": 8300 + }, + { + "epoch": 8.4e-05, + "grad_norm": 1.2656625509262085, + "learning_rate": 9.809275485991406e-06, + "loss": 0.2824, + "step": 8400 + }, + { + "epoch": 8.5e-05, + "grad_norm": 1.347659707069397, + "learning_rate": 9.822141333954775e-06, + "loss": 0.2805, + "step": 8500 + }, + { + "epoch": 8.6e-05, + "grad_norm": 1.3958872556686401, + "learning_rate": 9.834856506853153e-06, + "loss": 0.2777, + "step": 8600 + }, + { + "epoch": 8.7e-05, + "grad_norm": 1.4277667999267578, + "learning_rate": 9.847424493057225e-06, + "loss": 0.2734, + "step": 8700 + }, + { + "epoch": 8.8e-05, + "grad_norm": 1.233550786972046, + "learning_rate": 9.85984866118054e-06, + "loss": 0.2727, + "step": 8800 + }, + { + "epoch": 8.9e-05, + "grad_norm": 1.499273657798767, + "learning_rate": 9.872132265499283e-06, + "loss": 0.2712, + "step": 8900 + }, + { + "epoch": 9e-05, + "grad_norm": 1.4485379457473755, + "learning_rate": 9.884278451068888e-06, + "loss": 0.2669, + "step": 9000 + }, + { + "epoch": 9.1e-05, + "grad_norm": 1.2728357315063477, + "learning_rate": 9.896051320131294e-06, + "loss": 0.2657, + "step": 9100 + }, + { + "epoch": 9.2e-05, + "grad_norm": 1.2725111246109009, + "learning_rate": 9.90793429093813e-06, + "loss": 0.2635, + "step": 9200 + }, + { + "epoch": 9.3e-05, + "grad_norm": 1.3907318115234375, + "learning_rate": 9.919688613870083e-06, + "loss": 0.2581, + "step": 9300 + }, + { + "epoch": 9.4e-05, + "grad_norm": 1.3836479187011719, + "learning_rate": 9.93131704466464e-06, + "loss": 0.2588, + "step": 9400 + }, + { + "epoch": 9.5e-05, + "grad_norm": 1.2773383855819702, + "learning_rate": 9.942822251451706e-06, + "loss": 0.2536, + "step": 9500 + }, + { + "epoch": 9.6e-05, + "grad_norm": 1.2910404205322266, + "learning_rate": 9.954206818428214e-06, + "loss": 0.2513, + "step": 9600 + }, + { + "epoch": 9.7e-05, + "grad_norm": 1.234729528427124, + "learning_rate": 9.96547324934206e-06, + "loss": 0.2476, + "step": 9700 + }, + { + "epoch": 9.8e-05, + "grad_norm": 1.1756150722503662, + "learning_rate": 9.976623970797134e-06, + "loss": 0.2471, + "step": 9800 + }, + { + "epoch": 9.9e-05, + "grad_norm": 1.261687159538269, + "learning_rate": 9.987661335390354e-06, + "loss": 0.2489, + "step": 9900 + }, + { + "epoch": 0.0001, + "grad_norm": 1.3017668724060059, + "learning_rate": 9.998587624690824e-06, + "loss": 0.2435, + "step": 10000 + }, + { + "epoch": 0.000101, + "grad_norm": 1.232535719871521, + "learning_rate": 1e-05, + "loss": 0.2425, + "step": 10100 + }, + { + "epoch": 0.000102, + "grad_norm": 1.306433081626892, + "learning_rate": 1e-05, + "loss": 0.2406, + "step": 10200 + }, + { + "epoch": 0.000103, + "grad_norm": 1.3659272193908691, + "learning_rate": 1e-05, + "loss": 0.2389, + "step": 10300 + }, + { + "epoch": 0.000104, + "grad_norm": 1.1521058082580566, + "learning_rate": 1e-05, + "loss": 0.2338, + "step": 10400 + }, + { + "epoch": 0.000105, + "grad_norm": 1.1397546529769897, + "learning_rate": 1e-05, + "loss": 0.2342, + "step": 10500 + }, + { + "epoch": 0.000106, + "grad_norm": 1.3130905628204346, + "learning_rate": 1e-05, + "loss": 0.2313, + "step": 10600 + }, + { + "epoch": 0.000107, + "grad_norm": 1.1320550441741943, + "learning_rate": 1e-05, + "loss": 0.2288, + "step": 10700 + }, + { + "epoch": 0.000108, + "grad_norm": 1.2157635688781738, + "learning_rate": 1e-05, + "loss": 0.2296, + "step": 10800 + }, + { + "epoch": 0.000109, + "grad_norm": 1.2038499116897583, + "learning_rate": 1e-05, + "loss": 0.2249, + "step": 10900 + }, + { + "epoch": 0.00011, + "grad_norm": 1.3213508129119873, + "learning_rate": 1e-05, + "loss": 0.2243, + "step": 11000 + }, + { + "epoch": 0.000111, + "grad_norm": 1.1428966522216797, + "learning_rate": 1e-05, + "loss": 0.2213, + "step": 11100 + }, + { + "epoch": 0.000112, + "grad_norm": 1.2259374856948853, + "learning_rate": 1e-05, + "loss": 0.2202, + "step": 11200 + }, + { + "epoch": 0.000113, + "grad_norm": 1.1567683219909668, + "learning_rate": 1e-05, + "loss": 0.2175, + "step": 11300 + }, + { + "epoch": 0.000114, + "grad_norm": 1.2655612230300903, + "learning_rate": 1e-05, + "loss": 0.216, + "step": 11400 + }, + { + "epoch": 0.000115, + "grad_norm": 1.1602586507797241, + "learning_rate": 1e-05, + "loss": 0.2146, + "step": 11500 + }, + { + "epoch": 0.000116, + "grad_norm": 1.1369308233261108, + "learning_rate": 1e-05, + "loss": 0.2126, + "step": 11600 + }, + { + "epoch": 0.000117, + "grad_norm": 1.1988592147827148, + "learning_rate": 1e-05, + "loss": 0.2121, + "step": 11700 + }, + { + "epoch": 0.000118, + "grad_norm": 1.087939977645874, + "learning_rate": 1e-05, + "loss": 0.2101, + "step": 11800 + }, + { + "epoch": 0.000119, + "grad_norm": 1.2805454730987549, + "learning_rate": 1e-05, + "loss": 0.2094, + "step": 11900 + }, + { + "epoch": 0.00012, + "grad_norm": 1.4006527662277222, + "learning_rate": 1e-05, + "loss": 0.2043, + "step": 12000 + }, + { + "epoch": 0.000121, + "grad_norm": 1.2651677131652832, + "learning_rate": 1e-05, + "loss": 0.205, + "step": 12100 + }, + { + "epoch": 0.000122, + "grad_norm": 1.3023113012313843, + "learning_rate": 1e-05, + "loss": 0.2066, + "step": 12200 + }, + { + "epoch": 0.000123, + "grad_norm": 1.0964651107788086, + "learning_rate": 1e-05, + "loss": 0.2019, + "step": 12300 + }, + { + "epoch": 0.000124, + "grad_norm": 1.1747757196426392, + "learning_rate": 1e-05, + "loss": 0.201, + "step": 12400 + }, + { + "epoch": 0.000125, + "grad_norm": 1.0360560417175293, + "learning_rate": 1e-05, + "loss": 0.1995, + "step": 12500 + }, + { + "epoch": 0.000126, + "grad_norm": 1.0915257930755615, + "learning_rate": 1e-05, + "loss": 0.1979, + "step": 12600 + }, + { + "epoch": 0.000127, + "grad_norm": 1.1433717012405396, + "learning_rate": 1e-05, + "loss": 0.2003, + "step": 12700 + }, + { + "epoch": 0.000128, + "grad_norm": 1.1049145460128784, + "learning_rate": 1e-05, + "loss": 0.1956, + "step": 12800 + }, + { + "epoch": 0.000129, + "grad_norm": 1.11701238155365, + "learning_rate": 1e-05, + "loss": 0.1951, + "step": 12900 + }, + { + "epoch": 0.00013, + "grad_norm": 1.1755869388580322, + "learning_rate": 1e-05, + "loss": 0.1936, + "step": 13000 + }, + { + "epoch": 0.000131, + "grad_norm": 1.0519227981567383, + "learning_rate": 1e-05, + "loss": 0.1914, + "step": 13100 + }, + { + "epoch": 0.000132, + "grad_norm": 1.1982672214508057, + "learning_rate": 1e-05, + "loss": 0.1895, + "step": 13200 + }, + { + "epoch": 0.000133, + "grad_norm": 1.135452389717102, + "learning_rate": 1e-05, + "loss": 0.1899, + "step": 13300 + }, + { + "epoch": 0.000134, + "grad_norm": 1.0130894184112549, + "learning_rate": 1e-05, + "loss": 0.1858, + "step": 13400 + }, + { + "epoch": 0.000135, + "grad_norm": 1.1471365690231323, + "learning_rate": 1e-05, + "loss": 0.1872, + "step": 13500 + }, + { + "epoch": 0.000136, + "grad_norm": 1.1107739210128784, + "learning_rate": 1e-05, + "loss": 0.1864, + "step": 13600 + }, + { + "epoch": 0.000137, + "grad_norm": 1.1473486423492432, + "learning_rate": 1e-05, + "loss": 0.1854, + "step": 13700 + }, + { + "epoch": 0.000138, + "grad_norm": 1.0697531700134277, + "learning_rate": 1e-05, + "loss": 0.1813, + "step": 13800 + }, + { + "epoch": 0.000139, + "grad_norm": 0.9683561325073242, + "learning_rate": 1e-05, + "loss": 0.1801, + "step": 13900 + }, + { + "epoch": 0.00014, + "grad_norm": 1.1696103811264038, + "learning_rate": 1e-05, + "loss": 0.1802, + "step": 14000 + }, + { + "epoch": 0.000141, + "grad_norm": 1.2879928350448608, + "learning_rate": 1e-05, + "loss": 0.1808, + "step": 14100 + }, + { + "epoch": 0.000142, + "grad_norm": 1.0318293571472168, + "learning_rate": 1e-05, + "loss": 0.1792, + "step": 14200 + }, + { + "epoch": 0.000143, + "grad_norm": 1.0072672367095947, + "learning_rate": 1e-05, + "loss": 0.1784, + "step": 14300 + }, + { + "epoch": 0.000144, + "grad_norm": 1.0204075574874878, + "learning_rate": 1e-05, + "loss": 0.1756, + "step": 14400 + }, + { + "epoch": 0.000145, + "grad_norm": 1.1072639226913452, + "learning_rate": 1e-05, + "loss": 0.174, + "step": 14500 + }, + { + "epoch": 0.000146, + "grad_norm": 1.1650497913360596, + "learning_rate": 1e-05, + "loss": 0.1699, + "step": 14600 + }, + { + "epoch": 0.000147, + "grad_norm": 1.1133906841278076, + "learning_rate": 1e-05, + "loss": 0.1712, + "step": 14700 + }, + { + "epoch": 0.000148, + "grad_norm": 1.2355847358703613, + "learning_rate": 1e-05, + "loss": 0.1712, + "step": 14800 + }, + { + "epoch": 0.000149, + "grad_norm": 1.0743693113327026, + "learning_rate": 1e-05, + "loss": 0.1701, + "step": 14900 + }, + { + "epoch": 0.00015, + "grad_norm": 1.1882842779159546, + "learning_rate": 1e-05, + "loss": 0.1703, + "step": 15000 + }, + { + "epoch": 0.000151, + "grad_norm": 1.0762616395950317, + "learning_rate": 1e-05, + "loss": 0.1692, + "step": 15100 + }, + { + "epoch": 0.000152, + "grad_norm": 1.0435552597045898, + "learning_rate": 1e-05, + "loss": 0.1675, + "step": 15200 + }, + { + "epoch": 0.000153, + "grad_norm": 1.0835367441177368, + "learning_rate": 1e-05, + "loss": 0.1668, + "step": 15300 + }, + { + "epoch": 0.000154, + "grad_norm": 1.0594781637191772, + "learning_rate": 1e-05, + "loss": 0.1638, + "step": 15400 + }, + { + "epoch": 0.000155, + "grad_norm": 1.0666881799697876, + "learning_rate": 1e-05, + "loss": 0.1636, + "step": 15500 + }, + { + "epoch": 0.000156, + "grad_norm": 0.9173826575279236, + "learning_rate": 1e-05, + "loss": 0.1632, + "step": 15600 + }, + { + "epoch": 0.000157, + "grad_norm": 1.1107499599456787, + "learning_rate": 1e-05, + "loss": 0.163, + "step": 15700 + }, + { + "epoch": 0.000158, + "grad_norm": 1.0352386236190796, + "learning_rate": 1e-05, + "loss": 0.1602, + "step": 15800 + }, + { + "epoch": 0.000159, + "grad_norm": 0.9977409839630127, + "learning_rate": 1e-05, + "loss": 0.1623, + "step": 15900 + }, + { + "epoch": 0.00016, + "grad_norm": 1.0943259000778198, + "learning_rate": 1e-05, + "loss": 0.1603, + "step": 16000 + }, + { + "epoch": 0.000161, + "grad_norm": 1.0809710025787354, + "learning_rate": 1e-05, + "loss": 0.1582, + "step": 16100 + }, + { + "epoch": 0.000162, + "grad_norm": 1.1283208131790161, + "learning_rate": 1e-05, + "loss": 0.1583, + "step": 16200 + }, + { + "epoch": 0.000163, + "grad_norm": 1.0325435400009155, + "learning_rate": 1e-05, + "loss": 0.158, + "step": 16300 + }, + { + "epoch": 0.000164, + "grad_norm": 1.0305627584457397, + "learning_rate": 1e-05, + "loss": 0.1573, + "step": 16400 + }, + { + "epoch": 0.000165, + "grad_norm": 1.0640127658843994, + "learning_rate": 1e-05, + "loss": 0.1551, + "step": 16500 + }, + { + "epoch": 0.000166, + "grad_norm": 0.9327529668807983, + "learning_rate": 1e-05, + "loss": 0.1562, + "step": 16600 + }, + { + "epoch": 0.000167, + "grad_norm": 1.0069410800933838, + "learning_rate": 1e-05, + "loss": 0.1533, + "step": 16700 + }, + { + "epoch": 0.000168, + "grad_norm": 1.040076494216919, + "learning_rate": 1e-05, + "loss": 0.1527, + "step": 16800 + }, + { + "epoch": 0.000169, + "grad_norm": 1.008461356163025, + "learning_rate": 1e-05, + "loss": 0.1525, + "step": 16900 + }, + { + "epoch": 0.00017, + "grad_norm": 1.0036898851394653, + "learning_rate": 1e-05, + "loss": 0.1517, + "step": 17000 + }, + { + "epoch": 0.000171, + "grad_norm": 0.9357483386993408, + "learning_rate": 1e-05, + "loss": 0.1511, + "step": 17100 + }, + { + "epoch": 0.000172, + "grad_norm": 1.0033488273620605, + "learning_rate": 1e-05, + "loss": 0.1468, + "step": 17200 + }, + { + "epoch": 0.000173, + "grad_norm": 1.0451477766036987, + "learning_rate": 1e-05, + "loss": 0.15, + "step": 17300 + }, + { + "epoch": 0.000174, + "grad_norm": 0.971612274646759, + "learning_rate": 1e-05, + "loss": 0.1476, + "step": 17400 + }, + { + "epoch": 0.000175, + "grad_norm": 1.079099416732788, + "learning_rate": 1e-05, + "loss": 0.1479, + "step": 17500 + }, + { + "epoch": 0.000176, + "grad_norm": 1.0661680698394775, + "learning_rate": 1e-05, + "loss": 0.1476, + "step": 17600 + }, + { + "epoch": 0.000177, + "grad_norm": 1.0154145956039429, + "learning_rate": 1e-05, + "loss": 0.1467, + "step": 17700 + }, + { + "epoch": 0.000178, + "grad_norm": 1.0474337339401245, + "learning_rate": 1e-05, + "loss": 0.1441, + "step": 17800 + }, + { + "epoch": 0.000179, + "grad_norm": 1.0646860599517822, + "learning_rate": 1e-05, + "loss": 0.1459, + "step": 17900 + }, + { + "epoch": 0.00018, + "grad_norm": 1.0854105949401855, + "learning_rate": 1e-05, + "loss": 0.1437, + "step": 18000 + }, + { + "epoch": 0.000181, + "grad_norm": 0.9846110939979553, + "learning_rate": 1e-05, + "loss": 0.1425, + "step": 18100 + }, + { + "epoch": 0.000182, + "grad_norm": 1.0286470651626587, + "learning_rate": 1e-05, + "loss": 0.1432, + "step": 18200 + }, + { + "epoch": 0.000183, + "grad_norm": 1.0388602018356323, + "learning_rate": 1e-05, + "loss": 0.1403, + "step": 18300 + }, + { + "epoch": 0.000184, + "grad_norm": 0.9657048583030701, + "learning_rate": 1e-05, + "loss": 0.1417, + "step": 18400 + }, + { + "epoch": 0.000185, + "grad_norm": 0.8501772880554199, + "learning_rate": 1e-05, + "loss": 0.1424, + "step": 18500 + }, + { + "epoch": 0.000186, + "grad_norm": 0.9153370261192322, + "learning_rate": 1e-05, + "loss": 0.1376, + "step": 18600 + }, + { + "epoch": 0.000187, + "grad_norm": 0.9047082662582397, + "learning_rate": 1e-05, + "loss": 0.1413, + "step": 18700 + }, + { + "epoch": 0.000188, + "grad_norm": 0.9566175937652588, + "learning_rate": 1e-05, + "loss": 0.1387, + "step": 18800 + }, + { + "epoch": 0.000189, + "grad_norm": 1.069942831993103, + "learning_rate": 1e-05, + "loss": 0.1355, + "step": 18900 + }, + { + "epoch": 0.00019, + "grad_norm": 1.019620656967163, + "learning_rate": 1e-05, + "loss": 0.1357, + "step": 19000 + }, + { + "epoch": 0.000191, + "grad_norm": 0.9842545390129089, + "learning_rate": 1e-05, + "loss": 0.1366, + "step": 19100 + }, + { + "epoch": 0.000192, + "grad_norm": 0.972135603427887, + "learning_rate": 1e-05, + "loss": 0.1357, + "step": 19200 + }, + { + "epoch": 0.000193, + "grad_norm": 0.9025226831436157, + "learning_rate": 1e-05, + "loss": 0.1347, + "step": 19300 + }, + { + "epoch": 0.000194, + "grad_norm": 0.9164988398551941, + "learning_rate": 1e-05, + "loss": 0.1338, + "step": 19400 + }, + { + "epoch": 0.000195, + "grad_norm": 0.8067638874053955, + "learning_rate": 1e-05, + "loss": 0.133, + "step": 19500 + }, + { + "epoch": 0.000196, + "grad_norm": 0.8477145433425903, + "learning_rate": 1e-05, + "loss": 0.1334, + "step": 19600 + }, + { + "epoch": 0.000197, + "grad_norm": 0.860883891582489, + "learning_rate": 1e-05, + "loss": 0.1327, + "step": 19700 + }, + { + "epoch": 0.000198, + "grad_norm": 0.9660979509353638, + "learning_rate": 1e-05, + "loss": 0.1332, + "step": 19800 + }, + { + "epoch": 0.000199, + "grad_norm": 0.8979732394218445, + "learning_rate": 1e-05, + "loss": 0.1317, + "step": 19900 + }, + { + "epoch": 0.0002, + "grad_norm": 0.8831902146339417, + "learning_rate": 1e-05, + "loss": 0.1313, + "step": 20000 + }, + { + "epoch": 0.0002, + "eval_loss": 0.0992431640625, + "eval_runtime": 152.8076, + "eval_samples_per_second": 327.209, + "eval_steps_per_second": 20.451, + "step": 20000 + }, + { + "epoch": 0.000201, + "grad_norm": 0.9081249833106995, + "learning_rate": 1e-05, + "loss": 0.1296, + "step": 20100 + }, + { + "epoch": 0.000202, + "grad_norm": 1.0295116901397705, + "learning_rate": 1e-05, + "loss": 0.1296, + "step": 20200 + }, + { + "epoch": 0.000203, + "grad_norm": 0.8534417152404785, + "learning_rate": 1e-05, + "loss": 0.1271, + "step": 20300 + }, + { + "epoch": 0.000204, + "grad_norm": 0.8878474235534668, + "learning_rate": 1e-05, + "loss": 0.1276, + "step": 20400 + }, + { + "epoch": 0.000205, + "grad_norm": 0.9492274522781372, + "learning_rate": 1e-05, + "loss": 0.1296, + "step": 20500 + }, + { + "epoch": 0.000206, + "grad_norm": 0.9542170166969299, + "learning_rate": 1e-05, + "loss": 0.1284, + "step": 20600 + }, + { + "epoch": 0.000207, + "grad_norm": 0.8887580633163452, + "learning_rate": 1e-05, + "loss": 0.1257, + "step": 20700 + }, + { + "epoch": 0.000208, + "grad_norm": 0.8237319588661194, + "learning_rate": 1e-05, + "loss": 0.1253, + "step": 20800 + }, + { + "epoch": 0.000209, + "grad_norm": 0.8409337401390076, + "learning_rate": 1e-05, + "loss": 0.1241, + "step": 20900 + }, + { + "epoch": 0.00021, + "grad_norm": 0.8566481471061707, + "learning_rate": 1e-05, + "loss": 0.1252, + "step": 21000 + }, + { + "epoch": 0.000211, + "grad_norm": 0.8407108783721924, + "learning_rate": 1e-05, + "loss": 0.1242, + "step": 21100 + }, + { + "epoch": 0.000212, + "grad_norm": 0.853947639465332, + "learning_rate": 1e-05, + "loss": 0.1246, + "step": 21200 + }, + { + "epoch": 0.000213, + "grad_norm": 0.8899252414703369, + "learning_rate": 1e-05, + "loss": 0.1237, + "step": 21300 + }, + { + "epoch": 0.000214, + "grad_norm": 0.8689791560173035, + "learning_rate": 1e-05, + "loss": 0.1225, + "step": 21400 + }, + { + "epoch": 0.000215, + "grad_norm": 0.9782620668411255, + "learning_rate": 1e-05, + "loss": 0.1229, + "step": 21500 + }, + { + "epoch": 0.000216, + "grad_norm": 0.9015646576881409, + "learning_rate": 1e-05, + "loss": 0.1241, + "step": 21600 + }, + { + "epoch": 0.000217, + "grad_norm": 0.9284467697143555, + "learning_rate": 1e-05, + "loss": 0.1216, + "step": 21700 + }, + { + "epoch": 0.000218, + "grad_norm": 0.8393162488937378, + "learning_rate": 1e-05, + "loss": 0.1219, + "step": 21800 + }, + { + "epoch": 0.000219, + "grad_norm": 0.9249029159545898, + "learning_rate": 1e-05, + "loss": 0.1222, + "step": 21900 + }, + { + "epoch": 0.00022, + "grad_norm": 0.931483805179596, + "learning_rate": 1e-05, + "loss": 0.1208, + "step": 22000 + }, + { + "epoch": 0.000221, + "grad_norm": 0.9092661142349243, + "learning_rate": 1e-05, + "loss": 0.1214, + "step": 22100 + }, + { + "epoch": 0.000222, + "grad_norm": 0.9886374473571777, + "learning_rate": 1e-05, + "loss": 0.1189, + "step": 22200 + }, + { + "epoch": 0.000223, + "grad_norm": 0.8833937644958496, + "learning_rate": 1e-05, + "loss": 0.1175, + "step": 22300 + }, + { + "epoch": 0.000224, + "grad_norm": 0.9673048257827759, + "learning_rate": 1e-05, + "loss": 0.1168, + "step": 22400 + }, + { + "epoch": 0.000225, + "grad_norm": 0.872240424156189, + "learning_rate": 1e-05, + "loss": 0.1177, + "step": 22500 + }, + { + "epoch": 0.000226, + "grad_norm": 0.849644660949707, + "learning_rate": 1e-05, + "loss": 0.1177, + "step": 22600 + }, + { + "epoch": 0.000227, + "grad_norm": 0.9396729469299316, + "learning_rate": 1e-05, + "loss": 0.1174, + "step": 22700 + }, + { + "epoch": 0.000228, + "grad_norm": 0.9100921750068665, + "learning_rate": 1e-05, + "loss": 0.1161, + "step": 22800 + }, + { + "epoch": 0.000229, + "grad_norm": 0.8232945203781128, + "learning_rate": 1e-05, + "loss": 0.1149, + "step": 22900 + }, + { + "epoch": 0.00023, + "grad_norm": 0.8654581904411316, + "learning_rate": 1e-05, + "loss": 0.116, + "step": 23000 + }, + { + "epoch": 0.000231, + "grad_norm": 0.8864552974700928, + "learning_rate": 1e-05, + "loss": 0.1161, + "step": 23100 + }, + { + "epoch": 0.000232, + "grad_norm": 0.9292982816696167, + "learning_rate": 1e-05, + "loss": 0.1126, + "step": 23200 + }, + { + "epoch": 0.000233, + "grad_norm": 0.8095874786376953, + "learning_rate": 1e-05, + "loss": 0.1141, + "step": 23300 + }, + { + "epoch": 0.000234, + "grad_norm": 1.1662276983261108, + "learning_rate": 1e-05, + "loss": 0.113, + "step": 23400 + }, + { + "epoch": 0.000235, + "grad_norm": 0.8531011343002319, + "learning_rate": 1e-05, + "loss": 0.1147, + "step": 23500 + }, + { + "epoch": 0.000236, + "grad_norm": 0.895802915096283, + "learning_rate": 1e-05, + "loss": 0.114, + "step": 23600 + }, + { + "epoch": 0.000237, + "grad_norm": 0.8489896655082703, + "learning_rate": 1e-05, + "loss": 0.1142, + "step": 23700 + }, + { + "epoch": 0.000238, + "grad_norm": 0.8372708559036255, + "learning_rate": 1e-05, + "loss": 0.1123, + "step": 23800 + }, + { + "epoch": 0.000239, + "grad_norm": 0.8919999003410339, + "learning_rate": 1e-05, + "loss": 0.1134, + "step": 23900 + }, + { + "epoch": 0.00024, + "grad_norm": 0.8561524152755737, + "learning_rate": 1e-05, + "loss": 0.112, + "step": 24000 + }, + { + "epoch": 0.000241, + "grad_norm": 0.8549727201461792, + "learning_rate": 1e-05, + "loss": 0.1123, + "step": 24100 + }, + { + "epoch": 0.000242, + "grad_norm": 0.8339006900787354, + "learning_rate": 1e-05, + "loss": 0.1116, + "step": 24200 + }, + { + "epoch": 0.000243, + "grad_norm": 0.8727480173110962, + "learning_rate": 1e-05, + "loss": 0.1113, + "step": 24300 + }, + { + "epoch": 0.000244, + "grad_norm": 0.881377637386322, + "learning_rate": 1e-05, + "loss": 0.1098, + "step": 24400 + }, + { + "epoch": 0.000245, + "grad_norm": 0.8690173029899597, + "learning_rate": 1e-05, + "loss": 0.1109, + "step": 24500 + }, + { + "epoch": 0.000246, + "grad_norm": 0.833027720451355, + "learning_rate": 1e-05, + "loss": 0.1094, + "step": 24600 + }, + { + "epoch": 0.000247, + "grad_norm": 0.8230149745941162, + "learning_rate": 1e-05, + "loss": 0.1094, + "step": 24700 + }, + { + "epoch": 0.000248, + "grad_norm": 0.8857430219650269, + "learning_rate": 1e-05, + "loss": 0.1077, + "step": 24800 + }, + { + "epoch": 0.000249, + "grad_norm": 0.9106509685516357, + "learning_rate": 1e-05, + "loss": 0.1081, + "step": 24900 + }, + { + "epoch": 0.00025, + "grad_norm": 0.9534709453582764, + "learning_rate": 1e-05, + "loss": 0.1084, + "step": 25000 + }, + { + "epoch": 0.000251, + "grad_norm": 0.8446188569068909, + "learning_rate": 1e-05, + "loss": 0.1069, + "step": 25100 + }, + { + "epoch": 0.000252, + "grad_norm": 0.8347111344337463, + "learning_rate": 1e-05, + "loss": 0.1077, + "step": 25200 + }, + { + "epoch": 0.000253, + "grad_norm": 0.8703511357307434, + "learning_rate": 1e-05, + "loss": 0.1069, + "step": 25300 + }, + { + "epoch": 0.000254, + "grad_norm": 0.8182582259178162, + "learning_rate": 1e-05, + "loss": 0.1058, + "step": 25400 + }, + { + "epoch": 0.000255, + "grad_norm": 0.8704941868782043, + "learning_rate": 1e-05, + "loss": 0.1063, + "step": 25500 + }, + { + "epoch": 0.000256, + "grad_norm": 0.8137685656547546, + "learning_rate": 1e-05, + "loss": 0.1041, + "step": 25600 + }, + { + "epoch": 0.000257, + "grad_norm": 0.7531348466873169, + "learning_rate": 1e-05, + "loss": 0.106, + "step": 25700 + }, + { + "epoch": 0.000258, + "grad_norm": 0.886814534664154, + "learning_rate": 1e-05, + "loss": 0.1051, + "step": 25800 + }, + { + "epoch": 0.000259, + "grad_norm": 0.8390068411827087, + "learning_rate": 1e-05, + "loss": 0.105, + "step": 25900 + }, + { + "epoch": 0.00026, + "grad_norm": 0.7962291836738586, + "learning_rate": 1e-05, + "loss": 0.1046, + "step": 26000 + }, + { + "epoch": 0.000261, + "grad_norm": 0.9102724194526672, + "learning_rate": 1e-05, + "loss": 0.1044, + "step": 26100 + }, + { + "epoch": 0.000262, + "grad_norm": 0.8715778589248657, + "learning_rate": 1e-05, + "loss": 0.1031, + "step": 26200 + }, + { + "epoch": 0.000263, + "grad_norm": 0.8876039385795593, + "learning_rate": 1e-05, + "loss": 0.103, + "step": 26300 + }, + { + "epoch": 0.000264, + "grad_norm": 0.7934551239013672, + "learning_rate": 1e-05, + "loss": 0.1017, + "step": 26400 + }, + { + "epoch": 0.000265, + "grad_norm": 0.9847850799560547, + "learning_rate": 1e-05, + "loss": 0.1032, + "step": 26500 + }, + { + "epoch": 0.000266, + "grad_norm": 0.8920612335205078, + "learning_rate": 1e-05, + "loss": 0.1032, + "step": 26600 + }, + { + "epoch": 0.000267, + "grad_norm": 0.9092204570770264, + "learning_rate": 1e-05, + "loss": 0.102, + "step": 26700 + }, + { + "epoch": 0.000268, + "grad_norm": 0.7922365069389343, + "learning_rate": 1e-05, + "loss": 0.1024, + "step": 26800 + }, + { + "epoch": 0.000269, + "grad_norm": 0.8614472150802612, + "learning_rate": 1e-05, + "loss": 0.1022, + "step": 26900 + }, + { + "epoch": 0.00027, + "grad_norm": 0.7870116829872131, + "learning_rate": 1e-05, + "loss": 0.1004, + "step": 27000 + }, + { + "epoch": 0.000271, + "grad_norm": 0.6980022192001343, + "learning_rate": 1e-05, + "loss": 0.1006, + "step": 27100 + }, + { + "epoch": 0.000272, + "grad_norm": 0.7720369100570679, + "learning_rate": 1e-05, + "loss": 0.1012, + "step": 27200 + }, + { + "epoch": 0.000273, + "grad_norm": 0.8154132962226868, + "learning_rate": 1e-05, + "loss": 0.1005, + "step": 27300 + }, + { + "epoch": 0.000274, + "grad_norm": 0.8288457989692688, + "learning_rate": 1e-05, + "loss": 0.0985, + "step": 27400 + }, + { + "epoch": 0.000275, + "grad_norm": 0.8117573261260986, + "learning_rate": 1e-05, + "loss": 0.1008, + "step": 27500 + }, + { + "epoch": 0.000276, + "grad_norm": 0.7800782918930054, + "learning_rate": 1e-05, + "loss": 0.0988, + "step": 27600 + }, + { + "epoch": 0.000277, + "grad_norm": 0.9139901399612427, + "learning_rate": 1e-05, + "loss": 0.0994, + "step": 27700 + }, + { + "epoch": 0.000278, + "grad_norm": 0.745152473449707, + "learning_rate": 1e-05, + "loss": 0.0982, + "step": 27800 + }, + { + "epoch": 0.000279, + "grad_norm": 0.7476614117622375, + "learning_rate": 1e-05, + "loss": 0.0965, + "step": 27900 + }, + { + "epoch": 0.00028, + "grad_norm": 0.7490776777267456, + "learning_rate": 1e-05, + "loss": 0.0972, + "step": 28000 + }, + { + "epoch": 0.000281, + "grad_norm": 0.7730040550231934, + "learning_rate": 1e-05, + "loss": 0.0976, + "step": 28100 + }, + { + "epoch": 0.000282, + "grad_norm": 0.7657092213630676, + "learning_rate": 1e-05, + "loss": 0.0982, + "step": 28200 + }, + { + "epoch": 0.000283, + "grad_norm": 0.9147765040397644, + "learning_rate": 1e-05, + "loss": 0.0978, + "step": 28300 + }, + { + "epoch": 0.000284, + "grad_norm": 0.7426789999008179, + "learning_rate": 1e-05, + "loss": 0.0968, + "step": 28400 + }, + { + "epoch": 0.000285, + "grad_norm": 0.8652293086051941, + "learning_rate": 1e-05, + "loss": 0.0981, + "step": 28500 + }, + { + "epoch": 0.000286, + "grad_norm": 0.6864128112792969, + "learning_rate": 1e-05, + "loss": 0.0963, + "step": 28600 + }, + { + "epoch": 0.000287, + "grad_norm": 0.7807822227478027, + "learning_rate": 1e-05, + "loss": 0.0962, + "step": 28700 + }, + { + "epoch": 0.000288, + "grad_norm": 0.8013282418251038, + "learning_rate": 1e-05, + "loss": 0.0964, + "step": 28800 + }, + { + "epoch": 0.000289, + "grad_norm": 0.7287372350692749, + "learning_rate": 1e-05, + "loss": 0.0966, + "step": 28900 + }, + { + "epoch": 0.00029, + "grad_norm": 0.7577667832374573, + "learning_rate": 1e-05, + "loss": 0.0958, + "step": 29000 + }, + { + "epoch": 0.000291, + "grad_norm": 0.7510080933570862, + "learning_rate": 1e-05, + "loss": 0.0947, + "step": 29100 + }, + { + "epoch": 0.000292, + "grad_norm": 0.8355770707130432, + "learning_rate": 1e-05, + "loss": 0.0946, + "step": 29200 + }, + { + "epoch": 0.000293, + "grad_norm": 0.8899005651473999, + "learning_rate": 1e-05, + "loss": 0.0948, + "step": 29300 + }, + { + "epoch": 0.000294, + "grad_norm": 0.8526831865310669, + "learning_rate": 1e-05, + "loss": 0.0947, + "step": 29400 + }, + { + "epoch": 0.000295, + "grad_norm": 0.740943968296051, + "learning_rate": 1e-05, + "loss": 0.0928, + "step": 29500 + }, + { + "epoch": 0.000296, + "grad_norm": 0.8096754550933838, + "learning_rate": 1e-05, + "loss": 0.0948, + "step": 29600 + }, + { + "epoch": 0.000297, + "grad_norm": 0.8890173435211182, + "learning_rate": 1e-05, + "loss": 0.0934, + "step": 29700 + }, + { + "epoch": 0.000298, + "grad_norm": 0.8200284838676453, + "learning_rate": 1e-05, + "loss": 0.0931, + "step": 29800 + }, + { + "epoch": 0.000299, + "grad_norm": 0.70655757188797, + "learning_rate": 1e-05, + "loss": 0.0946, + "step": 29900 + }, + { + "epoch": 0.0003, + "grad_norm": 0.7843393087387085, + "learning_rate": 1e-05, + "loss": 0.0924, + "step": 30000 + }, + { + "epoch": 0.000301, + "grad_norm": 0.6674346923828125, + "learning_rate": 1e-05, + "loss": 0.0925, + "step": 30100 + }, + { + "epoch": 0.000302, + "grad_norm": 0.7955383062362671, + "learning_rate": 1e-05, + "loss": 0.0927, + "step": 30200 + }, + { + "epoch": 0.000303, + "grad_norm": 0.7410333752632141, + "learning_rate": 1e-05, + "loss": 0.0923, + "step": 30300 + }, + { + "epoch": 0.000304, + "grad_norm": 0.716390073299408, + "learning_rate": 1e-05, + "loss": 0.0924, + "step": 30400 + }, + { + "epoch": 0.000305, + "grad_norm": 0.7392554879188538, + "learning_rate": 1e-05, + "loss": 0.0921, + "step": 30500 + }, + { + "epoch": 0.000306, + "grad_norm": 0.9256471991539001, + "learning_rate": 1e-05, + "loss": 0.091, + "step": 30600 + }, + { + "epoch": 0.000307, + "grad_norm": 0.7692530751228333, + "learning_rate": 1e-05, + "loss": 0.0928, + "step": 30700 + }, + { + "epoch": 0.000308, + "grad_norm": 0.7785292863845825, + "learning_rate": 1e-05, + "loss": 0.0906, + "step": 30800 + }, + { + "epoch": 0.000309, + "grad_norm": 0.8413007259368896, + "learning_rate": 1e-05, + "loss": 0.09, + "step": 30900 + }, + { + "epoch": 0.00031, + "grad_norm": 0.9082907438278198, + "learning_rate": 1e-05, + "loss": 0.0896, + "step": 31000 + }, + { + "epoch": 0.000311, + "grad_norm": 0.7937412261962891, + "learning_rate": 1e-05, + "loss": 0.0892, + "step": 31100 + }, + { + "epoch": 0.000312, + "grad_norm": 0.7778225541114807, + "learning_rate": 1e-05, + "loss": 0.088, + "step": 31200 + }, + { + "epoch": 0.000313, + "grad_norm": 0.7651337385177612, + "learning_rate": 1e-05, + "loss": 0.0897, + "step": 31300 + }, + { + "epoch": 0.000314, + "grad_norm": 0.7604988217353821, + "learning_rate": 1e-05, + "loss": 0.0901, + "step": 31400 + }, + { + "epoch": 0.000315, + "grad_norm": 0.779761016368866, + "learning_rate": 1e-05, + "loss": 0.0903, + "step": 31500 + }, + { + "epoch": 0.000316, + "grad_norm": 0.7517678737640381, + "learning_rate": 1e-05, + "loss": 0.0885, + "step": 31600 + }, + { + "epoch": 0.000317, + "grad_norm": 0.8016210794448853, + "learning_rate": 1e-05, + "loss": 0.0893, + "step": 31700 + }, + { + "epoch": 0.000318, + "grad_norm": 0.678521990776062, + "learning_rate": 1e-05, + "loss": 0.0886, + "step": 31800 + }, + { + "epoch": 0.000319, + "grad_norm": 0.7407852411270142, + "learning_rate": 1e-05, + "loss": 0.0899, + "step": 31900 + }, + { + "epoch": 0.00032, + "grad_norm": 0.8720430135726929, + "learning_rate": 1e-05, + "loss": 0.0876, + "step": 32000 + }, + { + "epoch": 0.000321, + "grad_norm": 0.7622641324996948, + "learning_rate": 1e-05, + "loss": 0.0881, + "step": 32100 + }, + { + "epoch": 0.000322, + "grad_norm": 0.6715940237045288, + "learning_rate": 1e-05, + "loss": 0.0867, + "step": 32200 + }, + { + "epoch": 0.000323, + "grad_norm": 0.8118298053741455, + "learning_rate": 1e-05, + "loss": 0.0887, + "step": 32300 + }, + { + "epoch": 0.000324, + "grad_norm": 0.7427231073379517, + "learning_rate": 1e-05, + "loss": 0.0878, + "step": 32400 + }, + { + "epoch": 0.000325, + "grad_norm": 0.7627066969871521, + "learning_rate": 1e-05, + "loss": 0.0879, + "step": 32500 + }, + { + "epoch": 0.000326, + "grad_norm": 0.7354280948638916, + "learning_rate": 1e-05, + "loss": 0.088, + "step": 32600 + }, + { + "epoch": 0.000327, + "grad_norm": 0.6953477263450623, + "learning_rate": 1e-05, + "loss": 0.0867, + "step": 32700 + }, + { + "epoch": 0.000328, + "grad_norm": 0.7861385345458984, + "learning_rate": 1e-05, + "loss": 0.0858, + "step": 32800 + }, + { + "epoch": 0.000329, + "grad_norm": 0.7112125158309937, + "learning_rate": 1e-05, + "loss": 0.0859, + "step": 32900 + }, + { + "epoch": 0.00033, + "grad_norm": 0.7531374096870422, + "learning_rate": 1e-05, + "loss": 0.0862, + "step": 33000 + }, + { + "epoch": 0.000331, + "grad_norm": 0.7147675156593323, + "learning_rate": 1e-05, + "loss": 0.0851, + "step": 33100 + }, + { + "epoch": 0.000332, + "grad_norm": 0.8516043424606323, + "learning_rate": 1e-05, + "loss": 0.0858, + "step": 33200 + }, + { + "epoch": 0.000333, + "grad_norm": 0.7007201313972473, + "learning_rate": 1e-05, + "loss": 0.0856, + "step": 33300 + }, + { + "epoch": 0.000334, + "grad_norm": 0.7700639963150024, + "learning_rate": 1e-05, + "loss": 0.085, + "step": 33400 + }, + { + "epoch": 0.000335, + "grad_norm": 0.7579879760742188, + "learning_rate": 1e-05, + "loss": 0.0844, + "step": 33500 + }, + { + "epoch": 0.000336, + "grad_norm": 0.7982689738273621, + "learning_rate": 1e-05, + "loss": 0.0849, + "step": 33600 + }, + { + "epoch": 0.000337, + "grad_norm": 0.69140625, + "learning_rate": 1e-05, + "loss": 0.0864, + "step": 33700 + }, + { + "epoch": 0.000338, + "grad_norm": 0.723205029964447, + "learning_rate": 1e-05, + "loss": 0.0858, + "step": 33800 + }, + { + "epoch": 0.000339, + "grad_norm": 0.7827596664428711, + "learning_rate": 1e-05, + "loss": 0.084, + "step": 33900 + }, + { + "epoch": 0.00034, + "grad_norm": 0.8219903111457825, + "learning_rate": 1e-05, + "loss": 0.0852, + "step": 34000 + }, + { + "epoch": 0.000341, + "grad_norm": 0.8129620552062988, + "learning_rate": 1e-05, + "loss": 0.0848, + "step": 34100 + }, + { + "epoch": 0.000342, + "grad_norm": 0.6510952115058899, + "learning_rate": 1e-05, + "loss": 0.0827, + "step": 34200 + }, + { + "epoch": 0.000343, + "grad_norm": 0.7110053896903992, + "learning_rate": 1e-05, + "loss": 0.0836, + "step": 34300 + }, + { + "epoch": 0.000344, + "grad_norm": 0.7686619162559509, + "learning_rate": 1e-05, + "loss": 0.0835, + "step": 34400 + }, + { + "epoch": 0.000345, + "grad_norm": 0.829767107963562, + "learning_rate": 1e-05, + "loss": 0.0827, + "step": 34500 + }, + { + "epoch": 0.000346, + "grad_norm": 0.7650629281997681, + "learning_rate": 1e-05, + "loss": 0.0826, + "step": 34600 + }, + { + "epoch": 0.000347, + "grad_norm": 0.6766960024833679, + "learning_rate": 1e-05, + "loss": 0.0831, + "step": 34700 + }, + { + "epoch": 0.000348, + "grad_norm": 0.7824012637138367, + "learning_rate": 1e-05, + "loss": 0.0831, + "step": 34800 + }, + { + "epoch": 0.000349, + "grad_norm": 0.697309136390686, + "learning_rate": 1e-05, + "loss": 0.0826, + "step": 34900 + }, + { + "epoch": 0.00035, + "grad_norm": 0.6359274387359619, + "learning_rate": 1e-05, + "loss": 0.0821, + "step": 35000 + }, + { + "epoch": 0.000351, + "grad_norm": 0.7838051915168762, + "learning_rate": 1e-05, + "loss": 0.0828, + "step": 35100 + }, + { + "epoch": 0.000352, + "grad_norm": 0.8149462938308716, + "learning_rate": 1e-05, + "loss": 0.0819, + "step": 35200 + }, + { + "epoch": 0.000353, + "grad_norm": 0.7315548062324524, + "learning_rate": 1e-05, + "loss": 0.081, + "step": 35300 + }, + { + "epoch": 0.000354, + "grad_norm": 0.6927749514579773, + "learning_rate": 1e-05, + "loss": 0.0802, + "step": 35400 + }, + { + "epoch": 0.000355, + "grad_norm": 0.7449594736099243, + "learning_rate": 1e-05, + "loss": 0.0822, + "step": 35500 + }, + { + "epoch": 0.000356, + "grad_norm": 0.6572420597076416, + "learning_rate": 1e-05, + "loss": 0.0809, + "step": 35600 + }, + { + "epoch": 0.000357, + "grad_norm": 0.7096725702285767, + "learning_rate": 1e-05, + "loss": 0.0805, + "step": 35700 + }, + { + "epoch": 0.000358, + "grad_norm": 0.8065080046653748, + "learning_rate": 1e-05, + "loss": 0.08, + "step": 35800 + }, + { + "epoch": 0.000359, + "grad_norm": 0.5750519633293152, + "learning_rate": 1e-05, + "loss": 0.0796, + "step": 35900 + }, + { + "epoch": 0.00036, + "grad_norm": 0.7987583875656128, + "learning_rate": 1e-05, + "loss": 0.0795, + "step": 36000 + }, + { + "epoch": 0.000361, + "grad_norm": 0.7741938233375549, + "learning_rate": 1e-05, + "loss": 0.0795, + "step": 36100 + }, + { + "epoch": 0.000362, + "grad_norm": 0.7459242343902588, + "learning_rate": 1e-05, + "loss": 0.0804, + "step": 36200 + }, + { + "epoch": 0.000363, + "grad_norm": 0.6847333312034607, + "learning_rate": 1e-05, + "loss": 0.0809, + "step": 36300 + }, + { + "epoch": 0.000364, + "grad_norm": 0.7405627369880676, + "learning_rate": 1e-05, + "loss": 0.0782, + "step": 36400 + }, + { + "epoch": 0.000365, + "grad_norm": 0.6119332909584045, + "learning_rate": 1e-05, + "loss": 0.0806, + "step": 36500 + }, + { + "epoch": 0.000366, + "grad_norm": 0.7295922636985779, + "learning_rate": 1e-05, + "loss": 0.0791, + "step": 36600 + }, + { + "epoch": 0.000367, + "grad_norm": 0.7362000346183777, + "learning_rate": 1e-05, + "loss": 0.0793, + "step": 36700 + }, + { + "epoch": 0.000368, + "grad_norm": 0.650321900844574, + "learning_rate": 1e-05, + "loss": 0.0787, + "step": 36800 + }, + { + "epoch": 0.000369, + "grad_norm": 0.6487528681755066, + "learning_rate": 1e-05, + "loss": 0.0788, + "step": 36900 + }, + { + "epoch": 0.00037, + "grad_norm": 0.6908884644508362, + "learning_rate": 1e-05, + "loss": 0.078, + "step": 37000 + }, + { + "epoch": 0.000371, + "grad_norm": 0.7823421359062195, + "learning_rate": 1e-05, + "loss": 0.0773, + "step": 37100 + }, + { + "epoch": 0.000372, + "grad_norm": 0.7242419719696045, + "learning_rate": 1e-05, + "loss": 0.0789, + "step": 37200 + }, + { + "epoch": 0.000373, + "grad_norm": 0.7191994786262512, + "learning_rate": 1e-05, + "loss": 0.0786, + "step": 37300 + }, + { + "epoch": 0.000374, + "grad_norm": 0.6352174282073975, + "learning_rate": 1e-05, + "loss": 0.0782, + "step": 37400 + }, + { + "epoch": 0.000375, + "grad_norm": 0.6456391215324402, + "learning_rate": 1e-05, + "loss": 0.0801, + "step": 37500 + }, + { + "epoch": 0.000376, + "grad_norm": 0.7176135182380676, + "learning_rate": 1e-05, + "loss": 0.0788, + "step": 37600 + }, + { + "epoch": 0.000377, + "grad_norm": 0.7592889666557312, + "learning_rate": 1e-05, + "loss": 0.0782, + "step": 37700 + }, + { + "epoch": 0.000378, + "grad_norm": 0.7405545115470886, + "learning_rate": 1e-05, + "loss": 0.0772, + "step": 37800 + }, + { + "epoch": 0.000379, + "grad_norm": 0.6966970562934875, + "learning_rate": 1e-05, + "loss": 0.0761, + "step": 37900 + }, + { + "epoch": 0.00038, + "grad_norm": 0.7346359491348267, + "learning_rate": 1e-05, + "loss": 0.0775, + "step": 38000 + }, + { + "epoch": 0.000381, + "grad_norm": 0.729246199131012, + "learning_rate": 1e-05, + "loss": 0.0767, + "step": 38100 + }, + { + "epoch": 0.000382, + "grad_norm": 0.8081512451171875, + "learning_rate": 1e-05, + "loss": 0.078, + "step": 38200 + }, + { + "epoch": 0.000383, + "grad_norm": 0.6851301193237305, + "learning_rate": 1e-05, + "loss": 0.0757, + "step": 38300 + }, + { + "epoch": 0.000384, + "grad_norm": 0.6699986457824707, + "learning_rate": 1e-05, + "loss": 0.0767, + "step": 38400 + }, + { + "epoch": 0.000385, + "grad_norm": 0.7026481032371521, + "learning_rate": 1e-05, + "loss": 0.0776, + "step": 38500 + }, + { + "epoch": 0.000386, + "grad_norm": 0.7267670035362244, + "learning_rate": 1e-05, + "loss": 0.0761, + "step": 38600 + }, + { + "epoch": 0.000387, + "grad_norm": 0.648714005947113, + "learning_rate": 1e-05, + "loss": 0.0749, + "step": 38700 + }, + { + "epoch": 0.000388, + "grad_norm": 0.7160006165504456, + "learning_rate": 1e-05, + "loss": 0.0756, + "step": 38800 + }, + { + "epoch": 0.000389, + "grad_norm": 0.7773024439811707, + "learning_rate": 1e-05, + "loss": 0.0759, + "step": 38900 + }, + { + "epoch": 0.00039, + "grad_norm": 0.7162371277809143, + "learning_rate": 1e-05, + "loss": 0.0749, + "step": 39000 + }, + { + "epoch": 0.000391, + "grad_norm": 0.7529783844947815, + "learning_rate": 1e-05, + "loss": 0.0746, + "step": 39100 + }, + { + "epoch": 0.000392, + "grad_norm": 0.866392195224762, + "learning_rate": 1e-05, + "loss": 0.0755, + "step": 39200 + }, + { + "epoch": 0.000393, + "grad_norm": 0.751728355884552, + "learning_rate": 1e-05, + "loss": 0.0752, + "step": 39300 + }, + { + "epoch": 0.000394, + "grad_norm": 0.6856648325920105, + "learning_rate": 1e-05, + "loss": 0.0753, + "step": 39400 + }, + { + "epoch": 0.000395, + "grad_norm": 0.683175265789032, + "learning_rate": 1e-05, + "loss": 0.0739, + "step": 39500 + }, + { + "epoch": 0.000396, + "grad_norm": 0.7458997368812561, + "learning_rate": 1e-05, + "loss": 0.0752, + "step": 39600 + }, + { + "epoch": 0.000397, + "grad_norm": 0.7095280885696411, + "learning_rate": 1e-05, + "loss": 0.0753, + "step": 39700 + }, + { + "epoch": 0.000398, + "grad_norm": 0.6352033019065857, + "learning_rate": 1e-05, + "loss": 0.0737, + "step": 39800 + }, + { + "epoch": 0.000399, + "grad_norm": 0.695184588432312, + "learning_rate": 1e-05, + "loss": 0.0738, + "step": 39900 + }, + { + "epoch": 0.0004, + "grad_norm": 0.6518137454986572, + "learning_rate": 1e-05, + "loss": 0.074, + "step": 40000 + }, + { + "epoch": 0.0004, + "eval_loss": 0.057464599609375, + "eval_runtime": 146.7084, + "eval_samples_per_second": 340.812, + "eval_steps_per_second": 21.301, + "step": 40000 + }, + { + "epoch": 0.000401, + "grad_norm": 0.7782549858093262, + "learning_rate": 1e-05, + "loss": 0.074, + "step": 40100 + }, + { + "epoch": 0.000402, + "grad_norm": 0.6919134855270386, + "learning_rate": 1e-05, + "loss": 0.0739, + "step": 40200 + }, + { + "epoch": 0.000403, + "grad_norm": 0.661824643611908, + "learning_rate": 1e-05, + "loss": 0.0744, + "step": 40300 + }, + { + "epoch": 0.000404, + "grad_norm": 0.6964775919914246, + "learning_rate": 1e-05, + "loss": 0.0732, + "step": 40400 + }, + { + "epoch": 0.000405, + "grad_norm": 0.860140860080719, + "learning_rate": 1e-05, + "loss": 0.0736, + "step": 40500 + }, + { + "epoch": 0.000406, + "grad_norm": 0.6227797865867615, + "learning_rate": 1e-05, + "loss": 0.0734, + "step": 40600 + }, + { + "epoch": 0.000407, + "grad_norm": 0.5687974095344543, + "learning_rate": 1e-05, + "loss": 0.0734, + "step": 40700 + }, + { + "epoch": 0.000408, + "grad_norm": 0.6930891871452332, + "learning_rate": 1e-05, + "loss": 0.074, + "step": 40800 + }, + { + "epoch": 0.000409, + "grad_norm": 0.6303442716598511, + "learning_rate": 1e-05, + "loss": 0.0728, + "step": 40900 + }, + { + "epoch": 0.00041, + "grad_norm": 0.6731743812561035, + "learning_rate": 1e-05, + "loss": 0.0742, + "step": 41000 + }, + { + "epoch": 0.000411, + "grad_norm": 0.6712822318077087, + "learning_rate": 1e-05, + "loss": 0.0737, + "step": 41100 + }, + { + "epoch": 0.000412, + "grad_norm": 0.6134166717529297, + "learning_rate": 1e-05, + "loss": 0.0728, + "step": 41200 + }, + { + "epoch": 0.000413, + "grad_norm": 0.6910662651062012, + "learning_rate": 1e-05, + "loss": 0.0726, + "step": 41300 + }, + { + "epoch": 0.000414, + "grad_norm": 0.6266744136810303, + "learning_rate": 1e-05, + "loss": 0.0719, + "step": 41400 + }, + { + "epoch": 0.000415, + "grad_norm": 0.600907027721405, + "learning_rate": 1e-05, + "loss": 0.0737, + "step": 41500 + }, + { + "epoch": 0.000416, + "grad_norm": 0.6139588356018066, + "learning_rate": 1e-05, + "loss": 0.0722, + "step": 41600 + }, + { + "epoch": 0.000417, + "grad_norm": 0.6445550918579102, + "learning_rate": 1e-05, + "loss": 0.0721, + "step": 41700 + }, + { + "epoch": 0.000418, + "grad_norm": 0.7176617980003357, + "learning_rate": 1e-05, + "loss": 0.0718, + "step": 41800 + }, + { + "epoch": 0.000419, + "grad_norm": 0.7564845085144043, + "learning_rate": 1e-05, + "loss": 0.0724, + "step": 41900 + }, + { + "epoch": 0.00042, + "grad_norm": 0.7683578133583069, + "learning_rate": 1e-05, + "loss": 0.0714, + "step": 42000 + }, + { + "epoch": 0.000421, + "grad_norm": 0.731192946434021, + "learning_rate": 1e-05, + "loss": 0.0707, + "step": 42100 + }, + { + "epoch": 0.000422, + "grad_norm": 0.6390314102172852, + "learning_rate": 1e-05, + "loss": 0.0706, + "step": 42200 + }, + { + "epoch": 0.000423, + "grad_norm": 0.6024550199508667, + "learning_rate": 1e-05, + "loss": 0.0714, + "step": 42300 + }, + { + "epoch": 0.000424, + "grad_norm": 0.6974002718925476, + "learning_rate": 1e-05, + "loss": 0.0712, + "step": 42400 + }, + { + "epoch": 0.000425, + "grad_norm": 0.6231324672698975, + "learning_rate": 1e-05, + "loss": 0.0719, + "step": 42500 + }, + { + "epoch": 0.000426, + "grad_norm": 0.6329951882362366, + "learning_rate": 1e-05, + "loss": 0.0708, + "step": 42600 + }, + { + "epoch": 0.000427, + "grad_norm": 0.7328572869300842, + "learning_rate": 1e-05, + "loss": 0.0713, + "step": 42700 + }, + { + "epoch": 0.000428, + "grad_norm": 0.6142467856407166, + "learning_rate": 1e-05, + "loss": 0.0721, + "step": 42800 + }, + { + "epoch": 0.000429, + "grad_norm": 0.7287197113037109, + "learning_rate": 1e-05, + "loss": 0.0706, + "step": 42900 + }, + { + "epoch": 0.00043, + "grad_norm": 0.6606420278549194, + "learning_rate": 1e-05, + "loss": 0.0699, + "step": 43000 + }, + { + "epoch": 0.000431, + "grad_norm": 0.7667610049247742, + "learning_rate": 1e-05, + "loss": 0.07, + "step": 43100 + }, + { + "epoch": 0.000432, + "grad_norm": 0.5734269618988037, + "learning_rate": 1e-05, + "loss": 0.0699, + "step": 43200 + }, + { + "epoch": 0.000433, + "grad_norm": 0.5326073169708252, + "learning_rate": 1e-05, + "loss": 0.07, + "step": 43300 + }, + { + "epoch": 0.000434, + "grad_norm": 0.7028875946998596, + "learning_rate": 1e-05, + "loss": 0.0696, + "step": 43400 + }, + { + "epoch": 0.000435, + "grad_norm": 0.6137057542800903, + "learning_rate": 1e-05, + "loss": 0.0691, + "step": 43500 + }, + { + "epoch": 0.000436, + "grad_norm": 0.5539369583129883, + "learning_rate": 1e-05, + "loss": 0.0688, + "step": 43600 + }, + { + "epoch": 0.000437, + "grad_norm": 0.7035527229309082, + "learning_rate": 1e-05, + "loss": 0.071, + "step": 43700 + }, + { + "epoch": 0.000438, + "grad_norm": 0.7055030465126038, + "learning_rate": 1e-05, + "loss": 0.0699, + "step": 43800 + }, + { + "epoch": 0.000439, + "grad_norm": 0.536948025226593, + "learning_rate": 1e-05, + "loss": 0.0697, + "step": 43900 + }, + { + "epoch": 0.00044, + "grad_norm": 0.6797453165054321, + "learning_rate": 1e-05, + "loss": 0.0677, + "step": 44000 + }, + { + "epoch": 0.000441, + "grad_norm": 0.6475409865379333, + "learning_rate": 1e-05, + "loss": 0.0696, + "step": 44100 + }, + { + "epoch": 0.000442, + "grad_norm": 0.5951113700866699, + "learning_rate": 1e-05, + "loss": 0.0683, + "step": 44200 + }, + { + "epoch": 0.000443, + "grad_norm": 0.7197650671005249, + "learning_rate": 1e-05, + "loss": 0.0696, + "step": 44300 + }, + { + "epoch": 0.000444, + "grad_norm": 0.6708860397338867, + "learning_rate": 1e-05, + "loss": 0.0692, + "step": 44400 + }, + { + "epoch": 0.000445, + "grad_norm": 0.6833498477935791, + "learning_rate": 1e-05, + "loss": 0.0694, + "step": 44500 + }, + { + "epoch": 0.000446, + "grad_norm": 0.6520599722862244, + "learning_rate": 1e-05, + "loss": 0.0694, + "step": 44600 + }, + { + "epoch": 0.000447, + "grad_norm": 0.7471343278884888, + "learning_rate": 1e-05, + "loss": 0.0679, + "step": 44700 + }, + { + "epoch": 0.000448, + "grad_norm": 0.6124304533004761, + "learning_rate": 1e-05, + "loss": 0.0685, + "step": 44800 + }, + { + "epoch": 0.000449, + "grad_norm": 0.6457110643386841, + "learning_rate": 1e-05, + "loss": 0.0683, + "step": 44900 + }, + { + "epoch": 0.00045, + "grad_norm": 0.8282802104949951, + "learning_rate": 1e-05, + "loss": 0.0675, + "step": 45000 + }, + { + "epoch": 0.000451, + "grad_norm": 0.7290102243423462, + "learning_rate": 1e-05, + "loss": 0.067, + "step": 45100 + }, + { + "epoch": 0.000452, + "grad_norm": 0.6666006445884705, + "learning_rate": 1e-05, + "loss": 0.0672, + "step": 45200 + }, + { + "epoch": 0.000453, + "grad_norm": 0.5930759906768799, + "learning_rate": 1e-05, + "loss": 0.0687, + "step": 45300 + }, + { + "epoch": 0.000454, + "grad_norm": 0.7391034960746765, + "learning_rate": 1e-05, + "loss": 0.0681, + "step": 45400 + }, + { + "epoch": 0.000455, + "grad_norm": 0.6331747770309448, + "learning_rate": 1e-05, + "loss": 0.0686, + "step": 45500 + }, + { + "epoch": 0.000456, + "grad_norm": 0.7175407409667969, + "learning_rate": 1e-05, + "loss": 0.0682, + "step": 45600 + }, + { + "epoch": 0.000457, + "grad_norm": 0.6839337348937988, + "learning_rate": 1e-05, + "loss": 0.068, + "step": 45700 + }, + { + "epoch": 0.000458, + "grad_norm": 0.7204523682594299, + "learning_rate": 1e-05, + "loss": 0.0674, + "step": 45800 + }, + { + "epoch": 0.000459, + "grad_norm": 0.6172782778739929, + "learning_rate": 1e-05, + "loss": 0.0672, + "step": 45900 + }, + { + "epoch": 0.00046, + "grad_norm": 0.6801437735557556, + "learning_rate": 1e-05, + "loss": 0.068, + "step": 46000 + }, + { + "epoch": 0.000461, + "grad_norm": 0.6950106620788574, + "learning_rate": 1e-05, + "loss": 0.0667, + "step": 46100 + }, + { + "epoch": 0.000462, + "grad_norm": 0.7430393099784851, + "learning_rate": 1e-05, + "loss": 0.0661, + "step": 46200 + }, + { + "epoch": 0.000463, + "grad_norm": 0.7335778474807739, + "learning_rate": 1e-05, + "loss": 0.0664, + "step": 46300 + }, + { + "epoch": 0.000464, + "grad_norm": 0.6109582185745239, + "learning_rate": 1e-05, + "loss": 0.0678, + "step": 46400 + }, + { + "epoch": 0.000465, + "grad_norm": 0.747843325138092, + "learning_rate": 1e-05, + "loss": 0.0666, + "step": 46500 + }, + { + "epoch": 0.000466, + "grad_norm": 0.5541141033172607, + "learning_rate": 1e-05, + "loss": 0.066, + "step": 46600 + }, + { + "epoch": 0.000467, + "grad_norm": 0.7821163535118103, + "learning_rate": 1e-05, + "loss": 0.0663, + "step": 46700 + }, + { + "epoch": 0.000468, + "grad_norm": 0.6927903294563293, + "learning_rate": 1e-05, + "loss": 0.0668, + "step": 46800 + }, + { + "epoch": 0.000469, + "grad_norm": 0.6270934343338013, + "learning_rate": 1e-05, + "loss": 0.0674, + "step": 46900 + }, + { + "epoch": 0.00047, + "grad_norm": 0.7509257197380066, + "learning_rate": 1e-05, + "loss": 0.0661, + "step": 47000 + }, + { + "epoch": 0.000471, + "grad_norm": 0.6083252429962158, + "learning_rate": 1e-05, + "loss": 0.0655, + "step": 47100 + }, + { + "epoch": 0.000472, + "grad_norm": 0.5622929334640503, + "learning_rate": 1e-05, + "loss": 0.065, + "step": 47200 + }, + { + "epoch": 0.000473, + "grad_norm": 0.5768439173698425, + "learning_rate": 1e-05, + "loss": 0.0663, + "step": 47300 + }, + { + "epoch": 0.000474, + "grad_norm": 0.7420287728309631, + "learning_rate": 1e-05, + "loss": 0.0647, + "step": 47400 + }, + { + "epoch": 0.000475, + "grad_norm": 0.6630219221115112, + "learning_rate": 1e-05, + "loss": 0.066, + "step": 47500 + }, + { + "epoch": 0.000476, + "grad_norm": 0.5590940713882446, + "learning_rate": 1e-05, + "loss": 0.0662, + "step": 47600 + }, + { + "epoch": 0.000477, + "grad_norm": 0.5448912382125854, + "learning_rate": 1e-05, + "loss": 0.0648, + "step": 47700 + }, + { + "epoch": 0.000478, + "grad_norm": 0.6090975999832153, + "learning_rate": 1e-05, + "loss": 0.0653, + "step": 47800 + }, + { + "epoch": 0.000479, + "grad_norm": 0.7398414611816406, + "learning_rate": 1e-05, + "loss": 0.0653, + "step": 47900 + }, + { + "epoch": 0.00048, + "grad_norm": 0.6005905270576477, + "learning_rate": 1e-05, + "loss": 0.0654, + "step": 48000 + }, + { + "epoch": 0.000481, + "grad_norm": 0.6361467838287354, + "learning_rate": 1e-05, + "loss": 0.0653, + "step": 48100 + }, + { + "epoch": 0.000482, + "grad_norm": 0.6767069101333618, + "learning_rate": 1e-05, + "loss": 0.0652, + "step": 48200 + }, + { + "epoch": 0.000483, + "grad_norm": 0.6184808015823364, + "learning_rate": 1e-05, + "loss": 0.0654, + "step": 48300 + }, + { + "epoch": 0.000484, + "grad_norm": 0.7021101117134094, + "learning_rate": 1e-05, + "loss": 0.0637, + "step": 48400 + }, + { + "epoch": 0.000485, + "grad_norm": 0.6103231310844421, + "learning_rate": 1e-05, + "loss": 0.0653, + "step": 48500 + }, + { + "epoch": 0.000486, + "grad_norm": 0.5976945161819458, + "learning_rate": 1e-05, + "loss": 0.0647, + "step": 48600 + }, + { + "epoch": 0.000487, + "grad_norm": 0.6222690343856812, + "learning_rate": 1e-05, + "loss": 0.0647, + "step": 48700 + }, + { + "epoch": 0.000488, + "grad_norm": 0.5408068299293518, + "learning_rate": 1e-05, + "loss": 0.0641, + "step": 48800 + }, + { + "epoch": 0.000489, + "grad_norm": 0.628935694694519, + "learning_rate": 1e-05, + "loss": 0.0642, + "step": 48900 + }, + { + "epoch": 0.00049, + "grad_norm": 0.6062678694725037, + "learning_rate": 1e-05, + "loss": 0.0645, + "step": 49000 + }, + { + "epoch": 0.000491, + "grad_norm": 0.6533873677253723, + "learning_rate": 1e-05, + "loss": 0.0648, + "step": 49100 + }, + { + "epoch": 0.000492, + "grad_norm": 0.6818357706069946, + "learning_rate": 1e-05, + "loss": 0.0642, + "step": 49200 + }, + { + "epoch": 0.000493, + "grad_norm": 0.5615854859352112, + "learning_rate": 1e-05, + "loss": 0.0649, + "step": 49300 + }, + { + "epoch": 0.000494, + "grad_norm": 0.5262526273727417, + "learning_rate": 1e-05, + "loss": 0.0645, + "step": 49400 + }, + { + "epoch": 0.000495, + "grad_norm": 0.5227097868919373, + "learning_rate": 1e-05, + "loss": 0.0634, + "step": 49500 + }, + { + "epoch": 0.000496, + "grad_norm": 0.5794950723648071, + "learning_rate": 1e-05, + "loss": 0.0632, + "step": 49600 + }, + { + "epoch": 0.000497, + "grad_norm": 0.5515991449356079, + "learning_rate": 1e-05, + "loss": 0.0639, + "step": 49700 + }, + { + "epoch": 0.000498, + "grad_norm": 0.5834317803382874, + "learning_rate": 1e-05, + "loss": 0.0633, + "step": 49800 + }, + { + "epoch": 0.000499, + "grad_norm": 0.6389098763465881, + "learning_rate": 1e-05, + "loss": 0.0637, + "step": 49900 + }, + { + "epoch": 0.0005, + "grad_norm": 0.6473069787025452, + "learning_rate": 1e-05, + "loss": 0.0634, + "step": 50000 + }, + { + "epoch": 0.000501, + "grad_norm": 0.5156600475311279, + "learning_rate": 1e-05, + "loss": 0.0638, + "step": 50100 + }, + { + "epoch": 0.000502, + "grad_norm": 0.6542375683784485, + "learning_rate": 1e-05, + "loss": 0.0635, + "step": 50200 + }, + { + "epoch": 0.000503, + "grad_norm": 0.8224967122077942, + "learning_rate": 1e-05, + "loss": 0.0631, + "step": 50300 + }, + { + "epoch": 0.000504, + "grad_norm": 0.6293924450874329, + "learning_rate": 1e-05, + "loss": 0.0619, + "step": 50400 + }, + { + "epoch": 0.000505, + "grad_norm": 0.7436028718948364, + "learning_rate": 1e-05, + "loss": 0.064, + "step": 50500 + }, + { + "epoch": 0.000506, + "grad_norm": 0.660367488861084, + "learning_rate": 1e-05, + "loss": 0.0639, + "step": 50600 + }, + { + "epoch": 0.000507, + "grad_norm": 0.5511479377746582, + "learning_rate": 1e-05, + "loss": 0.0625, + "step": 50700 + }, + { + "epoch": 0.000508, + "grad_norm": 0.5846619009971619, + "learning_rate": 1e-05, + "loss": 0.0634, + "step": 50800 + }, + { + "epoch": 0.000509, + "grad_norm": 0.5902076959609985, + "learning_rate": 1e-05, + "loss": 0.0637, + "step": 50900 + }, + { + "epoch": 0.00051, + "grad_norm": 0.5104527473449707, + "learning_rate": 1e-05, + "loss": 0.0627, + "step": 51000 + }, + { + "epoch": 0.000511, + "grad_norm": 0.592365026473999, + "learning_rate": 1e-05, + "loss": 0.0624, + "step": 51100 + }, + { + "epoch": 0.000512, + "grad_norm": 0.7283549904823303, + "learning_rate": 1e-05, + "loss": 0.0618, + "step": 51200 + }, + { + "epoch": 0.000513, + "grad_norm": 0.6117008328437805, + "learning_rate": 1e-05, + "loss": 0.0621, + "step": 51300 + }, + { + "epoch": 0.000514, + "grad_norm": 0.6155059933662415, + "learning_rate": 1e-05, + "loss": 0.0627, + "step": 51400 + }, + { + "epoch": 0.000515, + "grad_norm": 0.6605076789855957, + "learning_rate": 1e-05, + "loss": 0.0626, + "step": 51500 + }, + { + "epoch": 0.000516, + "grad_norm": 0.7391318082809448, + "learning_rate": 1e-05, + "loss": 0.0609, + "step": 51600 + }, + { + "epoch": 0.000517, + "grad_norm": 0.5673928260803223, + "learning_rate": 1e-05, + "loss": 0.0626, + "step": 51700 + }, + { + "epoch": 0.000518, + "grad_norm": 0.7229452729225159, + "learning_rate": 1e-05, + "loss": 0.0613, + "step": 51800 + }, + { + "epoch": 0.000519, + "grad_norm": 0.6015135049819946, + "learning_rate": 1e-05, + "loss": 0.0614, + "step": 51900 + }, + { + "epoch": 0.00052, + "grad_norm": 3.3136706352233887, + "learning_rate": 1e-05, + "loss": 0.0607, + "step": 52000 + }, + { + "epoch": 0.000521, + "grad_norm": 0.5922873616218567, + "learning_rate": 1e-05, + "loss": 0.0627, + "step": 52100 + }, + { + "epoch": 0.000522, + "grad_norm": 0.6967010498046875, + "learning_rate": 1e-05, + "loss": 0.0611, + "step": 52200 + }, + { + "epoch": 0.000523, + "grad_norm": 0.5986941456794739, + "learning_rate": 1e-05, + "loss": 0.0618, + "step": 52300 + }, + { + "epoch": 0.000524, + "grad_norm": 0.5476034879684448, + "learning_rate": 1e-05, + "loss": 0.0614, + "step": 52400 + }, + { + "epoch": 0.000525, + "grad_norm": 0.5859378576278687, + "learning_rate": 1e-05, + "loss": 0.0614, + "step": 52500 + }, + { + "epoch": 0.000526, + "grad_norm": 0.601116955280304, + "learning_rate": 1e-05, + "loss": 0.0618, + "step": 52600 + }, + { + "epoch": 0.000527, + "grad_norm": 0.5084663033485413, + "learning_rate": 1e-05, + "loss": 0.0622, + "step": 52700 + }, + { + "epoch": 0.000528, + "grad_norm": 0.5654129385948181, + "learning_rate": 1e-05, + "loss": 0.0625, + "step": 52800 + }, + { + "epoch": 0.000529, + "grad_norm": 0.5403587222099304, + "learning_rate": 1e-05, + "loss": 0.0605, + "step": 52900 + }, + { + "epoch": 0.00053, + "grad_norm": 0.5523150563240051, + "learning_rate": 1e-05, + "loss": 0.0615, + "step": 53000 + }, + { + "epoch": 0.000531, + "grad_norm": 0.6014654636383057, + "learning_rate": 1e-05, + "loss": 0.0613, + "step": 53100 + }, + { + "epoch": 0.000532, + "grad_norm": 0.6389763355255127, + "learning_rate": 1e-05, + "loss": 0.0618, + "step": 53200 + }, + { + "epoch": 0.000533, + "grad_norm": 0.6326813697814941, + "learning_rate": 1e-05, + "loss": 0.0621, + "step": 53300 + }, + { + "epoch": 0.000534, + "grad_norm": 0.5675824284553528, + "learning_rate": 1e-05, + "loss": 0.0603, + "step": 53400 + }, + { + "epoch": 0.000535, + "grad_norm": 0.6056302189826965, + "learning_rate": 1e-05, + "loss": 0.0604, + "step": 53500 + }, + { + "epoch": 0.000536, + "grad_norm": 0.7404552698135376, + "learning_rate": 1e-05, + "loss": 0.0617, + "step": 53600 + }, + { + "epoch": 0.000537, + "grad_norm": 0.5762139558792114, + "learning_rate": 1e-05, + "loss": 0.061, + "step": 53700 + }, + { + "epoch": 0.000538, + "grad_norm": 0.6377224922180176, + "learning_rate": 1e-05, + "loss": 0.0606, + "step": 53800 + }, + { + "epoch": 0.000539, + "grad_norm": 0.6007105708122253, + "learning_rate": 1e-05, + "loss": 0.0617, + "step": 53900 + }, + { + "epoch": 0.00054, + "grad_norm": 0.679589033126831, + "learning_rate": 1e-05, + "loss": 0.0609, + "step": 54000 + }, + { + "epoch": 0.000541, + "grad_norm": 0.6322323679924011, + "learning_rate": 1e-05, + "loss": 0.0611, + "step": 54100 + }, + { + "epoch": 0.000542, + "grad_norm": 0.7151752710342407, + "learning_rate": 1e-05, + "loss": 0.0594, + "step": 54200 + }, + { + "epoch": 0.000543, + "grad_norm": 0.5888739228248596, + "learning_rate": 1e-05, + "loss": 0.0608, + "step": 54300 + }, + { + "epoch": 0.000544, + "grad_norm": 0.5529482364654541, + "learning_rate": 1e-05, + "loss": 0.0616, + "step": 54400 + }, + { + "epoch": 0.000545, + "grad_norm": 0.5086714625358582, + "learning_rate": 1e-05, + "loss": 0.0599, + "step": 54500 + }, + { + "epoch": 0.000546, + "grad_norm": 0.5248231887817383, + "learning_rate": 1e-05, + "loss": 0.0611, + "step": 54600 + }, + { + "epoch": 0.000547, + "grad_norm": 0.48391416668891907, + "learning_rate": 1e-05, + "loss": 0.0603, + "step": 54700 + }, + { + "epoch": 0.000548, + "grad_norm": 0.6535386443138123, + "learning_rate": 1e-05, + "loss": 0.0599, + "step": 54800 + }, + { + "epoch": 0.000549, + "grad_norm": 0.6315100193023682, + "learning_rate": 1e-05, + "loss": 0.0599, + "step": 54900 + }, + { + "epoch": 0.00055, + "grad_norm": 0.5279924273490906, + "learning_rate": 1e-05, + "loss": 0.0601, + "step": 55000 + }, + { + "epoch": 0.000551, + "grad_norm": 0.5455300807952881, + "learning_rate": 1e-05, + "loss": 0.0601, + "step": 55100 + }, + { + "epoch": 0.000552, + "grad_norm": 0.556695282459259, + "learning_rate": 1e-05, + "loss": 0.06, + "step": 55200 + }, + { + "epoch": 0.000553, + "grad_norm": 0.5867908000946045, + "learning_rate": 1e-05, + "loss": 0.0584, + "step": 55300 + }, + { + "epoch": 0.000554, + "grad_norm": 0.6211426258087158, + "learning_rate": 1e-05, + "loss": 0.0594, + "step": 55400 + }, + { + "epoch": 0.000555, + "grad_norm": 0.6962873339653015, + "learning_rate": 1e-05, + "loss": 0.0588, + "step": 55500 + }, + { + "epoch": 0.000556, + "grad_norm": 0.5341864228248596, + "learning_rate": 1e-05, + "loss": 0.0594, + "step": 55600 + }, + { + "epoch": 0.000557, + "grad_norm": 0.5630548000335693, + "learning_rate": 1e-05, + "loss": 0.0596, + "step": 55700 + }, + { + "epoch": 0.000558, + "grad_norm": 0.6993235349655151, + "learning_rate": 1e-05, + "loss": 0.0592, + "step": 55800 + }, + { + "epoch": 0.000559, + "grad_norm": 0.5936434268951416, + "learning_rate": 1e-05, + "loss": 0.0589, + "step": 55900 + }, + { + "epoch": 0.00056, + "grad_norm": 0.6682338714599609, + "learning_rate": 1e-05, + "loss": 0.0592, + "step": 56000 + }, + { + "epoch": 0.000561, + "grad_norm": 0.5741124749183655, + "learning_rate": 1e-05, + "loss": 0.0586, + "step": 56100 + }, + { + "epoch": 0.000562, + "grad_norm": 0.5639105439186096, + "learning_rate": 1e-05, + "loss": 0.0596, + "step": 56200 + }, + { + "epoch": 0.000563, + "grad_norm": 0.6496306657791138, + "learning_rate": 1e-05, + "loss": 0.0588, + "step": 56300 + }, + { + "epoch": 0.000564, + "grad_norm": 0.6160135865211487, + "learning_rate": 1e-05, + "loss": 0.0593, + "step": 56400 + }, + { + "epoch": 0.000565, + "grad_norm": 0.6027793288230896, + "learning_rate": 1e-05, + "loss": 0.0579, + "step": 56500 + }, + { + "epoch": 0.000566, + "grad_norm": 0.6365297436714172, + "learning_rate": 1e-05, + "loss": 0.0592, + "step": 56600 + }, + { + "epoch": 0.000567, + "grad_norm": 0.6124427914619446, + "learning_rate": 1e-05, + "loss": 0.0584, + "step": 56700 + }, + { + "epoch": 0.000568, + "grad_norm": 0.5500183701515198, + "learning_rate": 1e-05, + "loss": 0.0603, + "step": 56800 + }, + { + "epoch": 0.000569, + "grad_norm": 0.6076985597610474, + "learning_rate": 1e-05, + "loss": 0.0586, + "step": 56900 + }, + { + "epoch": 0.00057, + "grad_norm": 0.5683192610740662, + "learning_rate": 1e-05, + "loss": 0.0577, + "step": 57000 + }, + { + "epoch": 0.000571, + "grad_norm": 0.6625038385391235, + "learning_rate": 1e-05, + "loss": 0.0581, + "step": 57100 + }, + { + "epoch": 0.000572, + "grad_norm": 0.40177464485168457, + "learning_rate": 1e-05, + "loss": 0.0586, + "step": 57200 + }, + { + "epoch": 0.000573, + "grad_norm": 0.6952741742134094, + "learning_rate": 1e-05, + "loss": 0.0584, + "step": 57300 + }, + { + "epoch": 0.000574, + "grad_norm": 0.6179869771003723, + "learning_rate": 1e-05, + "loss": 0.0589, + "step": 57400 + }, + { + "epoch": 0.000575, + "grad_norm": 0.5745118260383606, + "learning_rate": 1e-05, + "loss": 0.0578, + "step": 57500 + }, + { + "epoch": 0.000576, + "grad_norm": 0.4852728843688965, + "learning_rate": 1e-05, + "loss": 0.0584, + "step": 57600 + }, + { + "epoch": 0.000577, + "grad_norm": 0.6206620335578918, + "learning_rate": 1e-05, + "loss": 0.0583, + "step": 57700 + }, + { + "epoch": 0.000578, + "grad_norm": 0.6402736306190491, + "learning_rate": 1e-05, + "loss": 0.0574, + "step": 57800 + }, + { + "epoch": 0.000579, + "grad_norm": 0.5858712792396545, + "learning_rate": 1e-05, + "loss": 0.0582, + "step": 57900 + }, + { + "epoch": 0.00058, + "grad_norm": 0.5614802837371826, + "learning_rate": 1e-05, + "loss": 0.0586, + "step": 58000 + }, + { + "epoch": 0.000581, + "grad_norm": 0.6376156210899353, + "learning_rate": 1e-05, + "loss": 0.0574, + "step": 58100 + }, + { + "epoch": 0.000582, + "grad_norm": 0.5398702621459961, + "learning_rate": 1e-05, + "loss": 0.0567, + "step": 58200 + }, + { + "epoch": 0.000583, + "grad_norm": 0.6560328602790833, + "learning_rate": 1e-05, + "loss": 0.0586, + "step": 58300 + }, + { + "epoch": 0.000584, + "grad_norm": 0.48175305128097534, + "learning_rate": 1e-05, + "loss": 0.0579, + "step": 58400 + }, + { + "epoch": 0.000585, + "grad_norm": 0.47494786977767944, + "learning_rate": 1e-05, + "loss": 0.0565, + "step": 58500 + }, + { + "epoch": 0.000586, + "grad_norm": 0.6271668672561646, + "learning_rate": 1e-05, + "loss": 0.0572, + "step": 58600 + }, + { + "epoch": 0.000587, + "grad_norm": 0.5039101243019104, + "learning_rate": 1e-05, + "loss": 0.0578, + "step": 58700 + }, + { + "epoch": 0.000588, + "grad_norm": 0.5363636612892151, + "learning_rate": 1e-05, + "loss": 0.0578, + "step": 58800 + }, + { + "epoch": 0.000589, + "grad_norm": 0.6029368042945862, + "learning_rate": 1e-05, + "loss": 0.0567, + "step": 58900 + }, + { + "epoch": 0.00059, + "grad_norm": 0.5582793354988098, + "learning_rate": 1e-05, + "loss": 0.0579, + "step": 59000 + }, + { + "epoch": 0.000591, + "grad_norm": 0.5290389657020569, + "learning_rate": 1e-05, + "loss": 0.0575, + "step": 59100 + }, + { + "epoch": 0.000592, + "grad_norm": 0.5864163041114807, + "learning_rate": 1e-05, + "loss": 0.0565, + "step": 59200 + }, + { + "epoch": 0.000593, + "grad_norm": 0.49124574661254883, + "learning_rate": 1e-05, + "loss": 0.0584, + "step": 59300 + }, + { + "epoch": 0.000594, + "grad_norm": 0.5180615782737732, + "learning_rate": 1e-05, + "loss": 0.0555, + "step": 59400 + }, + { + "epoch": 0.000595, + "grad_norm": 0.5236871838569641, + "learning_rate": 1e-05, + "loss": 0.0574, + "step": 59500 + }, + { + "epoch": 0.000596, + "grad_norm": 0.7328921556472778, + "learning_rate": 1e-05, + "loss": 0.057, + "step": 59600 + }, + { + "epoch": 0.000597, + "grad_norm": 0.5635091662406921, + "learning_rate": 1e-05, + "loss": 0.0559, + "step": 59700 + }, + { + "epoch": 0.000598, + "grad_norm": 0.5094209313392639, + "learning_rate": 1e-05, + "loss": 0.057, + "step": 59800 + }, + { + "epoch": 0.000599, + "grad_norm": 0.5855716466903687, + "learning_rate": 1e-05, + "loss": 0.0566, + "step": 59900 + }, + { + "epoch": 0.0006, + "grad_norm": 0.6821003556251526, + "learning_rate": 1e-05, + "loss": 0.0559, + "step": 60000 + }, + { + "epoch": 0.0006, + "eval_loss": 0.045257568359375, + "eval_runtime": 142.735, + "eval_samples_per_second": 350.3, + "eval_steps_per_second": 21.894, + "step": 60000 + }, + { + "epoch": 0.000601, + "grad_norm": 0.5633527040481567, + "learning_rate": 1e-05, + "loss": 0.0562, + "step": 60100 + }, + { + "epoch": 0.000602, + "grad_norm": 0.5337314009666443, + "learning_rate": 1e-05, + "loss": 0.0562, + "step": 60200 + }, + { + "epoch": 0.000603, + "grad_norm": 0.5282440185546875, + "learning_rate": 1e-05, + "loss": 0.0549, + "step": 60300 + }, + { + "epoch": 0.000604, + "grad_norm": 0.5766568779945374, + "learning_rate": 1e-05, + "loss": 0.0576, + "step": 60400 + }, + { + "epoch": 0.000605, + "grad_norm": 0.5904074311256409, + "learning_rate": 1e-05, + "loss": 0.0563, + "step": 60500 + }, + { + "epoch": 0.000606, + "grad_norm": 0.6538689136505127, + "learning_rate": 1e-05, + "loss": 0.0566, + "step": 60600 + }, + { + "epoch": 0.000607, + "grad_norm": 0.45561283826828003, + "learning_rate": 1e-05, + "loss": 0.0561, + "step": 60700 + }, + { + "epoch": 0.000608, + "grad_norm": 0.47445598244667053, + "learning_rate": 1e-05, + "loss": 0.0561, + "step": 60800 + }, + { + "epoch": 0.000609, + "grad_norm": 0.7631045579910278, + "learning_rate": 1e-05, + "loss": 0.0556, + "step": 60900 + }, + { + "epoch": 0.00061, + "grad_norm": 0.5754849910736084, + "learning_rate": 1e-05, + "loss": 0.0553, + "step": 61000 + }, + { + "epoch": 0.000611, + "grad_norm": 0.6670407652854919, + "learning_rate": 1e-05, + "loss": 0.057, + "step": 61100 + }, + { + "epoch": 0.000612, + "grad_norm": 0.5728887319564819, + "learning_rate": 1e-05, + "loss": 0.0566, + "step": 61200 + }, + { + "epoch": 0.000613, + "grad_norm": 0.5342495441436768, + "learning_rate": 1e-05, + "loss": 0.0552, + "step": 61300 + }, + { + "epoch": 0.000614, + "grad_norm": 0.5812315344810486, + "learning_rate": 1e-05, + "loss": 0.0556, + "step": 61400 + }, + { + "epoch": 0.000615, + "grad_norm": 0.5818805694580078, + "learning_rate": 1e-05, + "loss": 0.0551, + "step": 61500 + }, + { + "epoch": 0.000616, + "grad_norm": 0.6204677224159241, + "learning_rate": 1e-05, + "loss": 0.0556, + "step": 61600 + }, + { + "epoch": 0.000617, + "grad_norm": 0.5443527102470398, + "learning_rate": 1e-05, + "loss": 0.0552, + "step": 61700 + }, + { + "epoch": 0.000618, + "grad_norm": 0.49102166295051575, + "learning_rate": 1e-05, + "loss": 0.0549, + "step": 61800 + }, + { + "epoch": 0.000619, + "grad_norm": 0.557538628578186, + "learning_rate": 1e-05, + "loss": 0.0543, + "step": 61900 + }, + { + "epoch": 0.00062, + "grad_norm": 0.620365560054779, + "learning_rate": 1e-05, + "loss": 0.0561, + "step": 62000 + }, + { + "epoch": 0.000621, + "grad_norm": 0.6253044009208679, + "learning_rate": 1e-05, + "loss": 0.0557, + "step": 62100 + }, + { + "epoch": 0.000622, + "grad_norm": 0.7837327122688293, + "learning_rate": 1e-05, + "loss": 0.055, + "step": 62200 + }, + { + "epoch": 0.000623, + "grad_norm": 0.5085681676864624, + "learning_rate": 1e-05, + "loss": 0.0552, + "step": 62300 + }, + { + "epoch": 0.000624, + "grad_norm": 0.608761191368103, + "learning_rate": 1e-05, + "loss": 0.0532, + "step": 62400 + }, + { + "epoch": 0.000625, + "grad_norm": 0.7588841915130615, + "learning_rate": 1e-05, + "loss": 0.0552, + "step": 62500 + }, + { + "epoch": 0.000626, + "grad_norm": 0.5510600209236145, + "learning_rate": 1e-05, + "loss": 0.0551, + "step": 62600 + }, + { + "epoch": 0.000627, + "grad_norm": 0.5801370739936829, + "learning_rate": 1e-05, + "loss": 0.0542, + "step": 62700 + }, + { + "epoch": 0.000628, + "grad_norm": 0.6703765988349915, + "learning_rate": 1e-05, + "loss": 0.0569, + "step": 62800 + }, + { + "epoch": 0.000629, + "grad_norm": 0.4344656467437744, + "learning_rate": 1e-05, + "loss": 0.0549, + "step": 62900 + }, + { + "epoch": 0.00063, + "grad_norm": 0.5678920745849609, + "learning_rate": 1e-05, + "loss": 0.0555, + "step": 63000 + }, + { + "epoch": 0.000631, + "grad_norm": 0.5048655271530151, + "learning_rate": 1e-05, + "loss": 0.0547, + "step": 63100 + }, + { + "epoch": 0.000632, + "grad_norm": 0.5324554443359375, + "learning_rate": 1e-05, + "loss": 0.0551, + "step": 63200 + }, + { + "epoch": 0.000633, + "grad_norm": 0.5735768675804138, + "learning_rate": 1e-05, + "loss": 0.0551, + "step": 63300 + }, + { + "epoch": 0.000634, + "grad_norm": 0.5694500803947449, + "learning_rate": 1e-05, + "loss": 0.0542, + "step": 63400 + }, + { + "epoch": 0.000635, + "grad_norm": 0.5009059906005859, + "learning_rate": 1e-05, + "loss": 0.0538, + "step": 63500 + }, + { + "epoch": 0.000636, + "grad_norm": 0.5886440277099609, + "learning_rate": 1e-05, + "loss": 0.0545, + "step": 63600 + }, + { + "epoch": 0.000637, + "grad_norm": 0.5673546195030212, + "learning_rate": 1e-05, + "loss": 0.0548, + "step": 63700 + }, + { + "epoch": 0.000638, + "grad_norm": 0.5466011762619019, + "learning_rate": 1e-05, + "loss": 0.054, + "step": 63800 + }, + { + "epoch": 0.000639, + "grad_norm": 0.5927892923355103, + "learning_rate": 1e-05, + "loss": 0.0548, + "step": 63900 + }, + { + "epoch": 0.00064, + "grad_norm": 0.7305207252502441, + "learning_rate": 1e-05, + "loss": 0.0536, + "step": 64000 + }, + { + "epoch": 0.000641, + "grad_norm": 0.5603034496307373, + "learning_rate": 1e-05, + "loss": 0.0536, + "step": 64100 + }, + { + "epoch": 0.000642, + "grad_norm": 0.6965247988700867, + "learning_rate": 1e-05, + "loss": 0.0546, + "step": 64200 + }, + { + "epoch": 0.000643, + "grad_norm": 0.57351154088974, + "learning_rate": 1e-05, + "loss": 0.0535, + "step": 64300 + }, + { + "epoch": 0.000644, + "grad_norm": 0.511005163192749, + "learning_rate": 1e-05, + "loss": 0.0548, + "step": 64400 + }, + { + "epoch": 0.000645, + "grad_norm": 0.5340495705604553, + "learning_rate": 1e-05, + "loss": 0.0543, + "step": 64500 + }, + { + "epoch": 0.000646, + "grad_norm": 0.6858961582183838, + "learning_rate": 1e-05, + "loss": 0.0538, + "step": 64600 + }, + { + "epoch": 0.000647, + "grad_norm": 0.6375705599784851, + "learning_rate": 1e-05, + "loss": 0.0536, + "step": 64700 + }, + { + "epoch": 0.000648, + "grad_norm": 0.48544806241989136, + "learning_rate": 1e-05, + "loss": 0.0529, + "step": 64800 + }, + { + "epoch": 0.000649, + "grad_norm": 0.49595892429351807, + "learning_rate": 1e-05, + "loss": 0.0529, + "step": 64900 + }, + { + "epoch": 0.00065, + "grad_norm": 0.4976153075695038, + "learning_rate": 1e-05, + "loss": 0.0538, + "step": 65000 + }, + { + "epoch": 0.000651, + "grad_norm": 0.5489813089370728, + "learning_rate": 1e-05, + "loss": 0.0528, + "step": 65100 + }, + { + "epoch": 0.000652, + "grad_norm": 0.4820660650730133, + "learning_rate": 1e-05, + "loss": 0.0536, + "step": 65200 + }, + { + "epoch": 0.000653, + "grad_norm": 0.5546014308929443, + "learning_rate": 1e-05, + "loss": 0.0529, + "step": 65300 + }, + { + "epoch": 0.000654, + "grad_norm": 0.4900113344192505, + "learning_rate": 1e-05, + "loss": 0.0535, + "step": 65400 + }, + { + "epoch": 0.000655, + "grad_norm": 0.6061577796936035, + "learning_rate": 1e-05, + "loss": 0.0533, + "step": 65500 + }, + { + "epoch": 0.000656, + "grad_norm": 0.6450973749160767, + "learning_rate": 1e-05, + "loss": 0.0542, + "step": 65600 + }, + { + "epoch": 0.000657, + "grad_norm": 0.677505612373352, + "learning_rate": 1e-05, + "loss": 0.0539, + "step": 65700 + }, + { + "epoch": 0.000658, + "grad_norm": 0.48482370376586914, + "learning_rate": 1e-05, + "loss": 0.0533, + "step": 65800 + }, + { + "epoch": 0.000659, + "grad_norm": 0.49198102951049805, + "learning_rate": 1e-05, + "loss": 0.0527, + "step": 65900 + }, + { + "epoch": 0.00066, + "grad_norm": 0.47996985912323, + "learning_rate": 1e-05, + "loss": 0.0544, + "step": 66000 + }, + { + "epoch": 0.000661, + "grad_norm": 0.548791229724884, + "learning_rate": 1e-05, + "loss": 0.0534, + "step": 66100 + }, + { + "epoch": 0.000662, + "grad_norm": 0.6156114935874939, + "learning_rate": 1e-05, + "loss": 0.0538, + "step": 66200 + }, + { + "epoch": 0.000663, + "grad_norm": 0.5212823748588562, + "learning_rate": 1e-05, + "loss": 0.0534, + "step": 66300 + }, + { + "epoch": 0.000664, + "grad_norm": 0.5812687873840332, + "learning_rate": 1e-05, + "loss": 0.0527, + "step": 66400 + }, + { + "epoch": 0.000665, + "grad_norm": 0.4992978572845459, + "learning_rate": 1e-05, + "loss": 0.0532, + "step": 66500 + }, + { + "epoch": 0.000666, + "grad_norm": 0.5525248050689697, + "learning_rate": 1e-05, + "loss": 0.0533, + "step": 66600 + }, + { + "epoch": 0.000667, + "grad_norm": 0.6456683874130249, + "learning_rate": 1e-05, + "loss": 0.0535, + "step": 66700 + }, + { + "epoch": 0.000668, + "grad_norm": 0.6112907528877258, + "learning_rate": 1e-05, + "loss": 0.0532, + "step": 66800 + }, + { + "epoch": 0.000669, + "grad_norm": 0.543624222278595, + "learning_rate": 1e-05, + "loss": 0.0539, + "step": 66900 + }, + { + "epoch": 0.00067, + "grad_norm": 0.5512799024581909, + "learning_rate": 1e-05, + "loss": 0.0513, + "step": 67000 + }, + { + "epoch": 0.000671, + "grad_norm": 0.631289005279541, + "learning_rate": 1e-05, + "loss": 0.0519, + "step": 67100 + }, + { + "epoch": 0.000672, + "grad_norm": 0.47048887610435486, + "learning_rate": 1e-05, + "loss": 0.0532, + "step": 67200 + }, + { + "epoch": 0.000673, + "grad_norm": 0.5930091142654419, + "learning_rate": 1e-05, + "loss": 0.0528, + "step": 67300 + }, + { + "epoch": 0.000674, + "grad_norm": 0.7611256837844849, + "learning_rate": 1e-05, + "loss": 0.0527, + "step": 67400 + }, + { + "epoch": 0.000675, + "grad_norm": 0.49624642729759216, + "learning_rate": 1e-05, + "loss": 0.0528, + "step": 67500 + }, + { + "epoch": 0.000676, + "grad_norm": 0.6547495126724243, + "learning_rate": 1e-05, + "loss": 0.0523, + "step": 67600 + }, + { + "epoch": 0.000677, + "grad_norm": 0.635519802570343, + "learning_rate": 1e-05, + "loss": 0.0521, + "step": 67700 + }, + { + "epoch": 0.000678, + "grad_norm": 0.606388509273529, + "learning_rate": 1e-05, + "loss": 0.052, + "step": 67800 + }, + { + "epoch": 0.000679, + "grad_norm": 0.4945245385169983, + "learning_rate": 1e-05, + "loss": 0.0522, + "step": 67900 + }, + { + "epoch": 0.00068, + "grad_norm": 0.4815261662006378, + "learning_rate": 1e-05, + "loss": 0.0533, + "step": 68000 + }, + { + "epoch": 0.000681, + "grad_norm": 0.47382187843322754, + "learning_rate": 1e-05, + "loss": 0.0519, + "step": 68100 + }, + { + "epoch": 0.000682, + "grad_norm": 0.549886167049408, + "learning_rate": 1e-05, + "loss": 0.0518, + "step": 68200 + }, + { + "epoch": 0.000683, + "grad_norm": 0.5204160213470459, + "learning_rate": 1e-05, + "loss": 0.0519, + "step": 68300 + }, + { + "epoch": 0.000684, + "grad_norm": 0.5802004933357239, + "learning_rate": 1e-05, + "loss": 0.0517, + "step": 68400 + }, + { + "epoch": 0.000685, + "grad_norm": 0.5576998591423035, + "learning_rate": 1e-05, + "loss": 0.0519, + "step": 68500 + }, + { + "epoch": 0.000686, + "grad_norm": 0.5708860158920288, + "learning_rate": 1e-05, + "loss": 0.0523, + "step": 68600 + }, + { + "epoch": 0.000687, + "grad_norm": 0.6270045042037964, + "learning_rate": 1e-05, + "loss": 0.0502, + "step": 68700 + }, + { + "epoch": 0.000688, + "grad_norm": 0.462593138217926, + "learning_rate": 1e-05, + "loss": 0.0517, + "step": 68800 + }, + { + "epoch": 0.000689, + "grad_norm": 0.4807493984699249, + "learning_rate": 1e-05, + "loss": 0.0524, + "step": 68900 + }, + { + "epoch": 0.00069, + "grad_norm": 0.5798048973083496, + "learning_rate": 1e-05, + "loss": 0.0527, + "step": 69000 + }, + { + "epoch": 0.000691, + "grad_norm": 0.44622689485549927, + "learning_rate": 1e-05, + "loss": 0.0528, + "step": 69100 + }, + { + "epoch": 0.000692, + "grad_norm": 0.5129225254058838, + "learning_rate": 1e-05, + "loss": 0.0528, + "step": 69200 + }, + { + "epoch": 0.000693, + "grad_norm": 0.5368632674217224, + "learning_rate": 1e-05, + "loss": 0.0524, + "step": 69300 + }, + { + "epoch": 0.000694, + "grad_norm": 0.559655487537384, + "learning_rate": 1e-05, + "loss": 0.0525, + "step": 69400 + }, + { + "epoch": 0.000695, + "grad_norm": 0.6121320128440857, + "learning_rate": 1e-05, + "loss": 0.0507, + "step": 69500 + }, + { + "epoch": 0.000696, + "grad_norm": 0.5470311045646667, + "learning_rate": 1e-05, + "loss": 0.0511, + "step": 69600 + }, + { + "epoch": 0.000697, + "grad_norm": 0.5142286419868469, + "learning_rate": 1e-05, + "loss": 0.0516, + "step": 69700 + }, + { + "epoch": 0.000698, + "grad_norm": 0.6724265217781067, + "learning_rate": 1e-05, + "loss": 0.0517, + "step": 69800 + }, + { + "epoch": 0.000699, + "grad_norm": 0.4707196354866028, + "learning_rate": 1e-05, + "loss": 0.0511, + "step": 69900 + }, + { + "epoch": 0.0007, + "grad_norm": 0.616026759147644, + "learning_rate": 1e-05, + "loss": 0.0517, + "step": 70000 + }, + { + "epoch": 0.000701, + "grad_norm": 0.5991165041923523, + "learning_rate": 1e-05, + "loss": 0.0512, + "step": 70100 + }, + { + "epoch": 0.000702, + "grad_norm": 0.5611563324928284, + "learning_rate": 1e-05, + "loss": 0.0509, + "step": 70200 + }, + { + "epoch": 0.000703, + "grad_norm": 0.46492424607276917, + "learning_rate": 1e-05, + "loss": 0.0511, + "step": 70300 + }, + { + "epoch": 0.000704, + "grad_norm": 0.5256513357162476, + "learning_rate": 1e-05, + "loss": 0.0518, + "step": 70400 + }, + { + "epoch": 0.000705, + "grad_norm": 0.499254435300827, + "learning_rate": 1e-05, + "loss": 0.0501, + "step": 70500 + }, + { + "epoch": 0.000706, + "grad_norm": 0.5403403043746948, + "learning_rate": 1e-05, + "loss": 0.0509, + "step": 70600 + }, + { + "epoch": 0.000707, + "grad_norm": 0.6283129453659058, + "learning_rate": 1e-05, + "loss": 0.0519, + "step": 70700 + }, + { + "epoch": 0.000708, + "grad_norm": 0.5229069590568542, + "learning_rate": 1e-05, + "loss": 0.051, + "step": 70800 + }, + { + "epoch": 0.000709, + "grad_norm": 0.48306700587272644, + "learning_rate": 1e-05, + "loss": 0.0504, + "step": 70900 + }, + { + "epoch": 0.00071, + "grad_norm": 0.5926072597503662, + "learning_rate": 1e-05, + "loss": 0.0506, + "step": 71000 + }, + { + "epoch": 0.000711, + "grad_norm": 0.5640701651573181, + "learning_rate": 1e-05, + "loss": 0.0506, + "step": 71100 + }, + { + "epoch": 0.000712, + "grad_norm": 0.49134358763694763, + "learning_rate": 1e-05, + "loss": 0.0512, + "step": 71200 + }, + { + "epoch": 0.000713, + "grad_norm": 0.4878164231777191, + "learning_rate": 1e-05, + "loss": 0.0512, + "step": 71300 + }, + { + "epoch": 0.000714, + "grad_norm": 0.6183532476425171, + "learning_rate": 1e-05, + "loss": 0.0508, + "step": 71400 + }, + { + "epoch": 0.000715, + "grad_norm": 0.5065814852714539, + "learning_rate": 1e-05, + "loss": 0.0505, + "step": 71500 + }, + { + "epoch": 0.000716, + "grad_norm": 0.548599898815155, + "learning_rate": 1e-05, + "loss": 0.0502, + "step": 71600 + }, + { + "epoch": 0.000717, + "grad_norm": 0.4534250795841217, + "learning_rate": 1e-05, + "loss": 0.05, + "step": 71700 + }, + { + "epoch": 0.000718, + "grad_norm": 0.5044461488723755, + "learning_rate": 1e-05, + "loss": 0.0502, + "step": 71800 + }, + { + "epoch": 0.000719, + "grad_norm": 0.5321183204650879, + "learning_rate": 1e-05, + "loss": 0.0498, + "step": 71900 + }, + { + "epoch": 0.00072, + "grad_norm": 0.4777474105358124, + "learning_rate": 1e-05, + "loss": 0.0503, + "step": 72000 + }, + { + "epoch": 0.000721, + "grad_norm": 0.6466835141181946, + "learning_rate": 1e-05, + "loss": 0.0507, + "step": 72100 + }, + { + "epoch": 0.000722, + "grad_norm": 0.5359812378883362, + "learning_rate": 1e-05, + "loss": 0.0506, + "step": 72200 + }, + { + "epoch": 0.000723, + "grad_norm": 0.4923792779445648, + "learning_rate": 1e-05, + "loss": 0.05, + "step": 72300 + }, + { + "epoch": 0.000724, + "grad_norm": 0.5708417892456055, + "learning_rate": 1e-05, + "loss": 0.0511, + "step": 72400 + }, + { + "epoch": 0.000725, + "grad_norm": 0.5016763806343079, + "learning_rate": 1e-05, + "loss": 0.0509, + "step": 72500 + }, + { + "epoch": 0.000726, + "grad_norm": 0.4299620985984802, + "learning_rate": 1e-05, + "loss": 0.0504, + "step": 72600 + }, + { + "epoch": 0.000727, + "grad_norm": 0.387928307056427, + "learning_rate": 1e-05, + "loss": 0.0493, + "step": 72700 + }, + { + "epoch": 0.000728, + "grad_norm": 0.5286259651184082, + "learning_rate": 1e-05, + "loss": 0.0508, + "step": 72800 + }, + { + "epoch": 0.000729, + "grad_norm": 0.511677622795105, + "learning_rate": 1e-05, + "loss": 0.0503, + "step": 72900 + }, + { + "epoch": 0.00073, + "grad_norm": 0.4648519456386566, + "learning_rate": 1e-05, + "loss": 0.0494, + "step": 73000 + }, + { + "epoch": 0.000731, + "grad_norm": 0.4918229877948761, + "learning_rate": 1e-05, + "loss": 0.0496, + "step": 73100 + }, + { + "epoch": 0.000732, + "grad_norm": 0.49148622155189514, + "learning_rate": 1e-05, + "loss": 0.0494, + "step": 73200 + }, + { + "epoch": 0.000733, + "grad_norm": 0.5078290104866028, + "learning_rate": 1e-05, + "loss": 0.0495, + "step": 73300 + }, + { + "epoch": 0.000734, + "grad_norm": 0.591152012348175, + "learning_rate": 1e-05, + "loss": 0.0506, + "step": 73400 + }, + { + "epoch": 0.000735, + "grad_norm": 0.5350937843322754, + "learning_rate": 1e-05, + "loss": 0.0499, + "step": 73500 + }, + { + "epoch": 0.000736, + "grad_norm": 0.4960618019104004, + "learning_rate": 1e-05, + "loss": 0.0495, + "step": 73600 + }, + { + "epoch": 0.000737, + "grad_norm": 0.46348682045936584, + "learning_rate": 1e-05, + "loss": 0.0493, + "step": 73700 + }, + { + "epoch": 0.000738, + "grad_norm": 0.6859008073806763, + "learning_rate": 1e-05, + "loss": 0.0506, + "step": 73800 + }, + { + "epoch": 0.000739, + "grad_norm": 0.5936481952667236, + "learning_rate": 1e-05, + "loss": 0.0504, + "step": 73900 + }, + { + "epoch": 0.00074, + "grad_norm": 0.6398313045501709, + "learning_rate": 1e-05, + "loss": 0.0498, + "step": 74000 + }, + { + "epoch": 0.000741, + "grad_norm": 0.6062189936637878, + "learning_rate": 1e-05, + "loss": 0.05, + "step": 74100 + }, + { + "epoch": 0.000742, + "grad_norm": 0.5730705261230469, + "learning_rate": 1e-05, + "loss": 0.0498, + "step": 74200 + }, + { + "epoch": 0.000743, + "grad_norm": 0.5183285474777222, + "learning_rate": 1e-05, + "loss": 0.05, + "step": 74300 + }, + { + "epoch": 0.000744, + "grad_norm": 0.4582626521587372, + "learning_rate": 1e-05, + "loss": 0.0493, + "step": 74400 + }, + { + "epoch": 0.000745, + "grad_norm": 0.4545513987541199, + "learning_rate": 1e-05, + "loss": 0.0497, + "step": 74500 + }, + { + "epoch": 0.000746, + "grad_norm": 0.6823522448539734, + "learning_rate": 1e-05, + "loss": 0.0494, + "step": 74600 + }, + { + "epoch": 0.000747, + "grad_norm": 0.5017057061195374, + "learning_rate": 1e-05, + "loss": 0.0498, + "step": 74700 + }, + { + "epoch": 0.000748, + "grad_norm": 0.4436599910259247, + "learning_rate": 1e-05, + "loss": 0.0507, + "step": 74800 + }, + { + "epoch": 0.000749, + "grad_norm": 0.5471747517585754, + "learning_rate": 1e-05, + "loss": 0.0491, + "step": 74900 + }, + { + "epoch": 0.00075, + "grad_norm": 0.4700005352497101, + "learning_rate": 1e-05, + "loss": 0.0493, + "step": 75000 + }, + { + "epoch": 0.000751, + "grad_norm": 0.5744854211807251, + "learning_rate": 1e-05, + "loss": 0.0494, + "step": 75100 + }, + { + "epoch": 0.000752, + "grad_norm": 0.4908376634120941, + "learning_rate": 1e-05, + "loss": 0.0493, + "step": 75200 + }, + { + "epoch": 0.000753, + "grad_norm": 0.5889230966567993, + "learning_rate": 1e-05, + "loss": 0.0497, + "step": 75300 + }, + { + "epoch": 0.000754, + "grad_norm": 0.5542328953742981, + "learning_rate": 1e-05, + "loss": 0.049, + "step": 75400 + }, + { + "epoch": 0.000755, + "grad_norm": 0.567498505115509, + "learning_rate": 1e-05, + "loss": 0.0487, + "step": 75500 + }, + { + "epoch": 0.000756, + "grad_norm": 0.4234246611595154, + "learning_rate": 1e-05, + "loss": 0.0494, + "step": 75600 + }, + { + "epoch": 0.000757, + "grad_norm": 0.7256674766540527, + "learning_rate": 1e-05, + "loss": 0.0496, + "step": 75700 + }, + { + "epoch": 0.000758, + "grad_norm": 0.6111962795257568, + "learning_rate": 1e-05, + "loss": 0.0494, + "step": 75800 + }, + { + "epoch": 0.000759, + "grad_norm": 0.5681432485580444, + "learning_rate": 1e-05, + "loss": 0.0483, + "step": 75900 + }, + { + "epoch": 0.00076, + "grad_norm": 0.44954606890678406, + "learning_rate": 1e-05, + "loss": 0.049, + "step": 76000 + }, + { + "epoch": 0.000761, + "grad_norm": 0.5693077445030212, + "learning_rate": 1e-05, + "loss": 0.0485, + "step": 76100 + }, + { + "epoch": 0.000762, + "grad_norm": 0.47221890091896057, + "learning_rate": 1e-05, + "loss": 0.0485, + "step": 76200 + }, + { + "epoch": 0.000763, + "grad_norm": 0.5012596249580383, + "learning_rate": 1e-05, + "loss": 0.0488, + "step": 76300 + }, + { + "epoch": 0.000764, + "grad_norm": 0.5051250457763672, + "learning_rate": 1e-05, + "loss": 0.0492, + "step": 76400 + }, + { + "epoch": 0.000765, + "grad_norm": 0.45128434896469116, + "learning_rate": 1e-05, + "loss": 0.0483, + "step": 76500 + }, + { + "epoch": 0.000766, + "grad_norm": 0.48324739933013916, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 76600 + }, + { + "epoch": 0.000767, + "grad_norm": 0.6752970814704895, + "learning_rate": 1e-05, + "loss": 0.0487, + "step": 76700 + }, + { + "epoch": 0.000768, + "grad_norm": 0.4630663990974426, + "learning_rate": 1e-05, + "loss": 0.0501, + "step": 76800 + }, + { + "epoch": 0.000769, + "grad_norm": 0.4887773394584656, + "learning_rate": 1e-05, + "loss": 0.0481, + "step": 76900 + }, + { + "epoch": 0.00077, + "grad_norm": 0.4609774947166443, + "learning_rate": 1e-05, + "loss": 0.0486, + "step": 77000 + }, + { + "epoch": 0.000771, + "grad_norm": 0.6502612233161926, + "learning_rate": 1e-05, + "loss": 0.0495, + "step": 77100 + }, + { + "epoch": 0.000772, + "grad_norm": 0.563583254814148, + "learning_rate": 1e-05, + "loss": 0.0493, + "step": 77200 + }, + { + "epoch": 0.000773, + "grad_norm": 0.5242981314659119, + "learning_rate": 1e-05, + "loss": 0.0485, + "step": 77300 + }, + { + "epoch": 0.000774, + "grad_norm": 0.5238550901412964, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 77400 + }, + { + "epoch": 0.000775, + "grad_norm": 0.38637349009513855, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 77500 + }, + { + "epoch": 0.000776, + "grad_norm": 0.5395223498344421, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 77600 + }, + { + "epoch": 0.000777, + "grad_norm": 0.5965639352798462, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 77700 + }, + { + "epoch": 0.000778, + "grad_norm": 0.4685559868812561, + "learning_rate": 1e-05, + "loss": 0.0474, + "step": 77800 + }, + { + "epoch": 0.000779, + "grad_norm": 0.46465954184532166, + "learning_rate": 1e-05, + "loss": 0.049, + "step": 77900 + }, + { + "epoch": 0.00078, + "grad_norm": 0.5408352017402649, + "learning_rate": 1e-05, + "loss": 0.0487, + "step": 78000 + }, + { + "epoch": 0.000781, + "grad_norm": 0.3893685042858124, + "learning_rate": 1e-05, + "loss": 0.0479, + "step": 78100 + }, + { + "epoch": 0.000782, + "grad_norm": 0.6658462285995483, + "learning_rate": 1e-05, + "loss": 0.048, + "step": 78200 + }, + { + "epoch": 0.000783, + "grad_norm": 0.6283921003341675, + "learning_rate": 1e-05, + "loss": 0.0488, + "step": 78300 + }, + { + "epoch": 0.000784, + "grad_norm": 0.4658546447753906, + "learning_rate": 1e-05, + "loss": 0.0486, + "step": 78400 + }, + { + "epoch": 0.000785, + "grad_norm": 0.5362129807472229, + "learning_rate": 1e-05, + "loss": 0.0488, + "step": 78500 + }, + { + "epoch": 0.000786, + "grad_norm": 0.5157918334007263, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 78600 + }, + { + "epoch": 0.000787, + "grad_norm": 0.5089668035507202, + "learning_rate": 1e-05, + "loss": 0.0485, + "step": 78700 + }, + { + "epoch": 0.000788, + "grad_norm": 0.49590611457824707, + "learning_rate": 1e-05, + "loss": 0.0476, + "step": 78800 + }, + { + "epoch": 0.000789, + "grad_norm": 0.4500684440135956, + "learning_rate": 1e-05, + "loss": 0.0482, + "step": 78900 + }, + { + "epoch": 0.00079, + "grad_norm": 0.4456005096435547, + "learning_rate": 1e-05, + "loss": 0.0479, + "step": 79000 + }, + { + "epoch": 0.000791, + "grad_norm": 0.502184271812439, + "learning_rate": 1e-05, + "loss": 0.0483, + "step": 79100 + }, + { + "epoch": 0.000792, + "grad_norm": 0.4004657566547394, + "learning_rate": 1e-05, + "loss": 0.0483, + "step": 79200 + }, + { + "epoch": 0.000793, + "grad_norm": 0.6616214513778687, + "learning_rate": 1e-05, + "loss": 0.048, + "step": 79300 + }, + { + "epoch": 0.000794, + "grad_norm": 0.5488511323928833, + "learning_rate": 1e-05, + "loss": 0.0473, + "step": 79400 + }, + { + "epoch": 0.000795, + "grad_norm": 0.5251606702804565, + "learning_rate": 1e-05, + "loss": 0.0489, + "step": 79500 + }, + { + "epoch": 0.000796, + "grad_norm": 0.43220826983451843, + "learning_rate": 1e-05, + "loss": 0.0469, + "step": 79600 + }, + { + "epoch": 0.000797, + "grad_norm": 0.5535863041877747, + "learning_rate": 1e-05, + "loss": 0.0487, + "step": 79700 + }, + { + "epoch": 0.000798, + "grad_norm": 0.4892144799232483, + "learning_rate": 1e-05, + "loss": 0.048, + "step": 79800 + }, + { + "epoch": 0.000799, + "grad_norm": 0.443042516708374, + "learning_rate": 1e-05, + "loss": 0.0484, + "step": 79900 + }, + { + "epoch": 0.0008, + "grad_norm": 0.4258803725242615, + "learning_rate": 1e-05, + "loss": 0.0476, + "step": 80000 + }, + { + "epoch": 0.0008, + "eval_loss": 0.0394287109375, + "eval_runtime": 147.8199, + "eval_samples_per_second": 338.249, + "eval_steps_per_second": 21.141, + "step": 80000 + }, + { + "epoch": 0.000801, + "grad_norm": 0.5370935201644897, + "learning_rate": 1e-05, + "loss": 0.0478, + "step": 80100 + }, + { + "epoch": 0.000802, + "grad_norm": 0.5561772584915161, + "learning_rate": 1e-05, + "loss": 0.0487, + "step": 80200 + }, + { + "epoch": 0.000803, + "grad_norm": 0.5092744827270508, + "learning_rate": 1e-05, + "loss": 0.0476, + "step": 80300 + }, + { + "epoch": 0.000804, + "grad_norm": 0.4691084623336792, + "learning_rate": 1e-05, + "loss": 0.0473, + "step": 80400 + }, + { + "epoch": 0.000805, + "grad_norm": 0.5660099387168884, + "learning_rate": 1e-05, + "loss": 0.0475, + "step": 80500 + }, + { + "epoch": 0.000806, + "grad_norm": 0.5250957012176514, + "learning_rate": 1e-05, + "loss": 0.0471, + "step": 80600 + }, + { + "epoch": 0.000807, + "grad_norm": 0.5492421388626099, + "learning_rate": 1e-05, + "loss": 0.0481, + "step": 80700 + }, + { + "epoch": 0.000808, + "grad_norm": 0.7874831557273865, + "learning_rate": 1e-05, + "loss": 0.0475, + "step": 80800 + }, + { + "epoch": 0.000809, + "grad_norm": 0.6476261615753174, + "learning_rate": 1e-05, + "loss": 0.0477, + "step": 80900 + }, + { + "epoch": 0.00081, + "grad_norm": 0.557145357131958, + "learning_rate": 1e-05, + "loss": 0.0477, + "step": 81000 + }, + { + "epoch": 0.000811, + "grad_norm": 0.5536689758300781, + "learning_rate": 1e-05, + "loss": 0.0475, + "step": 81100 + }, + { + "epoch": 0.000812, + "grad_norm": 0.5005760788917542, + "learning_rate": 1e-05, + "loss": 0.0472, + "step": 81200 + }, + { + "epoch": 0.000813, + "grad_norm": 0.43560323119163513, + "learning_rate": 1e-05, + "loss": 0.0473, + "step": 81300 + }, + { + "epoch": 0.000814, + "grad_norm": 0.49981963634490967, + "learning_rate": 1e-05, + "loss": 0.0468, + "step": 81400 + }, + { + "epoch": 0.000815, + "grad_norm": 0.5209627151489258, + "learning_rate": 1e-05, + "loss": 0.0476, + "step": 81500 + }, + { + "epoch": 0.000816, + "grad_norm": 0.7528536319732666, + "learning_rate": 1e-05, + "loss": 0.0471, + "step": 81600 + }, + { + "epoch": 0.000817, + "grad_norm": 0.6212517023086548, + "learning_rate": 1e-05, + "loss": 0.0476, + "step": 81700 + }, + { + "epoch": 0.000818, + "grad_norm": 0.45106619596481323, + "learning_rate": 1e-05, + "loss": 0.0475, + "step": 81800 + }, + { + "epoch": 0.000819, + "grad_norm": 0.5259119868278503, + "learning_rate": 1e-05, + "loss": 0.0473, + "step": 81900 + }, + { + "epoch": 0.00082, + "grad_norm": 0.4737171232700348, + "learning_rate": 1e-05, + "loss": 0.0478, + "step": 82000 + }, + { + "epoch": 0.000821, + "grad_norm": 0.5119843482971191, + "learning_rate": 1e-05, + "loss": 0.0467, + "step": 82100 + }, + { + "epoch": 0.000822, + "grad_norm": 0.3932953178882599, + "learning_rate": 1e-05, + "loss": 0.0465, + "step": 82200 + }, + { + "epoch": 0.000823, + "grad_norm": 0.43303382396698, + "learning_rate": 1e-05, + "loss": 0.047, + "step": 82300 + }, + { + "epoch": 0.000824, + "grad_norm": 0.5500777363777161, + "learning_rate": 1e-05, + "loss": 0.0461, + "step": 82400 + }, + { + "epoch": 0.000825, + "grad_norm": 0.5227336883544922, + "learning_rate": 1e-05, + "loss": 0.0477, + "step": 82500 + }, + { + "epoch": 0.000826, + "grad_norm": 0.5672751665115356, + "learning_rate": 1e-05, + "loss": 0.0476, + "step": 82600 + }, + { + "epoch": 0.000827, + "grad_norm": 0.5093204975128174, + "learning_rate": 1e-05, + "loss": 0.0468, + "step": 82700 + }, + { + "epoch": 0.000828, + "grad_norm": 0.47309496998786926, + "learning_rate": 1e-05, + "loss": 0.0464, + "step": 82800 + }, + { + "epoch": 0.000829, + "grad_norm": 0.4092000722885132, + "learning_rate": 1e-05, + "loss": 0.0467, + "step": 82900 + }, + { + "epoch": 0.00083, + "grad_norm": 0.42544227838516235, + "learning_rate": 1e-05, + "loss": 0.0455, + "step": 83000 + }, + { + "epoch": 0.000831, + "grad_norm": 0.5713441371917725, + "learning_rate": 1e-05, + "loss": 0.0457, + "step": 83100 + }, + { + "epoch": 0.000832, + "grad_norm": 0.5193179845809937, + "learning_rate": 1e-05, + "loss": 0.0463, + "step": 83200 + }, + { + "epoch": 0.000833, + "grad_norm": 0.43209248781204224, + "learning_rate": 1e-05, + "loss": 0.047, + "step": 83300 + }, + { + "epoch": 0.000834, + "grad_norm": 0.5342600345611572, + "learning_rate": 1e-05, + "loss": 0.0456, + "step": 83400 + }, + { + "epoch": 0.000835, + "grad_norm": 0.592204213142395, + "learning_rate": 1e-05, + "loss": 0.0472, + "step": 83500 + }, + { + "epoch": 0.000836, + "grad_norm": 0.5118575692176819, + "learning_rate": 1e-05, + "loss": 0.0467, + "step": 83600 + }, + { + "epoch": 0.000837, + "grad_norm": 0.4781627058982849, + "learning_rate": 1e-05, + "loss": 0.0463, + "step": 83700 + }, + { + "epoch": 0.000838, + "grad_norm": 0.4500192403793335, + "learning_rate": 1e-05, + "loss": 0.0468, + "step": 83800 + }, + { + "epoch": 0.000839, + "grad_norm": 0.49369123578071594, + "learning_rate": 1e-05, + "loss": 0.0463, + "step": 83900 + }, + { + "epoch": 0.00084, + "grad_norm": 0.48518478870391846, + "learning_rate": 1e-05, + "loss": 0.0466, + "step": 84000 + }, + { + "epoch": 0.000841, + "grad_norm": 0.4960392117500305, + "learning_rate": 1e-05, + "loss": 0.0464, + "step": 84100 + }, + { + "epoch": 0.000842, + "grad_norm": 0.4881882667541504, + "learning_rate": 1e-05, + "loss": 0.0461, + "step": 84200 + }, + { + "epoch": 0.000843, + "grad_norm": 0.45837706327438354, + "learning_rate": 1e-05, + "loss": 0.0462, + "step": 84300 + }, + { + "epoch": 0.000844, + "grad_norm": 0.4866684675216675, + "learning_rate": 1e-05, + "loss": 0.0456, + "step": 84400 + }, + { + "epoch": 0.000845, + "grad_norm": 0.5094208121299744, + "learning_rate": 1e-05, + "loss": 0.0466, + "step": 84500 + }, + { + "epoch": 0.000846, + "grad_norm": 0.45124098658561707, + "learning_rate": 1e-05, + "loss": 0.0474, + "step": 84600 + }, + { + "epoch": 0.000847, + "grad_norm": 0.5730771422386169, + "learning_rate": 1e-05, + "loss": 0.0465, + "step": 84700 + }, + { + "epoch": 0.000848, + "grad_norm": 0.48597007989883423, + "learning_rate": 1e-05, + "loss": 0.0463, + "step": 84800 + }, + { + "epoch": 0.000849, + "grad_norm": 0.46603092551231384, + "learning_rate": 1e-05, + "loss": 0.0465, + "step": 84900 + }, + { + "epoch": 0.00085, + "grad_norm": 0.5534038543701172, + "learning_rate": 1e-05, + "loss": 0.0469, + "step": 85000 + }, + { + "epoch": 0.000851, + "grad_norm": 0.42876607179641724, + "learning_rate": 1e-05, + "loss": 0.0465, + "step": 85100 + }, + { + "epoch": 0.000852, + "grad_norm": 0.39502009749412537, + "learning_rate": 1e-05, + "loss": 0.0466, + "step": 85200 + }, + { + "epoch": 0.000853, + "grad_norm": 0.44408953189849854, + "learning_rate": 1e-05, + "loss": 0.0468, + "step": 85300 + }, + { + "epoch": 0.000854, + "grad_norm": 0.444979190826416, + "learning_rate": 1e-05, + "loss": 0.0463, + "step": 85400 + }, + { + "epoch": 0.000855, + "grad_norm": 0.4805260896682739, + "learning_rate": 1e-05, + "loss": 0.0465, + "step": 85500 + }, + { + "epoch": 0.000856, + "grad_norm": 0.552291750907898, + "learning_rate": 1e-05, + "loss": 0.0462, + "step": 85600 + }, + { + "epoch": 0.000857, + "grad_norm": 0.5068393349647522, + "learning_rate": 1e-05, + "loss": 0.0461, + "step": 85700 + }, + { + "epoch": 0.000858, + "grad_norm": 0.41845035552978516, + "learning_rate": 1e-05, + "loss": 0.0461, + "step": 85800 + }, + { + "epoch": 0.000859, + "grad_norm": 0.4751891493797302, + "learning_rate": 1e-05, + "loss": 0.0458, + "step": 85900 + }, + { + "epoch": 0.00086, + "grad_norm": 0.5280572175979614, + "learning_rate": 1e-05, + "loss": 0.0458, + "step": 86000 + }, + { + "epoch": 0.000861, + "grad_norm": 0.68556147813797, + "learning_rate": 1e-05, + "loss": 0.046, + "step": 86100 + }, + { + "epoch": 0.000862, + "grad_norm": 0.5463889241218567, + "learning_rate": 1e-05, + "loss": 0.0457, + "step": 86200 + }, + { + "epoch": 0.000863, + "grad_norm": 0.44014325737953186, + "learning_rate": 1e-05, + "loss": 0.0459, + "step": 86300 + }, + { + "epoch": 0.000864, + "grad_norm": 0.5454211235046387, + "learning_rate": 1e-05, + "loss": 0.0457, + "step": 86400 + }, + { + "epoch": 0.000865, + "grad_norm": 0.5828255414962769, + "learning_rate": 1e-05, + "loss": 0.0453, + "step": 86500 + }, + { + "epoch": 0.000866, + "grad_norm": 0.4621482789516449, + "learning_rate": 1e-05, + "loss": 0.0456, + "step": 86600 + }, + { + "epoch": 0.000867, + "grad_norm": 0.4085827171802521, + "learning_rate": 1e-05, + "loss": 0.0458, + "step": 86700 + }, + { + "epoch": 0.000868, + "grad_norm": 0.504058301448822, + "learning_rate": 1e-05, + "loss": 0.0459, + "step": 86800 + }, + { + "epoch": 0.000869, + "grad_norm": 0.48852622509002686, + "learning_rate": 1e-05, + "loss": 0.0454, + "step": 86900 + }, + { + "epoch": 0.00087, + "grad_norm": 0.4814854860305786, + "learning_rate": 1e-05, + "loss": 0.0457, + "step": 87000 + }, + { + "epoch": 0.000871, + "grad_norm": 0.40433430671691895, + "learning_rate": 1e-05, + "loss": 0.0463, + "step": 87100 + }, + { + "epoch": 0.000872, + "grad_norm": 0.40531593561172485, + "learning_rate": 1e-05, + "loss": 0.0452, + "step": 87200 + }, + { + "epoch": 0.000873, + "grad_norm": 0.5245575308799744, + "learning_rate": 1e-05, + "loss": 0.0449, + "step": 87300 + }, + { + "epoch": 0.000874, + "grad_norm": 0.39926889538764954, + "learning_rate": 1e-05, + "loss": 0.0459, + "step": 87400 + }, + { + "epoch": 0.000875, + "grad_norm": 0.4549976587295532, + "learning_rate": 1e-05, + "loss": 0.0464, + "step": 87500 + }, + { + "epoch": 0.000876, + "grad_norm": 0.4379943013191223, + "learning_rate": 1e-05, + "loss": 0.0458, + "step": 87600 + }, + { + "epoch": 0.000877, + "grad_norm": 0.5028941035270691, + "learning_rate": 1e-05, + "loss": 0.0462, + "step": 87700 + }, + { + "epoch": 0.000878, + "grad_norm": 0.43268847465515137, + "learning_rate": 1e-05, + "loss": 0.0459, + "step": 87800 + }, + { + "epoch": 0.000879, + "grad_norm": 0.5015890002250671, + "learning_rate": 1e-05, + "loss": 0.0449, + "step": 87900 + }, + { + "epoch": 0.00088, + "grad_norm": 0.445121705532074, + "learning_rate": 1e-05, + "loss": 0.0457, + "step": 88000 + }, + { + "epoch": 0.000881, + "grad_norm": 0.49214833974838257, + "learning_rate": 1e-05, + "loss": 0.0459, + "step": 88100 + }, + { + "epoch": 0.000882, + "grad_norm": 0.4444495141506195, + "learning_rate": 1e-05, + "loss": 0.0455, + "step": 88200 + }, + { + "epoch": 0.000883, + "grad_norm": 0.49876669049263, + "learning_rate": 1e-05, + "loss": 0.0459, + "step": 88300 + }, + { + "epoch": 0.000884, + "grad_norm": 0.5114990472793579, + "learning_rate": 1e-05, + "loss": 0.045, + "step": 88400 + }, + { + "epoch": 0.000885, + "grad_norm": 0.48783600330352783, + "learning_rate": 1e-05, + "loss": 0.0461, + "step": 88500 + }, + { + "epoch": 0.000886, + "grad_norm": 0.45137009024620056, + "learning_rate": 1e-05, + "loss": 0.0451, + "step": 88600 + }, + { + "epoch": 0.000887, + "grad_norm": 0.5109623074531555, + "learning_rate": 1e-05, + "loss": 0.0453, + "step": 88700 + }, + { + "epoch": 0.000888, + "grad_norm": 0.57321697473526, + "learning_rate": 1e-05, + "loss": 0.0458, + "step": 88800 + }, + { + "epoch": 0.000889, + "grad_norm": 0.4072723686695099, + "learning_rate": 1e-05, + "loss": 0.0446, + "step": 88900 + }, + { + "epoch": 0.00089, + "grad_norm": 0.5093070268630981, + "learning_rate": 1e-05, + "loss": 0.045, + "step": 89000 + }, + { + "epoch": 0.000891, + "grad_norm": 0.5923020839691162, + "learning_rate": 1e-05, + "loss": 0.0444, + "step": 89100 + }, + { + "epoch": 0.000892, + "grad_norm": 0.4343903958797455, + "learning_rate": 1e-05, + "loss": 0.0445, + "step": 89200 + }, + { + "epoch": 0.000893, + "grad_norm": 0.6024598479270935, + "learning_rate": 1e-05, + "loss": 0.045, + "step": 89300 + }, + { + "epoch": 0.000894, + "grad_norm": 0.5708175301551819, + "learning_rate": 1e-05, + "loss": 0.0445, + "step": 89400 + }, + { + "epoch": 0.000895, + "grad_norm": 0.42085763812065125, + "learning_rate": 1e-05, + "loss": 0.0448, + "step": 89500 + }, + { + "epoch": 0.000896, + "grad_norm": 0.4565168023109436, + "learning_rate": 1e-05, + "loss": 0.0448, + "step": 89600 + }, + { + "epoch": 0.000897, + "grad_norm": 0.4638221561908722, + "learning_rate": 1e-05, + "loss": 0.0455, + "step": 89700 + }, + { + "epoch": 0.000898, + "grad_norm": 0.3921230435371399, + "learning_rate": 1e-05, + "loss": 0.0452, + "step": 89800 + }, + { + "epoch": 0.000899, + "grad_norm": 0.5701455473899841, + "learning_rate": 1e-05, + "loss": 0.045, + "step": 89900 + }, + { + "epoch": 0.0009, + "grad_norm": 0.5132615566253662, + "learning_rate": 1e-05, + "loss": 0.0458, + "step": 90000 + }, + { + "epoch": 0.000901, + "grad_norm": 0.43130597472190857, + "learning_rate": 1e-05, + "loss": 0.0449, + "step": 90100 + }, + { + "epoch": 0.000902, + "grad_norm": 0.4558640718460083, + "learning_rate": 1e-05, + "loss": 0.0446, + "step": 90200 + }, + { + "epoch": 0.000903, + "grad_norm": 0.4325823485851288, + "learning_rate": 1e-05, + "loss": 0.0449, + "step": 90300 + }, + { + "epoch": 0.000904, + "grad_norm": 0.5899006724357605, + "learning_rate": 1e-05, + "loss": 0.0446, + "step": 90400 + }, + { + "epoch": 0.000905, + "grad_norm": 0.6101588010787964, + "learning_rate": 1e-05, + "loss": 0.0455, + "step": 90500 + }, + { + "epoch": 0.000906, + "grad_norm": 0.5354421138763428, + "learning_rate": 1e-05, + "loss": 0.0447, + "step": 90600 + }, + { + "epoch": 0.000907, + "grad_norm": 0.4496416449546814, + "learning_rate": 1e-05, + "loss": 0.0444, + "step": 90700 + }, + { + "epoch": 0.000908, + "grad_norm": 0.40793660283088684, + "learning_rate": 1e-05, + "loss": 0.0447, + "step": 90800 + }, + { + "epoch": 0.000909, + "grad_norm": 0.5534836053848267, + "learning_rate": 1e-05, + "loss": 0.0444, + "step": 90900 + }, + { + "epoch": 0.00091, + "grad_norm": 0.4275030493736267, + "learning_rate": 1e-05, + "loss": 0.0448, + "step": 91000 + }, + { + "epoch": 0.000911, + "grad_norm": 0.5632148385047913, + "learning_rate": 1e-05, + "loss": 0.0443, + "step": 91100 + }, + { + "epoch": 0.000912, + "grad_norm": 0.43501216173171997, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 91200 + }, + { + "epoch": 0.000913, + "grad_norm": 0.54071444272995, + "learning_rate": 1e-05, + "loss": 0.0449, + "step": 91300 + }, + { + "epoch": 0.000914, + "grad_norm": 0.40895435214042664, + "learning_rate": 1e-05, + "loss": 0.0451, + "step": 91400 + }, + { + "epoch": 0.000915, + "grad_norm": 0.495510995388031, + "learning_rate": 1e-05, + "loss": 0.0441, + "step": 91500 + }, + { + "epoch": 0.000916, + "grad_norm": 0.3936554789543152, + "learning_rate": 1e-05, + "loss": 0.0437, + "step": 91600 + }, + { + "epoch": 0.000917, + "grad_norm": 0.4443312883377075, + "learning_rate": 1e-05, + "loss": 0.0443, + "step": 91700 + }, + { + "epoch": 0.000918, + "grad_norm": 0.5269384384155273, + "learning_rate": 1e-05, + "loss": 0.0442, + "step": 91800 + }, + { + "epoch": 0.000919, + "grad_norm": 0.43092164397239685, + "learning_rate": 1e-05, + "loss": 0.0442, + "step": 91900 + }, + { + "epoch": 0.00092, + "grad_norm": 0.498935729265213, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 92000 + }, + { + "epoch": 0.000921, + "grad_norm": 0.4460262656211853, + "learning_rate": 1e-05, + "loss": 0.0448, + "step": 92100 + }, + { + "epoch": 0.000922, + "grad_norm": 0.4452255964279175, + "learning_rate": 1e-05, + "loss": 0.0441, + "step": 92200 + }, + { + "epoch": 0.000923, + "grad_norm": 0.5646675229072571, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 92300 + }, + { + "epoch": 0.000924, + "grad_norm": 0.5320536494255066, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 92400 + }, + { + "epoch": 0.000925, + "grad_norm": 0.4475862681865692, + "learning_rate": 1e-05, + "loss": 0.0432, + "step": 92500 + }, + { + "epoch": 0.000926, + "grad_norm": 0.42607611417770386, + "learning_rate": 1e-05, + "loss": 0.0448, + "step": 92600 + }, + { + "epoch": 0.000927, + "grad_norm": 0.465669721364975, + "learning_rate": 1e-05, + "loss": 0.0447, + "step": 92700 + }, + { + "epoch": 0.000928, + "grad_norm": 0.47202736139297485, + "learning_rate": 1e-05, + "loss": 0.0449, + "step": 92800 + }, + { + "epoch": 0.000929, + "grad_norm": 0.45119792222976685, + "learning_rate": 1e-05, + "loss": 0.0443, + "step": 92900 + }, + { + "epoch": 0.00093, + "grad_norm": 0.4515833258628845, + "learning_rate": 1e-05, + "loss": 0.0446, + "step": 93000 + }, + { + "epoch": 0.000931, + "grad_norm": 0.43587127327919006, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 93100 + }, + { + "epoch": 0.000932, + "grad_norm": 0.4407802224159241, + "learning_rate": 1e-05, + "loss": 0.0446, + "step": 93200 + }, + { + "epoch": 0.000933, + "grad_norm": 0.4792422950267792, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 93300 + }, + { + "epoch": 0.000934, + "grad_norm": 0.5214342474937439, + "learning_rate": 1e-05, + "loss": 0.0438, + "step": 93400 + }, + { + "epoch": 0.000935, + "grad_norm": 0.5573062300682068, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 93500 + }, + { + "epoch": 0.000936, + "grad_norm": 0.5918563008308411, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 93600 + }, + { + "epoch": 0.000937, + "grad_norm": 0.48166489601135254, + "learning_rate": 1e-05, + "loss": 0.0442, + "step": 93700 + }, + { + "epoch": 0.000938, + "grad_norm": 0.4840247631072998, + "learning_rate": 1e-05, + "loss": 0.0438, + "step": 93800 + }, + { + "epoch": 0.000939, + "grad_norm": 0.44477516412734985, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 93900 + }, + { + "epoch": 0.00094, + "grad_norm": 0.5108721256256104, + "learning_rate": 1e-05, + "loss": 0.0434, + "step": 94000 + }, + { + "epoch": 0.000941, + "grad_norm": 0.5947906970977783, + "learning_rate": 1e-05, + "loss": 0.0441, + "step": 94100 + }, + { + "epoch": 0.000942, + "grad_norm": 0.4325408637523651, + "learning_rate": 1e-05, + "loss": 0.0434, + "step": 94200 + }, + { + "epoch": 0.000943, + "grad_norm": 0.5207073092460632, + "learning_rate": 1e-05, + "loss": 0.0432, + "step": 94300 + }, + { + "epoch": 0.000944, + "grad_norm": 0.4852275848388672, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 94400 + }, + { + "epoch": 0.000945, + "grad_norm": 0.5342420339584351, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 94500 + }, + { + "epoch": 0.000946, + "grad_norm": 0.6544240713119507, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 94600 + }, + { + "epoch": 0.000947, + "grad_norm": 0.456338107585907, + "learning_rate": 1e-05, + "loss": 0.0437, + "step": 94700 + }, + { + "epoch": 0.000948, + "grad_norm": 0.51591956615448, + "learning_rate": 1e-05, + "loss": 0.0429, + "step": 94800 + }, + { + "epoch": 0.000949, + "grad_norm": 0.5521871447563171, + "learning_rate": 1e-05, + "loss": 0.0437, + "step": 94900 + }, + { + "epoch": 0.00095, + "grad_norm": 0.46055886149406433, + "learning_rate": 1e-05, + "loss": 0.0433, + "step": 95000 + }, + { + "epoch": 0.000951, + "grad_norm": 0.5128651261329651, + "learning_rate": 1e-05, + "loss": 0.0441, + "step": 95100 + }, + { + "epoch": 0.000952, + "grad_norm": 0.5421969294548035, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 95200 + }, + { + "epoch": 0.000953, + "grad_norm": 0.4281409680843353, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 95300 + }, + { + "epoch": 0.000954, + "grad_norm": 0.3867093622684479, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 95400 + }, + { + "epoch": 0.000955, + "grad_norm": 0.39425021409988403, + "learning_rate": 1e-05, + "loss": 0.0434, + "step": 95500 + }, + { + "epoch": 0.000956, + "grad_norm": 0.45868080854415894, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 95600 + }, + { + "epoch": 0.000957, + "grad_norm": 0.38381725549697876, + "learning_rate": 1e-05, + "loss": 0.0434, + "step": 95700 + }, + { + "epoch": 0.000958, + "grad_norm": 0.5100952386856079, + "learning_rate": 1e-05, + "loss": 0.043, + "step": 95800 + }, + { + "epoch": 0.000959, + "grad_norm": 0.45941147208213806, + "learning_rate": 1e-05, + "loss": 0.0442, + "step": 95900 + }, + { + "epoch": 0.00096, + "grad_norm": 0.3832944929599762, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 96000 + }, + { + "epoch": 0.000961, + "grad_norm": 0.3378923535346985, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 96100 + }, + { + "epoch": 0.000962, + "grad_norm": 0.41457870602607727, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 96200 + }, + { + "epoch": 0.000963, + "grad_norm": 0.49303749203681946, + "learning_rate": 1e-05, + "loss": 0.0426, + "step": 96300 + }, + { + "epoch": 0.000964, + "grad_norm": 0.3703688383102417, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 96400 + }, + { + "epoch": 0.000965, + "grad_norm": 0.3742707371711731, + "learning_rate": 1e-05, + "loss": 0.0431, + "step": 96500 + }, + { + "epoch": 0.000966, + "grad_norm": 0.4352927505970001, + "learning_rate": 1e-05, + "loss": 0.0427, + "step": 96600 + }, + { + "epoch": 0.000967, + "grad_norm": 0.4979144334793091, + "learning_rate": 1e-05, + "loss": 0.044, + "step": 96700 + }, + { + "epoch": 0.000968, + "grad_norm": 0.38628560304641724, + "learning_rate": 1e-05, + "loss": 0.0434, + "step": 96800 + }, + { + "epoch": 0.000969, + "grad_norm": 0.5488578677177429, + "learning_rate": 1e-05, + "loss": 0.0442, + "step": 96900 + }, + { + "epoch": 0.00097, + "grad_norm": 0.3385869264602661, + "learning_rate": 1e-05, + "loss": 0.043, + "step": 97000 + }, + { + "epoch": 0.000971, + "grad_norm": 0.3328537046909332, + "learning_rate": 1e-05, + "loss": 0.0434, + "step": 97100 + }, + { + "epoch": 0.000972, + "grad_norm": 0.3595049977302551, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 97200 + }, + { + "epoch": 0.000973, + "grad_norm": 0.4202601909637451, + "learning_rate": 1e-05, + "loss": 0.0433, + "step": 97300 + }, + { + "epoch": 0.000974, + "grad_norm": 0.47522690892219543, + "learning_rate": 1e-05, + "loss": 0.0428, + "step": 97400 + }, + { + "epoch": 0.000975, + "grad_norm": 0.4936007857322693, + "learning_rate": 1e-05, + "loss": 0.0429, + "step": 97500 + }, + { + "epoch": 0.000976, + "grad_norm": 0.40649285912513733, + "learning_rate": 1e-05, + "loss": 0.0433, + "step": 97600 + }, + { + "epoch": 0.000977, + "grad_norm": 0.4392286241054535, + "learning_rate": 1e-05, + "loss": 0.0429, + "step": 97700 + }, + { + "epoch": 0.000978, + "grad_norm": 0.38572990894317627, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 97800 + }, + { + "epoch": 0.000979, + "grad_norm": 0.5374602675437927, + "learning_rate": 1e-05, + "loss": 0.0428, + "step": 97900 + }, + { + "epoch": 0.00098, + "grad_norm": 0.4686330258846283, + "learning_rate": 1e-05, + "loss": 0.0431, + "step": 98000 + }, + { + "epoch": 0.000981, + "grad_norm": 0.44734638929367065, + "learning_rate": 1e-05, + "loss": 0.0424, + "step": 98100 + }, + { + "epoch": 0.000982, + "grad_norm": 0.47658222913742065, + "learning_rate": 1e-05, + "loss": 0.0439, + "step": 98200 + }, + { + "epoch": 0.000983, + "grad_norm": 0.73811274766922, + "learning_rate": 1e-05, + "loss": 0.0428, + "step": 98300 + }, + { + "epoch": 0.000984, + "grad_norm": 0.4593341648578644, + "learning_rate": 1e-05, + "loss": 0.0429, + "step": 98400 + }, + { + "epoch": 0.000985, + "grad_norm": 0.4732546806335449, + "learning_rate": 1e-05, + "loss": 0.0433, + "step": 98500 + }, + { + "epoch": 0.000986, + "grad_norm": 0.37035250663757324, + "learning_rate": 1e-05, + "loss": 0.0424, + "step": 98600 + }, + { + "epoch": 0.000987, + "grad_norm": 0.47103026509284973, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 98700 + }, + { + "epoch": 0.000988, + "grad_norm": 0.47766396403312683, + "learning_rate": 1e-05, + "loss": 0.0431, + "step": 98800 + }, + { + "epoch": 0.000989, + "grad_norm": 0.44070738554000854, + "learning_rate": 1e-05, + "loss": 0.0431, + "step": 98900 + }, + { + "epoch": 0.00099, + "grad_norm": 0.44191232323646545, + "learning_rate": 1e-05, + "loss": 0.0429, + "step": 99000 + }, + { + "epoch": 0.000991, + "grad_norm": 0.4926696717739105, + "learning_rate": 1e-05, + "loss": 0.0426, + "step": 99100 + }, + { + "epoch": 0.000992, + "grad_norm": 0.3758436143398285, + "learning_rate": 1e-05, + "loss": 0.042, + "step": 99200 + }, + { + "epoch": 0.000993, + "grad_norm": 0.4165551960468292, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 99300 + }, + { + "epoch": 0.000994, + "grad_norm": 0.4664058983325958, + "learning_rate": 1e-05, + "loss": 0.0436, + "step": 99400 + }, + { + "epoch": 0.000995, + "grad_norm": 0.5242469906806946, + "learning_rate": 1e-05, + "loss": 0.0431, + "step": 99500 + }, + { + "epoch": 0.000996, + "grad_norm": 0.5722303986549377, + "learning_rate": 1e-05, + "loss": 0.0433, + "step": 99600 + }, + { + "epoch": 0.000997, + "grad_norm": 0.4828585684299469, + "learning_rate": 1e-05, + "loss": 0.0425, + "step": 99700 + }, + { + "epoch": 0.000998, + "grad_norm": 0.46811702847480774, + "learning_rate": 1e-05, + "loss": 0.0429, + "step": 99800 + }, + { + "epoch": 0.000999, + "grad_norm": 0.379393070936203, + "learning_rate": 1e-05, + "loss": 0.0432, + "step": 99900 + }, + { + "epoch": 0.001, + "grad_norm": 0.5672951340675354, + "learning_rate": 1e-05, + "loss": 0.0435, + "step": 100000 + }, + { + "epoch": 0.001, + "eval_loss": 0.035888671875, + "eval_runtime": 147.9294, + "eval_samples_per_second": 337.999, + "eval_steps_per_second": 21.125, + "step": 100000 + } + ], + "logging_steps": 100, + "max_steps": 100000000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 20000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 200, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.951156666368e+18, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +}