| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 25.0, | |
| "eval_steps": 500, | |
| "global_step": 3125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6.874109268188477, | |
| "learning_rate": 2.5559105431309904e-07, | |
| "loss": 1.5574, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.183030128479004, | |
| "learning_rate": 8.306709265175719e-07, | |
| "loss": 1.4724, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.6565823554992676, | |
| "learning_rate": 1.4696485623003196e-06, | |
| "loss": 1.054, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.227349042892456, | |
| "learning_rate": 2.1086261980830672e-06, | |
| "loss": 0.8732, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.1049450635910034, | |
| "learning_rate": 2.747603833865815e-06, | |
| "loss": 0.7617, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.8624663352966309, | |
| "learning_rate": 3.386581469648563e-06, | |
| "loss": 0.7144, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.1665029525756836, | |
| "learning_rate": 4.02555910543131e-06, | |
| "loss": 0.7612, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0787376165390015, | |
| "learning_rate": 4.664536741214058e-06, | |
| "loss": 0.6864, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.4465893507003784, | |
| "learning_rate": 5.303514376996806e-06, | |
| "loss": 0.6558, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.1737278699874878, | |
| "learning_rate": 5.8785942492012785e-06, | |
| "loss": 0.6295, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.7027487754821777, | |
| "learning_rate": 6.517571884984026e-06, | |
| "loss": 0.638, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.5483449697494507, | |
| "learning_rate": 7.156549520766773e-06, | |
| "loss": 0.6173, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.9829121828079224, | |
| "learning_rate": 7.795527156549521e-06, | |
| "loss": 0.5511, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.466599464416504, | |
| "learning_rate": 8.434504792332269e-06, | |
| "loss": 0.5366, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.5559449195861816, | |
| "learning_rate": 9.073482428115017e-06, | |
| "loss": 0.5662, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.33512282371521, | |
| "learning_rate": 9.712460063897765e-06, | |
| "loss": 0.5567, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.943450927734375, | |
| "learning_rate": 1.035143769968051e-05, | |
| "loss": 0.5997, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.2756850719451904, | |
| "learning_rate": 1.099041533546326e-05, | |
| "loss": 0.5587, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.0768604278564453, | |
| "learning_rate": 1.1629392971246008e-05, | |
| "loss": 0.6288, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.082158088684082, | |
| "learning_rate": 1.2268370607028754e-05, | |
| "loss": 0.5475, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 1.4872550964355469, | |
| "learning_rate": 1.2907348242811502e-05, | |
| "loss": 0.6002, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.0925095081329346, | |
| "learning_rate": 1.3546325878594251e-05, | |
| "loss": 0.5887, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 1.4571633338928223, | |
| "learning_rate": 1.4185303514376998e-05, | |
| "loss": 0.6107, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.4871673583984375, | |
| "learning_rate": 1.4824281150159745e-05, | |
| "loss": 0.5582, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3184454441070557, | |
| "learning_rate": 1.5463258785942495e-05, | |
| "loss": 0.582, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.9747495651245117, | |
| "learning_rate": 1.610223642172524e-05, | |
| "loss": 0.4687, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.7691447734832764, | |
| "learning_rate": 1.6741214057507987e-05, | |
| "loss": 0.5013, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.8902829885482788, | |
| "learning_rate": 1.7380191693290737e-05, | |
| "loss": 0.5008, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.3197141885757446, | |
| "learning_rate": 1.8019169329073486e-05, | |
| "loss": 0.5229, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.325404167175293, | |
| "learning_rate": 1.8658146964856232e-05, | |
| "loss": 0.4975, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.3323551416397095, | |
| "learning_rate": 1.929712460063898e-05, | |
| "loss": 0.5254, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.7770269513130188, | |
| "learning_rate": 1.9936102236421725e-05, | |
| "loss": 0.5304, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.887294054031372, | |
| "learning_rate": 1.999949450079496e-05, | |
| "loss": 0.4755, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 1.2255381345748901, | |
| "learning_rate": 1.9997747161747696e-05, | |
| "loss": 0.4982, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.6865767240524292, | |
| "learning_rate": 1.9994751960168383e-05, | |
| "loss": 0.4857, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.2997519969940186, | |
| "learning_rate": 1.999050926990122e-05, | |
| "loss": 0.4987, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.4636356830596924, | |
| "learning_rate": 1.9985019620494935e-05, | |
| "loss": 0.4947, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.3720135688781738, | |
| "learning_rate": 1.9978283697136662e-05, | |
| "loss": 0.5135, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 1.6108952760696411, | |
| "learning_rate": 1.997030234056645e-05, | |
| "loss": 0.4405, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 2.1790759563446045, | |
| "learning_rate": 1.9961076546972304e-05, | |
| "loss": 0.4468, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 4.406925678253174, | |
| "learning_rate": 1.9950607467865856e-05, | |
| "loss": 0.4675, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 1.7346851825714111, | |
| "learning_rate": 1.993889640993864e-05, | |
| "loss": 0.4243, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 1.2865455150604248, | |
| "learning_rate": 1.9925944834898992e-05, | |
| "loss": 0.3787, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 1.321824550628662, | |
| "learning_rate": 1.991175435928962e-05, | |
| "loss": 0.3982, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.45711350440979, | |
| "learning_rate": 1.989632675428582e-05, | |
| "loss": 0.4811, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 2.5552310943603516, | |
| "learning_rate": 1.9879663945474416e-05, | |
| "loss": 0.4255, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 1.763244867324829, | |
| "learning_rate": 1.9861768012613435e-05, | |
| "loss": 0.4579, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 1.0951169729232788, | |
| "learning_rate": 1.984264118937249e-05, | |
| "loss": 0.398, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 8.235908508300781, | |
| "learning_rate": 1.9822285863054e-05, | |
| "loss": 0.4101, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 19.39371681213379, | |
| "learning_rate": 1.9800704574295246e-05, | |
| "loss": 0.4645, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 1.2594205141067505, | |
| "learning_rate": 1.9777900016751224e-05, | |
| "loss": 0.3632, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 1.7697833776474, | |
| "learning_rate": 1.9753875036758464e-05, | |
| "loss": 0.3356, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 1.66884446144104, | |
| "learning_rate": 1.9728632632979746e-05, | |
| "loss": 0.382, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 1.7648805379867554, | |
| "learning_rate": 1.970217595602985e-05, | |
| "loss": 0.3997, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 1.6132514476776123, | |
| "learning_rate": 1.967450830808228e-05, | |
| "loss": 0.3476, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 1.3217155933380127, | |
| "learning_rate": 1.9645633142457143e-05, | |
| "loss": 0.3362, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 1.5668702125549316, | |
| "learning_rate": 1.9615554063190098e-05, | |
| "loss": 0.3996, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 2.051454544067383, | |
| "learning_rate": 1.958427482458253e-05, | |
| "loss": 0.4104, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 1.84868586063385, | |
| "learning_rate": 1.9551799330732954e-05, | |
| "loss": 0.374, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 1.3498121500015259, | |
| "learning_rate": 1.9518131635049745e-05, | |
| "loss": 0.3444, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 1.962553858757019, | |
| "learning_rate": 1.9483275939745184e-05, | |
| "loss": 0.3153, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 1.7132457494735718, | |
| "learning_rate": 1.944723659531099e-05, | |
| "loss": 0.383, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 1.8860396146774292, | |
| "learning_rate": 1.9410018099975297e-05, | |
| "loss": 0.327, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 1.9154820442199707, | |
| "learning_rate": 1.9371625099141223e-05, | |
| "loss": 0.2704, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 1.4077069759368896, | |
| "learning_rate": 1.9332062384807058e-05, | |
| "loss": 0.2659, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 1.8489642143249512, | |
| "learning_rate": 1.9291334894968133e-05, | |
| "loss": 0.2523, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "grad_norm": 2.3917829990386963, | |
| "learning_rate": 1.9249447713000515e-05, | |
| "loss": 0.2885, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 2.5429599285125732, | |
| "learning_rate": 1.9206406067026506e-05, | |
| "loss": 0.3097, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 3.4684863090515137, | |
| "learning_rate": 1.9162215329262115e-05, | |
| "loss": 0.2868, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 2.662013292312622, | |
| "learning_rate": 1.9116881015346517e-05, | |
| "loss": 0.3281, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 2.223334312438965, | |
| "learning_rate": 1.9070408783653627e-05, | |
| "loss": 0.2799, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 2.992532730102539, | |
| "learning_rate": 1.9022804434585854e-05, | |
| "loss": 0.3252, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 1.6079481840133667, | |
| "learning_rate": 1.8974073909850125e-05, | |
| "loss": 0.2791, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 2.97104549407959, | |
| "learning_rate": 1.8924223291716274e-05, | |
| "loss": 0.3029, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 2.1889212131500244, | |
| "learning_rate": 1.887325880225789e-05, | |
| "loss": 0.293, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 2.123610019683838, | |
| "learning_rate": 1.882118680257572e-05, | |
| "loss": 0.2308, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 3.71563720703125, | |
| "learning_rate": 1.8768013792003683e-05, | |
| "loss": 0.216, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 2.459091901779175, | |
| "learning_rate": 1.8713746407297703e-05, | |
| "loss": 0.1972, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 2.669945240020752, | |
| "learning_rate": 1.8658391421807313e-05, | |
| "loss": 0.2478, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 2.1581931114196777, | |
| "learning_rate": 1.8601955744630255e-05, | |
| "loss": 0.2295, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 2.567908763885498, | |
| "learning_rate": 1.8544446419750125e-05, | |
| "loss": 0.2239, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "grad_norm": 2.0205740928649902, | |
| "learning_rate": 1.8485870625157186e-05, | |
| "loss": 0.216, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 2.2898030281066895, | |
| "learning_rate": 1.8426235671952452e-05, | |
| "loss": 0.2343, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 2.2003941535949707, | |
| "learning_rate": 1.836554900343514e-05, | |
| "loss": 0.2076, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 1.5117710828781128, | |
| "learning_rate": 1.8303818194173665e-05, | |
| "loss": 0.1904, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 1.2647337913513184, | |
| "learning_rate": 1.824105094906021e-05, | |
| "loss": 0.2341, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 2.00254225730896, | |
| "learning_rate": 1.8177255102349047e-05, | |
| "loss": 0.2288, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 2.516129493713379, | |
| "learning_rate": 1.8112438616678712e-05, | |
| "loss": 0.1685, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 7.12, | |
| "grad_norm": 3.6129150390625, | |
| "learning_rate": 1.8046609582078147e-05, | |
| "loss": 0.1625, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 3.1554768085479736, | |
| "learning_rate": 1.797977621495696e-05, | |
| "loss": 0.2011, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 3.8677282333374023, | |
| "learning_rate": 1.7911946857079886e-05, | |
| "loss": 0.1579, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 3.062568426132202, | |
| "learning_rate": 1.784312997452562e-05, | |
| "loss": 0.1705, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 1.8281478881835938, | |
| "learning_rate": 1.777333415663014e-05, | |
| "loss": 0.1708, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 1.1312201023101807, | |
| "learning_rate": 1.7702568114914607e-05, | |
| "loss": 0.1644, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 1.1482549905776978, | |
| "learning_rate": 1.7630840681998068e-05, | |
| "loss": 0.1297, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 3.109949827194214, | |
| "learning_rate": 1.755816081049501e-05, | |
| "loss": 0.1812, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 1.2507814168930054, | |
| "learning_rate": 1.7484537571897943e-05, | |
| "loss": 0.1331, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 2.898353338241577, | |
| "learning_rate": 1.740998015544514e-05, | |
| "loss": 0.1625, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 2.2018096446990967, | |
| "learning_rate": 1.7334497866973716e-05, | |
| "loss": 0.1441, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 2.1120376586914062, | |
| "learning_rate": 1.725810012775808e-05, | |
| "loss": 0.1804, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 8.08, | |
| "grad_norm": 1.815804362297058, | |
| "learning_rate": 1.7180796473334075e-05, | |
| "loss": 0.1145, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 2.5671639442443848, | |
| "learning_rate": 1.7102596552308765e-05, | |
| "loss": 0.1094, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 0.9062528610229492, | |
| "learning_rate": 1.7023510125156173e-05, | |
| "loss": 0.0924, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 3.5460164546966553, | |
| "learning_rate": 1.6943547062999027e-05, | |
| "loss": 0.1323, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 2.856273889541626, | |
| "learning_rate": 1.6862717346376706e-05, | |
| "loss": 0.0938, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 3.177924633026123, | |
| "learning_rate": 1.6781031063999515e-05, | |
| "loss": 0.1204, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.56, | |
| "grad_norm": 1.3371851444244385, | |
| "learning_rate": 1.6698498411489477e-05, | |
| "loss": 0.1342, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 2.0833916664123535, | |
| "learning_rate": 1.6615129690107773e-05, | |
| "loss": 0.1337, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.72, | |
| "grad_norm": 2.4857378005981445, | |
| "learning_rate": 1.6530935305469e-05, | |
| "loss": 0.1233, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 1.7040643692016602, | |
| "learning_rate": 1.6445925766242392e-05, | |
| "loss": 0.121, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 1.2290680408477783, | |
| "learning_rate": 1.6360111682840184e-05, | |
| "loss": 0.1037, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 3.9770867824554443, | |
| "learning_rate": 1.62735037660933e-05, | |
| "loss": 0.1133, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "grad_norm": 1.0533943176269531, | |
| "learning_rate": 1.618611282591446e-05, | |
| "loss": 0.0943, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 9.12, | |
| "grad_norm": 1.9520882368087769, | |
| "learning_rate": 1.609794976994897e-05, | |
| "loss": 0.0725, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 3.610280990600586, | |
| "learning_rate": 1.600902560221329e-05, | |
| "loss": 0.1015, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 1.3682546615600586, | |
| "learning_rate": 1.5919351421721548e-05, | |
| "loss": 0.0945, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 9.36, | |
| "grad_norm": 2.125917673110962, | |
| "learning_rate": 1.5828938421100266e-05, | |
| "loss": 0.0733, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 2.5206658840179443, | |
| "learning_rate": 1.5737797885191316e-05, | |
| "loss": 0.0646, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 9.52, | |
| "grad_norm": 2.5717523097991943, | |
| "learning_rate": 1.5645941189643444e-05, | |
| "loss": 0.0836, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 2.1724965572357178, | |
| "learning_rate": 1.55533797994924e-05, | |
| "loss": 0.0933, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.68, | |
| "grad_norm": 3.0643951892852783, | |
| "learning_rate": 1.546012526772996e-05, | |
| "loss": 0.083, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "grad_norm": 1.3585115671157837, | |
| "learning_rate": 1.5366189233861933e-05, | |
| "loss": 0.0804, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 9.84, | |
| "grad_norm": 2.106403112411499, | |
| "learning_rate": 1.5271583422455373e-05, | |
| "loss": 0.0941, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 1.7128456830978394, | |
| "learning_rate": 1.5176319641675213e-05, | |
| "loss": 0.0668, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 1.7037241458892822, | |
| "learning_rate": 1.5080409781810406e-05, | |
| "loss": 0.1043, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 10.08, | |
| "grad_norm": 0.8989495038986206, | |
| "learning_rate": 1.4983865813789869e-05, | |
| "loss": 0.0521, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 10.16, | |
| "grad_norm": 1.3013043403625488, | |
| "learning_rate": 1.488669978768833e-05, | |
| "loss": 0.0647, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 10.24, | |
| "grad_norm": 0.6377755999565125, | |
| "learning_rate": 1.47889238312223e-05, | |
| "loss": 0.0367, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 10.32, | |
| "grad_norm": 2.2679171562194824, | |
| "learning_rate": 1.4690550148236371e-05, | |
| "loss": 0.0769, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "grad_norm": 2.3292899131774902, | |
| "learning_rate": 1.4591591017179993e-05, | |
| "loss": 0.0908, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 10.48, | |
| "grad_norm": 1.8322817087173462, | |
| "learning_rate": 1.4492058789574948e-05, | |
| "loss": 0.0551, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 10.56, | |
| "grad_norm": 5.285430431365967, | |
| "learning_rate": 1.4391965888473705e-05, | |
| "loss": 0.0584, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 10.64, | |
| "grad_norm": 0.768731415271759, | |
| "learning_rate": 1.4291324806908846e-05, | |
| "loss": 0.0833, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 10.72, | |
| "grad_norm": 1.1357346773147583, | |
| "learning_rate": 1.419014810633374e-05, | |
| "loss": 0.0596, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 10.8, | |
| "grad_norm": 0.9390424489974976, | |
| "learning_rate": 1.408844841505473e-05, | |
| "loss": 0.0436, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 10.88, | |
| "grad_norm": 0.775092363357544, | |
| "learning_rate": 1.3986238426654894e-05, | |
| "loss": 0.0663, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 10.96, | |
| "grad_norm": 1.6158931255340576, | |
| "learning_rate": 1.3883530898409736e-05, | |
| "loss": 0.0438, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 11.04, | |
| "grad_norm": 1.8279812335968018, | |
| "learning_rate": 1.3780338649694874e-05, | |
| "loss": 0.0944, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 11.12, | |
| "grad_norm": 1.8661407232284546, | |
| "learning_rate": 1.3676674560386018e-05, | |
| "loss": 0.0398, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 1.8592077493667603, | |
| "learning_rate": 1.357255156925136e-05, | |
| "loss": 0.0561, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 11.28, | |
| "grad_norm": 3.479532241821289, | |
| "learning_rate": 1.3467982672336633e-05, | |
| "loss": 0.0408, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 11.36, | |
| "grad_norm": 2.4556691646575928, | |
| "learning_rate": 1.336298092134302e-05, | |
| "loss": 0.0603, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 11.44, | |
| "grad_norm": 2.028334379196167, | |
| "learning_rate": 1.325755942199812e-05, | |
| "loss": 0.0423, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 11.52, | |
| "grad_norm": 0.6319141387939453, | |
| "learning_rate": 1.3151731332420152e-05, | |
| "loss": 0.0717, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 11.6, | |
| "grad_norm": 1.3395280838012695, | |
| "learning_rate": 1.3045509861475645e-05, | |
| "loss": 0.0505, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 11.68, | |
| "grad_norm": 2.619269847869873, | |
| "learning_rate": 1.293890826713077e-05, | |
| "loss": 0.0477, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 11.76, | |
| "grad_norm": 2.6248464584350586, | |
| "learning_rate": 1.2831939854796567e-05, | |
| "loss": 0.0426, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 11.84, | |
| "grad_norm": 1.672989845275879, | |
| "learning_rate": 1.2724617975668229e-05, | |
| "loss": 0.0533, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 11.92, | |
| "grad_norm": 5.669078350067139, | |
| "learning_rate": 1.2616956025058688e-05, | |
| "loss": 0.0413, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 2.987652063369751, | |
| "learning_rate": 1.2508967440726689e-05, | |
| "loss": 0.0442, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 12.08, | |
| "grad_norm": 1.2909622192382812, | |
| "learning_rate": 1.2400665701199541e-05, | |
| "loss": 0.0565, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 12.16, | |
| "grad_norm": 0.264878511428833, | |
| "learning_rate": 1.2292064324090842e-05, | |
| "loss": 0.0243, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 12.24, | |
| "grad_norm": 2.0698373317718506, | |
| "learning_rate": 1.2183176864413262e-05, | |
| "loss": 0.0241, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 12.32, | |
| "grad_norm": 0.20062156021595, | |
| "learning_rate": 1.2074016912886683e-05, | |
| "loss": 0.0204, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 12.4, | |
| "grad_norm": 0.5076002478599548, | |
| "learning_rate": 1.1964598094241884e-05, | |
| "loss": 0.0462, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 12.48, | |
| "grad_norm": 1.1509472131729126, | |
| "learning_rate": 1.1854934065519986e-05, | |
| "loss": 0.0302, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 12.56, | |
| "grad_norm": 1.603930950164795, | |
| "learning_rate": 1.1745038514367853e-05, | |
| "loss": 0.0282, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 12.64, | |
| "grad_norm": 0.9757488965988159, | |
| "learning_rate": 1.1634925157329668e-05, | |
| "loss": 0.0387, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 12.72, | |
| "grad_norm": 1.058209776878357, | |
| "learning_rate": 1.1524607738134928e-05, | |
| "loss": 0.0445, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 2.3291449546813965, | |
| "learning_rate": 1.141410002598301e-05, | |
| "loss": 0.0572, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 12.88, | |
| "grad_norm": 0.8108656406402588, | |
| "learning_rate": 1.1303415813824599e-05, | |
| "loss": 0.0337, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 12.96, | |
| "grad_norm": 6.551490783691406, | |
| "learning_rate": 1.1192568916640107e-05, | |
| "loss": 0.0337, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 13.04, | |
| "grad_norm": 0.2524939477443695, | |
| "learning_rate": 1.1081573169715379e-05, | |
| "loss": 0.0218, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 13.12, | |
| "grad_norm": 0.17596031725406647, | |
| "learning_rate": 1.0970442426914847e-05, | |
| "loss": 0.0132, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 13.2, | |
| "grad_norm": 0.7680679559707642, | |
| "learning_rate": 1.085919055895237e-05, | |
| "loss": 0.0207, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 13.28, | |
| "grad_norm": 0.24066166579723358, | |
| "learning_rate": 1.0747831451659967e-05, | |
| "loss": 0.0142, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 13.36, | |
| "grad_norm": 3.3667285442352295, | |
| "learning_rate": 1.0636379004254665e-05, | |
| "loss": 0.0183, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 13.44, | |
| "grad_norm": 1.8831534385681152, | |
| "learning_rate": 1.0524847127603677e-05, | |
| "loss": 0.0253, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 13.52, | |
| "grad_norm": 1.329085111618042, | |
| "learning_rate": 1.0413249742488132e-05, | |
| "loss": 0.0118, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 13.6, | |
| "grad_norm": 2.0776255130767822, | |
| "learning_rate": 1.030160077786556e-05, | |
| "loss": 0.0128, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 13.68, | |
| "grad_norm": 1.02082097530365, | |
| "learning_rate": 1.0189914169131341e-05, | |
| "loss": 0.0406, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 13.76, | |
| "grad_norm": 0.7947062253952026, | |
| "learning_rate": 1.0078203856379394e-05, | |
| "loss": 0.0183, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 13.84, | |
| "grad_norm": 0.3098564147949219, | |
| "learning_rate": 9.966483782662233e-06, | |
| "loss": 0.0251, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 13.92, | |
| "grad_norm": 0.4250534772872925, | |
| "learning_rate": 9.854767892250692e-06, | |
| "loss": 0.0394, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 1.0927317142486572, | |
| "learning_rate": 9.743070128893452e-06, | |
| "loss": 0.0187, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 14.08, | |
| "grad_norm": 0.5906019806861877, | |
| "learning_rate": 9.631404434076687e-06, | |
| "loss": 0.0042, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 14.16, | |
| "grad_norm": 0.4497601389884949, | |
| "learning_rate": 9.519784745283956e-06, | |
| "loss": 0.0402, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 14.24, | |
| "grad_norm": 0.2257859855890274, | |
| "learning_rate": 9.408224994256603e-06, | |
| "loss": 0.0039, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 14.32, | |
| "grad_norm": 0.9509989023208618, | |
| "learning_rate": 9.296739105254869e-06, | |
| "loss": 0.0156, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 1.7652183771133423, | |
| "learning_rate": 9.185340993319977e-06, | |
| "loss": 0.0121, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 14.48, | |
| "grad_norm": 8.208316802978516, | |
| "learning_rate": 9.074044562537284e-06, | |
| "loss": 0.0389, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 14.56, | |
| "grad_norm": 0.13252846896648407, | |
| "learning_rate": 8.962863704300893e-06, | |
| "loss": 0.011, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 14.64, | |
| "grad_norm": 0.23998035490512848, | |
| "learning_rate": 8.851812295579789e-06, | |
| "loss": 0.0248, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 14.72, | |
| "grad_norm": 0.10990134626626968, | |
| "learning_rate": 8.740904197185794e-06, | |
| "loss": 0.0204, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 14.8, | |
| "grad_norm": 3.542644500732422, | |
| "learning_rate": 8.630153252043543e-06, | |
| "loss": 0.0163, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 14.88, | |
| "grad_norm": 0.7273716926574707, | |
| "learning_rate": 8.519573283462688e-06, | |
| "loss": 0.0068, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 14.96, | |
| "grad_norm": 0.036577966064214706, | |
| "learning_rate": 8.409178093412549e-06, | |
| "loss": 0.0084, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 15.04, | |
| "grad_norm": 0.1642957627773285, | |
| "learning_rate": 8.298981460799426e-06, | |
| "loss": 0.0093, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 15.12, | |
| "grad_norm": 0.11017989367246628, | |
| "learning_rate": 8.188997139746807e-06, | |
| "loss": 0.0076, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 15.2, | |
| "grad_norm": 0.04475900158286095, | |
| "learning_rate": 8.079238857878631e-06, | |
| "loss": 0.0059, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 15.28, | |
| "grad_norm": 1.0872098207473755, | |
| "learning_rate": 7.969720314605915e-06, | |
| "loss": 0.0051, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 15.36, | |
| "grad_norm": 0.32729968428611755, | |
| "learning_rate": 7.860455179416837e-06, | |
| "loss": 0.0016, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 15.44, | |
| "grad_norm": 1.3896279335021973, | |
| "learning_rate": 7.751457090170616e-06, | |
| "loss": 0.0066, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 15.52, | |
| "grad_norm": 0.24193909764289856, | |
| "learning_rate": 7.642739651395295e-06, | |
| "loss": 0.0049, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 15.6, | |
| "grad_norm": 0.0498543456196785, | |
| "learning_rate": 7.534316432589706e-06, | |
| "loss": 0.0065, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 15.68, | |
| "grad_norm": 0.053042277693748474, | |
| "learning_rate": 7.426200966529795e-06, | |
| "loss": 0.0096, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 15.76, | |
| "grad_norm": 0.6754854917526245, | |
| "learning_rate": 7.318406747579556e-06, | |
| "loss": 0.0074, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 15.84, | |
| "grad_norm": 0.05438900366425514, | |
| "learning_rate": 7.210947230006713e-06, | |
| "loss": 0.0025, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 15.92, | |
| "grad_norm": 0.0401092991232872, | |
| "learning_rate": 7.103835826303451e-06, | |
| "loss": 0.0022, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.05556059256196022, | |
| "learning_rate": 6.997085905512346e-06, | |
| "loss": 0.0049, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 16.08, | |
| "grad_norm": 0.07605039328336716, | |
| "learning_rate": 6.8907107915577075e-06, | |
| "loss": 0.0051, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 16.16, | |
| "grad_norm": 0.07472195476293564, | |
| "learning_rate": 6.7847237615825636e-06, | |
| "loss": 0.0031, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 16.24, | |
| "grad_norm": 0.203807532787323, | |
| "learning_rate": 6.6791380442914866e-06, | |
| "loss": 0.0014, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 16.32, | |
| "grad_norm": 0.08696573972702026, | |
| "learning_rate": 6.573966818299461e-06, | |
| "loss": 0.0024, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 16.4, | |
| "grad_norm": 0.025404011830687523, | |
| "learning_rate": 6.469223210486992e-06, | |
| "loss": 0.0007, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 16.48, | |
| "grad_norm": 0.05262196436524391, | |
| "learning_rate": 6.364920294361701e-06, | |
| "loss": 0.0006, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 16.56, | |
| "grad_norm": 0.029066545888781548, | |
| "learning_rate": 6.261071088426546e-06, | |
| "loss": 0.0004, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 16.64, | |
| "grad_norm": 0.02618943341076374, | |
| "learning_rate": 6.1576885545549355e-06, | |
| "loss": 0.0022, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 16.72, | |
| "grad_norm": 0.06179804354906082, | |
| "learning_rate": 6.054785596372894e-06, | |
| "loss": 0.0005, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 16.8, | |
| "grad_norm": 0.014377947896718979, | |
| "learning_rate": 5.952375057648509e-06, | |
| "loss": 0.0028, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 16.88, | |
| "grad_norm": 0.039646223187446594, | |
| "learning_rate": 5.850469720688847e-06, | |
| "loss": 0.001, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 16.96, | |
| "grad_norm": 0.44483599066734314, | |
| "learning_rate": 5.74908230474453e-06, | |
| "loss": 0.0044, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 17.04, | |
| "grad_norm": 0.024487623944878578, | |
| "learning_rate": 5.648225464422189e-06, | |
| "loss": 0.0007, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 17.12, | |
| "grad_norm": 0.0326782651245594, | |
| "learning_rate": 5.547911788105001e-06, | |
| "loss": 0.0008, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 17.2, | |
| "grad_norm": 0.015867168083786964, | |
| "learning_rate": 5.4481537963814675e-06, | |
| "loss": 0.0004, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 17.28, | |
| "grad_norm": 0.022902317345142365, | |
| "learning_rate": 5.348963940482663e-06, | |
| "loss": 0.0003, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 17.36, | |
| "grad_norm": 0.024737635627388954, | |
| "learning_rate": 5.25035460072814e-06, | |
| "loss": 0.0023, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 17.44, | |
| "grad_norm": 0.019413290545344353, | |
| "learning_rate": 5.15233808498071e-06, | |
| "loss": 0.0004, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 17.52, | |
| "grad_norm": 0.01702185347676277, | |
| "learning_rate": 5.054926627110208e-06, | |
| "loss": 0.0004, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 17.6, | |
| "grad_norm": 2.464230537414551, | |
| "learning_rate": 4.9581323854665695e-06, | |
| "loss": 0.0049, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 17.68, | |
| "grad_norm": 0.013668897561728954, | |
| "learning_rate": 4.861967441362262e-06, | |
| "loss": 0.0004, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 17.76, | |
| "grad_norm": 0.014037170447409153, | |
| "learning_rate": 4.766443797564375e-06, | |
| "loss": 0.0004, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 17.84, | |
| "grad_norm": 0.011271145194768906, | |
| "learning_rate": 4.671573376796511e-06, | |
| "loss": 0.0005, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 17.92, | |
| "grad_norm": 0.015100213699042797, | |
| "learning_rate": 4.57736802025065e-06, | |
| "loss": 0.0007, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.1259368360042572, | |
| "learning_rate": 4.48383948610919e-06, | |
| "loss": 0.0003, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 18.08, | |
| "grad_norm": 0.04630790278315544, | |
| "learning_rate": 4.390999448077375e-06, | |
| "loss": 0.0004, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 18.16, | |
| "grad_norm": 0.03695525601506233, | |
| "learning_rate": 4.298859493926235e-06, | |
| "loss": 0.0003, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 18.24, | |
| "grad_norm": 0.00454062782227993, | |
| "learning_rate": 4.207431124046267e-06, | |
| "loss": 0.0003, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 18.32, | |
| "grad_norm": 0.016318323090672493, | |
| "learning_rate": 4.116725750012035e-06, | |
| "loss": 0.0003, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 18.4, | |
| "grad_norm": 0.02545909956097603, | |
| "learning_rate": 4.026754693157816e-06, | |
| "loss": 0.0003, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 18.48, | |
| "grad_norm": 0.014473868533968925, | |
| "learning_rate": 3.937529183164562e-06, | |
| "loss": 0.0003, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 18.56, | |
| "grad_norm": 0.014056684449315071, | |
| "learning_rate": 3.84906035665826e-06, | |
| "loss": 0.0002, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 18.64, | |
| "grad_norm": 0.013438849709928036, | |
| "learning_rate": 3.7613592558199162e-06, | |
| "loss": 0.0002, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 18.72, | |
| "grad_norm": 0.01617852970957756, | |
| "learning_rate": 3.6744368270073393e-06, | |
| "loss": 0.0007, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 18.8, | |
| "grad_norm": 0.013655665330588818, | |
| "learning_rate": 3.5883039193888914e-06, | |
| "loss": 0.0007, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 18.88, | |
| "grad_norm": 0.014764860272407532, | |
| "learning_rate": 3.502971283589326e-06, | |
| "loss": 0.0005, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 18.96, | |
| "grad_norm": 0.03937000408768654, | |
| "learning_rate": 3.418449570347986e-06, | |
| "loss": 0.0003, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 19.04, | |
| "grad_norm": 0.014079142361879349, | |
| "learning_rate": 3.334749329189415e-06, | |
| "loss": 0.0003, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 19.12, | |
| "grad_norm": 0.016212107613682747, | |
| "learning_rate": 3.2518810071066363e-06, | |
| "loss": 0.0002, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 0.015338711440563202, | |
| "learning_rate": 3.1698549472572203e-06, | |
| "loss": 0.0003, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 19.28, | |
| "grad_norm": 0.02485400065779686, | |
| "learning_rate": 3.0886813876723075e-06, | |
| "loss": 0.0002, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 19.36, | |
| "grad_norm": 0.017927484586834908, | |
| "learning_rate": 3.0083704599787423e-06, | |
| "loss": 0.0003, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 19.44, | |
| "grad_norm": 0.012980809435248375, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 0.0005, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 19.52, | |
| "grad_norm": 0.03192094713449478, | |
| "learning_rate": 2.850376487177656e-06, | |
| "loss": 0.0002, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 19.6, | |
| "grad_norm": 0.019058849662542343, | |
| "learning_rate": 2.7727131619886017e-06, | |
| "loss": 0.0002, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 19.68, | |
| "grad_norm": 0.012099268846213818, | |
| "learning_rate": 2.6959519060665195e-06, | |
| "loss": 0.0002, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 19.76, | |
| "grad_norm": 0.010341337881982327, | |
| "learning_rate": 2.6201023003193437e-06, | |
| "loss": 0.0002, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 19.84, | |
| "grad_norm": 0.014265513978898525, | |
| "learning_rate": 2.545173811867977e-06, | |
| "loss": 0.0002, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 19.92, | |
| "grad_norm": 0.01814926601946354, | |
| "learning_rate": 2.471175792864642e-06, | |
| "loss": 0.001, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.011018522083759308, | |
| "learning_rate": 2.3981174793255956e-06, | |
| "loss": 0.0002, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 20.08, | |
| "grad_norm": 0.015308992005884647, | |
| "learning_rate": 2.3260079899783492e-06, | |
| "loss": 0.0008, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 20.16, | |
| "grad_norm": 0.010280388407409191, | |
| "learning_rate": 2.254856325123529e-06, | |
| "loss": 0.0002, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 20.24, | |
| "grad_norm": 0.011187535710632801, | |
| "learning_rate": 2.1846713655114836e-06, | |
| "loss": 0.0002, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 20.32, | |
| "grad_norm": 0.015644947066903114, | |
| "learning_rate": 2.115461871233867e-06, | |
| "loss": 0.0002, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 20.4, | |
| "grad_norm": 0.00875458400696516, | |
| "learning_rate": 2.0472364806302313e-06, | |
| "loss": 0.0002, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 20.48, | |
| "grad_norm": 0.017235323786735535, | |
| "learning_rate": 1.9800037092098477e-06, | |
| "loss": 0.0002, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 20.56, | |
| "grad_norm": 0.03400786966085434, | |
| "learning_rate": 1.9137719485888527e-06, | |
| "loss": 0.0002, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 20.64, | |
| "grad_norm": 0.011186143383383751, | |
| "learning_rate": 1.8485494654428482e-06, | |
| "loss": 0.0002, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 20.72, | |
| "grad_norm": 0.008517293259501457, | |
| "learning_rate": 1.784344400475093e-06, | |
| "loss": 0.0002, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 20.8, | |
| "grad_norm": 0.010979237034916878, | |
| "learning_rate": 1.7211647674004483e-06, | |
| "loss": 0.0002, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 20.88, | |
| "grad_norm": 0.01790277473628521, | |
| "learning_rate": 1.659018451945128e-06, | |
| "loss": 0.0002, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 20.96, | |
| "grad_norm": 0.01020762138068676, | |
| "learning_rate": 1.5979132108624572e-06, | |
| "loss": 0.0002, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 21.04, | |
| "grad_norm": 0.014280925504863262, | |
| "learning_rate": 1.5378566709647225e-06, | |
| "loss": 0.0003, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 21.12, | |
| "grad_norm": 0.012542898766696453, | |
| "learning_rate": 1.4788563281712253e-06, | |
| "loss": 0.0002, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 21.2, | |
| "grad_norm": 0.0072167362086474895, | |
| "learning_rate": 1.420919546572691e-06, | |
| "loss": 0.0002, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 21.28, | |
| "grad_norm": 0.010835397057235241, | |
| "learning_rate": 1.364053557512126e-06, | |
| "loss": 0.0002, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 21.36, | |
| "grad_norm": 0.010230830870568752, | |
| "learning_rate": 1.308265458682234e-06, | |
| "loss": 0.0004, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 21.44, | |
| "grad_norm": 0.00813596136868, | |
| "learning_rate": 1.2535622132395242e-06, | |
| "loss": 0.0005, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 21.52, | |
| "grad_norm": 0.02058684267103672, | |
| "learning_rate": 1.1999506489352208e-06, | |
| "loss": 0.0002, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 21.6, | |
| "grad_norm": 0.022641103714704514, | |
| "learning_rate": 1.1474374572630432e-06, | |
| "loss": 0.0002, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 21.68, | |
| "grad_norm": 0.015892324969172478, | |
| "learning_rate": 1.0960291926240263e-06, | |
| "loss": 0.0002, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 21.76, | |
| "grad_norm": 0.01219907309859991, | |
| "learning_rate": 1.0457322715084305e-06, | |
| "loss": 0.0002, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 21.84, | |
| "grad_norm": 0.009200166910886765, | |
| "learning_rate": 9.965529716948684e-07, | |
| "loss": 0.0002, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 21.92, | |
| "grad_norm": 0.00976441241800785, | |
| "learning_rate": 9.484974314667561e-07, | |
| "loss": 0.0002, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 0.010954627767205238, | |
| "learning_rate": 9.015716488461656e-07, | |
| "loss": 0.0002, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 22.08, | |
| "grad_norm": 0.016307951882481575, | |
| "learning_rate": 8.557814808451737e-07, | |
| "loss": 0.0002, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 22.16, | |
| "grad_norm": 0.009793213568627834, | |
| "learning_rate": 8.11132642734841e-07, | |
| "loss": 0.0001, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 22.24, | |
| "grad_norm": 0.023596106097102165, | |
| "learning_rate": 7.676307073318479e-07, | |
| "loss": 0.0002, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 22.32, | |
| "grad_norm": 0.011904028244316578, | |
| "learning_rate": 7.252811043029373e-07, | |
| "loss": 0.0004, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 22.4, | |
| "grad_norm": 0.013149023056030273, | |
| "learning_rate": 6.840891194872112e-07, | |
| "loss": 0.0002, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 22.48, | |
| "grad_norm": 0.011410288512706757, | |
| "learning_rate": 6.440598942363796e-07, | |
| "loss": 0.0002, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 22.56, | |
| "grad_norm": 0.008377713151276112, | |
| "learning_rate": 6.051984247730447e-07, | |
| "loss": 0.0002, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 22.64, | |
| "grad_norm": 0.19204267859458923, | |
| "learning_rate": 5.675095615671144e-07, | |
| "loss": 0.0005, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 22.72, | |
| "grad_norm": 0.017059409990906715, | |
| "learning_rate": 5.309980087303713e-07, | |
| "loss": 0.0002, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 22.8, | |
| "grad_norm": 0.00994216650724411, | |
| "learning_rate": 4.956683234293491e-07, | |
| "loss": 0.0002, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 22.88, | |
| "grad_norm": 0.017344018444418907, | |
| "learning_rate": 4.61524915316528e-07, | |
| "loss": 0.0001, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 22.96, | |
| "grad_norm": 0.009385120123624802, | |
| "learning_rate": 4.285720459799425e-07, | |
| "loss": 0.0002, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 23.04, | |
| "grad_norm": 0.009774768725037575, | |
| "learning_rate": 3.9681382841128323e-07, | |
| "loss": 0.0002, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 23.12, | |
| "grad_norm": 0.13636986911296844, | |
| "learning_rate": 3.6625422649252617e-07, | |
| "loss": 0.0005, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 23.2, | |
| "grad_norm": 0.011067167855799198, | |
| "learning_rate": 3.368970545011874e-07, | |
| "loss": 0.0002, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 23.28, | |
| "grad_norm": 0.008906451985239983, | |
| "learning_rate": 3.0874597663425046e-07, | |
| "loss": 0.0002, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 23.36, | |
| "grad_norm": 0.011505813337862492, | |
| "learning_rate": 2.818045065508168e-07, | |
| "loss": 0.0001, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 23.44, | |
| "grad_norm": 0.01766866073012352, | |
| "learning_rate": 2.560760069335511e-07, | |
| "loss": 0.0002, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 23.52, | |
| "grad_norm": 0.01433554571121931, | |
| "learning_rate": 2.315636890689743e-07, | |
| "loss": 0.0002, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 23.6, | |
| "grad_norm": 0.009151219390332699, | |
| "learning_rate": 2.082706124466416e-07, | |
| "loss": 0.0002, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 23.68, | |
| "grad_norm": 0.014051095582544804, | |
| "learning_rate": 1.8619968437727954e-07, | |
| "loss": 0.0002, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 23.76, | |
| "grad_norm": 0.01479571033269167, | |
| "learning_rate": 1.6535365962991322e-07, | |
| "loss": 0.0002, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 23.84, | |
| "grad_norm": 0.17555338144302368, | |
| "learning_rate": 1.4573514008802693e-07, | |
| "loss": 0.0005, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 23.92, | |
| "grad_norm": 0.01707664132118225, | |
| "learning_rate": 1.2734657442481368e-07, | |
| "loss": 0.0002, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.023542573675513268, | |
| "learning_rate": 1.1019025779754666e-07, | |
| "loss": 0.0002, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 24.08, | |
| "grad_norm": 0.008962544612586498, | |
| "learning_rate": 9.426833156111038e-08, | |
| "loss": 0.0002, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 24.16, | |
| "grad_norm": 0.009664681740105152, | |
| "learning_rate": 7.958278300072453e-08, | |
| "loss": 0.0002, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 24.24, | |
| "grad_norm": 0.0093710171058774, | |
| "learning_rate": 6.613544508391024e-08, | |
| "loss": 0.0001, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 24.32, | |
| "grad_norm": 0.016148734837770462, | |
| "learning_rate": 5.392799623170186e-08, | |
| "loss": 0.0002, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 24.4, | |
| "grad_norm": 0.018474169075489044, | |
| "learning_rate": 4.2961960109160205e-08, | |
| "loss": 0.0002, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 24.48, | |
| "grad_norm": 0.007969476282596588, | |
| "learning_rate": 3.323870543519458e-08, | |
| "loss": 0.0001, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 24.56, | |
| "grad_norm": 0.009026006795465946, | |
| "learning_rate": 2.475944581173173e-08, | |
| "loss": 0.0002, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 24.64, | |
| "grad_norm": 0.016195567324757576, | |
| "learning_rate": 1.752523957223362e-08, | |
| "loss": 0.0002, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 24.72, | |
| "grad_norm": 0.004977029282599688, | |
| "learning_rate": 1.1536989649608699e-08, | |
| "loss": 0.0002, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 24.8, | |
| "grad_norm": 0.008723889477550983, | |
| "learning_rate": 6.795443463509799e-09, | |
| "loss": 0.0002, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 24.88, | |
| "grad_norm": 0.01094700489193201, | |
| "learning_rate": 3.3011928270454406e-09, | |
| "loss": 0.0004, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 24.96, | |
| "grad_norm": 0.009475067257881165, | |
| "learning_rate": 1.054673872914469e-09, | |
| "loss": 0.0005, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "step": 3125, | |
| "total_flos": 1.246578685771776e+17, | |
| "train_loss": 0.15220735648881645, | |
| "train_runtime": 31927.3271, | |
| "train_samples_per_second": 0.392, | |
| "train_steps_per_second": 0.098 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.246578685771776e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |