| { | |
| "best_global_step": 6000, | |
| "best_metric": 0.15993832051753998, | |
| "best_model_checkpoint": "/data/alamparan/mattext_ckpt/results/2026-02-05/13-04-49/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-6000", | |
| "epoch": 1.938610662358643, | |
| "eval_steps": 50, | |
| "global_step": 6000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01615508885298869, | |
| "grad_norm": 1.0515121221542358, | |
| "learning_rate": 0.00019993667205169628, | |
| "loss": 6.0534, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01615508885298869, | |
| "eval_loss": 4.074550151824951, | |
| "eval_runtime": 174.8477, | |
| "eval_samples_per_second": 108.689, | |
| "eval_steps_per_second": 2.265, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03231017770597738, | |
| "grad_norm": 1.2968676090240479, | |
| "learning_rate": 0.00019987205169628432, | |
| "loss": 3.8424, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03231017770597738, | |
| "eval_loss": 3.6214287281036377, | |
| "eval_runtime": 175.7646, | |
| "eval_samples_per_second": 108.122, | |
| "eval_steps_per_second": 2.253, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.048465266558966075, | |
| "grad_norm": 1.0487699508666992, | |
| "learning_rate": 0.0001998074313408724, | |
| "loss": 3.566, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.048465266558966075, | |
| "eval_loss": 3.430216073989868, | |
| "eval_runtime": 174.1519, | |
| "eval_samples_per_second": 109.123, | |
| "eval_steps_per_second": 2.274, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06462035541195477, | |
| "grad_norm": 1.1100188493728638, | |
| "learning_rate": 0.00019974281098546044, | |
| "loss": 3.3895, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06462035541195477, | |
| "eval_loss": 3.2904512882232666, | |
| "eval_runtime": 174.3178, | |
| "eval_samples_per_second": 109.019, | |
| "eval_steps_per_second": 2.272, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08077544426494346, | |
| "grad_norm": 1.1103720664978027, | |
| "learning_rate": 0.00019967819063004846, | |
| "loss": 3.2487, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08077544426494346, | |
| "eval_loss": 3.155512571334839, | |
| "eval_runtime": 175.7563, | |
| "eval_samples_per_second": 108.127, | |
| "eval_steps_per_second": 2.253, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09693053311793215, | |
| "grad_norm": 0.9341705441474915, | |
| "learning_rate": 0.00019961357027463653, | |
| "loss": 3.1506, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09693053311793215, | |
| "eval_loss": 3.034304618835449, | |
| "eval_runtime": 173.9011, | |
| "eval_samples_per_second": 109.28, | |
| "eval_steps_per_second": 2.277, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11308562197092084, | |
| "grad_norm": 1.2189605236053467, | |
| "learning_rate": 0.00019954894991922457, | |
| "loss": 3.0199, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11308562197092084, | |
| "eval_loss": 2.9081249237060547, | |
| "eval_runtime": 174.6288, | |
| "eval_samples_per_second": 108.825, | |
| "eval_steps_per_second": 2.268, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.12924071082390953, | |
| "grad_norm": 1.398779273033142, | |
| "learning_rate": 0.0001994843295638126, | |
| "loss": 2.9081, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.12924071082390953, | |
| "eval_loss": 2.8029439449310303, | |
| "eval_runtime": 175.8793, | |
| "eval_samples_per_second": 108.051, | |
| "eval_steps_per_second": 2.252, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14539579967689822, | |
| "grad_norm": 1.3859950304031372, | |
| "learning_rate": 0.00019941970920840066, | |
| "loss": 2.7946, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14539579967689822, | |
| "eval_loss": 2.647810935974121, | |
| "eval_runtime": 173.9196, | |
| "eval_samples_per_second": 109.269, | |
| "eval_steps_per_second": 2.277, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.16155088852988692, | |
| "grad_norm": 1.5629290342330933, | |
| "learning_rate": 0.0001993550888529887, | |
| "loss": 2.6013, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16155088852988692, | |
| "eval_loss": 2.3850767612457275, | |
| "eval_runtime": 175.9348, | |
| "eval_samples_per_second": 108.017, | |
| "eval_steps_per_second": 2.251, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1777059773828756, | |
| "grad_norm": 2.2329182624816895, | |
| "learning_rate": 0.00019929046849757675, | |
| "loss": 2.3045, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1777059773828756, | |
| "eval_loss": 1.9701340198516846, | |
| "eval_runtime": 174.0917, | |
| "eval_samples_per_second": 109.161, | |
| "eval_steps_per_second": 2.275, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1938610662358643, | |
| "grad_norm": 2.0642428398132324, | |
| "learning_rate": 0.0001992258481421648, | |
| "loss": 1.8478, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1938610662358643, | |
| "eval_loss": 1.3872605562210083, | |
| "eval_runtime": 182.0251, | |
| "eval_samples_per_second": 104.403, | |
| "eval_steps_per_second": 2.176, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.210016155088853, | |
| "grad_norm": 1.663743495941162, | |
| "learning_rate": 0.00019916122778675284, | |
| "loss": 1.4464, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.210016155088853, | |
| "eval_loss": 1.0700706243515015, | |
| "eval_runtime": 184.3893, | |
| "eval_samples_per_second": 103.065, | |
| "eval_steps_per_second": 2.148, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.22617124394184168, | |
| "grad_norm": 1.1930618286132812, | |
| "learning_rate": 0.00019909660743134088, | |
| "loss": 1.1074, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.22617124394184168, | |
| "eval_loss": 0.8680539727210999, | |
| "eval_runtime": 178.1716, | |
| "eval_samples_per_second": 106.661, | |
| "eval_steps_per_second": 2.223, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24232633279483037, | |
| "grad_norm": 1.1257418394088745, | |
| "learning_rate": 0.00019903198707592893, | |
| "loss": 0.9421, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.24232633279483037, | |
| "eval_loss": 0.7728319764137268, | |
| "eval_runtime": 176.3872, | |
| "eval_samples_per_second": 107.74, | |
| "eval_steps_per_second": 2.245, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.25848142164781907, | |
| "grad_norm": 0.774366021156311, | |
| "learning_rate": 0.00019896736672051697, | |
| "loss": 0.815, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.25848142164781907, | |
| "eval_loss": 0.7130251526832581, | |
| "eval_runtime": 177.1295, | |
| "eval_samples_per_second": 107.289, | |
| "eval_steps_per_second": 2.236, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.27463651050080773, | |
| "grad_norm": 0.8781099915504456, | |
| "learning_rate": 0.00019890274636510502, | |
| "loss": 0.7576, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.27463651050080773, | |
| "eval_loss": 0.6742915511131287, | |
| "eval_runtime": 175.5733, | |
| "eval_samples_per_second": 108.24, | |
| "eval_steps_per_second": 2.255, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.29079159935379645, | |
| "grad_norm": 0.7646985054016113, | |
| "learning_rate": 0.00019883812600969306, | |
| "loss": 0.7248, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.29079159935379645, | |
| "eval_loss": 0.6413119435310364, | |
| "eval_runtime": 182.9224, | |
| "eval_samples_per_second": 103.891, | |
| "eval_steps_per_second": 2.165, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3069466882067851, | |
| "grad_norm": 0.7410991191864014, | |
| "learning_rate": 0.0001987735056542811, | |
| "loss": 0.706, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3069466882067851, | |
| "eval_loss": 0.6033533215522766, | |
| "eval_runtime": 179.7077, | |
| "eval_samples_per_second": 105.75, | |
| "eval_steps_per_second": 2.204, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.32310177705977383, | |
| "grad_norm": 0.7484062910079956, | |
| "learning_rate": 0.00019870888529886915, | |
| "loss": 0.6663, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.32310177705977383, | |
| "eval_loss": 0.5805819034576416, | |
| "eval_runtime": 178.089, | |
| "eval_samples_per_second": 106.711, | |
| "eval_steps_per_second": 2.224, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3392568659127625, | |
| "grad_norm": 0.749854326248169, | |
| "learning_rate": 0.0001986442649434572, | |
| "loss": 0.6597, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392568659127625, | |
| "eval_loss": 0.5619820356369019, | |
| "eval_runtime": 177.4502, | |
| "eval_samples_per_second": 107.095, | |
| "eval_steps_per_second": 2.232, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3554119547657512, | |
| "grad_norm": 0.7001623511314392, | |
| "learning_rate": 0.00019857964458804524, | |
| "loss": 0.6189, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3554119547657512, | |
| "eval_loss": 0.5405182242393494, | |
| "eval_runtime": 177.9338, | |
| "eval_samples_per_second": 106.804, | |
| "eval_steps_per_second": 2.226, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3715670436187399, | |
| "grad_norm": 0.6307182312011719, | |
| "learning_rate": 0.00019851502423263328, | |
| "loss": 0.5879, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3715670436187399, | |
| "eval_loss": 0.5224936604499817, | |
| "eval_runtime": 179.0782, | |
| "eval_samples_per_second": 106.121, | |
| "eval_steps_per_second": 2.211, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3877221324717286, | |
| "grad_norm": 0.755315899848938, | |
| "learning_rate": 0.00019845040387722132, | |
| "loss": 0.5659, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3877221324717286, | |
| "eval_loss": 0.4987003803253174, | |
| "eval_runtime": 180.4854, | |
| "eval_samples_per_second": 105.294, | |
| "eval_steps_per_second": 2.194, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.40387722132471726, | |
| "grad_norm": 0.6601123213768005, | |
| "learning_rate": 0.0001983857835218094, | |
| "loss": 0.5484, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.40387722132471726, | |
| "eval_loss": 0.48691466450691223, | |
| "eval_runtime": 185.1754, | |
| "eval_samples_per_second": 102.627, | |
| "eval_steps_per_second": 2.139, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.420032310177706, | |
| "grad_norm": 0.6862411499023438, | |
| "learning_rate": 0.0001983211631663974, | |
| "loss": 0.5404, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.420032310177706, | |
| "eval_loss": 0.4613674581050873, | |
| "eval_runtime": 184.3912, | |
| "eval_samples_per_second": 103.063, | |
| "eval_steps_per_second": 2.148, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.43618739903069464, | |
| "grad_norm": 0.705555260181427, | |
| "learning_rate": 0.00019825654281098546, | |
| "loss": 0.5047, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.43618739903069464, | |
| "eval_loss": 0.43498361110687256, | |
| "eval_runtime": 183.8266, | |
| "eval_samples_per_second": 103.38, | |
| "eval_steps_per_second": 2.154, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.45234248788368336, | |
| "grad_norm": 0.6185953617095947, | |
| "learning_rate": 0.00019819192245557353, | |
| "loss": 0.4912, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.45234248788368336, | |
| "eval_loss": 0.411850243806839, | |
| "eval_runtime": 174.5999, | |
| "eval_samples_per_second": 108.843, | |
| "eval_steps_per_second": 2.268, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.46849757673667203, | |
| "grad_norm": 0.6479601860046387, | |
| "learning_rate": 0.00019812730210016157, | |
| "loss": 0.4377, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.46849757673667203, | |
| "eval_loss": 0.4002995789051056, | |
| "eval_runtime": 174.5791, | |
| "eval_samples_per_second": 108.856, | |
| "eval_steps_per_second": 2.268, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.48465266558966075, | |
| "grad_norm": 0.7221212983131409, | |
| "learning_rate": 0.0001980626817447496, | |
| "loss": 0.4106, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.48465266558966075, | |
| "eval_loss": 0.38361039757728577, | |
| "eval_runtime": 176.2527, | |
| "eval_samples_per_second": 107.822, | |
| "eval_steps_per_second": 2.247, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5008077544426495, | |
| "grad_norm": 0.6264716386795044, | |
| "learning_rate": 0.00019799806138933766, | |
| "loss": 0.416, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5008077544426495, | |
| "eval_loss": 0.3711036145687103, | |
| "eval_runtime": 175.7893, | |
| "eval_samples_per_second": 108.107, | |
| "eval_steps_per_second": 2.253, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5169628432956381, | |
| "grad_norm": 0.6347914934158325, | |
| "learning_rate": 0.0001979334410339257, | |
| "loss": 0.4034, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5169628432956381, | |
| "eval_loss": 0.3516336679458618, | |
| "eval_runtime": 174.6119, | |
| "eval_samples_per_second": 108.836, | |
| "eval_steps_per_second": 2.268, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5331179321486268, | |
| "grad_norm": 0.6225407123565674, | |
| "learning_rate": 0.00019786882067851372, | |
| "loss": 0.3762, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5331179321486268, | |
| "eval_loss": 0.3361986577510834, | |
| "eval_runtime": 175.0444, | |
| "eval_samples_per_second": 108.567, | |
| "eval_steps_per_second": 2.262, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5492730210016155, | |
| "grad_norm": 0.7712708711624146, | |
| "learning_rate": 0.0001978042003231018, | |
| "loss": 0.3669, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5492730210016155, | |
| "eval_loss": 0.31636813282966614, | |
| "eval_runtime": 175.3287, | |
| "eval_samples_per_second": 108.391, | |
| "eval_steps_per_second": 2.259, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5654281098546042, | |
| "grad_norm": 0.6715072989463806, | |
| "learning_rate": 0.00019773957996768984, | |
| "loss": 0.3441, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5654281098546042, | |
| "eval_loss": 0.30088016390800476, | |
| "eval_runtime": 165.2219, | |
| "eval_samples_per_second": 115.021, | |
| "eval_steps_per_second": 2.397, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5815831987075929, | |
| "grad_norm": 0.585584282875061, | |
| "learning_rate": 0.00019767495961227788, | |
| "loss": 0.3267, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5815831987075929, | |
| "eval_loss": 0.28820616006851196, | |
| "eval_runtime": 163.4002, | |
| "eval_samples_per_second": 116.303, | |
| "eval_steps_per_second": 2.423, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5977382875605816, | |
| "grad_norm": 0.7109031677246094, | |
| "learning_rate": 0.00019761033925686593, | |
| "loss": 0.3056, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5977382875605816, | |
| "eval_loss": 0.2841907739639282, | |
| "eval_runtime": 163.4248, | |
| "eval_samples_per_second": 116.286, | |
| "eval_steps_per_second": 2.423, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6138933764135702, | |
| "grad_norm": 0.5637331008911133, | |
| "learning_rate": 0.00019754571890145397, | |
| "loss": 0.3018, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6138933764135702, | |
| "eval_loss": 0.27677446603775024, | |
| "eval_runtime": 168.0175, | |
| "eval_samples_per_second": 113.107, | |
| "eval_steps_per_second": 2.357, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.630048465266559, | |
| "grad_norm": 0.6805331110954285, | |
| "learning_rate": 0.00019748109854604202, | |
| "loss": 0.2974, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.630048465266559, | |
| "eval_loss": 0.2690126597881317, | |
| "eval_runtime": 169.2939, | |
| "eval_samples_per_second": 112.254, | |
| "eval_steps_per_second": 2.339, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6462035541195477, | |
| "grad_norm": 0.5953163504600525, | |
| "learning_rate": 0.00019741647819063006, | |
| "loss": 0.2824, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6462035541195477, | |
| "eval_loss": 0.2670688331127167, | |
| "eval_runtime": 165.9823, | |
| "eval_samples_per_second": 114.494, | |
| "eval_steps_per_second": 2.386, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6623586429725363, | |
| "grad_norm": 0.6007310152053833, | |
| "learning_rate": 0.0001973518578352181, | |
| "loss": 0.2772, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6623586429725363, | |
| "eval_loss": 0.26082342863082886, | |
| "eval_runtime": 171.6359, | |
| "eval_samples_per_second": 110.723, | |
| "eval_steps_per_second": 2.307, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.678513731825525, | |
| "grad_norm": 0.4502333700656891, | |
| "learning_rate": 0.00019728723747980615, | |
| "loss": 0.2689, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.678513731825525, | |
| "eval_loss": 0.2507447898387909, | |
| "eval_runtime": 165.8831, | |
| "eval_samples_per_second": 114.563, | |
| "eval_steps_per_second": 2.387, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6946688206785138, | |
| "grad_norm": 0.5430248975753784, | |
| "learning_rate": 0.0001972226171243942, | |
| "loss": 0.2599, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6946688206785138, | |
| "eval_loss": 0.247538760304451, | |
| "eval_runtime": 163.9496, | |
| "eval_samples_per_second": 115.914, | |
| "eval_steps_per_second": 2.415, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.7108239095315024, | |
| "grad_norm": 0.6169001460075378, | |
| "learning_rate": 0.00019715799676898224, | |
| "loss": 0.2568, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7108239095315024, | |
| "eval_loss": 0.24606221914291382, | |
| "eval_runtime": 169.0704, | |
| "eval_samples_per_second": 112.403, | |
| "eval_steps_per_second": 2.342, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7269789983844911, | |
| "grad_norm": 0.48222729563713074, | |
| "learning_rate": 0.00019709337641357028, | |
| "loss": 0.258, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7269789983844911, | |
| "eval_loss": 0.24414058029651642, | |
| "eval_runtime": 165.7073, | |
| "eval_samples_per_second": 114.684, | |
| "eval_steps_per_second": 2.39, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7431340872374798, | |
| "grad_norm": 0.5104192495346069, | |
| "learning_rate": 0.00019702875605815833, | |
| "loss": 0.2428, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7431340872374798, | |
| "eval_loss": 0.2404615581035614, | |
| "eval_runtime": 168.1658, | |
| "eval_samples_per_second": 113.008, | |
| "eval_steps_per_second": 2.355, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7592891760904685, | |
| "grad_norm": 0.4499678909778595, | |
| "learning_rate": 0.00019696413570274637, | |
| "loss": 0.2503, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7592891760904685, | |
| "eval_loss": 0.23556502163410187, | |
| "eval_runtime": 166.7646, | |
| "eval_samples_per_second": 113.957, | |
| "eval_steps_per_second": 2.375, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7754442649434572, | |
| "grad_norm": 0.5658778548240662, | |
| "learning_rate": 0.00019689951534733441, | |
| "loss": 0.2501, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7754442649434572, | |
| "eval_loss": 0.2322016805410385, | |
| "eval_runtime": 165.6498, | |
| "eval_samples_per_second": 114.724, | |
| "eval_steps_per_second": 2.391, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7915993537964459, | |
| "grad_norm": 0.5279048681259155, | |
| "learning_rate": 0.00019683489499192246, | |
| "loss": 0.2439, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7915993537964459, | |
| "eval_loss": 0.22799938917160034, | |
| "eval_runtime": 172.0629, | |
| "eval_samples_per_second": 110.448, | |
| "eval_steps_per_second": 2.301, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8077544426494345, | |
| "grad_norm": 0.5539276003837585, | |
| "learning_rate": 0.00019677027463651053, | |
| "loss": 0.2371, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8077544426494345, | |
| "eval_loss": 0.22852076590061188, | |
| "eval_runtime": 164.2885, | |
| "eval_samples_per_second": 115.675, | |
| "eval_steps_per_second": 2.41, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8239095315024233, | |
| "grad_norm": 0.46706199645996094, | |
| "learning_rate": 0.00019670565428109855, | |
| "loss": 0.2265, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8239095315024233, | |
| "eval_loss": 0.22647793591022491, | |
| "eval_runtime": 166.7134, | |
| "eval_samples_per_second": 113.992, | |
| "eval_steps_per_second": 2.375, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.840064620355412, | |
| "grad_norm": 0.47346100211143494, | |
| "learning_rate": 0.0001966410339256866, | |
| "loss": 0.2389, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.840064620355412, | |
| "eval_loss": 0.22408178448677063, | |
| "eval_runtime": 163.986, | |
| "eval_samples_per_second": 115.888, | |
| "eval_steps_per_second": 2.415, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8562197092084006, | |
| "grad_norm": 0.4981846809387207, | |
| "learning_rate": 0.00019657641357027466, | |
| "loss": 0.233, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8562197092084006, | |
| "eval_loss": 0.21772241592407227, | |
| "eval_runtime": 164.9909, | |
| "eval_samples_per_second": 115.182, | |
| "eval_steps_per_second": 2.4, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8723747980613893, | |
| "grad_norm": 0.39221659302711487, | |
| "learning_rate": 0.00019651179321486268, | |
| "loss": 0.2274, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8723747980613893, | |
| "eval_loss": 0.21865881979465485, | |
| "eval_runtime": 164.8791, | |
| "eval_samples_per_second": 115.26, | |
| "eval_steps_per_second": 2.402, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8885298869143781, | |
| "grad_norm": 0.3755420446395874, | |
| "learning_rate": 0.00019644717285945072, | |
| "loss": 0.2212, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8885298869143781, | |
| "eval_loss": 0.2188514620065689, | |
| "eval_runtime": 163.9572, | |
| "eval_samples_per_second": 115.908, | |
| "eval_steps_per_second": 2.415, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9046849757673667, | |
| "grad_norm": 0.4831067621707916, | |
| "learning_rate": 0.0001963825525040388, | |
| "loss": 0.2174, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9046849757673667, | |
| "eval_loss": 0.2164352685213089, | |
| "eval_runtime": 164.722, | |
| "eval_samples_per_second": 115.37, | |
| "eval_steps_per_second": 2.404, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9208400646203554, | |
| "grad_norm": 0.4211012125015259, | |
| "learning_rate": 0.00019631793214862684, | |
| "loss": 0.2193, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9208400646203554, | |
| "eval_loss": 0.21268193423748016, | |
| "eval_runtime": 173.6453, | |
| "eval_samples_per_second": 109.442, | |
| "eval_steps_per_second": 2.281, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9369951534733441, | |
| "grad_norm": 0.41934114694595337, | |
| "learning_rate": 0.00019625331179321486, | |
| "loss": 0.2146, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9369951534733441, | |
| "eval_loss": 0.21176370978355408, | |
| "eval_runtime": 167.1164, | |
| "eval_samples_per_second": 113.717, | |
| "eval_steps_per_second": 2.37, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9531502423263328, | |
| "grad_norm": 0.4452735483646393, | |
| "learning_rate": 0.00019618869143780293, | |
| "loss": 0.2073, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9531502423263328, | |
| "eval_loss": 0.20772910118103027, | |
| "eval_runtime": 126.5423, | |
| "eval_samples_per_second": 150.179, | |
| "eval_steps_per_second": 3.129, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9693053311793215, | |
| "grad_norm": 0.46377331018447876, | |
| "learning_rate": 0.00019612407108239097, | |
| "loss": 0.2093, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9693053311793215, | |
| "eval_loss": 0.2042500376701355, | |
| "eval_runtime": 109.4901, | |
| "eval_samples_per_second": 173.568, | |
| "eval_steps_per_second": 3.617, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9854604200323102, | |
| "grad_norm": 0.5356667041778564, | |
| "learning_rate": 0.000196059450726979, | |
| "loss": 0.2082, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9854604200323102, | |
| "eval_loss": 0.20467719435691833, | |
| "eval_runtime": 108.3027, | |
| "eval_samples_per_second": 175.471, | |
| "eval_steps_per_second": 3.656, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.001615508885299, | |
| "grad_norm": 0.5465859770774841, | |
| "learning_rate": 0.00019599483037156706, | |
| "loss": 0.2021, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.001615508885299, | |
| "eval_loss": 0.2056320309638977, | |
| "eval_runtime": 112.7152, | |
| "eval_samples_per_second": 168.602, | |
| "eval_steps_per_second": 3.513, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0177705977382876, | |
| "grad_norm": 0.5674402713775635, | |
| "learning_rate": 0.0001959302100161551, | |
| "loss": 0.2052, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.0177705977382876, | |
| "eval_loss": 0.20261099934577942, | |
| "eval_runtime": 114.1889, | |
| "eval_samples_per_second": 166.426, | |
| "eval_steps_per_second": 3.468, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.0339256865912763, | |
| "grad_norm": 0.44871240854263306, | |
| "learning_rate": 0.00019586558966074315, | |
| "loss": 0.2091, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.0339256865912763, | |
| "eval_loss": 0.19862857460975647, | |
| "eval_runtime": 111.4553, | |
| "eval_samples_per_second": 170.508, | |
| "eval_steps_per_second": 3.553, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.050080775444265, | |
| "grad_norm": 0.3680683374404907, | |
| "learning_rate": 0.0001958009693053312, | |
| "loss": 0.1998, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.050080775444265, | |
| "eval_loss": 0.1985771656036377, | |
| "eval_runtime": 108.0946, | |
| "eval_samples_per_second": 175.809, | |
| "eval_steps_per_second": 3.663, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.0662358642972536, | |
| "grad_norm": 0.4568157196044922, | |
| "learning_rate": 0.00019573634894991924, | |
| "loss": 0.19, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0662358642972536, | |
| "eval_loss": 0.19872696697711945, | |
| "eval_runtime": 109.2206, | |
| "eval_samples_per_second": 173.997, | |
| "eval_steps_per_second": 3.626, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0823909531502423, | |
| "grad_norm": 0.4335425794124603, | |
| "learning_rate": 0.00019567172859450728, | |
| "loss": 0.2, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.0823909531502423, | |
| "eval_loss": 0.19521376490592957, | |
| "eval_runtime": 109.0433, | |
| "eval_samples_per_second": 174.279, | |
| "eval_steps_per_second": 3.632, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.098546042003231, | |
| "grad_norm": 0.3882080316543579, | |
| "learning_rate": 0.00019560710823909533, | |
| "loss": 0.1937, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.098546042003231, | |
| "eval_loss": 0.19395685195922852, | |
| "eval_runtime": 111.4962, | |
| "eval_samples_per_second": 170.445, | |
| "eval_steps_per_second": 3.552, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1147011308562198, | |
| "grad_norm": 0.4501712918281555, | |
| "learning_rate": 0.00019554248788368337, | |
| "loss": 0.2047, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.1147011308562198, | |
| "eval_loss": 0.1951920986175537, | |
| "eval_runtime": 108.988, | |
| "eval_samples_per_second": 174.368, | |
| "eval_steps_per_second": 3.633, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.1308562197092085, | |
| "grad_norm": 0.37776461243629456, | |
| "learning_rate": 0.00019547786752827141, | |
| "loss": 0.1935, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1308562197092085, | |
| "eval_loss": 0.19314581155776978, | |
| "eval_runtime": 106.1233, | |
| "eval_samples_per_second": 179.075, | |
| "eval_steps_per_second": 3.732, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1470113085621971, | |
| "grad_norm": 0.43368467688560486, | |
| "learning_rate": 0.00019541324717285946, | |
| "loss": 0.1915, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.1470113085621971, | |
| "eval_loss": 0.1909024566411972, | |
| "eval_runtime": 114.5384, | |
| "eval_samples_per_second": 165.918, | |
| "eval_steps_per_second": 3.457, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.1631663974151858, | |
| "grad_norm": 0.4507925510406494, | |
| "learning_rate": 0.0001953486268174475, | |
| "loss": 0.1893, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1631663974151858, | |
| "eval_loss": 0.1882883608341217, | |
| "eval_runtime": 113.0984, | |
| "eval_samples_per_second": 168.031, | |
| "eval_steps_per_second": 3.501, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1793214862681745, | |
| "grad_norm": 0.4633695185184479, | |
| "learning_rate": 0.00019528400646203555, | |
| "loss": 0.1877, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.1793214862681745, | |
| "eval_loss": 0.19144870340824127, | |
| "eval_runtime": 110.1685, | |
| "eval_samples_per_second": 172.499, | |
| "eval_steps_per_second": 3.594, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.1954765751211631, | |
| "grad_norm": 0.37872394919395447, | |
| "learning_rate": 0.0001952193861066236, | |
| "loss": 0.1899, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1954765751211631, | |
| "eval_loss": 0.18969100713729858, | |
| "eval_runtime": 114.4541, | |
| "eval_samples_per_second": 166.04, | |
| "eval_steps_per_second": 3.46, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2116316639741518, | |
| "grad_norm": 0.4261837899684906, | |
| "learning_rate": 0.00019515476575121164, | |
| "loss": 0.189, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.2116316639741518, | |
| "eval_loss": 0.18924662470817566, | |
| "eval_runtime": 111.8806, | |
| "eval_samples_per_second": 169.86, | |
| "eval_steps_per_second": 3.539, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.2277867528271407, | |
| "grad_norm": 0.4276933968067169, | |
| "learning_rate": 0.00019509014539579968, | |
| "loss": 0.1882, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.2277867528271407, | |
| "eval_loss": 0.18727388978004456, | |
| "eval_runtime": 114.5697, | |
| "eval_samples_per_second": 165.873, | |
| "eval_steps_per_second": 3.456, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.2439418416801293, | |
| "grad_norm": 0.451194167137146, | |
| "learning_rate": 0.00019502552504038772, | |
| "loss": 0.1876, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.2439418416801293, | |
| "eval_loss": 0.18400390446186066, | |
| "eval_runtime": 110.0659, | |
| "eval_samples_per_second": 172.66, | |
| "eval_steps_per_second": 3.598, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.260096930533118, | |
| "grad_norm": 0.42289528250694275, | |
| "learning_rate": 0.0001949609046849758, | |
| "loss": 0.1764, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.260096930533118, | |
| "eval_loss": 0.1845184564590454, | |
| "eval_runtime": 109.3321, | |
| "eval_samples_per_second": 173.819, | |
| "eval_steps_per_second": 3.622, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.2762520193861067, | |
| "grad_norm": 0.4245447814464569, | |
| "learning_rate": 0.0001948962843295638, | |
| "loss": 0.1843, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.2762520193861067, | |
| "eval_loss": 0.18434682488441467, | |
| "eval_runtime": 110.8391, | |
| "eval_samples_per_second": 171.456, | |
| "eval_steps_per_second": 3.573, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.2924071082390953, | |
| "grad_norm": 0.4191521406173706, | |
| "learning_rate": 0.00019483166397415186, | |
| "loss": 0.1851, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2924071082390953, | |
| "eval_loss": 0.18124856054782867, | |
| "eval_runtime": 107.9981, | |
| "eval_samples_per_second": 175.966, | |
| "eval_steps_per_second": 3.667, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.308562197092084, | |
| "grad_norm": 0.3390869200229645, | |
| "learning_rate": 0.00019476704361873993, | |
| "loss": 0.1843, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.308562197092084, | |
| "eval_loss": 0.18725259602069855, | |
| "eval_runtime": 106.2756, | |
| "eval_samples_per_second": 178.818, | |
| "eval_steps_per_second": 3.726, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.3247172859450727, | |
| "grad_norm": 0.444640189409256, | |
| "learning_rate": 0.00019470242326332794, | |
| "loss": 0.1827, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3247172859450727, | |
| "eval_loss": 0.1816088706254959, | |
| "eval_runtime": 110.0768, | |
| "eval_samples_per_second": 172.643, | |
| "eval_steps_per_second": 3.597, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3408723747980613, | |
| "grad_norm": 0.42469364404678345, | |
| "learning_rate": 0.000194637802907916, | |
| "loss": 0.1839, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.3408723747980613, | |
| "eval_loss": 0.18320384621620178, | |
| "eval_runtime": 110.8523, | |
| "eval_samples_per_second": 171.435, | |
| "eval_steps_per_second": 3.572, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.35702746365105, | |
| "grad_norm": 0.48619943857192993, | |
| "learning_rate": 0.00019457318255250406, | |
| "loss": 0.1792, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.35702746365105, | |
| "eval_loss": 0.17806969583034515, | |
| "eval_runtime": 112.0106, | |
| "eval_samples_per_second": 169.663, | |
| "eval_steps_per_second": 3.535, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.3731825525040389, | |
| "grad_norm": 0.4447220265865326, | |
| "learning_rate": 0.0001945085621970921, | |
| "loss": 0.1772, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.3731825525040389, | |
| "eval_loss": 0.18053770065307617, | |
| "eval_runtime": 109.3432, | |
| "eval_samples_per_second": 173.801, | |
| "eval_steps_per_second": 3.622, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.3893376413570275, | |
| "grad_norm": 0.418562114238739, | |
| "learning_rate": 0.00019444394184168012, | |
| "loss": 0.1805, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.3893376413570275, | |
| "eval_loss": 0.17956310510635376, | |
| "eval_runtime": 109.4642, | |
| "eval_samples_per_second": 173.609, | |
| "eval_steps_per_second": 3.618, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4054927302100162, | |
| "grad_norm": 0.4397905170917511, | |
| "learning_rate": 0.0001943793214862682, | |
| "loss": 0.176, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.4054927302100162, | |
| "eval_loss": 0.1766211986541748, | |
| "eval_runtime": 118.1975, | |
| "eval_samples_per_second": 160.782, | |
| "eval_steps_per_second": 3.35, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.4216478190630049, | |
| "grad_norm": 0.3560314476490021, | |
| "learning_rate": 0.00019431470113085624, | |
| "loss": 0.1719, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4216478190630049, | |
| "eval_loss": 0.17780330777168274, | |
| "eval_runtime": 109.6268, | |
| "eval_samples_per_second": 173.352, | |
| "eval_steps_per_second": 3.612, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4378029079159935, | |
| "grad_norm": 0.3879343271255493, | |
| "learning_rate": 0.00019425008077544425, | |
| "loss": 0.1733, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.4378029079159935, | |
| "eval_loss": 0.17606207728385925, | |
| "eval_runtime": 109.7272, | |
| "eval_samples_per_second": 173.193, | |
| "eval_steps_per_second": 3.609, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.4539579967689822, | |
| "grad_norm": 0.4320586919784546, | |
| "learning_rate": 0.00019418546042003233, | |
| "loss": 0.1721, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.4539579967689822, | |
| "eval_loss": 0.17723843455314636, | |
| "eval_runtime": 107.8177, | |
| "eval_samples_per_second": 176.261, | |
| "eval_steps_per_second": 3.673, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.4701130856219708, | |
| "grad_norm": 0.31035879254341125, | |
| "learning_rate": 0.00019412084006462037, | |
| "loss": 0.1689, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.4701130856219708, | |
| "eval_loss": 0.17564034461975098, | |
| "eval_runtime": 109.0906, | |
| "eval_samples_per_second": 174.204, | |
| "eval_steps_per_second": 3.63, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.4862681744749597, | |
| "grad_norm": 0.37017735838890076, | |
| "learning_rate": 0.00019405621970920841, | |
| "loss": 0.1771, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.4862681744749597, | |
| "eval_loss": 0.1736358106136322, | |
| "eval_runtime": 110.4481, | |
| "eval_samples_per_second": 172.063, | |
| "eval_steps_per_second": 3.585, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.5024232633279482, | |
| "grad_norm": 0.5021731853485107, | |
| "learning_rate": 0.00019399159935379646, | |
| "loss": 0.1717, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.5024232633279482, | |
| "eval_loss": 0.17395810782909393, | |
| "eval_runtime": 110.9158, | |
| "eval_samples_per_second": 171.337, | |
| "eval_steps_per_second": 3.57, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.518578352180937, | |
| "grad_norm": 0.3692797124385834, | |
| "learning_rate": 0.0001939269789983845, | |
| "loss": 0.17, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.518578352180937, | |
| "eval_loss": 0.17615529894828796, | |
| "eval_runtime": 111.7907, | |
| "eval_samples_per_second": 169.996, | |
| "eval_steps_per_second": 3.542, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.5347334410339257, | |
| "grad_norm": 0.44357678294181824, | |
| "learning_rate": 0.00019386235864297255, | |
| "loss": 0.1635, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.5347334410339257, | |
| "eval_loss": 0.17349159717559814, | |
| "eval_runtime": 108.9427, | |
| "eval_samples_per_second": 174.44, | |
| "eval_steps_per_second": 3.635, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.5508885298869144, | |
| "grad_norm": 0.4116641581058502, | |
| "learning_rate": 0.0001937977382875606, | |
| "loss": 0.166, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.5508885298869144, | |
| "eval_loss": 0.17150762677192688, | |
| "eval_runtime": 112.722, | |
| "eval_samples_per_second": 168.592, | |
| "eval_steps_per_second": 3.513, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.567043618739903, | |
| "grad_norm": 0.4795362651348114, | |
| "learning_rate": 0.00019373311793214864, | |
| "loss": 0.1635, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.567043618739903, | |
| "eval_loss": 0.17290830612182617, | |
| "eval_runtime": 109.9865, | |
| "eval_samples_per_second": 172.785, | |
| "eval_steps_per_second": 3.6, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.5831987075928917, | |
| "grad_norm": 0.4252488315105438, | |
| "learning_rate": 0.00019366849757673668, | |
| "loss": 0.1657, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.5831987075928917, | |
| "eval_loss": 0.1683739274740219, | |
| "eval_runtime": 110.2434, | |
| "eval_samples_per_second": 172.382, | |
| "eval_steps_per_second": 3.592, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.5993537964458806, | |
| "grad_norm": 0.3322688639163971, | |
| "learning_rate": 0.00019360387722132472, | |
| "loss": 0.1686, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.5993537964458806, | |
| "eval_loss": 0.16992688179016113, | |
| "eval_runtime": 109.0229, | |
| "eval_samples_per_second": 174.312, | |
| "eval_steps_per_second": 3.632, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.615508885298869, | |
| "grad_norm": 0.3991793394088745, | |
| "learning_rate": 0.00019353925686591277, | |
| "loss": 0.163, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.615508885298869, | |
| "eval_loss": 0.16902555525302887, | |
| "eval_runtime": 120.177, | |
| "eval_samples_per_second": 158.133, | |
| "eval_steps_per_second": 3.295, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.631663974151858, | |
| "grad_norm": 0.2803505063056946, | |
| "learning_rate": 0.0001934746365105008, | |
| "loss": 0.1574, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.631663974151858, | |
| "eval_loss": 0.17043916881084442, | |
| "eval_runtime": 108.4069, | |
| "eval_samples_per_second": 175.303, | |
| "eval_steps_per_second": 3.653, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.6478190630048464, | |
| "grad_norm": 0.39425361156463623, | |
| "learning_rate": 0.00019341001615508886, | |
| "loss": 0.1575, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.6478190630048464, | |
| "eval_loss": 0.17015740275382996, | |
| "eval_runtime": 110.4234, | |
| "eval_samples_per_second": 172.101, | |
| "eval_steps_per_second": 3.586, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.6639741518578353, | |
| "grad_norm": 0.3668546676635742, | |
| "learning_rate": 0.0001933453957996769, | |
| "loss": 0.1697, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.6639741518578353, | |
| "eval_loss": 0.17117071151733398, | |
| "eval_runtime": 131.1419, | |
| "eval_samples_per_second": 144.912, | |
| "eval_steps_per_second": 3.02, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.680129240710824, | |
| "grad_norm": 0.4108649790287018, | |
| "learning_rate": 0.00019328077544426494, | |
| "loss": 0.1637, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.680129240710824, | |
| "eval_loss": 0.1672579050064087, | |
| "eval_runtime": 116.3456, | |
| "eval_samples_per_second": 163.341, | |
| "eval_steps_per_second": 3.404, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.6962843295638126, | |
| "grad_norm": 0.3385171592235565, | |
| "learning_rate": 0.000193216155088853, | |
| "loss": 0.1645, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.6962843295638126, | |
| "eval_loss": 0.1657264679670334, | |
| "eval_runtime": 113.7291, | |
| "eval_samples_per_second": 167.099, | |
| "eval_steps_per_second": 3.482, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.7124394184168013, | |
| "grad_norm": 0.3096817135810852, | |
| "learning_rate": 0.00019315153473344106, | |
| "loss": 0.1637, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.7124394184168013, | |
| "eval_loss": 0.1663082391023636, | |
| "eval_runtime": 112.0716, | |
| "eval_samples_per_second": 169.57, | |
| "eval_steps_per_second": 3.533, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.72859450726979, | |
| "grad_norm": 0.34010905027389526, | |
| "learning_rate": 0.00019308691437802908, | |
| "loss": 0.1625, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.72859450726979, | |
| "eval_loss": 0.16775698959827423, | |
| "eval_runtime": 107.0653, | |
| "eval_samples_per_second": 177.499, | |
| "eval_steps_per_second": 3.699, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.7447495961227788, | |
| "grad_norm": 0.34848374128341675, | |
| "learning_rate": 0.00019302229402261712, | |
| "loss": 0.1611, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.7447495961227788, | |
| "eval_loss": 0.1658620685338974, | |
| "eval_runtime": 114.0524, | |
| "eval_samples_per_second": 166.625, | |
| "eval_steps_per_second": 3.472, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.7609046849757672, | |
| "grad_norm": 0.3644295334815979, | |
| "learning_rate": 0.0001929576736672052, | |
| "loss": 0.1611, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.7609046849757672, | |
| "eval_loss": 0.1643301099538803, | |
| "eval_runtime": 109.7052, | |
| "eval_samples_per_second": 173.228, | |
| "eval_steps_per_second": 3.61, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.7770597738287561, | |
| "grad_norm": 0.4286295473575592, | |
| "learning_rate": 0.0001928930533117932, | |
| "loss": 0.1572, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.7770597738287561, | |
| "eval_loss": 0.16254638135433197, | |
| "eval_runtime": 111.7116, | |
| "eval_samples_per_second": 170.117, | |
| "eval_steps_per_second": 3.545, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.7932148626817448, | |
| "grad_norm": 0.3320305645465851, | |
| "learning_rate": 0.00019282843295638125, | |
| "loss": 0.1588, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.7932148626817448, | |
| "eval_loss": 0.16255541145801544, | |
| "eval_runtime": 113.7525, | |
| "eval_samples_per_second": 167.064, | |
| "eval_steps_per_second": 3.481, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.8093699515347335, | |
| "grad_norm": 0.3314014673233032, | |
| "learning_rate": 0.00019276381260096933, | |
| "loss": 0.1571, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.8093699515347335, | |
| "eval_loss": 0.16239432990550995, | |
| "eval_runtime": 113.2812, | |
| "eval_samples_per_second": 167.76, | |
| "eval_steps_per_second": 3.496, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.8255250403877221, | |
| "grad_norm": 0.3537631034851074, | |
| "learning_rate": 0.00019269919224555737, | |
| "loss": 0.1551, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.8255250403877221, | |
| "eval_loss": 0.16390350461006165, | |
| "eval_runtime": 116.9761, | |
| "eval_samples_per_second": 162.461, | |
| "eval_steps_per_second": 3.385, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.8416801292407108, | |
| "grad_norm": 0.38042768836021423, | |
| "learning_rate": 0.0001926345718901454, | |
| "loss": 0.1521, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.8416801292407108, | |
| "eval_loss": 0.16476310789585114, | |
| "eval_runtime": 110.127, | |
| "eval_samples_per_second": 172.564, | |
| "eval_steps_per_second": 3.596, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.8578352180936997, | |
| "grad_norm": 0.29028207063674927, | |
| "learning_rate": 0.00019256995153473346, | |
| "loss": 0.1583, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.8578352180936997, | |
| "eval_loss": 0.16223183274269104, | |
| "eval_runtime": 108.7981, | |
| "eval_samples_per_second": 174.672, | |
| "eval_steps_per_second": 3.64, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.8739903069466881, | |
| "grad_norm": 0.28699570894241333, | |
| "learning_rate": 0.0001925053311793215, | |
| "loss": 0.1552, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.8739903069466881, | |
| "eval_loss": 0.16354109346866608, | |
| "eval_runtime": 108.9341, | |
| "eval_samples_per_second": 174.454, | |
| "eval_steps_per_second": 3.635, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.890145395799677, | |
| "grad_norm": 0.34706467390060425, | |
| "learning_rate": 0.00019244071082390952, | |
| "loss": 0.1596, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.890145395799677, | |
| "eval_loss": 0.16145038604736328, | |
| "eval_runtime": 111.1377, | |
| "eval_samples_per_second": 170.995, | |
| "eval_steps_per_second": 3.563, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.9063004846526654, | |
| "grad_norm": 0.46458888053894043, | |
| "learning_rate": 0.0001923760904684976, | |
| "loss": 0.1582, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.9063004846526654, | |
| "eval_loss": 0.165074422955513, | |
| "eval_runtime": 111.4361, | |
| "eval_samples_per_second": 170.537, | |
| "eval_steps_per_second": 3.554, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.9224555735056543, | |
| "grad_norm": 0.35297635197639465, | |
| "learning_rate": 0.00019231147011308564, | |
| "loss": 0.157, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.9224555735056543, | |
| "eval_loss": 0.1603960543870926, | |
| "eval_runtime": 114.7674, | |
| "eval_samples_per_second": 165.587, | |
| "eval_steps_per_second": 3.45, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.938610662358643, | |
| "grad_norm": 0.4734126031398773, | |
| "learning_rate": 0.00019224684975767368, | |
| "loss": 0.1531, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.938610662358643, | |
| "eval_loss": 0.15993832051753998, | |
| "eval_runtime": 113.255, | |
| "eval_samples_per_second": 167.798, | |
| "eval_steps_per_second": 3.497, | |
| "step": 6000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 154750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 10, | |
| "early_stopping_threshold": 0.001 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 1 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.052398008610816e+16, | |
| "train_batch_size": 96, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |