| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.13175905660515472, |
| "eval_steps": 2000, |
| "global_step": 18000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0007319947589175262, |
| "grad_norm": 800.0, |
| "learning_rate": 4.879238838741157e-07, |
| "loss": 0.952, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0014639895178350524, |
| "grad_norm": 768.0, |
| "learning_rate": 9.758477677482314e-07, |
| "loss": 0.8458, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0021959842767525785, |
| "grad_norm": 1600.0, |
| "learning_rate": 1.4637716516223471e-06, |
| "loss": 0.8841, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0029279790356701047, |
| "grad_norm": 616.0, |
| "learning_rate": 1.951695535496463e-06, |
| "loss": 0.9277, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.003659973794587631, |
| "grad_norm": 436.0, |
| "learning_rate": 2.4396194193705783e-06, |
| "loss": 0.6913, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.004391968553505157, |
| "grad_norm": 1256.0, |
| "learning_rate": 2.9275433032446943e-06, |
| "loss": 0.6372, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.005123963312422683, |
| "grad_norm": 764.0, |
| "learning_rate": 3.41546718711881e-06, |
| "loss": 1.0322, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.005855958071340209, |
| "grad_norm": 1408.0, |
| "learning_rate": 3.903391070992926e-06, |
| "loss": 0.7853, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.006587952830257735, |
| "grad_norm": 43.5, |
| "learning_rate": 4.391314954867041e-06, |
| "loss": 0.9377, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.007319947589175262, |
| "grad_norm": 684.0, |
| "learning_rate": 4.879238838741157e-06, |
| "loss": 1.4695, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.008051942348092788, |
| "grad_norm": 524.0, |
| "learning_rate": 5.367162722615272e-06, |
| "loss": 1.4889, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.008783937107010314, |
| "grad_norm": 33.25, |
| "learning_rate": 5.8550866064893885e-06, |
| "loss": 1.2154, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.00951593186592784, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.343010490363504e-06, |
| "loss": 1.3154, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.010247926624845366, |
| "grad_norm": 388.0, |
| "learning_rate": 6.83093437423762e-06, |
| "loss": 1.5739, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.010979921383762893, |
| "grad_norm": 251.0, |
| "learning_rate": 7.318858258111735e-06, |
| "loss": 0.7903, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.011711916142680419, |
| "grad_norm": 95.5, |
| "learning_rate": 7.806782141985851e-06, |
| "loss": 0.6962, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.012443910901597945, |
| "grad_norm": 516.0, |
| "learning_rate": 8.294706025859967e-06, |
| "loss": 1.2352, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.01317590566051547, |
| "grad_norm": 41.0, |
| "learning_rate": 8.782629909734082e-06, |
| "loss": 0.924, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.013907900419432996, |
| "grad_norm": 0.91015625, |
| "learning_rate": 9.270553793608198e-06, |
| "loss": 1.467, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.014639895178350524, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.758477677482313e-06, |
| "loss": 0.9323, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.014639895178350524, |
| "eval_loss": 1.4799224138259888, |
| "eval_runtime": 27.9405, |
| "eval_samples_per_second": 17.895, |
| "eval_steps_per_second": 17.895, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.01537188993726805, |
| "grad_norm": 39.5, |
| "learning_rate": 1.0246401561356429e-05, |
| "loss": 1.2229, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.016103884696185577, |
| "grad_norm": 984.0, |
| "learning_rate": 1.0734325445230544e-05, |
| "loss": 1.7086, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.016835879455103103, |
| "grad_norm": 684.0, |
| "learning_rate": 1.122224932910466e-05, |
| "loss": 0.9654, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.01756787421402063, |
| "grad_norm": 840.0, |
| "learning_rate": 1.1710173212978777e-05, |
| "loss": 1.3702, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.018299868972938154, |
| "grad_norm": 388.0, |
| "learning_rate": 1.2198097096852893e-05, |
| "loss": 1.0888, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.01903186373185568, |
| "grad_norm": 832.0, |
| "learning_rate": 1.2686020980727008e-05, |
| "loss": 0.9989, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.019763858490773206, |
| "grad_norm": 0.3984375, |
| "learning_rate": 1.3173944864601122e-05, |
| "loss": 1.3161, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.02049585324969073, |
| "grad_norm": 644.0, |
| "learning_rate": 1.366186874847524e-05, |
| "loss": 1.1279, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.021227848008608257, |
| "grad_norm": 414.0, |
| "learning_rate": 1.4149792632349354e-05, |
| "loss": 1.2044, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.021959842767525786, |
| "grad_norm": 12.1875, |
| "learning_rate": 1.463771651622347e-05, |
| "loss": 1.0637, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.022691837526443312, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.5125640400097585e-05, |
| "loss": 1.2628, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.023423832285360838, |
| "grad_norm": 10.875, |
| "learning_rate": 1.5613564283971703e-05, |
| "loss": 1.3439, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.024155827044278363, |
| "grad_norm": 256.0, |
| "learning_rate": 1.6101488167845818e-05, |
| "loss": 1.2094, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.02488782180319589, |
| "grad_norm": 0.267578125, |
| "learning_rate": 1.6589412051719934e-05, |
| "loss": 1.4852, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.025619816562113415, |
| "grad_norm": 54.25, |
| "learning_rate": 1.707733593559405e-05, |
| "loss": 1.1689, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.02635181132103094, |
| "grad_norm": 308.0, |
| "learning_rate": 1.7565259819468165e-05, |
| "loss": 1.0845, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.027083806079948466, |
| "grad_norm": 264.0, |
| "learning_rate": 1.805318370334228e-05, |
| "loss": 1.2785, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.027815800838865992, |
| "grad_norm": 81.5, |
| "learning_rate": 1.8541107587216396e-05, |
| "loss": 1.0887, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.02854779559778352, |
| "grad_norm": 696.0, |
| "learning_rate": 1.902903147109051e-05, |
| "loss": 1.2968, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.029279790356701047, |
| "grad_norm": 186.0, |
| "learning_rate": 1.9516955354964627e-05, |
| "loss": 1.3166, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.029279790356701047, |
| "eval_loss": 0.8257483839988708, |
| "eval_runtime": 27.9567, |
| "eval_samples_per_second": 17.885, |
| "eval_steps_per_second": 17.885, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.030011785115618573, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.9999999997189743e-05, |
| "loss": 1.171, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.0307437798745361, |
| "grad_norm": 211.0, |
| "learning_rate": 1.9999971332569874e-05, |
| "loss": 1.1772, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.03147577463345363, |
| "grad_norm": 0.1416015625, |
| "learning_rate": 1.9999886462973602e-05, |
| "loss": 1.0697, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.032207769392371154, |
| "grad_norm": 0.220703125, |
| "learning_rate": 1.9999745388877933e-05, |
| "loss": 1.2177, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.03293976415128868, |
| "grad_norm": 0.283203125, |
| "learning_rate": 1.999954811107578e-05, |
| "loss": 1.1959, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.033671758910206205, |
| "grad_norm": 0.490234375, |
| "learning_rate": 1.9999294630675945e-05, |
| "loss": 1.1617, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.03440375366912373, |
| "grad_norm": 390.0, |
| "learning_rate": 1.999898494910312e-05, |
| "loss": 1.1348, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.03513574842804126, |
| "grad_norm": 0.279296875, |
| "learning_rate": 1.999861906809787e-05, |
| "loss": 1.1857, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.03586774318695878, |
| "grad_norm": 620.0, |
| "learning_rate": 1.9998196989716637e-05, |
| "loss": 1.1041, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.03659973794587631, |
| "grad_norm": 7.9375, |
| "learning_rate": 1.999771871633172e-05, |
| "loss": 1.2604, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.037331732704793834, |
| "grad_norm": 0.1328125, |
| "learning_rate": 1.9997184250631257e-05, |
| "loss": 1.1525, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.03806372746371136, |
| "grad_norm": 988.0, |
| "learning_rate": 1.999659359561922e-05, |
| "loss": 1.1125, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.038795722222628885, |
| "grad_norm": 528.0, |
| "learning_rate": 1.99959467546154e-05, |
| "loss": 1.0241, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.03952771698154641, |
| "grad_norm": 0.08203125, |
| "learning_rate": 1.999524373125537e-05, |
| "loss": 1.0007, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.04025971174046394, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 1.9994484529490483e-05, |
| "loss": 1.7392, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.04099170649938146, |
| "grad_norm": 155.0, |
| "learning_rate": 1.9993669153587842e-05, |
| "loss": 1.6975, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.04172370125829899, |
| "grad_norm": 0.1787109375, |
| "learning_rate": 1.9992797608130284e-05, |
| "loss": 1.3126, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.042455696017216514, |
| "grad_norm": 102.5, |
| "learning_rate": 1.9991869898016337e-05, |
| "loss": 1.0694, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.04318769077613404, |
| "grad_norm": 282.0, |
| "learning_rate": 1.999088602846021e-05, |
| "loss": 1.1731, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.04391968553505157, |
| "grad_norm": 756.0, |
| "learning_rate": 1.998984600499175e-05, |
| "loss": 0.9569, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.04391968553505157, |
| "eval_loss": 1.0243369340896606, |
| "eval_runtime": 27.9367, |
| "eval_samples_per_second": 17.898, |
| "eval_steps_per_second": 17.898, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.0446516802939691, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 1.9988749833456433e-05, |
| "loss": 0.8217, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.045383675052886624, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 1.9987597520015302e-05, |
| "loss": 0.9041, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.04611566981180415, |
| "grad_norm": 70.0, |
| "learning_rate": 1.998638907114495e-05, |
| "loss": 1.0699, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.046847664570721675, |
| "grad_norm": 178.0, |
| "learning_rate": 1.998512449363748e-05, |
| "loss": 0.9322, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.0475796593296392, |
| "grad_norm": 0.1533203125, |
| "learning_rate": 1.9983803794600468e-05, |
| "loss": 0.9877, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.04831165408855673, |
| "grad_norm": 368.0, |
| "learning_rate": 1.998242698145692e-05, |
| "loss": 1.0714, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.04904364884747425, |
| "grad_norm": 0.279296875, |
| "learning_rate": 1.9980994061945238e-05, |
| "loss": 0.9344, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.04977564360639178, |
| "grad_norm": 2800.0, |
| "learning_rate": 1.997950504411916e-05, |
| "loss": 1.2076, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.050507638365309304, |
| "grad_norm": 0.31640625, |
| "learning_rate": 1.9977959936347732e-05, |
| "loss": 1.0685, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.05123963312422683, |
| "grad_norm": 29.75, |
| "learning_rate": 1.9976358747315254e-05, |
| "loss": 1.1026, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.051971627883144356, |
| "grad_norm": 2112.0, |
| "learning_rate": 1.9974701486021233e-05, |
| "loss": 1.0783, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.05270362264206188, |
| "grad_norm": 0.111328125, |
| "learning_rate": 1.997298816178033e-05, |
| "loss": 0.8777, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.05343561740097941, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 1.9971218784222302e-05, |
| "loss": 0.9701, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.05416761215989693, |
| "grad_norm": 132.0, |
| "learning_rate": 1.9969393363291963e-05, |
| "loss": 0.9978, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.05489960691881446, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.9967511909249118e-05, |
| "loss": 1.2451, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.055631601677731984, |
| "grad_norm": 912.0, |
| "learning_rate": 1.99655744326685e-05, |
| "loss": 0.8866, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.05636359643664951, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 1.9963580944439732e-05, |
| "loss": 0.9139, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.05709559119556704, |
| "grad_norm": 0.1796875, |
| "learning_rate": 1.9961531455767233e-05, |
| "loss": 1.0991, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.05782758595448457, |
| "grad_norm": 0.45703125, |
| "learning_rate": 1.9959425978170187e-05, |
| "loss": 1.0318, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.058559580713402094, |
| "grad_norm": 161.0, |
| "learning_rate": 1.995726452348246e-05, |
| "loss": 1.0115, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.058559580713402094, |
| "eval_loss": 1.2017102241516113, |
| "eval_runtime": 27.9424, |
| "eval_samples_per_second": 17.894, |
| "eval_steps_per_second": 17.894, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.05929157547231962, |
| "grad_norm": 94.5, |
| "learning_rate": 1.9955047103852534e-05, |
| "loss": 1.3752, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.060023570231237146, |
| "grad_norm": 83.0, |
| "learning_rate": 1.995277373174345e-05, |
| "loss": 1.0333, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.06075556499015467, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 1.9950444419932723e-05, |
| "loss": 1.0582, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.0614875597490722, |
| "grad_norm": 8.5, |
| "learning_rate": 1.994805918151229e-05, |
| "loss": 0.9273, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.06221955450798972, |
| "grad_norm": 0.1376953125, |
| "learning_rate": 1.9945618029888408e-05, |
| "loss": 0.8619, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.06295154926690726, |
| "grad_norm": 552.0, |
| "learning_rate": 1.994312097878161e-05, |
| "loss": 1.2394, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.06368354402582478, |
| "grad_norm": 0.62890625, |
| "learning_rate": 1.99405680422266e-05, |
| "loss": 0.8713, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.06441553878474231, |
| "grad_norm": 152.0, |
| "learning_rate": 1.9937959234572198e-05, |
| "loss": 0.9949, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.06514753354365983, |
| "grad_norm": 99.0, |
| "learning_rate": 1.993529457048124e-05, |
| "loss": 1.0313, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.06587952830257736, |
| "grad_norm": 1004.0, |
| "learning_rate": 1.993257406493051e-05, |
| "loss": 1.0299, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.06661152306149488, |
| "grad_norm": 0.16796875, |
| "learning_rate": 1.9929797733210644e-05, |
| "loss": 0.9293, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.06734351782041241, |
| "grad_norm": 0.75, |
| "learning_rate": 1.992696559092605e-05, |
| "loss": 1.04, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.06807551257932994, |
| "grad_norm": 5.15625, |
| "learning_rate": 1.992407765399483e-05, |
| "loss": 1.072, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.06880750733824746, |
| "grad_norm": 0.12890625, |
| "learning_rate": 1.992113393864867e-05, |
| "loss": 1.102, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.06953950209716499, |
| "grad_norm": 0.66796875, |
| "learning_rate": 1.9918134461432763e-05, |
| "loss": 1.0206, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.07027149685608251, |
| "grad_norm": 0.158203125, |
| "learning_rate": 1.991507923920571e-05, |
| "loss": 0.7945, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.07100349161500004, |
| "grad_norm": 4.75, |
| "learning_rate": 1.991196828913943e-05, |
| "loss": 1.1373, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.07173548637391756, |
| "grad_norm": 88.0, |
| "learning_rate": 1.9908801628719063e-05, |
| "loss": 1.0789, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.07246748113283509, |
| "grad_norm": 0.283203125, |
| "learning_rate": 1.9905579275742866e-05, |
| "loss": 0.9591, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.07319947589175262, |
| "grad_norm": 484.0, |
| "learning_rate": 1.990230124832212e-05, |
| "loss": 1.1461, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.07319947589175262, |
| "eval_loss": 0.7712569832801819, |
| "eval_runtime": 28.1186, |
| "eval_samples_per_second": 17.782, |
| "eval_steps_per_second": 17.782, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.07393147065067014, |
| "grad_norm": 696.0, |
| "learning_rate": 1.9898967564881014e-05, |
| "loss": 1.0556, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.07466346540958767, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.9895578244156576e-05, |
| "loss": 1.1493, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.0753954601685052, |
| "grad_norm": 1.34375, |
| "learning_rate": 1.989213330519852e-05, |
| "loss": 0.8955, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.07612745492742272, |
| "grad_norm": 146.0, |
| "learning_rate": 1.988863276736918e-05, |
| "loss": 1.2152, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.07685944968634024, |
| "grad_norm": 756.0, |
| "learning_rate": 1.9885076650343364e-05, |
| "loss": 1.0884, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.07759144444525777, |
| "grad_norm": 151.0, |
| "learning_rate": 1.988146497410829e-05, |
| "loss": 1.1883, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.0783234392041753, |
| "grad_norm": 100.5, |
| "learning_rate": 1.987779775896343e-05, |
| "loss": 0.9924, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.07905543396309282, |
| "grad_norm": 0.77734375, |
| "learning_rate": 1.9874075025520417e-05, |
| "loss": 0.7545, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.07978742872201035, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.987029679470292e-05, |
| "loss": 0.7715, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.08051942348092787, |
| "grad_norm": 0.33203125, |
| "learning_rate": 1.9866463087746544e-05, |
| "loss": 0.7923, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.0812514182398454, |
| "grad_norm": 0.53125, |
| "learning_rate": 1.986257392619869e-05, |
| "loss": 1.122, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.08198341299876293, |
| "grad_norm": 0.1845703125, |
| "learning_rate": 1.9858629331918445e-05, |
| "loss": 0.9972, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.08271540775768045, |
| "grad_norm": 130.0, |
| "learning_rate": 1.9854629327076454e-05, |
| "loss": 1.0698, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.08344740251659798, |
| "grad_norm": 119.5, |
| "learning_rate": 1.9850573934154798e-05, |
| "loss": 1.163, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.0841793972755155, |
| "grad_norm": 0.17578125, |
| "learning_rate": 1.9846463175946872e-05, |
| "loss": 0.8634, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.08491139203443303, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 1.9842297075557243e-05, |
| "loss": 1.0536, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.08564338679335055, |
| "grad_norm": 0.12353515625, |
| "learning_rate": 1.9838075656401546e-05, |
| "loss": 0.826, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.08637538155226808, |
| "grad_norm": 3.5625, |
| "learning_rate": 1.9833798942206312e-05, |
| "loss": 0.9368, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.0871073763111856, |
| "grad_norm": 3712.0, |
| "learning_rate": 1.9829466957008884e-05, |
| "loss": 0.9388, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.08783937107010314, |
| "grad_norm": 352.0, |
| "learning_rate": 1.9825079725157236e-05, |
| "loss": 1.0504, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.08783937107010314, |
| "eval_loss": 0.770910382270813, |
| "eval_runtime": 27.9411, |
| "eval_samples_per_second": 17.895, |
| "eval_steps_per_second": 17.895, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.08857136582902067, |
| "grad_norm": 211.0, |
| "learning_rate": 1.982063727130987e-05, |
| "loss": 0.9014, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.0893033605879382, |
| "grad_norm": 258.0, |
| "learning_rate": 1.9816139620435657e-05, |
| "loss": 0.9101, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.09003535534685572, |
| "grad_norm": 0.765625, |
| "learning_rate": 1.9811586797813706e-05, |
| "loss": 1.0403, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.09076735010577325, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.9806978829033218e-05, |
| "loss": 0.9556, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.09149934486469077, |
| "grad_norm": 0.232421875, |
| "learning_rate": 1.9802315739993346e-05, |
| "loss": 0.8063, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.0922313396236083, |
| "grad_norm": 0.396484375, |
| "learning_rate": 1.9797597556903048e-05, |
| "loss": 0.8704, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.09296333438252583, |
| "grad_norm": 94.5, |
| "learning_rate": 1.9792824306280934e-05, |
| "loss": 1.0443, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.09369532914144335, |
| "grad_norm": 0.09912109375, |
| "learning_rate": 1.9787996014955126e-05, |
| "loss": 0.9383, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.09442732390036088, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 1.9783112710063098e-05, |
| "loss": 0.9516, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.0951593186592784, |
| "grad_norm": 0.1005859375, |
| "learning_rate": 1.9778174419051538e-05, |
| "loss": 0.9241, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.09589131341819593, |
| "grad_norm": 106.0, |
| "learning_rate": 1.977318116967618e-05, |
| "loss": 0.9661, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.09662330817711345, |
| "grad_norm": 0.314453125, |
| "learning_rate": 1.976813299000164e-05, |
| "loss": 1.0954, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.09735530293603098, |
| "grad_norm": 114.5, |
| "learning_rate": 1.9763029908401294e-05, |
| "loss": 0.9344, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.0980872976949485, |
| "grad_norm": 0.447265625, |
| "learning_rate": 1.9757871953557078e-05, |
| "loss": 1.0499, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.09881929245386603, |
| "grad_norm": 78.5, |
| "learning_rate": 1.975265915445934e-05, |
| "loss": 0.9215, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.09955128721278356, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 1.97473915404067e-05, |
| "loss": 1.048, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.10028328197170108, |
| "grad_norm": 0.490234375, |
| "learning_rate": 1.9742069141005853e-05, |
| "loss": 1.0092, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.10101527673061861, |
| "grad_norm": 1.4296875, |
| "learning_rate": 1.9736691986171413e-05, |
| "loss": 0.964, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.10174727148953613, |
| "grad_norm": 0.2421875, |
| "learning_rate": 1.9731260106125757e-05, |
| "loss": 0.8828, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.10247926624845366, |
| "grad_norm": 88.0, |
| "learning_rate": 1.972577353139884e-05, |
| "loss": 1.0908, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.10247926624845366, |
| "eval_loss": 0.8814056515693665, |
| "eval_runtime": 27.9852, |
| "eval_samples_per_second": 17.867, |
| "eval_steps_per_second": 17.867, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.10321126100737119, |
| "grad_norm": 0.18359375, |
| "learning_rate": 1.9720232292828033e-05, |
| "loss": 0.9781, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.10394325576628871, |
| "grad_norm": 0.31640625, |
| "learning_rate": 1.971463642155794e-05, |
| "loss": 0.9888, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.10467525052520624, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 1.9708985949040237e-05, |
| "loss": 1.0119, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.10540724528412376, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.9703280907033475e-05, |
| "loss": 1.0127, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.10613924004304129, |
| "grad_norm": 0.53125, |
| "learning_rate": 1.9697521327602928e-05, |
| "loss": 1.0275, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.10687123480195881, |
| "grad_norm": 0.28515625, |
| "learning_rate": 1.9691707243120386e-05, |
| "loss": 0.869, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.10760322956087634, |
| "grad_norm": 1.3125, |
| "learning_rate": 1.9685838686263998e-05, |
| "loss": 0.8713, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.10833522431979387, |
| "grad_norm": 244.0, |
| "learning_rate": 1.9679915690018062e-05, |
| "loss": 1.0574, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.10906721907871139, |
| "grad_norm": 0.298828125, |
| "learning_rate": 1.9673938287672865e-05, |
| "loss": 0.8997, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.10979921383762892, |
| "grad_norm": 0.76171875, |
| "learning_rate": 1.966790651282447e-05, |
| "loss": 1.2234, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.11053120859654644, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 1.9661820399374564e-05, |
| "loss": 0.8861, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.11126320335546397, |
| "grad_norm": 9.0, |
| "learning_rate": 1.9655679981530224e-05, |
| "loss": 0.9659, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.1119951981143815, |
| "grad_norm": 142.0, |
| "learning_rate": 1.964948529380375e-05, |
| "loss": 1.0234, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.11272719287329902, |
| "grad_norm": 0.171875, |
| "learning_rate": 1.964323637101247e-05, |
| "loss": 1.011, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.11345918763221656, |
| "grad_norm": 0.65625, |
| "learning_rate": 1.9636933248278545e-05, |
| "loss": 0.9565, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.11419118239113409, |
| "grad_norm": 76.0, |
| "learning_rate": 1.9630575961028765e-05, |
| "loss": 0.9768, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.11492317715005161, |
| "grad_norm": 242.0, |
| "learning_rate": 1.9624164544994343e-05, |
| "loss": 0.7916, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.11565517190896914, |
| "grad_norm": 160.0, |
| "learning_rate": 1.9617699036210737e-05, |
| "loss": 0.8392, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.11638716666788666, |
| "grad_norm": 0.5390625, |
| "learning_rate": 1.9611179471017423e-05, |
| "loss": 0.8403, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.11711916142680419, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.9604605886057712e-05, |
| "loss": 0.7843, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.11711916142680419, |
| "eval_loss": 0.9412841796875, |
| "eval_runtime": 27.963, |
| "eval_samples_per_second": 17.881, |
| "eval_steps_per_second": 17.881, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.11785115618572171, |
| "grad_norm": 0.67578125, |
| "learning_rate": 1.9597978318278523e-05, |
| "loss": 1.0179, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.11858315094463924, |
| "grad_norm": 116.0, |
| "learning_rate": 1.9591296804930198e-05, |
| "loss": 0.9158, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.11931514570355677, |
| "grad_norm": 9.875, |
| "learning_rate": 1.958456138356627e-05, |
| "loss": 0.9174, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.12004714046247429, |
| "grad_norm": 1.1953125, |
| "learning_rate": 1.957777209204327e-05, |
| "loss": 0.942, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.12077913522139182, |
| "grad_norm": 288.0, |
| "learning_rate": 1.95709289685205e-05, |
| "loss": 0.8128, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.12151112998030934, |
| "grad_norm": 117.0, |
| "learning_rate": 1.956403205145984e-05, |
| "loss": 1.0152, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.12224312473922687, |
| "grad_norm": 90.0, |
| "learning_rate": 1.9557081379625494e-05, |
| "loss": 0.809, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.1229751194981444, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 1.9550076992083818e-05, |
| "loss": 0.7162, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.12370711425706192, |
| "grad_norm": 0.18359375, |
| "learning_rate": 1.9543018928203066e-05, |
| "loss": 0.7201, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.12443910901597945, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 1.9535907227653182e-05, |
| "loss": 1.279, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.12517110377489696, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 1.952874193040558e-05, |
| "loss": 0.5654, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.1259030985338145, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 1.9521523076732903e-05, |
| "loss": 0.7602, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.12663509329273204, |
| "grad_norm": 3760.0, |
| "learning_rate": 1.951425070720883e-05, |
| "loss": 0.9334, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.12736708805164956, |
| "grad_norm": 93.0, |
| "learning_rate": 1.9506924862707804e-05, |
| "loss": 1.1316, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.1280990828105671, |
| "grad_norm": 0.28125, |
| "learning_rate": 1.949954558440484e-05, |
| "loss": 1.0999, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.12883107756948461, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 1.9492112913775273e-05, |
| "loss": 0.929, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.12956307232840214, |
| "grad_norm": 3904.0, |
| "learning_rate": 1.9484626892594525e-05, |
| "loss": 0.7699, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.13029506708731967, |
| "grad_norm": 0.447265625, |
| "learning_rate": 1.9477087562937888e-05, |
| "loss": 1.0148, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.1310270618462372, |
| "grad_norm": 8.25, |
| "learning_rate": 1.9469494967180262e-05, |
| "loss": 0.8446, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.13175905660515472, |
| "grad_norm": 148.0, |
| "learning_rate": 1.9461849147995942e-05, |
| "loss": 0.8187, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.13175905660515472, |
| "eval_loss": 0.8699705600738525, |
| "eval_runtime": 27.9351, |
| "eval_samples_per_second": 17.899, |
| "eval_steps_per_second": 17.899, |
| "step": 18000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 136613, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.742241467994931e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|