{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 14.0, "eval_steps": 500, "global_step": 14728, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09505703422053231, "grad_norm": 0.9250678420066833, "learning_rate": 3.95882818685669e-05, "loss": 6.6663, "step": 100 }, { "epoch": 0.19011406844106463, "grad_norm": 1.117205262184143, "learning_rate": 7.91765637371338e-05, "loss": 4.145, "step": 200 }, { "epoch": 0.28517110266159695, "grad_norm": 1.3325448036193848, "learning_rate": 0.00011876484560570071, "loss": 3.8608, "step": 300 }, { "epoch": 0.38022813688212925, "grad_norm": 0.9613128900527954, "learning_rate": 0.0001583531274742676, "loss": 3.768, "step": 400 }, { "epoch": 0.4752851711026616, "grad_norm": 1.1198444366455078, "learning_rate": 0.00019794140934283454, "loss": 3.7093, "step": 500 }, { "epoch": 0.5703422053231939, "grad_norm": 1.0210366249084473, "learning_rate": 0.00023752969121140142, "loss": 3.6664, "step": 600 }, { "epoch": 0.6653992395437263, "grad_norm": 0.9687440395355225, "learning_rate": 0.00027711797307996834, "loss": 3.5409, "step": 700 }, { "epoch": 0.7604562737642585, "grad_norm": 1.4981633424758911, "learning_rate": 0.0003167062549485352, "loss": 3.3992, "step": 800 }, { "epoch": 0.8555133079847909, "grad_norm": 0.8627603650093079, "learning_rate": 0.00035629453681710216, "loss": 3.1727, "step": 900 }, { "epoch": 0.9505703422053232, "grad_norm": 0.9925593733787537, "learning_rate": 0.0003958828186856691, "loss": 2.9724, "step": 1000 }, { "epoch": 1.0, "eval_loss": 2.6960370540618896, "eval_runtime": 3.7345, "eval_samples_per_second": 1897.152, "eval_steps_per_second": 118.622, "step": 1052 }, { "epoch": 1.0456273764258555, "grad_norm": 0.9267619848251343, "learning_rate": 0.00043547110055423594, "loss": 2.7022, "step": 1100 }, { "epoch": 1.1406844106463878, "grad_norm": 0.7666485905647278, "learning_rate": 0.00047505938242280285, "loss": 2.5641, "step": 1200 }, { "epoch": 1.2357414448669202, "grad_norm": 0.5969619154930115, "learning_rate": 0.0004990645699549983, "loss": 2.5058, "step": 1300 }, { "epoch": 1.3307984790874525, "grad_norm": 0.7782655358314514, "learning_rate": 0.0004965363806441826, "loss": 2.4665, "step": 1400 }, { "epoch": 1.4258555133079849, "grad_norm": 0.8928040266036987, "learning_rate": 0.000494008191333367, "loss": 2.4311, "step": 1500 }, { "epoch": 1.5209125475285172, "grad_norm": 0.8687949180603027, "learning_rate": 0.0004914800020225515, "loss": 2.3964, "step": 1600 }, { "epoch": 1.6159695817490496, "grad_norm": 0.6245518922805786, "learning_rate": 0.0004889518127117359, "loss": 2.374, "step": 1700 }, { "epoch": 1.7110266159695817, "grad_norm": 0.6903976202011108, "learning_rate": 0.0004864236234009203, "loss": 2.3606, "step": 1800 }, { "epoch": 1.806083650190114, "grad_norm": 0.8996257781982422, "learning_rate": 0.00048389543409010466, "loss": 2.3376, "step": 1900 }, { "epoch": 1.9011406844106464, "grad_norm": 0.734466016292572, "learning_rate": 0.0004813672447792891, "loss": 2.3226, "step": 2000 }, { "epoch": 1.9961977186311786, "grad_norm": 0.6836825013160706, "learning_rate": 0.0004788390554684735, "loss": 2.3108, "step": 2100 }, { "epoch": 2.0, "eval_loss": 2.285733461380005, "eval_runtime": 3.623, "eval_samples_per_second": 1955.579, "eval_steps_per_second": 122.275, "step": 2104 }, { "epoch": 2.091254752851711, "grad_norm": 0.5974160432815552, "learning_rate": 0.0004763108661576579, "loss": 2.2585, "step": 2200 }, { "epoch": 2.1863117870722433, "grad_norm": 0.788093626499176, "learning_rate": 0.0004737826768468423, "loss": 2.264, "step": 2300 }, { "epoch": 2.2813688212927756, "grad_norm": 0.7451100945472717, "learning_rate": 0.00047125448753602674, "loss": 2.2504, "step": 2400 }, { "epoch": 2.376425855513308, "grad_norm": 0.6724629998207092, "learning_rate": 0.0004687262982252111, "loss": 2.2358, "step": 2500 }, { "epoch": 2.4714828897338403, "grad_norm": 0.6606141924858093, "learning_rate": 0.00046619810891439554, "loss": 2.2301, "step": 2600 }, { "epoch": 2.5665399239543727, "grad_norm": 0.6599621772766113, "learning_rate": 0.0004636699196035799, "loss": 2.2268, "step": 2700 }, { "epoch": 2.661596958174905, "grad_norm": 0.6633493304252625, "learning_rate": 0.00046114173029276434, "loss": 2.2247, "step": 2800 }, { "epoch": 2.7566539923954374, "grad_norm": 0.6308265328407288, "learning_rate": 0.00045861354098194877, "loss": 2.2221, "step": 2900 }, { "epoch": 2.8517110266159698, "grad_norm": 0.6383451223373413, "learning_rate": 0.00045608535167113314, "loss": 2.2274, "step": 3000 }, { "epoch": 2.9467680608365017, "grad_norm": 0.61512291431427, "learning_rate": 0.00045355716236031757, "loss": 2.2067, "step": 3100 }, { "epoch": 3.0, "eval_loss": 2.2008087635040283, "eval_runtime": 3.5613, "eval_samples_per_second": 1989.445, "eval_steps_per_second": 124.393, "step": 3156 }, { "epoch": 3.041825095057034, "grad_norm": 0.7461186647415161, "learning_rate": 0.00045102897304950194, "loss": 2.1882, "step": 3200 }, { "epoch": 3.1368821292775664, "grad_norm": 0.6590662598609924, "learning_rate": 0.00044850078373868637, "loss": 2.1662, "step": 3300 }, { "epoch": 3.2319391634980987, "grad_norm": 0.5832785964012146, "learning_rate": 0.00044597259442787074, "loss": 2.1603, "step": 3400 }, { "epoch": 3.326996197718631, "grad_norm": 0.6356543898582458, "learning_rate": 0.00044344440511705517, "loss": 2.1601, "step": 3500 }, { "epoch": 3.4220532319391634, "grad_norm": 0.7197031378746033, "learning_rate": 0.0004409162158062396, "loss": 2.1567, "step": 3600 }, { "epoch": 3.517110266159696, "grad_norm": 0.5856086611747742, "learning_rate": 0.00043838802649542397, "loss": 2.1588, "step": 3700 }, { "epoch": 3.612167300380228, "grad_norm": 0.6212655305862427, "learning_rate": 0.00043585983718460834, "loss": 2.1565, "step": 3800 }, { "epoch": 3.7072243346007605, "grad_norm": 0.6765671968460083, "learning_rate": 0.0004333316478737928, "loss": 2.1667, "step": 3900 }, { "epoch": 3.802281368821293, "grad_norm": 0.6720090508460999, "learning_rate": 0.0004308034585629772, "loss": 2.1675, "step": 4000 }, { "epoch": 3.897338403041825, "grad_norm": 0.7150991559028625, "learning_rate": 0.00042827526925216157, "loss": 2.1474, "step": 4100 }, { "epoch": 3.9923954372623576, "grad_norm": 0.5831249356269836, "learning_rate": 0.00042574707994134605, "loss": 2.1485, "step": 4200 }, { "epoch": 4.0, "eval_loss": 2.15364408493042, "eval_runtime": 3.644, "eval_samples_per_second": 1944.292, "eval_steps_per_second": 121.57, "step": 4208 }, { "epoch": 4.08745247148289, "grad_norm": 0.6653150916099548, "learning_rate": 0.0004232188906305304, "loss": 2.0899, "step": 4300 }, { "epoch": 4.182509505703422, "grad_norm": 0.7235066294670105, "learning_rate": 0.0004206907013197148, "loss": 2.0982, "step": 4400 }, { "epoch": 4.277566539923955, "grad_norm": 0.7326545715332031, "learning_rate": 0.0004181625120088992, "loss": 2.1007, "step": 4500 }, { "epoch": 4.3726235741444865, "grad_norm": 0.6236776113510132, "learning_rate": 0.00041563432269808365, "loss": 2.1031, "step": 4600 }, { "epoch": 4.467680608365019, "grad_norm": 0.5669475197792053, "learning_rate": 0.000413106133387268, "loss": 2.1087, "step": 4700 }, { "epoch": 4.562737642585551, "grad_norm": 0.5483006834983826, "learning_rate": 0.00041057794407645245, "loss": 2.1034, "step": 4800 }, { "epoch": 4.657794676806084, "grad_norm": 0.5456926822662354, "learning_rate": 0.0004080497547656369, "loss": 2.1065, "step": 4900 }, { "epoch": 4.752851711026616, "grad_norm": 0.9545803666114807, "learning_rate": 0.00040552156545482125, "loss": 2.1168, "step": 5000 }, { "epoch": 4.847908745247148, "grad_norm": 0.5378767251968384, "learning_rate": 0.0004029933761440057, "loss": 2.1107, "step": 5100 }, { "epoch": 4.942965779467681, "grad_norm": 0.629880964756012, "learning_rate": 0.00040046518683319005, "loss": 2.0983, "step": 5200 }, { "epoch": 5.0, "eval_loss": 2.132718801498413, "eval_runtime": 3.6373, "eval_samples_per_second": 1947.857, "eval_steps_per_second": 121.793, "step": 5260 }, { "epoch": 5.038022813688213, "grad_norm": 0.5900342464447021, "learning_rate": 0.0003979369975223745, "loss": 2.0758, "step": 5300 }, { "epoch": 5.133079847908745, "grad_norm": 0.6181082129478455, "learning_rate": 0.0003954088082115589, "loss": 2.041, "step": 5400 }, { "epoch": 5.228136882129277, "grad_norm": 0.6756412386894226, "learning_rate": 0.0003928806189007433, "loss": 2.0548, "step": 5500 }, { "epoch": 5.32319391634981, "grad_norm": 0.6649320125579834, "learning_rate": 0.0003903524295899277, "loss": 2.0438, "step": 5600 }, { "epoch": 5.418250950570342, "grad_norm": 0.5628513693809509, "learning_rate": 0.00038782424027911214, "loss": 2.0485, "step": 5700 }, { "epoch": 5.513307984790875, "grad_norm": 0.6923677921295166, "learning_rate": 0.0003852960509682965, "loss": 2.063, "step": 5800 }, { "epoch": 5.608365019011407, "grad_norm": 0.6819363236427307, "learning_rate": 0.0003827678616574809, "loss": 2.0618, "step": 5900 }, { "epoch": 5.7034220532319395, "grad_norm": 0.6446284055709839, "learning_rate": 0.00038023967234666537, "loss": 2.0674, "step": 6000 }, { "epoch": 5.798479087452471, "grad_norm": 0.6319680213928223, "learning_rate": 0.00037771148303584974, "loss": 2.061, "step": 6100 }, { "epoch": 5.893536121673003, "grad_norm": 0.6318814754486084, "learning_rate": 0.0003751832937250341, "loss": 2.0656, "step": 6200 }, { "epoch": 5.988593155893536, "grad_norm": 0.6261875033378601, "learning_rate": 0.0003726551044142186, "loss": 2.0663, "step": 6300 }, { "epoch": 6.0, "eval_loss": 2.1098814010620117, "eval_runtime": 3.698, "eval_samples_per_second": 1915.889, "eval_steps_per_second": 119.794, "step": 6312 }, { "epoch": 6.083650190114068, "grad_norm": 0.6620230674743652, "learning_rate": 0.00037012691510340297, "loss": 1.9996, "step": 6400 }, { "epoch": 6.178707224334601, "grad_norm": 1.0794607400894165, "learning_rate": 0.00036759872579258734, "loss": 2.0018, "step": 6500 }, { "epoch": 6.273764258555133, "grad_norm": 1.372861385345459, "learning_rate": 0.00036507053648177177, "loss": 2.0059, "step": 6600 }, { "epoch": 6.3688212927756656, "grad_norm": 0.5926664471626282, "learning_rate": 0.0003625423471709562, "loss": 2.012, "step": 6700 }, { "epoch": 6.4638783269961975, "grad_norm": 0.7855852246284485, "learning_rate": 0.00036001415786014057, "loss": 2.0128, "step": 6800 }, { "epoch": 6.55893536121673, "grad_norm": 0.6684075593948364, "learning_rate": 0.000357485968549325, "loss": 2.0221, "step": 6900 }, { "epoch": 6.653992395437262, "grad_norm": 0.628013014793396, "learning_rate": 0.00035495777923850937, "loss": 2.0159, "step": 7000 }, { "epoch": 6.749049429657795, "grad_norm": 0.7943947911262512, "learning_rate": 0.0003524295899276938, "loss": 2.0223, "step": 7100 }, { "epoch": 6.844106463878327, "grad_norm": 0.645799994468689, "learning_rate": 0.0003499014006168782, "loss": 2.0206, "step": 7200 }, { "epoch": 6.93916349809886, "grad_norm": 0.6603648066520691, "learning_rate": 0.0003473732113060626, "loss": 2.0304, "step": 7300 }, { "epoch": 7.0, "eval_loss": 2.099062919616699, "eval_runtime": 3.631, "eval_samples_per_second": 1951.251, "eval_steps_per_second": 122.005, "step": 7364 }, { "epoch": 7.034220532319392, "grad_norm": 0.6082973480224609, "learning_rate": 0.000344845021995247, "loss": 2.0039, "step": 7400 }, { "epoch": 7.129277566539924, "grad_norm": 0.673995852470398, "learning_rate": 0.0003423168326844314, "loss": 1.9663, "step": 7500 }, { "epoch": 7.224334600760456, "grad_norm": 0.675037682056427, "learning_rate": 0.0003397886433736158, "loss": 1.9696, "step": 7600 }, { "epoch": 7.319391634980988, "grad_norm": 0.6488978266716003, "learning_rate": 0.0003372604540628002, "loss": 1.9701, "step": 7700 }, { "epoch": 7.414448669201521, "grad_norm": 0.8255399465560913, "learning_rate": 0.0003347322647519846, "loss": 1.9654, "step": 7800 }, { "epoch": 7.509505703422053, "grad_norm": 1.2661654949188232, "learning_rate": 0.00033220407544116905, "loss": 1.9736, "step": 7900 }, { "epoch": 7.604562737642586, "grad_norm": 0.6545805335044861, "learning_rate": 0.0003296758861303534, "loss": 1.9783, "step": 8000 }, { "epoch": 7.699619771863118, "grad_norm": 0.8890361189842224, "learning_rate": 0.00032714769681953785, "loss": 1.9807, "step": 8100 }, { "epoch": 7.79467680608365, "grad_norm": 0.6547899842262268, "learning_rate": 0.0003246195075087223, "loss": 1.9723, "step": 8200 }, { "epoch": 7.889733840304182, "grad_norm": 1.1239402294158936, "learning_rate": 0.00032209131819790665, "loss": 1.9734, "step": 8300 }, { "epoch": 7.984790874524715, "grad_norm": 0.6624830961227417, "learning_rate": 0.000319563128887091, "loss": 1.9869, "step": 8400 }, { "epoch": 8.0, "eval_loss": 2.1034328937530518, "eval_runtime": 3.6013, "eval_samples_per_second": 1967.337, "eval_steps_per_second": 123.011, "step": 8416 }, { "epoch": 8.079847908745247, "grad_norm": 0.6550971269607544, "learning_rate": 0.0003170349395762755, "loss": 1.9223, "step": 8500 }, { "epoch": 8.17490494296578, "grad_norm": 0.660987138748169, "learning_rate": 0.0003145067502654599, "loss": 1.9245, "step": 8600 }, { "epoch": 8.269961977186313, "grad_norm": 0.759884774684906, "learning_rate": 0.00031197856095464425, "loss": 1.9235, "step": 8700 }, { "epoch": 8.365019011406844, "grad_norm": 0.9319919347763062, "learning_rate": 0.00030945037164382874, "loss": 1.9239, "step": 8800 }, { "epoch": 8.460076045627376, "grad_norm": 0.6610597968101501, "learning_rate": 0.0003069221823330131, "loss": 1.928, "step": 8900 }, { "epoch": 8.55513307984791, "grad_norm": 0.7076143622398376, "learning_rate": 0.0003043939930221975, "loss": 1.9289, "step": 9000 }, { "epoch": 8.65019011406844, "grad_norm": 0.6368849873542786, "learning_rate": 0.0003018658037113819, "loss": 1.932, "step": 9100 }, { "epoch": 8.745247148288973, "grad_norm": 0.7639185786247253, "learning_rate": 0.00029933761440056634, "loss": 1.9485, "step": 9200 }, { "epoch": 8.840304182509506, "grad_norm": 1.0823330879211426, "learning_rate": 0.0002968094250897507, "loss": 1.9447, "step": 9300 }, { "epoch": 8.935361216730039, "grad_norm": 0.8542035222053528, "learning_rate": 0.00029428123577893514, "loss": 1.942, "step": 9400 }, { "epoch": 9.0, "eval_loss": 2.0947535037994385, "eval_runtime": 3.6147, "eval_samples_per_second": 1960.063, "eval_steps_per_second": 122.556, "step": 9468 }, { "epoch": 9.03041825095057, "grad_norm": 0.7601971626281738, "learning_rate": 0.00029175304646811956, "loss": 1.9243, "step": 9500 }, { "epoch": 9.125475285171103, "grad_norm": 0.7461040019989014, "learning_rate": 0.00028922485715730394, "loss": 1.8704, "step": 9600 }, { "epoch": 9.220532319391635, "grad_norm": 0.7719326019287109, "learning_rate": 0.00028669666784648836, "loss": 1.8832, "step": 9700 }, { "epoch": 9.315589353612168, "grad_norm": 0.716136634349823, "learning_rate": 0.00028416847853567274, "loss": 1.8787, "step": 9800 }, { "epoch": 9.4106463878327, "grad_norm": 0.6928532123565674, "learning_rate": 0.00028164028922485717, "loss": 1.8855, "step": 9900 }, { "epoch": 9.505703422053232, "grad_norm": 0.7696681618690491, "learning_rate": 0.0002791120999140416, "loss": 1.8855, "step": 10000 }, { "epoch": 9.600760456273765, "grad_norm": 0.8969391584396362, "learning_rate": 0.00027658391060322597, "loss": 1.9034, "step": 10100 }, { "epoch": 9.695817490494296, "grad_norm": 0.8469530940055847, "learning_rate": 0.00027405572129241034, "loss": 1.8965, "step": 10200 }, { "epoch": 9.790874524714829, "grad_norm": 0.7956866025924683, "learning_rate": 0.0002715275319815948, "loss": 1.9087, "step": 10300 }, { "epoch": 9.885931558935361, "grad_norm": 0.8293343782424927, "learning_rate": 0.0002689993426707792, "loss": 1.9177, "step": 10400 }, { "epoch": 9.980988593155894, "grad_norm": 0.7472631931304932, "learning_rate": 0.00026647115335996357, "loss": 1.9082, "step": 10500 }, { "epoch": 10.0, "eval_loss": 2.097904920578003, "eval_runtime": 3.5592, "eval_samples_per_second": 1990.641, "eval_steps_per_second": 124.468, "step": 10520 }, { "epoch": 10.076045627376425, "grad_norm": 0.7787309288978577, "learning_rate": 0.00026394296404914805, "loss": 1.8393, "step": 10600 }, { "epoch": 10.171102661596958, "grad_norm": 1.3328174352645874, "learning_rate": 0.0002614147747383324, "loss": 1.8283, "step": 10700 }, { "epoch": 10.26615969581749, "grad_norm": 0.7740694284439087, "learning_rate": 0.0002588865854275168, "loss": 1.8422, "step": 10800 }, { "epoch": 10.361216730038024, "grad_norm": 0.828940749168396, "learning_rate": 0.0002563583961167012, "loss": 1.8516, "step": 10900 }, { "epoch": 10.456273764258555, "grad_norm": 0.751752495765686, "learning_rate": 0.00025383020680588565, "loss": 1.8624, "step": 11000 }, { "epoch": 10.551330798479087, "grad_norm": 0.9940192103385925, "learning_rate": 0.00025130201749507, "loss": 1.8599, "step": 11100 }, { "epoch": 10.64638783269962, "grad_norm": 0.8591569066047668, "learning_rate": 0.00024877382818425445, "loss": 1.8581, "step": 11200 }, { "epoch": 10.741444866920151, "grad_norm": 0.7676281332969666, "learning_rate": 0.0002462456388734388, "loss": 1.8637, "step": 11300 }, { "epoch": 10.836501901140684, "grad_norm": 0.7896871566772461, "learning_rate": 0.00024371744956262325, "loss": 1.8606, "step": 11400 }, { "epoch": 10.931558935361217, "grad_norm": 0.8302274942398071, "learning_rate": 0.00024118926025180765, "loss": 1.8656, "step": 11500 }, { "epoch": 11.0, "eval_loss": 2.0961618423461914, "eval_runtime": 3.6362, "eval_samples_per_second": 1948.473, "eval_steps_per_second": 121.831, "step": 11572 }, { "epoch": 11.02661596958175, "grad_norm": 0.8891871571540833, "learning_rate": 0.00023866107094099208, "loss": 1.8522, "step": 11600 }, { "epoch": 11.12167300380228, "grad_norm": 0.7549653649330139, "learning_rate": 0.00023613288163017645, "loss": 1.7913, "step": 11700 }, { "epoch": 11.216730038022813, "grad_norm": 0.8127674460411072, "learning_rate": 0.00023360469231936088, "loss": 1.8102, "step": 11800 }, { "epoch": 11.311787072243346, "grad_norm": 0.841659426689148, "learning_rate": 0.0002310765030085453, "loss": 1.803, "step": 11900 }, { "epoch": 11.406844106463879, "grad_norm": 0.8460645079612732, "learning_rate": 0.00022854831369772968, "loss": 1.8201, "step": 12000 }, { "epoch": 11.50190114068441, "grad_norm": 0.7932580709457397, "learning_rate": 0.0002260201243869141, "loss": 1.811, "step": 12100 }, { "epoch": 11.596958174904943, "grad_norm": 0.8419378399848938, "learning_rate": 0.0002234919350760985, "loss": 1.8145, "step": 12200 }, { "epoch": 11.692015209125476, "grad_norm": 0.8346748352050781, "learning_rate": 0.0002209637457652829, "loss": 1.8328, "step": 12300 }, { "epoch": 11.787072243346007, "grad_norm": 1.019510269165039, "learning_rate": 0.0002184355564544673, "loss": 1.8257, "step": 12400 }, { "epoch": 11.88212927756654, "grad_norm": 0.8175719976425171, "learning_rate": 0.00021590736714365173, "loss": 1.8274, "step": 12500 }, { "epoch": 11.977186311787072, "grad_norm": 0.7476153373718262, "learning_rate": 0.00021337917783283614, "loss": 1.8361, "step": 12600 }, { "epoch": 12.0, "eval_loss": 2.1029505729675293, "eval_runtime": 3.5932, "eval_samples_per_second": 1971.782, "eval_steps_per_second": 123.289, "step": 12624 }, { "epoch": 12.072243346007605, "grad_norm": 0.8637651205062866, "learning_rate": 0.00021085098852202054, "loss": 1.7684, "step": 12700 }, { "epoch": 12.167300380228136, "grad_norm": 0.80800461769104, "learning_rate": 0.00020832279921120496, "loss": 1.7703, "step": 12800 }, { "epoch": 12.262357414448669, "grad_norm": 1.0111021995544434, "learning_rate": 0.00020579460990038934, "loss": 1.7809, "step": 12900 }, { "epoch": 12.357414448669202, "grad_norm": 0.8477798700332642, "learning_rate": 0.00020326642058957376, "loss": 1.7795, "step": 13000 }, { "epoch": 12.452471482889734, "grad_norm": 0.8284028172492981, "learning_rate": 0.00020073823127875814, "loss": 1.7803, "step": 13100 }, { "epoch": 12.547528517110266, "grad_norm": 0.7752136588096619, "learning_rate": 0.00019821004196794256, "loss": 1.7836, "step": 13200 }, { "epoch": 12.642585551330798, "grad_norm": 0.8929184675216675, "learning_rate": 0.00019568185265712696, "loss": 1.7724, "step": 13300 }, { "epoch": 12.737642585551331, "grad_norm": 0.8475900888442993, "learning_rate": 0.00019315366334631136, "loss": 1.7891, "step": 13400 }, { "epoch": 12.832699619771864, "grad_norm": 0.9029939770698547, "learning_rate": 0.0001906254740354958, "loss": 1.7888, "step": 13500 }, { "epoch": 12.927756653992395, "grad_norm": 0.841206967830658, "learning_rate": 0.0001880972847246802, "loss": 1.8005, "step": 13600 }, { "epoch": 13.0, "eval_loss": 2.1176211833953857, "eval_runtime": 3.6226, "eval_samples_per_second": 1955.796, "eval_steps_per_second": 122.289, "step": 13676 }, { "epoch": 13.022813688212928, "grad_norm": 0.786509096622467, "learning_rate": 0.0001855690954138646, "loss": 1.7784, "step": 13700 }, { "epoch": 13.11787072243346, "grad_norm": 0.8644747734069824, "learning_rate": 0.000183040906103049, "loss": 1.7234, "step": 13800 }, { "epoch": 13.212927756653992, "grad_norm": 0.8760172128677368, "learning_rate": 0.00018051271679223342, "loss": 1.7308, "step": 13900 }, { "epoch": 13.307984790874524, "grad_norm": 0.7858941555023193, "learning_rate": 0.0001779845274814178, "loss": 1.7318, "step": 14000 }, { "epoch": 13.403041825095057, "grad_norm": 0.8771238327026367, "learning_rate": 0.00017545633817060222, "loss": 1.7473, "step": 14100 }, { "epoch": 13.49809885931559, "grad_norm": 0.8886803984642029, "learning_rate": 0.00017292814885978665, "loss": 1.7491, "step": 14200 }, { "epoch": 13.593155893536121, "grad_norm": 0.8704127669334412, "learning_rate": 0.00017039995954897102, "loss": 1.7548, "step": 14300 }, { "epoch": 13.688212927756654, "grad_norm": 1.2635705471038818, "learning_rate": 0.00016787177023815545, "loss": 1.7532, "step": 14400 }, { "epoch": 13.783269961977187, "grad_norm": 0.9218750596046448, "learning_rate": 0.00016534358092733985, "loss": 1.7531, "step": 14500 }, { "epoch": 13.87832699619772, "grad_norm": 0.9513919353485107, "learning_rate": 0.00016281539161652425, "loss": 1.7618, "step": 14600 }, { "epoch": 13.97338403041825, "grad_norm": 1.010962963104248, "learning_rate": 0.00016028720230570865, "loss": 1.7646, "step": 14700 }, { "epoch": 14.0, "eval_loss": 2.130631923675537, "eval_runtime": 3.6539, "eval_samples_per_second": 1938.998, "eval_steps_per_second": 121.239, "step": 14728 } ], "logging_steps": 100, "max_steps": 21040, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.6215157665850184e+16, "train_batch_size": 128, "trial_name": null, "trial_params": null }