| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 2075, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.060350030175015085, |
| "grad_norm": 0.7244853377342224, |
| "learning_rate": 0.00011428571428571427, |
| "loss": 1.5091, |
| "mean_token_accuracy": 0.6793113535642624, |
| "num_tokens": 152165.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.12070006035003017, |
| "grad_norm": 0.8389242887496948, |
| "learning_rate": 0.0002333333333333333, |
| "loss": 0.8436, |
| "mean_token_accuracy": 0.7881802421808243, |
| "num_tokens": 267390.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18105009052504525, |
| "grad_norm": 0.6344988942146301, |
| "learning_rate": 0.00029997787517981614, |
| "loss": 0.5527, |
| "mean_token_accuracy": 0.8469069242477417, |
| "num_tokens": 420975.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24140012070006034, |
| "grad_norm": 0.7947192192077637, |
| "learning_rate": 0.0002997630832860032, |
| "loss": 0.4522, |
| "mean_token_accuracy": 0.870941441655159, |
| "num_tokens": 538380.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.30175015087507545, |
| "grad_norm": 0.43716728687286377, |
| "learning_rate": 0.0002993201135681549, |
| "loss": 0.3049, |
| "mean_token_accuracy": 0.9136220461130142, |
| "num_tokens": 690650.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3621001810500905, |
| "grad_norm": 1.09097421169281, |
| "learning_rate": 0.0002986496409313553, |
| "loss": 0.3172, |
| "mean_token_accuracy": 0.91127048432827, |
| "num_tokens": 806066.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4224502112251056, |
| "grad_norm": 0.3773705065250397, |
| "learning_rate": 0.0002977526869022985, |
| "loss": 0.2029, |
| "mean_token_accuracy": 0.9433162885904313, |
| "num_tokens": 960853.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.4828002414001207, |
| "grad_norm": 0.8292771577835083, |
| "learning_rate": 0.0002966306180728982, |
| "loss": 0.2274, |
| "mean_token_accuracy": 0.9385988712310791, |
| "num_tokens": 1077726.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5431502715751357, |
| "grad_norm": 0.4765889346599579, |
| "learning_rate": 0.0002952851440181598, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9479016721248626, |
| "num_tokens": 1232263.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.6035003017501509, |
| "grad_norm": 0.9254749417304993, |
| "learning_rate": 0.0002937183146914856, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.9498224484920502, |
| "num_tokens": 1349057.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.663850331925166, |
| "grad_norm": 0.4938018023967743, |
| "learning_rate": 0.000291932517301382, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.9588899296522141, |
| "num_tokens": 1496867.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.724200362100181, |
| "grad_norm": 0.6995358467102051, |
| "learning_rate": 0.00028993047267432864, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9568761509656906, |
| "num_tokens": 1610727.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7845503922751962, |
| "grad_norm": 0.46799567341804504, |
| "learning_rate": 0.0002877152311093483, |
| "loss": 0.1351, |
| "mean_token_accuracy": 0.9633717983961105, |
| "num_tokens": 1762041.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8449004224502112, |
| "grad_norm": 0.6729409098625183, |
| "learning_rate": 0.00028529016773059656, |
| "loss": 0.1206, |
| "mean_token_accuracy": 0.9687577307224273, |
| "num_tokens": 1877965.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9052504526252263, |
| "grad_norm": 0.5820412635803223, |
| "learning_rate": 0.00028265897734504976, |
| "loss": 0.1183, |
| "mean_token_accuracy": 0.96822787463665, |
| "num_tokens": 2028343.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9656004828002414, |
| "grad_norm": 0.8604497909545898, |
| "learning_rate": 0.0002798256688131267, |
| "loss": 0.1159, |
| "mean_token_accuracy": 0.9700725018978119, |
| "num_tokens": 2145044.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.1169130727648735, |
| "eval_mean_token_accuracy": 0.9691641559471955, |
| "eval_num_tokens": 2223513.0, |
| "eval_runtime": 60.5832, |
| "eval_samples_per_second": 6.091, |
| "eval_steps_per_second": 3.054, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.024140012070006, |
| "grad_norm": 0.20096616446971893, |
| "learning_rate": 0.0002767945589408217, |
| "loss": 0.122, |
| "mean_token_accuracy": 0.9680000224064306, |
| "num_tokens": 2291746.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0844900422450212, |
| "grad_norm": 0.34665247797966003, |
| "learning_rate": 0.0002735702659026533, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9780776232481003, |
| "num_tokens": 2424528.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1448400724200363, |
| "grad_norm": 0.30349963903427124, |
| "learning_rate": 0.0002701577022054515, |
| "loss": 0.1019, |
| "mean_token_accuracy": 0.9732917118072509, |
| "num_tokens": 2557091.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.2051901025950513, |
| "grad_norm": 0.3892677426338196, |
| "learning_rate": 0.0002665620672037014, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9782004028558731, |
| "num_tokens": 2691527.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2655401327700664, |
| "grad_norm": 0.29889699816703796, |
| "learning_rate": 0.0002627888391778493, |
| "loss": 0.1023, |
| "mean_token_accuracy": 0.9729781967401504, |
| "num_tokens": 2824699.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.3258901629450814, |
| "grad_norm": 0.393573522567749, |
| "learning_rate": 0.0002588437669876384, |
| "loss": 0.0779, |
| "mean_token_accuracy": 0.9795191860198975, |
| "num_tokens": 2958826.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3862401931200965, |
| "grad_norm": 0.26299118995666504, |
| "learning_rate": 0.00025473286131319283, |
| "loss": 0.0988, |
| "mean_token_accuracy": 0.9739746767282486, |
| "num_tokens": 3092320.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.4465902232951118, |
| "grad_norm": 0.3649594783782959, |
| "learning_rate": 0.0002504623854971937, |
| "loss": 0.0729, |
| "mean_token_accuracy": 0.9814109367132187, |
| "num_tokens": 3227452.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5069402534701268, |
| "grad_norm": 0.28632357716560364, |
| "learning_rate": 0.00024603884600210097, |
| "loss": 0.0957, |
| "mean_token_accuracy": 0.9748889011144638, |
| "num_tokens": 3361210.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.567290283645142, |
| "grad_norm": 0.25492990016937256, |
| "learning_rate": 0.00024146898249695974, |
| "loss": 0.075, |
| "mean_token_accuracy": 0.9806595808267593, |
| "num_tokens": 3497177.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.627640313820157, |
| "grad_norm": 0.37043872475624084, |
| "learning_rate": 0.00023675975758889506, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9762868732213974, |
| "num_tokens": 3630834.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.687990343995172, |
| "grad_norm": 0.26372411847114563, |
| "learning_rate": 0.00023191834621493968, |
| "loss": 0.0674, |
| "mean_token_accuracy": 0.9826526433229447, |
| "num_tokens": 3766598.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.748340374170187, |
| "grad_norm": 0.2400335669517517, |
| "learning_rate": 0.00022695212471035816, |
| "loss": 0.0807, |
| "mean_token_accuracy": 0.9793906199932099, |
| "num_tokens": 3899644.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.8086904043452021, |
| "grad_norm": 0.19833268225193024, |
| "learning_rate": 0.0002218686595701219, |
| "loss": 0.0655, |
| "mean_token_accuracy": 0.9832920217514038, |
| "num_tokens": 4036037.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.8690404345202172, |
| "grad_norm": 0.17969554662704468, |
| "learning_rate": 0.0002166756959206587, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9791438663005829, |
| "num_tokens": 4168035.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.9293904646952322, |
| "grad_norm": 0.3069966733455658, |
| "learning_rate": 0.00021138114571944054, |
| "loss": 0.0624, |
| "mean_token_accuracy": 0.9839604765176773, |
| "num_tokens": 4302324.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9897404948702473, |
| "grad_norm": 0.26080530881881714, |
| "learning_rate": 0.000205993075700389, |
| "loss": 0.0728, |
| "mean_token_accuracy": 0.9816776049137116, |
| "num_tokens": 4428521.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.07739538699388504, |
| "eval_mean_token_accuracy": 0.9806474750106399, |
| "eval_num_tokens": 4447026.0, |
| "eval_runtime": 60.6735, |
| "eval_samples_per_second": 6.082, |
| "eval_steps_per_second": 3.049, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.048280024140012, |
| "grad_norm": 0.32912909984588623, |
| "learning_rate": 0.00020051969508346498, |
| "loss": 0.0624, |
| "mean_token_accuracy": 0.98369190680612, |
| "num_tokens": 4571335.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.1086300543150274, |
| "grad_norm": 0.22884123027324677, |
| "learning_rate": 0.00019496934306716706, |
| "loss": 0.0543, |
| "mean_token_accuracy": 0.9862597143650055, |
| "num_tokens": 4694373.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.1689800844900424, |
| "grad_norm": 0.15646718442440033, |
| "learning_rate": 0.00018935047612299625, |
| "loss": 0.0683, |
| "mean_token_accuracy": 0.9817469125986099, |
| "num_tokens": 4840032.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.2293301146650575, |
| "grad_norm": 0.32684165239334106, |
| "learning_rate": 0.00018367165511124414, |
| "loss": 0.0558, |
| "mean_token_accuracy": 0.9862085193395614, |
| "num_tokens": 4962900.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.2896801448400725, |
| "grad_norm": 0.15353620052337646, |
| "learning_rate": 0.00017794153223773558, |
| "loss": 0.0649, |
| "mean_token_accuracy": 0.9830775827169418, |
| "num_tokens": 5107775.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.3500301750150876, |
| "grad_norm": 0.13864906132221222, |
| "learning_rate": 0.00017216883787139772, |
| "loss": 0.0513, |
| "mean_token_accuracy": 0.9871918082237243, |
| "num_tokens": 5231159.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 2.4103802051901027, |
| "grad_norm": 0.18856066465377808, |
| "learning_rate": 0.00016636236724274, |
| "loss": 0.0653, |
| "mean_token_accuracy": 0.9824860644340515, |
| "num_tokens": 5375658.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.4707302353651177, |
| "grad_norm": 0.1747666597366333, |
| "learning_rate": 0.00016053096704351255, |
| "loss": 0.0536, |
| "mean_token_accuracy": 0.9870379114151001, |
| "num_tokens": 5498792.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.5310802655401328, |
| "grad_norm": 0.08616527169942856, |
| "learning_rate": 0.00015468352194795791, |
| "loss": 0.0605, |
| "mean_token_accuracy": 0.9837486296892166, |
| "num_tokens": 5644155.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.591430295715148, |
| "grad_norm": 0.21047131717205048, |
| "learning_rate": 0.00014882894107619277, |
| "loss": 0.0502, |
| "mean_token_accuracy": 0.9874639976024627, |
| "num_tokens": 5768255.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.651780325890163, |
| "grad_norm": 0.09520892798900604, |
| "learning_rate": 0.00014297614442034518, |
| "loss": 0.0568, |
| "mean_token_accuracy": 0.9851021945476532, |
| "num_tokens": 5913228.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.712130356065178, |
| "grad_norm": 0.11644323915243149, |
| "learning_rate": 0.000137134049254126, |
| "loss": 0.0523, |
| "mean_token_accuracy": 0.9867914581298828, |
| "num_tokens": 6037285.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.772480386240193, |
| "grad_norm": 0.12872624397277832, |
| "learning_rate": 0.000131311556546543, |
| "loss": 0.0563, |
| "mean_token_accuracy": 0.9849929654598236, |
| "num_tokens": 6183361.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.832830416415208, |
| "grad_norm": 0.10195529460906982, |
| "learning_rate": 0.0001255175374004563, |
| "loss": 0.0501, |
| "mean_token_accuracy": 0.9871714848279953, |
| "num_tokens": 6305713.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.8931804465902236, |
| "grad_norm": 0.09452041983604431, |
| "learning_rate": 0.0001197608195366377, |
| "loss": 0.0581, |
| "mean_token_accuracy": 0.9840293884277344, |
| "num_tokens": 6451719.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.9535304767652386, |
| "grad_norm": 0.17165224254131317, |
| "learning_rate": 0.00011405017384392655, |
| "loss": 0.049, |
| "mean_token_accuracy": 0.9875269651412963, |
| "num_tokens": 6575211.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.06446010619401932, |
| "eval_mean_token_accuracy": 0.9841987928828677, |
| "eval_num_tokens": 6670539.0, |
| "eval_runtime": 60.4296, |
| "eval_samples_per_second": 6.106, |
| "eval_steps_per_second": 3.061, |
| "step": 1245 |
| }, |
| { |
| "epoch": 3.012070006035003, |
| "grad_norm": 0.08178732544183731, |
| "learning_rate": 0.00010839430101597464, |
| "loss": 0.0535, |
| "mean_token_accuracy": 0.9864560107594913, |
| "num_tokens": 6706527.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.0724200362100182, |
| "grad_norm": 0.0654640942811966, |
| "learning_rate": 0.00010280181829493925, |
| "loss": 0.042, |
| "mean_token_accuracy": 0.9891558569669724, |
| "num_tokens": 6845866.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 3.1327700663850333, |
| "grad_norm": 0.13900737464427948, |
| "learning_rate": 9.728124634232282e-05, |
| "loss": 0.0496, |
| "mean_token_accuracy": 0.9874085110425949, |
| "num_tokens": 6972947.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.1931200965600484, |
| "grad_norm": 0.05376769229769707, |
| "learning_rate": 9.184099625696183e-05, |
| "loss": 0.0415, |
| "mean_token_accuracy": 0.9890899294614792, |
| "num_tokens": 7115975.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 3.2534701267350634, |
| "grad_norm": 0.12443197518587112, |
| "learning_rate": 8.648935675994459e-05, |
| "loss": 0.0484, |
| "mean_token_accuracy": 0.987565501332283, |
| "num_tokens": 7243324.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.3138201569100785, |
| "grad_norm": 0.0667525976896286, |
| "learning_rate": 8.123448156598283e-05, |
| "loss": 0.0415, |
| "mean_token_accuracy": 0.9890210199356079, |
| "num_tokens": 7385182.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 3.3741701870850935, |
| "grad_norm": 0.12773087620735168, |
| "learning_rate": 7.608437696047756e-05, |
| "loss": 0.0487, |
| "mean_token_accuracy": 0.9873174405097962, |
| "num_tokens": 7509648.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.4345202172601086, |
| "grad_norm": 0.07510969042778015, |
| "learning_rate": 7.104688960120769e-05, |
| "loss": 0.0403, |
| "mean_token_accuracy": 0.989400810599327, |
| "num_tokens": 7650532.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 3.4948702474351236, |
| "grad_norm": 0.24315868318080902, |
| "learning_rate": 6.612969456322507e-05, |
| "loss": 0.0493, |
| "mean_token_accuracy": 0.987003293633461, |
| "num_tokens": 7779847.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.5552202776101387, |
| "grad_norm": 0.0974864810705185, |
| "learning_rate": 6.134028364517273e-05, |
| "loss": 0.0405, |
| "mean_token_accuracy": 0.9892659622430802, |
| "num_tokens": 7922087.0, |
| "step": 1475 |
| }, |
| { |
| "epoch": 3.6155703077851538, |
| "grad_norm": 0.112852543592453, |
| "learning_rate": 5.6685953954840553e-05, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9879545611143112, |
| "num_tokens": 8049661.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.675920337960169, |
| "grad_norm": 0.09587077796459198, |
| "learning_rate": 5.2173796791351116e-05, |
| "loss": 0.0399, |
| "mean_token_accuracy": 0.9899050652980804, |
| "num_tokens": 8191357.0, |
| "step": 1525 |
| }, |
| { |
| "epoch": 3.736270368135184, |
| "grad_norm": 0.15348604321479797, |
| "learning_rate": 4.781068684091327e-05, |
| "loss": 0.047, |
| "mean_token_accuracy": 0.9878348118066788, |
| "num_tokens": 8317709.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.796620398310199, |
| "grad_norm": 0.10841736942529678, |
| "learning_rate": 4.360327170260604e-05, |
| "loss": 0.0398, |
| "mean_token_accuracy": 0.9894819515943527, |
| "num_tokens": 8460448.0, |
| "step": 1575 |
| }, |
| { |
| "epoch": 3.856970428485214, |
| "grad_norm": 0.10409346967935562, |
| "learning_rate": 3.955796176015015e-05, |
| "loss": 0.0467, |
| "mean_token_accuracy": 0.9879930222034454, |
| "num_tokens": 8587426.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.9173204586602295, |
| "grad_norm": 0.08520140498876572, |
| "learning_rate": 3.5680920415099366e-05, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9894054895639419, |
| "num_tokens": 8728471.0, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.9776704888352445, |
| "grad_norm": 0.10589835047721863, |
| "learning_rate": 3.197805469633152e-05, |
| "loss": 0.0458, |
| "mean_token_accuracy": 0.9883326524496079, |
| "num_tokens": 8850332.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.06247411295771599, |
| "eval_mean_token_accuracy": 0.9854503702472996, |
| "eval_num_tokens": 8894052.0, |
| "eval_runtime": 60.5535, |
| "eval_samples_per_second": 6.094, |
| "eval_steps_per_second": 3.055, |
| "step": 1660 |
| }, |
| { |
| "epoch": 4.036210018105009, |
| "grad_norm": 0.1120341494679451, |
| "learning_rate": 2.8455006260147228e-05, |
| "loss": 0.0407, |
| "mean_token_accuracy": 0.990142793385024, |
| "num_tokens": 8992280.0, |
| "step": 1675 |
| }, |
| { |
| "epoch": 4.096560048280024, |
| "grad_norm": 0.09372762590646744, |
| "learning_rate": 2.5117142794687618e-05, |
| "loss": 0.039, |
| "mean_token_accuracy": 0.990278902053833, |
| "num_tokens": 9121058.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.15691007845504, |
| "grad_norm": 0.15355007350444794, |
| "learning_rate": 2.1969549841768168e-05, |
| "loss": 0.0403, |
| "mean_token_accuracy": 0.9900617271661758, |
| "num_tokens": 9260440.0, |
| "step": 1725 |
| }, |
| { |
| "epoch": 4.217260108630055, |
| "grad_norm": 0.07104801386594772, |
| "learning_rate": 1.901702304858842e-05, |
| "loss": 0.0385, |
| "mean_token_accuracy": 0.9903567266464234, |
| "num_tokens": 9389750.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 4.27761013880507, |
| "grad_norm": 0.09815526008605957, |
| "learning_rate": 1.6264060861122442e-05, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9897827422618866, |
| "num_tokens": 9529209.0, |
| "step": 1775 |
| }, |
| { |
| "epoch": 4.337960168980085, |
| "grad_norm": 0.07184392958879471, |
| "learning_rate": 1.3714857670322927e-05, |
| "loss": 0.0387, |
| "mean_token_accuracy": 0.9904780793190002, |
| "num_tokens": 9658250.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.3983101991551, |
| "grad_norm": 0.03805818408727646, |
| "learning_rate": 1.1373297421581129e-05, |
| "loss": 0.039, |
| "mean_token_accuracy": 0.9900361305475235, |
| "num_tokens": 9798481.0, |
| "step": 1825 |
| }, |
| { |
| "epoch": 4.458660229330115, |
| "grad_norm": 0.10860127955675125, |
| "learning_rate": 9.242947697178927e-06, |
| "loss": 0.0388, |
| "mean_token_accuracy": 0.9901992106437683, |
| "num_tokens": 9927421.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.51901025950513, |
| "grad_norm": 0.08905451744794846, |
| "learning_rate": 7.3270542807491675e-06, |
| "loss": 0.039, |
| "mean_token_accuracy": 0.9904217219352722, |
| "num_tokens": 10065813.0, |
| "step": 1875 |
| }, |
| { |
| "epoch": 4.579360289680145, |
| "grad_norm": 0.09814044833183289, |
| "learning_rate": 5.628536212026197e-06, |
| "loss": 0.0389, |
| "mean_token_accuracy": 0.9902842086553574, |
| "num_tokens": 10193933.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.63971031985516, |
| "grad_norm": 0.08526572585105896, |
| "learning_rate": 4.149981339420344e-06, |
| "loss": 0.0391, |
| "mean_token_accuracy": 0.9901294547319412, |
| "num_tokens": 10333978.0, |
| "step": 1925 |
| }, |
| { |
| "epoch": 4.700060350030175, |
| "grad_norm": 0.07220665365457535, |
| "learning_rate": 2.8936423771929897e-06, |
| "loss": 0.0388, |
| "mean_token_accuracy": 0.9902608853578567, |
| "num_tokens": 10462278.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.76041038020519, |
| "grad_norm": 0.06212342530488968, |
| "learning_rate": 1.8614334732393544e-06, |
| "loss": 0.0411, |
| "mean_token_accuracy": 0.9896285820007324, |
| "num_tokens": 10600837.0, |
| "step": 1975 |
| }, |
| { |
| "epoch": 4.820760410380205, |
| "grad_norm": 0.06338857114315033, |
| "learning_rate": 1.0549272927081964e-06, |
| "loss": 0.0387, |
| "mean_token_accuracy": 0.9904201912879944, |
| "num_tokens": 10730440.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.88111044055522, |
| "grad_norm": 0.07437903434038162, |
| "learning_rate": 4.753526219018755e-07, |
| "loss": 0.0394, |
| "mean_token_accuracy": 0.9902550059556962, |
| "num_tokens": 10868096.0, |
| "step": 2025 |
| }, |
| { |
| "epoch": 4.941460470730235, |
| "grad_norm": 0.04579373076558113, |
| "learning_rate": 1.235924961075496e-07, |
| "loss": 0.0376, |
| "mean_token_accuracy": 0.9905123418569565, |
| "num_tokens": 10997848.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.4054584205150604, |
| "learning_rate": 1.8285421163888313e-10, |
| "loss": 0.0427, |
| "mean_token_accuracy": 0.989623335833402, |
| "num_tokens": 11117565.0, |
| "step": 2075 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.06487765908241272, |
| "eval_mean_token_accuracy": 0.9856334386645137, |
| "eval_num_tokens": 11117565.0, |
| "eval_runtime": 60.3982, |
| "eval_samples_per_second": 6.109, |
| "eval_steps_per_second": 3.063, |
| "step": 2075 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 2075, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.8081901975415194e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|