| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 2075, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.060350030175015085, |
| "grad_norm": 0.21468117833137512, |
| "learning_rate": 8.18181818181818e-05, |
| "loss": 1.8508, |
| "mean_token_accuracy": 0.6162685614824295, |
| "num_tokens": 157389.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.12070006035003017, |
| "grad_norm": 0.28003624081611633, |
| "learning_rate": 0.00016704545454545452, |
| "loss": 1.0701, |
| "mean_token_accuracy": 0.7399352079629898, |
| "num_tokens": 284181.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18105009052504525, |
| "grad_norm": 0.226594477891922, |
| "learning_rate": 0.0002522727272727273, |
| "loss": 0.6641, |
| "mean_token_accuracy": 0.8210779559612275, |
| "num_tokens": 442731.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24140012070006034, |
| "grad_norm": 0.4477122724056244, |
| "learning_rate": 0.0002999887132933212, |
| "loss": 0.5388, |
| "mean_token_accuracy": 0.8498140323162079, |
| "num_tokens": 570887.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.30175015087507545, |
| "grad_norm": 0.270370751619339, |
| "learning_rate": 0.0002998791256978121, |
| "loss": 0.4147, |
| "mean_token_accuracy": 0.8810201096534729, |
| "num_tokens": 729150.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3621001810500905, |
| "grad_norm": 0.3737528324127197, |
| "learning_rate": 0.0002996530399366737, |
| "loss": 0.3783, |
| "mean_token_accuracy": 0.8932037615776062, |
| "num_tokens": 855423.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4224502112251056, |
| "grad_norm": 0.32675644755363464, |
| "learning_rate": 0.00029931063174202567, |
| "loss": 0.2747, |
| "mean_token_accuracy": 0.9195706987380982, |
| "num_tokens": 1013920.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.4828002414001207, |
| "grad_norm": 0.40882647037506104, |
| "learning_rate": 0.00029885216726118104, |
| "loss": 0.2362, |
| "mean_token_accuracy": 0.9326811420917511, |
| "num_tokens": 1139058.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5431502715751357, |
| "grad_norm": 0.2929173409938812, |
| "learning_rate": 0.00029827800284977474, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.943121486902237, |
| "num_tokens": 1294639.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.6035003017501509, |
| "grad_norm": 0.49980688095092773, |
| "learning_rate": 0.00029758858479477575, |
| "loss": 0.1879, |
| "mean_token_accuracy": 0.9459474259614944, |
| "num_tokens": 1420667.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.663850331925166, |
| "grad_norm": 0.2513156533241272, |
| "learning_rate": 0.0002967844489675963, |
| "loss": 0.1538, |
| "mean_token_accuracy": 0.9570270645618438, |
| "num_tokens": 1577420.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.724200362100181, |
| "grad_norm": 0.48155277967453003, |
| "learning_rate": 0.00029586622040756957, |
| "loss": 0.1349, |
| "mean_token_accuracy": 0.9620789134502411, |
| "num_tokens": 1705571.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7845503922751962, |
| "grad_norm": 0.16763296723365784, |
| "learning_rate": 0.0002948346128361186, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9658524990081787, |
| "num_tokens": 1863982.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8449004224502112, |
| "grad_norm": 0.3732389211654663, |
| "learning_rate": 0.00029369042810199416, |
| "loss": 0.11, |
| "mean_token_accuracy": 0.9696171337366104, |
| "num_tokens": 1989622.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9052504526252263, |
| "grad_norm": 0.20290179550647736, |
| "learning_rate": 0.0002924345555580135, |
| "loss": 0.0928, |
| "mean_token_accuracy": 0.9740620368719101, |
| "num_tokens": 2146763.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9656004828002414, |
| "grad_norm": 0.3237822651863098, |
| "learning_rate": 0.000291067971369783, |
| "loss": 0.0932, |
| "mean_token_accuracy": 0.9736461997032165, |
| "num_tokens": 2273110.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.08881417661905289, |
| "eval_mean_token_accuracy": 0.9754456977586489, |
| "eval_num_tokens": 2354180.0, |
| "eval_runtime": 62.77, |
| "eval_samples_per_second": 5.879, |
| "eval_steps_per_second": 2.947, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.024140012070006, |
| "grad_norm": 0.13449090719223022, |
| "learning_rate": 0.0002895917377569438, |
| "loss": 0.0954, |
| "mean_token_accuracy": 0.9730558665757326, |
| "num_tokens": 2423130.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0844900422450212, |
| "grad_norm": 0.2197084277868271, |
| "learning_rate": 0.00028800700216752875, |
| "loss": 0.0551, |
| "mean_token_accuracy": 0.9840189307928086, |
| "num_tokens": 2564094.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1448400724200363, |
| "grad_norm": 0.13322922587394714, |
| "learning_rate": 0.00028631499638607285, |
| "loss": 0.0742, |
| "mean_token_accuracy": 0.979583665728569, |
| "num_tokens": 2704963.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.2051901025950513, |
| "grad_norm": 0.18199768662452698, |
| "learning_rate": 0.0002845170355761712, |
| "loss": 0.0578, |
| "mean_token_accuracy": 0.9836762601137161, |
| "num_tokens": 2847569.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2655401327700664, |
| "grad_norm": 0.14187701046466827, |
| "learning_rate": 0.0002826145172582274, |
| "loss": 0.0688, |
| "mean_token_accuracy": 0.9806136053800583, |
| "num_tokens": 2989847.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.3258901629450814, |
| "grad_norm": 0.3071412146091461, |
| "learning_rate": 0.00028060892022318764, |
| "loss": 0.0552, |
| "mean_token_accuracy": 0.9846898633241653, |
| "num_tokens": 3134888.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3862401931200965, |
| "grad_norm": 0.16651563346385956, |
| "learning_rate": 0.0002785018033831051, |
| "loss": 0.063, |
| "mean_token_accuracy": 0.9824958252906799, |
| "num_tokens": 3277522.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.4465902232951118, |
| "grad_norm": 0.08527401834726334, |
| "learning_rate": 0.0002762948045594276, |
| "loss": 0.0445, |
| "mean_token_accuracy": 0.986923239827156, |
| "num_tokens": 3419296.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5069402534701268, |
| "grad_norm": 0.09827699512243271, |
| "learning_rate": 0.0002739896392099502, |
| "loss": 0.0683, |
| "mean_token_accuracy": 0.9812941312789917, |
| "num_tokens": 3561482.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.567290283645142, |
| "grad_norm": 0.1221788227558136, |
| "learning_rate": 0.00027158809909542307, |
| "loss": 0.0535, |
| "mean_token_accuracy": 0.98518315076828, |
| "num_tokens": 3704279.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.627640313820157, |
| "grad_norm": 0.10258518159389496, |
| "learning_rate": 0.00026909205088685, |
| "loss": 0.0616, |
| "mean_token_accuracy": 0.9829519605636596, |
| "num_tokens": 3845569.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.687990343995172, |
| "grad_norm": 0.09674005955457687, |
| "learning_rate": 0.0002665034347145612, |
| "loss": 0.0422, |
| "mean_token_accuracy": 0.987600001692772, |
| "num_tokens": 3986667.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.748340374170187, |
| "grad_norm": 0.1226281225681305, |
| "learning_rate": 0.000263824262660187, |
| "loss": 0.0595, |
| "mean_token_accuracy": 0.9835411328077316, |
| "num_tokens": 4126147.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.8086904043452021, |
| "grad_norm": 0.16110199689865112, |
| "learning_rate": 0.0002610566171927056, |
| "loss": 0.0399, |
| "mean_token_accuracy": 0.9882411390542984, |
| "num_tokens": 4268948.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.8690404345202172, |
| "grad_norm": 0.14910632371902466, |
| "learning_rate": 0.00025820264954977976, |
| "loss": 0.0627, |
| "mean_token_accuracy": 0.9824194663763046, |
| "num_tokens": 4410572.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.9293904646952322, |
| "grad_norm": 0.1125180646777153, |
| "learning_rate": 0.00025526457806564136, |
| "loss": 0.0425, |
| "mean_token_accuracy": 0.987920394539833, |
| "num_tokens": 4553968.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9897404948702473, |
| "grad_norm": 0.11598383635282516, |
| "learning_rate": 0.00025224468644682245, |
| "loss": 0.046, |
| "mean_token_accuracy": 0.98680981695652, |
| "num_tokens": 4687933.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.05183422192931175, |
| "eval_mean_token_accuracy": 0.9854742443239367, |
| "eval_num_tokens": 4708360.0, |
| "eval_runtime": 62.7258, |
| "eval_samples_per_second": 5.883, |
| "eval_steps_per_second": 2.949, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.048280024140012, |
| "grad_norm": 0.12139257043600082, |
| "learning_rate": 0.00024914532199707444, |
| "loss": 0.0467, |
| "mean_token_accuracy": 0.9864320312578654, |
| "num_tokens": 4837944.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.1086300543150274, |
| "grad_norm": 0.07626475393772125, |
| "learning_rate": 0.00024596889379285353, |
| "loss": 0.0334, |
| "mean_token_accuracy": 0.9904208928346634, |
| "num_tokens": 4969799.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.1689800844900424, |
| "grad_norm": 0.0758512020111084, |
| "learning_rate": 0.00024271787081079228, |
| "loss": 0.044, |
| "mean_token_accuracy": 0.9870604687929153, |
| "num_tokens": 5120530.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.2293301146650575, |
| "grad_norm": 0.11110670864582062, |
| "learning_rate": 0.00023939478000861117, |
| "loss": 0.0318, |
| "mean_token_accuracy": 0.9904728019237519, |
| "num_tokens": 5253594.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.2896801448400725, |
| "grad_norm": 0.05050364509224892, |
| "learning_rate": 0.00023600220436096318, |
| "loss": 0.0474, |
| "mean_token_accuracy": 0.9859576737880706, |
| "num_tokens": 5406515.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.3500301750150876, |
| "grad_norm": 0.10038283467292786, |
| "learning_rate": 0.00023254278085173684, |
| "loss": 0.0317, |
| "mean_token_accuracy": 0.9904199486970902, |
| "num_tokens": 5540873.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 2.4103802051901027, |
| "grad_norm": 0.06138943135738373, |
| "learning_rate": 0.00022901919842437972, |
| "loss": 0.0416, |
| "mean_token_accuracy": 0.9875155621767044, |
| "num_tokens": 5690736.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.4707302353651177, |
| "grad_norm": 0.053865499794483185, |
| "learning_rate": 0.00022543419589183397, |
| "loss": 0.0279, |
| "mean_token_accuracy": 0.9913687032461166, |
| "num_tokens": 5822410.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.5310802655401328, |
| "grad_norm": 0.06363498419523239, |
| "learning_rate": 0.00022179055980770993, |
| "loss": 0.0438, |
| "mean_token_accuracy": 0.9867295175790787, |
| "num_tokens": 5973587.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.591430295715148, |
| "grad_norm": 0.09079308062791824, |
| "learning_rate": 0.0002180911223003513, |
| "loss": 0.0283, |
| "mean_token_accuracy": 0.9913936233520508, |
| "num_tokens": 6106943.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.651780325890163, |
| "grad_norm": 0.06340842694044113, |
| "learning_rate": 0.00021433875887147627, |
| "loss": 0.0426, |
| "mean_token_accuracy": 0.9875069695711136, |
| "num_tokens": 6259442.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.712130356065178, |
| "grad_norm": 0.0556156300008297, |
| "learning_rate": 0.00021053638616110525, |
| "loss": 0.029, |
| "mean_token_accuracy": 0.9911601048707962, |
| "num_tokens": 6392790.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.772480386240193, |
| "grad_norm": 0.07941101491451263, |
| "learning_rate": 0.00020668695968051274, |
| "loss": 0.0435, |
| "mean_token_accuracy": 0.9869343960285186, |
| "num_tokens": 6545535.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.832830416415208, |
| "grad_norm": 0.053617168217897415, |
| "learning_rate": 0.00020279347151496482, |
| "loss": 0.0285, |
| "mean_token_accuracy": 0.9914081859588623, |
| "num_tokens": 6679597.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.8931804465902236, |
| "grad_norm": 0.059137940406799316, |
| "learning_rate": 0.00019885894799802922, |
| "loss": 0.0439, |
| "mean_token_accuracy": 0.98672642827034, |
| "num_tokens": 6830721.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.9535304767652386, |
| "grad_norm": 0.054504215717315674, |
| "learning_rate": 0.00019488644735926396, |
| "loss": 0.0289, |
| "mean_token_accuracy": 0.9915051358938217, |
| "num_tokens": 6962566.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.04319874569773674, |
| "eval_mean_token_accuracy": 0.9879072198996672, |
| "eval_num_tokens": 7062540.0, |
| "eval_runtime": 62.809, |
| "eval_samples_per_second": 5.875, |
| "eval_steps_per_second": 2.945, |
| "step": 1245 |
| }, |
| { |
| "epoch": 3.012070006035003, |
| "grad_norm": 0.07366561144590378, |
| "learning_rate": 0.00019087905734711452, |
| "loss": 0.0369, |
| "mean_token_accuracy": 0.9888100163223817, |
| "num_tokens": 7098485.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.0724200362100182, |
| "grad_norm": 0.04900413379073143, |
| "learning_rate": 0.00018683989282886613, |
| "loss": 0.0243, |
| "mean_token_accuracy": 0.992353920340538, |
| "num_tokens": 7246823.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 3.1327700663850333, |
| "grad_norm": 0.11869508028030396, |
| "learning_rate": 0.0001827720933695173, |
| "loss": 0.0293, |
| "mean_token_accuracy": 0.9912081670761108, |
| "num_tokens": 7381683.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.1931200965600484, |
| "grad_norm": 0.06294097751379013, |
| "learning_rate": 0.00017867882079145627, |
| "loss": 0.0268, |
| "mean_token_accuracy": 0.9917609655857086, |
| "num_tokens": 7528755.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 3.2534701267350634, |
| "grad_norm": 0.1576467901468277, |
| "learning_rate": 0.00017456325671683724, |
| "loss": 0.0286, |
| "mean_token_accuracy": 0.9911142766475678, |
| "num_tokens": 7663142.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.3138201569100785, |
| "grad_norm": 0.05235208570957184, |
| "learning_rate": 0.00017042860009456638, |
| "loss": 0.0259, |
| "mean_token_accuracy": 0.991670835018158, |
| "num_tokens": 7809523.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 3.3741701870850935, |
| "grad_norm": 0.0751878172159195, |
| "learning_rate": 0.00016627806471382066, |
| "loss": 0.0294, |
| "mean_token_accuracy": 0.9906228709220887, |
| "num_tokens": 7944654.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.4345202172601086, |
| "grad_norm": 0.03894800692796707, |
| "learning_rate": 0.00016211487670603078, |
| "loss": 0.0269, |
| "mean_token_accuracy": 0.9913842624425888, |
| "num_tokens": 8092398.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 3.4948702474351236, |
| "grad_norm": 0.09787385165691376, |
| "learning_rate": 0.0001579422720372715, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.9911862814426422, |
| "num_tokens": 8228938.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.5552202776101387, |
| "grad_norm": 0.05320660397410393, |
| "learning_rate": 0.00015376349399300745, |
| "loss": 0.0291, |
| "mean_token_accuracy": 0.9911050695180893, |
| "num_tokens": 8378749.0, |
| "step": 1475 |
| }, |
| { |
| "epoch": 3.6155703077851538, |
| "grad_norm": 0.08408358693122864, |
| "learning_rate": 0.0001495817906571492, |
| "loss": 0.0312, |
| "mean_token_accuracy": 0.9901037472486496, |
| "num_tokens": 8516085.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.675920337960169, |
| "grad_norm": 0.050662487745285034, |
| "learning_rate": 0.00014540041238738055, |
| "loss": 0.0243, |
| "mean_token_accuracy": 0.9920293813943863, |
| "num_tokens": 8664227.0, |
| "step": 1525 |
| }, |
| { |
| "epoch": 3.736270368135184, |
| "grad_norm": 0.11257333308458328, |
| "learning_rate": 0.00014122260928871734, |
| "loss": 0.0304, |
| "mean_token_accuracy": 0.990105789899826, |
| "num_tokens": 8802546.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.796620398310199, |
| "grad_norm": 0.05049088969826698, |
| "learning_rate": 0.00013705162868726396, |
| "loss": 0.0236, |
| "mean_token_accuracy": 0.992456527352333, |
| "num_tokens": 8952407.0, |
| "step": 1575 |
| }, |
| { |
| "epoch": 3.856970428485214, |
| "grad_norm": 0.07625644654035568, |
| "learning_rate": 0.00013289071260612855, |
| "loss": 0.027, |
| "mean_token_accuracy": 0.991376177072525, |
| "num_tokens": 9087703.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.9173204586602295, |
| "grad_norm": 0.03547623008489609, |
| "learning_rate": 0.00012874309524546083, |
| "loss": 0.0255, |
| "mean_token_accuracy": 0.991724653840065, |
| "num_tokens": 9236298.0, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.9776704888352445, |
| "grad_norm": 0.056914571672677994, |
| "learning_rate": 0.00012461200046857084, |
| "loss": 0.0258, |
| "mean_token_accuracy": 0.9917834293842316, |
| "num_tokens": 9369153.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.04025053605437279, |
| "eval_mean_token_accuracy": 0.9888840298394899, |
| "eval_num_tokens": 9416720.0, |
| "eval_runtime": 62.7742, |
| "eval_samples_per_second": 5.878, |
| "eval_steps_per_second": 2.947, |
| "step": 1660 |
| }, |
| { |
| "epoch": 4.036210018105009, |
| "grad_norm": 0.03895945847034454, |
| "learning_rate": 0.00012050063929608123, |
| "loss": 0.025, |
| "mean_token_accuracy": 0.9920010032113066, |
| "num_tokens": 9517464.0, |
| "step": 1675 |
| }, |
| { |
| "epoch": 4.096560048280024, |
| "grad_norm": 0.038321759551763535, |
| "learning_rate": 0.0001164122074100633, |
| "loss": 0.0196, |
| "mean_token_accuracy": 0.9934940075874329, |
| "num_tokens": 9654511.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.15691007845504, |
| "grad_norm": 0.048036105930805206, |
| "learning_rate": 0.00011234988267009415, |
| "loss": 0.0231, |
| "mean_token_accuracy": 0.9922078281641007, |
| "num_tokens": 9800672.0, |
| "step": 1725 |
| }, |
| { |
| "epoch": 4.217260108630055, |
| "grad_norm": 0.09493701159954071, |
| "learning_rate": 0.00010831682264316787, |
| "loss": 0.0204, |
| "mean_token_accuracy": 0.993373184800148, |
| "num_tokens": 9938087.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 4.27761013880507, |
| "grad_norm": 0.056498508900403976, |
| "learning_rate": 0.00010431616214937911, |
| "loss": 0.0227, |
| "mean_token_accuracy": 0.9926544779539108, |
| "num_tokens": 10084429.0, |
| "step": 1775 |
| }, |
| { |
| "epoch": 4.337960168980085, |
| "grad_norm": 0.04016057401895523, |
| "learning_rate": 0.00010035101082528777, |
| "loss": 0.0197, |
| "mean_token_accuracy": 0.9933375012874603, |
| "num_tokens": 10221190.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.3983101991551, |
| "grad_norm": 0.06675443053245544, |
| "learning_rate": 9.642445070685809e-05, |
| "loss": 0.0253, |
| "mean_token_accuracy": 0.992013863325119, |
| "num_tokens": 10367374.0, |
| "step": 1825 |
| }, |
| { |
| "epoch": 4.458660229330115, |
| "grad_norm": 0.06365606188774109, |
| "learning_rate": 9.253953383385157e-05, |
| "loss": 0.0198, |
| "mean_token_accuracy": 0.9934730947017669, |
| "num_tokens": 10505133.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.51901025950513, |
| "grad_norm": 0.04390507563948631, |
| "learning_rate": 8.869927987753459e-05, |
| "loss": 0.025, |
| "mean_token_accuracy": 0.9921487855911255, |
| "num_tokens": 10654059.0, |
| "step": 1875 |
| }, |
| { |
| "epoch": 4.579360289680145, |
| "grad_norm": 0.04688199982047081, |
| "learning_rate": 8.490667379354661e-05, |
| "loss": 0.0193, |
| "mean_token_accuracy": 0.9935662603378296, |
| "num_tokens": 10791488.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.63971031985516, |
| "grad_norm": 0.048728689551353455, |
| "learning_rate": 8.116466350175079e-05, |
| "loss": 0.0219, |
| "mean_token_accuracy": 0.9928493529558182, |
| "num_tokens": 10936918.0, |
| "step": 1925 |
| }, |
| { |
| "epoch": 4.700060350030175, |
| "grad_norm": 0.03529852256178856, |
| "learning_rate": 7.747615759487304e-05, |
| "loss": 0.019, |
| "mean_token_accuracy": 0.9935548371076584, |
| "num_tokens": 11074592.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.76041038020519, |
| "grad_norm": 0.04538232088088989, |
| "learning_rate": 7.38440230777085e-05, |
| "loss": 0.0231, |
| "mean_token_accuracy": 0.9924061894416809, |
| "num_tokens": 11221831.0, |
| "step": 1975 |
| }, |
| { |
| "epoch": 4.820760410380205, |
| "grad_norm": 0.05990992859005928, |
| "learning_rate": 7.027108313865378e-05, |
| "loss": 0.018, |
| "mean_token_accuracy": 0.9938581752777099, |
| "num_tokens": 11360739.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.88111044055522, |
| "grad_norm": 0.05956440791487694, |
| "learning_rate": 6.676011495529687e-05, |
| "loss": 0.0242, |
| "mean_token_accuracy": 0.9921341562271118, |
| "num_tokens": 11507676.0, |
| "step": 2025 |
| }, |
| { |
| "epoch": 4.941460470730235, |
| "grad_norm": 0.03628522902727127, |
| "learning_rate": 6.331384753577056e-05, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.9937015652656556, |
| "num_tokens": 11646256.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.12821151316165924, |
| "learning_rate": 5.993495959754631e-05, |
| "loss": 0.0215, |
| "mean_token_accuracy": 0.9930061482891595, |
| "num_tokens": 11770900.0, |
| "step": 2075 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.0397900752723217, |
| "eval_mean_token_accuracy": 0.9896509357400842, |
| "eval_num_tokens": 11770900.0, |
| "eval_runtime": 62.8881, |
| "eval_samples_per_second": 5.868, |
| "eval_steps_per_second": 2.942, |
| "step": 2075 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 2905, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 7, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.056626317959158e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|