| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.9822904368358913, |
| "eval_steps": 500, |
| "global_step": 844, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.047225501770956316, |
| "grad_norm": 2.2224767208099365, |
| "learning_rate": 0.00019786729857819907, |
| "loss": 1.4046, |
| "mean_token_accuracy": 0.6860443703830242, |
| "num_tokens": 11258.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.09445100354191263, |
| "grad_norm": 2.9072351455688477, |
| "learning_rate": 0.00019549763033175358, |
| "loss": 0.9967, |
| "mean_token_accuracy": 0.796217393875122, |
| "num_tokens": 22389.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.14167650531286896, |
| "grad_norm": 1.7167646884918213, |
| "learning_rate": 0.00019312796208530806, |
| "loss": 0.4636, |
| "mean_token_accuracy": 0.9008516699075699, |
| "num_tokens": 33839.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.18890200708382526, |
| "grad_norm": 1.540899395942688, |
| "learning_rate": 0.00019075829383886258, |
| "loss": 0.4207, |
| "mean_token_accuracy": 0.9101088687777519, |
| "num_tokens": 45227.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2361275088547816, |
| "grad_norm": 1.7775837182998657, |
| "learning_rate": 0.0001883886255924171, |
| "loss": 0.3663, |
| "mean_token_accuracy": 0.9167132675647736, |
| "num_tokens": 56969.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2833530106257379, |
| "grad_norm": 1.2004725933074951, |
| "learning_rate": 0.00018601895734597157, |
| "loss": 0.3187, |
| "mean_token_accuracy": 0.9333658754825592, |
| "num_tokens": 68254.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.3305785123966942, |
| "grad_norm": 1.32843816280365, |
| "learning_rate": 0.0001836492890995261, |
| "loss": 0.3664, |
| "mean_token_accuracy": 0.9262594923377037, |
| "num_tokens": 79400.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3778040141676505, |
| "grad_norm": 1.8211090564727783, |
| "learning_rate": 0.00018127962085308057, |
| "loss": 0.3767, |
| "mean_token_accuracy": 0.9262291938066483, |
| "num_tokens": 90951.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.42502951593860683, |
| "grad_norm": 1.4291584491729736, |
| "learning_rate": 0.00017890995260663508, |
| "loss": 0.2829, |
| "mean_token_accuracy": 0.936517083644867, |
| "num_tokens": 102424.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4722550177095632, |
| "grad_norm": 1.4524176120758057, |
| "learning_rate": 0.0001765402843601896, |
| "loss": 0.3404, |
| "mean_token_accuracy": 0.9263656228780747, |
| "num_tokens": 114139.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5194805194805194, |
| "grad_norm": 1.2539514303207397, |
| "learning_rate": 0.00017417061611374408, |
| "loss": 0.367, |
| "mean_token_accuracy": 0.9276282742619515, |
| "num_tokens": 125361.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5667060212514758, |
| "grad_norm": 0.8698514103889465, |
| "learning_rate": 0.0001718009478672986, |
| "loss": 0.2972, |
| "mean_token_accuracy": 0.9385714828968048, |
| "num_tokens": 136673.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6139315230224321, |
| "grad_norm": 1.112260341644287, |
| "learning_rate": 0.00016943127962085308, |
| "loss": 0.2655, |
| "mean_token_accuracy": 0.9430030316114426, |
| "num_tokens": 147942.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6611570247933884, |
| "grad_norm": 1.448964238166809, |
| "learning_rate": 0.0001670616113744076, |
| "loss": 0.2198, |
| "mean_token_accuracy": 0.9480510011315346, |
| "num_tokens": 159547.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7083825265643447, |
| "grad_norm": 1.364487886428833, |
| "learning_rate": 0.0001646919431279621, |
| "loss": 0.2652, |
| "mean_token_accuracy": 0.9435501232743263, |
| "num_tokens": 170741.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.755608028335301, |
| "grad_norm": 0.9347731471061707, |
| "learning_rate": 0.0001623222748815166, |
| "loss": 0.2316, |
| "mean_token_accuracy": 0.950811243057251, |
| "num_tokens": 181969.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8028335301062574, |
| "grad_norm": 0.9598692655563354, |
| "learning_rate": 0.0001599526066350711, |
| "loss": 0.1798, |
| "mean_token_accuracy": 0.956840255856514, |
| "num_tokens": 193046.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8500590318772137, |
| "grad_norm": 0.7902966737747192, |
| "learning_rate": 0.0001575829383886256, |
| "loss": 0.2031, |
| "mean_token_accuracy": 0.9510332688689231, |
| "num_tokens": 204429.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.89728453364817, |
| "grad_norm": 0.6174350380897522, |
| "learning_rate": 0.0001552132701421801, |
| "loss": 0.2149, |
| "mean_token_accuracy": 0.9503797248005867, |
| "num_tokens": 215842.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.9445100354191264, |
| "grad_norm": 1.0609785318374634, |
| "learning_rate": 0.00015284360189573462, |
| "loss": 0.1987, |
| "mean_token_accuracy": 0.9518642231822014, |
| "num_tokens": 227422.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9917355371900827, |
| "grad_norm": 0.8327723741531372, |
| "learning_rate": 0.0001504739336492891, |
| "loss": 0.1987, |
| "mean_token_accuracy": 0.9499309301376343, |
| "num_tokens": 238784.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.19764918088912964, |
| "eval_mean_token_accuracy": 0.9525814542063961, |
| "eval_num_tokens": 240751.0, |
| "eval_runtime": 8.6341, |
| "eval_samples_per_second": 24.554, |
| "eval_steps_per_second": 3.127, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.037780401416765, |
| "grad_norm": 0.6004464626312256, |
| "learning_rate": 0.0001481042654028436, |
| "loss": 0.1459, |
| "mean_token_accuracy": 0.9623932639757792, |
| "num_tokens": 249811.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.0850059031877213, |
| "grad_norm": 0.5023283362388611, |
| "learning_rate": 0.0001457345971563981, |
| "loss": 0.1085, |
| "mean_token_accuracy": 0.9685453534126282, |
| "num_tokens": 261566.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.1322314049586777, |
| "grad_norm": 0.9379103183746338, |
| "learning_rate": 0.0001433649289099526, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9732989251613617, |
| "num_tokens": 273201.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.179456906729634, |
| "grad_norm": 1.0074262619018555, |
| "learning_rate": 0.00014099526066350712, |
| "loss": 0.1364, |
| "mean_token_accuracy": 0.9613454505801201, |
| "num_tokens": 284542.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.2266824085005903, |
| "grad_norm": 1.0366216897964478, |
| "learning_rate": 0.0001386255924170616, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9535841554403305, |
| "num_tokens": 295925.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.2739079102715467, |
| "grad_norm": 0.7236562371253967, |
| "learning_rate": 0.00013625592417061612, |
| "loss": 0.124, |
| "mean_token_accuracy": 0.9644495561718941, |
| "num_tokens": 307173.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.321133412042503, |
| "grad_norm": 1.424338459968567, |
| "learning_rate": 0.0001338862559241706, |
| "loss": 0.1094, |
| "mean_token_accuracy": 0.9684907376766205, |
| "num_tokens": 318836.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.3683589138134593, |
| "grad_norm": 0.9334771037101746, |
| "learning_rate": 0.00013151658767772512, |
| "loss": 0.1226, |
| "mean_token_accuracy": 0.9643323451280594, |
| "num_tokens": 330284.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.4155844155844157, |
| "grad_norm": 1.1074577569961548, |
| "learning_rate": 0.00012914691943127963, |
| "loss": 0.1415, |
| "mean_token_accuracy": 0.9657251536846161, |
| "num_tokens": 341636.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.462809917355372, |
| "grad_norm": 0.4959653317928314, |
| "learning_rate": 0.00012677725118483412, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9732676669955254, |
| "num_tokens": 353014.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.510035419126328, |
| "grad_norm": 1.0306661128997803, |
| "learning_rate": 0.00012440758293838863, |
| "loss": 0.0992, |
| "mean_token_accuracy": 0.9696009412407876, |
| "num_tokens": 364630.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.5572609208972845, |
| "grad_norm": 0.907910943031311, |
| "learning_rate": 0.00012203791469194314, |
| "loss": 0.0965, |
| "mean_token_accuracy": 0.9723138064146042, |
| "num_tokens": 376367.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.604486422668241, |
| "grad_norm": 0.7785394787788391, |
| "learning_rate": 0.00011966824644549763, |
| "loss": 0.1407, |
| "mean_token_accuracy": 0.9637758329510688, |
| "num_tokens": 387837.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.6517119244391971, |
| "grad_norm": 1.0611106157302856, |
| "learning_rate": 0.00011729857819905214, |
| "loss": 0.1064, |
| "mean_token_accuracy": 0.9677139699459076, |
| "num_tokens": 398928.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.6989374262101535, |
| "grad_norm": 0.7576094269752502, |
| "learning_rate": 0.00011492890995260664, |
| "loss": 0.1181, |
| "mean_token_accuracy": 0.9652754247188569, |
| "num_tokens": 410374.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.7461629279811097, |
| "grad_norm": 0.7488318085670471, |
| "learning_rate": 0.00011255924170616114, |
| "loss": 0.1012, |
| "mean_token_accuracy": 0.9696858197450637, |
| "num_tokens": 421264.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.7933884297520661, |
| "grad_norm": 0.6903710961341858, |
| "learning_rate": 0.00011018957345971565, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.964360211789608, |
| "num_tokens": 432294.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.8406139315230226, |
| "grad_norm": 1.0426132678985596, |
| "learning_rate": 0.00010781990521327015, |
| "loss": 0.1058, |
| "mean_token_accuracy": 0.9673917979001999, |
| "num_tokens": 443662.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.8878394332939787, |
| "grad_norm": 0.6279439330101013, |
| "learning_rate": 0.00010545023696682465, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9684036031365395, |
| "num_tokens": 454594.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.935064935064935, |
| "grad_norm": 1.036986231803894, |
| "learning_rate": 0.00010308056872037915, |
| "loss": 0.078, |
| "mean_token_accuracy": 0.9755039691925049, |
| "num_tokens": 465724.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.9822904368358913, |
| "grad_norm": 0.7609688639640808, |
| "learning_rate": 0.00010071090047393366, |
| "loss": 0.0922, |
| "mean_token_accuracy": 0.9699518546462059, |
| "num_tokens": 477077.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.15497758984565735, |
| "eval_mean_token_accuracy": 0.9626010656356812, |
| "eval_num_tokens": 481502.0, |
| "eval_runtime": 8.619, |
| "eval_samples_per_second": 24.597, |
| "eval_steps_per_second": 3.133, |
| "step": 424 |
| }, |
| { |
| "epoch": 2.0283353010625738, |
| "grad_norm": 0.832304835319519, |
| "learning_rate": 9.834123222748816e-05, |
| "loss": 0.1073, |
| "mean_token_accuracy": 0.9689378264622811, |
| "num_tokens": 488024.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.07556080283353, |
| "grad_norm": 0.5486078858375549, |
| "learning_rate": 9.597156398104266e-05, |
| "loss": 0.0484, |
| "mean_token_accuracy": 0.9826492935419082, |
| "num_tokens": 499131.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.1227863046044866, |
| "grad_norm": 0.40537288784980774, |
| "learning_rate": 9.360189573459716e-05, |
| "loss": 0.0566, |
| "mean_token_accuracy": 0.9807873949408531, |
| "num_tokens": 510734.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.1700118063754426, |
| "grad_norm": 0.5876602530479431, |
| "learning_rate": 9.123222748815167e-05, |
| "loss": 0.0578, |
| "mean_token_accuracy": 0.9810668498277664, |
| "num_tokens": 521828.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.217237308146399, |
| "grad_norm": 0.5273938775062561, |
| "learning_rate": 8.886255924170617e-05, |
| "loss": 0.055, |
| "mean_token_accuracy": 0.9809592545032502, |
| "num_tokens": 533020.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.2644628099173554, |
| "grad_norm": 0.5727369785308838, |
| "learning_rate": 8.649289099526067e-05, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9824301272630691, |
| "num_tokens": 544287.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.311688311688312, |
| "grad_norm": 0.609664261341095, |
| "learning_rate": 8.412322274881517e-05, |
| "loss": 0.0553, |
| "mean_token_accuracy": 0.9817688629031182, |
| "num_tokens": 555880.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.358913813459268, |
| "grad_norm": 0.48904091119766235, |
| "learning_rate": 8.175355450236967e-05, |
| "loss": 0.0561, |
| "mean_token_accuracy": 0.9802302822470665, |
| "num_tokens": 567454.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.406139315230224, |
| "grad_norm": 0.48052382469177246, |
| "learning_rate": 7.938388625592418e-05, |
| "loss": 0.049, |
| "mean_token_accuracy": 0.983304688334465, |
| "num_tokens": 578751.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.4533648170011806, |
| "grad_norm": 0.6199146509170532, |
| "learning_rate": 7.701421800947868e-05, |
| "loss": 0.0602, |
| "mean_token_accuracy": 0.9780631363391876, |
| "num_tokens": 590469.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.500590318772137, |
| "grad_norm": 0.753097414970398, |
| "learning_rate": 7.464454976303318e-05, |
| "loss": 0.0509, |
| "mean_token_accuracy": 0.9815936490893364, |
| "num_tokens": 602058.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.5478158205430934, |
| "grad_norm": 0.7676092386245728, |
| "learning_rate": 7.227488151658768e-05, |
| "loss": 0.052, |
| "mean_token_accuracy": 0.981397558748722, |
| "num_tokens": 613415.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.5950413223140494, |
| "grad_norm": 0.49483609199523926, |
| "learning_rate": 6.990521327014218e-05, |
| "loss": 0.051, |
| "mean_token_accuracy": 0.9827686205506325, |
| "num_tokens": 625010.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.642266824085006, |
| "grad_norm": 0.6355498433113098, |
| "learning_rate": 6.753554502369669e-05, |
| "loss": 0.0563, |
| "mean_token_accuracy": 0.980421070754528, |
| "num_tokens": 636527.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.689492325855962, |
| "grad_norm": 0.6222267150878906, |
| "learning_rate": 6.516587677725119e-05, |
| "loss": 0.0566, |
| "mean_token_accuracy": 0.9805225148797035, |
| "num_tokens": 647992.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.7367178276269186, |
| "grad_norm": 0.5963544845581055, |
| "learning_rate": 6.279620853080569e-05, |
| "loss": 0.0484, |
| "mean_token_accuracy": 0.9810224324464798, |
| "num_tokens": 659364.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.783943329397875, |
| "grad_norm": 0.48161807656288147, |
| "learning_rate": 6.0426540284360186e-05, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9829053461551667, |
| "num_tokens": 670493.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.8311688311688314, |
| "grad_norm": 0.6928554773330688, |
| "learning_rate": 5.80568720379147e-05, |
| "loss": 0.0474, |
| "mean_token_accuracy": 0.9838150143623352, |
| "num_tokens": 681980.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.8783943329397874, |
| "grad_norm": 0.9183737635612488, |
| "learning_rate": 5.56872037914692e-05, |
| "loss": 0.0513, |
| "mean_token_accuracy": 0.9831859543919563, |
| "num_tokens": 692889.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.925619834710744, |
| "grad_norm": 0.6690914034843445, |
| "learning_rate": 5.33175355450237e-05, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.9816744804382325, |
| "num_tokens": 704348.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.9728453364817002, |
| "grad_norm": 0.3908725678920746, |
| "learning_rate": 5.09478672985782e-05, |
| "loss": 0.0481, |
| "mean_token_accuracy": 0.9822039097547531, |
| "num_tokens": 715695.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.14640700817108154, |
| "eval_mean_token_accuracy": 0.967608372370402, |
| "eval_num_tokens": 722253.0, |
| "eval_runtime": 8.5998, |
| "eval_samples_per_second": 24.652, |
| "eval_steps_per_second": 3.14, |
| "step": 636 |
| }, |
| { |
| "epoch": 3.0188902007083827, |
| "grad_norm": 0.39226505160331726, |
| "learning_rate": 4.857819905213271e-05, |
| "loss": 0.0402, |
| "mean_token_accuracy": 0.9841358707501338, |
| "num_tokens": 726613.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.0661157024793386, |
| "grad_norm": 0.34782007336616516, |
| "learning_rate": 4.620853080568721e-05, |
| "loss": 0.0322, |
| "mean_token_accuracy": 0.9865604758262634, |
| "num_tokens": 737963.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.113341204250295, |
| "grad_norm": 0.7617977261543274, |
| "learning_rate": 4.383886255924171e-05, |
| "loss": 0.0352, |
| "mean_token_accuracy": 0.9863895252346992, |
| "num_tokens": 748987.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.1605667060212514, |
| "grad_norm": 0.5488002896308899, |
| "learning_rate": 4.146919431279621e-05, |
| "loss": 0.0323, |
| "mean_token_accuracy": 0.9870850175619126, |
| "num_tokens": 760488.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 3.207792207792208, |
| "grad_norm": 0.44978898763656616, |
| "learning_rate": 3.909952606635071e-05, |
| "loss": 0.0357, |
| "mean_token_accuracy": 0.9864787235856056, |
| "num_tokens": 771677.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.2550177095631643, |
| "grad_norm": 0.44440773129463196, |
| "learning_rate": 3.672985781990522e-05, |
| "loss": 0.0363, |
| "mean_token_accuracy": 0.9861913770437241, |
| "num_tokens": 783243.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 3.3022432113341202, |
| "grad_norm": 0.41815730929374695, |
| "learning_rate": 3.4360189573459716e-05, |
| "loss": 0.0374, |
| "mean_token_accuracy": 0.9860232338309288, |
| "num_tokens": 794711.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.3494687131050767, |
| "grad_norm": 0.2978448271751404, |
| "learning_rate": 3.1990521327014215e-05, |
| "loss": 0.0308, |
| "mean_token_accuracy": 0.9869714677333832, |
| "num_tokens": 806385.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.396694214876033, |
| "grad_norm": 0.46016019582748413, |
| "learning_rate": 2.962085308056872e-05, |
| "loss": 0.0343, |
| "mean_token_accuracy": 0.9866258546710014, |
| "num_tokens": 817664.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.4439197166469895, |
| "grad_norm": 0.4907480478286743, |
| "learning_rate": 2.7251184834123224e-05, |
| "loss": 0.0356, |
| "mean_token_accuracy": 0.9860621899366379, |
| "num_tokens": 829118.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.4911452184179455, |
| "grad_norm": 0.5607575178146362, |
| "learning_rate": 2.4881516587677726e-05, |
| "loss": 0.0375, |
| "mean_token_accuracy": 0.9857015043497086, |
| "num_tokens": 840540.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.538370720188902, |
| "grad_norm": 0.5227943062782288, |
| "learning_rate": 2.251184834123223e-05, |
| "loss": 0.0371, |
| "mean_token_accuracy": 0.9858236253261566, |
| "num_tokens": 851987.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.5855962219598583, |
| "grad_norm": 0.28605079650878906, |
| "learning_rate": 2.014218009478673e-05, |
| "loss": 0.0306, |
| "mean_token_accuracy": 0.9882108762860298, |
| "num_tokens": 863423.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 3.6328217237308147, |
| "grad_norm": 0.33974796533584595, |
| "learning_rate": 1.7772511848341233e-05, |
| "loss": 0.0381, |
| "mean_token_accuracy": 0.9858895480632782, |
| "num_tokens": 874997.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 3.680047225501771, |
| "grad_norm": 0.5480939149856567, |
| "learning_rate": 1.5402843601895736e-05, |
| "loss": 0.0344, |
| "mean_token_accuracy": 0.9871362060308456, |
| "num_tokens": 886148.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 3.7272727272727275, |
| "grad_norm": 0.4544774889945984, |
| "learning_rate": 1.3033175355450238e-05, |
| "loss": 0.0364, |
| "mean_token_accuracy": 0.9863489225506783, |
| "num_tokens": 897137.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 3.7744982290436835, |
| "grad_norm": 0.6491249799728394, |
| "learning_rate": 1.066350710900474e-05, |
| "loss": 0.0336, |
| "mean_token_accuracy": 0.9858677625656128, |
| "num_tokens": 908688.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 3.82172373081464, |
| "grad_norm": 0.4520932137966156, |
| "learning_rate": 8.293838862559241e-06, |
| "loss": 0.0337, |
| "mean_token_accuracy": 0.9875034481287003, |
| "num_tokens": 920183.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.8689492325855963, |
| "grad_norm": 0.45541301369667053, |
| "learning_rate": 5.924170616113745e-06, |
| "loss": 0.0337, |
| "mean_token_accuracy": 0.986977969110012, |
| "num_tokens": 931127.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.9161747343565523, |
| "grad_norm": 0.4386422634124756, |
| "learning_rate": 3.5545023696682464e-06, |
| "loss": 0.0354, |
| "mean_token_accuracy": 0.9872412413358689, |
| "num_tokens": 942690.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 3.9634002361275087, |
| "grad_norm": 0.5566153526306152, |
| "learning_rate": 1.1848341232227488e-06, |
| "loss": 0.0351, |
| "mean_token_accuracy": 0.9845622256398201, |
| "num_tokens": 954344.0, |
| "step": 840 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 844, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.631693863816397e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|