{ "best_metric": 0.41329578, "best_model_checkpoint": "/group/40174/Zywoou/mm_math_reasoning/oly_output/SFT_text_40k_3B/v2-20250623-201026/checkpoint-900", "epoch": 3.426625145971195, "eval_steps": 100, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003114052160373686, "grad_norm": 9.090961456298828, "learning_rate": 2.469135802469136e-07, "loss": 0.8441067934036255, "memory(GiB)": 61.48, "step": 1, "token_acc": 0.7542614698998621, "train_speed(iter/s)": 0.013065 }, { "epoch": 0.015570260801868432, "grad_norm": 8.591418266296387, "learning_rate": 1.234567901234568e-06, "loss": 0.8394168615341187, "memory(GiB)": 61.48, "step": 5, "token_acc": 0.772884347485021, "train_speed(iter/s)": 0.016776 }, { "epoch": 0.031140521603736863, "grad_norm": 1.963486909866333, "learning_rate": 2.469135802469136e-06, "loss": 0.8379721641540527, "memory(GiB)": 61.48, "step": 10, "token_acc": 0.7595817060496037, "train_speed(iter/s)": 0.016413 }, { "epoch": 0.04671078240560529, "grad_norm": 1.29275381565094, "learning_rate": 3.7037037037037037e-06, "loss": 0.7905796527862549, "memory(GiB)": 61.48, "step": 15, "token_acc": 0.7768803131791101, "train_speed(iter/s)": 0.016697 }, { "epoch": 0.06228104320747373, "grad_norm": 1.3676708936691284, "learning_rate": 4.938271604938272e-06, "loss": 0.7433982372283936, "memory(GiB)": 61.48, "step": 20, "token_acc": 0.7680492430075545, "train_speed(iter/s)": 0.016714 }, { "epoch": 0.07785130400934216, "grad_norm": 0.9433161020278931, "learning_rate": 6.17283950617284e-06, "loss": 0.7095602989196778, "memory(GiB)": 61.48, "step": 25, "token_acc": 0.7874565609207318, "train_speed(iter/s)": 0.016902 }, { "epoch": 0.09342156481121058, "grad_norm": 0.6722971200942993, "learning_rate": 7.4074074074074075e-06, "loss": 0.672957468032837, "memory(GiB)": 61.48, "step": 30, "token_acc": 0.7957260974215338, "train_speed(iter/s)": 0.017038 }, { "epoch": 0.10899182561307902, "grad_norm": 0.4969067871570587, "learning_rate": 8.641975308641975e-06, "loss": 0.6527645587921143, "memory(GiB)": 61.48, "step": 35, "token_acc": 0.8015626837586735, "train_speed(iter/s)": 0.017137 }, { "epoch": 0.12456208641494745, "grad_norm": 0.4756340980529785, "learning_rate": 9.876543209876543e-06, "loss": 0.6334109306335449, "memory(GiB)": 61.48, "step": 40, "token_acc": 0.8020030272154918, "train_speed(iter/s)": 0.017137 }, { "epoch": 0.1401323472168159, "grad_norm": 0.37226057052612305, "learning_rate": 1.1111111111111113e-05, "loss": 0.6206116676330566, "memory(GiB)": 81.99, "step": 45, "token_acc": 0.7978761643835617, "train_speed(iter/s)": 0.017002 }, { "epoch": 0.15570260801868432, "grad_norm": 0.33603885769844055, "learning_rate": 1.234567901234568e-05, "loss": 0.6084653854370117, "memory(GiB)": 81.99, "step": 50, "token_acc": 0.8057346158430913, "train_speed(iter/s)": 0.01702 }, { "epoch": 0.17127286882055273, "grad_norm": 0.3247829079627991, "learning_rate": 1.3580246913580248e-05, "loss": 0.5855489730834961, "memory(GiB)": 81.99, "step": 55, "token_acc": 0.8133501940355266, "train_speed(iter/s)": 0.017086 }, { "epoch": 0.18684312962242117, "grad_norm": 0.3287549316883087, "learning_rate": 1.4814814814814815e-05, "loss": 0.5795706748962403, "memory(GiB)": 81.99, "step": 60, "token_acc": 0.8243709005928014, "train_speed(iter/s)": 0.017103 }, { "epoch": 0.2024133904242896, "grad_norm": 0.3348773121833801, "learning_rate": 1.6049382716049385e-05, "loss": 0.5844010353088379, "memory(GiB)": 81.99, "step": 65, "token_acc": 0.826077338385553, "train_speed(iter/s)": 0.017084 }, { "epoch": 0.21798365122615804, "grad_norm": 0.3951764702796936, "learning_rate": 1.728395061728395e-05, "loss": 0.5668695449829102, "memory(GiB)": 81.99, "step": 70, "token_acc": 0.8184763611920233, "train_speed(iter/s)": 0.017115 }, { "epoch": 0.23355391202802647, "grad_norm": 0.3419385850429535, "learning_rate": 1.851851851851852e-05, "loss": 0.5640019416809082, "memory(GiB)": 81.99, "step": 75, "token_acc": 0.8262526646713034, "train_speed(iter/s)": 0.017089 }, { "epoch": 0.2491241728298949, "grad_norm": 0.4079224467277527, "learning_rate": 1.9753086419753087e-05, "loss": 0.5657567501068115, "memory(GiB)": 81.99, "step": 80, "token_acc": 0.8255806686338495, "train_speed(iter/s)": 0.017058 }, { "epoch": 0.2646944336317633, "grad_norm": 0.4042387902736664, "learning_rate": 1.9999660048205748e-05, "loss": 0.5499643325805664, "memory(GiB)": 81.99, "step": 85, "token_acc": 0.8258961193684297, "train_speed(iter/s)": 0.01709 }, { "epoch": 0.2802646944336318, "grad_norm": 0.3471659719944, "learning_rate": 1.9998279033654883e-05, "loss": 0.5546986579895019, "memory(GiB)": 81.99, "step": 90, "token_acc": 0.8232631885048027, "train_speed(iter/s)": 0.01708 }, { "epoch": 0.2958349552355002, "grad_norm": 0.37730872631073, "learning_rate": 1.999583585595892e-05, "loss": 0.5496613502502441, "memory(GiB)": 81.99, "step": 95, "token_acc": 0.8184557633810109, "train_speed(iter/s)": 0.017135 }, { "epoch": 0.31140521603736865, "grad_norm": 0.35348325967788696, "learning_rate": 1.9992330774667867e-05, "loss": 0.5377495765686036, "memory(GiB)": 81.99, "step": 100, "token_acc": 0.8358910692831397, "train_speed(iter/s)": 0.017151 }, { "epoch": 0.31140521603736865, "eval_loss": 0.5029594302177429, "eval_runtime": 48.123, "eval_samples_per_second": 8.624, "eval_steps_per_second": 1.081, "eval_token_acc": 0.8289853121092036, "step": 100 }, { "epoch": 0.32697547683923706, "grad_norm": 0.43411314487457275, "learning_rate": 1.9987764162142615e-05, "loss": 0.5542641639709472, "memory(GiB)": 83.67, "step": 105, "token_acc": 0.822730121577026, "train_speed(iter/s)": 0.016973 }, { "epoch": 0.34254573764110546, "grad_norm": 0.3889370560646057, "learning_rate": 1.998213650351541e-05, "loss": 0.5661673545837402, "memory(GiB)": 83.67, "step": 110, "token_acc": 0.8248022938189019, "train_speed(iter/s)": 0.016968 }, { "epoch": 0.3581159984429739, "grad_norm": 0.4113948941230774, "learning_rate": 1.99754483966383e-05, "loss": 0.5373417377471924, "memory(GiB)": 83.67, "step": 115, "token_acc": 0.8353561888566422, "train_speed(iter/s)": 0.016986 }, { "epoch": 0.37368625924484233, "grad_norm": 0.3282926678657532, "learning_rate": 1.996770055201962e-05, "loss": 0.532097053527832, "memory(GiB)": 83.67, "step": 120, "token_acc": 0.8311512985373245, "train_speed(iter/s)": 0.017002 }, { "epoch": 0.3892565200467108, "grad_norm": 0.3860708475112915, "learning_rate": 1.9958893792748527e-05, "loss": 0.5377762794494629, "memory(GiB)": 83.67, "step": 125, "token_acc": 0.8319092733783512, "train_speed(iter/s)": 0.017043 }, { "epoch": 0.4048267808485792, "grad_norm": 0.3636181056499481, "learning_rate": 1.994902905440754e-05, "loss": 0.5360857009887695, "memory(GiB)": 83.67, "step": 130, "token_acc": 0.8289638007457961, "train_speed(iter/s)": 0.017049 }, { "epoch": 0.42039704165044767, "grad_norm": 0.44786185026168823, "learning_rate": 1.9938107384973165e-05, "loss": 0.5159939765930176, "memory(GiB)": 83.67, "step": 135, "token_acc": 0.8303873578325347, "train_speed(iter/s)": 0.017068 }, { "epoch": 0.4359673024523161, "grad_norm": 0.36897265911102295, "learning_rate": 1.9926129944704552e-05, "loss": 0.525636863708496, "memory(GiB)": 83.67, "step": 140, "token_acc": 0.825755096315691, "train_speed(iter/s)": 0.017067 }, { "epoch": 0.4515375632541845, "grad_norm": 0.3940599262714386, "learning_rate": 1.9913098006020245e-05, "loss": 0.5220311164855957, "memory(GiB)": 83.67, "step": 145, "token_acc": 0.8314430013298949, "train_speed(iter/s)": 0.017053 }, { "epoch": 0.46710782405605294, "grad_norm": 0.36770397424697876, "learning_rate": 1.9899012953363002e-05, "loss": 0.5308480262756348, "memory(GiB)": 83.67, "step": 150, "token_acc": 0.8356369708426136, "train_speed(iter/s)": 0.017061 }, { "epoch": 0.48267808485792135, "grad_norm": 0.3850296437740326, "learning_rate": 1.988387628305271e-05, "loss": 0.5177151679992675, "memory(GiB)": 83.67, "step": 155, "token_acc": 0.8446110002134257, "train_speed(iter/s)": 0.017066 }, { "epoch": 0.4982483456597898, "grad_norm": 0.41091373562812805, "learning_rate": 1.9867689603127448e-05, "loss": 0.5239609718322754, "memory(GiB)": 83.67, "step": 160, "token_acc": 0.8418038278766896, "train_speed(iter/s)": 0.017026 }, { "epoch": 0.5138186064616582, "grad_norm": 0.35084888339042664, "learning_rate": 1.9850454633172632e-05, "loss": 0.5135612487792969, "memory(GiB)": 83.67, "step": 165, "token_acc": 0.8403358434638878, "train_speed(iter/s)": 0.017025 }, { "epoch": 0.5293888672635266, "grad_norm": 0.3684956729412079, "learning_rate": 1.9832173204138358e-05, "loss": 0.5212111473083496, "memory(GiB)": 83.67, "step": 170, "token_acc": 0.8203030950800491, "train_speed(iter/s)": 0.01705 }, { "epoch": 0.5449591280653951, "grad_norm": 0.4466633200645447, "learning_rate": 1.981284725814487e-05, "loss": 0.5236361503601075, "memory(GiB)": 83.67, "step": 175, "token_acc": 0.8292287351630786, "train_speed(iter/s)": 0.017037 }, { "epoch": 0.5605293888672636, "grad_norm": 0.4126527011394501, "learning_rate": 1.979247884827625e-05, "loss": 0.5252516746520997, "memory(GiB)": 83.67, "step": 180, "token_acc": 0.8313978101236051, "train_speed(iter/s)": 0.017027 }, { "epoch": 0.576099649669132, "grad_norm": 0.3615601658821106, "learning_rate": 1.9771070138362326e-05, "loss": 0.5181349277496338, "memory(GiB)": 83.67, "step": 185, "token_acc": 0.8271949270166622, "train_speed(iter/s)": 0.017036 }, { "epoch": 0.5916699104710004, "grad_norm": 0.363862544298172, "learning_rate": 1.974862340274876e-05, "loss": 0.5171935081481933, "memory(GiB)": 83.67, "step": 190, "token_acc": 0.8262061487073518, "train_speed(iter/s)": 0.017046 }, { "epoch": 0.6072401712728688, "grad_norm": 0.35778218507766724, "learning_rate": 1.9725141026055473e-05, "loss": 0.5045164585113525, "memory(GiB)": 83.67, "step": 195, "token_acc": 0.8280538716190542, "train_speed(iter/s)": 0.017058 }, { "epoch": 0.6228104320747373, "grad_norm": 0.3499464690685272, "learning_rate": 1.9700625502923286e-05, "loss": 0.5087326049804688, "memory(GiB)": 83.67, "step": 200, "token_acc": 0.8393105379001429, "train_speed(iter/s)": 0.017074 }, { "epoch": 0.6228104320747373, "eval_loss": 0.47101157903671265, "eval_runtime": 48.3616, "eval_samples_per_second": 8.581, "eval_steps_per_second": 1.075, "eval_token_acc": 0.8365704554766953, "step": 200 }, { "epoch": 0.6383806928766057, "grad_norm": 0.3926837146282196, "learning_rate": 1.967507943774893e-05, "loss": 0.5087917804718017, "memory(GiB)": 83.78, "step": 205, "token_acc": 0.8356912249863274, "train_speed(iter/s)": 0.016969 }, { "epoch": 0.6539509536784741, "grad_norm": 0.44116681814193726, "learning_rate": 1.9648505544408343e-05, "loss": 0.5104311943054199, "memory(GiB)": 83.78, "step": 210, "token_acc": 0.8370874883557564, "train_speed(iter/s)": 0.016962 }, { "epoch": 0.6695212144803425, "grad_norm": 0.3881992995738983, "learning_rate": 1.962090664596838e-05, "loss": 0.49617815017700195, "memory(GiB)": 83.78, "step": 215, "token_acc": 0.8333252446460243, "train_speed(iter/s)": 0.016969 }, { "epoch": 0.6850914752822109, "grad_norm": 0.3824191987514496, "learning_rate": 1.9592285674386895e-05, "loss": 0.5057227134704589, "memory(GiB)": 83.78, "step": 220, "token_acc": 0.8328552368245496, "train_speed(iter/s)": 0.016954 }, { "epoch": 0.7006617360840794, "grad_norm": 0.3746967017650604, "learning_rate": 1.9562645670201278e-05, "loss": 0.5225645542144776, "memory(GiB)": 83.78, "step": 225, "token_acc": 0.8198400577125468, "train_speed(iter/s)": 0.016942 }, { "epoch": 0.7162319968859479, "grad_norm": 0.38867348432540894, "learning_rate": 1.9531989782205425e-05, "loss": 0.5209392547607422, "memory(GiB)": 83.78, "step": 230, "token_acc": 0.8268426966579727, "train_speed(iter/s)": 0.016954 }, { "epoch": 0.7318022576878163, "grad_norm": 0.34132710099220276, "learning_rate": 1.9500321267115253e-05, "loss": 0.506260871887207, "memory(GiB)": 83.78, "step": 235, "token_acc": 0.8383771649752925, "train_speed(iter/s)": 0.016942 }, { "epoch": 0.7473725184896847, "grad_norm": 0.3127667009830475, "learning_rate": 1.9467643489222704e-05, "loss": 0.5079313278198242, "memory(GiB)": 83.78, "step": 240, "token_acc": 0.8315731041446339, "train_speed(iter/s)": 0.016933 }, { "epoch": 0.7629427792915532, "grad_norm": 0.36045560240745544, "learning_rate": 1.9433959920038346e-05, "loss": 0.5103404521942139, "memory(GiB)": 83.78, "step": 245, "token_acc": 0.8284737724912421, "train_speed(iter/s)": 0.016938 }, { "epoch": 0.7785130400934216, "grad_norm": 0.3331986963748932, "learning_rate": 1.939927413792258e-05, "loss": 0.5129657745361328, "memory(GiB)": 83.78, "step": 250, "token_acc": 0.8337586241949231, "train_speed(iter/s)": 0.016947 }, { "epoch": 0.79408330089529, "grad_norm": 0.3113352954387665, "learning_rate": 1.9363589827705494e-05, "loss": 0.5070863723754883, "memory(GiB)": 83.78, "step": 255, "token_acc": 0.8234678436927421, "train_speed(iter/s)": 0.016943 }, { "epoch": 0.8096535616971584, "grad_norm": 0.36101603507995605, "learning_rate": 1.932691078029541e-05, "loss": 0.5083826541900635, "memory(GiB)": 83.78, "step": 260, "token_acc": 0.8253210564574751, "train_speed(iter/s)": 0.016957 }, { "epoch": 0.8252238224990268, "grad_norm": 0.35200613737106323, "learning_rate": 1.9289240892276156e-05, "loss": 0.5094104290008545, "memory(GiB)": 83.78, "step": 265, "token_acc": 0.8399629352836229, "train_speed(iter/s)": 0.016958 }, { "epoch": 0.8407940833008953, "grad_norm": 0.3463002145290375, "learning_rate": 1.9250584165493102e-05, "loss": 0.503563404083252, "memory(GiB)": 83.78, "step": 270, "token_acc": 0.8328735843753342, "train_speed(iter/s)": 0.016964 }, { "epoch": 0.8563643441027637, "grad_norm": 0.35573363304138184, "learning_rate": 1.9210944706628047e-05, "loss": 0.5095272064208984, "memory(GiB)": 83.78, "step": 275, "token_acc": 0.8259964544369949, "train_speed(iter/s)": 0.016968 }, { "epoch": 0.8719346049046321, "grad_norm": 0.37910547852516174, "learning_rate": 1.9170326726762935e-05, "loss": 0.512710428237915, "memory(GiB)": 83.78, "step": 280, "token_acc": 0.8316431444307558, "train_speed(iter/s)": 0.01696 }, { "epoch": 0.8875048657065006, "grad_norm": 0.36142924427986145, "learning_rate": 1.9128734540932494e-05, "loss": 0.5098121643066407, "memory(GiB)": 83.78, "step": 285, "token_acc": 0.8245892568215938, "train_speed(iter/s)": 0.016968 }, { "epoch": 0.903075126508369, "grad_norm": 0.3772912621498108, "learning_rate": 1.908617256766583e-05, "loss": 0.5062539577484131, "memory(GiB)": 83.78, "step": 290, "token_acc": 0.834810945744792, "train_speed(iter/s)": 0.016961 }, { "epoch": 0.9186453873102375, "grad_norm": 0.3878962993621826, "learning_rate": 1.904264532851702e-05, "loss": 0.4917923927307129, "memory(GiB)": 83.78, "step": 295, "token_acc": 0.8298348122666052, "train_speed(iter/s)": 0.016966 }, { "epoch": 0.9342156481121059, "grad_norm": 0.31322357058525085, "learning_rate": 1.899815744758478e-05, "loss": 0.49855747222900393, "memory(GiB)": 83.78, "step": 300, "token_acc": 0.8404212765465496, "train_speed(iter/s)": 0.016995 }, { "epoch": 0.9342156481121059, "eval_loss": 0.4527965784072876, "eval_runtime": 48.4158, "eval_samples_per_second": 8.572, "eval_steps_per_second": 1.074, "eval_token_acc": 0.8412428185484144, "step": 300 }, { "epoch": 0.9497859089139743, "grad_norm": 0.34280747175216675, "learning_rate": 1.8952713651021227e-05, "loss": 0.48580265045166016, "memory(GiB)": 83.78, "step": 305, "token_acc": 0.838886737615503, "train_speed(iter/s)": 0.016948 }, { "epoch": 0.9653561697158427, "grad_norm": 0.37516114115715027, "learning_rate": 1.890631876652977e-05, "loss": 0.49081811904907224, "memory(GiB)": 83.78, "step": 310, "token_acc": 0.8279721583939669, "train_speed(iter/s)": 0.016959 }, { "epoch": 0.9809264305177112, "grad_norm": 0.37829071283340454, "learning_rate": 1.8858977722852273e-05, "loss": 0.5034436225891114, "memory(GiB)": 83.78, "step": 315, "token_acc": 0.8373031558965061, "train_speed(iter/s)": 0.016951 }, { "epoch": 0.9964966913195796, "grad_norm": 0.3431848883628845, "learning_rate": 1.881069554924545e-05, "loss": 0.4938789367675781, "memory(GiB)": 83.78, "step": 320, "token_acc": 0.8419751634548625, "train_speed(iter/s)": 0.016944 }, { "epoch": 1.0124562086414948, "grad_norm": 0.3898485600948334, "learning_rate": 1.8761477374946548e-05, "loss": 0.549742317199707, "memory(GiB)": 83.78, "step": 325, "token_acc": 0.8478408314628672, "train_speed(iter/s)": 0.016925 }, { "epoch": 1.0280264694433632, "grad_norm": 0.3720360994338989, "learning_rate": 1.8711328428628492e-05, "loss": 0.4674954414367676, "memory(GiB)": 83.78, "step": 330, "token_acc": 0.8488305868952306, "train_speed(iter/s)": 0.016907 }, { "epoch": 1.0435967302452316, "grad_norm": 0.37718313932418823, "learning_rate": 1.866025403784439e-05, "loss": 0.4795982837677002, "memory(GiB)": 83.78, "step": 335, "token_acc": 0.8451706485573955, "train_speed(iter/s)": 0.016912 }, { "epoch": 1.0591669910471, "grad_norm": 0.35298970341682434, "learning_rate": 1.8608259628461568e-05, "loss": 0.46731414794921877, "memory(GiB)": 83.78, "step": 340, "token_acc": 0.8569462760162835, "train_speed(iter/s)": 0.016926 }, { "epoch": 1.0747372518489684, "grad_norm": 0.33034011721611023, "learning_rate": 1.855535072408516e-05, "loss": 0.4545105457305908, "memory(GiB)": 83.78, "step": 345, "token_acc": 0.8460660005038206, "train_speed(iter/s)": 0.016927 }, { "epoch": 1.0903075126508368, "grad_norm": 0.3065577745437622, "learning_rate": 1.850153294547131e-05, "loss": 0.45794997215270994, "memory(GiB)": 83.78, "step": 350, "token_acc": 0.8486942707726407, "train_speed(iter/s)": 0.016917 }, { "epoch": 1.1058777734527054, "grad_norm": 0.3462938368320465, "learning_rate": 1.8446812009930046e-05, "loss": 0.46443839073181153, "memory(GiB)": 83.78, "step": 355, "token_acc": 0.8502326066894884, "train_speed(iter/s)": 0.016914 }, { "epoch": 1.1214480342545738, "grad_norm": 0.32309216260910034, "learning_rate": 1.839119373071791e-05, "loss": 0.4771932601928711, "memory(GiB)": 83.78, "step": 360, "token_acc": 0.8482756332906233, "train_speed(iter/s)": 0.016898 }, { "epoch": 1.1370182950564423, "grad_norm": 0.33675771951675415, "learning_rate": 1.8334684016420383e-05, "loss": 0.4566344261169434, "memory(GiB)": 83.78, "step": 365, "token_acc": 0.8473928793757336, "train_speed(iter/s)": 0.016891 }, { "epoch": 1.1525885558583107, "grad_norm": 0.3071984648704529, "learning_rate": 1.82772888703242e-05, "loss": 0.4622032165527344, "memory(GiB)": 83.78, "step": 370, "token_acc": 0.8537222303050629, "train_speed(iter/s)": 0.016883 }, { "epoch": 1.168158816660179, "grad_norm": 0.339647114276886, "learning_rate": 1.8219014389779586e-05, "loss": 0.45253515243530273, "memory(GiB)": 83.78, "step": 375, "token_acc": 0.8437412853929243, "train_speed(iter/s)": 0.01689 }, { "epoch": 1.1837290774620475, "grad_norm": 0.3810037672519684, "learning_rate": 1.81598667655525e-05, "loss": 0.46290812492370603, "memory(GiB)": 83.78, "step": 380, "token_acc": 0.8509664132197645, "train_speed(iter/s)": 0.016878 }, { "epoch": 1.1992993382639159, "grad_norm": 0.3978622853755951, "learning_rate": 1.8099852281166974e-05, "loss": 0.4630721569061279, "memory(GiB)": 83.78, "step": 385, "token_acc": 0.8428528719403278, "train_speed(iter/s)": 0.016867 }, { "epoch": 1.2148695990657843, "grad_norm": 0.34080690145492554, "learning_rate": 1.8038977312237583e-05, "loss": 0.46429901123046874, "memory(GiB)": 83.78, "step": 390, "token_acc": 0.853177745668261, "train_speed(iter/s)": 0.016864 }, { "epoch": 1.2304398598676527, "grad_norm": 0.38950115442276, "learning_rate": 1.7977248325792117e-05, "loss": 0.4587130546569824, "memory(GiB)": 83.78, "step": 395, "token_acc": 0.8436527334397446, "train_speed(iter/s)": 0.016864 }, { "epoch": 1.246010120669521, "grad_norm": 0.33555251359939575, "learning_rate": 1.791467187958459e-05, "loss": 0.4662825584411621, "memory(GiB)": 83.78, "step": 400, "token_acc": 0.8447516930022574, "train_speed(iter/s)": 0.016854 }, { "epoch": 1.246010120669521, "eval_loss": 0.44290465116500854, "eval_runtime": 48.4727, "eval_samples_per_second": 8.562, "eval_steps_per_second": 1.073, "eval_token_acc": 0.8440737465212763, "step": 400 }, { "epoch": 1.2615803814713895, "grad_norm": 0.3424926996231079, "learning_rate": 1.785125462139855e-05, "loss": 0.45247802734375, "memory(GiB)": 85.46, "step": 405, "token_acc": 0.8476541229236075, "train_speed(iter/s)": 0.016819 }, { "epoch": 1.2771506422732581, "grad_norm": 0.3556825518608093, "learning_rate": 1.7787003288340873e-05, "loss": 0.4520209312438965, "memory(GiB)": 85.46, "step": 410, "token_acc": 0.85518420823792, "train_speed(iter/s)": 0.016827 }, { "epoch": 1.2927209030751265, "grad_norm": 0.3012397587299347, "learning_rate": 1.7721924706126045e-05, "loss": 0.4547447204589844, "memory(GiB)": 85.46, "step": 415, "token_acc": 0.8473332915910087, "train_speed(iter/s)": 0.016838 }, { "epoch": 1.308291163876995, "grad_norm": 0.3305128514766693, "learning_rate": 1.765602578835102e-05, "loss": 0.44603533744812013, "memory(GiB)": 85.46, "step": 420, "token_acc": 0.8563695561772267, "train_speed(iter/s)": 0.016851 }, { "epoch": 1.3238614246788634, "grad_norm": 0.375415563583374, "learning_rate": 1.7589313535760787e-05, "loss": 0.4534785270690918, "memory(GiB)": 85.46, "step": 425, "token_acc": 0.8450059826434574, "train_speed(iter/s)": 0.016859 }, { "epoch": 1.3394316854807318, "grad_norm": 0.3619174659252167, "learning_rate": 1.7521795035504618e-05, "loss": 0.46638121604919436, "memory(GiB)": 87.47, "step": 430, "token_acc": 0.8533132783257229, "train_speed(iter/s)": 0.016865 }, { "epoch": 1.3550019462826002, "grad_norm": 0.34234941005706787, "learning_rate": 1.745347746038319e-05, "loss": 0.45301074981689454, "memory(GiB)": 87.47, "step": 435, "token_acc": 0.8506241771102551, "train_speed(iter/s)": 0.016853 }, { "epoch": 1.3705722070844686, "grad_norm": 0.35721340775489807, "learning_rate": 1.738436806808657e-05, "loss": 0.4574443817138672, "memory(GiB)": 87.47, "step": 440, "token_acc": 0.8492071302651172, "train_speed(iter/s)": 0.016851 }, { "epoch": 1.3861424678863372, "grad_norm": 0.3377233147621155, "learning_rate": 1.731447420042321e-05, "loss": 0.4555491924285889, "memory(GiB)": 87.47, "step": 445, "token_acc": 0.8449323972958919, "train_speed(iter/s)": 0.016853 }, { "epoch": 1.4017127286882056, "grad_norm": 0.3378112316131592, "learning_rate": 1.724380328253998e-05, "loss": 0.456014347076416, "memory(GiB)": 87.47, "step": 450, "token_acc": 0.8491198343217546, "train_speed(iter/s)": 0.016866 }, { "epoch": 1.417282989490074, "grad_norm": 0.32281750440597534, "learning_rate": 1.7172362822133368e-05, "loss": 0.444715690612793, "memory(GiB)": 87.47, "step": 455, "token_acc": 0.8585264429436933, "train_speed(iter/s)": 0.01687 }, { "epoch": 1.4328532502919424, "grad_norm": 0.3332570791244507, "learning_rate": 1.7100160408651906e-05, "loss": 0.46764631271362306, "memory(GiB)": 87.47, "step": 460, "token_acc": 0.8437270092325083, "train_speed(iter/s)": 0.016885 }, { "epoch": 1.4484235110938108, "grad_norm": 0.3404083251953125, "learning_rate": 1.7027203712489902e-05, "loss": 0.4540658950805664, "memory(GiB)": 87.47, "step": 465, "token_acc": 0.8424792767766657, "train_speed(iter/s)": 0.016891 }, { "epoch": 1.4639937718956793, "grad_norm": 0.28665056824684143, "learning_rate": 1.6953500484172584e-05, "loss": 0.4646796703338623, "memory(GiB)": 87.47, "step": 470, "token_acc": 0.8517768926209016, "train_speed(iter/s)": 0.016894 }, { "epoch": 1.4795640326975477, "grad_norm": 0.3514065742492676, "learning_rate": 1.6879058553532708e-05, "loss": 0.4555992603302002, "memory(GiB)": 87.47, "step": 475, "token_acc": 0.8537480228233454, "train_speed(iter/s)": 0.016905 }, { "epoch": 1.495134293499416, "grad_norm": 0.36578574776649475, "learning_rate": 1.6803885828878798e-05, "loss": 0.4544710636138916, "memory(GiB)": 87.47, "step": 480, "token_acc": 0.8525084812486251, "train_speed(iter/s)": 0.016912 }, { "epoch": 1.5107045543012845, "grad_norm": 0.32191744446754456, "learning_rate": 1.6727990296154962e-05, "loss": 0.4602982521057129, "memory(GiB)": 87.47, "step": 485, "token_acc": 0.8430140101913102, "train_speed(iter/s)": 0.016906 }, { "epoch": 1.5262748151031529, "grad_norm": 0.37726181745529175, "learning_rate": 1.665138001809255e-05, "loss": 0.45351152420043944, "memory(GiB)": 87.47, "step": 490, "token_acc": 0.8455952326181131, "train_speed(iter/s)": 0.016915 }, { "epoch": 1.5418450759050213, "grad_norm": 0.3360103666782379, "learning_rate": 1.657406313335358e-05, "loss": 0.4640647411346436, "memory(GiB)": 87.47, "step": 495, "token_acc": 0.8416911128839417, "train_speed(iter/s)": 0.016901 }, { "epoch": 1.5574153367068897, "grad_norm": 0.3354435861110687, "learning_rate": 1.6496047855666166e-05, "loss": 0.45473790168762207, "memory(GiB)": 87.47, "step": 500, "token_acc": 0.8470284591147179, "train_speed(iter/s)": 0.016903 }, { "epoch": 1.5574153367068897, "eval_loss": 0.4339936375617981, "eval_runtime": 49.215, "eval_samples_per_second": 8.432, "eval_steps_per_second": 1.057, "eval_token_acc": 0.8464905695468675, "step": 500 }, { "epoch": 1.5729855975087583, "grad_norm": 0.29624515771865845, "learning_rate": 1.641734247295189e-05, "loss": 0.44854736328125, "memory(GiB)": 87.47, "step": 505, "token_acc": 0.8512255304674686, "train_speed(iter/s)": 0.016867 }, { "epoch": 1.5885558583106267, "grad_norm": 0.2959994375705719, "learning_rate": 1.633795534644538e-05, "loss": 0.44970054626464845, "memory(GiB)": 87.47, "step": 510, "token_acc": 0.8495887288243693, "train_speed(iter/s)": 0.016868 }, { "epoch": 1.6041261191124951, "grad_norm": 0.28208568692207336, "learning_rate": 1.625789490980604e-05, "loss": 0.45240216255187987, "memory(GiB)": 87.47, "step": 515, "token_acc": 0.8508599646850458, "train_speed(iter/s)": 0.016866 }, { "epoch": 1.6196963799143635, "grad_norm": 0.31556007266044617, "learning_rate": 1.61771696682221e-05, "loss": 0.46805973052978517, "memory(GiB)": 87.47, "step": 520, "token_acc": 0.8385705498249266, "train_speed(iter/s)": 0.016855 }, { "epoch": 1.635266640716232, "grad_norm": 0.33998918533325195, "learning_rate": 1.609578819750708e-05, "loss": 0.4480471611022949, "memory(GiB)": 87.47, "step": 525, "token_acc": 0.8535245057224707, "train_speed(iter/s)": 0.016854 }, { "epoch": 1.6508369015181006, "grad_norm": 0.32541388273239136, "learning_rate": 1.601375914318873e-05, "loss": 0.44594502449035645, "memory(GiB)": 87.47, "step": 530, "token_acc": 0.8513384035634705, "train_speed(iter/s)": 0.01686 }, { "epoch": 1.666407162319969, "grad_norm": 0.3565449118614197, "learning_rate": 1.5931091219590594e-05, "loss": 0.44635515213012694, "memory(GiB)": 87.47, "step": 535, "token_acc": 0.8587149590440981, "train_speed(iter/s)": 0.016864 }, { "epoch": 1.6819774231218374, "grad_norm": 0.30892956256866455, "learning_rate": 1.5847793208906228e-05, "loss": 0.4479209899902344, "memory(GiB)": 87.47, "step": 540, "token_acc": 0.8477150375810529, "train_speed(iter/s)": 0.016866 }, { "epoch": 1.6975476839237058, "grad_norm": 0.32783636450767517, "learning_rate": 1.5763873960266236e-05, "loss": 0.4361083984375, "memory(GiB)": 87.47, "step": 545, "token_acc": 0.8581115692629165, "train_speed(iter/s)": 0.01688 }, { "epoch": 1.7131179447255742, "grad_norm": 0.31219062209129333, "learning_rate": 1.567934238879819e-05, "loss": 0.44908871650695803, "memory(GiB)": 87.47, "step": 550, "token_acc": 0.8354629470446383, "train_speed(iter/s)": 0.01688 }, { "epoch": 1.7286882055274426, "grad_norm": 0.3076675534248352, "learning_rate": 1.5594207474679533e-05, "loss": 0.44863643646240237, "memory(GiB)": 87.47, "step": 555, "token_acc": 0.8530024926954223, "train_speed(iter/s)": 0.01688 }, { "epoch": 1.744258466329311, "grad_norm": 0.35262957215309143, "learning_rate": 1.5508478262183564e-05, "loss": 0.44416370391845705, "memory(GiB)": 87.47, "step": 560, "token_acc": 0.8449086194172916, "train_speed(iter/s)": 0.016884 }, { "epoch": 1.7598287271311794, "grad_norm": 0.31981098651885986, "learning_rate": 1.5422163858718632e-05, "loss": 0.4421844482421875, "memory(GiB)": 87.47, "step": 565, "token_acc": 0.8547054363189448, "train_speed(iter/s)": 0.016885 }, { "epoch": 1.7753989879330478, "grad_norm": 0.3417418301105499, "learning_rate": 1.533527343386062e-05, "loss": 0.45076637268066405, "memory(GiB)": 87.47, "step": 570, "token_acc": 0.8389380153741954, "train_speed(iter/s)": 0.016882 }, { "epoch": 1.7909692487349163, "grad_norm": 0.31734615564346313, "learning_rate": 1.5247816218378808e-05, "loss": 0.43622050285339353, "memory(GiB)": 87.47, "step": 575, "token_acc": 0.8538195434018485, "train_speed(iter/s)": 0.0169 }, { "epoch": 1.8065395095367847, "grad_norm": 0.34167781472206116, "learning_rate": 1.5159801503255245e-05, "loss": 0.4417697906494141, "memory(GiB)": 87.47, "step": 580, "token_acc": 0.85343586541145, "train_speed(iter/s)": 0.016913 }, { "epoch": 1.822109770338653, "grad_norm": 0.3014916777610779, "learning_rate": 1.5071238638697731e-05, "loss": 0.464891242980957, "memory(GiB)": 87.47, "step": 585, "token_acc": 0.8448814913208093, "train_speed(iter/s)": 0.016921 }, { "epoch": 1.8376800311405215, "grad_norm": 0.28431716561317444, "learning_rate": 1.4982137033146508e-05, "loss": 0.43960394859313967, "memory(GiB)": 87.47, "step": 590, "token_acc": 0.8527028466591703, "train_speed(iter/s)": 0.016926 }, { "epoch": 1.8532502919423899, "grad_norm": 0.2890400290489197, "learning_rate": 1.4892506152274743e-05, "loss": 0.43685274124145507, "memory(GiB)": 87.47, "step": 595, "token_acc": 0.8587665608002407, "train_speed(iter/s)": 0.016925 }, { "epoch": 1.8688205527442585, "grad_norm": 0.3061586916446686, "learning_rate": 1.4802355517982956e-05, "loss": 0.45107498168945315, "memory(GiB)": 87.47, "step": 600, "token_acc": 0.8508957683688835, "train_speed(iter/s)": 0.01692 }, { "epoch": 1.8688205527442585, "eval_loss": 0.4256907105445862, "eval_runtime": 48.605, "eval_samples_per_second": 8.538, "eval_steps_per_second": 1.07, "eval_token_acc": 0.8487490155926254, "step": 600 }, { "epoch": 1.884390813546127, "grad_norm": 0.3080097436904907, "learning_rate": 1.4711694707387459e-05, "loss": 0.4596552848815918, "memory(GiB)": 87.47, "step": 605, "token_acc": 0.8494166730021033, "train_speed(iter/s)": 0.016891 }, { "epoch": 1.8999610743479953, "grad_norm": 0.3166070282459259, "learning_rate": 1.462053335180294e-05, "loss": 0.44292964935302737, "memory(GiB)": 87.47, "step": 610, "token_acc": 0.8526988947012526, "train_speed(iter/s)": 0.016896 }, { "epoch": 1.9155313351498637, "grad_norm": 0.29401150345802307, "learning_rate": 1.452888113571929e-05, "loss": 0.4381908893585205, "memory(GiB)": 87.47, "step": 615, "token_acc": 0.8555842110978391, "train_speed(iter/s)": 0.016908 }, { "epoch": 1.9311015959517321, "grad_norm": 0.3151325285434723, "learning_rate": 1.4436747795772752e-05, "loss": 0.4259210109710693, "memory(GiB)": 87.47, "step": 620, "token_acc": 0.8545294649153147, "train_speed(iter/s)": 0.016922 }, { "epoch": 1.9466718567536008, "grad_norm": 0.29976552724838257, "learning_rate": 1.4344143119711585e-05, "loss": 0.44890317916870115, "memory(GiB)": 87.47, "step": 625, "token_acc": 0.8479431788087383, "train_speed(iter/s)": 0.016918 }, { "epoch": 1.9622421175554692, "grad_norm": 0.3326264023780823, "learning_rate": 1.4251076945356233e-05, "loss": 0.4403618335723877, "memory(GiB)": 87.47, "step": 630, "token_acc": 0.8553420249762108, "train_speed(iter/s)": 0.016916 }, { "epoch": 1.9778123783573376, "grad_norm": 0.3118704557418823, "learning_rate": 1.4157559159554244e-05, "loss": 0.4499207496643066, "memory(GiB)": 87.47, "step": 635, "token_acc": 0.8585684300402007, "train_speed(iter/s)": 0.01692 }, { "epoch": 1.993382639159206, "grad_norm": 0.31018197536468506, "learning_rate": 1.4063599697129912e-05, "loss": 0.43601245880126954, "memory(GiB)": 87.47, "step": 640, "token_acc": 0.8492618761832873, "train_speed(iter/s)": 0.016918 }, { "epoch": 2.009342156481121, "grad_norm": 0.4045466482639313, "learning_rate": 1.3969208539828873e-05, "loss": 0.5253468990325928, "memory(GiB)": 87.47, "step": 645, "token_acc": 0.8468511299166429, "train_speed(iter/s)": 0.016918 }, { "epoch": 2.0249124172829895, "grad_norm": 0.35480746626853943, "learning_rate": 1.3874395715257697e-05, "loss": 0.4091975212097168, "memory(GiB)": 87.47, "step": 650, "token_acc": 0.8652064686351988, "train_speed(iter/s)": 0.016917 }, { "epoch": 2.040482678084858, "grad_norm": 0.304674357175827, "learning_rate": 1.3779171295818606e-05, "loss": 0.4048311233520508, "memory(GiB)": 87.47, "step": 655, "token_acc": 0.860902665654438, "train_speed(iter/s)": 0.016915 }, { "epoch": 2.0560529388867264, "grad_norm": 0.34621867537498474, "learning_rate": 1.3683545397639433e-05, "loss": 0.4079150199890137, "memory(GiB)": 87.47, "step": 660, "token_acc": 0.8592790169293577, "train_speed(iter/s)": 0.01693 }, { "epoch": 2.0716231996885948, "grad_norm": 0.31017231941223145, "learning_rate": 1.3587528179498946e-05, "loss": 0.4034367561340332, "memory(GiB)": 87.47, "step": 665, "token_acc": 0.8592915642451773, "train_speed(iter/s)": 0.016927 }, { "epoch": 2.087193460490463, "grad_norm": 0.34436559677124023, "learning_rate": 1.3491129841747632e-05, "loss": 0.40624065399169923, "memory(GiB)": 87.47, "step": 670, "token_acc": 0.86241849685157, "train_speed(iter/s)": 0.016928 }, { "epoch": 2.1027637212923316, "grad_norm": 0.32138824462890625, "learning_rate": 1.3394360625224067e-05, "loss": 0.4064358711242676, "memory(GiB)": 87.47, "step": 675, "token_acc": 0.864174034962998, "train_speed(iter/s)": 0.016932 }, { "epoch": 2.1183339820942, "grad_norm": 0.32160255312919617, "learning_rate": 1.3297230810166979e-05, "loss": 0.4131148338317871, "memory(GiB)": 87.47, "step": 680, "token_acc": 0.8563570810274059, "train_speed(iter/s)": 0.016933 }, { "epoch": 2.1339042428960684, "grad_norm": 0.29857733845710754, "learning_rate": 1.3199750715123144e-05, "loss": 0.40442190170288084, "memory(GiB)": 87.47, "step": 685, "token_acc": 0.8606646118780595, "train_speed(iter/s)": 0.016936 }, { "epoch": 2.149474503697937, "grad_norm": 0.3053974211215973, "learning_rate": 1.3101930695851186e-05, "loss": 0.4091023921966553, "memory(GiB)": 87.47, "step": 690, "token_acc": 0.8537588049550644, "train_speed(iter/s)": 0.016931 }, { "epoch": 2.165044764499805, "grad_norm": 0.32609260082244873, "learning_rate": 1.300378114422144e-05, "loss": 0.4144451141357422, "memory(GiB)": 88.68, "step": 695, "token_acc": 0.8581630992954251, "train_speed(iter/s)": 0.016933 }, { "epoch": 2.1806150253016736, "grad_norm": 0.2846038043498993, "learning_rate": 1.2905312487111981e-05, "loss": 0.4058229923248291, "memory(GiB)": 88.68, "step": 700, "token_acc": 0.8490695870940025, "train_speed(iter/s)": 0.016924 }, { "epoch": 2.1806150253016736, "eval_loss": 0.4248170256614685, "eval_runtime": 48.4312, "eval_samples_per_second": 8.569, "eval_steps_per_second": 1.074, "eval_token_acc": 0.8499472561067161, "step": 700 }, { "epoch": 2.1961852861035425, "grad_norm": 0.3057588040828705, "learning_rate": 1.2806535185300931e-05, "loss": 0.39852018356323243, "memory(GiB)": 88.68, "step": 705, "token_acc": 0.859068653718373, "train_speed(iter/s)": 0.016907 }, { "epoch": 2.211755546905411, "grad_norm": 0.3422738015651703, "learning_rate": 1.2707459732355152e-05, "loss": 0.40930471420288084, "memory(GiB)": 88.68, "step": 710, "token_acc": 0.8593000519385292, "train_speed(iter/s)": 0.016916 }, { "epoch": 2.2273258077072793, "grad_norm": 0.32292571663856506, "learning_rate": 1.260809665351547e-05, "loss": 0.40809078216552735, "memory(GiB)": 88.68, "step": 715, "token_acc": 0.8623604255075267, "train_speed(iter/s)": 0.016914 }, { "epoch": 2.2428960685091477, "grad_norm": 0.30992391705513, "learning_rate": 1.2508456504578538e-05, "loss": 0.40337481498718264, "memory(GiB)": 88.68, "step": 720, "token_acc": 0.8515773998256702, "train_speed(iter/s)": 0.016923 }, { "epoch": 2.258466329311016, "grad_norm": 0.3296166956424713, "learning_rate": 1.2408549870775432e-05, "loss": 0.4040327548980713, "memory(GiB)": 88.68, "step": 725, "token_acc": 0.870754853952457, "train_speed(iter/s)": 0.016915 }, { "epoch": 2.2740365901128845, "grad_norm": 0.3059770166873932, "learning_rate": 1.230838736564715e-05, "loss": 0.388106107711792, "memory(GiB)": 88.68, "step": 730, "token_acc": 0.8662276135612913, "train_speed(iter/s)": 0.016923 }, { "epoch": 2.289606850914753, "grad_norm": 0.29819902777671814, "learning_rate": 1.2207979629917061e-05, "loss": 0.415024995803833, "memory(GiB)": 88.68, "step": 735, "token_acc": 0.8562834300703839, "train_speed(iter/s)": 0.016919 }, { "epoch": 2.3051771117166213, "grad_norm": 0.31571272015571594, "learning_rate": 1.2107337330360533e-05, "loss": 0.4108760833740234, "memory(GiB)": 88.68, "step": 740, "token_acc": 0.8624641478349758, "train_speed(iter/s)": 0.016924 }, { "epoch": 2.3207473725184897, "grad_norm": 0.30818915367126465, "learning_rate": 1.2006471158671702e-05, "loss": 0.41235151290893557, "memory(GiB)": 88.68, "step": 745, "token_acc": 0.8561366178899871, "train_speed(iter/s)": 0.01693 }, { "epoch": 2.336317633320358, "grad_norm": 0.293542742729187, "learning_rate": 1.1905391830327685e-05, "loss": 0.418719482421875, "memory(GiB)": 88.68, "step": 750, "token_acc": 0.8582308714036777, "train_speed(iter/s)": 0.016924 }, { "epoch": 2.3518878941222265, "grad_norm": 0.2972683310508728, "learning_rate": 1.180411008345021e-05, "loss": 0.40260977745056153, "memory(GiB)": 88.68, "step": 755, "token_acc": 0.8619026578825308, "train_speed(iter/s)": 0.016922 }, { "epoch": 2.367458154924095, "grad_norm": 0.2991423010826111, "learning_rate": 1.1702636677664844e-05, "loss": 0.399456262588501, "memory(GiB)": 88.68, "step": 760, "token_acc": 0.8674852160245288, "train_speed(iter/s)": 0.016926 }, { "epoch": 2.3830284157259634, "grad_norm": 0.28033456206321716, "learning_rate": 1.1600982392957978e-05, "loss": 0.40012359619140625, "memory(GiB)": 88.68, "step": 765, "token_acc": 0.8569271295496779, "train_speed(iter/s)": 0.016929 }, { "epoch": 2.3985986765278318, "grad_norm": 0.30392777919769287, "learning_rate": 1.1499158028531585e-05, "loss": 0.4144479274749756, "memory(GiB)": 88.68, "step": 770, "token_acc": 0.8572535511903578, "train_speed(iter/s)": 0.016923 }, { "epoch": 2.4141689373297, "grad_norm": 0.27619481086730957, "learning_rate": 1.1397174401656009e-05, "loss": 0.4138012886047363, "memory(GiB)": 88.68, "step": 775, "token_acc": 0.8541658201074043, "train_speed(iter/s)": 0.016917 }, { "epoch": 2.4297391981315686, "grad_norm": 0.27977001667022705, "learning_rate": 1.1295042346520755e-05, "loss": 0.4025775909423828, "memory(GiB)": 88.68, "step": 780, "token_acc": 0.869224874229117, "train_speed(iter/s)": 0.016916 }, { "epoch": 2.445309458933437, "grad_norm": 0.35665157437324524, "learning_rate": 1.1192772713083557e-05, "loss": 0.4065700054168701, "memory(GiB)": 88.68, "step": 785, "token_acc": 0.8511265419646967, "train_speed(iter/s)": 0.016918 }, { "epoch": 2.4608797197353054, "grad_norm": 0.3625037670135498, "learning_rate": 1.1090376365917724e-05, "loss": 0.40373077392578127, "memory(GiB)": 88.68, "step": 790, "token_acc": 0.8655929839902706, "train_speed(iter/s)": 0.016916 }, { "epoch": 2.476449980537174, "grad_norm": 0.30906039476394653, "learning_rate": 1.0987864183057943e-05, "loss": 0.4046307563781738, "memory(GiB)": 88.68, "step": 795, "token_acc": 0.8631747227753758, "train_speed(iter/s)": 0.016918 }, { "epoch": 2.492020241339042, "grad_norm": 0.2804671823978424, "learning_rate": 1.088524705484466e-05, "loss": 0.39722390174865724, "memory(GiB)": 88.68, "step": 800, "token_acc": 0.8720521927504471, "train_speed(iter/s)": 0.016919 }, { "epoch": 2.492020241339042, "eval_loss": 0.4193665385246277, "eval_runtime": 49.4368, "eval_samples_per_second": 8.395, "eval_steps_per_second": 1.052, "eval_token_acc": 0.851378460924476, "step": 800 }, { "epoch": 2.5075905021409106, "grad_norm": 0.276696115732193, "learning_rate": 1.0782535882767144e-05, "loss": 0.40638461112976076, "memory(GiB)": 88.68, "step": 805, "token_acc": 0.8657145358437709, "train_speed(iter/s)": 0.016894 }, { "epoch": 2.523160762942779, "grad_norm": 0.29846805334091187, "learning_rate": 1.067974157830539e-05, "loss": 0.40010814666748046, "memory(GiB)": 88.68, "step": 810, "token_acc": 0.8588209819736914, "train_speed(iter/s)": 0.016899 }, { "epoch": 2.538731023744648, "grad_norm": 0.27926602959632874, "learning_rate": 1.0576875061770913e-05, "loss": 0.4041747570037842, "memory(GiB)": 88.68, "step": 815, "token_acc": 0.8594588904095168, "train_speed(iter/s)": 0.016903 }, { "epoch": 2.5543012845465163, "grad_norm": 0.31468990445137024, "learning_rate": 1.0473947261146654e-05, "loss": 0.3997108697891235, "memory(GiB)": 88.68, "step": 820, "token_acc": 0.864746443340764, "train_speed(iter/s)": 0.016909 }, { "epoch": 2.5698715453483847, "grad_norm": 0.317765474319458, "learning_rate": 1.0370969110926052e-05, "loss": 0.40914144515991213, "memory(GiB)": 88.68, "step": 825, "token_acc": 0.8667216600585897, "train_speed(iter/s)": 0.016911 }, { "epoch": 2.585441806150253, "grad_norm": 0.2770572900772095, "learning_rate": 1.0267951550951406e-05, "loss": 0.4096653461456299, "memory(GiB)": 88.68, "step": 830, "token_acc": 0.8620899938038338, "train_speed(iter/s)": 0.016914 }, { "epoch": 2.6010120669521215, "grad_norm": 0.2760813534259796, "learning_rate": 1.0164905525251695e-05, "loss": 0.3938852310180664, "memory(GiB)": 88.68, "step": 835, "token_acc": 0.8624401122397716, "train_speed(iter/s)": 0.016914 }, { "epoch": 2.61658232775399, "grad_norm": 0.2749018967151642, "learning_rate": 1.0061841980879941e-05, "loss": 0.4151924133300781, "memory(GiB)": 88.68, "step": 840, "token_acc": 0.8581742617267449, "train_speed(iter/s)": 0.016909 }, { "epoch": 2.6321525885558583, "grad_norm": 0.30041322112083435, "learning_rate": 9.958771866750266e-06, "loss": 0.4036086082458496, "memory(GiB)": 88.68, "step": 845, "token_acc": 0.8535083801509132, "train_speed(iter/s)": 0.016911 }, { "epoch": 2.6477228493577267, "grad_norm": 0.3127138018608093, "learning_rate": 9.855706132474719e-06, "loss": 0.39623475074768066, "memory(GiB)": 88.68, "step": 850, "token_acc": 0.8613897832181449, "train_speed(iter/s)": 0.016914 }, { "epoch": 2.663293110159595, "grad_norm": 0.3154863119125366, "learning_rate": 9.752655727200051e-06, "loss": 0.40503616333007814, "memory(GiB)": 88.68, "step": 855, "token_acc": 0.8705114688096711, "train_speed(iter/s)": 0.016916 }, { "epoch": 2.6788633709614635, "grad_norm": 0.30002886056900024, "learning_rate": 9.649631598444557e-06, "loss": 0.39531519412994387, "memory(GiB)": 88.68, "step": 860, "token_acc": 0.8739137447179123, "train_speed(iter/s)": 0.01692 }, { "epoch": 2.694433631763332, "grad_norm": 0.2766549289226532, "learning_rate": 9.54664469093505e-06, "loss": 0.4008350372314453, "memory(GiB)": 88.68, "step": 865, "token_acc": 0.8610019064176141, "train_speed(iter/s)": 0.016918 }, { "epoch": 2.7100038925652004, "grad_norm": 0.28013867139816284, "learning_rate": 9.443705945444158e-06, "loss": 0.40520267486572265, "memory(GiB)": 88.68, "step": 870, "token_acc": 0.872128417616696, "train_speed(iter/s)": 0.016923 }, { "epoch": 2.7255741533670688, "grad_norm": 0.2968541085720062, "learning_rate": 9.34082629762803e-06, "loss": 0.40741329193115233, "memory(GiB)": 88.68, "step": 875, "token_acc": 0.8571625546526471, "train_speed(iter/s)": 0.016921 }, { "epoch": 2.741144414168937, "grad_norm": 0.2884249687194824, "learning_rate": 9.23801667686461e-06, "loss": 0.40064706802368166, "memory(GiB)": 88.68, "step": 880, "token_acc": 0.8599723495981721, "train_speed(iter/s)": 0.016922 }, { "epoch": 2.756714674970806, "grad_norm": 0.27596229314804077, "learning_rate": 9.135288005092546e-06, "loss": 0.39715871810913084, "memory(GiB)": 88.68, "step": 885, "token_acc": 0.8647707635744576, "train_speed(iter/s)": 0.016927 }, { "epoch": 2.7722849357726744, "grad_norm": 0.30293765664100647, "learning_rate": 9.032651195650884e-06, "loss": 0.3991700649261475, "memory(GiB)": 88.68, "step": 890, "token_acc": 0.8664495037451203, "train_speed(iter/s)": 0.01693 }, { "epoch": 2.787855196574543, "grad_norm": 0.2744984030723572, "learning_rate": 8.930117152119736e-06, "loss": 0.3990873575210571, "memory(GiB)": 88.68, "step": 895, "token_acc": 0.8639404074186683, "train_speed(iter/s)": 0.016932 }, { "epoch": 2.8034254573764112, "grad_norm": 0.29595863819122314, "learning_rate": 8.827696767161902e-06, "loss": 0.4118965148925781, "memory(GiB)": 88.68, "step": 900, "token_acc": 0.8585057335917827, "train_speed(iter/s)": 0.016929 }, { "epoch": 2.8034254573764112, "eval_loss": 0.41329577565193176, "eval_runtime": 48.7426, "eval_samples_per_second": 8.514, "eval_steps_per_second": 1.067, "eval_token_acc": 0.8528091814089642, "step": 900 }, { "epoch": 2.8189957181782797, "grad_norm": 0.26091545820236206, "learning_rate": 8.725400921365722e-06, "loss": 0.39678106307983396, "memory(GiB)": 88.68, "step": 905, "token_acc": 0.8594101414128966, "train_speed(iter/s)": 0.016913 }, { "epoch": 2.834565978980148, "grad_norm": 0.2783149778842926, "learning_rate": 8.623240482089153e-06, "loss": 0.3983915328979492, "memory(GiB)": 88.68, "step": 910, "token_acc": 0.8653014428764835, "train_speed(iter/s)": 0.01691 }, { "epoch": 2.8501362397820165, "grad_norm": 0.29711443185806274, "learning_rate": 8.52122630230531e-06, "loss": 0.3964498996734619, "memory(GiB)": 88.68, "step": 915, "token_acc": 0.8654651025002963, "train_speed(iter/s)": 0.016909 }, { "epoch": 2.865706500583885, "grad_norm": 0.3235342502593994, "learning_rate": 8.419369219449487e-06, "loss": 0.4111301422119141, "memory(GiB)": 88.68, "step": 920, "token_acc": 0.854048169486135, "train_speed(iter/s)": 0.016905 }, { "epoch": 2.8812767613857533, "grad_norm": 0.2915455400943756, "learning_rate": 8.317680054267834e-06, "loss": 0.3976348161697388, "memory(GiB)": 88.68, "step": 925, "token_acc": 0.8646497658549133, "train_speed(iter/s)": 0.016911 }, { "epoch": 2.8968470221876217, "grad_norm": 0.28624093532562256, "learning_rate": 8.216169609667854e-06, "loss": 0.3987285137176514, "memory(GiB)": 88.68, "step": 930, "token_acc": 0.8666723028265342, "train_speed(iter/s)": 0.016905 }, { "epoch": 2.91241728298949, "grad_norm": 0.2571397125720978, "learning_rate": 8.114848669570733e-06, "loss": 0.4107855796813965, "memory(GiB)": 88.68, "step": 935, "token_acc": 0.8626929739970473, "train_speed(iter/s)": 0.016902 }, { "epoch": 2.9279875437913585, "grad_norm": 0.266347736120224, "learning_rate": 8.013727997765724e-06, "loss": 0.39544177055358887, "memory(GiB)": 88.68, "step": 940, "token_acc": 0.8621273056885385, "train_speed(iter/s)": 0.016899 }, { "epoch": 2.943557804593227, "grad_norm": 0.26831647753715515, "learning_rate": 7.91281833676665e-06, "loss": 0.3936420202255249, "memory(GiB)": 88.68, "step": 945, "token_acc": 0.856834701996057, "train_speed(iter/s)": 0.016903 }, { "epoch": 2.9591280653950953, "grad_norm": 0.2694167494773865, "learning_rate": 7.812130406670699e-06, "loss": 0.4112107276916504, "memory(GiB)": 88.68, "step": 950, "token_acc": 0.8646875190971869, "train_speed(iter/s)": 0.016901 }, { "epoch": 2.9746983261969637, "grad_norm": 0.29274359345436096, "learning_rate": 7.71167490401956e-06, "loss": 0.39890074729919434, "memory(GiB)": 88.68, "step": 955, "token_acc": 0.8708542010096479, "train_speed(iter/s)": 0.016909 }, { "epoch": 2.990268586998832, "grad_norm": 0.2841947674751282, "learning_rate": 7.6114625006630885e-06, "loss": 0.3915250301361084, "memory(GiB)": 88.68, "step": 960, "token_acc": 0.8685056019696435, "train_speed(iter/s)": 0.016913 }, { "epoch": 3.0062281043207473, "grad_norm": 0.36610281467437744, "learning_rate": 7.511503842625576e-06, "loss": 0.46400060653686526, "memory(GiB)": 88.68, "step": 965, "token_acc": 0.8688302643312309, "train_speed(iter/s)": 0.016913 }, { "epoch": 3.0217983651226157, "grad_norm": 0.3299410939216614, "learning_rate": 7.411809548974792e-06, "loss": 0.37694129943847654, "memory(GiB)": 88.68, "step": 970, "token_acc": 0.8694388579532905, "train_speed(iter/s)": 0.016915 }, { "epoch": 3.037368625924484, "grad_norm": 0.3347257375717163, "learning_rate": 7.312390210693863e-06, "loss": 0.36944580078125, "memory(GiB)": 88.68, "step": 975, "token_acc": 0.878339156936005, "train_speed(iter/s)": 0.016917 }, { "epoch": 3.0529388867263525, "grad_norm": 0.2957051992416382, "learning_rate": 7.213256389556125e-06, "loss": 0.36371331214904784, "memory(GiB)": 88.68, "step": 980, "token_acc": 0.8717935493188025, "train_speed(iter/s)": 0.016917 }, { "epoch": 3.068509147528221, "grad_norm": 0.29925552010536194, "learning_rate": 7.114418617003137e-06, "loss": 0.37583396434783933, "memory(GiB)": 88.68, "step": 985, "token_acc": 0.8665699665764031, "train_speed(iter/s)": 0.016916 }, { "epoch": 3.0840794083300898, "grad_norm": 0.2758331298828125, "learning_rate": 7.015887393025847e-06, "loss": 0.3523877620697021, "memory(GiB)": 88.68, "step": 990, "token_acc": 0.8778131746205382, "train_speed(iter/s)": 0.016922 }, { "epoch": 3.099649669131958, "grad_norm": 0.2721407413482666, "learning_rate": 6.917673185049138e-06, "loss": 0.36904470920562743, "memory(GiB)": 88.68, "step": 995, "token_acc": 0.8740084948468113, "train_speed(iter/s)": 0.016923 }, { "epoch": 3.1152199299338266, "grad_norm": 0.27260729670524597, "learning_rate": 6.819786426819825e-06, "loss": 0.37018847465515137, "memory(GiB)": 88.68, "step": 1000, "token_acc": 0.8693169060405699, "train_speed(iter/s)": 0.016924 }, { "epoch": 3.1152199299338266, "eval_loss": 0.4171549081802368, "eval_runtime": 48.9505, "eval_samples_per_second": 8.478, "eval_steps_per_second": 1.062, "eval_token_acc": 0.8533666490046466, "step": 1000 }, { "epoch": 3.130790190735695, "grad_norm": 0.3021749258041382, "learning_rate": 6.722237517298232e-06, "loss": 0.3654526948928833, "memory(GiB)": 88.68, "step": 1005, "token_acc": 0.8657461259026207, "train_speed(iter/s)": 0.016908 }, { "epoch": 3.1463604515375634, "grad_norm": 0.2921292781829834, "learning_rate": 6.625036819553467e-06, "loss": 0.36435742378234864, "memory(GiB)": 88.68, "step": 1010, "token_acc": 0.8771943849326895, "train_speed(iter/s)": 0.016915 }, { "epoch": 3.161930712339432, "grad_norm": 0.2832075357437134, "learning_rate": 6.528194659662488e-06, "loss": 0.3685340881347656, "memory(GiB)": 88.68, "step": 1015, "token_acc": 0.8670015373143141, "train_speed(iter/s)": 0.016916 }, { "epoch": 3.1775009731413, "grad_norm": 0.2529529631137848, "learning_rate": 6.431721325613138e-06, "loss": 0.3727813720703125, "memory(GiB)": 88.68, "step": 1020, "token_acc": 0.8713205243098983, "train_speed(iter/s)": 0.016914 }, { "epoch": 3.1930712339431686, "grad_norm": 0.2848242521286011, "learning_rate": 6.335627066211196e-06, "loss": 0.3792572021484375, "memory(GiB)": 88.68, "step": 1025, "token_acc": 0.86777010721209, "train_speed(iter/s)": 0.01691 }, { "epoch": 3.208641494745037, "grad_norm": 0.2933395802974701, "learning_rate": 6.239922089991597e-06, "loss": 0.36856865882873535, "memory(GiB)": 88.68, "step": 1030, "token_acc": 0.8724858299595142, "train_speed(iter/s)": 0.01691 }, { "epoch": 3.2242117555469054, "grad_norm": 0.2859324514865875, "learning_rate": 6.144616564133927e-06, "loss": 0.36645007133483887, "memory(GiB)": 88.68, "step": 1035, "token_acc": 0.8778270121878196, "train_speed(iter/s)": 0.016913 }, { "epoch": 3.239782016348774, "grad_norm": 0.244709774851799, "learning_rate": 6.049720613382332e-06, "loss": 0.3758384704589844, "memory(GiB)": 88.68, "step": 1040, "token_acc": 0.8745641464981795, "train_speed(iter/s)": 0.016907 }, { "epoch": 3.2553522771506422, "grad_norm": 0.28737974166870117, "learning_rate": 5.955244318969913e-06, "loss": 0.37161884307861326, "memory(GiB)": 88.68, "step": 1045, "token_acc": 0.8789312516614523, "train_speed(iter/s)": 0.016907 }, { "epoch": 3.2709225379525106, "grad_norm": 0.30621790885925293, "learning_rate": 5.8611977175477355e-06, "loss": 0.36142144203186033, "memory(GiB)": 88.68, "step": 1050, "token_acc": 0.8809154383242824, "train_speed(iter/s)": 0.016914 }, { "epoch": 3.286492798754379, "grad_norm": 0.2538982033729553, "learning_rate": 5.767590800118621e-06, "loss": 0.3751323699951172, "memory(GiB)": 88.68, "step": 1055, "token_acc": 0.8687682142777692, "train_speed(iter/s)": 0.01691 }, { "epoch": 3.3020630595562475, "grad_norm": 0.2651020586490631, "learning_rate": 5.674433510975725e-06, "loss": 0.3630067825317383, "memory(GiB)": 88.68, "step": 1060, "token_acc": 0.8768329269920676, "train_speed(iter/s)": 0.016915 }, { "epoch": 3.317633320358116, "grad_norm": 0.28797048330307007, "learning_rate": 5.581735746646134e-06, "loss": 0.38075408935546873, "memory(GiB)": 88.68, "step": 1065, "token_acc": 0.8739769760426389, "train_speed(iter/s)": 0.016913 }, { "epoch": 3.3332035811599843, "grad_norm": 0.2768980860710144, "learning_rate": 5.4895073548394926e-06, "loss": 0.37256827354431155, "memory(GiB)": 88.68, "step": 1070, "token_acc": 0.8735899236301825, "train_speed(iter/s)": 0.016917 }, { "epoch": 3.3487738419618527, "grad_norm": 0.29908499121665955, "learning_rate": 5.397758133401849e-06, "loss": 0.37295982837677, "memory(GiB)": 88.68, "step": 1075, "token_acc": 0.8778628774722752, "train_speed(iter/s)": 0.016923 }, { "epoch": 3.364344102763721, "grad_norm": 0.31482502818107605, "learning_rate": 5.306497829274785e-06, "loss": 0.37373597621917726, "memory(GiB)": 88.68, "step": 1080, "token_acc": 0.8745586160071005, "train_speed(iter/s)": 0.016928 }, { "epoch": 3.3799143635655895, "grad_norm": 0.2600855827331543, "learning_rate": 5.215736137459932e-06, "loss": 0.36784698963165285, "memory(GiB)": 88.68, "step": 1085, "token_acc": 0.8707799198767635, "train_speed(iter/s)": 0.016929 }, { "epoch": 3.3954846243674583, "grad_norm": 0.29292717576026917, "learning_rate": 5.12548269998906e-06, "loss": 0.36927309036254885, "memory(GiB)": 88.68, "step": 1090, "token_acc": 0.88031051846326, "train_speed(iter/s)": 0.016927 }, { "epoch": 3.4110548851693268, "grad_norm": 0.2715342342853546, "learning_rate": 5.035747104899738e-06, "loss": 0.37144927978515624, "memory(GiB)": 88.68, "step": 1095, "token_acc": 0.8731032759416966, "train_speed(iter/s)": 0.01692 }, { "epoch": 3.426625145971195, "grad_norm": 0.2806420922279358, "learning_rate": 4.946538885216759e-06, "loss": 0.3772748470306396, "memory(GiB)": 88.68, "step": 1100, "token_acc": 0.8754309330659928, "train_speed(iter/s)": 0.016918 }, { "epoch": 3.426625145971195, "eval_loss": 0.41369661688804626, "eval_runtime": 48.7784, "eval_samples_per_second": 8.508, "eval_steps_per_second": 1.066, "eval_token_acc": 0.8540369662526263, "step": 1100 } ], "logging_steps": 5, "max_steps": 1605, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5522242675121062e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }