| { |
| "best_metric": 1.6074212789535522, |
| "best_model_checkpoint": "./deepseek-MoE-lora/checkpoint-1000", |
| "epoch": 1.3736263736263736, |
| "eval_steps": 100, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013736263736263736, |
| "grad_norm": 29.239500045776367, |
| "learning_rate": 1.1441647597254004e-06, |
| "loss": 9.2221, |
| "mean_token_accuracy": 0.12948232870548965, |
| "num_tokens": 121829.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027472527472527472, |
| "grad_norm": 30.485370635986328, |
| "learning_rate": 2.288329519450801e-06, |
| "loss": 8.9209, |
| "mean_token_accuracy": 0.13155823573470116, |
| "num_tokens": 232251.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04120879120879121, |
| "grad_norm": 26.796140670776367, |
| "learning_rate": 3.4324942791762018e-06, |
| "loss": 8.6267, |
| "mean_token_accuracy": 0.1354693438857794, |
| "num_tokens": 349170.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.054945054945054944, |
| "grad_norm": 26.021697998046875, |
| "learning_rate": 4.576659038901602e-06, |
| "loss": 8.4408, |
| "mean_token_accuracy": 0.13846412636339664, |
| "num_tokens": 459909.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06868131868131869, |
| "grad_norm": 29.667829513549805, |
| "learning_rate": 5.720823798627003e-06, |
| "loss": 8.2092, |
| "mean_token_accuracy": 0.13972207996994257, |
| "num_tokens": 570165.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08241758241758242, |
| "grad_norm": 21.387392044067383, |
| "learning_rate": 6.8649885583524035e-06, |
| "loss": 7.3725, |
| "mean_token_accuracy": 0.15276159830391406, |
| "num_tokens": 684371.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09615384615384616, |
| "grad_norm": 22.36768913269043, |
| "learning_rate": 8.009153318077805e-06, |
| "loss": 6.6599, |
| "mean_token_accuracy": 0.16514867953956128, |
| "num_tokens": 799113.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10989010989010989, |
| "grad_norm": 12.144390106201172, |
| "learning_rate": 9.153318077803204e-06, |
| "loss": 5.8577, |
| "mean_token_accuracy": 0.18936714343726635, |
| "num_tokens": 919909.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12362637362637363, |
| "grad_norm": 5.9125447273254395, |
| "learning_rate": 1.0297482837528604e-05, |
| "loss": 5.3595, |
| "mean_token_accuracy": 0.209349187836051, |
| "num_tokens": 1032496.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.13736263736263737, |
| "grad_norm": 4.612669944763184, |
| "learning_rate": 1.1441647597254006e-05, |
| "loss": 5.1678, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.13736263736263737, |
| "eval_loss": 4.984684944152832, |
| "eval_mean_token_accuracy": 0.2370274927881029, |
| "eval_num_tokens": 1141793.0, |
| "eval_runtime": 311.0531, |
| "eval_samples_per_second": 16.64, |
| "eval_steps_per_second": 1.042, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1510989010989011, |
| "grad_norm": 4.400078296661377, |
| "learning_rate": 1.2585812356979407e-05, |
| "loss": 4.8803, |
| "mean_token_accuracy": 0.23492281436920165, |
| "num_tokens": 1254329.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.16483516483516483, |
| "grad_norm": 4.174052715301514, |
| "learning_rate": 1.3729977116704807e-05, |
| "loss": 4.5865, |
| "mean_token_accuracy": 0.2669687133282423, |
| "num_tokens": 1373912.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17857142857142858, |
| "grad_norm": 4.239316463470459, |
| "learning_rate": 1.4874141876430206e-05, |
| "loss": 4.3418, |
| "mean_token_accuracy": 0.29036078676581384, |
| "num_tokens": 1491018.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.19230769230769232, |
| "grad_norm": 3.802030563354492, |
| "learning_rate": 1.601830663615561e-05, |
| "loss": 4.0019, |
| "mean_token_accuracy": 0.3230995647609234, |
| "num_tokens": 1604500.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.20604395604395603, |
| "grad_norm": 3.057298183441162, |
| "learning_rate": 1.716247139588101e-05, |
| "loss": 3.7445, |
| "mean_token_accuracy": 0.3446956820785999, |
| "num_tokens": 1723869.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21978021978021978, |
| "grad_norm": 2.6854259967803955, |
| "learning_rate": 1.8306636155606407e-05, |
| "loss": 3.4938, |
| "mean_token_accuracy": 0.3709439463913441, |
| "num_tokens": 1840593.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.23351648351648352, |
| "grad_norm": 2.322840452194214, |
| "learning_rate": 1.945080091533181e-05, |
| "loss": 3.2394, |
| "mean_token_accuracy": 0.3960001617670059, |
| "num_tokens": 1948634.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.24725274725274726, |
| "grad_norm": 1.996971607208252, |
| "learning_rate": 2.0594965675057208e-05, |
| "loss": 3.0746, |
| "mean_token_accuracy": 0.41375042870640755, |
| "num_tokens": 2058974.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.260989010989011, |
| "grad_norm": 1.6177858114242554, |
| "learning_rate": 2.173913043478261e-05, |
| "loss": 2.8488, |
| "mean_token_accuracy": 0.4406682662665844, |
| "num_tokens": 2180905.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.27472527472527475, |
| "grad_norm": 1.3668256998062134, |
| "learning_rate": 2.2883295194508012e-05, |
| "loss": 2.7354, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.27472527472527475, |
| "eval_loss": 2.6662864685058594, |
| "eval_mean_token_accuracy": 0.4655627465726417, |
| "eval_num_tokens": 2297024.0, |
| "eval_runtime": 310.9656, |
| "eval_samples_per_second": 16.645, |
| "eval_steps_per_second": 1.042, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28846153846153844, |
| "grad_norm": 1.2411565780639648, |
| "learning_rate": 2.402745995423341e-05, |
| "loss": 2.5765, |
| "mean_token_accuracy": 0.4659102514386177, |
| "num_tokens": 2415020.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3021978021978022, |
| "grad_norm": 1.1622848510742188, |
| "learning_rate": 2.5171624713958813e-05, |
| "loss": 2.522, |
| "mean_token_accuracy": 0.4841405004262924, |
| "num_tokens": 2530662.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3159340659340659, |
| "grad_norm": 1.0268805027008057, |
| "learning_rate": 2.6315789473684212e-05, |
| "loss": 2.4053, |
| "mean_token_accuracy": 0.49754476696252825, |
| "num_tokens": 2644661.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.32967032967032966, |
| "grad_norm": 1.050316572189331, |
| "learning_rate": 2.7459954233409614e-05, |
| "loss": 2.3136, |
| "mean_token_accuracy": 0.5062366120517254, |
| "num_tokens": 2764206.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3434065934065934, |
| "grad_norm": 0.974026083946228, |
| "learning_rate": 2.8604118993135016e-05, |
| "loss": 2.2151, |
| "mean_token_accuracy": 0.5198402456939221, |
| "num_tokens": 2879504.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.35714285714285715, |
| "grad_norm": 0.95684415102005, |
| "learning_rate": 2.974828375286041e-05, |
| "loss": 2.192, |
| "mean_token_accuracy": 0.523027028143406, |
| "num_tokens": 2994634.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3708791208791209, |
| "grad_norm": 1.0790362358093262, |
| "learning_rate": 3.089244851258582e-05, |
| "loss": 2.0856, |
| "mean_token_accuracy": 0.537685776501894, |
| "num_tokens": 3110559.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.38461538461538464, |
| "grad_norm": 0.8086666464805603, |
| "learning_rate": 3.203661327231122e-05, |
| "loss": 2.0586, |
| "mean_token_accuracy": 0.54344697073102, |
| "num_tokens": 3224576.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3983516483516483, |
| "grad_norm": 0.8853408694267273, |
| "learning_rate": 3.3180778032036615e-05, |
| "loss": 1.9996, |
| "mean_token_accuracy": 0.5515147194266319, |
| "num_tokens": 3341645.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.41208791208791207, |
| "grad_norm": 0.9164928793907166, |
| "learning_rate": 3.432494279176202e-05, |
| "loss": 1.98, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.41208791208791207, |
| "eval_loss": 1.9627922773361206, |
| "eval_mean_token_accuracy": 0.5530768693597229, |
| "eval_num_tokens": 3452305.0, |
| "eval_runtime": 326.8485, |
| "eval_samples_per_second": 15.836, |
| "eval_steps_per_second": 0.991, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4258241758241758, |
| "grad_norm": 0.91264408826828, |
| "learning_rate": 3.546910755148742e-05, |
| "loss": 1.9447, |
| "mean_token_accuracy": 0.5537945911288261, |
| "num_tokens": 3569623.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.43956043956043955, |
| "grad_norm": 0.77753084897995, |
| "learning_rate": 3.6613272311212814e-05, |
| "loss": 1.8766, |
| "mean_token_accuracy": 0.5657263264060021, |
| "num_tokens": 3686940.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4532967032967033, |
| "grad_norm": 1.0137959718704224, |
| "learning_rate": 3.7757437070938216e-05, |
| "loss": 1.8847, |
| "mean_token_accuracy": 0.5627776235342026, |
| "num_tokens": 3802302.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.46703296703296704, |
| "grad_norm": 0.8629505038261414, |
| "learning_rate": 3.890160183066362e-05, |
| "loss": 1.8672, |
| "mean_token_accuracy": 0.5701652288436889, |
| "num_tokens": 3915449.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4807692307692308, |
| "grad_norm": 0.916632354259491, |
| "learning_rate": 4.0045766590389014e-05, |
| "loss": 1.8513, |
| "mean_token_accuracy": 0.5717276155948638, |
| "num_tokens": 4032982.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4945054945054945, |
| "grad_norm": 0.9153818488121033, |
| "learning_rate": 4.1189931350114416e-05, |
| "loss": 1.8632, |
| "mean_token_accuracy": 0.5680810511112213, |
| "num_tokens": 4144430.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5082417582417582, |
| "grad_norm": 0.9300126433372498, |
| "learning_rate": 4.233409610983982e-05, |
| "loss": 1.7932, |
| "mean_token_accuracy": 0.5819852232933045, |
| "num_tokens": 4255862.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.521978021978022, |
| "grad_norm": 1.1194523572921753, |
| "learning_rate": 4.347826086956522e-05, |
| "loss": 1.8185, |
| "mean_token_accuracy": 0.5755217462778092, |
| "num_tokens": 4364858.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5357142857142857, |
| "grad_norm": 0.8476864099502563, |
| "learning_rate": 4.462242562929062e-05, |
| "loss": 1.7662, |
| "mean_token_accuracy": 0.5817034646868706, |
| "num_tokens": 4478176.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5494505494505495, |
| "grad_norm": 0.9283179640769958, |
| "learning_rate": 4.5766590389016025e-05, |
| "loss": 1.801, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5494505494505495, |
| "eval_loss": 1.7755845785140991, |
| "eval_mean_token_accuracy": 0.5817039539048701, |
| "eval_num_tokens": 4596634.0, |
| "eval_runtime": 316.8533, |
| "eval_samples_per_second": 16.336, |
| "eval_steps_per_second": 1.023, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5631868131868132, |
| "grad_norm": 1.0799726247787476, |
| "learning_rate": 4.691075514874142e-05, |
| "loss": 1.7898, |
| "mean_token_accuracy": 0.5777772672474384, |
| "num_tokens": 4708369.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5769230769230769, |
| "grad_norm": 0.7919142246246338, |
| "learning_rate": 4.805491990846682e-05, |
| "loss": 1.7842, |
| "mean_token_accuracy": 0.5787179872393609, |
| "num_tokens": 4827961.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5906593406593407, |
| "grad_norm": 0.9103861451148987, |
| "learning_rate": 4.9199084668192224e-05, |
| "loss": 1.7741, |
| "mean_token_accuracy": 0.5839256420731544, |
| "num_tokens": 4938283.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6043956043956044, |
| "grad_norm": 0.864320695400238, |
| "learning_rate": 4.998937902711889e-05, |
| "loss": 1.7477, |
| "mean_token_accuracy": 0.5855789959430695, |
| "num_tokens": 5049612.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6181318681318682, |
| "grad_norm": 1.1230181455612183, |
| "learning_rate": 4.995397578418183e-05, |
| "loss": 1.7505, |
| "mean_token_accuracy": 0.5864449262619018, |
| "num_tokens": 5164775.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6318681318681318, |
| "grad_norm": 0.9053744673728943, |
| "learning_rate": 4.991857254124478e-05, |
| "loss": 1.7381, |
| "mean_token_accuracy": 0.5900515034794808, |
| "num_tokens": 5278039.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6456043956043956, |
| "grad_norm": 1.0245819091796875, |
| "learning_rate": 4.9883169298307723e-05, |
| "loss": 1.7427, |
| "mean_token_accuracy": 0.5880876764655113, |
| "num_tokens": 5390124.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6593406593406593, |
| "grad_norm": 0.8946035504341125, |
| "learning_rate": 4.9847766055370674e-05, |
| "loss": 1.71, |
| "mean_token_accuracy": 0.5933957740664482, |
| "num_tokens": 5497492.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6730769230769231, |
| "grad_norm": 0.8355729579925537, |
| "learning_rate": 4.9812362812433624e-05, |
| "loss": 1.6985, |
| "mean_token_accuracy": 0.5948651686310769, |
| "num_tokens": 5613483.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6868131868131868, |
| "grad_norm": 0.9272288680076599, |
| "learning_rate": 4.977695956949657e-05, |
| "loss": 1.7286, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6868131868131868, |
| "eval_loss": 1.7027286291122437, |
| "eval_mean_token_accuracy": 0.593273350302084, |
| "eval_num_tokens": 5729153.0, |
| "eval_runtime": 311.0607, |
| "eval_samples_per_second": 16.64, |
| "eval_steps_per_second": 1.042, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7005494505494505, |
| "grad_norm": 0.9919455051422119, |
| "learning_rate": 4.974155632655951e-05, |
| "loss": 1.7034, |
| "mean_token_accuracy": 0.5900716975331306, |
| "num_tokens": 5840054.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.9207066893577576, |
| "learning_rate": 4.970615308362247e-05, |
| "loss": 1.6816, |
| "mean_token_accuracy": 0.5953675732016563, |
| "num_tokens": 5951050.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.728021978021978, |
| "grad_norm": 1.035968542098999, |
| "learning_rate": 4.967074984068541e-05, |
| "loss": 1.6997, |
| "mean_token_accuracy": 0.5936009138822556, |
| "num_tokens": 6059659.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7417582417582418, |
| "grad_norm": 0.9535728096961975, |
| "learning_rate": 4.9635346597748354e-05, |
| "loss": 1.6733, |
| "mean_token_accuracy": 0.597626781463623, |
| "num_tokens": 6166152.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7554945054945055, |
| "grad_norm": 0.9060805439949036, |
| "learning_rate": 4.9599943354811304e-05, |
| "loss": 1.6729, |
| "mean_token_accuracy": 0.59917561262846, |
| "num_tokens": 6281975.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.9403394460678101, |
| "learning_rate": 4.9564540111874254e-05, |
| "loss": 1.6765, |
| "mean_token_accuracy": 0.5959734156727791, |
| "num_tokens": 6393517.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.782967032967033, |
| "grad_norm": 1.018798589706421, |
| "learning_rate": 4.95291368689372e-05, |
| "loss": 1.6512, |
| "mean_token_accuracy": 0.6024029836058616, |
| "num_tokens": 6512243.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7967032967032966, |
| "grad_norm": 0.9490474462509155, |
| "learning_rate": 4.949373362600014e-05, |
| "loss": 1.7011, |
| "mean_token_accuracy": 0.592784532904625, |
| "num_tokens": 6627920.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8104395604395604, |
| "grad_norm": 0.9688321948051453, |
| "learning_rate": 4.945833038306309e-05, |
| "loss": 1.6464, |
| "mean_token_accuracy": 0.6033695384860038, |
| "num_tokens": 6739972.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8241758241758241, |
| "grad_norm": 1.112197756767273, |
| "learning_rate": 4.9422927140126034e-05, |
| "loss": 1.6757, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8241758241758241, |
| "eval_loss": 1.6667677164077759, |
| "eval_mean_token_accuracy": 0.5988459940309878, |
| "eval_num_tokens": 6859022.0, |
| "eval_runtime": 311.0117, |
| "eval_samples_per_second": 16.642, |
| "eval_steps_per_second": 1.042, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8379120879120879, |
| "grad_norm": 0.8909268379211426, |
| "learning_rate": 4.9387523897188984e-05, |
| "loss": 1.6889, |
| "mean_token_accuracy": 0.5944475166499614, |
| "num_tokens": 6970961.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8516483516483516, |
| "grad_norm": 0.9639058709144592, |
| "learning_rate": 4.9352120654251934e-05, |
| "loss": 1.6563, |
| "mean_token_accuracy": 0.6006275966763497, |
| "num_tokens": 7081524.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8653846153846154, |
| "grad_norm": 1.0538073778152466, |
| "learning_rate": 4.931671741131488e-05, |
| "loss": 1.645, |
| "mean_token_accuracy": 0.6039176091551781, |
| "num_tokens": 7191486.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8791208791208791, |
| "grad_norm": 1.0572949647903442, |
| "learning_rate": 4.928131416837782e-05, |
| "loss": 1.6553, |
| "mean_token_accuracy": 0.6014241844415664, |
| "num_tokens": 7306262.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8928571428571429, |
| "grad_norm": 1.0915870666503906, |
| "learning_rate": 4.924591092544078e-05, |
| "loss": 1.6737, |
| "mean_token_accuracy": 0.5958378210663795, |
| "num_tokens": 7418990.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9065934065934066, |
| "grad_norm": 0.9485398530960083, |
| "learning_rate": 4.921050768250372e-05, |
| "loss": 1.6078, |
| "mean_token_accuracy": 0.6094794437289238, |
| "num_tokens": 7537671.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9203296703296703, |
| "grad_norm": 1.1498547792434692, |
| "learning_rate": 4.9175104439566664e-05, |
| "loss": 1.6356, |
| "mean_token_accuracy": 0.6057997912168502, |
| "num_tokens": 7651191.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9340659340659341, |
| "grad_norm": 1.0218619108200073, |
| "learning_rate": 4.9139701196629614e-05, |
| "loss": 1.6491, |
| "mean_token_accuracy": 0.6027492165565491, |
| "num_tokens": 7769287.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9478021978021978, |
| "grad_norm": 1.183112621307373, |
| "learning_rate": 4.9104297953692564e-05, |
| "loss": 1.617, |
| "mean_token_accuracy": 0.6064566567540168, |
| "num_tokens": 7886626.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9615384615384616, |
| "grad_norm": 1.1547908782958984, |
| "learning_rate": 4.906889471075551e-05, |
| "loss": 1.6585, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9615384615384616, |
| "eval_loss": 1.6455810070037842, |
| "eval_mean_token_accuracy": 0.6022681238842599, |
| "eval_num_tokens": 7995973.0, |
| "eval_runtime": 310.453, |
| "eval_samples_per_second": 16.672, |
| "eval_steps_per_second": 1.044, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9752747252747253, |
| "grad_norm": 0.961618959903717, |
| "learning_rate": 4.903349146781845e-05, |
| "loss": 1.6406, |
| "mean_token_accuracy": 0.6022139228880405, |
| "num_tokens": 8109073.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.989010989010989, |
| "grad_norm": 1.27222740650177, |
| "learning_rate": 4.89980882248814e-05, |
| "loss": 1.6678, |
| "mean_token_accuracy": 0.5999792292714119, |
| "num_tokens": 8222282.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.0027472527472527, |
| "grad_norm": 1.0560694932937622, |
| "learning_rate": 4.896268498194435e-05, |
| "loss": 1.638, |
| "mean_token_accuracy": 0.6034953713417053, |
| "num_tokens": 8335446.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.0164835164835164, |
| "grad_norm": 1.034684419631958, |
| "learning_rate": 4.8927281739007295e-05, |
| "loss": 1.621, |
| "mean_token_accuracy": 0.6036525964736938, |
| "num_tokens": 8450623.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.0302197802197801, |
| "grad_norm": 1.0682315826416016, |
| "learning_rate": 4.8891878496070245e-05, |
| "loss": 1.6376, |
| "mean_token_accuracy": 0.6037934944033623, |
| "num_tokens": 8561023.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.043956043956044, |
| "grad_norm": 1.2298721075057983, |
| "learning_rate": 4.885647525313319e-05, |
| "loss": 1.6339, |
| "mean_token_accuracy": 0.6060884281992912, |
| "num_tokens": 8675222.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.0576923076923077, |
| "grad_norm": 1.0562454462051392, |
| "learning_rate": 4.882107201019613e-05, |
| "loss": 1.6333, |
| "mean_token_accuracy": 0.6048480987548828, |
| "num_tokens": 8784755.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0714285714285714, |
| "grad_norm": 0.9854257702827454, |
| "learning_rate": 4.878566876725909e-05, |
| "loss": 1.5999, |
| "mean_token_accuracy": 0.6091530665755271, |
| "num_tokens": 8900580.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.085164835164835, |
| "grad_norm": 1.1997156143188477, |
| "learning_rate": 4.875026552432203e-05, |
| "loss": 1.6677, |
| "mean_token_accuracy": 0.5998320803046227, |
| "num_tokens": 9016189.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.098901098901099, |
| "grad_norm": 1.0334490537643433, |
| "learning_rate": 4.8714862281384975e-05, |
| "loss": 1.6272, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.098901098901099, |
| "eval_loss": 1.6325246095657349, |
| "eval_mean_token_accuracy": 0.6042487479654359, |
| "eval_num_tokens": 9132964.0, |
| "eval_runtime": 310.2316, |
| "eval_samples_per_second": 16.684, |
| "eval_steps_per_second": 1.044, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1126373626373627, |
| "grad_norm": 1.03342604637146, |
| "learning_rate": 4.8679459038447925e-05, |
| "loss": 1.6429, |
| "mean_token_accuracy": 0.6036744929850102, |
| "num_tokens": 9249801.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.1263736263736264, |
| "grad_norm": 1.1043306589126587, |
| "learning_rate": 4.8644055795510875e-05, |
| "loss": 1.6237, |
| "mean_token_accuracy": 0.6055251255631446, |
| "num_tokens": 9365589.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.14010989010989, |
| "grad_norm": 1.131428837776184, |
| "learning_rate": 4.860865255257382e-05, |
| "loss": 1.6363, |
| "mean_token_accuracy": 0.6017774865031242, |
| "num_tokens": 9475982.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1538461538461537, |
| "grad_norm": 0.9910945892333984, |
| "learning_rate": 4.857324930963676e-05, |
| "loss": 1.6104, |
| "mean_token_accuracy": 0.6087021440267563, |
| "num_tokens": 9592179.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1675824175824177, |
| "grad_norm": 1.1637762784957886, |
| "learning_rate": 4.853784606669971e-05, |
| "loss": 1.642, |
| "mean_token_accuracy": 0.6020917922258378, |
| "num_tokens": 9708684.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1813186813186813, |
| "grad_norm": 1.1667078733444214, |
| "learning_rate": 4.850244282376266e-05, |
| "loss": 1.6368, |
| "mean_token_accuracy": 0.6061318039894104, |
| "num_tokens": 9822155.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.195054945054945, |
| "grad_norm": 1.1195344924926758, |
| "learning_rate": 4.8467039580825605e-05, |
| "loss": 1.5997, |
| "mean_token_accuracy": 0.6092845112085342, |
| "num_tokens": 9935041.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.2087912087912087, |
| "grad_norm": 1.0910887718200684, |
| "learning_rate": 4.8431636337888555e-05, |
| "loss": 1.6381, |
| "mean_token_accuracy": 0.6027830302715301, |
| "num_tokens": 10051143.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2225274725274726, |
| "grad_norm": 1.171608805656433, |
| "learning_rate": 4.83962330949515e-05, |
| "loss": 1.604, |
| "mean_token_accuracy": 0.6083497762680053, |
| "num_tokens": 10164538.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.2362637362637363, |
| "grad_norm": 1.1128859519958496, |
| "learning_rate": 4.836082985201444e-05, |
| "loss": 1.6191, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2362637362637363, |
| "eval_loss": 1.6174854040145874, |
| "eval_mean_token_accuracy": 0.605991005713557, |
| "eval_num_tokens": 10279282.0, |
| "eval_runtime": 310.7449, |
| "eval_samples_per_second": 16.657, |
| "eval_steps_per_second": 1.043, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 1.0883232355117798, |
| "learning_rate": 4.83254266090774e-05, |
| "loss": 1.5946, |
| "mean_token_accuracy": 0.6100020207464695, |
| "num_tokens": 10387510.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.2637362637362637, |
| "grad_norm": 1.3262733221054077, |
| "learning_rate": 4.829002336614034e-05, |
| "loss": 1.6276, |
| "mean_token_accuracy": 0.606501379609108, |
| "num_tokens": 10499682.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.2774725274725274, |
| "grad_norm": 1.113751769065857, |
| "learning_rate": 4.8254620123203285e-05, |
| "loss": 1.5819, |
| "mean_token_accuracy": 0.6106113165616989, |
| "num_tokens": 10618406.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.2912087912087913, |
| "grad_norm": 1.1370590925216675, |
| "learning_rate": 4.8219216880266235e-05, |
| "loss": 1.6192, |
| "mean_token_accuracy": 0.6069919407367707, |
| "num_tokens": 10737112.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.304945054945055, |
| "grad_norm": 1.2001625299453735, |
| "learning_rate": 4.8183813637329185e-05, |
| "loss": 1.5989, |
| "mean_token_accuracy": 0.6121641129255295, |
| "num_tokens": 10855000.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.3186813186813187, |
| "grad_norm": 1.2227309942245483, |
| "learning_rate": 4.814841039439213e-05, |
| "loss": 1.6093, |
| "mean_token_accuracy": 0.6081742540001869, |
| "num_tokens": 10969157.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3324175824175823, |
| "grad_norm": 1.0985106229782104, |
| "learning_rate": 4.811300715145507e-05, |
| "loss": 1.6226, |
| "mean_token_accuracy": 0.6075394690036774, |
| "num_tokens": 11084095.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.3461538461538463, |
| "grad_norm": 1.1740081310272217, |
| "learning_rate": 4.807760390851802e-05, |
| "loss": 1.5927, |
| "mean_token_accuracy": 0.6087717518210412, |
| "num_tokens": 11200093.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.35989010989011, |
| "grad_norm": 1.2875778675079346, |
| "learning_rate": 4.804220066558097e-05, |
| "loss": 1.6199, |
| "mean_token_accuracy": 0.6059487432241439, |
| "num_tokens": 11310360.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.3736263736263736, |
| "grad_norm": 1.12953519821167, |
| "learning_rate": 4.8006797422643916e-05, |
| "loss": 1.5867, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.3736263736263736, |
| "eval_loss": 1.6074212789535522, |
| "eval_mean_token_accuracy": 0.6077834514924038, |
| "eval_num_tokens": 11427781.0, |
| "eval_runtime": 309.2704, |
| "eval_samples_per_second": 16.736, |
| "eval_steps_per_second": 1.048, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 14560, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.451849537677951e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|