| { |
| "best_global_step": 1275, |
| "best_metric": 0.00177309, |
| "best_model_checkpoint": "/ext_hdd2/nhkoh/gelab-env/checkpoint/gui_exp/sft_448/v0-20260221_074940/checkpoint-1275", |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 1275, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0007847753580537571, |
| "grad_norm": 72.18675231933594, |
| "learning_rate": 1.5625e-07, |
| "loss": 2.0382871627807617, |
| "memory(GiB)": 68.03, |
| "step": 1, |
| "token_acc": 0.5797872340425532, |
| "train_speed(iter/s)": 0.024381 |
| }, |
| { |
| "epoch": 0.007847753580537572, |
| "grad_norm": 36.29729461669922, |
| "learning_rate": 1.5625e-06, |
| "loss": 1.7542770173814561, |
| "memory(GiB)": 78.33, |
| "step": 10, |
| "token_acc": 0.5938697318007663, |
| "train_speed(iter/s)": 0.075875 |
| }, |
| { |
| "epoch": 0.015695507161075144, |
| "grad_norm": 13.778337478637695, |
| "learning_rate": 3.125e-06, |
| "loss": 0.7909364223480224, |
| "memory(GiB)": 78.33, |
| "step": 20, |
| "token_acc": 0.7760314341846758, |
| "train_speed(iter/s)": 0.087711 |
| }, |
| { |
| "epoch": 0.023543260741612712, |
| "grad_norm": 27.69485855102539, |
| "learning_rate": 4.6875000000000004e-06, |
| "loss": 0.3084306240081787, |
| "memory(GiB)": 78.33, |
| "step": 30, |
| "token_acc": 0.911275415896488, |
| "train_speed(iter/s)": 0.09229 |
| }, |
| { |
| "epoch": 0.03139101432215029, |
| "grad_norm": 4.941364765167236, |
| "learning_rate": 6.25e-06, |
| "loss": 0.16059274673461915, |
| "memory(GiB)": 78.33, |
| "step": 40, |
| "token_acc": 0.944043321299639, |
| "train_speed(iter/s)": 0.094919 |
| }, |
| { |
| "epoch": 0.039238767902687856, |
| "grad_norm": 7.940003395080566, |
| "learning_rate": 7.8125e-06, |
| "loss": 0.1214432954788208, |
| "memory(GiB)": 78.33, |
| "step": 50, |
| "token_acc": 0.9553853086976115, |
| "train_speed(iter/s)": 0.096567 |
| }, |
| { |
| "epoch": 0.047086521483225424, |
| "grad_norm": 4.930227279663086, |
| "learning_rate": 9.375000000000001e-06, |
| "loss": 0.12964333295822145, |
| "memory(GiB)": 78.33, |
| "step": 60, |
| "token_acc": 0.9566682715454983, |
| "train_speed(iter/s)": 0.09759 |
| }, |
| { |
| "epoch": 0.054934275063763, |
| "grad_norm": 11.7340726852417, |
| "learning_rate": 9.999394317256736e-06, |
| "loss": 0.10636246204376221, |
| "memory(GiB)": 78.33, |
| "step": 70, |
| "token_acc": 0.9639681796911558, |
| "train_speed(iter/s)": 0.098428 |
| }, |
| { |
| "epoch": 0.06278202864430057, |
| "grad_norm": 3.2650105953216553, |
| "learning_rate": 9.995693454107632e-06, |
| "loss": 0.10599330663681031, |
| "memory(GiB)": 78.33, |
| "step": 80, |
| "token_acc": 0.9595461272816971, |
| "train_speed(iter/s)": 0.099074 |
| }, |
| { |
| "epoch": 0.07062978222483814, |
| "grad_norm": 3.0711424350738525, |
| "learning_rate": 9.988630705723449e-06, |
| "loss": 0.08438605070114136, |
| "memory(GiB)": 78.33, |
| "step": 90, |
| "token_acc": 0.9621403331650682, |
| "train_speed(iter/s)": 0.09951 |
| }, |
| { |
| "epoch": 0.07847753580537571, |
| "grad_norm": 4.047796726226807, |
| "learning_rate": 9.978210825027824e-06, |
| "loss": 0.07347342371940613, |
| "memory(GiB)": 78.33, |
| "step": 100, |
| "token_acc": 0.9733570159857904, |
| "train_speed(iter/s)": 0.099919 |
| }, |
| { |
| "epoch": 0.08632528938591329, |
| "grad_norm": 2.480022668838501, |
| "learning_rate": 9.964440824148982e-06, |
| "loss": 0.05940539240837097, |
| "memory(GiB)": 78.33, |
| "step": 110, |
| "token_acc": 0.9747619047619047, |
| "train_speed(iter/s)": 0.100252 |
| }, |
| { |
| "epoch": 0.09417304296645085, |
| "grad_norm": 7.3106865882873535, |
| "learning_rate": 9.94732996970087e-06, |
| "loss": 0.07703952789306641, |
| "memory(GiB)": 78.33, |
| "step": 120, |
| "token_acc": 0.9689497716894977, |
| "train_speed(iter/s)": 0.100517 |
| }, |
| { |
| "epoch": 0.10202079654698842, |
| "grad_norm": 3.5625627040863037, |
| "learning_rate": 9.926889776547134e-06, |
| "loss": 0.06253595352172851, |
| "memory(GiB)": 78.33, |
| "step": 130, |
| "token_acc": 0.9812667261373773, |
| "train_speed(iter/s)": 0.100722 |
| }, |
| { |
| "epoch": 0.109868550127526, |
| "grad_norm": 4.747700214385986, |
| "learning_rate": 9.903134000052106e-06, |
| "loss": 0.0579115629196167, |
| "memory(GiB)": 78.33, |
| "step": 140, |
| "token_acc": 0.9774236387782205, |
| "train_speed(iter/s)": 0.100934 |
| }, |
| { |
| "epoch": 0.11771630370806356, |
| "grad_norm": 2.209345579147339, |
| "learning_rate": 9.87607862682405e-06, |
| "loss": 0.05663343667984009, |
| "memory(GiB)": 78.33, |
| "step": 150, |
| "token_acc": 0.9818758495695514, |
| "train_speed(iter/s)": 0.101108 |
| }, |
| { |
| "epoch": 0.12556405728860115, |
| "grad_norm": 1.7758883237838745, |
| "learning_rate": 9.845741863956859e-06, |
| "loss": 0.05171079039573669, |
| "memory(GiB)": 78.33, |
| "step": 160, |
| "token_acc": 0.9778481012658228, |
| "train_speed(iter/s)": 0.101248 |
| }, |
| { |
| "epoch": 0.1334118108691387, |
| "grad_norm": 1.93281090259552, |
| "learning_rate": 9.812144126777474e-06, |
| "loss": 0.05034952163696289, |
| "memory(GiB)": 78.33, |
| "step": 170, |
| "token_acc": 0.982707509881423, |
| "train_speed(iter/s)": 0.101395 |
| }, |
| { |
| "epoch": 0.14125956444967627, |
| "grad_norm": 2.119216203689575, |
| "learning_rate": 9.77530802510725e-06, |
| "loss": 0.0457323968410492, |
| "memory(GiB)": 78.33, |
| "step": 180, |
| "token_acc": 0.9846005774783445, |
| "train_speed(iter/s)": 0.101512 |
| }, |
| { |
| "epoch": 0.14910731803021385, |
| "grad_norm": 3.984466314315796, |
| "learning_rate": 9.735258348046538e-06, |
| "loss": 0.049006104469299316, |
| "memory(GiB)": 78.33, |
| "step": 190, |
| "token_acc": 0.9822242479489517, |
| "train_speed(iter/s)": 0.101601 |
| }, |
| { |
| "epoch": 0.15695507161075142, |
| "grad_norm": 3.68289852142334, |
| "learning_rate": 9.692022047292672e-06, |
| "loss": 0.05341410040855408, |
| "memory(GiB)": 78.33, |
| "step": 200, |
| "token_acc": 0.9789666209419295, |
| "train_speed(iter/s)": 0.101709 |
| }, |
| { |
| "epoch": 0.164802825191289, |
| "grad_norm": 5.514003753662109, |
| "learning_rate": 9.645628219002667e-06, |
| "loss": 0.05571324825286865, |
| "memory(GiB)": 78.33, |
| "step": 210, |
| "token_acc": 0.9723809523809523, |
| "train_speed(iter/s)": 0.101789 |
| }, |
| { |
| "epoch": 0.17265057877182657, |
| "grad_norm": 2.3219399452209473, |
| "learning_rate": 9.596108084212752e-06, |
| "loss": 0.04433055818080902, |
| "memory(GiB)": 78.33, |
| "step": 220, |
| "token_acc": 0.984073359073359, |
| "train_speed(iter/s)": 0.101862 |
| }, |
| { |
| "epoch": 0.18049833235236415, |
| "grad_norm": 1.942451000213623, |
| "learning_rate": 9.543494967827972e-06, |
| "loss": 0.04440748691558838, |
| "memory(GiB)": 78.33, |
| "step": 230, |
| "token_acc": 0.9850483729111698, |
| "train_speed(iter/s)": 0.101945 |
| }, |
| { |
| "epoch": 0.1883460859329017, |
| "grad_norm": 1.6777188777923584, |
| "learning_rate": 9.48782427619597e-06, |
| "loss": 0.040869510173797606, |
| "memory(GiB)": 78.33, |
| "step": 240, |
| "token_acc": 0.9838155958803335, |
| "train_speed(iter/s)": 0.102011 |
| }, |
| { |
| "epoch": 0.19619383951343927, |
| "grad_norm": 1.4591469764709473, |
| "learning_rate": 9.429133473280043e-06, |
| "loss": 0.03583506345748901, |
| "memory(GiB)": 78.33, |
| "step": 250, |
| "token_acc": 0.9880838894184938, |
| "train_speed(iter/s)": 0.102067 |
| }, |
| { |
| "epoch": 0.20404159309397685, |
| "grad_norm": 1.7732504606246948, |
| "learning_rate": 9.367462055447528e-06, |
| "loss": 0.0392861932516098, |
| "memory(GiB)": 78.33, |
| "step": 260, |
| "token_acc": 0.9775227164036346, |
| "train_speed(iter/s)": 0.102128 |
| }, |
| { |
| "epoch": 0.21188934667451442, |
| "grad_norm": 1.3268229961395264, |
| "learning_rate": 9.302851524890452e-06, |
| "loss": 0.11516731977462769, |
| "memory(GiB)": 78.33, |
| "step": 270, |
| "token_acc": 0.9702209414024976, |
| "train_speed(iter/s)": 0.102181 |
| }, |
| { |
| "epoch": 0.219737100255052, |
| "grad_norm": 1.4181880950927734, |
| "learning_rate": 9.235345361696354e-06, |
| "loss": 0.029826369881629945, |
| "memory(GiB)": 78.33, |
| "step": 280, |
| "token_acc": 0.9828947368421053, |
| "train_speed(iter/s)": 0.102219 |
| }, |
| { |
| "epoch": 0.22758485383558957, |
| "grad_norm": 1.2937140464782715, |
| "learning_rate": 9.164988994588077e-06, |
| "loss": 0.032772365212440493, |
| "memory(GiB)": 78.33, |
| "step": 290, |
| "token_acc": 0.9853313100657562, |
| "train_speed(iter/s)": 0.102275 |
| }, |
| { |
| "epoch": 0.23543260741612712, |
| "grad_norm": 2.1428334712982178, |
| "learning_rate": 9.091829770352194e-06, |
| "loss": 0.026177412271499632, |
| "memory(GiB)": 78.33, |
| "step": 300, |
| "token_acc": 0.985909090909091, |
| "train_speed(iter/s)": 0.10232 |
| }, |
| { |
| "epoch": 0.2432803609966647, |
| "grad_norm": 1.6647447347640991, |
| "learning_rate": 9.015916921976684e-06, |
| "loss": 0.030026063323020935, |
| "memory(GiB)": 78.33, |
| "step": 310, |
| "token_acc": 0.9893975903614458, |
| "train_speed(iter/s)": 0.102353 |
| }, |
| { |
| "epoch": 0.2511281145772023, |
| "grad_norm": 1.532943606376648, |
| "learning_rate": 8.93730153551926e-06, |
| "loss": 0.028212451934814455, |
| "memory(GiB)": 78.33, |
| "step": 320, |
| "token_acc": 0.9867881548974943, |
| "train_speed(iter/s)": 0.102401 |
| }, |
| { |
| "epoch": 0.2589758681577399, |
| "grad_norm": 1.246038794517517, |
| "learning_rate": 8.856036515728666e-06, |
| "loss": 0.031563830375671384, |
| "memory(GiB)": 78.33, |
| "step": 330, |
| "token_acc": 0.9879013494648674, |
| "train_speed(iter/s)": 0.10244 |
| }, |
| { |
| "epoch": 0.2668236217382774, |
| "grad_norm": 1.293062448501587, |
| "learning_rate": 8.772176550442063e-06, |
| "loss": 0.027055150270462035, |
| "memory(GiB)": 78.33, |
| "step": 340, |
| "token_acc": 0.9899343544857768, |
| "train_speed(iter/s)": 0.102468 |
| }, |
| { |
| "epoch": 0.27467137531881497, |
| "grad_norm": 1.6155800819396973, |
| "learning_rate": 8.68577807378251e-06, |
| "loss": 0.02880167365074158, |
| "memory(GiB)": 78.33, |
| "step": 350, |
| "token_acc": 0.9876488751654169, |
| "train_speed(iter/s)": 0.102506 |
| }, |
| { |
| "epoch": 0.28251912889935255, |
| "grad_norm": 1.7026891708374023, |
| "learning_rate": 8.596899228181216e-06, |
| "loss": 0.023001885414123534, |
| "memory(GiB)": 78.33, |
| "step": 360, |
| "token_acc": 0.9908088235294118, |
| "train_speed(iter/s)": 0.102532 |
| }, |
| { |
| "epoch": 0.2903668824798901, |
| "grad_norm": 1.3324004411697388, |
| "learning_rate": 8.505599825250217e-06, |
| "loss": 0.026848217844963072, |
| "memory(GiB)": 78.33, |
| "step": 370, |
| "token_acc": 0.9889553612517257, |
| "train_speed(iter/s)": 0.102551 |
| }, |
| { |
| "epoch": 0.2982146360604277, |
| "grad_norm": 1.1974210739135742, |
| "learning_rate": 8.411941305531757e-06, |
| "loss": 0.026788771152496338, |
| "memory(GiB)": 78.33, |
| "step": 380, |
| "token_acc": 0.9888746255883611, |
| "train_speed(iter/s)": 0.102581 |
| }, |
| { |
| "epoch": 0.30606238964096527, |
| "grad_norm": 0.8245720863342285, |
| "learning_rate": 8.315986697151453e-06, |
| "loss": 0.025021129846572877, |
| "memory(GiB)": 78.33, |
| "step": 390, |
| "token_acc": 0.9893911439114391, |
| "train_speed(iter/s)": 0.102605 |
| }, |
| { |
| "epoch": 0.31391014322150285, |
| "grad_norm": 1.4364780187606812, |
| "learning_rate": 8.217800573403105e-06, |
| "loss": 0.02195422351360321, |
| "memory(GiB)": 78.33, |
| "step": 400, |
| "token_acc": 0.9930297397769516, |
| "train_speed(iter/s)": 0.102624 |
| }, |
| { |
| "epoch": 0.3217578968020404, |
| "grad_norm": 1.7430766820907593, |
| "learning_rate": 8.117449009293668e-06, |
| "loss": 0.027543401718139647, |
| "memory(GiB)": 78.33, |
| "step": 410, |
| "token_acc": 0.990978800180424, |
| "train_speed(iter/s)": 0.102653 |
| }, |
| { |
| "epoch": 0.329605650382578, |
| "grad_norm": 1.2770256996154785, |
| "learning_rate": 8.014999537077633e-06, |
| "loss": 0.02160567492246628, |
| "memory(GiB)": 78.33, |
| "step": 420, |
| "token_acc": 0.9921478060046189, |
| "train_speed(iter/s)": 0.102677 |
| }, |
| { |
| "epoch": 0.3374534039631156, |
| "grad_norm": 1.3733662366867065, |
| "learning_rate": 7.910521100810743e-06, |
| "loss": 0.026095324754714967, |
| "memory(GiB)": 78.33, |
| "step": 430, |
| "token_acc": 0.9870490286771508, |
| "train_speed(iter/s)": 0.102699 |
| }, |
| { |
| "epoch": 0.34530115754365315, |
| "grad_norm": 0.7466018199920654, |
| "learning_rate": 7.804084009953638e-06, |
| "loss": 0.023969930410385133, |
| "memory(GiB)": 78.33, |
| "step": 440, |
| "token_acc": 0.9892673821745217, |
| "train_speed(iter/s)": 0.102719 |
| }, |
| { |
| "epoch": 0.3531489111241907, |
| "grad_norm": 1.8762829303741455, |
| "learning_rate": 7.695759892056627e-06, |
| "loss": 0.026105433702468872, |
| "memory(GiB)": 78.33, |
| "step": 450, |
| "token_acc": 0.9867986798679867, |
| "train_speed(iter/s)": 0.102733 |
| }, |
| { |
| "epoch": 0.3609966647047283, |
| "grad_norm": 1.4278916120529175, |
| "learning_rate": 7.585621644557453e-06, |
| "loss": 0.02523442208766937, |
| "memory(GiB)": 78.33, |
| "step": 460, |
| "token_acc": 0.9880179730404394, |
| "train_speed(iter/s)": 0.102756 |
| }, |
| { |
| "epoch": 0.3688444182852658, |
| "grad_norm": 1.1567350625991821, |
| "learning_rate": 7.473743385724478e-06, |
| "loss": 0.018508574366569518, |
| "memory(GiB)": 78.33, |
| "step": 470, |
| "token_acc": 0.9911154985192497, |
| "train_speed(iter/s)": 0.102775 |
| }, |
| { |
| "epoch": 0.3766921718658034, |
| "grad_norm": 1.2503105401992798, |
| "learning_rate": 7.3602004047783e-06, |
| "loss": 0.0277862012386322, |
| "memory(GiB)": 78.33, |
| "step": 480, |
| "token_acc": 0.9869375907111756, |
| "train_speed(iter/s)": 0.102794 |
| }, |
| { |
| "epoch": 0.38453992544634097, |
| "grad_norm": 0.9864979982376099, |
| "learning_rate": 7.245069111225365e-06, |
| "loss": 0.020091001689434052, |
| "memory(GiB)": 78.33, |
| "step": 490, |
| "token_acc": 0.9874360167519777, |
| "train_speed(iter/s)": 0.102814 |
| }, |
| { |
| "epoch": 0.39238767902687854, |
| "grad_norm": 0.7295334339141846, |
| "learning_rate": 7.128426983437685e-06, |
| "loss": 0.018309633433818816, |
| "memory(GiB)": 78.33, |
| "step": 500, |
| "token_acc": 0.9927813163481953, |
| "train_speed(iter/s)": 0.102832 |
| }, |
| { |
| "epoch": 0.39238767902687854, |
| "eval_loss": 0.01768402010202408, |
| "eval_runtime": 17.8731, |
| "eval_samples_per_second": 17.233, |
| "eval_steps_per_second": 2.909, |
| "eval_token_acc": 0.9905556236967987, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4002354326074161, |
| "grad_norm": 1.2380131483078003, |
| "learning_rate": 7.010352516513246e-06, |
| "loss": 0.021393966674804688, |
| "memory(GiB)": 78.33, |
| "step": 510, |
| "token_acc": 0.9890754602468137, |
| "train_speed(iter/s)": 0.1017 |
| }, |
| { |
| "epoch": 0.4080831861879537, |
| "grad_norm": 0.7806908488273621, |
| "learning_rate": 6.890925169452215e-06, |
| "loss": 0.01623480170965195, |
| "memory(GiB)": 78.33, |
| "step": 520, |
| "token_acc": 0.9933014354066986, |
| "train_speed(iter/s)": 0.101735 |
| }, |
| { |
| "epoch": 0.41593093976849127, |
| "grad_norm": 0.7129095792770386, |
| "learning_rate": 6.770225311684469e-06, |
| "loss": 0.018100659549236297, |
| "memory(GiB)": 78.33, |
| "step": 530, |
| "token_acc": 0.9937106918238994, |
| "train_speed(iter/s)": 0.10177 |
| }, |
| { |
| "epoch": 0.42377869334902885, |
| "grad_norm": 0.9304487705230713, |
| "learning_rate": 6.648334168984452e-06, |
| "loss": 0.014826363325119019, |
| "memory(GiB)": 78.33, |
| "step": 540, |
| "token_acc": 0.9924026590693258, |
| "train_speed(iter/s)": 0.101801 |
| }, |
| { |
| "epoch": 0.4316264469295664, |
| "grad_norm": 0.5562326908111572, |
| "learning_rate": 6.525333768809755e-06, |
| "loss": 0.017968928813934325, |
| "memory(GiB)": 78.33, |
| "step": 550, |
| "token_acc": 0.9962894248608535, |
| "train_speed(iter/s)": 0.101834 |
| }, |
| { |
| "epoch": 0.439474200510104, |
| "grad_norm": 0.6488431096076965, |
| "learning_rate": 6.4013068851001815e-06, |
| "loss": 0.01475011110305786, |
| "memory(GiB)": 78.33, |
| "step": 560, |
| "token_acc": 0.9914529914529915, |
| "train_speed(iter/s)": 0.101862 |
| }, |
| { |
| "epoch": 0.44732195409064157, |
| "grad_norm": 0.7067273259162903, |
| "learning_rate": 6.276336982574479e-06, |
| "loss": 0.0170462965965271, |
| "memory(GiB)": 78.33, |
| "step": 570, |
| "token_acc": 0.9939707149009475, |
| "train_speed(iter/s)": 0.101891 |
| }, |
| { |
| "epoch": 0.45516970767117915, |
| "grad_norm": 0.9754965305328369, |
| "learning_rate": 6.150508160562201e-06, |
| "loss": 0.012932208180427552, |
| "memory(GiB)": 78.33, |
| "step": 580, |
| "token_acc": 0.9925044091710759, |
| "train_speed(iter/s)": 0.101923 |
| }, |
| { |
| "epoch": 0.4630174612517167, |
| "grad_norm": 1.3289886713027954, |
| "learning_rate": 6.023905096408493e-06, |
| "loss": 0.015326529741287231, |
| "memory(GiB)": 78.33, |
| "step": 590, |
| "token_acc": 0.9921052631578947, |
| "train_speed(iter/s)": 0.101953 |
| }, |
| { |
| "epoch": 0.47086521483225424, |
| "grad_norm": 1.0586456060409546, |
| "learning_rate": 5.896612988489917e-06, |
| "loss": 0.014036232233047485, |
| "memory(GiB)": 78.33, |
| "step": 600, |
| "token_acc": 0.9930523390458545, |
| "train_speed(iter/s)": 0.101977 |
| }, |
| { |
| "epoch": 0.4787129684127918, |
| "grad_norm": 0.5072459578514099, |
| "learning_rate": 5.768717498879635e-06, |
| "loss": 0.01249450072646141, |
| "memory(GiB)": 78.33, |
| "step": 610, |
| "token_acc": 0.9956875898418783, |
| "train_speed(iter/s)": 0.102005 |
| }, |
| { |
| "epoch": 0.4865607219933294, |
| "grad_norm": 0.4904365539550781, |
| "learning_rate": 5.640304695700543e-06, |
| "loss": 0.009184502065181732, |
| "memory(GiB)": 78.33, |
| "step": 620, |
| "token_acc": 0.9970631424375918, |
| "train_speed(iter/s)": 0.10203 |
| }, |
| { |
| "epoch": 0.49440847557386697, |
| "grad_norm": 0.4811933636665344, |
| "learning_rate": 5.511460995205152e-06, |
| "loss": 0.009154336154460907, |
| "memory(GiB)": 78.33, |
| "step": 630, |
| "token_acc": 0.9952277657266811, |
| "train_speed(iter/s)": 0.102052 |
| }, |
| { |
| "epoch": 0.5022562291544046, |
| "grad_norm": 0.6274723410606384, |
| "learning_rate": 5.3822731036211975e-06, |
| "loss": 0.016800814867019655, |
| "memory(GiB)": 78.33, |
| "step": 640, |
| "token_acc": 0.9949977262391997, |
| "train_speed(iter/s)": 0.102079 |
| }, |
| { |
| "epoch": 0.5101039827349422, |
| "grad_norm": 0.4992922842502594, |
| "learning_rate": 5.252827958802104e-06, |
| "loss": 0.0129698246717453, |
| "memory(GiB)": 78.33, |
| "step": 650, |
| "token_acc": 0.9954934655250113, |
| "train_speed(iter/s)": 0.102099 |
| }, |
| { |
| "epoch": 0.5179517363154797, |
| "grad_norm": 0.20970365405082703, |
| "learning_rate": 5.123212671721576e-06, |
| "loss": 0.011136610805988312, |
| "memory(GiB)": 78.33, |
| "step": 660, |
| "token_acc": 0.9967289719626168, |
| "train_speed(iter/s)": 0.10212 |
| }, |
| { |
| "epoch": 0.5257994898960172, |
| "grad_norm": 0.6177439093589783, |
| "learning_rate": 4.99351446785169e-06, |
| "loss": 0.008509316295385361, |
| "memory(GiB)": 78.33, |
| "step": 670, |
| "token_acc": 0.9966634890371783, |
| "train_speed(iter/s)": 0.102147 |
| }, |
| { |
| "epoch": 0.5336472434765548, |
| "grad_norm": 1.3992984294891357, |
| "learning_rate": 4.863820628463925e-06, |
| "loss": 0.008021638542413712, |
| "memory(GiB)": 78.33, |
| "step": 680, |
| "token_acc": 0.9985155863433943, |
| "train_speed(iter/s)": 0.102167 |
| }, |
| { |
| "epoch": 0.5414949970570924, |
| "grad_norm": 0.5666137933731079, |
| "learning_rate": 4.734218431892659e-06, |
| "loss": 0.010254481434822082, |
| "memory(GiB)": 78.33, |
| "step": 690, |
| "token_acc": 0.9966918714555766, |
| "train_speed(iter/s)": 0.102186 |
| }, |
| { |
| "epoch": 0.5493427506376299, |
| "grad_norm": 0.8575289845466614, |
| "learning_rate": 4.604795094800618e-06, |
| "loss": 0.006985708326101303, |
| "memory(GiB)": 78.33, |
| "step": 700, |
| "token_acc": 0.9972093023255814, |
| "train_speed(iter/s)": 0.102208 |
| }, |
| { |
| "epoch": 0.5571905042181675, |
| "grad_norm": 0.5324018001556396, |
| "learning_rate": 4.475637713485853e-06, |
| "loss": 0.00994066745042801, |
| "memory(GiB)": 78.33, |
| "step": 710, |
| "token_acc": 0.995475113122172, |
| "train_speed(iter/s)": 0.102224 |
| }, |
| { |
| "epoch": 0.5650382577987051, |
| "grad_norm": 0.1626451015472412, |
| "learning_rate": 4.3468332052697e-06, |
| "loss": 0.007179060578346252, |
| "memory(GiB)": 78.33, |
| "step": 720, |
| "token_acc": 0.9971195391262602, |
| "train_speed(iter/s)": 0.10224 |
| }, |
| { |
| "epoch": 0.5728860113792427, |
| "grad_norm": 0.282648503780365, |
| "learning_rate": 4.218468250005189e-06, |
| "loss": 0.009923791885375977, |
| "memory(GiB)": 78.33, |
| "step": 730, |
| "token_acc": 0.9982158786797503, |
| "train_speed(iter/s)": 0.102261 |
| }, |
| { |
| "epoch": 0.5807337649597802, |
| "grad_norm": 1.230008840560913, |
| "learning_rate": 4.090629231745257e-06, |
| "loss": 0.010334306955337524, |
| "memory(GiB)": 78.33, |
| "step": 740, |
| "token_acc": 0.9966903073286052, |
| "train_speed(iter/s)": 0.102277 |
| }, |
| { |
| "epoch": 0.5885815185403178, |
| "grad_norm": 0.9169402122497559, |
| "learning_rate": 3.963402180610028e-06, |
| "loss": 0.007900170236825942, |
| "memory(GiB)": 78.33, |
| "step": 750, |
| "token_acc": 0.9972413793103448, |
| "train_speed(iter/s)": 0.102291 |
| }, |
| { |
| "epoch": 0.5964292721208554, |
| "grad_norm": 0.9331321716308594, |
| "learning_rate": 3.836872714892268e-06, |
| "loss": 0.0052720453590154644, |
| "memory(GiB)": 78.33, |
| "step": 760, |
| "token_acc": 0.9972489683631361, |
| "train_speed(iter/s)": 0.102309 |
| }, |
| { |
| "epoch": 0.604277025701393, |
| "grad_norm": 0.2847830057144165, |
| "learning_rate": 3.7111259834399776e-06, |
| "loss": 0.005255531892180443, |
| "memory(GiB)": 78.33, |
| "step": 770, |
| "token_acc": 0.9972776769509982, |
| "train_speed(iter/s)": 0.102325 |
| }, |
| { |
| "epoch": 0.6121247792819305, |
| "grad_norm": 0.5080392360687256, |
| "learning_rate": 3.5862466083549176e-06, |
| "loss": 0.003240898996591568, |
| "memory(GiB)": 78.33, |
| "step": 780, |
| "token_acc": 0.9979939819458375, |
| "train_speed(iter/s)": 0.102338 |
| }, |
| { |
| "epoch": 0.6199725328624681, |
| "grad_norm": 0.9272496104240417, |
| "learning_rate": 3.4623186280455938e-06, |
| "loss": 0.004520921036601067, |
| "memory(GiB)": 78.33, |
| "step": 790, |
| "token_acc": 0.9975049900199601, |
| "train_speed(iter/s)": 0.102355 |
| }, |
| { |
| "epoch": 0.6278202864430057, |
| "grad_norm": 0.6222965121269226, |
| "learning_rate": 3.339425440673049e-06, |
| "loss": 0.007227100431919098, |
| "memory(GiB)": 78.33, |
| "step": 800, |
| "token_acc": 0.9971014492753624, |
| "train_speed(iter/s)": 0.102372 |
| }, |
| { |
| "epoch": 0.6356680400235433, |
| "grad_norm": 1.0359742641448975, |
| "learning_rate": 3.2176497480275196e-06, |
| "loss": 0.0054885722696781155, |
| "memory(GiB)": 78.33, |
| "step": 810, |
| "token_acc": 0.9980601357904947, |
| "train_speed(iter/s)": 0.102383 |
| }, |
| { |
| "epoch": 0.6435157936040808, |
| "grad_norm": 0.13294534385204315, |
| "learning_rate": 3.0970734998737095e-06, |
| "loss": 0.005127144977450371, |
| "memory(GiB)": 78.33, |
| "step": 820, |
| "token_acc": 0.9975868725868726, |
| "train_speed(iter/s)": 0.1024 |
| }, |
| { |
| "epoch": 0.6513635471846184, |
| "grad_norm": 0.3067134618759155, |
| "learning_rate": 2.9777778388021508e-06, |
| "loss": 0.0035617969930171966, |
| "memory(GiB)": 78.33, |
| "step": 830, |
| "token_acc": 0.9981176470588236, |
| "train_speed(iter/s)": 0.102414 |
| }, |
| { |
| "epoch": 0.659211300765156, |
| "grad_norm": 0.9650315046310425, |
| "learning_rate": 2.859843045623753e-06, |
| "loss": 0.004638446867465973, |
| "memory(GiB)": 78.33, |
| "step": 840, |
| "token_acc": 0.9977127172918573, |
| "train_speed(iter/s)": 0.102425 |
| }, |
| { |
| "epoch": 0.6670590543456936, |
| "grad_norm": 0.6016029119491577, |
| "learning_rate": 2.743348485344307e-06, |
| "loss": 0.004326858744025231, |
| "memory(GiB)": 78.33, |
| "step": 850, |
| "token_acc": 0.9991146525011066, |
| "train_speed(iter/s)": 0.102441 |
| }, |
| { |
| "epoch": 0.6749068079262311, |
| "grad_norm": 0.0693819597363472, |
| "learning_rate": 2.6283725537552573e-06, |
| "loss": 0.004721887409687042, |
| "memory(GiB)": 78.33, |
| "step": 860, |
| "token_acc": 0.9976065102920058, |
| "train_speed(iter/s)": 0.102453 |
| }, |
| { |
| "epoch": 0.6827545615067687, |
| "grad_norm": 0.8225326538085938, |
| "learning_rate": 2.514992624676748e-06, |
| "loss": 0.0044202588498592375, |
| "memory(GiB)": 78.33, |
| "step": 870, |
| "token_acc": 0.9981176470588236, |
| "train_speed(iter/s)": 0.102466 |
| }, |
| { |
| "epoch": 0.6906023150873063, |
| "grad_norm": 0.1978958398103714, |
| "learning_rate": 2.403284997888381e-06, |
| "loss": 0.003972284868359566, |
| "memory(GiB)": 78.33, |
| "step": 880, |
| "token_acc": 0.9976915974145891, |
| "train_speed(iter/s)": 0.102478 |
| }, |
| { |
| "epoch": 0.6984500686678439, |
| "grad_norm": 0.12321511656045914, |
| "learning_rate": 2.2933248477827814e-06, |
| "loss": 0.006783504784107208, |
| "memory(GiB)": 78.33, |
| "step": 890, |
| "token_acc": 0.9984984984984985, |
| "train_speed(iter/s)": 0.102492 |
| }, |
| { |
| "epoch": 0.7062978222483814, |
| "grad_norm": 0.38810572028160095, |
| "learning_rate": 2.1851861727764815e-06, |
| "loss": 0.004711529612541199, |
| "memory(GiB)": 78.33, |
| "step": 900, |
| "token_acc": 0.9980544747081712, |
| "train_speed(iter/s)": 0.102507 |
| }, |
| { |
| "epoch": 0.714145575828919, |
| "grad_norm": 0.3537726402282715, |
| "learning_rate": 2.0789417455121964e-06, |
| "loss": 0.0040462717413902284, |
| "memory(GiB)": 78.33, |
| "step": 910, |
| "token_acc": 0.998610467809171, |
| "train_speed(iter/s)": 0.102515 |
| }, |
| { |
| "epoch": 0.7219933294094566, |
| "grad_norm": 0.5852633118629456, |
| "learning_rate": 1.9746630638859853e-06, |
| "loss": 0.002691943012177944, |
| "memory(GiB)": 78.33, |
| "step": 920, |
| "token_acc": 0.9984901862103673, |
| "train_speed(iter/s)": 0.102527 |
| }, |
| { |
| "epoch": 0.7298410829899941, |
| "grad_norm": 0.09465645998716354, |
| "learning_rate": 1.8724203029322684e-06, |
| "loss": 0.0028355952352285387, |
| "memory(GiB)": 78.33, |
| "step": 930, |
| "token_acc": 0.9995393827729157, |
| "train_speed(iter/s)": 0.102539 |
| }, |
| { |
| "epoch": 0.7376888365705316, |
| "grad_norm": 0.6381473541259766, |
| "learning_rate": 1.772282267599068e-06, |
| "loss": 0.0024913540109992027, |
| "memory(GiB)": 78.33, |
| "step": 940, |
| "token_acc": 0.9990740740740741, |
| "train_speed(iter/s)": 0.102548 |
| }, |
| { |
| "epoch": 0.7455365901510692, |
| "grad_norm": 0.6442692279815674, |
| "learning_rate": 1.6743163464452605e-06, |
| "loss": 0.0036306858062744142, |
| "memory(GiB)": 78.33, |
| "step": 950, |
| "token_acc": 0.9976213130352045, |
| "train_speed(iter/s)": 0.10256 |
| }, |
| { |
| "epoch": 0.7533843437316068, |
| "grad_norm": 0.5344854593276978, |
| "learning_rate": 1.5785884662909917e-06, |
| "loss": 0.0027463218197226525, |
| "memory(GiB)": 78.33, |
| "step": 960, |
| "token_acc": 0.9976819656930923, |
| "train_speed(iter/s)": 0.102572 |
| }, |
| { |
| "epoch": 0.7612320973121444, |
| "grad_norm": 1.05326247215271, |
| "learning_rate": 1.4851630478517942e-06, |
| "loss": 0.0029366277158260345, |
| "memory(GiB)": 78.33, |
| "step": 970, |
| "token_acc": 0.9985869053226566, |
| "train_speed(iter/s)": 0.102583 |
| }, |
| { |
| "epoch": 0.7690798508926819, |
| "grad_norm": 0.2340899109840393, |
| "learning_rate": 1.394102962386223e-06, |
| "loss": 0.0015608785673975945, |
| "memory(GiB)": 78.33, |
| "step": 980, |
| "token_acc": 0.999054820415879, |
| "train_speed(iter/s)": 0.102594 |
| }, |
| { |
| "epoch": 0.7769276044732195, |
| "grad_norm": 0.08281790465116501, |
| "learning_rate": 1.3054694893862341e-06, |
| "loss": 0.0035311192274093627, |
| "memory(GiB)": 78.33, |
| "step": 990, |
| "token_acc": 0.9981438515081207, |
| "train_speed(iter/s)": 0.102606 |
| }, |
| { |
| "epoch": 0.7847753580537571, |
| "grad_norm": 0.4216911494731903, |
| "learning_rate": 1.219322275338738e-06, |
| "loss": 0.003600326552987099, |
| "memory(GiB)": 78.33, |
| "step": 1000, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.102614 |
| }, |
| { |
| "epoch": 0.7847753580537571, |
| "eval_loss": 0.002305834786966443, |
| "eval_runtime": 17.6788, |
| "eval_samples_per_second": 17.422, |
| "eval_steps_per_second": 2.941, |
| "eval_token_acc": 0.9990187660983687, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7926231116342947, |
| "grad_norm": 0.233176589012146, |
| "learning_rate": 1.1357192935860955e-06, |
| "loss": 0.001780262403190136, |
| "memory(GiB)": 78.33, |
| "step": 1010, |
| "token_acc": 0.9993978321959053, |
| "train_speed(iter/s)": 0.102038 |
| }, |
| { |
| "epoch": 0.8004708652148322, |
| "grad_norm": 0.4482191801071167, |
| "learning_rate": 1.0547168053125733e-06, |
| "loss": 0.004453697055578232, |
| "memory(GiB)": 78.33, |
| "step": 1020, |
| "token_acc": 0.999061473486626, |
| "train_speed(iter/s)": 0.102041 |
| }, |
| { |
| "epoch": 0.8083186187953698, |
| "grad_norm": 0.04691644757986069, |
| "learning_rate": 9.763693216830055e-07, |
| "loss": 0.0016242723912000656, |
| "memory(GiB)": 78.33, |
| "step": 1030, |
| "token_acc": 0.9991503823279524, |
| "train_speed(iter/s)": 0.102056 |
| }, |
| { |
| "epoch": 0.8161663723759074, |
| "grad_norm": 0.31022143363952637, |
| "learning_rate": 9.007295671591393e-07, |
| "loss": 0.003097619116306305, |
| "memory(GiB)": 78.33, |
| "step": 1040, |
| "token_acc": 0.999526066350711, |
| "train_speed(iter/s)": 0.10207 |
| }, |
| { |
| "epoch": 0.824014125956445, |
| "grad_norm": 0.44393110275268555, |
| "learning_rate": 8.278484440183549e-07, |
| "loss": 0.002246275171637535, |
| "memory(GiB)": 78.33, |
| "step": 1050, |
| "token_acc": 0.9991091314031181, |
| "train_speed(iter/s)": 0.102085 |
| }, |
| { |
| "epoch": 0.8318618795369825, |
| "grad_norm": 0.268406480550766, |
| "learning_rate": 7.577749980986443e-07, |
| "loss": 0.003959977626800537, |
| "memory(GiB)": 78.33, |
| "step": 1060, |
| "token_acc": 0.99800796812749, |
| "train_speed(iter/s)": 0.1021 |
| }, |
| { |
| "epoch": 0.8397096331175201, |
| "grad_norm": 0.8081741333007812, |
| "learning_rate": 6.905563857928838e-07, |
| "loss": 0.004642174392938614, |
| "memory(GiB)": 78.33, |
| "step": 1070, |
| "token_acc": 0.9976303317535545, |
| "train_speed(iter/s)": 0.102113 |
| }, |
| { |
| "epoch": 0.8475573866980577, |
| "grad_norm": 0.09950771182775497, |
| "learning_rate": 6.262378423146254e-07, |
| "loss": 0.0011267985217273235, |
| "memory(GiB)": 78.33, |
| "step": 1080, |
| "token_acc": 0.9995366079703429, |
| "train_speed(iter/s)": 0.102128 |
| }, |
| { |
| "epoch": 0.8554051402785953, |
| "grad_norm": 0.5595096945762634, |
| "learning_rate": 5.648626512567546e-07, |
| "loss": 0.001190672628581524, |
| "memory(GiB)": 78.33, |
| "step": 1090, |
| "token_acc": 0.9995657837603127, |
| "train_speed(iter/s)": 0.102142 |
| }, |
| { |
| "epoch": 0.8632528938591328, |
| "grad_norm": 0.3294273018836975, |
| "learning_rate": 5.064721154635155e-07, |
| "loss": 0.0030788829550147056, |
| "memory(GiB)": 78.33, |
| "step": 1100, |
| "token_acc": 0.9986708019494904, |
| "train_speed(iter/s)": 0.102152 |
| }, |
| { |
| "epoch": 0.8711006474396704, |
| "grad_norm": 1.3336243629455566, |
| "learning_rate": 4.511055292354799e-07, |
| "loss": 0.0034807972609996797, |
| "memory(GiB)": 78.33, |
| "step": 1110, |
| "token_acc": 0.9985250737463127, |
| "train_speed(iter/s)": 0.102168 |
| }, |
| { |
| "epoch": 0.878948401020208, |
| "grad_norm": 0.5120218992233276, |
| "learning_rate": 3.988001518861878e-07, |
| "loss": 0.003415053337812424, |
| "memory(GiB)": 78.33, |
| "step": 1120, |
| "token_acc": 0.9981299672744273, |
| "train_speed(iter/s)": 0.10218 |
| }, |
| { |
| "epoch": 0.8867961546007456, |
| "grad_norm": 0.2708381116390228, |
| "learning_rate": 3.495911826682441e-07, |
| "loss": 0.002040334790945053, |
| "memory(GiB)": 78.33, |
| "step": 1130, |
| "token_acc": 0.9990627928772259, |
| "train_speed(iter/s)": 0.102191 |
| }, |
| { |
| "epoch": 0.8946439081812831, |
| "grad_norm": 0.2763141989707947, |
| "learning_rate": 3.0351173708574657e-07, |
| "loss": 0.002172568999230862, |
| "memory(GiB)": 78.33, |
| "step": 1140, |
| "token_acc": 0.9994972347913524, |
| "train_speed(iter/s)": 0.102204 |
| }, |
| { |
| "epoch": 0.9024916617618207, |
| "grad_norm": 0.08879227936267853, |
| "learning_rate": 2.605928246089834e-07, |
| "loss": 0.0020642828196287153, |
| "memory(GiB)": 78.33, |
| "step": 1150, |
| "token_acc": 0.9995069033530573, |
| "train_speed(iter/s)": 0.102216 |
| }, |
| { |
| "epoch": 0.9103394153423583, |
| "grad_norm": 0.4619658291339874, |
| "learning_rate": 2.2086332780640928e-07, |
| "loss": 0.001702458970248699, |
| "memory(GiB)": 78.33, |
| "step": 1160, |
| "token_acc": 0.9995285242809995, |
| "train_speed(iter/s)": 0.102227 |
| }, |
| { |
| "epoch": 0.9181871689228959, |
| "grad_norm": 0.6599162220954895, |
| "learning_rate": 1.8434998290792373e-07, |
| "loss": 0.0055834796279668805, |
| "memory(GiB)": 78.33, |
| "step": 1170, |
| "token_acc": 0.9982070820259973, |
| "train_speed(iter/s)": 0.10224 |
| }, |
| { |
| "epoch": 0.9260349225034334, |
| "grad_norm": 0.07474014908075333, |
| "learning_rate": 1.510773618125494e-07, |
| "loss": 0.0019306868314743042, |
| "memory(GiB)": 78.33, |
| "step": 1180, |
| "token_acc": 0.9995256166982922, |
| "train_speed(iter/s)": 0.102253 |
| }, |
| { |
| "epoch": 0.9338826760839709, |
| "grad_norm": 0.12219434976577759, |
| "learning_rate": 1.2106785555260568e-07, |
| "loss": 0.0020278608426451683, |
| "memory(GiB)": 78.33, |
| "step": 1190, |
| "token_acc": 0.9995555555555555, |
| "train_speed(iter/s)": 0.102262 |
| }, |
| { |
| "epoch": 0.9417304296645085, |
| "grad_norm": 0.14843851327896118, |
| "learning_rate": 9.434165922551641e-08, |
| "loss": 0.0019979637116193773, |
| "memory(GiB)": 78.33, |
| "step": 1200, |
| "token_acc": 0.9995711835334476, |
| "train_speed(iter/s)": 0.102275 |
| }, |
| { |
| "epoch": 0.9495781832450461, |
| "grad_norm": 0.4590687155723572, |
| "learning_rate": 7.091675840338485e-08, |
| "loss": 0.0013441312126815318, |
| "memory(GiB)": 78.33, |
| "step": 1210, |
| "token_acc": 0.9995460735360872, |
| "train_speed(iter/s)": 0.102286 |
| }, |
| { |
| "epoch": 0.9574259368255836, |
| "grad_norm": 0.3629854619503021, |
| "learning_rate": 5.0808917029481205e-08, |
| "loss": 0.002632497064769268, |
| "memory(GiB)": 78.33, |
| "step": 1220, |
| "token_acc": 0.9995487364620939, |
| "train_speed(iter/s)": 0.102295 |
| }, |
| { |
| "epoch": 0.9652736904061212, |
| "grad_norm": 0.20997942984104156, |
| "learning_rate": 3.4031666809793974e-08, |
| "loss": 0.0015788381919264794, |
| "memory(GiB)": 78.33, |
| "step": 1230, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.102307 |
| }, |
| { |
| "epoch": 0.9731214439866588, |
| "grad_norm": 0.4643399715423584, |
| "learning_rate": 2.0596298106774214e-08, |
| "loss": 0.0017853409051895141, |
| "memory(GiB)": 78.33, |
| "step": 1240, |
| "token_acc": 0.9991235758106923, |
| "train_speed(iter/s)": 0.102317 |
| }, |
| { |
| "epoch": 0.9809691975671964, |
| "grad_norm": 0.1480502039194107, |
| "learning_rate": 1.051185234141494e-08, |
| "loss": 0.0018517106771469116, |
| "memory(GiB)": 78.33, |
| "step": 1250, |
| "token_acc": 0.9990913221263062, |
| "train_speed(iter/s)": 0.102326 |
| }, |
| { |
| "epoch": 0.9888169511477339, |
| "grad_norm": 0.6869596838951111, |
| "learning_rate": 3.7851159087665124e-09, |
| "loss": 0.0024400349706411363, |
| "memory(GiB)": 78.33, |
| "step": 1260, |
| "token_acc": 0.9990494296577946, |
| "train_speed(iter/s)": 0.102338 |
| }, |
| { |
| "epoch": 0.9966647047282715, |
| "grad_norm": 0.9563629031181335, |
| "learning_rate": 4.2061561098261093e-10, |
| "loss": 0.0032432712614536285, |
| "memory(GiB)": 78.33, |
| "step": 1270, |
| "token_acc": 0.9995429616087751, |
| "train_speed(iter/s)": 0.102349 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.0017730883555486798, |
| "eval_runtime": 18.1533, |
| "eval_samples_per_second": 16.967, |
| "eval_steps_per_second": 2.864, |
| "eval_token_acc": 0.9992640745737765, |
| "step": 1275 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1275, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.34975330481655e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|