| { |
| "best_metric": 0.37647188, |
| "best_model_checkpoint": "/mnt/petrelfs/caoyuhang/InternLM-XComposer/finetune_audio/output/sft-continue_base_silence/qwen2-audio-7b-instruct/v0-20241120-155458/checkpoint-20000", |
| "epoch": 3.0, |
| "eval_steps": 2000, |
| "global_step": 32220, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 9.310986964618249e-05, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.2414649286157668e-07, |
| "loss": 0.18691748, |
| "memory(GiB)": 26.35, |
| "step": 1, |
| "train_speed(iter/s)": 0.046198 |
| }, |
| { |
| "epoch": 0.00186219739292365, |
| "grad_norm": 1.59375, |
| "learning_rate": 2.4829298572315337e-06, |
| "loss": 0.2265442, |
| "memory(GiB)": 59.78, |
| "step": 20, |
| "train_speed(iter/s)": 0.434773 |
| }, |
| { |
| "epoch": 0.0037243947858473, |
| "grad_norm": 1.4140625, |
| "learning_rate": 4.965859714463067e-06, |
| "loss": 0.21457853, |
| "memory(GiB)": 59.78, |
| "step": 40, |
| "train_speed(iter/s)": 0.569546 |
| }, |
| { |
| "epoch": 0.00558659217877095, |
| "grad_norm": 1.5, |
| "learning_rate": 7.4487895716946e-06, |
| "loss": 0.2096719, |
| "memory(GiB)": 75.39, |
| "step": 60, |
| "train_speed(iter/s)": 0.627998 |
| }, |
| { |
| "epoch": 0.0074487895716946, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.931719428926135e-06, |
| "loss": 0.21570706, |
| "memory(GiB)": 75.39, |
| "step": 80, |
| "train_speed(iter/s)": 0.655146 |
| }, |
| { |
| "epoch": 0.00931098696461825, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.2414649286157666e-05, |
| "loss": 0.22957432, |
| "memory(GiB)": 75.39, |
| "step": 100, |
| "train_speed(iter/s)": 0.680076 |
| }, |
| { |
| "epoch": 0.0111731843575419, |
| "grad_norm": 1.1796875, |
| "learning_rate": 1.48975791433892e-05, |
| "loss": 0.22518969, |
| "memory(GiB)": 75.39, |
| "step": 120, |
| "train_speed(iter/s)": 0.695236 |
| }, |
| { |
| "epoch": 0.01303538175046555, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.7380509000620735e-05, |
| "loss": 0.20143158, |
| "memory(GiB)": 75.39, |
| "step": 140, |
| "train_speed(iter/s)": 0.709963 |
| }, |
| { |
| "epoch": 0.0148975791433892, |
| "grad_norm": 1.875, |
| "learning_rate": 1.986343885785227e-05, |
| "loss": 0.19077778, |
| "memory(GiB)": 75.39, |
| "step": 160, |
| "train_speed(iter/s)": 0.719808 |
| }, |
| { |
| "epoch": 0.01675977653631285, |
| "grad_norm": 1.390625, |
| "learning_rate": 2.2346368715083797e-05, |
| "loss": 0.2053205, |
| "memory(GiB)": 75.39, |
| "step": 180, |
| "train_speed(iter/s)": 0.729386 |
| }, |
| { |
| "epoch": 0.0186219739292365, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.482929857231533e-05, |
| "loss": 0.21171951, |
| "memory(GiB)": 75.39, |
| "step": 200, |
| "train_speed(iter/s)": 0.735649 |
| }, |
| { |
| "epoch": 0.020484171322160148, |
| "grad_norm": 1.5390625, |
| "learning_rate": 2.7312228429546866e-05, |
| "loss": 0.19139764, |
| "memory(GiB)": 75.39, |
| "step": 220, |
| "train_speed(iter/s)": 0.74262 |
| }, |
| { |
| "epoch": 0.0223463687150838, |
| "grad_norm": 1.421875, |
| "learning_rate": 2.97951582867784e-05, |
| "loss": 0.19102612, |
| "memory(GiB)": 75.39, |
| "step": 240, |
| "train_speed(iter/s)": 0.748473 |
| }, |
| { |
| "epoch": 0.024208566108007448, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.227808814400993e-05, |
| "loss": 0.22476618, |
| "memory(GiB)": 75.39, |
| "step": 260, |
| "train_speed(iter/s)": 0.749743 |
| }, |
| { |
| "epoch": 0.0260707635009311, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.476101800124147e-05, |
| "loss": 0.20150518, |
| "memory(GiB)": 75.39, |
| "step": 280, |
| "train_speed(iter/s)": 0.751916 |
| }, |
| { |
| "epoch": 0.027932960893854747, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.7243947858473e-05, |
| "loss": 0.20568705, |
| "memory(GiB)": 75.39, |
| "step": 300, |
| "train_speed(iter/s)": 0.755403 |
| }, |
| { |
| "epoch": 0.0297951582867784, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.972687771570454e-05, |
| "loss": 0.21228166, |
| "memory(GiB)": 75.39, |
| "step": 320, |
| "train_speed(iter/s)": 0.758702 |
| }, |
| { |
| "epoch": 0.03165735567970205, |
| "grad_norm": 1.5546875, |
| "learning_rate": 4.220980757293606e-05, |
| "loss": 0.23770952, |
| "memory(GiB)": 75.39, |
| "step": 340, |
| "train_speed(iter/s)": 0.760792 |
| }, |
| { |
| "epoch": 0.0335195530726257, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.4692737430167594e-05, |
| "loss": 0.1984645, |
| "memory(GiB)": 75.39, |
| "step": 360, |
| "train_speed(iter/s)": 0.761537 |
| }, |
| { |
| "epoch": 0.035381750465549346, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.717566728739913e-05, |
| "loss": 0.20803604, |
| "memory(GiB)": 45.23, |
| "step": 380, |
| "train_speed(iter/s)": 0.762314 |
| }, |
| { |
| "epoch": 0.037243947858473, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.965859714463066e-05, |
| "loss": 0.21420815, |
| "memory(GiB)": 45.23, |
| "step": 400, |
| "train_speed(iter/s)": 0.764075 |
| }, |
| { |
| "epoch": 0.03910614525139665, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.21415270018622e-05, |
| "loss": 0.20427363, |
| "memory(GiB)": 45.23, |
| "step": 420, |
| "train_speed(iter/s)": 0.765517 |
| }, |
| { |
| "epoch": 0.040968342644320296, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.462445685909373e-05, |
| "loss": 0.20555434, |
| "memory(GiB)": 45.23, |
| "step": 440, |
| "train_speed(iter/s)": 0.765864 |
| }, |
| { |
| "epoch": 0.04283054003724395, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.7107386716325263e-05, |
| "loss": 0.21655397, |
| "memory(GiB)": 45.23, |
| "step": 460, |
| "train_speed(iter/s)": 0.766872 |
| }, |
| { |
| "epoch": 0.0446927374301676, |
| "grad_norm": 1.6484375, |
| "learning_rate": 5.95903165735568e-05, |
| "loss": 0.19097025, |
| "memory(GiB)": 45.23, |
| "step": 480, |
| "train_speed(iter/s)": 0.768982 |
| }, |
| { |
| "epoch": 0.04655493482309125, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.207324643078833e-05, |
| "loss": 0.19996111, |
| "memory(GiB)": 45.23, |
| "step": 500, |
| "train_speed(iter/s)": 0.770034 |
| }, |
| { |
| "epoch": 0.048417132216014895, |
| "grad_norm": 1.484375, |
| "learning_rate": 6.455617628801986e-05, |
| "loss": 0.20735807, |
| "memory(GiB)": 45.23, |
| "step": 520, |
| "train_speed(iter/s)": 0.771256 |
| }, |
| { |
| "epoch": 0.05027932960893855, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.70391061452514e-05, |
| "loss": 0.20899239, |
| "memory(GiB)": 45.23, |
| "step": 540, |
| "train_speed(iter/s)": 0.773346 |
| }, |
| { |
| "epoch": 0.0521415270018622, |
| "grad_norm": 2.1875, |
| "learning_rate": 6.952203600248294e-05, |
| "loss": 0.21587, |
| "memory(GiB)": 45.23, |
| "step": 560, |
| "train_speed(iter/s)": 0.775101 |
| }, |
| { |
| "epoch": 0.054003724394785846, |
| "grad_norm": 1.6484375, |
| "learning_rate": 7.200496585971447e-05, |
| "loss": 0.20331898, |
| "memory(GiB)": 45.23, |
| "step": 580, |
| "train_speed(iter/s)": 0.776379 |
| }, |
| { |
| "epoch": 0.055865921787709494, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.4487895716946e-05, |
| "loss": 0.19293334, |
| "memory(GiB)": 45.23, |
| "step": 600, |
| "train_speed(iter/s)": 0.776493 |
| }, |
| { |
| "epoch": 0.05772811918063315, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.697082557417753e-05, |
| "loss": 0.23722024, |
| "memory(GiB)": 45.23, |
| "step": 620, |
| "train_speed(iter/s)": 0.777184 |
| }, |
| { |
| "epoch": 0.0595903165735568, |
| "grad_norm": 1.3984375, |
| "learning_rate": 7.945375543140908e-05, |
| "loss": 0.21692863, |
| "memory(GiB)": 45.23, |
| "step": 640, |
| "train_speed(iter/s)": 0.77775 |
| }, |
| { |
| "epoch": 0.061452513966480445, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.193668528864061e-05, |
| "loss": 0.22430882, |
| "memory(GiB)": 45.23, |
| "step": 660, |
| "train_speed(iter/s)": 0.778898 |
| }, |
| { |
| "epoch": 0.0633147113594041, |
| "grad_norm": 1.5546875, |
| "learning_rate": 8.441961514587213e-05, |
| "loss": 0.22223544, |
| "memory(GiB)": 45.23, |
| "step": 680, |
| "train_speed(iter/s)": 0.7801 |
| }, |
| { |
| "epoch": 0.06517690875232775, |
| "grad_norm": 1.6015625, |
| "learning_rate": 8.690254500310366e-05, |
| "loss": 0.23334203, |
| "memory(GiB)": 45.23, |
| "step": 700, |
| "train_speed(iter/s)": 0.781002 |
| }, |
| { |
| "epoch": 0.0670391061452514, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.938547486033519e-05, |
| "loss": 0.20012813, |
| "memory(GiB)": 45.23, |
| "step": 720, |
| "train_speed(iter/s)": 0.781167 |
| }, |
| { |
| "epoch": 0.06890130353817504, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.186840471756673e-05, |
| "loss": 0.22085505, |
| "memory(GiB)": 45.23, |
| "step": 740, |
| "train_speed(iter/s)": 0.780622 |
| }, |
| { |
| "epoch": 0.07076350093109869, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.435133457479826e-05, |
| "loss": 0.213342, |
| "memory(GiB)": 45.23, |
| "step": 760, |
| "train_speed(iter/s)": 0.781779 |
| }, |
| { |
| "epoch": 0.07262569832402235, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.68342644320298e-05, |
| "loss": 0.21715076, |
| "memory(GiB)": 45.23, |
| "step": 780, |
| "train_speed(iter/s)": 0.782338 |
| }, |
| { |
| "epoch": 0.074487895716946, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.931719428926133e-05, |
| "loss": 0.22635069, |
| "memory(GiB)": 45.23, |
| "step": 800, |
| "train_speed(iter/s)": 0.782662 |
| }, |
| { |
| "epoch": 0.07635009310986965, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00010180012414649287, |
| "loss": 0.20717359, |
| "memory(GiB)": 45.23, |
| "step": 820, |
| "train_speed(iter/s)": 0.783065 |
| }, |
| { |
| "epoch": 0.0782122905027933, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.0001042830540037244, |
| "loss": 0.22582912, |
| "memory(GiB)": 45.23, |
| "step": 840, |
| "train_speed(iter/s)": 0.783863 |
| }, |
| { |
| "epoch": 0.08007448789571694, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00010676598386095593, |
| "loss": 0.22400272, |
| "memory(GiB)": 45.23, |
| "step": 860, |
| "train_speed(iter/s)": 0.784412 |
| }, |
| { |
| "epoch": 0.08193668528864059, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00010924891371818746, |
| "loss": 0.23082299, |
| "memory(GiB)": 45.23, |
| "step": 880, |
| "train_speed(iter/s)": 0.784183 |
| }, |
| { |
| "epoch": 0.08379888268156424, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.000111731843575419, |
| "loss": 0.23772116, |
| "memory(GiB)": 45.23, |
| "step": 900, |
| "train_speed(iter/s)": 0.784761 |
| }, |
| { |
| "epoch": 0.0856610800744879, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00011421477343265053, |
| "loss": 0.23279626, |
| "memory(GiB)": 45.23, |
| "step": 920, |
| "train_speed(iter/s)": 0.785722 |
| }, |
| { |
| "epoch": 0.08752327746741155, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00011669770328988207, |
| "loss": 0.22019596, |
| "memory(GiB)": 45.23, |
| "step": 940, |
| "train_speed(iter/s)": 0.786441 |
| }, |
| { |
| "epoch": 0.0893854748603352, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001191806331471136, |
| "loss": 0.22586164, |
| "memory(GiB)": 45.23, |
| "step": 960, |
| "train_speed(iter/s)": 0.787059 |
| }, |
| { |
| "epoch": 0.09124767225325885, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00012166356300434513, |
| "loss": 0.23982925, |
| "memory(GiB)": 45.23, |
| "step": 980, |
| "train_speed(iter/s)": 0.786758 |
| }, |
| { |
| "epoch": 0.0931098696461825, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.00012414649286157667, |
| "loss": 0.22995653, |
| "memory(GiB)": 45.23, |
| "step": 1000, |
| "train_speed(iter/s)": 0.787241 |
| }, |
| { |
| "epoch": 0.09497206703910614, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001266294227188082, |
| "loss": 0.25532598, |
| "memory(GiB)": 45.23, |
| "step": 1020, |
| "train_speed(iter/s)": 0.788014 |
| }, |
| { |
| "epoch": 0.09683426443202979, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00012911235257603973, |
| "loss": 0.22124271, |
| "memory(GiB)": 45.23, |
| "step": 1040, |
| "train_speed(iter/s)": 0.788227 |
| }, |
| { |
| "epoch": 0.09869646182495345, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00013159528243327127, |
| "loss": 0.22865233, |
| "memory(GiB)": 45.23, |
| "step": 1060, |
| "train_speed(iter/s)": 0.788926 |
| }, |
| { |
| "epoch": 0.1005586592178771, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.0001340782122905028, |
| "loss": 0.25925579, |
| "memory(GiB)": 45.23, |
| "step": 1080, |
| "train_speed(iter/s)": 0.788974 |
| }, |
| { |
| "epoch": 0.10242085661080075, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00013656114214773433, |
| "loss": 0.24655545, |
| "memory(GiB)": 45.23, |
| "step": 1100, |
| "train_speed(iter/s)": 0.789406 |
| }, |
| { |
| "epoch": 0.1042830540037244, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00013904407200496588, |
| "loss": 0.25059803, |
| "memory(GiB)": 45.23, |
| "step": 1120, |
| "train_speed(iter/s)": 0.789646 |
| }, |
| { |
| "epoch": 0.10614525139664804, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.0001415270018621974, |
| "loss": 0.26447239, |
| "memory(GiB)": 45.23, |
| "step": 1140, |
| "train_speed(iter/s)": 0.790032 |
| }, |
| { |
| "epoch": 0.10800744878957169, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00014400993171942894, |
| "loss": 0.28571393, |
| "memory(GiB)": 45.23, |
| "step": 1160, |
| "train_speed(iter/s)": 0.790468 |
| }, |
| { |
| "epoch": 0.10986964618249534, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00014649286157666046, |
| "loss": 0.26289692, |
| "memory(GiB)": 45.23, |
| "step": 1180, |
| "train_speed(iter/s)": 0.790894 |
| }, |
| { |
| "epoch": 0.11173184357541899, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.000148975791433892, |
| "loss": 0.26204472, |
| "memory(GiB)": 45.23, |
| "step": 1200, |
| "train_speed(iter/s)": 0.791024 |
| }, |
| { |
| "epoch": 0.11359404096834265, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00015145872129112352, |
| "loss": 0.29556036, |
| "memory(GiB)": 45.23, |
| "step": 1220, |
| "train_speed(iter/s)": 0.791423 |
| }, |
| { |
| "epoch": 0.1154562383612663, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00015394165114835507, |
| "loss": 0.27372065, |
| "memory(GiB)": 45.23, |
| "step": 1240, |
| "train_speed(iter/s)": 0.79099 |
| }, |
| { |
| "epoch": 0.11731843575418995, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00015642458100558658, |
| "loss": 0.28276806, |
| "memory(GiB)": 45.23, |
| "step": 1260, |
| "train_speed(iter/s)": 0.791011 |
| }, |
| { |
| "epoch": 0.1191806331471136, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00015890751086281816, |
| "loss": 0.28502092, |
| "memory(GiB)": 45.23, |
| "step": 1280, |
| "train_speed(iter/s)": 0.791642 |
| }, |
| { |
| "epoch": 0.12104283054003724, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00016139044072004967, |
| "loss": 0.29006686, |
| "memory(GiB)": 45.23, |
| "step": 1300, |
| "train_speed(iter/s)": 0.791155 |
| }, |
| { |
| "epoch": 0.12290502793296089, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00016387337057728122, |
| "loss": 0.29351676, |
| "memory(GiB)": 45.23, |
| "step": 1320, |
| "train_speed(iter/s)": 0.79116 |
| }, |
| { |
| "epoch": 0.12476722532588454, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00016635630043451273, |
| "loss": 0.30054893, |
| "memory(GiB)": 45.23, |
| "step": 1340, |
| "train_speed(iter/s)": 0.791442 |
| }, |
| { |
| "epoch": 0.1266294227188082, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00016883923029174425, |
| "loss": 0.284747, |
| "memory(GiB)": 45.23, |
| "step": 1360, |
| "train_speed(iter/s)": 0.791642 |
| }, |
| { |
| "epoch": 0.12849162011173185, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.0001713221601489758, |
| "loss": 0.28408041, |
| "memory(GiB)": 45.23, |
| "step": 1380, |
| "train_speed(iter/s)": 0.792049 |
| }, |
| { |
| "epoch": 0.1303538175046555, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00017380509000620731, |
| "loss": 0.30376158, |
| "memory(GiB)": 45.23, |
| "step": 1400, |
| "train_speed(iter/s)": 0.79165 |
| }, |
| { |
| "epoch": 0.13221601489757914, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017628801986343886, |
| "loss": 0.30356815, |
| "memory(GiB)": 45.23, |
| "step": 1420, |
| "train_speed(iter/s)": 0.791951 |
| }, |
| { |
| "epoch": 0.1340782122905028, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00017877094972067038, |
| "loss": 0.29253561, |
| "memory(GiB)": 45.23, |
| "step": 1440, |
| "train_speed(iter/s)": 0.792277 |
| }, |
| { |
| "epoch": 0.13594040968342644, |
| "grad_norm": 1.828125, |
| "learning_rate": 0.00018125387957790195, |
| "loss": 0.31560664, |
| "memory(GiB)": 45.23, |
| "step": 1460, |
| "train_speed(iter/s)": 0.792471 |
| }, |
| { |
| "epoch": 0.1378026070763501, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.00018373680943513347, |
| "loss": 0.34103947, |
| "memory(GiB)": 45.23, |
| "step": 1480, |
| "train_speed(iter/s)": 0.792883 |
| }, |
| { |
| "epoch": 0.13966480446927373, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.000186219739292365, |
| "loss": 0.32740824, |
| "memory(GiB)": 45.23, |
| "step": 1500, |
| "train_speed(iter/s)": 0.793276 |
| }, |
| { |
| "epoch": 0.14152700186219738, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00018870266914959653, |
| "loss": 0.32411356, |
| "memory(GiB)": 45.23, |
| "step": 1520, |
| "train_speed(iter/s)": 0.793438 |
| }, |
| { |
| "epoch": 0.14338919925512103, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00019118559900682807, |
| "loss": 0.33667076, |
| "memory(GiB)": 45.23, |
| "step": 1540, |
| "train_speed(iter/s)": 0.793845 |
| }, |
| { |
| "epoch": 0.1452513966480447, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.0001936685288640596, |
| "loss": 0.3620486, |
| "memory(GiB)": 45.23, |
| "step": 1560, |
| "train_speed(iter/s)": 0.793711 |
| }, |
| { |
| "epoch": 0.14711359404096835, |
| "grad_norm": 2.0, |
| "learning_rate": 0.00019615145872129114, |
| "loss": 0.35051939, |
| "memory(GiB)": 45.23, |
| "step": 1580, |
| "train_speed(iter/s)": 0.793904 |
| }, |
| { |
| "epoch": 0.148975791433892, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00019863438857852265, |
| "loss": 0.32083566, |
| "memory(GiB)": 45.23, |
| "step": 1600, |
| "train_speed(iter/s)": 0.794229 |
| }, |
| { |
| "epoch": 0.15083798882681565, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00019999995733650257, |
| "loss": 0.34570508, |
| "memory(GiB)": 45.23, |
| "step": 1620, |
| "train_speed(iter/s)": 0.794576 |
| }, |
| { |
| "epoch": 0.1527001862197393, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00019999955703731584, |
| "loss": 0.37476125, |
| "memory(GiB)": 45.23, |
| "step": 1640, |
| "train_speed(iter/s)": 0.794974 |
| }, |
| { |
| "epoch": 0.15456238361266295, |
| "grad_norm": 3.046875, |
| "learning_rate": 0.00019999873537223758, |
| "loss": 0.34583857, |
| "memory(GiB)": 45.23, |
| "step": 1660, |
| "train_speed(iter/s)": 0.79501 |
| }, |
| { |
| "epoch": 0.1564245810055866, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00019999749234473, |
| "loss": 0.36670647, |
| "memory(GiB)": 45.23, |
| "step": 1680, |
| "train_speed(iter/s)": 0.794806 |
| }, |
| { |
| "epoch": 0.15828677839851024, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00019999582796003082, |
| "loss": 0.35735064, |
| "memory(GiB)": 61.09, |
| "step": 1700, |
| "train_speed(iter/s)": 0.794864 |
| }, |
| { |
| "epoch": 0.1601489757914339, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00019999374222515323, |
| "loss": 0.37268331, |
| "memory(GiB)": 61.09, |
| "step": 1720, |
| "train_speed(iter/s)": 0.794839 |
| }, |
| { |
| "epoch": 0.16201117318435754, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.00019999123514888583, |
| "loss": 0.35943956, |
| "memory(GiB)": 61.09, |
| "step": 1740, |
| "train_speed(iter/s)": 0.795031 |
| }, |
| { |
| "epoch": 0.16387337057728119, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00019998830674179265, |
| "loss": 0.36007235, |
| "memory(GiB)": 61.09, |
| "step": 1760, |
| "train_speed(iter/s)": 0.794945 |
| }, |
| { |
| "epoch": 0.16573556797020483, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00019998495701621302, |
| "loss": 0.36344244, |
| "memory(GiB)": 61.09, |
| "step": 1780, |
| "train_speed(iter/s)": 0.794986 |
| }, |
| { |
| "epoch": 0.16759776536312848, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019998118598626164, |
| "loss": 0.37410173, |
| "memory(GiB)": 61.09, |
| "step": 1800, |
| "train_speed(iter/s)": 0.794841 |
| }, |
| { |
| "epoch": 0.16945996275605213, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.0001999769936678284, |
| "loss": 0.36024194, |
| "memory(GiB)": 61.09, |
| "step": 1820, |
| "train_speed(iter/s)": 0.795138 |
| }, |
| { |
| "epoch": 0.1713221601489758, |
| "grad_norm": 1.9765625, |
| "learning_rate": 0.00019997238007857834, |
| "loss": 0.36868992, |
| "memory(GiB)": 61.09, |
| "step": 1840, |
| "train_speed(iter/s)": 0.795246 |
| }, |
| { |
| "epoch": 0.17318435754189945, |
| "grad_norm": 2.0, |
| "learning_rate": 0.0001999673452379517, |
| "loss": 0.39632101, |
| "memory(GiB)": 61.09, |
| "step": 1860, |
| "train_speed(iter/s)": 0.795603 |
| }, |
| { |
| "epoch": 0.1750465549348231, |
| "grad_norm": 2.53125, |
| "learning_rate": 0.00019996188916716366, |
| "loss": 0.38238664, |
| "memory(GiB)": 61.09, |
| "step": 1880, |
| "train_speed(iter/s)": 0.795803 |
| }, |
| { |
| "epoch": 0.17690875232774675, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00019995601188920432, |
| "loss": 0.39695168, |
| "memory(GiB)": 61.09, |
| "step": 1900, |
| "train_speed(iter/s)": 0.795572 |
| }, |
| { |
| "epoch": 0.1787709497206704, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00019994971342883865, |
| "loss": 0.37256937, |
| "memory(GiB)": 61.09, |
| "step": 1920, |
| "train_speed(iter/s)": 0.795652 |
| }, |
| { |
| "epoch": 0.18063314711359404, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.00019994299381260634, |
| "loss": 0.38835607, |
| "memory(GiB)": 61.09, |
| "step": 1940, |
| "train_speed(iter/s)": 0.795347 |
| }, |
| { |
| "epoch": 0.1824953445065177, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00019993585306882165, |
| "loss": 0.37124941, |
| "memory(GiB)": 61.09, |
| "step": 1960, |
| "train_speed(iter/s)": 0.795281 |
| }, |
| { |
| "epoch": 0.18435754189944134, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00019992829122757343, |
| "loss": 0.40771437, |
| "memory(GiB)": 61.09, |
| "step": 1980, |
| "train_speed(iter/s)": 0.795167 |
| }, |
| { |
| "epoch": 0.186219739292365, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.0001999203083207248, |
| "loss": 0.40394459, |
| "memory(GiB)": 61.09, |
| "step": 2000, |
| "train_speed(iter/s)": 0.795156 |
| }, |
| { |
| "epoch": 0.186219739292365, |
| "eval_loss": 0.4044577181339264, |
| "eval_runtime": 77.5215, |
| "eval_samples_per_second": 179.112, |
| "eval_steps_per_second": 1.406, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.18808193668528864, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.00019991190438191315, |
| "loss": 0.39065728, |
| "memory(GiB)": 61.09, |
| "step": 2020, |
| "train_speed(iter/s)": 0.762049 |
| }, |
| { |
| "epoch": 0.18994413407821228, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00019990307944654995, |
| "loss": 0.38545542, |
| "memory(GiB)": 61.09, |
| "step": 2040, |
| "train_speed(iter/s)": 0.762203 |
| }, |
| { |
| "epoch": 0.19180633147113593, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.0001998938335518207, |
| "loss": 0.39323997, |
| "memory(GiB)": 61.09, |
| "step": 2060, |
| "train_speed(iter/s)": 0.762592 |
| }, |
| { |
| "epoch": 0.19366852886405958, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001998841667366846, |
| "loss": 0.40772767, |
| "memory(GiB)": 61.09, |
| "step": 2080, |
| "train_speed(iter/s)": 0.76303 |
| }, |
| { |
| "epoch": 0.19553072625698323, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00019987407904187445, |
| "loss": 0.37801023, |
| "memory(GiB)": 61.09, |
| "step": 2100, |
| "train_speed(iter/s)": 0.763483 |
| }, |
| { |
| "epoch": 0.1973929236499069, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019986357050989656, |
| "loss": 0.37646284, |
| "memory(GiB)": 61.09, |
| "step": 2120, |
| "train_speed(iter/s)": 0.764105 |
| }, |
| { |
| "epoch": 0.19925512104283055, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.0001998526411850305, |
| "loss": 0.39433451, |
| "memory(GiB)": 61.09, |
| "step": 2140, |
| "train_speed(iter/s)": 0.764481 |
| }, |
| { |
| "epoch": 0.2011173184357542, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00019984129111332895, |
| "loss": 0.37200718, |
| "memory(GiB)": 61.09, |
| "step": 2160, |
| "train_speed(iter/s)": 0.764802 |
| }, |
| { |
| "epoch": 0.20297951582867785, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00019982952034261744, |
| "loss": 0.39085593, |
| "memory(GiB)": 61.09, |
| "step": 2180, |
| "train_speed(iter/s)": 0.765212 |
| }, |
| { |
| "epoch": 0.2048417132216015, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00019981732892249414, |
| "loss": 0.41396513, |
| "memory(GiB)": 61.09, |
| "step": 2200, |
| "train_speed(iter/s)": 0.765596 |
| }, |
| { |
| "epoch": 0.20670391061452514, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00019980471690432986, |
| "loss": 0.38944612, |
| "memory(GiB)": 61.09, |
| "step": 2220, |
| "train_speed(iter/s)": 0.76564 |
| }, |
| { |
| "epoch": 0.2085661080074488, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00019979168434126752, |
| "loss": 0.3887634, |
| "memory(GiB)": 61.09, |
| "step": 2240, |
| "train_speed(iter/s)": 0.765877 |
| }, |
| { |
| "epoch": 0.21042830540037244, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00019977823128822215, |
| "loss": 0.40166087, |
| "memory(GiB)": 61.09, |
| "step": 2260, |
| "train_speed(iter/s)": 0.766381 |
| }, |
| { |
| "epoch": 0.2122905027932961, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00019976435780188058, |
| "loss": 0.39366639, |
| "memory(GiB)": 61.09, |
| "step": 2280, |
| "train_speed(iter/s)": 0.766888 |
| }, |
| { |
| "epoch": 0.21415270018621974, |
| "grad_norm": 2.078125, |
| "learning_rate": 0.00019975006394070118, |
| "loss": 0.3975667, |
| "memory(GiB)": 61.09, |
| "step": 2300, |
| "train_speed(iter/s)": 0.767231 |
| }, |
| { |
| "epoch": 0.21601489757914338, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00019973534976491374, |
| "loss": 0.4033679, |
| "memory(GiB)": 61.09, |
| "step": 2320, |
| "train_speed(iter/s)": 0.767385 |
| }, |
| { |
| "epoch": 0.21787709497206703, |
| "grad_norm": 2.0, |
| "learning_rate": 0.000199720215336519, |
| "loss": 0.39846911, |
| "memory(GiB)": 61.09, |
| "step": 2340, |
| "train_speed(iter/s)": 0.767652 |
| }, |
| { |
| "epoch": 0.21973929236499068, |
| "grad_norm": 1.9453125, |
| "learning_rate": 0.0001997046607192886, |
| "loss": 0.38012104, |
| "memory(GiB)": 61.09, |
| "step": 2360, |
| "train_speed(iter/s)": 0.767841 |
| }, |
| { |
| "epoch": 0.22160148975791433, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.0001996886859787646, |
| "loss": 0.40880508, |
| "memory(GiB)": 61.09, |
| "step": 2380, |
| "train_speed(iter/s)": 0.768088 |
| }, |
| { |
| "epoch": 0.22346368715083798, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.0001996722911822595, |
| "loss": 0.40787139, |
| "memory(GiB)": 61.09, |
| "step": 2400, |
| "train_speed(iter/s)": 0.76849 |
| }, |
| { |
| "epoch": 0.22532588454376165, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.00019965547639885564, |
| "loss": 0.37931633, |
| "memory(GiB)": 61.09, |
| "step": 2420, |
| "train_speed(iter/s)": 0.768893 |
| }, |
| { |
| "epoch": 0.2271880819366853, |
| "grad_norm": 2.0, |
| "learning_rate": 0.0001996382416994051, |
| "loss": 0.40943422, |
| "memory(GiB)": 61.09, |
| "step": 2440, |
| "train_speed(iter/s)": 0.76922 |
| }, |
| { |
| "epoch": 0.22905027932960895, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00019962058715652936, |
| "loss": 0.39480171, |
| "memory(GiB)": 61.09, |
| "step": 2460, |
| "train_speed(iter/s)": 0.769501 |
| }, |
| { |
| "epoch": 0.2309124767225326, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019960251284461896, |
| "loss": 0.39054556, |
| "memory(GiB)": 61.09, |
| "step": 2480, |
| "train_speed(iter/s)": 0.769688 |
| }, |
| { |
| "epoch": 0.23277467411545624, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.00019958401883983325, |
| "loss": 0.41470366, |
| "memory(GiB)": 61.09, |
| "step": 2500, |
| "train_speed(iter/s)": 0.769923 |
| }, |
| { |
| "epoch": 0.2346368715083799, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00019956510522009993, |
| "loss": 0.40362306, |
| "memory(GiB)": 61.09, |
| "step": 2520, |
| "train_speed(iter/s)": 0.770155 |
| }, |
| { |
| "epoch": 0.23649906890130354, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00019954577206511497, |
| "loss": 0.41036525, |
| "memory(GiB)": 61.09, |
| "step": 2540, |
| "train_speed(iter/s)": 0.770394 |
| }, |
| { |
| "epoch": 0.2383612662942272, |
| "grad_norm": 2.859375, |
| "learning_rate": 0.00019952601945634203, |
| "loss": 0.37247095, |
| "memory(GiB)": 61.09, |
| "step": 2560, |
| "train_speed(iter/s)": 0.770637 |
| }, |
| { |
| "epoch": 0.24022346368715083, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00019950584747701223, |
| "loss": 0.39711597, |
| "memory(GiB)": 61.09, |
| "step": 2580, |
| "train_speed(iter/s)": 0.770483 |
| }, |
| { |
| "epoch": 0.24208566108007448, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.00019948525621212376, |
| "loss": 0.40421638, |
| "memory(GiB)": 61.09, |
| "step": 2600, |
| "train_speed(iter/s)": 0.770381 |
| }, |
| { |
| "epoch": 0.24394785847299813, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00019946424574844164, |
| "loss": 0.38328767, |
| "memory(GiB)": 61.09, |
| "step": 2620, |
| "train_speed(iter/s)": 0.770361 |
| }, |
| { |
| "epoch": 0.24581005586592178, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.00019944281617449713, |
| "loss": 0.40005746, |
| "memory(GiB)": 61.09, |
| "step": 2640, |
| "train_speed(iter/s)": 0.770354 |
| }, |
| { |
| "epoch": 0.24767225325884543, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.00019942096758058758, |
| "loss": 0.43671055, |
| "memory(GiB)": 61.09, |
| "step": 2660, |
| "train_speed(iter/s)": 0.770409 |
| }, |
| { |
| "epoch": 0.24953445065176907, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00019939870005877589, |
| "loss": 0.40949144, |
| "memory(GiB)": 61.09, |
| "step": 2680, |
| "train_speed(iter/s)": 0.770232 |
| }, |
| { |
| "epoch": 0.25139664804469275, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.00019937601370289022, |
| "loss": 0.39216471, |
| "memory(GiB)": 61.09, |
| "step": 2700, |
| "train_speed(iter/s)": 0.7704 |
| }, |
| { |
| "epoch": 0.2532588454376164, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.0001993529086085236, |
| "loss": 0.42032986, |
| "memory(GiB)": 61.09, |
| "step": 2720, |
| "train_speed(iter/s)": 0.770556 |
| }, |
| { |
| "epoch": 0.25512104283054005, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.0001993293848730334, |
| "loss": 0.39470253, |
| "memory(GiB)": 61.09, |
| "step": 2740, |
| "train_speed(iter/s)": 0.770526 |
| }, |
| { |
| "epoch": 0.2569832402234637, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00019930544259554107, |
| "loss": 0.40104327, |
| "memory(GiB)": 61.09, |
| "step": 2760, |
| "train_speed(iter/s)": 0.77006 |
| }, |
| { |
| "epoch": 0.25884543761638734, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00019928108187693166, |
| "loss": 0.38365974, |
| "memory(GiB)": 61.09, |
| "step": 2780, |
| "train_speed(iter/s)": 0.769969 |
| }, |
| { |
| "epoch": 0.260707635009311, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.0001992563028198534, |
| "loss": 0.41973991, |
| "memory(GiB)": 61.09, |
| "step": 2800, |
| "train_speed(iter/s)": 0.770041 |
| }, |
| { |
| "epoch": 0.26256983240223464, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.0001992311055287172, |
| "loss": 0.39436083, |
| "memory(GiB)": 61.09, |
| "step": 2820, |
| "train_speed(iter/s)": 0.769916 |
| }, |
| { |
| "epoch": 0.2644320297951583, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019920549010969633, |
| "loss": 0.40017633, |
| "memory(GiB)": 61.09, |
| "step": 2840, |
| "train_speed(iter/s)": 0.770067 |
| }, |
| { |
| "epoch": 0.26629422718808193, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.00019917945667072596, |
| "loss": 0.41224184, |
| "memory(GiB)": 61.09, |
| "step": 2860, |
| "train_speed(iter/s)": 0.770021 |
| }, |
| { |
| "epoch": 0.2681564245810056, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00019915300532150255, |
| "loss": 0.39522376, |
| "memory(GiB)": 61.09, |
| "step": 2880, |
| "train_speed(iter/s)": 0.769932 |
| }, |
| { |
| "epoch": 0.27001862197392923, |
| "grad_norm": 1.7421875, |
| "learning_rate": 0.00019912613617348359, |
| "loss": 0.3813251, |
| "memory(GiB)": 61.09, |
| "step": 2900, |
| "train_speed(iter/s)": 0.770051 |
| }, |
| { |
| "epoch": 0.2718808193668529, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.000199098849339887, |
| "loss": 0.40877409, |
| "memory(GiB)": 61.09, |
| "step": 2920, |
| "train_speed(iter/s)": 0.77 |
| }, |
| { |
| "epoch": 0.2737430167597765, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00019907114493569067, |
| "loss": 0.3886503, |
| "memory(GiB)": 61.09, |
| "step": 2940, |
| "train_speed(iter/s)": 0.769917 |
| }, |
| { |
| "epoch": 0.2756052141527002, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00019904302307763208, |
| "loss": 0.39552131, |
| "memory(GiB)": 61.09, |
| "step": 2960, |
| "train_speed(iter/s)": 0.769804 |
| }, |
| { |
| "epoch": 0.2774674115456238, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00019901448388420762, |
| "loss": 0.42156706, |
| "memory(GiB)": 61.09, |
| "step": 2980, |
| "train_speed(iter/s)": 0.769718 |
| }, |
| { |
| "epoch": 0.27932960893854747, |
| "grad_norm": 1.984375, |
| "learning_rate": 0.00019898552747567226, |
| "loss": 0.42330751, |
| "memory(GiB)": 61.09, |
| "step": 3000, |
| "train_speed(iter/s)": 0.769751 |
| }, |
| { |
| "epoch": 0.2811918063314711, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00019895615397403903, |
| "loss": 0.40717449, |
| "memory(GiB)": 61.09, |
| "step": 3020, |
| "train_speed(iter/s)": 0.769716 |
| }, |
| { |
| "epoch": 0.28305400372439476, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.00019892636350307832, |
| "loss": 0.40095501, |
| "memory(GiB)": 61.09, |
| "step": 3040, |
| "train_speed(iter/s)": 0.769668 |
| }, |
| { |
| "epoch": 0.2849162011173184, |
| "grad_norm": 1.8359375, |
| "learning_rate": 0.0001988961561883176, |
| "loss": 0.40677986, |
| "memory(GiB)": 61.09, |
| "step": 3060, |
| "train_speed(iter/s)": 0.769698 |
| }, |
| { |
| "epoch": 0.28677839851024206, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00019886553215704074, |
| "loss": 0.43705454, |
| "memory(GiB)": 61.09, |
| "step": 3080, |
| "train_speed(iter/s)": 0.769667 |
| }, |
| { |
| "epoch": 0.2886405959031657, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.00019883449153828756, |
| "loss": 0.42250085, |
| "memory(GiB)": 61.09, |
| "step": 3100, |
| "train_speed(iter/s)": 0.769504 |
| }, |
| { |
| "epoch": 0.2905027932960894, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.0001988030344628532, |
| "loss": 0.4198226, |
| "memory(GiB)": 61.09, |
| "step": 3120, |
| "train_speed(iter/s)": 0.76958 |
| }, |
| { |
| "epoch": 0.29236499068901306, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00019877116106328764, |
| "loss": 0.39872463, |
| "memory(GiB)": 61.09, |
| "step": 3140, |
| "train_speed(iter/s)": 0.769603 |
| }, |
| { |
| "epoch": 0.2942271880819367, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001987388714738951, |
| "loss": 0.42246108, |
| "memory(GiB)": 61.09, |
| "step": 3160, |
| "train_speed(iter/s)": 0.769508 |
| }, |
| { |
| "epoch": 0.29608938547486036, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00019870616583073352, |
| "loss": 0.40915985, |
| "memory(GiB)": 61.09, |
| "step": 3180, |
| "train_speed(iter/s)": 0.769716 |
| }, |
| { |
| "epoch": 0.297951582867784, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.0001986730442716139, |
| "loss": 0.3973412, |
| "memory(GiB)": 61.09, |
| "step": 3200, |
| "train_speed(iter/s)": 0.769624 |
| }, |
| { |
| "epoch": 0.29981378026070765, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00019863950693609986, |
| "loss": 0.40871854, |
| "memory(GiB)": 61.09, |
| "step": 3220, |
| "train_speed(iter/s)": 0.769628 |
| }, |
| { |
| "epoch": 0.3016759776536313, |
| "grad_norm": 3.96875, |
| "learning_rate": 0.0001986055539655069, |
| "loss": 0.40967684, |
| "memory(GiB)": 61.09, |
| "step": 3240, |
| "train_speed(iter/s)": 0.769597 |
| }, |
| { |
| "epoch": 0.30353817504655495, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00019857118550290186, |
| "loss": 0.40765038, |
| "memory(GiB)": 61.09, |
| "step": 3260, |
| "train_speed(iter/s)": 0.769707 |
| }, |
| { |
| "epoch": 0.3054003724394786, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.0001985364016931024, |
| "loss": 0.39425554, |
| "memory(GiB)": 61.09, |
| "step": 3280, |
| "train_speed(iter/s)": 0.769702 |
| }, |
| { |
| "epoch": 0.30726256983240224, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.00019850120268267624, |
| "loss": 0.43635969, |
| "memory(GiB)": 61.09, |
| "step": 3300, |
| "train_speed(iter/s)": 0.769851 |
| }, |
| { |
| "epoch": 0.3091247672253259, |
| "grad_norm": 1.984375, |
| "learning_rate": 0.0001984655886199407, |
| "loss": 0.38309081, |
| "memory(GiB)": 61.09, |
| "step": 3320, |
| "train_speed(iter/s)": 0.7701 |
| }, |
| { |
| "epoch": 0.31098696461824954, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00019842955965496196, |
| "loss": 0.39160743, |
| "memory(GiB)": 61.09, |
| "step": 3340, |
| "train_speed(iter/s)": 0.770116 |
| }, |
| { |
| "epoch": 0.3128491620111732, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00019839311593955442, |
| "loss": 0.3894845, |
| "memory(GiB)": 61.09, |
| "step": 3360, |
| "train_speed(iter/s)": 0.770262 |
| }, |
| { |
| "epoch": 0.31471135940409684, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00019835625762728017, |
| "loss": 0.40365009, |
| "memory(GiB)": 61.09, |
| "step": 3380, |
| "train_speed(iter/s)": 0.770237 |
| }, |
| { |
| "epoch": 0.3165735567970205, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019831898487344827, |
| "loss": 0.39198034, |
| "memory(GiB)": 61.09, |
| "step": 3400, |
| "train_speed(iter/s)": 0.770325 |
| }, |
| { |
| "epoch": 0.31843575418994413, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.00019828129783511404, |
| "loss": 0.40903959, |
| "memory(GiB)": 61.09, |
| "step": 3420, |
| "train_speed(iter/s)": 0.770274 |
| }, |
| { |
| "epoch": 0.3202979515828678, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00019824319667107859, |
| "loss": 0.39606228, |
| "memory(GiB)": 61.09, |
| "step": 3440, |
| "train_speed(iter/s)": 0.770391 |
| }, |
| { |
| "epoch": 0.3221601489757914, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00019820468154188781, |
| "loss": 0.40229883, |
| "memory(GiB)": 61.09, |
| "step": 3460, |
| "train_speed(iter/s)": 0.770531 |
| }, |
| { |
| "epoch": 0.3240223463687151, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00019816575260983221, |
| "loss": 0.42437396, |
| "memory(GiB)": 61.09, |
| "step": 3480, |
| "train_speed(iter/s)": 0.770378 |
| }, |
| { |
| "epoch": 0.3258845437616387, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00019812641003894563, |
| "loss": 0.38859506, |
| "memory(GiB)": 61.09, |
| "step": 3500, |
| "train_speed(iter/s)": 0.770398 |
| }, |
| { |
| "epoch": 0.32774674115456237, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.000198086653995005, |
| "loss": 0.40636191, |
| "memory(GiB)": 61.09, |
| "step": 3520, |
| "train_speed(iter/s)": 0.77057 |
| }, |
| { |
| "epoch": 0.329608938547486, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00019804648464552947, |
| "loss": 0.3950357, |
| "memory(GiB)": 61.09, |
| "step": 3540, |
| "train_speed(iter/s)": 0.770758 |
| }, |
| { |
| "epoch": 0.33147113594040967, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00019800590215977978, |
| "loss": 0.39195304, |
| "memory(GiB)": 61.09, |
| "step": 3560, |
| "train_speed(iter/s)": 0.770732 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 2.125, |
| "learning_rate": 0.0001979649067087574, |
| "loss": 0.39345875, |
| "memory(GiB)": 61.09, |
| "step": 3580, |
| "train_speed(iter/s)": 0.770863 |
| }, |
| { |
| "epoch": 0.33519553072625696, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00019792349846520395, |
| "loss": 0.42576008, |
| "memory(GiB)": 61.09, |
| "step": 3600, |
| "train_speed(iter/s)": 0.770989 |
| }, |
| { |
| "epoch": 0.3370577281191806, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00019788167760360038, |
| "loss": 0.39812212, |
| "memory(GiB)": 61.09, |
| "step": 3620, |
| "train_speed(iter/s)": 0.770888 |
| }, |
| { |
| "epoch": 0.33891992551210426, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00019783944430016637, |
| "loss": 0.39472914, |
| "memory(GiB)": 61.09, |
| "step": 3640, |
| "train_speed(iter/s)": 0.77104 |
| }, |
| { |
| "epoch": 0.3407821229050279, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00019779679873285942, |
| "loss": 0.37525396, |
| "memory(GiB)": 61.09, |
| "step": 3660, |
| "train_speed(iter/s)": 0.771096 |
| }, |
| { |
| "epoch": 0.3426443202979516, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00019775374108137422, |
| "loss": 0.38320711, |
| "memory(GiB)": 61.09, |
| "step": 3680, |
| "train_speed(iter/s)": 0.771183 |
| }, |
| { |
| "epoch": 0.34450651769087526, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.0001977102715271418, |
| "loss": 0.43915482, |
| "memory(GiB)": 61.09, |
| "step": 3700, |
| "train_speed(iter/s)": 0.771154 |
| }, |
| { |
| "epoch": 0.3463687150837989, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00019766639025332888, |
| "loss": 0.38033762, |
| "memory(GiB)": 61.09, |
| "step": 3720, |
| "train_speed(iter/s)": 0.771208 |
| }, |
| { |
| "epoch": 0.34823091247672255, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00019762209744483698, |
| "loss": 0.39966769, |
| "memory(GiB)": 61.09, |
| "step": 3740, |
| "train_speed(iter/s)": 0.771161 |
| }, |
| { |
| "epoch": 0.3500931098696462, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.0001975773932883017, |
| "loss": 0.40170197, |
| "memory(GiB)": 61.09, |
| "step": 3760, |
| "train_speed(iter/s)": 0.77126 |
| }, |
| { |
| "epoch": 0.35195530726256985, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00019753227797209198, |
| "loss": 0.38694088, |
| "memory(GiB)": 61.09, |
| "step": 3780, |
| "train_speed(iter/s)": 0.771333 |
| }, |
| { |
| "epoch": 0.3538175046554935, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.00019748675168630917, |
| "loss": 0.41248221, |
| "memory(GiB)": 61.09, |
| "step": 3800, |
| "train_speed(iter/s)": 0.771472 |
| }, |
| { |
| "epoch": 0.35567970204841715, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.0001974408146227864, |
| "loss": 0.39216695, |
| "memory(GiB)": 61.09, |
| "step": 3820, |
| "train_speed(iter/s)": 0.771659 |
| }, |
| { |
| "epoch": 0.3575418994413408, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001973944669750876, |
| "loss": 0.39854093, |
| "memory(GiB)": 61.09, |
| "step": 3840, |
| "train_speed(iter/s)": 0.771931 |
| }, |
| { |
| "epoch": 0.35940409683426444, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00019734770893850686, |
| "loss": 0.40031991, |
| "memory(GiB)": 61.09, |
| "step": 3860, |
| "train_speed(iter/s)": 0.772166 |
| }, |
| { |
| "epoch": 0.3612662942271881, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00019730054071006748, |
| "loss": 0.4241601, |
| "memory(GiB)": 61.09, |
| "step": 3880, |
| "train_speed(iter/s)": 0.772192 |
| }, |
| { |
| "epoch": 0.36312849162011174, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.0001972529624885211, |
| "loss": 0.38440259, |
| "memory(GiB)": 61.09, |
| "step": 3900, |
| "train_speed(iter/s)": 0.772255 |
| }, |
| { |
| "epoch": 0.3649906890130354, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019720497447434706, |
| "loss": 0.4149951, |
| "memory(GiB)": 61.09, |
| "step": 3920, |
| "train_speed(iter/s)": 0.77238 |
| }, |
| { |
| "epoch": 0.36685288640595903, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.00019715657686975138, |
| "loss": 0.37301335, |
| "memory(GiB)": 61.09, |
| "step": 3940, |
| "train_speed(iter/s)": 0.772593 |
| }, |
| { |
| "epoch": 0.3687150837988827, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.00019710776987866594, |
| "loss": 0.40297618, |
| "memory(GiB)": 61.09, |
| "step": 3960, |
| "train_speed(iter/s)": 0.772734 |
| }, |
| { |
| "epoch": 0.37057728119180633, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00019705855370674767, |
| "loss": 0.39141912, |
| "memory(GiB)": 61.09, |
| "step": 3980, |
| "train_speed(iter/s)": 0.772939 |
| }, |
| { |
| "epoch": 0.37243947858473, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00019700892856137765, |
| "loss": 0.39458523, |
| "memory(GiB)": 61.09, |
| "step": 4000, |
| "train_speed(iter/s)": 0.773013 |
| }, |
| { |
| "epoch": 0.37243947858473, |
| "eval_loss": 0.43771594762802124, |
| "eval_runtime": 76.1164, |
| "eval_samples_per_second": 182.418, |
| "eval_steps_per_second": 1.432, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.3743016759776536, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019695889465166024, |
| "loss": 0.40729074, |
| "memory(GiB)": 61.09, |
| "step": 4020, |
| "train_speed(iter/s)": 0.757361 |
| }, |
| { |
| "epoch": 0.3761638733705773, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.0001969084521884222, |
| "loss": 0.40453234, |
| "memory(GiB)": 61.09, |
| "step": 4040, |
| "train_speed(iter/s)": 0.757463 |
| }, |
| { |
| "epoch": 0.3780260707635009, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00019685760138421181, |
| "loss": 0.37667882, |
| "memory(GiB)": 61.09, |
| "step": 4060, |
| "train_speed(iter/s)": 0.757733 |
| }, |
| { |
| "epoch": 0.37988826815642457, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00019680634245329798, |
| "loss": 0.38365791, |
| "memory(GiB)": 61.09, |
| "step": 4080, |
| "train_speed(iter/s)": 0.757943 |
| }, |
| { |
| "epoch": 0.3817504655493482, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.0001967546756116693, |
| "loss": 0.42101908, |
| "memory(GiB)": 61.09, |
| "step": 4100, |
| "train_speed(iter/s)": 0.758279 |
| }, |
| { |
| "epoch": 0.38361266294227186, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.0001967026010770332, |
| "loss": 0.40190611, |
| "memory(GiB)": 61.09, |
| "step": 4120, |
| "train_speed(iter/s)": 0.758514 |
| }, |
| { |
| "epoch": 0.3854748603351955, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00019665011906881493, |
| "loss": 0.41981277, |
| "memory(GiB)": 61.09, |
| "step": 4140, |
| "train_speed(iter/s)": 0.75872 |
| }, |
| { |
| "epoch": 0.38733705772811916, |
| "grad_norm": 1.9765625, |
| "learning_rate": 0.0001965972298081568, |
| "loss": 0.38099627, |
| "memory(GiB)": 61.09, |
| "step": 4160, |
| "train_speed(iter/s)": 0.758977 |
| }, |
| { |
| "epoch": 0.3891992551210428, |
| "grad_norm": 1.875, |
| "learning_rate": 0.0001965439335179171, |
| "loss": 0.3816371, |
| "memory(GiB)": 61.09, |
| "step": 4180, |
| "train_speed(iter/s)": 0.759186 |
| }, |
| { |
| "epoch": 0.39106145251396646, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00019649023042266924, |
| "loss": 0.39355574, |
| "memory(GiB)": 61.09, |
| "step": 4200, |
| "train_speed(iter/s)": 0.759226 |
| }, |
| { |
| "epoch": 0.3929236499068901, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00019643612074870067, |
| "loss": 0.42333245, |
| "memory(GiB)": 61.09, |
| "step": 4220, |
| "train_speed(iter/s)": 0.75949 |
| }, |
| { |
| "epoch": 0.3947858472998138, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00019638160472401216, |
| "loss": 0.38143654, |
| "memory(GiB)": 61.09, |
| "step": 4240, |
| "train_speed(iter/s)": 0.759673 |
| }, |
| { |
| "epoch": 0.39664804469273746, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00019632668257831663, |
| "loss": 0.40620303, |
| "memory(GiB)": 61.09, |
| "step": 4260, |
| "train_speed(iter/s)": 0.759941 |
| }, |
| { |
| "epoch": 0.3985102420856611, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00019627135454303835, |
| "loss": 0.40267248, |
| "memory(GiB)": 61.09, |
| "step": 4280, |
| "train_speed(iter/s)": 0.760262 |
| }, |
| { |
| "epoch": 0.40037243947858475, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00019621562085131179, |
| "loss": 0.40306973, |
| "memory(GiB)": 61.09, |
| "step": 4300, |
| "train_speed(iter/s)": 0.760521 |
| }, |
| { |
| "epoch": 0.4022346368715084, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00019615948173798073, |
| "loss": 0.38690736, |
| "memory(GiB)": 61.09, |
| "step": 4320, |
| "train_speed(iter/s)": 0.76073 |
| }, |
| { |
| "epoch": 0.40409683426443205, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.0001961029374395973, |
| "loss": 0.40119696, |
| "memory(GiB)": 61.09, |
| "step": 4340, |
| "train_speed(iter/s)": 0.760992 |
| }, |
| { |
| "epoch": 0.4059590316573557, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00019604598819442098, |
| "loss": 0.3982002, |
| "memory(GiB)": 61.09, |
| "step": 4360, |
| "train_speed(iter/s)": 0.761273 |
| }, |
| { |
| "epoch": 0.40782122905027934, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00019598863424241753, |
| "loss": 0.36936131, |
| "memory(GiB)": 61.09, |
| "step": 4380, |
| "train_speed(iter/s)": 0.76156 |
| }, |
| { |
| "epoch": 0.409683426443203, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.000195930875825258, |
| "loss": 0.37518866, |
| "memory(GiB)": 61.09, |
| "step": 4400, |
| "train_speed(iter/s)": 0.76183 |
| }, |
| { |
| "epoch": 0.41154562383612664, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001958727131863177, |
| "loss": 0.39472764, |
| "memory(GiB)": 61.09, |
| "step": 4420, |
| "train_speed(iter/s)": 0.762004 |
| }, |
| { |
| "epoch": 0.4134078212290503, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00019581414657067527, |
| "loss": 0.38476, |
| "memory(GiB)": 61.09, |
| "step": 4440, |
| "train_speed(iter/s)": 0.76225 |
| }, |
| { |
| "epoch": 0.41527001862197394, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00019575517622511153, |
| "loss": 0.44369678, |
| "memory(GiB)": 61.09, |
| "step": 4460, |
| "train_speed(iter/s)": 0.762464 |
| }, |
| { |
| "epoch": 0.4171322160148976, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00019569580239810853, |
| "loss": 0.39095647, |
| "memory(GiB)": 61.09, |
| "step": 4480, |
| "train_speed(iter/s)": 0.762511 |
| }, |
| { |
| "epoch": 0.41899441340782123, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00019563602533984842, |
| "loss": 0.39244399, |
| "memory(GiB)": 61.09, |
| "step": 4500, |
| "train_speed(iter/s)": 0.762601 |
| }, |
| { |
| "epoch": 0.4208566108007449, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00019557584530221245, |
| "loss": 0.3898438, |
| "memory(GiB)": 61.09, |
| "step": 4520, |
| "train_speed(iter/s)": 0.762751 |
| }, |
| { |
| "epoch": 0.4227188081936685, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001955152625387799, |
| "loss": 0.38730087, |
| "memory(GiB)": 61.09, |
| "step": 4540, |
| "train_speed(iter/s)": 0.762907 |
| }, |
| { |
| "epoch": 0.4245810055865922, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00019545427730482696, |
| "loss": 0.4196455, |
| "memory(GiB)": 61.09, |
| "step": 4560, |
| "train_speed(iter/s)": 0.763065 |
| }, |
| { |
| "epoch": 0.4264432029795158, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.0001953928898573258, |
| "loss": 0.36989145, |
| "memory(GiB)": 61.09, |
| "step": 4580, |
| "train_speed(iter/s)": 0.763243 |
| }, |
| { |
| "epoch": 0.42830540037243947, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.0001953311004549433, |
| "loss": 0.38576865, |
| "memory(GiB)": 61.09, |
| "step": 4600, |
| "train_speed(iter/s)": 0.763397 |
| }, |
| { |
| "epoch": 0.4301675977653631, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.0001952689093580401, |
| "loss": 0.37395325, |
| "memory(GiB)": 61.09, |
| "step": 4620, |
| "train_speed(iter/s)": 0.763554 |
| }, |
| { |
| "epoch": 0.43202979515828677, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00019520631682866943, |
| "loss": 0.35659883, |
| "memory(GiB)": 61.09, |
| "step": 4640, |
| "train_speed(iter/s)": 0.763689 |
| }, |
| { |
| "epoch": 0.4338919925512104, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.000195143323130576, |
| "loss": 0.37833457, |
| "memory(GiB)": 61.09, |
| "step": 4660, |
| "train_speed(iter/s)": 0.763955 |
| }, |
| { |
| "epoch": 0.43575418994413406, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00019507992852919496, |
| "loss": 0.38467407, |
| "memory(GiB)": 61.09, |
| "step": 4680, |
| "train_speed(iter/s)": 0.76422 |
| }, |
| { |
| "epoch": 0.4376163873370577, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.00019501613329165072, |
| "loss": 0.42341266, |
| "memory(GiB)": 61.09, |
| "step": 4700, |
| "train_speed(iter/s)": 0.764381 |
| }, |
| { |
| "epoch": 0.43947858472998136, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019495193768675586, |
| "loss": 0.39320512, |
| "memory(GiB)": 61.09, |
| "step": 4720, |
| "train_speed(iter/s)": 0.76442 |
| }, |
| { |
| "epoch": 0.441340782122905, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00019488734198500992, |
| "loss": 0.39242547, |
| "memory(GiB)": 61.09, |
| "step": 4740, |
| "train_speed(iter/s)": 0.764554 |
| }, |
| { |
| "epoch": 0.44320297951582865, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.0001948223464585984, |
| "loss": 0.41076765, |
| "memory(GiB)": 61.09, |
| "step": 4760, |
| "train_speed(iter/s)": 0.764645 |
| }, |
| { |
| "epoch": 0.4450651769087523, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.0001947569513813915, |
| "loss": 0.39205194, |
| "memory(GiB)": 61.09, |
| "step": 4780, |
| "train_speed(iter/s)": 0.764725 |
| }, |
| { |
| "epoch": 0.44692737430167595, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00019469115702894295, |
| "loss": 0.41007242, |
| "memory(GiB)": 61.09, |
| "step": 4800, |
| "train_speed(iter/s)": 0.764778 |
| }, |
| { |
| "epoch": 0.44878957169459965, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00019462496367848894, |
| "loss": 0.40207219, |
| "memory(GiB)": 61.09, |
| "step": 4820, |
| "train_speed(iter/s)": 0.764936 |
| }, |
| { |
| "epoch": 0.4506517690875233, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00019455837160894693, |
| "loss": 0.37529342, |
| "memory(GiB)": 61.09, |
| "step": 4840, |
| "train_speed(iter/s)": 0.764978 |
| }, |
| { |
| "epoch": 0.45251396648044695, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00019449138110091445, |
| "loss": 0.38617492, |
| "memory(GiB)": 61.09, |
| "step": 4860, |
| "train_speed(iter/s)": 0.765185 |
| }, |
| { |
| "epoch": 0.4543761638733706, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00019442399243666786, |
| "loss": 0.40544581, |
| "memory(GiB)": 61.09, |
| "step": 4880, |
| "train_speed(iter/s)": 0.765361 |
| }, |
| { |
| "epoch": 0.45623836126629425, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00019435620590016125, |
| "loss": 0.39625409, |
| "memory(GiB)": 61.09, |
| "step": 4900, |
| "train_speed(iter/s)": 0.765415 |
| }, |
| { |
| "epoch": 0.4581005586592179, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00019428802177702529, |
| "loss": 0.37152231, |
| "memory(GiB)": 61.09, |
| "step": 4920, |
| "train_speed(iter/s)": 0.765621 |
| }, |
| { |
| "epoch": 0.45996275605214154, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00019421944035456586, |
| "loss": 0.40098886, |
| "memory(GiB)": 61.09, |
| "step": 4940, |
| "train_speed(iter/s)": 0.765762 |
| }, |
| { |
| "epoch": 0.4618249534450652, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00019415046192176294, |
| "loss": 0.38226032, |
| "memory(GiB)": 61.09, |
| "step": 4960, |
| "train_speed(iter/s)": 0.765864 |
| }, |
| { |
| "epoch": 0.46368715083798884, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00019408108676926942, |
| "loss": 0.4000948, |
| "memory(GiB)": 61.09, |
| "step": 4980, |
| "train_speed(iter/s)": 0.76605 |
| }, |
| { |
| "epoch": 0.4655493482309125, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00019401131518940985, |
| "loss": 0.39587092, |
| "memory(GiB)": 61.09, |
| "step": 5000, |
| "train_speed(iter/s)": 0.766201 |
| }, |
| { |
| "epoch": 0.46741154562383613, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00019394114747617916, |
| "loss": 0.37739358, |
| "memory(GiB)": 61.09, |
| "step": 5020, |
| "train_speed(iter/s)": 0.766333 |
| }, |
| { |
| "epoch": 0.4692737430167598, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00019387058392524145, |
| "loss": 0.37761064, |
| "memory(GiB)": 61.09, |
| "step": 5040, |
| "train_speed(iter/s)": 0.766541 |
| }, |
| { |
| "epoch": 0.47113594040968343, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.0001937996248339288, |
| "loss": 0.36651943, |
| "memory(GiB)": 61.09, |
| "step": 5060, |
| "train_speed(iter/s)": 0.766602 |
| }, |
| { |
| "epoch": 0.4729981378026071, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00019372827050123995, |
| "loss": 0.36461196, |
| "memory(GiB)": 61.09, |
| "step": 5080, |
| "train_speed(iter/s)": 0.766795 |
| }, |
| { |
| "epoch": 0.4748603351955307, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.00019365652122783897, |
| "loss": 0.38799112, |
| "memory(GiB)": 61.09, |
| "step": 5100, |
| "train_speed(iter/s)": 0.766791 |
| }, |
| { |
| "epoch": 0.4767225325884544, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.00019358437731605425, |
| "loss": 0.37337472, |
| "memory(GiB)": 61.09, |
| "step": 5120, |
| "train_speed(iter/s)": 0.766967 |
| }, |
| { |
| "epoch": 0.478584729981378, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00019351183906987694, |
| "loss": 0.41348104, |
| "memory(GiB)": 61.09, |
| "step": 5140, |
| "train_speed(iter/s)": 0.767242 |
| }, |
| { |
| "epoch": 0.48044692737430167, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00019343890679495988, |
| "loss": 0.41450329, |
| "memory(GiB)": 61.09, |
| "step": 5160, |
| "train_speed(iter/s)": 0.767312 |
| }, |
| { |
| "epoch": 0.4823091247672253, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00019336558079861607, |
| "loss": 0.35895565, |
| "memory(GiB)": 61.09, |
| "step": 5180, |
| "train_speed(iter/s)": 0.767456 |
| }, |
| { |
| "epoch": 0.48417132216014896, |
| "grad_norm": 1.875, |
| "learning_rate": 0.00019329186138981769, |
| "loss": 0.35140588, |
| "memory(GiB)": 61.09, |
| "step": 5200, |
| "train_speed(iter/s)": 0.767603 |
| }, |
| { |
| "epoch": 0.4860335195530726, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.0001932177488791945, |
| "loss": 0.39651599, |
| "memory(GiB)": 61.09, |
| "step": 5220, |
| "train_speed(iter/s)": 0.767676 |
| }, |
| { |
| "epoch": 0.48789571694599626, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00019314324357903276, |
| "loss": 0.37009797, |
| "memory(GiB)": 61.09, |
| "step": 5240, |
| "train_speed(iter/s)": 0.767901 |
| }, |
| { |
| "epoch": 0.4897579143389199, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00019306834580327378, |
| "loss": 0.3629123, |
| "memory(GiB)": 61.09, |
| "step": 5260, |
| "train_speed(iter/s)": 0.768057 |
| }, |
| { |
| "epoch": 0.49162011173184356, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00019299305586751263, |
| "loss": 0.41036406, |
| "memory(GiB)": 61.09, |
| "step": 5280, |
| "train_speed(iter/s)": 0.768172 |
| }, |
| { |
| "epoch": 0.4934823091247672, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.0001929173740889968, |
| "loss": 0.3859787, |
| "memory(GiB)": 61.09, |
| "step": 5300, |
| "train_speed(iter/s)": 0.768303 |
| }, |
| { |
| "epoch": 0.49534450651769085, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00019284130078662498, |
| "loss": 0.36947384, |
| "memory(GiB)": 61.09, |
| "step": 5320, |
| "train_speed(iter/s)": 0.768419 |
| }, |
| { |
| "epoch": 0.4972067039106145, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00019276483628094548, |
| "loss": 0.39193997, |
| "memory(GiB)": 61.09, |
| "step": 5340, |
| "train_speed(iter/s)": 0.768551 |
| }, |
| { |
| "epoch": 0.49906890130353815, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00019268798089415508, |
| "loss": 0.40856562, |
| "memory(GiB)": 61.09, |
| "step": 5360, |
| "train_speed(iter/s)": 0.768667 |
| }, |
| { |
| "epoch": 0.5009310986964618, |
| "grad_norm": 1.75, |
| "learning_rate": 0.0001926107349500976, |
| "loss": 0.38102791, |
| "memory(GiB)": 61.09, |
| "step": 5380, |
| "train_speed(iter/s)": 0.768775 |
| }, |
| { |
| "epoch": 0.5027932960893855, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00019253309877426255, |
| "loss": 0.39010181, |
| "memory(GiB)": 61.09, |
| "step": 5400, |
| "train_speed(iter/s)": 0.768826 |
| }, |
| { |
| "epoch": 0.5046554934823091, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00019245507269378376, |
| "loss": 0.36907117, |
| "memory(GiB)": 61.09, |
| "step": 5420, |
| "train_speed(iter/s)": 0.768871 |
| }, |
| { |
| "epoch": 0.5065176908752328, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00019237665703743795, |
| "loss": 0.39245234, |
| "memory(GiB)": 61.09, |
| "step": 5440, |
| "train_speed(iter/s)": 0.769044 |
| }, |
| { |
| "epoch": 0.5083798882681564, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.0001922978521356434, |
| "loss": 0.37116551, |
| "memory(GiB)": 61.09, |
| "step": 5460, |
| "train_speed(iter/s)": 0.769157 |
| }, |
| { |
| "epoch": 0.5102420856610801, |
| "grad_norm": 1.625, |
| "learning_rate": 0.0001922186583204586, |
| "loss": 0.3747494, |
| "memory(GiB)": 61.09, |
| "step": 5480, |
| "train_speed(iter/s)": 0.769282 |
| }, |
| { |
| "epoch": 0.5121042830540037, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00019213907592558068, |
| "loss": 0.39164233, |
| "memory(GiB)": 61.09, |
| "step": 5500, |
| "train_speed(iter/s)": 0.769399 |
| }, |
| { |
| "epoch": 0.5139664804469274, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.0001920591052863443, |
| "loss": 0.36614444, |
| "memory(GiB)": 61.09, |
| "step": 5520, |
| "train_speed(iter/s)": 0.769497 |
| }, |
| { |
| "epoch": 0.515828677839851, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001919787467397199, |
| "loss": 0.36284633, |
| "memory(GiB)": 61.09, |
| "step": 5540, |
| "train_speed(iter/s)": 0.769655 |
| }, |
| { |
| "epoch": 0.5176908752327747, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00019189800062431248, |
| "loss": 0.38447697, |
| "memory(GiB)": 61.09, |
| "step": 5560, |
| "train_speed(iter/s)": 0.769671 |
| }, |
| { |
| "epoch": 0.5195530726256983, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.0001918168672803601, |
| "loss": 0.3774585, |
| "memory(GiB)": 61.09, |
| "step": 5580, |
| "train_speed(iter/s)": 0.769848 |
| }, |
| { |
| "epoch": 0.521415270018622, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00019173534704973255, |
| "loss": 0.37147908, |
| "memory(GiB)": 61.09, |
| "step": 5600, |
| "train_speed(iter/s)": 0.769928 |
| }, |
| { |
| "epoch": 0.5232774674115456, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00019165344027592985, |
| "loss": 0.4088347, |
| "memory(GiB)": 61.09, |
| "step": 5620, |
| "train_speed(iter/s)": 0.77009 |
| }, |
| { |
| "epoch": 0.5251396648044693, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00019157114730408064, |
| "loss": 0.38829794, |
| "memory(GiB)": 61.09, |
| "step": 5640, |
| "train_speed(iter/s)": 0.770242 |
| }, |
| { |
| "epoch": 0.527001862197393, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.000191488468480941, |
| "loss": 0.36397715, |
| "memory(GiB)": 61.09, |
| "step": 5660, |
| "train_speed(iter/s)": 0.770341 |
| }, |
| { |
| "epoch": 0.5288640595903166, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001914054041548929, |
| "loss": 0.38371773, |
| "memory(GiB)": 61.09, |
| "step": 5680, |
| "train_speed(iter/s)": 0.770391 |
| }, |
| { |
| "epoch": 0.5307262569832403, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00019132195467594254, |
| "loss": 0.3827935, |
| "memory(GiB)": 61.09, |
| "step": 5700, |
| "train_speed(iter/s)": 0.77059 |
| }, |
| { |
| "epoch": 0.5325884543761639, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00019123812039571915, |
| "loss": 0.38209565, |
| "memory(GiB)": 61.09, |
| "step": 5720, |
| "train_speed(iter/s)": 0.770819 |
| }, |
| { |
| "epoch": 0.5344506517690876, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001911539016674734, |
| "loss": 0.35899367, |
| "memory(GiB)": 61.09, |
| "step": 5740, |
| "train_speed(iter/s)": 0.771004 |
| }, |
| { |
| "epoch": 0.5363128491620112, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00019106929884607577, |
| "loss": 0.38742561, |
| "memory(GiB)": 61.09, |
| "step": 5760, |
| "train_speed(iter/s)": 0.770977 |
| }, |
| { |
| "epoch": 0.5381750465549349, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00019098431228801535, |
| "loss": 0.40184608, |
| "memory(GiB)": 61.09, |
| "step": 5780, |
| "train_speed(iter/s)": 0.770971 |
| }, |
| { |
| "epoch": 0.5400372439478585, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00019089894235139806, |
| "loss": 0.37216854, |
| "memory(GiB)": 61.09, |
| "step": 5800, |
| "train_speed(iter/s)": 0.771075 |
| }, |
| { |
| "epoch": 0.5418994413407822, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00019081318939594532, |
| "loss": 0.38967266, |
| "memory(GiB)": 61.09, |
| "step": 5820, |
| "train_speed(iter/s)": 0.77124 |
| }, |
| { |
| "epoch": 0.5437616387337058, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00019072705378299237, |
| "loss": 0.36322145, |
| "memory(GiB)": 61.09, |
| "step": 5840, |
| "train_speed(iter/s)": 0.771282 |
| }, |
| { |
| "epoch": 0.5456238361266295, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00019064053587548697, |
| "loss": 0.41168041, |
| "memory(GiB)": 61.09, |
| "step": 5860, |
| "train_speed(iter/s)": 0.771409 |
| }, |
| { |
| "epoch": 0.547486033519553, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00019055363603798765, |
| "loss": 0.38845997, |
| "memory(GiB)": 61.09, |
| "step": 5880, |
| "train_speed(iter/s)": 0.771601 |
| }, |
| { |
| "epoch": 0.5493482309124768, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00019046635463666232, |
| "loss": 0.35851908, |
| "memory(GiB)": 61.09, |
| "step": 5900, |
| "train_speed(iter/s)": 0.771596 |
| }, |
| { |
| "epoch": 0.5512104283054003, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00019037869203928671, |
| "loss": 0.38949237, |
| "memory(GiB)": 61.09, |
| "step": 5920, |
| "train_speed(iter/s)": 0.771651 |
| }, |
| { |
| "epoch": 0.553072625698324, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00019029064861524267, |
| "loss": 0.36254482, |
| "memory(GiB)": 61.09, |
| "step": 5940, |
| "train_speed(iter/s)": 0.771719 |
| }, |
| { |
| "epoch": 0.5549348230912476, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00019020222473551686, |
| "loss": 0.37663541, |
| "memory(GiB)": 61.09, |
| "step": 5960, |
| "train_speed(iter/s)": 0.771885 |
| }, |
| { |
| "epoch": 0.5567970204841713, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00019011342077269894, |
| "loss": 0.37530179, |
| "memory(GiB)": 61.09, |
| "step": 5980, |
| "train_speed(iter/s)": 0.77194 |
| }, |
| { |
| "epoch": 0.5586592178770949, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00019002423710098022, |
| "loss": 0.33494754, |
| "memory(GiB)": 61.09, |
| "step": 6000, |
| "train_speed(iter/s)": 0.771986 |
| }, |
| { |
| "epoch": 0.5586592178770949, |
| "eval_loss": 0.44819557666778564, |
| "eval_runtime": 76.4302, |
| "eval_samples_per_second": 181.669, |
| "eval_steps_per_second": 1.426, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.5605214152700186, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00018993467409615192, |
| "loss": 0.39943161, |
| "memory(GiB)": 61.09, |
| "step": 6020, |
| "train_speed(iter/s)": 0.761403 |
| }, |
| { |
| "epoch": 0.5623836126629422, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00018984473213560366, |
| "loss": 0.37351987, |
| "memory(GiB)": 61.09, |
| "step": 6040, |
| "train_speed(iter/s)": 0.76152 |
| }, |
| { |
| "epoch": 0.5642458100558659, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.0001897544115983219, |
| "loss": 0.39573755, |
| "memory(GiB)": 61.09, |
| "step": 6060, |
| "train_speed(iter/s)": 0.761669 |
| }, |
| { |
| "epoch": 0.5661080074487895, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018966371286488823, |
| "loss": 0.4023766, |
| "memory(GiB)": 61.09, |
| "step": 6080, |
| "train_speed(iter/s)": 0.761822 |
| }, |
| { |
| "epoch": 0.5679702048417132, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00018957263631747787, |
| "loss": 0.3705966, |
| "memory(GiB)": 61.09, |
| "step": 6100, |
| "train_speed(iter/s)": 0.76184 |
| }, |
| { |
| "epoch": 0.5698324022346368, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00018948118233985803, |
| "loss": 0.38215985, |
| "memory(GiB)": 61.09, |
| "step": 6120, |
| "train_speed(iter/s)": 0.76194 |
| }, |
| { |
| "epoch": 0.5716945996275605, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.0001893893513173863, |
| "loss": 0.37131, |
| "memory(GiB)": 61.09, |
| "step": 6140, |
| "train_speed(iter/s)": 0.762158 |
| }, |
| { |
| "epoch": 0.5735567970204841, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00018929714363700897, |
| "loss": 0.38114462, |
| "memory(GiB)": 61.09, |
| "step": 6160, |
| "train_speed(iter/s)": 0.762317 |
| }, |
| { |
| "epoch": 0.5754189944134078, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.0001892045596872595, |
| "loss": 0.36428151, |
| "memory(GiB)": 61.09, |
| "step": 6180, |
| "train_speed(iter/s)": 0.762461 |
| }, |
| { |
| "epoch": 0.5772811918063314, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00018911159985825684, |
| "loss": 0.38127549, |
| "memory(GiB)": 61.09, |
| "step": 6200, |
| "train_speed(iter/s)": 0.762497 |
| }, |
| { |
| "epoch": 0.5791433891992551, |
| "grad_norm": 3.015625, |
| "learning_rate": 0.00018901826454170367, |
| "loss": 0.38451765, |
| "memory(GiB)": 61.09, |
| "step": 6220, |
| "train_speed(iter/s)": 0.762625 |
| }, |
| { |
| "epoch": 0.5810055865921788, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00018892455413088493, |
| "loss": 0.39861171, |
| "memory(GiB)": 61.09, |
| "step": 6240, |
| "train_speed(iter/s)": 0.762782 |
| }, |
| { |
| "epoch": 0.5828677839851024, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.0001888304690206661, |
| "loss": 0.35783043, |
| "memory(GiB)": 61.09, |
| "step": 6260, |
| "train_speed(iter/s)": 0.762874 |
| }, |
| { |
| "epoch": 0.5847299813780261, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00018873600960749148, |
| "loss": 0.36647081, |
| "memory(GiB)": 61.09, |
| "step": 6280, |
| "train_speed(iter/s)": 0.762938 |
| }, |
| { |
| "epoch": 0.5865921787709497, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.0001886411762893826, |
| "loss": 0.3527771, |
| "memory(GiB)": 61.09, |
| "step": 6300, |
| "train_speed(iter/s)": 0.763086 |
| }, |
| { |
| "epoch": 0.5884543761638734, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.0001885459694659365, |
| "loss": 0.38488383, |
| "memory(GiB)": 61.09, |
| "step": 6320, |
| "train_speed(iter/s)": 0.763164 |
| }, |
| { |
| "epoch": 0.590316573556797, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00018845038953832401, |
| "loss": 0.37899766, |
| "memory(GiB)": 61.09, |
| "step": 6340, |
| "train_speed(iter/s)": 0.763257 |
| }, |
| { |
| "epoch": 0.5921787709497207, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00018835443690928813, |
| "loss": 0.36712158, |
| "memory(GiB)": 61.09, |
| "step": 6360, |
| "train_speed(iter/s)": 0.763341 |
| }, |
| { |
| "epoch": 0.5940409683426443, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.0001882581119831423, |
| "loss": 0.36925483, |
| "memory(GiB)": 61.09, |
| "step": 6380, |
| "train_speed(iter/s)": 0.7635 |
| }, |
| { |
| "epoch": 0.595903165735568, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00018816141516576874, |
| "loss": 0.37719622, |
| "memory(GiB)": 61.09, |
| "step": 6400, |
| "train_speed(iter/s)": 0.763631 |
| }, |
| { |
| "epoch": 0.5977653631284916, |
| "grad_norm": 1.9765625, |
| "learning_rate": 0.0001880643468646166, |
| "loss": 0.37256207, |
| "memory(GiB)": 61.09, |
| "step": 6420, |
| "train_speed(iter/s)": 0.76375 |
| }, |
| { |
| "epoch": 0.5996275605214153, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018796690748870045, |
| "loss": 0.36438172, |
| "memory(GiB)": 61.09, |
| "step": 6440, |
| "train_speed(iter/s)": 0.763808 |
| }, |
| { |
| "epoch": 0.6014897579143389, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00018786909744859838, |
| "loss": 0.37020493, |
| "memory(GiB)": 61.09, |
| "step": 6460, |
| "train_speed(iter/s)": 0.763949 |
| }, |
| { |
| "epoch": 0.6033519553072626, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001877709171564504, |
| "loss": 0.37200084, |
| "memory(GiB)": 61.09, |
| "step": 6480, |
| "train_speed(iter/s)": 0.763933 |
| }, |
| { |
| "epoch": 0.6052141527001862, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00018767236702595653, |
| "loss": 0.37319822, |
| "memory(GiB)": 61.09, |
| "step": 6500, |
| "train_speed(iter/s)": 0.764126 |
| }, |
| { |
| "epoch": 0.6070763500931099, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00018757344747237536, |
| "loss": 0.37853823, |
| "memory(GiB)": 61.09, |
| "step": 6520, |
| "train_speed(iter/s)": 0.764271 |
| }, |
| { |
| "epoch": 0.6089385474860335, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018747415891252187, |
| "loss": 0.37537384, |
| "memory(GiB)": 61.09, |
| "step": 6540, |
| "train_speed(iter/s)": 0.764402 |
| }, |
| { |
| "epoch": 0.6108007448789572, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00018737450176476613, |
| "loss": 0.36597242, |
| "memory(GiB)": 61.09, |
| "step": 6560, |
| "train_speed(iter/s)": 0.764565 |
| }, |
| { |
| "epoch": 0.6126629422718808, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.0001872744764490312, |
| "loss": 0.35904284, |
| "memory(GiB)": 61.09, |
| "step": 6580, |
| "train_speed(iter/s)": 0.764671 |
| }, |
| { |
| "epoch": 0.6145251396648045, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00018717408338679147, |
| "loss": 0.3644063, |
| "memory(GiB)": 61.09, |
| "step": 6600, |
| "train_speed(iter/s)": 0.764735 |
| }, |
| { |
| "epoch": 0.6163873370577281, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.00018707332300107102, |
| "loss": 0.35240235, |
| "memory(GiB)": 61.09, |
| "step": 6620, |
| "train_speed(iter/s)": 0.764831 |
| }, |
| { |
| "epoch": 0.6182495344506518, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00018697219571644155, |
| "loss": 0.36407018, |
| "memory(GiB)": 61.09, |
| "step": 6640, |
| "train_speed(iter/s)": 0.765007 |
| }, |
| { |
| "epoch": 0.6201117318435754, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0001868707019590209, |
| "loss": 0.35534208, |
| "memory(GiB)": 61.09, |
| "step": 6660, |
| "train_speed(iter/s)": 0.765128 |
| }, |
| { |
| "epoch": 0.6219739292364991, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00018676884215647094, |
| "loss": 0.34723496, |
| "memory(GiB)": 61.09, |
| "step": 6680, |
| "train_speed(iter/s)": 0.76517 |
| }, |
| { |
| "epoch": 0.6238361266294227, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00018666661673799614, |
| "loss": 0.36451449, |
| "memory(GiB)": 61.09, |
| "step": 6700, |
| "train_speed(iter/s)": 0.765259 |
| }, |
| { |
| "epoch": 0.6256983240223464, |
| "grad_norm": 1.828125, |
| "learning_rate": 0.00018656402613434142, |
| "loss": 0.36834097, |
| "memory(GiB)": 61.09, |
| "step": 6720, |
| "train_speed(iter/s)": 0.765415 |
| }, |
| { |
| "epoch": 0.62756052141527, |
| "grad_norm": 1.8671875, |
| "learning_rate": 0.00018646107077779046, |
| "loss": 0.3686954, |
| "memory(GiB)": 61.09, |
| "step": 6740, |
| "train_speed(iter/s)": 0.765555 |
| }, |
| { |
| "epoch": 0.6294227188081937, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00018635775110216403, |
| "loss": 0.35826502, |
| "memory(GiB)": 61.09, |
| "step": 6760, |
| "train_speed(iter/s)": 0.765672 |
| }, |
| { |
| "epoch": 0.6312849162011173, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00018625406754281784, |
| "loss": 0.36378212, |
| "memory(GiB)": 61.09, |
| "step": 6780, |
| "train_speed(iter/s)": 0.765766 |
| }, |
| { |
| "epoch": 0.633147113594041, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00018615002053664104, |
| "loss": 0.36196086, |
| "memory(GiB)": 61.09, |
| "step": 6800, |
| "train_speed(iter/s)": 0.765831 |
| }, |
| { |
| "epoch": 0.6350093109869647, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00018604561052205413, |
| "loss": 0.38385079, |
| "memory(GiB)": 61.09, |
| "step": 6820, |
| "train_speed(iter/s)": 0.765852 |
| }, |
| { |
| "epoch": 0.6368715083798883, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001859408379390073, |
| "loss": 0.34547548, |
| "memory(GiB)": 61.09, |
| "step": 6840, |
| "train_speed(iter/s)": 0.765936 |
| }, |
| { |
| "epoch": 0.638733705772812, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00018583570322897836, |
| "loss": 0.37987022, |
| "memory(GiB)": 61.09, |
| "step": 6860, |
| "train_speed(iter/s)": 0.76609 |
| }, |
| { |
| "epoch": 0.6405959031657356, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00018573020683497113, |
| "loss": 0.34121661, |
| "memory(GiB)": 61.09, |
| "step": 6880, |
| "train_speed(iter/s)": 0.766189 |
| }, |
| { |
| "epoch": 0.6424581005586593, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00018562434920151336, |
| "loss": 0.38715105, |
| "memory(GiB)": 61.09, |
| "step": 6900, |
| "train_speed(iter/s)": 0.766299 |
| }, |
| { |
| "epoch": 0.6443202979515829, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00018551813077465504, |
| "loss": 0.36055377, |
| "memory(GiB)": 61.09, |
| "step": 6920, |
| "train_speed(iter/s)": 0.766405 |
| }, |
| { |
| "epoch": 0.6461824953445066, |
| "grad_norm": 1.984375, |
| "learning_rate": 0.00018541155200196637, |
| "loss": 0.36063504, |
| "memory(GiB)": 61.09, |
| "step": 6940, |
| "train_speed(iter/s)": 0.766488 |
| }, |
| { |
| "epoch": 0.6480446927374302, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.0001853046133325359, |
| "loss": 0.36382327, |
| "memory(GiB)": 61.09, |
| "step": 6960, |
| "train_speed(iter/s)": 0.766578 |
| }, |
| { |
| "epoch": 0.6499068901303539, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00018519731521696872, |
| "loss": 0.37230091, |
| "memory(GiB)": 61.09, |
| "step": 6980, |
| "train_speed(iter/s)": 0.766773 |
| }, |
| { |
| "epoch": 0.6517690875232774, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00018508965810738447, |
| "loss": 0.35780354, |
| "memory(GiB)": 61.09, |
| "step": 7000, |
| "train_speed(iter/s)": 0.766857 |
| }, |
| { |
| "epoch": 0.6536312849162011, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00018498164245741555, |
| "loss": 0.34599066, |
| "memory(GiB)": 61.09, |
| "step": 7020, |
| "train_speed(iter/s)": 0.76696 |
| }, |
| { |
| "epoch": 0.6554934823091247, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00018487326872220505, |
| "loss": 0.37994871, |
| "memory(GiB)": 61.09, |
| "step": 7040, |
| "train_speed(iter/s)": 0.767115 |
| }, |
| { |
| "epoch": 0.6573556797020484, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00018476453735840492, |
| "loss": 0.37087135, |
| "memory(GiB)": 61.09, |
| "step": 7060, |
| "train_speed(iter/s)": 0.76724 |
| }, |
| { |
| "epoch": 0.659217877094972, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00018465544882417413, |
| "loss": 0.3895617, |
| "memory(GiB)": 61.09, |
| "step": 7080, |
| "train_speed(iter/s)": 0.767307 |
| }, |
| { |
| "epoch": 0.6610800744878957, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00018454600357917656, |
| "loss": 0.36390953, |
| "memory(GiB)": 61.09, |
| "step": 7100, |
| "train_speed(iter/s)": 0.76745 |
| }, |
| { |
| "epoch": 0.6629422718808193, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.0001844362020845792, |
| "loss": 0.36717396, |
| "memory(GiB)": 61.09, |
| "step": 7120, |
| "train_speed(iter/s)": 0.767583 |
| }, |
| { |
| "epoch": 0.664804469273743, |
| "grad_norm": 2.0, |
| "learning_rate": 0.00018432604480305011, |
| "loss": 0.37814822, |
| "memory(GiB)": 61.09, |
| "step": 7140, |
| "train_speed(iter/s)": 0.767624 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00018421553219875658, |
| "loss": 0.35424995, |
| "memory(GiB)": 61.09, |
| "step": 7160, |
| "train_speed(iter/s)": 0.767752 |
| }, |
| { |
| "epoch": 0.6685288640595903, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00018410466473736312, |
| "loss": 0.36877232, |
| "memory(GiB)": 61.09, |
| "step": 7180, |
| "train_speed(iter/s)": 0.767815 |
| }, |
| { |
| "epoch": 0.6703910614525139, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.0001839934428860294, |
| "loss": 0.34407656, |
| "memory(GiB)": 61.09, |
| "step": 7200, |
| "train_speed(iter/s)": 0.767919 |
| }, |
| { |
| "epoch": 0.6722532588454376, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00018388186711340852, |
| "loss": 0.38237391, |
| "memory(GiB)": 61.09, |
| "step": 7220, |
| "train_speed(iter/s)": 0.768033 |
| }, |
| { |
| "epoch": 0.6741154562383612, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.0001837699378896448, |
| "loss": 0.3648797, |
| "memory(GiB)": 61.09, |
| "step": 7240, |
| "train_speed(iter/s)": 0.768127 |
| }, |
| { |
| "epoch": 0.6759776536312849, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00018365765568637178, |
| "loss": 0.33588786, |
| "memory(GiB)": 61.09, |
| "step": 7260, |
| "train_speed(iter/s)": 0.768202 |
| }, |
| { |
| "epoch": 0.6778398510242085, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00018354502097671062, |
| "loss": 0.3664793, |
| "memory(GiB)": 61.09, |
| "step": 7280, |
| "train_speed(iter/s)": 0.768262 |
| }, |
| { |
| "epoch": 0.6797020484171322, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00018343203423526762, |
| "loss": 0.34410334, |
| "memory(GiB)": 61.09, |
| "step": 7300, |
| "train_speed(iter/s)": 0.768354 |
| }, |
| { |
| "epoch": 0.6815642458100558, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00018331869593813248, |
| "loss": 0.35052714, |
| "memory(GiB)": 61.09, |
| "step": 7320, |
| "train_speed(iter/s)": 0.768408 |
| }, |
| { |
| "epoch": 0.6834264432029795, |
| "grad_norm": 1.8671875, |
| "learning_rate": 0.00018320500656287617, |
| "loss": 0.3694984, |
| "memory(GiB)": 61.09, |
| "step": 7340, |
| "train_speed(iter/s)": 0.768539 |
| }, |
| { |
| "epoch": 0.6852886405959032, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00018309096658854917, |
| "loss": 0.3540817, |
| "memory(GiB)": 61.09, |
| "step": 7360, |
| "train_speed(iter/s)": 0.76859 |
| }, |
| { |
| "epoch": 0.6871508379888268, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.0001829765764956791, |
| "loss": 0.32105043, |
| "memory(GiB)": 61.09, |
| "step": 7380, |
| "train_speed(iter/s)": 0.768719 |
| }, |
| { |
| "epoch": 0.6890130353817505, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00018286183676626894, |
| "loss": 0.36089745, |
| "memory(GiB)": 61.09, |
| "step": 7400, |
| "train_speed(iter/s)": 0.768705 |
| }, |
| { |
| "epoch": 0.6908752327746741, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.0001827467478837949, |
| "loss": 0.33547273, |
| "memory(GiB)": 61.09, |
| "step": 7420, |
| "train_speed(iter/s)": 0.768804 |
| }, |
| { |
| "epoch": 0.6927374301675978, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00018263131033320447, |
| "loss": 0.32867005, |
| "memory(GiB)": 61.09, |
| "step": 7440, |
| "train_speed(iter/s)": 0.768897 |
| }, |
| { |
| "epoch": 0.6945996275605214, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00018251552460091423, |
| "loss": 0.34813719, |
| "memory(GiB)": 61.09, |
| "step": 7460, |
| "train_speed(iter/s)": 0.769038 |
| }, |
| { |
| "epoch": 0.6964618249534451, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00018239939117480796, |
| "loss": 0.34680479, |
| "memory(GiB)": 61.09, |
| "step": 7480, |
| "train_speed(iter/s)": 0.769117 |
| }, |
| { |
| "epoch": 0.6983240223463687, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00018228291054423444, |
| "loss": 0.34514585, |
| "memory(GiB)": 61.09, |
| "step": 7500, |
| "train_speed(iter/s)": 0.769217 |
| }, |
| { |
| "epoch": 0.7001862197392924, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018216608320000547, |
| "loss": 0.36143909, |
| "memory(GiB)": 61.09, |
| "step": 7520, |
| "train_speed(iter/s)": 0.769262 |
| }, |
| { |
| "epoch": 0.702048417132216, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00018204890963439388, |
| "loss": 0.34863019, |
| "memory(GiB)": 61.09, |
| "step": 7540, |
| "train_speed(iter/s)": 0.769313 |
| }, |
| { |
| "epoch": 0.7039106145251397, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00018193139034113122, |
| "loss": 0.33534517, |
| "memory(GiB)": 61.09, |
| "step": 7560, |
| "train_speed(iter/s)": 0.769396 |
| }, |
| { |
| "epoch": 0.7057728119180633, |
| "grad_norm": 3.421875, |
| "learning_rate": 0.000181813525815406, |
| "loss": 0.37421381, |
| "memory(GiB)": 61.09, |
| "step": 7580, |
| "train_speed(iter/s)": 0.769524 |
| }, |
| { |
| "epoch": 0.707635009310987, |
| "grad_norm": 1.75, |
| "learning_rate": 0.0001816953165538612, |
| "loss": 0.35957568, |
| "memory(GiB)": 61.09, |
| "step": 7600, |
| "train_speed(iter/s)": 0.769611 |
| }, |
| { |
| "epoch": 0.7094972067039106, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00018157676305459268, |
| "loss": 0.34478993, |
| "memory(GiB)": 61.09, |
| "step": 7620, |
| "train_speed(iter/s)": 0.769707 |
| }, |
| { |
| "epoch": 0.7113594040968343, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00018145786581714654, |
| "loss": 0.34837961, |
| "memory(GiB)": 61.09, |
| "step": 7640, |
| "train_speed(iter/s)": 0.769876 |
| }, |
| { |
| "epoch": 0.7132216014897579, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00018133862534251748, |
| "loss": 0.3299273, |
| "memory(GiB)": 61.09, |
| "step": 7660, |
| "train_speed(iter/s)": 0.770039 |
| }, |
| { |
| "epoch": 0.7150837988826816, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00018121904213314632, |
| "loss": 0.3494205, |
| "memory(GiB)": 61.09, |
| "step": 7680, |
| "train_speed(iter/s)": 0.770117 |
| }, |
| { |
| "epoch": 0.7169459962756052, |
| "grad_norm": 2.640625, |
| "learning_rate": 0.0001810991166929183, |
| "loss": 0.33510184, |
| "memory(GiB)": 61.09, |
| "step": 7700, |
| "train_speed(iter/s)": 0.77021 |
| }, |
| { |
| "epoch": 0.7188081936685289, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.0001809788495271604, |
| "loss": 0.34384561, |
| "memory(GiB)": 61.09, |
| "step": 7720, |
| "train_speed(iter/s)": 0.770295 |
| }, |
| { |
| "epoch": 0.7206703910614525, |
| "grad_norm": 1.875, |
| "learning_rate": 0.00018085824114263981, |
| "loss": 0.35482507, |
| "memory(GiB)": 61.09, |
| "step": 7740, |
| "train_speed(iter/s)": 0.770398 |
| }, |
| { |
| "epoch": 0.7225325884543762, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001807372920475613, |
| "loss": 0.3394609, |
| "memory(GiB)": 61.09, |
| "step": 7760, |
| "train_speed(iter/s)": 0.770488 |
| }, |
| { |
| "epoch": 0.7243947858472998, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0001806160027515654, |
| "loss": 0.3570467, |
| "memory(GiB)": 61.09, |
| "step": 7780, |
| "train_speed(iter/s)": 0.770601 |
| }, |
| { |
| "epoch": 0.7262569832402235, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00018049437376572603, |
| "loss": 0.32628872, |
| "memory(GiB)": 61.09, |
| "step": 7800, |
| "train_speed(iter/s)": 0.77061 |
| }, |
| { |
| "epoch": 0.7281191806331471, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.0001803724056025486, |
| "loss": 0.34269819, |
| "memory(GiB)": 61.09, |
| "step": 7820, |
| "train_speed(iter/s)": 0.770763 |
| }, |
| { |
| "epoch": 0.7299813780260708, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00018025009877596757, |
| "loss": 0.35294495, |
| "memory(GiB)": 61.09, |
| "step": 7840, |
| "train_speed(iter/s)": 0.770808 |
| }, |
| { |
| "epoch": 0.7318435754189944, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00018012745380134452, |
| "loss": 0.34454529, |
| "memory(GiB)": 61.09, |
| "step": 7860, |
| "train_speed(iter/s)": 0.770915 |
| }, |
| { |
| "epoch": 0.7337057728119181, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001800044711954658, |
| "loss": 0.32856019, |
| "memory(GiB)": 61.09, |
| "step": 7880, |
| "train_speed(iter/s)": 0.771015 |
| }, |
| { |
| "epoch": 0.7355679702048417, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00017988115147654044, |
| "loss": 0.35125015, |
| "memory(GiB)": 61.09, |
| "step": 7900, |
| "train_speed(iter/s)": 0.771089 |
| }, |
| { |
| "epoch": 0.7374301675977654, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.000179757495164198, |
| "loss": 0.36670623, |
| "memory(GiB)": 61.09, |
| "step": 7920, |
| "train_speed(iter/s)": 0.771186 |
| }, |
| { |
| "epoch": 0.7392923649906891, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00017963350277948634, |
| "loss": 0.3566488, |
| "memory(GiB)": 61.09, |
| "step": 7940, |
| "train_speed(iter/s)": 0.771297 |
| }, |
| { |
| "epoch": 0.7411545623836127, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.00017950917484486936, |
| "loss": 0.33071799, |
| "memory(GiB)": 61.09, |
| "step": 7960, |
| "train_speed(iter/s)": 0.771381 |
| }, |
| { |
| "epoch": 0.7430167597765364, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00017938451188422488, |
| "loss": 0.34926617, |
| "memory(GiB)": 61.09, |
| "step": 7980, |
| "train_speed(iter/s)": 0.771511 |
| }, |
| { |
| "epoch": 0.74487895716946, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00017925951442284243, |
| "loss": 0.36137218, |
| "memory(GiB)": 61.09, |
| "step": 8000, |
| "train_speed(iter/s)": 0.771625 |
| }, |
| { |
| "epoch": 0.74487895716946, |
| "eval_loss": 0.44029951095581055, |
| "eval_runtime": 76.4367, |
| "eval_samples_per_second": 181.654, |
| "eval_steps_per_second": 1.426, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.7467411545623837, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00017913418298742104, |
| "loss": 0.34908738, |
| "memory(GiB)": 61.09, |
| "step": 8020, |
| "train_speed(iter/s)": 0.763602 |
| }, |
| { |
| "epoch": 0.7486033519553073, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017900851810606692, |
| "loss": 0.36671166, |
| "memory(GiB)": 61.09, |
| "step": 8040, |
| "train_speed(iter/s)": 0.763666 |
| }, |
| { |
| "epoch": 0.750465549348231, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00017888252030829138, |
| "loss": 0.34246197, |
| "memory(GiB)": 61.09, |
| "step": 8060, |
| "train_speed(iter/s)": 0.763721 |
| }, |
| { |
| "epoch": 0.7523277467411545, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00017875619012500856, |
| "loss": 0.33407593, |
| "memory(GiB)": 61.09, |
| "step": 8080, |
| "train_speed(iter/s)": 0.763808 |
| }, |
| { |
| "epoch": 0.7541899441340782, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.00017862952808853305, |
| "loss": 0.32741477, |
| "memory(GiB)": 61.09, |
| "step": 8100, |
| "train_speed(iter/s)": 0.763914 |
| }, |
| { |
| "epoch": 0.7560521415270018, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00017850253473257788, |
| "loss": 0.35220075, |
| "memory(GiB)": 61.09, |
| "step": 8120, |
| "train_speed(iter/s)": 0.76395 |
| }, |
| { |
| "epoch": 0.7579143389199255, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00017837521059225212, |
| "loss": 0.33165274, |
| "memory(GiB)": 61.09, |
| "step": 8140, |
| "train_speed(iter/s)": 0.7641 |
| }, |
| { |
| "epoch": 0.7597765363128491, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.0001782475562040586, |
| "loss": 0.36606779, |
| "memory(GiB)": 61.09, |
| "step": 8160, |
| "train_speed(iter/s)": 0.764186 |
| }, |
| { |
| "epoch": 0.7616387337057728, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00017811957210589176, |
| "loss": 0.33115237, |
| "memory(GiB)": 61.09, |
| "step": 8180, |
| "train_speed(iter/s)": 0.764259 |
| }, |
| { |
| "epoch": 0.7635009310986964, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.00017799125883703537, |
| "loss": 0.34380808, |
| "memory(GiB)": 61.09, |
| "step": 8200, |
| "train_speed(iter/s)": 0.764342 |
| }, |
| { |
| "epoch": 0.7653631284916201, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00017786261693816013, |
| "loss": 0.34643345, |
| "memory(GiB)": 61.09, |
| "step": 8220, |
| "train_speed(iter/s)": 0.764458 |
| }, |
| { |
| "epoch": 0.7672253258845437, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017773364695132152, |
| "loss": 0.3174742, |
| "memory(GiB)": 61.09, |
| "step": 8240, |
| "train_speed(iter/s)": 0.764536 |
| }, |
| { |
| "epoch": 0.7690875232774674, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00017760434941995753, |
| "loss": 0.35655365, |
| "memory(GiB)": 61.09, |
| "step": 8260, |
| "train_speed(iter/s)": 0.764581 |
| }, |
| { |
| "epoch": 0.770949720670391, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00017747472488888622, |
| "loss": 0.3317188, |
| "memory(GiB)": 61.09, |
| "step": 8280, |
| "train_speed(iter/s)": 0.764703 |
| }, |
| { |
| "epoch": 0.7728119180633147, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017734477390430359, |
| "loss": 0.32900805, |
| "memory(GiB)": 61.09, |
| "step": 8300, |
| "train_speed(iter/s)": 0.764809 |
| }, |
| { |
| "epoch": 0.7746741154562383, |
| "grad_norm": 1.375, |
| "learning_rate": 0.0001772144970137812, |
| "loss": 0.33518133, |
| "memory(GiB)": 61.09, |
| "step": 8320, |
| "train_speed(iter/s)": 0.764899 |
| }, |
| { |
| "epoch": 0.776536312849162, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00017708389476626385, |
| "loss": 0.3434902, |
| "memory(GiB)": 61.09, |
| "step": 8340, |
| "train_speed(iter/s)": 0.764975 |
| }, |
| { |
| "epoch": 0.7783985102420856, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017695296771206732, |
| "loss": 0.32565141, |
| "memory(GiB)": 61.09, |
| "step": 8360, |
| "train_speed(iter/s)": 0.765046 |
| }, |
| { |
| "epoch": 0.7802607076350093, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00017682171640287598, |
| "loss": 0.33638844, |
| "memory(GiB)": 61.09, |
| "step": 8380, |
| "train_speed(iter/s)": 0.765188 |
| }, |
| { |
| "epoch": 0.7821229050279329, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.00017669014139174054, |
| "loss": 0.33523929, |
| "memory(GiB)": 61.09, |
| "step": 8400, |
| "train_speed(iter/s)": 0.765285 |
| }, |
| { |
| "epoch": 0.7839851024208566, |
| "grad_norm": 1.625, |
| "learning_rate": 0.0001765582432330757, |
| "loss": 0.34908547, |
| "memory(GiB)": 61.09, |
| "step": 8420, |
| "train_speed(iter/s)": 0.765416 |
| }, |
| { |
| "epoch": 0.7858472998137802, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00017642602248265767, |
| "loss": 0.31645274, |
| "memory(GiB)": 61.09, |
| "step": 8440, |
| "train_speed(iter/s)": 0.765534 |
| }, |
| { |
| "epoch": 0.7877094972067039, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.0001762934796976222, |
| "loss": 0.34835916, |
| "memory(GiB)": 61.09, |
| "step": 8460, |
| "train_speed(iter/s)": 0.765627 |
| }, |
| { |
| "epoch": 0.7895716945996276, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017616061543646177, |
| "loss": 0.34466844, |
| "memory(GiB)": 61.09, |
| "step": 8480, |
| "train_speed(iter/s)": 0.765714 |
| }, |
| { |
| "epoch": 0.7914338919925512, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00017602743025902355, |
| "loss": 0.34232378, |
| "memory(GiB)": 61.09, |
| "step": 8500, |
| "train_speed(iter/s)": 0.765833 |
| }, |
| { |
| "epoch": 0.7932960893854749, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00017589392472650697, |
| "loss": 0.33207898, |
| "memory(GiB)": 61.09, |
| "step": 8520, |
| "train_speed(iter/s)": 0.765895 |
| }, |
| { |
| "epoch": 0.7951582867783985, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00017576009940146128, |
| "loss": 0.31665926, |
| "memory(GiB)": 61.09, |
| "step": 8540, |
| "train_speed(iter/s)": 0.765888 |
| }, |
| { |
| "epoch": 0.7970204841713222, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00017562595484778322, |
| "loss": 0.33440113, |
| "memory(GiB)": 61.09, |
| "step": 8560, |
| "train_speed(iter/s)": 0.766027 |
| }, |
| { |
| "epoch": 0.7988826815642458, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.0001754914916307147, |
| "loss": 0.34350667, |
| "memory(GiB)": 61.09, |
| "step": 8580, |
| "train_speed(iter/s)": 0.766145 |
| }, |
| { |
| "epoch": 0.8007448789571695, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00017535671031684041, |
| "loss": 0.33459611, |
| "memory(GiB)": 61.09, |
| "step": 8600, |
| "train_speed(iter/s)": 0.76629 |
| }, |
| { |
| "epoch": 0.8026070763500931, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00017522161147408532, |
| "loss": 0.34758761, |
| "memory(GiB)": 61.09, |
| "step": 8620, |
| "train_speed(iter/s)": 0.766406 |
| }, |
| { |
| "epoch": 0.8044692737430168, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.0001750861956717124, |
| "loss": 0.31787, |
| "memory(GiB)": 61.09, |
| "step": 8640, |
| "train_speed(iter/s)": 0.766498 |
| }, |
| { |
| "epoch": 0.8063314711359404, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.00017495046348032015, |
| "loss": 0.3213912, |
| "memory(GiB)": 61.09, |
| "step": 8660, |
| "train_speed(iter/s)": 0.766583 |
| }, |
| { |
| "epoch": 0.8081936685288641, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.0001748144154718403, |
| "loss": 0.34279273, |
| "memory(GiB)": 61.09, |
| "step": 8680, |
| "train_speed(iter/s)": 0.766674 |
| }, |
| { |
| "epoch": 0.8100558659217877, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017467805221953525, |
| "loss": 0.30648351, |
| "memory(GiB)": 61.09, |
| "step": 8700, |
| "train_speed(iter/s)": 0.76673 |
| }, |
| { |
| "epoch": 0.8119180633147114, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00017454137429799582, |
| "loss": 0.34818337, |
| "memory(GiB)": 61.09, |
| "step": 8720, |
| "train_speed(iter/s)": 0.766821 |
| }, |
| { |
| "epoch": 0.813780260707635, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.0001744043822831387, |
| "loss": 0.35243084, |
| "memory(GiB)": 61.09, |
| "step": 8740, |
| "train_speed(iter/s)": 0.766816 |
| }, |
| { |
| "epoch": 0.8156424581005587, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00017426707675220407, |
| "loss": 0.3365181, |
| "memory(GiB)": 61.09, |
| "step": 8760, |
| "train_speed(iter/s)": 0.766823 |
| }, |
| { |
| "epoch": 0.8175046554934823, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00017412945828375315, |
| "loss": 0.30240817, |
| "memory(GiB)": 61.09, |
| "step": 8780, |
| "train_speed(iter/s)": 0.766921 |
| }, |
| { |
| "epoch": 0.819366852886406, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00017399152745766584, |
| "loss": 0.32030184, |
| "memory(GiB)": 61.09, |
| "step": 8800, |
| "train_speed(iter/s)": 0.767073 |
| }, |
| { |
| "epoch": 0.8212290502793296, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00017385328485513806, |
| "loss": 0.3264648, |
| "memory(GiB)": 61.09, |
| "step": 8820, |
| "train_speed(iter/s)": 0.767141 |
| }, |
| { |
| "epoch": 0.8230912476722533, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.0001737147310586797, |
| "loss": 0.32821541, |
| "memory(GiB)": 61.09, |
| "step": 8840, |
| "train_speed(iter/s)": 0.76719 |
| }, |
| { |
| "epoch": 0.8249534450651769, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0001735758666521117, |
| "loss": 0.35671031, |
| "memory(GiB)": 61.09, |
| "step": 8860, |
| "train_speed(iter/s)": 0.767275 |
| }, |
| { |
| "epoch": 0.8268156424581006, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017343669222056395, |
| "loss": 0.33457453, |
| "memory(GiB)": 61.09, |
| "step": 8880, |
| "train_speed(iter/s)": 0.767361 |
| }, |
| { |
| "epoch": 0.8286778398510242, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017329720835047255, |
| "loss": 0.32296019, |
| "memory(GiB)": 61.09, |
| "step": 8900, |
| "train_speed(iter/s)": 0.767461 |
| }, |
| { |
| "epoch": 0.8305400372439479, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00017315741562957767, |
| "loss": 0.32958663, |
| "memory(GiB)": 61.09, |
| "step": 8920, |
| "train_speed(iter/s)": 0.767551 |
| }, |
| { |
| "epoch": 0.8324022346368715, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001730173146469207, |
| "loss": 0.32108679, |
| "memory(GiB)": 61.09, |
| "step": 8940, |
| "train_speed(iter/s)": 0.767576 |
| }, |
| { |
| "epoch": 0.8342644320297952, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00017287690599284198, |
| "loss": 0.32883315, |
| "memory(GiB)": 61.09, |
| "step": 8960, |
| "train_speed(iter/s)": 0.767694 |
| }, |
| { |
| "epoch": 0.8361266294227188, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00017273619025897835, |
| "loss": 0.33148849, |
| "memory(GiB)": 61.09, |
| "step": 8980, |
| "train_speed(iter/s)": 0.767813 |
| }, |
| { |
| "epoch": 0.8379888268156425, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00017259516803826052, |
| "loss": 0.32300215, |
| "memory(GiB)": 61.09, |
| "step": 9000, |
| "train_speed(iter/s)": 0.767839 |
| }, |
| { |
| "epoch": 0.839851024208566, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00017245383992491072, |
| "loss": 0.3056927, |
| "memory(GiB)": 61.09, |
| "step": 9020, |
| "train_speed(iter/s)": 0.767908 |
| }, |
| { |
| "epoch": 0.8417132216014898, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00017231220651443997, |
| "loss": 0.32388425, |
| "memory(GiB)": 61.09, |
| "step": 9040, |
| "train_speed(iter/s)": 0.767979 |
| }, |
| { |
| "epoch": 0.8435754189944135, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00017217026840364588, |
| "loss": 0.32691157, |
| "memory(GiB)": 61.09, |
| "step": 9060, |
| "train_speed(iter/s)": 0.768042 |
| }, |
| { |
| "epoch": 0.845437616387337, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0001720280261906098, |
| "loss": 0.31360347, |
| "memory(GiB)": 61.09, |
| "step": 9080, |
| "train_speed(iter/s)": 0.768054 |
| }, |
| { |
| "epoch": 0.8472998137802608, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.0001718854804746946, |
| "loss": 0.33109579, |
| "memory(GiB)": 61.09, |
| "step": 9100, |
| "train_speed(iter/s)": 0.76809 |
| }, |
| { |
| "epoch": 0.8491620111731844, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00017174263185654195, |
| "loss": 0.315205, |
| "memory(GiB)": 61.09, |
| "step": 9120, |
| "train_speed(iter/s)": 0.768076 |
| }, |
| { |
| "epoch": 0.851024208566108, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00017159948093806987, |
| "loss": 0.33985429, |
| "memory(GiB)": 61.09, |
| "step": 9140, |
| "train_speed(iter/s)": 0.768161 |
| }, |
| { |
| "epoch": 0.8528864059590316, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017145602832247024, |
| "loss": 0.32494226, |
| "memory(GiB)": 61.09, |
| "step": 9160, |
| "train_speed(iter/s)": 0.768242 |
| }, |
| { |
| "epoch": 0.8547486033519553, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00017131227461420604, |
| "loss": 0.33872006, |
| "memory(GiB)": 61.09, |
| "step": 9180, |
| "train_speed(iter/s)": 0.76826 |
| }, |
| { |
| "epoch": 0.8566108007448789, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017116822041900908, |
| "loss": 0.33576746, |
| "memory(GiB)": 61.09, |
| "step": 9200, |
| "train_speed(iter/s)": 0.768325 |
| }, |
| { |
| "epoch": 0.8584729981378026, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00017102386634387733, |
| "loss": 0.32657223, |
| "memory(GiB)": 61.09, |
| "step": 9220, |
| "train_speed(iter/s)": 0.768388 |
| }, |
| { |
| "epoch": 0.8603351955307262, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017087921299707227, |
| "loss": 0.32885175, |
| "memory(GiB)": 61.09, |
| "step": 9240, |
| "train_speed(iter/s)": 0.768406 |
| }, |
| { |
| "epoch": 0.8621973929236499, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00017073426098811645, |
| "loss": 0.3327816, |
| "memory(GiB)": 61.09, |
| "step": 9260, |
| "train_speed(iter/s)": 0.768456 |
| }, |
| { |
| "epoch": 0.8640595903165735, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00017058901092779097, |
| "loss": 0.31467597, |
| "memory(GiB)": 61.09, |
| "step": 9280, |
| "train_speed(iter/s)": 0.768571 |
| }, |
| { |
| "epoch": 0.8659217877094972, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001704434634281327, |
| "loss": 0.33298354, |
| "memory(GiB)": 61.09, |
| "step": 9300, |
| "train_speed(iter/s)": 0.76864 |
| }, |
| { |
| "epoch": 0.8677839851024208, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00017029761910243186, |
| "loss": 0.33006821, |
| "memory(GiB)": 61.09, |
| "step": 9320, |
| "train_speed(iter/s)": 0.768736 |
| }, |
| { |
| "epoch": 0.8696461824953445, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.0001701514785652295, |
| "loss": 0.32524149, |
| "memory(GiB)": 61.09, |
| "step": 9340, |
| "train_speed(iter/s)": 0.768775 |
| }, |
| { |
| "epoch": 0.8715083798882681, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00017000504243231464, |
| "loss": 0.31099353, |
| "memory(GiB)": 61.09, |
| "step": 9360, |
| "train_speed(iter/s)": 0.768862 |
| }, |
| { |
| "epoch": 0.8733705772811918, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00016985831132072197, |
| "loss": 0.35106015, |
| "memory(GiB)": 61.09, |
| "step": 9380, |
| "train_speed(iter/s)": 0.768879 |
| }, |
| { |
| "epoch": 0.8752327746741154, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.0001697112858487291, |
| "loss": 0.34133174, |
| "memory(GiB)": 61.09, |
| "step": 9400, |
| "train_speed(iter/s)": 0.768871 |
| }, |
| { |
| "epoch": 0.8770949720670391, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.0001695639666358539, |
| "loss": 0.32821767, |
| "memory(GiB)": 61.09, |
| "step": 9420, |
| "train_speed(iter/s)": 0.768959 |
| }, |
| { |
| "epoch": 0.8789571694599627, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00016941635430285206, |
| "loss": 0.30042934, |
| "memory(GiB)": 61.09, |
| "step": 9440, |
| "train_speed(iter/s)": 0.769071 |
| }, |
| { |
| "epoch": 0.8808193668528864, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00016926844947171444, |
| "loss": 0.30185919, |
| "memory(GiB)": 61.09, |
| "step": 9460, |
| "train_speed(iter/s)": 0.769178 |
| }, |
| { |
| "epoch": 0.88268156424581, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.0001691202527656642, |
| "loss": 0.32123606, |
| "memory(GiB)": 61.09, |
| "step": 9480, |
| "train_speed(iter/s)": 0.76926 |
| }, |
| { |
| "epoch": 0.8845437616387337, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.00016897176480915453, |
| "loss": 0.33448987, |
| "memory(GiB)": 61.09, |
| "step": 9500, |
| "train_speed(iter/s)": 0.769379 |
| }, |
| { |
| "epoch": 0.8864059590316573, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00016882298622786582, |
| "loss": 0.32368143, |
| "memory(GiB)": 61.09, |
| "step": 9520, |
| "train_speed(iter/s)": 0.769475 |
| }, |
| { |
| "epoch": 0.888268156424581, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.000168673917648703, |
| "loss": 0.28515136, |
| "memory(GiB)": 61.09, |
| "step": 9540, |
| "train_speed(iter/s)": 0.769458 |
| }, |
| { |
| "epoch": 0.8901303538175046, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00016852455969979302, |
| "loss": 0.32660358, |
| "memory(GiB)": 61.09, |
| "step": 9560, |
| "train_speed(iter/s)": 0.769559 |
| }, |
| { |
| "epoch": 0.8919925512104283, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00016837491301048213, |
| "loss": 0.31508813, |
| "memory(GiB)": 61.09, |
| "step": 9580, |
| "train_speed(iter/s)": 0.769661 |
| }, |
| { |
| "epoch": 0.8938547486033519, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00016822497821133322, |
| "loss": 0.3051513, |
| "memory(GiB)": 61.09, |
| "step": 9600, |
| "train_speed(iter/s)": 0.769764 |
| }, |
| { |
| "epoch": 0.8957169459962756, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00016807475593412315, |
| "loss": 0.29644897, |
| "memory(GiB)": 61.09, |
| "step": 9620, |
| "train_speed(iter/s)": 0.769809 |
| }, |
| { |
| "epoch": 0.8975791433891993, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00016792424681184024, |
| "loss": 0.327367, |
| "memory(GiB)": 61.09, |
| "step": 9640, |
| "train_speed(iter/s)": 0.76979 |
| }, |
| { |
| "epoch": 0.8994413407821229, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00016777345147868137, |
| "loss": 0.28873682, |
| "memory(GiB)": 61.09, |
| "step": 9660, |
| "train_speed(iter/s)": 0.769893 |
| }, |
| { |
| "epoch": 0.9013035381750466, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00016762237057004945, |
| "loss": 0.3337399, |
| "memory(GiB)": 61.09, |
| "step": 9680, |
| "train_speed(iter/s)": 0.769961 |
| }, |
| { |
| "epoch": 0.9031657355679702, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0001674710047225507, |
| "loss": 0.31436739, |
| "memory(GiB)": 61.09, |
| "step": 9700, |
| "train_speed(iter/s)": 0.770068 |
| }, |
| { |
| "epoch": 0.9050279329608939, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00016731935457399205, |
| "loss": 0.32119958, |
| "memory(GiB)": 61.09, |
| "step": 9720, |
| "train_speed(iter/s)": 0.770142 |
| }, |
| { |
| "epoch": 0.9068901303538175, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00016716742076337826, |
| "loss": 0.31439784, |
| "memory(GiB)": 61.09, |
| "step": 9740, |
| "train_speed(iter/s)": 0.7702 |
| }, |
| { |
| "epoch": 0.9087523277467412, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00016701520393090947, |
| "loss": 0.30380776, |
| "memory(GiB)": 61.09, |
| "step": 9760, |
| "train_speed(iter/s)": 0.770272 |
| }, |
| { |
| "epoch": 0.9106145251396648, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00016686270471797832, |
| "loss": 0.32591763, |
| "memory(GiB)": 61.09, |
| "step": 9780, |
| "train_speed(iter/s)": 0.770365 |
| }, |
| { |
| "epoch": 0.9124767225325885, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00016670992376716735, |
| "loss": 0.31843548, |
| "memory(GiB)": 61.09, |
| "step": 9800, |
| "train_speed(iter/s)": 0.770333 |
| }, |
| { |
| "epoch": 0.9143389199255121, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.0001665568617222462, |
| "loss": 0.33044133, |
| "memory(GiB)": 61.09, |
| "step": 9820, |
| "train_speed(iter/s)": 0.770405 |
| }, |
| { |
| "epoch": 0.9162011173184358, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00016640351922816898, |
| "loss": 0.30755372, |
| "memory(GiB)": 61.09, |
| "step": 9840, |
| "train_speed(iter/s)": 0.770498 |
| }, |
| { |
| "epoch": 0.9180633147113594, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.0001662498969310715, |
| "loss": 0.31706464, |
| "memory(GiB)": 61.09, |
| "step": 9860, |
| "train_speed(iter/s)": 0.770551 |
| }, |
| { |
| "epoch": 0.9199255121042831, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.00016609599547826864, |
| "loss": 0.32224698, |
| "memory(GiB)": 61.09, |
| "step": 9880, |
| "train_speed(iter/s)": 0.770596 |
| }, |
| { |
| "epoch": 0.9217877094972067, |
| "grad_norm": 1.9453125, |
| "learning_rate": 0.00016594181551825147, |
| "loss": 0.3317049, |
| "memory(GiB)": 61.09, |
| "step": 9900, |
| "train_speed(iter/s)": 0.770655 |
| }, |
| { |
| "epoch": 0.9236499068901304, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00016578735770068464, |
| "loss": 0.28957124, |
| "memory(GiB)": 61.09, |
| "step": 9920, |
| "train_speed(iter/s)": 0.7707 |
| }, |
| { |
| "epoch": 0.925512104283054, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00016563262267640355, |
| "loss": 0.31882176, |
| "memory(GiB)": 61.09, |
| "step": 9940, |
| "train_speed(iter/s)": 0.77078 |
| }, |
| { |
| "epoch": 0.9273743016759777, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00016547761109741177, |
| "loss": 0.29138837, |
| "memory(GiB)": 61.09, |
| "step": 9960, |
| "train_speed(iter/s)": 0.770913 |
| }, |
| { |
| "epoch": 0.9292364990689013, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00016532232361687806, |
| "loss": 0.32336102, |
| "memory(GiB)": 61.09, |
| "step": 9980, |
| "train_speed(iter/s)": 0.771 |
| }, |
| { |
| "epoch": 0.931098696461825, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016516676088913386, |
| "loss": 0.30521989, |
| "memory(GiB)": 61.09, |
| "step": 10000, |
| "train_speed(iter/s)": 0.771064 |
| }, |
| { |
| "epoch": 0.931098696461825, |
| "eval_loss": 0.4275904893875122, |
| "eval_runtime": 76.1303, |
| "eval_samples_per_second": 182.385, |
| "eval_steps_per_second": 1.432, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.9329608938547486, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0001650109235696703, |
| "loss": 0.3077956, |
| "memory(GiB)": 61.09, |
| "step": 10020, |
| "train_speed(iter/s)": 0.764779 |
| }, |
| { |
| "epoch": 0.9348230912476723, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00016485481231513557, |
| "loss": 0.30179648, |
| "memory(GiB)": 61.09, |
| "step": 10040, |
| "train_speed(iter/s)": 0.764916 |
| }, |
| { |
| "epoch": 0.9366852886405959, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00016469842778333224, |
| "loss": 0.29367597, |
| "memory(GiB)": 61.09, |
| "step": 10060, |
| "train_speed(iter/s)": 0.764969 |
| }, |
| { |
| "epoch": 0.9385474860335196, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00016454177063321423, |
| "loss": 0.32096641, |
| "memory(GiB)": 61.09, |
| "step": 10080, |
| "train_speed(iter/s)": 0.765074 |
| }, |
| { |
| "epoch": 0.9404096834264432, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016438484152488437, |
| "loss": 0.33787951, |
| "memory(GiB)": 61.09, |
| "step": 10100, |
| "train_speed(iter/s)": 0.765195 |
| }, |
| { |
| "epoch": 0.9422718808193669, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00016422764111959117, |
| "loss": 0.33140016, |
| "memory(GiB)": 61.09, |
| "step": 10120, |
| "train_speed(iter/s)": 0.765314 |
| }, |
| { |
| "epoch": 0.9441340782122905, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00016407017007972655, |
| "loss": 0.3267303, |
| "memory(GiB)": 61.09, |
| "step": 10140, |
| "train_speed(iter/s)": 0.765392 |
| }, |
| { |
| "epoch": 0.9459962756052142, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001639124290688227, |
| "loss": 0.30506415, |
| "memory(GiB)": 61.09, |
| "step": 10160, |
| "train_speed(iter/s)": 0.765479 |
| }, |
| { |
| "epoch": 0.9478584729981379, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0001637544187515494, |
| "loss": 0.32266409, |
| "memory(GiB)": 61.09, |
| "step": 10180, |
| "train_speed(iter/s)": 0.765539 |
| }, |
| { |
| "epoch": 0.9497206703910615, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00016359613979371112, |
| "loss": 0.31371219, |
| "memory(GiB)": 61.09, |
| "step": 10200, |
| "train_speed(iter/s)": 0.765598 |
| }, |
| { |
| "epoch": 0.9515828677839852, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00016343759286224438, |
| "loss": 0.32517624, |
| "memory(GiB)": 61.09, |
| "step": 10220, |
| "train_speed(iter/s)": 0.765643 |
| }, |
| { |
| "epoch": 0.9534450651769087, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00016327877862521483, |
| "loss": 0.29492621, |
| "memory(GiB)": 61.09, |
| "step": 10240, |
| "train_speed(iter/s)": 0.765654 |
| }, |
| { |
| "epoch": 0.9553072625698324, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00016311969775181448, |
| "loss": 0.31768193, |
| "memory(GiB)": 61.09, |
| "step": 10260, |
| "train_speed(iter/s)": 0.765711 |
| }, |
| { |
| "epoch": 0.957169459962756, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00016296035091235882, |
| "loss": 0.31737647, |
| "memory(GiB)": 61.09, |
| "step": 10280, |
| "train_speed(iter/s)": 0.765782 |
| }, |
| { |
| "epoch": 0.9590316573556797, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00016280073877828407, |
| "loss": 0.29982419, |
| "memory(GiB)": 61.09, |
| "step": 10300, |
| "train_speed(iter/s)": 0.765856 |
| }, |
| { |
| "epoch": 0.9608938547486033, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001626408620221443, |
| "loss": 0.29922385, |
| "memory(GiB)": 61.09, |
| "step": 10320, |
| "train_speed(iter/s)": 0.765951 |
| }, |
| { |
| "epoch": 0.962756052141527, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.0001624807213176086, |
| "loss": 0.31153214, |
| "memory(GiB)": 61.09, |
| "step": 10340, |
| "train_speed(iter/s)": 0.765997 |
| }, |
| { |
| "epoch": 0.9646182495344506, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016232031733945824, |
| "loss": 0.32357917, |
| "memory(GiB)": 61.09, |
| "step": 10360, |
| "train_speed(iter/s)": 0.766017 |
| }, |
| { |
| "epoch": 0.9664804469273743, |
| "grad_norm": 1.5, |
| "learning_rate": 0.0001621596507635839, |
| "loss": 0.3050452, |
| "memory(GiB)": 61.09, |
| "step": 10380, |
| "train_speed(iter/s)": 0.766116 |
| }, |
| { |
| "epoch": 0.9683426443202979, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016199872226698277, |
| "loss": 0.30685663, |
| "memory(GiB)": 61.09, |
| "step": 10400, |
| "train_speed(iter/s)": 0.766219 |
| }, |
| { |
| "epoch": 0.9702048417132216, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016183753252775558, |
| "loss": 0.31804857, |
| "memory(GiB)": 61.09, |
| "step": 10420, |
| "train_speed(iter/s)": 0.766245 |
| }, |
| { |
| "epoch": 0.9720670391061452, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00016167608222510392, |
| "loss": 0.28163512, |
| "memory(GiB)": 61.09, |
| "step": 10440, |
| "train_speed(iter/s)": 0.766326 |
| }, |
| { |
| "epoch": 0.9739292364990689, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00016151437203932738, |
| "loss": 0.31363811, |
| "memory(GiB)": 61.09, |
| "step": 10460, |
| "train_speed(iter/s)": 0.766343 |
| }, |
| { |
| "epoch": 0.9757914338919925, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00016135240265182046, |
| "loss": 0.31705124, |
| "memory(GiB)": 61.09, |
| "step": 10480, |
| "train_speed(iter/s)": 0.766417 |
| }, |
| { |
| "epoch": 0.9776536312849162, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00016119017474507002, |
| "loss": 0.3048178, |
| "memory(GiB)": 61.09, |
| "step": 10500, |
| "train_speed(iter/s)": 0.766326 |
| }, |
| { |
| "epoch": 0.9795158286778398, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00016102768900265208, |
| "loss": 0.2878325, |
| "memory(GiB)": 61.09, |
| "step": 10520, |
| "train_speed(iter/s)": 0.766347 |
| }, |
| { |
| "epoch": 0.9813780260707635, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00016086494610922917, |
| "loss": 0.30582104, |
| "memory(GiB)": 61.09, |
| "step": 10540, |
| "train_speed(iter/s)": 0.766427 |
| }, |
| { |
| "epoch": 0.9832402234636871, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00016070194675054747, |
| "loss": 0.30753689, |
| "memory(GiB)": 61.09, |
| "step": 10560, |
| "train_speed(iter/s)": 0.766485 |
| }, |
| { |
| "epoch": 0.9851024208566108, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016053869161343362, |
| "loss": 0.30274782, |
| "memory(GiB)": 61.09, |
| "step": 10580, |
| "train_speed(iter/s)": 0.766535 |
| }, |
| { |
| "epoch": 0.9869646182495344, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00016037518138579214, |
| "loss": 0.32131209, |
| "memory(GiB)": 61.09, |
| "step": 10600, |
| "train_speed(iter/s)": 0.766567 |
| }, |
| { |
| "epoch": 0.9888268156424581, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00016021141675660248, |
| "loss": 0.30618265, |
| "memory(GiB)": 61.09, |
| "step": 10620, |
| "train_speed(iter/s)": 0.766659 |
| }, |
| { |
| "epoch": 0.9906890130353817, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00016004739841591592, |
| "loss": 0.29824829, |
| "memory(GiB)": 61.09, |
| "step": 10640, |
| "train_speed(iter/s)": 0.76669 |
| }, |
| { |
| "epoch": 0.9925512104283054, |
| "grad_norm": 1.75, |
| "learning_rate": 0.0001598831270548529, |
| "loss": 0.3051553, |
| "memory(GiB)": 61.09, |
| "step": 10660, |
| "train_speed(iter/s)": 0.766785 |
| }, |
| { |
| "epoch": 0.994413407821229, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00015971860336559996, |
| "loss": 0.30366507, |
| "memory(GiB)": 61.09, |
| "step": 10680, |
| "train_speed(iter/s)": 0.766815 |
| }, |
| { |
| "epoch": 0.9962756052141527, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00015955382804140687, |
| "loss": 0.31902528, |
| "memory(GiB)": 61.09, |
| "step": 10700, |
| "train_speed(iter/s)": 0.76688 |
| }, |
| { |
| "epoch": 0.9981378026070763, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00015938880177658374, |
| "loss": 0.30881336, |
| "memory(GiB)": 61.09, |
| "step": 10720, |
| "train_speed(iter/s)": 0.766982 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00015922352526649803, |
| "loss": 0.30783999, |
| "memory(GiB)": 61.09, |
| "step": 10740, |
| "train_speed(iter/s)": 0.767075 |
| }, |
| { |
| "epoch": 1.0018621973929236, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015905799920757163, |
| "loss": 0.2917798, |
| "memory(GiB)": 61.09, |
| "step": 10760, |
| "train_speed(iter/s)": 0.766514 |
| }, |
| { |
| "epoch": 1.0037243947858474, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00015889222429727807, |
| "loss": 0.2846385, |
| "memory(GiB)": 61.09, |
| "step": 10780, |
| "train_speed(iter/s)": 0.766588 |
| }, |
| { |
| "epoch": 1.005586592178771, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00015872620123413928, |
| "loss": 0.29505885, |
| "memory(GiB)": 61.09, |
| "step": 10800, |
| "train_speed(iter/s)": 0.76667 |
| }, |
| { |
| "epoch": 1.0074487895716946, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.000158559930717723, |
| "loss": 0.27733662, |
| "memory(GiB)": 61.09, |
| "step": 10820, |
| "train_speed(iter/s)": 0.76677 |
| }, |
| { |
| "epoch": 1.0093109869646182, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00015839341344863953, |
| "loss": 0.29862165, |
| "memory(GiB)": 61.09, |
| "step": 10840, |
| "train_speed(iter/s)": 0.766803 |
| }, |
| { |
| "epoch": 1.011173184357542, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00015822665012853895, |
| "loss": 0.3004262, |
| "memory(GiB)": 61.09, |
| "step": 10860, |
| "train_speed(iter/s)": 0.766875 |
| }, |
| { |
| "epoch": 1.0130353817504656, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00015805964146010815, |
| "loss": 0.32919612, |
| "memory(GiB)": 61.09, |
| "step": 10880, |
| "train_speed(iter/s)": 0.766929 |
| }, |
| { |
| "epoch": 1.0148975791433892, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001578923881470678, |
| "loss": 0.31359372, |
| "memory(GiB)": 61.09, |
| "step": 10900, |
| "train_speed(iter/s)": 0.766932 |
| }, |
| { |
| "epoch": 1.0167597765363128, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00015772489089416943, |
| "loss": 0.3069638, |
| "memory(GiB)": 61.09, |
| "step": 10920, |
| "train_speed(iter/s)": 0.767026 |
| }, |
| { |
| "epoch": 1.0186219739292366, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00015755715040719243, |
| "loss": 0.29340148, |
| "memory(GiB)": 61.09, |
| "step": 10940, |
| "train_speed(iter/s)": 0.7671 |
| }, |
| { |
| "epoch": 1.0204841713221602, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00015738916739294122, |
| "loss": 0.29378917, |
| "memory(GiB)": 61.09, |
| "step": 10960, |
| "train_speed(iter/s)": 0.767181 |
| }, |
| { |
| "epoch": 1.0223463687150838, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.000157220942559242, |
| "loss": 0.30050142, |
| "memory(GiB)": 61.09, |
| "step": 10980, |
| "train_speed(iter/s)": 0.767282 |
| }, |
| { |
| "epoch": 1.0242085661080074, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00015705247661493992, |
| "loss": 0.28569994, |
| "memory(GiB)": 61.09, |
| "step": 11000, |
| "train_speed(iter/s)": 0.767289 |
| }, |
| { |
| "epoch": 1.0260707635009312, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00015688377026989626, |
| "loss": 0.28935771, |
| "memory(GiB)": 69.04, |
| "step": 11020, |
| "train_speed(iter/s)": 0.767361 |
| }, |
| { |
| "epoch": 1.0279329608938548, |
| "grad_norm": 1.5, |
| "learning_rate": 0.0001567148242349851, |
| "loss": 0.28245692, |
| "memory(GiB)": 31.11, |
| "step": 11040, |
| "train_speed(iter/s)": 0.767465 |
| }, |
| { |
| "epoch": 1.0297951582867784, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001565456392220906, |
| "loss": 0.28632317, |
| "memory(GiB)": 31.11, |
| "step": 11060, |
| "train_speed(iter/s)": 0.767508 |
| }, |
| { |
| "epoch": 1.031657355679702, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00015637621594410383, |
| "loss": 0.30479004, |
| "memory(GiB)": 31.11, |
| "step": 11080, |
| "train_speed(iter/s)": 0.767594 |
| }, |
| { |
| "epoch": 1.0335195530726258, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00015620655511491984, |
| "loss": 0.27974844, |
| "memory(GiB)": 31.11, |
| "step": 11100, |
| "train_speed(iter/s)": 0.767652 |
| }, |
| { |
| "epoch": 1.0353817504655494, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00015603665744943473, |
| "loss": 0.26321175, |
| "memory(GiB)": 31.11, |
| "step": 11120, |
| "train_speed(iter/s)": 0.767723 |
| }, |
| { |
| "epoch": 1.037243947858473, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.0001558665236635424, |
| "loss": 0.27321138, |
| "memory(GiB)": 31.11, |
| "step": 11140, |
| "train_speed(iter/s)": 0.767764 |
| }, |
| { |
| "epoch": 1.0391061452513966, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00015569615447413184, |
| "loss": 0.28301878, |
| "memory(GiB)": 31.11, |
| "step": 11160, |
| "train_speed(iter/s)": 0.767868 |
| }, |
| { |
| "epoch": 1.0409683426443204, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015552555059908382, |
| "loss": 0.28938766, |
| "memory(GiB)": 31.11, |
| "step": 11180, |
| "train_speed(iter/s)": 0.767873 |
| }, |
| { |
| "epoch": 1.042830540037244, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00015535471275726813, |
| "loss": 0.27508519, |
| "memory(GiB)": 31.11, |
| "step": 11200, |
| "train_speed(iter/s)": 0.767984 |
| }, |
| { |
| "epoch": 1.0446927374301676, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001551836416685403, |
| "loss": 0.27724741, |
| "memory(GiB)": 31.11, |
| "step": 11220, |
| "train_speed(iter/s)": 0.768059 |
| }, |
| { |
| "epoch": 1.0465549348230911, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00015501233805373876, |
| "loss": 0.26528058, |
| "memory(GiB)": 31.11, |
| "step": 11240, |
| "train_speed(iter/s)": 0.768095 |
| }, |
| { |
| "epoch": 1.048417132216015, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.0001548408026346817, |
| "loss": 0.28528447, |
| "memory(GiB)": 31.11, |
| "step": 11260, |
| "train_speed(iter/s)": 0.768155 |
| }, |
| { |
| "epoch": 1.0502793296089385, |
| "grad_norm": 1.875, |
| "learning_rate": 0.0001546690361341641, |
| "loss": 0.29369481, |
| "memory(GiB)": 31.11, |
| "step": 11280, |
| "train_speed(iter/s)": 0.768221 |
| }, |
| { |
| "epoch": 1.0521415270018621, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.0001544970392759546, |
| "loss": 0.29468055, |
| "memory(GiB)": 31.11, |
| "step": 11300, |
| "train_speed(iter/s)": 0.768263 |
| }, |
| { |
| "epoch": 1.0540037243947857, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001543248127847925, |
| "loss": 0.28771296, |
| "memory(GiB)": 31.11, |
| "step": 11320, |
| "train_speed(iter/s)": 0.768313 |
| }, |
| { |
| "epoch": 1.0558659217877095, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00015415235738638468, |
| "loss": 0.30344753, |
| "memory(GiB)": 31.11, |
| "step": 11340, |
| "train_speed(iter/s)": 0.768354 |
| }, |
| { |
| "epoch": 1.0577281191806331, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00015397967380740264, |
| "loss": 0.283692, |
| "memory(GiB)": 31.11, |
| "step": 11360, |
| "train_speed(iter/s)": 0.76837 |
| }, |
| { |
| "epoch": 1.0595903165735567, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.00015380676277547927, |
| "loss": 0.270574, |
| "memory(GiB)": 31.11, |
| "step": 11380, |
| "train_speed(iter/s)": 0.768391 |
| }, |
| { |
| "epoch": 1.0614525139664805, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.0001536336250192059, |
| "loss": 0.2845499, |
| "memory(GiB)": 31.11, |
| "step": 11400, |
| "train_speed(iter/s)": 0.768457 |
| }, |
| { |
| "epoch": 1.0633147113594041, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.00015346026126812924, |
| "loss": 0.26141229, |
| "memory(GiB)": 31.11, |
| "step": 11420, |
| "train_speed(iter/s)": 0.768433 |
| }, |
| { |
| "epoch": 1.0651769087523277, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0001532866722527482, |
| "loss": 0.28916895, |
| "memory(GiB)": 31.11, |
| "step": 11440, |
| "train_speed(iter/s)": 0.768458 |
| }, |
| { |
| "epoch": 1.0670391061452513, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.000153112858704511, |
| "loss": 0.29057674, |
| "memory(GiB)": 31.11, |
| "step": 11460, |
| "train_speed(iter/s)": 0.768521 |
| }, |
| { |
| "epoch": 1.0689013035381751, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.0001529388213558118, |
| "loss": 0.29088101, |
| "memory(GiB)": 31.11, |
| "step": 11480, |
| "train_speed(iter/s)": 0.76856 |
| }, |
| { |
| "epoch": 1.0707635009310987, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.0001527645609399879, |
| "loss": 0.31489258, |
| "memory(GiB)": 31.11, |
| "step": 11500, |
| "train_speed(iter/s)": 0.768607 |
| }, |
| { |
| "epoch": 1.0726256983240223, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00015259007819131657, |
| "loss": 0.29952948, |
| "memory(GiB)": 31.11, |
| "step": 11520, |
| "train_speed(iter/s)": 0.768637 |
| }, |
| { |
| "epoch": 1.074487895716946, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.0001524153738450118, |
| "loss": 0.28699169, |
| "memory(GiB)": 31.11, |
| "step": 11540, |
| "train_speed(iter/s)": 0.76869 |
| }, |
| { |
| "epoch": 1.0763500931098697, |
| "grad_norm": 2.078125, |
| "learning_rate": 0.00015224044863722136, |
| "loss": 0.2873806, |
| "memory(GiB)": 31.11, |
| "step": 11560, |
| "train_speed(iter/s)": 0.768776 |
| }, |
| { |
| "epoch": 1.0782122905027933, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.0001520653033050237, |
| "loss": 0.26496642, |
| "memory(GiB)": 31.11, |
| "step": 11580, |
| "train_speed(iter/s)": 0.768813 |
| }, |
| { |
| "epoch": 1.080074487895717, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0001518899385864248, |
| "loss": 0.29042182, |
| "memory(GiB)": 31.11, |
| "step": 11600, |
| "train_speed(iter/s)": 0.768799 |
| }, |
| { |
| "epoch": 1.0819366852886405, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.000151714355220355, |
| "loss": 0.26191492, |
| "memory(GiB)": 31.11, |
| "step": 11620, |
| "train_speed(iter/s)": 0.768819 |
| }, |
| { |
| "epoch": 1.0837988826815643, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00015153855394666604, |
| "loss": 0.29819686, |
| "memory(GiB)": 31.11, |
| "step": 11640, |
| "train_speed(iter/s)": 0.768815 |
| }, |
| { |
| "epoch": 1.085661080074488, |
| "grad_norm": 1.625, |
| "learning_rate": 0.0001513625355061278, |
| "loss": 0.26143956, |
| "memory(GiB)": 31.11, |
| "step": 11660, |
| "train_speed(iter/s)": 0.76891 |
| }, |
| { |
| "epoch": 1.0875232774674115, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001511863006404252, |
| "loss": 0.2926599, |
| "memory(GiB)": 31.11, |
| "step": 11680, |
| "train_speed(iter/s)": 0.76896 |
| }, |
| { |
| "epoch": 1.089385474860335, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.0001510098500921552, |
| "loss": 0.26912103, |
| "memory(GiB)": 31.11, |
| "step": 11700, |
| "train_speed(iter/s)": 0.768983 |
| }, |
| { |
| "epoch": 1.091247672253259, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00015083318460482344, |
| "loss": 0.2636065, |
| "memory(GiB)": 31.11, |
| "step": 11720, |
| "train_speed(iter/s)": 0.769047 |
| }, |
| { |
| "epoch": 1.0931098696461825, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00015065630492284136, |
| "loss": 0.2775203, |
| "memory(GiB)": 31.11, |
| "step": 11740, |
| "train_speed(iter/s)": 0.769099 |
| }, |
| { |
| "epoch": 1.094972067039106, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.0001504792117915229, |
| "loss": 0.27552454, |
| "memory(GiB)": 31.11, |
| "step": 11760, |
| "train_speed(iter/s)": 0.769134 |
| }, |
| { |
| "epoch": 1.0968342644320297, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.0001503019059570815, |
| "loss": 0.23976431, |
| "memory(GiB)": 31.11, |
| "step": 11780, |
| "train_speed(iter/s)": 0.769196 |
| }, |
| { |
| "epoch": 1.0986964618249535, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00015012438816662665, |
| "loss": 0.2832912, |
| "memory(GiB)": 31.11, |
| "step": 11800, |
| "train_speed(iter/s)": 0.76928 |
| }, |
| { |
| "epoch": 1.100558659217877, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00014994665916816117, |
| "loss": 0.28068914, |
| "memory(GiB)": 31.11, |
| "step": 11820, |
| "train_speed(iter/s)": 0.769327 |
| }, |
| { |
| "epoch": 1.1024208566108007, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00014976871971057775, |
| "loss": 0.26431036, |
| "memory(GiB)": 31.11, |
| "step": 11840, |
| "train_speed(iter/s)": 0.769318 |
| }, |
| { |
| "epoch": 1.1042830540037243, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001495905705436559, |
| "loss": 0.28522236, |
| "memory(GiB)": 31.11, |
| "step": 11860, |
| "train_speed(iter/s)": 0.769396 |
| }, |
| { |
| "epoch": 1.106145251396648, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00014941221241805867, |
| "loss": 0.28218701, |
| "memory(GiB)": 31.11, |
| "step": 11880, |
| "train_speed(iter/s)": 0.76947 |
| }, |
| { |
| "epoch": 1.1080074487895717, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00014923364608532982, |
| "loss": 0.26047482, |
| "memory(GiB)": 31.11, |
| "step": 11900, |
| "train_speed(iter/s)": 0.769479 |
| }, |
| { |
| "epoch": 1.1098696461824953, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00014905487229789022, |
| "loss": 0.29261403, |
| "memory(GiB)": 31.11, |
| "step": 11920, |
| "train_speed(iter/s)": 0.769492 |
| }, |
| { |
| "epoch": 1.111731843575419, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00014887589180903504, |
| "loss": 0.2624248, |
| "memory(GiB)": 31.11, |
| "step": 11940, |
| "train_speed(iter/s)": 0.769556 |
| }, |
| { |
| "epoch": 1.1135940409683427, |
| "grad_norm": 1.375, |
| "learning_rate": 0.0001486967053729303, |
| "loss": 0.25690923, |
| "memory(GiB)": 31.11, |
| "step": 11960, |
| "train_speed(iter/s)": 0.76963 |
| }, |
| { |
| "epoch": 1.1154562383612663, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00014851731374460988, |
| "loss": 0.28093395, |
| "memory(GiB)": 31.11, |
| "step": 11980, |
| "train_speed(iter/s)": 0.769697 |
| }, |
| { |
| "epoch": 1.1173184357541899, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001483377176799723, |
| "loss": 0.28581772, |
| "memory(GiB)": 31.11, |
| "step": 12000, |
| "train_speed(iter/s)": 0.769759 |
| }, |
| { |
| "epoch": 1.1173184357541899, |
| "eval_loss": 0.41399523615837097, |
| "eval_runtime": 77.0372, |
| "eval_samples_per_second": 180.237, |
| "eval_steps_per_second": 1.415, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.1191806331471137, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.0001481579179357774, |
| "loss": 0.28383641, |
| "memory(GiB)": 41.92, |
| "step": 12020, |
| "train_speed(iter/s)": 0.7643 |
| }, |
| { |
| "epoch": 1.1210428305400373, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.0001479779152696434, |
| "loss": 0.30731783, |
| "memory(GiB)": 41.92, |
| "step": 12040, |
| "train_speed(iter/s)": 0.764324 |
| }, |
| { |
| "epoch": 1.1229050279329609, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00014779771044004346, |
| "loss": 0.26912603, |
| "memory(GiB)": 41.92, |
| "step": 12060, |
| "train_speed(iter/s)": 0.764388 |
| }, |
| { |
| "epoch": 1.1247672253258845, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00014761730420630265, |
| "loss": 0.27448189, |
| "memory(GiB)": 41.92, |
| "step": 12080, |
| "train_speed(iter/s)": 0.764451 |
| }, |
| { |
| "epoch": 1.1266294227188083, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00014743669732859464, |
| "loss": 0.28500652, |
| "memory(GiB)": 41.92, |
| "step": 12100, |
| "train_speed(iter/s)": 0.764504 |
| }, |
| { |
| "epoch": 1.1284916201117319, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001472558905679386, |
| "loss": 0.26959038, |
| "memory(GiB)": 41.92, |
| "step": 12120, |
| "train_speed(iter/s)": 0.764615 |
| }, |
| { |
| "epoch": 1.1303538175046555, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00014707488468619596, |
| "loss": 0.28465414, |
| "memory(GiB)": 41.92, |
| "step": 12140, |
| "train_speed(iter/s)": 0.764627 |
| }, |
| { |
| "epoch": 1.132216014897579, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00014689368044606708, |
| "loss": 0.2677213, |
| "memory(GiB)": 41.92, |
| "step": 12160, |
| "train_speed(iter/s)": 0.764649 |
| }, |
| { |
| "epoch": 1.1340782122905029, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00014671227861108826, |
| "loss": 0.26625967, |
| "memory(GiB)": 41.92, |
| "step": 12180, |
| "train_speed(iter/s)": 0.764713 |
| }, |
| { |
| "epoch": 1.1359404096834265, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.00014653067994562833, |
| "loss": 0.26140971, |
| "memory(GiB)": 41.92, |
| "step": 12200, |
| "train_speed(iter/s)": 0.764762 |
| }, |
| { |
| "epoch": 1.13780260707635, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00014634888521488547, |
| "loss": 0.27225456, |
| "memory(GiB)": 41.92, |
| "step": 12220, |
| "train_speed(iter/s)": 0.764812 |
| }, |
| { |
| "epoch": 1.1396648044692737, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00014616689518488416, |
| "loss": 0.27721281, |
| "memory(GiB)": 41.92, |
| "step": 12240, |
| "train_speed(iter/s)": 0.764902 |
| }, |
| { |
| "epoch": 1.1415270018621975, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00014598471062247158, |
| "loss": 0.27392306, |
| "memory(GiB)": 41.92, |
| "step": 12260, |
| "train_speed(iter/s)": 0.764971 |
| }, |
| { |
| "epoch": 1.143389199255121, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00014580233229531482, |
| "loss": 0.26567361, |
| "memory(GiB)": 41.92, |
| "step": 12280, |
| "train_speed(iter/s)": 0.764996 |
| }, |
| { |
| "epoch": 1.1452513966480447, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00014561976097189736, |
| "loss": 0.25700495, |
| "memory(GiB)": 41.92, |
| "step": 12300, |
| "train_speed(iter/s)": 0.765049 |
| }, |
| { |
| "epoch": 1.1471135940409685, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00014543699742151586, |
| "loss": 0.26121554, |
| "memory(GiB)": 41.92, |
| "step": 12320, |
| "train_speed(iter/s)": 0.76512 |
| }, |
| { |
| "epoch": 1.148975791433892, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00014525404241427697, |
| "loss": 0.26690025, |
| "memory(GiB)": 41.92, |
| "step": 12340, |
| "train_speed(iter/s)": 0.765123 |
| }, |
| { |
| "epoch": 1.1508379888268156, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.0001450708967210942, |
| "loss": 0.26092589, |
| "memory(GiB)": 41.92, |
| "step": 12360, |
| "train_speed(iter/s)": 0.765214 |
| }, |
| { |
| "epoch": 1.1527001862197392, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001448875611136843, |
| "loss": 0.24557509, |
| "memory(GiB)": 41.92, |
| "step": 12380, |
| "train_speed(iter/s)": 0.765292 |
| }, |
| { |
| "epoch": 1.1545623836126628, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00014470403636456455, |
| "loss": 0.26972108, |
| "memory(GiB)": 41.92, |
| "step": 12400, |
| "train_speed(iter/s)": 0.765326 |
| }, |
| { |
| "epoch": 1.1564245810055866, |
| "grad_norm": 3.21875, |
| "learning_rate": 0.000144520323247049, |
| "loss": 0.26979203, |
| "memory(GiB)": 41.92, |
| "step": 12420, |
| "train_speed(iter/s)": 0.765415 |
| }, |
| { |
| "epoch": 1.1582867783985102, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.0001443364225352455, |
| "loss": 0.26735349, |
| "memory(GiB)": 41.92, |
| "step": 12440, |
| "train_speed(iter/s)": 0.765454 |
| }, |
| { |
| "epoch": 1.1601489757914338, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00014415233500405237, |
| "loss": 0.2736985, |
| "memory(GiB)": 41.92, |
| "step": 12460, |
| "train_speed(iter/s)": 0.7655 |
| }, |
| { |
| "epoch": 1.1620111731843576, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.0001439680614291551, |
| "loss": 0.27279396, |
| "memory(GiB)": 41.92, |
| "step": 12480, |
| "train_speed(iter/s)": 0.765512 |
| }, |
| { |
| "epoch": 1.1638733705772812, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00014378360258702311, |
| "loss": 0.2811708, |
| "memory(GiB)": 41.92, |
| "step": 12500, |
| "train_speed(iter/s)": 0.765564 |
| }, |
| { |
| "epoch": 1.1657355679702048, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00014359895925490656, |
| "loss": 0.27203159, |
| "memory(GiB)": 41.92, |
| "step": 12520, |
| "train_speed(iter/s)": 0.765622 |
| }, |
| { |
| "epoch": 1.1675977653631284, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00014341413221083282, |
| "loss": 0.25222397, |
| "memory(GiB)": 41.92, |
| "step": 12540, |
| "train_speed(iter/s)": 0.76566 |
| }, |
| { |
| "epoch": 1.169459962756052, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001432291222336035, |
| "loss": 0.28008556, |
| "memory(GiB)": 41.92, |
| "step": 12560, |
| "train_speed(iter/s)": 0.765712 |
| }, |
| { |
| "epoch": 1.1713221601489758, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00014304393010279104, |
| "loss": 0.27260323, |
| "memory(GiB)": 41.92, |
| "step": 12580, |
| "train_speed(iter/s)": 0.765741 |
| }, |
| { |
| "epoch": 1.1731843575418994, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.0001428585565987353, |
| "loss": 0.26112268, |
| "memory(GiB)": 41.92, |
| "step": 12600, |
| "train_speed(iter/s)": 0.765776 |
| }, |
| { |
| "epoch": 1.175046554934823, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001426730025025405, |
| "loss": 0.27053833, |
| "memory(GiB)": 41.92, |
| "step": 12620, |
| "train_speed(iter/s)": 0.765859 |
| }, |
| { |
| "epoch": 1.1769087523277468, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00014248726859607174, |
| "loss": 0.25856524, |
| "memory(GiB)": 41.92, |
| "step": 12640, |
| "train_speed(iter/s)": 0.765898 |
| }, |
| { |
| "epoch": 1.1787709497206704, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00014230135566195185, |
| "loss": 0.27097459, |
| "memory(GiB)": 41.92, |
| "step": 12660, |
| "train_speed(iter/s)": 0.765943 |
| }, |
| { |
| "epoch": 1.180633147113594, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.000142115264483558, |
| "loss": 0.24883418, |
| "memory(GiB)": 41.92, |
| "step": 12680, |
| "train_speed(iter/s)": 0.766016 |
| }, |
| { |
| "epoch": 1.1824953445065176, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00014192899584501841, |
| "loss": 0.27689738, |
| "memory(GiB)": 41.92, |
| "step": 12700, |
| "train_speed(iter/s)": 0.766087 |
| }, |
| { |
| "epoch": 1.1843575418994414, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00014174255053120904, |
| "loss": 0.26999578, |
| "memory(GiB)": 41.92, |
| "step": 12720, |
| "train_speed(iter/s)": 0.766176 |
| }, |
| { |
| "epoch": 1.186219739292365, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00014155592932775033, |
| "loss": 0.25672078, |
| "memory(GiB)": 41.92, |
| "step": 12740, |
| "train_speed(iter/s)": 0.766239 |
| }, |
| { |
| "epoch": 1.1880819366852886, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.0001413691330210039, |
| "loss": 0.24652848, |
| "memory(GiB)": 41.92, |
| "step": 12760, |
| "train_speed(iter/s)": 0.766343 |
| }, |
| { |
| "epoch": 1.1899441340782122, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.0001411821623980691, |
| "loss": 0.26364939, |
| "memory(GiB)": 41.92, |
| "step": 12780, |
| "train_speed(iter/s)": 0.766431 |
| }, |
| { |
| "epoch": 1.191806331471136, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00014099501824677992, |
| "loss": 0.27025316, |
| "memory(GiB)": 41.92, |
| "step": 12800, |
| "train_speed(iter/s)": 0.766497 |
| }, |
| { |
| "epoch": 1.1936685288640596, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00014080770135570142, |
| "loss": 0.22930434, |
| "memory(GiB)": 41.92, |
| "step": 12820, |
| "train_speed(iter/s)": 0.766501 |
| }, |
| { |
| "epoch": 1.1955307262569832, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001406202125141266, |
| "loss": 0.23245554, |
| "memory(GiB)": 41.92, |
| "step": 12840, |
| "train_speed(iter/s)": 0.766545 |
| }, |
| { |
| "epoch": 1.197392923649907, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.000140432552512073, |
| "loss": 0.25615866, |
| "memory(GiB)": 41.92, |
| "step": 12860, |
| "train_speed(iter/s)": 0.766559 |
| }, |
| { |
| "epoch": 1.1992551210428306, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00014024472214027939, |
| "loss": 0.24658422, |
| "memory(GiB)": 41.92, |
| "step": 12880, |
| "train_speed(iter/s)": 0.766583 |
| }, |
| { |
| "epoch": 1.2011173184357542, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00014005672219020234, |
| "loss": 0.24690557, |
| "memory(GiB)": 41.92, |
| "step": 12900, |
| "train_speed(iter/s)": 0.766626 |
| }, |
| { |
| "epoch": 1.2029795158286778, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00013986855345401303, |
| "loss": 0.27605991, |
| "memory(GiB)": 41.92, |
| "step": 12920, |
| "train_speed(iter/s)": 0.76671 |
| }, |
| { |
| "epoch": 1.2048417132216014, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00013968021672459388, |
| "loss": 0.24593453, |
| "memory(GiB)": 41.92, |
| "step": 12940, |
| "train_speed(iter/s)": 0.766726 |
| }, |
| { |
| "epoch": 1.2067039106145252, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00013949171279553516, |
| "loss": 0.2621069, |
| "memory(GiB)": 41.92, |
| "step": 12960, |
| "train_speed(iter/s)": 0.766796 |
| }, |
| { |
| "epoch": 1.2085661080074488, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00013930304246113158, |
| "loss": 0.26049728, |
| "memory(GiB)": 41.92, |
| "step": 12980, |
| "train_speed(iter/s)": 0.766867 |
| }, |
| { |
| "epoch": 1.2104283054003724, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0001391142065163791, |
| "loss": 0.25473256, |
| "memory(GiB)": 41.92, |
| "step": 13000, |
| "train_speed(iter/s)": 0.76689 |
| }, |
| { |
| "epoch": 1.2122905027932962, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00013892520575697163, |
| "loss": 0.25336313, |
| "memory(GiB)": 41.92, |
| "step": 13020, |
| "train_speed(iter/s)": 0.766867 |
| }, |
| { |
| "epoch": 1.2141527001862198, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.0001387360409792973, |
| "loss": 0.26464946, |
| "memory(GiB)": 41.92, |
| "step": 13040, |
| "train_speed(iter/s)": 0.766959 |
| }, |
| { |
| "epoch": 1.2160148975791434, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00013854671298043557, |
| "loss": 0.26744711, |
| "memory(GiB)": 41.92, |
| "step": 13060, |
| "train_speed(iter/s)": 0.766995 |
| }, |
| { |
| "epoch": 1.217877094972067, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0001383572225581535, |
| "loss": 0.25843668, |
| "memory(GiB)": 41.92, |
| "step": 13080, |
| "train_speed(iter/s)": 0.766992 |
| }, |
| { |
| "epoch": 1.2197392923649906, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00013816757051090273, |
| "loss": 0.24495487, |
| "memory(GiB)": 41.92, |
| "step": 13100, |
| "train_speed(iter/s)": 0.767065 |
| }, |
| { |
| "epoch": 1.2216014897579144, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00013797775763781578, |
| "loss": 0.24947252, |
| "memory(GiB)": 41.92, |
| "step": 13120, |
| "train_speed(iter/s)": 0.767141 |
| }, |
| { |
| "epoch": 1.223463687150838, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001377877847387029, |
| "loss": 0.24399962, |
| "memory(GiB)": 41.92, |
| "step": 13140, |
| "train_speed(iter/s)": 0.767172 |
| }, |
| { |
| "epoch": 1.2253258845437616, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00013759765261404865, |
| "loss": 0.2646559, |
| "memory(GiB)": 41.92, |
| "step": 13160, |
| "train_speed(iter/s)": 0.767248 |
| }, |
| { |
| "epoch": 1.2271880819366854, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00013740736206500852, |
| "loss": 0.23537984, |
| "memory(GiB)": 41.92, |
| "step": 13180, |
| "train_speed(iter/s)": 0.767296 |
| }, |
| { |
| "epoch": 1.229050279329609, |
| "grad_norm": 1.8359375, |
| "learning_rate": 0.0001372169138934055, |
| "loss": 0.2611448, |
| "memory(GiB)": 41.92, |
| "step": 13200, |
| "train_speed(iter/s)": 0.767357 |
| }, |
| { |
| "epoch": 1.2309124767225326, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00013702630890172681, |
| "loss": 0.24709506, |
| "memory(GiB)": 41.92, |
| "step": 13220, |
| "train_speed(iter/s)": 0.767381 |
| }, |
| { |
| "epoch": 1.2327746741154562, |
| "grad_norm": 1.7421875, |
| "learning_rate": 0.00013683554789312039, |
| "loss": 0.25667541, |
| "memory(GiB)": 41.92, |
| "step": 13240, |
| "train_speed(iter/s)": 0.767469 |
| }, |
| { |
| "epoch": 1.23463687150838, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00013664463167139165, |
| "loss": 0.25362358, |
| "memory(GiB)": 41.92, |
| "step": 13260, |
| "train_speed(iter/s)": 0.767528 |
| }, |
| { |
| "epoch": 1.2364990689013036, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.000136453561041, |
| "loss": 0.27490563, |
| "memory(GiB)": 41.92, |
| "step": 13280, |
| "train_speed(iter/s)": 0.767627 |
| }, |
| { |
| "epoch": 1.2383612662942272, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00013626233680705548, |
| "loss": 0.2458004, |
| "memory(GiB)": 41.92, |
| "step": 13300, |
| "train_speed(iter/s)": 0.767693 |
| }, |
| { |
| "epoch": 1.2402234636871508, |
| "grad_norm": 1.375, |
| "learning_rate": 0.0001360709597753153, |
| "loss": 0.25720315, |
| "memory(GiB)": 41.92, |
| "step": 13320, |
| "train_speed(iter/s)": 0.767713 |
| }, |
| { |
| "epoch": 1.2420856610800746, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.0001358794307521806, |
| "loss": 0.22600164, |
| "memory(GiB)": 41.92, |
| "step": 13340, |
| "train_speed(iter/s)": 0.767729 |
| }, |
| { |
| "epoch": 1.2439478584729982, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.000135687750544693, |
| "loss": 0.24354362, |
| "memory(GiB)": 41.92, |
| "step": 13360, |
| "train_speed(iter/s)": 0.767783 |
| }, |
| { |
| "epoch": 1.2458100558659218, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.000135495919960531, |
| "loss": 0.24208927, |
| "memory(GiB)": 41.92, |
| "step": 13380, |
| "train_speed(iter/s)": 0.767773 |
| }, |
| { |
| "epoch": 1.2476722532588453, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00013530393980800686, |
| "loss": 0.24553039, |
| "memory(GiB)": 41.92, |
| "step": 13400, |
| "train_speed(iter/s)": 0.767854 |
| }, |
| { |
| "epoch": 1.2495344506517692, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00013511181089606304, |
| "loss": 0.23603408, |
| "memory(GiB)": 41.92, |
| "step": 13420, |
| "train_speed(iter/s)": 0.767905 |
| }, |
| { |
| "epoch": 1.2513966480446927, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00013491953403426887, |
| "loss": 0.25591078, |
| "memory(GiB)": 41.92, |
| "step": 13440, |
| "train_speed(iter/s)": 0.767918 |
| }, |
| { |
| "epoch": 1.2532588454376163, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00013472711003281696, |
| "loss": 0.23547509, |
| "memory(GiB)": 41.92, |
| "step": 13460, |
| "train_speed(iter/s)": 0.767974 |
| }, |
| { |
| "epoch": 1.25512104283054, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013453453970252013, |
| "loss": 0.25424879, |
| "memory(GiB)": 41.92, |
| "step": 13480, |
| "train_speed(iter/s)": 0.768055 |
| }, |
| { |
| "epoch": 1.2569832402234637, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00013434182385480757, |
| "loss": 0.24909823, |
| "memory(GiB)": 41.92, |
| "step": 13500, |
| "train_speed(iter/s)": 0.768135 |
| }, |
| { |
| "epoch": 1.2588454376163873, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00013414896330172176, |
| "loss": 0.24665921, |
| "memory(GiB)": 41.92, |
| "step": 13520, |
| "train_speed(iter/s)": 0.76815 |
| }, |
| { |
| "epoch": 1.260707635009311, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001339559588559149, |
| "loss": 0.25871291, |
| "memory(GiB)": 41.92, |
| "step": 13540, |
| "train_speed(iter/s)": 0.768194 |
| }, |
| { |
| "epoch": 1.2625698324022347, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0001337628113306455, |
| "loss": 0.25705509, |
| "memory(GiB)": 41.92, |
| "step": 13560, |
| "train_speed(iter/s)": 0.768234 |
| }, |
| { |
| "epoch": 1.2644320297951583, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013356952153977488, |
| "loss": 0.22989726, |
| "memory(GiB)": 41.92, |
| "step": 13580, |
| "train_speed(iter/s)": 0.768319 |
| }, |
| { |
| "epoch": 1.266294227188082, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00013337609029776402, |
| "loss": 0.25790679, |
| "memory(GiB)": 41.92, |
| "step": 13600, |
| "train_speed(iter/s)": 0.76841 |
| }, |
| { |
| "epoch": 1.2681564245810055, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013318251841966974, |
| "loss": 0.24724863, |
| "memory(GiB)": 41.92, |
| "step": 13620, |
| "train_speed(iter/s)": 0.768461 |
| }, |
| { |
| "epoch": 1.2700186219739291, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.0001329888067211415, |
| "loss": 0.25862865, |
| "memory(GiB)": 41.92, |
| "step": 13640, |
| "train_speed(iter/s)": 0.768508 |
| }, |
| { |
| "epoch": 1.271880819366853, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00013279495601841798, |
| "loss": 0.24088619, |
| "memory(GiB)": 41.92, |
| "step": 13660, |
| "train_speed(iter/s)": 0.768585 |
| }, |
| { |
| "epoch": 1.2737430167597765, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00013260096712832355, |
| "loss": 0.25227604, |
| "memory(GiB)": 41.92, |
| "step": 13680, |
| "train_speed(iter/s)": 0.768636 |
| }, |
| { |
| "epoch": 1.2756052141527001, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00013240684086826483, |
| "loss": 0.24656982, |
| "memory(GiB)": 41.92, |
| "step": 13700, |
| "train_speed(iter/s)": 0.768727 |
| }, |
| { |
| "epoch": 1.277467411545624, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00013221257805622728, |
| "loss": 0.25998006, |
| "memory(GiB)": 41.92, |
| "step": 13720, |
| "train_speed(iter/s)": 0.768793 |
| }, |
| { |
| "epoch": 1.2793296089385475, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00013201817951077175, |
| "loss": 0.24141338, |
| "memory(GiB)": 41.92, |
| "step": 13740, |
| "train_speed(iter/s)": 0.768859 |
| }, |
| { |
| "epoch": 1.2811918063314711, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00013182364605103107, |
| "loss": 0.22881098, |
| "memory(GiB)": 41.92, |
| "step": 13760, |
| "train_speed(iter/s)": 0.768936 |
| }, |
| { |
| "epoch": 1.2830540037243947, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.0001316289784967065, |
| "loss": 0.21602743, |
| "memory(GiB)": 41.92, |
| "step": 13780, |
| "train_speed(iter/s)": 0.769004 |
| }, |
| { |
| "epoch": 1.2849162011173183, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001314341776680643, |
| "loss": 0.23749352, |
| "memory(GiB)": 41.92, |
| "step": 13800, |
| "train_speed(iter/s)": 0.769016 |
| }, |
| { |
| "epoch": 1.2867783985102421, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00013123924438593245, |
| "loss": 0.23468378, |
| "memory(GiB)": 41.92, |
| "step": 13820, |
| "train_speed(iter/s)": 0.769095 |
| }, |
| { |
| "epoch": 1.2886405959031657, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.0001310441794716969, |
| "loss": 0.23531232, |
| "memory(GiB)": 58.24, |
| "step": 13840, |
| "train_speed(iter/s)": 0.769107 |
| }, |
| { |
| "epoch": 1.2905027932960893, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00013084898374729825, |
| "loss": 0.25231352, |
| "memory(GiB)": 58.24, |
| "step": 13860, |
| "train_speed(iter/s)": 0.769127 |
| }, |
| { |
| "epoch": 1.2923649906890131, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00013065365803522846, |
| "loss": 0.22567735, |
| "memory(GiB)": 58.24, |
| "step": 13880, |
| "train_speed(iter/s)": 0.769166 |
| }, |
| { |
| "epoch": 1.2942271880819367, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00013045820315852702, |
| "loss": 0.21936474, |
| "memory(GiB)": 58.24, |
| "step": 13900, |
| "train_speed(iter/s)": 0.769171 |
| }, |
| { |
| "epoch": 1.2960893854748603, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001302626199407778, |
| "loss": 0.24280531, |
| "memory(GiB)": 58.24, |
| "step": 13920, |
| "train_speed(iter/s)": 0.769216 |
| }, |
| { |
| "epoch": 1.2979515828677841, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001300669092061054, |
| "loss": 0.23217328, |
| "memory(GiB)": 58.24, |
| "step": 13940, |
| "train_speed(iter/s)": 0.769238 |
| }, |
| { |
| "epoch": 1.2998137802607077, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00012987107177917176, |
| "loss": 0.22496073, |
| "memory(GiB)": 58.24, |
| "step": 13960, |
| "train_speed(iter/s)": 0.769301 |
| }, |
| { |
| "epoch": 1.3016759776536313, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.00012967510848517266, |
| "loss": 0.23814521, |
| "memory(GiB)": 58.24, |
| "step": 13980, |
| "train_speed(iter/s)": 0.769345 |
| }, |
| { |
| "epoch": 1.303538175046555, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00012947902014983425, |
| "loss": 0.22658379, |
| "memory(GiB)": 58.24, |
| "step": 14000, |
| "train_speed(iter/s)": 0.769362 |
| }, |
| { |
| "epoch": 1.303538175046555, |
| "eval_loss": 0.40084439516067505, |
| "eval_runtime": 77.0745, |
| "eval_samples_per_second": 180.15, |
| "eval_steps_per_second": 1.414, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.3054003724394785, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.0001292828075994095, |
| "loss": 0.21561022, |
| "memory(GiB)": 58.24, |
| "step": 14020, |
| "train_speed(iter/s)": 0.764795 |
| }, |
| { |
| "epoch": 1.3072625698324023, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00012908647166067494, |
| "loss": 0.22430439, |
| "memory(GiB)": 58.24, |
| "step": 14040, |
| "train_speed(iter/s)": 0.764864 |
| }, |
| { |
| "epoch": 1.309124767225326, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00012889001316092682, |
| "loss": 0.22065244, |
| "memory(GiB)": 58.24, |
| "step": 14060, |
| "train_speed(iter/s)": 0.76492 |
| }, |
| { |
| "epoch": 1.3109869646182495, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00012869343292797795, |
| "loss": 0.24164536, |
| "memory(GiB)": 58.24, |
| "step": 14080, |
| "train_speed(iter/s)": 0.764972 |
| }, |
| { |
| "epoch": 1.3128491620111733, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00012849673179015402, |
| "loss": 0.2178977, |
| "memory(GiB)": 58.24, |
| "step": 14100, |
| "train_speed(iter/s)": 0.764989 |
| }, |
| { |
| "epoch": 1.314711359404097, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00012829991057629022, |
| "loss": 0.24479382, |
| "memory(GiB)": 58.24, |
| "step": 14120, |
| "train_speed(iter/s)": 0.765055 |
| }, |
| { |
| "epoch": 1.3165735567970205, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00012810297011572771, |
| "loss": 0.24034092, |
| "memory(GiB)": 58.24, |
| "step": 14140, |
| "train_speed(iter/s)": 0.765145 |
| }, |
| { |
| "epoch": 1.318435754189944, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001279059112383101, |
| "loss": 0.2230279, |
| "memory(GiB)": 58.24, |
| "step": 14160, |
| "train_speed(iter/s)": 0.765184 |
| }, |
| { |
| "epoch": 1.3202979515828677, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00012770873477437994, |
| "loss": 0.23335476, |
| "memory(GiB)": 58.24, |
| "step": 14180, |
| "train_speed(iter/s)": 0.765219 |
| }, |
| { |
| "epoch": 1.3221601489757915, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00012751144155477527, |
| "loss": 0.23615379, |
| "memory(GiB)": 58.24, |
| "step": 14200, |
| "train_speed(iter/s)": 0.765242 |
| }, |
| { |
| "epoch": 1.324022346368715, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001273140324108261, |
| "loss": 0.22382288, |
| "memory(GiB)": 58.24, |
| "step": 14220, |
| "train_speed(iter/s)": 0.76528 |
| }, |
| { |
| "epoch": 1.3258845437616387, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.00012711650817435091, |
| "loss": 0.24886432, |
| "memory(GiB)": 58.24, |
| "step": 14240, |
| "train_speed(iter/s)": 0.765335 |
| }, |
| { |
| "epoch": 1.3277467411545625, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001269188696776532, |
| "loss": 0.25288081, |
| "memory(GiB)": 58.24, |
| "step": 14260, |
| "train_speed(iter/s)": 0.765389 |
| }, |
| { |
| "epoch": 1.329608938547486, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00012672111775351781, |
| "loss": 0.23030953, |
| "memory(GiB)": 58.24, |
| "step": 14280, |
| "train_speed(iter/s)": 0.765397 |
| }, |
| { |
| "epoch": 1.3314711359404097, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00012652325323520762, |
| "loss": 0.23378878, |
| "memory(GiB)": 58.24, |
| "step": 14300, |
| "train_speed(iter/s)": 0.765441 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00012632527695645993, |
| "loss": 0.22340455, |
| "memory(GiB)": 58.24, |
| "step": 14320, |
| "train_speed(iter/s)": 0.765486 |
| }, |
| { |
| "epoch": 1.3351955307262569, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001261271897514829, |
| "loss": 0.22818685, |
| "memory(GiB)": 58.24, |
| "step": 14340, |
| "train_speed(iter/s)": 0.765531 |
| }, |
| { |
| "epoch": 1.3370577281191807, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.0001259289924549522, |
| "loss": 0.22412224, |
| "memory(GiB)": 58.24, |
| "step": 14360, |
| "train_speed(iter/s)": 0.765583 |
| }, |
| { |
| "epoch": 1.3389199255121043, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00012573068590200731, |
| "loss": 0.23063211, |
| "memory(GiB)": 58.24, |
| "step": 14380, |
| "train_speed(iter/s)": 0.765598 |
| }, |
| { |
| "epoch": 1.3407821229050279, |
| "grad_norm": 1.5, |
| "learning_rate": 0.0001255322709282481, |
| "loss": 0.21774821, |
| "memory(GiB)": 58.24, |
| "step": 14400, |
| "train_speed(iter/s)": 0.765623 |
| }, |
| { |
| "epoch": 1.3426443202979517, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001253337483697313, |
| "loss": 0.22045803, |
| "memory(GiB)": 58.24, |
| "step": 14420, |
| "train_speed(iter/s)": 0.76571 |
| }, |
| { |
| "epoch": 1.3445065176908753, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00012513511906296696, |
| "loss": 0.23508878, |
| "memory(GiB)": 58.24, |
| "step": 14440, |
| "train_speed(iter/s)": 0.765764 |
| }, |
| { |
| "epoch": 1.3463687150837989, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00012493638384491492, |
| "loss": 0.20511746, |
| "memory(GiB)": 58.24, |
| "step": 14460, |
| "train_speed(iter/s)": 0.765794 |
| }, |
| { |
| "epoch": 1.3482309124767227, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00012473754355298136, |
| "loss": 0.24029357, |
| "memory(GiB)": 58.24, |
| "step": 14480, |
| "train_speed(iter/s)": 0.76582 |
| }, |
| { |
| "epoch": 1.3500931098696463, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00012453859902501516, |
| "loss": 0.23441067, |
| "memory(GiB)": 58.24, |
| "step": 14500, |
| "train_speed(iter/s)": 0.765812 |
| }, |
| { |
| "epoch": 1.3519553072625698, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00012433955109930432, |
| "loss": 0.2113512, |
| "memory(GiB)": 58.24, |
| "step": 14520, |
| "train_speed(iter/s)": 0.76589 |
| }, |
| { |
| "epoch": 1.3538175046554934, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00012414040061457275, |
| "loss": 0.21798153, |
| "memory(GiB)": 58.24, |
| "step": 14540, |
| "train_speed(iter/s)": 0.765908 |
| }, |
| { |
| "epoch": 1.355679702048417, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00012394114840997634, |
| "loss": 0.21171179, |
| "memory(GiB)": 58.24, |
| "step": 14560, |
| "train_speed(iter/s)": 0.765942 |
| }, |
| { |
| "epoch": 1.3575418994413408, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.00012374179532509958, |
| "loss": 0.2068325, |
| "memory(GiB)": 58.24, |
| "step": 14580, |
| "train_speed(iter/s)": 0.765988 |
| }, |
| { |
| "epoch": 1.3594040968342644, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00012354234219995214, |
| "loss": 0.21365037, |
| "memory(GiB)": 58.24, |
| "step": 14600, |
| "train_speed(iter/s)": 0.766031 |
| }, |
| { |
| "epoch": 1.361266294227188, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00012334278987496518, |
| "loss": 0.21723585, |
| "memory(GiB)": 58.24, |
| "step": 14620, |
| "train_speed(iter/s)": 0.766081 |
| }, |
| { |
| "epoch": 1.3631284916201118, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00012314313919098787, |
| "loss": 0.21648164, |
| "memory(GiB)": 58.24, |
| "step": 14640, |
| "train_speed(iter/s)": 0.76614 |
| }, |
| { |
| "epoch": 1.3649906890130354, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00012294339098928381, |
| "loss": 0.22302213, |
| "memory(GiB)": 58.24, |
| "step": 14660, |
| "train_speed(iter/s)": 0.766204 |
| }, |
| { |
| "epoch": 1.366852886405959, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00012274354611152753, |
| "loss": 0.22709019, |
| "memory(GiB)": 58.24, |
| "step": 14680, |
| "train_speed(iter/s)": 0.766276 |
| }, |
| { |
| "epoch": 1.3687150837988826, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00012254360539980087, |
| "loss": 0.21800654, |
| "memory(GiB)": 58.24, |
| "step": 14700, |
| "train_speed(iter/s)": 0.76635 |
| }, |
| { |
| "epoch": 1.3705772811918062, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.0001223435696965896, |
| "loss": 0.22402124, |
| "memory(GiB)": 58.24, |
| "step": 14720, |
| "train_speed(iter/s)": 0.766429 |
| }, |
| { |
| "epoch": 1.37243947858473, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001221434398447796, |
| "loss": 0.210607, |
| "memory(GiB)": 58.24, |
| "step": 14740, |
| "train_speed(iter/s)": 0.766452 |
| }, |
| { |
| "epoch": 1.3743016759776536, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00012194321668765356, |
| "loss": 0.2248714, |
| "memory(GiB)": 58.24, |
| "step": 14760, |
| "train_speed(iter/s)": 0.766523 |
| }, |
| { |
| "epoch": 1.3761638733705772, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00012174290106888731, |
| "loss": 0.21417339, |
| "memory(GiB)": 58.24, |
| "step": 14780, |
| "train_speed(iter/s)": 0.766543 |
| }, |
| { |
| "epoch": 1.378026070763501, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00012154249383254632, |
| "loss": 0.234969, |
| "memory(GiB)": 58.24, |
| "step": 14800, |
| "train_speed(iter/s)": 0.76655 |
| }, |
| { |
| "epoch": 1.3798882681564246, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00012134199582308204, |
| "loss": 0.21318011, |
| "memory(GiB)": 58.24, |
| "step": 14820, |
| "train_speed(iter/s)": 0.766531 |
| }, |
| { |
| "epoch": 1.3817504655493482, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00012114140788532842, |
| "loss": 0.22573395, |
| "memory(GiB)": 58.24, |
| "step": 14840, |
| "train_speed(iter/s)": 0.766585 |
| }, |
| { |
| "epoch": 1.3836126629422718, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00012094073086449837, |
| "loss": 0.21785412, |
| "memory(GiB)": 58.24, |
| "step": 14860, |
| "train_speed(iter/s)": 0.766626 |
| }, |
| { |
| "epoch": 1.3854748603351954, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00012073996560618019, |
| "loss": 0.24774947, |
| "memory(GiB)": 58.24, |
| "step": 14880, |
| "train_speed(iter/s)": 0.766689 |
| }, |
| { |
| "epoch": 1.3873370577281192, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0001205391129563339, |
| "loss": 0.23350339, |
| "memory(GiB)": 58.24, |
| "step": 14900, |
| "train_speed(iter/s)": 0.766772 |
| }, |
| { |
| "epoch": 1.3891992551210428, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001203381737612878, |
| "loss": 0.19333084, |
| "memory(GiB)": 58.24, |
| "step": 14920, |
| "train_speed(iter/s)": 0.766812 |
| }, |
| { |
| "epoch": 1.3910614525139664, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0001201371488677349, |
| "loss": 0.21416593, |
| "memory(GiB)": 58.24, |
| "step": 14940, |
| "train_speed(iter/s)": 0.766851 |
| }, |
| { |
| "epoch": 1.3929236499068902, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0001199360391227293, |
| "loss": 0.21936598, |
| "memory(GiB)": 58.24, |
| "step": 14960, |
| "train_speed(iter/s)": 0.766868 |
| }, |
| { |
| "epoch": 1.3947858472998138, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.0001197348453736826, |
| "loss": 0.22091618, |
| "memory(GiB)": 58.24, |
| "step": 14980, |
| "train_speed(iter/s)": 0.766953 |
| }, |
| { |
| "epoch": 1.3966480446927374, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00011953356846836037, |
| "loss": 0.21947927, |
| "memory(GiB)": 58.24, |
| "step": 15000, |
| "train_speed(iter/s)": 0.767002 |
| }, |
| { |
| "epoch": 1.3985102420856612, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00011933220925487862, |
| "loss": 0.23145449, |
| "memory(GiB)": 58.24, |
| "step": 15020, |
| "train_speed(iter/s)": 0.767081 |
| }, |
| { |
| "epoch": 1.4003724394785848, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001191307685817002, |
| "loss": 0.22147784, |
| "memory(GiB)": 58.24, |
| "step": 15040, |
| "train_speed(iter/s)": 0.767163 |
| }, |
| { |
| "epoch": 1.4022346368715084, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011892924729763109, |
| "loss": 0.19200894, |
| "memory(GiB)": 58.24, |
| "step": 15060, |
| "train_speed(iter/s)": 0.767203 |
| }, |
| { |
| "epoch": 1.404096834264432, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00011872764625181701, |
| "loss": 0.21943903, |
| "memory(GiB)": 58.24, |
| "step": 15080, |
| "train_speed(iter/s)": 0.767267 |
| }, |
| { |
| "epoch": 1.4059590316573556, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00011852596629373977, |
| "loss": 0.21660228, |
| "memory(GiB)": 58.24, |
| "step": 15100, |
| "train_speed(iter/s)": 0.767307 |
| }, |
| { |
| "epoch": 1.4078212290502794, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.00011832420827321373, |
| "loss": 0.21862273, |
| "memory(GiB)": 58.24, |
| "step": 15120, |
| "train_speed(iter/s)": 0.767327 |
| }, |
| { |
| "epoch": 1.409683426443203, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00011812237304038209, |
| "loss": 0.24118257, |
| "memory(GiB)": 58.24, |
| "step": 15140, |
| "train_speed(iter/s)": 0.767389 |
| }, |
| { |
| "epoch": 1.4115456238361266, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00011792046144571348, |
| "loss": 0.20173512, |
| "memory(GiB)": 58.24, |
| "step": 15160, |
| "train_speed(iter/s)": 0.767455 |
| }, |
| { |
| "epoch": 1.4134078212290504, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011771847433999823, |
| "loss": 0.19333193, |
| "memory(GiB)": 58.24, |
| "step": 15180, |
| "train_speed(iter/s)": 0.767496 |
| }, |
| { |
| "epoch": 1.415270018621974, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00011751641257434492, |
| "loss": 0.21858253, |
| "memory(GiB)": 58.24, |
| "step": 15200, |
| "train_speed(iter/s)": 0.767558 |
| }, |
| { |
| "epoch": 1.4171322160148976, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011731427700017663, |
| "loss": 0.20099108, |
| "memory(GiB)": 58.24, |
| "step": 15220, |
| "train_speed(iter/s)": 0.76761 |
| }, |
| { |
| "epoch": 1.4189944134078212, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00011711206846922753, |
| "loss": 0.20006487, |
| "memory(GiB)": 58.24, |
| "step": 15240, |
| "train_speed(iter/s)": 0.767685 |
| }, |
| { |
| "epoch": 1.4208566108007448, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00011690978783353914, |
| "loss": 0.20852513, |
| "memory(GiB)": 58.24, |
| "step": 15260, |
| "train_speed(iter/s)": 0.767757 |
| }, |
| { |
| "epoch": 1.4227188081936686, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.00011670743594545692, |
| "loss": 0.19811587, |
| "memory(GiB)": 58.24, |
| "step": 15280, |
| "train_speed(iter/s)": 0.767769 |
| }, |
| { |
| "epoch": 1.4245810055865922, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00011650501365762638, |
| "loss": 0.23192661, |
| "memory(GiB)": 58.24, |
| "step": 15300, |
| "train_speed(iter/s)": 0.76783 |
| }, |
| { |
| "epoch": 1.4264432029795158, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00011630252182298985, |
| "loss": 0.20126216, |
| "memory(GiB)": 58.24, |
| "step": 15320, |
| "train_speed(iter/s)": 0.767878 |
| }, |
| { |
| "epoch": 1.4283054003724396, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011609996129478257, |
| "loss": 0.1996449, |
| "memory(GiB)": 58.24, |
| "step": 15340, |
| "train_speed(iter/s)": 0.767869 |
| }, |
| { |
| "epoch": 1.4301675977653632, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00011589733292652938, |
| "loss": 0.18534939, |
| "memory(GiB)": 58.24, |
| "step": 15360, |
| "train_speed(iter/s)": 0.767893 |
| }, |
| { |
| "epoch": 1.4320297951582868, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00011569463757204084, |
| "loss": 0.21407509, |
| "memory(GiB)": 58.24, |
| "step": 15380, |
| "train_speed(iter/s)": 0.76794 |
| }, |
| { |
| "epoch": 1.4338919925512104, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00011549187608540983, |
| "loss": 0.18790734, |
| "memory(GiB)": 58.24, |
| "step": 15400, |
| "train_speed(iter/s)": 0.767948 |
| }, |
| { |
| "epoch": 1.435754189944134, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00011528904932100786, |
| "loss": 0.21195495, |
| "memory(GiB)": 58.24, |
| "step": 15420, |
| "train_speed(iter/s)": 0.768007 |
| }, |
| { |
| "epoch": 1.4376163873370578, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00011508615813348156, |
| "loss": 0.22268994, |
| "memory(GiB)": 58.24, |
| "step": 15440, |
| "train_speed(iter/s)": 0.768083 |
| }, |
| { |
| "epoch": 1.4394785847299814, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00011488320337774893, |
| "loss": 0.21396356, |
| "memory(GiB)": 58.24, |
| "step": 15460, |
| "train_speed(iter/s)": 0.768135 |
| }, |
| { |
| "epoch": 1.441340782122905, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011468018590899591, |
| "loss": 0.18995935, |
| "memory(GiB)": 58.24, |
| "step": 15480, |
| "train_speed(iter/s)": 0.768203 |
| }, |
| { |
| "epoch": 1.4432029795158288, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00011447710658267266, |
| "loss": 0.2040406, |
| "memory(GiB)": 58.24, |
| "step": 15500, |
| "train_speed(iter/s)": 0.768246 |
| }, |
| { |
| "epoch": 1.4450651769087524, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00011427396625448999, |
| "loss": 0.19963667, |
| "memory(GiB)": 58.24, |
| "step": 15520, |
| "train_speed(iter/s)": 0.768292 |
| }, |
| { |
| "epoch": 1.446927374301676, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00011407076578041569, |
| "loss": 0.20383754, |
| "memory(GiB)": 58.24, |
| "step": 15540, |
| "train_speed(iter/s)": 0.768307 |
| }, |
| { |
| "epoch": 1.4487895716945998, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00011386750601667111, |
| "loss": 0.20976281, |
| "memory(GiB)": 58.24, |
| "step": 15560, |
| "train_speed(iter/s)": 0.768352 |
| }, |
| { |
| "epoch": 1.4506517690875234, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00011366418781972732, |
| "loss": 0.21814973, |
| "memory(GiB)": 58.24, |
| "step": 15580, |
| "train_speed(iter/s)": 0.768338 |
| }, |
| { |
| "epoch": 1.452513966480447, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00011346081204630166, |
| "loss": 0.20122726, |
| "memory(GiB)": 58.24, |
| "step": 15600, |
| "train_speed(iter/s)": 0.768344 |
| }, |
| { |
| "epoch": 1.4543761638733705, |
| "grad_norm": 1.03125, |
| "learning_rate": 0.00011325737955335405, |
| "loss": 0.20199561, |
| "memory(GiB)": 58.24, |
| "step": 15620, |
| "train_speed(iter/s)": 0.768395 |
| }, |
| { |
| "epoch": 1.4562383612662941, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00011305389119808345, |
| "loss": 0.19181294, |
| "memory(GiB)": 58.24, |
| "step": 15640, |
| "train_speed(iter/s)": 0.76846 |
| }, |
| { |
| "epoch": 1.458100558659218, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00011285034783792416, |
| "loss": 0.20035157, |
| "memory(GiB)": 58.24, |
| "step": 15660, |
| "train_speed(iter/s)": 0.768503 |
| }, |
| { |
| "epoch": 1.4599627560521415, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0001126467503305423, |
| "loss": 0.20264366, |
| "memory(GiB)": 58.24, |
| "step": 15680, |
| "train_speed(iter/s)": 0.768589 |
| }, |
| { |
| "epoch": 1.4618249534450651, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0001124430995338321, |
| "loss": 0.19577993, |
| "memory(GiB)": 58.24, |
| "step": 15700, |
| "train_speed(iter/s)": 0.76866 |
| }, |
| { |
| "epoch": 1.463687150837989, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00011223939630591238, |
| "loss": 0.22569623, |
| "memory(GiB)": 58.24, |
| "step": 15720, |
| "train_speed(iter/s)": 0.768719 |
| }, |
| { |
| "epoch": 1.4655493482309125, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00011203564150512282, |
| "loss": 0.20180607, |
| "memory(GiB)": 58.24, |
| "step": 15740, |
| "train_speed(iter/s)": 0.768778 |
| }, |
| { |
| "epoch": 1.4674115456238361, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011183183599002052, |
| "loss": 0.21258569, |
| "memory(GiB)": 58.24, |
| "step": 15760, |
| "train_speed(iter/s)": 0.768849 |
| }, |
| { |
| "epoch": 1.4692737430167597, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00011162798061937614, |
| "loss": 0.22123888, |
| "memory(GiB)": 58.24, |
| "step": 15780, |
| "train_speed(iter/s)": 0.768889 |
| }, |
| { |
| "epoch": 1.4711359404096833, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00011142407625217051, |
| "loss": 0.20667183, |
| "memory(GiB)": 58.24, |
| "step": 15800, |
| "train_speed(iter/s)": 0.768946 |
| }, |
| { |
| "epoch": 1.4729981378026071, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001112201237475909, |
| "loss": 0.19118414, |
| "memory(GiB)": 58.24, |
| "step": 15820, |
| "train_speed(iter/s)": 0.768992 |
| }, |
| { |
| "epoch": 1.4748603351955307, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00011101612396502743, |
| "loss": 0.204529, |
| "memory(GiB)": 58.24, |
| "step": 15840, |
| "train_speed(iter/s)": 0.769069 |
| }, |
| { |
| "epoch": 1.4767225325884543, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00011081207776406936, |
| "loss": 0.22140031, |
| "memory(GiB)": 58.24, |
| "step": 15860, |
| "train_speed(iter/s)": 0.769122 |
| }, |
| { |
| "epoch": 1.4785847299813781, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00011060798600450165, |
| "loss": 0.20754693, |
| "memory(GiB)": 58.24, |
| "step": 15880, |
| "train_speed(iter/s)": 0.769183 |
| }, |
| { |
| "epoch": 1.4804469273743017, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001104038495463011, |
| "loss": 0.19853688, |
| "memory(GiB)": 58.24, |
| "step": 15900, |
| "train_speed(iter/s)": 0.76922 |
| }, |
| { |
| "epoch": 1.4823091247672253, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00011019966924963301, |
| "loss": 0.21250277, |
| "memory(GiB)": 58.24, |
| "step": 15920, |
| "train_speed(iter/s)": 0.769279 |
| }, |
| { |
| "epoch": 1.484171322160149, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00010999544597484723, |
| "loss": 0.19000928, |
| "memory(GiB)": 58.24, |
| "step": 15940, |
| "train_speed(iter/s)": 0.769266 |
| }, |
| { |
| "epoch": 1.4860335195530725, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00010979118058247486, |
| "loss": 0.20164049, |
| "memory(GiB)": 58.24, |
| "step": 15960, |
| "train_speed(iter/s)": 0.769316 |
| }, |
| { |
| "epoch": 1.4878957169459963, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00010958687393322435, |
| "loss": 0.18680851, |
| "memory(GiB)": 58.24, |
| "step": 15980, |
| "train_speed(iter/s)": 0.769344 |
| }, |
| { |
| "epoch": 1.48975791433892, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00010938252688797814, |
| "loss": 0.1877938, |
| "memory(GiB)": 58.24, |
| "step": 16000, |
| "train_speed(iter/s)": 0.769392 |
| }, |
| { |
| "epoch": 1.48975791433892, |
| "eval_loss": 0.3874351382255554, |
| "eval_runtime": 76.1306, |
| "eval_samples_per_second": 182.384, |
| "eval_steps_per_second": 1.432, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.4916201117318435, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00010917814030778871, |
| "loss": 0.194278, |
| "memory(GiB)": 58.24, |
| "step": 16020, |
| "train_speed(iter/s)": 0.765509 |
| }, |
| { |
| "epoch": 1.4934823091247673, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00010897371505387522, |
| "loss": 0.20364323, |
| "memory(GiB)": 58.24, |
| "step": 16040, |
| "train_speed(iter/s)": 0.765554 |
| }, |
| { |
| "epoch": 1.495344506517691, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001087692519876198, |
| "loss": 0.19808676, |
| "memory(GiB)": 58.24, |
| "step": 16060, |
| "train_speed(iter/s)": 0.765634 |
| }, |
| { |
| "epoch": 1.4972067039106145, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001085647519705639, |
| "loss": 0.18924752, |
| "memory(GiB)": 58.24, |
| "step": 16080, |
| "train_speed(iter/s)": 0.765667 |
| }, |
| { |
| "epoch": 1.499068901303538, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00010836021586440461, |
| "loss": 0.16571351, |
| "memory(GiB)": 58.24, |
| "step": 16100, |
| "train_speed(iter/s)": 0.765745 |
| }, |
| { |
| "epoch": 1.5009310986964617, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00010815564453099116, |
| "loss": 0.20310044, |
| "memory(GiB)": 58.24, |
| "step": 16120, |
| "train_speed(iter/s)": 0.765771 |
| }, |
| { |
| "epoch": 1.5027932960893855, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.0001079510388323212, |
| "loss": 0.19717693, |
| "memory(GiB)": 58.24, |
| "step": 16140, |
| "train_speed(iter/s)": 0.765833 |
| }, |
| { |
| "epoch": 1.504655493482309, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00010774639963053721, |
| "loss": 0.19102185, |
| "memory(GiB)": 58.24, |
| "step": 16160, |
| "train_speed(iter/s)": 0.765855 |
| }, |
| { |
| "epoch": 1.5065176908752327, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00010754172778792273, |
| "loss": 0.20653129, |
| "memory(GiB)": 58.24, |
| "step": 16180, |
| "train_speed(iter/s)": 0.765871 |
| }, |
| { |
| "epoch": 1.5083798882681565, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00010733702416689894, |
| "loss": 0.17359197, |
| "memory(GiB)": 58.24, |
| "step": 16200, |
| "train_speed(iter/s)": 0.76593 |
| }, |
| { |
| "epoch": 1.51024208566108, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00010713228963002095, |
| "loss": 0.20052829, |
| "memory(GiB)": 58.24, |
| "step": 16220, |
| "train_speed(iter/s)": 0.766002 |
| }, |
| { |
| "epoch": 1.5121042830540037, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00010692752503997405, |
| "loss": 0.19778749, |
| "memory(GiB)": 58.24, |
| "step": 16240, |
| "train_speed(iter/s)": 0.766039 |
| }, |
| { |
| "epoch": 1.5139664804469275, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010672273125957018, |
| "loss": 0.20196233, |
| "memory(GiB)": 58.24, |
| "step": 16260, |
| "train_speed(iter/s)": 0.766025 |
| }, |
| { |
| "epoch": 1.5158286778398509, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.00010651790915174431, |
| "loss": 0.18381786, |
| "memory(GiB)": 58.24, |
| "step": 16280, |
| "train_speed(iter/s)": 0.766031 |
| }, |
| { |
| "epoch": 1.5176908752327747, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.0001063130595795508, |
| "loss": 0.20333216, |
| "memory(GiB)": 58.24, |
| "step": 16300, |
| "train_speed(iter/s)": 0.766055 |
| }, |
| { |
| "epoch": 1.5195530726256983, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00010610818340615967, |
| "loss": 0.20140872, |
| "memory(GiB)": 58.24, |
| "step": 16320, |
| "train_speed(iter/s)": 0.76611 |
| }, |
| { |
| "epoch": 1.5214152700186219, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00010590328149485304, |
| "loss": 0.20071106, |
| "memory(GiB)": 58.24, |
| "step": 16340, |
| "train_speed(iter/s)": 0.76614 |
| }, |
| { |
| "epoch": 1.5232774674115457, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00010569835470902154, |
| "loss": 0.18391163, |
| "memory(GiB)": 58.24, |
| "step": 16360, |
| "train_speed(iter/s)": 0.766176 |
| }, |
| { |
| "epoch": 1.5251396648044693, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00010549340391216058, |
| "loss": 0.19652946, |
| "memory(GiB)": 58.24, |
| "step": 16380, |
| "train_speed(iter/s)": 0.76621 |
| }, |
| { |
| "epoch": 1.5270018621973929, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001052884299678667, |
| "loss": 0.20681992, |
| "memory(GiB)": 58.24, |
| "step": 16400, |
| "train_speed(iter/s)": 0.766211 |
| }, |
| { |
| "epoch": 1.5288640595903167, |
| "grad_norm": 0.99609375, |
| "learning_rate": 0.00010508343373983403, |
| "loss": 0.17298584, |
| "memory(GiB)": 58.24, |
| "step": 16420, |
| "train_speed(iter/s)": 0.76628 |
| }, |
| { |
| "epoch": 1.5307262569832403, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00010487841609185054, |
| "loss": 0.20368049, |
| "memory(GiB)": 58.24, |
| "step": 16440, |
| "train_speed(iter/s)": 0.766332 |
| }, |
| { |
| "epoch": 1.5325884543761639, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00010467337788779456, |
| "loss": 0.1869107, |
| "memory(GiB)": 58.24, |
| "step": 16460, |
| "train_speed(iter/s)": 0.766398 |
| }, |
| { |
| "epoch": 1.5344506517690877, |
| "grad_norm": 0.96484375, |
| "learning_rate": 0.00010446831999163097, |
| "loss": 0.1824265, |
| "memory(GiB)": 58.24, |
| "step": 16480, |
| "train_speed(iter/s)": 0.766432 |
| }, |
| { |
| "epoch": 1.536312849162011, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00010426324326740759, |
| "loss": 0.18272617, |
| "memory(GiB)": 58.24, |
| "step": 16500, |
| "train_speed(iter/s)": 0.766466 |
| }, |
| { |
| "epoch": 1.5381750465549349, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00010405814857925163, |
| "loss": 0.18423979, |
| "memory(GiB)": 58.24, |
| "step": 16520, |
| "train_speed(iter/s)": 0.766478 |
| }, |
| { |
| "epoch": 1.5400372439478585, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.000103853036791366, |
| "loss": 0.19179487, |
| "memory(GiB)": 58.24, |
| "step": 16540, |
| "train_speed(iter/s)": 0.766515 |
| }, |
| { |
| "epoch": 1.541899441340782, |
| "grad_norm": 1.015625, |
| "learning_rate": 0.00010364790876802564, |
| "loss": 0.18541435, |
| "memory(GiB)": 58.24, |
| "step": 16560, |
| "train_speed(iter/s)": 0.766542 |
| }, |
| { |
| "epoch": 1.5437616387337059, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00010344276537357385, |
| "loss": 0.19410855, |
| "memory(GiB)": 58.24, |
| "step": 16580, |
| "train_speed(iter/s)": 0.76658 |
| }, |
| { |
| "epoch": 1.5456238361266295, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00010323760747241879, |
| "loss": 0.19085188, |
| "memory(GiB)": 58.24, |
| "step": 16600, |
| "train_speed(iter/s)": 0.766623 |
| }, |
| { |
| "epoch": 1.547486033519553, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00010303243592902973, |
| "loss": 0.19986777, |
| "memory(GiB)": 58.24, |
| "step": 16620, |
| "train_speed(iter/s)": 0.766689 |
| }, |
| { |
| "epoch": 1.5493482309124769, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010282725160793338, |
| "loss": 0.18039792, |
| "memory(GiB)": 58.24, |
| "step": 16640, |
| "train_speed(iter/s)": 0.76671 |
| }, |
| { |
| "epoch": 1.5512104283054002, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00010262205537371032, |
| "loss": 0.18429933, |
| "memory(GiB)": 58.24, |
| "step": 16660, |
| "train_speed(iter/s)": 0.766755 |
| }, |
| { |
| "epoch": 1.553072625698324, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.0001024168480909913, |
| "loss": 0.18001516, |
| "memory(GiB)": 58.24, |
| "step": 16680, |
| "train_speed(iter/s)": 0.766784 |
| }, |
| { |
| "epoch": 1.5549348230912476, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00010221163062445367, |
| "loss": 0.17717788, |
| "memory(GiB)": 58.24, |
| "step": 16700, |
| "train_speed(iter/s)": 0.76683 |
| }, |
| { |
| "epoch": 1.5567970204841712, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00010200640383881765, |
| "loss": 0.18292215, |
| "memory(GiB)": 58.24, |
| "step": 16720, |
| "train_speed(iter/s)": 0.766884 |
| }, |
| { |
| "epoch": 1.558659217877095, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00010180116859884278, |
| "loss": 0.20042725, |
| "memory(GiB)": 58.24, |
| "step": 16740, |
| "train_speed(iter/s)": 0.766896 |
| }, |
| { |
| "epoch": 1.5605214152700186, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.00010159592576932418, |
| "loss": 0.1987066, |
| "memory(GiB)": 58.24, |
| "step": 16760, |
| "train_speed(iter/s)": 0.766911 |
| }, |
| { |
| "epoch": 1.5623836126629422, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001013906762150889, |
| "loss": 0.17853866, |
| "memory(GiB)": 58.24, |
| "step": 16780, |
| "train_speed(iter/s)": 0.766976 |
| }, |
| { |
| "epoch": 1.564245810055866, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010118542080099247, |
| "loss": 0.16571226, |
| "memory(GiB)": 58.24, |
| "step": 16800, |
| "train_speed(iter/s)": 0.767027 |
| }, |
| { |
| "epoch": 1.5661080074487894, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010098016039191496, |
| "loss": 0.17807512, |
| "memory(GiB)": 58.24, |
| "step": 16820, |
| "train_speed(iter/s)": 0.767079 |
| }, |
| { |
| "epoch": 1.5679702048417132, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00010077489585275758, |
| "loss": 0.17454565, |
| "memory(GiB)": 58.24, |
| "step": 16840, |
| "train_speed(iter/s)": 0.767166 |
| }, |
| { |
| "epoch": 1.5698324022346368, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00010056962804843892, |
| "loss": 0.18413407, |
| "memory(GiB)": 58.24, |
| "step": 16860, |
| "train_speed(iter/s)": 0.767203 |
| }, |
| { |
| "epoch": 1.5716945996275604, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00010036435784389135, |
| "loss": 0.17773303, |
| "memory(GiB)": 58.24, |
| "step": 16880, |
| "train_speed(iter/s)": 0.767245 |
| }, |
| { |
| "epoch": 1.5735567970204842, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001001590861040573, |
| "loss": 0.19698086, |
| "memory(GiB)": 58.24, |
| "step": 16900, |
| "train_speed(iter/s)": 0.767309 |
| }, |
| { |
| "epoch": 1.5754189944134078, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.99538136938857e-05, |
| "loss": 0.18003293, |
| "memory(GiB)": 58.24, |
| "step": 16920, |
| "train_speed(iter/s)": 0.767369 |
| }, |
| { |
| "epoch": 1.5772811918063314, |
| "grad_norm": 1.0859375, |
| "learning_rate": 9.974854147832826e-05, |
| "loss": 0.18390901, |
| "memory(GiB)": 58.24, |
| "step": 16940, |
| "train_speed(iter/s)": 0.767439 |
| }, |
| { |
| "epoch": 1.5791433891992552, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.954327032233599e-05, |
| "loss": 0.17549541, |
| "memory(GiB)": 58.24, |
| "step": 16960, |
| "train_speed(iter/s)": 0.767475 |
| }, |
| { |
| "epoch": 1.5810055865921788, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.933800109085529e-05, |
| "loss": 0.17035363, |
| "memory(GiB)": 58.24, |
| "step": 16980, |
| "train_speed(iter/s)": 0.767506 |
| }, |
| { |
| "epoch": 1.5828677839851024, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.913273464882459e-05, |
| "loss": 0.16605544, |
| "memory(GiB)": 58.24, |
| "step": 17000, |
| "train_speed(iter/s)": 0.767541 |
| }, |
| { |
| "epoch": 1.5847299813780262, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.892747186117047e-05, |
| "loss": 0.1817858, |
| "memory(GiB)": 58.24, |
| "step": 17020, |
| "train_speed(iter/s)": 0.767592 |
| }, |
| { |
| "epoch": 1.5865921787709496, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.872221359280408e-05, |
| "loss": 0.17891808, |
| "memory(GiB)": 58.24, |
| "step": 17040, |
| "train_speed(iter/s)": 0.767651 |
| }, |
| { |
| "epoch": 1.5884543761638734, |
| "grad_norm": 1.296875, |
| "learning_rate": 9.851696070861766e-05, |
| "loss": 0.18936658, |
| "memory(GiB)": 58.24, |
| "step": 17060, |
| "train_speed(iter/s)": 0.767704 |
| }, |
| { |
| "epoch": 1.590316573556797, |
| "grad_norm": 1.296875, |
| "learning_rate": 9.831171407348064e-05, |
| "loss": 0.18166518, |
| "memory(GiB)": 58.24, |
| "step": 17080, |
| "train_speed(iter/s)": 0.767746 |
| }, |
| { |
| "epoch": 1.5921787709497206, |
| "grad_norm": 1.328125, |
| "learning_rate": 9.810647455223616e-05, |
| "loss": 0.19375865, |
| "memory(GiB)": 58.24, |
| "step": 17100, |
| "train_speed(iter/s)": 0.767823 |
| }, |
| { |
| "epoch": 1.5940409683426444, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.790124300969743e-05, |
| "loss": 0.18132524, |
| "memory(GiB)": 58.24, |
| "step": 17120, |
| "train_speed(iter/s)": 0.767869 |
| }, |
| { |
| "epoch": 1.595903165735568, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.769602031064394e-05, |
| "loss": 0.16648118, |
| "memory(GiB)": 58.24, |
| "step": 17140, |
| "train_speed(iter/s)": 0.767933 |
| }, |
| { |
| "epoch": 1.5977653631284916, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.749080731981806e-05, |
| "loss": 0.17684975, |
| "memory(GiB)": 58.24, |
| "step": 17160, |
| "train_speed(iter/s)": 0.76797 |
| }, |
| { |
| "epoch": 1.5996275605214154, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.728560490192112e-05, |
| "loss": 0.17072924, |
| "memory(GiB)": 58.24, |
| "step": 17180, |
| "train_speed(iter/s)": 0.767983 |
| }, |
| { |
| "epoch": 1.6014897579143388, |
| "grad_norm": 0.9921875, |
| "learning_rate": 9.708041392160992e-05, |
| "loss": 0.17846963, |
| "memory(GiB)": 58.24, |
| "step": 17200, |
| "train_speed(iter/s)": 0.768026 |
| }, |
| { |
| "epoch": 1.6033519553072626, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.687523524349314e-05, |
| "loss": 0.19516928, |
| "memory(GiB)": 58.24, |
| "step": 17220, |
| "train_speed(iter/s)": 0.768095 |
| }, |
| { |
| "epoch": 1.6052141527001862, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.667006973212755e-05, |
| "loss": 0.16764045, |
| "memory(GiB)": 58.24, |
| "step": 17240, |
| "train_speed(iter/s)": 0.768152 |
| }, |
| { |
| "epoch": 1.6070763500931098, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.646491825201447e-05, |
| "loss": 0.18242341, |
| "memory(GiB)": 58.24, |
| "step": 17260, |
| "train_speed(iter/s)": 0.768197 |
| }, |
| { |
| "epoch": 1.6089385474860336, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.625978166759612e-05, |
| "loss": 0.17120929, |
| "memory(GiB)": 58.24, |
| "step": 17280, |
| "train_speed(iter/s)": 0.768261 |
| }, |
| { |
| "epoch": 1.6108007448789572, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.605466084325186e-05, |
| "loss": 0.16303382, |
| "memory(GiB)": 58.24, |
| "step": 17300, |
| "train_speed(iter/s)": 0.768326 |
| }, |
| { |
| "epoch": 1.6126629422718808, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.58495566432948e-05, |
| "loss": 0.16613436, |
| "memory(GiB)": 58.24, |
| "step": 17320, |
| "train_speed(iter/s)": 0.768393 |
| }, |
| { |
| "epoch": 1.6145251396648046, |
| "grad_norm": 0.890625, |
| "learning_rate": 9.564446993196785e-05, |
| "loss": 0.16981788, |
| "memory(GiB)": 58.24, |
| "step": 17340, |
| "train_speed(iter/s)": 0.768432 |
| }, |
| { |
| "epoch": 1.616387337057728, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.543940157344029e-05, |
| "loss": 0.18024995, |
| "memory(GiB)": 58.24, |
| "step": 17360, |
| "train_speed(iter/s)": 0.768451 |
| }, |
| { |
| "epoch": 1.6182495344506518, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.523435243180407e-05, |
| "loss": 0.18168037, |
| "memory(GiB)": 58.24, |
| "step": 17380, |
| "train_speed(iter/s)": 0.768465 |
| }, |
| { |
| "epoch": 1.6201117318435754, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.502932337107018e-05, |
| "loss": 0.17241347, |
| "memory(GiB)": 58.24, |
| "step": 17400, |
| "train_speed(iter/s)": 0.768438 |
| }, |
| { |
| "epoch": 1.621973929236499, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.482431525516494e-05, |
| "loss": 0.1620393, |
| "memory(GiB)": 58.24, |
| "step": 17420, |
| "train_speed(iter/s)": 0.768459 |
| }, |
| { |
| "epoch": 1.6238361266294228, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.46193289479265e-05, |
| "loss": 0.18191284, |
| "memory(GiB)": 58.24, |
| "step": 17440, |
| "train_speed(iter/s)": 0.768493 |
| }, |
| { |
| "epoch": 1.6256983240223464, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.4414365313101e-05, |
| "loss": 0.17514572, |
| "memory(GiB)": 58.24, |
| "step": 17460, |
| "train_speed(iter/s)": 0.768567 |
| }, |
| { |
| "epoch": 1.62756052141527, |
| "grad_norm": 1.3671875, |
| "learning_rate": 9.420942521433918e-05, |
| "loss": 0.17852993, |
| "memory(GiB)": 58.24, |
| "step": 17480, |
| "train_speed(iter/s)": 0.768601 |
| }, |
| { |
| "epoch": 1.6294227188081938, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.400450951519253e-05, |
| "loss": 0.16387472, |
| "memory(GiB)": 58.24, |
| "step": 17500, |
| "train_speed(iter/s)": 0.768606 |
| }, |
| { |
| "epoch": 1.6312849162011172, |
| "grad_norm": 2.0, |
| "learning_rate": 9.379961907910967e-05, |
| "loss": 0.16921017, |
| "memory(GiB)": 58.24, |
| "step": 17520, |
| "train_speed(iter/s)": 0.76869 |
| }, |
| { |
| "epoch": 1.633147113594041, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.359475476943292e-05, |
| "loss": 0.17136528, |
| "memory(GiB)": 58.24, |
| "step": 17540, |
| "train_speed(iter/s)": 0.768707 |
| }, |
| { |
| "epoch": 1.6350093109869648, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.338991744939442e-05, |
| "loss": 0.18011303, |
| "memory(GiB)": 58.24, |
| "step": 17560, |
| "train_speed(iter/s)": 0.768748 |
| }, |
| { |
| "epoch": 1.6368715083798882, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.318510798211253e-05, |
| "loss": 0.16851079, |
| "memory(GiB)": 58.24, |
| "step": 17580, |
| "train_speed(iter/s)": 0.768783 |
| }, |
| { |
| "epoch": 1.638733705772812, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.298032723058838e-05, |
| "loss": 0.18129635, |
| "memory(GiB)": 58.24, |
| "step": 17600, |
| "train_speed(iter/s)": 0.768787 |
| }, |
| { |
| "epoch": 1.6405959031657356, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.277557605770194e-05, |
| "loss": 0.15408287, |
| "memory(GiB)": 58.24, |
| "step": 17620, |
| "train_speed(iter/s)": 0.768849 |
| }, |
| { |
| "epoch": 1.6424581005586592, |
| "grad_norm": 0.99609375, |
| "learning_rate": 9.257085532620875e-05, |
| "loss": 0.1833734, |
| "memory(GiB)": 58.24, |
| "step": 17640, |
| "train_speed(iter/s)": 0.768885 |
| }, |
| { |
| "epoch": 1.644320297951583, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.236616589873593e-05, |
| "loss": 0.16197611, |
| "memory(GiB)": 58.24, |
| "step": 17660, |
| "train_speed(iter/s)": 0.768904 |
| }, |
| { |
| "epoch": 1.6461824953445066, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.216150863777865e-05, |
| "loss": 0.16501439, |
| "memory(GiB)": 58.24, |
| "step": 17680, |
| "train_speed(iter/s)": 0.768969 |
| }, |
| { |
| "epoch": 1.6480446927374302, |
| "grad_norm": 1.25, |
| "learning_rate": 9.195688440569675e-05, |
| "loss": 0.17173784, |
| "memory(GiB)": 58.24, |
| "step": 17700, |
| "train_speed(iter/s)": 0.769007 |
| }, |
| { |
| "epoch": 1.649906890130354, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.175229406471069e-05, |
| "loss": 0.15976682, |
| "memory(GiB)": 58.24, |
| "step": 17720, |
| "train_speed(iter/s)": 0.769066 |
| }, |
| { |
| "epoch": 1.6517690875232773, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.15477384768982e-05, |
| "loss": 0.16614034, |
| "memory(GiB)": 58.24, |
| "step": 17740, |
| "train_speed(iter/s)": 0.769113 |
| }, |
| { |
| "epoch": 1.6536312849162011, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.134321850419062e-05, |
| "loss": 0.19003222, |
| "memory(GiB)": 58.24, |
| "step": 17760, |
| "train_speed(iter/s)": 0.769151 |
| }, |
| { |
| "epoch": 1.6554934823091247, |
| "grad_norm": 1.0078125, |
| "learning_rate": 9.11387350083691e-05, |
| "loss": 0.17415506, |
| "memory(GiB)": 58.24, |
| "step": 17780, |
| "train_speed(iter/s)": 0.769173 |
| }, |
| { |
| "epoch": 1.6573556797020483, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.093428885106127e-05, |
| "loss": 0.16781898, |
| "memory(GiB)": 58.24, |
| "step": 17800, |
| "train_speed(iter/s)": 0.769224 |
| }, |
| { |
| "epoch": 1.6592178770949721, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.072988089373724e-05, |
| "loss": 0.16520228, |
| "memory(GiB)": 58.24, |
| "step": 17820, |
| "train_speed(iter/s)": 0.769277 |
| }, |
| { |
| "epoch": 1.6610800744878957, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.052551199770623e-05, |
| "loss": 0.17041247, |
| "memory(GiB)": 58.24, |
| "step": 17840, |
| "train_speed(iter/s)": 0.769331 |
| }, |
| { |
| "epoch": 1.6629422718808193, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.032118302411293e-05, |
| "loss": 0.18079939, |
| "memory(GiB)": 58.24, |
| "step": 17860, |
| "train_speed(iter/s)": 0.769372 |
| }, |
| { |
| "epoch": 1.6648044692737431, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.011689483393375e-05, |
| "loss": 0.15872812, |
| "memory(GiB)": 58.24, |
| "step": 17880, |
| "train_speed(iter/s)": 0.76943 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.515625, |
| "learning_rate": 8.991264828797319e-05, |
| "loss": 0.16760997, |
| "memory(GiB)": 58.24, |
| "step": 17900, |
| "train_speed(iter/s)": 0.769459 |
| }, |
| { |
| "epoch": 1.6685288640595903, |
| "grad_norm": 1.125, |
| "learning_rate": 8.970844424686041e-05, |
| "loss": 0.16810267, |
| "memory(GiB)": 58.24, |
| "step": 17920, |
| "train_speed(iter/s)": 0.769493 |
| }, |
| { |
| "epoch": 1.670391061452514, |
| "grad_norm": 1.25, |
| "learning_rate": 8.950428357104535e-05, |
| "loss": 0.16381236, |
| "memory(GiB)": 58.24, |
| "step": 17940, |
| "train_speed(iter/s)": 0.769526 |
| }, |
| { |
| "epoch": 1.6722532588454375, |
| "grad_norm": 0.8828125, |
| "learning_rate": 8.930016712079531e-05, |
| "loss": 0.15553318, |
| "memory(GiB)": 58.24, |
| "step": 17960, |
| "train_speed(iter/s)": 0.76956 |
| }, |
| { |
| "epoch": 1.6741154562383613, |
| "grad_norm": 0.96875, |
| "learning_rate": 8.909609575619121e-05, |
| "loss": 0.17401633, |
| "memory(GiB)": 58.24, |
| "step": 17980, |
| "train_speed(iter/s)": 0.76963 |
| }, |
| { |
| "epoch": 1.675977653631285, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.889207033712392e-05, |
| "loss": 0.16966208, |
| "memory(GiB)": 58.24, |
| "step": 18000, |
| "train_speed(iter/s)": 0.76966 |
| }, |
| { |
| "epoch": 1.675977653631285, |
| "eval_loss": 0.3787587583065033, |
| "eval_runtime": 76.3917, |
| "eval_samples_per_second": 181.761, |
| "eval_steps_per_second": 1.427, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.6778398510242085, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.868809172329084e-05, |
| "loss": 0.16671758, |
| "memory(GiB)": 58.24, |
| "step": 18020, |
| "train_speed(iter/s)": 0.766139 |
| }, |
| { |
| "epoch": 1.6797020484171323, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.848416077419208e-05, |
| "loss": 0.1657429, |
| "memory(GiB)": 58.24, |
| "step": 18040, |
| "train_speed(iter/s)": 0.766174 |
| }, |
| { |
| "epoch": 1.6815642458100557, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.828027834912686e-05, |
| "loss": 0.16353419, |
| "memory(GiB)": 58.24, |
| "step": 18060, |
| "train_speed(iter/s)": 0.766179 |
| }, |
| { |
| "epoch": 1.6834264432029795, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.807644530719002e-05, |
| "loss": 0.15319026, |
| "memory(GiB)": 58.24, |
| "step": 18080, |
| "train_speed(iter/s)": 0.766193 |
| }, |
| { |
| "epoch": 1.6852886405959033, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.787266250726824e-05, |
| "loss": 0.15553379, |
| "memory(GiB)": 58.24, |
| "step": 18100, |
| "train_speed(iter/s)": 0.76623 |
| }, |
| { |
| "epoch": 1.6871508379888267, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.766893080803659e-05, |
| "loss": 0.16313032, |
| "memory(GiB)": 58.24, |
| "step": 18120, |
| "train_speed(iter/s)": 0.766265 |
| }, |
| { |
| "epoch": 1.6890130353817505, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.746525106795476e-05, |
| "loss": 0.16240375, |
| "memory(GiB)": 58.24, |
| "step": 18140, |
| "train_speed(iter/s)": 0.766296 |
| }, |
| { |
| "epoch": 1.690875232774674, |
| "grad_norm": 1.234375, |
| "learning_rate": 8.726162414526343e-05, |
| "loss": 0.14353172, |
| "memory(GiB)": 58.24, |
| "step": 18160, |
| "train_speed(iter/s)": 0.766374 |
| }, |
| { |
| "epoch": 1.6927374301675977, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.705805089798089e-05, |
| "loss": 0.15949094, |
| "memory(GiB)": 58.24, |
| "step": 18180, |
| "train_speed(iter/s)": 0.766413 |
| }, |
| { |
| "epoch": 1.6945996275605215, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.685453218389914e-05, |
| "loss": 0.15888497, |
| "memory(GiB)": 58.24, |
| "step": 18200, |
| "train_speed(iter/s)": 0.76646 |
| }, |
| { |
| "epoch": 1.696461824953445, |
| "grad_norm": 1.265625, |
| "learning_rate": 8.665106886058037e-05, |
| "loss": 0.16196404, |
| "memory(GiB)": 58.24, |
| "step": 18220, |
| "train_speed(iter/s)": 0.766503 |
| }, |
| { |
| "epoch": 1.6983240223463687, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.644766178535351e-05, |
| "loss": 0.15065837, |
| "memory(GiB)": 58.24, |
| "step": 18240, |
| "train_speed(iter/s)": 0.766528 |
| }, |
| { |
| "epoch": 1.7001862197392925, |
| "grad_norm": 1.25, |
| "learning_rate": 8.624431181531029e-05, |
| "loss": 0.15646677, |
| "memory(GiB)": 58.24, |
| "step": 18260, |
| "train_speed(iter/s)": 0.766571 |
| }, |
| { |
| "epoch": 1.7020484171322159, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.604101980730202e-05, |
| "loss": 0.17705973, |
| "memory(GiB)": 58.24, |
| "step": 18280, |
| "train_speed(iter/s)": 0.766602 |
| }, |
| { |
| "epoch": 1.7039106145251397, |
| "grad_norm": 0.96875, |
| "learning_rate": 8.583778661793565e-05, |
| "loss": 0.1656255, |
| "memory(GiB)": 58.24, |
| "step": 18300, |
| "train_speed(iter/s)": 0.76666 |
| }, |
| { |
| "epoch": 1.7057728119180633, |
| "grad_norm": 1.4765625, |
| "learning_rate": 8.563461310357023e-05, |
| "loss": 0.16021589, |
| "memory(GiB)": 58.24, |
| "step": 18320, |
| "train_speed(iter/s)": 0.766706 |
| }, |
| { |
| "epoch": 1.7076350093109869, |
| "grad_norm": 1.4921875, |
| "learning_rate": 8.543150012031357e-05, |
| "loss": 0.15275285, |
| "memory(GiB)": 58.24, |
| "step": 18340, |
| "train_speed(iter/s)": 0.766732 |
| }, |
| { |
| "epoch": 1.7094972067039107, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.522844852401824e-05, |
| "loss": 0.16219683, |
| "memory(GiB)": 58.24, |
| "step": 18360, |
| "train_speed(iter/s)": 0.766794 |
| }, |
| { |
| "epoch": 1.7113594040968343, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.502545917027817e-05, |
| "loss": 0.1585273, |
| "memory(GiB)": 58.24, |
| "step": 18380, |
| "train_speed(iter/s)": 0.7668 |
| }, |
| { |
| "epoch": 1.7132216014897579, |
| "grad_norm": 0.8359375, |
| "learning_rate": 8.482253291442508e-05, |
| "loss": 0.15480431, |
| "memory(GiB)": 58.24, |
| "step": 18400, |
| "train_speed(iter/s)": 0.766861 |
| }, |
| { |
| "epoch": 1.7150837988826817, |
| "grad_norm": 1.125, |
| "learning_rate": 8.461967061152475e-05, |
| "loss": 0.16871202, |
| "memory(GiB)": 58.24, |
| "step": 18420, |
| "train_speed(iter/s)": 0.766888 |
| }, |
| { |
| "epoch": 1.716945996275605, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.441687311637358e-05, |
| "loss": 0.14869682, |
| "memory(GiB)": 58.24, |
| "step": 18440, |
| "train_speed(iter/s)": 0.766939 |
| }, |
| { |
| "epoch": 1.7188081936685289, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.421414128349478e-05, |
| "loss": 0.15027819, |
| "memory(GiB)": 58.24, |
| "step": 18460, |
| "train_speed(iter/s)": 0.76699 |
| }, |
| { |
| "epoch": 1.7206703910614525, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.401147596713492e-05, |
| "loss": 0.1574163, |
| "memory(GiB)": 58.24, |
| "step": 18480, |
| "train_speed(iter/s)": 0.767002 |
| }, |
| { |
| "epoch": 1.722532588454376, |
| "grad_norm": 1.0703125, |
| "learning_rate": 8.380887802126036e-05, |
| "loss": 0.1544909, |
| "memory(GiB)": 58.24, |
| "step": 18500, |
| "train_speed(iter/s)": 0.767039 |
| }, |
| { |
| "epoch": 1.7243947858472999, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.360634829955349e-05, |
| "loss": 0.14190108, |
| "memory(GiB)": 58.24, |
| "step": 18520, |
| "train_speed(iter/s)": 0.767069 |
| }, |
| { |
| "epoch": 1.7262569832402235, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.340388765540923e-05, |
| "loss": 0.15983729, |
| "memory(GiB)": 58.24, |
| "step": 18540, |
| "train_speed(iter/s)": 0.76707 |
| }, |
| { |
| "epoch": 1.728119180633147, |
| "grad_norm": 1.3046875, |
| "learning_rate": 8.320149694193147e-05, |
| "loss": 0.16751406, |
| "memory(GiB)": 58.24, |
| "step": 18560, |
| "train_speed(iter/s)": 0.767086 |
| }, |
| { |
| "epoch": 1.7299813780260709, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.299917701192943e-05, |
| "loss": 0.16194601, |
| "memory(GiB)": 58.24, |
| "step": 18580, |
| "train_speed(iter/s)": 0.767106 |
| }, |
| { |
| "epoch": 1.7318435754189943, |
| "grad_norm": 1.1796875, |
| "learning_rate": 8.279692871791411e-05, |
| "loss": 0.16382195, |
| "memory(GiB)": 58.24, |
| "step": 18600, |
| "train_speed(iter/s)": 0.767163 |
| }, |
| { |
| "epoch": 1.733705772811918, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.259475291209454e-05, |
| "loss": 0.15488023, |
| "memory(GiB)": 58.24, |
| "step": 18620, |
| "train_speed(iter/s)": 0.767205 |
| }, |
| { |
| "epoch": 1.7355679702048417, |
| "grad_norm": 1.25, |
| "learning_rate": 8.239265044637438e-05, |
| "loss": 0.1414247, |
| "memory(GiB)": 58.24, |
| "step": 18640, |
| "train_speed(iter/s)": 0.767229 |
| }, |
| { |
| "epoch": 1.7374301675977653, |
| "grad_norm": 1.2890625, |
| "learning_rate": 8.219062217234832e-05, |
| "loss": 0.15719596, |
| "memory(GiB)": 58.24, |
| "step": 18660, |
| "train_speed(iter/s)": 0.76727 |
| }, |
| { |
| "epoch": 1.739292364990689, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.198866894129832e-05, |
| "loss": 0.15911665, |
| "memory(GiB)": 58.24, |
| "step": 18680, |
| "train_speed(iter/s)": 0.767339 |
| }, |
| { |
| "epoch": 1.7411545623836127, |
| "grad_norm": 1.0, |
| "learning_rate": 8.178679160419018e-05, |
| "loss": 0.14015193, |
| "memory(GiB)": 58.24, |
| "step": 18700, |
| "train_speed(iter/s)": 0.767386 |
| }, |
| { |
| "epoch": 1.7430167597765363, |
| "grad_norm": 1.2890625, |
| "learning_rate": 8.158499101166997e-05, |
| "loss": 0.16646228, |
| "memory(GiB)": 58.24, |
| "step": 18720, |
| "train_speed(iter/s)": 0.767387 |
| }, |
| { |
| "epoch": 1.74487895716946, |
| "grad_norm": 1.015625, |
| "learning_rate": 8.138326801406025e-05, |
| "loss": 0.13748178, |
| "memory(GiB)": 58.24, |
| "step": 18740, |
| "train_speed(iter/s)": 0.767449 |
| }, |
| { |
| "epoch": 1.7467411545623837, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.118162346135674e-05, |
| "loss": 0.16049628, |
| "memory(GiB)": 58.24, |
| "step": 18760, |
| "train_speed(iter/s)": 0.767496 |
| }, |
| { |
| "epoch": 1.7486033519553073, |
| "grad_norm": 0.92578125, |
| "learning_rate": 8.098005820322455e-05, |
| "loss": 0.14880733, |
| "memory(GiB)": 58.24, |
| "step": 18780, |
| "train_speed(iter/s)": 0.767549 |
| }, |
| { |
| "epoch": 1.750465549348231, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.077857308899465e-05, |
| "loss": 0.16304884, |
| "memory(GiB)": 58.24, |
| "step": 18800, |
| "train_speed(iter/s)": 0.767576 |
| }, |
| { |
| "epoch": 1.7523277467411544, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.057716896766044e-05, |
| "loss": 0.14319315, |
| "memory(GiB)": 58.24, |
| "step": 18820, |
| "train_speed(iter/s)": 0.767627 |
| }, |
| { |
| "epoch": 1.7541899441340782, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.037584668787391e-05, |
| "loss": 0.1478158, |
| "memory(GiB)": 58.24, |
| "step": 18840, |
| "train_speed(iter/s)": 0.767631 |
| }, |
| { |
| "epoch": 1.7560521415270018, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.017460709794216e-05, |
| "loss": 0.1471365, |
| "memory(GiB)": 58.24, |
| "step": 18860, |
| "train_speed(iter/s)": 0.767666 |
| }, |
| { |
| "epoch": 1.7579143389199254, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.997345104582405e-05, |
| "loss": 0.13758318, |
| "memory(GiB)": 58.24, |
| "step": 18880, |
| "train_speed(iter/s)": 0.767694 |
| }, |
| { |
| "epoch": 1.7597765363128492, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.97723793791263e-05, |
| "loss": 0.14244239, |
| "memory(GiB)": 58.24, |
| "step": 18900, |
| "train_speed(iter/s)": 0.767758 |
| }, |
| { |
| "epoch": 1.7616387337057728, |
| "grad_norm": 0.984375, |
| "learning_rate": 7.957139294510003e-05, |
| "loss": 0.15055372, |
| "memory(GiB)": 58.24, |
| "step": 18920, |
| "train_speed(iter/s)": 0.767801 |
| }, |
| { |
| "epoch": 1.7635009310986964, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.937049259063734e-05, |
| "loss": 0.15460105, |
| "memory(GiB)": 58.24, |
| "step": 18940, |
| "train_speed(iter/s)": 0.767834 |
| }, |
| { |
| "epoch": 1.7653631284916202, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.916967916226747e-05, |
| "loss": 0.13544981, |
| "memory(GiB)": 58.24, |
| "step": 18960, |
| "train_speed(iter/s)": 0.767892 |
| }, |
| { |
| "epoch": 1.7672253258845436, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.896895350615357e-05, |
| "loss": 0.15087662, |
| "memory(GiB)": 58.24, |
| "step": 18980, |
| "train_speed(iter/s)": 0.76795 |
| }, |
| { |
| "epoch": 1.7690875232774674, |
| "grad_norm": 0.8671875, |
| "learning_rate": 7.876831646808874e-05, |
| "loss": 0.13152529, |
| "memory(GiB)": 58.24, |
| "step": 19000, |
| "train_speed(iter/s)": 0.767981 |
| }, |
| { |
| "epoch": 1.770949720670391, |
| "grad_norm": 1.3515625, |
| "learning_rate": 7.856776889349279e-05, |
| "loss": 0.1433506, |
| "memory(GiB)": 58.24, |
| "step": 19020, |
| "train_speed(iter/s)": 0.768017 |
| }, |
| { |
| "epoch": 1.7728119180633146, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.836731162740857e-05, |
| "loss": 0.16239972, |
| "memory(GiB)": 58.24, |
| "step": 19040, |
| "train_speed(iter/s)": 0.768055 |
| }, |
| { |
| "epoch": 1.7746741154562384, |
| "grad_norm": 1.0546875, |
| "learning_rate": 7.816694551449838e-05, |
| "loss": 0.15868897, |
| "memory(GiB)": 58.24, |
| "step": 19060, |
| "train_speed(iter/s)": 0.768099 |
| }, |
| { |
| "epoch": 1.776536312849162, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.796667139904036e-05, |
| "loss": 0.13739709, |
| "memory(GiB)": 58.24, |
| "step": 19080, |
| "train_speed(iter/s)": 0.768138 |
| }, |
| { |
| "epoch": 1.7783985102420856, |
| "grad_norm": 1.2734375, |
| "learning_rate": 7.77664901249251e-05, |
| "loss": 0.14739815, |
| "memory(GiB)": 58.24, |
| "step": 19100, |
| "train_speed(iter/s)": 0.768153 |
| }, |
| { |
| "epoch": 1.7802607076350094, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.75664025356519e-05, |
| "loss": 0.15729291, |
| "memory(GiB)": 58.24, |
| "step": 19120, |
| "train_speed(iter/s)": 0.768177 |
| }, |
| { |
| "epoch": 1.7821229050279328, |
| "grad_norm": 1.3359375, |
| "learning_rate": 7.736640947432544e-05, |
| "loss": 0.14836711, |
| "memory(GiB)": 58.24, |
| "step": 19140, |
| "train_speed(iter/s)": 0.768201 |
| }, |
| { |
| "epoch": 1.7839851024208566, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.716651178365193e-05, |
| "loss": 0.1370422, |
| "memory(GiB)": 58.24, |
| "step": 19160, |
| "train_speed(iter/s)": 0.768264 |
| }, |
| { |
| "epoch": 1.7858472998137802, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.696671030593575e-05, |
| "loss": 0.13977002, |
| "memory(GiB)": 58.24, |
| "step": 19180, |
| "train_speed(iter/s)": 0.768294 |
| }, |
| { |
| "epoch": 1.7877094972067038, |
| "grad_norm": 0.953125, |
| "learning_rate": 7.676700588307601e-05, |
| "loss": 0.13385307, |
| "memory(GiB)": 58.24, |
| "step": 19200, |
| "train_speed(iter/s)": 0.768336 |
| }, |
| { |
| "epoch": 1.7895716945996276, |
| "grad_norm": 1.421875, |
| "learning_rate": 7.656739935656271e-05, |
| "loss": 0.15971838, |
| "memory(GiB)": 58.24, |
| "step": 19220, |
| "train_speed(iter/s)": 0.768358 |
| }, |
| { |
| "epoch": 1.7914338919925512, |
| "grad_norm": 0.9609375, |
| "learning_rate": 7.636789156747338e-05, |
| "loss": 0.14956834, |
| "memory(GiB)": 58.24, |
| "step": 19240, |
| "train_speed(iter/s)": 0.76839 |
| }, |
| { |
| "epoch": 1.7932960893854748, |
| "grad_norm": 1.34375, |
| "learning_rate": 7.616848335646956e-05, |
| "loss": 0.14564731, |
| "memory(GiB)": 58.24, |
| "step": 19260, |
| "train_speed(iter/s)": 0.76842 |
| }, |
| { |
| "epoch": 1.7951582867783986, |
| "grad_norm": 1.3125, |
| "learning_rate": 7.596917556379305e-05, |
| "loss": 0.14346001, |
| "memory(GiB)": 58.24, |
| "step": 19280, |
| "train_speed(iter/s)": 0.768463 |
| }, |
| { |
| "epoch": 1.7970204841713222, |
| "grad_norm": 1.015625, |
| "learning_rate": 7.576996902926278e-05, |
| "loss": 0.14654245, |
| "memory(GiB)": 58.24, |
| "step": 19300, |
| "train_speed(iter/s)": 0.768515 |
| }, |
| { |
| "epoch": 1.7988826815642458, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.557086459227078e-05, |
| "loss": 0.15021694, |
| "memory(GiB)": 58.24, |
| "step": 19320, |
| "train_speed(iter/s)": 0.768549 |
| }, |
| { |
| "epoch": 1.8007448789571696, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.537186309177892e-05, |
| "loss": 0.15023569, |
| "memory(GiB)": 58.24, |
| "step": 19340, |
| "train_speed(iter/s)": 0.768581 |
| }, |
| { |
| "epoch": 1.802607076350093, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.517296536631544e-05, |
| "loss": 0.15608531, |
| "memory(GiB)": 58.24, |
| "step": 19360, |
| "train_speed(iter/s)": 0.768616 |
| }, |
| { |
| "epoch": 1.8044692737430168, |
| "grad_norm": 1.0546875, |
| "learning_rate": 7.497417225397118e-05, |
| "loss": 0.15757583, |
| "memory(GiB)": 58.24, |
| "step": 19380, |
| "train_speed(iter/s)": 0.768653 |
| }, |
| { |
| "epoch": 1.8063314711359404, |
| "grad_norm": 1.046875, |
| "learning_rate": 7.477548459239623e-05, |
| "loss": 0.14360132, |
| "memory(GiB)": 58.24, |
| "step": 19400, |
| "train_speed(iter/s)": 0.768689 |
| }, |
| { |
| "epoch": 1.808193668528864, |
| "grad_norm": 1.3671875, |
| "learning_rate": 7.457690321879632e-05, |
| "loss": 0.15476027, |
| "memory(GiB)": 58.24, |
| "step": 19420, |
| "train_speed(iter/s)": 0.768724 |
| }, |
| { |
| "epoch": 1.8100558659217878, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.437842896992932e-05, |
| "loss": 0.13898827, |
| "memory(GiB)": 58.24, |
| "step": 19440, |
| "train_speed(iter/s)": 0.768768 |
| }, |
| { |
| "epoch": 1.8119180633147114, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.418006268210178e-05, |
| "loss": 0.14609106, |
| "memory(GiB)": 58.24, |
| "step": 19460, |
| "train_speed(iter/s)": 0.768823 |
| }, |
| { |
| "epoch": 1.813780260707635, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.398180519116526e-05, |
| "loss": 0.14926387, |
| "memory(GiB)": 58.24, |
| "step": 19480, |
| "train_speed(iter/s)": 0.768889 |
| }, |
| { |
| "epoch": 1.8156424581005588, |
| "grad_norm": 1.2578125, |
| "learning_rate": 7.378365733251284e-05, |
| "loss": 0.14744906, |
| "memory(GiB)": 58.24, |
| "step": 19500, |
| "train_speed(iter/s)": 0.768922 |
| }, |
| { |
| "epoch": 1.8175046554934822, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.35856199410758e-05, |
| "loss": 0.15171263, |
| "memory(GiB)": 58.24, |
| "step": 19520, |
| "train_speed(iter/s)": 0.768968 |
| }, |
| { |
| "epoch": 1.819366852886406, |
| "grad_norm": 1.2109375, |
| "learning_rate": 7.33876938513198e-05, |
| "loss": 0.14976683, |
| "memory(GiB)": 58.24, |
| "step": 19540, |
| "train_speed(iter/s)": 0.769005 |
| }, |
| { |
| "epoch": 1.8212290502793296, |
| "grad_norm": 1.4609375, |
| "learning_rate": 7.31898798972416e-05, |
| "loss": 0.13972685, |
| "memory(GiB)": 58.24, |
| "step": 19560, |
| "train_speed(iter/s)": 0.769011 |
| }, |
| { |
| "epoch": 1.8230912476722532, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.29921789123654e-05, |
| "loss": 0.14240303, |
| "memory(GiB)": 58.24, |
| "step": 19580, |
| "train_speed(iter/s)": 0.769071 |
| }, |
| { |
| "epoch": 1.824953445065177, |
| "grad_norm": 1.1328125, |
| "learning_rate": 7.27945917297394e-05, |
| "loss": 0.15212356, |
| "memory(GiB)": 58.24, |
| "step": 19600, |
| "train_speed(iter/s)": 0.769081 |
| }, |
| { |
| "epoch": 1.8268156424581006, |
| "grad_norm": 1.4609375, |
| "learning_rate": 7.25971191819323e-05, |
| "loss": 0.13351725, |
| "memory(GiB)": 58.24, |
| "step": 19620, |
| "train_speed(iter/s)": 0.769122 |
| }, |
| { |
| "epoch": 1.8286778398510242, |
| "grad_norm": 1.203125, |
| "learning_rate": 7.239976210102977e-05, |
| "loss": 0.14566996, |
| "memory(GiB)": 58.24, |
| "step": 19640, |
| "train_speed(iter/s)": 0.769148 |
| }, |
| { |
| "epoch": 1.830540037243948, |
| "grad_norm": 0.93359375, |
| "learning_rate": 7.220252131863084e-05, |
| "loss": 0.1393986, |
| "memory(GiB)": 58.24, |
| "step": 19660, |
| "train_speed(iter/s)": 0.769214 |
| }, |
| { |
| "epoch": 1.8324022346368714, |
| "grad_norm": 0.96484375, |
| "learning_rate": 7.200539766584467e-05, |
| "loss": 0.14144776, |
| "memory(GiB)": 58.24, |
| "step": 19680, |
| "train_speed(iter/s)": 0.76923 |
| }, |
| { |
| "epoch": 1.8342644320297952, |
| "grad_norm": 0.87109375, |
| "learning_rate": 7.180839197328674e-05, |
| "loss": 0.13021162, |
| "memory(GiB)": 58.24, |
| "step": 19700, |
| "train_speed(iter/s)": 0.769244 |
| }, |
| { |
| "epoch": 1.8361266294227188, |
| "grad_norm": 1.046875, |
| "learning_rate": 7.161150507107547e-05, |
| "loss": 0.14698963, |
| "memory(GiB)": 58.24, |
| "step": 19720, |
| "train_speed(iter/s)": 0.769265 |
| }, |
| { |
| "epoch": 1.8379888268156424, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.141473778882883e-05, |
| "loss": 0.1384059, |
| "memory(GiB)": 58.24, |
| "step": 19740, |
| "train_speed(iter/s)": 0.769284 |
| }, |
| { |
| "epoch": 1.8398510242085662, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.121809095566072e-05, |
| "loss": 0.13622018, |
| "memory(GiB)": 58.24, |
| "step": 19760, |
| "train_speed(iter/s)": 0.769313 |
| }, |
| { |
| "epoch": 1.8417132216014898, |
| "grad_norm": 0.8828125, |
| "learning_rate": 7.102156540017748e-05, |
| "loss": 0.13797989, |
| "memory(GiB)": 58.24, |
| "step": 19780, |
| "train_speed(iter/s)": 0.769333 |
| }, |
| { |
| "epoch": 1.8435754189944134, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.082516195047443e-05, |
| "loss": 0.15423731, |
| "memory(GiB)": 58.24, |
| "step": 19800, |
| "train_speed(iter/s)": 0.769378 |
| }, |
| { |
| "epoch": 1.8454376163873372, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.062888143413235e-05, |
| "loss": 0.14557085, |
| "memory(GiB)": 58.24, |
| "step": 19820, |
| "train_speed(iter/s)": 0.769413 |
| }, |
| { |
| "epoch": 1.8472998137802608, |
| "grad_norm": 0.875, |
| "learning_rate": 7.043272467821412e-05, |
| "loss": 0.11753706, |
| "memory(GiB)": 58.24, |
| "step": 19840, |
| "train_speed(iter/s)": 0.769451 |
| }, |
| { |
| "epoch": 1.8491620111731844, |
| "grad_norm": 1.28125, |
| "learning_rate": 7.023669250926103e-05, |
| "loss": 0.13218291, |
| "memory(GiB)": 58.24, |
| "step": 19860, |
| "train_speed(iter/s)": 0.769492 |
| }, |
| { |
| "epoch": 1.8510242085661082, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.00407857532894e-05, |
| "loss": 0.13124348, |
| "memory(GiB)": 58.24, |
| "step": 19880, |
| "train_speed(iter/s)": 0.769509 |
| }, |
| { |
| "epoch": 1.8528864059590315, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.98450052357872e-05, |
| "loss": 0.1268985, |
| "memory(GiB)": 58.24, |
| "step": 19900, |
| "train_speed(iter/s)": 0.769551 |
| }, |
| { |
| "epoch": 1.8547486033519553, |
| "grad_norm": 0.9765625, |
| "learning_rate": 6.964935178171037e-05, |
| "loss": 0.13166354, |
| "memory(GiB)": 58.24, |
| "step": 19920, |
| "train_speed(iter/s)": 0.769587 |
| }, |
| { |
| "epoch": 1.856610800744879, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.945382621547951e-05, |
| "loss": 0.13858917, |
| "memory(GiB)": 58.24, |
| "step": 19940, |
| "train_speed(iter/s)": 0.769637 |
| }, |
| { |
| "epoch": 1.8584729981378025, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.925842936097629e-05, |
| "loss": 0.14260222, |
| "memory(GiB)": 58.24, |
| "step": 19960, |
| "train_speed(iter/s)": 0.769684 |
| }, |
| { |
| "epoch": 1.8603351955307263, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.906316204154002e-05, |
| "loss": 0.14681377, |
| "memory(GiB)": 58.24, |
| "step": 19980, |
| "train_speed(iter/s)": 0.769741 |
| }, |
| { |
| "epoch": 1.86219739292365, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.886802507996432e-05, |
| "loss": 0.1321399, |
| "memory(GiB)": 58.24, |
| "step": 20000, |
| "train_speed(iter/s)": 0.769748 |
| }, |
| { |
| "epoch": 1.86219739292365, |
| "eval_loss": 0.3764718770980835, |
| "eval_runtime": 77.0126, |
| "eval_samples_per_second": 180.295, |
| "eval_steps_per_second": 1.415, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.8640595903165735, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.867301929849337e-05, |
| "loss": 0.13974258, |
| "memory(GiB)": 58.24, |
| "step": 20020, |
| "train_speed(iter/s)": 0.766626 |
| }, |
| { |
| "epoch": 1.8659217877094973, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.847814551881861e-05, |
| "loss": 0.13684764, |
| "memory(GiB)": 58.24, |
| "step": 20040, |
| "train_speed(iter/s)": 0.766677 |
| }, |
| { |
| "epoch": 1.8677839851024207, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.828340456207542e-05, |
| "loss": 0.13937807, |
| "memory(GiB)": 58.24, |
| "step": 20060, |
| "train_speed(iter/s)": 0.766727 |
| }, |
| { |
| "epoch": 1.8696461824953445, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.808879724883935e-05, |
| "loss": 0.14577636, |
| "memory(GiB)": 58.24, |
| "step": 20080, |
| "train_speed(iter/s)": 0.766742 |
| }, |
| { |
| "epoch": 1.8715083798882681, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.789432439912289e-05, |
| "loss": 0.12374585, |
| "memory(GiB)": 58.24, |
| "step": 20100, |
| "train_speed(iter/s)": 0.766781 |
| }, |
| { |
| "epoch": 1.8733705772811917, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.769998683237193e-05, |
| "loss": 0.12786458, |
| "memory(GiB)": 58.24, |
| "step": 20120, |
| "train_speed(iter/s)": 0.766806 |
| }, |
| { |
| "epoch": 1.8752327746741155, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.750578536746224e-05, |
| "loss": 0.12823834, |
| "memory(GiB)": 58.24, |
| "step": 20140, |
| "train_speed(iter/s)": 0.766836 |
| }, |
| { |
| "epoch": 1.8770949720670391, |
| "grad_norm": 1.6171875, |
| "learning_rate": 6.73117208226963e-05, |
| "loss": 0.14647946, |
| "memory(GiB)": 58.24, |
| "step": 20160, |
| "train_speed(iter/s)": 0.766845 |
| }, |
| { |
| "epoch": 1.8789571694599627, |
| "grad_norm": 0.9921875, |
| "learning_rate": 6.711779401579947e-05, |
| "loss": 0.12733679, |
| "memory(GiB)": 58.24, |
| "step": 20180, |
| "train_speed(iter/s)": 0.766908 |
| }, |
| { |
| "epoch": 1.8808193668528865, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.692400576391675e-05, |
| "loss": 0.1478996, |
| "memory(GiB)": 58.24, |
| "step": 20200, |
| "train_speed(iter/s)": 0.76694 |
| }, |
| { |
| "epoch": 1.88268156424581, |
| "grad_norm": 1.3203125, |
| "learning_rate": 6.673035688360944e-05, |
| "loss": 0.13632859, |
| "memory(GiB)": 58.24, |
| "step": 20220, |
| "train_speed(iter/s)": 0.767002 |
| }, |
| { |
| "epoch": 1.8845437616387337, |
| "grad_norm": 1.4921875, |
| "learning_rate": 6.653684819085142e-05, |
| "loss": 0.13835206, |
| "memory(GiB)": 58.24, |
| "step": 20240, |
| "train_speed(iter/s)": 0.767049 |
| }, |
| { |
| "epoch": 1.8864059590316573, |
| "grad_norm": 2.453125, |
| "learning_rate": 6.634348050102597e-05, |
| "loss": 0.15622594, |
| "memory(GiB)": 58.24, |
| "step": 20260, |
| "train_speed(iter/s)": 0.76704 |
| }, |
| { |
| "epoch": 1.888268156424581, |
| "grad_norm": 1.0, |
| "learning_rate": 6.615025462892218e-05, |
| "loss": 0.12626829, |
| "memory(GiB)": 58.24, |
| "step": 20280, |
| "train_speed(iter/s)": 0.767074 |
| }, |
| { |
| "epoch": 1.8901303538175047, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.59571713887315e-05, |
| "loss": 0.13793883, |
| "memory(GiB)": 58.24, |
| "step": 20300, |
| "train_speed(iter/s)": 0.767106 |
| }, |
| { |
| "epoch": 1.8919925512104283, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.576423159404456e-05, |
| "loss": 0.12859577, |
| "memory(GiB)": 58.24, |
| "step": 20320, |
| "train_speed(iter/s)": 0.767136 |
| }, |
| { |
| "epoch": 1.893854748603352, |
| "grad_norm": 1.4296875, |
| "learning_rate": 6.557143605784742e-05, |
| "loss": 0.13382497, |
| "memory(GiB)": 58.24, |
| "step": 20340, |
| "train_speed(iter/s)": 0.767179 |
| }, |
| { |
| "epoch": 1.8957169459962757, |
| "grad_norm": 0.93359375, |
| "learning_rate": 6.537878559251824e-05, |
| "loss": 0.12739023, |
| "memory(GiB)": 58.24, |
| "step": 20360, |
| "train_speed(iter/s)": 0.767227 |
| }, |
| { |
| "epoch": 1.8975791433891993, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.518628100982406e-05, |
| "loss": 0.1451817, |
| "memory(GiB)": 58.24, |
| "step": 20380, |
| "train_speed(iter/s)": 0.767254 |
| }, |
| { |
| "epoch": 1.899441340782123, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.499392312091707e-05, |
| "loss": 0.14085768, |
| "memory(GiB)": 58.24, |
| "step": 20400, |
| "train_speed(iter/s)": 0.767302 |
| }, |
| { |
| "epoch": 1.9013035381750467, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.480171273633143e-05, |
| "loss": 0.14016054, |
| "memory(GiB)": 58.24, |
| "step": 20420, |
| "train_speed(iter/s)": 0.767337 |
| }, |
| { |
| "epoch": 1.90316573556797, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.46096506659797e-05, |
| "loss": 0.14577733, |
| "memory(GiB)": 58.24, |
| "step": 20440, |
| "train_speed(iter/s)": 0.767358 |
| }, |
| { |
| "epoch": 1.905027932960894, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.441773771914947e-05, |
| "loss": 0.14134233, |
| "memory(GiB)": 58.24, |
| "step": 20460, |
| "train_speed(iter/s)": 0.767418 |
| }, |
| { |
| "epoch": 1.9068901303538175, |
| "grad_norm": 1.25, |
| "learning_rate": 6.422597470450014e-05, |
| "loss": 0.14478569, |
| "memory(GiB)": 58.24, |
| "step": 20480, |
| "train_speed(iter/s)": 0.767431 |
| }, |
| { |
| "epoch": 1.908752327746741, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.403436243005915e-05, |
| "loss": 0.12952732, |
| "memory(GiB)": 58.24, |
| "step": 20500, |
| "train_speed(iter/s)": 0.767457 |
| }, |
| { |
| "epoch": 1.910614525139665, |
| "grad_norm": 1.25, |
| "learning_rate": 6.384290170321881e-05, |
| "loss": 0.13815956, |
| "memory(GiB)": 58.24, |
| "step": 20520, |
| "train_speed(iter/s)": 0.767502 |
| }, |
| { |
| "epoch": 1.9124767225325885, |
| "grad_norm": 1.03125, |
| "learning_rate": 6.365159333073298e-05, |
| "loss": 0.14318891, |
| "memory(GiB)": 58.24, |
| "step": 20540, |
| "train_speed(iter/s)": 0.76757 |
| }, |
| { |
| "epoch": 1.914338919925512, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.34604381187134e-05, |
| "loss": 0.13616923, |
| "memory(GiB)": 58.24, |
| "step": 20560, |
| "train_speed(iter/s)": 0.767628 |
| }, |
| { |
| "epoch": 1.916201117318436, |
| "grad_norm": 1.4140625, |
| "learning_rate": 6.326943687262656e-05, |
| "loss": 0.12770704, |
| "memory(GiB)": 58.24, |
| "step": 20580, |
| "train_speed(iter/s)": 0.767647 |
| }, |
| { |
| "epoch": 1.9180633147113593, |
| "grad_norm": 1.3359375, |
| "learning_rate": 6.307859039729007e-05, |
| "loss": 0.12648449, |
| "memory(GiB)": 58.24, |
| "step": 20600, |
| "train_speed(iter/s)": 0.767683 |
| }, |
| { |
| "epoch": 1.919925512104283, |
| "grad_norm": 1.515625, |
| "learning_rate": 6.288789949686941e-05, |
| "loss": 0.13040155, |
| "memory(GiB)": 58.24, |
| "step": 20620, |
| "train_speed(iter/s)": 0.767726 |
| }, |
| { |
| "epoch": 1.9217877094972067, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.269736497487469e-05, |
| "loss": 0.13270816, |
| "memory(GiB)": 58.24, |
| "step": 20640, |
| "train_speed(iter/s)": 0.767769 |
| }, |
| { |
| "epoch": 1.9236499068901303, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.250698763415683e-05, |
| "loss": 0.14179718, |
| "memory(GiB)": 58.24, |
| "step": 20660, |
| "train_speed(iter/s)": 0.767801 |
| }, |
| { |
| "epoch": 1.925512104283054, |
| "grad_norm": 0.9140625, |
| "learning_rate": 6.231676827690457e-05, |
| "loss": 0.12922909, |
| "memory(GiB)": 58.24, |
| "step": 20680, |
| "train_speed(iter/s)": 0.767832 |
| }, |
| { |
| "epoch": 1.9273743016759777, |
| "grad_norm": 0.87109375, |
| "learning_rate": 6.212670770464102e-05, |
| "loss": 0.11642365, |
| "memory(GiB)": 58.24, |
| "step": 20700, |
| "train_speed(iter/s)": 0.76785 |
| }, |
| { |
| "epoch": 1.9292364990689013, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.19368067182201e-05, |
| "loss": 0.14179863, |
| "memory(GiB)": 58.24, |
| "step": 20720, |
| "train_speed(iter/s)": 0.767879 |
| }, |
| { |
| "epoch": 1.931098696461825, |
| "grad_norm": 1.0078125, |
| "learning_rate": 6.174706611782336e-05, |
| "loss": 0.14035552, |
| "memory(GiB)": 58.24, |
| "step": 20740, |
| "train_speed(iter/s)": 0.76793 |
| }, |
| { |
| "epoch": 1.9329608938547485, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.155748670295654e-05, |
| "loss": 0.13533392, |
| "memory(GiB)": 58.24, |
| "step": 20760, |
| "train_speed(iter/s)": 0.767974 |
| }, |
| { |
| "epoch": 1.9348230912476723, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.136806927244607e-05, |
| "loss": 0.14196831, |
| "memory(GiB)": 58.24, |
| "step": 20780, |
| "train_speed(iter/s)": 0.767996 |
| }, |
| { |
| "epoch": 1.9366852886405959, |
| "grad_norm": 1.0234375, |
| "learning_rate": 6.117881462443604e-05, |
| "loss": 0.11872478, |
| "memory(GiB)": 58.24, |
| "step": 20800, |
| "train_speed(iter/s)": 0.76802 |
| }, |
| { |
| "epoch": 1.9385474860335195, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.098972355638448e-05, |
| "loss": 0.12733064, |
| "memory(GiB)": 58.24, |
| "step": 20820, |
| "train_speed(iter/s)": 0.768081 |
| }, |
| { |
| "epoch": 1.9404096834264433, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.080079686506014e-05, |
| "loss": 0.12513154, |
| "memory(GiB)": 58.24, |
| "step": 20840, |
| "train_speed(iter/s)": 0.768129 |
| }, |
| { |
| "epoch": 1.9422718808193669, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.061203534653926e-05, |
| "loss": 0.14031601, |
| "memory(GiB)": 58.24, |
| "step": 20860, |
| "train_speed(iter/s)": 0.768162 |
| }, |
| { |
| "epoch": 1.9441340782122905, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.042343979620197e-05, |
| "loss": 0.14372622, |
| "memory(GiB)": 58.24, |
| "step": 20880, |
| "train_speed(iter/s)": 0.768224 |
| }, |
| { |
| "epoch": 1.9459962756052143, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.0235011008729134e-05, |
| "loss": 0.13222244, |
| "memory(GiB)": 58.24, |
| "step": 20900, |
| "train_speed(iter/s)": 0.768263 |
| }, |
| { |
| "epoch": 1.9478584729981379, |
| "grad_norm": 0.9765625, |
| "learning_rate": 6.0046749778098875e-05, |
| "loss": 0.11751642, |
| "memory(GiB)": 58.24, |
| "step": 20920, |
| "train_speed(iter/s)": 0.768299 |
| }, |
| { |
| "epoch": 1.9497206703910615, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.985865689758337e-05, |
| "loss": 0.13005948, |
| "memory(GiB)": 58.24, |
| "step": 20940, |
| "train_speed(iter/s)": 0.768318 |
| }, |
| { |
| "epoch": 1.9515828677839853, |
| "grad_norm": 1.4609375, |
| "learning_rate": 5.967073315974534e-05, |
| "loss": 0.13441616, |
| "memory(GiB)": 58.24, |
| "step": 20960, |
| "train_speed(iter/s)": 0.768371 |
| }, |
| { |
| "epoch": 1.9534450651769086, |
| "grad_norm": 1.0703125, |
| "learning_rate": 5.9482979356434834e-05, |
| "loss": 0.12724516, |
| "memory(GiB)": 58.24, |
| "step": 20980, |
| "train_speed(iter/s)": 0.768402 |
| }, |
| { |
| "epoch": 1.9553072625698324, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.929539627878582e-05, |
| "loss": 0.13387883, |
| "memory(GiB)": 58.24, |
| "step": 21000, |
| "train_speed(iter/s)": 0.768462 |
| }, |
| { |
| "epoch": 1.957169459962756, |
| "grad_norm": 0.8984375, |
| "learning_rate": 5.910798471721295e-05, |
| "loss": 0.13106408, |
| "memory(GiB)": 58.24, |
| "step": 21020, |
| "train_speed(iter/s)": 0.768497 |
| }, |
| { |
| "epoch": 1.9590316573556796, |
| "grad_norm": 0.84375, |
| "learning_rate": 5.892074546140809e-05, |
| "loss": 0.11956599, |
| "memory(GiB)": 58.24, |
| "step": 21040, |
| "train_speed(iter/s)": 0.76853 |
| }, |
| { |
| "epoch": 1.9608938547486034, |
| "grad_norm": 1.015625, |
| "learning_rate": 5.87336793003371e-05, |
| "loss": 0.12412496, |
| "memory(GiB)": 58.24, |
| "step": 21060, |
| "train_speed(iter/s)": 0.768568 |
| }, |
| { |
| "epoch": 1.962756052141527, |
| "grad_norm": 0.81640625, |
| "learning_rate": 5.854678702223648e-05, |
| "loss": 0.1311347, |
| "memory(GiB)": 58.24, |
| "step": 21080, |
| "train_speed(iter/s)": 0.768605 |
| }, |
| { |
| "epoch": 1.9646182495344506, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.8360069414610016e-05, |
| "loss": 0.11528455, |
| "memory(GiB)": 58.24, |
| "step": 21100, |
| "train_speed(iter/s)": 0.768642 |
| }, |
| { |
| "epoch": 1.9664804469273744, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.817352726422551e-05, |
| "loss": 0.14151897, |
| "memory(GiB)": 58.24, |
| "step": 21120, |
| "train_speed(iter/s)": 0.76868 |
| }, |
| { |
| "epoch": 1.9683426443202978, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.798716135711149e-05, |
| "loss": 0.13168578, |
| "memory(GiB)": 58.24, |
| "step": 21140, |
| "train_speed(iter/s)": 0.768704 |
| }, |
| { |
| "epoch": 1.9702048417132216, |
| "grad_norm": 0.82421875, |
| "learning_rate": 5.780097247855371e-05, |
| "loss": 0.11186194, |
| "memory(GiB)": 58.24, |
| "step": 21160, |
| "train_speed(iter/s)": 0.768737 |
| }, |
| { |
| "epoch": 1.9720670391061452, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.76149614130921e-05, |
| "loss": 0.13154147, |
| "memory(GiB)": 58.24, |
| "step": 21180, |
| "train_speed(iter/s)": 0.768775 |
| }, |
| { |
| "epoch": 1.9739292364990688, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.742912894451735e-05, |
| "loss": 0.1238625, |
| "memory(GiB)": 58.24, |
| "step": 21200, |
| "train_speed(iter/s)": 0.768828 |
| }, |
| { |
| "epoch": 1.9757914338919926, |
| "grad_norm": 0.984375, |
| "learning_rate": 5.724347585586755e-05, |
| "loss": 0.11985067, |
| "memory(GiB)": 58.24, |
| "step": 21220, |
| "train_speed(iter/s)": 0.768858 |
| }, |
| { |
| "epoch": 1.9776536312849162, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.705800292942498e-05, |
| "loss": 0.12159057, |
| "memory(GiB)": 58.24, |
| "step": 21240, |
| "train_speed(iter/s)": 0.768883 |
| }, |
| { |
| "epoch": 1.9795158286778398, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.687271094671266e-05, |
| "loss": 0.1149127, |
| "memory(GiB)": 58.24, |
| "step": 21260, |
| "train_speed(iter/s)": 0.768853 |
| }, |
| { |
| "epoch": 1.9813780260707636, |
| "grad_norm": 1.015625, |
| "learning_rate": 5.668760068849141e-05, |
| "loss": 0.13043272, |
| "memory(GiB)": 58.24, |
| "step": 21280, |
| "train_speed(iter/s)": 0.768896 |
| }, |
| { |
| "epoch": 1.983240223463687, |
| "grad_norm": 1.5, |
| "learning_rate": 5.650267293475605e-05, |
| "loss": 0.11858026, |
| "memory(GiB)": 58.24, |
| "step": 21300, |
| "train_speed(iter/s)": 0.768961 |
| }, |
| { |
| "epoch": 1.9851024208566108, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.631792846473256e-05, |
| "loss": 0.11108406, |
| "memory(GiB)": 58.24, |
| "step": 21320, |
| "train_speed(iter/s)": 0.769007 |
| }, |
| { |
| "epoch": 1.9869646182495344, |
| "grad_norm": 0.76953125, |
| "learning_rate": 5.6133368056874616e-05, |
| "loss": 0.1135129, |
| "memory(GiB)": 58.24, |
| "step": 21340, |
| "train_speed(iter/s)": 0.769044 |
| }, |
| { |
| "epoch": 1.988826815642458, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.594899248886012e-05, |
| "loss": 0.12456424, |
| "memory(GiB)": 58.24, |
| "step": 21360, |
| "train_speed(iter/s)": 0.769086 |
| }, |
| { |
| "epoch": 1.9906890130353818, |
| "grad_norm": 0.98828125, |
| "learning_rate": 5.5764802537588465e-05, |
| "loss": 0.12759299, |
| "memory(GiB)": 58.24, |
| "step": 21380, |
| "train_speed(iter/s)": 0.769135 |
| }, |
| { |
| "epoch": 1.9925512104283054, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.5580798979176595e-05, |
| "loss": 0.12331834, |
| "memory(GiB)": 58.24, |
| "step": 21400, |
| "train_speed(iter/s)": 0.769187 |
| }, |
| { |
| "epoch": 1.994413407821229, |
| "grad_norm": 1.0, |
| "learning_rate": 5.53969825889562e-05, |
| "loss": 0.11813304, |
| "memory(GiB)": 58.24, |
| "step": 21420, |
| "train_speed(iter/s)": 0.769246 |
| }, |
| { |
| "epoch": 1.9962756052141528, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.5213354141470294e-05, |
| "loss": 0.11984769, |
| "memory(GiB)": 58.24, |
| "step": 21440, |
| "train_speed(iter/s)": 0.769291 |
| }, |
| { |
| "epoch": 1.9981378026070762, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.502991441046998e-05, |
| "loss": 0.11140869, |
| "memory(GiB)": 58.24, |
| "step": 21460, |
| "train_speed(iter/s)": 0.769304 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.484666416891109e-05, |
| "loss": 0.12352898, |
| "memory(GiB)": 58.24, |
| "step": 21480, |
| "train_speed(iter/s)": 0.769312 |
| }, |
| { |
| "epoch": 2.001862197392924, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.466360418895107e-05, |
| "loss": 0.12280267, |
| "memory(GiB)": 58.24, |
| "step": 21500, |
| "train_speed(iter/s)": 0.769021 |
| }, |
| { |
| "epoch": 2.003724394785847, |
| "grad_norm": 0.8671875, |
| "learning_rate": 5.4480735241945635e-05, |
| "loss": 0.12903731, |
| "memory(GiB)": 58.24, |
| "step": 21520, |
| "train_speed(iter/s)": 0.769074 |
| }, |
| { |
| "epoch": 2.005586592178771, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.429805809844562e-05, |
| "loss": 0.13171415, |
| "memory(GiB)": 58.24, |
| "step": 21540, |
| "train_speed(iter/s)": 0.769104 |
| }, |
| { |
| "epoch": 2.007448789571695, |
| "grad_norm": 0.85546875, |
| "learning_rate": 5.4115573528193605e-05, |
| "loss": 0.10338333, |
| "memory(GiB)": 58.24, |
| "step": 21560, |
| "train_speed(iter/s)": 0.769152 |
| }, |
| { |
| "epoch": 2.009310986964618, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.3933282300120624e-05, |
| "loss": 0.12971883, |
| "memory(GiB)": 58.24, |
| "step": 21580, |
| "train_speed(iter/s)": 0.769168 |
| }, |
| { |
| "epoch": 2.011173184357542, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.3751185182343324e-05, |
| "loss": 0.12211378, |
| "memory(GiB)": 58.24, |
| "step": 21600, |
| "train_speed(iter/s)": 0.769214 |
| }, |
| { |
| "epoch": 2.0130353817504654, |
| "grad_norm": 0.8359375, |
| "learning_rate": 5.356928294216008e-05, |
| "loss": 0.11860096, |
| "memory(GiB)": 58.24, |
| "step": 21620, |
| "train_speed(iter/s)": 0.769226 |
| }, |
| { |
| "epoch": 2.014897579143389, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.338757634604836e-05, |
| "loss": 0.12931483, |
| "memory(GiB)": 58.24, |
| "step": 21640, |
| "train_speed(iter/s)": 0.769265 |
| }, |
| { |
| "epoch": 2.016759776536313, |
| "grad_norm": 1.390625, |
| "learning_rate": 5.320606615966117e-05, |
| "loss": 0.11598758, |
| "memory(GiB)": 58.24, |
| "step": 21660, |
| "train_speed(iter/s)": 0.769256 |
| }, |
| { |
| "epoch": 2.0186219739292364, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.302475314782389e-05, |
| "loss": 0.12075157, |
| "memory(GiB)": 58.24, |
| "step": 21680, |
| "train_speed(iter/s)": 0.769291 |
| }, |
| { |
| "epoch": 2.02048417132216, |
| "grad_norm": 0.984375, |
| "learning_rate": 5.284363807453117e-05, |
| "loss": 0.12393267, |
| "memory(GiB)": 58.24, |
| "step": 21700, |
| "train_speed(iter/s)": 0.769313 |
| }, |
| { |
| "epoch": 2.022346368715084, |
| "grad_norm": 0.875, |
| "learning_rate": 5.266272170294342e-05, |
| "loss": 0.11194068, |
| "memory(GiB)": 58.24, |
| "step": 21720, |
| "train_speed(iter/s)": 0.76932 |
| }, |
| { |
| "epoch": 2.0242085661080074, |
| "grad_norm": 1.015625, |
| "learning_rate": 5.248200479538394e-05, |
| "loss": 0.11720626, |
| "memory(GiB)": 58.24, |
| "step": 21740, |
| "train_speed(iter/s)": 0.769353 |
| }, |
| { |
| "epoch": 2.026070763500931, |
| "grad_norm": 0.875, |
| "learning_rate": 5.230148811333555e-05, |
| "loss": 0.10890026, |
| "memory(GiB)": 58.24, |
| "step": 21760, |
| "train_speed(iter/s)": 0.769387 |
| }, |
| { |
| "epoch": 2.0279329608938546, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.212117241743735e-05, |
| "loss": 0.11610082, |
| "memory(GiB)": 58.24, |
| "step": 21780, |
| "train_speed(iter/s)": 0.769434 |
| }, |
| { |
| "epoch": 2.0297951582867784, |
| "grad_norm": 0.96875, |
| "learning_rate": 5.194105846748143e-05, |
| "loss": 0.13138523, |
| "memory(GiB)": 58.24, |
| "step": 21800, |
| "train_speed(iter/s)": 0.769494 |
| }, |
| { |
| "epoch": 2.031657355679702, |
| "grad_norm": 1.0078125, |
| "learning_rate": 5.176114702241006e-05, |
| "loss": 0.12315972, |
| "memory(GiB)": 58.24, |
| "step": 21820, |
| "train_speed(iter/s)": 0.76951 |
| }, |
| { |
| "epoch": 2.0335195530726256, |
| "grad_norm": 0.9765625, |
| "learning_rate": 5.158143884031197e-05, |
| "loss": 0.11700615, |
| "memory(GiB)": 58.24, |
| "step": 21840, |
| "train_speed(iter/s)": 0.769544 |
| }, |
| { |
| "epoch": 2.0353817504655494, |
| "grad_norm": 1.3203125, |
| "learning_rate": 5.1401934678419506e-05, |
| "loss": 0.12096615, |
| "memory(GiB)": 58.24, |
| "step": 21860, |
| "train_speed(iter/s)": 0.769577 |
| }, |
| { |
| "epoch": 2.037243947858473, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.122263529310538e-05, |
| "loss": 0.122633, |
| "memory(GiB)": 58.24, |
| "step": 21880, |
| "train_speed(iter/s)": 0.769619 |
| }, |
| { |
| "epoch": 2.0391061452513966, |
| "grad_norm": 1.0078125, |
| "learning_rate": 5.1043541439879286e-05, |
| "loss": 0.11288965, |
| "memory(GiB)": 58.24, |
| "step": 21900, |
| "train_speed(iter/s)": 0.769682 |
| }, |
| { |
| "epoch": 2.0409683426443204, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.0864653873385124e-05, |
| "loss": 0.12267628, |
| "memory(GiB)": 58.24, |
| "step": 21920, |
| "train_speed(iter/s)": 0.769729 |
| }, |
| { |
| "epoch": 2.0428305400372437, |
| "grad_norm": 1.4453125, |
| "learning_rate": 5.068597334739731e-05, |
| "loss": 0.11504929, |
| "memory(GiB)": 58.24, |
| "step": 21940, |
| "train_speed(iter/s)": 0.769754 |
| }, |
| { |
| "epoch": 2.0446927374301676, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.0507500614817995e-05, |
| "loss": 0.11222873, |
| "memory(GiB)": 58.24, |
| "step": 21960, |
| "train_speed(iter/s)": 0.769798 |
| }, |
| { |
| "epoch": 2.0465549348230914, |
| "grad_norm": 0.84765625, |
| "learning_rate": 5.032923642767374e-05, |
| "loss": 0.11789845, |
| "memory(GiB)": 58.24, |
| "step": 21980, |
| "train_speed(iter/s)": 0.769844 |
| }, |
| { |
| "epoch": 2.0484171322160147, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.015118153711237e-05, |
| "loss": 0.13255039, |
| "memory(GiB)": 58.24, |
| "step": 22000, |
| "train_speed(iter/s)": 0.769864 |
| }, |
| { |
| "epoch": 2.0484171322160147, |
| "eval_loss": 0.377461701631546, |
| "eval_runtime": 77.0906, |
| "eval_samples_per_second": 180.113, |
| "eval_steps_per_second": 1.414, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.0502793296089385, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.9973336693399795e-05, |
| "loss": 0.10642745, |
| "memory(GiB)": 58.24, |
| "step": 22020, |
| "train_speed(iter/s)": 0.766966 |
| }, |
| { |
| "epoch": 2.0521415270018624, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.97957026459168e-05, |
| "loss": 0.11735952, |
| "memory(GiB)": 58.24, |
| "step": 22040, |
| "train_speed(iter/s)": 0.766974 |
| }, |
| { |
| "epoch": 2.0540037243947857, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.961828014315599e-05, |
| "loss": 0.11628766, |
| "memory(GiB)": 58.24, |
| "step": 22060, |
| "train_speed(iter/s)": 0.766999 |
| }, |
| { |
| "epoch": 2.0558659217877095, |
| "grad_norm": 0.8359375, |
| "learning_rate": 4.944106993271863e-05, |
| "loss": 0.13654802, |
| "memory(GiB)": 58.24, |
| "step": 22080, |
| "train_speed(iter/s)": 0.767032 |
| }, |
| { |
| "epoch": 2.0577281191806334, |
| "grad_norm": 1.3515625, |
| "learning_rate": 4.926407276131141e-05, |
| "loss": 0.13798956, |
| "memory(GiB)": 58.24, |
| "step": 22100, |
| "train_speed(iter/s)": 0.767046 |
| }, |
| { |
| "epoch": 2.0595903165735567, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.908728937474325e-05, |
| "loss": 0.10691235, |
| "memory(GiB)": 58.24, |
| "step": 22120, |
| "train_speed(iter/s)": 0.767074 |
| }, |
| { |
| "epoch": 2.0614525139664805, |
| "grad_norm": 0.9609375, |
| "learning_rate": 4.891072051792249e-05, |
| "loss": 0.11395538, |
| "memory(GiB)": 58.24, |
| "step": 22140, |
| "train_speed(iter/s)": 0.76709 |
| }, |
| { |
| "epoch": 2.063314711359404, |
| "grad_norm": 0.91796875, |
| "learning_rate": 4.873436693485325e-05, |
| "loss": 0.11044333, |
| "memory(GiB)": 58.24, |
| "step": 22160, |
| "train_speed(iter/s)": 0.767114 |
| }, |
| { |
| "epoch": 2.0651769087523277, |
| "grad_norm": 1.125, |
| "learning_rate": 4.8558229368632714e-05, |
| "loss": 0.11576631, |
| "memory(GiB)": 58.24, |
| "step": 22180, |
| "train_speed(iter/s)": 0.76713 |
| }, |
| { |
| "epoch": 2.0670391061452515, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.838230856144781e-05, |
| "loss": 0.13391452, |
| "memory(GiB)": 58.24, |
| "step": 22200, |
| "train_speed(iter/s)": 0.767181 |
| }, |
| { |
| "epoch": 2.068901303538175, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.820660525457201e-05, |
| "loss": 0.10914934, |
| "memory(GiB)": 58.24, |
| "step": 22220, |
| "train_speed(iter/s)": 0.76721 |
| }, |
| { |
| "epoch": 2.0707635009310987, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.8031120188362546e-05, |
| "loss": 0.12316871, |
| "memory(GiB)": 58.24, |
| "step": 22240, |
| "train_speed(iter/s)": 0.767259 |
| }, |
| { |
| "epoch": 2.0726256983240225, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.785585410225678e-05, |
| "loss": 0.11936328, |
| "memory(GiB)": 58.24, |
| "step": 22260, |
| "train_speed(iter/s)": 0.767306 |
| }, |
| { |
| "epoch": 2.074487895716946, |
| "grad_norm": 1.46875, |
| "learning_rate": 4.768080773476955e-05, |
| "loss": 0.11963451, |
| "memory(GiB)": 58.24, |
| "step": 22280, |
| "train_speed(iter/s)": 0.767332 |
| }, |
| { |
| "epoch": 2.0763500931098697, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.7505981823489785e-05, |
| "loss": 0.12376244, |
| "memory(GiB)": 58.24, |
| "step": 22300, |
| "train_speed(iter/s)": 0.767376 |
| }, |
| { |
| "epoch": 2.078212290502793, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.7331377105077537e-05, |
| "loss": 0.12010782, |
| "memory(GiB)": 58.24, |
| "step": 22320, |
| "train_speed(iter/s)": 0.76741 |
| }, |
| { |
| "epoch": 2.080074487895717, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.71569943152608e-05, |
| "loss": 0.10957891, |
| "memory(GiB)": 58.24, |
| "step": 22340, |
| "train_speed(iter/s)": 0.767455 |
| }, |
| { |
| "epoch": 2.0819366852886407, |
| "grad_norm": 0.92578125, |
| "learning_rate": 4.698283418883237e-05, |
| "loss": 0.10824106, |
| "memory(GiB)": 58.24, |
| "step": 22360, |
| "train_speed(iter/s)": 0.76747 |
| }, |
| { |
| "epoch": 2.083798882681564, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.680889745964689e-05, |
| "loss": 0.11550634, |
| "memory(GiB)": 58.24, |
| "step": 22380, |
| "train_speed(iter/s)": 0.767518 |
| }, |
| { |
| "epoch": 2.085661080074488, |
| "grad_norm": 0.97265625, |
| "learning_rate": 4.6635184860617675e-05, |
| "loss": 0.12425991, |
| "memory(GiB)": 58.24, |
| "step": 22400, |
| "train_speed(iter/s)": 0.767554 |
| }, |
| { |
| "epoch": 2.0875232774674117, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.646169712371364e-05, |
| "loss": 0.1120156, |
| "memory(GiB)": 58.24, |
| "step": 22420, |
| "train_speed(iter/s)": 0.767568 |
| }, |
| { |
| "epoch": 2.089385474860335, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.628843497995606e-05, |
| "loss": 0.10975211, |
| "memory(GiB)": 58.24, |
| "step": 22440, |
| "train_speed(iter/s)": 0.767614 |
| }, |
| { |
| "epoch": 2.091247672253259, |
| "grad_norm": 1.3515625, |
| "learning_rate": 4.611539915941588e-05, |
| "loss": 0.1215034, |
| "memory(GiB)": 58.24, |
| "step": 22460, |
| "train_speed(iter/s)": 0.767646 |
| }, |
| { |
| "epoch": 2.0931098696461823, |
| "grad_norm": 0.97265625, |
| "learning_rate": 4.594259039121017e-05, |
| "loss": 0.1136503, |
| "memory(GiB)": 58.24, |
| "step": 22480, |
| "train_speed(iter/s)": 0.767685 |
| }, |
| { |
| "epoch": 2.094972067039106, |
| "grad_norm": 0.93359375, |
| "learning_rate": 4.577000940349939e-05, |
| "loss": 0.11370624, |
| "memory(GiB)": 58.24, |
| "step": 22500, |
| "train_speed(iter/s)": 0.76774 |
| }, |
| { |
| "epoch": 2.09683426443203, |
| "grad_norm": 0.96875, |
| "learning_rate": 4.559765692348421e-05, |
| "loss": 0.11183515, |
| "memory(GiB)": 58.24, |
| "step": 22520, |
| "train_speed(iter/s)": 0.767789 |
| }, |
| { |
| "epoch": 2.0986964618249533, |
| "grad_norm": 0.85546875, |
| "learning_rate": 4.54255336774023e-05, |
| "loss": 0.11438358, |
| "memory(GiB)": 58.24, |
| "step": 22540, |
| "train_speed(iter/s)": 0.767817 |
| }, |
| { |
| "epoch": 2.100558659217877, |
| "grad_norm": 0.94140625, |
| "learning_rate": 4.525364039052568e-05, |
| "loss": 0.10709119, |
| "memory(GiB)": 58.24, |
| "step": 22560, |
| "train_speed(iter/s)": 0.767825 |
| }, |
| { |
| "epoch": 2.102420856610801, |
| "grad_norm": 1.46875, |
| "learning_rate": 4.508197778715711e-05, |
| "loss": 0.12675226, |
| "memory(GiB)": 58.24, |
| "step": 22580, |
| "train_speed(iter/s)": 0.767876 |
| }, |
| { |
| "epoch": 2.1042830540037243, |
| "grad_norm": 1.125, |
| "learning_rate": 4.4910546590627525e-05, |
| "loss": 0.11741807, |
| "memory(GiB)": 58.24, |
| "step": 22600, |
| "train_speed(iter/s)": 0.767894 |
| }, |
| { |
| "epoch": 2.106145251396648, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.473934752329269e-05, |
| "loss": 0.12141221, |
| "memory(GiB)": 58.24, |
| "step": 22620, |
| "train_speed(iter/s)": 0.767948 |
| }, |
| { |
| "epoch": 2.1080074487895715, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.456838130653026e-05, |
| "loss": 0.11729333, |
| "memory(GiB)": 58.24, |
| "step": 22640, |
| "train_speed(iter/s)": 0.767948 |
| }, |
| { |
| "epoch": 2.1098696461824953, |
| "grad_norm": 0.70703125, |
| "learning_rate": 4.439764866073682e-05, |
| "loss": 0.1138922, |
| "memory(GiB)": 58.24, |
| "step": 22660, |
| "train_speed(iter/s)": 0.767955 |
| }, |
| { |
| "epoch": 2.111731843575419, |
| "grad_norm": 1.3671875, |
| "learning_rate": 4.4227150305324605e-05, |
| "loss": 0.11240854, |
| "memory(GiB)": 58.24, |
| "step": 22680, |
| "train_speed(iter/s)": 0.768003 |
| }, |
| { |
| "epoch": 2.1135940409683425, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.4056886958718744e-05, |
| "loss": 0.10561801, |
| "memory(GiB)": 58.24, |
| "step": 22700, |
| "train_speed(iter/s)": 0.768013 |
| }, |
| { |
| "epoch": 2.1154562383612663, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.388685933835409e-05, |
| "loss": 0.11626101, |
| "memory(GiB)": 58.24, |
| "step": 22720, |
| "train_speed(iter/s)": 0.768045 |
| }, |
| { |
| "epoch": 2.11731843575419, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.3717068160672244e-05, |
| "loss": 0.10951095, |
| "memory(GiB)": 58.24, |
| "step": 22740, |
| "train_speed(iter/s)": 0.768074 |
| }, |
| { |
| "epoch": 2.1191806331471135, |
| "grad_norm": 0.9609375, |
| "learning_rate": 4.3547514141118394e-05, |
| "loss": 0.09885264, |
| "memory(GiB)": 58.24, |
| "step": 22760, |
| "train_speed(iter/s)": 0.768116 |
| }, |
| { |
| "epoch": 2.1210428305400373, |
| "grad_norm": 0.8984375, |
| "learning_rate": 4.337819799413864e-05, |
| "loss": 0.08843653, |
| "memory(GiB)": 58.24, |
| "step": 22780, |
| "train_speed(iter/s)": 0.768167 |
| }, |
| { |
| "epoch": 2.122905027932961, |
| "grad_norm": 0.9609375, |
| "learning_rate": 4.320912043317652e-05, |
| "loss": 0.10544789, |
| "memory(GiB)": 58.24, |
| "step": 22800, |
| "train_speed(iter/s)": 0.768184 |
| }, |
| { |
| "epoch": 2.1247672253258845, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.304028217067041e-05, |
| "loss": 0.10042415, |
| "memory(GiB)": 58.24, |
| "step": 22820, |
| "train_speed(iter/s)": 0.768232 |
| }, |
| { |
| "epoch": 2.1266294227188083, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.287168391805031e-05, |
| "loss": 0.11463039, |
| "memory(GiB)": 58.24, |
| "step": 22840, |
| "train_speed(iter/s)": 0.768245 |
| }, |
| { |
| "epoch": 2.1284916201117317, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.27033263857349e-05, |
| "loss": 0.11318433, |
| "memory(GiB)": 58.24, |
| "step": 22860, |
| "train_speed(iter/s)": 0.768273 |
| }, |
| { |
| "epoch": 2.1303538175046555, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.253521028312857e-05, |
| "loss": 0.10956602, |
| "memory(GiB)": 58.24, |
| "step": 22880, |
| "train_speed(iter/s)": 0.768307 |
| }, |
| { |
| "epoch": 2.1322160148975793, |
| "grad_norm": 0.94921875, |
| "learning_rate": 4.23673363186183e-05, |
| "loss": 0.10085559, |
| "memory(GiB)": 58.24, |
| "step": 22900, |
| "train_speed(iter/s)": 0.768345 |
| }, |
| { |
| "epoch": 2.1340782122905027, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.2199705199570885e-05, |
| "loss": 0.10715935, |
| "memory(GiB)": 58.24, |
| "step": 22920, |
| "train_speed(iter/s)": 0.768374 |
| }, |
| { |
| "epoch": 2.1359404096834265, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.203231763232982e-05, |
| "loss": 0.12393291, |
| "memory(GiB)": 58.24, |
| "step": 22940, |
| "train_speed(iter/s)": 0.768377 |
| }, |
| { |
| "epoch": 2.1378026070763503, |
| "grad_norm": 1.125, |
| "learning_rate": 4.186517432221234e-05, |
| "loss": 0.10514045, |
| "memory(GiB)": 58.24, |
| "step": 22960, |
| "train_speed(iter/s)": 0.768416 |
| }, |
| { |
| "epoch": 2.1396648044692737, |
| "grad_norm": 1.4140625, |
| "learning_rate": 4.1698275973506464e-05, |
| "loss": 0.12016675, |
| "memory(GiB)": 58.24, |
| "step": 22980, |
| "train_speed(iter/s)": 0.768455 |
| }, |
| { |
| "epoch": 2.1415270018621975, |
| "grad_norm": 0.98046875, |
| "learning_rate": 4.153162328946806e-05, |
| "loss": 0.10668141, |
| "memory(GiB)": 58.24, |
| "step": 23000, |
| "train_speed(iter/s)": 0.76848 |
| }, |
| { |
| "epoch": 2.143389199255121, |
| "grad_norm": 0.921875, |
| "learning_rate": 4.136521697231773e-05, |
| "loss": 0.1124084, |
| "memory(GiB)": 58.24, |
| "step": 23020, |
| "train_speed(iter/s)": 0.768523 |
| }, |
| { |
| "epoch": 2.1452513966480447, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.1199057723238085e-05, |
| "loss": 0.11913981, |
| "memory(GiB)": 58.24, |
| "step": 23040, |
| "train_speed(iter/s)": 0.768557 |
| }, |
| { |
| "epoch": 2.1471135940409685, |
| "grad_norm": 0.9296875, |
| "learning_rate": 4.1033146242370656e-05, |
| "loss": 0.1068146, |
| "memory(GiB)": 58.24, |
| "step": 23060, |
| "train_speed(iter/s)": 0.768601 |
| }, |
| { |
| "epoch": 2.148975791433892, |
| "grad_norm": 0.8046875, |
| "learning_rate": 4.086748322881283e-05, |
| "loss": 0.10721408, |
| "memory(GiB)": 58.24, |
| "step": 23080, |
| "train_speed(iter/s)": 0.76861 |
| }, |
| { |
| "epoch": 2.1508379888268156, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.070206938061527e-05, |
| "loss": 0.11343349, |
| "memory(GiB)": 58.24, |
| "step": 23100, |
| "train_speed(iter/s)": 0.768649 |
| }, |
| { |
| "epoch": 2.1527001862197395, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.053690539477847e-05, |
| "loss": 0.11392477, |
| "memory(GiB)": 58.24, |
| "step": 23120, |
| "train_speed(iter/s)": 0.768699 |
| }, |
| { |
| "epoch": 2.154562383612663, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.0371991967250336e-05, |
| "loss": 0.11984729, |
| "memory(GiB)": 58.24, |
| "step": 23140, |
| "train_speed(iter/s)": 0.768728 |
| }, |
| { |
| "epoch": 2.1564245810055866, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.020732979292278e-05, |
| "loss": 0.12230155, |
| "memory(GiB)": 58.24, |
| "step": 23160, |
| "train_speed(iter/s)": 0.768733 |
| }, |
| { |
| "epoch": 2.1582867783985105, |
| "grad_norm": 0.9609375, |
| "learning_rate": 4.0042919565629155e-05, |
| "loss": 0.11182994, |
| "memory(GiB)": 58.24, |
| "step": 23180, |
| "train_speed(iter/s)": 0.768712 |
| }, |
| { |
| "epoch": 2.160148975791434, |
| "grad_norm": 0.91796875, |
| "learning_rate": 3.987876197814119e-05, |
| "loss": 0.1028029, |
| "memory(GiB)": 58.24, |
| "step": 23200, |
| "train_speed(iter/s)": 0.768722 |
| }, |
| { |
| "epoch": 2.1620111731843576, |
| "grad_norm": 0.96484375, |
| "learning_rate": 3.9714857722165956e-05, |
| "loss": 0.1047961, |
| "memory(GiB)": 58.24, |
| "step": 23220, |
| "train_speed(iter/s)": 0.768762 |
| }, |
| { |
| "epoch": 2.163873370577281, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.9551207488343165e-05, |
| "loss": 0.10931444, |
| "memory(GiB)": 58.24, |
| "step": 23240, |
| "train_speed(iter/s)": 0.768788 |
| }, |
| { |
| "epoch": 2.165735567970205, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.938781196624215e-05, |
| "loss": 0.11954699, |
| "memory(GiB)": 58.24, |
| "step": 23260, |
| "train_speed(iter/s)": 0.768814 |
| }, |
| { |
| "epoch": 2.1675977653631286, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.9224671844358964e-05, |
| "loss": 0.12002516, |
| "memory(GiB)": 58.24, |
| "step": 23280, |
| "train_speed(iter/s)": 0.768837 |
| }, |
| { |
| "epoch": 2.169459962756052, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.906178781011347e-05, |
| "loss": 0.11104758, |
| "memory(GiB)": 58.24, |
| "step": 23300, |
| "train_speed(iter/s)": 0.768871 |
| }, |
| { |
| "epoch": 2.171322160148976, |
| "grad_norm": 1.25, |
| "learning_rate": 3.8899160549846504e-05, |
| "loss": 0.10640566, |
| "memory(GiB)": 58.24, |
| "step": 23320, |
| "train_speed(iter/s)": 0.768908 |
| }, |
| { |
| "epoch": 2.1731843575418996, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.873679074881685e-05, |
| "loss": 0.09920184, |
| "memory(GiB)": 58.24, |
| "step": 23340, |
| "train_speed(iter/s)": 0.768945 |
| }, |
| { |
| "epoch": 2.175046554934823, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.857467909119853e-05, |
| "loss": 0.09962378, |
| "memory(GiB)": 58.24, |
| "step": 23360, |
| "train_speed(iter/s)": 0.768952 |
| }, |
| { |
| "epoch": 2.176908752327747, |
| "grad_norm": 1.0078125, |
| "learning_rate": 3.841282626007784e-05, |
| "loss": 0.10315701, |
| "memory(GiB)": 74.75, |
| "step": 23380, |
| "train_speed(iter/s)": 0.768953 |
| }, |
| { |
| "epoch": 2.17877094972067, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.82512329374503e-05, |
| "loss": 0.10988283, |
| "memory(GiB)": 74.75, |
| "step": 23400, |
| "train_speed(iter/s)": 0.768963 |
| }, |
| { |
| "epoch": 2.180633147113594, |
| "grad_norm": 1.0, |
| "learning_rate": 3.808989980421823e-05, |
| "loss": 0.10468407, |
| "memory(GiB)": 74.75, |
| "step": 23420, |
| "train_speed(iter/s)": 0.768989 |
| }, |
| { |
| "epoch": 2.182495344506518, |
| "grad_norm": 1.4609375, |
| "learning_rate": 3.7928827540187296e-05, |
| "loss": 0.1051132, |
| "memory(GiB)": 74.75, |
| "step": 23440, |
| "train_speed(iter/s)": 0.769015 |
| }, |
| { |
| "epoch": 2.184357541899441, |
| "grad_norm": 0.953125, |
| "learning_rate": 3.776801682406421e-05, |
| "loss": 0.11219141, |
| "memory(GiB)": 74.75, |
| "step": 23460, |
| "train_speed(iter/s)": 0.769046 |
| }, |
| { |
| "epoch": 2.186219739292365, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.7607468333453386e-05, |
| "loss": 0.09865897, |
| "memory(GiB)": 74.75, |
| "step": 23480, |
| "train_speed(iter/s)": 0.769062 |
| }, |
| { |
| "epoch": 2.188081936685289, |
| "grad_norm": 0.98046875, |
| "learning_rate": 3.744718274485445e-05, |
| "loss": 0.10719773, |
| "memory(GiB)": 74.75, |
| "step": 23500, |
| "train_speed(iter/s)": 0.769096 |
| }, |
| { |
| "epoch": 2.189944134078212, |
| "grad_norm": 0.921875, |
| "learning_rate": 3.728716073365921e-05, |
| "loss": 0.10016888, |
| "memory(GiB)": 74.75, |
| "step": 23520, |
| "train_speed(iter/s)": 0.769102 |
| }, |
| { |
| "epoch": 2.191806331471136, |
| "grad_norm": 0.90234375, |
| "learning_rate": 3.712740297414875e-05, |
| "loss": 0.09913735, |
| "memory(GiB)": 74.75, |
| "step": 23540, |
| "train_speed(iter/s)": 0.769151 |
| }, |
| { |
| "epoch": 2.1936685288640594, |
| "grad_norm": 0.7578125, |
| "learning_rate": 3.696791013949081e-05, |
| "loss": 0.11304789, |
| "memory(GiB)": 74.75, |
| "step": 23560, |
| "train_speed(iter/s)": 0.7692 |
| }, |
| { |
| "epoch": 2.195530726256983, |
| "grad_norm": 0.97265625, |
| "learning_rate": 3.680868290173677e-05, |
| "loss": 0.11336186, |
| "memory(GiB)": 74.75, |
| "step": 23580, |
| "train_speed(iter/s)": 0.769216 |
| }, |
| { |
| "epoch": 2.197392923649907, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.664972193181887e-05, |
| "loss": 0.09854193, |
| "memory(GiB)": 74.75, |
| "step": 23600, |
| "train_speed(iter/s)": 0.769212 |
| }, |
| { |
| "epoch": 2.1992551210428304, |
| "grad_norm": 0.90234375, |
| "learning_rate": 3.649102789954738e-05, |
| "loss": 0.09684347, |
| "memory(GiB)": 74.75, |
| "step": 23620, |
| "train_speed(iter/s)": 0.769264 |
| }, |
| { |
| "epoch": 2.201117318435754, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.633260147360783e-05, |
| "loss": 0.10960507, |
| "memory(GiB)": 74.75, |
| "step": 23640, |
| "train_speed(iter/s)": 0.769308 |
| }, |
| { |
| "epoch": 2.202979515828678, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.617444332155803e-05, |
| "loss": 0.11710701, |
| "memory(GiB)": 74.75, |
| "step": 23660, |
| "train_speed(iter/s)": 0.769343 |
| }, |
| { |
| "epoch": 2.2048417132216014, |
| "grad_norm": 0.859375, |
| "learning_rate": 3.601655410982545e-05, |
| "loss": 0.10495639, |
| "memory(GiB)": 74.75, |
| "step": 23680, |
| "train_speed(iter/s)": 0.769394 |
| }, |
| { |
| "epoch": 2.206703910614525, |
| "grad_norm": 0.89453125, |
| "learning_rate": 3.585893450370439e-05, |
| "loss": 0.10962081, |
| "memory(GiB)": 74.75, |
| "step": 23700, |
| "train_speed(iter/s)": 0.769409 |
| }, |
| { |
| "epoch": 2.2085661080074486, |
| "grad_norm": 0.875, |
| "learning_rate": 3.5701585167352924e-05, |
| "loss": 0.11209228, |
| "memory(GiB)": 74.75, |
| "step": 23720, |
| "train_speed(iter/s)": 0.769435 |
| }, |
| { |
| "epoch": 2.2104283054003724, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.554450676379056e-05, |
| "loss": 0.10384083, |
| "memory(GiB)": 74.75, |
| "step": 23740, |
| "train_speed(iter/s)": 0.769455 |
| }, |
| { |
| "epoch": 2.212290502793296, |
| "grad_norm": 0.9375, |
| "learning_rate": 3.538769995489494e-05, |
| "loss": 0.10817424, |
| "memory(GiB)": 74.75, |
| "step": 23760, |
| "train_speed(iter/s)": 0.769461 |
| }, |
| { |
| "epoch": 2.2141527001862196, |
| "grad_norm": 0.84375, |
| "learning_rate": 3.523116540139949e-05, |
| "loss": 0.11751704, |
| "memory(GiB)": 74.75, |
| "step": 23780, |
| "train_speed(iter/s)": 0.769484 |
| }, |
| { |
| "epoch": 2.2160148975791434, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.507490376289029e-05, |
| "loss": 0.11738909, |
| "memory(GiB)": 74.75, |
| "step": 23800, |
| "train_speed(iter/s)": 0.769518 |
| }, |
| { |
| "epoch": 2.217877094972067, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.49189156978035e-05, |
| "loss": 0.10404855, |
| "memory(GiB)": 74.75, |
| "step": 23820, |
| "train_speed(iter/s)": 0.769528 |
| }, |
| { |
| "epoch": 2.2197392923649906, |
| "grad_norm": 0.82421875, |
| "learning_rate": 3.476320186342259e-05, |
| "loss": 0.10161319, |
| "memory(GiB)": 74.75, |
| "step": 23840, |
| "train_speed(iter/s)": 0.76957 |
| }, |
| { |
| "epoch": 2.2216014897579144, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.4607762915875374e-05, |
| "loss": 0.10832006, |
| "memory(GiB)": 74.75, |
| "step": 23860, |
| "train_speed(iter/s)": 0.7696 |
| }, |
| { |
| "epoch": 2.223463687150838, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.445259951013149e-05, |
| "loss": 0.1030372, |
| "memory(GiB)": 74.75, |
| "step": 23880, |
| "train_speed(iter/s)": 0.76962 |
| }, |
| { |
| "epoch": 2.2253258845437616, |
| "grad_norm": 0.86328125, |
| "learning_rate": 3.4297712299999515e-05, |
| "loss": 0.09935099, |
| "memory(GiB)": 74.75, |
| "step": 23900, |
| "train_speed(iter/s)": 0.769668 |
| }, |
| { |
| "epoch": 2.2271880819366854, |
| "grad_norm": 1.515625, |
| "learning_rate": 3.4143101938124223e-05, |
| "loss": 0.11144433, |
| "memory(GiB)": 74.75, |
| "step": 23920, |
| "train_speed(iter/s)": 0.769691 |
| }, |
| { |
| "epoch": 2.2290502793296088, |
| "grad_norm": 0.99609375, |
| "learning_rate": 3.398876907598379e-05, |
| "loss": 0.11325089, |
| "memory(GiB)": 74.75, |
| "step": 23940, |
| "train_speed(iter/s)": 0.769745 |
| }, |
| { |
| "epoch": 2.2309124767225326, |
| "grad_norm": 1.0, |
| "learning_rate": 3.383471436388722e-05, |
| "loss": 0.10212806, |
| "memory(GiB)": 74.75, |
| "step": 23960, |
| "train_speed(iter/s)": 0.769766 |
| }, |
| { |
| "epoch": 2.2327746741154564, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.36809384509713e-05, |
| "loss": 0.12155344, |
| "memory(GiB)": 74.75, |
| "step": 23980, |
| "train_speed(iter/s)": 0.769802 |
| }, |
| { |
| "epoch": 2.2346368715083798, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.352744198519818e-05, |
| "loss": 0.1001812, |
| "memory(GiB)": 74.75, |
| "step": 24000, |
| "train_speed(iter/s)": 0.769804 |
| }, |
| { |
| "epoch": 2.2346368715083798, |
| "eval_loss": 0.38177013397216797, |
| "eval_runtime": 77.0804, |
| "eval_samples_per_second": 180.137, |
| "eval_steps_per_second": 1.414, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.2364990689013036, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.337422561335249e-05, |
| "loss": 0.10155301, |
| "memory(GiB)": 74.75, |
| "step": 24020, |
| "train_speed(iter/s)": 0.767136 |
| }, |
| { |
| "epoch": 2.2383612662942274, |
| "grad_norm": 0.84765625, |
| "learning_rate": 3.322128998103863e-05, |
| "loss": 0.10558331, |
| "memory(GiB)": 74.75, |
| "step": 24040, |
| "train_speed(iter/s)": 0.767149 |
| }, |
| { |
| "epoch": 2.2402234636871508, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.3068635732678044e-05, |
| "loss": 0.10001249, |
| "memory(GiB)": 74.75, |
| "step": 24060, |
| "train_speed(iter/s)": 0.767192 |
| }, |
| { |
| "epoch": 2.2420856610800746, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.2916263511506453e-05, |
| "loss": 0.10083565, |
| "memory(GiB)": 74.75, |
| "step": 24080, |
| "train_speed(iter/s)": 0.767228 |
| }, |
| { |
| "epoch": 2.243947858472998, |
| "grad_norm": 1.0, |
| "learning_rate": 3.276417395957138e-05, |
| "loss": 0.11419288, |
| "memory(GiB)": 74.75, |
| "step": 24100, |
| "train_speed(iter/s)": 0.767252 |
| }, |
| { |
| "epoch": 2.2458100558659218, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.2612367717729054e-05, |
| "loss": 0.10783464, |
| "memory(GiB)": 74.75, |
| "step": 24120, |
| "train_speed(iter/s)": 0.767284 |
| }, |
| { |
| "epoch": 2.2476722532588456, |
| "grad_norm": 0.9453125, |
| "learning_rate": 3.2460845425642095e-05, |
| "loss": 0.11875585, |
| "memory(GiB)": 74.75, |
| "step": 24140, |
| "train_speed(iter/s)": 0.767304 |
| }, |
| { |
| "epoch": 2.249534450651769, |
| "grad_norm": 1.375, |
| "learning_rate": 3.230960772177656e-05, |
| "loss": 0.09926898, |
| "memory(GiB)": 74.75, |
| "step": 24160, |
| "train_speed(iter/s)": 0.767332 |
| }, |
| { |
| "epoch": 2.2513966480446927, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.215865524339943e-05, |
| "loss": 0.09872712, |
| "memory(GiB)": 74.75, |
| "step": 24180, |
| "train_speed(iter/s)": 0.767362 |
| }, |
| { |
| "epoch": 2.2532588454376166, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.200798862657568e-05, |
| "loss": 0.10884237, |
| "memory(GiB)": 74.75, |
| "step": 24200, |
| "train_speed(iter/s)": 0.767404 |
| }, |
| { |
| "epoch": 2.25512104283054, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.185760850616592e-05, |
| "loss": 0.11168021, |
| "memory(GiB)": 74.75, |
| "step": 24220, |
| "train_speed(iter/s)": 0.767435 |
| }, |
| { |
| "epoch": 2.2569832402234637, |
| "grad_norm": 0.76953125, |
| "learning_rate": 3.170751551582347e-05, |
| "loss": 0.10452518, |
| "memory(GiB)": 74.75, |
| "step": 24240, |
| "train_speed(iter/s)": 0.767454 |
| }, |
| { |
| "epoch": 2.2588454376163876, |
| "grad_norm": 0.8359375, |
| "learning_rate": 3.155771028799182e-05, |
| "loss": 0.10053955, |
| "memory(GiB)": 74.75, |
| "step": 24260, |
| "train_speed(iter/s)": 0.767471 |
| }, |
| { |
| "epoch": 2.260707635009311, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.140819345390191e-05, |
| "loss": 0.11290877, |
| "memory(GiB)": 74.75, |
| "step": 24280, |
| "train_speed(iter/s)": 0.767523 |
| }, |
| { |
| "epoch": 2.2625698324022347, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.125896564356938e-05, |
| "loss": 0.0997721, |
| "memory(GiB)": 74.75, |
| "step": 24300, |
| "train_speed(iter/s)": 0.767559 |
| }, |
| { |
| "epoch": 2.264432029795158, |
| "grad_norm": 0.7265625, |
| "learning_rate": 3.111002748579226e-05, |
| "loss": 0.10257297, |
| "memory(GiB)": 74.75, |
| "step": 24320, |
| "train_speed(iter/s)": 0.767592 |
| }, |
| { |
| "epoch": 2.266294227188082, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.09613796081478e-05, |
| "loss": 0.1102082, |
| "memory(GiB)": 74.75, |
| "step": 24340, |
| "train_speed(iter/s)": 0.767628 |
| }, |
| { |
| "epoch": 2.2681564245810057, |
| "grad_norm": 0.9765625, |
| "learning_rate": 3.0813022636990275e-05, |
| "loss": 0.11674392, |
| "memory(GiB)": 74.75, |
| "step": 24360, |
| "train_speed(iter/s)": 0.767675 |
| }, |
| { |
| "epoch": 2.270018621973929, |
| "grad_norm": 1.25, |
| "learning_rate": 3.066495719744815e-05, |
| "loss": 0.09853878, |
| "memory(GiB)": 74.75, |
| "step": 24380, |
| "train_speed(iter/s)": 0.767684 |
| }, |
| { |
| "epoch": 2.271880819366853, |
| "grad_norm": 0.83984375, |
| "learning_rate": 3.0517183913421367e-05, |
| "loss": 0.1074196, |
| "memory(GiB)": 74.75, |
| "step": 24400, |
| "train_speed(iter/s)": 0.767675 |
| }, |
| { |
| "epoch": 2.2737430167597763, |
| "grad_norm": 1.453125, |
| "learning_rate": 3.036970340757903e-05, |
| "loss": 0.09881254, |
| "memory(GiB)": 74.75, |
| "step": 24420, |
| "train_speed(iter/s)": 0.767677 |
| }, |
| { |
| "epoch": 2.2756052141527, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.022251630135633e-05, |
| "loss": 0.11979437, |
| "memory(GiB)": 74.75, |
| "step": 24440, |
| "train_speed(iter/s)": 0.767719 |
| }, |
| { |
| "epoch": 2.277467411545624, |
| "grad_norm": 1.4609375, |
| "learning_rate": 3.007562321495231e-05, |
| "loss": 0.10171646, |
| "memory(GiB)": 74.75, |
| "step": 24460, |
| "train_speed(iter/s)": 0.767738 |
| }, |
| { |
| "epoch": 2.2793296089385473, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.9929024767327086e-05, |
| "loss": 0.10340936, |
| "memory(GiB)": 74.75, |
| "step": 24480, |
| "train_speed(iter/s)": 0.767767 |
| }, |
| { |
| "epoch": 2.281191806331471, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.9782721576199268e-05, |
| "loss": 0.09857596, |
| "memory(GiB)": 74.75, |
| "step": 24500, |
| "train_speed(iter/s)": 0.767785 |
| }, |
| { |
| "epoch": 2.283054003724395, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.9636714258043273e-05, |
| "loss": 0.10468978, |
| "memory(GiB)": 74.75, |
| "step": 24520, |
| "train_speed(iter/s)": 0.767803 |
| }, |
| { |
| "epoch": 2.2849162011173183, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.9491003428086905e-05, |
| "loss": 0.10225184, |
| "memory(GiB)": 74.75, |
| "step": 24540, |
| "train_speed(iter/s)": 0.767846 |
| }, |
| { |
| "epoch": 2.286778398510242, |
| "grad_norm": 0.84765625, |
| "learning_rate": 2.934558970030862e-05, |
| "loss": 0.09291599, |
| "memory(GiB)": 74.75, |
| "step": 24560, |
| "train_speed(iter/s)": 0.767871 |
| }, |
| { |
| "epoch": 2.288640595903166, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.920047368743497e-05, |
| "loss": 0.102846, |
| "memory(GiB)": 74.75, |
| "step": 24580, |
| "train_speed(iter/s)": 0.7679 |
| }, |
| { |
| "epoch": 2.2905027932960893, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.9055656000938092e-05, |
| "loss": 0.09924616, |
| "memory(GiB)": 74.75, |
| "step": 24600, |
| "train_speed(iter/s)": 0.767908 |
| }, |
| { |
| "epoch": 2.292364990689013, |
| "grad_norm": 1.4921875, |
| "learning_rate": 2.8911137251032915e-05, |
| "loss": 0.11189671, |
| "memory(GiB)": 74.75, |
| "step": 24620, |
| "train_speed(iter/s)": 0.767926 |
| }, |
| { |
| "epoch": 2.294227188081937, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.8766918046674995e-05, |
| "loss": 0.10130365, |
| "memory(GiB)": 74.75, |
| "step": 24640, |
| "train_speed(iter/s)": 0.767934 |
| }, |
| { |
| "epoch": 2.2960893854748603, |
| "grad_norm": 1.359375, |
| "learning_rate": 2.862299899555746e-05, |
| "loss": 0.10901361, |
| "memory(GiB)": 74.75, |
| "step": 24660, |
| "train_speed(iter/s)": 0.767971 |
| }, |
| { |
| "epoch": 2.297951582867784, |
| "grad_norm": 0.875, |
| "learning_rate": 2.8479380704108815e-05, |
| "loss": 0.10119644, |
| "memory(GiB)": 74.75, |
| "step": 24680, |
| "train_speed(iter/s)": 0.768011 |
| }, |
| { |
| "epoch": 2.2998137802607075, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.8336063777490275e-05, |
| "loss": 0.10090758, |
| "memory(GiB)": 74.75, |
| "step": 24700, |
| "train_speed(iter/s)": 0.768031 |
| }, |
| { |
| "epoch": 2.3016759776536313, |
| "grad_norm": 1.3203125, |
| "learning_rate": 2.819304881959306e-05, |
| "loss": 0.11179039, |
| "memory(GiB)": 74.75, |
| "step": 24720, |
| "train_speed(iter/s)": 0.768069 |
| }, |
| { |
| "epoch": 2.303538175046555, |
| "grad_norm": 0.953125, |
| "learning_rate": 2.8050336433036216e-05, |
| "loss": 0.09195191, |
| "memory(GiB)": 74.75, |
| "step": 24740, |
| "train_speed(iter/s)": 0.768106 |
| }, |
| { |
| "epoch": 2.3054003724394785, |
| "grad_norm": 1.0, |
| "learning_rate": 2.7907927219163654e-05, |
| "loss": 0.09057497, |
| "memory(GiB)": 74.75, |
| "step": 24760, |
| "train_speed(iter/s)": 0.768137 |
| }, |
| { |
| "epoch": 2.3072625698324023, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.776582177804189e-05, |
| "loss": 0.10486867, |
| "memory(GiB)": 74.75, |
| "step": 24780, |
| "train_speed(iter/s)": 0.768152 |
| }, |
| { |
| "epoch": 2.3091247672253257, |
| "grad_norm": 0.98828125, |
| "learning_rate": 2.7624020708457442e-05, |
| "loss": 0.10587823, |
| "memory(GiB)": 74.75, |
| "step": 24800, |
| "train_speed(iter/s)": 0.768172 |
| }, |
| { |
| "epoch": 2.3109869646182495, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.748252460791433e-05, |
| "loss": 0.10775185, |
| "memory(GiB)": 74.75, |
| "step": 24820, |
| "train_speed(iter/s)": 0.768207 |
| }, |
| { |
| "epoch": 2.3128491620111733, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.7341334072631454e-05, |
| "loss": 0.09222878, |
| "memory(GiB)": 74.75, |
| "step": 24840, |
| "train_speed(iter/s)": 0.768225 |
| }, |
| { |
| "epoch": 2.3147113594040967, |
| "grad_norm": 1.3203125, |
| "learning_rate": 2.720044969754021e-05, |
| "loss": 0.10134428, |
| "memory(GiB)": 74.75, |
| "step": 24860, |
| "train_speed(iter/s)": 0.768226 |
| }, |
| { |
| "epoch": 2.3165735567970205, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.7059872076281956e-05, |
| "loss": 0.09082563, |
| "memory(GiB)": 74.75, |
| "step": 24880, |
| "train_speed(iter/s)": 0.768223 |
| }, |
| { |
| "epoch": 2.3184357541899443, |
| "grad_norm": 0.95703125, |
| "learning_rate": 2.691960180120544e-05, |
| "loss": 0.10442271, |
| "memory(GiB)": 74.75, |
| "step": 24900, |
| "train_speed(iter/s)": 0.768261 |
| }, |
| { |
| "epoch": 2.3202979515828677, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.6779639463364438e-05, |
| "loss": 0.1070719, |
| "memory(GiB)": 74.75, |
| "step": 24920, |
| "train_speed(iter/s)": 0.768305 |
| }, |
| { |
| "epoch": 2.3221601489757915, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.6639985652514987e-05, |
| "loss": 0.10250396, |
| "memory(GiB)": 74.75, |
| "step": 24940, |
| "train_speed(iter/s)": 0.768353 |
| }, |
| { |
| "epoch": 2.3240223463687153, |
| "grad_norm": 0.84765625, |
| "learning_rate": 2.6500640957113364e-05, |
| "loss": 0.09741715, |
| "memory(GiB)": 74.75, |
| "step": 24960, |
| "train_speed(iter/s)": 0.768386 |
| }, |
| { |
| "epoch": 2.3258845437616387, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.6361605964313084e-05, |
| "loss": 0.09976978, |
| "memory(GiB)": 74.75, |
| "step": 24980, |
| "train_speed(iter/s)": 0.768395 |
| }, |
| { |
| "epoch": 2.3277467411545625, |
| "grad_norm": 0.8828125, |
| "learning_rate": 2.6222881259962828e-05, |
| "loss": 0.09276388, |
| "memory(GiB)": 74.75, |
| "step": 25000, |
| "train_speed(iter/s)": 0.768413 |
| }, |
| { |
| "epoch": 2.329608938547486, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.6084467428603788e-05, |
| "loss": 0.10102336, |
| "memory(GiB)": 74.75, |
| "step": 25020, |
| "train_speed(iter/s)": 0.768435 |
| }, |
| { |
| "epoch": 2.3314711359404097, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.5946365053467126e-05, |
| "loss": 0.09177208, |
| "memory(GiB)": 74.75, |
| "step": 25040, |
| "train_speed(iter/s)": 0.768475 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 0.87890625, |
| "learning_rate": 2.5808574716471856e-05, |
| "loss": 0.10831335, |
| "memory(GiB)": 74.75, |
| "step": 25060, |
| "train_speed(iter/s)": 0.768501 |
| }, |
| { |
| "epoch": 2.335195530726257, |
| "grad_norm": 0.765625, |
| "learning_rate": 2.5671096998221943e-05, |
| "loss": 0.08765578, |
| "memory(GiB)": 74.75, |
| "step": 25080, |
| "train_speed(iter/s)": 0.768518 |
| }, |
| { |
| "epoch": 2.3370577281191807, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.5533932478004186e-05, |
| "loss": 0.10742819, |
| "memory(GiB)": 74.75, |
| "step": 25100, |
| "train_speed(iter/s)": 0.76853 |
| }, |
| { |
| "epoch": 2.338919925512104, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.5397081733785654e-05, |
| "loss": 0.09265711, |
| "memory(GiB)": 74.75, |
| "step": 25120, |
| "train_speed(iter/s)": 0.768552 |
| }, |
| { |
| "epoch": 2.340782122905028, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.5260545342211295e-05, |
| "loss": 0.11310081, |
| "memory(GiB)": 74.75, |
| "step": 25140, |
| "train_speed(iter/s)": 0.768573 |
| }, |
| { |
| "epoch": 2.3426443202979517, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.5124323878601342e-05, |
| "loss": 0.09951773, |
| "memory(GiB)": 74.75, |
| "step": 25160, |
| "train_speed(iter/s)": 0.768598 |
| }, |
| { |
| "epoch": 2.344506517690875, |
| "grad_norm": 0.96484375, |
| "learning_rate": 2.498841791694925e-05, |
| "loss": 0.0946602, |
| "memory(GiB)": 74.75, |
| "step": 25180, |
| "train_speed(iter/s)": 0.768608 |
| }, |
| { |
| "epoch": 2.346368715083799, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.4852828029918817e-05, |
| "loss": 0.0985311, |
| "memory(GiB)": 74.75, |
| "step": 25200, |
| "train_speed(iter/s)": 0.768644 |
| }, |
| { |
| "epoch": 2.3482309124767227, |
| "grad_norm": 0.80078125, |
| "learning_rate": 2.4717554788842156e-05, |
| "loss": 0.10430787, |
| "memory(GiB)": 74.75, |
| "step": 25220, |
| "train_speed(iter/s)": 0.768664 |
| }, |
| { |
| "epoch": 2.350093109869646, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.4582598763717125e-05, |
| "loss": 0.10131855, |
| "memory(GiB)": 74.75, |
| "step": 25240, |
| "train_speed(iter/s)": 0.768693 |
| }, |
| { |
| "epoch": 2.35195530726257, |
| "grad_norm": 0.88671875, |
| "learning_rate": 2.4447960523204795e-05, |
| "loss": 0.09341418, |
| "memory(GiB)": 74.75, |
| "step": 25260, |
| "train_speed(iter/s)": 0.768703 |
| }, |
| { |
| "epoch": 2.3538175046554937, |
| "grad_norm": 0.81640625, |
| "learning_rate": 2.431364063462743e-05, |
| "loss": 0.1005939, |
| "memory(GiB)": 74.75, |
| "step": 25280, |
| "train_speed(iter/s)": 0.768753 |
| }, |
| { |
| "epoch": 2.355679702048417, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.417963966396565e-05, |
| "loss": 0.09922752, |
| "memory(GiB)": 74.75, |
| "step": 25300, |
| "train_speed(iter/s)": 0.76878 |
| }, |
| { |
| "epoch": 2.357541899441341, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.4045958175856354e-05, |
| "loss": 0.10236137, |
| "memory(GiB)": 74.75, |
| "step": 25320, |
| "train_speed(iter/s)": 0.768813 |
| }, |
| { |
| "epoch": 2.3594040968342647, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.3912596733590242e-05, |
| "loss": 0.10635381, |
| "memory(GiB)": 74.75, |
| "step": 25340, |
| "train_speed(iter/s)": 0.768832 |
| }, |
| { |
| "epoch": 2.361266294227188, |
| "grad_norm": 0.953125, |
| "learning_rate": 2.377955589910943e-05, |
| "loss": 0.10511913, |
| "memory(GiB)": 74.75, |
| "step": 25360, |
| "train_speed(iter/s)": 0.768863 |
| }, |
| { |
| "epoch": 2.363128491620112, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.364683623300513e-05, |
| "loss": 0.08750724, |
| "memory(GiB)": 74.75, |
| "step": 25380, |
| "train_speed(iter/s)": 0.7689 |
| }, |
| { |
| "epoch": 2.364990689013035, |
| "grad_norm": 0.90625, |
| "learning_rate": 2.3514438294515172e-05, |
| "loss": 0.08906246, |
| "memory(GiB)": 74.75, |
| "step": 25400, |
| "train_speed(iter/s)": 0.768923 |
| }, |
| { |
| "epoch": 2.366852886405959, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.3382362641521803e-05, |
| "loss": 0.10484225, |
| "memory(GiB)": 74.75, |
| "step": 25420, |
| "train_speed(iter/s)": 0.768941 |
| }, |
| { |
| "epoch": 2.368715083798883, |
| "grad_norm": 0.921875, |
| "learning_rate": 2.325060983054924e-05, |
| "loss": 0.10104049, |
| "memory(GiB)": 74.75, |
| "step": 25440, |
| "train_speed(iter/s)": 0.768962 |
| }, |
| { |
| "epoch": 2.370577281191806, |
| "grad_norm": 0.9609375, |
| "learning_rate": 2.3119180416761378e-05, |
| "loss": 0.10175326, |
| "memory(GiB)": 74.75, |
| "step": 25460, |
| "train_speed(iter/s)": 0.768988 |
| }, |
| { |
| "epoch": 2.37243947858473, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.2988074953959317e-05, |
| "loss": 0.096415, |
| "memory(GiB)": 74.75, |
| "step": 25480, |
| "train_speed(iter/s)": 0.769004 |
| }, |
| { |
| "epoch": 2.3743016759776534, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.28572939945793e-05, |
| "loss": 0.09378529, |
| "memory(GiB)": 74.75, |
| "step": 25500, |
| "train_speed(iter/s)": 0.768992 |
| }, |
| { |
| "epoch": 2.376163873370577, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.272683808969004e-05, |
| "loss": 0.10414027, |
| "memory(GiB)": 74.75, |
| "step": 25520, |
| "train_speed(iter/s)": 0.769017 |
| }, |
| { |
| "epoch": 2.378026070763501, |
| "grad_norm": 0.87109375, |
| "learning_rate": 2.259670778899071e-05, |
| "loss": 0.10215082, |
| "memory(GiB)": 74.75, |
| "step": 25540, |
| "train_speed(iter/s)": 0.769021 |
| }, |
| { |
| "epoch": 2.3798882681564244, |
| "grad_norm": 0.984375, |
| "learning_rate": 2.2466903640808446e-05, |
| "loss": 0.09546254, |
| "memory(GiB)": 74.75, |
| "step": 25560, |
| "train_speed(iter/s)": 0.769032 |
| }, |
| { |
| "epoch": 2.381750465549348, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.2337426192096002e-05, |
| "loss": 0.10957, |
| "memory(GiB)": 74.75, |
| "step": 25580, |
| "train_speed(iter/s)": 0.769066 |
| }, |
| { |
| "epoch": 2.383612662942272, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.2208275988429716e-05, |
| "loss": 0.09794831, |
| "memory(GiB)": 74.75, |
| "step": 25600, |
| "train_speed(iter/s)": 0.769101 |
| }, |
| { |
| "epoch": 2.3854748603351954, |
| "grad_norm": 0.76953125, |
| "learning_rate": 2.2079453574006814e-05, |
| "loss": 0.09378864, |
| "memory(GiB)": 74.75, |
| "step": 25620, |
| "train_speed(iter/s)": 0.769125 |
| }, |
| { |
| "epoch": 2.387337057728119, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.1950959491643454e-05, |
| "loss": 0.10166427, |
| "memory(GiB)": 74.75, |
| "step": 25640, |
| "train_speed(iter/s)": 0.769157 |
| }, |
| { |
| "epoch": 2.389199255121043, |
| "grad_norm": 0.78515625, |
| "learning_rate": 2.1822794282772253e-05, |
| "loss": 0.09407473, |
| "memory(GiB)": 74.75, |
| "step": 25660, |
| "train_speed(iter/s)": 0.769197 |
| }, |
| { |
| "epoch": 2.3910614525139664, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.169495848744012e-05, |
| "loss": 0.09880165, |
| "memory(GiB)": 74.75, |
| "step": 25680, |
| "train_speed(iter/s)": 0.769245 |
| }, |
| { |
| "epoch": 2.39292364990689, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.1567452644305818e-05, |
| "loss": 0.0979923, |
| "memory(GiB)": 74.75, |
| "step": 25700, |
| "train_speed(iter/s)": 0.769264 |
| }, |
| { |
| "epoch": 2.394785847299814, |
| "grad_norm": 1.09375, |
| "learning_rate": 2.14402772906379e-05, |
| "loss": 0.09793384, |
| "memory(GiB)": 74.75, |
| "step": 25720, |
| "train_speed(iter/s)": 0.769313 |
| }, |
| { |
| "epoch": 2.3966480446927374, |
| "grad_norm": 0.8046875, |
| "learning_rate": 2.1313432962312284e-05, |
| "loss": 0.10001029, |
| "memory(GiB)": 74.75, |
| "step": 25740, |
| "train_speed(iter/s)": 0.769316 |
| }, |
| { |
| "epoch": 2.398510242085661, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.118692019381008e-05, |
| "loss": 0.11119788, |
| "memory(GiB)": 74.75, |
| "step": 25760, |
| "train_speed(iter/s)": 0.769331 |
| }, |
| { |
| "epoch": 2.4003724394785846, |
| "grad_norm": 0.8515625, |
| "learning_rate": 2.1060739518215333e-05, |
| "loss": 0.09229028, |
| "memory(GiB)": 74.75, |
| "step": 25780, |
| "train_speed(iter/s)": 0.769372 |
| }, |
| { |
| "epoch": 2.4022346368715084, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.0934891467212647e-05, |
| "loss": 0.09729228, |
| "memory(GiB)": 74.75, |
| "step": 25800, |
| "train_speed(iter/s)": 0.769395 |
| }, |
| { |
| "epoch": 2.404096834264432, |
| "grad_norm": 0.9296875, |
| "learning_rate": 2.0809376571085238e-05, |
| "loss": 0.09595935, |
| "memory(GiB)": 74.75, |
| "step": 25820, |
| "train_speed(iter/s)": 0.769411 |
| }, |
| { |
| "epoch": 2.4059590316573556, |
| "grad_norm": 1.1484375, |
| "learning_rate": 2.068419535871233e-05, |
| "loss": 0.09269361, |
| "memory(GiB)": 74.75, |
| "step": 25840, |
| "train_speed(iter/s)": 0.769431 |
| }, |
| { |
| "epoch": 2.4078212290502794, |
| "grad_norm": 1.3046875, |
| "learning_rate": 2.055934835756722e-05, |
| "loss": 0.09150513, |
| "memory(GiB)": 74.75, |
| "step": 25860, |
| "train_speed(iter/s)": 0.769471 |
| }, |
| { |
| "epoch": 2.4096834264432028, |
| "grad_norm": 0.8515625, |
| "learning_rate": 2.0434836093714937e-05, |
| "loss": 0.09933028, |
| "memory(GiB)": 74.75, |
| "step": 25880, |
| "train_speed(iter/s)": 0.769502 |
| }, |
| { |
| "epoch": 2.4115456238361266, |
| "grad_norm": 1.125, |
| "learning_rate": 2.0310659091809948e-05, |
| "loss": 0.09195083, |
| "memory(GiB)": 74.75, |
| "step": 25900, |
| "train_speed(iter/s)": 0.76953 |
| }, |
| { |
| "epoch": 2.4134078212290504, |
| "grad_norm": 0.83984375, |
| "learning_rate": 2.0186817875094198e-05, |
| "loss": 0.10260382, |
| "memory(GiB)": 74.75, |
| "step": 25920, |
| "train_speed(iter/s)": 0.769561 |
| }, |
| { |
| "epoch": 2.4152700186219738, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.0063312965394564e-05, |
| "loss": 0.0909433, |
| "memory(GiB)": 74.75, |
| "step": 25940, |
| "train_speed(iter/s)": 0.769595 |
| }, |
| { |
| "epoch": 2.4171322160148976, |
| "grad_norm": 1.53125, |
| "learning_rate": 1.9940144883120958e-05, |
| "loss": 0.10011367, |
| "memory(GiB)": 74.75, |
| "step": 25960, |
| "train_speed(iter/s)": 0.769627 |
| }, |
| { |
| "epoch": 2.4189944134078214, |
| "grad_norm": 1.3203125, |
| "learning_rate": 1.9817314147263945e-05, |
| "loss": 0.10159645, |
| "memory(GiB)": 74.75, |
| "step": 25980, |
| "train_speed(iter/s)": 0.769625 |
| }, |
| { |
| "epoch": 2.4208566108007448, |
| "grad_norm": 1.125, |
| "learning_rate": 1.9694821275392682e-05, |
| "loss": 0.09749501, |
| "memory(GiB)": 74.75, |
| "step": 26000, |
| "train_speed(iter/s)": 0.769664 |
| }, |
| { |
| "epoch": 2.4208566108007448, |
| "eval_loss": 0.38522472977638245, |
| "eval_runtime": 76.9483, |
| "eval_samples_per_second": 180.446, |
| "eval_steps_per_second": 1.417, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.4227188081936686, |
| "grad_norm": 0.9609375, |
| "learning_rate": 1.9572666783652582e-05, |
| "loss": 0.09559954, |
| "memory(GiB)": 74.75, |
| "step": 26020, |
| "train_speed(iter/s)": 0.767242 |
| }, |
| { |
| "epoch": 2.4245810055865924, |
| "grad_norm": 1.1796875, |
| "learning_rate": 1.945085118676332e-05, |
| "loss": 0.09020438, |
| "memory(GiB)": 74.75, |
| "step": 26040, |
| "train_speed(iter/s)": 0.767278 |
| }, |
| { |
| "epoch": 2.4264432029795158, |
| "grad_norm": 1.0, |
| "learning_rate": 1.9329374998016537e-05, |
| "loss": 0.10400308, |
| "memory(GiB)": 74.75, |
| "step": 26060, |
| "train_speed(iter/s)": 0.767312 |
| }, |
| { |
| "epoch": 2.4283054003724396, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.9208238729273732e-05, |
| "loss": 0.10132781, |
| "memory(GiB)": 74.75, |
| "step": 26080, |
| "train_speed(iter/s)": 0.767353 |
| }, |
| { |
| "epoch": 2.430167597765363, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.9087442890964102e-05, |
| "loss": 0.09796514, |
| "memory(GiB)": 74.75, |
| "step": 26100, |
| "train_speed(iter/s)": 0.76738 |
| }, |
| { |
| "epoch": 2.4320297951582868, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.8966987992082296e-05, |
| "loss": 0.0868296, |
| "memory(GiB)": 74.75, |
| "step": 26120, |
| "train_speed(iter/s)": 0.7674 |
| }, |
| { |
| "epoch": 2.4338919925512106, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.884687454018652e-05, |
| "loss": 0.09946807, |
| "memory(GiB)": 74.75, |
| "step": 26140, |
| "train_speed(iter/s)": 0.767421 |
| }, |
| { |
| "epoch": 2.435754189944134, |
| "grad_norm": 0.86328125, |
| "learning_rate": 1.8727103041396056e-05, |
| "loss": 0.08833747, |
| "memory(GiB)": 74.75, |
| "step": 26160, |
| "train_speed(iter/s)": 0.767423 |
| }, |
| { |
| "epoch": 2.4376163873370578, |
| "grad_norm": 0.8359375, |
| "learning_rate": 1.860767400038941e-05, |
| "loss": 0.08898092, |
| "memory(GiB)": 74.75, |
| "step": 26180, |
| "train_speed(iter/s)": 0.76743 |
| }, |
| { |
| "epoch": 2.439478584729981, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.8488587920402046e-05, |
| "loss": 0.08464298, |
| "memory(GiB)": 74.75, |
| "step": 26200, |
| "train_speed(iter/s)": 0.767463 |
| }, |
| { |
| "epoch": 2.441340782122905, |
| "grad_norm": 0.875, |
| "learning_rate": 1.8369845303224308e-05, |
| "loss": 0.09342575, |
| "memory(GiB)": 74.75, |
| "step": 26220, |
| "train_speed(iter/s)": 0.767475 |
| }, |
| { |
| "epoch": 2.4432029795158288, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.8251446649199355e-05, |
| "loss": 0.09570937, |
| "memory(GiB)": 74.75, |
| "step": 26240, |
| "train_speed(iter/s)": 0.767504 |
| }, |
| { |
| "epoch": 2.445065176908752, |
| "grad_norm": 0.78515625, |
| "learning_rate": 1.8133392457220856e-05, |
| "loss": 0.08847888, |
| "memory(GiB)": 74.75, |
| "step": 26260, |
| "train_speed(iter/s)": 0.767526 |
| }, |
| { |
| "epoch": 2.446927374301676, |
| "grad_norm": 1.125, |
| "learning_rate": 1.801568322473115e-05, |
| "loss": 0.1068172, |
| "memory(GiB)": 74.75, |
| "step": 26280, |
| "train_speed(iter/s)": 0.767557 |
| }, |
| { |
| "epoch": 2.4487895716945998, |
| "grad_norm": 1.234375, |
| "learning_rate": 1.7898319447718993e-05, |
| "loss": 0.10401188, |
| "memory(GiB)": 74.75, |
| "step": 26300, |
| "train_speed(iter/s)": 0.767592 |
| }, |
| { |
| "epoch": 2.450651769087523, |
| "grad_norm": 1.21875, |
| "learning_rate": 1.778130162071753e-05, |
| "loss": 0.10306834, |
| "memory(GiB)": 74.75, |
| "step": 26320, |
| "train_speed(iter/s)": 0.767625 |
| }, |
| { |
| "epoch": 2.452513966480447, |
| "grad_norm": 0.875, |
| "learning_rate": 1.7664630236802073e-05, |
| "loss": 0.09694492, |
| "memory(GiB)": 74.75, |
| "step": 26340, |
| "train_speed(iter/s)": 0.767644 |
| }, |
| { |
| "epoch": 2.4543761638733708, |
| "grad_norm": 1.328125, |
| "learning_rate": 1.7548305787588314e-05, |
| "loss": 0.1060955, |
| "memory(GiB)": 74.75, |
| "step": 26360, |
| "train_speed(iter/s)": 0.767692 |
| }, |
| { |
| "epoch": 2.456238361266294, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.7432328763229912e-05, |
| "loss": 0.09546111, |
| "memory(GiB)": 74.75, |
| "step": 26380, |
| "train_speed(iter/s)": 0.767709 |
| }, |
| { |
| "epoch": 2.458100558659218, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.731669965241668e-05, |
| "loss": 0.11217393, |
| "memory(GiB)": 74.75, |
| "step": 26400, |
| "train_speed(iter/s)": 0.767736 |
| }, |
| { |
| "epoch": 2.4599627560521418, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.7201418942372415e-05, |
| "loss": 0.09984755, |
| "memory(GiB)": 74.75, |
| "step": 26420, |
| "train_speed(iter/s)": 0.767754 |
| }, |
| { |
| "epoch": 2.461824953445065, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.708648711885281e-05, |
| "loss": 0.09662188, |
| "memory(GiB)": 74.75, |
| "step": 26440, |
| "train_speed(iter/s)": 0.767787 |
| }, |
| { |
| "epoch": 2.463687150837989, |
| "grad_norm": 0.9375, |
| "learning_rate": 1.69719046661436e-05, |
| "loss": 0.09171115, |
| "memory(GiB)": 74.75, |
| "step": 26460, |
| "train_speed(iter/s)": 0.767817 |
| }, |
| { |
| "epoch": 2.4655493482309123, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.6857672067058216e-05, |
| "loss": 0.09821294, |
| "memory(GiB)": 74.75, |
| "step": 26480, |
| "train_speed(iter/s)": 0.767854 |
| }, |
| { |
| "epoch": 2.467411545623836, |
| "grad_norm": 0.734375, |
| "learning_rate": 1.674378980293605e-05, |
| "loss": 0.10482373, |
| "memory(GiB)": 74.75, |
| "step": 26500, |
| "train_speed(iter/s)": 0.767907 |
| }, |
| { |
| "epoch": 2.46927374301676, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.6630258353640217e-05, |
| "loss": 0.08855894, |
| "memory(GiB)": 74.75, |
| "step": 26520, |
| "train_speed(iter/s)": 0.767921 |
| }, |
| { |
| "epoch": 2.4711359404096833, |
| "grad_norm": 0.96484375, |
| "learning_rate": 1.6517078197555647e-05, |
| "loss": 0.08362047, |
| "memory(GiB)": 74.75, |
| "step": 26540, |
| "train_speed(iter/s)": 0.767932 |
| }, |
| { |
| "epoch": 2.472998137802607, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.6404249811587056e-05, |
| "loss": 0.09590976, |
| "memory(GiB)": 74.75, |
| "step": 26560, |
| "train_speed(iter/s)": 0.767948 |
| }, |
| { |
| "epoch": 2.4748603351955305, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.629177367115684e-05, |
| "loss": 0.11099143, |
| "memory(GiB)": 74.75, |
| "step": 26580, |
| "train_speed(iter/s)": 0.767995 |
| }, |
| { |
| "epoch": 2.4767225325884543, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.6179650250203216e-05, |
| "loss": 0.09504373, |
| "memory(GiB)": 74.75, |
| "step": 26600, |
| "train_speed(iter/s)": 0.768019 |
| }, |
| { |
| "epoch": 2.478584729981378, |
| "grad_norm": 0.88671875, |
| "learning_rate": 1.6067880021178105e-05, |
| "loss": 0.09536514, |
| "memory(GiB)": 74.75, |
| "step": 26620, |
| "train_speed(iter/s)": 0.768043 |
| }, |
| { |
| "epoch": 2.4804469273743015, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.5956463455045267e-05, |
| "loss": 0.10592208, |
| "memory(GiB)": 74.75, |
| "step": 26640, |
| "train_speed(iter/s)": 0.768061 |
| }, |
| { |
| "epoch": 2.4823091247672253, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.5845401021278106e-05, |
| "loss": 0.10959908, |
| "memory(GiB)": 74.75, |
| "step": 26660, |
| "train_speed(iter/s)": 0.768072 |
| }, |
| { |
| "epoch": 2.484171322160149, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.5734693187857997e-05, |
| "loss": 0.09669304, |
| "memory(GiB)": 74.75, |
| "step": 26680, |
| "train_speed(iter/s)": 0.768115 |
| }, |
| { |
| "epoch": 2.4860335195530725, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.5624340421272e-05, |
| "loss": 0.10210849, |
| "memory(GiB)": 74.75, |
| "step": 26700, |
| "train_speed(iter/s)": 0.76816 |
| }, |
| { |
| "epoch": 2.4878957169459963, |
| "grad_norm": 1.4140625, |
| "learning_rate": 1.5514343186511104e-05, |
| "loss": 0.10455359, |
| "memory(GiB)": 74.75, |
| "step": 26720, |
| "train_speed(iter/s)": 0.768193 |
| }, |
| { |
| "epoch": 2.48975791433892, |
| "grad_norm": 1.21875, |
| "learning_rate": 1.5404701947068224e-05, |
| "loss": 0.10247712, |
| "memory(GiB)": 74.75, |
| "step": 26740, |
| "train_speed(iter/s)": 0.768229 |
| }, |
| { |
| "epoch": 2.4916201117318435, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.529541716493612e-05, |
| "loss": 0.09632077, |
| "memory(GiB)": 74.75, |
| "step": 26760, |
| "train_speed(iter/s)": 0.768262 |
| }, |
| { |
| "epoch": 2.4934823091247673, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.518648930060571e-05, |
| "loss": 0.10718331, |
| "memory(GiB)": 74.75, |
| "step": 26780, |
| "train_speed(iter/s)": 0.768295 |
| }, |
| { |
| "epoch": 2.4953445065176907, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.507791881306384e-05, |
| "loss": 0.09734278, |
| "memory(GiB)": 74.75, |
| "step": 26800, |
| "train_speed(iter/s)": 0.768348 |
| }, |
| { |
| "epoch": 2.4972067039106145, |
| "grad_norm": 1.1640625, |
| "learning_rate": 1.4969706159791564e-05, |
| "loss": 0.10756767, |
| "memory(GiB)": 74.75, |
| "step": 26820, |
| "train_speed(iter/s)": 0.768362 |
| }, |
| { |
| "epoch": 2.4990689013035383, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.4861851796762094e-05, |
| "loss": 0.10674, |
| "memory(GiB)": 74.75, |
| "step": 26840, |
| "train_speed(iter/s)": 0.768384 |
| }, |
| { |
| "epoch": 2.5009310986964617, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.4754356178438954e-05, |
| "loss": 0.10347006, |
| "memory(GiB)": 74.75, |
| "step": 26860, |
| "train_speed(iter/s)": 0.768408 |
| }, |
| { |
| "epoch": 2.5027932960893855, |
| "grad_norm": 0.875, |
| "learning_rate": 1.4647219757774045e-05, |
| "loss": 0.08911729, |
| "memory(GiB)": 74.75, |
| "step": 26880, |
| "train_speed(iter/s)": 0.768442 |
| }, |
| { |
| "epoch": 2.504655493482309, |
| "grad_norm": 0.9765625, |
| "learning_rate": 1.454044298620566e-05, |
| "loss": 0.09411467, |
| "memory(GiB)": 74.75, |
| "step": 26900, |
| "train_speed(iter/s)": 0.768473 |
| }, |
| { |
| "epoch": 2.5065176908752327, |
| "grad_norm": 1.0, |
| "learning_rate": 1.443402631365669e-05, |
| "loss": 0.08597951, |
| "memory(GiB)": 74.75, |
| "step": 26920, |
| "train_speed(iter/s)": 0.768502 |
| }, |
| { |
| "epoch": 2.5083798882681565, |
| "grad_norm": 1.3515625, |
| "learning_rate": 1.4327970188532702e-05, |
| "loss": 0.10816082, |
| "memory(GiB)": 74.75, |
| "step": 26940, |
| "train_speed(iter/s)": 0.768536 |
| }, |
| { |
| "epoch": 2.51024208566108, |
| "grad_norm": 1.3046875, |
| "learning_rate": 1.4222275057720024e-05, |
| "loss": 0.08901605, |
| "memory(GiB)": 74.75, |
| "step": 26960, |
| "train_speed(iter/s)": 0.768546 |
| }, |
| { |
| "epoch": 2.5121042830540037, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.4116941366583802e-05, |
| "loss": 0.0930239, |
| "memory(GiB)": 74.75, |
| "step": 26980, |
| "train_speed(iter/s)": 0.768569 |
| }, |
| { |
| "epoch": 2.5139664804469275, |
| "grad_norm": 0.9609375, |
| "learning_rate": 1.4011969558966331e-05, |
| "loss": 0.10327775, |
| "memory(GiB)": 74.75, |
| "step": 27000, |
| "train_speed(iter/s)": 0.768586 |
| }, |
| { |
| "epoch": 2.515828677839851, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.3907360077184906e-05, |
| "loss": 0.10326664, |
| "memory(GiB)": 74.75, |
| "step": 27020, |
| "train_speed(iter/s)": 0.768624 |
| }, |
| { |
| "epoch": 2.5176908752327747, |
| "grad_norm": 1.3828125, |
| "learning_rate": 1.3803113362030151e-05, |
| "loss": 0.09636109, |
| "memory(GiB)": 74.75, |
| "step": 27040, |
| "train_speed(iter/s)": 0.76864 |
| }, |
| { |
| "epoch": 2.5195530726256985, |
| "grad_norm": 0.99609375, |
| "learning_rate": 1.3699229852764172e-05, |
| "loss": 0.10351133, |
| "memory(GiB)": 74.75, |
| "step": 27060, |
| "train_speed(iter/s)": 0.768667 |
| }, |
| { |
| "epoch": 2.521415270018622, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.3595709987118487e-05, |
| "loss": 0.09553545, |
| "memory(GiB)": 74.75, |
| "step": 27080, |
| "train_speed(iter/s)": 0.76871 |
| }, |
| { |
| "epoch": 2.5232774674115457, |
| "grad_norm": 0.921875, |
| "learning_rate": 1.349255420129253e-05, |
| "loss": 0.08632871, |
| "memory(GiB)": 74.75, |
| "step": 27100, |
| "train_speed(iter/s)": 0.768728 |
| }, |
| { |
| "epoch": 2.5251396648044695, |
| "grad_norm": 0.96484375, |
| "learning_rate": 1.3389762929951467e-05, |
| "loss": 0.10044611, |
| "memory(GiB)": 74.75, |
| "step": 27120, |
| "train_speed(iter/s)": 0.768769 |
| }, |
| { |
| "epoch": 2.527001862197393, |
| "grad_norm": 0.8515625, |
| "learning_rate": 1.3287336606224576e-05, |
| "loss": 0.09492825, |
| "memory(GiB)": 74.75, |
| "step": 27140, |
| "train_speed(iter/s)": 0.76879 |
| }, |
| { |
| "epoch": 2.5288640595903167, |
| "grad_norm": 1.1640625, |
| "learning_rate": 1.318527566170339e-05, |
| "loss": 0.10425776, |
| "memory(GiB)": 74.75, |
| "step": 27160, |
| "train_speed(iter/s)": 0.768817 |
| }, |
| { |
| "epoch": 2.5307262569832405, |
| "grad_norm": 1.1328125, |
| "learning_rate": 1.3083580526439787e-05, |
| "loss": 0.10316639, |
| "memory(GiB)": 74.75, |
| "step": 27180, |
| "train_speed(iter/s)": 0.768838 |
| }, |
| { |
| "epoch": 2.532588454376164, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.298225162894433e-05, |
| "loss": 0.08870911, |
| "memory(GiB)": 74.75, |
| "step": 27200, |
| "train_speed(iter/s)": 0.768852 |
| }, |
| { |
| "epoch": 2.5344506517690877, |
| "grad_norm": 1.0, |
| "learning_rate": 1.288128939618426e-05, |
| "loss": 0.09396772, |
| "memory(GiB)": 74.75, |
| "step": 27220, |
| "train_speed(iter/s)": 0.768884 |
| }, |
| { |
| "epoch": 2.536312849162011, |
| "grad_norm": 0.90625, |
| "learning_rate": 1.2780694253581905e-05, |
| "loss": 0.09058628, |
| "memory(GiB)": 74.75, |
| "step": 27240, |
| "train_speed(iter/s)": 0.768915 |
| }, |
| { |
| "epoch": 2.538175046554935, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.2680466625012766e-05, |
| "loss": 0.09303147, |
| "memory(GiB)": 74.75, |
| "step": 27260, |
| "train_speed(iter/s)": 0.76895 |
| }, |
| { |
| "epoch": 2.5400372439478582, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.2580606932803762e-05, |
| "loss": 0.09020774, |
| "memory(GiB)": 74.75, |
| "step": 27280, |
| "train_speed(iter/s)": 0.768973 |
| }, |
| { |
| "epoch": 2.541899441340782, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.2481115597731385e-05, |
| "loss": 0.0921455, |
| "memory(GiB)": 74.75, |
| "step": 27300, |
| "train_speed(iter/s)": 0.769017 |
| }, |
| { |
| "epoch": 2.543761638733706, |
| "grad_norm": 0.80078125, |
| "learning_rate": 1.2381993039020123e-05, |
| "loss": 0.10183475, |
| "memory(GiB)": 74.75, |
| "step": 27320, |
| "train_speed(iter/s)": 0.769039 |
| }, |
| { |
| "epoch": 2.5456238361266292, |
| "grad_norm": 0.828125, |
| "learning_rate": 1.228323967434043e-05, |
| "loss": 0.08973705, |
| "memory(GiB)": 74.75, |
| "step": 27340, |
| "train_speed(iter/s)": 0.769069 |
| }, |
| { |
| "epoch": 2.547486033519553, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.2184855919807148e-05, |
| "loss": 0.08727324, |
| "memory(GiB)": 74.75, |
| "step": 27360, |
| "train_speed(iter/s)": 0.769103 |
| }, |
| { |
| "epoch": 2.549348230912477, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.2086842189977698e-05, |
| "loss": 0.09381813, |
| "memory(GiB)": 74.75, |
| "step": 27380, |
| "train_speed(iter/s)": 0.769124 |
| }, |
| { |
| "epoch": 2.5512104283054002, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.1989198897850352e-05, |
| "loss": 0.09397247, |
| "memory(GiB)": 74.75, |
| "step": 27400, |
| "train_speed(iter/s)": 0.769158 |
| }, |
| { |
| "epoch": 2.553072625698324, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.1891926454862468e-05, |
| "loss": 0.10348772, |
| "memory(GiB)": 74.75, |
| "step": 27420, |
| "train_speed(iter/s)": 0.769173 |
| }, |
| { |
| "epoch": 2.554934823091248, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.1795025270888726e-05, |
| "loss": 0.10088123, |
| "memory(GiB)": 74.75, |
| "step": 27440, |
| "train_speed(iter/s)": 0.769208 |
| }, |
| { |
| "epoch": 2.5567970204841712, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.169849575423948e-05, |
| "loss": 0.10310076, |
| "memory(GiB)": 74.75, |
| "step": 27460, |
| "train_speed(iter/s)": 0.769257 |
| }, |
| { |
| "epoch": 2.558659217877095, |
| "grad_norm": 1.0, |
| "learning_rate": 1.1602338311659012e-05, |
| "loss": 0.08614667, |
| "memory(GiB)": 74.75, |
| "step": 27480, |
| "train_speed(iter/s)": 0.769299 |
| }, |
| { |
| "epoch": 2.560521415270019, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.1506553348323757e-05, |
| "loss": 0.10040137, |
| "memory(GiB)": 74.75, |
| "step": 27500, |
| "train_speed(iter/s)": 0.769316 |
| }, |
| { |
| "epoch": 2.5623836126629422, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.1411141267840675e-05, |
| "loss": 0.08545375, |
| "memory(GiB)": 74.75, |
| "step": 27520, |
| "train_speed(iter/s)": 0.769353 |
| }, |
| { |
| "epoch": 2.564245810055866, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.131610247224555e-05, |
| "loss": 0.1016354, |
| "memory(GiB)": 74.75, |
| "step": 27540, |
| "train_speed(iter/s)": 0.769373 |
| }, |
| { |
| "epoch": 2.5661080074487894, |
| "grad_norm": 0.9609375, |
| "learning_rate": 1.1221437362001153e-05, |
| "loss": 0.09240752, |
| "memory(GiB)": 74.75, |
| "step": 27560, |
| "train_speed(iter/s)": 0.769391 |
| }, |
| { |
| "epoch": 2.5679702048417132, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.1127146335995774e-05, |
| "loss": 0.09214135, |
| "memory(GiB)": 74.75, |
| "step": 27580, |
| "train_speed(iter/s)": 0.76942 |
| }, |
| { |
| "epoch": 2.5698324022346366, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.1033229791541422e-05, |
| "loss": 0.10229049, |
| "memory(GiB)": 74.75, |
| "step": 27600, |
| "train_speed(iter/s)": 0.769439 |
| }, |
| { |
| "epoch": 2.5716945996275604, |
| "grad_norm": 1.4609375, |
| "learning_rate": 1.093968812437206e-05, |
| "loss": 0.11024711, |
| "memory(GiB)": 74.75, |
| "step": 27620, |
| "train_speed(iter/s)": 0.769455 |
| }, |
| { |
| "epoch": 2.5735567970204842, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.0846521728642201e-05, |
| "loss": 0.10305281, |
| "memory(GiB)": 74.75, |
| "step": 27640, |
| "train_speed(iter/s)": 0.769477 |
| }, |
| { |
| "epoch": 2.5754189944134076, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.0753730996924926e-05, |
| "loss": 0.10268775, |
| "memory(GiB)": 74.75, |
| "step": 27660, |
| "train_speed(iter/s)": 0.769492 |
| }, |
| { |
| "epoch": 2.5772811918063314, |
| "grad_norm": 0.83203125, |
| "learning_rate": 1.0661316320210524e-05, |
| "loss": 0.08912667, |
| "memory(GiB)": 74.75, |
| "step": 27680, |
| "train_speed(iter/s)": 0.769514 |
| }, |
| { |
| "epoch": 2.5791433891992552, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.0569278087904588e-05, |
| "loss": 0.09018875, |
| "memory(GiB)": 74.75, |
| "step": 27700, |
| "train_speed(iter/s)": 0.769546 |
| }, |
| { |
| "epoch": 2.5810055865921786, |
| "grad_norm": 1.15625, |
| "learning_rate": 1.0477616687826596e-05, |
| "loss": 0.08439552, |
| "memory(GiB)": 74.75, |
| "step": 27720, |
| "train_speed(iter/s)": 0.76954 |
| }, |
| { |
| "epoch": 2.5828677839851024, |
| "grad_norm": 0.84765625, |
| "learning_rate": 1.038633250620813e-05, |
| "loss": 0.10151093, |
| "memory(GiB)": 74.75, |
| "step": 27740, |
| "train_speed(iter/s)": 0.76954 |
| }, |
| { |
| "epoch": 2.5847299813780262, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.029542592769126e-05, |
| "loss": 0.10534508, |
| "memory(GiB)": 74.75, |
| "step": 27760, |
| "train_speed(iter/s)": 0.769567 |
| }, |
| { |
| "epoch": 2.5865921787709496, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.0204897335327047e-05, |
| "loss": 0.09673885, |
| "memory(GiB)": 74.75, |
| "step": 27780, |
| "train_speed(iter/s)": 0.769612 |
| }, |
| { |
| "epoch": 2.5884543761638734, |
| "grad_norm": 1.453125, |
| "learning_rate": 1.0114747110573753e-05, |
| "loss": 0.10991576, |
| "memory(GiB)": 74.75, |
| "step": 27800, |
| "train_speed(iter/s)": 0.769639 |
| }, |
| { |
| "epoch": 2.5903165735567972, |
| "grad_norm": 1.1796875, |
| "learning_rate": 1.0024975633295386e-05, |
| "loss": 0.08816065, |
| "memory(GiB)": 74.75, |
| "step": 27820, |
| "train_speed(iter/s)": 0.769669 |
| }, |
| { |
| "epoch": 2.5921787709497206, |
| "grad_norm": 0.87890625, |
| "learning_rate": 9.935583281759986e-06, |
| "loss": 0.10151916, |
| "memory(GiB)": 74.75, |
| "step": 27840, |
| "train_speed(iter/s)": 0.769699 |
| }, |
| { |
| "epoch": 2.5940409683426444, |
| "grad_norm": 0.96484375, |
| "learning_rate": 9.846570432638147e-06, |
| "loss": 0.10038137, |
| "memory(GiB)": 74.75, |
| "step": 27860, |
| "train_speed(iter/s)": 0.769711 |
| }, |
| { |
| "epoch": 2.5959031657355682, |
| "grad_norm": 0.98828125, |
| "learning_rate": 9.757937461001244e-06, |
| "loss": 0.08528215, |
| "memory(GiB)": 74.75, |
| "step": 27880, |
| "train_speed(iter/s)": 0.769713 |
| }, |
| { |
| "epoch": 2.5977653631284916, |
| "grad_norm": 0.85546875, |
| "learning_rate": 9.669684740320096e-06, |
| "loss": 0.08982363, |
| "memory(GiB)": 74.75, |
| "step": 27900, |
| "train_speed(iter/s)": 0.76973 |
| }, |
| { |
| "epoch": 2.5996275605214154, |
| "grad_norm": 1.125, |
| "learning_rate": 9.581812642463206e-06, |
| "loss": 0.11246864, |
| "memory(GiB)": 74.75, |
| "step": 27920, |
| "train_speed(iter/s)": 0.769746 |
| }, |
| { |
| "epoch": 2.601489757914339, |
| "grad_norm": 1.0859375, |
| "learning_rate": 9.494321537695216e-06, |
| "loss": 0.100134, |
| "memory(GiB)": 74.75, |
| "step": 27940, |
| "train_speed(iter/s)": 0.769769 |
| }, |
| { |
| "epoch": 2.6033519553072626, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.407211794675508e-06, |
| "loss": 0.09557046, |
| "memory(GiB)": 74.75, |
| "step": 27960, |
| "train_speed(iter/s)": 0.76978 |
| }, |
| { |
| "epoch": 2.605214152700186, |
| "grad_norm": 0.83984375, |
| "learning_rate": 9.32048378045638e-06, |
| "loss": 0.09010305, |
| "memory(GiB)": 74.75, |
| "step": 27980, |
| "train_speed(iter/s)": 0.769779 |
| }, |
| { |
| "epoch": 2.60707635009311, |
| "grad_norm": 0.91015625, |
| "learning_rate": 9.234137860481796e-06, |
| "loss": 0.09046291, |
| "memory(GiB)": 45.75, |
| "step": 28000, |
| "train_speed(iter/s)": 0.769799 |
| }, |
| { |
| "epoch": 2.60707635009311, |
| "eval_loss": 0.38696086406707764, |
| "eval_runtime": 77.0234, |
| "eval_samples_per_second": 180.27, |
| "eval_steps_per_second": 1.415, |
| "step": 28000 |
| }, |
| { |
| "epoch": 2.6089385474860336, |
| "grad_norm": 0.97265625, |
| "learning_rate": 9.148174398585564e-06, |
| "loss": 0.09225785, |
| "memory(GiB)": 45.75, |
| "step": 28020, |
| "train_speed(iter/s)": 0.767502 |
| }, |
| { |
| "epoch": 2.610800744878957, |
| "grad_norm": 0.89453125, |
| "learning_rate": 9.062593756990012e-06, |
| "loss": 0.10355859, |
| "memory(GiB)": 45.75, |
| "step": 28040, |
| "train_speed(iter/s)": 0.767513 |
| }, |
| { |
| "epoch": 2.612662942271881, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.977396296304385e-06, |
| "loss": 0.09487821, |
| "memory(GiB)": 45.75, |
| "step": 28060, |
| "train_speed(iter/s)": 0.767532 |
| }, |
| { |
| "epoch": 2.6145251396648046, |
| "grad_norm": 0.86328125, |
| "learning_rate": 8.892582375523295e-06, |
| "loss": 0.08763714, |
| "memory(GiB)": 45.75, |
| "step": 28080, |
| "train_speed(iter/s)": 0.767557 |
| }, |
| { |
| "epoch": 2.616387337057728, |
| "grad_norm": 0.78515625, |
| "learning_rate": 8.808152352025267e-06, |
| "loss": 0.09426429, |
| "memory(GiB)": 45.75, |
| "step": 28100, |
| "train_speed(iter/s)": 0.767589 |
| }, |
| { |
| "epoch": 2.618249534450652, |
| "grad_norm": 1.0078125, |
| "learning_rate": 8.7241065815712e-06, |
| "loss": 0.10946054, |
| "memory(GiB)": 45.75, |
| "step": 28120, |
| "train_speed(iter/s)": 0.767604 |
| }, |
| { |
| "epoch": 2.6201117318435756, |
| "grad_norm": 1.03125, |
| "learning_rate": 8.640445418302856e-06, |
| "loss": 0.1012117, |
| "memory(GiB)": 45.75, |
| "step": 28140, |
| "train_speed(iter/s)": 0.767624 |
| }, |
| { |
| "epoch": 2.621973929236499, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.557169214741435e-06, |
| "loss": 0.09337143, |
| "memory(GiB)": 45.75, |
| "step": 28160, |
| "train_speed(iter/s)": 0.767651 |
| }, |
| { |
| "epoch": 2.623836126629423, |
| "grad_norm": 1.484375, |
| "learning_rate": 8.474278321785988e-06, |
| "loss": 0.08732111, |
| "memory(GiB)": 45.75, |
| "step": 28180, |
| "train_speed(iter/s)": 0.767693 |
| }, |
| { |
| "epoch": 2.6256983240223466, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.391773088712018e-06, |
| "loss": 0.09296905, |
| "memory(GiB)": 45.75, |
| "step": 28200, |
| "train_speed(iter/s)": 0.767704 |
| }, |
| { |
| "epoch": 2.62756052141527, |
| "grad_norm": 0.98046875, |
| "learning_rate": 8.309653863169964e-06, |
| "loss": 0.09660787, |
| "memory(GiB)": 45.75, |
| "step": 28220, |
| "train_speed(iter/s)": 0.767745 |
| }, |
| { |
| "epoch": 2.629422718808194, |
| "grad_norm": 1.0078125, |
| "learning_rate": 8.22792099118379e-06, |
| "loss": 0.08955661, |
| "memory(GiB)": 45.75, |
| "step": 28240, |
| "train_speed(iter/s)": 0.767763 |
| }, |
| { |
| "epoch": 2.631284916201117, |
| "grad_norm": 0.890625, |
| "learning_rate": 8.146574817149411e-06, |
| "loss": 0.09369433, |
| "memory(GiB)": 45.75, |
| "step": 28260, |
| "train_speed(iter/s)": 0.767786 |
| }, |
| { |
| "epoch": 2.633147113594041, |
| "grad_norm": 0.98046875, |
| "learning_rate": 8.065615683833462e-06, |
| "loss": 0.09985462, |
| "memory(GiB)": 45.75, |
| "step": 28280, |
| "train_speed(iter/s)": 0.767812 |
| }, |
| { |
| "epoch": 2.635009310986965, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.985043932371539e-06, |
| "loss": 0.09353604, |
| "memory(GiB)": 45.75, |
| "step": 28300, |
| "train_speed(iter/s)": 0.767841 |
| }, |
| { |
| "epoch": 2.636871508379888, |
| "grad_norm": 1.0078125, |
| "learning_rate": 7.904859902267114e-06, |
| "loss": 0.08616074, |
| "memory(GiB)": 45.75, |
| "step": 28320, |
| "train_speed(iter/s)": 0.767875 |
| }, |
| { |
| "epoch": 2.638733705772812, |
| "grad_norm": 1.234375, |
| "learning_rate": 7.825063931389764e-06, |
| "loss": 0.09571852, |
| "memory(GiB)": 45.75, |
| "step": 28340, |
| "train_speed(iter/s)": 0.767919 |
| }, |
| { |
| "epoch": 2.6405959031657353, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.745656355974018e-06, |
| "loss": 0.10770638, |
| "memory(GiB)": 45.75, |
| "step": 28360, |
| "train_speed(iter/s)": 0.767942 |
| }, |
| { |
| "epoch": 2.642458100558659, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.666637510617791e-06, |
| "loss": 0.08916503, |
| "memory(GiB)": 45.75, |
| "step": 28380, |
| "train_speed(iter/s)": 0.767976 |
| }, |
| { |
| "epoch": 2.644320297951583, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.58800772828101e-06, |
| "loss": 0.1024822, |
| "memory(GiB)": 45.75, |
| "step": 28400, |
| "train_speed(iter/s)": 0.768012 |
| }, |
| { |
| "epoch": 2.6461824953445063, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.509767340284235e-06, |
| "loss": 0.08160416, |
| "memory(GiB)": 45.75, |
| "step": 28420, |
| "train_speed(iter/s)": 0.768045 |
| }, |
| { |
| "epoch": 2.64804469273743, |
| "grad_norm": 1.046875, |
| "learning_rate": 7.431916676307238e-06, |
| "loss": 0.09086616, |
| "memory(GiB)": 45.75, |
| "step": 28440, |
| "train_speed(iter/s)": 0.768057 |
| }, |
| { |
| "epoch": 2.649906890130354, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.354456064387605e-06, |
| "loss": 0.10261321, |
| "memory(GiB)": 45.75, |
| "step": 28460, |
| "train_speed(iter/s)": 0.768083 |
| }, |
| { |
| "epoch": 2.6517690875232773, |
| "grad_norm": 1.2734375, |
| "learning_rate": 7.277385830919381e-06, |
| "loss": 0.09744512, |
| "memory(GiB)": 45.75, |
| "step": 28480, |
| "train_speed(iter/s)": 0.768101 |
| }, |
| { |
| "epoch": 2.653631284916201, |
| "grad_norm": 0.98046875, |
| "learning_rate": 7.200706300651705e-06, |
| "loss": 0.09409308, |
| "memory(GiB)": 45.75, |
| "step": 28500, |
| "train_speed(iter/s)": 0.768142 |
| }, |
| { |
| "epoch": 2.655493482309125, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.124417796687366e-06, |
| "loss": 0.09326741, |
| "memory(GiB)": 45.75, |
| "step": 28520, |
| "train_speed(iter/s)": 0.768152 |
| }, |
| { |
| "epoch": 2.6573556797020483, |
| "grad_norm": 0.78125, |
| "learning_rate": 7.048520640481526e-06, |
| "loss": 0.097636, |
| "memory(GiB)": 45.75, |
| "step": 28540, |
| "train_speed(iter/s)": 0.768179 |
| }, |
| { |
| "epoch": 2.659217877094972, |
| "grad_norm": 1.125, |
| "learning_rate": 6.973015151840334e-06, |
| "loss": 0.09104769, |
| "memory(GiB)": 45.75, |
| "step": 28560, |
| "train_speed(iter/s)": 0.768207 |
| }, |
| { |
| "epoch": 2.661080074487896, |
| "grad_norm": 1.125, |
| "learning_rate": 6.897901648919591e-06, |
| "loss": 0.1032981, |
| "memory(GiB)": 45.75, |
| "step": 28580, |
| "train_speed(iter/s)": 0.768224 |
| }, |
| { |
| "epoch": 2.6629422718808193, |
| "grad_norm": 0.8828125, |
| "learning_rate": 6.8231804482233655e-06, |
| "loss": 0.08662249, |
| "memory(GiB)": 45.75, |
| "step": 28600, |
| "train_speed(iter/s)": 0.768234 |
| }, |
| { |
| "epoch": 2.664804469273743, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.74885186460269e-06, |
| "loss": 0.1023387, |
| "memory(GiB)": 45.75, |
| "step": 28620, |
| "train_speed(iter/s)": 0.768249 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.674916211254289e-06, |
| "loss": 0.09208263, |
| "memory(GiB)": 45.75, |
| "step": 28640, |
| "train_speed(iter/s)": 0.768252 |
| }, |
| { |
| "epoch": 2.6685288640595903, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.601373799719135e-06, |
| "loss": 0.09997191, |
| "memory(GiB)": 45.75, |
| "step": 28660, |
| "train_speed(iter/s)": 0.768271 |
| }, |
| { |
| "epoch": 2.6703910614525137, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.528224939881256e-06, |
| "loss": 0.09428074, |
| "memory(GiB)": 45.75, |
| "step": 28680, |
| "train_speed(iter/s)": 0.768303 |
| }, |
| { |
| "epoch": 2.6722532588454375, |
| "grad_norm": 0.984375, |
| "learning_rate": 6.455469939966352e-06, |
| "loss": 0.09120364, |
| "memory(GiB)": 45.75, |
| "step": 28700, |
| "train_speed(iter/s)": 0.768333 |
| }, |
| { |
| "epoch": 2.6741154562383613, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.383109106540552e-06, |
| "loss": 0.09465919, |
| "memory(GiB)": 45.75, |
| "step": 28720, |
| "train_speed(iter/s)": 0.768359 |
| }, |
| { |
| "epoch": 2.6759776536312847, |
| "grad_norm": 1.515625, |
| "learning_rate": 6.311142744509058e-06, |
| "loss": 0.10171504, |
| "memory(GiB)": 45.75, |
| "step": 28740, |
| "train_speed(iter/s)": 0.768388 |
| }, |
| { |
| "epoch": 2.6778398510242085, |
| "grad_norm": 0.9921875, |
| "learning_rate": 6.239571157114921e-06, |
| "loss": 0.08548477, |
| "memory(GiB)": 45.75, |
| "step": 28760, |
| "train_speed(iter/s)": 0.768436 |
| }, |
| { |
| "epoch": 2.6797020484171323, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.168394645937725e-06, |
| "loss": 0.10516393, |
| "memory(GiB)": 45.75, |
| "step": 28780, |
| "train_speed(iter/s)": 0.768465 |
| }, |
| { |
| "epoch": 2.6815642458100557, |
| "grad_norm": 0.96484375, |
| "learning_rate": 6.0976135108923636e-06, |
| "loss": 0.10106995, |
| "memory(GiB)": 45.75, |
| "step": 28800, |
| "train_speed(iter/s)": 0.768484 |
| }, |
| { |
| "epoch": 2.6834264432029795, |
| "grad_norm": 0.83984375, |
| "learning_rate": 6.027228050227718e-06, |
| "loss": 0.08965033, |
| "memory(GiB)": 45.75, |
| "step": 28820, |
| "train_speed(iter/s)": 0.768529 |
| }, |
| { |
| "epoch": 2.6852886405959033, |
| "grad_norm": 0.8828125, |
| "learning_rate": 5.957238560525391e-06, |
| "loss": 0.09630392, |
| "memory(GiB)": 45.75, |
| "step": 28840, |
| "train_speed(iter/s)": 0.768562 |
| }, |
| { |
| "epoch": 2.6871508379888267, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.887645336698566e-06, |
| "loss": 0.08954536, |
| "memory(GiB)": 45.75, |
| "step": 28860, |
| "train_speed(iter/s)": 0.768579 |
| }, |
| { |
| "epoch": 2.6890130353817505, |
| "grad_norm": 0.85546875, |
| "learning_rate": 5.818448671990628e-06, |
| "loss": 0.10974171, |
| "memory(GiB)": 45.75, |
| "step": 28880, |
| "train_speed(iter/s)": 0.768584 |
| }, |
| { |
| "epoch": 2.6908752327746743, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.7496488579740105e-06, |
| "loss": 0.10110396, |
| "memory(GiB)": 45.75, |
| "step": 28900, |
| "train_speed(iter/s)": 0.768602 |
| }, |
| { |
| "epoch": 2.6927374301675977, |
| "grad_norm": 0.98046875, |
| "learning_rate": 5.681246184548972e-06, |
| "loss": 0.11610439, |
| "memory(GiB)": 45.75, |
| "step": 28920, |
| "train_speed(iter/s)": 0.768609 |
| }, |
| { |
| "epoch": 2.6945996275605215, |
| "grad_norm": 0.9609375, |
| "learning_rate": 5.613240939942277e-06, |
| "loss": 0.09422116, |
| "memory(GiB)": 45.75, |
| "step": 28940, |
| "train_speed(iter/s)": 0.768625 |
| }, |
| { |
| "epoch": 2.6964618249534453, |
| "grad_norm": 0.75, |
| "learning_rate": 5.5456334107061635e-06, |
| "loss": 0.10199471, |
| "memory(GiB)": 45.75, |
| "step": 28960, |
| "train_speed(iter/s)": 0.768654 |
| }, |
| { |
| "epoch": 2.6983240223463687, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.478423881716899e-06, |
| "loss": 0.0880377, |
| "memory(GiB)": 45.75, |
| "step": 28980, |
| "train_speed(iter/s)": 0.768691 |
| }, |
| { |
| "epoch": 2.7001862197392925, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.4116126361738125e-06, |
| "loss": 0.08939071, |
| "memory(GiB)": 45.75, |
| "step": 29000, |
| "train_speed(iter/s)": 0.76867 |
| }, |
| { |
| "epoch": 2.702048417132216, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.345199955597924e-06, |
| "loss": 0.10153615, |
| "memory(GiB)": 45.75, |
| "step": 29020, |
| "train_speed(iter/s)": 0.768684 |
| }, |
| { |
| "epoch": 2.7039106145251397, |
| "grad_norm": 0.8359375, |
| "learning_rate": 5.279186119830892e-06, |
| "loss": 0.09276768, |
| "memory(GiB)": 45.75, |
| "step": 29040, |
| "train_speed(iter/s)": 0.768714 |
| }, |
| { |
| "epoch": 2.705772811918063, |
| "grad_norm": 0.890625, |
| "learning_rate": 5.21357140703369e-06, |
| "loss": 0.09366176, |
| "memory(GiB)": 45.75, |
| "step": 29060, |
| "train_speed(iter/s)": 0.768739 |
| }, |
| { |
| "epoch": 2.707635009310987, |
| "grad_norm": 1.125, |
| "learning_rate": 5.148356093685591e-06, |
| "loss": 0.09835015, |
| "memory(GiB)": 45.75, |
| "step": 29080, |
| "train_speed(iter/s)": 0.768747 |
| }, |
| { |
| "epoch": 2.7094972067039107, |
| "grad_norm": 1.0078125, |
| "learning_rate": 5.083540454582891e-06, |
| "loss": 0.09732973, |
| "memory(GiB)": 45.75, |
| "step": 29100, |
| "train_speed(iter/s)": 0.768793 |
| }, |
| { |
| "epoch": 2.711359404096834, |
| "grad_norm": 0.90625, |
| "learning_rate": 5.019124762837801e-06, |
| "loss": 0.09556629, |
| "memory(GiB)": 45.75, |
| "step": 29120, |
| "train_speed(iter/s)": 0.768826 |
| }, |
| { |
| "epoch": 2.713221601489758, |
| "grad_norm": 0.84375, |
| "learning_rate": 4.95510928987728e-06, |
| "loss": 0.10411631, |
| "memory(GiB)": 45.75, |
| "step": 29140, |
| "train_speed(iter/s)": 0.768842 |
| }, |
| { |
| "epoch": 2.7150837988826817, |
| "grad_norm": 1.4765625, |
| "learning_rate": 4.891494305441868e-06, |
| "loss": 0.09335988, |
| "memory(GiB)": 45.75, |
| "step": 29160, |
| "train_speed(iter/s)": 0.768872 |
| }, |
| { |
| "epoch": 2.716945996275605, |
| "grad_norm": 0.953125, |
| "learning_rate": 4.8282800775846345e-06, |
| "loss": 0.08496674, |
| "memory(GiB)": 45.75, |
| "step": 29180, |
| "train_speed(iter/s)": 0.768898 |
| }, |
| { |
| "epoch": 2.718808193668529, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.765466872669921e-06, |
| "loss": 0.10083444, |
| "memory(GiB)": 45.75, |
| "step": 29200, |
| "train_speed(iter/s)": 0.768937 |
| }, |
| { |
| "epoch": 2.7206703910614527, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.703054955372344e-06, |
| "loss": 0.11718053, |
| "memory(GiB)": 45.75, |
| "step": 29220, |
| "train_speed(iter/s)": 0.768969 |
| }, |
| { |
| "epoch": 2.722532588454376, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.6410445886755914e-06, |
| "loss": 0.09782855, |
| "memory(GiB)": 45.75, |
| "step": 29240, |
| "train_speed(iter/s)": 0.768995 |
| }, |
| { |
| "epoch": 2.7243947858473, |
| "grad_norm": 0.73828125, |
| "learning_rate": 4.579436033871342e-06, |
| "loss": 0.09005135, |
| "memory(GiB)": 45.75, |
| "step": 29260, |
| "train_speed(iter/s)": 0.76898 |
| }, |
| { |
| "epoch": 2.7262569832402237, |
| "grad_norm": 1.125, |
| "learning_rate": 4.518229550558218e-06, |
| "loss": 0.1048403, |
| "memory(GiB)": 45.75, |
| "step": 29280, |
| "train_speed(iter/s)": 0.769008 |
| }, |
| { |
| "epoch": 2.728119180633147, |
| "grad_norm": 0.90234375, |
| "learning_rate": 4.457425396640558e-06, |
| "loss": 0.08753203, |
| "memory(GiB)": 45.75, |
| "step": 29300, |
| "train_speed(iter/s)": 0.769034 |
| }, |
| { |
| "epoch": 2.729981378026071, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.397023828327496e-06, |
| "loss": 0.1012768, |
| "memory(GiB)": 45.75, |
| "step": 29320, |
| "train_speed(iter/s)": 0.76906 |
| }, |
| { |
| "epoch": 2.7318435754189943, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.337025100131764e-06, |
| "loss": 0.09988154, |
| "memory(GiB)": 45.75, |
| "step": 29340, |
| "train_speed(iter/s)": 0.769086 |
| }, |
| { |
| "epoch": 2.733705772811918, |
| "grad_norm": 0.74609375, |
| "learning_rate": 4.277429464868654e-06, |
| "loss": 0.10575411, |
| "memory(GiB)": 45.75, |
| "step": 29360, |
| "train_speed(iter/s)": 0.769107 |
| }, |
| { |
| "epoch": 2.7355679702048414, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.2182371736549375e-06, |
| "loss": 0.0899995, |
| "memory(GiB)": 45.75, |
| "step": 29380, |
| "train_speed(iter/s)": 0.769123 |
| }, |
| { |
| "epoch": 2.7374301675977653, |
| "grad_norm": 0.9375, |
| "learning_rate": 4.159448475907879e-06, |
| "loss": 0.08668512, |
| "memory(GiB)": 45.75, |
| "step": 29400, |
| "train_speed(iter/s)": 0.76917 |
| }, |
| { |
| "epoch": 2.739292364990689, |
| "grad_norm": 0.98046875, |
| "learning_rate": 4.10106361934407e-06, |
| "loss": 0.09873737, |
| "memory(GiB)": 45.75, |
| "step": 29420, |
| "train_speed(iter/s)": 0.769201 |
| }, |
| { |
| "epoch": 2.7411545623836124, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.043082849978475e-06, |
| "loss": 0.11074016, |
| "memory(GiB)": 45.75, |
| "step": 29440, |
| "train_speed(iter/s)": 0.769209 |
| }, |
| { |
| "epoch": 2.7430167597765363, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.985506412123397e-06, |
| "loss": 0.10686636, |
| "memory(GiB)": 45.75, |
| "step": 29460, |
| "train_speed(iter/s)": 0.769252 |
| }, |
| { |
| "epoch": 2.74487895716946, |
| "grad_norm": 0.8203125, |
| "learning_rate": 3.928334548387313e-06, |
| "loss": 0.09317576, |
| "memory(GiB)": 45.75, |
| "step": 29480, |
| "train_speed(iter/s)": 0.769275 |
| }, |
| { |
| "epoch": 2.7467411545623834, |
| "grad_norm": 1.3671875, |
| "learning_rate": 3.8715674996740894e-06, |
| "loss": 0.10190638, |
| "memory(GiB)": 45.75, |
| "step": 29500, |
| "train_speed(iter/s)": 0.769309 |
| }, |
| { |
| "epoch": 2.7486033519553073, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.8152055051817405e-06, |
| "loss": 0.1021665, |
| "memory(GiB)": 45.75, |
| "step": 29520, |
| "train_speed(iter/s)": 0.76935 |
| }, |
| { |
| "epoch": 2.750465549348231, |
| "grad_norm": 1.875, |
| "learning_rate": 3.7592488024015272e-06, |
| "loss": 0.09051661, |
| "memory(GiB)": 45.75, |
| "step": 29540, |
| "train_speed(iter/s)": 0.769382 |
| }, |
| { |
| "epoch": 2.7523277467411544, |
| "grad_norm": 0.9921875, |
| "learning_rate": 3.7036976271169864e-06, |
| "loss": 0.10837436, |
| "memory(GiB)": 45.75, |
| "step": 29560, |
| "train_speed(iter/s)": 0.769402 |
| }, |
| { |
| "epoch": 2.7541899441340782, |
| "grad_norm": 1.125, |
| "learning_rate": 3.6485522134028005e-06, |
| "loss": 0.09573312, |
| "memory(GiB)": 45.75, |
| "step": 29580, |
| "train_speed(iter/s)": 0.769426 |
| }, |
| { |
| "epoch": 2.756052141527002, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.593812793624041e-06, |
| "loss": 0.10327983, |
| "memory(GiB)": 45.75, |
| "step": 29600, |
| "train_speed(iter/s)": 0.76945 |
| }, |
| { |
| "epoch": 2.7579143389199254, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.5394795984349138e-06, |
| "loss": 0.09151828, |
| "memory(GiB)": 45.75, |
| "step": 29620, |
| "train_speed(iter/s)": 0.76947 |
| }, |
| { |
| "epoch": 2.7597765363128492, |
| "grad_norm": 0.8203125, |
| "learning_rate": 3.485552856778007e-06, |
| "loss": 0.09154891, |
| "memory(GiB)": 45.75, |
| "step": 29640, |
| "train_speed(iter/s)": 0.769473 |
| }, |
| { |
| "epoch": 2.761638733705773, |
| "grad_norm": 1.0, |
| "learning_rate": 3.432032795883222e-06, |
| "loss": 0.086561, |
| "memory(GiB)": 45.75, |
| "step": 29660, |
| "train_speed(iter/s)": 0.769506 |
| }, |
| { |
| "epoch": 2.7635009310986964, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.3789196412668424e-06, |
| "loss": 0.08920512, |
| "memory(GiB)": 45.75, |
| "step": 29680, |
| "train_speed(iter/s)": 0.769517 |
| }, |
| { |
| "epoch": 2.7653631284916202, |
| "grad_norm": 1.0078125, |
| "learning_rate": 3.326213616730578e-06, |
| "loss": 0.09615018, |
| "memory(GiB)": 45.75, |
| "step": 29700, |
| "train_speed(iter/s)": 0.769545 |
| }, |
| { |
| "epoch": 2.7672253258845436, |
| "grad_norm": 0.93359375, |
| "learning_rate": 3.273914944360601e-06, |
| "loss": 0.08047535, |
| "memory(GiB)": 45.75, |
| "step": 29720, |
| "train_speed(iter/s)": 0.769564 |
| }, |
| { |
| "epoch": 2.7690875232774674, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.2220238445266892e-06, |
| "loss": 0.08651033, |
| "memory(GiB)": 45.75, |
| "step": 29740, |
| "train_speed(iter/s)": 0.769571 |
| }, |
| { |
| "epoch": 2.770949720670391, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.1705405358811723e-06, |
| "loss": 0.09964473, |
| "memory(GiB)": 45.75, |
| "step": 29760, |
| "train_speed(iter/s)": 0.76961 |
| }, |
| { |
| "epoch": 2.7728119180633146, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.1194652353581545e-06, |
| "loss": 0.09600849, |
| "memory(GiB)": 45.75, |
| "step": 29780, |
| "train_speed(iter/s)": 0.769634 |
| }, |
| { |
| "epoch": 2.7746741154562384, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.068798158172437e-06, |
| "loss": 0.10160735, |
| "memory(GiB)": 45.75, |
| "step": 29800, |
| "train_speed(iter/s)": 0.769656 |
| }, |
| { |
| "epoch": 2.776536312849162, |
| "grad_norm": 0.8828125, |
| "learning_rate": 3.0185395178187857e-06, |
| "loss": 0.10089248, |
| "memory(GiB)": 45.75, |
| "step": 29820, |
| "train_speed(iter/s)": 0.769644 |
| }, |
| { |
| "epoch": 2.7783985102420856, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.968689526070889e-06, |
| "loss": 0.09260511, |
| "memory(GiB)": 45.75, |
| "step": 29840, |
| "train_speed(iter/s)": 0.769688 |
| }, |
| { |
| "epoch": 2.7802607076350094, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.9192483929805335e-06, |
| "loss": 0.09912217, |
| "memory(GiB)": 45.75, |
| "step": 29860, |
| "train_speed(iter/s)": 0.769712 |
| }, |
| { |
| "epoch": 2.782122905027933, |
| "grad_norm": 0.83984375, |
| "learning_rate": 2.870216326876729e-06, |
| "loss": 0.09143957, |
| "memory(GiB)": 45.75, |
| "step": 29880, |
| "train_speed(iter/s)": 0.769748 |
| }, |
| { |
| "epoch": 2.7839851024208566, |
| "grad_norm": 1.0, |
| "learning_rate": 2.821593534364786e-06, |
| "loss": 0.10889904, |
| "memory(GiB)": 45.75, |
| "step": 29900, |
| "train_speed(iter/s)": 0.769769 |
| }, |
| { |
| "epoch": 2.7858472998137804, |
| "grad_norm": 0.9921875, |
| "learning_rate": 2.7733802203254832e-06, |
| "loss": 0.10050044, |
| "memory(GiB)": 45.75, |
| "step": 29920, |
| "train_speed(iter/s)": 0.769786 |
| }, |
| { |
| "epoch": 2.787709497206704, |
| "grad_norm": 1.328125, |
| "learning_rate": 2.7255765879141693e-06, |
| "loss": 0.08751726, |
| "memory(GiB)": 45.75, |
| "step": 29940, |
| "train_speed(iter/s)": 0.769797 |
| }, |
| { |
| "epoch": 2.7895716945996276, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.6781828385599394e-06, |
| "loss": 0.09062733, |
| "memory(GiB)": 45.75, |
| "step": 29960, |
| "train_speed(iter/s)": 0.769807 |
| }, |
| { |
| "epoch": 2.7914338919925514, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.631199171964771e-06, |
| "loss": 0.11372201, |
| "memory(GiB)": 45.75, |
| "step": 29980, |
| "train_speed(iter/s)": 0.769828 |
| }, |
| { |
| "epoch": 2.793296089385475, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.5846257861027014e-06, |
| "loss": 0.09804424, |
| "memory(GiB)": 45.75, |
| "step": 30000, |
| "train_speed(iter/s)": 0.769849 |
| }, |
| { |
| "epoch": 2.793296089385475, |
| "eval_loss": 0.3881955146789551, |
| "eval_runtime": 77.0253, |
| "eval_samples_per_second": 180.266, |
| "eval_steps_per_second": 1.415, |
| "step": 30000 |
| }, |
| { |
| "epoch": 2.7951582867783986, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.5384628772189388e-06, |
| "loss": 0.08849491, |
| "memory(GiB)": 45.75, |
| "step": 30020, |
| "train_speed(iter/s)": 0.767704 |
| }, |
| { |
| "epoch": 2.7970204841713224, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.492710639829121e-06, |
| "loss": 0.10444663, |
| "memory(GiB)": 45.75, |
| "step": 30040, |
| "train_speed(iter/s)": 0.767745 |
| }, |
| { |
| "epoch": 2.798882681564246, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.4473692667184133e-06, |
| "loss": 0.09703537, |
| "memory(GiB)": 45.75, |
| "step": 30060, |
| "train_speed(iter/s)": 0.767778 |
| }, |
| { |
| "epoch": 2.8007448789571696, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.402438948940744e-06, |
| "loss": 0.09352793, |
| "memory(GiB)": 45.75, |
| "step": 30080, |
| "train_speed(iter/s)": 0.767825 |
| }, |
| { |
| "epoch": 2.802607076350093, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.3579198758179934e-06, |
| "loss": 0.10403392, |
| "memory(GiB)": 45.75, |
| "step": 30100, |
| "train_speed(iter/s)": 0.767855 |
| }, |
| { |
| "epoch": 2.804469273743017, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.3138122349391612e-06, |
| "loss": 0.08864436, |
| "memory(GiB)": 45.75, |
| "step": 30120, |
| "train_speed(iter/s)": 0.767874 |
| }, |
| { |
| "epoch": 2.80633147113594, |
| "grad_norm": 0.890625, |
| "learning_rate": 2.2701162121596454e-06, |
| "loss": 0.10232602, |
| "memory(GiB)": 45.75, |
| "step": 30140, |
| "train_speed(iter/s)": 0.767912 |
| }, |
| { |
| "epoch": 2.808193668528864, |
| "grad_norm": 1.125, |
| "learning_rate": 2.226831991600398e-06, |
| "loss": 0.09759098, |
| "memory(GiB)": 45.75, |
| "step": 30160, |
| "train_speed(iter/s)": 0.767926 |
| }, |
| { |
| "epoch": 2.810055865921788, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.183959755647147e-06, |
| "loss": 0.08362782, |
| "memory(GiB)": 45.75, |
| "step": 30180, |
| "train_speed(iter/s)": 0.767937 |
| }, |
| { |
| "epoch": 2.811918063314711, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.141499684949688e-06, |
| "loss": 0.09115112, |
| "memory(GiB)": 45.75, |
| "step": 30200, |
| "train_speed(iter/s)": 0.767948 |
| }, |
| { |
| "epoch": 2.813780260707635, |
| "grad_norm": 0.8515625, |
| "learning_rate": 2.0994519584210728e-06, |
| "loss": 0.09945311, |
| "memory(GiB)": 45.75, |
| "step": 30220, |
| "train_speed(iter/s)": 0.767951 |
| }, |
| { |
| "epoch": 2.815642458100559, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.057816753236874e-06, |
| "loss": 0.09534336, |
| "memory(GiB)": 45.75, |
| "step": 30240, |
| "train_speed(iter/s)": 0.767979 |
| }, |
| { |
| "epoch": 2.817504655493482, |
| "grad_norm": 1.359375, |
| "learning_rate": 2.016594244834391e-06, |
| "loss": 0.08807804, |
| "memory(GiB)": 45.75, |
| "step": 30260, |
| "train_speed(iter/s)": 0.768021 |
| }, |
| { |
| "epoch": 2.819366852886406, |
| "grad_norm": 0.9296875, |
| "learning_rate": 1.9757846069119903e-06, |
| "loss": 0.09050249, |
| "memory(GiB)": 45.75, |
| "step": 30280, |
| "train_speed(iter/s)": 0.768066 |
| }, |
| { |
| "epoch": 2.82122905027933, |
| "grad_norm": 1.3203125, |
| "learning_rate": 1.9353880114283517e-06, |
| "loss": 0.09604942, |
| "memory(GiB)": 45.75, |
| "step": 30300, |
| "train_speed(iter/s)": 0.768089 |
| }, |
| { |
| "epoch": 2.823091247672253, |
| "grad_norm": 0.86328125, |
| "learning_rate": 1.8954046286016936e-06, |
| "loss": 0.08841636, |
| "memory(GiB)": 45.75, |
| "step": 30320, |
| "train_speed(iter/s)": 0.768128 |
| }, |
| { |
| "epoch": 2.824953445065177, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.8558346269090698e-06, |
| "loss": 0.09765224, |
| "memory(GiB)": 45.75, |
| "step": 30340, |
| "train_speed(iter/s)": 0.768171 |
| }, |
| { |
| "epoch": 2.826815642458101, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.816678173085762e-06, |
| "loss": 0.09548703, |
| "memory(GiB)": 45.75, |
| "step": 30360, |
| "train_speed(iter/s)": 0.768206 |
| }, |
| { |
| "epoch": 2.828677839851024, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.7779354321244e-06, |
| "loss": 0.08739243, |
| "memory(GiB)": 45.75, |
| "step": 30380, |
| "train_speed(iter/s)": 0.768232 |
| }, |
| { |
| "epoch": 2.830540037243948, |
| "grad_norm": 0.8359375, |
| "learning_rate": 1.7396065672744211e-06, |
| "loss": 0.09241841, |
| "memory(GiB)": 45.75, |
| "step": 30400, |
| "train_speed(iter/s)": 0.768247 |
| }, |
| { |
| "epoch": 2.8324022346368714, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.7016917400413001e-06, |
| "loss": 0.09562107, |
| "memory(GiB)": 45.75, |
| "step": 30420, |
| "train_speed(iter/s)": 0.76828 |
| }, |
| { |
| "epoch": 2.834264432029795, |
| "grad_norm": 1.3515625, |
| "learning_rate": 1.6641911101858865e-06, |
| "loss": 0.0923029, |
| "memory(GiB)": 45.75, |
| "step": 30440, |
| "train_speed(iter/s)": 0.768309 |
| }, |
| { |
| "epoch": 2.8361266294227185, |
| "grad_norm": 0.890625, |
| "learning_rate": 1.6271048357237695e-06, |
| "loss": 0.09228848, |
| "memory(GiB)": 45.75, |
| "step": 30460, |
| "train_speed(iter/s)": 0.768324 |
| }, |
| { |
| "epoch": 2.8379888268156424, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.5904330729245243e-06, |
| "loss": 0.08994877, |
| "memory(GiB)": 45.75, |
| "step": 30480, |
| "train_speed(iter/s)": 0.768352 |
| }, |
| { |
| "epoch": 2.839851024208566, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.5541759763111452e-06, |
| "loss": 0.09694709, |
| "memory(GiB)": 45.75, |
| "step": 30500, |
| "train_speed(iter/s)": 0.768384 |
| }, |
| { |
| "epoch": 2.8417132216014895, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.5183336986593465e-06, |
| "loss": 0.10389121, |
| "memory(GiB)": 45.75, |
| "step": 30520, |
| "train_speed(iter/s)": 0.768416 |
| }, |
| { |
| "epoch": 2.8435754189944134, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.482906390996941e-06, |
| "loss": 0.08865334, |
| "memory(GiB)": 45.75, |
| "step": 30540, |
| "train_speed(iter/s)": 0.768442 |
| }, |
| { |
| "epoch": 2.845437616387337, |
| "grad_norm": 0.86328125, |
| "learning_rate": 1.4478942026031727e-06, |
| "loss": 0.10575958, |
| "memory(GiB)": 45.75, |
| "step": 30560, |
| "train_speed(iter/s)": 0.768483 |
| }, |
| { |
| "epoch": 2.8472998137802605, |
| "grad_norm": 0.9453125, |
| "learning_rate": 1.4132972810080969e-06, |
| "loss": 0.091862, |
| "memory(GiB)": 45.75, |
| "step": 30580, |
| "train_speed(iter/s)": 0.768508 |
| }, |
| { |
| "epoch": 2.8491620111731844, |
| "grad_norm": 1.2734375, |
| "learning_rate": 1.3791157719920122e-06, |
| "loss": 0.09626628, |
| "memory(GiB)": 45.75, |
| "step": 30600, |
| "train_speed(iter/s)": 0.768524 |
| }, |
| { |
| "epoch": 2.851024208566108, |
| "grad_norm": 1.640625, |
| "learning_rate": 1.3453498195847624e-06, |
| "loss": 0.09501921, |
| "memory(GiB)": 45.75, |
| "step": 30620, |
| "train_speed(iter/s)": 0.768565 |
| }, |
| { |
| "epoch": 2.8528864059590315, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.311999566065203e-06, |
| "loss": 0.10152951, |
| "memory(GiB)": 45.75, |
| "step": 30640, |
| "train_speed(iter/s)": 0.768572 |
| }, |
| { |
| "epoch": 2.8547486033519553, |
| "grad_norm": 1.5390625, |
| "learning_rate": 1.2790651519605234e-06, |
| "loss": 0.09824728, |
| "memory(GiB)": 45.75, |
| "step": 30660, |
| "train_speed(iter/s)": 0.768607 |
| }, |
| { |
| "epoch": 2.856610800744879, |
| "grad_norm": 1.2421875, |
| "learning_rate": 1.246546716045749e-06, |
| "loss": 0.10510842, |
| "memory(GiB)": 45.75, |
| "step": 30680, |
| "train_speed(iter/s)": 0.768631 |
| }, |
| { |
| "epoch": 2.8584729981378025, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.2144443953430617e-06, |
| "loss": 0.10336709, |
| "memory(GiB)": 45.75, |
| "step": 30700, |
| "train_speed(iter/s)": 0.76866 |
| }, |
| { |
| "epoch": 2.8603351955307263, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.1827583251213026e-06, |
| "loss": 0.09153603, |
| "memory(GiB)": 45.75, |
| "step": 30720, |
| "train_speed(iter/s)": 0.768687 |
| }, |
| { |
| "epoch": 2.86219739292365, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.1514886388953706e-06, |
| "loss": 0.08601239, |
| "memory(GiB)": 45.75, |
| "step": 30740, |
| "train_speed(iter/s)": 0.768721 |
| }, |
| { |
| "epoch": 2.8640595903165735, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.1206354684256015e-06, |
| "loss": 0.09374793, |
| "memory(GiB)": 45.75, |
| "step": 30760, |
| "train_speed(iter/s)": 0.768745 |
| }, |
| { |
| "epoch": 2.8659217877094973, |
| "grad_norm": 0.7421875, |
| "learning_rate": 1.0901989437173577e-06, |
| "loss": 0.09772741, |
| "memory(GiB)": 45.75, |
| "step": 30780, |
| "train_speed(iter/s)": 0.768781 |
| }, |
| { |
| "epoch": 2.8677839851024207, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.0601791930203053e-06, |
| "loss": 0.10399811, |
| "memory(GiB)": 45.75, |
| "step": 30800, |
| "train_speed(iter/s)": 0.768806 |
| }, |
| { |
| "epoch": 2.8696461824953445, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.0305763428280046e-06, |
| "loss": 0.10219818, |
| "memory(GiB)": 45.75, |
| "step": 30820, |
| "train_speed(iter/s)": 0.768816 |
| }, |
| { |
| "epoch": 2.871508379888268, |
| "grad_norm": 0.88671875, |
| "learning_rate": 1.0013905178773208e-06, |
| "loss": 0.1023465, |
| "memory(GiB)": 45.75, |
| "step": 30840, |
| "train_speed(iter/s)": 0.768834 |
| }, |
| { |
| "epoch": 2.8733705772811917, |
| "grad_norm": 0.98046875, |
| "learning_rate": 9.726218411479137e-07, |
| "loss": 0.08770326, |
| "memory(GiB)": 45.75, |
| "step": 30860, |
| "train_speed(iter/s)": 0.768871 |
| }, |
| { |
| "epoch": 2.8752327746741155, |
| "grad_norm": 0.921875, |
| "learning_rate": 9.442704338617158e-07, |
| "loss": 0.10444794, |
| "memory(GiB)": 45.75, |
| "step": 30880, |
| "train_speed(iter/s)": 0.768893 |
| }, |
| { |
| "epoch": 2.877094972067039, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.163364154823994e-07, |
| "loss": 0.0871839, |
| "memory(GiB)": 45.75, |
| "step": 30900, |
| "train_speed(iter/s)": 0.768916 |
| }, |
| { |
| "epoch": 2.8789571694599627, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.888199037149325e-07, |
| "loss": 0.09597771, |
| "memory(GiB)": 45.75, |
| "step": 30920, |
| "train_speed(iter/s)": 0.768935 |
| }, |
| { |
| "epoch": 2.8808193668528865, |
| "grad_norm": 0.76953125, |
| "learning_rate": 8.617210145050014e-07, |
| "loss": 0.09385652, |
| "memory(GiB)": 45.75, |
| "step": 30940, |
| "train_speed(iter/s)": 0.768951 |
| }, |
| { |
| "epoch": 2.88268156424581, |
| "grad_norm": 0.6640625, |
| "learning_rate": 8.350398620386113e-07, |
| "loss": 0.11076908, |
| "memory(GiB)": 45.75, |
| "step": 30960, |
| "train_speed(iter/s)": 0.768962 |
| }, |
| { |
| "epoch": 2.8845437616387337, |
| "grad_norm": 0.98046875, |
| "learning_rate": 8.087765587415197e-07, |
| "loss": 0.08995301, |
| "memory(GiB)": 45.75, |
| "step": 30980, |
| "train_speed(iter/s)": 0.768978 |
| }, |
| { |
| "epoch": 2.8864059590316575, |
| "grad_norm": 0.81640625, |
| "learning_rate": 7.82931215278826e-07, |
| "loss": 0.10020612, |
| "memory(GiB)": 45.75, |
| "step": 31000, |
| "train_speed(iter/s)": 0.768993 |
| }, |
| { |
| "epoch": 2.888268156424581, |
| "grad_norm": 1.0, |
| "learning_rate": 7.575039405544715e-07, |
| "loss": 0.08659879, |
| "memory(GiB)": 45.75, |
| "step": 31020, |
| "train_speed(iter/s)": 0.76903 |
| }, |
| { |
| "epoch": 2.8901303538175047, |
| "grad_norm": 0.96875, |
| "learning_rate": 7.324948417107957e-07, |
| "loss": 0.08481673, |
| "memory(GiB)": 45.75, |
| "step": 31040, |
| "train_speed(iter/s)": 0.769054 |
| }, |
| { |
| "epoch": 2.8919925512104285, |
| "grad_norm": 1.0390625, |
| "learning_rate": 7.07904024128081e-07, |
| "loss": 0.09589967, |
| "memory(GiB)": 45.75, |
| "step": 31060, |
| "train_speed(iter/s)": 0.769068 |
| }, |
| { |
| "epoch": 2.893854748603352, |
| "grad_norm": 0.984375, |
| "learning_rate": 6.837315914240972e-07, |
| "loss": 0.08965684, |
| "memory(GiB)": 45.75, |
| "step": 31080, |
| "train_speed(iter/s)": 0.769106 |
| }, |
| { |
| "epoch": 2.8957169459962757, |
| "grad_norm": 0.89453125, |
| "learning_rate": 6.59977645453691e-07, |
| "loss": 0.09921396, |
| "memory(GiB)": 45.75, |
| "step": 31100, |
| "train_speed(iter/s)": 0.769127 |
| }, |
| { |
| "epoch": 2.8975791433891995, |
| "grad_norm": 0.70703125, |
| "learning_rate": 6.366422863083199e-07, |
| "loss": 0.09246094, |
| "memory(GiB)": 45.75, |
| "step": 31120, |
| "train_speed(iter/s)": 0.769148 |
| }, |
| { |
| "epoch": 2.899441340782123, |
| "grad_norm": 1.0078125, |
| "learning_rate": 6.137256123156631e-07, |
| "loss": 0.09109651, |
| "memory(GiB)": 45.75, |
| "step": 31140, |
| "train_speed(iter/s)": 0.769182 |
| }, |
| { |
| "epoch": 2.9013035381750467, |
| "grad_norm": 0.703125, |
| "learning_rate": 5.912277200392113e-07, |
| "loss": 0.08994749, |
| "memory(GiB)": 45.75, |
| "step": 31160, |
| "train_speed(iter/s)": 0.769206 |
| }, |
| { |
| "epoch": 2.90316573556797, |
| "grad_norm": 0.88671875, |
| "learning_rate": 5.69148704277811e-07, |
| "loss": 0.08762764, |
| "memory(GiB)": 45.75, |
| "step": 31180, |
| "train_speed(iter/s)": 0.769234 |
| }, |
| { |
| "epoch": 2.905027932960894, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.474886580653315e-07, |
| "loss": 0.0921519, |
| "memory(GiB)": 45.75, |
| "step": 31200, |
| "train_speed(iter/s)": 0.769261 |
| }, |
| { |
| "epoch": 2.9068901303538173, |
| "grad_norm": 0.9296875, |
| "learning_rate": 5.262476726702215e-07, |
| "loss": 0.09754463, |
| "memory(GiB)": 45.75, |
| "step": 31220, |
| "train_speed(iter/s)": 0.769276 |
| }, |
| { |
| "epoch": 2.908752327746741, |
| "grad_norm": 0.9921875, |
| "learning_rate": 5.054258375951415e-07, |
| "loss": 0.08906224, |
| "memory(GiB)": 45.75, |
| "step": 31240, |
| "train_speed(iter/s)": 0.769297 |
| }, |
| { |
| "epoch": 2.910614525139665, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.850232405765875e-07, |
| "loss": 0.09115668, |
| "memory(GiB)": 45.75, |
| "step": 31260, |
| "train_speed(iter/s)": 0.769324 |
| }, |
| { |
| "epoch": 2.9124767225325883, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.6503996758452407e-07, |
| "loss": 0.08493429, |
| "memory(GiB)": 45.75, |
| "step": 31280, |
| "train_speed(iter/s)": 0.76933 |
| }, |
| { |
| "epoch": 2.914338919925512, |
| "grad_norm": 0.9765625, |
| "learning_rate": 4.454761028220289e-07, |
| "loss": 0.09017068, |
| "memory(GiB)": 45.75, |
| "step": 31300, |
| "train_speed(iter/s)": 0.769354 |
| }, |
| { |
| "epoch": 2.916201117318436, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.263317287249158e-07, |
| "loss": 0.08491018, |
| "memory(GiB)": 45.75, |
| "step": 31320, |
| "train_speed(iter/s)": 0.769377 |
| }, |
| { |
| "epoch": 2.9180633147113593, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.0760692596140124e-07, |
| "loss": 0.08563709, |
| "memory(GiB)": 45.75, |
| "step": 31340, |
| "train_speed(iter/s)": 0.769395 |
| }, |
| { |
| "epoch": 2.919925512104283, |
| "grad_norm": 1.0078125, |
| "learning_rate": 3.8930177343177165e-07, |
| "loss": 0.08611808, |
| "memory(GiB)": 45.75, |
| "step": 31360, |
| "train_speed(iter/s)": 0.769403 |
| }, |
| { |
| "epoch": 2.921787709497207, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.714163482680388e-07, |
| "loss": 0.09184546, |
| "memory(GiB)": 45.75, |
| "step": 31380, |
| "train_speed(iter/s)": 0.76942 |
| }, |
| { |
| "epoch": 2.9236499068901303, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.5395072583361833e-07, |
| "loss": 0.09664055, |
| "memory(GiB)": 45.75, |
| "step": 31400, |
| "train_speed(iter/s)": 0.769447 |
| }, |
| { |
| "epoch": 2.925512104283054, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.369049797230073e-07, |
| "loss": 0.10272588, |
| "memory(GiB)": 45.75, |
| "step": 31420, |
| "train_speed(iter/s)": 0.769475 |
| }, |
| { |
| "epoch": 2.927374301675978, |
| "grad_norm": 0.8125, |
| "learning_rate": 3.2027918176148474e-07, |
| "loss": 0.08928733, |
| "memory(GiB)": 45.75, |
| "step": 31440, |
| "train_speed(iter/s)": 0.769516 |
| }, |
| { |
| "epoch": 2.9292364990689013, |
| "grad_norm": 1.015625, |
| "learning_rate": 3.040734020048008e-07, |
| "loss": 0.10981673, |
| "memory(GiB)": 45.75, |
| "step": 31460, |
| "train_speed(iter/s)": 0.769512 |
| }, |
| { |
| "epoch": 2.931098696461825, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.88287708738888e-07, |
| "loss": 0.0986732, |
| "memory(GiB)": 45.75, |
| "step": 31480, |
| "train_speed(iter/s)": 0.769534 |
| }, |
| { |
| "epoch": 2.9329608938547485, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.7292216847957244e-07, |
| "loss": 0.0985683, |
| "memory(GiB)": 45.75, |
| "step": 31500, |
| "train_speed(iter/s)": 0.769578 |
| }, |
| { |
| "epoch": 2.9348230912476723, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.579768459722853e-07, |
| "loss": 0.09273306, |
| "memory(GiB)": 45.75, |
| "step": 31520, |
| "train_speed(iter/s)": 0.769616 |
| }, |
| { |
| "epoch": 2.9366852886405956, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.434518041917966e-07, |
| "loss": 0.10807085, |
| "memory(GiB)": 45.75, |
| "step": 31540, |
| "train_speed(iter/s)": 0.769631 |
| }, |
| { |
| "epoch": 2.9385474860335195, |
| "grad_norm": 0.8984375, |
| "learning_rate": 2.2934710434194818e-07, |
| "loss": 0.10285647, |
| "memory(GiB)": 45.75, |
| "step": 31560, |
| "train_speed(iter/s)": 0.769659 |
| }, |
| { |
| "epoch": 2.9404096834264433, |
| "grad_norm": 0.9296875, |
| "learning_rate": 2.1566280585539888e-07, |
| "loss": 0.09199415, |
| "memory(GiB)": 45.75, |
| "step": 31580, |
| "train_speed(iter/s)": 0.76968 |
| }, |
| { |
| "epoch": 2.9422718808193666, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.0239896639336898e-07, |
| "loss": 0.09799184, |
| "memory(GiB)": 45.75, |
| "step": 31600, |
| "train_speed(iter/s)": 0.769719 |
| }, |
| { |
| "epoch": 2.9441340782122905, |
| "grad_norm": 0.98828125, |
| "learning_rate": 1.8955564184539587e-07, |
| "loss": 0.0983022, |
| "memory(GiB)": 45.75, |
| "step": 31620, |
| "train_speed(iter/s)": 0.769745 |
| }, |
| { |
| "epoch": 2.9459962756052143, |
| "grad_norm": 0.8671875, |
| "learning_rate": 1.771328863291233e-07, |
| "loss": 0.09433325, |
| "memory(GiB)": 45.75, |
| "step": 31640, |
| "train_speed(iter/s)": 0.769762 |
| }, |
| { |
| "epoch": 2.9478584729981376, |
| "grad_norm": 0.875, |
| "learning_rate": 1.6513075219001252e-07, |
| "loss": 0.09435837, |
| "memory(GiB)": 45.75, |
| "step": 31660, |
| "train_speed(iter/s)": 0.769781 |
| }, |
| { |
| "epoch": 2.9497206703910615, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.5354929000122032e-07, |
| "loss": 0.09303812, |
| "memory(GiB)": 45.75, |
| "step": 31680, |
| "train_speed(iter/s)": 0.769783 |
| }, |
| { |
| "epoch": 2.9515828677839853, |
| "grad_norm": 1.125, |
| "learning_rate": 1.423885485632659e-07, |
| "loss": 0.09572449, |
| "memory(GiB)": 45.75, |
| "step": 31700, |
| "train_speed(iter/s)": 0.769797 |
| }, |
| { |
| "epoch": 2.9534450651769086, |
| "grad_norm": 1.53125, |
| "learning_rate": 1.316485749039309e-07, |
| "loss": 0.10269543, |
| "memory(GiB)": 45.75, |
| "step": 31720, |
| "train_speed(iter/s)": 0.769825 |
| }, |
| { |
| "epoch": 2.9553072625698324, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.2132941427799304e-07, |
| "loss": 0.0916387, |
| "memory(GiB)": 45.75, |
| "step": 31740, |
| "train_speed(iter/s)": 0.769837 |
| }, |
| { |
| "epoch": 2.9571694599627563, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.1143111016707063e-07, |
| "loss": 0.09537607, |
| "memory(GiB)": 45.75, |
| "step": 31760, |
| "train_speed(iter/s)": 0.769875 |
| }, |
| { |
| "epoch": 2.9590316573556796, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.0195370427943385e-07, |
| "loss": 0.10331657, |
| "memory(GiB)": 45.75, |
| "step": 31780, |
| "train_speed(iter/s)": 0.76991 |
| }, |
| { |
| "epoch": 2.9608938547486034, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.289723654980486e-08, |
| "loss": 0.0982655, |
| "memory(GiB)": 45.75, |
| "step": 31800, |
| "train_speed(iter/s)": 0.769926 |
| }, |
| { |
| "epoch": 2.9627560521415273, |
| "grad_norm": 0.984375, |
| "learning_rate": 8.426174513922469e-08, |
| "loss": 0.09388782, |
| "memory(GiB)": 45.75, |
| "step": 31820, |
| "train_speed(iter/s)": 0.769945 |
| }, |
| { |
| "epoch": 2.9646182495344506, |
| "grad_norm": 1.0, |
| "learning_rate": 7.604726643486438e-08, |
| "loss": 0.09825534, |
| "memory(GiB)": 45.75, |
| "step": 31840, |
| "train_speed(iter/s)": 0.769959 |
| }, |
| { |
| "epoch": 2.9664804469273744, |
| "grad_norm": 1.0546875, |
| "learning_rate": 6.82538350498918e-08, |
| "loss": 0.10639879, |
| "memory(GiB)": 45.75, |
| "step": 31860, |
| "train_speed(iter/s)": 0.76997 |
| }, |
| { |
| "epoch": 2.968342644320298, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.088148382331627e-08, |
| "loss": 0.09718776, |
| "memory(GiB)": 45.75, |
| "step": 31880, |
| "train_speed(iter/s)": 0.769995 |
| }, |
| { |
| "epoch": 2.9702048417132216, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.393024381984413e-08, |
| "loss": 0.08618087, |
| "memory(GiB)": 45.75, |
| "step": 31900, |
| "train_speed(iter/s)": 0.770024 |
| }, |
| { |
| "epoch": 2.972067039106145, |
| "grad_norm": 0.7890625, |
| "learning_rate": 4.74001443297678e-08, |
| "loss": 0.09072049, |
| "memory(GiB)": 45.75, |
| "step": 31920, |
| "train_speed(iter/s)": 0.770048 |
| }, |
| { |
| "epoch": 2.973929236499069, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.129121286879922e-08, |
| "loss": 0.09811844, |
| "memory(GiB)": 45.75, |
| "step": 31940, |
| "train_speed(iter/s)": 0.770087 |
| }, |
| { |
| "epoch": 2.9757914338919926, |
| "grad_norm": 0.8828125, |
| "learning_rate": 3.5603475178036526e-08, |
| "loss": 0.08870932, |
| "memory(GiB)": 45.75, |
| "step": 31960, |
| "train_speed(iter/s)": 0.770105 |
| }, |
| { |
| "epoch": 2.977653631284916, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.033695522375313e-08, |
| "loss": 0.09320135, |
| "memory(GiB)": 45.75, |
| "step": 31980, |
| "train_speed(iter/s)": 0.770131 |
| }, |
| { |
| "epoch": 2.97951582867784, |
| "grad_norm": 0.94921875, |
| "learning_rate": 2.5491675197375498e-08, |
| "loss": 0.10496864, |
| "memory(GiB)": 45.75, |
| "step": 32000, |
| "train_speed(iter/s)": 0.770166 |
| }, |
| { |
| "epoch": 2.97951582867784, |
| "eval_loss": 0.3874885141849518, |
| "eval_runtime": 76.9471, |
| "eval_samples_per_second": 180.449, |
| "eval_steps_per_second": 1.417, |
| "step": 32000 |
| }, |
| { |
| "epoch": 2.9813780260707636, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.106765551534995e-08, |
| "loss": 0.09619021, |
| "memory(GiB)": 45.75, |
| "step": 32020, |
| "train_speed(iter/s)": 0.768206 |
| }, |
| { |
| "epoch": 2.983240223463687, |
| "grad_norm": 0.75, |
| "learning_rate": 1.7064914819064915e-08, |
| "loss": 0.08765938, |
| "memory(GiB)": 45.75, |
| "step": 32040, |
| "train_speed(iter/s)": 0.768239 |
| }, |
| { |
| "epoch": 2.985102420856611, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.3483469974773233e-08, |
| "loss": 0.1008575, |
| "memory(GiB)": 45.75, |
| "step": 32060, |
| "train_speed(iter/s)": 0.768255 |
| }, |
| { |
| "epoch": 2.9869646182495346, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.0323336073536638e-08, |
| "loss": 0.10057193, |
| "memory(GiB)": 45.75, |
| "step": 32080, |
| "train_speed(iter/s)": 0.768274 |
| }, |
| { |
| "epoch": 2.988826815642458, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.584526431148043e-09, |
| "loss": 0.08665556, |
| "memory(GiB)": 45.75, |
| "step": 32100, |
| "train_speed(iter/s)": 0.768288 |
| }, |
| { |
| "epoch": 2.990689013035382, |
| "grad_norm": 0.91796875, |
| "learning_rate": 5.267052588053822e-09, |
| "loss": 0.09432332, |
| "memory(GiB)": 45.75, |
| "step": 32120, |
| "train_speed(iter/s)": 0.768312 |
| }, |
| { |
| "epoch": 2.9925512104283056, |
| "grad_norm": 0.90625, |
| "learning_rate": 3.370924309342716e-09, |
| "loss": 0.10474067, |
| "memory(GiB)": 45.75, |
| "step": 32140, |
| "train_speed(iter/s)": 0.768318 |
| }, |
| { |
| "epoch": 2.994413407821229, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.8961495847014122e-09, |
| "loss": 0.08654416, |
| "memory(GiB)": 45.75, |
| "step": 32160, |
| "train_speed(iter/s)": 0.768345 |
| }, |
| { |
| "epoch": 2.996275605214153, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.427346283479409e-10, |
| "loss": 0.1108827, |
| "memory(GiB)": 45.75, |
| "step": 32180, |
| "train_speed(iter/s)": 0.768396 |
| }, |
| { |
| "epoch": 2.998137802607076, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.1068387902056786e-10, |
| "loss": 0.09056249, |
| "memory(GiB)": 45.75, |
| "step": 32200, |
| "train_speed(iter/s)": 0.768395 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0, |
| "loss": 0.08908014, |
| "memory(GiB)": 45.75, |
| "step": 32220, |
| "train_speed(iter/s)": 0.76839 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.3871558606624603, |
| "eval_runtime": 76.4163, |
| "eval_samples_per_second": 181.702, |
| "eval_steps_per_second": 1.426, |
| "step": 32220 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 32220, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.14434535928968e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|