| { |
| "best_global_step": 1485, |
| "best_metric": 0.3291038, |
| "best_model_checkpoint": "/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818/checkpoint-1485", |
| "epoch": 0.9994531611492029, |
| "eval_steps": 50, |
| "global_step": 1485, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006730324317503049, |
| "grad_norm": 179.0, |
| "learning_rate": 9.999988811118231e-05, |
| "loss": 0.9820185899734497, |
| "memory(GiB)": 71.61, |
| "step": 1, |
| "token_acc": 0.8044692737430168, |
| "train_speed(iter/s)": 0.218302 |
| }, |
| { |
| "epoch": 0.003365162158751525, |
| "grad_norm": 2928.0, |
| "learning_rate": 9.999720280459576e-05, |
| "loss": 5.42672061920166, |
| "memory(GiB)": 73.26, |
| "step": 5, |
| "token_acc": 0.3954864964853866, |
| "train_speed(iter/s)": 0.397179 |
| }, |
| { |
| "epoch": 0.00673032431750305, |
| "grad_norm": 35.75, |
| "learning_rate": 9.99888115313551e-05, |
| "loss": 1.4307238578796386, |
| "memory(GiB)": 73.26, |
| "step": 10, |
| "token_acc": 0.7211833231146536, |
| "train_speed(iter/s)": 0.450251 |
| }, |
| { |
| "epoch": 0.010095486476254575, |
| "grad_norm": 7.40625, |
| "learning_rate": 9.997482711915927e-05, |
| "loss": 1.1120232582092284, |
| "memory(GiB)": 73.26, |
| "step": 15, |
| "token_acc": 0.7599109131403118, |
| "train_speed(iter/s)": 0.467479 |
| }, |
| { |
| "epoch": 0.0134606486350061, |
| "grad_norm": 5.9375, |
| "learning_rate": 9.99552511326936e-05, |
| "loss": 0.819661808013916, |
| "memory(GiB)": 73.26, |
| "step": 20, |
| "token_acc": 0.7947040995374063, |
| "train_speed(iter/s)": 0.482418 |
| }, |
| { |
| "epoch": 0.016825810793757626, |
| "grad_norm": 5.4375, |
| "learning_rate": 9.993008576227247e-05, |
| "loss": 0.9263886451721192, |
| "memory(GiB)": 73.26, |
| "step": 25, |
| "token_acc": 0.7752791563275434, |
| "train_speed(iter/s)": 0.490393 |
| }, |
| { |
| "epoch": 0.02019097295250915, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.989933382359422e-05, |
| "loss": 0.7815323352813721, |
| "memory(GiB)": 73.26, |
| "step": 30, |
| "token_acc": 0.8033576869267838, |
| "train_speed(iter/s)": 0.499449 |
| }, |
| { |
| "epoch": 0.023556135111260673, |
| "grad_norm": 4.0625, |
| "learning_rate": 9.986299875742613e-05, |
| "loss": 0.713004732131958, |
| "memory(GiB)": 73.26, |
| "step": 35, |
| "token_acc": 0.8128159139083646, |
| "train_speed(iter/s)": 0.503737 |
| }, |
| { |
| "epoch": 0.0269212972700122, |
| "grad_norm": 9.4375, |
| "learning_rate": 9.982108462921937e-05, |
| "loss": 0.7091834068298339, |
| "memory(GiB)": 73.26, |
| "step": 40, |
| "token_acc": 0.82666015625, |
| "train_speed(iter/s)": 0.502146 |
| }, |
| { |
| "epoch": 0.030286459428763724, |
| "grad_norm": 6.1875, |
| "learning_rate": 9.977359612865423e-05, |
| "loss": 0.6785173892974854, |
| "memory(GiB)": 73.26, |
| "step": 45, |
| "token_acc": 0.8104975044276284, |
| "train_speed(iter/s)": 0.505359 |
| }, |
| { |
| "epoch": 0.03365162158751525, |
| "grad_norm": 4.65625, |
| "learning_rate": 9.972053856911534e-05, |
| "loss": 0.7147142887115479, |
| "memory(GiB)": 73.26, |
| "step": 50, |
| "token_acc": 0.8093519535540562, |
| "train_speed(iter/s)": 0.508235 |
| }, |
| { |
| "epoch": 0.03365162158751525, |
| "eval_loss": 0.6807255148887634, |
| "eval_runtime": 6.4656, |
| "eval_samples_per_second": 37.119, |
| "eval_steps_per_second": 37.119, |
| "eval_token_acc": 0.8191331923890064, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.037016783746266775, |
| "grad_norm": 3.875, |
| "learning_rate": 9.966191788709716e-05, |
| "loss": 0.7501668453216552, |
| "memory(GiB)": 73.26, |
| "step": 55, |
| "token_acc": 0.81598110608148, |
| "train_speed(iter/s)": 0.312271 |
| }, |
| { |
| "epoch": 0.0403819459050183, |
| "grad_norm": 5.15625, |
| "learning_rate": 9.959774064153977e-05, |
| "loss": 0.7556098461151123, |
| "memory(GiB)": 73.26, |
| "step": 60, |
| "token_acc": 0.812223746380125, |
| "train_speed(iter/s)": 0.323476 |
| }, |
| { |
| "epoch": 0.04374710806376982, |
| "grad_norm": 3.28125, |
| "learning_rate": 9.952801401309503e-05, |
| "loss": 0.6282004833221435, |
| "memory(GiB)": 73.26, |
| "step": 65, |
| "token_acc": 0.8382288469969311, |
| "train_speed(iter/s)": 0.332657 |
| }, |
| { |
| "epoch": 0.047112270222521346, |
| "grad_norm": 3.84375, |
| "learning_rate": 9.945274580332316e-05, |
| "loss": 0.6862215042114258, |
| "memory(GiB)": 73.26, |
| "step": 70, |
| "token_acc": 0.8230115830115831, |
| "train_speed(iter/s)": 0.341959 |
| }, |
| { |
| "epoch": 0.05047743238127287, |
| "grad_norm": 4.96875, |
| "learning_rate": 9.937194443381972e-05, |
| "loss": 0.8654034614562989, |
| "memory(GiB)": 73.26, |
| "step": 75, |
| "token_acc": 0.787591859807801, |
| "train_speed(iter/s)": 0.351409 |
| }, |
| { |
| "epoch": 0.0538425945400244, |
| "grad_norm": 3.734375, |
| "learning_rate": 9.928561894527353e-05, |
| "loss": 0.7333785057067871, |
| "memory(GiB)": 73.26, |
| "step": 80, |
| "token_acc": 0.8105247240284289, |
| "train_speed(iter/s)": 0.358757 |
| }, |
| { |
| "epoch": 0.057207756698775925, |
| "grad_norm": 10.6875, |
| "learning_rate": 9.919377899645497e-05, |
| "loss": 0.6422113418579102, |
| "memory(GiB)": 73.26, |
| "step": 85, |
| "token_acc": 0.8280427771176371, |
| "train_speed(iter/s)": 0.366331 |
| }, |
| { |
| "epoch": 0.06057291885752745, |
| "grad_norm": 3.265625, |
| "learning_rate": 9.909643486313533e-05, |
| "loss": 0.7867285251617432, |
| "memory(GiB)": 73.26, |
| "step": 90, |
| "token_acc": 0.8057151496824917, |
| "train_speed(iter/s)": 0.373092 |
| }, |
| { |
| "epoch": 0.06393808101627897, |
| "grad_norm": 3.0625, |
| "learning_rate": 9.899359743693714e-05, |
| "loss": 0.675608491897583, |
| "memory(GiB)": 73.26, |
| "step": 95, |
| "token_acc": 0.8164676304211188, |
| "train_speed(iter/s)": 0.379215 |
| }, |
| { |
| "epoch": 0.0673032431750305, |
| "grad_norm": 3.390625, |
| "learning_rate": 9.888527822411543e-05, |
| "loss": 0.7589282989501953, |
| "memory(GiB)": 73.26, |
| "step": 100, |
| "token_acc": 0.810183048761729, |
| "train_speed(iter/s)": 0.385325 |
| }, |
| { |
| "epoch": 0.0673032431750305, |
| "eval_loss": 0.6772852540016174, |
| "eval_runtime": 6.3754, |
| "eval_samples_per_second": 37.645, |
| "eval_steps_per_second": 37.645, |
| "eval_token_acc": 0.8261627906976744, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07066840533378202, |
| "grad_norm": 3.859375, |
| "learning_rate": 9.877148934427037e-05, |
| "loss": 0.6617954730987549, |
| "memory(GiB)": 73.26, |
| "step": 105, |
| "token_acc": 0.8273892727345341, |
| "train_speed(iter/s)": 0.311491 |
| }, |
| { |
| "epoch": 0.07403356749253355, |
| "grad_norm": 3.15625, |
| "learning_rate": 9.865224352899119e-05, |
| "loss": 0.7310012340545654, |
| "memory(GiB)": 73.26, |
| "step": 110, |
| "token_acc": 0.8089593596059114, |
| "train_speed(iter/s)": 0.317239 |
| }, |
| { |
| "epoch": 0.07739872965128507, |
| "grad_norm": 3.828125, |
| "learning_rate": 9.85275541204318e-05, |
| "loss": 0.703582763671875, |
| "memory(GiB)": 73.26, |
| "step": 115, |
| "token_acc": 0.8229212819376753, |
| "train_speed(iter/s)": 0.32253 |
| }, |
| { |
| "epoch": 0.0807638918100366, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.839743506981782e-05, |
| "loss": 0.7135389804840088, |
| "memory(GiB)": 73.26, |
| "step": 120, |
| "token_acc": 0.8188603416104493, |
| "train_speed(iter/s)": 0.327653 |
| }, |
| { |
| "epoch": 0.08412905396878811, |
| "grad_norm": 3.5, |
| "learning_rate": 9.826190093588563e-05, |
| "loss": 0.6105506420135498, |
| "memory(GiB)": 73.26, |
| "step": 125, |
| "token_acc": 0.8411754713776117, |
| "train_speed(iter/s)": 0.332914 |
| }, |
| { |
| "epoch": 0.08749421612753965, |
| "grad_norm": 3.0625, |
| "learning_rate": 9.812096688325354e-05, |
| "loss": 0.7001046657562255, |
| "memory(GiB)": 73.26, |
| "step": 130, |
| "token_acc": 0.8249736406085254, |
| "train_speed(iter/s)": 0.337969 |
| }, |
| { |
| "epoch": 0.09085937828629118, |
| "grad_norm": 2.9375, |
| "learning_rate": 9.797464868072488e-05, |
| "loss": 0.6970377922058105, |
| "memory(GiB)": 73.26, |
| "step": 135, |
| "token_acc": 0.8269230769230769, |
| "train_speed(iter/s)": 0.342842 |
| }, |
| { |
| "epoch": 0.09422454044504269, |
| "grad_norm": 3.59375, |
| "learning_rate": 9.78229626995238e-05, |
| "loss": 0.6484662055969238, |
| "memory(GiB)": 73.26, |
| "step": 140, |
| "token_acc": 0.8294907944932824, |
| "train_speed(iter/s)": 0.347497 |
| }, |
| { |
| "epoch": 0.09758970260379422, |
| "grad_norm": 2.9375, |
| "learning_rate": 9.766592591146352e-05, |
| "loss": 0.6710952281951904, |
| "memory(GiB)": 73.26, |
| "step": 145, |
| "token_acc": 0.8295923041685753, |
| "train_speed(iter/s)": 0.352009 |
| }, |
| { |
| "epoch": 0.10095486476254574, |
| "grad_norm": 3.140625, |
| "learning_rate": 9.750355588704727e-05, |
| "loss": 0.6715181350708008, |
| "memory(GiB)": 73.26, |
| "step": 150, |
| "token_acc": 0.8266999559406668, |
| "train_speed(iter/s)": 0.355725 |
| }, |
| { |
| "epoch": 0.10095486476254574, |
| "eval_loss": 0.65446937084198, |
| "eval_runtime": 6.2927, |
| "eval_samples_per_second": 38.139, |
| "eval_steps_per_second": 38.139, |
| "eval_token_acc": 0.8305496828752643, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10432002692129727, |
| "grad_norm": 2.859375, |
| "learning_rate": 9.733587079350252e-05, |
| "loss": 0.6584550857543945, |
| "memory(GiB)": 73.26, |
| "step": 155, |
| "token_acc": 0.8323802163833076, |
| "train_speed(iter/s)": 0.310504 |
| }, |
| { |
| "epoch": 0.1076851890800488, |
| "grad_norm": 3.109375, |
| "learning_rate": 9.716288939274819e-05, |
| "loss": 0.7138989925384521, |
| "memory(GiB)": 73.26, |
| "step": 160, |
| "token_acc": 0.8222054380664653, |
| "train_speed(iter/s)": 0.314403 |
| }, |
| { |
| "epoch": 0.11105035123880032, |
| "grad_norm": 22.25, |
| "learning_rate": 9.698463103929542e-05, |
| "loss": 0.6830341339111328, |
| "memory(GiB)": 73.26, |
| "step": 165, |
| "token_acc": 0.8247949233864726, |
| "train_speed(iter/s)": 0.318566 |
| }, |
| { |
| "epoch": 0.11441551339755185, |
| "grad_norm": 2.859375, |
| "learning_rate": 9.680111567808213e-05, |
| "loss": 0.6824192047119141, |
| "memory(GiB)": 73.26, |
| "step": 170, |
| "token_acc": 0.836785661818716, |
| "train_speed(iter/s)": 0.3222 |
| }, |
| { |
| "epoch": 0.11778067555630337, |
| "grad_norm": 2.734375, |
| "learning_rate": 9.661236384224129e-05, |
| "loss": 0.6676050186157226, |
| "memory(GiB)": 73.26, |
| "step": 175, |
| "token_acc": 0.8297106664747373, |
| "train_speed(iter/s)": 0.325905 |
| }, |
| { |
| "epoch": 0.1211458377150549, |
| "grad_norm": 3.5, |
| "learning_rate": 9.641839665080363e-05, |
| "loss": 0.6630958557128906, |
| "memory(GiB)": 73.26, |
| "step": 180, |
| "token_acc": 0.8245426829268293, |
| "train_speed(iter/s)": 0.329211 |
| }, |
| { |
| "epoch": 0.12451099987380641, |
| "grad_norm": 3.140625, |
| "learning_rate": 9.62192358063346e-05, |
| "loss": 0.7024449348449707, |
| "memory(GiB)": 73.26, |
| "step": 185, |
| "token_acc": 0.8313099041533546, |
| "train_speed(iter/s)": 0.332825 |
| }, |
| { |
| "epoch": 0.12787616203255794, |
| "grad_norm": 3.953125, |
| "learning_rate": 9.601490359250615e-05, |
| "loss": 0.6541357517242432, |
| "memory(GiB)": 73.26, |
| "step": 190, |
| "token_acc": 0.829295154185022, |
| "train_speed(iter/s)": 0.336139 |
| }, |
| { |
| "epoch": 0.13124132419130946, |
| "grad_norm": 3.515625, |
| "learning_rate": 9.580542287160348e-05, |
| "loss": 0.6423999786376953, |
| "memory(GiB)": 73.26, |
| "step": 195, |
| "token_acc": 0.8379697413372377, |
| "train_speed(iter/s)": 0.339594 |
| }, |
| { |
| "epoch": 0.134606486350061, |
| "grad_norm": 3.515625, |
| "learning_rate": 9.559081708196696e-05, |
| "loss": 0.7582132339477539, |
| "memory(GiB)": 73.26, |
| "step": 200, |
| "token_acc": 0.8089741740008657, |
| "train_speed(iter/s)": 0.342609 |
| }, |
| { |
| "epoch": 0.134606486350061, |
| "eval_loss": 0.6314957737922668, |
| "eval_runtime": 6.3575, |
| "eval_samples_per_second": 37.75, |
| "eval_steps_per_second": 37.75, |
| "eval_token_acc": 0.8350422832980973, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13797164850881252, |
| "grad_norm": 3.96875, |
| "learning_rate": 9.537111023536973e-05, |
| "loss": 0.6429227352142334, |
| "memory(GiB)": 73.26, |
| "step": 205, |
| "token_acc": 0.8360507956416086, |
| "train_speed(iter/s)": 0.306958 |
| }, |
| { |
| "epoch": 0.14133681066756404, |
| "grad_norm": 3.109375, |
| "learning_rate": 9.514632691433107e-05, |
| "loss": 0.6970784187316894, |
| "memory(GiB)": 73.26, |
| "step": 210, |
| "token_acc": 0.8279309788743751, |
| "train_speed(iter/s)": 0.310052 |
| }, |
| { |
| "epoch": 0.14470197282631556, |
| "grad_norm": 2.84375, |
| "learning_rate": 9.491649226936585e-05, |
| "loss": 0.7154839515686036, |
| "memory(GiB)": 73.26, |
| "step": 215, |
| "token_acc": 0.8265466495213601, |
| "train_speed(iter/s)": 0.313148 |
| }, |
| { |
| "epoch": 0.1480671349850671, |
| "grad_norm": 3.5, |
| "learning_rate": 9.468163201617062e-05, |
| "loss": 0.6052781105041504, |
| "memory(GiB)": 73.26, |
| "step": 220, |
| "token_acc": 0.8488794669897032, |
| "train_speed(iter/s)": 0.315973 |
| }, |
| { |
| "epoch": 0.15143229714381862, |
| "grad_norm": 2.828125, |
| "learning_rate": 9.444177243274618e-05, |
| "loss": 0.5735151290893554, |
| "memory(GiB)": 73.26, |
| "step": 225, |
| "token_acc": 0.8521346213773762, |
| "train_speed(iter/s)": 0.318863 |
| }, |
| { |
| "epoch": 0.15479745930257013, |
| "grad_norm": 3.140625, |
| "learning_rate": 9.419694035645751e-05, |
| "loss": 0.6527684211730957, |
| "memory(GiB)": 73.26, |
| "step": 230, |
| "token_acc": 0.8357325655790148, |
| "train_speed(iter/s)": 0.32165 |
| }, |
| { |
| "epoch": 0.15816262146132168, |
| "grad_norm": 3.015625, |
| "learning_rate": 9.394716318103098e-05, |
| "loss": 0.672496223449707, |
| "memory(GiB)": 73.26, |
| "step": 235, |
| "token_acc": 0.8255225893459204, |
| "train_speed(iter/s)": 0.324423 |
| }, |
| { |
| "epoch": 0.1615277836200732, |
| "grad_norm": 2.5, |
| "learning_rate": 9.369246885348926e-05, |
| "loss": 0.5730775356292724, |
| "memory(GiB)": 73.26, |
| "step": 240, |
| "token_acc": 0.8565380231232699, |
| "train_speed(iter/s)": 0.327159 |
| }, |
| { |
| "epoch": 0.1648929457788247, |
| "grad_norm": 2.859375, |
| "learning_rate": 9.343288587102443e-05, |
| "loss": 0.6417149543762207, |
| "memory(GiB)": 73.26, |
| "step": 245, |
| "token_acc": 0.8340062808434275, |
| "train_speed(iter/s)": 0.329689 |
| }, |
| { |
| "epoch": 0.16825810793757623, |
| "grad_norm": 2.671875, |
| "learning_rate": 9.316844327780955e-05, |
| "loss": 0.6126539707183838, |
| "memory(GiB)": 73.26, |
| "step": 250, |
| "token_acc": 0.8416943761746422, |
| "train_speed(iter/s)": 0.332312 |
| }, |
| { |
| "epoch": 0.16825810793757623, |
| "eval_loss": 0.6005221009254456, |
| "eval_runtime": 6.383, |
| "eval_samples_per_second": 37.6, |
| "eval_steps_per_second": 37.6, |
| "eval_token_acc": 0.8414904862579281, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.17162327009632777, |
| "grad_norm": 3.640625, |
| "learning_rate": 9.289917066174886e-05, |
| "loss": 0.5673539161682128, |
| "memory(GiB)": 73.78, |
| "step": 255, |
| "token_acc": 0.8447506770750358, |
| "train_speed(iter/s)": 0.305561 |
| }, |
| { |
| "epoch": 0.1749884322550793, |
| "grad_norm": 2.9375, |
| "learning_rate": 9.262509815116732e-05, |
| "loss": 0.696702241897583, |
| "memory(GiB)": 73.78, |
| "step": 260, |
| "token_acc": 0.8255023183925811, |
| "train_speed(iter/s)": 0.308253 |
| }, |
| { |
| "epoch": 0.1783535944138308, |
| "grad_norm": 3.34375, |
| "learning_rate": 9.23462564114396e-05, |
| "loss": 0.6081646919250489, |
| "memory(GiB)": 73.78, |
| "step": 265, |
| "token_acc": 0.83946592144077, |
| "train_speed(iter/s)": 0.310782 |
| }, |
| { |
| "epoch": 0.18171875657258235, |
| "grad_norm": 3.09375, |
| "learning_rate": 9.206267664155907e-05, |
| "loss": 0.6581857681274415, |
| "memory(GiB)": 73.78, |
| "step": 270, |
| "token_acc": 0.8402509652509652, |
| "train_speed(iter/s)": 0.31337 |
| }, |
| { |
| "epoch": 0.18508391873133387, |
| "grad_norm": 2.625, |
| "learning_rate": 9.177439057064683e-05, |
| "loss": 0.5923350334167481, |
| "memory(GiB)": 74.6, |
| "step": 275, |
| "token_acc": 0.8534911648653285, |
| "train_speed(iter/s)": 0.315773 |
| }, |
| { |
| "epoch": 0.18844908089008539, |
| "grad_norm": 3.640625, |
| "learning_rate": 9.14814304544018e-05, |
| "loss": 0.636703634262085, |
| "memory(GiB)": 74.6, |
| "step": 280, |
| "token_acc": 0.8385491895361901, |
| "train_speed(iter/s)": 0.318259 |
| }, |
| { |
| "epoch": 0.1918142430488369, |
| "grad_norm": 3.109375, |
| "learning_rate": 9.118382907149165e-05, |
| "loss": 0.6206272125244141, |
| "memory(GiB)": 74.6, |
| "step": 285, |
| "token_acc": 0.8444787644787645, |
| "train_speed(iter/s)": 0.320478 |
| }, |
| { |
| "epoch": 0.19517940520758845, |
| "grad_norm": 2.546875, |
| "learning_rate": 9.088161971988516e-05, |
| "loss": 0.6622869491577148, |
| "memory(GiB)": 74.6, |
| "step": 290, |
| "token_acc": 0.8280620155038759, |
| "train_speed(iter/s)": 0.322612 |
| }, |
| { |
| "epoch": 0.19854456736633996, |
| "grad_norm": 2.546875, |
| "learning_rate": 9.057483621312671e-05, |
| "loss": 0.5659195899963378, |
| "memory(GiB)": 74.6, |
| "step": 295, |
| "token_acc": 0.8537543198240654, |
| "train_speed(iter/s)": 0.324748 |
| }, |
| { |
| "epoch": 0.20190972952509148, |
| "grad_norm": 2.78125, |
| "learning_rate": 9.026351287655294e-05, |
| "loss": 0.576479721069336, |
| "memory(GiB)": 74.6, |
| "step": 300, |
| "token_acc": 0.8472282845918813, |
| "train_speed(iter/s)": 0.326837 |
| }, |
| { |
| "epoch": 0.20190972952509148, |
| "eval_loss": 0.6099406480789185, |
| "eval_runtime": 6.5229, |
| "eval_samples_per_second": 36.793, |
| "eval_steps_per_second": 36.793, |
| "eval_token_acc": 0.8406448202959831, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.20527489168384302, |
| "grad_norm": 2.828125, |
| "learning_rate": 8.994768454345206e-05, |
| "loss": 0.6150260448455811, |
| "memory(GiB)": 74.6, |
| "step": 305, |
| "token_acc": 0.8427843137254902, |
| "train_speed(iter/s)": 0.30445 |
| }, |
| { |
| "epoch": 0.20864005384259454, |
| "grad_norm": 3.140625, |
| "learning_rate": 8.962738655116658e-05, |
| "loss": 0.6955391883850097, |
| "memory(GiB)": 74.6, |
| "step": 310, |
| "token_acc": 0.8351194121249235, |
| "train_speed(iter/s)": 0.306547 |
| }, |
| { |
| "epoch": 0.21200521600134606, |
| "grad_norm": 3.375, |
| "learning_rate": 8.930265473713938e-05, |
| "loss": 0.5725995063781738, |
| "memory(GiB)": 74.6, |
| "step": 315, |
| "token_acc": 0.854539641943734, |
| "train_speed(iter/s)": 0.308729 |
| }, |
| { |
| "epoch": 0.2153703781600976, |
| "grad_norm": 2.78125, |
| "learning_rate": 8.897352543490395e-05, |
| "loss": 0.5337778568267822, |
| "memory(GiB)": 74.6, |
| "step": 320, |
| "token_acc": 0.856396866840731, |
| "train_speed(iter/s)": 0.310883 |
| }, |
| { |
| "epoch": 0.21873554031884912, |
| "grad_norm": 3.0625, |
| "learning_rate": 8.864003547001915e-05, |
| "loss": 0.6348609447479248, |
| "memory(GiB)": 75.82, |
| "step": 325, |
| "token_acc": 0.8423275457531675, |
| "train_speed(iter/s)": 0.312767 |
| }, |
| { |
| "epoch": 0.22210070247760064, |
| "grad_norm": 3.09375, |
| "learning_rate": 8.83022221559489e-05, |
| "loss": 0.6023256778717041, |
| "memory(GiB)": 75.82, |
| "step": 330, |
| "token_acc": 0.8492975734355045, |
| "train_speed(iter/s)": 0.314836 |
| }, |
| { |
| "epoch": 0.22546586463635215, |
| "grad_norm": 3.171875, |
| "learning_rate": 8.796012328988716e-05, |
| "loss": 0.7502017498016358, |
| "memory(GiB)": 75.82, |
| "step": 335, |
| "token_acc": 0.8177310293012773, |
| "train_speed(iter/s)": 0.316755 |
| }, |
| { |
| "epoch": 0.2288310267951037, |
| "grad_norm": 3.125, |
| "learning_rate": 8.761377714852899e-05, |
| "loss": 0.5663125038146972, |
| "memory(GiB)": 75.82, |
| "step": 340, |
| "token_acc": 0.8523967726625534, |
| "train_speed(iter/s)": 0.318692 |
| }, |
| { |
| "epoch": 0.23219618895385521, |
| "grad_norm": 2.6875, |
| "learning_rate": 8.726322248378775e-05, |
| "loss": 0.6224043846130372, |
| "memory(GiB)": 75.82, |
| "step": 345, |
| "token_acc": 0.8415334471519479, |
| "train_speed(iter/s)": 0.32067 |
| }, |
| { |
| "epoch": 0.23556135111260673, |
| "grad_norm": 3.0625, |
| "learning_rate": 8.690849851845933e-05, |
| "loss": 0.6304502010345459, |
| "memory(GiB)": 75.82, |
| "step": 350, |
| "token_acc": 0.8429706005294691, |
| "train_speed(iter/s)": 0.322382 |
| }, |
| { |
| "epoch": 0.23556135111260673, |
| "eval_loss": 0.584740936756134, |
| "eval_runtime": 6.3513, |
| "eval_samples_per_second": 37.787, |
| "eval_steps_per_second": 37.787, |
| "eval_token_acc": 0.8461945031712473, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.23892651327135828, |
| "grad_norm": 3.65625, |
| "learning_rate": 8.654964494183358e-05, |
| "loss": 0.6737657070159913, |
| "memory(GiB)": 75.82, |
| "step": 355, |
| "token_acc": 0.8434381584701626, |
| "train_speed(iter/s)": 0.303665 |
| }, |
| { |
| "epoch": 0.2422916754301098, |
| "grad_norm": 2.765625, |
| "learning_rate": 8.618670190525352e-05, |
| "loss": 0.6179668426513671, |
| "memory(GiB)": 75.82, |
| "step": 360, |
| "token_acc": 0.8392533779077866, |
| "train_speed(iter/s)": 0.305585 |
| }, |
| { |
| "epoch": 0.2456568375888613, |
| "grad_norm": 3.15625, |
| "learning_rate": 8.581971001762286e-05, |
| "loss": 0.626660680770874, |
| "memory(GiB)": 75.82, |
| "step": 365, |
| "token_acc": 0.8430995837335895, |
| "train_speed(iter/s)": 0.307436 |
| }, |
| { |
| "epoch": 0.24902199974761283, |
| "grad_norm": 3.265625, |
| "learning_rate": 8.54487103408625e-05, |
| "loss": 0.546476411819458, |
| "memory(GiB)": 75.82, |
| "step": 370, |
| "token_acc": 0.856482219741668, |
| "train_speed(iter/s)": 0.309304 |
| }, |
| { |
| "epoch": 0.25238716190636434, |
| "grad_norm": 3.09375, |
| "learning_rate": 8.507374438531607e-05, |
| "loss": 0.6368942260742188, |
| "memory(GiB)": 75.82, |
| "step": 375, |
| "token_acc": 0.8362845604224914, |
| "train_speed(iter/s)": 0.311038 |
| }, |
| { |
| "epoch": 0.2557523240651159, |
| "grad_norm": 2.921875, |
| "learning_rate": 8.469485410510545e-05, |
| "loss": 0.6205560684204101, |
| "memory(GiB)": 75.82, |
| "step": 380, |
| "token_acc": 0.8416890480453596, |
| "train_speed(iter/s)": 0.312832 |
| }, |
| { |
| "epoch": 0.25911748622386743, |
| "grad_norm": 3.1875, |
| "learning_rate": 8.43120818934367e-05, |
| "loss": 0.5687759399414063, |
| "memory(GiB)": 75.82, |
| "step": 385, |
| "token_acc": 0.8516474854169951, |
| "train_speed(iter/s)": 0.314546 |
| }, |
| { |
| "epoch": 0.2624826483826189, |
| "grad_norm": 3.375, |
| "learning_rate": 8.392547057785661e-05, |
| "loss": 0.6561696529388428, |
| "memory(GiB)": 75.82, |
| "step": 390, |
| "token_acc": 0.8435968137254902, |
| "train_speed(iter/s)": 0.316291 |
| }, |
| { |
| "epoch": 0.26584781054137047, |
| "grad_norm": 2.84375, |
| "learning_rate": 8.353506341546104e-05, |
| "loss": 0.6340418815612793, |
| "memory(GiB)": 75.82, |
| "step": 395, |
| "token_acc": 0.8422232182877634, |
| "train_speed(iter/s)": 0.317983 |
| }, |
| { |
| "epoch": 0.269212972700122, |
| "grad_norm": 2.921875, |
| "learning_rate": 8.314090408805482e-05, |
| "loss": 0.5887197017669678, |
| "memory(GiB)": 75.82, |
| "step": 400, |
| "token_acc": 0.8538324420677362, |
| "train_speed(iter/s)": 0.319589 |
| }, |
| { |
| "epoch": 0.269212972700122, |
| "eval_loss": 0.5566386580467224, |
| "eval_runtime": 6.2654, |
| "eval_samples_per_second": 38.306, |
| "eval_steps_per_second": 38.306, |
| "eval_token_acc": 0.8535412262156448, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2725781348588735, |
| "grad_norm": 3.828125, |
| "learning_rate": 8.274303669726426e-05, |
| "loss": 0.5180852890014649, |
| "memory(GiB)": 75.82, |
| "step": 405, |
| "token_acc": 0.8568248957953948, |
| "train_speed(iter/s)": 0.303898 |
| }, |
| { |
| "epoch": 0.27594329701762504, |
| "grad_norm": 3.171875, |
| "learning_rate": 8.234150575960288e-05, |
| "loss": 0.6065554618835449, |
| "memory(GiB)": 75.82, |
| "step": 410, |
| "token_acc": 0.8468217054263566, |
| "train_speed(iter/s)": 0.305556 |
| }, |
| { |
| "epoch": 0.2793084591763766, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.19363562014904e-05, |
| "loss": 0.563250207901001, |
| "memory(GiB)": 75.82, |
| "step": 415, |
| "token_acc": 0.8505627962085308, |
| "train_speed(iter/s)": 0.307215 |
| }, |
| { |
| "epoch": 0.2826736213351281, |
| "grad_norm": 3.203125, |
| "learning_rate": 8.152763335422613e-05, |
| "loss": 0.5627524375915527, |
| "memory(GiB)": 75.82, |
| "step": 420, |
| "token_acc": 0.8593019632284201, |
| "train_speed(iter/s)": 0.308773 |
| }, |
| { |
| "epoch": 0.2860387834938796, |
| "grad_norm": 2.390625, |
| "learning_rate": 8.111538294891684e-05, |
| "loss": 0.5277237892150879, |
| "memory(GiB)": 75.82, |
| "step": 425, |
| "token_acc": 0.8635786802030457, |
| "train_speed(iter/s)": 0.310337 |
| }, |
| { |
| "epoch": 0.2894039456526311, |
| "grad_norm": 2.796875, |
| "learning_rate": 8.06996511113601e-05, |
| "loss": 0.610354232788086, |
| "memory(GiB)": 75.82, |
| "step": 430, |
| "token_acc": 0.842520795150148, |
| "train_speed(iter/s)": 0.311764 |
| }, |
| { |
| "epoch": 0.29276910781138266, |
| "grad_norm": 4.3125, |
| "learning_rate": 8.028048435688333e-05, |
| "loss": 0.5220999717712402, |
| "memory(GiB)": 75.82, |
| "step": 435, |
| "token_acc": 0.866747609652451, |
| "train_speed(iter/s)": 0.31318 |
| }, |
| { |
| "epoch": 0.2961342699701342, |
| "grad_norm": 2.890625, |
| "learning_rate": 7.985792958513931e-05, |
| "loss": 0.6387944698333741, |
| "memory(GiB)": 75.82, |
| "step": 440, |
| "token_acc": 0.8430452550325412, |
| "train_speed(iter/s)": 0.314765 |
| }, |
| { |
| "epoch": 0.2994994321288857, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.943203407485864e-05, |
| "loss": 0.44078569412231444, |
| "memory(GiB)": 75.82, |
| "step": 445, |
| "token_acc": 0.8829911533967618, |
| "train_speed(iter/s)": 0.316117 |
| }, |
| { |
| "epoch": 0.30286459428763723, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.900284547855991e-05, |
| "loss": 0.5065964698791504, |
| "memory(GiB)": 75.82, |
| "step": 450, |
| "token_acc": 0.8605196982397317, |
| "train_speed(iter/s)": 0.317607 |
| }, |
| { |
| "epoch": 0.30286459428763723, |
| "eval_loss": 0.5302485823631287, |
| "eval_runtime": 6.3503, |
| "eval_samples_per_second": 37.794, |
| "eval_steps_per_second": 37.794, |
| "eval_token_acc": 0.8599894291754757, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.3062297564463888, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.857041181721787e-05, |
| "loss": 0.4503211975097656, |
| "memory(GiB)": 75.82, |
| "step": 455, |
| "token_acc": 0.8636938646426312, |
| "train_speed(iter/s)": 0.303058 |
| }, |
| { |
| "epoch": 0.30959491860514027, |
| "grad_norm": 3.359375, |
| "learning_rate": 7.813478147489052e-05, |
| "loss": 0.5654148578643798, |
| "memory(GiB)": 75.82, |
| "step": 460, |
| "token_acc": 0.8542982030111704, |
| "train_speed(iter/s)": 0.304484 |
| }, |
| { |
| "epoch": 0.3129600807638918, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.769600319330552e-05, |
| "loss": 0.47755861282348633, |
| "memory(GiB)": 75.82, |
| "step": 465, |
| "token_acc": 0.8762720077531901, |
| "train_speed(iter/s)": 0.305941 |
| }, |
| { |
| "epoch": 0.31632524292264336, |
| "grad_norm": 2.609375, |
| "learning_rate": 7.725412606640658e-05, |
| "loss": 0.5518892288208008, |
| "memory(GiB)": 75.82, |
| "step": 470, |
| "token_acc": 0.8566623959000641, |
| "train_speed(iter/s)": 0.307401 |
| }, |
| { |
| "epoch": 0.31969040508139485, |
| "grad_norm": 3.265625, |
| "learning_rate": 7.680919953486048e-05, |
| "loss": 0.5913249492645264, |
| "memory(GiB)": 75.82, |
| "step": 475, |
| "token_acc": 0.8497417957687823, |
| "train_speed(iter/s)": 0.308848 |
| }, |
| { |
| "epoch": 0.3230555672401464, |
| "grad_norm": 2.65625, |
| "learning_rate": 7.636127338052512e-05, |
| "loss": 0.5384829044342041, |
| "memory(GiB)": 75.82, |
| "step": 480, |
| "token_acc": 0.8643092105263158, |
| "train_speed(iter/s)": 0.31026 |
| }, |
| { |
| "epoch": 0.32642072939889794, |
| "grad_norm": 3.265625, |
| "learning_rate": 7.591039772087977e-05, |
| "loss": 0.5349913120269776, |
| "memory(GiB)": 75.82, |
| "step": 485, |
| "token_acc": 0.8635131063573366, |
| "train_speed(iter/s)": 0.311583 |
| }, |
| { |
| "epoch": 0.3297858915576494, |
| "grad_norm": 3.046875, |
| "learning_rate": 7.545662300341736e-05, |
| "loss": 0.48796830177307127, |
| "memory(GiB)": 75.82, |
| "step": 490, |
| "token_acc": 0.8717186726102031, |
| "train_speed(iter/s)": 0.312855 |
| }, |
| { |
| "epoch": 0.33315105371640097, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 0.5078158378601074, |
| "memory(GiB)": 75.82, |
| "step": 495, |
| "token_acc": 0.8613126649076517, |
| "train_speed(iter/s)": 0.314225 |
| }, |
| { |
| "epoch": 0.33651621587515246, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.454057980117841e-05, |
| "loss": 0.484033203125, |
| "memory(GiB)": 75.82, |
| "step": 500, |
| "token_acc": 0.8767056530214425, |
| "train_speed(iter/s)": 0.315565 |
| }, |
| { |
| "epoch": 0.33651621587515246, |
| "eval_loss": 0.5204777121543884, |
| "eval_runtime": 6.2841, |
| "eval_samples_per_second": 38.192, |
| "eval_steps_per_second": 38.192, |
| "eval_token_acc": 0.8624207188160676, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.339881378033904, |
| "grad_norm": 3.25, |
| "learning_rate": 7.407841381047532e-05, |
| "loss": 0.5047823905944824, |
| "memory(GiB)": 75.82, |
| "step": 505, |
| "token_acc": 0.8646571869925139, |
| "train_speed(iter/s)": 0.303291 |
| }, |
| { |
| "epoch": 0.34324654019265555, |
| "grad_norm": 2.5, |
| "learning_rate": 7.361355373863414e-05, |
| "loss": 0.5279562950134278, |
| "memory(GiB)": 75.82, |
| "step": 510, |
| "token_acc": 0.8671359436867576, |
| "train_speed(iter/s)": 0.304627 |
| }, |
| { |
| "epoch": 0.34661170235140704, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.314605159783314e-05, |
| "loss": 0.5070261001586914, |
| "memory(GiB)": 75.82, |
| "step": 515, |
| "token_acc": 0.8705286483064261, |
| "train_speed(iter/s)": 0.305896 |
| }, |
| { |
| "epoch": 0.3499768645101586, |
| "grad_norm": 2.671875, |
| "learning_rate": 7.267595969586589e-05, |
| "loss": 0.49044408798217776, |
| "memory(GiB)": 75.82, |
| "step": 520, |
| "token_acc": 0.8736138290932811, |
| "train_speed(iter/s)": 0.307204 |
| }, |
| { |
| "epoch": 0.3533420266689101, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.220333063028872e-05, |
| "loss": 0.5966588497161865, |
| "memory(GiB)": 75.82, |
| "step": 525, |
| "token_acc": 0.8524286815728604, |
| "train_speed(iter/s)": 0.30845 |
| }, |
| { |
| "epoch": 0.3567071888276616, |
| "grad_norm": 2.796875, |
| "learning_rate": 7.172821728253562e-05, |
| "loss": 0.5701375007629395, |
| "memory(GiB)": 75.82, |
| "step": 530, |
| "token_acc": 0.8540377863233573, |
| "train_speed(iter/s)": 0.309712 |
| }, |
| { |
| "epoch": 0.36007235098641316, |
| "grad_norm": 2.796875, |
| "learning_rate": 7.12506728120015e-05, |
| "loss": 0.4613838195800781, |
| "memory(GiB)": 75.82, |
| "step": 535, |
| "token_acc": 0.8718804641551423, |
| "train_speed(iter/s)": 0.310968 |
| }, |
| { |
| "epoch": 0.3634375131451647, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.077075065009433e-05, |
| "loss": 0.5259300708770752, |
| "memory(GiB)": 75.82, |
| "step": 540, |
| "token_acc": 0.8661874904419636, |
| "train_speed(iter/s)": 0.312118 |
| }, |
| { |
| "epoch": 0.3668026753039162, |
| "grad_norm": 2.671875, |
| "learning_rate": 7.02885044942567e-05, |
| "loss": 0.5487593173980713, |
| "memory(GiB)": 75.82, |
| "step": 545, |
| "token_acc": 0.8593700787401575, |
| "train_speed(iter/s)": 0.313405 |
| }, |
| { |
| "epoch": 0.37016783746266774, |
| "grad_norm": 2.40625, |
| "learning_rate": 6.980398830195785e-05, |
| "loss": 0.4660326957702637, |
| "memory(GiB)": 75.82, |
| "step": 550, |
| "token_acc": 0.8734921592279855, |
| "train_speed(iter/s)": 0.31456 |
| }, |
| { |
| "epoch": 0.37016783746266774, |
| "eval_loss": 0.49012547731399536, |
| "eval_runtime": 6.3431, |
| "eval_samples_per_second": 37.837, |
| "eval_steps_per_second": 37.837, |
| "eval_token_acc": 0.8673890063424947, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3735329996214193, |
| "grad_norm": 2.5625, |
| "learning_rate": 6.931725628465643e-05, |
| "loss": 0.5410624027252198, |
| "memory(GiB)": 75.82, |
| "step": 555, |
| "token_acc": 0.8664497667672768, |
| "train_speed(iter/s)": 0.302648 |
| }, |
| { |
| "epoch": 0.37689816178017077, |
| "grad_norm": 2.328125, |
| "learning_rate": 6.882836290173493e-05, |
| "loss": 0.5354323387145996, |
| "memory(GiB)": 75.82, |
| "step": 560, |
| "token_acc": 0.8606255012028869, |
| "train_speed(iter/s)": 0.303869 |
| }, |
| { |
| "epoch": 0.3802633239389223, |
| "grad_norm": 2.390625, |
| "learning_rate": 6.833736285440632e-05, |
| "loss": 0.4386926174163818, |
| "memory(GiB)": 75.82, |
| "step": 565, |
| "token_acc": 0.8871541196475499, |
| "train_speed(iter/s)": 0.30496 |
| }, |
| { |
| "epoch": 0.3836284860976738, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.784431107959359e-05, |
| "loss": 0.5115750789642334, |
| "memory(GiB)": 75.82, |
| "step": 570, |
| "token_acc": 0.8649976962064199, |
| "train_speed(iter/s)": 0.306182 |
| }, |
| { |
| "epoch": 0.38699364825642535, |
| "grad_norm": 2.4375, |
| "learning_rate": 6.734926274378312e-05, |
| "loss": 0.48287324905395507, |
| "memory(GiB)": 75.82, |
| "step": 575, |
| "token_acc": 0.8724030754130542, |
| "train_speed(iter/s)": 0.307381 |
| }, |
| { |
| "epoch": 0.3903588104151769, |
| "grad_norm": 2.703125, |
| "learning_rate": 6.685227323685209e-05, |
| "loss": 0.5082109451293946, |
| "memory(GiB)": 75.82, |
| "step": 580, |
| "token_acc": 0.8686852331606217, |
| "train_speed(iter/s)": 0.30846 |
| }, |
| { |
| "epoch": 0.3937239725739284, |
| "grad_norm": 2.296875, |
| "learning_rate": 6.635339816587109e-05, |
| "loss": 0.46943073272705077, |
| "memory(GiB)": 75.82, |
| "step": 585, |
| "token_acc": 0.8746086412022542, |
| "train_speed(iter/s)": 0.309547 |
| }, |
| { |
| "epoch": 0.3970891347326799, |
| "grad_norm": 2.265625, |
| "learning_rate": 6.585269334888234e-05, |
| "loss": 0.4492472171783447, |
| "memory(GiB)": 75.82, |
| "step": 590, |
| "token_acc": 0.8817360438851243, |
| "train_speed(iter/s)": 0.31066 |
| }, |
| { |
| "epoch": 0.40045429689143147, |
| "grad_norm": 2.703125, |
| "learning_rate": 6.535021480865439e-05, |
| "loss": 0.4906127452850342, |
| "memory(GiB)": 75.82, |
| "step": 595, |
| "token_acc": 0.8699199748940845, |
| "train_speed(iter/s)": 0.311715 |
| }, |
| { |
| "epoch": 0.40381945905018296, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.484601876641375e-05, |
| "loss": 0.4776750564575195, |
| "memory(GiB)": 75.82, |
| "step": 600, |
| "token_acc": 0.8830860534124629, |
| "train_speed(iter/s)": 0.312795 |
| }, |
| { |
| "epoch": 0.40381945905018296, |
| "eval_loss": 0.4834407567977905, |
| "eval_runtime": 6.359, |
| "eval_samples_per_second": 37.742, |
| "eval_steps_per_second": 37.742, |
| "eval_token_acc": 0.8705602536997886, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4071846212089345, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.434016163555452e-05, |
| "loss": 0.42650303840637205, |
| "memory(GiB)": 75.82, |
| "step": 605, |
| "token_acc": 0.873349786500568, |
| "train_speed(iter/s)": 0.302036 |
| }, |
| { |
| "epoch": 0.41054978336768605, |
| "grad_norm": 2.375, |
| "learning_rate": 6.383270001532635e-05, |
| "loss": 0.47733469009399415, |
| "memory(GiB)": 75.82, |
| "step": 610, |
| "token_acc": 0.8673865361903155, |
| "train_speed(iter/s)": 0.303179 |
| }, |
| { |
| "epoch": 0.41391494552643754, |
| "grad_norm": 2.3125, |
| "learning_rate": 6.332369068450174e-05, |
| "loss": 0.4712835788726807, |
| "memory(GiB)": 75.82, |
| "step": 615, |
| "token_acc": 0.8805620608899297, |
| "train_speed(iter/s)": 0.304281 |
| }, |
| { |
| "epoch": 0.4172801076851891, |
| "grad_norm": 2.625, |
| "learning_rate": 6.281319059502313e-05, |
| "loss": 0.45713419914245607, |
| "memory(GiB)": 75.82, |
| "step": 620, |
| "token_acc": 0.877295995182174, |
| "train_speed(iter/s)": 0.305295 |
| }, |
| { |
| "epoch": 0.42064526984394063, |
| "grad_norm": 2.453125, |
| "learning_rate": 6.230125686563068e-05, |
| "loss": 0.3812277317047119, |
| "memory(GiB)": 75.82, |
| "step": 625, |
| "token_acc": 0.8977045908183633, |
| "train_speed(iter/s)": 0.306432 |
| }, |
| { |
| "epoch": 0.4240104320026921, |
| "grad_norm": 2.125, |
| "learning_rate": 6.178794677547137e-05, |
| "loss": 0.48100833892822265, |
| "memory(GiB)": 75.82, |
| "step": 630, |
| "token_acc": 0.8763653633053665, |
| "train_speed(iter/s)": 0.307582 |
| }, |
| { |
| "epoch": 0.42737559416144366, |
| "grad_norm": 2.8125, |
| "learning_rate": 6.127331775769023e-05, |
| "loss": 0.42731170654296874, |
| "memory(GiB)": 75.82, |
| "step": 635, |
| "token_acc": 0.8863417762103238, |
| "train_speed(iter/s)": 0.308599 |
| }, |
| { |
| "epoch": 0.4307407563201952, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.0757427393004195e-05, |
| "loss": 0.3901322603225708, |
| "memory(GiB)": 75.82, |
| "step": 640, |
| "token_acc": 0.8963465035543065, |
| "train_speed(iter/s)": 0.309577 |
| }, |
| { |
| "epoch": 0.4341059184789467, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.024033340325954e-05, |
| "loss": 0.41823792457580566, |
| "memory(GiB)": 75.82, |
| "step": 645, |
| "token_acc": 0.8839628681177977, |
| "train_speed(iter/s)": 0.310568 |
| }, |
| { |
| "epoch": 0.43747108063769824, |
| "grad_norm": 2.625, |
| "learning_rate": 5.9722093644973546e-05, |
| "loss": 0.45659918785095216, |
| "memory(GiB)": 75.82, |
| "step": 650, |
| "token_acc": 0.8752002563281, |
| "train_speed(iter/s)": 0.311578 |
| }, |
| { |
| "epoch": 0.43747108063769824, |
| "eval_loss": 0.4680798649787903, |
| "eval_runtime": 6.359, |
| "eval_samples_per_second": 37.742, |
| "eval_steps_per_second": 37.742, |
| "eval_token_acc": 0.8731501057082452, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.44083624279644973, |
| "grad_norm": 2.34375, |
| "learning_rate": 5.920276610286102e-05, |
| "loss": 0.45874710083007814, |
| "memory(GiB)": 75.82, |
| "step": 655, |
| "token_acc": 0.874843798812871, |
| "train_speed(iter/s)": 0.303426 |
| }, |
| { |
| "epoch": 0.4442014049552013, |
| "grad_norm": 1.890625, |
| "learning_rate": 5.868240888334653e-05, |
| "loss": 0.40706768035888674, |
| "memory(GiB)": 75.82, |
| "step": 660, |
| "token_acc": 0.8987694831829368, |
| "train_speed(iter/s)": 0.304386 |
| }, |
| { |
| "epoch": 0.4475665671139528, |
| "grad_norm": 2.3125, |
| "learning_rate": 5.816108020806297e-05, |
| "loss": 0.4790656566619873, |
| "memory(GiB)": 75.82, |
| "step": 665, |
| "token_acc": 0.8695376820772641, |
| "train_speed(iter/s)": 0.30531 |
| }, |
| { |
| "epoch": 0.4509317292727043, |
| "grad_norm": 2.625, |
| "learning_rate": 5.763883840733736e-05, |
| "loss": 0.4840695858001709, |
| "memory(GiB)": 75.82, |
| "step": 670, |
| "token_acc": 0.8794635643884311, |
| "train_speed(iter/s)": 0.306292 |
| }, |
| { |
| "epoch": 0.45429689143145585, |
| "grad_norm": 2.703125, |
| "learning_rate": 5.7115741913664264e-05, |
| "loss": 0.4588040351867676, |
| "memory(GiB)": 75.82, |
| "step": 675, |
| "token_acc": 0.8740804106073568, |
| "train_speed(iter/s)": 0.307251 |
| }, |
| { |
| "epoch": 0.4576620535902074, |
| "grad_norm": 2.3125, |
| "learning_rate": 5.6591849255168015e-05, |
| "loss": 0.39728033542633057, |
| "memory(GiB)": 75.82, |
| "step": 680, |
| "token_acc": 0.893990116371752, |
| "train_speed(iter/s)": 0.308265 |
| }, |
| { |
| "epoch": 0.4610272157489589, |
| "grad_norm": 2.1875, |
| "learning_rate": 5.60672190490541e-05, |
| "loss": 0.422639799118042, |
| "memory(GiB)": 75.82, |
| "step": 685, |
| "token_acc": 0.8848245180425112, |
| "train_speed(iter/s)": 0.309233 |
| }, |
| { |
| "epoch": 0.46439237790771043, |
| "grad_norm": 2.265625, |
| "learning_rate": 5.5541909995050554e-05, |
| "loss": 0.39331207275390623, |
| "memory(GiB)": 75.82, |
| "step": 690, |
| "token_acc": 0.8947537301459971, |
| "train_speed(iter/s)": 0.310207 |
| }, |
| { |
| "epoch": 0.467757540066462, |
| "grad_norm": 2.21875, |
| "learning_rate": 5.501598086884025e-05, |
| "loss": 0.43639063835144043, |
| "memory(GiB)": 75.82, |
| "step": 695, |
| "token_acc": 0.8884950048340315, |
| "train_speed(iter/s)": 0.311107 |
| }, |
| { |
| "epoch": 0.47112270222521346, |
| "grad_norm": 2.375, |
| "learning_rate": 5.448949051548459e-05, |
| "loss": 0.413299560546875, |
| "memory(GiB)": 75.82, |
| "step": 700, |
| "token_acc": 0.8879505353641984, |
| "train_speed(iter/s)": 0.312013 |
| }, |
| { |
| "epoch": 0.47112270222521346, |
| "eval_loss": 0.4406000077724457, |
| "eval_runtime": 6.3948, |
| "eval_samples_per_second": 37.531, |
| "eval_steps_per_second": 37.531, |
| "eval_token_acc": 0.8806025369978858, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.474487864383965, |
| "grad_norm": 2.421875, |
| "learning_rate": 5.396249784283942e-05, |
| "loss": 0.43725104331970216, |
| "memory(GiB)": 75.82, |
| "step": 705, |
| "token_acc": 0.8810175054704595, |
| "train_speed(iter/s)": 0.304357 |
| }, |
| { |
| "epoch": 0.47785302654271655, |
| "grad_norm": 2.34375, |
| "learning_rate": 5.343506181496405e-05, |
| "loss": 0.41141476631164553, |
| "memory(GiB)": 75.82, |
| "step": 710, |
| "token_acc": 0.8912708204811844, |
| "train_speed(iter/s)": 0.305281 |
| }, |
| { |
| "epoch": 0.48121818870146804, |
| "grad_norm": 2.609375, |
| "learning_rate": 5.290724144552379e-05, |
| "loss": 0.5024977684020996, |
| "memory(GiB)": 75.82, |
| "step": 715, |
| "token_acc": 0.8692831144168381, |
| "train_speed(iter/s)": 0.306216 |
| }, |
| { |
| "epoch": 0.4845833508602196, |
| "grad_norm": 1.8828125, |
| "learning_rate": 5.2379095791187124e-05, |
| "loss": 0.37138142585754397, |
| "memory(GiB)": 75.82, |
| "step": 720, |
| "token_acc": 0.8965241069998399, |
| "train_speed(iter/s)": 0.307078 |
| }, |
| { |
| "epoch": 0.4879485130189711, |
| "grad_norm": 2.484375, |
| "learning_rate": 5.185068394501791e-05, |
| "loss": 0.46549081802368164, |
| "memory(GiB)": 75.82, |
| "step": 725, |
| "token_acc": 0.8741355463347165, |
| "train_speed(iter/s)": 0.308012 |
| }, |
| { |
| "epoch": 0.4913136751777226, |
| "grad_norm": 2.828125, |
| "learning_rate": 5.132206502986368e-05, |
| "loss": 0.5339263916015625, |
| "memory(GiB)": 75.82, |
| "step": 730, |
| "token_acc": 0.8623881049916553, |
| "train_speed(iter/s)": 0.308868 |
| }, |
| { |
| "epoch": 0.49467883733647416, |
| "grad_norm": 2.5625, |
| "learning_rate": 5.0793298191740404e-05, |
| "loss": 0.4308777809143066, |
| "memory(GiB)": 75.82, |
| "step": 735, |
| "token_acc": 0.8834385624089364, |
| "train_speed(iter/s)": 0.309717 |
| }, |
| { |
| "epoch": 0.49804399949522565, |
| "grad_norm": 3.0, |
| "learning_rate": 5.026444259321489e-05, |
| "loss": 0.3827210903167725, |
| "memory(GiB)": 75.82, |
| "step": 740, |
| "token_acc": 0.8980582524271845, |
| "train_speed(iter/s)": 0.310525 |
| }, |
| { |
| "epoch": 0.5014091616539772, |
| "grad_norm": 2.1875, |
| "learning_rate": 4.973555740678511e-05, |
| "loss": 0.4466721534729004, |
| "memory(GiB)": 75.82, |
| "step": 745, |
| "token_acc": 0.8842266462480858, |
| "train_speed(iter/s)": 0.311445 |
| }, |
| { |
| "epoch": 0.5047743238127287, |
| "grad_norm": 2.765625, |
| "learning_rate": 4.92067018082596e-05, |
| "loss": 0.5502868175506592, |
| "memory(GiB)": 75.82, |
| "step": 750, |
| "token_acc": 0.8586033117350612, |
| "train_speed(iter/s)": 0.312324 |
| }, |
| { |
| "epoch": 0.5047743238127287, |
| "eval_loss": 0.42037147283554077, |
| "eval_runtime": 6.3843, |
| "eval_samples_per_second": 37.592, |
| "eval_steps_per_second": 37.592, |
| "eval_token_acc": 0.8845665961945032, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5081394859714803, |
| "grad_norm": 2.125, |
| "learning_rate": 4.8677934970136335e-05, |
| "loss": 0.5509189128875732, |
| "memory(GiB)": 75.82, |
| "step": 755, |
| "token_acc": 0.8773990147783252, |
| "train_speed(iter/s)": 0.304846 |
| }, |
| { |
| "epoch": 0.5115046481302318, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.8149316054982095e-05, |
| "loss": 0.392488431930542, |
| "memory(GiB)": 75.82, |
| "step": 760, |
| "token_acc": 0.8881675052751177, |
| "train_speed(iter/s)": 0.305699 |
| }, |
| { |
| "epoch": 0.5148698102889833, |
| "grad_norm": 2.125, |
| "learning_rate": 4.762090420881289e-05, |
| "loss": 0.34769492149353026, |
| "memory(GiB)": 75.82, |
| "step": 765, |
| "token_acc": 0.904643578195372, |
| "train_speed(iter/s)": 0.306491 |
| }, |
| { |
| "epoch": 0.5182349724477349, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.709275855447621e-05, |
| "loss": 0.34389894008636473, |
| "memory(GiB)": 75.82, |
| "step": 770, |
| "token_acc": 0.9075886411038023, |
| "train_speed(iter/s)": 0.307358 |
| }, |
| { |
| "epoch": 0.5216001346064864, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.6564938185035956e-05, |
| "loss": 0.3195344924926758, |
| "memory(GiB)": 75.82, |
| "step": 775, |
| "token_acc": 0.9082976621666118, |
| "train_speed(iter/s)": 0.308159 |
| }, |
| { |
| "epoch": 0.5249652967652378, |
| "grad_norm": 2.21875, |
| "learning_rate": 4.603750215716057e-05, |
| "loss": 0.38263275623321535, |
| "memory(GiB)": 75.82, |
| "step": 780, |
| "token_acc": 0.8930994539136191, |
| "train_speed(iter/s)": 0.30896 |
| }, |
| { |
| "epoch": 0.5283304589239894, |
| "grad_norm": 2.4375, |
| "learning_rate": 4.551050948451542e-05, |
| "loss": 0.4862419605255127, |
| "memory(GiB)": 75.82, |
| "step": 785, |
| "token_acc": 0.8823529411764706, |
| "train_speed(iter/s)": 0.309802 |
| }, |
| { |
| "epoch": 0.5316956210827409, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.498401913115975e-05, |
| "loss": 0.46417646408081054, |
| "memory(GiB)": 75.82, |
| "step": 790, |
| "token_acc": 0.8798179059180576, |
| "train_speed(iter/s)": 0.310657 |
| }, |
| { |
| "epoch": 0.5350607832414924, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.445809000494946e-05, |
| "loss": 0.5157633304595948, |
| "memory(GiB)": 75.82, |
| "step": 795, |
| "token_acc": 0.8697006636868482, |
| "train_speed(iter/s)": 0.311416 |
| }, |
| { |
| "epoch": 0.538425945400244, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.393278095094591e-05, |
| "loss": 0.36940808296203614, |
| "memory(GiB)": 75.82, |
| "step": 800, |
| "token_acc": 0.8981329839502129, |
| "train_speed(iter/s)": 0.312237 |
| }, |
| { |
| "epoch": 0.538425945400244, |
| "eval_loss": 0.3978128135204315, |
| "eval_runtime": 6.3728, |
| "eval_samples_per_second": 37.66, |
| "eval_steps_per_second": 37.66, |
| "eval_token_acc": 0.8903276955602537, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5417911075589955, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.340815074483199e-05, |
| "loss": 0.34389219284057615, |
| "memory(GiB)": 75.82, |
| "step": 805, |
| "token_acc": 0.8938571824626718, |
| "train_speed(iter/s)": 0.305594 |
| }, |
| { |
| "epoch": 0.545156269717747, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.288425808633575e-05, |
| "loss": 0.31910243034362795, |
| "memory(GiB)": 75.82, |
| "step": 810, |
| "token_acc": 0.9097661188369153, |
| "train_speed(iter/s)": 0.306364 |
| }, |
| { |
| "epoch": 0.5485214318764986, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.236116159266265e-05, |
| "loss": 0.3916430950164795, |
| "memory(GiB)": 75.82, |
| "step": 815, |
| "token_acc": 0.8918385922330098, |
| "train_speed(iter/s)": 0.307181 |
| }, |
| { |
| "epoch": 0.5518865940352501, |
| "grad_norm": 2.265625, |
| "learning_rate": 4.1838919791937034e-05, |
| "loss": 0.38680903911590575, |
| "memory(GiB)": 75.82, |
| "step": 820, |
| "token_acc": 0.8982239382239382, |
| "train_speed(iter/s)": 0.307962 |
| }, |
| { |
| "epoch": 0.5552517561940016, |
| "grad_norm": 3.09375, |
| "learning_rate": 4.131759111665349e-05, |
| "loss": 0.39566311836242674, |
| "memory(GiB)": 75.82, |
| "step": 825, |
| "token_acc": 0.8838128359152703, |
| "train_speed(iter/s)": 0.308723 |
| }, |
| { |
| "epoch": 0.5586169183527532, |
| "grad_norm": 2.609375, |
| "learning_rate": 4.0797233897138985e-05, |
| "loss": 0.3857170820236206, |
| "memory(GiB)": 75.82, |
| "step": 830, |
| "token_acc": 0.8954257979114576, |
| "train_speed(iter/s)": 0.309512 |
| }, |
| { |
| "epoch": 0.5619820805115047, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.027790635502646e-05, |
| "loss": 0.4497522354125977, |
| "memory(GiB)": 75.82, |
| "step": 835, |
| "token_acc": 0.8830426939266386, |
| "train_speed(iter/s)": 0.310275 |
| }, |
| { |
| "epoch": 0.5653472426702562, |
| "grad_norm": 2.484375, |
| "learning_rate": 3.9759666596740476e-05, |
| "loss": 0.36929664611816404, |
| "memory(GiB)": 75.82, |
| "step": 840, |
| "token_acc": 0.9050632911392406, |
| "train_speed(iter/s)": 0.31104 |
| }, |
| { |
| "epoch": 0.5687124048290076, |
| "grad_norm": 2.828125, |
| "learning_rate": 3.924257260699583e-05, |
| "loss": 0.4202712535858154, |
| "memory(GiB)": 75.82, |
| "step": 845, |
| "token_acc": 0.8854133418448771, |
| "train_speed(iter/s)": 0.311816 |
| }, |
| { |
| "epoch": 0.5720775669877592, |
| "grad_norm": 1.875, |
| "learning_rate": 3.8726682242309794e-05, |
| "loss": 0.3440741777420044, |
| "memory(GiB)": 75.82, |
| "step": 850, |
| "token_acc": 0.9036319612590799, |
| "train_speed(iter/s)": 0.312528 |
| }, |
| { |
| "epoch": 0.5720775669877592, |
| "eval_loss": 0.38969123363494873, |
| "eval_runtime": 6.3806, |
| "eval_samples_per_second": 37.614, |
| "eval_steps_per_second": 37.614, |
| "eval_token_acc": 0.8914376321353066, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5754427291465107, |
| "grad_norm": 2.25, |
| "learning_rate": 3.821205322452863e-05, |
| "loss": 0.3591428756713867, |
| "memory(GiB)": 75.82, |
| "step": 855, |
| "token_acc": 0.8932054420676646, |
| "train_speed(iter/s)": 0.30595 |
| }, |
| { |
| "epoch": 0.5788078913052622, |
| "grad_norm": 2.109375, |
| "learning_rate": 3.769874313436933e-05, |
| "loss": 0.4184281349182129, |
| "memory(GiB)": 75.82, |
| "step": 860, |
| "token_acc": 0.8862865449846551, |
| "train_speed(iter/s)": 0.306767 |
| }, |
| { |
| "epoch": 0.5821730534640138, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.718680940497687e-05, |
| "loss": 0.4002545833587646, |
| "memory(GiB)": 75.82, |
| "step": 865, |
| "token_acc": 0.896395693555937, |
| "train_speed(iter/s)": 0.307516 |
| }, |
| { |
| "epoch": 0.5855382156227653, |
| "grad_norm": 2.71875, |
| "learning_rate": 3.6676309315498256e-05, |
| "loss": 0.43825640678405764, |
| "memory(GiB)": 75.82, |
| "step": 870, |
| "token_acc": 0.8854260764829871, |
| "train_speed(iter/s)": 0.308318 |
| }, |
| { |
| "epoch": 0.5889033777815168, |
| "grad_norm": 2.109375, |
| "learning_rate": 3.616729998467365e-05, |
| "loss": 0.4464766025543213, |
| "memory(GiB)": 75.82, |
| "step": 875, |
| "token_acc": 0.8829486224869695, |
| "train_speed(iter/s)": 0.309057 |
| }, |
| { |
| "epoch": 0.5922685399402684, |
| "grad_norm": 2.328125, |
| "learning_rate": 3.5659838364445505e-05, |
| "loss": 0.3885576486587524, |
| "memory(GiB)": 75.82, |
| "step": 880, |
| "token_acc": 0.8979990239141045, |
| "train_speed(iter/s)": 0.309821 |
| }, |
| { |
| "epoch": 0.5956337020990199, |
| "grad_norm": 2.359375, |
| "learning_rate": 3.515398123358627e-05, |
| "loss": 0.29079430103302, |
| "memory(GiB)": 75.82, |
| "step": 885, |
| "token_acc": 0.9171314741035856, |
| "train_speed(iter/s)": 0.310587 |
| }, |
| { |
| "epoch": 0.5989988642577714, |
| "grad_norm": 1.8984375, |
| "learning_rate": 3.464978519134561e-05, |
| "loss": 0.35250732898712156, |
| "memory(GiB)": 75.82, |
| "step": 890, |
| "token_acc": 0.9011910753229324, |
| "train_speed(iter/s)": 0.311324 |
| }, |
| { |
| "epoch": 0.602364026416523, |
| "grad_norm": 2.796875, |
| "learning_rate": 3.414730665111766e-05, |
| "loss": 0.5763841152191163, |
| "memory(GiB)": 75.82, |
| "step": 895, |
| "token_acc": 0.850735294117647, |
| "train_speed(iter/s)": 0.312078 |
| }, |
| { |
| "epoch": 0.6057291885752745, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.364660183412892e-05, |
| "loss": 0.3474919319152832, |
| "memory(GiB)": 75.82, |
| "step": 900, |
| "token_acc": 0.9043635170603674, |
| "train_speed(iter/s)": 0.312842 |
| }, |
| { |
| "epoch": 0.6057291885752745, |
| "eval_loss": 0.3748236298561096, |
| "eval_runtime": 6.318, |
| "eval_samples_per_second": 37.987, |
| "eval_steps_per_second": 37.987, |
| "eval_token_acc": 0.896247357293869, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.609094350734026, |
| "grad_norm": 2.453125, |
| "learning_rate": 3.314772676314791e-05, |
| "loss": 0.3573255777359009, |
| "memory(GiB)": 75.82, |
| "step": 905, |
| "token_acc": 0.8959614555995142, |
| "train_speed(iter/s)": 0.306732 |
| }, |
| { |
| "epoch": 0.6124595128927776, |
| "grad_norm": 2.671875, |
| "learning_rate": 3.2650737256216886e-05, |
| "loss": 0.35031719207763673, |
| "memory(GiB)": 75.82, |
| "step": 910, |
| "token_acc": 0.901930971567584, |
| "train_speed(iter/s)": 0.307389 |
| }, |
| { |
| "epoch": 0.615824675051529, |
| "grad_norm": 2.359375, |
| "learning_rate": 3.215568892040641e-05, |
| "loss": 0.42179441452026367, |
| "memory(GiB)": 75.82, |
| "step": 915, |
| "token_acc": 0.8914620966496835, |
| "train_speed(iter/s)": 0.308116 |
| }, |
| { |
| "epoch": 0.6191898372102805, |
| "grad_norm": 1.9375, |
| "learning_rate": 3.16626371455937e-05, |
| "loss": 0.3349519968032837, |
| "memory(GiB)": 75.82, |
| "step": 920, |
| "token_acc": 0.9102564102564102, |
| "train_speed(iter/s)": 0.3088 |
| }, |
| { |
| "epoch": 0.6225549993690321, |
| "grad_norm": 2.265625, |
| "learning_rate": 3.1171637098265064e-05, |
| "loss": 0.3286914587020874, |
| "memory(GiB)": 75.82, |
| "step": 925, |
| "token_acc": 0.9120385232744783, |
| "train_speed(iter/s)": 0.309527 |
| }, |
| { |
| "epoch": 0.6259201615277836, |
| "grad_norm": 2.078125, |
| "learning_rate": 3.0682743715343564e-05, |
| "loss": 0.32795827388763427, |
| "memory(GiB)": 75.82, |
| "step": 930, |
| "token_acc": 0.9038667278007031, |
| "train_speed(iter/s)": 0.3102 |
| }, |
| { |
| "epoch": 0.6292853236865351, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.019601169804216e-05, |
| "loss": 0.35293123722076414, |
| "memory(GiB)": 75.82, |
| "step": 935, |
| "token_acc": 0.9023519870235198, |
| "train_speed(iter/s)": 0.310922 |
| }, |
| { |
| "epoch": 0.6326504858452867, |
| "grad_norm": 1.9609375, |
| "learning_rate": 2.9711495505743313e-05, |
| "loss": 0.2731185436248779, |
| "memory(GiB)": 75.82, |
| "step": 940, |
| "token_acc": 0.923992673992674, |
| "train_speed(iter/s)": 0.311559 |
| }, |
| { |
| "epoch": 0.6360156480040382, |
| "grad_norm": 2.25, |
| "learning_rate": 2.9229249349905684e-05, |
| "loss": 0.46314468383789065, |
| "memory(GiB)": 75.82, |
| "step": 945, |
| "token_acc": 0.8777718407694363, |
| "train_speed(iter/s)": 0.312255 |
| }, |
| { |
| "epoch": 0.6393808101627897, |
| "grad_norm": 2.109375, |
| "learning_rate": 2.8749327187998515e-05, |
| "loss": 0.3386242151260376, |
| "memory(GiB)": 75.82, |
| "step": 950, |
| "token_acc": 0.906275336468299, |
| "train_speed(iter/s)": 0.312929 |
| }, |
| { |
| "epoch": 0.6393808101627897, |
| "eval_loss": 0.36514538526535034, |
| "eval_runtime": 6.288, |
| "eval_samples_per_second": 38.168, |
| "eval_steps_per_second": 38.168, |
| "eval_token_acc": 0.9002642706131079, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6427459723215413, |
| "grad_norm": 2.0625, |
| "learning_rate": 2.827178271746441e-05, |
| "loss": 0.3628067970275879, |
| "memory(GiB)": 75.82, |
| "step": 955, |
| "token_acc": 0.9006407689227073, |
| "train_speed(iter/s)": 0.307148 |
| }, |
| { |
| "epoch": 0.6461111344802928, |
| "grad_norm": 2.171875, |
| "learning_rate": 2.7796669369711294e-05, |
| "loss": 0.35329625606536863, |
| "memory(GiB)": 75.82, |
| "step": 960, |
| "token_acc": 0.9036697247706422, |
| "train_speed(iter/s)": 0.307809 |
| }, |
| { |
| "epoch": 0.6494762966390443, |
| "grad_norm": 2.03125, |
| "learning_rate": 2.7324040304134123e-05, |
| "loss": 0.3087867259979248, |
| "memory(GiB)": 75.82, |
| "step": 965, |
| "token_acc": 0.915299187800431, |
| "train_speed(iter/s)": 0.30851 |
| }, |
| { |
| "epoch": 0.6528414587977959, |
| "grad_norm": 2.25, |
| "learning_rate": 2.6853948402166878e-05, |
| "loss": 0.3155955791473389, |
| "memory(GiB)": 75.82, |
| "step": 970, |
| "token_acc": 0.912190414924413, |
| "train_speed(iter/s)": 0.309237 |
| }, |
| { |
| "epoch": 0.6562066209565474, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.638644626136587e-05, |
| "loss": 0.31089558601379397, |
| "memory(GiB)": 75.82, |
| "step": 975, |
| "token_acc": 0.9141963109354414, |
| "train_speed(iter/s)": 0.30992 |
| }, |
| { |
| "epoch": 0.6595717831152988, |
| "grad_norm": 2.390625, |
| "learning_rate": 2.5921586189524694e-05, |
| "loss": 0.38663172721862793, |
| "memory(GiB)": 75.82, |
| "step": 980, |
| "token_acc": 0.9041765169424744, |
| "train_speed(iter/s)": 0.310581 |
| }, |
| { |
| "epoch": 0.6629369452740504, |
| "grad_norm": 2.140625, |
| "learning_rate": 2.5459420198821605e-05, |
| "loss": 0.34139630794525144, |
| "memory(GiB)": 75.82, |
| "step": 985, |
| "token_acc": 0.908329455560726, |
| "train_speed(iter/s)": 0.311278 |
| }, |
| { |
| "epoch": 0.6663021074328019, |
| "grad_norm": 1.953125, |
| "learning_rate": 2.500000000000001e-05, |
| "loss": 0.30913376808166504, |
| "memory(GiB)": 75.82, |
| "step": 990, |
| "token_acc": 0.9187433922368222, |
| "train_speed(iter/s)": 0.311944 |
| }, |
| { |
| "epoch": 0.6696672695915534, |
| "grad_norm": 2.53125, |
| "learning_rate": 2.454337699658267e-05, |
| "loss": 0.3810436248779297, |
| "memory(GiB)": 75.82, |
| "step": 995, |
| "token_acc": 0.8920236336779911, |
| "train_speed(iter/s)": 0.312598 |
| }, |
| { |
| "epoch": 0.6730324317503049, |
| "grad_norm": 2.546875, |
| "learning_rate": 2.4089602279120222e-05, |
| "loss": 0.3346914768218994, |
| "memory(GiB)": 75.82, |
| "step": 1000, |
| "token_acc": 0.9072181670721817, |
| "train_speed(iter/s)": 0.313245 |
| }, |
| { |
| "epoch": 0.6730324317503049, |
| "eval_loss": 0.35263723134994507, |
| "eval_runtime": 6.3269, |
| "eval_samples_per_second": 37.933, |
| "eval_steps_per_second": 37.933, |
| "eval_token_acc": 0.9024841437632135, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6763975939090565, |
| "grad_norm": 2.53125, |
| "learning_rate": 2.363872661947488e-05, |
| "loss": 0.3666444063186646, |
| "memory(GiB)": 75.82, |
| "step": 1005, |
| "token_acc": 0.901596274033677, |
| "train_speed(iter/s)": 0.307343 |
| }, |
| { |
| "epoch": 0.679762756067808, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.319080046513954e-05, |
| "loss": 0.2809803247451782, |
| "memory(GiB)": 75.82, |
| "step": 1010, |
| "token_acc": 0.9265139116202946, |
| "train_speed(iter/s)": 0.308004 |
| }, |
| { |
| "epoch": 0.6831279182265595, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.274587393359342e-05, |
| "loss": 0.3413016557693481, |
| "memory(GiB)": 75.82, |
| "step": 1015, |
| "token_acc": 0.9119178921568627, |
| "train_speed(iter/s)": 0.30864 |
| }, |
| { |
| "epoch": 0.6864930803853111, |
| "grad_norm": 2.625, |
| "learning_rate": 2.2303996806694488e-05, |
| "loss": 0.378632116317749, |
| "memory(GiB)": 75.82, |
| "step": 1020, |
| "token_acc": 0.9010562286424355, |
| "train_speed(iter/s)": 0.309282 |
| }, |
| { |
| "epoch": 0.6898582425440626, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.1865218525109495e-05, |
| "loss": 0.32521207332611085, |
| "memory(GiB)": 75.82, |
| "step": 1025, |
| "token_acc": 0.9113677264547091, |
| "train_speed(iter/s)": 0.309938 |
| }, |
| { |
| "epoch": 0.6932234047028141, |
| "grad_norm": 2.125, |
| "learning_rate": 2.1429588182782144e-05, |
| "loss": 0.3491218090057373, |
| "memory(GiB)": 75.82, |
| "step": 1030, |
| "token_acc": 0.9001129578828465, |
| "train_speed(iter/s)": 0.310579 |
| }, |
| { |
| "epoch": 0.6965885668615657, |
| "grad_norm": 1.9921875, |
| "learning_rate": 2.09971545214401e-05, |
| "loss": 0.32745966911315916, |
| "memory(GiB)": 75.82, |
| "step": 1035, |
| "token_acc": 0.9061001818482394, |
| "train_speed(iter/s)": 0.311215 |
| }, |
| { |
| "epoch": 0.6999537290203172, |
| "grad_norm": 2.40625, |
| "learning_rate": 2.0567965925141363e-05, |
| "loss": 0.4002220153808594, |
| "memory(GiB)": 75.82, |
| "step": 1040, |
| "token_acc": 0.8949858088930936, |
| "train_speed(iter/s)": 0.311883 |
| }, |
| { |
| "epoch": 0.7033188911790687, |
| "grad_norm": 2.234375, |
| "learning_rate": 2.0142070414860704e-05, |
| "loss": 0.33180482387542726, |
| "memory(GiB)": 75.82, |
| "step": 1045, |
| "token_acc": 0.9061082552162081, |
| "train_speed(iter/s)": 0.31252 |
| }, |
| { |
| "epoch": 0.7066840533378203, |
| "grad_norm": 2.375, |
| "learning_rate": 1.9719515643116674e-05, |
| "loss": 0.2780953884124756, |
| "memory(GiB)": 75.82, |
| "step": 1050, |
| "token_acc": 0.9193521731945133, |
| "train_speed(iter/s)": 0.313162 |
| }, |
| { |
| "epoch": 0.7066840533378203, |
| "eval_loss": 0.3458307981491089, |
| "eval_runtime": 6.3244, |
| "eval_samples_per_second": 37.948, |
| "eval_steps_per_second": 37.948, |
| "eval_token_acc": 0.9045983086680761, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7100492154965717, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.9300348888639914e-05, |
| "loss": 0.2843871355056763, |
| "memory(GiB)": 75.82, |
| "step": 1055, |
| "token_acc": 0.908830434955629, |
| "train_speed(iter/s)": 0.307659 |
| }, |
| { |
| "epoch": 0.7134143776553232, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.888461705108318e-05, |
| "loss": 0.31880433559417726, |
| "memory(GiB)": 75.82, |
| "step": 1060, |
| "token_acc": 0.9162755488266465, |
| "train_speed(iter/s)": 0.308265 |
| }, |
| { |
| "epoch": 0.7167795398140748, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.847236664577389e-05, |
| "loss": 0.3458749055862427, |
| "memory(GiB)": 75.82, |
| "step": 1065, |
| "token_acc": 0.905449976441024, |
| "train_speed(iter/s)": 0.308846 |
| }, |
| { |
| "epoch": 0.7201447019728263, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.8063643798509593e-05, |
| "loss": 0.3110387325286865, |
| "memory(GiB)": 75.82, |
| "step": 1070, |
| "token_acc": 0.9163458691145988, |
| "train_speed(iter/s)": 0.309494 |
| }, |
| { |
| "epoch": 0.7235098641315778, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.7658494240397126e-05, |
| "loss": 0.3040132522583008, |
| "memory(GiB)": 75.82, |
| "step": 1075, |
| "token_acc": 0.9117370892018779, |
| "train_speed(iter/s)": 0.310049 |
| }, |
| { |
| "epoch": 0.7268750262903294, |
| "grad_norm": 2.125, |
| "learning_rate": 1.725696330273575e-05, |
| "loss": 0.28740246295928956, |
| "memory(GiB)": 75.82, |
| "step": 1080, |
| "token_acc": 0.9206708975521306, |
| "train_speed(iter/s)": 0.310661 |
| }, |
| { |
| "epoch": 0.7302401884490809, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.68590959119452e-05, |
| "loss": 0.3025235176086426, |
| "memory(GiB)": 75.82, |
| "step": 1085, |
| "token_acc": 0.912094861660079, |
| "train_speed(iter/s)": 0.311236 |
| }, |
| { |
| "epoch": 0.7336053506078324, |
| "grad_norm": 1.9375, |
| "learning_rate": 1.646493658453896e-05, |
| "loss": 0.3215456485748291, |
| "memory(GiB)": 75.82, |
| "step": 1090, |
| "token_acc": 0.914180252230083, |
| "train_speed(iter/s)": 0.31184 |
| }, |
| { |
| "epoch": 0.736970512766584, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.60745294221434e-05, |
| "loss": 0.35790162086486815, |
| "memory(GiB)": 75.82, |
| "step": 1095, |
| "token_acc": 0.8988936693300553, |
| "train_speed(iter/s)": 0.31246 |
| }, |
| { |
| "epoch": 0.7403356749253355, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.5687918106563326e-05, |
| "loss": 0.3286574840545654, |
| "memory(GiB)": 75.82, |
| "step": 1100, |
| "token_acc": 0.9104522765088489, |
| "train_speed(iter/s)": 0.313052 |
| }, |
| { |
| "epoch": 0.7403356749253355, |
| "eval_loss": 0.34071242809295654, |
| "eval_runtime": 6.4053, |
| "eval_samples_per_second": 37.469, |
| "eval_steps_per_second": 37.469, |
| "eval_token_acc": 0.9052325581395348, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.743700837084087, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.5305145894894547e-05, |
| "loss": 0.3178743600845337, |
| "memory(GiB)": 75.82, |
| "step": 1105, |
| "token_acc": 0.9053732762719924, |
| "train_speed(iter/s)": 0.30749 |
| }, |
| { |
| "epoch": 0.7470659992428386, |
| "grad_norm": 1.9453125, |
| "learning_rate": 1.4926255614683932e-05, |
| "loss": 0.28967604637145994, |
| "memory(GiB)": 75.82, |
| "step": 1110, |
| "token_acc": 0.9175757575757576, |
| "train_speed(iter/s)": 0.308051 |
| }, |
| { |
| "epoch": 0.75043116140159, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.4551289659137496e-05, |
| "loss": 0.3481321096420288, |
| "memory(GiB)": 75.82, |
| "step": 1115, |
| "token_acc": 0.9100428367444074, |
| "train_speed(iter/s)": 0.308681 |
| }, |
| { |
| "epoch": 0.7537963235603415, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.4180289982377137e-05, |
| "loss": 0.3199401617050171, |
| "memory(GiB)": 75.82, |
| "step": 1120, |
| "token_acc": 0.9106353591160221, |
| "train_speed(iter/s)": 0.309223 |
| }, |
| { |
| "epoch": 0.7571614857190931, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.3813298094746491e-05, |
| "loss": 0.2976421356201172, |
| "memory(GiB)": 75.82, |
| "step": 1125, |
| "token_acc": 0.9175590435675517, |
| "train_speed(iter/s)": 0.309803 |
| }, |
| { |
| "epoch": 0.7605266478778446, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.345035505816642e-05, |
| "loss": 0.2948709726333618, |
| "memory(GiB)": 75.82, |
| "step": 1130, |
| "token_acc": 0.9145778364116095, |
| "train_speed(iter/s)": 0.310376 |
| }, |
| { |
| "epoch": 0.7638918100365961, |
| "grad_norm": 2.21875, |
| "learning_rate": 1.3091501481540674e-05, |
| "loss": 0.3075523853302002, |
| "memory(GiB)": 75.82, |
| "step": 1135, |
| "token_acc": 0.9111648285239462, |
| "train_speed(iter/s)": 0.310961 |
| }, |
| { |
| "epoch": 0.7672569721953476, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.2736777516212266e-05, |
| "loss": 0.2970130205154419, |
| "memory(GiB)": 75.82, |
| "step": 1140, |
| "token_acc": 0.914568783498457, |
| "train_speed(iter/s)": 0.311547 |
| }, |
| { |
| "epoch": 0.7706221343540992, |
| "grad_norm": 2.171875, |
| "learning_rate": 1.238622285147103e-05, |
| "loss": 0.29731974601745603, |
| "memory(GiB)": 75.82, |
| "step": 1145, |
| "token_acc": 0.9114565731666103, |
| "train_speed(iter/s)": 0.312152 |
| }, |
| { |
| "epoch": 0.7739872965128507, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.2039876710112847e-05, |
| "loss": 0.3596015930175781, |
| "memory(GiB)": 75.82, |
| "step": 1150, |
| "token_acc": 0.9056197074672825, |
| "train_speed(iter/s)": 0.312745 |
| }, |
| { |
| "epoch": 0.7739872965128507, |
| "eval_loss": 0.33650800585746765, |
| "eval_runtime": 6.306, |
| "eval_samples_per_second": 38.059, |
| "eval_steps_per_second": 38.059, |
| "eval_token_acc": 0.9056025369978858, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7773524586716022, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.1697777844051105e-05, |
| "loss": 0.280198860168457, |
| "memory(GiB)": 75.82, |
| "step": 1155, |
| "token_acc": 0.9082849646179534, |
| "train_speed(iter/s)": 0.306862 |
| }, |
| { |
| "epoch": 0.7807176208303538, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.1359964529980849e-05, |
| "loss": 0.31822154521942136, |
| "memory(GiB)": 75.82, |
| "step": 1160, |
| "token_acc": 0.9165397502284496, |
| "train_speed(iter/s)": 0.307449 |
| }, |
| { |
| "epoch": 0.7840827829891053, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.1026474565096068e-05, |
| "loss": 0.3568312883377075, |
| "memory(GiB)": 75.82, |
| "step": 1165, |
| "token_acc": 0.9036163277976098, |
| "train_speed(iter/s)": 0.30803 |
| }, |
| { |
| "epoch": 0.7874479451478568, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.0697345262860636e-05, |
| "loss": 0.3734995603561401, |
| "memory(GiB)": 75.82, |
| "step": 1170, |
| "token_acc": 0.9021773935318604, |
| "train_speed(iter/s)": 0.308568 |
| }, |
| { |
| "epoch": 0.7908131073066084, |
| "grad_norm": 2.5, |
| "learning_rate": 1.037261344883343e-05, |
| "loss": 0.3525618314743042, |
| "memory(GiB)": 75.82, |
| "step": 1175, |
| "token_acc": 0.9067342073897497, |
| "train_speed(iter/s)": 0.309122 |
| }, |
| { |
| "epoch": 0.7941782694653599, |
| "grad_norm": 1.5859375, |
| "learning_rate": 1.0052315456547934e-05, |
| "loss": 0.2623563289642334, |
| "memory(GiB)": 75.82, |
| "step": 1180, |
| "token_acc": 0.9251740139211136, |
| "train_speed(iter/s)": 0.309652 |
| }, |
| { |
| "epoch": 0.7975434316241113, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.73648712344707e-06, |
| "loss": 0.27451634407043457, |
| "memory(GiB)": 75.82, |
| "step": 1185, |
| "token_acc": 0.9259454705364996, |
| "train_speed(iter/s)": 0.310245 |
| }, |
| { |
| "epoch": 0.8009085937828629, |
| "grad_norm": 2.1875, |
| "learning_rate": 9.425163786873292e-06, |
| "loss": 0.3021913290023804, |
| "memory(GiB)": 75.82, |
| "step": 1190, |
| "token_acc": 0.9174669867947179, |
| "train_speed(iter/s)": 0.310785 |
| }, |
| { |
| "epoch": 0.8042737559416144, |
| "grad_norm": 2.578125, |
| "learning_rate": 9.118380280114857e-06, |
| "loss": 0.3496464014053345, |
| "memory(GiB)": 75.82, |
| "step": 1195, |
| "token_acc": 0.9070032573289902, |
| "train_speed(iter/s)": 0.311393 |
| }, |
| { |
| "epoch": 0.8076389181003659, |
| "grad_norm": 1.9765625, |
| "learning_rate": 8.816170928508365e-06, |
| "loss": 0.25614950656890867, |
| "memory(GiB)": 75.82, |
| "step": 1200, |
| "token_acc": 0.9283973187081048, |
| "train_speed(iter/s)": 0.311948 |
| }, |
| { |
| "epoch": 0.8076389181003659, |
| "eval_loss": 0.3316649794578552, |
| "eval_runtime": 6.3957, |
| "eval_samples_per_second": 37.525, |
| "eval_steps_per_second": 37.525, |
| "eval_token_acc": 0.9070824524312896, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8110040802591175, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.5185695455982e-06, |
| "loss": 0.3539767026901245, |
| "memory(GiB)": 75.82, |
| "step": 1205, |
| "token_acc": 0.9054973715310323, |
| "train_speed(iter/s)": 0.306783 |
| }, |
| { |
| "epoch": 0.814369242417869, |
| "grad_norm": 1.5859375, |
| "learning_rate": 8.225609429353187e-06, |
| "loss": 0.24862987995147706, |
| "memory(GiB)": 76.54, |
| "step": 1210, |
| "token_acc": 0.9274929223026109, |
| "train_speed(iter/s)": 0.307289 |
| }, |
| { |
| "epoch": 0.8177344045766205, |
| "grad_norm": 2.875, |
| "learning_rate": 7.937323358440935e-06, |
| "loss": 0.30047638416290284, |
| "memory(GiB)": 76.54, |
| "step": 1215, |
| "token_acc": 0.9201830198271479, |
| "train_speed(iter/s)": 0.307808 |
| }, |
| { |
| "epoch": 0.8210995667353721, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.653743588560386e-06, |
| "loss": 0.34635608196258544, |
| "memory(GiB)": 77.35, |
| "step": 1220, |
| "token_acc": 0.90292348580221, |
| "train_speed(iter/s)": 0.308334 |
| }, |
| { |
| "epoch": 0.8244647288941236, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.374901848832683e-06, |
| "loss": 0.2774034976959229, |
| "memory(GiB)": 77.35, |
| "step": 1225, |
| "token_acc": 0.9153208206023571, |
| "train_speed(iter/s)": 0.30888 |
| }, |
| { |
| "epoch": 0.8278298910528751, |
| "grad_norm": 2.765625, |
| "learning_rate": 7.100829338251147e-06, |
| "loss": 0.3617110729217529, |
| "memory(GiB)": 77.35, |
| "step": 1230, |
| "token_acc": 0.901881936625382, |
| "train_speed(iter/s)": 0.309428 |
| }, |
| { |
| "epoch": 0.8311950532116267, |
| "grad_norm": 2.515625, |
| "learning_rate": 6.831556722190452e-06, |
| "loss": 0.3457359790802002, |
| "memory(GiB)": 77.35, |
| "step": 1235, |
| "token_acc": 0.9088393543428133, |
| "train_speed(iter/s)": 0.309927 |
| }, |
| { |
| "epoch": 0.8345602153703782, |
| "grad_norm": 2.140625, |
| "learning_rate": 6.567114128975571e-06, |
| "loss": 0.2862051486968994, |
| "memory(GiB)": 77.35, |
| "step": 1240, |
| "token_acc": 0.9152869313615174, |
| "train_speed(iter/s)": 0.310429 |
| }, |
| { |
| "epoch": 0.8379253775291297, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.3075311465107535e-06, |
| "loss": 0.32160158157348634, |
| "memory(GiB)": 77.35, |
| "step": 1245, |
| "token_acc": 0.9136561235197121, |
| "train_speed(iter/s)": 0.310932 |
| }, |
| { |
| "epoch": 0.8412905396878813, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.052836818969026e-06, |
| "loss": 0.3995193958282471, |
| "memory(GiB)": 78.27, |
| "step": 1250, |
| "token_acc": 0.8993502188038721, |
| "train_speed(iter/s)": 0.311402 |
| }, |
| { |
| "epoch": 0.8412905396878813, |
| "eval_loss": 0.33036008477211, |
| "eval_runtime": 6.3924, |
| "eval_samples_per_second": 37.544, |
| "eval_steps_per_second": 37.544, |
| "eval_token_acc": 0.9071881606765327, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8446557018466327, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.803059643542491e-06, |
| "loss": 0.3073962926864624, |
| "memory(GiB)": 78.27, |
| "step": 1255, |
| "token_acc": 0.9106831510540497, |
| "train_speed(iter/s)": 0.30711 |
| }, |
| { |
| "epoch": 0.8480208640053842, |
| "grad_norm": 1.9140625, |
| "learning_rate": 5.558227567253832e-06, |
| "loss": 0.2895121812820435, |
| "memory(GiB)": 78.27, |
| "step": 1260, |
| "token_acc": 0.9245490196078432, |
| "train_speed(iter/s)": 0.307631 |
| }, |
| { |
| "epoch": 0.8513860261641358, |
| "grad_norm": 2.484375, |
| "learning_rate": 5.318367983829392e-06, |
| "loss": 0.354207706451416, |
| "memory(GiB)": 78.27, |
| "step": 1265, |
| "token_acc": 0.9037365421152628, |
| "train_speed(iter/s)": 0.308129 |
| }, |
| { |
| "epoch": 0.8547511883228873, |
| "grad_norm": 2.734375, |
| "learning_rate": 5.083507730634152e-06, |
| "loss": 0.3204244375228882, |
| "memory(GiB)": 78.27, |
| "step": 1270, |
| "token_acc": 0.9147783251231527, |
| "train_speed(iter/s)": 0.308617 |
| }, |
| { |
| "epoch": 0.8581163504816388, |
| "grad_norm": 2.328125, |
| "learning_rate": 4.853673085668947e-06, |
| "loss": 0.29280381202697753, |
| "memory(GiB)": 78.27, |
| "step": 1275, |
| "token_acc": 0.9152905198776758, |
| "train_speed(iter/s)": 0.309122 |
| }, |
| { |
| "epoch": 0.8614815126403904, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.6288897646302785e-06, |
| "loss": 0.29522113800048827, |
| "memory(GiB)": 78.27, |
| "step": 1280, |
| "token_acc": 0.9161471321695761, |
| "train_speed(iter/s)": 0.309632 |
| }, |
| { |
| "epoch": 0.8648466747991419, |
| "grad_norm": 2.265625, |
| "learning_rate": 4.4091829180330505e-06, |
| "loss": 0.36226553916931153, |
| "memory(GiB)": 78.27, |
| "step": 1285, |
| "token_acc": 0.9011563440519563, |
| "train_speed(iter/s)": 0.310141 |
| }, |
| { |
| "epoch": 0.8682118369578934, |
| "grad_norm": 2.5, |
| "learning_rate": 4.19457712839652e-06, |
| "loss": 0.31820580959320066, |
| "memory(GiB)": 78.27, |
| "step": 1290, |
| "token_acc": 0.9157977883096367, |
| "train_speed(iter/s)": 0.310634 |
| }, |
| { |
| "epoch": 0.8715769991166449, |
| "grad_norm": 1.875, |
| "learning_rate": 3.9850964074938375e-06, |
| "loss": 0.2922437906265259, |
| "memory(GiB)": 78.27, |
| "step": 1295, |
| "token_acc": 0.9246329526916802, |
| "train_speed(iter/s)": 0.311131 |
| }, |
| { |
| "epoch": 0.8749421612753965, |
| "grad_norm": 2.421875, |
| "learning_rate": 3.780764193665398e-06, |
| "loss": 0.2996021509170532, |
| "memory(GiB)": 78.27, |
| "step": 1300, |
| "token_acc": 0.912474373127267, |
| "train_speed(iter/s)": 0.311627 |
| }, |
| { |
| "epoch": 0.8749421612753965, |
| "eval_loss": 0.3295805752277374, |
| "eval_runtime": 6.303, |
| "eval_samples_per_second": 38.077, |
| "eval_steps_per_second": 38.077, |
| "eval_token_acc": 0.9073467230443975, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.878307323434148, |
| "grad_norm": 2.25, |
| "learning_rate": 3.581603349196372e-06, |
| "loss": 0.31300716400146483, |
| "memory(GiB)": 78.27, |
| "step": 1305, |
| "token_acc": 0.9083045669166369, |
| "train_speed(iter/s)": 0.307342 |
| }, |
| { |
| "epoch": 0.8816724855928995, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.3876361577587113e-06, |
| "loss": 0.2798715114593506, |
| "memory(GiB)": 78.27, |
| "step": 1310, |
| "token_acc": 0.9237044145873321, |
| "train_speed(iter/s)": 0.307815 |
| }, |
| { |
| "epoch": 0.8850376477516511, |
| "grad_norm": 2.921875, |
| "learning_rate": 3.1988843219178777e-06, |
| "loss": 0.3821584701538086, |
| "memory(GiB)": 78.27, |
| "step": 1315, |
| "token_acc": 0.9002973861324151, |
| "train_speed(iter/s)": 0.3083 |
| }, |
| { |
| "epoch": 0.8884028099104025, |
| "grad_norm": 2.296875, |
| "learning_rate": 3.0153689607045845e-06, |
| "loss": 0.28262839317321775, |
| "memory(GiB)": 78.27, |
| "step": 1320, |
| "token_acc": 0.921765601217656, |
| "train_speed(iter/s)": 0.308778 |
| }, |
| { |
| "epoch": 0.891767972069154, |
| "grad_norm": 2.421875, |
| "learning_rate": 2.8371106072518195e-06, |
| "loss": 0.3282342433929443, |
| "memory(GiB)": 78.27, |
| "step": 1325, |
| "token_acc": 0.9107642467972317, |
| "train_speed(iter/s)": 0.309285 |
| }, |
| { |
| "epoch": 0.8951331342279056, |
| "grad_norm": 2.390625, |
| "learning_rate": 2.664129206497479e-06, |
| "loss": 0.3914219617843628, |
| "memory(GiB)": 78.27, |
| "step": 1330, |
| "token_acc": 0.8952534427190155, |
| "train_speed(iter/s)": 0.309779 |
| }, |
| { |
| "epoch": 0.8984982963866571, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.496444112952734e-06, |
| "loss": 0.315179443359375, |
| "memory(GiB)": 78.27, |
| "step": 1335, |
| "token_acc": 0.9142040038131554, |
| "train_speed(iter/s)": 0.310272 |
| }, |
| { |
| "epoch": 0.9018634585454086, |
| "grad_norm": 2.265625, |
| "learning_rate": 2.334074088536492e-06, |
| "loss": 0.36266069412231444, |
| "memory(GiB)": 78.27, |
| "step": 1340, |
| "token_acc": 0.9021089077746302, |
| "train_speed(iter/s)": 0.310752 |
| }, |
| { |
| "epoch": 0.9052286207041602, |
| "grad_norm": 2.5625, |
| "learning_rate": 2.1770373004762035e-06, |
| "loss": 0.4528657913208008, |
| "memory(GiB)": 78.27, |
| "step": 1345, |
| "token_acc": 0.8829328404189772, |
| "train_speed(iter/s)": 0.31123 |
| }, |
| { |
| "epoch": 0.9085937828629117, |
| "grad_norm": 2.390625, |
| "learning_rate": 2.0253513192751373e-06, |
| "loss": 0.3175913095474243, |
| "memory(GiB)": 78.27, |
| "step": 1350, |
| "token_acc": 0.9110584518167456, |
| "train_speed(iter/s)": 0.311745 |
| }, |
| { |
| "epoch": 0.9085937828629117, |
| "eval_loss": 0.32911160588264465, |
| "eval_runtime": 6.3063, |
| "eval_samples_per_second": 38.057, |
| "eval_steps_per_second": 38.057, |
| "eval_token_acc": 0.9075052854122622, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9119589450216632, |
| "grad_norm": 2.875, |
| "learning_rate": 1.879033116746476e-06, |
| "loss": 0.33309078216552734, |
| "memory(GiB)": 78.27, |
| "step": 1355, |
| "token_acc": 0.9085878012402857, |
| "train_speed(iter/s)": 0.30713 |
| }, |
| { |
| "epoch": 0.9153241071804148, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.738099064114368e-06, |
| "loss": 0.27440056800842283, |
| "memory(GiB)": 78.27, |
| "step": 1360, |
| "token_acc": 0.9196319018404908, |
| "train_speed(iter/s)": 0.307612 |
| }, |
| { |
| "epoch": 0.9186892693391663, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.6025649301821876e-06, |
| "loss": 0.3164578199386597, |
| "memory(GiB)": 78.27, |
| "step": 1365, |
| "token_acc": 0.912046908315565, |
| "train_speed(iter/s)": 0.308062 |
| }, |
| { |
| "epoch": 0.9220544314979178, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.4724458795681962e-06, |
| "loss": 0.35011940002441405, |
| "memory(GiB)": 78.27, |
| "step": 1370, |
| "token_acc": 0.9021322288694585, |
| "train_speed(iter/s)": 0.30853 |
| }, |
| { |
| "epoch": 0.9254195936566694, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.3477564710088098e-06, |
| "loss": 0.26722261905670164, |
| "memory(GiB)": 78.27, |
| "step": 1375, |
| "token_acc": 0.929877564000636, |
| "train_speed(iter/s)": 0.309022 |
| }, |
| { |
| "epoch": 0.9287847558154209, |
| "grad_norm": 2.09375, |
| "learning_rate": 1.2285106557296477e-06, |
| "loss": 0.2892286777496338, |
| "memory(GiB)": 78.27, |
| "step": 1380, |
| "token_acc": 0.9129415442325727, |
| "train_speed(iter/s)": 0.309492 |
| }, |
| { |
| "epoch": 0.9321499179741723, |
| "grad_norm": 6.71875, |
| "learning_rate": 1.1147217758845751e-06, |
| "loss": 0.32126703262329104, |
| "memory(GiB)": 78.27, |
| "step": 1385, |
| "token_acc": 0.9072528883183568, |
| "train_speed(iter/s)": 0.309961 |
| }, |
| { |
| "epoch": 0.935515080132924, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.0064025630628582e-06, |
| "loss": 0.29770004749298096, |
| "memory(GiB)": 78.27, |
| "step": 1390, |
| "token_acc": 0.9165990588998865, |
| "train_speed(iter/s)": 0.310404 |
| }, |
| { |
| "epoch": 0.9388802422916754, |
| "grad_norm": 2.0, |
| "learning_rate": 9.035651368646648e-07, |
| "loss": 0.32735161781311034, |
| "memory(GiB)": 78.27, |
| "step": 1395, |
| "token_acc": 0.910242711036483, |
| "train_speed(iter/s)": 0.310858 |
| }, |
| { |
| "epoch": 0.9422454044504269, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.062210035450379e-07, |
| "loss": 0.3620461463928223, |
| "memory(GiB)": 78.27, |
| "step": 1400, |
| "token_acc": 0.9046025104602511, |
| "train_speed(iter/s)": 0.311345 |
| }, |
| { |
| "epoch": 0.9422454044504269, |
| "eval_loss": 0.3291800916194916, |
| "eval_runtime": 6.3197, |
| "eval_samples_per_second": 37.976, |
| "eval_steps_per_second": 37.976, |
| "eval_token_acc": 0.90776955602537, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9456105666091785, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.143810547264762e-07, |
| "loss": 0.3065107583999634, |
| "memory(GiB)": 78.27, |
| "step": 1405, |
| "token_acc": 0.9103534858174114, |
| "train_speed(iter/s)": 0.307325 |
| }, |
| { |
| "epoch": 0.94897572876793, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.280555661802856e-07, |
| "loss": 0.260418701171875, |
| "memory(GiB)": 78.27, |
| "step": 1410, |
| "token_acc": 0.9281273692191054, |
| "train_speed(iter/s)": 0.307789 |
| }, |
| { |
| "epoch": 0.9523408909266815, |
| "grad_norm": 1.53125, |
| "learning_rate": 5.472541966768551e-07, |
| "loss": 0.23598823547363282, |
| "memory(GiB)": 78.27, |
| "step": 1415, |
| "token_acc": 0.9302064991195774, |
| "train_speed(iter/s)": 0.308273 |
| }, |
| { |
| "epoch": 0.9557060530854331, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.7198598690496585e-07, |
| "loss": 0.2811413288116455, |
| "memory(GiB)": 78.27, |
| "step": 1420, |
| "token_acc": 0.9181454836131095, |
| "train_speed(iter/s)": 0.308702 |
| }, |
| { |
| "epoch": 0.9590712152441846, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.02259358460233e-07, |
| "loss": 0.23685033321380616, |
| "memory(GiB)": 78.27, |
| "step": 1425, |
| "token_acc": 0.9318723201524536, |
| "train_speed(iter/s)": 0.30913 |
| }, |
| { |
| "epoch": 0.9624363774029361, |
| "grad_norm": 2.6875, |
| "learning_rate": 3.380821129028489e-07, |
| "loss": 0.28534321784973143, |
| "memory(GiB)": 78.27, |
| "step": 1430, |
| "token_acc": 0.9202698558724318, |
| "train_speed(iter/s)": 0.309604 |
| }, |
| { |
| "epoch": 0.9658015395616877, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.794614308846644e-07, |
| "loss": 0.30334537029266356, |
| "memory(GiB)": 78.27, |
| "step": 1435, |
| "token_acc": 0.9193307439498059, |
| "train_speed(iter/s)": 0.310037 |
| }, |
| { |
| "epoch": 0.9691667017204392, |
| "grad_norm": 2.796875, |
| "learning_rate": 2.2640387134577058e-07, |
| "loss": 0.30445032119750975, |
| "memory(GiB)": 78.27, |
| "step": 1440, |
| "token_acc": 0.9128602730490477, |
| "train_speed(iter/s)": 0.310511 |
| }, |
| { |
| "epoch": 0.9725318638791907, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.789153707806357e-07, |
| "loss": 0.3070082187652588, |
| "memory(GiB)": 78.27, |
| "step": 1445, |
| "token_acc": 0.9169588779088301, |
| "train_speed(iter/s)": 0.310939 |
| }, |
| { |
| "epoch": 0.9758970260379422, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.3700124257388092e-07, |
| "loss": 0.2871255874633789, |
| "memory(GiB)": 78.27, |
| "step": 1450, |
| "token_acc": 0.9249602543720191, |
| "train_speed(iter/s)": 0.311389 |
| }, |
| { |
| "epoch": 0.9758970260379422, |
| "eval_loss": 0.32921192049980164, |
| "eval_runtime": 6.3471, |
| "eval_samples_per_second": 37.813, |
| "eval_steps_per_second": 37.813, |
| "eval_token_acc": 0.9072938689217759, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.9792621881966938, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.0066617640578368e-07, |
| "loss": 0.32105896472930906, |
| "memory(GiB)": 78.27, |
| "step": 1455, |
| "token_acc": 0.9072366364488903, |
| "train_speed(iter/s)": 0.307431 |
| }, |
| { |
| "epoch": 0.9826273503554452, |
| "grad_norm": 2.625, |
| "learning_rate": 6.991423772753636e-08, |
| "loss": 0.3220836639404297, |
| "memory(GiB)": 78.27, |
| "step": 1460, |
| "token_acc": 0.9141094834232845, |
| "train_speed(iter/s)": 0.307867 |
| }, |
| { |
| "epoch": 0.9859925125141967, |
| "grad_norm": 2.609375, |
| "learning_rate": 4.474886730641004e-08, |
| "loss": 0.330595874786377, |
| "memory(GiB)": 78.27, |
| "step": 1465, |
| "token_acc": 0.9087763447625039, |
| "train_speed(iter/s)": 0.308304 |
| }, |
| { |
| "epoch": 0.9893576746729483, |
| "grad_norm": 1.8515625, |
| "learning_rate": 2.5172880840745873e-08, |
| "loss": 0.3304997444152832, |
| "memory(GiB)": 78.27, |
| "step": 1470, |
| "token_acc": 0.9131480090157776, |
| "train_speed(iter/s)": 0.308715 |
| }, |
| { |
| "epoch": 0.9927228368316998, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.1188468644907079e-08, |
| "loss": 0.29615800380706786, |
| "memory(GiB)": 78.27, |
| "step": 1475, |
| "token_acc": 0.919311727363849, |
| "train_speed(iter/s)": 0.309186 |
| }, |
| { |
| "epoch": 0.9960879989904513, |
| "grad_norm": 2.515625, |
| "learning_rate": 2.797195404247166e-09, |
| "loss": 0.37999179363250735, |
| "memory(GiB)": 78.27, |
| "step": 1480, |
| "token_acc": 0.8994573890839451, |
| "train_speed(iter/s)": 0.309628 |
| }, |
| { |
| "epoch": 0.9994531611492029, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.0, |
| "loss": 0.29352550506591796, |
| "memory(GiB)": 78.27, |
| "step": 1485, |
| "token_acc": 0.9119153858866303, |
| "train_speed(iter/s)": 0.310084 |
| }, |
| { |
| "epoch": 0.9994531611492029, |
| "eval_loss": 0.3291037976741791, |
| "eval_runtime": 6.3842, |
| "eval_samples_per_second": 37.593, |
| "eval_steps_per_second": 37.593, |
| "eval_token_acc": 0.9075581395348837, |
| "step": 1485 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1485, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.771720449253366e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|