{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1520737327188941, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01152073732718894, "grad_norm": 1.6566706895828247, "learning_rate": 1.8e-05, "loss": 0.9518, "step": 10 }, { "epoch": 0.02304147465437788, "grad_norm": 0.8684583902359009, "learning_rate": 3.8e-05, "loss": 0.1942, "step": 20 }, { "epoch": 0.03456221198156682, "grad_norm": 0.6393296718597412, "learning_rate": 5.8e-05, "loss": 0.125, "step": 30 }, { "epoch": 0.04608294930875576, "grad_norm": 0.35887211561203003, "learning_rate": 7.800000000000001e-05, "loss": 0.0976, "step": 40 }, { "epoch": 0.0576036866359447, "grad_norm": 0.7574331164360046, "learning_rate": 9.8e-05, "loss": 0.0844, "step": 50 }, { "epoch": 0.06912442396313365, "grad_norm": 0.7963987588882446, "learning_rate": 9.997785653888835e-05, "loss": 0.0697, "step": 60 }, { "epoch": 0.08064516129032258, "grad_norm": 0.7024580240249634, "learning_rate": 9.990133642141359e-05, "loss": 0.0661, "step": 70 }, { "epoch": 0.09216589861751152, "grad_norm": 0.6337135434150696, "learning_rate": 9.977024992520602e-05, "loss": 0.0577, "step": 80 }, { "epoch": 0.10368663594470046, "grad_norm": 0.3510747253894806, "learning_rate": 9.95847403914247e-05, "loss": 0.0537, "step": 90 }, { "epoch": 0.1152073732718894, "grad_norm": 0.4563221037387848, "learning_rate": 9.934501067202117e-05, "loss": 0.052, "step": 100 }, { "epoch": 0.12672811059907835, "grad_norm": 0.3978365957736969, "learning_rate": 9.905132290792394e-05, "loss": 0.0445, "step": 110 }, { "epoch": 0.1382488479262673, "grad_norm": 0.40588557720184326, "learning_rate": 9.870399824239117e-05, "loss": 0.0427, "step": 120 }, { "epoch": 0.1497695852534562, "grad_norm": 0.5251225233078003, "learning_rate": 9.830341646984521e-05, "loss": 0.0378, "step": 130 }, { "epoch": 0.16129032258064516, "grad_norm": 0.38299882411956787, "learning_rate": 9.785001562057309e-05, "loss": 0.0404, "step": 140 }, { "epoch": 0.1728110599078341, "grad_norm": 0.2926655411720276, "learning_rate": 9.734429148174675e-05, "loss": 0.0395, "step": 150 }, { "epoch": 0.18433179723502305, "grad_norm": 0.4306221306324005, "learning_rate": 9.6786797055287e-05, "loss": 0.0361, "step": 160 }, { "epoch": 0.195852534562212, "grad_norm": 0.3377871811389923, "learning_rate": 9.617814195316411e-05, "loss": 0.035, "step": 170 }, { "epoch": 0.2073732718894009, "grad_norm": 0.40714597702026367, "learning_rate": 9.551899173079607e-05, "loss": 0.0332, "step": 180 }, { "epoch": 0.21889400921658986, "grad_norm": 0.40230241417884827, "learning_rate": 9.481006715927351e-05, "loss": 0.0321, "step": 190 }, { "epoch": 0.2304147465437788, "grad_norm": 0.41335731744766235, "learning_rate": 9.405214343720707e-05, "loss": 0.0319, "step": 200 }, { "epoch": 0.24193548387096775, "grad_norm": 0.20366613566875458, "learning_rate": 9.32460493430591e-05, "loss": 0.0276, "step": 210 }, { "epoch": 0.2534562211981567, "grad_norm": 0.24148087203502655, "learning_rate": 9.239266632888659e-05, "loss": 0.0331, "step": 220 }, { "epoch": 0.26497695852534564, "grad_norm": 0.31080925464630127, "learning_rate": 9.14929275564863e-05, "loss": 0.0305, "step": 230 }, { "epoch": 0.2764976958525346, "grad_norm": 0.2609555125236511, "learning_rate": 9.0547816876996e-05, "loss": 0.0277, "step": 240 }, { "epoch": 0.2880184331797235, "grad_norm": 0.35045984387397766, "learning_rate": 8.955836775506776e-05, "loss": 0.0254, "step": 250 }, { "epoch": 0.2995391705069124, "grad_norm": 0.2834389805793762, "learning_rate": 8.852566213878947e-05, "loss": 0.0272, "step": 260 }, { "epoch": 0.31105990783410137, "grad_norm": 0.48804351687431335, "learning_rate": 8.745082927659047e-05, "loss": 0.0273, "step": 270 }, { "epoch": 0.3225806451612903, "grad_norm": 0.36219653487205505, "learning_rate": 8.633504448242505e-05, "loss": 0.0274, "step": 280 }, { "epoch": 0.33410138248847926, "grad_norm": 0.28334954380989075, "learning_rate": 8.517952785058385e-05, "loss": 0.0244, "step": 290 }, { "epoch": 0.3456221198156682, "grad_norm": 0.37658995389938354, "learning_rate": 8.398554292153866e-05, "loss": 0.0234, "step": 300 }, { "epoch": 0.35714285714285715, "grad_norm": 0.4306541979312897, "learning_rate": 8.275439530027948e-05, "loss": 0.0255, "step": 310 }, { "epoch": 0.3686635944700461, "grad_norm": 0.3718424141407013, "learning_rate": 8.148743122865463e-05, "loss": 0.026, "step": 320 }, { "epoch": 0.38018433179723504, "grad_norm": 0.40250667929649353, "learning_rate": 8.018603611327504e-05, "loss": 0.0234, "step": 330 }, { "epoch": 0.391705069124424, "grad_norm": 0.3549322187900543, "learning_rate": 7.88516330105925e-05, "loss": 0.0246, "step": 340 }, { "epoch": 0.4032258064516129, "grad_norm": 0.3418162763118744, "learning_rate": 7.748568107080832e-05, "loss": 0.0214, "step": 350 }, { "epoch": 0.4147465437788018, "grad_norm": 0.4048205614089966, "learning_rate": 7.608967394231387e-05, "loss": 0.021, "step": 360 }, { "epoch": 0.42626728110599077, "grad_norm": 0.2962040603160858, "learning_rate": 7.466513813840825e-05, "loss": 0.0229, "step": 370 }, { "epoch": 0.4377880184331797, "grad_norm": 0.26386022567749023, "learning_rate": 7.32136313680782e-05, "loss": 0.021, "step": 380 }, { "epoch": 0.44930875576036866, "grad_norm": 0.18067125976085663, "learning_rate": 7.173674083266624e-05, "loss": 0.0207, "step": 390 }, { "epoch": 0.4608294930875576, "grad_norm": 0.24752575159072876, "learning_rate": 7.023608149028937e-05, "loss": 0.023, "step": 400 }, { "epoch": 0.47235023041474655, "grad_norm": 0.24577368795871735, "learning_rate": 6.871329428990602e-05, "loss": 0.0199, "step": 410 }, { "epoch": 0.4838709677419355, "grad_norm": 0.20847314596176147, "learning_rate": 6.71700443769625e-05, "loss": 0.0214, "step": 420 }, { "epoch": 0.49539170506912444, "grad_norm": 0.19042600691318512, "learning_rate": 6.56080192725808e-05, "loss": 0.019, "step": 430 }, { "epoch": 0.5069124423963134, "grad_norm": 0.3057345449924469, "learning_rate": 6.402892702827916e-05, "loss": 0.0204, "step": 440 }, { "epoch": 0.5184331797235023, "grad_norm": 0.27462145686149597, "learning_rate": 6.243449435824276e-05, "loss": 0.0198, "step": 450 }, { "epoch": 0.5299539170506913, "grad_norm": 0.2001851201057434, "learning_rate": 6.0826464751186994e-05, "loss": 0.0189, "step": 460 }, { "epoch": 0.5414746543778802, "grad_norm": 0.2672295570373535, "learning_rate": 5.9206596563878357e-05, "loss": 0.0178, "step": 470 }, { "epoch": 0.5529953917050692, "grad_norm": 0.3161514699459076, "learning_rate": 5.757666109839702e-05, "loss": 0.0213, "step": 480 }, { "epoch": 0.5645161290322581, "grad_norm": 0.22500912845134735, "learning_rate": 5.5938440665244006e-05, "loss": 0.0191, "step": 490 }, { "epoch": 0.576036866359447, "grad_norm": 0.32964590191841125, "learning_rate": 5.4293726634410855e-05, "loss": 0.0194, "step": 500 }, { "epoch": 0.5875576036866359, "grad_norm": 0.2184433490037918, "learning_rate": 5.264431747654284e-05, "loss": 0.0181, "step": 510 }, { "epoch": 0.5990783410138248, "grad_norm": 0.22752192616462708, "learning_rate": 5.0992016796337686e-05, "loss": 0.0153, "step": 520 }, { "epoch": 0.6105990783410138, "grad_norm": 0.17904232442378998, "learning_rate": 4.93386313603304e-05, "loss": 0.017, "step": 530 }, { "epoch": 0.6221198156682027, "grad_norm": 0.30245211720466614, "learning_rate": 4.7685969121220456e-05, "loss": 0.017, "step": 540 }, { "epoch": 0.6336405529953917, "grad_norm": 0.23163466155529022, "learning_rate": 4.60358372409022e-05, "loss": 0.0181, "step": 550 }, { "epoch": 0.6451612903225806, "grad_norm": 0.22935254871845245, "learning_rate": 4.439004011435979e-05, "loss": 0.0181, "step": 560 }, { "epoch": 0.6566820276497696, "grad_norm": 0.220436692237854, "learning_rate": 4.275037739658771e-05, "loss": 0.017, "step": 570 }, { "epoch": 0.6682027649769585, "grad_norm": 0.26229238510131836, "learning_rate": 4.111864203469457e-05, "loss": 0.0178, "step": 580 }, { "epoch": 0.6797235023041475, "grad_norm": 0.19217033684253693, "learning_rate": 3.949661830734172e-05, "loss": 0.0152, "step": 590 }, { "epoch": 0.6912442396313364, "grad_norm": 0.20372073352336884, "learning_rate": 3.788607987366069e-05, "loss": 0.0159, "step": 600 }, { "epoch": 0.7027649769585254, "grad_norm": 0.1933118999004364, "learning_rate": 3.628878783378302e-05, "loss": 0.0157, "step": 610 }, { "epoch": 0.7142857142857143, "grad_norm": 0.19851423799991608, "learning_rate": 3.470648880310313e-05, "loss": 0.0145, "step": 620 }, { "epoch": 0.7258064516129032, "grad_norm": 0.16878198087215424, "learning_rate": 3.3140913002379995e-05, "loss": 0.0157, "step": 630 }, { "epoch": 0.7373271889400922, "grad_norm": 0.21986301243305206, "learning_rate": 3.1593772365766105e-05, "loss": 0.0162, "step": 640 }, { "epoch": 0.7488479262672811, "grad_norm": 0.13134035468101501, "learning_rate": 3.006675866883275e-05, "loss": 0.0157, "step": 650 }, { "epoch": 0.7603686635944701, "grad_norm": 0.1785222440958023, "learning_rate": 2.8561541678638142e-05, "loss": 0.0147, "step": 660 }, { "epoch": 0.771889400921659, "grad_norm": 0.16605904698371887, "learning_rate": 2.707976732786166e-05, "loss": 0.0133, "step": 670 }, { "epoch": 0.783410138248848, "grad_norm": 0.18675336241722107, "learning_rate": 2.562305591500069e-05, "loss": 0.0145, "step": 680 }, { "epoch": 0.7949308755760369, "grad_norm": 0.18020185828208923, "learning_rate": 2.419300033259798e-05, "loss": 0.0151, "step": 690 }, { "epoch": 0.8064516129032258, "grad_norm": 0.1856432557106018, "learning_rate": 2.279116432543705e-05, "loss": 0.0137, "step": 700 }, { "epoch": 0.8179723502304147, "grad_norm": 0.1251407414674759, "learning_rate": 2.1419080780610123e-05, "loss": 0.0126, "step": 710 }, { "epoch": 0.8294930875576036, "grad_norm": 0.20187409222126007, "learning_rate": 2.0078250051328784e-05, "loss": 0.0139, "step": 720 }, { "epoch": 0.8410138248847926, "grad_norm": 0.16873343288898468, "learning_rate": 1.877013831630961e-05, "loss": 0.0142, "step": 730 }, { "epoch": 0.8525345622119815, "grad_norm": 0.11382901668548584, "learning_rate": 1.749617597652934e-05, "loss": 0.0129, "step": 740 }, { "epoch": 0.8640552995391705, "grad_norm": 0.1710187941789627, "learning_rate": 1.62577560911024e-05, "loss": 0.0152, "step": 750 }, { "epoch": 0.8755760368663594, "grad_norm": 0.12709008157253265, "learning_rate": 1.5056232853991209e-05, "loss": 0.0139, "step": 760 }, { "epoch": 0.8870967741935484, "grad_norm": 0.15011294186115265, "learning_rate": 1.389292011321498e-05, "loss": 0.0136, "step": 770 }, { "epoch": 0.8986175115207373, "grad_norm": 0.17724603414535522, "learning_rate": 1.2769089934176126e-05, "loss": 0.0125, "step": 780 }, { "epoch": 0.9101382488479263, "grad_norm": 0.16647891700267792, "learning_rate": 1.1685971208675539e-05, "loss": 0.0129, "step": 790 }, { "epoch": 0.9216589861751152, "grad_norm": 0.1402997523546219, "learning_rate": 1.0644748311137376e-05, "loss": 0.0123, "step": 800 }, { "epoch": 0.9331797235023042, "grad_norm": 0.11741903424263, "learning_rate": 9.646559803512994e-06, "loss": 0.0119, "step": 810 }, { "epoch": 0.9447004608294931, "grad_norm": 0.14751701056957245, "learning_rate": 8.692497190280224e-06, "loss": 0.0133, "step": 820 }, { "epoch": 0.956221198156682, "grad_norm": 0.14610819518566132, "learning_rate": 7.783603724899257e-06, "loss": 0.0128, "step": 830 }, { "epoch": 0.967741935483871, "grad_norm": 0.1284884661436081, "learning_rate": 6.92087326903022e-06, "loss": 0.0116, "step": 840 }, { "epoch": 0.9792626728110599, "grad_norm": 0.14618724584579468, "learning_rate": 6.1052492057601275e-06, "loss": 0.0137, "step": 850 }, { "epoch": 0.9907834101382489, "grad_norm": 0.14276568591594696, "learning_rate": 5.337623408027293e-06, "loss": 0.0136, "step": 860 }, { "epoch": 1.0023041474654377, "grad_norm": 0.10031285136938095, "learning_rate": 4.618835263371396e-06, "loss": 0.0119, "step": 870 }, { "epoch": 1.0138248847926268, "grad_norm": 0.10763294994831085, "learning_rate": 3.949670756075447e-06, "loss": 0.0111, "step": 880 }, { "epoch": 1.0253456221198156, "grad_norm": 0.1076810285449028, "learning_rate": 3.3308616077036115e-06, "loss": 0.0109, "step": 890 }, { "epoch": 1.0368663594470047, "grad_norm": 0.13353325426578522, "learning_rate": 2.7630844769743757e-06, "loss": 0.011, "step": 900 }, { "epoch": 1.0483870967741935, "grad_norm": 0.10967040807008743, "learning_rate": 2.2469602198441573e-06, "loss": 0.0117, "step": 910 }, { "epoch": 1.0599078341013826, "grad_norm": 0.16759690642356873, "learning_rate": 1.7830532106104747e-06, "loss": 0.0118, "step": 920 }, { "epoch": 1.0714285714285714, "grad_norm": 0.15041400492191315, "learning_rate": 1.3718707247769135e-06, "loss": 0.0121, "step": 930 }, { "epoch": 1.0829493087557605, "grad_norm": 0.11416321247816086, "learning_rate": 1.0138623843548078e-06, "loss": 0.0127, "step": 940 }, { "epoch": 1.0944700460829493, "grad_norm": 0.12047427892684937, "learning_rate": 7.094196662081831e-07, "loss": 0.0117, "step": 950 }, { "epoch": 1.1059907834101383, "grad_norm": 0.13395977020263672, "learning_rate": 4.5887547397955864e-07, "loss": 0.0108, "step": 960 }, { "epoch": 1.1175115207373272, "grad_norm": 0.08437898010015488, "learning_rate": 2.625037740646763e-07, "loss": 0.0123, "step": 970 }, { "epoch": 1.129032258064516, "grad_norm": 0.07629328966140747, "learning_rate": 1.2051929603428825e-07, "loss": 0.0111, "step": 980 }, { "epoch": 1.140552995391705, "grad_norm": 0.10458842664957047, "learning_rate": 3.3077297830541584e-08, "loss": 0.0128, "step": 990 }, { "epoch": 1.1520737327188941, "grad_norm": 0.11208489537239075, "learning_rate": 2.7339599464326627e-10, "loss": 0.0123, "step": 1000 }, { "epoch": 1.1520737327188941, "step": 1000, "total_flos": 0.0, "train_loss": 0.03519474593549967, "train_runtime": 1870.4299, "train_samples_per_second": 53.464, "train_steps_per_second": 0.535 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 100, "trial_name": null, "trial_params": null }