{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.89875173370319, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11095700416088766, "grad_norm": 0.31662145256996155, "learning_rate": 7.407407407407407e-05, "loss": 0.5604, "step": 10 }, { "epoch": 0.22191400832177532, "grad_norm": 0.38665255904197693, "learning_rate": 0.00014814814814814815, "loss": 0.3448, "step": 20 }, { "epoch": 0.332871012482663, "grad_norm": 0.38282278180122375, "learning_rate": 0.00019999417253661235, "loss": 0.1345, "step": 30 }, { "epoch": 0.44382801664355065, "grad_norm": 0.33959391713142395, "learning_rate": 0.000199890592080658, "loss": 0.1206, "step": 40 }, { "epoch": 0.5547850208044383, "grad_norm": 0.2943621873855591, "learning_rate": 0.00019965766682369186, "loss": 0.1234, "step": 50 }, { "epoch": 0.665742024965326, "grad_norm": 0.25359126925468445, "learning_rate": 0.00019929569837240564, "loss": 0.1039, "step": 60 }, { "epoch": 0.7766990291262136, "grad_norm": 0.23930878937244415, "learning_rate": 0.0001988051554269675, "loss": 0.102, "step": 70 }, { "epoch": 0.8876560332871013, "grad_norm": 0.2013150006532669, "learning_rate": 0.00019818667317411865, "loss": 0.0974, "step": 80 }, { "epoch": 0.9986130374479889, "grad_norm": 0.25096118450164795, "learning_rate": 0.00019744105246469263, "loss": 0.099, "step": 90 }, { "epoch": 1.0998613037447988, "grad_norm": 0.25178226828575134, "learning_rate": 0.0001965692587766216, "loss": 0.0714, "step": 100 }, { "epoch": 1.2108183079056865, "grad_norm": 0.2704208195209503, "learning_rate": 0.00019557242096477327, "loss": 0.0771, "step": 110 }, { "epoch": 1.3217753120665743, "grad_norm": 0.22107760608196259, "learning_rate": 0.00019445182979923654, "loss": 0.0703, "step": 120 }, { "epoch": 1.4327323162274619, "grad_norm": 0.26953792572021484, "learning_rate": 0.00019320893629394873, "loss": 0.0753, "step": 130 }, { "epoch": 1.5436893203883495, "grad_norm": 0.2142401486635208, "learning_rate": 0.00019184534982782904, "loss": 0.0724, "step": 140 }, { "epoch": 1.6546463245492373, "grad_norm": 0.25699618458747864, "learning_rate": 0.00019036283606085053, "loss": 0.0648, "step": 150 }, { "epoch": 1.765603328710125, "grad_norm": 0.2224379926919937, "learning_rate": 0.00018876331464774945, "loss": 0.0706, "step": 160 }, { "epoch": 1.8765603328710125, "grad_norm": 0.23435620963573456, "learning_rate": 0.0001870488567523318, "loss": 0.0695, "step": 170 }, { "epoch": 1.9875173370319001, "grad_norm": 0.18676415085792542, "learning_rate": 0.00018522168236559695, "loss": 0.0615, "step": 180 }, { "epoch": 2.08876560332871, "grad_norm": 0.24162153899669647, "learning_rate": 0.00018328415743114912, "loss": 0.0445, "step": 190 }, { "epoch": 2.1997226074895977, "grad_norm": 0.3869277536869049, "learning_rate": 0.00018123879078162097, "loss": 0.0502, "step": 200 }, { "epoch": 2.3106796116504853, "grad_norm": 0.3037394881248474, "learning_rate": 0.00017908823089007457, "loss": 0.0482, "step": 210 }, { "epoch": 2.421636615811373, "grad_norm": 0.18976379930973053, "learning_rate": 0.00017683526244058716, "loss": 0.0528, "step": 220 }, { "epoch": 2.5325936199722605, "grad_norm": 0.30705705285072327, "learning_rate": 0.00017448280272246212, "loss": 0.0521, "step": 230 }, { "epoch": 2.6435506241331486, "grad_norm": 0.21610881388187408, "learning_rate": 0.000172033897852734, "loss": 0.0535, "step": 240 }, { "epoch": 2.754507628294036, "grad_norm": 0.18693220615386963, "learning_rate": 0.00016949171883185918, "loss": 0.0517, "step": 250 }, { "epoch": 2.8654646324549238, "grad_norm": 0.3321268558502197, "learning_rate": 0.0001668595574376992, "loss": 0.0407, "step": 260 }, { "epoch": 2.9764216366158114, "grad_norm": 0.20721495151519775, "learning_rate": 0.000164140821963114, "loss": 0.0417, "step": 270 }, { "epoch": 3.0776699029126213, "grad_norm": 0.20151656866073608, "learning_rate": 0.00016133903280268362, "loss": 0.0373, "step": 280 }, { "epoch": 3.188626907073509, "grad_norm": 0.3590203821659088, "learning_rate": 0.00015845781789427377, "loss": 0.0358, "step": 290 }, { "epoch": 3.2995839112343965, "grad_norm": 0.20630675554275513, "learning_rate": 0.000155500908021347, "loss": 0.0299, "step": 300 }, { "epoch": 3.410540915395284, "grad_norm": 0.3287246525287628, "learning_rate": 0.000152472131982103, "loss": 0.0331, "step": 310 }, { "epoch": 3.5214979195561718, "grad_norm": 0.24394913017749786, "learning_rate": 0.0001493754116317029, "loss": 0.0368, "step": 320 }, { "epoch": 3.63245492371706, "grad_norm": 0.20165830850601196, "learning_rate": 0.0001462147568039977, "loss": 0.0336, "step": 330 }, { "epoch": 3.7434119278779474, "grad_norm": 0.2538021504878998, "learning_rate": 0.00014299426011933568, "loss": 0.0295, "step": 340 }, { "epoch": 3.854368932038835, "grad_norm": 0.36229604482650757, "learning_rate": 0.00013971809168517298, "loss": 0.0358, "step": 350 }, { "epoch": 3.9653259361997226, "grad_norm": 0.4092184603214264, "learning_rate": 0.00013639049369634876, "loss": 0.034, "step": 360 }, { "epoch": 4.066574202496533, "grad_norm": 0.11960680782794952, "learning_rate": 0.00013301577494201664, "loss": 0.0233, "step": 370 }, { "epoch": 4.17753120665742, "grad_norm": 0.26415354013442993, "learning_rate": 0.00012959830522634596, "loss": 0.02, "step": 380 }, { "epoch": 4.288488210818308, "grad_norm": 0.21966516971588135, "learning_rate": 0.00012614250971021657, "loss": 0.0225, "step": 390 }, { "epoch": 4.399445214979195, "grad_norm": 0.2905697524547577, "learning_rate": 0.00012265286318123415, "loss": 0.0244, "step": 400 }, { "epoch": 4.510402219140083, "grad_norm": 0.24163606762886047, "learning_rate": 0.00011913388425948584, "loss": 0.017, "step": 410 }, { "epoch": 4.621359223300971, "grad_norm": 0.40009695291519165, "learning_rate": 0.00011559012954653865, "loss": 0.0219, "step": 420 }, { "epoch": 4.732316227461858, "grad_norm": 0.1963382512331009, "learning_rate": 0.0001120261877252568, "loss": 0.0179, "step": 430 }, { "epoch": 4.843273231622746, "grad_norm": 0.33989155292510986, "learning_rate": 0.00010844667361807842, "loss": 0.0198, "step": 440 }, { "epoch": 4.954230235783633, "grad_norm": 0.38484710454940796, "learning_rate": 0.00010485622221144484, "loss": 0.0249, "step": 450 }, { "epoch": 5.055478502080444, "grad_norm": 0.18945415318012238, "learning_rate": 0.00010125948265412033, "loss": 0.0177, "step": 460 }, { "epoch": 5.166435506241331, "grad_norm": 0.25906893610954285, "learning_rate": 9.766111223717352e-05, "loss": 0.0127, "step": 470 }, { "epoch": 5.277392510402219, "grad_norm": 0.23804187774658203, "learning_rate": 9.406577036341548e-05, "loss": 0.0128, "step": 480 }, { "epoch": 5.388349514563107, "grad_norm": 0.20456787943840027, "learning_rate": 9.047811251410376e-05, "loss": 0.0111, "step": 490 }, { "epoch": 5.499306518723994, "grad_norm": 0.15757159888744354, "learning_rate": 8.690278422072384e-05, "loss": 0.0101, "step": 500 }, { "epoch": 5.610263522884882, "grad_norm": 0.16691505908966064, "learning_rate": 8.334441504965455e-05, "loss": 0.0115, "step": 510 }, { "epoch": 5.721220527045769, "grad_norm": 0.5055399537086487, "learning_rate": 7.980761260750607e-05, "loss": 0.0088, "step": 520 }, { "epoch": 5.832177531206657, "grad_norm": 0.15076065063476562, "learning_rate": 7.629695657489257e-05, "loss": 0.0117, "step": 530 }, { "epoch": 5.943134535367545, "grad_norm": 0.09655993431806564, "learning_rate": 7.281699277636572e-05, "loss": 0.0111, "step": 540 }, { "epoch": 6.044382801664355, "grad_norm": 0.4866645336151123, "learning_rate": 6.93722272941869e-05, "loss": 0.0092, "step": 550 }, { "epoch": 6.155339805825243, "grad_norm": 0.1816895604133606, "learning_rate": 6.59671206335602e-05, "loss": 0.0082, "step": 560 }, { "epoch": 6.26629680998613, "grad_norm": 0.22271257638931274, "learning_rate": 6.260608194688206e-05, "loss": 0.0046, "step": 570 }, { "epoch": 6.377253814147018, "grad_norm": 0.06787201762199402, "learning_rate": 5.929346332448511e-05, "loss": 0.0051, "step": 580 }, { "epoch": 6.4882108183079055, "grad_norm": 0.09298055619001389, "learning_rate": 5.6033554159270294e-05, "loss": 0.0054, "step": 590 }, { "epoch": 6.599167822468793, "grad_norm": 0.03731105476617813, "learning_rate": 5.283057559252341e-05, "loss": 0.0053, "step": 600 }, { "epoch": 6.710124826629681, "grad_norm": 0.10652171820402145, "learning_rate": 4.96886750481082e-05, "loss": 0.0057, "step": 610 }, { "epoch": 6.821081830790568, "grad_norm": 0.2607424259185791, "learning_rate": 4.661192086211366e-05, "loss": 0.0077, "step": 620 }, { "epoch": 6.932038834951456, "grad_norm": 0.11328639835119247, "learning_rate": 4.360429701490934e-05, "loss": 0.0073, "step": 630 }, { "epoch": 7.033287101248266, "grad_norm": 0.0941685363650322, "learning_rate": 4.06696979724298e-05, "loss": 0.0039, "step": 640 }, { "epoch": 7.144244105409154, "grad_norm": 0.45776239037513733, "learning_rate": 3.7811923643367974e-05, "loss": 0.0032, "step": 650 }, { "epoch": 7.2552011095700415, "grad_norm": 0.08863729238510132, "learning_rate": 3.503467445880789e-05, "loss": 0.0026, "step": 660 }, { "epoch": 7.366158113730929, "grad_norm": 0.04661976918578148, "learning_rate": 3.2341546580666796e-05, "loss": 0.0024, "step": 670 }, { "epoch": 7.477115117891817, "grad_norm": 0.08003357797861099, "learning_rate": 2.9736027245152275e-05, "loss": 0.0022, "step": 680 }, { "epoch": 7.588072122052704, "grad_norm": 0.15967042744159698, "learning_rate": 2.722149024726307e-05, "loss": 0.0024, "step": 690 }, { "epoch": 7.699029126213592, "grad_norm": 0.0572751984000206, "learning_rate": 2.480119157218108e-05, "loss": 0.003, "step": 700 }, { "epoch": 7.8099861303744795, "grad_norm": 0.0780700072646141, "learning_rate": 2.247826517921121e-05, "loss": 0.0035, "step": 710 }, { "epoch": 7.920943134535367, "grad_norm": 0.19474399089813232, "learning_rate": 2.025571894372794e-05, "loss": 0.0027, "step": 720 }, { "epoch": 8.022191400832178, "grad_norm": 0.12848657369613647, "learning_rate": 1.813643076238375e-05, "loss": 0.002, "step": 730 }, { "epoch": 8.133148404993065, "grad_norm": 0.05772533640265465, "learning_rate": 1.6123144826622504e-05, "loss": 0.0017, "step": 740 }, { "epoch": 8.244105409153953, "grad_norm": 0.14121367037296295, "learning_rate": 1.4218468069322578e-05, "loss": 0.0013, "step": 750 }, { "epoch": 8.35506241331484, "grad_norm": 0.14342299103736877, "learning_rate": 1.2424866789171729e-05, "loss": 0.0016, "step": 760 }, { "epoch": 8.466019417475728, "grad_norm": 0.03438349440693855, "learning_rate": 1.0744663457143878e-05, "loss": 0.0011, "step": 770 }, { "epoch": 8.576976421636616, "grad_norm": 0.0756613090634346, "learning_rate": 9.180033709213454e-06, "loss": 0.0017, "step": 780 }, { "epoch": 8.687933425797503, "grad_norm": 0.0464102178812027, "learning_rate": 7.733003529201278e-06, "loss": 0.0014, "step": 790 }, { "epoch": 8.79889042995839, "grad_norm": 0.12452979385852814, "learning_rate": 6.405446625399481e-06, "loss": 0.0015, "step": 800 }, { "epoch": 8.909847434119278, "grad_norm": 0.08071909099817276, "learning_rate": 5.199082004372957e-06, "loss": 0.0014, "step": 810 }, { "epoch": 9.011095700416089, "grad_norm": 0.06948132812976837, "learning_rate": 4.115471745078314e-06, "loss": 0.0012, "step": 820 }, { "epoch": 9.122052704576976, "grad_norm": 0.07605510950088501, "learning_rate": 3.1560189761830728e-06, "loss": 0.0009, "step": 830 }, { "epoch": 9.233009708737864, "grad_norm": 0.0312280785292387, "learning_rate": 2.3219660592038285e-06, "loss": 0.0012, "step": 840 }, { "epoch": 9.343966712898752, "grad_norm": 0.02329327166080475, "learning_rate": 1.6143929798162704e-06, "loss": 0.001, "step": 850 }, { "epoch": 9.45492371705964, "grad_norm": 0.08054498583078384, "learning_rate": 1.034215949419748e-06, "loss": 0.0012, "step": 860 }, { "epoch": 9.565880721220527, "grad_norm": 0.09850303828716278, "learning_rate": 5.821862187675775e-07, "loss": 0.0011, "step": 870 }, { "epoch": 9.676837725381414, "grad_norm": 0.08373916149139404, "learning_rate": 2.588891051988895e-07, "loss": 0.0019, "step": 880 }, { "epoch": 9.787794729542302, "grad_norm": 0.017217393964529037, "learning_rate": 6.474323473194543e-08, "loss": 0.0009, "step": 890 }, { "epoch": 9.89875173370319, "grad_norm": 0.04848321154713631, "learning_rate": 0.0, "loss": 0.0009, "step": 900 } ], "logging_steps": 10, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.301284175906406e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }