{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 735, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.040878896269800714, "grad_norm": 6.032278537750244, "learning_rate": 0.00019755102040816327, "loss": 2.6293, "step": 10 }, { "epoch": 0.08175779253960143, "grad_norm": 3.237109899520874, "learning_rate": 0.0001948299319727891, "loss": 2.008, "step": 20 }, { "epoch": 0.12263668880940215, "grad_norm": 2.7581236362457275, "learning_rate": 0.000192108843537415, "loss": 1.855, "step": 30 }, { "epoch": 0.16351558507920286, "grad_norm": 3.083601474761963, "learning_rate": 0.00018938775510204083, "loss": 1.7576, "step": 40 }, { "epoch": 0.20439448134900357, "grad_norm": 2.7807652950286865, "learning_rate": 0.0001866666666666667, "loss": 1.7434, "step": 50 }, { "epoch": 0.2452733776188043, "grad_norm": 2.6213436126708984, "learning_rate": 0.00018394557823129252, "loss": 1.682, "step": 60 }, { "epoch": 0.286152273888605, "grad_norm": 2.6907846927642822, "learning_rate": 0.00018122448979591838, "loss": 1.6749, "step": 70 }, { "epoch": 0.3270311701584057, "grad_norm": 5.332385540008545, "learning_rate": 0.00017850340136054421, "loss": 1.5744, "step": 80 }, { "epoch": 0.36791006642820645, "grad_norm": 3.23760986328125, "learning_rate": 0.00017578231292517008, "loss": 1.5338, "step": 90 }, { "epoch": 0.40878896269800713, "grad_norm": 3.479809522628784, "learning_rate": 0.00017306122448979594, "loss": 1.4143, "step": 100 }, { "epoch": 0.44966785896780787, "grad_norm": 2.7317419052124023, "learning_rate": 0.0001703401360544218, "loss": 1.4603, "step": 110 }, { "epoch": 0.4905467552376086, "grad_norm": 2.8279190063476562, "learning_rate": 0.00016761904761904763, "loss": 1.5169, "step": 120 }, { "epoch": 0.5314256515074093, "grad_norm": 3.096827983856201, "learning_rate": 0.0001648979591836735, "loss": 1.4396, "step": 130 }, { "epoch": 0.57230454777721, "grad_norm": 2.9124717712402344, "learning_rate": 0.00016217687074829932, "loss": 1.3824, "step": 140 }, { "epoch": 0.6131834440470108, "grad_norm": 3.073765754699707, "learning_rate": 0.00015945578231292519, "loss": 1.3974, "step": 150 }, { "epoch": 0.6540623403168114, "grad_norm": 2.5229296684265137, "learning_rate": 0.00015673469387755102, "loss": 1.4211, "step": 160 }, { "epoch": 0.6949412365866121, "grad_norm": 2.8301663398742676, "learning_rate": 0.00015401360544217688, "loss": 1.3567, "step": 170 }, { "epoch": 0.7358201328564129, "grad_norm": 2.6760520935058594, "learning_rate": 0.00015129251700680274, "loss": 1.3552, "step": 180 }, { "epoch": 0.7766990291262136, "grad_norm": 2.741240978240967, "learning_rate": 0.00014857142857142857, "loss": 1.408, "step": 190 }, { "epoch": 0.8175779253960143, "grad_norm": 2.028970956802368, "learning_rate": 0.00014585034013605443, "loss": 1.3497, "step": 200 }, { "epoch": 0.858456821665815, "grad_norm": 2.6956677436828613, "learning_rate": 0.00014312925170068027, "loss": 1.3592, "step": 210 }, { "epoch": 0.8993357179356157, "grad_norm": 2.448338270187378, "learning_rate": 0.00014040816326530613, "loss": 1.3309, "step": 220 }, { "epoch": 0.9402146142054164, "grad_norm": 2.267707347869873, "learning_rate": 0.00013768707482993196, "loss": 1.329, "step": 230 }, { "epoch": 0.9810935104752172, "grad_norm": 2.4610676765441895, "learning_rate": 0.00013496598639455782, "loss": 1.3194, "step": 240 }, { "epoch": 1.0204394481349004, "grad_norm": 2.539577007293701, "learning_rate": 0.00013224489795918368, "loss": 1.2862, "step": 250 }, { "epoch": 1.0613183444047012, "grad_norm": 2.377258777618408, "learning_rate": 0.00012952380952380954, "loss": 1.2052, "step": 260 }, { "epoch": 1.1021972406745018, "grad_norm": 2.4516634941101074, "learning_rate": 0.00012680272108843538, "loss": 1.1859, "step": 270 }, { "epoch": 1.1430761369443025, "grad_norm": 3.4123401641845703, "learning_rate": 0.00012408163265306124, "loss": 1.2547, "step": 280 }, { "epoch": 1.1839550332141031, "grad_norm": 2.9211533069610596, "learning_rate": 0.00012136054421768707, "loss": 1.187, "step": 290 }, { "epoch": 1.224833929483904, "grad_norm": 2.8444571495056152, "learning_rate": 0.00011863945578231292, "loss": 1.1871, "step": 300 }, { "epoch": 1.2657128257537047, "grad_norm": 2.8576762676239014, "learning_rate": 0.00011591836734693877, "loss": 1.2033, "step": 310 }, { "epoch": 1.3065917220235055, "grad_norm": 2.8612818717956543, "learning_rate": 0.00011319727891156464, "loss": 1.1889, "step": 320 }, { "epoch": 1.347470618293306, "grad_norm": 2.666062355041504, "learning_rate": 0.00011047619047619049, "loss": 1.1122, "step": 330 }, { "epoch": 1.3883495145631068, "grad_norm": 2.7399775981903076, "learning_rate": 0.00010775510204081634, "loss": 1.2343, "step": 340 }, { "epoch": 1.4292284108329074, "grad_norm": 2.89231538772583, "learning_rate": 0.00010503401360544218, "loss": 1.2481, "step": 350 }, { "epoch": 1.4701073071027082, "grad_norm": 2.9003829956054688, "learning_rate": 0.00010231292517006803, "loss": 1.1994, "step": 360 }, { "epoch": 1.510986203372509, "grad_norm": 3.2669005393981934, "learning_rate": 9.959183673469388e-05, "loss": 1.1804, "step": 370 }, { "epoch": 1.5518650996423098, "grad_norm": 3.1723055839538574, "learning_rate": 9.687074829931974e-05, "loss": 1.1427, "step": 380 }, { "epoch": 1.5927439959121104, "grad_norm": 3.0546045303344727, "learning_rate": 9.414965986394559e-05, "loss": 1.1616, "step": 390 }, { "epoch": 1.633622892181911, "grad_norm": 3.060725450515747, "learning_rate": 9.142857142857143e-05, "loss": 1.2045, "step": 400 }, { "epoch": 1.6745017884517117, "grad_norm": 2.8336000442504883, "learning_rate": 8.870748299319729e-05, "loss": 1.1922, "step": 410 }, { "epoch": 1.7153806847215125, "grad_norm": 2.888324499130249, "learning_rate": 8.598639455782314e-05, "loss": 1.1646, "step": 420 }, { "epoch": 1.7562595809913133, "grad_norm": 2.559555768966675, "learning_rate": 8.326530612244899e-05, "loss": 1.2204, "step": 430 }, { "epoch": 1.797138477261114, "grad_norm": 3.291555643081665, "learning_rate": 8.054421768707483e-05, "loss": 1.1561, "step": 440 }, { "epoch": 1.8380173735309147, "grad_norm": 2.7448337078094482, "learning_rate": 7.782312925170068e-05, "loss": 1.1783, "step": 450 }, { "epoch": 1.8788962698007152, "grad_norm": 3.201742649078369, "learning_rate": 7.510204081632653e-05, "loss": 1.1499, "step": 460 }, { "epoch": 1.919775166070516, "grad_norm": 3.3023853302001953, "learning_rate": 7.238095238095238e-05, "loss": 1.1929, "step": 470 }, { "epoch": 1.9606540623403168, "grad_norm": 2.9392805099487305, "learning_rate": 6.965986394557822e-05, "loss": 1.2178, "step": 480 }, { "epoch": 2.0, "grad_norm": 3.641012668609619, "learning_rate": 6.693877551020408e-05, "loss": 1.1721, "step": 490 }, { "epoch": 2.040878896269801, "grad_norm": 3.4774110317230225, "learning_rate": 6.421768707482993e-05, "loss": 1.0634, "step": 500 }, { "epoch": 2.0817577925396016, "grad_norm": 3.280494213104248, "learning_rate": 6.149659863945578e-05, "loss": 1.0549, "step": 510 }, { "epoch": 2.1226366888094024, "grad_norm": 3.7747931480407715, "learning_rate": 5.877551020408164e-05, "loss": 1.0331, "step": 520 }, { "epoch": 2.1635155850792027, "grad_norm": 3.1277899742126465, "learning_rate": 5.6054421768707486e-05, "loss": 1.0384, "step": 530 }, { "epoch": 2.2043944813490035, "grad_norm": 3.6484415531158447, "learning_rate": 5.333333333333333e-05, "loss": 1.0136, "step": 540 }, { "epoch": 2.2452733776188043, "grad_norm": 3.5595340728759766, "learning_rate": 5.061224489795918e-05, "loss": 1.0227, "step": 550 }, { "epoch": 2.286152273888605, "grad_norm": 3.5124669075012207, "learning_rate": 4.7891156462585034e-05, "loss": 1.0109, "step": 560 }, { "epoch": 2.327031170158406, "grad_norm": 3.5096631050109863, "learning_rate": 4.517006802721089e-05, "loss": 1.0131, "step": 570 }, { "epoch": 2.3679100664282062, "grad_norm": 3.6057543754577637, "learning_rate": 4.2448979591836735e-05, "loss": 1.0791, "step": 580 }, { "epoch": 2.408788962698007, "grad_norm": 3.254915714263916, "learning_rate": 3.972789115646259e-05, "loss": 0.9997, "step": 590 }, { "epoch": 2.449667858967808, "grad_norm": 4.024539947509766, "learning_rate": 3.7006802721088437e-05, "loss": 1.0364, "step": 600 }, { "epoch": 2.4905467552376086, "grad_norm": 3.6602048873901367, "learning_rate": 3.428571428571429e-05, "loss": 0.9388, "step": 610 }, { "epoch": 2.5314256515074094, "grad_norm": 3.5694384574890137, "learning_rate": 3.156462585034014e-05, "loss": 1.0256, "step": 620 }, { "epoch": 2.57230454777721, "grad_norm": 2.8324692249298096, "learning_rate": 2.8843537414965988e-05, "loss": 1.0396, "step": 630 }, { "epoch": 2.613183444047011, "grad_norm": 3.545496702194214, "learning_rate": 2.612244897959184e-05, "loss": 1.0569, "step": 640 }, { "epoch": 2.6540623403168113, "grad_norm": 3.602918863296509, "learning_rate": 2.3401360544217686e-05, "loss": 1.0485, "step": 650 }, { "epoch": 2.694941236586612, "grad_norm": 3.526660442352295, "learning_rate": 2.0680272108843536e-05, "loss": 1.0541, "step": 660 }, { "epoch": 2.735820132856413, "grad_norm": 3.0326685905456543, "learning_rate": 1.7959183673469387e-05, "loss": 0.9885, "step": 670 }, { "epoch": 2.7766990291262137, "grad_norm": 3.41005277633667, "learning_rate": 1.5238095238095241e-05, "loss": 1.0277, "step": 680 }, { "epoch": 2.817577925396014, "grad_norm": 3.461618423461914, "learning_rate": 1.251700680272109e-05, "loss": 1.0553, "step": 690 }, { "epoch": 2.858456821665815, "grad_norm": 3.59061861038208, "learning_rate": 9.795918367346939e-06, "loss": 1.0436, "step": 700 }, { "epoch": 2.8993357179356156, "grad_norm": 3.4869349002838135, "learning_rate": 7.07482993197279e-06, "loss": 1.0493, "step": 710 }, { "epoch": 2.9402146142054164, "grad_norm": 3.2588963508605957, "learning_rate": 4.353741496598639e-06, "loss": 1.0311, "step": 720 }, { "epoch": 2.981093510475217, "grad_norm": 3.391941785812378, "learning_rate": 1.63265306122449e-06, "loss": 0.9652, "step": 730 } ], "logging_steps": 10, "max_steps": 735, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5661456203854643e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }