| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.89875173370319, | |
| "eval_steps": 500, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11095700416088766, | |
| "grad_norm": 0.31662145256996155, | |
| "learning_rate": 7.407407407407407e-05, | |
| "loss": 0.5604, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.22191400832177532, | |
| "grad_norm": 0.38665255904197693, | |
| "learning_rate": 0.00014814814814814815, | |
| "loss": 0.3448, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.332871012482663, | |
| "grad_norm": 0.38282278180122375, | |
| "learning_rate": 0.00019999417253661235, | |
| "loss": 0.1345, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.44382801664355065, | |
| "grad_norm": 0.33959391713142395, | |
| "learning_rate": 0.000199890592080658, | |
| "loss": 0.1206, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5547850208044383, | |
| "grad_norm": 0.2943621873855591, | |
| "learning_rate": 0.00019965766682369186, | |
| "loss": 0.1234, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.665742024965326, | |
| "grad_norm": 0.25359126925468445, | |
| "learning_rate": 0.00019929569837240564, | |
| "loss": 0.1039, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 0.23930878937244415, | |
| "learning_rate": 0.0001988051554269675, | |
| "loss": 0.102, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8876560332871013, | |
| "grad_norm": 0.2013150006532669, | |
| "learning_rate": 0.00019818667317411865, | |
| "loss": 0.0974, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9986130374479889, | |
| "grad_norm": 0.25096118450164795, | |
| "learning_rate": 0.00019744105246469263, | |
| "loss": 0.099, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0998613037447988, | |
| "grad_norm": 0.25178226828575134, | |
| "learning_rate": 0.0001965692587766216, | |
| "loss": 0.0714, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2108183079056865, | |
| "grad_norm": 0.2704208195209503, | |
| "learning_rate": 0.00019557242096477327, | |
| "loss": 0.0771, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3217753120665743, | |
| "grad_norm": 0.22107760608196259, | |
| "learning_rate": 0.00019445182979923654, | |
| "loss": 0.0703, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.4327323162274619, | |
| "grad_norm": 0.26953792572021484, | |
| "learning_rate": 0.00019320893629394873, | |
| "loss": 0.0753, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5436893203883495, | |
| "grad_norm": 0.2142401486635208, | |
| "learning_rate": 0.00019184534982782904, | |
| "loss": 0.0724, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6546463245492373, | |
| "grad_norm": 0.25699618458747864, | |
| "learning_rate": 0.00019036283606085053, | |
| "loss": 0.0648, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.765603328710125, | |
| "grad_norm": 0.2224379926919937, | |
| "learning_rate": 0.00018876331464774945, | |
| "loss": 0.0706, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8765603328710125, | |
| "grad_norm": 0.23435620963573456, | |
| "learning_rate": 0.0001870488567523318, | |
| "loss": 0.0695, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9875173370319001, | |
| "grad_norm": 0.18676415085792542, | |
| "learning_rate": 0.00018522168236559695, | |
| "loss": 0.0615, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.08876560332871, | |
| "grad_norm": 0.24162153899669647, | |
| "learning_rate": 0.00018328415743114912, | |
| "loss": 0.0445, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1997226074895977, | |
| "grad_norm": 0.3869277536869049, | |
| "learning_rate": 0.00018123879078162097, | |
| "loss": 0.0502, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.3106796116504853, | |
| "grad_norm": 0.3037394881248474, | |
| "learning_rate": 0.00017908823089007457, | |
| "loss": 0.0482, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.421636615811373, | |
| "grad_norm": 0.18976379930973053, | |
| "learning_rate": 0.00017683526244058716, | |
| "loss": 0.0528, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.5325936199722605, | |
| "grad_norm": 0.30705705285072327, | |
| "learning_rate": 0.00017448280272246212, | |
| "loss": 0.0521, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.6435506241331486, | |
| "grad_norm": 0.21610881388187408, | |
| "learning_rate": 0.000172033897852734, | |
| "loss": 0.0535, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.754507628294036, | |
| "grad_norm": 0.18693220615386963, | |
| "learning_rate": 0.00016949171883185918, | |
| "loss": 0.0517, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8654646324549238, | |
| "grad_norm": 0.3321268558502197, | |
| "learning_rate": 0.0001668595574376992, | |
| "loss": 0.0407, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9764216366158114, | |
| "grad_norm": 0.20721495151519775, | |
| "learning_rate": 0.000164140821963114, | |
| "loss": 0.0417, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0776699029126213, | |
| "grad_norm": 0.20151656866073608, | |
| "learning_rate": 0.00016133903280268362, | |
| "loss": 0.0373, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.188626907073509, | |
| "grad_norm": 0.3590203821659088, | |
| "learning_rate": 0.00015845781789427377, | |
| "loss": 0.0358, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.2995839112343965, | |
| "grad_norm": 0.20630675554275513, | |
| "learning_rate": 0.000155500908021347, | |
| "loss": 0.0299, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.410540915395284, | |
| "grad_norm": 0.3287246525287628, | |
| "learning_rate": 0.000152472131982103, | |
| "loss": 0.0331, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.5214979195561718, | |
| "grad_norm": 0.24394913017749786, | |
| "learning_rate": 0.0001493754116317029, | |
| "loss": 0.0368, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.63245492371706, | |
| "grad_norm": 0.20165830850601196, | |
| "learning_rate": 0.0001462147568039977, | |
| "loss": 0.0336, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.7434119278779474, | |
| "grad_norm": 0.2538021504878998, | |
| "learning_rate": 0.00014299426011933568, | |
| "loss": 0.0295, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.854368932038835, | |
| "grad_norm": 0.36229604482650757, | |
| "learning_rate": 0.00013971809168517298, | |
| "loss": 0.0358, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.9653259361997226, | |
| "grad_norm": 0.4092184603214264, | |
| "learning_rate": 0.00013639049369634876, | |
| "loss": 0.034, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.066574202496533, | |
| "grad_norm": 0.11960680782794952, | |
| "learning_rate": 0.00013301577494201664, | |
| "loss": 0.0233, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.17753120665742, | |
| "grad_norm": 0.26415354013442993, | |
| "learning_rate": 0.00012959830522634596, | |
| "loss": 0.02, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.288488210818308, | |
| "grad_norm": 0.21966516971588135, | |
| "learning_rate": 0.00012614250971021657, | |
| "loss": 0.0225, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.399445214979195, | |
| "grad_norm": 0.2905697524547577, | |
| "learning_rate": 0.00012265286318123415, | |
| "loss": 0.0244, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.510402219140083, | |
| "grad_norm": 0.24163606762886047, | |
| "learning_rate": 0.00011913388425948584, | |
| "loss": 0.017, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.621359223300971, | |
| "grad_norm": 0.40009695291519165, | |
| "learning_rate": 0.00011559012954653865, | |
| "loss": 0.0219, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.732316227461858, | |
| "grad_norm": 0.1963382512331009, | |
| "learning_rate": 0.0001120261877252568, | |
| "loss": 0.0179, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.843273231622746, | |
| "grad_norm": 0.33989155292510986, | |
| "learning_rate": 0.00010844667361807842, | |
| "loss": 0.0198, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.954230235783633, | |
| "grad_norm": 0.38484710454940796, | |
| "learning_rate": 0.00010485622221144484, | |
| "loss": 0.0249, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.055478502080444, | |
| "grad_norm": 0.18945415318012238, | |
| "learning_rate": 0.00010125948265412033, | |
| "loss": 0.0177, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.166435506241331, | |
| "grad_norm": 0.25906893610954285, | |
| "learning_rate": 9.766111223717352e-05, | |
| "loss": 0.0127, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.277392510402219, | |
| "grad_norm": 0.23804187774658203, | |
| "learning_rate": 9.406577036341548e-05, | |
| "loss": 0.0128, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.388349514563107, | |
| "grad_norm": 0.20456787943840027, | |
| "learning_rate": 9.047811251410376e-05, | |
| "loss": 0.0111, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.499306518723994, | |
| "grad_norm": 0.15757159888744354, | |
| "learning_rate": 8.690278422072384e-05, | |
| "loss": 0.0101, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.610263522884882, | |
| "grad_norm": 0.16691505908966064, | |
| "learning_rate": 8.334441504965455e-05, | |
| "loss": 0.0115, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.721220527045769, | |
| "grad_norm": 0.5055399537086487, | |
| "learning_rate": 7.980761260750607e-05, | |
| "loss": 0.0088, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.832177531206657, | |
| "grad_norm": 0.15076065063476562, | |
| "learning_rate": 7.629695657489257e-05, | |
| "loss": 0.0117, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.943134535367545, | |
| "grad_norm": 0.09655993431806564, | |
| "learning_rate": 7.281699277636572e-05, | |
| "loss": 0.0111, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.044382801664355, | |
| "grad_norm": 0.4866645336151123, | |
| "learning_rate": 6.93722272941869e-05, | |
| "loss": 0.0092, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.155339805825243, | |
| "grad_norm": 0.1816895604133606, | |
| "learning_rate": 6.59671206335602e-05, | |
| "loss": 0.0082, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.26629680998613, | |
| "grad_norm": 0.22271257638931274, | |
| "learning_rate": 6.260608194688206e-05, | |
| "loss": 0.0046, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.377253814147018, | |
| "grad_norm": 0.06787201762199402, | |
| "learning_rate": 5.929346332448511e-05, | |
| "loss": 0.0051, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.4882108183079055, | |
| "grad_norm": 0.09298055619001389, | |
| "learning_rate": 5.6033554159270294e-05, | |
| "loss": 0.0054, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.599167822468793, | |
| "grad_norm": 0.03731105476617813, | |
| "learning_rate": 5.283057559252341e-05, | |
| "loss": 0.0053, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.710124826629681, | |
| "grad_norm": 0.10652171820402145, | |
| "learning_rate": 4.96886750481082e-05, | |
| "loss": 0.0057, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.821081830790568, | |
| "grad_norm": 0.2607424259185791, | |
| "learning_rate": 4.661192086211366e-05, | |
| "loss": 0.0077, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 6.932038834951456, | |
| "grad_norm": 0.11328639835119247, | |
| "learning_rate": 4.360429701490934e-05, | |
| "loss": 0.0073, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.033287101248266, | |
| "grad_norm": 0.0941685363650322, | |
| "learning_rate": 4.06696979724298e-05, | |
| "loss": 0.0039, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.144244105409154, | |
| "grad_norm": 0.45776239037513733, | |
| "learning_rate": 3.7811923643367974e-05, | |
| "loss": 0.0032, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.2552011095700415, | |
| "grad_norm": 0.08863729238510132, | |
| "learning_rate": 3.503467445880789e-05, | |
| "loss": 0.0026, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.366158113730929, | |
| "grad_norm": 0.04661976918578148, | |
| "learning_rate": 3.2341546580666796e-05, | |
| "loss": 0.0024, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.477115117891817, | |
| "grad_norm": 0.08003357797861099, | |
| "learning_rate": 2.9736027245152275e-05, | |
| "loss": 0.0022, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 7.588072122052704, | |
| "grad_norm": 0.15967042744159698, | |
| "learning_rate": 2.722149024726307e-05, | |
| "loss": 0.0024, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 7.699029126213592, | |
| "grad_norm": 0.0572751984000206, | |
| "learning_rate": 2.480119157218108e-05, | |
| "loss": 0.003, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 7.8099861303744795, | |
| "grad_norm": 0.0780700072646141, | |
| "learning_rate": 2.247826517921121e-05, | |
| "loss": 0.0035, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 7.920943134535367, | |
| "grad_norm": 0.19474399089813232, | |
| "learning_rate": 2.025571894372794e-05, | |
| "loss": 0.0027, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.022191400832178, | |
| "grad_norm": 0.12848657369613647, | |
| "learning_rate": 1.813643076238375e-05, | |
| "loss": 0.002, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.133148404993065, | |
| "grad_norm": 0.05772533640265465, | |
| "learning_rate": 1.6123144826622504e-05, | |
| "loss": 0.0017, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.244105409153953, | |
| "grad_norm": 0.14121367037296295, | |
| "learning_rate": 1.4218468069322578e-05, | |
| "loss": 0.0013, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.35506241331484, | |
| "grad_norm": 0.14342299103736877, | |
| "learning_rate": 1.2424866789171729e-05, | |
| "loss": 0.0016, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.466019417475728, | |
| "grad_norm": 0.03438349440693855, | |
| "learning_rate": 1.0744663457143878e-05, | |
| "loss": 0.0011, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 8.576976421636616, | |
| "grad_norm": 0.0756613090634346, | |
| "learning_rate": 9.180033709213454e-06, | |
| "loss": 0.0017, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 8.687933425797503, | |
| "grad_norm": 0.0464102178812027, | |
| "learning_rate": 7.733003529201278e-06, | |
| "loss": 0.0014, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 8.79889042995839, | |
| "grad_norm": 0.12452979385852814, | |
| "learning_rate": 6.405446625399481e-06, | |
| "loss": 0.0015, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 8.909847434119278, | |
| "grad_norm": 0.08071909099817276, | |
| "learning_rate": 5.199082004372957e-06, | |
| "loss": 0.0014, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.011095700416089, | |
| "grad_norm": 0.06948132812976837, | |
| "learning_rate": 4.115471745078314e-06, | |
| "loss": 0.0012, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 9.122052704576976, | |
| "grad_norm": 0.07605510950088501, | |
| "learning_rate": 3.1560189761830728e-06, | |
| "loss": 0.0009, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 9.233009708737864, | |
| "grad_norm": 0.0312280785292387, | |
| "learning_rate": 2.3219660592038285e-06, | |
| "loss": 0.0012, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 9.343966712898752, | |
| "grad_norm": 0.02329327166080475, | |
| "learning_rate": 1.6143929798162704e-06, | |
| "loss": 0.001, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 9.45492371705964, | |
| "grad_norm": 0.08054498583078384, | |
| "learning_rate": 1.034215949419748e-06, | |
| "loss": 0.0012, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 9.565880721220527, | |
| "grad_norm": 0.09850303828716278, | |
| "learning_rate": 5.821862187675775e-07, | |
| "loss": 0.0011, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 9.676837725381414, | |
| "grad_norm": 0.08373916149139404, | |
| "learning_rate": 2.588891051988895e-07, | |
| "loss": 0.0019, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 9.787794729542302, | |
| "grad_norm": 0.017217393964529037, | |
| "learning_rate": 6.474323473194543e-08, | |
| "loss": 0.0009, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 9.89875173370319, | |
| "grad_norm": 0.04848321154713631, | |
| "learning_rate": 0.0, | |
| "loss": 0.0009, | |
| "step": 900 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 900, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.301284175906406e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |