{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011389521640091117, "grad_norm": 1.0344932079315186, "learning_rate": 1.090909090909091e-06, "loss": 1.3125, "step": 5 }, { "epoch": 0.022779043280182234, "grad_norm": 1.0441877841949463, "learning_rate": 2.454545454545455e-06, "loss": 1.3976, "step": 10 }, { "epoch": 0.03416856492027335, "grad_norm": 0.8714914917945862, "learning_rate": 3.818181818181818e-06, "loss": 1.3306, "step": 15 }, { "epoch": 0.04555808656036447, "grad_norm": 0.6306040287017822, "learning_rate": 5.181818181818182e-06, "loss": 1.3198, "step": 20 }, { "epoch": 0.05694760820045558, "grad_norm": 0.6439117193222046, "learning_rate": 6.545454545454545e-06, "loss": 1.3367, "step": 25 }, { "epoch": 0.0683371298405467, "grad_norm": 0.5493311882019043, "learning_rate": 7.909090909090909e-06, "loss": 1.3273, "step": 30 }, { "epoch": 0.07972665148063782, "grad_norm": 0.5211251974105835, "learning_rate": 9.272727272727273e-06, "loss": 1.2805, "step": 35 }, { "epoch": 0.09111617312072894, "grad_norm": 0.4951925277709961, "learning_rate": 1.0636363636363636e-05, "loss": 1.2461, "step": 40 }, { "epoch": 0.10250569476082004, "grad_norm": 0.5532836318016052, "learning_rate": 1.2e-05, "loss": 1.2666, "step": 45 }, { "epoch": 0.11389521640091116, "grad_norm": 0.49684393405914307, "learning_rate": 1.3363636363636364e-05, "loss": 1.2291, "step": 50 }, { "epoch": 0.1252847380410023, "grad_norm": 0.42352697253227234, "learning_rate": 1.4727272727272728e-05, "loss": 1.2539, "step": 55 }, { "epoch": 0.1366742596810934, "grad_norm": 0.443546324968338, "learning_rate": 1.6090909090909092e-05, "loss": 1.1928, "step": 60 }, { "epoch": 0.1480637813211845, "grad_norm": 0.48440542817115784, "learning_rate": 1.7454545454545456e-05, "loss": 1.2362, "step": 65 }, { "epoch": 0.15945330296127563, "grad_norm": 0.5563086867332458, "learning_rate": 1.881818181818182e-05, "loss": 1.2254, "step": 70 }, { "epoch": 0.17084282460136674, "grad_norm": 0.43645480275154114, "learning_rate": 2.0181818181818183e-05, "loss": 1.2314, "step": 75 }, { "epoch": 0.18223234624145787, "grad_norm": 0.46460843086242676, "learning_rate": 2.1545454545454544e-05, "loss": 1.213, "step": 80 }, { "epoch": 0.19362186788154898, "grad_norm": 0.5368738174438477, "learning_rate": 2.290909090909091e-05, "loss": 1.2347, "step": 85 }, { "epoch": 0.20501138952164008, "grad_norm": 0.6184517741203308, "learning_rate": 2.4272727272727275e-05, "loss": 1.1732, "step": 90 }, { "epoch": 0.2164009111617312, "grad_norm": 0.4807546138763428, "learning_rate": 2.5636363636363635e-05, "loss": 1.1216, "step": 95 }, { "epoch": 0.22779043280182232, "grad_norm": 0.508878767490387, "learning_rate": 2.7000000000000002e-05, "loss": 1.1522, "step": 100 }, { "epoch": 0.23917995444191345, "grad_norm": 0.484258234500885, "learning_rate": 2.8363636363636363e-05, "loss": 1.1326, "step": 105 }, { "epoch": 0.2505694760820046, "grad_norm": 0.5640544891357422, "learning_rate": 2.972727272727273e-05, "loss": 1.1838, "step": 110 }, { "epoch": 0.2619589977220957, "grad_norm": 0.5793269276618958, "learning_rate": 2.9999727562078572e-05, "loss": 1.1362, "step": 115 }, { "epoch": 0.2733485193621868, "grad_norm": 0.6144348978996277, "learning_rate": 2.9998620799983725e-05, "loss": 1.1469, "step": 120 }, { "epoch": 0.2847380410022779, "grad_norm": 0.5140024423599243, "learning_rate": 2.9996662749114444e-05, "loss": 1.113, "step": 125 }, { "epoch": 0.296127562642369, "grad_norm": 0.5761210918426514, "learning_rate": 2.999385352060538e-05, "loss": 1.1337, "step": 130 }, { "epoch": 0.30751708428246016, "grad_norm": 0.5640386343002319, "learning_rate": 2.9990193273902185e-05, "loss": 1.1025, "step": 135 }, { "epoch": 0.31890660592255127, "grad_norm": 0.5742846727371216, "learning_rate": 2.9985682216752418e-05, "loss": 1.0998, "step": 140 }, { "epoch": 0.33029612756264237, "grad_norm": 0.5623965263366699, "learning_rate": 2.9980320605193775e-05, "loss": 1.1085, "step": 145 }, { "epoch": 0.3416856492027335, "grad_norm": 0.684952974319458, "learning_rate": 2.9974108743539558e-05, "loss": 1.0412, "step": 150 }, { "epoch": 0.3530751708428246, "grad_norm": 0.6107752323150635, "learning_rate": 2.996704698436138e-05, "loss": 1.1049, "step": 155 }, { "epoch": 0.36446469248291574, "grad_norm": 0.5499274134635925, "learning_rate": 2.9959135728469197e-05, "loss": 1.084, "step": 160 }, { "epoch": 0.37585421412300685, "grad_norm": 0.6297823190689087, "learning_rate": 2.99503754248885e-05, "loss": 1.0479, "step": 165 }, { "epoch": 0.38724373576309795, "grad_norm": 0.579071581363678, "learning_rate": 2.994076657083489e-05, "loss": 1.0339, "step": 170 }, { "epoch": 0.39863325740318906, "grad_norm": 0.6385654211044312, "learning_rate": 2.993030971168582e-05, "loss": 1.0807, "step": 175 }, { "epoch": 0.41002277904328016, "grad_norm": 0.6286646127700806, "learning_rate": 2.9919005440949648e-05, "loss": 1.052, "step": 180 }, { "epoch": 0.4214123006833713, "grad_norm": 0.6202685832977295, "learning_rate": 2.9906854400231942e-05, "loss": 0.9854, "step": 185 }, { "epoch": 0.4328018223234624, "grad_norm": 0.6805747747421265, "learning_rate": 2.9893857279199103e-05, "loss": 1.0216, "step": 190 }, { "epoch": 0.44419134396355353, "grad_norm": 0.7440136671066284, "learning_rate": 2.9880014815539158e-05, "loss": 1.048, "step": 195 }, { "epoch": 0.45558086560364464, "grad_norm": 0.7811765074729919, "learning_rate": 2.9865327794919954e-05, "loss": 0.9698, "step": 200 }, { "epoch": 0.46697038724373574, "grad_norm": 0.6883424520492554, "learning_rate": 2.9849797050944517e-05, "loss": 0.9668, "step": 205 }, { "epoch": 0.4783599088838269, "grad_norm": 0.7067654728889465, "learning_rate": 2.9833423465103763e-05, "loss": 1.0072, "step": 210 }, { "epoch": 0.489749430523918, "grad_norm": 0.7656168937683105, "learning_rate": 2.981620796672647e-05, "loss": 0.9862, "step": 215 }, { "epoch": 0.5011389521640092, "grad_norm": 0.7423420548439026, "learning_rate": 2.9798151532926504e-05, "loss": 0.9068, "step": 220 }, { "epoch": 0.5125284738041003, "grad_norm": 0.7688584923744202, "learning_rate": 2.977925518854739e-05, "loss": 0.992, "step": 225 }, { "epoch": 0.5239179954441914, "grad_norm": 0.817163348197937, "learning_rate": 2.9759520006104146e-05, "loss": 0.952, "step": 230 }, { "epoch": 0.5353075170842825, "grad_norm": 0.8287720680236816, "learning_rate": 2.9738947105722373e-05, "loss": 0.9268, "step": 235 }, { "epoch": 0.5466970387243736, "grad_norm": 0.8259539008140564, "learning_rate": 2.9717537655074708e-05, "loss": 0.8883, "step": 240 }, { "epoch": 0.5580865603644647, "grad_norm": 0.7533463835716248, "learning_rate": 2.969529286931455e-05, "loss": 0.9673, "step": 245 }, { "epoch": 0.5694760820045558, "grad_norm": 0.8228470087051392, "learning_rate": 2.9672214011007087e-05, "loss": 0.9552, "step": 250 }, { "epoch": 0.5808656036446469, "grad_norm": 0.858070433139801, "learning_rate": 2.964830239005762e-05, "loss": 0.8839, "step": 255 }, { "epoch": 0.592255125284738, "grad_norm": 0.9179785847663879, "learning_rate": 2.9623559363637234e-05, "loss": 0.8456, "step": 260 }, { "epoch": 0.6036446469248291, "grad_norm": 0.9425612092018127, "learning_rate": 2.9597986336105776e-05, "loss": 0.9096, "step": 265 }, { "epoch": 0.6150341685649203, "grad_norm": 0.7805739045143127, "learning_rate": 2.9571584758932108e-05, "loss": 0.8979, "step": 270 }, { "epoch": 0.6264236902050114, "grad_norm": 0.8105698823928833, "learning_rate": 2.954435613061177e-05, "loss": 0.8979, "step": 275 }, { "epoch": 0.6378132118451025, "grad_norm": 0.8522108793258667, "learning_rate": 2.9516301996581906e-05, "loss": 0.8944, "step": 280 }, { "epoch": 0.6492027334851936, "grad_norm": 0.8105028867721558, "learning_rate": 2.948742394913354e-05, "loss": 0.8501, "step": 285 }, { "epoch": 0.6605922551252847, "grad_norm": 1.0309525728225708, "learning_rate": 2.945772362732122e-05, "loss": 0.8477, "step": 290 }, { "epoch": 0.6719817767653758, "grad_norm": 0.8388428092002869, "learning_rate": 2.9427202716869988e-05, "loss": 0.8436, "step": 295 }, { "epoch": 0.683371298405467, "grad_norm": 0.9071732759475708, "learning_rate": 2.9395862950079672e-05, "loss": 0.7864, "step": 300 }, { "epoch": 0.6947608200455581, "grad_norm": 0.8124439120292664, "learning_rate": 2.9363706105726613e-05, "loss": 0.8854, "step": 305 }, { "epoch": 0.7061503416856492, "grad_norm": 0.8722319006919861, "learning_rate": 2.9330734008962666e-05, "loss": 0.8301, "step": 310 }, { "epoch": 0.7175398633257403, "grad_norm": 1.59921133518219, "learning_rate": 2.9296948531211626e-05, "loss": 0.8504, "step": 315 }, { "epoch": 0.7289293849658315, "grad_norm": 0.8519591093063354, "learning_rate": 2.9262351590063004e-05, "loss": 0.7793, "step": 320 }, { "epoch": 0.7403189066059226, "grad_norm": 0.8795806765556335, "learning_rate": 2.9226945149163193e-05, "loss": 0.8116, "step": 325 }, { "epoch": 0.7517084282460137, "grad_norm": 0.9364266991615295, "learning_rate": 2.9190731218104018e-05, "loss": 0.8115, "step": 330 }, { "epoch": 0.7630979498861048, "grad_norm": 0.8541845679283142, "learning_rate": 2.915371185230866e-05, "loss": 0.8512, "step": 335 }, { "epoch": 0.7744874715261959, "grad_norm": 0.9761788249015808, "learning_rate": 2.9115889152915017e-05, "loss": 0.7849, "step": 340 }, { "epoch": 0.785876993166287, "grad_norm": 0.9925262331962585, "learning_rate": 2.9077265266656436e-05, "loss": 0.7846, "step": 345 }, { "epoch": 0.7972665148063781, "grad_norm": 0.9060655832290649, "learning_rate": 2.9037842385739875e-05, "loss": 0.803, "step": 350 }, { "epoch": 0.8086560364464692, "grad_norm": 1.0514848232269287, "learning_rate": 2.8997622747721457e-05, "loss": 0.8081, "step": 355 }, { "epoch": 0.8200455580865603, "grad_norm": 0.9304701089859009, "learning_rate": 2.8956608635379504e-05, "loss": 0.7955, "step": 360 }, { "epoch": 0.8314350797266514, "grad_norm": 0.8657699823379517, "learning_rate": 2.8914802376584958e-05, "loss": 0.8052, "step": 365 }, { "epoch": 0.8428246013667426, "grad_norm": 0.9663517475128174, "learning_rate": 2.8872206344169244e-05, "loss": 0.7788, "step": 370 }, { "epoch": 0.8542141230068337, "grad_norm": 0.9335425496101379, "learning_rate": 2.8828822955789597e-05, "loss": 0.7237, "step": 375 }, { "epoch": 0.8656036446469249, "grad_norm": 0.9053182005882263, "learning_rate": 2.8784654673791867e-05, "loss": 0.7874, "step": 380 }, { "epoch": 0.876993166287016, "grad_norm": 1.0356011390686035, "learning_rate": 2.873970400507073e-05, "loss": 0.804, "step": 385 }, { "epoch": 0.8883826879271071, "grad_norm": 1.020293116569519, "learning_rate": 2.8693973500927407e-05, "loss": 0.7587, "step": 390 }, { "epoch": 0.8997722095671982, "grad_norm": 0.9958662986755371, "learning_rate": 2.864746575692489e-05, "loss": 0.7968, "step": 395 }, { "epoch": 0.9111617312072893, "grad_norm": 1.0016448497772217, "learning_rate": 2.8600183412740562e-05, "loss": 0.7648, "step": 400 }, { "epoch": 0.9225512528473804, "grad_norm": 0.9963740706443787, "learning_rate": 2.8552129152016444e-05, "loss": 0.7726, "step": 405 }, { "epoch": 0.9339407744874715, "grad_norm": 0.92083340883255, "learning_rate": 2.850330570220683e-05, "loss": 0.7348, "step": 410 }, { "epoch": 0.9453302961275627, "grad_norm": 1.0512831211090088, "learning_rate": 2.845371583442349e-05, "loss": 0.8, "step": 415 }, { "epoch": 0.9567198177676538, "grad_norm": 1.076445460319519, "learning_rate": 2.8403362363278407e-05, "loss": 0.7718, "step": 420 }, { "epoch": 0.9681093394077449, "grad_norm": 0.9659481644630432, "learning_rate": 2.835224814672401e-05, "loss": 0.7325, "step": 425 }, { "epoch": 0.979498861047836, "grad_norm": 1.0408178567886353, "learning_rate": 2.8300376085890966e-05, "loss": 0.7066, "step": 430 }, { "epoch": 0.9908883826879271, "grad_norm": 1.320637583732605, "learning_rate": 2.824774912492352e-05, "loss": 0.6915, "step": 435 } ], "logging_steps": 5, "max_steps": 2195, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.281137357177487e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }