| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 439, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011389521640091117, | |
| "grad_norm": 1.0344932079315186, | |
| "learning_rate": 1.090909090909091e-06, | |
| "loss": 1.3125, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.022779043280182234, | |
| "grad_norm": 1.0441877841949463, | |
| "learning_rate": 2.454545454545455e-06, | |
| "loss": 1.3976, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03416856492027335, | |
| "grad_norm": 0.8714914917945862, | |
| "learning_rate": 3.818181818181818e-06, | |
| "loss": 1.3306, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04555808656036447, | |
| "grad_norm": 0.6306040287017822, | |
| "learning_rate": 5.181818181818182e-06, | |
| "loss": 1.3198, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05694760820045558, | |
| "grad_norm": 0.6439117193222046, | |
| "learning_rate": 6.545454545454545e-06, | |
| "loss": 1.3367, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0683371298405467, | |
| "grad_norm": 0.5493311882019043, | |
| "learning_rate": 7.909090909090909e-06, | |
| "loss": 1.3273, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07972665148063782, | |
| "grad_norm": 0.5211251974105835, | |
| "learning_rate": 9.272727272727273e-06, | |
| "loss": 1.2805, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09111617312072894, | |
| "grad_norm": 0.4951925277709961, | |
| "learning_rate": 1.0636363636363636e-05, | |
| "loss": 1.2461, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10250569476082004, | |
| "grad_norm": 0.5532836318016052, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.2666, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11389521640091116, | |
| "grad_norm": 0.49684393405914307, | |
| "learning_rate": 1.3363636363636364e-05, | |
| "loss": 1.2291, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1252847380410023, | |
| "grad_norm": 0.42352697253227234, | |
| "learning_rate": 1.4727272727272728e-05, | |
| "loss": 1.2539, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1366742596810934, | |
| "grad_norm": 0.443546324968338, | |
| "learning_rate": 1.6090909090909092e-05, | |
| "loss": 1.1928, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1480637813211845, | |
| "grad_norm": 0.48440542817115784, | |
| "learning_rate": 1.7454545454545456e-05, | |
| "loss": 1.2362, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15945330296127563, | |
| "grad_norm": 0.5563086867332458, | |
| "learning_rate": 1.881818181818182e-05, | |
| "loss": 1.2254, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.17084282460136674, | |
| "grad_norm": 0.43645480275154114, | |
| "learning_rate": 2.0181818181818183e-05, | |
| "loss": 1.2314, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.18223234624145787, | |
| "grad_norm": 0.46460843086242676, | |
| "learning_rate": 2.1545454545454544e-05, | |
| "loss": 1.213, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.19362186788154898, | |
| "grad_norm": 0.5368738174438477, | |
| "learning_rate": 2.290909090909091e-05, | |
| "loss": 1.2347, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.20501138952164008, | |
| "grad_norm": 0.6184517741203308, | |
| "learning_rate": 2.4272727272727275e-05, | |
| "loss": 1.1732, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2164009111617312, | |
| "grad_norm": 0.4807546138763428, | |
| "learning_rate": 2.5636363636363635e-05, | |
| "loss": 1.1216, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22779043280182232, | |
| "grad_norm": 0.508878767490387, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 1.1522, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23917995444191345, | |
| "grad_norm": 0.484258234500885, | |
| "learning_rate": 2.8363636363636363e-05, | |
| "loss": 1.1326, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2505694760820046, | |
| "grad_norm": 0.5640544891357422, | |
| "learning_rate": 2.972727272727273e-05, | |
| "loss": 1.1838, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2619589977220957, | |
| "grad_norm": 0.5793269276618958, | |
| "learning_rate": 2.9999727562078572e-05, | |
| "loss": 1.1362, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2733485193621868, | |
| "grad_norm": 0.6144348978996277, | |
| "learning_rate": 2.9998620799983725e-05, | |
| "loss": 1.1469, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2847380410022779, | |
| "grad_norm": 0.5140024423599243, | |
| "learning_rate": 2.9996662749114444e-05, | |
| "loss": 1.113, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.296127562642369, | |
| "grad_norm": 0.5761210918426514, | |
| "learning_rate": 2.999385352060538e-05, | |
| "loss": 1.1337, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.30751708428246016, | |
| "grad_norm": 0.5640386343002319, | |
| "learning_rate": 2.9990193273902185e-05, | |
| "loss": 1.1025, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.31890660592255127, | |
| "grad_norm": 0.5742846727371216, | |
| "learning_rate": 2.9985682216752418e-05, | |
| "loss": 1.0998, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.33029612756264237, | |
| "grad_norm": 0.5623965263366699, | |
| "learning_rate": 2.9980320605193775e-05, | |
| "loss": 1.1085, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3416856492027335, | |
| "grad_norm": 0.684952974319458, | |
| "learning_rate": 2.9974108743539558e-05, | |
| "loss": 1.0412, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3530751708428246, | |
| "grad_norm": 0.6107752323150635, | |
| "learning_rate": 2.996704698436138e-05, | |
| "loss": 1.1049, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.36446469248291574, | |
| "grad_norm": 0.5499274134635925, | |
| "learning_rate": 2.9959135728469197e-05, | |
| "loss": 1.084, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.37585421412300685, | |
| "grad_norm": 0.6297823190689087, | |
| "learning_rate": 2.99503754248885e-05, | |
| "loss": 1.0479, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.38724373576309795, | |
| "grad_norm": 0.579071581363678, | |
| "learning_rate": 2.994076657083489e-05, | |
| "loss": 1.0339, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.39863325740318906, | |
| "grad_norm": 0.6385654211044312, | |
| "learning_rate": 2.993030971168582e-05, | |
| "loss": 1.0807, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.41002277904328016, | |
| "grad_norm": 0.6286646127700806, | |
| "learning_rate": 2.9919005440949648e-05, | |
| "loss": 1.052, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4214123006833713, | |
| "grad_norm": 0.6202685832977295, | |
| "learning_rate": 2.9906854400231942e-05, | |
| "loss": 0.9854, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4328018223234624, | |
| "grad_norm": 0.6805747747421265, | |
| "learning_rate": 2.9893857279199103e-05, | |
| "loss": 1.0216, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.44419134396355353, | |
| "grad_norm": 0.7440136671066284, | |
| "learning_rate": 2.9880014815539158e-05, | |
| "loss": 1.048, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.45558086560364464, | |
| "grad_norm": 0.7811765074729919, | |
| "learning_rate": 2.9865327794919954e-05, | |
| "loss": 0.9698, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.46697038724373574, | |
| "grad_norm": 0.6883424520492554, | |
| "learning_rate": 2.9849797050944517e-05, | |
| "loss": 0.9668, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4783599088838269, | |
| "grad_norm": 0.7067654728889465, | |
| "learning_rate": 2.9833423465103763e-05, | |
| "loss": 1.0072, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.489749430523918, | |
| "grad_norm": 0.7656168937683105, | |
| "learning_rate": 2.981620796672647e-05, | |
| "loss": 0.9862, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5011389521640092, | |
| "grad_norm": 0.7423420548439026, | |
| "learning_rate": 2.9798151532926504e-05, | |
| "loss": 0.9068, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5125284738041003, | |
| "grad_norm": 0.7688584923744202, | |
| "learning_rate": 2.977925518854739e-05, | |
| "loss": 0.992, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5239179954441914, | |
| "grad_norm": 0.817163348197937, | |
| "learning_rate": 2.9759520006104146e-05, | |
| "loss": 0.952, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5353075170842825, | |
| "grad_norm": 0.8287720680236816, | |
| "learning_rate": 2.9738947105722373e-05, | |
| "loss": 0.9268, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5466970387243736, | |
| "grad_norm": 0.8259539008140564, | |
| "learning_rate": 2.9717537655074708e-05, | |
| "loss": 0.8883, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5580865603644647, | |
| "grad_norm": 0.7533463835716248, | |
| "learning_rate": 2.969529286931455e-05, | |
| "loss": 0.9673, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5694760820045558, | |
| "grad_norm": 0.8228470087051392, | |
| "learning_rate": 2.9672214011007087e-05, | |
| "loss": 0.9552, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5808656036446469, | |
| "grad_norm": 0.858070433139801, | |
| "learning_rate": 2.964830239005762e-05, | |
| "loss": 0.8839, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.592255125284738, | |
| "grad_norm": 0.9179785847663879, | |
| "learning_rate": 2.9623559363637234e-05, | |
| "loss": 0.8456, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6036446469248291, | |
| "grad_norm": 0.9425612092018127, | |
| "learning_rate": 2.9597986336105776e-05, | |
| "loss": 0.9096, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6150341685649203, | |
| "grad_norm": 0.7805739045143127, | |
| "learning_rate": 2.9571584758932108e-05, | |
| "loss": 0.8979, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6264236902050114, | |
| "grad_norm": 0.8105698823928833, | |
| "learning_rate": 2.954435613061177e-05, | |
| "loss": 0.8979, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6378132118451025, | |
| "grad_norm": 0.8522108793258667, | |
| "learning_rate": 2.9516301996581906e-05, | |
| "loss": 0.8944, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6492027334851936, | |
| "grad_norm": 0.8105028867721558, | |
| "learning_rate": 2.948742394913354e-05, | |
| "loss": 0.8501, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6605922551252847, | |
| "grad_norm": 1.0309525728225708, | |
| "learning_rate": 2.945772362732122e-05, | |
| "loss": 0.8477, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6719817767653758, | |
| "grad_norm": 0.8388428092002869, | |
| "learning_rate": 2.9427202716869988e-05, | |
| "loss": 0.8436, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.683371298405467, | |
| "grad_norm": 0.9071732759475708, | |
| "learning_rate": 2.9395862950079672e-05, | |
| "loss": 0.7864, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6947608200455581, | |
| "grad_norm": 0.8124439120292664, | |
| "learning_rate": 2.9363706105726613e-05, | |
| "loss": 0.8854, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7061503416856492, | |
| "grad_norm": 0.8722319006919861, | |
| "learning_rate": 2.9330734008962666e-05, | |
| "loss": 0.8301, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7175398633257403, | |
| "grad_norm": 1.59921133518219, | |
| "learning_rate": 2.9296948531211626e-05, | |
| "loss": 0.8504, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7289293849658315, | |
| "grad_norm": 0.8519591093063354, | |
| "learning_rate": 2.9262351590063004e-05, | |
| "loss": 0.7793, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7403189066059226, | |
| "grad_norm": 0.8795806765556335, | |
| "learning_rate": 2.9226945149163193e-05, | |
| "loss": 0.8116, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7517084282460137, | |
| "grad_norm": 0.9364266991615295, | |
| "learning_rate": 2.9190731218104018e-05, | |
| "loss": 0.8115, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7630979498861048, | |
| "grad_norm": 0.8541845679283142, | |
| "learning_rate": 2.915371185230866e-05, | |
| "loss": 0.8512, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7744874715261959, | |
| "grad_norm": 0.9761788249015808, | |
| "learning_rate": 2.9115889152915017e-05, | |
| "loss": 0.7849, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.785876993166287, | |
| "grad_norm": 0.9925262331962585, | |
| "learning_rate": 2.9077265266656436e-05, | |
| "loss": 0.7846, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7972665148063781, | |
| "grad_norm": 0.9060655832290649, | |
| "learning_rate": 2.9037842385739875e-05, | |
| "loss": 0.803, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8086560364464692, | |
| "grad_norm": 1.0514848232269287, | |
| "learning_rate": 2.8997622747721457e-05, | |
| "loss": 0.8081, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8200455580865603, | |
| "grad_norm": 0.9304701089859009, | |
| "learning_rate": 2.8956608635379504e-05, | |
| "loss": 0.7955, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8314350797266514, | |
| "grad_norm": 0.8657699823379517, | |
| "learning_rate": 2.8914802376584958e-05, | |
| "loss": 0.8052, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8428246013667426, | |
| "grad_norm": 0.9663517475128174, | |
| "learning_rate": 2.8872206344169244e-05, | |
| "loss": 0.7788, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8542141230068337, | |
| "grad_norm": 0.9335425496101379, | |
| "learning_rate": 2.8828822955789597e-05, | |
| "loss": 0.7237, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8656036446469249, | |
| "grad_norm": 0.9053182005882263, | |
| "learning_rate": 2.8784654673791867e-05, | |
| "loss": 0.7874, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.876993166287016, | |
| "grad_norm": 1.0356011390686035, | |
| "learning_rate": 2.873970400507073e-05, | |
| "loss": 0.804, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8883826879271071, | |
| "grad_norm": 1.020293116569519, | |
| "learning_rate": 2.8693973500927407e-05, | |
| "loss": 0.7587, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8997722095671982, | |
| "grad_norm": 0.9958662986755371, | |
| "learning_rate": 2.864746575692489e-05, | |
| "loss": 0.7968, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9111617312072893, | |
| "grad_norm": 1.0016448497772217, | |
| "learning_rate": 2.8600183412740562e-05, | |
| "loss": 0.7648, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9225512528473804, | |
| "grad_norm": 0.9963740706443787, | |
| "learning_rate": 2.8552129152016444e-05, | |
| "loss": 0.7726, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9339407744874715, | |
| "grad_norm": 0.92083340883255, | |
| "learning_rate": 2.850330570220683e-05, | |
| "loss": 0.7348, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9453302961275627, | |
| "grad_norm": 1.0512831211090088, | |
| "learning_rate": 2.845371583442349e-05, | |
| "loss": 0.8, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9567198177676538, | |
| "grad_norm": 1.076445460319519, | |
| "learning_rate": 2.8403362363278407e-05, | |
| "loss": 0.7718, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9681093394077449, | |
| "grad_norm": 0.9659481644630432, | |
| "learning_rate": 2.835224814672401e-05, | |
| "loss": 0.7325, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.979498861047836, | |
| "grad_norm": 1.0408178567886353, | |
| "learning_rate": 2.8300376085890966e-05, | |
| "loss": 0.7066, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9908883826879271, | |
| "grad_norm": 1.320637583732605, | |
| "learning_rate": 2.824774912492352e-05, | |
| "loss": 0.6915, | |
| "step": 435 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2195, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.281137357177487e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |