| { | |
| "best_global_step": 250, | |
| "best_metric": 0.4610269367694855, | |
| "best_model_checkpoint": "/root/autodl-tmp/model/lora-textui/stage1_cap_func/checkpoint-250", | |
| "epoch": 2.4482338611449452, | |
| "eval_steps": 25, | |
| "global_step": 250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0097442143727162, | |
| "grad_norm": 1.4819260835647583, | |
| "learning_rate": 0.0, | |
| "loss": 0.9971, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0194884287454324, | |
| "grad_norm": 1.1177597045898438, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.8357, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.029232643118148598, | |
| "grad_norm": 1.5783437490463257, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.0968, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0389768574908648, | |
| "grad_norm": 1.3322019577026367, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.9436, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.048721071863580996, | |
| "grad_norm": 1.1600998640060425, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.8405, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.058465286236297195, | |
| "grad_norm": 3.9179673194885254, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8719, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0682095006090134, | |
| "grad_norm": 0.7671970725059509, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.7487, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0779537149817296, | |
| "grad_norm": 0.6959292888641357, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.8104, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0876979293544458, | |
| "grad_norm": 0.4416683316230774, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.7356, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.09744214372716199, | |
| "grad_norm": 0.48265624046325684, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.7642, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1071863580998782, | |
| "grad_norm": 0.45329996943473816, | |
| "learning_rate": 4e-05, | |
| "loss": 0.734, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.11693057247259439, | |
| "grad_norm": 0.506803572177887, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.7155, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.12667478684531058, | |
| "grad_norm": 0.5821135640144348, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.7207, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.1364190012180268, | |
| "grad_norm": 0.6181040406227112, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 0.7882, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.146163215590743, | |
| "grad_norm": 0.47155848145484924, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 0.7188, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1559074299634592, | |
| "grad_norm": 0.3960055112838745, | |
| "learning_rate": 6e-05, | |
| "loss": 0.7002, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1656516443361754, | |
| "grad_norm": 0.4034963846206665, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.7174, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.1753958587088916, | |
| "grad_norm": 0.408160924911499, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 0.7085, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1851400730816078, | |
| "grad_norm": 0.4035159647464752, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.788, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.19488428745432398, | |
| "grad_norm": 0.38708168268203735, | |
| "learning_rate": 7.6e-05, | |
| "loss": 0.6491, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2046285018270402, | |
| "grad_norm": 0.42144861817359924, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6749, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.2143727161997564, | |
| "grad_norm": 0.3642968237400055, | |
| "learning_rate": 8.4e-05, | |
| "loss": 0.6431, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.2241169305724726, | |
| "grad_norm": 0.4132453203201294, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 0.6089, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.23386114494518878, | |
| "grad_norm": 0.4148790240287781, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 0.605, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.243605359317905, | |
| "grad_norm": 0.46465402841567993, | |
| "learning_rate": 9.6e-05, | |
| "loss": 0.6383, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.243605359317905, | |
| "eval_loss": 0.6978875398635864, | |
| "eval_runtime": 116.3355, | |
| "eval_samples_per_second": 7.367, | |
| "eval_steps_per_second": 0.464, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.25334957369062117, | |
| "grad_norm": 0.38448578119277954, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6573, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.2630937880633374, | |
| "grad_norm": 0.386096715927124, | |
| "learning_rate": 9.999512620046522e-05, | |
| "loss": 0.6503, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.2728380024360536, | |
| "grad_norm": 0.4173428416252136, | |
| "learning_rate": 9.998050575201771e-05, | |
| "loss": 0.6056, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.28258221680876977, | |
| "grad_norm": 0.3838996887207031, | |
| "learning_rate": 9.995614150494293e-05, | |
| "loss": 0.6291, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.292326431181486, | |
| "grad_norm": 0.37374845147132874, | |
| "learning_rate": 9.992203820909906e-05, | |
| "loss": 0.5929, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3020706455542022, | |
| "grad_norm": 0.38990911841392517, | |
| "learning_rate": 9.987820251299122e-05, | |
| "loss": 0.6166, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3118148599269184, | |
| "grad_norm": 0.3717946410179138, | |
| "learning_rate": 9.982464296247522e-05, | |
| "loss": 0.631, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.3215590742996346, | |
| "grad_norm": 0.38515135645866394, | |
| "learning_rate": 9.976136999909156e-05, | |
| "loss": 0.6309, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.3313032886723508, | |
| "grad_norm": 0.4617915749549866, | |
| "learning_rate": 9.968839595802982e-05, | |
| "loss": 0.6514, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.341047503045067, | |
| "grad_norm": 0.36573508381843567, | |
| "learning_rate": 9.96057350657239e-05, | |
| "loss": 0.5839, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3507917174177832, | |
| "grad_norm": 0.38394173979759216, | |
| "learning_rate": 9.951340343707852e-05, | |
| "loss": 0.5927, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.36053593179049936, | |
| "grad_norm": 0.41442376375198364, | |
| "learning_rate": 9.941141907232765e-05, | |
| "loss": 0.6388, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.3702801461632156, | |
| "grad_norm": 0.3883739709854126, | |
| "learning_rate": 9.929980185352526e-05, | |
| "loss": 0.6241, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.3800243605359318, | |
| "grad_norm": 0.3784323036670685, | |
| "learning_rate": 9.917857354066931e-05, | |
| "loss": 0.5968, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.38976857490864797, | |
| "grad_norm": 0.38477954268455505, | |
| "learning_rate": 9.904775776745958e-05, | |
| "loss": 0.6051, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3995127892813642, | |
| "grad_norm": 0.42043524980545044, | |
| "learning_rate": 9.890738003669029e-05, | |
| "loss": 0.6269, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4092570036540804, | |
| "grad_norm": 0.4155956208705902, | |
| "learning_rate": 9.875746771527816e-05, | |
| "loss": 0.6137, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4190012180267966, | |
| "grad_norm": 0.39818403124809265, | |
| "learning_rate": 9.859805002892732e-05, | |
| "loss": 0.5831, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.4287454323995128, | |
| "grad_norm": 0.4027486741542816, | |
| "learning_rate": 9.842915805643155e-05, | |
| "loss": 0.5671, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.438489646772229, | |
| "grad_norm": 0.3745126724243164, | |
| "learning_rate": 9.825082472361557e-05, | |
| "loss": 0.5605, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4482338611449452, | |
| "grad_norm": 0.4132031798362732, | |
| "learning_rate": 9.806308479691595e-05, | |
| "loss": 0.6192, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.4579780755176614, | |
| "grad_norm": 0.42464327812194824, | |
| "learning_rate": 9.786597487660337e-05, | |
| "loss": 0.5856, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.46772228989037756, | |
| "grad_norm": 0.42480695247650146, | |
| "learning_rate": 9.765953338964735e-05, | |
| "loss": 0.6433, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.4774665042630938, | |
| "grad_norm": 0.4505927562713623, | |
| "learning_rate": 9.744380058222483e-05, | |
| "loss": 0.6637, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.48721071863581, | |
| "grad_norm": 0.40766066312789917, | |
| "learning_rate": 9.721881851187406e-05, | |
| "loss": 0.5722, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.48721071863581, | |
| "eval_loss": 0.6059486865997314, | |
| "eval_runtime": 116.3481, | |
| "eval_samples_per_second": 7.366, | |
| "eval_steps_per_second": 0.464, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.49695493300852617, | |
| "grad_norm": 0.45385217666625977, | |
| "learning_rate": 9.698463103929542e-05, | |
| "loss": 0.6223, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5066991473812423, | |
| "grad_norm": 0.4196796417236328, | |
| "learning_rate": 9.674128381980072e-05, | |
| "loss": 0.572, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5164433617539586, | |
| "grad_norm": 0.4284449517726898, | |
| "learning_rate": 9.648882429441257e-05, | |
| "loss": 0.5225, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5261875761266748, | |
| "grad_norm": 0.45584678649902344, | |
| "learning_rate": 9.622730168061567e-05, | |
| "loss": 0.5592, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.535931790499391, | |
| "grad_norm": 0.4218933582305908, | |
| "learning_rate": 9.595676696276172e-05, | |
| "loss": 0.5857, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5456760048721072, | |
| "grad_norm": 0.45864835381507874, | |
| "learning_rate": 9.567727288213005e-05, | |
| "loss": 0.5908, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5554202192448234, | |
| "grad_norm": 0.4483995735645294, | |
| "learning_rate": 9.538887392664544e-05, | |
| "loss": 0.5695, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5651644336175395, | |
| "grad_norm": 0.48954787850379944, | |
| "learning_rate": 9.50916263202557e-05, | |
| "loss": 0.5314, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5749086479902558, | |
| "grad_norm": 0.515805184841156, | |
| "learning_rate": 9.478558801197065e-05, | |
| "loss": 0.6366, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.584652862362972, | |
| "grad_norm": 0.4741595387458801, | |
| "learning_rate": 9.447081866456489e-05, | |
| "loss": 0.4933, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5943970767356882, | |
| "grad_norm": 0.4829859137535095, | |
| "learning_rate": 9.414737964294636e-05, | |
| "loss": 0.6337, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6041412911084044, | |
| "grad_norm": 0.45033347606658936, | |
| "learning_rate": 9.381533400219318e-05, | |
| "loss": 0.5714, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6138855054811205, | |
| "grad_norm": 0.4400843381881714, | |
| "learning_rate": 9.347474647526095e-05, | |
| "loss": 0.5756, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6236297198538368, | |
| "grad_norm": 0.45498228073120117, | |
| "learning_rate": 9.312568346036288e-05, | |
| "loss": 0.5635, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.633373934226553, | |
| "grad_norm": 0.4397442936897278, | |
| "learning_rate": 9.276821300802534e-05, | |
| "loss": 0.5508, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6431181485992692, | |
| "grad_norm": 0.465465247631073, | |
| "learning_rate": 9.24024048078213e-05, | |
| "loss": 0.5118, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6528623629719854, | |
| "grad_norm": 0.4904666244983673, | |
| "learning_rate": 9.202833017478422e-05, | |
| "loss": 0.5555, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6626065773447016, | |
| "grad_norm": 0.4522901177406311, | |
| "learning_rate": 9.164606203550497e-05, | |
| "loss": 0.5712, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6723507917174177, | |
| "grad_norm": 0.46516191959381104, | |
| "learning_rate": 9.125567491391476e-05, | |
| "loss": 0.54, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.682095006090134, | |
| "grad_norm": 0.563292384147644, | |
| "learning_rate": 9.085724491675642e-05, | |
| "loss": 0.5334, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6918392204628502, | |
| "grad_norm": 0.4758321940898895, | |
| "learning_rate": 9.045084971874738e-05, | |
| "loss": 0.558, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.7015834348355664, | |
| "grad_norm": 0.4505232870578766, | |
| "learning_rate": 9.003656854743667e-05, | |
| "loss": 0.5592, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7113276492082826, | |
| "grad_norm": 0.49200811982154846, | |
| "learning_rate": 8.961448216775954e-05, | |
| "loss": 0.562, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7210718635809987, | |
| "grad_norm": 0.49411916732788086, | |
| "learning_rate": 8.9184672866292e-05, | |
| "loss": 0.5502, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.730816077953715, | |
| "grad_norm": 0.49506062269210815, | |
| "learning_rate": 8.874722443520899e-05, | |
| "loss": 0.5268, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.730816077953715, | |
| "eval_loss": 0.5484524369239807, | |
| "eval_runtime": 116.2855, | |
| "eval_samples_per_second": 7.37, | |
| "eval_steps_per_second": 0.464, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7405602923264312, | |
| "grad_norm": 0.4459410607814789, | |
| "learning_rate": 8.83022221559489e-05, | |
| "loss": 0.5651, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7503045066991474, | |
| "grad_norm": 0.4649423360824585, | |
| "learning_rate": 8.784975278258783e-05, | |
| "loss": 0.507, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7600487210718636, | |
| "grad_norm": 0.5267335176467896, | |
| "learning_rate": 8.73899045249266e-05, | |
| "loss": 0.5846, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7697929354445798, | |
| "grad_norm": 0.48191651701927185, | |
| "learning_rate": 8.692276703129421e-05, | |
| "loss": 0.4785, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7795371498172959, | |
| "grad_norm": 0.4713154137134552, | |
| "learning_rate": 8.644843137107059e-05, | |
| "loss": 0.5115, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7892813641900122, | |
| "grad_norm": 0.48336002230644226, | |
| "learning_rate": 8.596699001693255e-05, | |
| "loss": 0.4972, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.7990255785627284, | |
| "grad_norm": 0.5115373730659485, | |
| "learning_rate": 8.547853682682604e-05, | |
| "loss": 0.5548, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.8087697929354446, | |
| "grad_norm": 0.4678809344768524, | |
| "learning_rate": 8.498316702566828e-05, | |
| "loss": 0.5017, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8185140073081608, | |
| "grad_norm": 0.46973857283592224, | |
| "learning_rate": 8.44809771867835e-05, | |
| "loss": 0.5304, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.8282582216808769, | |
| "grad_norm": 0.5022630095481873, | |
| "learning_rate": 8.397206521307584e-05, | |
| "loss": 0.4957, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8380024360535931, | |
| "grad_norm": 0.47750282287597656, | |
| "learning_rate": 8.345653031794292e-05, | |
| "loss": 0.4853, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8477466504263094, | |
| "grad_norm": 0.48647361993789673, | |
| "learning_rate": 8.293447300593402e-05, | |
| "loss": 0.499, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8574908647990256, | |
| "grad_norm": 0.5240001082420349, | |
| "learning_rate": 8.240599505315655e-05, | |
| "loss": 0.5061, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8672350791717418, | |
| "grad_norm": 0.4904966354370117, | |
| "learning_rate": 8.18711994874345e-05, | |
| "loss": 0.5139, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.876979293544458, | |
| "grad_norm": 0.5042212605476379, | |
| "learning_rate": 8.133019056822304e-05, | |
| "loss": 0.5355, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8867235079171741, | |
| "grad_norm": 0.4775819778442383, | |
| "learning_rate": 8.07830737662829e-05, | |
| "loss": 0.4889, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.8964677222898904, | |
| "grad_norm": 0.5047743320465088, | |
| "learning_rate": 8.022995574311876e-05, | |
| "loss": 0.4788, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.9062119366626066, | |
| "grad_norm": 0.4905475676059723, | |
| "learning_rate": 7.967094433018508e-05, | |
| "loss": 0.5104, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.9159561510353228, | |
| "grad_norm": 0.49578583240509033, | |
| "learning_rate": 7.910614850786448e-05, | |
| "loss": 0.4903, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.925700365408039, | |
| "grad_norm": 0.5329849123954773, | |
| "learning_rate": 7.85356783842216e-05, | |
| "loss": 0.4926, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9354445797807551, | |
| "grad_norm": 0.5267957448959351, | |
| "learning_rate": 7.795964517353735e-05, | |
| "loss": 0.5514, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.9451887941534713, | |
| "grad_norm": 0.5095996856689453, | |
| "learning_rate": 7.737816117462752e-05, | |
| "loss": 0.4886, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.9549330085261876, | |
| "grad_norm": 0.46476784348487854, | |
| "learning_rate": 7.679133974894983e-05, | |
| "loss": 0.4776, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.9646772228989038, | |
| "grad_norm": 0.516237735748291, | |
| "learning_rate": 7.619929529850397e-05, | |
| "loss": 0.5343, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.97442143727162, | |
| "grad_norm": 0.5128530859947205, | |
| "learning_rate": 7.560214324352858e-05, | |
| "loss": 0.492, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.97442143727162, | |
| "eval_loss": 0.5159465074539185, | |
| "eval_runtime": 116.1788, | |
| "eval_samples_per_second": 7.377, | |
| "eval_steps_per_second": 0.465, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9841656516443362, | |
| "grad_norm": 0.525909960269928, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.5276, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.9939098660170523, | |
| "grad_norm": 0.4755876958370209, | |
| "learning_rate": 7.439298295693665e-05, | |
| "loss": 0.4887, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.0097442143727162, | |
| "grad_norm": 0.898318350315094, | |
| "learning_rate": 7.378121045351378e-05, | |
| "loss": 0.896, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.0194884287454324, | |
| "grad_norm": 0.5060648918151855, | |
| "learning_rate": 7.316480175599309e-05, | |
| "loss": 0.4999, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.0292326431181487, | |
| "grad_norm": 0.46954262256622314, | |
| "learning_rate": 7.254387703447154e-05, | |
| "loss": 0.4633, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.0389768574908649, | |
| "grad_norm": 0.4725915193557739, | |
| "learning_rate": 7.191855733945387e-05, | |
| "loss": 0.4491, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.048721071863581, | |
| "grad_norm": 0.4837670624256134, | |
| "learning_rate": 7.128896457825364e-05, | |
| "loss": 0.4508, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.058465286236297, | |
| "grad_norm": 0.5884471535682678, | |
| "learning_rate": 7.06552214912271e-05, | |
| "loss": 0.4297, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.0682095006090133, | |
| "grad_norm": 0.5307117104530334, | |
| "learning_rate": 7.001745162784477e-05, | |
| "loss": 0.4592, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0779537149817295, | |
| "grad_norm": 0.5245863795280457, | |
| "learning_rate": 6.937577932260515e-05, | |
| "loss": 0.4819, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0876979293544458, | |
| "grad_norm": 0.5525639653205872, | |
| "learning_rate": 6.873032967079561e-05, | |
| "loss": 0.4678, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.097442143727162, | |
| "grad_norm": 0.5659409761428833, | |
| "learning_rate": 6.808122850410461e-05, | |
| "loss": 0.4635, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.1071863580998782, | |
| "grad_norm": 0.5493988394737244, | |
| "learning_rate": 6.742860236609077e-05, | |
| "loss": 0.4307, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.1169305724725944, | |
| "grad_norm": 0.5591247081756592, | |
| "learning_rate": 6.677257848751277e-05, | |
| "loss": 0.416, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.1266747868453106, | |
| "grad_norm": 0.4955357611179352, | |
| "learning_rate": 6.611328476152557e-05, | |
| "loss": 0.4662, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1364190012180269, | |
| "grad_norm": 0.5241750478744507, | |
| "learning_rate": 6.545084971874738e-05, | |
| "loss": 0.4301, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.146163215590743, | |
| "grad_norm": 0.4876319468021393, | |
| "learning_rate": 6.478540250220234e-05, | |
| "loss": 0.4564, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.155907429963459, | |
| "grad_norm": 0.5395438075065613, | |
| "learning_rate": 6.411707284214384e-05, | |
| "loss": 0.502, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.1656516443361755, | |
| "grad_norm": 0.5435388684272766, | |
| "learning_rate": 6.344599103076329e-05, | |
| "loss": 0.4536, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.1753958587088915, | |
| "grad_norm": 0.526055097579956, | |
| "learning_rate": 6.277228789678953e-05, | |
| "loss": 0.4576, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1851400730816077, | |
| "grad_norm": 0.5060620903968811, | |
| "learning_rate": 6.209609477998338e-05, | |
| "loss": 0.4814, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.194884287454324, | |
| "grad_norm": 0.4862349033355713, | |
| "learning_rate": 6.141754350553279e-05, | |
| "loss": 0.4277, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.2046285018270402, | |
| "grad_norm": 0.5572500824928284, | |
| "learning_rate": 6.073676635835317e-05, | |
| "loss": 0.5118, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.2143727161997564, | |
| "grad_norm": 0.5726194381713867, | |
| "learning_rate": 6.005389605729824e-05, | |
| "loss": 0.4725, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.2241169305724726, | |
| "grad_norm": 0.5314717888832092, | |
| "learning_rate": 5.9369065729286245e-05, | |
| "loss": 0.4719, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2241169305724726, | |
| "eval_loss": 0.4989548325538635, | |
| "eval_runtime": 116.3548, | |
| "eval_samples_per_second": 7.365, | |
| "eval_steps_per_second": 0.464, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2338611449451888, | |
| "grad_norm": 0.5402824878692627, | |
| "learning_rate": 5.868240888334653e-05, | |
| "loss": 0.4735, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.243605359317905, | |
| "grad_norm": 0.5422326922416687, | |
| "learning_rate": 5.799405938459175e-05, | |
| "loss": 0.4729, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.2533495736906213, | |
| "grad_norm": 0.5375432968139648, | |
| "learning_rate": 5.730415142812059e-05, | |
| "loss": 0.4589, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.2630937880633373, | |
| "grad_norm": 0.5534482002258301, | |
| "learning_rate": 5.661281951285613e-05, | |
| "loss": 0.4464, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.2728380024360537, | |
| "grad_norm": 0.5432469844818115, | |
| "learning_rate": 5.5920198415325064e-05, | |
| "loss": 0.4537, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2825822168087697, | |
| "grad_norm": 0.5186154246330261, | |
| "learning_rate": 5.522642316338268e-05, | |
| "loss": 0.4158, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.292326431181486, | |
| "grad_norm": 0.5382589101791382, | |
| "learning_rate": 5.453162900988902e-05, | |
| "loss": 0.4176, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.3020706455542022, | |
| "grad_norm": 0.5565094351768494, | |
| "learning_rate": 5.383595140634093e-05, | |
| "loss": 0.4287, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.3118148599269184, | |
| "grad_norm": 0.5403843522071838, | |
| "learning_rate": 5.313952597646568e-05, | |
| "loss": 0.4118, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.3215590742996346, | |
| "grad_norm": 0.5338913798332214, | |
| "learning_rate": 5.244248848978067e-05, | |
| "loss": 0.3948, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3313032886723508, | |
| "grad_norm": 0.5245387554168701, | |
| "learning_rate": 5.174497483512506e-05, | |
| "loss": 0.4579, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.341047503045067, | |
| "grad_norm": 0.51710045337677, | |
| "learning_rate": 5.104712099416785e-05, | |
| "loss": 0.4025, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.3507917174177833, | |
| "grad_norm": 0.56063312292099, | |
| "learning_rate": 5.034906301489808e-05, | |
| "loss": 0.4956, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.3605359317904995, | |
| "grad_norm": 0.6041215658187866, | |
| "learning_rate": 4.965093698510193e-05, | |
| "loss": 0.4336, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.3702801461632155, | |
| "grad_norm": 0.5541957020759583, | |
| "learning_rate": 4.895287900583216e-05, | |
| "loss": 0.4456, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.380024360535932, | |
| "grad_norm": 0.5618797540664673, | |
| "learning_rate": 4.825502516487497e-05, | |
| "loss": 0.3996, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.389768574908648, | |
| "grad_norm": 0.533968448638916, | |
| "learning_rate": 4.755751151021934e-05, | |
| "loss": 0.4155, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.3995127892813641, | |
| "grad_norm": 0.5411080121994019, | |
| "learning_rate": 4.6860474023534335e-05, | |
| "loss": 0.4325, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.4092570036540804, | |
| "grad_norm": 0.5761292576789856, | |
| "learning_rate": 4.616404859365907e-05, | |
| "loss": 0.4166, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.4190012180267966, | |
| "grad_norm": 0.5804659128189087, | |
| "learning_rate": 4.5468370990111006e-05, | |
| "loss": 0.4436, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4287454323995128, | |
| "grad_norm": 0.5537294745445251, | |
| "learning_rate": 4.477357683661734e-05, | |
| "loss": 0.3795, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.438489646772229, | |
| "grad_norm": 0.5477214455604553, | |
| "learning_rate": 4.407980158467495e-05, | |
| "loss": 0.3896, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.4482338611449452, | |
| "grad_norm": 0.5110722780227661, | |
| "learning_rate": 4.3387180487143876e-05, | |
| "loss": 0.4036, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.4579780755176615, | |
| "grad_norm": 0.5480209589004517, | |
| "learning_rate": 4.269584857187943e-05, | |
| "loss": 0.4451, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.4677222898903777, | |
| "grad_norm": 0.5255818367004395, | |
| "learning_rate": 4.2005940615408264e-05, | |
| "loss": 0.4031, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4677222898903777, | |
| "eval_loss": 0.4808570146560669, | |
| "eval_runtime": 116.3103, | |
| "eval_samples_per_second": 7.368, | |
| "eval_steps_per_second": 0.464, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4774665042630937, | |
| "grad_norm": 0.5485397577285767, | |
| "learning_rate": 4.131759111665349e-05, | |
| "loss": 0.462, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.4872107186358101, | |
| "grad_norm": 0.5463838577270508, | |
| "learning_rate": 4.063093427071376e-05, | |
| "loss": 0.4265, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.4969549330085261, | |
| "grad_norm": 0.5511963367462158, | |
| "learning_rate": 3.9946103942701777e-05, | |
| "loss": 0.3964, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.5066991473812423, | |
| "grad_norm": 0.5571511387825012, | |
| "learning_rate": 3.926323364164684e-05, | |
| "loss": 0.4642, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.5164433617539586, | |
| "grad_norm": 0.5894230604171753, | |
| "learning_rate": 3.858245649446721e-05, | |
| "loss": 0.4511, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5261875761266748, | |
| "grad_norm": 0.5373761653900146, | |
| "learning_rate": 3.790390522001662e-05, | |
| "loss": 0.388, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.535931790499391, | |
| "grad_norm": 0.5737512707710266, | |
| "learning_rate": 3.7227712103210486e-05, | |
| "loss": 0.375, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5456760048721072, | |
| "grad_norm": 0.5502772331237793, | |
| "learning_rate": 3.655400896923672e-05, | |
| "loss": 0.4433, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.5554202192448234, | |
| "grad_norm": 0.5434299111366272, | |
| "learning_rate": 3.588292715785617e-05, | |
| "loss": 0.4002, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.5651644336175394, | |
| "grad_norm": 0.5924922227859497, | |
| "learning_rate": 3.5214597497797684e-05, | |
| "loss": 0.4141, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5749086479902559, | |
| "grad_norm": 0.6273884177207947, | |
| "learning_rate": 3.4549150281252636e-05, | |
| "loss": 0.4277, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.5846528623629719, | |
| "grad_norm": 0.578081488609314, | |
| "learning_rate": 3.388671523847445e-05, | |
| "loss": 0.3956, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.5943970767356883, | |
| "grad_norm": 0.5401508212089539, | |
| "learning_rate": 3.322742151248725e-05, | |
| "loss": 0.4305, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.6041412911084043, | |
| "grad_norm": 0.6144226789474487, | |
| "learning_rate": 3.257139763390925e-05, | |
| "loss": 0.4126, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.6138855054811205, | |
| "grad_norm": 0.5739960670471191, | |
| "learning_rate": 3.1918771495895396e-05, | |
| "loss": 0.404, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6236297198538368, | |
| "grad_norm": 0.5783050060272217, | |
| "learning_rate": 3.12696703292044e-05, | |
| "loss": 0.4209, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.633373934226553, | |
| "grad_norm": 0.5928875207901001, | |
| "learning_rate": 3.062422067739485e-05, | |
| "loss": 0.4317, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6431181485992692, | |
| "grad_norm": 0.5249680876731873, | |
| "learning_rate": 2.9982548372155263e-05, | |
| "loss": 0.3964, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.6528623629719854, | |
| "grad_norm": 0.5612180233001709, | |
| "learning_rate": 2.934477850877292e-05, | |
| "loss": 0.3919, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.6626065773447016, | |
| "grad_norm": 0.5700953006744385, | |
| "learning_rate": 2.8711035421746367e-05, | |
| "loss": 0.4403, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6723507917174176, | |
| "grad_norm": 0.5822266936302185, | |
| "learning_rate": 2.8081442660546125e-05, | |
| "loss": 0.4265, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.682095006090134, | |
| "grad_norm": 0.5738089680671692, | |
| "learning_rate": 2.7456122965528475e-05, | |
| "loss": 0.4612, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.69183922046285, | |
| "grad_norm": 0.5751779079437256, | |
| "learning_rate": 2.6835198244006927e-05, | |
| "loss": 0.3983, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.7015834348355665, | |
| "grad_norm": 0.5598529577255249, | |
| "learning_rate": 2.6218789546486234e-05, | |
| "loss": 0.4338, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.7113276492082825, | |
| "grad_norm": 0.5587442517280579, | |
| "learning_rate": 2.560701704306336e-05, | |
| "loss": 0.3825, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7113276492082825, | |
| "eval_loss": 0.47023555636405945, | |
| "eval_runtime": 116.2392, | |
| "eval_samples_per_second": 7.373, | |
| "eval_steps_per_second": 0.465, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7210718635809987, | |
| "grad_norm": 0.5803806185722351, | |
| "learning_rate": 2.500000000000001e-05, | |
| "loss": 0.4265, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.730816077953715, | |
| "grad_norm": 0.5986764430999756, | |
| "learning_rate": 2.4397856756471432e-05, | |
| "loss": 0.4299, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.7405602923264312, | |
| "grad_norm": 0.6137004494667053, | |
| "learning_rate": 2.3800704701496053e-05, | |
| "loss": 0.4355, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.7503045066991474, | |
| "grad_norm": 0.5777759552001953, | |
| "learning_rate": 2.3208660251050158e-05, | |
| "loss": 0.4235, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.7600487210718636, | |
| "grad_norm": 0.599251389503479, | |
| "learning_rate": 2.2621838825372493e-05, | |
| "loss": 0.4078, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.7697929354445798, | |
| "grad_norm": 0.5639720559120178, | |
| "learning_rate": 2.2040354826462668e-05, | |
| "loss": 0.4046, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.7795371498172958, | |
| "grad_norm": 0.5639522075653076, | |
| "learning_rate": 2.1464321615778422e-05, | |
| "loss": 0.433, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.7892813641900123, | |
| "grad_norm": 0.5366583466529846, | |
| "learning_rate": 2.0893851492135537e-05, | |
| "loss": 0.4318, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.7990255785627283, | |
| "grad_norm": 0.5910248160362244, | |
| "learning_rate": 2.0329055669814934e-05, | |
| "loss": 0.4184, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.8087697929354447, | |
| "grad_norm": 0.5742807388305664, | |
| "learning_rate": 1.977004425688126e-05, | |
| "loss": 0.4346, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8185140073081607, | |
| "grad_norm": 0.5683897137641907, | |
| "learning_rate": 1.9216926233717085e-05, | |
| "loss": 0.4007, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.828258221680877, | |
| "grad_norm": 0.5249460339546204, | |
| "learning_rate": 1.866980943177699e-05, | |
| "loss": 0.4165, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.8380024360535931, | |
| "grad_norm": 0.5429449677467346, | |
| "learning_rate": 1.8128800512565513e-05, | |
| "loss": 0.378, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.8477466504263094, | |
| "grad_norm": 0.5251733660697937, | |
| "learning_rate": 1.7594004946843456e-05, | |
| "loss": 0.3824, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.8574908647990256, | |
| "grad_norm": 0.5763549208641052, | |
| "learning_rate": 1.7065526994065973e-05, | |
| "loss": 0.3788, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.8672350791717418, | |
| "grad_norm": 0.5946308970451355, | |
| "learning_rate": 1.6543469682057106e-05, | |
| "loss": 0.448, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.876979293544458, | |
| "grad_norm": 0.5707454085350037, | |
| "learning_rate": 1.602793478692419e-05, | |
| "loss": 0.3816, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.886723507917174, | |
| "grad_norm": 0.5635607838630676, | |
| "learning_rate": 1.551902281321651e-05, | |
| "loss": 0.3979, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.8964677222898905, | |
| "grad_norm": 1.6317293643951416, | |
| "learning_rate": 1.5016832974331724e-05, | |
| "loss": 0.3654, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.9062119366626065, | |
| "grad_norm": 0.5927464365959167, | |
| "learning_rate": 1.4521463173173965e-05, | |
| "loss": 0.4432, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.915956151035323, | |
| "grad_norm": 0.5671654343605042, | |
| "learning_rate": 1.4033009983067452e-05, | |
| "loss": 0.364, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.925700365408039, | |
| "grad_norm": 0.5950899124145508, | |
| "learning_rate": 1.3551568628929434e-05, | |
| "loss": 0.4442, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.9354445797807551, | |
| "grad_norm": 0.5787199139595032, | |
| "learning_rate": 1.3077232968705805e-05, | |
| "loss": 0.4042, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.9451887941534713, | |
| "grad_norm": 0.5602390766143799, | |
| "learning_rate": 1.2610095475073414e-05, | |
| "loss": 0.4252, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.9549330085261876, | |
| "grad_norm": 0.5912858843803406, | |
| "learning_rate": 1.2150247217412186e-05, | |
| "loss": 0.3944, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9549330085261876, | |
| "eval_loss": 0.4625219404697418, | |
| "eval_runtime": 116.3528, | |
| "eval_samples_per_second": 7.366, | |
| "eval_steps_per_second": 0.464, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9646772228989038, | |
| "grad_norm": 0.5247693657875061, | |
| "learning_rate": 1.1697777844051105e-05, | |
| "loss": 0.397, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.97442143727162, | |
| "grad_norm": 0.5671912431716919, | |
| "learning_rate": 1.1252775564791024e-05, | |
| "loss": 0.4191, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.9841656516443362, | |
| "grad_norm": 0.5699170827865601, | |
| "learning_rate": 1.0815327133708015e-05, | |
| "loss": 0.4126, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.9939098660170522, | |
| "grad_norm": 0.5686872005462646, | |
| "learning_rate": 1.0385517832240471e-05, | |
| "loss": 0.3936, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.009744214372716, | |
| "grad_norm": 0.9932243824005127, | |
| "learning_rate": 9.963431452563332e-06, | |
| "loss": 0.8769, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.0194884287454324, | |
| "grad_norm": 0.5336227416992188, | |
| "learning_rate": 9.549150281252633e-06, | |
| "loss": 0.3756, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.0292326431181484, | |
| "grad_norm": 0.5651218891143799, | |
| "learning_rate": 9.142755083243576e-06, | |
| "loss": 0.4197, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.038976857490865, | |
| "grad_norm": 0.5759946703910828, | |
| "learning_rate": 8.744325086085248e-06, | |
| "loss": 0.4043, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.048721071863581, | |
| "grad_norm": 0.529643177986145, | |
| "learning_rate": 8.353937964495029e-06, | |
| "loss": 0.3806, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.0584652862362973, | |
| "grad_norm": 0.563081681728363, | |
| "learning_rate": 7.971669825215788e-06, | |
| "loss": 0.3838, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.0682095006090133, | |
| "grad_norm": 0.5327872037887573, | |
| "learning_rate": 7.597595192178702e-06, | |
| "loss": 0.3778, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.0779537149817298, | |
| "grad_norm": 0.5525795817375183, | |
| "learning_rate": 7.2317869919746705e-06, | |
| "loss": 0.4119, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.0876979293544458, | |
| "grad_norm": 0.5779188871383667, | |
| "learning_rate": 6.874316539637127e-06, | |
| "loss": 0.3804, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.097442143727162, | |
| "grad_norm": 0.5842592716217041, | |
| "learning_rate": 6.52525352473905e-06, | |
| "loss": 0.3932, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.107186358099878, | |
| "grad_norm": 0.5471872091293335, | |
| "learning_rate": 6.184665997806832e-06, | |
| "loss": 0.3658, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.116930572472594, | |
| "grad_norm": 0.5467835664749146, | |
| "learning_rate": 5.852620357053651e-06, | |
| "loss": 0.3683, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.1266747868453106, | |
| "grad_norm": 0.5607587099075317, | |
| "learning_rate": 5.529181335435124e-06, | |
| "loss": 0.3635, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.1364190012180266, | |
| "grad_norm": 0.5682641267776489, | |
| "learning_rate": 5.214411988029355e-06, | |
| "loss": 0.3929, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.146163215590743, | |
| "grad_norm": 0.5493900775909424, | |
| "learning_rate": 4.908373679744316e-06, | |
| "loss": 0.3579, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.155907429963459, | |
| "grad_norm": 0.5399561524391174, | |
| "learning_rate": 4.611126073354571e-06, | |
| "loss": 0.397, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.1656516443361755, | |
| "grad_norm": 0.566953182220459, | |
| "learning_rate": 4.322727117869951e-06, | |
| "loss": 0.3996, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.1753958587088915, | |
| "grad_norm": 0.5458848476409912, | |
| "learning_rate": 4.043233037238281e-06, | |
| "loss": 0.3797, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.185140073081608, | |
| "grad_norm": 0.5842291116714478, | |
| "learning_rate": 3.772698319384349e-06, | |
| "loss": 0.4177, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.194884287454324, | |
| "grad_norm": 0.5811557769775391, | |
| "learning_rate": 3.511175705587433e-06, | |
| "loss": 0.4057, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.2046285018270404, | |
| "grad_norm": 0.5538292527198792, | |
| "learning_rate": 3.258716180199278e-06, | |
| "loss": 0.3411, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.2046285018270404, | |
| "eval_loss": 0.461640328168869, | |
| "eval_runtime": 116.2914, | |
| "eval_samples_per_second": 7.369, | |
| "eval_steps_per_second": 0.464, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.2143727161997564, | |
| "grad_norm": 0.5587625503540039, | |
| "learning_rate": 3.0153689607045845e-06, | |
| "loss": 0.3767, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.2241169305724724, | |
| "grad_norm": 0.6179119348526001, | |
| "learning_rate": 2.7811814881259503e-06, | |
| "loss": 0.3909, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.233861144945189, | |
| "grad_norm": 0.5846551060676575, | |
| "learning_rate": 2.5561994177751737e-06, | |
| "loss": 0.377, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.243605359317905, | |
| "grad_norm": 0.5826045274734497, | |
| "learning_rate": 2.340466610352654e-06, | |
| "loss": 0.3621, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.2533495736906213, | |
| "grad_norm": 0.5827783942222595, | |
| "learning_rate": 2.134025123396638e-06, | |
| "loss": 0.4011, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.2630937880633373, | |
| "grad_norm": 0.5747119188308716, | |
| "learning_rate": 1.9369152030840556e-06, | |
| "loss": 0.3517, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.2728380024360537, | |
| "grad_norm": 0.5735805630683899, | |
| "learning_rate": 1.7491752763844293e-06, | |
| "loss": 0.3749, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.2825822168087697, | |
| "grad_norm": 0.5682513117790222, | |
| "learning_rate": 1.5708419435684462e-06, | |
| "loss": 0.3629, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.292326431181486, | |
| "grad_norm": 0.587968111038208, | |
| "learning_rate": 1.4019499710726913e-06, | |
| "loss": 0.3971, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.302070645554202, | |
| "grad_norm": 0.5531240105628967, | |
| "learning_rate": 1.2425322847218368e-06, | |
| "loss": 0.3643, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.311814859926918, | |
| "grad_norm": 0.5963295698165894, | |
| "learning_rate": 1.0926199633097157e-06, | |
| "loss": 0.3678, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.3215590742996346, | |
| "grad_norm": 0.593429684638977, | |
| "learning_rate": 9.522422325404235e-07, | |
| "loss": 0.4407, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.331303288672351, | |
| "grad_norm": 0.5391296148300171, | |
| "learning_rate": 8.214264593307098e-07, | |
| "loss": 0.3228, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.341047503045067, | |
| "grad_norm": 0.5678541660308838, | |
| "learning_rate": 7.001981464747565e-07, | |
| "loss": 0.3825, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.350791717417783, | |
| "grad_norm": 0.5584573149681091, | |
| "learning_rate": 5.885809276723608e-07, | |
| "loss": 0.3306, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.3605359317904995, | |
| "grad_norm": 0.5642800331115723, | |
| "learning_rate": 4.865965629214819e-07, | |
| "loss": 0.368, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.3702801461632155, | |
| "grad_norm": 0.5758326053619385, | |
| "learning_rate": 3.9426493427611177e-07, | |
| "loss": 0.3755, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.380024360535932, | |
| "grad_norm": 0.5884226560592651, | |
| "learning_rate": 3.1160404197018154e-07, | |
| "loss": 0.389, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.389768574908648, | |
| "grad_norm": 0.5593277812004089, | |
| "learning_rate": 2.386300009084408e-07, | |
| "loss": 0.3565, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.3995127892813644, | |
| "grad_norm": 0.5782268643379211, | |
| "learning_rate": 1.753570375247815e-07, | |
| "loss": 0.3858, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.4092570036540804, | |
| "grad_norm": 0.5798349380493164, | |
| "learning_rate": 1.2179748700879012e-07, | |
| "loss": 0.3403, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.4190012180267964, | |
| "grad_norm": 0.596785843372345, | |
| "learning_rate": 7.796179090094891e-08, | |
| "loss": 0.3451, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.428745432399513, | |
| "grad_norm": 0.5822293162345886, | |
| "learning_rate": 4.385849505708084e-08, | |
| "loss": 0.3843, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.4384896467722292, | |
| "grad_norm": 0.5595471262931824, | |
| "learning_rate": 1.949424798228239e-08, | |
| "loss": 0.409, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.4482338611449452, | |
| "grad_norm": 0.5454703569412231, | |
| "learning_rate": 4.873799534788059e-09, | |
| "loss": 0.3588, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4482338611449452, | |
| "eval_loss": 0.4610269367694855, | |
| "eval_runtime": 116.2672, | |
| "eval_samples_per_second": 7.371, | |
| "eval_steps_per_second": 0.464, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4482338611449452, | |
| "step": 250, | |
| "total_flos": 2.3792355349217935e+18, | |
| "train_loss": 0.49713707935810086, | |
| "train_runtime": 9531.3935, | |
| "train_samples_per_second": 2.518, | |
| "train_steps_per_second": 0.026 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3792355349217935e+18, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |