| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.12, | |
| "eval_steps": 500, | |
| "global_step": 700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.8485889434814453, | |
| "learning_rate": 0.0001999964908278481, | |
| "loss": 1.2049, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.47789862751960754, | |
| "learning_rate": 0.00019998596355767805, | |
| "loss": 0.9333, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.017558217048645, | |
| "learning_rate": 0.00019996841892833, | |
| "loss": 0.8671, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.6610977053642273, | |
| "learning_rate": 0.00019994385817114646, | |
| "loss": 0.7979, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.6075429320335388, | |
| "learning_rate": 0.00019991228300988585, | |
| "loss": 0.7662, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.6595763564109802, | |
| "learning_rate": 0.00019987369566060176, | |
| "loss": 0.7929, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.6968618035316467, | |
| "learning_rate": 0.00019982809883148722, | |
| "loss": 0.7683, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.4889592230319977, | |
| "learning_rate": 0.00019977549572268468, | |
| "loss": 0.8667, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.6651108264923096, | |
| "learning_rate": 0.0001997158900260614, | |
| "loss": 0.8446, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5898510217666626, | |
| "learning_rate": 0.00019964928592495045, | |
| "loss": 0.9051, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.4398016035556793, | |
| "learning_rate": 0.00019957568809385694, | |
| "loss": 0.7235, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.6901968121528625, | |
| "learning_rate": 0.00019949510169813003, | |
| "loss": 0.8169, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.6267213225364685, | |
| "learning_rate": 0.00019940753239360047, | |
| "loss": 0.8266, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.48524895310401917, | |
| "learning_rate": 0.00019931298632618356, | |
| "loss": 0.758, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.5294132232666016, | |
| "learning_rate": 0.0001992114701314478, | |
| "loss": 0.7759, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.48957982659339905, | |
| "learning_rate": 0.0001991029909341493, | |
| "loss": 0.7797, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.645412802696228, | |
| "learning_rate": 0.00019898755634773158, | |
| "loss": 0.7437, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.43297675251960754, | |
| "learning_rate": 0.0001988651744737914, | |
| "loss": 0.8043, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.5513920783996582, | |
| "learning_rate": 0.00019873585390151003, | |
| "loss": 0.7701, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8462435007095337, | |
| "learning_rate": 0.0001985996037070505, | |
| "loss": 0.709, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.6892585158348083, | |
| "learning_rate": 0.00019845643345292054, | |
| "loss": 0.7377, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.4617864191532135, | |
| "learning_rate": 0.00019830635318730154, | |
| "loss": 0.8352, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.6300354599952698, | |
| "learning_rate": 0.0001981493734433433, | |
| "loss": 0.7738, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.8086859583854675, | |
| "learning_rate": 0.0001979855052384247, | |
| "loss": 0.8067, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.6272985935211182, | |
| "learning_rate": 0.00019781476007338058, | |
| "loss": 0.7456, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.44750839471817017, | |
| "learning_rate": 0.00019763714993169452, | |
| "loss": 0.758, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.5053977370262146, | |
| "learning_rate": 0.00019745268727865774, | |
| "loss": 0.7895, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.41920769214630127, | |
| "learning_rate": 0.00019726138506049438, | |
| "loss": 0.7302, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.38280290365219116, | |
| "learning_rate": 0.00019706325670345275, | |
| "loss": 0.8152, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.554710865020752, | |
| "learning_rate": 0.0001968583161128631, | |
| "loss": 0.8461, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.5612509250640869, | |
| "learning_rate": 0.00019664657767216176, | |
| "loss": 0.7787, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.610614538192749, | |
| "learning_rate": 0.00019642805624188147, | |
| "loss": 0.7574, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.679517924785614, | |
| "learning_rate": 0.0001962027671586086, | |
| "loss": 0.8487, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.6685434579849243, | |
| "learning_rate": 0.00019597072623390668, | |
| "loss": 0.6611, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.480293869972229, | |
| "learning_rate": 0.00019573194975320673, | |
| "loss": 0.7802, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.7727369070053101, | |
| "learning_rate": 0.00019548645447466431, | |
| "loss": 0.6727, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.6371043920516968, | |
| "learning_rate": 0.00019523425762798329, | |
| "loss": 0.7502, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.6399966478347778, | |
| "learning_rate": 0.00019497537691320668, | |
| "loss": 0.8401, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.7263137698173523, | |
| "learning_rate": 0.00019470983049947444, | |
| "loss": 0.7494, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.402416467666626, | |
| "learning_rate": 0.00019443763702374812, | |
| "loss": 0.7842, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.6639626026153564, | |
| "learning_rate": 0.00019415881558950302, | |
| "loss": 0.8082, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.5801042914390564, | |
| "learning_rate": 0.00019387338576538744, | |
| "loss": 0.7883, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.5533607006072998, | |
| "learning_rate": 0.00019358136758384912, | |
| "loss": 0.7356, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.6019654273986816, | |
| "learning_rate": 0.00019328278153972947, | |
| "loss": 0.7891, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.5344104170799255, | |
| "learning_rate": 0.00019297764858882514, | |
| "loss": 0.7671, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.5494843125343323, | |
| "learning_rate": 0.0001926659901464172, | |
| "loss": 0.6608, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.465420126914978, | |
| "learning_rate": 0.00019234782808576824, | |
| "loss": 0.647, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.5202775001525879, | |
| "learning_rate": 0.00019202318473658705, | |
| "loss": 0.729, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.5757818222045898, | |
| "learning_rate": 0.00019169208288346166, | |
| "loss": 0.6713, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.46555572748184204, | |
| "learning_rate": 0.0001913545457642601, | |
| "loss": 0.7049, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.5101790428161621, | |
| "learning_rate": 0.00019101059706849957, | |
| "loss": 0.7419, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.6083744764328003, | |
| "learning_rate": 0.00019066026093568378, | |
| "loss": 0.7148, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.4719640612602234, | |
| "learning_rate": 0.00019030356195360874, | |
| "loss": 0.7493, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.7365225553512573, | |
| "learning_rate": 0.0001899405251566371, | |
| "loss": 0.7652, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.4452705383300781, | |
| "learning_rate": 0.0001895711760239413, | |
| "loss": 0.7438, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.6071786284446716, | |
| "learning_rate": 0.0001891955404777151, | |
| "loss": 0.7683, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.5774498581886292, | |
| "learning_rate": 0.00018881364488135448, | |
| "loss": 0.8115, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.6134682893753052, | |
| "learning_rate": 0.00018842551603760724, | |
| "loss": 0.8335, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.4869893193244934, | |
| "learning_rate": 0.00018803118118669202, | |
| "loss": 0.6933, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.6457111239433289, | |
| "learning_rate": 0.00018763066800438636, | |
| "loss": 0.7515, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.59674471616745, | |
| "learning_rate": 0.0001872240046000844, | |
| "loss": 0.6931, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.44608160853385925, | |
| "learning_rate": 0.00018681121951482393, | |
| "loss": 0.782, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.5934664607048035, | |
| "learning_rate": 0.00018639234171928353, | |
| "loss": 0.7361, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.49716323614120483, | |
| "learning_rate": 0.0001859674006117491, | |
| "loss": 0.7443, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.47995495796203613, | |
| "learning_rate": 0.00018553642601605068, | |
| "loss": 0.7221, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.5177399516105652, | |
| "learning_rate": 0.00018509944817946922, | |
| "loss": 0.7622, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.6638798713684082, | |
| "learning_rate": 0.0001846564977706138, | |
| "loss": 0.8556, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.5056771636009216, | |
| "learning_rate": 0.00018420760587726923, | |
| "loss": 0.7814, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.44543707370758057, | |
| "learning_rate": 0.0001837528040042142, | |
| "loss": 0.722, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6765120625495911, | |
| "learning_rate": 0.00018329212407100994, | |
| "loss": 0.7903, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.49232372641563416, | |
| "learning_rate": 0.00018282559840976042, | |
| "loss": 0.6996, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.47392791509628296, | |
| "learning_rate": 0.00018235325976284275, | |
| "loss": 0.773, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 0.5056615471839905, | |
| "learning_rate": 0.00018187514128060946, | |
| "loss": 0.728, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.5857616662979126, | |
| "learning_rate": 0.00018139127651906184, | |
| "loss": 0.7659, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5966864228248596, | |
| "learning_rate": 0.00018090169943749476, | |
| "loss": 0.7039, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.4524347484111786, | |
| "learning_rate": 0.00018040644439611348, | |
| "loss": 0.7125, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 0.5570976138114929, | |
| "learning_rate": 0.00017990554615362198, | |
| "loss": 0.698, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.6045777201652527, | |
| "learning_rate": 0.00017939903986478355, | |
| "loss": 0.8255, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 0.6149687767028809, | |
| "learning_rate": 0.00017888696107795342, | |
| "loss": 0.6616, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.4873579144477844, | |
| "learning_rate": 0.000178369345732584, | |
| "loss": 0.7452, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 0.5569061636924744, | |
| "learning_rate": 0.00017784623015670238, | |
| "loss": 0.7652, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.5825181603431702, | |
| "learning_rate": 0.00017731765106436073, | |
| "loss": 0.7793, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 0.4047383666038513, | |
| "learning_rate": 0.00017678364555305978, | |
| "loss": 0.6875, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.5080836415290833, | |
| "learning_rate": 0.0001762442511011448, | |
| "loss": 0.7465, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.5825940370559692, | |
| "learning_rate": 0.00017569950556517566, | |
| "loss": 0.7205, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.476992666721344, | |
| "learning_rate": 0.00017514944717726962, | |
| "loss": 0.6589, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 0.7424727082252502, | |
| "learning_rate": 0.00017459411454241822, | |
| "loss": 0.7035, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.6544787287712097, | |
| "learning_rate": 0.00017403354663577783, | |
| "loss": 0.787, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.49425187706947327, | |
| "learning_rate": 0.00017346778279993415, | |
| "loss": 0.7515, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5473236441612244, | |
| "learning_rate": 0.00017289686274214118, | |
| "loss": 0.7199, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 0.6773544549942017, | |
| "learning_rate": 0.00017232082653153422, | |
| "loss": 0.8037, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.6355096101760864, | |
| "learning_rate": 0.00017173971459631787, | |
| "loss": 0.7502, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 0.47867000102996826, | |
| "learning_rate": 0.00017115356772092857, | |
| "loss": 0.7446, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.5135357975959778, | |
| "learning_rate": 0.0001705624270431721, | |
| "loss": 0.6507, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.48866042494773865, | |
| "learning_rate": 0.00016996633405133655, | |
| "loss": 0.7164, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.5892354249954224, | |
| "learning_rate": 0.0001693653305812805, | |
| "loss": 0.7621, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 0.6633970141410828, | |
| "learning_rate": 0.00016875945881349676, | |
| "loss": 0.7623, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.6444060802459717, | |
| "learning_rate": 0.000168148761270152, | |
| "loss": 0.6606, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.7012648582458496, | |
| "learning_rate": 0.00016753328081210245, | |
| "loss": 0.6941, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7064160704612732, | |
| "learning_rate": 0.00016691306063588583, | |
| "loss": 0.6841, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 0.7241398096084595, | |
| "learning_rate": 0.00016628814427068953, | |
| "loss": 0.6996, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.7807374596595764, | |
| "learning_rate": 0.00016565857557529566, | |
| "loss": 0.7542, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.763768196105957, | |
| "learning_rate": 0.00016502439873500289, | |
| "loss": 0.7175, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.6105090379714966, | |
| "learning_rate": 0.0001643856582585254, | |
| "loss": 0.7565, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.5686540603637695, | |
| "learning_rate": 0.000163742398974869, | |
| "loss": 0.7339, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.5341500043869019, | |
| "learning_rate": 0.00016309466603018496, | |
| "loss": 0.569, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.7274748682975769, | |
| "learning_rate": 0.00016244250488460158, | |
| "loss": 0.7556, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.7321165204048157, | |
| "learning_rate": 0.00016178596130903344, | |
| "loss": 0.7084, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 0.5086159110069275, | |
| "learning_rate": 0.00016112508138196917, | |
| "loss": 0.6935, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.4714389443397522, | |
| "learning_rate": 0.0001604599114862375, | |
| "loss": 0.7076, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 0.5031452178955078, | |
| "learning_rate": 0.0001597904983057519, | |
| "loss": 0.7151, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.7745943665504456, | |
| "learning_rate": 0.0001591168888222342, | |
| "loss": 0.7001, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 0.6076303124427795, | |
| "learning_rate": 0.00015843913031191723, | |
| "loss": 0.7285, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.7456529140472412, | |
| "learning_rate": 0.00015775727034222675, | |
| "loss": 0.8041, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.5760998725891113, | |
| "learning_rate": 0.0001570713567684432, | |
| "loss": 0.7353, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.7057327032089233, | |
| "learning_rate": 0.00015638143773034267, | |
| "loss": 0.7792, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 0.7615967392921448, | |
| "learning_rate": 0.00015568756164881882, | |
| "loss": 1.0121, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.6304950714111328, | |
| "learning_rate": 0.000154989777222484, | |
| "loss": 0.7727, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.6852543950080872, | |
| "learning_rate": 0.00015428813342425177, | |
| "loss": 0.741, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.6379660964012146, | |
| "learning_rate": 0.00015358267949789966, | |
| "loss": 0.6919, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 0.5846463441848755, | |
| "learning_rate": 0.00015287346495461315, | |
| "loss": 0.7163, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.5999557971954346, | |
| "learning_rate": 0.0001521605395695108, | |
| "loss": 0.8152, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 0.5806307196617126, | |
| "learning_rate": 0.00015144395337815064, | |
| "loss": 0.6709, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.6559942960739136, | |
| "learning_rate": 0.00015072375667301893, | |
| "loss": 0.6527, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6287715435028076, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.8194, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 0.616222620010376, | |
| "learning_rate": 0.00014927273415482915, | |
| "loss": 0.6627, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 0.4750412106513977, | |
| "learning_rate": 0.0001485420101795274, | |
| "loss": 0.6366, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 0.5122964978218079, | |
| "learning_rate": 0.00014780787935881923, | |
| "loss": 0.6717, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 0.7382633090019226, | |
| "learning_rate": 0.0001470703932165333, | |
| "loss": 0.6483, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.6540554761886597, | |
| "learning_rate": 0.00014632960351198618, | |
| "loss": 0.6151, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 0.4776591956615448, | |
| "learning_rate": 0.00014558556223635003, | |
| "loss": 0.6707, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 0.8012662529945374, | |
| "learning_rate": 0.00014483832160900326, | |
| "loss": 0.6125, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 0.6735953092575073, | |
| "learning_rate": 0.00014408793407386588, | |
| "loss": 0.6206, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 0.5640230774879456, | |
| "learning_rate": 0.00014333445229571873, | |
| "loss": 0.6161, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.5928654074668884, | |
| "learning_rate": 0.00014257792915650728, | |
| "loss": 0.6583, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 0.7347397208213806, | |
| "learning_rate": 0.00014181841775163013, | |
| "loss": 0.6222, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 0.593773365020752, | |
| "learning_rate": 0.0001410559713862128, | |
| "loss": 0.716, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 0.6244611144065857, | |
| "learning_rate": 0.00014029064357136628, | |
| "loss": 0.6198, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 0.5083370804786682, | |
| "learning_rate": 0.00013952248802043165, | |
| "loss": 0.6389, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.5241413116455078, | |
| "learning_rate": 0.0001387515586452103, | |
| "loss": 0.6842, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1875, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.320122286794342e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |