| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 5000, |
| "global_step": 87900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.11376564277588168, |
| "grad_norm": 1.9390705823898315, |
| "learning_rate": 0.0007909078498293515, |
| "loss": 1.5809, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.22753128555176336, |
| "grad_norm": 1.703497052192688, |
| "learning_rate": 0.000781806598407281, |
| "loss": 1.54, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3412969283276451, |
| "grad_norm": 1.7551511526107788, |
| "learning_rate": 0.0007727144482366326, |
| "loss": 1.5087, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.4550625711035267, |
| "grad_norm": 1.5709869861602783, |
| "learning_rate": 0.000763613196814562, |
| "loss": 1.4773, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.5688282138794084, |
| "grad_norm": 1.5395598411560059, |
| "learning_rate": 0.0007545119453924914, |
| "loss": 1.4546, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5688282138794084, |
| "eval_accuracy": 0.647436, |
| "eval_loss": 1.4382679462432861, |
| "eval_runtime": 16.1443, |
| "eval_samples_per_second": 15485.324, |
| "eval_steps_per_second": 30.289, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.6825938566552902, |
| "grad_norm": 1.6133095026016235, |
| "learning_rate": 0.0007454106939704209, |
| "loss": 1.4513, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7963594994311718, |
| "grad_norm": 1.3529345989227295, |
| "learning_rate": 0.0007363185437997725, |
| "loss": 1.459, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.9101251422070534, |
| "grad_norm": 1.4212840795516968, |
| "learning_rate": 0.000727217292377702, |
| "loss": 1.4393, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.023890784982935, |
| "grad_norm": 1.3942997455596924, |
| "learning_rate": 0.0007181342434584756, |
| "loss": 1.4183, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.1376564277588168, |
| "grad_norm": 1.584731936454773, |
| "learning_rate": 0.0007090329920364051, |
| "loss": 1.3759, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.1376564277588168, |
| "eval_accuracy": 0.660984, |
| "eval_loss": 1.38503897190094, |
| "eval_runtime": 16.2019, |
| "eval_samples_per_second": 15430.245, |
| "eval_steps_per_second": 30.182, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2514220705346986, |
| "grad_norm": 1.4144625663757324, |
| "learning_rate": 0.0006999317406143345, |
| "loss": 1.375, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.36518771331058, |
| "grad_norm": 1.3004510402679443, |
| "learning_rate": 0.0006908395904436861, |
| "loss": 1.3729, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.4789533560864618, |
| "grad_norm": 1.3783901929855347, |
| "learning_rate": 0.0006817474402730376, |
| "loss": 1.3562, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.5927189988623436, |
| "grad_norm": 1.309706449508667, |
| "learning_rate": 0.000672646188850967, |
| "loss": 1.355, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.7064846416382253, |
| "grad_norm": 3.742795944213867, |
| "learning_rate": 0.0006635540386803186, |
| "loss": 1.3508, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.7064846416382253, |
| "eval_accuracy": 0.673728, |
| "eval_loss": 1.316284418106079, |
| "eval_runtime": 16.2031, |
| "eval_samples_per_second": 15429.139, |
| "eval_steps_per_second": 30.179, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.820250284414107, |
| "grad_norm": 1.2620598077774048, |
| "learning_rate": 0.0006544527872582481, |
| "loss": 1.3472, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.9340159271899886, |
| "grad_norm": 1.3602592945098877, |
| "learning_rate": 0.0006453515358361775, |
| "loss": 1.3371, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.04778156996587, |
| "grad_norm": 1.3070189952850342, |
| "learning_rate": 0.000636259385665529, |
| "loss": 1.3145, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.161547212741752, |
| "grad_norm": 1.2134970426559448, |
| "learning_rate": 0.0006271581342434585, |
| "loss": 1.2917, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.2753128555176336, |
| "grad_norm": 1.3796401023864746, |
| "learning_rate": 0.00061806598407281, |
| "loss": 1.294, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.2753128555176336, |
| "eval_accuracy": 0.682924, |
| "eval_loss": 1.283160924911499, |
| "eval_runtime": 16.1194, |
| "eval_samples_per_second": 15509.309, |
| "eval_steps_per_second": 30.336, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.3890784982935154, |
| "grad_norm": 1.357393741607666, |
| "learning_rate": 0.0006089738339021616, |
| "loss": 1.2936, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.502844141069397, |
| "grad_norm": 1.2381339073181152, |
| "learning_rate": 0.0005998725824800911, |
| "loss": 1.2859, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.616609783845279, |
| "grad_norm": 1.256423830986023, |
| "learning_rate": 0.0005907713310580204, |
| "loss": 1.2899, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.73037542662116, |
| "grad_norm": 1.1443513631820679, |
| "learning_rate": 0.000581679180887372, |
| "loss": 1.2846, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.8441410693970424, |
| "grad_norm": 1.2000058889389038, |
| "learning_rate": 0.0005725870307167236, |
| "loss": 1.2811, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.8441410693970424, |
| "eval_accuracy": 0.688052, |
| "eval_loss": 1.2580605745315552, |
| "eval_runtime": 16.1237, |
| "eval_samples_per_second": 15505.095, |
| "eval_steps_per_second": 30.328, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.9579067121729237, |
| "grad_norm": 1.2849873304367065, |
| "learning_rate": 0.0005634857792946531, |
| "loss": 1.2779, |
| "step": 26000 |
| }, |
| { |
| "epoch": 3.0716723549488054, |
| "grad_norm": 1.2703396081924438, |
| "learning_rate": 0.0005543936291240047, |
| "loss": 1.2444, |
| "step": 27000 |
| }, |
| { |
| "epoch": 3.185437997724687, |
| "grad_norm": 1.356720209121704, |
| "learning_rate": 0.000545292377701934, |
| "loss": 1.2303, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.299203640500569, |
| "grad_norm": 1.128195881843567, |
| "learning_rate": 0.0005361911262798635, |
| "loss": 1.2321, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.4129692832764507, |
| "grad_norm": 1.2033754587173462, |
| "learning_rate": 0.0005270989761092151, |
| "loss": 1.2331, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.4129692832764507, |
| "eval_accuracy": 0.69262, |
| "eval_loss": 1.2387434244155884, |
| "eval_runtime": 16.2457, |
| "eval_samples_per_second": 15388.688, |
| "eval_steps_per_second": 30.1, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.526734926052332, |
| "grad_norm": 1.2216309309005737, |
| "learning_rate": 0.0005179977246871446, |
| "loss": 1.2384, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.640500568828214, |
| "grad_norm": 1.3189234733581543, |
| "learning_rate": 0.000508896473265074, |
| "loss": 1.239, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.7542662116040955, |
| "grad_norm": 1.193328857421875, |
| "learning_rate": 0.0004998043230944255, |
| "loss": 1.2282, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.868031854379977, |
| "grad_norm": 1.3810237646102905, |
| "learning_rate": 0.000490703071672355, |
| "loss": 1.2301, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.981797497155859, |
| "grad_norm": 1.477654218673706, |
| "learning_rate": 0.0004816018202502845, |
| "loss": 1.2276, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.981797497155859, |
| "eval_accuracy": 0.697844, |
| "eval_loss": 1.2226529121398926, |
| "eval_runtime": 16.1466, |
| "eval_samples_per_second": 15483.136, |
| "eval_steps_per_second": 30.285, |
| "step": 35000 |
| }, |
| { |
| "epoch": 4.09556313993174, |
| "grad_norm": 2.5721781253814697, |
| "learning_rate": 0.00047250056882821396, |
| "loss": 1.2011, |
| "step": 36000 |
| }, |
| { |
| "epoch": 4.2093287827076225, |
| "grad_norm": 1.233066439628601, |
| "learning_rate": 0.00046340841865756544, |
| "loss": 1.1882, |
| "step": 37000 |
| }, |
| { |
| "epoch": 4.323094425483504, |
| "grad_norm": 15.391983032226562, |
| "learning_rate": 0.0004543071672354949, |
| "loss": 1.1856, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.436860068259386, |
| "grad_norm": 1.2283698320388794, |
| "learning_rate": 0.0004452059158134244, |
| "loss": 1.1972, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.550625711035267, |
| "grad_norm": 1.1042656898498535, |
| "learning_rate": 0.0004361046643913539, |
| "loss": 1.1964, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.550625711035267, |
| "eval_accuracy": 0.698972, |
| "eval_loss": 1.2195725440979004, |
| "eval_runtime": 16.22, |
| "eval_samples_per_second": 15413.078, |
| "eval_steps_per_second": 30.148, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.664391353811149, |
| "grad_norm": 1.2379703521728516, |
| "learning_rate": 0.00042701251422070535, |
| "loss": 1.194, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.778156996587031, |
| "grad_norm": 1.3536499738693237, |
| "learning_rate": 0.00041792036405005693, |
| "loss": 1.1939, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.891922639362912, |
| "grad_norm": 1.1571460962295532, |
| "learning_rate": 0.00040881911262798635, |
| "loss": 1.1952, |
| "step": 43000 |
| }, |
| { |
| "epoch": 5.005688282138794, |
| "grad_norm": 1.1833922863006592, |
| "learning_rate": 0.00039972696245733794, |
| "loss": 1.1908, |
| "step": 44000 |
| }, |
| { |
| "epoch": 5.1194539249146755, |
| "grad_norm": 1.4700716733932495, |
| "learning_rate": 0.00039062571103526736, |
| "loss": 1.1498, |
| "step": 45000 |
| }, |
| { |
| "epoch": 5.1194539249146755, |
| "eval_accuracy": 0.703608, |
| "eval_loss": 1.1993978023529053, |
| "eval_runtime": 16.3707, |
| "eval_samples_per_second": 15271.187, |
| "eval_steps_per_second": 29.87, |
| "step": 45000 |
| }, |
| { |
| "epoch": 5.233219567690558, |
| "grad_norm": 1.3525902032852173, |
| "learning_rate": 0.00038152445961319684, |
| "loss": 1.1507, |
| "step": 46000 |
| }, |
| { |
| "epoch": 5.346985210466439, |
| "grad_norm": 1.3642832040786743, |
| "learning_rate": 0.0003724232081911263, |
| "loss": 1.1551, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.460750853242321, |
| "grad_norm": 1.2102240324020386, |
| "learning_rate": 0.0003633219567690558, |
| "loss": 1.1574, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.5745164960182025, |
| "grad_norm": 1.1597959995269775, |
| "learning_rate": 0.0003542207053469852, |
| "loss": 1.1545, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.688282138794084, |
| "grad_norm": 1.2223830223083496, |
| "learning_rate": 0.00034512855517633675, |
| "loss": 1.1548, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.688282138794084, |
| "eval_accuracy": 0.705224, |
| "eval_loss": 1.1899733543395996, |
| "eval_runtime": 16.029, |
| "eval_samples_per_second": 15596.716, |
| "eval_steps_per_second": 30.507, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.802047781569966, |
| "grad_norm": 1.1772878170013428, |
| "learning_rate": 0.0003360364050056883, |
| "loss": 1.1543, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.915813424345847, |
| "grad_norm": 1.286970615386963, |
| "learning_rate": 0.00032693515358361776, |
| "loss": 1.1566, |
| "step": 52000 |
| }, |
| { |
| "epoch": 6.0295790671217295, |
| "grad_norm": 1.1497869491577148, |
| "learning_rate": 0.00031783390216154724, |
| "loss": 1.1471, |
| "step": 53000 |
| }, |
| { |
| "epoch": 6.143344709897611, |
| "grad_norm": 1.2324450016021729, |
| "learning_rate": 0.00030873265073947667, |
| "loss": 1.1141, |
| "step": 54000 |
| }, |
| { |
| "epoch": 6.257110352673493, |
| "grad_norm": 1.175905466079712, |
| "learning_rate": 0.00029963139931740615, |
| "loss": 1.1232, |
| "step": 55000 |
| }, |
| { |
| "epoch": 6.257110352673493, |
| "eval_accuracy": 0.707532, |
| "eval_loss": 1.183059573173523, |
| "eval_runtime": 16.1679, |
| "eval_samples_per_second": 15462.772, |
| "eval_steps_per_second": 30.245, |
| "step": 55000 |
| }, |
| { |
| "epoch": 6.370875995449374, |
| "grad_norm": 1.133489966392517, |
| "learning_rate": 0.00029053924914675767, |
| "loss": 1.1213, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.484641638225256, |
| "grad_norm": 1.3633593320846558, |
| "learning_rate": 0.00028143799772468715, |
| "loss": 1.1206, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.598407281001138, |
| "grad_norm": 1.2622781991958618, |
| "learning_rate": 0.00027233674630261663, |
| "loss": 1.1241, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.712172923777019, |
| "grad_norm": 1.2032582759857178, |
| "learning_rate": 0.00026324459613196816, |
| "loss": 1.1276, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.825938566552901, |
| "grad_norm": 1.166924238204956, |
| "learning_rate": 0.00025414334470989764, |
| "loss": 1.1264, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.825938566552901, |
| "eval_accuracy": 0.710036, |
| "eval_loss": 1.1695001125335693, |
| "eval_runtime": 16.198, |
| "eval_samples_per_second": 15434.001, |
| "eval_steps_per_second": 30.189, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.939704209328783, |
| "grad_norm": 1.236396074295044, |
| "learning_rate": 0.00024505119453924917, |
| "loss": 1.1196, |
| "step": 61000 |
| }, |
| { |
| "epoch": 7.053469852104665, |
| "grad_norm": 1.2301005125045776, |
| "learning_rate": 0.00023594994311717865, |
| "loss": 1.1065, |
| "step": 62000 |
| }, |
| { |
| "epoch": 7.167235494880546, |
| "grad_norm": 1.1987460851669312, |
| "learning_rate": 0.00022685779294653017, |
| "loss": 1.0845, |
| "step": 63000 |
| }, |
| { |
| "epoch": 7.281001137656427, |
| "grad_norm": 1.367330551147461, |
| "learning_rate": 0.0002177565415244596, |
| "loss": 1.0915, |
| "step": 64000 |
| }, |
| { |
| "epoch": 7.39476678043231, |
| "grad_norm": 1.2554900646209717, |
| "learning_rate": 0.00020865529010238908, |
| "loss": 1.0896, |
| "step": 65000 |
| }, |
| { |
| "epoch": 7.39476678043231, |
| "eval_accuracy": 0.712788, |
| "eval_loss": 1.1583917140960693, |
| "eval_runtime": 15.94, |
| "eval_samples_per_second": 15683.855, |
| "eval_steps_per_second": 30.678, |
| "step": 65000 |
| }, |
| { |
| "epoch": 7.508532423208191, |
| "grad_norm": 1.1475346088409424, |
| "learning_rate": 0.00019955403868031853, |
| "loss": 1.0937, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.622298065984073, |
| "grad_norm": 1.2330896854400635, |
| "learning_rate": 0.000190452787258248, |
| "loss": 1.095, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.736063708759954, |
| "grad_norm": 1.3467962741851807, |
| "learning_rate": 0.0001813515358361775, |
| "loss": 1.0945, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.849829351535837, |
| "grad_norm": 1.144555926322937, |
| "learning_rate": 0.00017225938566552902, |
| "loss": 1.0943, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.963594994311718, |
| "grad_norm": 1.39180326461792, |
| "learning_rate": 0.0001631581342434585, |
| "loss": 1.0917, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.963594994311718, |
| "eval_accuracy": 0.715496, |
| "eval_loss": 1.1535059213638306, |
| "eval_runtime": 16.0681, |
| "eval_samples_per_second": 15558.787, |
| "eval_steps_per_second": 30.433, |
| "step": 70000 |
| }, |
| { |
| "epoch": 8.0773606370876, |
| "grad_norm": 1.277241587638855, |
| "learning_rate": 0.00015405688282138795, |
| "loss": 1.0693, |
| "step": 71000 |
| }, |
| { |
| "epoch": 8.19112627986348, |
| "grad_norm": 1.3388996124267578, |
| "learning_rate": 0.00014496473265073948, |
| "loss": 1.064, |
| "step": 72000 |
| }, |
| { |
| "epoch": 8.304891922639364, |
| "grad_norm": 1.1635925769805908, |
| "learning_rate": 0.00013588168373151308, |
| "loss": 1.0617, |
| "step": 73000 |
| }, |
| { |
| "epoch": 8.418657565415245, |
| "grad_norm": 1.1681923866271973, |
| "learning_rate": 0.00012678043230944256, |
| "loss": 1.0664, |
| "step": 74000 |
| }, |
| { |
| "epoch": 8.532423208191126, |
| "grad_norm": 1.3212028741836548, |
| "learning_rate": 0.00011767918088737203, |
| "loss": 1.0654, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.532423208191126, |
| "eval_accuracy": 0.714384, |
| "eval_loss": 1.154496192932129, |
| "eval_runtime": 16.158, |
| "eval_samples_per_second": 15472.18, |
| "eval_steps_per_second": 30.264, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.646188850967008, |
| "grad_norm": 1.341015100479126, |
| "learning_rate": 0.00010857792946530148, |
| "loss": 1.0618, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.759954493742889, |
| "grad_norm": 1.2505824565887451, |
| "learning_rate": 9.947667804323096e-05, |
| "loss": 1.0674, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.873720136518772, |
| "grad_norm": 1.2615190744400024, |
| "learning_rate": 9.037542662116041e-05, |
| "loss": 1.0638, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.987485779294653, |
| "grad_norm": 1.2935796976089478, |
| "learning_rate": 8.128327645051195e-05, |
| "loss": 1.0616, |
| "step": 79000 |
| }, |
| { |
| "epoch": 9.101251422070535, |
| "grad_norm": 1.3248777389526367, |
| "learning_rate": 7.218202502844142e-05, |
| "loss": 1.0395, |
| "step": 80000 |
| }, |
| { |
| "epoch": 9.101251422070535, |
| "eval_accuracy": 0.716892, |
| "eval_loss": 1.1470571756362915, |
| "eval_runtime": 16.0825, |
| "eval_samples_per_second": 15544.827, |
| "eval_steps_per_second": 30.406, |
| "step": 80000 |
| }, |
| { |
| "epoch": 9.215017064846416, |
| "grad_norm": 1.379506230354309, |
| "learning_rate": 6.308077360637088e-05, |
| "loss": 1.0436, |
| "step": 81000 |
| }, |
| { |
| "epoch": 9.328782707622299, |
| "grad_norm": 1.1906781196594238, |
| "learning_rate": 5.398862343572242e-05, |
| "loss": 1.0417, |
| "step": 82000 |
| }, |
| { |
| "epoch": 9.44254835039818, |
| "grad_norm": 1.1397643089294434, |
| "learning_rate": 4.489647326507395e-05, |
| "loss": 1.0376, |
| "step": 83000 |
| }, |
| { |
| "epoch": 9.556313993174061, |
| "grad_norm": 1.0807147026062012, |
| "learning_rate": 3.5813424345847554e-05, |
| "loss": 1.0381, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.670079635949943, |
| "grad_norm": 1.3149391412734985, |
| "learning_rate": 2.6712172923777017e-05, |
| "loss": 1.0383, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.670079635949943, |
| "eval_accuracy": 0.713636, |
| "eval_loss": 1.1722280979156494, |
| "eval_runtime": 16.186, |
| "eval_samples_per_second": 15445.423, |
| "eval_steps_per_second": 30.211, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.783845278725824, |
| "grad_norm": 1.227634072303772, |
| "learning_rate": 1.7610921501706483e-05, |
| "loss": 1.0359, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.897610921501707, |
| "grad_norm": 1.2846591472625732, |
| "learning_rate": 8.509670079635951e-06, |
| "loss": 1.0337, |
| "step": 87000 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 87900, |
| "total_flos": 5.6417821488e+17, |
| "train_loss": 1.2023330011465443, |
| "train_runtime": 3087.8654, |
| "train_samples_per_second": 14573.174, |
| "train_steps_per_second": 28.466 |
| } |
| ], |
| "logging_steps": 1000, |
| "max_steps": 87900, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 5000, |
| "total_flos": 5.6417821488e+17, |
| "train_batch_size": 512, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|