| { | |
| "best_metric": 0.75, | |
| "best_model_checkpoint": "2024_08_13/checkpoint-62", | |
| "epoch": 29.76, | |
| "eval_steps": 500, | |
| "global_step": 930, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 10.543134689331055, | |
| "learning_rate": 1.0752688172043011e-07, | |
| "loss": 0.7672, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 3.446369171142578, | |
| "learning_rate": 2.1505376344086022e-07, | |
| "loss": 0.7681, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 11.52937126159668, | |
| "learning_rate": 3.225806451612903e-07, | |
| "loss": 0.7191, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "eval_accuracy": 0.25, | |
| "eval_loss": 0.7458651065826416, | |
| "eval_runtime": 1.1044, | |
| "eval_samples_per_second": 28.975, | |
| "eval_steps_per_second": 28.975, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 5.57147741317749, | |
| "learning_rate": 4.3010752688172043e-07, | |
| "loss": 0.7269, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5.154497146606445, | |
| "learning_rate": 5.376344086021505e-07, | |
| "loss": 0.7094, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 7.2774434089660645, | |
| "learning_rate": 6.451612903225806e-07, | |
| "loss": 0.6894, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.6786516308784485, | |
| "eval_runtime": 1.183, | |
| "eval_samples_per_second": 27.05, | |
| "eval_steps_per_second": 27.05, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 3.891305923461914, | |
| "learning_rate": 7.526881720430107e-07, | |
| "loss": 0.6601, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 4.829281330108643, | |
| "learning_rate": 8.602150537634409e-07, | |
| "loss": 0.6542, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 4.894190311431885, | |
| "learning_rate": 9.67741935483871e-07, | |
| "loss": 0.5993, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.6089950799942017, | |
| "eval_runtime": 1.1112, | |
| "eval_samples_per_second": 28.798, | |
| "eval_steps_per_second": 28.798, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 3.572125196456909, | |
| "learning_rate": 9.91636798088411e-07, | |
| "loss": 0.5845, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 6.358020305633545, | |
| "learning_rate": 9.79689366786141e-07, | |
| "loss": 0.5637, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 2.5889291763305664, | |
| "learning_rate": 9.67741935483871e-07, | |
| "loss": 0.5858, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5701560974121094, | |
| "eval_runtime": 1.1543, | |
| "eval_samples_per_second": 27.722, | |
| "eval_steps_per_second": 27.722, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 5.341928482055664, | |
| "learning_rate": 9.557945041816009e-07, | |
| "loss": 0.559, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 5.699573993682861, | |
| "learning_rate": 9.438470728793309e-07, | |
| "loss": 0.4752, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 6.19121789932251, | |
| "learning_rate": 9.318996415770609e-07, | |
| "loss": 0.5407, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5572408437728882, | |
| "eval_runtime": 1.1052, | |
| "eval_samples_per_second": 28.954, | |
| "eval_steps_per_second": 28.954, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 2.574436664581299, | |
| "learning_rate": 9.199522102747909e-07, | |
| "loss": 0.6231, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 5.261692523956299, | |
| "learning_rate": 9.080047789725208e-07, | |
| "loss": 0.4838, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 5.266414642333984, | |
| "learning_rate": 8.960573476702509e-07, | |
| "loss": 0.6552, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 5.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5552529692649841, | |
| "eval_runtime": 1.1351, | |
| "eval_samples_per_second": 28.192, | |
| "eval_steps_per_second": 28.192, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 5.3043341636657715, | |
| "learning_rate": 8.841099163679809e-07, | |
| "loss": 0.5321, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 3.8107211589813232, | |
| "learning_rate": 8.721624850657109e-07, | |
| "loss": 0.5797, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 13.660761833190918, | |
| "learning_rate": 8.602150537634409e-07, | |
| "loss": 0.5562, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 6.976, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.552901029586792, | |
| "eval_runtime": 1.1121, | |
| "eval_samples_per_second": 28.774, | |
| "eval_steps_per_second": 28.774, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 5.3051605224609375, | |
| "learning_rate": 8.482676224611708e-07, | |
| "loss": 0.4844, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 3.601945400238037, | |
| "learning_rate": 8.363201911589009e-07, | |
| "loss": 0.6067, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 5.2441229820251465, | |
| "learning_rate": 8.243727598566307e-07, | |
| "loss": 0.519, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 8.390249252319336, | |
| "learning_rate": 8.124253285543607e-07, | |
| "loss": 0.6054, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5519319772720337, | |
| "eval_runtime": 1.1259, | |
| "eval_samples_per_second": 28.423, | |
| "eval_steps_per_second": 28.423, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 4.872560024261475, | |
| "learning_rate": 8.004778972520908e-07, | |
| "loss": 0.4244, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 4.932515621185303, | |
| "learning_rate": 7.885304659498207e-07, | |
| "loss": 0.501, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 10.174718856811523, | |
| "learning_rate": 7.765830346475507e-07, | |
| "loss": 0.7563, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 8.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5517733097076416, | |
| "eval_runtime": 1.1268, | |
| "eval_samples_per_second": 28.399, | |
| "eval_steps_per_second": 28.399, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 10.531723022460938, | |
| "learning_rate": 7.646356033452807e-07, | |
| "loss": 0.7409, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 5.187292098999023, | |
| "learning_rate": 7.526881720430107e-07, | |
| "loss": 0.4602, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 5.227697849273682, | |
| "learning_rate": 7.407407407407406e-07, | |
| "loss": 0.5174, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 9.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5523006319999695, | |
| "eval_runtime": 1.1419, | |
| "eval_samples_per_second": 28.023, | |
| "eval_steps_per_second": 28.023, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 10.24, | |
| "grad_norm": 3.8940961360931396, | |
| "learning_rate": 7.287933094384707e-07, | |
| "loss": 0.4655, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 10.56, | |
| "grad_norm": 9.157276153564453, | |
| "learning_rate": 7.168458781362007e-07, | |
| "loss": 0.8298, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 10.88, | |
| "grad_norm": 6.260276794433594, | |
| "learning_rate": 7.048984468339306e-07, | |
| "loss": 0.3765, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 10.975999999999999, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5513983964920044, | |
| "eval_runtime": 1.3718, | |
| "eval_samples_per_second": 23.327, | |
| "eval_steps_per_second": 23.327, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 6.0307416915893555, | |
| "learning_rate": 6.929510155316607e-07, | |
| "loss": 0.409, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 11.52, | |
| "grad_norm": 7.104811191558838, | |
| "learning_rate": 6.810035842293906e-07, | |
| "loss": 0.5515, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 11.84, | |
| "grad_norm": 1.7931679487228394, | |
| "learning_rate": 6.690561529271206e-07, | |
| "loss": 0.5727, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5506787300109863, | |
| "eval_runtime": 1.1592, | |
| "eval_samples_per_second": 27.606, | |
| "eval_steps_per_second": 27.606, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 12.16, | |
| "grad_norm": 15.06185245513916, | |
| "learning_rate": 6.571087216248506e-07, | |
| "loss": 0.6646, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 12.48, | |
| "grad_norm": 4.5719828605651855, | |
| "learning_rate": 6.451612903225806e-07, | |
| "loss": 0.6321, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 10.500142097473145, | |
| "learning_rate": 6.332138590203107e-07, | |
| "loss": 0.5613, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 12.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5510138273239136, | |
| "eval_runtime": 1.1313, | |
| "eval_samples_per_second": 28.287, | |
| "eval_steps_per_second": 28.287, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 13.12, | |
| "grad_norm": 3.0991406440734863, | |
| "learning_rate": 6.212664277180406e-07, | |
| "loss": 0.3966, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 13.44, | |
| "grad_norm": 2.3058762550354004, | |
| "learning_rate": 6.093189964157706e-07, | |
| "loss": 0.5845, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 13.76, | |
| "grad_norm": 2.215249538421631, | |
| "learning_rate": 5.973715651135006e-07, | |
| "loss": 0.568, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 13.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5510228872299194, | |
| "eval_runtime": 1.3316, | |
| "eval_samples_per_second": 24.031, | |
| "eval_steps_per_second": 24.031, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 14.08, | |
| "grad_norm": 2.978492021560669, | |
| "learning_rate": 5.854241338112306e-07, | |
| "loss": 0.5611, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 3.9042763710021973, | |
| "learning_rate": 5.734767025089605e-07, | |
| "loss": 0.4335, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 14.72, | |
| "grad_norm": 8.8019380569458, | |
| "learning_rate": 5.615292712066906e-07, | |
| "loss": 0.6655, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 14.975999999999999, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5513969659805298, | |
| "eval_runtime": 1.1115, | |
| "eval_samples_per_second": 28.791, | |
| "eval_steps_per_second": 28.791, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 15.04, | |
| "grad_norm": 3.463810920715332, | |
| "learning_rate": 5.495818399044206e-07, | |
| "loss": 0.5628, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 15.36, | |
| "grad_norm": 1.9772050380706787, | |
| "learning_rate": 5.376344086021505e-07, | |
| "loss": 0.504, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 15.68, | |
| "grad_norm": 2.6561172008514404, | |
| "learning_rate": 5.256869772998806e-07, | |
| "loss": 0.7277, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 5.404987335205078, | |
| "learning_rate": 5.137395459976105e-07, | |
| "loss": 0.4883, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5522246956825256, | |
| "eval_runtime": 1.157, | |
| "eval_samples_per_second": 27.658, | |
| "eval_steps_per_second": 27.658, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 16.32, | |
| "grad_norm": 4.906336307525635, | |
| "learning_rate": 5.017921146953405e-07, | |
| "loss": 0.4576, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 16.64, | |
| "grad_norm": 3.543666124343872, | |
| "learning_rate": 4.898446833930704e-07, | |
| "loss": 0.687, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 16.96, | |
| "grad_norm": 5.162899017333984, | |
| "learning_rate": 4.778972520908004e-07, | |
| "loss": 0.5317, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 16.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5518386960029602, | |
| "eval_runtime": 1.1469, | |
| "eval_samples_per_second": 27.901, | |
| "eval_steps_per_second": 27.901, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 17.28, | |
| "grad_norm": 3.6676950454711914, | |
| "learning_rate": 4.6594982078853044e-07, | |
| "loss": 0.5024, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 17.6, | |
| "grad_norm": 10.735907554626465, | |
| "learning_rate": 4.540023894862604e-07, | |
| "loss": 0.6743, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 17.92, | |
| "grad_norm": 4.575161457061768, | |
| "learning_rate": 4.4205495818399044e-07, | |
| "loss": 0.4501, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 17.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5519962906837463, | |
| "eval_runtime": 1.0972, | |
| "eval_samples_per_second": 29.165, | |
| "eval_steps_per_second": 29.165, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 18.24, | |
| "grad_norm": 6.966436862945557, | |
| "learning_rate": 4.3010752688172043e-07, | |
| "loss": 0.7276, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 18.56, | |
| "grad_norm": 5.1026763916015625, | |
| "learning_rate": 4.1816009557945043e-07, | |
| "loss": 0.4801, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 18.88, | |
| "grad_norm": 6.751893043518066, | |
| "learning_rate": 4.0621266427718037e-07, | |
| "loss": 0.4616, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 18.976, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.551902174949646, | |
| "eval_runtime": 1.27, | |
| "eval_samples_per_second": 25.197, | |
| "eval_steps_per_second": 25.197, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 5.350219249725342, | |
| "learning_rate": 3.9426523297491037e-07, | |
| "loss": 0.4631, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 19.52, | |
| "grad_norm": 5.136310577392578, | |
| "learning_rate": 3.8231780167264037e-07, | |
| "loss": 0.5746, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 19.84, | |
| "grad_norm": 4.849793910980225, | |
| "learning_rate": 3.703703703703703e-07, | |
| "loss": 0.4522, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5509653091430664, | |
| "eval_runtime": 1.1115, | |
| "eval_samples_per_second": 28.789, | |
| "eval_steps_per_second": 28.789, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 20.16, | |
| "grad_norm": 8.096334457397461, | |
| "learning_rate": 3.5842293906810036e-07, | |
| "loss": 0.66, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 20.48, | |
| "grad_norm": 11.139561653137207, | |
| "learning_rate": 3.4647550776583036e-07, | |
| "loss": 0.4573, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 20.8, | |
| "grad_norm": 5.0489583015441895, | |
| "learning_rate": 3.345280764635603e-07, | |
| "loss": 0.6326, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 20.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5507452487945557, | |
| "eval_runtime": 1.2933, | |
| "eval_samples_per_second": 24.742, | |
| "eval_steps_per_second": 24.742, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 21.12, | |
| "grad_norm": 12.198716163635254, | |
| "learning_rate": 3.225806451612903e-07, | |
| "loss": 0.7282, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 21.44, | |
| "grad_norm": 4.501183986663818, | |
| "learning_rate": 3.106332138590203e-07, | |
| "loss": 0.51, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 21.76, | |
| "grad_norm": 5.399625778198242, | |
| "learning_rate": 2.986857825567503e-07, | |
| "loss": 0.3828, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 21.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5508217811584473, | |
| "eval_runtime": 1.1768, | |
| "eval_samples_per_second": 27.192, | |
| "eval_steps_per_second": 27.192, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 22.08, | |
| "grad_norm": 3.282414436340332, | |
| "learning_rate": 2.8673835125448024e-07, | |
| "loss": 0.6789, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 22.4, | |
| "grad_norm": 4.816540718078613, | |
| "learning_rate": 2.747909199522103e-07, | |
| "loss": 0.5746, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 22.72, | |
| "grad_norm": 3.5417306423187256, | |
| "learning_rate": 2.628434886499403e-07, | |
| "loss": 0.4283, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 22.976, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5509472489356995, | |
| "eval_runtime": 1.2114, | |
| "eval_samples_per_second": 26.417, | |
| "eval_steps_per_second": 26.417, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 23.04, | |
| "grad_norm": 5.404343605041504, | |
| "learning_rate": 2.508960573476702e-07, | |
| "loss": 0.5891, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 23.36, | |
| "grad_norm": 4.924590587615967, | |
| "learning_rate": 2.389486260454002e-07, | |
| "loss": 0.4529, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 23.68, | |
| "grad_norm": 3.1310863494873047, | |
| "learning_rate": 2.270011947431302e-07, | |
| "loss": 0.5812, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 5.183323383331299, | |
| "learning_rate": 2.1505376344086022e-07, | |
| "loss": 0.6701, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5505539178848267, | |
| "eval_runtime": 1.1047, | |
| "eval_samples_per_second": 28.967, | |
| "eval_steps_per_second": 28.967, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 24.32, | |
| "grad_norm": 11.777995109558105, | |
| "learning_rate": 2.0310633213859019e-07, | |
| "loss": 0.5262, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 24.64, | |
| "grad_norm": 2.3787074089050293, | |
| "learning_rate": 1.9115890083632018e-07, | |
| "loss": 0.4884, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 24.96, | |
| "grad_norm": 5.162797927856445, | |
| "learning_rate": 1.7921146953405018e-07, | |
| "loss": 0.6157, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 24.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5503212213516235, | |
| "eval_runtime": 1.2194, | |
| "eval_samples_per_second": 26.243, | |
| "eval_steps_per_second": 26.243, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 25.28, | |
| "grad_norm": 4.399529933929443, | |
| "learning_rate": 1.6726403823178015e-07, | |
| "loss": 0.5277, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 25.6, | |
| "grad_norm": 8.869352340698242, | |
| "learning_rate": 1.5531660692951015e-07, | |
| "loss": 0.6222, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 25.92, | |
| "grad_norm": 3.5912718772888184, | |
| "learning_rate": 1.4336917562724012e-07, | |
| "loss": 0.5657, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 25.984, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5502746105194092, | |
| "eval_runtime": 1.0991, | |
| "eval_samples_per_second": 29.116, | |
| "eval_steps_per_second": 29.116, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 26.24, | |
| "grad_norm": 4.004587173461914, | |
| "learning_rate": 1.3142174432497014e-07, | |
| "loss": 0.5406, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 26.56, | |
| "grad_norm": 6.145273208618164, | |
| "learning_rate": 1.194743130227001e-07, | |
| "loss": 0.6507, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 26.88, | |
| "grad_norm": 2.8329687118530273, | |
| "learning_rate": 1.0752688172043011e-07, | |
| "loss": 0.5127, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 26.976, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5503281354904175, | |
| "eval_runtime": 1.277, | |
| "eval_samples_per_second": 25.058, | |
| "eval_steps_per_second": 25.058, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 27.2, | |
| "grad_norm": 5.049647808074951, | |
| "learning_rate": 9.557945041816009e-08, | |
| "loss": 0.4542, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 27.52, | |
| "grad_norm": 2.4863924980163574, | |
| "learning_rate": 8.363201911589008e-08, | |
| "loss": 0.5172, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 27.84, | |
| "grad_norm": 5.455885887145996, | |
| "learning_rate": 7.168458781362006e-08, | |
| "loss": 0.6178, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5502800941467285, | |
| "eval_runtime": 1.1133, | |
| "eval_samples_per_second": 28.744, | |
| "eval_steps_per_second": 28.744, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 28.16, | |
| "grad_norm": 5.835262298583984, | |
| "learning_rate": 5.973715651135006e-08, | |
| "loss": 0.4629, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 28.48, | |
| "grad_norm": 5.468852519989014, | |
| "learning_rate": 4.7789725209080046e-08, | |
| "loss": 0.381, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 28.8, | |
| "grad_norm": 4.890566825866699, | |
| "learning_rate": 3.584229390681003e-08, | |
| "loss": 0.5679, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 28.992, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5502068996429443, | |
| "eval_runtime": 1.1769, | |
| "eval_samples_per_second": 27.191, | |
| "eval_steps_per_second": 27.191, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 29.12, | |
| "grad_norm": 5.2326884269714355, | |
| "learning_rate": 2.3894862604540023e-08, | |
| "loss": 0.7681, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 29.44, | |
| "grad_norm": 8.419669151306152, | |
| "learning_rate": 1.1947431302270011e-08, | |
| "loss": 0.6077, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 29.76, | |
| "grad_norm": 2.1025705337524414, | |
| "learning_rate": 0.0, | |
| "loss": 0.6102, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 29.76, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.5502274632453918, | |
| "eval_runtime": 1.6314, | |
| "eval_samples_per_second": 19.615, | |
| "eval_steps_per_second": 19.615, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 29.76, | |
| "step": 930, | |
| "total_flos": 3.8132430847082496e+17, | |
| "train_loss": 0.5704158216394404, | |
| "train_runtime": 536.3007, | |
| "train_samples_per_second": 6.992, | |
| "train_steps_per_second": 1.734 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 930, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 30, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.8132430847082496e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |