| { | |
| "best_metric": 0.17754687368869781, | |
| "best_model_checkpoint": "results/checkpoint-25000", | |
| "epoch": 9.998720081914758, | |
| "eval_steps": 500, | |
| "global_step": 26040, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.038397542557276336, | |
| "grad_norm": 1.0079567432403564, | |
| "learning_rate": 9.999643338380885e-06, | |
| "loss": 5.5723, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07679508511455267, | |
| "grad_norm": 0.6461474299430847, | |
| "learning_rate": 9.998558958654982e-06, | |
| "loss": 2.2782, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.115192627671829, | |
| "grad_norm": 0.4909125566482544, | |
| "learning_rate": 9.996746982275233e-06, | |
| "loss": 1.8047, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15359017022910534, | |
| "grad_norm": 0.47547289729118347, | |
| "learning_rate": 9.994207672995245e-06, | |
| "loss": 1.5821, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.19198771278638166, | |
| "grad_norm": 0.41358837485313416, | |
| "learning_rate": 9.99094140044013e-06, | |
| "loss": 1.4754, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19198771278638166, | |
| "eval_valid_loss": 1.4288749694824219, | |
| "eval_valid_runtime": 4.7117, | |
| "eval_valid_samples_per_second": 212.238, | |
| "eval_valid_steps_per_second": 6.792, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19198771278638166, | |
| "eval_valid_target_loss": 1.4590624570846558, | |
| "eval_valid_target_runtime": 4.684, | |
| "eval_valid_target_samples_per_second": 213.493, | |
| "eval_valid_target_steps_per_second": 6.832, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.230385255343658, | |
| "grad_norm": 0.43229448795318604, | |
| "learning_rate": 9.986948640052719e-06, | |
| "loss": 1.4087, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.26878279790093434, | |
| "grad_norm": 0.528977632522583, | |
| "learning_rate": 9.982229973024328e-06, | |
| "loss": 1.3245, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3071803404582107, | |
| "grad_norm": 0.5489594340324402, | |
| "learning_rate": 9.976786086210186e-06, | |
| "loss": 1.0455, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.34557788301548703, | |
| "grad_norm": 0.5119125843048096, | |
| "learning_rate": 9.970617772029439e-06, | |
| "loss": 0.7605, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3839754255727633, | |
| "grad_norm": 0.5092576146125793, | |
| "learning_rate": 9.963725928349814e-06, | |
| "loss": 0.6005, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3839754255727633, | |
| "eval_valid_loss": 0.49165624380111694, | |
| "eval_valid_runtime": 4.674, | |
| "eval_valid_samples_per_second": 213.951, | |
| "eval_valid_steps_per_second": 6.846, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3839754255727633, | |
| "eval_valid_target_loss": 0.5142187476158142, | |
| "eval_valid_target_runtime": 4.6758, | |
| "eval_valid_target_samples_per_second": 213.869, | |
| "eval_valid_target_steps_per_second": 6.844, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.42237296813003966, | |
| "grad_norm": 0.43798330426216125, | |
| "learning_rate": 9.956111558356915e-06, | |
| "loss": 0.4887, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.460770510687316, | |
| "grad_norm": 0.3737218379974365, | |
| "learning_rate": 9.947775770408207e-06, | |
| "loss": 0.4307, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.49916805324459235, | |
| "grad_norm": 0.4303857386112213, | |
| "learning_rate": 9.938719777871674e-06, | |
| "loss": 0.4027, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5375655958018687, | |
| "grad_norm": 0.3709202706813812, | |
| "learning_rate": 9.92894489894921e-06, | |
| "loss": 0.3799, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.575963138359145, | |
| "grad_norm": 0.3918135464191437, | |
| "learning_rate": 9.918452556484728e-06, | |
| "loss": 0.3633, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.575963138359145, | |
| "eval_valid_loss": 0.33228906989097595, | |
| "eval_valid_runtime": 4.7244, | |
| "eval_valid_samples_per_second": 211.669, | |
| "eval_valid_steps_per_second": 6.773, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.575963138359145, | |
| "eval_valid_target_loss": 0.3428671956062317, | |
| "eval_valid_target_runtime": 4.6595, | |
| "eval_valid_target_samples_per_second": 214.617, | |
| "eval_valid_target_steps_per_second": 6.868, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6143606809164214, | |
| "grad_norm": 0.3580816686153412, | |
| "learning_rate": 9.907244277757053e-06, | |
| "loss": 0.3565, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6527582234736977, | |
| "grad_norm": 0.34893009066581726, | |
| "learning_rate": 9.895321694257617e-06, | |
| "loss": 0.3443, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6911557660309741, | |
| "grad_norm": 0.3050221800804138, | |
| "learning_rate": 9.882686541452967e-06, | |
| "loss": 0.3339, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7295533085882504, | |
| "grad_norm": 0.3123306632041931, | |
| "learning_rate": 9.869340658532151e-06, | |
| "loss": 0.3278, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7679508511455266, | |
| "grad_norm": 0.31590646505355835, | |
| "learning_rate": 9.85528598813901e-06, | |
| "loss": 0.32, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7679508511455266, | |
| "eval_valid_loss": 0.29783594608306885, | |
| "eval_valid_runtime": 4.6776, | |
| "eval_valid_samples_per_second": 213.785, | |
| "eval_valid_steps_per_second": 6.841, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7679508511455266, | |
| "eval_valid_target_loss": 0.3129218816757202, | |
| "eval_valid_target_runtime": 4.6598, | |
| "eval_valid_target_samples_per_second": 214.601, | |
| "eval_valid_target_steps_per_second": 6.867, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.806348393702803, | |
| "grad_norm": 0.2684693932533264, | |
| "learning_rate": 9.840524576089392e-06, | |
| "loss": 0.3194, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8447459362600793, | |
| "grad_norm": 0.30888476967811584, | |
| "learning_rate": 9.82505857107337e-06, | |
| "loss": 0.3108, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.8831434788173557, | |
| "grad_norm": 0.2777215242385864, | |
| "learning_rate": 9.808890224342476e-06, | |
| "loss": 0.3105, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.921541021374632, | |
| "grad_norm": 0.30601397156715393, | |
| "learning_rate": 9.792021889381995e-06, | |
| "loss": 0.3055, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.9599385639319084, | |
| "grad_norm": 0.2972748875617981, | |
| "learning_rate": 9.774456021568404e-06, | |
| "loss": 0.3008, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9599385639319084, | |
| "eval_valid_loss": 0.27842968702316284, | |
| "eval_valid_runtime": 4.68, | |
| "eval_valid_samples_per_second": 213.675, | |
| "eval_valid_steps_per_second": 6.838, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9599385639319084, | |
| "eval_valid_target_loss": 0.2943359315395355, | |
| "eval_valid_target_runtime": 4.6764, | |
| "eval_valid_target_samples_per_second": 213.838, | |
| "eval_valid_target_steps_per_second": 6.843, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9983361064891847, | |
| "grad_norm": 0.2891447842121124, | |
| "learning_rate": 9.756195177811953e-06, | |
| "loss": 0.2969, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.036733649046461, | |
| "grad_norm": 0.30049994587898254, | |
| "learning_rate": 9.737242016184486e-06, | |
| "loss": 0.2913, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.0751311916037374, | |
| "grad_norm": 0.25728341937065125, | |
| "learning_rate": 9.717599295532518e-06, | |
| "loss": 0.2911, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.1135287341610136, | |
| "grad_norm": 0.31619054079055786, | |
| "learning_rate": 9.697269875075667e-06, | |
| "loss": 0.2879, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.15192627671829, | |
| "grad_norm": 0.3005208671092987, | |
| "learning_rate": 9.676256713990448e-06, | |
| "loss": 0.2839, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.15192627671829, | |
| "eval_valid_loss": 0.2648593783378601, | |
| "eval_valid_runtime": 4.6888, | |
| "eval_valid_samples_per_second": 213.274, | |
| "eval_valid_steps_per_second": 6.825, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.15192627671829, | |
| "eval_valid_target_loss": 0.2817968726158142, | |
| "eval_valid_target_runtime": 4.6695, | |
| "eval_valid_target_samples_per_second": 214.155, | |
| "eval_valid_target_steps_per_second": 6.853, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.1903238192755663, | |
| "grad_norm": 0.24846772849559784, | |
| "learning_rate": 9.654562870979545e-06, | |
| "loss": 0.2803, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.2287213618328428, | |
| "grad_norm": 0.23501113057136536, | |
| "learning_rate": 9.632191503826574e-06, | |
| "loss": 0.278, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.267118904390119, | |
| "grad_norm": 0.27793240547180176, | |
| "learning_rate": 9.609145868936434e-06, | |
| "loss": 0.2776, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.3055164469473954, | |
| "grad_norm": 0.2599338889122009, | |
| "learning_rate": 9.5854293208613e-06, | |
| "loss": 0.275, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.3439139895046717, | |
| "grad_norm": 0.2431340515613556, | |
| "learning_rate": 9.561045311812335e-06, | |
| "loss": 0.2722, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.3439139895046717, | |
| "eval_valid_loss": 0.2545468807220459, | |
| "eval_valid_runtime": 4.7131, | |
| "eval_valid_samples_per_second": 212.177, | |
| "eval_valid_steps_per_second": 6.79, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.3439139895046717, | |
| "eval_valid_target_loss": 0.27326563000679016, | |
| "eval_valid_target_runtime": 4.6635, | |
| "eval_valid_target_samples_per_second": 214.433, | |
| "eval_valid_target_steps_per_second": 6.862, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.382311532061948, | |
| "grad_norm": 0.28658226132392883, | |
| "learning_rate": 9.535997391157174e-06, | |
| "loss": 0.2693, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.4207090746192244, | |
| "grad_norm": 0.26118528842926025, | |
| "learning_rate": 9.510289204903273e-06, | |
| "loss": 0.2667, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.4591066171765008, | |
| "grad_norm": 0.2761940062046051, | |
| "learning_rate": 9.483924495167204e-06, | |
| "loss": 0.2654, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.497504159733777, | |
| "grad_norm": 0.2712952792644501, | |
| "learning_rate": 9.456907099629933e-06, | |
| "loss": 0.2642, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.5359017022910533, | |
| "grad_norm": 0.23100949823856354, | |
| "learning_rate": 9.429240950978212e-06, | |
| "loss": 0.2622, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.5359017022910533, | |
| "eval_valid_loss": 0.24485936760902405, | |
| "eval_valid_runtime": 4.6751, | |
| "eval_valid_samples_per_second": 213.9, | |
| "eval_valid_steps_per_second": 6.845, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.5359017022910533, | |
| "eval_valid_target_loss": 0.2641640603542328, | |
| "eval_valid_target_runtime": 4.6656, | |
| "eval_valid_target_samples_per_second": 214.335, | |
| "eval_valid_target_steps_per_second": 6.859, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.5742992448483297, | |
| "grad_norm": 0.2676081359386444, | |
| "learning_rate": 9.400930076332126e-06, | |
| "loss": 0.2602, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.6126967874056062, | |
| "grad_norm": 0.24242335557937622, | |
| "learning_rate": 9.371978596658904e-06, | |
| "loss": 0.2573, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.6510943299628824, | |
| "grad_norm": 0.27868130803108215, | |
| "learning_rate": 9.342390726173065e-06, | |
| "loss": 0.2574, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.6894918725201586, | |
| "grad_norm": 0.2644180655479431, | |
| "learning_rate": 9.31217077172299e-06, | |
| "loss": 0.255, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.727889415077435, | |
| "grad_norm": 0.2352069914340973, | |
| "learning_rate": 9.281323132164013e-06, | |
| "loss": 0.2538, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.727889415077435, | |
| "eval_valid_loss": 0.2354765683412552, | |
| "eval_valid_runtime": 4.7068, | |
| "eval_valid_samples_per_second": 212.461, | |
| "eval_valid_steps_per_second": 6.799, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.727889415077435, | |
| "eval_valid_target_loss": 0.25502344965934753, | |
| "eval_valid_target_runtime": 4.6612, | |
| "eval_valid_target_samples_per_second": 214.536, | |
| "eval_valid_target_steps_per_second": 6.865, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.7662869576347113, | |
| "grad_norm": 0.28041261434555054, | |
| "learning_rate": 9.249852297718116e-06, | |
| "loss": 0.2507, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.8046845001919878, | |
| "grad_norm": 0.2735440135002136, | |
| "learning_rate": 9.217762849320334e-06, | |
| "loss": 0.2496, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.843082042749264, | |
| "grad_norm": 0.26316097378730774, | |
| "learning_rate": 9.185059457951933e-06, | |
| "loss": 0.2479, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.8814795853065402, | |
| "grad_norm": 0.23891638219356537, | |
| "learning_rate": 9.151746883960512e-06, | |
| "loss": 0.2457, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.9198771278638167, | |
| "grad_norm": 0.22432874143123627, | |
| "learning_rate": 9.117829976367072e-06, | |
| "loss": 0.2446, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.9198771278638167, | |
| "eval_valid_loss": 0.2283046841621399, | |
| "eval_valid_runtime": 4.6829, | |
| "eval_valid_samples_per_second": 213.544, | |
| "eval_valid_steps_per_second": 6.833, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.9198771278638167, | |
| "eval_valid_target_loss": 0.24809375405311584, | |
| "eval_valid_target_runtime": 4.6694, | |
| "eval_valid_target_samples_per_second": 214.162, | |
| "eval_valid_target_steps_per_second": 6.853, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.9582746704210932, | |
| "grad_norm": 0.27488961815834045, | |
| "learning_rate": 9.08331367216019e-06, | |
| "loss": 0.2434, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.9966722129783694, | |
| "grad_norm": 0.2284267097711563, | |
| "learning_rate": 9.048202995577383e-06, | |
| "loss": 0.24, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.0350697555356456, | |
| "grad_norm": 0.2710357904434204, | |
| "learning_rate": 9.012503057373769e-06, | |
| "loss": 0.2399, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.073467298092922, | |
| "grad_norm": 0.24398750066757202, | |
| "learning_rate": 8.976219054078147e-06, | |
| "loss": 0.2391, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.1118648406501985, | |
| "grad_norm": 0.24732039868831635, | |
| "learning_rate": 8.939356267236582e-06, | |
| "loss": 0.2374, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.1118648406501985, | |
| "eval_valid_loss": 0.22253906726837158, | |
| "eval_valid_runtime": 4.6969, | |
| "eval_valid_samples_per_second": 212.904, | |
| "eval_valid_steps_per_second": 6.813, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.1118648406501985, | |
| "eval_valid_target_loss": 0.24240624904632568, | |
| "eval_valid_target_runtime": 4.6761, | |
| "eval_valid_target_samples_per_second": 213.853, | |
| "eval_valid_target_steps_per_second": 6.843, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.1502623832074748, | |
| "grad_norm": 0.23949123919010162, | |
| "learning_rate": 8.901920062643607e-06, | |
| "loss": 0.2368, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.188659925764751, | |
| "grad_norm": 0.26010605692863464, | |
| "learning_rate": 8.863915889561188e-06, | |
| "loss": 0.2351, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.2270574683220272, | |
| "grad_norm": 0.2524034380912781, | |
| "learning_rate": 8.825349279925506e-06, | |
| "loss": 0.2333, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.265455010879304, | |
| "grad_norm": 0.24745632708072662, | |
| "learning_rate": 8.78622584754173e-06, | |
| "loss": 0.2323, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.30385255343658, | |
| "grad_norm": 0.2586907148361206, | |
| "learning_rate": 8.746551287266863e-06, | |
| "loss": 0.2312, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.30385255343658, | |
| "eval_valid_loss": 0.216859370470047, | |
| "eval_valid_runtime": 4.6709, | |
| "eval_valid_samples_per_second": 214.092, | |
| "eval_valid_steps_per_second": 6.851, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.30385255343658, | |
| "eval_valid_target_loss": 0.23771093785762787, | |
| "eval_valid_target_runtime": 4.6848, | |
| "eval_valid_target_samples_per_second": 213.455, | |
| "eval_valid_target_steps_per_second": 6.831, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.3422500959938564, | |
| "grad_norm": 0.24499697983264923, | |
| "learning_rate": 8.706331374180792e-06, | |
| "loss": 0.2301, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.3806476385511326, | |
| "grad_norm": 0.24237163364887238, | |
| "learning_rate": 8.665571962745655e-06, | |
| "loss": 0.2304, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.419045181108409, | |
| "grad_norm": 0.27395910024642944, | |
| "learning_rate": 8.624278985953665e-06, | |
| "loss": 0.2287, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.4574427236656855, | |
| "grad_norm": 0.2500033378601074, | |
| "learning_rate": 8.582458454463493e-06, | |
| "loss": 0.2279, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.4958402662229617, | |
| "grad_norm": 0.2605977952480316, | |
| "learning_rate": 8.540116455725346e-06, | |
| "loss": 0.2277, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.4958402662229617, | |
| "eval_valid_loss": 0.21196874976158142, | |
| "eval_valid_runtime": 4.6941, | |
| "eval_valid_samples_per_second": 213.035, | |
| "eval_valid_steps_per_second": 6.817, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.4958402662229617, | |
| "eval_valid_target_loss": 0.23328906297683716, | |
| "eval_valid_target_runtime": 4.6792, | |
| "eval_valid_target_samples_per_second": 213.712, | |
| "eval_valid_target_steps_per_second": 6.839, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.534237808780238, | |
| "grad_norm": 0.2220095992088318, | |
| "learning_rate": 8.497259153094875e-06, | |
| "loss": 0.2254, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.5726353513375146, | |
| "grad_norm": 0.24707047641277313, | |
| "learning_rate": 8.453892784936022e-06, | |
| "loss": 0.2239, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.611032893894791, | |
| "grad_norm": 0.23103290796279907, | |
| "learning_rate": 8.41002366371297e-06, | |
| "loss": 0.224, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.649430436452067, | |
| "grad_norm": 0.2249547839164734, | |
| "learning_rate": 8.36565817507127e-06, | |
| "loss": 0.2227, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.6878279790093433, | |
| "grad_norm": 0.24457262456417084, | |
| "learning_rate": 8.32080277690836e-06, | |
| "loss": 0.2209, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.6878279790093433, | |
| "eval_valid_loss": 0.20793749392032623, | |
| "eval_valid_runtime": 4.6727, | |
| "eval_valid_samples_per_second": 214.01, | |
| "eval_valid_steps_per_second": 6.848, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.6878279790093433, | |
| "eval_valid_target_loss": 0.22950781881809235, | |
| "eval_valid_target_runtime": 4.6848, | |
| "eval_valid_target_samples_per_second": 213.456, | |
| "eval_valid_target_steps_per_second": 6.831, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.7262255215666196, | |
| "grad_norm": 0.23176012933254242, | |
| "learning_rate": 8.275463998433537e-06, | |
| "loss": 0.2206, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.764623064123896, | |
| "grad_norm": 0.21723733842372894, | |
| "learning_rate": 8.229648439217552e-06, | |
| "loss": 0.2203, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.8030206066811725, | |
| "grad_norm": 0.2428179383277893, | |
| "learning_rate": 8.183362768231971e-06, | |
| "loss": 0.2192, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.8414181492384487, | |
| "grad_norm": 0.2162482738494873, | |
| "learning_rate": 8.136613722878437e-06, | |
| "loss": 0.2183, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.879815691795725, | |
| "grad_norm": 0.22231200337409973, | |
| "learning_rate": 8.08940810800796e-06, | |
| "loss": 0.2177, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.879815691795725, | |
| "eval_valid_loss": 0.20469531416893005, | |
| "eval_valid_runtime": 4.6819, | |
| "eval_valid_samples_per_second": 213.587, | |
| "eval_valid_steps_per_second": 6.835, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.879815691795725, | |
| "eval_valid_target_loss": 0.2264062464237213, | |
| "eval_valid_target_runtime": 4.6647, | |
| "eval_valid_target_samples_per_second": 214.376, | |
| "eval_valid_target_steps_per_second": 6.86, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.9182132343530016, | |
| "grad_norm": 0.2663327157497406, | |
| "learning_rate": 8.041752794930389e-06, | |
| "loss": 0.2172, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.956610776910278, | |
| "grad_norm": 0.2545444369316101, | |
| "learning_rate": 7.993654720414227e-06, | |
| "loss": 0.216, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.995008319467554, | |
| "grad_norm": 0.2252371460199356, | |
| "learning_rate": 7.9451208856769e-06, | |
| "loss": 0.2154, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.0334058620248303, | |
| "grad_norm": 0.2507840394973755, | |
| "learning_rate": 7.896158355365643e-06, | |
| "loss": 0.2151, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.0718034045821065, | |
| "grad_norm": 0.22570189833641052, | |
| "learning_rate": 7.846774256529178e-06, | |
| "loss": 0.2131, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.0718034045821065, | |
| "eval_valid_loss": 0.2014453113079071, | |
| "eval_valid_runtime": 4.6924, | |
| "eval_valid_samples_per_second": 213.111, | |
| "eval_valid_steps_per_second": 6.82, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.0718034045821065, | |
| "eval_valid_target_loss": 0.22346094250679016, | |
| "eval_valid_target_runtime": 4.6655, | |
| "eval_valid_target_samples_per_second": 214.339, | |
| "eval_valid_target_steps_per_second": 6.859, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.110200947139383, | |
| "grad_norm": 0.24750301241874695, | |
| "learning_rate": 7.796975777580276e-06, | |
| "loss": 0.2133, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.1485984896966595, | |
| "grad_norm": 0.2118765264749527, | |
| "learning_rate": 7.746770167249413e-06, | |
| "loss": 0.2124, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.1869960322539357, | |
| "grad_norm": 0.22295965254306793, | |
| "learning_rate": 7.696164733529628e-06, | |
| "loss": 0.2123, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.225393574811212, | |
| "grad_norm": 0.2226712554693222, | |
| "learning_rate": 7.645166842612766e-06, | |
| "loss": 0.2115, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.2637911173684886, | |
| "grad_norm": 0.22712872922420502, | |
| "learning_rate": 7.593783917817248e-06, | |
| "loss": 0.211, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.2637911173684886, | |
| "eval_valid_loss": 0.19893750548362732, | |
| "eval_valid_runtime": 4.6876, | |
| "eval_valid_samples_per_second": 213.327, | |
| "eval_valid_steps_per_second": 6.826, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.2637911173684886, | |
| "eval_valid_target_loss": 0.22138281166553497, | |
| "eval_valid_target_runtime": 4.6684, | |
| "eval_valid_target_samples_per_second": 214.206, | |
| "eval_valid_target_steps_per_second": 6.855, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.302188659925765, | |
| "grad_norm": 0.20663662254810333, | |
| "learning_rate": 7.5420234385075155e-06, | |
| "loss": 0.211, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.340586202483041, | |
| "grad_norm": 0.24639233946800232, | |
| "learning_rate": 7.489892939005333e-06, | |
| "loss": 0.2099, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.3789837450403173, | |
| "grad_norm": 0.21435491740703583, | |
| "learning_rate": 7.437400007493079e-06, | |
| "loss": 0.209, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.4173812875975935, | |
| "grad_norm": 0.21131959557533264, | |
| "learning_rate": 7.384552284909195e-06, | |
| "loss": 0.2081, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.45577883015487, | |
| "grad_norm": 0.2295517921447754, | |
| "learning_rate": 7.3313574638359734e-06, | |
| "loss": 0.2084, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.45577883015487, | |
| "eval_valid_loss": 0.19658593833446503, | |
| "eval_valid_runtime": 4.6935, | |
| "eval_valid_samples_per_second": 213.059, | |
| "eval_valid_steps_per_second": 6.818, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.45577883015487, | |
| "eval_valid_target_loss": 0.2188750058412552, | |
| "eval_valid_target_runtime": 4.6686, | |
| "eval_valid_target_samples_per_second": 214.199, | |
| "eval_valid_target_steps_per_second": 6.854, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.4941763727121464, | |
| "grad_norm": 0.2244088351726532, | |
| "learning_rate": 7.277823287379801e-06, | |
| "loss": 0.2084, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.5325739152694227, | |
| "grad_norm": 0.2267696112394333, | |
| "learning_rate": 7.2239575480440774e-06, | |
| "loss": 0.2085, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.5709714578266993, | |
| "grad_norm": 0.20846766233444214, | |
| "learning_rate": 7.169768086594913e-06, | |
| "loss": 0.2063, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.6093690003839756, | |
| "grad_norm": 0.23632733523845673, | |
| "learning_rate": 7.115262790919827e-06, | |
| "loss": 0.2068, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.647766542941252, | |
| "grad_norm": 0.20877471566200256, | |
| "learning_rate": 7.060449594879573e-06, | |
| "loss": 0.2059, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.647766542941252, | |
| "eval_valid_loss": 0.19441406428813934, | |
| "eval_valid_runtime": 4.6671, | |
| "eval_valid_samples_per_second": 214.264, | |
| "eval_valid_steps_per_second": 6.856, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.647766542941252, | |
| "eval_valid_target_loss": 0.21704687178134918, | |
| "eval_valid_target_runtime": 4.6648, | |
| "eval_valid_target_samples_per_second": 214.371, | |
| "eval_valid_target_steps_per_second": 6.86, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.686164085498528, | |
| "grad_norm": 0.20587915182113647, | |
| "learning_rate": 7.0053364771532805e-06, | |
| "loss": 0.2058, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.7245616280558043, | |
| "grad_norm": 0.208708256483078, | |
| "learning_rate": 6.949931460077058e-06, | |
| "loss": 0.2052, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.7629591706130805, | |
| "grad_norm": 0.21517980098724365, | |
| "learning_rate": 6.894242608476263e-06, | |
| "loss": 0.2049, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.801356713170357, | |
| "grad_norm": 0.22570070624351501, | |
| "learning_rate": 6.8382780284915685e-06, | |
| "loss": 0.2047, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.8397542557276334, | |
| "grad_norm": 0.22346258163452148, | |
| "learning_rate": 6.782045866399023e-06, | |
| "loss": 0.2037, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.8397542557276334, | |
| "eval_valid_loss": 0.1928359419107437, | |
| "eval_valid_runtime": 4.6748, | |
| "eval_valid_samples_per_second": 213.912, | |
| "eval_valid_steps_per_second": 6.845, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.8397542557276334, | |
| "eval_valid_target_loss": 0.21531249582767487, | |
| "eval_valid_target_runtime": 4.6773, | |
| "eval_valid_target_samples_per_second": 213.8, | |
| "eval_valid_target_steps_per_second": 6.842, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.8781517982849096, | |
| "grad_norm": 0.2544507086277008, | |
| "learning_rate": 6.725554307424274e-06, | |
| "loss": 0.2036, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.9165493408421863, | |
| "grad_norm": 0.27723318338394165, | |
| "learning_rate": 6.668811574551106e-06, | |
| "loss": 0.2039, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.9549468833994625, | |
| "grad_norm": 0.22496485710144043, | |
| "learning_rate": 6.6118259273245065e-06, | |
| "loss": 0.2032, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.9933444259567388, | |
| "grad_norm": 0.22093619406223297, | |
| "learning_rate": 6.55460566064838e-06, | |
| "loss": 0.2027, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 4.031741968514015, | |
| "grad_norm": 0.2137976437807083, | |
| "learning_rate": 6.497159103578143e-06, | |
| "loss": 0.2016, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.031741968514015, | |
| "eval_valid_loss": 0.19111718237400055, | |
| "eval_valid_runtime": 4.6833, | |
| "eval_valid_samples_per_second": 213.523, | |
| "eval_valid_steps_per_second": 6.833, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.031741968514015, | |
| "eval_valid_target_loss": 0.2142656296491623, | |
| "eval_valid_target_runtime": 4.6587, | |
| "eval_valid_target_samples_per_second": 214.65, | |
| "eval_valid_target_steps_per_second": 6.869, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.070139511071291, | |
| "grad_norm": 0.20360158383846283, | |
| "learning_rate": 6.439494618108332e-06, | |
| "loss": 0.2013, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 4.1085370536285675, | |
| "grad_norm": 0.21878282725811005, | |
| "learning_rate": 6.38162059795542e-06, | |
| "loss": 0.2006, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 4.146934596185844, | |
| "grad_norm": 0.2319776862859726, | |
| "learning_rate": 6.323545467336017e-06, | |
| "loss": 0.2012, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 4.185332138743121, | |
| "grad_norm": 0.20898312330245972, | |
| "learning_rate": 6.26527767974063e-06, | |
| "loss": 0.2005, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 4.223729681300397, | |
| "grad_norm": 0.21366915106773376, | |
| "learning_rate": 6.206825716703166e-06, | |
| "loss": 0.2, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.223729681300397, | |
| "eval_valid_loss": 0.18977344036102295, | |
| "eval_valid_runtime": 4.7328, | |
| "eval_valid_samples_per_second": 211.293, | |
| "eval_valid_steps_per_second": 6.761, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.223729681300397, | |
| "eval_valid_target_loss": 0.21274219453334808, | |
| "eval_valid_target_runtime": 4.6506, | |
| "eval_valid_target_samples_per_second": 215.026, | |
| "eval_valid_target_steps_per_second": 6.881, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.262127223857673, | |
| "grad_norm": 0.20968745648860931, | |
| "learning_rate": 6.1481980865663405e-06, | |
| "loss": 0.1993, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 4.3005247664149495, | |
| "grad_norm": 0.20683012902736664, | |
| "learning_rate": 6.089403323243203e-06, | |
| "loss": 0.1992, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 4.338922308972226, | |
| "grad_norm": 0.20785097777843475, | |
| "learning_rate": 6.030449984974916e-06, | |
| "loss": 0.199, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 4.377319851529502, | |
| "grad_norm": 0.20532238483428955, | |
| "learning_rate": 5.971346653085025e-06, | |
| "loss": 0.199, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 4.415717394086778, | |
| "grad_norm": 0.21589842438697815, | |
| "learning_rate": 5.912101930730329e-06, | |
| "loss": 0.1992, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.415717394086778, | |
| "eval_valid_loss": 0.18833594024181366, | |
| "eval_valid_runtime": 4.6904, | |
| "eval_valid_samples_per_second": 213.203, | |
| "eval_valid_steps_per_second": 6.823, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.415717394086778, | |
| "eval_valid_target_loss": 0.211976557970047, | |
| "eval_valid_target_runtime": 4.658, | |
| "eval_valid_target_samples_per_second": 214.686, | |
| "eval_valid_target_steps_per_second": 6.87, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.4541149366440544, | |
| "grad_norm": 0.2021540254354477, | |
| "learning_rate": 5.852724441648614e-06, | |
| "loss": 0.1987, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 4.492512479201331, | |
| "grad_norm": 0.24406403303146362, | |
| "learning_rate": 5.7932228289033506e-06, | |
| "loss": 0.1984, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 4.530910021758608, | |
| "grad_norm": 0.20519228279590607, | |
| "learning_rate": 5.7336057536256216e-06, | |
| "loss": 0.1984, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 4.569307564315884, | |
| "grad_norm": 0.21227143704891205, | |
| "learning_rate": 5.67388189375337e-06, | |
| "loss": 0.1976, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 4.60770510687316, | |
| "grad_norm": 0.2325662076473236, | |
| "learning_rate": 5.614059942768254e-06, | |
| "loss": 0.1977, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.60770510687316, | |
| "eval_valid_loss": 0.18742187321186066, | |
| "eval_valid_runtime": 4.6831, | |
| "eval_valid_samples_per_second": 213.535, | |
| "eval_valid_steps_per_second": 6.833, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.60770510687316, | |
| "eval_valid_target_loss": 0.21108593046665192, | |
| "eval_valid_target_runtime": 4.6502, | |
| "eval_valid_target_samples_per_second": 215.046, | |
| "eval_valid_target_steps_per_second": 6.881, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.6461026494304365, | |
| "grad_norm": 0.2245544046163559, | |
| "learning_rate": 5.554148608430192e-06, | |
| "loss": 0.1965, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 4.684500191987713, | |
| "grad_norm": 0.22662824392318726, | |
| "learning_rate": 5.4941566115098614e-06, | |
| "loss": 0.1971, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 4.722897734544989, | |
| "grad_norm": 0.19245535135269165, | |
| "learning_rate": 5.4340926845192874e-06, | |
| "loss": 0.1974, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 4.761295277102265, | |
| "grad_norm": 0.18942756950855255, | |
| "learning_rate": 5.373965570440729e-06, | |
| "loss": 0.1966, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 4.799692819659541, | |
| "grad_norm": 0.1962059736251831, | |
| "learning_rate": 5.3137840214540395e-06, | |
| "loss": 0.1958, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.799692819659541, | |
| "eval_valid_loss": 0.18663281202316284, | |
| "eval_valid_runtime": 4.6972, | |
| "eval_valid_samples_per_second": 212.895, | |
| "eval_valid_steps_per_second": 6.813, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.799692819659541, | |
| "eval_valid_target_loss": 0.21009375154972076, | |
| "eval_valid_target_runtime": 4.6708, | |
| "eval_valid_target_samples_per_second": 214.096, | |
| "eval_valid_target_steps_per_second": 6.851, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.838090362216818, | |
| "grad_norm": 0.2151457667350769, | |
| "learning_rate": 5.2535567976626846e-06, | |
| "loss": 0.1963, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 4.876487904774095, | |
| "grad_norm": 0.18380814790725708, | |
| "learning_rate": 5.1932926658186166e-06, | |
| "loss": 0.1959, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 4.914885447331371, | |
| "grad_norm": 0.19516663253307343, | |
| "learning_rate": 5.133000398046168e-06, | |
| "loss": 0.1953, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 4.953282989888647, | |
| "grad_norm": 0.24182352423667908, | |
| "learning_rate": 5.072688770565177e-06, | |
| "loss": 0.1953, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 4.9916805324459235, | |
| "grad_norm": 0.23720215260982513, | |
| "learning_rate": 5.012366562413501e-06, | |
| "loss": 0.1955, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.9916805324459235, | |
| "eval_valid_loss": 0.18524999916553497, | |
| "eval_valid_runtime": 4.6908, | |
| "eval_valid_samples_per_second": 213.184, | |
| "eval_valid_steps_per_second": 6.822, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.9916805324459235, | |
| "eval_valid_target_loss": 0.20893749594688416, | |
| "eval_valid_target_runtime": 4.6667, | |
| "eval_valid_target_samples_per_second": 214.285, | |
| "eval_valid_target_steps_per_second": 6.857, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 5.0300780750032, | |
| "grad_norm": 0.20271484553813934, | |
| "learning_rate": 4.952042554169138e-06, | |
| "loss": 0.1948, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 5.068475617560476, | |
| "grad_norm": 0.2053770273923874, | |
| "learning_rate": 4.891725526672107e-06, | |
| "loss": 0.1947, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 5.106873160117752, | |
| "grad_norm": 0.20811979472637177, | |
| "learning_rate": 4.8314242597463e-06, | |
| "loss": 0.1939, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 5.145270702675028, | |
| "grad_norm": 0.19889037311077118, | |
| "learning_rate": 4.771147530921483e-06, | |
| "loss": 0.1943, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 5.1836682452323055, | |
| "grad_norm": 0.2038932591676712, | |
| "learning_rate": 4.710904114155621e-06, | |
| "loss": 0.1938, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 5.1836682452323055, | |
| "eval_valid_loss": 0.1847265660762787, | |
| "eval_valid_runtime": 4.698, | |
| "eval_valid_samples_per_second": 212.854, | |
| "eval_valid_steps_per_second": 6.811, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 5.1836682452323055, | |
| "eval_valid_target_loss": 0.20839843153953552, | |
| "eval_valid_target_runtime": 4.6593, | |
| "eval_valid_target_samples_per_second": 214.626, | |
| "eval_valid_target_steps_per_second": 6.868, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 5.222065787789582, | |
| "grad_norm": 0.19585560262203217, | |
| "learning_rate": 4.650702778557736e-06, | |
| "loss": 0.1932, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 5.260463330346858, | |
| "grad_norm": 0.23953603208065033, | |
| "learning_rate": 4.59055228711146e-06, | |
| "loss": 0.1933, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 5.298860872904134, | |
| "grad_norm": 0.21477288007736206, | |
| "learning_rate": 4.530461395399485e-06, | |
| "loss": 0.1929, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 5.33725841546141, | |
| "grad_norm": 0.22662727534770966, | |
| "learning_rate": 4.470438850329089e-06, | |
| "loss": 0.1935, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 5.375655958018687, | |
| "grad_norm": 0.18912354111671448, | |
| "learning_rate": 4.410493388858925e-06, | |
| "loss": 0.1931, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.375655958018687, | |
| "eval_valid_loss": 0.18379686772823334, | |
| "eval_valid_runtime": 4.6729, | |
| "eval_valid_samples_per_second": 214.001, | |
| "eval_valid_steps_per_second": 6.848, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.375655958018687, | |
| "eval_valid_target_loss": 0.20746874809265137, | |
| "eval_valid_target_runtime": 4.6581, | |
| "eval_valid_target_samples_per_second": 214.682, | |
| "eval_valid_target_steps_per_second": 6.87, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.414053500575963, | |
| "grad_norm": 0.21155835688114166, | |
| "learning_rate": 4.350633736727259e-06, | |
| "loss": 0.193, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 5.452451043133239, | |
| "grad_norm": 0.2160138338804245, | |
| "learning_rate": 4.29086860718184e-06, | |
| "loss": 0.1931, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 5.490848585690516, | |
| "grad_norm": 0.19270409643650055, | |
| "learning_rate": 4.231206699711587e-06, | |
| "loss": 0.1925, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 5.5292461282477925, | |
| "grad_norm": 0.18501386046409607, | |
| "learning_rate": 4.171656698780281e-06, | |
| "loss": 0.1925, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 5.567643670805069, | |
| "grad_norm": 0.20564299821853638, | |
| "learning_rate": 4.112227272562447e-06, | |
| "loss": 0.1918, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 5.567643670805069, | |
| "eval_valid_loss": 0.18317969143390656, | |
| "eval_valid_runtime": 4.679, | |
| "eval_valid_samples_per_second": 213.72, | |
| "eval_valid_steps_per_second": 6.839, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 5.567643670805069, | |
| "eval_valid_target_loss": 0.20700781047344208, | |
| "eval_valid_target_runtime": 4.674, | |
| "eval_valid_target_samples_per_second": 213.95, | |
| "eval_valid_target_steps_per_second": 6.846, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 5.606041213362345, | |
| "grad_norm": 0.21509169042110443, | |
| "learning_rate": 4.052927071681593e-06, | |
| "loss": 0.1919, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 5.644438755919621, | |
| "grad_norm": 0.18730491399765015, | |
| "learning_rate": 3.99376472795103e-06, | |
| "loss": 0.1921, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 5.682836298476897, | |
| "grad_norm": 0.21269969642162323, | |
| "learning_rate": 3.934748853117398e-06, | |
| "loss": 0.1918, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 5.721233841034174, | |
| "grad_norm": 0.18910899758338928, | |
| "learning_rate": 3.8758880376071415e-06, | |
| "loss": 0.1914, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 5.75963138359145, | |
| "grad_norm": 0.22251802682876587, | |
| "learning_rate": 3.8171908492760665e-06, | |
| "loss": 0.1916, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.75963138359145, | |
| "eval_valid_loss": 0.18259374797344208, | |
| "eval_valid_runtime": 4.67, | |
| "eval_valid_samples_per_second": 214.134, | |
| "eval_valid_steps_per_second": 6.852, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.75963138359145, | |
| "eval_valid_target_loss": 0.20646093785762787, | |
| "eval_valid_target_runtime": 4.67, | |
| "eval_valid_target_samples_per_second": 214.131, | |
| "eval_valid_target_steps_per_second": 6.852, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.798028926148726, | |
| "grad_norm": 0.17328619956970215, | |
| "learning_rate": 3.758665832162203e-06, | |
| "loss": 0.1911, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 5.836426468706003, | |
| "grad_norm": 0.20850612223148346, | |
| "learning_rate": 3.7003215052421116e-06, | |
| "loss": 0.1915, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 5.8748240112632795, | |
| "grad_norm": 0.1912785917520523, | |
| "learning_rate": 3.642166361190859e-06, | |
| "loss": 0.1908, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 5.913221553820556, | |
| "grad_norm": 0.2138790339231491, | |
| "learning_rate": 3.584208865145812e-06, | |
| "loss": 0.1907, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 5.951619096377832, | |
| "grad_norm": 0.19723013043403625, | |
| "learning_rate": 3.5264574534744373e-06, | |
| "loss": 0.1913, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.951619096377832, | |
| "eval_valid_loss": 0.1817968785762787, | |
| "eval_valid_runtime": 4.6726, | |
| "eval_valid_samples_per_second": 214.016, | |
| "eval_valid_steps_per_second": 6.849, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.951619096377832, | |
| "eval_valid_target_loss": 0.20574218034744263, | |
| "eval_valid_target_runtime": 4.6817, | |
| "eval_valid_target_samples_per_second": 213.599, | |
| "eval_valid_target_steps_per_second": 6.835, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.990016638935108, | |
| "grad_norm": 0.19212548434734344, | |
| "learning_rate": 3.4689205325462997e-06, | |
| "loss": 0.1907, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 6.028414181492384, | |
| "grad_norm": 0.19529464840888977, | |
| "learning_rate": 3.4116064775094126e-06, | |
| "loss": 0.1901, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 6.066811724049661, | |
| "grad_norm": 0.2088070809841156, | |
| "learning_rate": 3.354523631071147e-06, | |
| "loss": 0.1902, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 6.105209266606937, | |
| "grad_norm": 0.19294045865535736, | |
| "learning_rate": 3.2976803022838514e-06, | |
| "loss": 0.1903, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 6.143606809164213, | |
| "grad_norm": 0.20844899117946625, | |
| "learning_rate": 3.2410847653353805e-06, | |
| "loss": 0.1897, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 6.143606809164213, | |
| "eval_valid_loss": 0.1809999942779541, | |
| "eval_valid_runtime": 4.6789, | |
| "eval_valid_samples_per_second": 213.724, | |
| "eval_valid_steps_per_second": 6.839, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 6.143606809164213, | |
| "eval_valid_target_loss": 0.20546874403953552, | |
| "eval_valid_target_runtime": 4.6614, | |
| "eval_valid_target_samples_per_second": 214.53, | |
| "eval_valid_target_steps_per_second": 6.865, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 6.18200435172149, | |
| "grad_norm": 0.19932307302951813, | |
| "learning_rate": 3.184745258344688e-06, | |
| "loss": 0.1894, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 6.220401894278766, | |
| "grad_norm": 0.19776058197021484, | |
| "learning_rate": 3.128669982162681e-06, | |
| "loss": 0.1899, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 6.258799436836043, | |
| "grad_norm": 0.20467509329319, | |
| "learning_rate": 3.07286709917849e-06, | |
| "loss": 0.1898, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 6.297196979393319, | |
| "grad_norm": 0.19593088328838348, | |
| "learning_rate": 3.017344732131342e-06, | |
| "loss": 0.1895, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 6.335594521950595, | |
| "grad_norm": 0.20078891515731812, | |
| "learning_rate": 2.9621109629282064e-06, | |
| "loss": 0.1897, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 6.335594521950595, | |
| "eval_valid_loss": 0.1807578057050705, | |
| "eval_valid_runtime": 4.7017, | |
| "eval_valid_samples_per_second": 212.687, | |
| "eval_valid_steps_per_second": 6.806, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 6.335594521950595, | |
| "eval_valid_target_loss": 0.2052578181028366, | |
| "eval_valid_target_runtime": 4.6726, | |
| "eval_valid_target_samples_per_second": 214.013, | |
| "eval_valid_target_steps_per_second": 6.848, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 6.373992064507871, | |
| "grad_norm": 0.17822235822677612, | |
| "learning_rate": 2.9071738314673758e-06, | |
| "loss": 0.1889, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 6.412389607065148, | |
| "grad_norm": 0.21160703897476196, | |
| "learning_rate": 2.8525413344681797e-06, | |
| "loss": 0.1889, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 6.450787149622424, | |
| "grad_norm": 0.19472962617874146, | |
| "learning_rate": 2.798221424306953e-06, | |
| "loss": 0.1894, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 6.4891846921797, | |
| "grad_norm": 0.17923222482204437, | |
| "learning_rate": 2.744222007859506e-06, | |
| "loss": 0.1891, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 6.527582234736977, | |
| "grad_norm": 0.18077126145362854, | |
| "learning_rate": 2.690550945350157e-06, | |
| "loss": 0.1886, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.527582234736977, | |
| "eval_valid_loss": 0.18031249940395355, | |
| "eval_valid_runtime": 4.6828, | |
| "eval_valid_samples_per_second": 213.548, | |
| "eval_valid_steps_per_second": 6.834, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.527582234736977, | |
| "eval_valid_target_loss": 0.20450781285762787, | |
| "eval_valid_target_runtime": 4.6685, | |
| "eval_valid_target_samples_per_second": 214.203, | |
| "eval_valid_target_steps_per_second": 6.854, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.565979777294253, | |
| "grad_norm": 0.19065329432487488, | |
| "learning_rate": 2.637216049207615e-06, | |
| "loss": 0.188, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 6.60437731985153, | |
| "grad_norm": 0.20368430018424988, | |
| "learning_rate": 2.5842250829277724e-06, | |
| "loss": 0.189, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 6.642774862408806, | |
| "grad_norm": 0.21131780743598938, | |
| "learning_rate": 2.5315857599436575e-06, | |
| "loss": 0.1887, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 6.681172404966082, | |
| "grad_norm": 0.2033446729183197, | |
| "learning_rate": 2.4793057425026467e-06, | |
| "loss": 0.1887, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 6.719569947523358, | |
| "grad_norm": 0.19689294695854187, | |
| "learning_rate": 2.427392640551137e-06, | |
| "loss": 0.1887, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 6.719569947523358, | |
| "eval_valid_loss": 0.17996874451637268, | |
| "eval_valid_runtime": 4.7043, | |
| "eval_valid_samples_per_second": 212.57, | |
| "eval_valid_steps_per_second": 6.802, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 6.719569947523358, | |
| "eval_valid_target_loss": 0.20432811975479126, | |
| "eval_valid_target_runtime": 4.6638, | |
| "eval_valid_target_samples_per_second": 214.416, | |
| "eval_valid_target_steps_per_second": 6.861, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 6.757967490080635, | |
| "grad_norm": 0.1994999349117279, | |
| "learning_rate": 2.3758540106268406e-06, | |
| "loss": 0.1881, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 6.796365032637911, | |
| "grad_norm": 0.19650602340698242, | |
| "learning_rate": 2.32469735475884e-06, | |
| "loss": 0.1881, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 6.834762575195187, | |
| "grad_norm": 0.21248474717140198, | |
| "learning_rate": 2.273930119375586e-06, | |
| "loss": 0.1882, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 6.873160117752464, | |
| "grad_norm": 0.19042810797691345, | |
| "learning_rate": 2.2235596942209776e-06, | |
| "loss": 0.188, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 6.91155766030974, | |
| "grad_norm": 0.23096908628940582, | |
| "learning_rate": 2.173593411278714e-06, | |
| "loss": 0.1886, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 6.91155766030974, | |
| "eval_valid_loss": 0.17952343821525574, | |
| "eval_valid_runtime": 4.6878, | |
| "eval_valid_samples_per_second": 213.321, | |
| "eval_valid_steps_per_second": 6.826, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 6.91155766030974, | |
| "eval_valid_target_loss": 0.20391406118869781, | |
| "eval_valid_target_runtime": 4.6595, | |
| "eval_valid_target_samples_per_second": 214.617, | |
| "eval_valid_target_steps_per_second": 6.868, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 6.949955202867017, | |
| "grad_norm": 0.21275204420089722, | |
| "learning_rate": 2.124038543705034e-06, | |
| "loss": 0.1878, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 6.988352745424293, | |
| "grad_norm": 0.20453621447086334, | |
| "learning_rate": 2.0749023047700285e-06, | |
| "loss": 0.188, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 7.026750287981569, | |
| "grad_norm": 0.20724526047706604, | |
| "learning_rate": 2.026191846807663e-06, | |
| "loss": 0.1883, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 7.065147830538845, | |
| "grad_norm": 0.1886543333530426, | |
| "learning_rate": 1.9779142601746825e-06, | |
| "loss": 0.1874, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 7.1035453730961216, | |
| "grad_norm": 0.20411571860313416, | |
| "learning_rate": 1.9300765722185265e-06, | |
| "loss": 0.187, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 7.1035453730961216, | |
| "eval_valid_loss": 0.17924219369888306, | |
| "eval_valid_runtime": 4.6825, | |
| "eval_valid_samples_per_second": 213.561, | |
| "eval_valid_steps_per_second": 6.834, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 7.1035453730961216, | |
| "eval_valid_target_loss": 0.20393750071525574, | |
| "eval_valid_target_runtime": 4.6736, | |
| "eval_valid_target_samples_per_second": 213.97, | |
| "eval_valid_target_steps_per_second": 6.847, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 7.141942915653398, | |
| "grad_norm": 0.18996645510196686, | |
| "learning_rate": 1.8826857462544129e-06, | |
| "loss": 0.1871, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 7.180340458210675, | |
| "grad_norm": 0.21018381416797638, | |
| "learning_rate": 1.8357486805517615e-06, | |
| "loss": 0.1874, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 7.218738000767951, | |
| "grad_norm": 0.19617675244808197, | |
| "learning_rate": 1.7892722073300627e-06, | |
| "loss": 0.1869, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 7.257135543325227, | |
| "grad_norm": 0.2340448796749115, | |
| "learning_rate": 1.743263091764379e-06, | |
| "loss": 0.187, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 7.295533085882504, | |
| "grad_norm": 0.22970305383205414, | |
| "learning_rate": 1.6977280310005845e-06, | |
| "loss": 0.1873, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.295533085882504, | |
| "eval_valid_loss": 0.1788671910762787, | |
| "eval_valid_runtime": 4.6706, | |
| "eval_valid_samples_per_second": 214.105, | |
| "eval_valid_steps_per_second": 6.851, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.295533085882504, | |
| "eval_valid_target_loss": 0.20334374904632568, | |
| "eval_valid_target_runtime": 4.6842, | |
| "eval_valid_target_samples_per_second": 213.484, | |
| "eval_valid_target_steps_per_second": 6.831, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.33393062843978, | |
| "grad_norm": 0.20527499914169312, | |
| "learning_rate": 1.6526736531805354e-06, | |
| "loss": 0.1873, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 7.372328170997056, | |
| "grad_norm": 0.1835908442735672, | |
| "learning_rate": 1.6081065164772624e-06, | |
| "loss": 0.187, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 7.410725713554332, | |
| "grad_norm": 0.18936371803283691, | |
| "learning_rate": 1.564033108140348e-06, | |
| "loss": 0.1865, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 7.4491232561116085, | |
| "grad_norm": 0.19136998057365417, | |
| "learning_rate": 1.520459843551646e-06, | |
| "loss": 0.1872, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 7.487520798668886, | |
| "grad_norm": 0.19691316783428192, | |
| "learning_rate": 1.4773930652914426e-06, | |
| "loss": 0.187, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 7.487520798668886, | |
| "eval_valid_loss": 0.17878125607967377, | |
| "eval_valid_runtime": 4.6602, | |
| "eval_valid_samples_per_second": 214.581, | |
| "eval_valid_steps_per_second": 6.867, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 7.487520798668886, | |
| "eval_valid_target_loss": 0.20325781404972076, | |
| "eval_valid_target_runtime": 4.6796, | |
| "eval_valid_target_samples_per_second": 213.695, | |
| "eval_valid_target_steps_per_second": 6.838, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 7.525918341226162, | |
| "grad_norm": 0.18792080879211426, | |
| "learning_rate": 1.434839042215227e-06, | |
| "loss": 0.1868, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 7.564315883783438, | |
| "grad_norm": 0.1945939064025879, | |
| "learning_rate": 1.3928039685411793e-06, | |
| "loss": 0.1869, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 7.602713426340714, | |
| "grad_norm": 0.17974095046520233, | |
| "learning_rate": 1.3512939629485456e-06, | |
| "loss": 0.187, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 7.641110968897991, | |
| "grad_norm": 0.22416825592517853, | |
| "learning_rate": 1.3103150676869864e-06, | |
| "loss": 0.1871, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 7.679508511455267, | |
| "grad_norm": 0.19613422453403473, | |
| "learning_rate": 1.2698732476970627e-06, | |
| "loss": 0.1869, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.679508511455267, | |
| "eval_valid_loss": 0.1783437430858612, | |
| "eval_valid_runtime": 4.6716, | |
| "eval_valid_samples_per_second": 214.058, | |
| "eval_valid_steps_per_second": 6.85, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.679508511455267, | |
| "eval_valid_target_loss": 0.2031562477350235, | |
| "eval_valid_target_runtime": 4.6803, | |
| "eval_valid_target_samples_per_second": 213.661, | |
| "eval_valid_target_steps_per_second": 6.837, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.717906054012543, | |
| "grad_norm": 0.20145875215530396, | |
| "learning_rate": 1.229974389741964e-06, | |
| "loss": 0.187, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 7.756303596569819, | |
| "grad_norm": 0.18396620452404022, | |
| "learning_rate": 1.1906243015506375e-06, | |
| "loss": 0.1867, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 7.7947011391270955, | |
| "grad_norm": 0.18105918169021606, | |
| "learning_rate": 1.1518287109723958e-06, | |
| "loss": 0.1862, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 7.833098681684373, | |
| "grad_norm": 0.20986780524253845, | |
| "learning_rate": 1.1135932651431651e-06, | |
| "loss": 0.1863, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 7.871496224241649, | |
| "grad_norm": 0.21804456412792206, | |
| "learning_rate": 1.075923529663489e-06, | |
| "loss": 0.1869, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 7.871496224241649, | |
| "eval_valid_loss": 0.17836718261241913, | |
| "eval_valid_runtime": 4.6832, | |
| "eval_valid_samples_per_second": 213.531, | |
| "eval_valid_steps_per_second": 6.833, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 7.871496224241649, | |
| "eval_valid_target_loss": 0.20322656631469727, | |
| "eval_valid_target_runtime": 4.6763, | |
| "eval_valid_target_samples_per_second": 213.843, | |
| "eval_valid_target_steps_per_second": 6.843, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 7.909893766798925, | |
| "grad_norm": 0.22019818425178528, | |
| "learning_rate": 1.0388249877883827e-06, | |
| "loss": 0.1858, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 7.948291309356201, | |
| "grad_norm": 0.1965310275554657, | |
| "learning_rate": 1.0023030396291916e-06, | |
| "loss": 0.1866, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 7.9866888519134775, | |
| "grad_norm": 0.18218408524990082, | |
| "learning_rate": 9.66363001367534e-07, | |
| "loss": 0.1869, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 8.025086394470755, | |
| "grad_norm": 0.1850380003452301, | |
| "learning_rate": 9.310101044814835e-07, | |
| "loss": 0.1861, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 8.06348393702803, | |
| "grad_norm": 0.18823818862438202, | |
| "learning_rate": 8.962494949840577e-07, | |
| "loss": 0.186, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 8.06348393702803, | |
| "eval_valid_loss": 0.17808593809604645, | |
| "eval_valid_runtime": 4.6916, | |
| "eval_valid_samples_per_second": 213.147, | |
| "eval_valid_steps_per_second": 6.821, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 8.06348393702803, | |
| "eval_valid_target_loss": 0.20311719179153442, | |
| "eval_valid_target_runtime": 4.6653, | |
| "eval_valid_target_samples_per_second": 214.347, | |
| "eval_valid_target_steps_per_second": 6.859, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 8.101881479585307, | |
| "grad_norm": 0.20501789450645447, | |
| "learning_rate": 8.620862326741658e-07, | |
| "loss": 0.1862, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 8.140279022142582, | |
| "grad_norm": 0.19500133395195007, | |
| "learning_rate": 8.285252904000906e-07, | |
| "loss": 0.1862, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 8.17867656469986, | |
| "grad_norm": 0.18742544949054718, | |
| "learning_rate": 7.955715533356367e-07, | |
| "loss": 0.1863, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 8.217074107257135, | |
| "grad_norm": 0.20386624336242676, | |
| "learning_rate": 7.632298182690473e-07, | |
| "loss": 0.186, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 8.255471649814412, | |
| "grad_norm": 0.17727358639240265, | |
| "learning_rate": 7.315047929047608e-07, | |
| "loss": 0.1861, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 8.255471649814412, | |
| "eval_valid_loss": 0.17788280546665192, | |
| "eval_valid_runtime": 4.679, | |
| "eval_valid_samples_per_second": 213.72, | |
| "eval_valid_steps_per_second": 6.839, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 8.255471649814412, | |
| "eval_valid_target_loss": 0.2026640623807907, | |
| "eval_valid_target_runtime": 4.6709, | |
| "eval_valid_target_samples_per_second": 214.093, | |
| "eval_valid_target_steps_per_second": 6.851, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 8.293869192371687, | |
| "grad_norm": 0.19971401989459991, | |
| "learning_rate": 7.004010951781648e-07, | |
| "loss": 0.1858, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 8.332266734928965, | |
| "grad_norm": 0.17827193439006805, | |
| "learning_rate": 6.699232525833987e-07, | |
| "loss": 0.1868, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 8.370664277486242, | |
| "grad_norm": 0.18275295197963715, | |
| "learning_rate": 6.400757015143266e-07, | |
| "loss": 0.1858, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 8.409061820043517, | |
| "grad_norm": 0.19496768712997437, | |
| "learning_rate": 6.108627866187661e-07, | |
| "loss": 0.1854, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 8.447459362600794, | |
| "grad_norm": 0.19046269357204437, | |
| "learning_rate": 5.822887601660832e-07, | |
| "loss": 0.1862, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.447459362600794, | |
| "eval_valid_loss": 0.17781250178813934, | |
| "eval_valid_runtime": 4.6746, | |
| "eval_valid_samples_per_second": 213.921, | |
| "eval_valid_steps_per_second": 6.845, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.447459362600794, | |
| "eval_valid_target_loss": 0.2026640623807907, | |
| "eval_valid_target_runtime": 4.6755, | |
| "eval_valid_target_samples_per_second": 213.88, | |
| "eval_valid_target_steps_per_second": 6.844, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.48585690515807, | |
| "grad_norm": 0.20896296203136444, | |
| "learning_rate": 5.543577814282219e-07, | |
| "loss": 0.1856, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 8.524254447715347, | |
| "grad_norm": 0.19562530517578125, | |
| "learning_rate": 5.270739160742738e-07, | |
| "loss": 0.1857, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 8.562651990272622, | |
| "grad_norm": 0.1972120851278305, | |
| "learning_rate": 5.004411355786792e-07, | |
| "loss": 0.1863, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 8.601049532829899, | |
| "grad_norm": 0.19712330400943756, | |
| "learning_rate": 4.7446331664312786e-07, | |
| "loss": 0.1855, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 8.639447075387174, | |
| "grad_norm": 0.20409992337226868, | |
| "learning_rate": 4.4914424063226937e-07, | |
| "loss": 0.1857, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 8.639447075387174, | |
| "eval_valid_loss": 0.17765624821186066, | |
| "eval_valid_runtime": 4.6769, | |
| "eval_valid_samples_per_second": 213.818, | |
| "eval_valid_steps_per_second": 6.842, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 8.639447075387174, | |
| "eval_valid_target_loss": 0.2025781273841858, | |
| "eval_valid_target_runtime": 4.6696, | |
| "eval_valid_target_samples_per_second": 214.151, | |
| "eval_valid_target_steps_per_second": 6.853, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 8.677844617944451, | |
| "grad_norm": 0.21083636581897736, | |
| "learning_rate": 4.2448759302328336e-07, | |
| "loss": 0.1861, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 8.716242160501729, | |
| "grad_norm": 0.18778979778289795, | |
| "learning_rate": 4.0049696286942496e-07, | |
| "loss": 0.1862, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 8.754639703059004, | |
| "grad_norm": 0.18586015701293945, | |
| "learning_rate": 3.7717584227759117e-07, | |
| "loss": 0.1857, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 8.793037245616281, | |
| "grad_norm": 0.1977422684431076, | |
| "learning_rate": 3.54527625900013e-07, | |
| "loss": 0.1856, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 8.831434788173556, | |
| "grad_norm": 0.18881608545780182, | |
| "learning_rate": 3.3255561044011564e-07, | |
| "loss": 0.1857, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 8.831434788173556, | |
| "eval_valid_loss": 0.17771874368190765, | |
| "eval_valid_runtime": 4.6727, | |
| "eval_valid_samples_per_second": 214.01, | |
| "eval_valid_steps_per_second": 6.848, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 8.831434788173556, | |
| "eval_valid_target_loss": 0.20250000059604645, | |
| "eval_valid_target_runtime": 4.6666, | |
| "eval_valid_target_samples_per_second": 214.287, | |
| "eval_valid_target_steps_per_second": 6.857, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 8.869832330730834, | |
| "grad_norm": 0.2037239372730255, | |
| "learning_rate": 3.112629941726547e-07, | |
| "loss": 0.1856, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 8.908229873288109, | |
| "grad_norm": 0.18967826664447784, | |
| "learning_rate": 2.9065287647816744e-07, | |
| "loss": 0.1855, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 8.946627415845386, | |
| "grad_norm": 0.17752571403980255, | |
| "learning_rate": 2.707282573918213e-07, | |
| "loss": 0.1858, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 8.985024958402661, | |
| "grad_norm": 0.18709731101989746, | |
| "learning_rate": 2.514920371667301e-07, | |
| "loss": 0.1854, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 9.023422500959938, | |
| "grad_norm": 0.21643956005573273, | |
| "learning_rate": 2.3294701585178213e-07, | |
| "loss": 0.1858, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 9.023422500959938, | |
| "eval_valid_loss": 0.17762500047683716, | |
| "eval_valid_runtime": 4.6791, | |
| "eval_valid_samples_per_second": 213.717, | |
| "eval_valid_steps_per_second": 6.839, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 9.023422500959938, | |
| "eval_valid_target_loss": 0.20255468785762787, | |
| "eval_valid_target_runtime": 4.7103, | |
| "eval_valid_target_samples_per_second": 212.301, | |
| "eval_valid_target_steps_per_second": 6.794, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 9.061820043517216, | |
| "grad_norm": 0.18775244057178497, | |
| "learning_rate": 2.1509589288407183e-07, | |
| "loss": 0.1855, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 9.100217586074491, | |
| "grad_norm": 0.17277489602565765, | |
| "learning_rate": 1.9794126669595403e-07, | |
| "loss": 0.1859, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 9.138615128631768, | |
| "grad_norm": 0.18996348977088928, | |
| "learning_rate": 1.8148563433682264e-07, | |
| "loss": 0.1852, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 9.177012671189043, | |
| "grad_norm": 0.1894453912973404, | |
| "learning_rate": 1.6573139110963087e-07, | |
| "loss": 0.1854, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 9.21541021374632, | |
| "grad_norm": 0.2011975795030594, | |
| "learning_rate": 1.5068083022223346e-07, | |
| "loss": 0.1855, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 9.21541021374632, | |
| "eval_valid_loss": 0.17754687368869781, | |
| "eval_valid_runtime": 4.6668, | |
| "eval_valid_samples_per_second": 214.279, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 9.21541021374632, | |
| "eval_valid_target_loss": 0.20250000059604645, | |
| "eval_valid_target_runtime": 4.6766, | |
| "eval_valid_target_samples_per_second": 213.829, | |
| "eval_valid_target_steps_per_second": 6.843, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 9.253807756303596, | |
| "grad_norm": 0.2087700515985489, | |
| "learning_rate": 1.3633614245357807e-07, | |
| "loss": 0.1858, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 9.292205298860873, | |
| "grad_norm": 0.18402153253555298, | |
| "learning_rate": 1.2269941583481548e-07, | |
| "loss": 0.1859, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 9.330602841418148, | |
| "grad_norm": 0.17724697291851044, | |
| "learning_rate": 1.0977263534536597e-07, | |
| "loss": 0.1856, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 9.369000383975425, | |
| "grad_norm": 0.1847800761461258, | |
| "learning_rate": 9.755768262397936e-08, | |
| "loss": 0.1858, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 9.407397926532703, | |
| "grad_norm": 0.1905263364315033, | |
| "learning_rate": 8.605633569484184e-08, | |
| "loss": 0.1856, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 9.407397926532703, | |
| "eval_valid_loss": 0.1775546818971634, | |
| "eval_valid_runtime": 4.6591, | |
| "eval_valid_samples_per_second": 214.636, | |
| "eval_valid_steps_per_second": 6.868, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 9.407397926532703, | |
| "eval_valid_target_loss": 0.2024531215429306, | |
| "eval_valid_target_runtime": 4.6763, | |
| "eval_valid_target_samples_per_second": 213.844, | |
| "eval_valid_target_steps_per_second": 6.843, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 9.445795469089978, | |
| "grad_norm": 0.17600856721401215, | |
| "learning_rate": 7.52702687087653e-08, | |
| "loss": 0.1855, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 9.484193011647255, | |
| "grad_norm": 0.19071801006793976, | |
| "learning_rate": 6.520105169949609e-08, | |
| "loss": 0.1856, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 9.52259055420453, | |
| "grad_norm": 0.20268982648849487, | |
| "learning_rate": 5.5850150355178936e-08, | |
| "loss": 0.1855, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 9.560988096761807, | |
| "grad_norm": 0.18069659173488617, | |
| "learning_rate": 4.721892580500709e-08, | |
| "loss": 0.1852, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 9.599385639319083, | |
| "grad_norm": 0.19809788465499878, | |
| "learning_rate": 3.9308634421098e-08, | |
| "loss": 0.1853, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.599385639319083, | |
| "eval_valid_loss": 0.17754687368869781, | |
| "eval_valid_runtime": 4.6689, | |
| "eval_valid_samples_per_second": 214.182, | |
| "eval_valid_steps_per_second": 6.854, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.599385639319083, | |
| "eval_valid_target_loss": 0.20237499475479126, | |
| "eval_valid_target_runtime": 4.688, | |
| "eval_valid_target_samples_per_second": 213.313, | |
| "eval_valid_target_steps_per_second": 6.826, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.63778318187636, | |
| "grad_norm": 0.1990041732788086, | |
| "learning_rate": 3.2120427635613517e-08, | |
| "loss": 0.1852, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 9.676180724433635, | |
| "grad_norm": 0.20578785240650177, | |
| "learning_rate": 2.565535177315226e-08, | |
| "loss": 0.185, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 9.714578266990912, | |
| "grad_norm": 0.19831426441669464, | |
| "learning_rate": 1.991434789845037e-08, | |
| "loss": 0.1858, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 9.75297580954819, | |
| "grad_norm": 0.18692290782928467, | |
| "learning_rate": 1.489825167939607e-08, | |
| "loss": 0.1848, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 9.791373352105465, | |
| "grad_norm": 0.20175856351852417, | |
| "learning_rate": 1.0607793265389742e-08, | |
| "loss": 0.1854, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 9.791373352105465, | |
| "eval_valid_loss": 0.17751562595367432, | |
| "eval_valid_runtime": 4.667, | |
| "eval_valid_samples_per_second": 214.272, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 9.791373352105465, | |
| "eval_valid_target_loss": 0.20240625739097595, | |
| "eval_valid_target_runtime": 4.6781, | |
| "eval_valid_target_samples_per_second": 213.763, | |
| "eval_valid_target_steps_per_second": 6.84, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 9.829770894662742, | |
| "grad_norm": 0.20650416612625122, | |
| "learning_rate": 7.0435971810606244e-09, | |
| "loss": 0.1859, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 9.868168437220017, | |
| "grad_norm": 0.1880464404821396, | |
| "learning_rate": 4.206182235363399e-09, | |
| "loss": 0.1857, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 9.906565979777294, | |
| "grad_norm": 0.19517436623573303, | |
| "learning_rate": 2.095961446056949e-09, | |
| "loss": 0.1851, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 9.94496352233457, | |
| "grad_norm": 0.21848323941230774, | |
| "learning_rate": 7.132419795868872e-10, | |
| "loss": 0.1858, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 9.983361064891847, | |
| "grad_norm": 0.20499403774738312, | |
| "learning_rate": 5.82251063713235e-11, | |
| "loss": 0.1851, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 9.983361064891847, | |
| "eval_valid_loss": 0.1775234341621399, | |
| "eval_valid_runtime": 4.6706, | |
| "eval_valid_samples_per_second": 214.106, | |
| "eval_valid_steps_per_second": 6.851, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 9.983361064891847, | |
| "eval_valid_target_loss": 0.20242968201637268, | |
| "eval_valid_target_runtime": 4.679, | |
| "eval_valid_target_samples_per_second": 213.721, | |
| "eval_valid_target_steps_per_second": 6.839, | |
| "step": 26000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 26040, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.475781022436819e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |