| { |
| "best_metric": 0.004960117861628532, |
| "best_model_checkpoint": "miner_id_24/checkpoint-450", |
| "epoch": 1.9692307692307693, |
| "eval_steps": 25, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003282051282051282, |
| "grad_norm": 8.107928276062012, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 13.7808, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.003282051282051282, |
| "eval_loss": 16.358562469482422, |
| "eval_runtime": 1.6836, |
| "eval_samples_per_second": 29.698, |
| "eval_steps_per_second": 29.698, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.006564102564102564, |
| "grad_norm": 8.200773239135742, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 14.0115, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.009846153846153846, |
| "grad_norm": 8.451738357543945, |
| "learning_rate": 2.5e-05, |
| "loss": 14.201, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.013128205128205127, |
| "grad_norm": 8.04790210723877, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 14.1223, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01641025641025641, |
| "grad_norm": 8.250162124633789, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 14.0582, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.019692307692307693, |
| "grad_norm": 8.590646743774414, |
| "learning_rate": 5e-05, |
| "loss": 13.9467, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.022974358974358976, |
| "grad_norm": 9.05842399597168, |
| "learning_rate": 5.833333333333334e-05, |
| "loss": 13.9617, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.026256410256410255, |
| "grad_norm": 9.801227569580078, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 13.5738, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.029538461538461538, |
| "grad_norm": 10.93070125579834, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 13.7217, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03282051282051282, |
| "grad_norm": 12.238622665405273, |
| "learning_rate": 8.333333333333334e-05, |
| "loss": 13.0686, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0361025641025641, |
| "grad_norm": 14.860128402709961, |
| "learning_rate": 9.166666666666667e-05, |
| "loss": 13.3774, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.039384615384615386, |
| "grad_norm": 14.760673522949219, |
| "learning_rate": 0.0001, |
| "loss": 13.4816, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.042666666666666665, |
| "grad_norm": 13.52017593383789, |
| "learning_rate": 0.00010833333333333333, |
| "loss": 12.5105, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04594871794871795, |
| "grad_norm": 15.965612411499023, |
| "learning_rate": 0.00011666666666666668, |
| "loss": 12.7767, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.04923076923076923, |
| "grad_norm": 14.593632698059082, |
| "learning_rate": 0.000125, |
| "loss": 11.4924, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05251282051282051, |
| "grad_norm": 14.693791389465332, |
| "learning_rate": 0.00013333333333333334, |
| "loss": 10.9891, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.055794871794871796, |
| "grad_norm": 16.373519897460938, |
| "learning_rate": 0.00014166666666666668, |
| "loss": 9.5602, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.059076923076923075, |
| "grad_norm": 16.455101013183594, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 8.4045, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.06235897435897436, |
| "grad_norm": 15.62342357635498, |
| "learning_rate": 0.00015833333333333332, |
| "loss": 6.9826, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.06564102564102564, |
| "grad_norm": 12.31684684753418, |
| "learning_rate": 0.0001666666666666667, |
| "loss": 5.5791, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06892307692307692, |
| "grad_norm": 9.89257526397705, |
| "learning_rate": 0.000175, |
| "loss": 4.5438, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.0722051282051282, |
| "grad_norm": 7.226674556732178, |
| "learning_rate": 0.00018333333333333334, |
| "loss": 3.9863, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.07548717948717949, |
| "grad_norm": 3.454035520553589, |
| "learning_rate": 0.00019166666666666667, |
| "loss": 3.3812, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.07876923076923077, |
| "grad_norm": 4.084013938903809, |
| "learning_rate": 0.0002, |
| "loss": 3.3663, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08205128205128205, |
| "grad_norm": 4.125749111175537, |
| "learning_rate": 0.00019999866135254795, |
| "loss": 3.1984, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08205128205128205, |
| "eval_loss": 3.513918399810791, |
| "eval_runtime": 1.7168, |
| "eval_samples_per_second": 29.124, |
| "eval_steps_per_second": 29.124, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08533333333333333, |
| "grad_norm": 3.0909500122070312, |
| "learning_rate": 0.0001999946454500135, |
| "loss": 2.8699, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.08861538461538461, |
| "grad_norm": 3.214726686477661, |
| "learning_rate": 0.00019998795241186058, |
| "loss": 2.6662, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0918974358974359, |
| "grad_norm": 3.6891489028930664, |
| "learning_rate": 0.00019997858243719183, |
| "loss": 2.455, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.09517948717948718, |
| "grad_norm": 3.7046895027160645, |
| "learning_rate": 0.00019996653580474266, |
| "loss": 2.2075, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.09846153846153846, |
| "grad_norm": 4.107726097106934, |
| "learning_rate": 0.00019995181287287293, |
| "loss": 1.92, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10174358974358974, |
| "grad_norm": 5.024882793426514, |
| "learning_rate": 0.0001999344140795563, |
| "loss": 1.6797, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.10502564102564102, |
| "grad_norm": 4.104298114776611, |
| "learning_rate": 0.0001999143399423672, |
| "loss": 1.3627, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.10830769230769231, |
| "grad_norm": 3.537320137023926, |
| "learning_rate": 0.00019989159105846555, |
| "loss": 1.0682, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.11158974358974359, |
| "grad_norm": 3.1266891956329346, |
| "learning_rate": 0.00019986616810457867, |
| "loss": 0.8258, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.11487179487179487, |
| "grad_norm": 2.4511232376098633, |
| "learning_rate": 0.00019983807183698163, |
| "loss": 0.6316, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11815384615384615, |
| "grad_norm": 3.5290353298187256, |
| "learning_rate": 0.00019980730309147434, |
| "loss": 0.6164, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.12143589743589743, |
| "grad_norm": 1.588909387588501, |
| "learning_rate": 0.0001997738627833568, |
| "loss": 0.3606, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.12471794871794872, |
| "grad_norm": 1.2943755388259888, |
| "learning_rate": 0.000199737751907402, |
| "loss": 0.2683, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 1.034509539604187, |
| "learning_rate": 0.00019969897153782623, |
| "loss": 0.1998, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.13128205128205128, |
| "grad_norm": 0.7319996356964111, |
| "learning_rate": 0.00019965752282825712, |
| "loss": 0.1469, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13456410256410256, |
| "grad_norm": 0.6485500931739807, |
| "learning_rate": 0.00019961340701169926, |
| "loss": 0.1237, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.13784615384615384, |
| "grad_norm": 0.43038779497146606, |
| "learning_rate": 0.00019956662540049773, |
| "loss": 0.0857, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.14112820512820512, |
| "grad_norm": 0.31721484661102295, |
| "learning_rate": 0.0001995171793862988, |
| "loss": 0.0624, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.1444102564102564, |
| "grad_norm": 0.2403116375207901, |
| "learning_rate": 0.00019946507044000877, |
| "loss": 0.0492, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.1476923076923077, |
| "grad_norm": 0.21824949979782104, |
| "learning_rate": 0.00019941030011175, |
| "loss": 0.0469, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.15097435897435899, |
| "grad_norm": 0.1729796677827835, |
| "learning_rate": 0.00019935287003081494, |
| "loss": 0.0372, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.15425641025641026, |
| "grad_norm": 0.1362195611000061, |
| "learning_rate": 0.00019929278190561767, |
| "loss": 0.0307, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.15753846153846154, |
| "grad_norm": 0.11314882338047028, |
| "learning_rate": 0.00019923003752364297, |
| "loss": 0.0259, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.16082051282051282, |
| "grad_norm": 0.1020006462931633, |
| "learning_rate": 0.00019916463875139316, |
| "loss": 0.0239, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.1641025641025641, |
| "grad_norm": 0.12704280018806458, |
| "learning_rate": 0.00019909658753433272, |
| "loss": 0.0235, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1641025641025641, |
| "eval_loss": 0.10366738587617874, |
| "eval_runtime": 1.6921, |
| "eval_samples_per_second": 29.548, |
| "eval_steps_per_second": 29.548, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16738461538461538, |
| "grad_norm": 4.014084815979004, |
| "learning_rate": 0.0001990258858968303, |
| "loss": 0.1188, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.17066666666666666, |
| "grad_norm": 3.2102086544036865, |
| "learning_rate": 0.0001989525359420985, |
| "loss": 0.0992, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.17394871794871794, |
| "grad_norm": 1.2787097692489624, |
| "learning_rate": 0.00019887653985213124, |
| "loss": 0.0536, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.17723076923076922, |
| "grad_norm": 1.2789212465286255, |
| "learning_rate": 0.00019879789988763914, |
| "loss": 0.0527, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.18051282051282053, |
| "grad_norm": 0.691673755645752, |
| "learning_rate": 0.0001987166183879818, |
| "loss": 0.0349, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1837948717948718, |
| "grad_norm": 0.8248271942138672, |
| "learning_rate": 0.00019863269777109873, |
| "loss": 0.0326, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.18707692307692309, |
| "grad_norm": 0.2020619660615921, |
| "learning_rate": 0.00019854614053343696, |
| "loss": 0.0209, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.19035897435897436, |
| "grad_norm": 0.10885163396596909, |
| "learning_rate": 0.0001984569492498771, |
| "loss": 0.0165, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.19364102564102564, |
| "grad_norm": 0.08642534166574478, |
| "learning_rate": 0.00019836512657365657, |
| "loss": 0.0146, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.19692307692307692, |
| "grad_norm": 0.7858078479766846, |
| "learning_rate": 0.00019827067523629075, |
| "loss": 0.0332, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2002051282051282, |
| "grad_norm": 3.002739191055298, |
| "learning_rate": 0.00019817359804749166, |
| "loss": 0.021, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.20348717948717948, |
| "grad_norm": 1.7393077611923218, |
| "learning_rate": 0.00019807389789508445, |
| "loss": 0.0173, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.20676923076923076, |
| "grad_norm": 0.1218792125582695, |
| "learning_rate": 0.0001979715777449215, |
| "loss": 0.0135, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.21005128205128204, |
| "grad_norm": 0.11387912929058075, |
| "learning_rate": 0.00019786664064079401, |
| "loss": 0.0133, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.21333333333333335, |
| "grad_norm": 0.08703344315290451, |
| "learning_rate": 0.0001977590897043418, |
| "loss": 0.0126, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.21661538461538463, |
| "grad_norm": 0.0711798295378685, |
| "learning_rate": 0.00019764892813496003, |
| "loss": 0.0123, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.2198974358974359, |
| "grad_norm": 0.056465234607458115, |
| "learning_rate": 0.00019753615920970442, |
| "loss": 0.0116, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.22317948717948718, |
| "grad_norm": 0.051314447075128555, |
| "learning_rate": 0.00019742078628319355, |
| "loss": 0.0111, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.22646153846153846, |
| "grad_norm": 0.04775601997971535, |
| "learning_rate": 0.00019730281278750898, |
| "loss": 0.0109, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.22974358974358974, |
| "grad_norm": 0.04364297538995743, |
| "learning_rate": 0.00019718224223209342, |
| "loss": 0.0105, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.23302564102564102, |
| "grad_norm": 0.041208017617464066, |
| "learning_rate": 0.00019705907820364603, |
| "loss": 0.0101, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.2363076923076923, |
| "grad_norm": 0.03828458860516548, |
| "learning_rate": 0.00019693332436601614, |
| "loss": 0.0096, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.23958974358974358, |
| "grad_norm": 0.03837643936276436, |
| "learning_rate": 0.0001968049844600938, |
| "loss": 0.0097, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.24287179487179486, |
| "grad_norm": 0.03652190417051315, |
| "learning_rate": 0.00019667406230369864, |
| "loss": 0.0094, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.24615384615384617, |
| "grad_norm": 0.03440447524189949, |
| "learning_rate": 0.00019654056179146658, |
| "loss": 0.0091, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24615384615384617, |
| "eval_loss": 0.010809546336531639, |
| "eval_runtime": 1.7246, |
| "eval_samples_per_second": 28.993, |
| "eval_steps_per_second": 28.993, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24943589743589745, |
| "grad_norm": 0.03175151348114014, |
| "learning_rate": 0.0001964044868947336, |
| "loss": 0.0087, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.2527179487179487, |
| "grad_norm": 0.03136426582932472, |
| "learning_rate": 0.00019626584166141777, |
| "loss": 0.0087, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.030030284076929092, |
| "learning_rate": 0.0001961246302158988, |
| "loss": 0.0085, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2592820512820513, |
| "grad_norm": 0.028109928593039513, |
| "learning_rate": 0.00019598085675889547, |
| "loss": 0.0083, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.26256410256410256, |
| "grad_norm": 0.026804205030202866, |
| "learning_rate": 0.00019583452556734044, |
| "loss": 0.008, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.26584615384615384, |
| "grad_norm": 0.025468742474913597, |
| "learning_rate": 0.0001956856409942532, |
| "loss": 0.0079, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.2691282051282051, |
| "grad_norm": 0.02471235767006874, |
| "learning_rate": 0.00019553420746861052, |
| "loss": 0.0078, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.2724102564102564, |
| "grad_norm": 0.023358380421996117, |
| "learning_rate": 0.00019538022949521465, |
| "loss": 0.0078, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.2756923076923077, |
| "grad_norm": 0.022298390045762062, |
| "learning_rate": 0.00019522371165455954, |
| "loss": 0.0076, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.27897435897435896, |
| "grad_norm": 0.021381327882409096, |
| "learning_rate": 0.0001950646586026941, |
| "loss": 0.0076, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.28225641025641024, |
| "grad_norm": 0.02039244771003723, |
| "learning_rate": 0.00019490307507108426, |
| "loss": 0.0075, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.2855384615384615, |
| "grad_norm": 0.1511549949645996, |
| "learning_rate": 0.00019473896586647186, |
| "loss": 0.0085, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.2888205128205128, |
| "grad_norm": 0.01761717163026333, |
| "learning_rate": 0.00019457233587073176, |
| "loss": 0.007, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.2921025641025641, |
| "grad_norm": 0.01719754748046398, |
| "learning_rate": 0.0001944031900407266, |
| "loss": 0.007, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.2953846153846154, |
| "grad_norm": 0.016939733177423477, |
| "learning_rate": 0.0001942315334081593, |
| "loss": 0.0072, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2986666666666667, |
| "grad_norm": 0.01624459959566593, |
| "learning_rate": 0.00019405737107942362, |
| "loss": 0.007, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.30194871794871797, |
| "grad_norm": 0.016138238832354546, |
| "learning_rate": 0.00019388070823545187, |
| "loss": 0.007, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.30523076923076925, |
| "grad_norm": 0.01595952734351158, |
| "learning_rate": 0.0001937015501315611, |
| "loss": 0.007, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.30851282051282053, |
| "grad_norm": 0.015773704275488853, |
| "learning_rate": 0.00019351990209729662, |
| "loss": 0.0069, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.3117948717948718, |
| "grad_norm": 0.015645822510123253, |
| "learning_rate": 0.0001933357695362735, |
| "loss": 0.0068, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.3150769230769231, |
| "grad_norm": 0.015485318377614021, |
| "learning_rate": 0.00019314915792601581, |
| "loss": 0.0068, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.31835897435897437, |
| "grad_norm": 0.016759535297751427, |
| "learning_rate": 0.00019296007281779373, |
| "loss": 0.0069, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.32164102564102565, |
| "grad_norm": 0.016155634075403214, |
| "learning_rate": 0.0001927685198364583, |
| "loss": 0.0067, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.3249230769230769, |
| "grad_norm": 0.0162162147462368, |
| "learning_rate": 0.0001925745046802742, |
| "loss": 0.0069, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.3282051282051282, |
| "grad_norm": 0.01534635853022337, |
| "learning_rate": 0.00019237803312075028, |
| "loss": 0.0068, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3282051282051282, |
| "eval_loss": 0.00823766179382801, |
| "eval_runtime": 1.7026, |
| "eval_samples_per_second": 29.368, |
| "eval_steps_per_second": 29.368, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3314871794871795, |
| "grad_norm": 0.5894611477851868, |
| "learning_rate": 0.00019217911100246756, |
| "loss": 0.0151, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.33476923076923076, |
| "grad_norm": 0.1130584329366684, |
| "learning_rate": 0.00019197774424290582, |
| "loss": 0.0103, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.33805128205128204, |
| "grad_norm": 0.0547531358897686, |
| "learning_rate": 0.0001917739388322673, |
| "loss": 0.0098, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.3413333333333333, |
| "grad_norm": 0.03562573716044426, |
| "learning_rate": 0.0001915677008332985, |
| "loss": 0.0087, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.3446153846153846, |
| "grad_norm": 0.030168889090418816, |
| "learning_rate": 0.00019135903638110993, |
| "loss": 0.0083, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3478974358974359, |
| "grad_norm": 0.023503584787249565, |
| "learning_rate": 0.00019114795168299347, |
| "loss": 0.0076, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.35117948717948716, |
| "grad_norm": 0.020446596667170525, |
| "learning_rate": 0.00019093445301823788, |
| "loss": 0.0073, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.35446153846153844, |
| "grad_norm": 0.01846328377723694, |
| "learning_rate": 0.00019071854673794196, |
| "loss": 0.0069, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.3577435897435897, |
| "grad_norm": 0.016864923760294914, |
| "learning_rate": 0.00019050023926482548, |
| "loss": 0.0066, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.36102564102564105, |
| "grad_norm": 0.017281338572502136, |
| "learning_rate": 0.00019027953709303827, |
| "loss": 0.0066, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.36430769230769233, |
| "grad_norm": 0.017325421795248985, |
| "learning_rate": 0.00019005644678796705, |
| "loss": 0.0065, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.3675897435897436, |
| "grad_norm": 0.020572949200868607, |
| "learning_rate": 0.00018983097498603995, |
| "loss": 0.0062, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.3708717948717949, |
| "grad_norm": 0.018894601613283157, |
| "learning_rate": 0.00018960312839452932, |
| "loss": 0.0064, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.37415384615384617, |
| "grad_norm": 0.01940176449716091, |
| "learning_rate": 0.00018937291379135196, |
| "loss": 0.0063, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.37743589743589745, |
| "grad_norm": 2.0286033153533936, |
| "learning_rate": 0.00018914033802486775, |
| "loss": 0.0516, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.38071794871794873, |
| "grad_norm": 0.01834729313850403, |
| "learning_rate": 0.00018890540801367572, |
| "loss": 0.0063, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.016609683632850647, |
| "learning_rate": 0.0001886681307464083, |
| "loss": 0.0061, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.3872820512820513, |
| "grad_norm": 0.014862019568681717, |
| "learning_rate": 0.00018842851328152355, |
| "loss": 0.0059, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.39056410256410257, |
| "grad_norm": 0.014286825433373451, |
| "learning_rate": 0.00018818656274709493, |
| "loss": 0.006, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.39384615384615385, |
| "grad_norm": 0.013413852080702782, |
| "learning_rate": 0.0001879422863405995, |
| "loss": 0.0062, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3971282051282051, |
| "grad_norm": 0.012395060621201992, |
| "learning_rate": 0.00018769569132870366, |
| "loss": 0.0061, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.4004102564102564, |
| "grad_norm": 0.012121280655264854, |
| "learning_rate": 0.0001874467850470471, |
| "loss": 0.006, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.4036923076923077, |
| "grad_norm": 0.012034958228468895, |
| "learning_rate": 0.0001871955749000245, |
| "loss": 0.0059, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.40697435897435896, |
| "grad_norm": 0.012840136885643005, |
| "learning_rate": 0.0001869420683605652, |
| "loss": 0.0059, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.41025641025641024, |
| "grad_norm": 0.01383453793823719, |
| "learning_rate": 0.0001866862729699111, |
| "loss": 0.006, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.41025641025641024, |
| "eval_loss": 0.008009692654013634, |
| "eval_runtime": 1.7144, |
| "eval_samples_per_second": 29.165, |
| "eval_steps_per_second": 29.165, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.4135384615384615, |
| "grad_norm": 0.015620230697095394, |
| "learning_rate": 0.0001864281963373921, |
| "loss": 0.006, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.4168205128205128, |
| "grad_norm": 0.01630019024014473, |
| "learning_rate": 0.00018616784614019995, |
| "loss": 0.0061, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.4201025641025641, |
| "grad_norm": 0.016993194818496704, |
| "learning_rate": 0.00018590523012315972, |
| "loss": 0.006, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.42338461538461536, |
| "grad_norm": 0.016546938568353653, |
| "learning_rate": 0.00018564035609849945, |
| "loss": 0.0058, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.4266666666666667, |
| "grad_norm": 0.018067970871925354, |
| "learning_rate": 0.0001853732319456177, |
| "loss": 0.0059, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.429948717948718, |
| "grad_norm": 0.017609402537345886, |
| "learning_rate": 0.0001851038656108494, |
| "loss": 0.0059, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.43323076923076925, |
| "grad_norm": 0.015436948277056217, |
| "learning_rate": 0.0001848322651072291, |
| "loss": 0.0058, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.43651282051282053, |
| "grad_norm": 0.014602603390812874, |
| "learning_rate": 0.00018455843851425283, |
| "loss": 0.0059, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.4397948717948718, |
| "grad_norm": 0.0129224993288517, |
| "learning_rate": 0.00018428239397763775, |
| "loss": 0.0056, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.4430769230769231, |
| "grad_norm": 0.011858578771352768, |
| "learning_rate": 0.00018400413970907974, |
| "loss": 0.0057, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.44635897435897437, |
| "grad_norm": 0.011236813850700855, |
| "learning_rate": 0.00018372368398600927, |
| "loss": 0.0056, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.44964102564102565, |
| "grad_norm": 0.01114533469080925, |
| "learning_rate": 0.00018344103515134492, |
| "loss": 0.0058, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.45292307692307693, |
| "grad_norm": 0.009413519874215126, |
| "learning_rate": 0.00018315620161324538, |
| "loss": 0.0056, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4562051282051282, |
| "grad_norm": 0.009282637387514114, |
| "learning_rate": 0.0001828691918448594, |
| "loss": 0.0056, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.4594871794871795, |
| "grad_norm": 0.008740304969251156, |
| "learning_rate": 0.00018258001438407344, |
| "loss": 0.0057, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.46276923076923077, |
| "grad_norm": 0.008844515308737755, |
| "learning_rate": 0.00018228867783325804, |
| "loss": 0.0056, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.46605128205128205, |
| "grad_norm": 0.008487056940793991, |
| "learning_rate": 0.00018199519085901165, |
| "loss": 0.0055, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.4693333333333333, |
| "grad_norm": 0.008857525885105133, |
| "learning_rate": 0.000181699562191903, |
| "loss": 0.0057, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.4726153846153846, |
| "grad_norm": 0.008444939740002155, |
| "learning_rate": 0.00018140180062621117, |
| "loss": 0.0056, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.4758974358974359, |
| "grad_norm": 0.008454709313809872, |
| "learning_rate": 0.00018110191501966423, |
| "loss": 0.0056, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.47917948717948716, |
| "grad_norm": 0.008842523209750652, |
| "learning_rate": 0.00018079991429317553, |
| "loss": 0.0056, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.48246153846153844, |
| "grad_norm": 0.008386999368667603, |
| "learning_rate": 0.00018049580743057853, |
| "loss": 0.0055, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.4857435897435897, |
| "grad_norm": 0.008592522703111172, |
| "learning_rate": 0.00018018960347835936, |
| "loss": 0.0055, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.489025641025641, |
| "grad_norm": 0.008794533088803291, |
| "learning_rate": 0.00017988131154538783, |
| "loss": 0.0058, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.49230769230769234, |
| "grad_norm": 0.008663009852170944, |
| "learning_rate": 0.00017957094080264634, |
| "loss": 0.0055, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.49230769230769234, |
| "eval_loss": 0.007762688212096691, |
| "eval_runtime": 1.6988, |
| "eval_samples_per_second": 29.433, |
| "eval_steps_per_second": 29.433, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4955897435897436, |
| "grad_norm": 0.0282682366669178, |
| "learning_rate": 0.00017925850048295725, |
| "loss": 0.0074, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.4988717948717949, |
| "grad_norm": 0.027021881192922592, |
| "learning_rate": 0.00017894399988070803, |
| "loss": 0.0072, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.5021538461538462, |
| "grad_norm": 0.028071371838450432, |
| "learning_rate": 0.00017862744835157494, |
| "loss": 0.0074, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.5054358974358975, |
| "grad_norm": 0.026930296793580055, |
| "learning_rate": 0.00017830885531224457, |
| "loss": 0.0072, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.5087179487179487, |
| "grad_norm": 0.022877950221300125, |
| "learning_rate": 0.00017798823024013383, |
| "loss": 0.0069, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 0.02152320370078087, |
| "learning_rate": 0.00017766558267310798, |
| "loss": 0.0068, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5152820512820513, |
| "grad_norm": 0.01885562390089035, |
| "learning_rate": 0.00017734092220919682, |
| "loss": 0.0065, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.5185641025641026, |
| "grad_norm": 0.017150552943348885, |
| "learning_rate": 0.00017701425850630937, |
| "loss": 0.0062, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5218461538461538, |
| "grad_norm": 0.01740305684506893, |
| "learning_rate": 0.00017668560128194635, |
| "loss": 0.0061, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.5251282051282051, |
| "grad_norm": 0.0199392419308424, |
| "learning_rate": 0.00017635496031291115, |
| "loss": 0.0061, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5284102564102564, |
| "grad_norm": 0.014733769930899143, |
| "learning_rate": 0.00017602234543501928, |
| "loss": 0.0059, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5316923076923077, |
| "grad_norm": 0.010931742377579212, |
| "learning_rate": 0.0001756877665428052, |
| "loss": 0.0055, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.534974358974359, |
| "grad_norm": 0.009115135297179222, |
| "learning_rate": 0.00017535123358922866, |
| "loss": 0.0054, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5382564102564102, |
| "grad_norm": 0.008291061967611313, |
| "learning_rate": 0.000175012756585378, |
| "loss": 0.0054, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5415384615384615, |
| "grad_norm": 0.007542457897216082, |
| "learning_rate": 0.00017467234560017284, |
| "loss": 0.0053, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5448205128205128, |
| "grad_norm": 0.007635565474629402, |
| "learning_rate": 0.0001743300107600642, |
| "loss": 0.0053, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5481025641025641, |
| "grad_norm": 0.00773365143686533, |
| "learning_rate": 0.0001739857622487334, |
| "loss": 0.0053, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5513846153846154, |
| "grad_norm": 0.007943989709019661, |
| "learning_rate": 0.00017363961030678927, |
| "loss": 0.0052, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.5546666666666666, |
| "grad_norm": 0.008328091353178024, |
| "learning_rate": 0.00017329156523146323, |
| "loss": 0.0054, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.5579487179487179, |
| "grad_norm": 0.008655044250190258, |
| "learning_rate": 0.00017294163737630305, |
| "loss": 0.0052, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5612307692307692, |
| "grad_norm": 0.008740806020796299, |
| "learning_rate": 0.00017258983715086505, |
| "loss": 0.0054, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.5645128205128205, |
| "grad_norm": 0.00872563011944294, |
| "learning_rate": 0.00017223617502040427, |
| "loss": 0.0053, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.5677948717948718, |
| "grad_norm": 0.009066494181752205, |
| "learning_rate": 0.00017188066150556307, |
| "loss": 0.0054, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.571076923076923, |
| "grad_norm": 0.008819897659122944, |
| "learning_rate": 0.0001715233071820584, |
| "loss": 0.0053, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.5743589743589743, |
| "grad_norm": 0.009242737665772438, |
| "learning_rate": 0.00017116412268036708, |
| "loss": 0.0053, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5743589743589743, |
| "eval_loss": 0.008101023733615875, |
| "eval_runtime": 1.6961, |
| "eval_samples_per_second": 29.479, |
| "eval_steps_per_second": 29.479, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5776410256410256, |
| "grad_norm": 0.009131861850619316, |
| "learning_rate": 0.00017080311868540943, |
| "loss": 0.0052, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.5809230769230769, |
| "grad_norm": 0.008674906566739082, |
| "learning_rate": 0.00017044030593623167, |
| "loss": 0.0052, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.5842051282051282, |
| "grad_norm": 0.008941737934947014, |
| "learning_rate": 0.00017007569522568627, |
| "loss": 0.0053, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.5874871794871794, |
| "grad_norm": 0.008659431710839272, |
| "learning_rate": 0.00016970929740011103, |
| "loss": 0.0053, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.5907692307692308, |
| "grad_norm": 0.008444879204034805, |
| "learning_rate": 0.00016934112335900621, |
| "loss": 0.0052, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5940512820512821, |
| "grad_norm": 0.008231930434703827, |
| "learning_rate": 0.0001689711840547106, |
| "loss": 0.0052, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.5973333333333334, |
| "grad_norm": 0.0077408417128026485, |
| "learning_rate": 0.0001685994904920754, |
| "loss": 0.0053, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.6006153846153847, |
| "grad_norm": 0.007263463456183672, |
| "learning_rate": 0.00016822605372813717, |
| "loss": 0.0051, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.6038974358974359, |
| "grad_norm": 0.0077566043473780155, |
| "learning_rate": 0.00016785088487178854, |
| "loss": 0.0051, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.6071794871794872, |
| "grad_norm": 0.0069167339242994785, |
| "learning_rate": 0.00016747399508344808, |
| "loss": 0.0052, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6104615384615385, |
| "grad_norm": 0.006657553371042013, |
| "learning_rate": 0.0001670953955747281, |
| "loss": 0.0051, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.6137435897435898, |
| "grad_norm": 0.00632864935323596, |
| "learning_rate": 0.0001667150976081012, |
| "loss": 0.0051, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.6170256410256411, |
| "grad_norm": 0.006999644450843334, |
| "learning_rate": 0.00016633311249656535, |
| "loss": 0.0053, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.6203076923076923, |
| "grad_norm": 0.006461160257458687, |
| "learning_rate": 0.000165949451603307, |
| "loss": 0.0052, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.6235897435897436, |
| "grad_norm": 0.00639855582267046, |
| "learning_rate": 0.00016556412634136347, |
| "loss": 0.0052, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6268717948717949, |
| "grad_norm": 0.006086940411478281, |
| "learning_rate": 0.0001651771481732832, |
| "loss": 0.0051, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6301538461538462, |
| "grad_norm": 0.006296528037637472, |
| "learning_rate": 0.00016478852861078486, |
| "loss": 0.0052, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6334358974358975, |
| "grad_norm": 0.006205971818417311, |
| "learning_rate": 0.0001643982792144148, |
| "loss": 0.0052, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6367179487179487, |
| "grad_norm": 0.006357920356094837, |
| "learning_rate": 0.0001640064115932033, |
| "loss": 0.0052, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.006061731372028589, |
| "learning_rate": 0.00016361293740431904, |
| "loss": 0.0052, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6432820512820513, |
| "grad_norm": 0.005866146180778742, |
| "learning_rate": 0.00016321786835272244, |
| "loss": 0.0051, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6465641025641026, |
| "grad_norm": 0.005625640973448753, |
| "learning_rate": 0.00016282121619081753, |
| "loss": 0.0051, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.6498461538461539, |
| "grad_norm": 0.005495929159224033, |
| "learning_rate": 0.0001624229927181022, |
| "loss": 0.0051, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.6531282051282051, |
| "grad_norm": 0.005448007490485907, |
| "learning_rate": 0.0001620232097808173, |
| "loss": 0.0051, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.6564102564102564, |
| "grad_norm": 0.005435483064502478, |
| "learning_rate": 0.00016162187927159415, |
| "loss": 0.0051, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6564102564102564, |
| "eval_loss": 0.008042293600738049, |
| "eval_runtime": 1.0817, |
| "eval_samples_per_second": 46.222, |
| "eval_steps_per_second": 46.222, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6596923076923077, |
| "grad_norm": 0.015730086714029312, |
| "learning_rate": 0.00016121901312910085, |
| "loss": 0.006, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.662974358974359, |
| "grad_norm": 0.01854288950562477, |
| "learning_rate": 0.00016081462333768703, |
| "loss": 0.0063, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.6662564102564102, |
| "grad_norm": 0.016778547316789627, |
| "learning_rate": 0.0001604087219270275, |
| "loss": 0.0062, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.6695384615384615, |
| "grad_norm": 0.014833272434771061, |
| "learning_rate": 0.00016000132097176422, |
| "loss": 0.0059, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.6728205128205128, |
| "grad_norm": 0.016074592247605324, |
| "learning_rate": 0.0001595924325911472, |
| "loss": 0.0061, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6761025641025641, |
| "grad_norm": 0.01580972597002983, |
| "learning_rate": 0.0001591820689486739, |
| "loss": 0.0059, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.6793846153846154, |
| "grad_norm": 0.01449266355484724, |
| "learning_rate": 0.00015877024225172766, |
| "loss": 0.0058, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.6826666666666666, |
| "grad_norm": 0.012255052104592323, |
| "learning_rate": 0.00015835696475121418, |
| "loss": 0.0056, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.6859487179487179, |
| "grad_norm": 0.01258911658078432, |
| "learning_rate": 0.0001579422487411972, |
| "loss": 0.0055, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.6892307692307692, |
| "grad_norm": 0.014708973467350006, |
| "learning_rate": 0.00015752610655853314, |
| "loss": 0.0056, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6925128205128205, |
| "grad_norm": 0.012465777806937695, |
| "learning_rate": 0.00015710855058250346, |
| "loss": 0.0054, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.6957948717948718, |
| "grad_norm": 0.008663519285619259, |
| "learning_rate": 0.00015668959323444695, |
| "loss": 0.0051, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.699076923076923, |
| "grad_norm": 0.006837547291070223, |
| "learning_rate": 0.00015626924697738993, |
| "loss": 0.0051, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.7023589743589743, |
| "grad_norm": 0.0062984684482216835, |
| "learning_rate": 0.00015584752431567578, |
| "loss": 0.0049, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.7056410256410256, |
| "grad_norm": 0.005604996811598539, |
| "learning_rate": 0.00015542443779459247, |
| "loss": 0.005, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.7089230769230769, |
| "grad_norm": 0.005338320974260569, |
| "learning_rate": 0.000155, |
| "loss": 0.005, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.7122051282051282, |
| "grad_norm": 0.004744368139654398, |
| "learning_rate": 0.00015457422355795545, |
| "loss": 0.0049, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.7154871794871794, |
| "grad_norm": 0.004858638159930706, |
| "learning_rate": 0.0001541471211343377, |
| "loss": 0.0049, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.7187692307692307, |
| "grad_norm": 0.0049782246351242065, |
| "learning_rate": 0.0001537187054344706, |
| "loss": 0.005, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.7220512820512821, |
| "grad_norm": 0.005176792852580547, |
| "learning_rate": 0.0001532889892027449, |
| "loss": 0.005, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7253333333333334, |
| "grad_norm": 0.00572836771607399, |
| "learning_rate": 0.00015285798522223922, |
| "loss": 0.0052, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.7286153846153847, |
| "grad_norm": 0.005091918632388115, |
| "learning_rate": 0.0001524257063143398, |
| "loss": 0.0049, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7318974358974359, |
| "grad_norm": 0.005498278420418501, |
| "learning_rate": 0.00015199216533835904, |
| "loss": 0.0049, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.7351794871794872, |
| "grad_norm": 0.0057638660073280334, |
| "learning_rate": 0.00015155737519115307, |
| "loss": 0.005, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7384615384615385, |
| "grad_norm": 0.005555428098887205, |
| "learning_rate": 0.00015112134880673788, |
| "loss": 0.005, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7384615384615385, |
| "eval_loss": 0.00847731251269579, |
| "eval_runtime": 1.7103, |
| "eval_samples_per_second": 29.235, |
| "eval_steps_per_second": 29.235, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7417435897435898, |
| "grad_norm": 0.005984974093735218, |
| "learning_rate": 0.0001506840991559048, |
| "loss": 0.005, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.7450256410256411, |
| "grad_norm": 0.006116104777902365, |
| "learning_rate": 0.0001502456392458345, |
| "loss": 0.005, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.7483076923076923, |
| "grad_norm": 0.0061400169506669044, |
| "learning_rate": 0.00014980598211971014, |
| "loss": 0.005, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.7515897435897436, |
| "grad_norm": 0.010314466431736946, |
| "learning_rate": 0.0001493651408563293, |
| "loss": 0.0051, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.7548717948717949, |
| "grad_norm": 0.005545389838516712, |
| "learning_rate": 0.00014892312856971496, |
| "loss": 0.005, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7581538461538462, |
| "grad_norm": 0.005526201333850622, |
| "learning_rate": 0.0001484799584087254, |
| "loss": 0.005, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.7614358974358975, |
| "grad_norm": 0.0059164236299693584, |
| "learning_rate": 0.00014803564355666296, |
| "loss": 0.0051, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.7647179487179487, |
| "grad_norm": 0.005439561791718006, |
| "learning_rate": 0.00014759019723088198, |
| "loss": 0.0048, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 0.0054712677374482155, |
| "learning_rate": 0.00014714363268239554, |
| "loss": 0.005, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.7712820512820513, |
| "grad_norm": 0.031185509636998177, |
| "learning_rate": 0.00014669596319548132, |
| "loss": 0.0053, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7745641025641026, |
| "grad_norm": 0.005096518434584141, |
| "learning_rate": 0.00014624720208728637, |
| "loss": 0.0049, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.7778461538461539, |
| "grad_norm": 0.004755071364343166, |
| "learning_rate": 0.000145797362707431, |
| "loss": 0.0049, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.7811282051282051, |
| "grad_norm": 0.004966154228895903, |
| "learning_rate": 0.00014534645843761168, |
| "loss": 0.005, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.7844102564102564, |
| "grad_norm": 0.0049712988547980785, |
| "learning_rate": 0.00014489450269120286, |
| "loss": 0.005, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.7876923076923077, |
| "grad_norm": 0.004542510025203228, |
| "learning_rate": 0.00014444150891285807, |
| "loss": 0.0049, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.790974358974359, |
| "grad_norm": 0.004633238539099693, |
| "learning_rate": 0.00014398749057810997, |
| "loss": 0.0049, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.7942564102564103, |
| "grad_norm": 0.004606141243129969, |
| "learning_rate": 0.0001435324611929693, |
| "loss": 0.005, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.7975384615384615, |
| "grad_norm": 0.004728773143142462, |
| "learning_rate": 0.00014307643429352333, |
| "loss": 0.005, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.8008205128205128, |
| "grad_norm": 0.005116627085953951, |
| "learning_rate": 0.00014261942344553314, |
| "loss": 0.005, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.8041025641025641, |
| "grad_norm": 0.006209938321262598, |
| "learning_rate": 0.00014216144224403002, |
| "loss": 0.0052, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.8073846153846154, |
| "grad_norm": 0.004638911224901676, |
| "learning_rate": 0.00014170250431291105, |
| "loss": 0.0049, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.8106666666666666, |
| "grad_norm": 0.0047111185267567635, |
| "learning_rate": 0.00014124262330453375, |
| "loss": 0.0048, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.8139487179487179, |
| "grad_norm": 0.004355636890977621, |
| "learning_rate": 0.0001407818128993102, |
| "loss": 0.005, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.8172307692307692, |
| "grad_norm": 0.005983169190585613, |
| "learning_rate": 0.0001403200868052998, |
| "loss": 0.0052, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.8205128205128205, |
| "grad_norm": 0.8480205535888672, |
| "learning_rate": 0.00013985745875780173, |
| "loss": 0.0246, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8205128205128205, |
| "eval_loss": 0.00733967823907733, |
| "eval_runtime": 1.7185, |
| "eval_samples_per_second": 29.095, |
| "eval_steps_per_second": 29.095, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8237948717948718, |
| "grad_norm": 0.01247889269143343, |
| "learning_rate": 0.00013939394251894603, |
| "loss": 0.0056, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.827076923076923, |
| "grad_norm": 0.012979921884834766, |
| "learning_rate": 0.00013892955187728455, |
| "loss": 0.0058, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8303589743589743, |
| "grad_norm": 0.01275433786213398, |
| "learning_rate": 0.00013846430064738064, |
| "loss": 0.0058, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.8336410256410256, |
| "grad_norm": 0.013448723591864109, |
| "learning_rate": 0.00013799820266939818, |
| "loss": 0.0058, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.8369230769230769, |
| "grad_norm": 0.013343177735805511, |
| "learning_rate": 0.00013753127180868982, |
| "loss": 0.0059, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.8402051282051282, |
| "grad_norm": 0.013337034732103348, |
| "learning_rate": 0.00013706352195538458, |
| "loss": 0.0057, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.8434871794871794, |
| "grad_norm": 0.014081962406635284, |
| "learning_rate": 0.0001365949670239747, |
| "loss": 0.0059, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.8467692307692307, |
| "grad_norm": 0.01269819401204586, |
| "learning_rate": 0.0001361256209529016, |
| "loss": 0.0056, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.850051282051282, |
| "grad_norm": 0.012389056384563446, |
| "learning_rate": 0.0001356554977041414, |
| "loss": 0.0055, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.8533333333333334, |
| "grad_norm": 0.013030463829636574, |
| "learning_rate": 0.00013518461126278933, |
| "loss": 0.0055, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8566153846153847, |
| "grad_norm": 0.013525651767849922, |
| "learning_rate": 0.00013471297563664392, |
| "loss": 0.0053, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.859897435897436, |
| "grad_norm": 0.012331862933933735, |
| "learning_rate": 0.0001342406048557904, |
| "loss": 0.0053, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.8631794871794872, |
| "grad_norm": 0.009830540977418423, |
| "learning_rate": 0.00013376751297218287, |
| "loss": 0.0051, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.8664615384615385, |
| "grad_norm": 0.008250358514487743, |
| "learning_rate": 0.00013329371405922688, |
| "loss": 0.005, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.8697435897435898, |
| "grad_norm": 0.0070004030130803585, |
| "learning_rate": 0.00013281922221136037, |
| "loss": 0.0049, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8730256410256411, |
| "grad_norm": 0.006113375071436167, |
| "learning_rate": 0.00013234405154363446, |
| "loss": 0.005, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.8763076923076923, |
| "grad_norm": 0.005196304526180029, |
| "learning_rate": 0.00013186821619129378, |
| "loss": 0.0048, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.8795897435897436, |
| "grad_norm": 0.004914364777505398, |
| "learning_rate": 0.0001313917303093556, |
| "loss": 0.0049, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.8828717948717949, |
| "grad_norm": 0.004722020123153925, |
| "learning_rate": 0.00013091460807218913, |
| "loss": 0.0049, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.8861538461538462, |
| "grad_norm": 0.004352330230176449, |
| "learning_rate": 0.0001304368636730936, |
| "loss": 0.0049, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8894358974358975, |
| "grad_norm": 0.004314970225095749, |
| "learning_rate": 0.00012995851132387623, |
| "loss": 0.0049, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.8927179487179487, |
| "grad_norm": 0.004017701838165522, |
| "learning_rate": 0.00012947956525442925, |
| "loss": 0.0048, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 0.004261241294443607, |
| "learning_rate": 0.00012900003971230684, |
| "loss": 0.0048, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.8992820512820513, |
| "grad_norm": 0.004208185710012913, |
| "learning_rate": 0.00012851994896230116, |
| "loss": 0.0048, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.9025641025641026, |
| "grad_norm": 0.004194408655166626, |
| "learning_rate": 0.00012803930728601785, |
| "loss": 0.0048, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.9025641025641026, |
| "eval_loss": 0.005147330928593874, |
| "eval_runtime": 1.6972, |
| "eval_samples_per_second": 29.46, |
| "eval_steps_per_second": 29.46, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.9058461538461539, |
| "grad_norm": 0.00444167573004961, |
| "learning_rate": 0.00012755812898145155, |
| "loss": 0.0047, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.9091282051282051, |
| "grad_norm": 0.004847261123359203, |
| "learning_rate": 0.0001270764283625603, |
| "loss": 0.005, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.9124102564102564, |
| "grad_norm": 0.004441663157194853, |
| "learning_rate": 0.0001265942197588397, |
| "loss": 0.0047, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.9156923076923077, |
| "grad_norm": 0.0051227472722530365, |
| "learning_rate": 0.00012611151751489697, |
| "loss": 0.0049, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.918974358974359, |
| "grad_norm": 0.004962059669196606, |
| "learning_rate": 0.00012562833599002375, |
| "loss": 0.0049, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9222564102564103, |
| "grad_norm": 0.004343735985457897, |
| "learning_rate": 0.00012514468955776936, |
| "loss": 0.0048, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.9255384615384615, |
| "grad_norm": 0.004809789825230837, |
| "learning_rate": 0.000124660592605513, |
| "loss": 0.0049, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.9288205128205128, |
| "grad_norm": 0.022584544494748116, |
| "learning_rate": 0.0001241760595340358, |
| "loss": 0.005, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.9321025641025641, |
| "grad_norm": 0.004884497728198767, |
| "learning_rate": 0.0001236911047570925, |
| "loss": 0.0048, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.9353846153846154, |
| "grad_norm": 0.0044205994345247746, |
| "learning_rate": 0.00012320574270098254, |
| "loss": 0.0049, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.9386666666666666, |
| "grad_norm": 0.0042507946491241455, |
| "learning_rate": 0.0001227199878041211, |
| "loss": 0.0048, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.9419487179487179, |
| "grad_norm": 0.003997748717665672, |
| "learning_rate": 0.0001222338545166093, |
| "loss": 0.0047, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.9452307692307692, |
| "grad_norm": 0.005669133272022009, |
| "learning_rate": 0.00012174735729980466, |
| "loss": 0.0049, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.9485128205128205, |
| "grad_norm": 0.004083422012627125, |
| "learning_rate": 0.00012126051062589075, |
| "loss": 0.0048, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.9517948717948718, |
| "grad_norm": 0.00390887726098299, |
| "learning_rate": 0.00012077332897744662, |
| "loss": 0.0048, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.955076923076923, |
| "grad_norm": 0.003818151541054249, |
| "learning_rate": 0.0001202858268470162, |
| "loss": 0.0048, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.9583589743589743, |
| "grad_norm": 0.004068729933351278, |
| "learning_rate": 0.00011979801873667682, |
| "loss": 0.0049, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.9616410256410256, |
| "grad_norm": 0.003813754068687558, |
| "learning_rate": 0.00011930991915760819, |
| "loss": 0.0049, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.9649230769230769, |
| "grad_norm": 0.004149145446717739, |
| "learning_rate": 0.0001188215426296605, |
| "loss": 0.0049, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.9682051282051282, |
| "grad_norm": 0.0038277863059192896, |
| "learning_rate": 0.00011833290368092243, |
| "loss": 0.0049, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9714871794871794, |
| "grad_norm": 0.005801186431199312, |
| "learning_rate": 0.00011784401684728925, |
| "loss": 0.0048, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.9747692307692307, |
| "grad_norm": 0.00392820592969656, |
| "learning_rate": 0.00011735489667203014, |
| "loss": 0.005, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.978051282051282, |
| "grad_norm": 0.0036806361749768257, |
| "learning_rate": 0.00011686555770535575, |
| "loss": 0.0048, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.9813333333333333, |
| "grad_norm": 0.0037270234897732735, |
| "learning_rate": 0.00011637601450398507, |
| "loss": 0.0048, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.9846153846153847, |
| "grad_norm": 0.032717231661081314, |
| "learning_rate": 0.00011588628163071289, |
| "loss": 0.0054, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9846153846153847, |
| "eval_loss": 0.005068204831331968, |
| "eval_runtime": 1.6978, |
| "eval_samples_per_second": 29.45, |
| "eval_steps_per_second": 29.45, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.987897435897436, |
| "grad_norm": 0.009295407682657242, |
| "learning_rate": 0.0001153963736539761, |
| "loss": 0.0053, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.9911794871794872, |
| "grad_norm": 0.006343138869851828, |
| "learning_rate": 0.00011490630514742058, |
| "loss": 0.0049, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.9944615384615385, |
| "grad_norm": 0.004029371310025454, |
| "learning_rate": 0.00011441609068946764, |
| "loss": 0.0047, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.9977435897435898, |
| "grad_norm": 0.004377037286758423, |
| "learning_rate": 0.00011392574486288026, |
| "loss": 0.0048, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.001025641025641, |
| "grad_norm": 0.004222679417580366, |
| "learning_rate": 0.00011343528225432935, |
| "loss": 0.0065, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.0043076923076923, |
| "grad_norm": 0.011929094791412354, |
| "learning_rate": 0.00011294471745395987, |
| "loss": 0.0057, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.0075897435897436, |
| "grad_norm": 0.011988217942416668, |
| "learning_rate": 0.00011245406505495668, |
| "loss": 0.0055, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.010871794871795, |
| "grad_norm": 0.011446448042988777, |
| "learning_rate": 0.00011196333965311053, |
| "loss": 0.0054, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.0141538461538462, |
| "grad_norm": 0.010524057783186436, |
| "learning_rate": 0.00011147255584638383, |
| "loss": 0.0054, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.0174358974358975, |
| "grad_norm": 0.01129401195794344, |
| "learning_rate": 0.00011098172823447641, |
| "loss": 0.0056, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0207179487179487, |
| "grad_norm": 0.010420313104987144, |
| "learning_rate": 0.00011049087141839126, |
| "loss": 0.0054, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.024, |
| "grad_norm": 0.009146089665591717, |
| "learning_rate": 0.00011000000000000002, |
| "loss": 0.0052, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.0272820512820513, |
| "grad_norm": 0.008935105055570602, |
| "learning_rate": 0.00010950912858160875, |
| "loss": 0.0052, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.0305641025641026, |
| "grad_norm": 0.009841774590313435, |
| "learning_rate": 0.0001090182717655236, |
| "loss": 0.0051, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.0338461538461539, |
| "grad_norm": 0.010123356245458126, |
| "learning_rate": 0.0001085274441536162, |
| "loss": 0.005, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.0371282051282051, |
| "grad_norm": 0.00827121827751398, |
| "learning_rate": 0.00010803666034688951, |
| "loss": 0.0051, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.0404102564102564, |
| "grad_norm": 0.0060403901152312756, |
| "learning_rate": 0.00010754593494504334, |
| "loss": 0.0048, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.0436923076923077, |
| "grad_norm": 0.005550421308726072, |
| "learning_rate": 0.00010705528254604016, |
| "loss": 0.0047, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.046974358974359, |
| "grad_norm": 0.005134978331625462, |
| "learning_rate": 0.00010656471774567066, |
| "loss": 0.0048, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.0502564102564103, |
| "grad_norm": 0.004197476897388697, |
| "learning_rate": 0.00010607425513711977, |
| "loss": 0.0048, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0535384615384615, |
| "grad_norm": 0.0037515375297516584, |
| "learning_rate": 0.0001055839093105324, |
| "loss": 0.0047, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.0568205128205128, |
| "grad_norm": 0.0034930245019495487, |
| "learning_rate": 0.00010509369485257942, |
| "loss": 0.0048, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.060102564102564, |
| "grad_norm": 0.003294882597401738, |
| "learning_rate": 0.00010460362634602392, |
| "loss": 0.0047, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.0633846153846154, |
| "grad_norm": 0.0033549105282872915, |
| "learning_rate": 0.00010411371836928712, |
| "loss": 0.0047, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.0666666666666667, |
| "grad_norm": 0.0033656777814030647, |
| "learning_rate": 0.00010362398549601493, |
| "loss": 0.0049, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.0666666666666667, |
| "eval_loss": 0.005108626559376717, |
| "eval_runtime": 1.7164, |
| "eval_samples_per_second": 29.131, |
| "eval_steps_per_second": 29.131, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.069948717948718, |
| "grad_norm": 0.0033352887257933617, |
| "learning_rate": 0.00010313444229464429, |
| "loss": 0.0048, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.0732307692307692, |
| "grad_norm": 0.0032802869100123644, |
| "learning_rate": 0.00010264510332796991, |
| "loss": 0.0048, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.0765128205128205, |
| "grad_norm": 0.003124071517959237, |
| "learning_rate": 0.00010215598315271076, |
| "loss": 0.0048, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.0797948717948718, |
| "grad_norm": 0.0035496705677360296, |
| "learning_rate": 0.00010166709631907762, |
| "loss": 0.0047, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.083076923076923, |
| "grad_norm": 0.0036747653502970934, |
| "learning_rate": 0.00010117845737033956, |
| "loss": 0.0048, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.0863589743589743, |
| "grad_norm": 0.0033675089944154024, |
| "learning_rate": 0.00010069008084239182, |
| "loss": 0.0048, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.0896410256410256, |
| "grad_norm": 0.0036912565119564533, |
| "learning_rate": 0.00010020198126332321, |
| "loss": 0.0048, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.0929230769230769, |
| "grad_norm": 0.003969641402363777, |
| "learning_rate": 9.971417315298381e-05, |
| "loss": 0.0048, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.0962051282051282, |
| "grad_norm": 0.003843925893306732, |
| "learning_rate": 9.92266710225534e-05, |
| "loss": 0.0049, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.0994871794871794, |
| "grad_norm": 0.004053202457726002, |
| "learning_rate": 9.873948937410929e-05, |
| "loss": 0.0049, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.1027692307692307, |
| "grad_norm": 0.00362073234282434, |
| "learning_rate": 9.825264270019538e-05, |
| "loss": 0.0048, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.106051282051282, |
| "grad_norm": 0.004117041826248169, |
| "learning_rate": 9.776614548339074e-05, |
| "loss": 0.0049, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.1093333333333333, |
| "grad_norm": 0.003928050398826599, |
| "learning_rate": 9.728001219587897e-05, |
| "loss": 0.0048, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.1126153846153846, |
| "grad_norm": 0.004026252776384354, |
| "learning_rate": 9.679425729901746e-05, |
| "loss": 0.0049, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.1158974358974358, |
| "grad_norm": 0.003736741142347455, |
| "learning_rate": 9.630889524290749e-05, |
| "loss": 0.0048, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1191794871794871, |
| "grad_norm": 0.0036812988109886646, |
| "learning_rate": 9.582394046596421e-05, |
| "loss": 0.0049, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.1224615384615384, |
| "grad_norm": 0.0035057140048593283, |
| "learning_rate": 9.533940739448703e-05, |
| "loss": 0.0048, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.1257435897435897, |
| "grad_norm": 0.0037068051751703024, |
| "learning_rate": 9.485531044223068e-05, |
| "loss": 0.0048, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.129025641025641, |
| "grad_norm": 0.0039064399898052216, |
| "learning_rate": 9.437166400997628e-05, |
| "loss": 0.0047, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.1323076923076922, |
| "grad_norm": 0.00368549139238894, |
| "learning_rate": 9.388848248510309e-05, |
| "loss": 0.0047, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.1355897435897435, |
| "grad_norm": 0.0034059761092066765, |
| "learning_rate": 9.340578024116031e-05, |
| "loss": 0.0048, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.1388717948717948, |
| "grad_norm": 0.0036056831013411283, |
| "learning_rate": 9.292357163743977e-05, |
| "loss": 0.0048, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.142153846153846, |
| "grad_norm": 0.0035778083838522434, |
| "learning_rate": 9.244187101854847e-05, |
| "loss": 0.0047, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.1454358974358974, |
| "grad_norm": 0.0036830275785177946, |
| "learning_rate": 9.196069271398216e-05, |
| "loss": 0.0048, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.1487179487179486, |
| "grad_norm": 0.0038206533063203096, |
| "learning_rate": 9.148005103769887e-05, |
| "loss": 0.0049, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1487179487179486, |
| "eval_loss": 0.005071515217423439, |
| "eval_runtime": 1.7105, |
| "eval_samples_per_second": 29.231, |
| "eval_steps_per_second": 29.231, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.152, |
| "grad_norm": 0.003768439870327711, |
| "learning_rate": 9.099996028769313e-05, |
| "loss": 0.0047, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.1552820512820512, |
| "grad_norm": 0.0035322019830346107, |
| "learning_rate": 9.052043474557075e-05, |
| "loss": 0.0047, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.1585641025641025, |
| "grad_norm": 0.003624577773734927, |
| "learning_rate": 9.004148867612379e-05, |
| "loss": 0.0048, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.1618461538461538, |
| "grad_norm": 0.00348327262327075, |
| "learning_rate": 8.956313632690642e-05, |
| "loss": 0.0047, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.1651282051282053, |
| "grad_norm": 0.0037093586288392544, |
| "learning_rate": 8.908539192781092e-05, |
| "loss": 0.005, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.1684102564102563, |
| "grad_norm": 0.00967591442167759, |
| "learning_rate": 8.860826969064444e-05, |
| "loss": 0.0055, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.1716923076923078, |
| "grad_norm": 0.009883382357656956, |
| "learning_rate": 8.813178380870625e-05, |
| "loss": 0.0053, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.1749743589743589, |
| "grad_norm": 0.009646824561059475, |
| "learning_rate": 8.765594845636553e-05, |
| "loss": 0.0053, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.1782564102564104, |
| "grad_norm": 0.009988558478653431, |
| "learning_rate": 8.718077778863966e-05, |
| "loss": 0.0054, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.1815384615384614, |
| "grad_norm": 0.009603265672922134, |
| "learning_rate": 8.670628594077313e-05, |
| "loss": 0.0054, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.184820512820513, |
| "grad_norm": 0.008899768814444542, |
| "learning_rate": 8.623248702781716e-05, |
| "loss": 0.0052, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.188102564102564, |
| "grad_norm": 0.009325512684881687, |
| "learning_rate": 8.575939514420967e-05, |
| "loss": 0.0053, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.1913846153846155, |
| "grad_norm": 0.008173607289791107, |
| "learning_rate": 8.528702436335611e-05, |
| "loss": 0.0052, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.1946666666666665, |
| "grad_norm": 0.008188599720597267, |
| "learning_rate": 8.481538873721074e-05, |
| "loss": 0.005, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.197948717948718, |
| "grad_norm": 0.008771805092692375, |
| "learning_rate": 8.434450229585867e-05, |
| "loss": 0.0051, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.2012307692307693, |
| "grad_norm": 0.008419954217970371, |
| "learning_rate": 8.38743790470984e-05, |
| "loss": 0.0049, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.2045128205128206, |
| "grad_norm": 0.007303427904844284, |
| "learning_rate": 8.340503297602529e-05, |
| "loss": 0.0048, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.2077948717948719, |
| "grad_norm": 0.006724661216139793, |
| "learning_rate": 8.293647804461544e-05, |
| "loss": 0.0048, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.2110769230769232, |
| "grad_norm": 0.005520752165466547, |
| "learning_rate": 8.24687281913102e-05, |
| "loss": 0.0047, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.2143589743589744, |
| "grad_norm": 0.005496680270880461, |
| "learning_rate": 8.200179733060183e-05, |
| "loss": 0.0049, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.2176410256410257, |
| "grad_norm": 0.004724535159766674, |
| "learning_rate": 8.153569935261935e-05, |
| "loss": 0.0048, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.220923076923077, |
| "grad_norm": 0.004228896461427212, |
| "learning_rate": 8.10704481227155e-05, |
| "loss": 0.0047, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.2242051282051283, |
| "grad_norm": 0.003649340244010091, |
| "learning_rate": 8.060605748105404e-05, |
| "loss": 0.0049, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.2274871794871796, |
| "grad_norm": 0.0032480864319950342, |
| "learning_rate": 8.014254124219835e-05, |
| "loss": 0.0048, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.2307692307692308, |
| "grad_norm": 0.003284406615421176, |
| "learning_rate": 7.96799131947002e-05, |
| "loss": 0.0047, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.2307692307692308, |
| "eval_loss": 0.005098435096442699, |
| "eval_runtime": 1.7095, |
| "eval_samples_per_second": 29.248, |
| "eval_steps_per_second": 29.248, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.2340512820512821, |
| "grad_norm": 0.0030009394977241755, |
| "learning_rate": 7.921818710068983e-05, |
| "loss": 0.0048, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.2373333333333334, |
| "grad_norm": 0.0028776165563613176, |
| "learning_rate": 7.875737669546627e-05, |
| "loss": 0.0048, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.2406153846153847, |
| "grad_norm": 0.0028720826376229525, |
| "learning_rate": 7.829749568708899e-05, |
| "loss": 0.0048, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.243897435897436, |
| "grad_norm": 0.0026297715958207846, |
| "learning_rate": 7.783855775597e-05, |
| "loss": 0.0047, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.2471794871794872, |
| "grad_norm": 0.002790766768157482, |
| "learning_rate": 7.738057655446687e-05, |
| "loss": 0.005, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2504615384615385, |
| "grad_norm": 0.002748630242422223, |
| "learning_rate": 7.69235657064767e-05, |
| "loss": 0.0047, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.2537435897435898, |
| "grad_norm": 0.0028022800106555223, |
| "learning_rate": 7.646753880703074e-05, |
| "loss": 0.0048, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.257025641025641, |
| "grad_norm": 0.0030396939255297184, |
| "learning_rate": 7.601250942189009e-05, |
| "loss": 0.0048, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.2603076923076924, |
| "grad_norm": 0.003127284813672304, |
| "learning_rate": 7.555849108714192e-05, |
| "loss": 0.0049, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.2635897435897436, |
| "grad_norm": 0.0029194881208240986, |
| "learning_rate": 7.510549730879715e-05, |
| "loss": 0.0047, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.266871794871795, |
| "grad_norm": 0.0032365506049245596, |
| "learning_rate": 7.465354156238835e-05, |
| "loss": 0.0048, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.2701538461538462, |
| "grad_norm": 0.003188034286722541, |
| "learning_rate": 7.420263729256902e-05, |
| "loss": 0.0047, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.2734358974358975, |
| "grad_norm": 0.0029549768660217524, |
| "learning_rate": 7.375279791271368e-05, |
| "loss": 0.0048, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.2767179487179487, |
| "grad_norm": 0.09431561827659607, |
| "learning_rate": 7.330403680451869e-05, |
| "loss": 0.0054, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.0030596228316426277, |
| "learning_rate": 7.285636731760448e-05, |
| "loss": 0.0046, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.2832820512820513, |
| "grad_norm": 0.0031989929266273975, |
| "learning_rate": 7.240980276911804e-05, |
| "loss": 0.0048, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.2865641025641026, |
| "grad_norm": 0.003118851687759161, |
| "learning_rate": 7.196435644333708e-05, |
| "loss": 0.0048, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.2898461538461539, |
| "grad_norm": 0.003467726055532694, |
| "learning_rate": 7.152004159127463e-05, |
| "loss": 0.0048, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.2931282051282051, |
| "grad_norm": 0.0033981874585151672, |
| "learning_rate": 7.107687143028502e-05, |
| "loss": 0.0048, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.2964102564102564, |
| "grad_norm": 0.002850554184988141, |
| "learning_rate": 7.063485914367075e-05, |
| "loss": 0.0047, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.2996923076923077, |
| "grad_norm": 0.0028371524531394243, |
| "learning_rate": 7.019401788028993e-05, |
| "loss": 0.0047, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.302974358974359, |
| "grad_norm": 0.003020528005436063, |
| "learning_rate": 6.975436075416555e-05, |
| "loss": 0.0049, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.3062564102564103, |
| "grad_norm": 0.0028517835307866335, |
| "learning_rate": 6.931590084409524e-05, |
| "loss": 0.0047, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.3095384615384615, |
| "grad_norm": 0.003879575990140438, |
| "learning_rate": 6.887865119326214e-05, |
| "loss": 0.0049, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.3128205128205128, |
| "grad_norm": 0.0037416021805256605, |
| "learning_rate": 6.844262480884697e-05, |
| "loss": 0.0049, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3128205128205128, |
| "eval_loss": 0.0050128186121582985, |
| "eval_runtime": 1.8163, |
| "eval_samples_per_second": 27.529, |
| "eval_steps_per_second": 27.529, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.316102564102564, |
| "grad_norm": 0.0030078617855906487, |
| "learning_rate": 6.800783466164098e-05, |
| "loss": 0.0047, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.3193846153846154, |
| "grad_norm": 0.0030800465028733015, |
| "learning_rate": 6.757429368566022e-05, |
| "loss": 0.0048, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.3226666666666667, |
| "grad_norm": 0.003120367182418704, |
| "learning_rate": 6.71420147777608e-05, |
| "loss": 0.0047, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.325948717948718, |
| "grad_norm": 0.003493399592116475, |
| "learning_rate": 6.671101079725513e-05, |
| "loss": 0.0049, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.3292307692307692, |
| "grad_norm": 0.0038059516809880733, |
| "learning_rate": 6.62812945655294e-05, |
| "loss": 0.005, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.3325128205128205, |
| "grad_norm": 0.009474639780819416, |
| "learning_rate": 6.58528788656623e-05, |
| "loss": 0.0052, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.3357948717948718, |
| "grad_norm": 0.009293424896895885, |
| "learning_rate": 6.542577644204456e-05, |
| "loss": 0.0053, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.339076923076923, |
| "grad_norm": 0.009816371835768223, |
| "learning_rate": 6.500000000000002e-05, |
| "loss": 0.0055, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.3423589743589743, |
| "grad_norm": 0.009543578140437603, |
| "learning_rate": 6.45755622054075e-05, |
| "loss": 0.0053, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.3456410256410256, |
| "grad_norm": 0.008382177911698818, |
| "learning_rate": 6.415247568432425e-05, |
| "loss": 0.0052, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.348923076923077, |
| "grad_norm": 0.00873810425400734, |
| "learning_rate": 6.373075302261006e-05, |
| "loss": 0.0052, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.3522051282051282, |
| "grad_norm": 0.007860679179430008, |
| "learning_rate": 6.331040676555306e-05, |
| "loss": 0.0052, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.3554871794871794, |
| "grad_norm": 0.007655597757548094, |
| "learning_rate": 6.289144941749656e-05, |
| "loss": 0.005, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.3587692307692307, |
| "grad_norm": 0.008586070500314236, |
| "learning_rate": 6.247389344146688e-05, |
| "loss": 0.0051, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.362051282051282, |
| "grad_norm": 0.009542787447571754, |
| "learning_rate": 6.20577512588028e-05, |
| "loss": 0.005, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.3653333333333333, |
| "grad_norm": 0.00885209534317255, |
| "learning_rate": 6.164303524878586e-05, |
| "loss": 0.005, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.3686153846153846, |
| "grad_norm": 0.006266025826334953, |
| "learning_rate": 6.122975774827238e-05, |
| "loss": 0.0048, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.3718974358974358, |
| "grad_norm": 0.0066079627722501755, |
| "learning_rate": 6.081793105132611e-05, |
| "loss": 0.0048, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.3751794871794871, |
| "grad_norm": 0.00554714584723115, |
| "learning_rate": 6.0407567408852874e-05, |
| "loss": 0.0049, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.3784615384615384, |
| "grad_norm": 0.004965722095221281, |
| "learning_rate": 5.9998679028235824e-05, |
| "loss": 0.0047, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3817435897435897, |
| "grad_norm": 0.004424337763339281, |
| "learning_rate": 5.959127807297251e-05, |
| "loss": 0.0047, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.385025641025641, |
| "grad_norm": 0.003751178737729788, |
| "learning_rate": 5.918537666231296e-05, |
| "loss": 0.0047, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.3883076923076922, |
| "grad_norm": 0.0038334885612130165, |
| "learning_rate": 5.8780986870899144e-05, |
| "loss": 0.0048, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.3915897435897435, |
| "grad_norm": 0.0035623824223876, |
| "learning_rate": 5.8378120728405885e-05, |
| "loss": 0.0047, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.3948717948717948, |
| "grad_norm": 0.003139911452308297, |
| "learning_rate": 5.797679021918272e-05, |
| "loss": 0.0048, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.3948717948717948, |
| "eval_loss": 0.005003854166716337, |
| "eval_runtime": 1.5778, |
| "eval_samples_per_second": 31.689, |
| "eval_steps_per_second": 31.689, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.398153846153846, |
| "grad_norm": 0.0028019817546010017, |
| "learning_rate": 5.7577007281897824e-05, |
| "loss": 0.0047, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.4014358974358974, |
| "grad_norm": 0.002678703283891082, |
| "learning_rate": 5.717878380918251e-05, |
| "loss": 0.0046, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.4047179487179486, |
| "grad_norm": 0.00241168774664402, |
| "learning_rate": 5.678213164727761e-05, |
| "loss": 0.0047, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.408, |
| "grad_norm": 0.0023575942032039165, |
| "learning_rate": 5.6387062595681006e-05, |
| "loss": 0.0046, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.4112820512820514, |
| "grad_norm": 0.002579533262178302, |
| "learning_rate": 5.599358840679673e-05, |
| "loss": 0.0047, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.4145641025641025, |
| "grad_norm": 0.002402591984719038, |
| "learning_rate": 5.560172078558521e-05, |
| "loss": 0.0047, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.417846153846154, |
| "grad_norm": 0.0023617083206772804, |
| "learning_rate": 5.521147138921514e-05, |
| "loss": 0.0047, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.421128205128205, |
| "grad_norm": 0.0024777904618531466, |
| "learning_rate": 5.4822851826716814e-05, |
| "loss": 0.0046, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.4244102564102565, |
| "grad_norm": 0.002682394813746214, |
| "learning_rate": 5.443587365863657e-05, |
| "loss": 0.0048, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.4276923076923076, |
| "grad_norm": 0.0025969718117266893, |
| "learning_rate": 5.405054839669306e-05, |
| "loss": 0.0047, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.430974358974359, |
| "grad_norm": 0.0027512714732438326, |
| "learning_rate": 5.3666887503434693e-05, |
| "loss": 0.0048, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.4342564102564102, |
| "grad_norm": 0.002662122482433915, |
| "learning_rate": 5.3284902391898795e-05, |
| "loss": 0.0047, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.4375384615384617, |
| "grad_norm": 0.002524281619116664, |
| "learning_rate": 5.290460442527192e-05, |
| "loss": 0.0047, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.4408205128205127, |
| "grad_norm": 0.003124300390481949, |
| "learning_rate": 5.252600491655193e-05, |
| "loss": 0.0047, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.4441025641025642, |
| "grad_norm": 0.0028491078410297632, |
| "learning_rate": 5.214911512821145e-05, |
| "loss": 0.0047, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.4473846153846153, |
| "grad_norm": 0.003151330165565014, |
| "learning_rate": 5.177394627186285e-05, |
| "loss": 0.0047, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.4506666666666668, |
| "grad_norm": 0.002938151592388749, |
| "learning_rate": 5.1400509507924596e-05, |
| "loss": 0.0047, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.4539487179487178, |
| "grad_norm": 0.0029112198390066624, |
| "learning_rate": 5.102881594528941e-05, |
| "loss": 0.0047, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.4572307692307693, |
| "grad_norm": 0.003463789587840438, |
| "learning_rate": 5.06588766409938e-05, |
| "loss": 0.0048, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.4605128205128204, |
| "grad_norm": 0.003294401103630662, |
| "learning_rate": 5.0290702599889016e-05, |
| "loss": 0.0047, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.4637948717948719, |
| "grad_norm": 0.0029868704732507467, |
| "learning_rate": 4.9924304774313756e-05, |
| "loss": 0.0047, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.467076923076923, |
| "grad_norm": 0.003178369253873825, |
| "learning_rate": 4.955969406376835e-05, |
| "loss": 0.0047, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.4703589743589744, |
| "grad_norm": 0.0042068324983119965, |
| "learning_rate": 4.919688131459058e-05, |
| "loss": 0.0048, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.4736410256410255, |
| "grad_norm": 0.00410189013928175, |
| "learning_rate": 4.883587731963295e-05, |
| "loss": 0.0048, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.476923076923077, |
| "grad_norm": 0.0037490760441869497, |
| "learning_rate": 4.847669281794158e-05, |
| "loss": 0.005, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.476923076923077, |
| "eval_loss": 0.004960117861628532, |
| "eval_runtime": 1.0676, |
| "eval_samples_per_second": 46.836, |
| "eval_steps_per_second": 46.836, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4802051282051283, |
| "grad_norm": 0.003720939392223954, |
| "learning_rate": 4.811933849443693e-05, |
| "loss": 0.0048, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.4834871794871796, |
| "grad_norm": 0.003880757139995694, |
| "learning_rate": 4.776382497959577e-05, |
| "loss": 0.0049, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.4867692307692308, |
| "grad_norm": 0.005387154407799244, |
| "learning_rate": 4.741016284913496e-05, |
| "loss": 0.0049, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.4900512820512821, |
| "grad_norm": 0.0035007649566978216, |
| "learning_rate": 4.705836262369696e-05, |
| "loss": 0.0048, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.4933333333333334, |
| "grad_norm": 0.003741365857422352, |
| "learning_rate": 4.670843476853683e-05, |
| "loss": 0.005, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.4966153846153847, |
| "grad_norm": 0.009279366582632065, |
| "learning_rate": 4.6360389693210735e-05, |
| "loss": 0.0053, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.499897435897436, |
| "grad_norm": 0.00897167343646288, |
| "learning_rate": 4.601423775126657e-05, |
| "loss": 0.0052, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.5031794871794872, |
| "grad_norm": 0.009041314013302326, |
| "learning_rate": 4.566998923993585e-05, |
| "loss": 0.0053, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.5064615384615383, |
| "grad_norm": 0.008953151293098927, |
| "learning_rate": 4.5327654399827175e-05, |
| "loss": 0.0053, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.5097435897435898, |
| "grad_norm": 0.008846296928822994, |
| "learning_rate": 4.4987243414622004e-05, |
| "loss": 0.0052, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.513025641025641, |
| "grad_norm": 0.008246372453868389, |
| "learning_rate": 4.464876641077137e-05, |
| "loss": 0.0052, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.5163076923076924, |
| "grad_norm": 0.0073341550305485725, |
| "learning_rate": 4.431223345719482e-05, |
| "loss": 0.0051, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.5195897435897436, |
| "grad_norm": 0.006810352671891451, |
| "learning_rate": 4.397765456498075e-05, |
| "loss": 0.0051, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.522871794871795, |
| "grad_norm": 0.007108463905751705, |
| "learning_rate": 4.364503968708885e-05, |
| "loss": 0.005, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.5261538461538462, |
| "grad_norm": 0.00809982419013977, |
| "learning_rate": 4.33143987180537e-05, |
| "loss": 0.005, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.5294358974358975, |
| "grad_norm": 0.007939063012599945, |
| "learning_rate": 4.298574149369064e-05, |
| "loss": 0.005, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.5327179487179488, |
| "grad_norm": 0.006030023563653231, |
| "learning_rate": 4.2659077790803183e-05, |
| "loss": 0.0048, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.536, |
| "grad_norm": 0.0052713961340487, |
| "learning_rate": 4.233441732689205e-05, |
| "loss": 0.0047, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.5392820512820513, |
| "grad_norm": 0.0051575591787695885, |
| "learning_rate": 4.201176975986618e-05, |
| "loss": 0.0047, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.5425641025641026, |
| "grad_norm": 0.004836643114686012, |
| "learning_rate": 4.1691144687755434e-05, |
| "loss": 0.0048, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.5458461538461539, |
| "grad_norm": 0.004236603155732155, |
| "learning_rate": 4.137255164842508e-05, |
| "loss": 0.0047, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.5491282051282051, |
| "grad_norm": 0.0034698331728577614, |
| "learning_rate": 4.1056000119291995e-05, |
| "loss": 0.0047, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.5524102564102564, |
| "grad_norm": 0.0033043124713003635, |
| "learning_rate": 4.074149951704279e-05, |
| "loss": 0.0048, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.5556923076923077, |
| "grad_norm": 0.00292124948464334, |
| "learning_rate": 4.042905919735367e-05, |
| "loss": 0.0047, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.558974358974359, |
| "grad_norm": 0.0029612670186907053, |
| "learning_rate": 4.0118688454612205e-05, |
| "loss": 0.0048, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.558974358974359, |
| "eval_loss": 0.004941979423165321, |
| "eval_runtime": 1.0895, |
| "eval_samples_per_second": 45.893, |
| "eval_steps_per_second": 45.893, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.5622564102564103, |
| "grad_norm": 0.0030873967334628105, |
| "learning_rate": 3.9810396521640656e-05, |
| "loss": 0.0048, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.5655384615384615, |
| "grad_norm": 0.0027741712983697653, |
| "learning_rate": 3.9504192569421475e-05, |
| "loss": 0.0047, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.5688205128205128, |
| "grad_norm": 0.0026183989830315113, |
| "learning_rate": 3.9200085706824475e-05, |
| "loss": 0.0047, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.572102564102564, |
| "grad_norm": 0.002464702120050788, |
| "learning_rate": 3.88980849803358e-05, |
| "loss": 0.0047, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.5753846153846154, |
| "grad_norm": 0.002672748640179634, |
| "learning_rate": 3.8598199373788846e-05, |
| "loss": 0.0048, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.5786666666666667, |
| "grad_norm": 0.002426745370030403, |
| "learning_rate": 3.8300437808097e-05, |
| "loss": 0.0048, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.581948717948718, |
| "grad_norm": 0.002388445660471916, |
| "learning_rate": 3.800480914098834e-05, |
| "loss": 0.0048, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.5852307692307692, |
| "grad_norm": 0.0023456227499991655, |
| "learning_rate": 3.771132216674197e-05, |
| "loss": 0.0048, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.5885128205128205, |
| "grad_norm": 0.0024677433539181948, |
| "learning_rate": 3.741998561592657e-05, |
| "loss": 0.0048, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.5917948717948718, |
| "grad_norm": 0.002437222981825471, |
| "learning_rate": 3.713080815514063e-05, |
| "loss": 0.0048, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.595076923076923, |
| "grad_norm": 0.0025231228210031986, |
| "learning_rate": 3.684379838675464e-05, |
| "loss": 0.0048, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.5983589743589743, |
| "grad_norm": 0.0025096021126955748, |
| "learning_rate": 3.655896484865512e-05, |
| "loss": 0.0047, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.6016410256410256, |
| "grad_norm": 0.0026138813700526953, |
| "learning_rate": 3.627631601399073e-05, |
| "loss": 0.0048, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.604923076923077, |
| "grad_norm": 0.0025557177141308784, |
| "learning_rate": 3.599586029092027e-05, |
| "loss": 0.0047, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.6082051282051282, |
| "grad_norm": 0.0025317317340523005, |
| "learning_rate": 3.571760602236226e-05, |
| "loss": 0.0047, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.6114871794871795, |
| "grad_norm": 0.0025278881657868624, |
| "learning_rate": 3.54415614857472e-05, |
| "loss": 0.0048, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.6147692307692307, |
| "grad_norm": 0.003274059621617198, |
| "learning_rate": 3.516773489277092e-05, |
| "loss": 0.0049, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.618051282051282, |
| "grad_norm": 0.0029409686103463173, |
| "learning_rate": 3.489613438915061e-05, |
| "loss": 0.0048, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.6213333333333333, |
| "grad_norm": 0.003028150415048003, |
| "learning_rate": 3.4626768054382305e-05, |
| "loss": 0.0048, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.6246153846153846, |
| "grad_norm": 0.0030735956970602274, |
| "learning_rate": 3.435964390150057e-05, |
| "loss": 0.0049, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.6278974358974359, |
| "grad_norm": 0.0029834897723048925, |
| "learning_rate": 3.409476987684031e-05, |
| "loss": 0.0048, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.6311794871794871, |
| "grad_norm": 0.0033205023501068354, |
| "learning_rate": 3.3832153859800054e-05, |
| "loss": 0.0048, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.6344615384615384, |
| "grad_norm": 0.0034721517004072666, |
| "learning_rate": 3.357180366260791e-05, |
| "loss": 0.0048, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.6377435897435897, |
| "grad_norm": 0.003484040265902877, |
| "learning_rate": 3.3313727030088934e-05, |
| "loss": 0.0048, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.641025641025641, |
| "grad_norm": 0.003361676586791873, |
| "learning_rate": 3.305793163943483e-05, |
| "loss": 0.0048, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.641025641025641, |
| "eval_loss": 0.004984674043953419, |
| "eval_runtime": 1.057, |
| "eval_samples_per_second": 47.306, |
| "eval_steps_per_second": 47.306, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6443076923076925, |
| "grad_norm": 0.0035249050706624985, |
| "learning_rate": 3.2804425099975525e-05, |
| "loss": 0.0048, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.6475897435897435, |
| "grad_norm": 0.0035485997796058655, |
| "learning_rate": 3.25532149529529e-05, |
| "loss": 0.0048, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.650871794871795, |
| "grad_norm": 0.0034246218856424093, |
| "learning_rate": 3.2304308671296355e-05, |
| "loss": 0.0048, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.654153846153846, |
| "grad_norm": 0.0036572501994669437, |
| "learning_rate": 3.205771365940052e-05, |
| "loss": 0.0048, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.6574358974358976, |
| "grad_norm": 0.003463857341557741, |
| "learning_rate": 3.1813437252905096e-05, |
| "loss": 0.0049, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.6607179487179486, |
| "grad_norm": 0.008703382685780525, |
| "learning_rate": 3.157148671847649e-05, |
| "loss": 0.0052, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.00902031920850277, |
| "learning_rate": 3.133186925359172e-05, |
| "loss": 0.0053, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.6672820512820512, |
| "grad_norm": 0.00848945789039135, |
| "learning_rate": 3.109459198632431e-05, |
| "loss": 0.0052, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.6705641025641027, |
| "grad_norm": 0.008561553433537483, |
| "learning_rate": 3.085966197513227e-05, |
| "loss": 0.0053, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.6738461538461538, |
| "grad_norm": 0.008840641938149929, |
| "learning_rate": 3.062708620864806e-05, |
| "loss": 0.0053, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.6771282051282053, |
| "grad_norm": 0.008055276237428188, |
| "learning_rate": 3.0396871605470702e-05, |
| "loss": 0.0052, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.6804102564102563, |
| "grad_norm": 0.007263275794684887, |
| "learning_rate": 3.0169025013960052e-05, |
| "loss": 0.0052, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.6836923076923078, |
| "grad_norm": 0.006828081328421831, |
| "learning_rate": 2.9943553212032964e-05, |
| "loss": 0.005, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.6869743589743589, |
| "grad_norm": 0.006435649003833532, |
| "learning_rate": 2.972046290696173e-05, |
| "loss": 0.0051, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.6902564102564104, |
| "grad_norm": 0.007080839481204748, |
| "learning_rate": 2.9499760735174537e-05, |
| "loss": 0.0049, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.6935384615384614, |
| "grad_norm": 0.006745288148522377, |
| "learning_rate": 2.928145326205806e-05, |
| "loss": 0.0049, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.696820512820513, |
| "grad_norm": 0.00520884245634079, |
| "learning_rate": 2.906554698176213e-05, |
| "loss": 0.0047, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.700102564102564, |
| "grad_norm": 0.00431660795584321, |
| "learning_rate": 2.8852048317006565e-05, |
| "loss": 0.0047, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.7033846153846155, |
| "grad_norm": 0.004076897166669369, |
| "learning_rate": 2.8640963618890103e-05, |
| "loss": 0.0046, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.7066666666666666, |
| "grad_norm": 0.0038497080095112324, |
| "learning_rate": 2.8432299166701508e-05, |
| "loss": 0.0048, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.709948717948718, |
| "grad_norm": 0.0030418243259191513, |
| "learning_rate": 2.8226061167732704e-05, |
| "loss": 0.0047, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.7132307692307691, |
| "grad_norm": 0.0033263147342950106, |
| "learning_rate": 2.8022255757094174e-05, |
| "loss": 0.0046, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.7165128205128206, |
| "grad_norm": 0.0030696901958435774, |
| "learning_rate": 2.7820888997532464e-05, |
| "loss": 0.0047, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.7197948717948717, |
| "grad_norm": 0.002596198348328471, |
| "learning_rate": 2.7621966879249762e-05, |
| "loss": 0.0048, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.7230769230769232, |
| "grad_norm": 0.00273293349891901, |
| "learning_rate": 2.7425495319725793e-05, |
| "loss": 0.0047, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.7230769230769232, |
| "eval_loss": 0.004907587543129921, |
| "eval_runtime": 1.1425, |
| "eval_samples_per_second": 43.766, |
| "eval_steps_per_second": 43.766, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.7263589743589742, |
| "grad_norm": 0.0025641010142862797, |
| "learning_rate": 2.72314801635417e-05, |
| "loss": 0.0047, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.7296410256410257, |
| "grad_norm": 0.002465146593749523, |
| "learning_rate": 2.7039927182206293e-05, |
| "loss": 0.0047, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.7329230769230768, |
| "grad_norm": 0.0026465991977602243, |
| "learning_rate": 2.6850842073984196e-05, |
| "loss": 0.0047, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.7362051282051283, |
| "grad_norm": 0.002503578085452318, |
| "learning_rate": 2.666423046372651e-05, |
| "loss": 0.0048, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.7394871794871793, |
| "grad_norm": 0.002414755057543516, |
| "learning_rate": 2.6480097902703404e-05, |
| "loss": 0.0047, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.7427692307692308, |
| "grad_norm": 0.00238244840875268, |
| "learning_rate": 2.629844986843892e-05, |
| "loss": 0.0046, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.746051282051282, |
| "grad_norm": 0.002321184379979968, |
| "learning_rate": 2.611929176454814e-05, |
| "loss": 0.0047, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.7493333333333334, |
| "grad_norm": 0.002244308590888977, |
| "learning_rate": 2.59426289205764e-05, |
| "loss": 0.0047, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.7526153846153845, |
| "grad_norm": 0.0020898343063890934, |
| "learning_rate": 2.5768466591840707e-05, |
| "loss": 0.0047, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.755897435897436, |
| "grad_norm": 0.002154160290956497, |
| "learning_rate": 2.5596809959273432e-05, |
| "loss": 0.0048, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.759179487179487, |
| "grad_norm": 0.0022296863608062267, |
| "learning_rate": 2.542766412926825e-05, |
| "loss": 0.0047, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.7624615384615385, |
| "grad_norm": 0.0023085817229002714, |
| "learning_rate": 2.5261034133528138e-05, |
| "loss": 0.0048, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.7657435897435896, |
| "grad_norm": 0.0021941603627055883, |
| "learning_rate": 2.5096924928915733e-05, |
| "loss": 0.0047, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.769025641025641, |
| "grad_norm": 0.002463526790961623, |
| "learning_rate": 2.4935341397305903e-05, |
| "loss": 0.0047, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.7723076923076924, |
| "grad_norm": 0.0022917832247912884, |
| "learning_rate": 2.4776288345440503e-05, |
| "loss": 0.0048, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.7755897435897436, |
| "grad_norm": 0.0023666713386774063, |
| "learning_rate": 2.461977050478534e-05, |
| "loss": 0.0049, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.778871794871795, |
| "grad_norm": 0.0024120372254401445, |
| "learning_rate": 2.4465792531389504e-05, |
| "loss": 0.0047, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.7821538461538462, |
| "grad_norm": 0.0027782933320850134, |
| "learning_rate": 2.4314359005746817e-05, |
| "loss": 0.0047, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.7854358974358975, |
| "grad_norm": 0.0026203745510429144, |
| "learning_rate": 2.4165474432659588e-05, |
| "loss": 0.0047, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.7887179487179488, |
| "grad_norm": 0.0026280086021870375, |
| "learning_rate": 2.401914324110456e-05, |
| "loss": 0.0046, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.792, |
| "grad_norm": 0.002855598460882902, |
| "learning_rate": 2.387536978410121e-05, |
| "loss": 0.0047, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.7952820512820513, |
| "grad_norm": 0.0029618304688483477, |
| "learning_rate": 2.373415833858226e-05, |
| "loss": 0.0047, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.7985641025641026, |
| "grad_norm": 0.0031228596344590187, |
| "learning_rate": 2.359551310526643e-05, |
| "loss": 0.0048, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.8018461538461539, |
| "grad_norm": 0.003316520480439067, |
| "learning_rate": 2.345943820853342e-05, |
| "loss": 0.0047, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.8051282051282052, |
| "grad_norm": 0.004680112935602665, |
| "learning_rate": 2.332593769630136e-05, |
| "loss": 0.0048, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.8051282051282052, |
| "eval_loss": 0.005017252638936043, |
| "eval_runtime": 1.0529, |
| "eval_samples_per_second": 47.487, |
| "eval_steps_per_second": 47.487, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.8084102564102564, |
| "grad_norm": 0.0034276428632438183, |
| "learning_rate": 2.3195015539906243e-05, |
| "loss": 0.0049, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.8116923076923077, |
| "grad_norm": 0.0030685942620038986, |
| "learning_rate": 2.3066675633983865e-05, |
| "loss": 0.0048, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.814974358974359, |
| "grad_norm": 0.0046894908882677555, |
| "learning_rate": 2.2940921796353956e-05, |
| "loss": 0.0047, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.8182564102564103, |
| "grad_norm": 0.0035674276296049356, |
| "learning_rate": 2.2817757767906625e-05, |
| "loss": 0.0048, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.8215384615384616, |
| "grad_norm": 0.005269620567560196, |
| "learning_rate": 2.2697187212491044e-05, |
| "loss": 0.0051, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.8248205128205128, |
| "grad_norm": 0.008738451637327671, |
| "learning_rate": 2.2579213716806474e-05, |
| "loss": 0.0052, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.828102564102564, |
| "grad_norm": 0.008472139947116375, |
| "learning_rate": 2.2463840790295566e-05, |
| "loss": 0.0051, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.8313846153846154, |
| "grad_norm": 0.008605373091995716, |
| "learning_rate": 2.2351071865039974e-05, |
| "loss": 0.0051, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.8346666666666667, |
| "grad_norm": 0.02175315096974373, |
| "learning_rate": 2.224091029565824e-05, |
| "loss": 0.0053, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.837948717948718, |
| "grad_norm": 0.008465359918773174, |
| "learning_rate": 2.2133359359206e-05, |
| "loss": 0.0052, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.8412307692307692, |
| "grad_norm": 0.007232977543026209, |
| "learning_rate": 2.2028422255078542e-05, |
| "loss": 0.0052, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.8445128205128205, |
| "grad_norm": 0.007051311433315277, |
| "learning_rate": 2.1926102104915553e-05, |
| "loss": 0.0051, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.8477948717948718, |
| "grad_norm": 0.006151077803224325, |
| "learning_rate": 2.182640195250835e-05, |
| "loss": 0.005, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.851076923076923, |
| "grad_norm": 0.006573867984116077, |
| "learning_rate": 2.1729324763709264e-05, |
| "loss": 0.0051, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.8543589743589743, |
| "grad_norm": 0.00678396737203002, |
| "learning_rate": 2.1634873426343427e-05, |
| "loss": 0.0049, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.8576410256410256, |
| "grad_norm": 0.005578219890594482, |
| "learning_rate": 2.1543050750122902e-05, |
| "loss": 0.0048, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.860923076923077, |
| "grad_norm": 0.0040833973325788975, |
| "learning_rate": 2.145385946656303e-05, |
| "loss": 0.0047, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.8642051282051282, |
| "grad_norm": 0.004177347291260958, |
| "learning_rate": 2.1367302228901282e-05, |
| "loss": 0.0046, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.8674871794871795, |
| "grad_norm": 0.0036663906648755074, |
| "learning_rate": 2.128338161201819e-05, |
| "loss": 0.0047, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.8707692307692307, |
| "grad_norm": 0.003597427159547806, |
| "learning_rate": 2.1202100112360894e-05, |
| "loss": 0.0048, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.874051282051282, |
| "grad_norm": 0.0029398370534181595, |
| "learning_rate": 2.1123460147868763e-05, |
| "loss": 0.0048, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.8773333333333333, |
| "grad_norm": 0.003072077641263604, |
| "learning_rate": 2.1047464057901542e-05, |
| "loss": 0.0048, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.8806153846153846, |
| "grad_norm": 0.002605011221021414, |
| "learning_rate": 2.0974114103169712e-05, |
| "loss": 0.0048, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.8838974358974359, |
| "grad_norm": 0.002371675567701459, |
| "learning_rate": 2.0903412465667293e-05, |
| "loss": 0.0047, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.8871794871794871, |
| "grad_norm": 0.002911495743319392, |
| "learning_rate": 2.0835361248606867e-05, |
| "loss": 0.0047, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.8871794871794871, |
| "eval_loss": 0.0050178528763353825, |
| "eval_runtime": 1.0828, |
| "eval_samples_per_second": 46.176, |
| "eval_steps_per_second": 46.176, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.8904615384615384, |
| "grad_norm": 0.0025259945541620255, |
| "learning_rate": 2.0769962476357068e-05, |
| "loss": 0.0047, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.8937435897435897, |
| "grad_norm": 0.0023200158029794693, |
| "learning_rate": 2.070721809438233e-05, |
| "loss": 0.0047, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.897025641025641, |
| "grad_norm": 0.0023292931728065014, |
| "learning_rate": 2.0647129969185046e-05, |
| "loss": 0.0048, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.9003076923076923, |
| "grad_norm": 0.0025951117277145386, |
| "learning_rate": 2.058969988825001e-05, |
| "loss": 0.0047, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.9035897435897438, |
| "grad_norm": 0.0026415924075990915, |
| "learning_rate": 2.0534929559991233e-05, |
| "loss": 0.0047, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.9068717948717948, |
| "grad_norm": 0.0020874382462352514, |
| "learning_rate": 2.0482820613701192e-05, |
| "loss": 0.0046, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.9101538461538463, |
| "grad_norm": 0.002052360912784934, |
| "learning_rate": 2.043337459950229e-05, |
| "loss": 0.0046, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.9134358974358974, |
| "grad_norm": 0.0021120973397046328, |
| "learning_rate": 2.0386592988300747e-05, |
| "loss": 0.0046, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.9167179487179489, |
| "grad_norm": 0.0021454044617712498, |
| "learning_rate": 2.03424771717429e-05, |
| "loss": 0.0047, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.0023362315259873867, |
| "learning_rate": 2.0301028462173774e-05, |
| "loss": 0.0048, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.9232820512820514, |
| "grad_norm": 0.002209689933806658, |
| "learning_rate": 2.0262248092598006e-05, |
| "loss": 0.0048, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.9265641025641025, |
| "grad_norm": 0.0022381660528481007, |
| "learning_rate": 2.0226137216643222e-05, |
| "loss": 0.0048, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.929846153846154, |
| "grad_norm": 0.002202109433710575, |
| "learning_rate": 2.019269690852569e-05, |
| "loss": 0.0047, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.933128205128205, |
| "grad_norm": 0.0021981867030262947, |
| "learning_rate": 2.016192816301837e-05, |
| "loss": 0.0046, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.9364102564102565, |
| "grad_norm": 0.002059696475043893, |
| "learning_rate": 2.0133831895421322e-05, |
| "loss": 0.0047, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.9396923076923076, |
| "grad_norm": 0.0020739359315484762, |
| "learning_rate": 2.0108408941534486e-05, |
| "loss": 0.0046, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.942974358974359, |
| "grad_norm": 0.0024034185335040092, |
| "learning_rate": 2.00856600576328e-05, |
| "loss": 0.0047, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.9462564102564102, |
| "grad_norm": 0.0022281610872596502, |
| "learning_rate": 2.006558592044373e-05, |
| "loss": 0.0048, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.9495384615384617, |
| "grad_norm": 0.0029593328945338726, |
| "learning_rate": 2.0048187127127092e-05, |
| "loss": 0.0049, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.9528205128205127, |
| "grad_norm": 0.002573527628555894, |
| "learning_rate": 2.003346419525735e-05, |
| "loss": 0.0048, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.9561025641025642, |
| "grad_norm": 0.002822197275236249, |
| "learning_rate": 2.002141756280818e-05, |
| "loss": 0.0047, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.9593846153846153, |
| "grad_norm": 0.002600959734991193, |
| "learning_rate": 2.001204758813944e-05, |
| "loss": 0.0047, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.9626666666666668, |
| "grad_norm": 0.003187810303643346, |
| "learning_rate": 2.0005354549986523e-05, |
| "loss": 0.0047, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.9659487179487178, |
| "grad_norm": 0.0029263379983603954, |
| "learning_rate": 2.0001338647452058e-05, |
| "loss": 0.0048, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.9692307692307693, |
| "grad_norm": 0.003195718163624406, |
| "learning_rate": 2e-05, |
| "loss": 0.0048, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9692307692307693, |
| "eval_loss": 0.004978457931429148, |
| "eval_runtime": 1.0723, |
| "eval_samples_per_second": 46.628, |
| "eval_steps_per_second": 46.628, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 600, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 80, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 3 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.654552359691878e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|