Invalid JSON:
Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.995163584637269, | |
| "eval_steps": 50, | |
| "global_step": 987, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0030346135609293505, | |
| "grad_norm": 5.274968147277832, | |
| "learning_rate": 2.0202020202020205e-07, | |
| "loss": 0.8522, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006069227121858701, | |
| "grad_norm": 5.754838466644287, | |
| "learning_rate": 4.040404040404041e-07, | |
| "loss": 0.854, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.009103840682788052, | |
| "grad_norm": 5.632802963256836, | |
| "learning_rate": 6.060606060606061e-07, | |
| "loss": 0.873, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.012138454243717402, | |
| "grad_norm": 5.394160747528076, | |
| "learning_rate": 8.080808080808082e-07, | |
| "loss": 0.8563, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.015173067804646752, | |
| "grad_norm": 4.5878424644470215, | |
| "learning_rate": 1.01010101010101e-06, | |
| "loss": 0.8425, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.018207681365576104, | |
| "grad_norm": 4.280163764953613, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "loss": 0.8569, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.021242294926505454, | |
| "grad_norm": 3.5766515731811523, | |
| "learning_rate": 1.4141414141414143e-06, | |
| "loss": 0.8309, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.024276908487434804, | |
| "grad_norm": 4.070709228515625, | |
| "learning_rate": 1.6161616161616164e-06, | |
| "loss": 0.825, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.027311522048364154, | |
| "grad_norm": 4.395944595336914, | |
| "learning_rate": 1.8181818181818183e-06, | |
| "loss": 0.8419, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.030346135609293504, | |
| "grad_norm": 3.765310764312744, | |
| "learning_rate": 2.02020202020202e-06, | |
| "loss": 0.8426, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03338074917022286, | |
| "grad_norm": 2.906996488571167, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.8263, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03641536273115221, | |
| "grad_norm": 2.8421216011047363, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "loss": 0.8242, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03944997629208156, | |
| "grad_norm": 2.6818017959594727, | |
| "learning_rate": 2.6262626262626267e-06, | |
| "loss": 0.8206, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04248458985301091, | |
| "grad_norm": 2.8597841262817383, | |
| "learning_rate": 2.8282828282828286e-06, | |
| "loss": 0.8201, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.04551920341394026, | |
| "grad_norm": 2.673666000366211, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 0.8215, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04855381697486961, | |
| "grad_norm": 2.636281728744507, | |
| "learning_rate": 3.232323232323233e-06, | |
| "loss": 0.8037, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05158843053579896, | |
| "grad_norm": 2.502042770385742, | |
| "learning_rate": 3.4343434343434347e-06, | |
| "loss": 0.8162, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.05462304409672831, | |
| "grad_norm": 2.562307834625244, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "loss": 0.8066, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.05765765765765766, | |
| "grad_norm": 2.6667816638946533, | |
| "learning_rate": 3.8383838383838385e-06, | |
| "loss": 0.8197, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06069227121858701, | |
| "grad_norm": 2.293628215789795, | |
| "learning_rate": 4.04040404040404e-06, | |
| "loss": 0.8092, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06372688477951635, | |
| "grad_norm": 2.4843485355377197, | |
| "learning_rate": 4.242424242424243e-06, | |
| "loss": 0.8082, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.06676149834044572, | |
| "grad_norm": 2.5178024768829346, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.8165, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.06979611190137507, | |
| "grad_norm": 2.6686830520629883, | |
| "learning_rate": 4.646464646464647e-06, | |
| "loss": 0.773, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07283072546230442, | |
| "grad_norm": 2.2187390327453613, | |
| "learning_rate": 4.848484848484849e-06, | |
| "loss": 0.8026, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.07586533902323377, | |
| "grad_norm": 2.4837453365325928, | |
| "learning_rate": 5.0505050505050515e-06, | |
| "loss": 0.7962, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07889995258416312, | |
| "grad_norm": 2.3953895568847656, | |
| "learning_rate": 5.252525252525253e-06, | |
| "loss": 0.7902, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.08193456614509247, | |
| "grad_norm": 2.384993553161621, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 0.8133, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.08496917970602182, | |
| "grad_norm": 2.5588831901550293, | |
| "learning_rate": 5.656565656565657e-06, | |
| "loss": 0.8043, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.08800379326695117, | |
| "grad_norm": 2.254196882247925, | |
| "learning_rate": 5.858585858585859e-06, | |
| "loss": 0.7967, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09103840682788052, | |
| "grad_norm": 2.2686617374420166, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 0.8036, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09407302038880987, | |
| "grad_norm": 2.3785650730133057, | |
| "learning_rate": 6.262626262626264e-06, | |
| "loss": 0.799, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.09710763394973922, | |
| "grad_norm": 2.4035139083862305, | |
| "learning_rate": 6.464646464646466e-06, | |
| "loss": 0.8086, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.10014224751066857, | |
| "grad_norm": 2.2935521602630615, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.7842, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.10317686107159792, | |
| "grad_norm": 2.3163795471191406, | |
| "learning_rate": 6.868686868686869e-06, | |
| "loss": 0.8041, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.10621147463252727, | |
| "grad_norm": 2.3953793048858643, | |
| "learning_rate": 7.070707070707071e-06, | |
| "loss": 0.7929, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10924608819345662, | |
| "grad_norm": 2.4116768836975098, | |
| "learning_rate": 7.272727272727273e-06, | |
| "loss": 0.7943, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.11228070175438597, | |
| "grad_norm": 2.5221264362335205, | |
| "learning_rate": 7.474747474747476e-06, | |
| "loss": 0.8036, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.11531531531531532, | |
| "grad_norm": 2.4730491638183594, | |
| "learning_rate": 7.676767676767677e-06, | |
| "loss": 0.8063, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.11834992887624467, | |
| "grad_norm": 2.1653268337249756, | |
| "learning_rate": 7.87878787878788e-06, | |
| "loss": 0.8009, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12138454243717402, | |
| "grad_norm": 2.266632080078125, | |
| "learning_rate": 8.08080808080808e-06, | |
| "loss": 0.7825, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12441915599810337, | |
| "grad_norm": 2.2630982398986816, | |
| "learning_rate": 8.282828282828283e-06, | |
| "loss": 0.8004, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1274537695590327, | |
| "grad_norm": 2.403125762939453, | |
| "learning_rate": 8.484848484848486e-06, | |
| "loss": 0.7911, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.13048838311996205, | |
| "grad_norm": 2.4483842849731445, | |
| "learning_rate": 8.686868686868687e-06, | |
| "loss": 0.7861, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.13352299668089143, | |
| "grad_norm": 2.346780776977539, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.7969, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.13655761024182078, | |
| "grad_norm": 2.3152859210968018, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.7842, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.13959222380275013, | |
| "grad_norm": 2.2333388328552246, | |
| "learning_rate": 9.292929292929294e-06, | |
| "loss": 0.7886, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.14262683736367948, | |
| "grad_norm": 2.360898017883301, | |
| "learning_rate": 9.494949494949497e-06, | |
| "loss": 0.8146, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.14566145092460883, | |
| "grad_norm": 2.5461983680725098, | |
| "learning_rate": 9.696969696969698e-06, | |
| "loss": 0.8005, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.14869606448553818, | |
| "grad_norm": 2.4684877395629883, | |
| "learning_rate": 9.8989898989899e-06, | |
| "loss": 0.8031, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.15173067804646753, | |
| "grad_norm": 2.3025074005126953, | |
| "learning_rate": 1.0101010101010103e-05, | |
| "loss": 0.7828, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15173067804646753, | |
| "eval_loss": NaN, | |
| "eval_runtime": 233.599, | |
| "eval_samples_per_second": 17.8, | |
| "eval_steps_per_second": 4.452, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15476529160739688, | |
| "grad_norm": 2.4276065826416016, | |
| "learning_rate": 1.0303030303030304e-05, | |
| "loss": 0.7801, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.15779990516832623, | |
| "grad_norm": 2.279287815093994, | |
| "learning_rate": 1.0505050505050507e-05, | |
| "loss": 0.8055, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.16083451872925558, | |
| "grad_norm": 2.7459404468536377, | |
| "learning_rate": 1.0707070707070708e-05, | |
| "loss": 0.7864, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.16386913229018493, | |
| "grad_norm": 2.3488147258758545, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 0.7869, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.16690374585111428, | |
| "grad_norm": 2.547166109085083, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 0.7869, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.16993835941204363, | |
| "grad_norm": 2.4360105991363525, | |
| "learning_rate": 1.1313131313131314e-05, | |
| "loss": 0.7883, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.17297297297297298, | |
| "grad_norm": 2.8864669799804688, | |
| "learning_rate": 1.1515151515151517e-05, | |
| "loss": 0.7927, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.17600758653390233, | |
| "grad_norm": 2.3167998790740967, | |
| "learning_rate": 1.1717171717171718e-05, | |
| "loss": 0.7999, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.17904220009483168, | |
| "grad_norm": 3.075058698654175, | |
| "learning_rate": 1.191919191919192e-05, | |
| "loss": 0.7888, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.18207681365576103, | |
| "grad_norm": 2.38899827003479, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 0.787, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18511142721669038, | |
| "grad_norm": 3.121044874191284, | |
| "learning_rate": 1.2323232323232323e-05, | |
| "loss": 0.8053, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.18814604077761973, | |
| "grad_norm": 2.580725908279419, | |
| "learning_rate": 1.2525252525252527e-05, | |
| "loss": 0.7819, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.19118065433854908, | |
| "grad_norm": 3.1028575897216797, | |
| "learning_rate": 1.2727272727272728e-05, | |
| "loss": 0.7886, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.19421526789947843, | |
| "grad_norm": 2.6575424671173096, | |
| "learning_rate": 1.2929292929292931e-05, | |
| "loss": 0.7904, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.19724988146040778, | |
| "grad_norm": 2.8595755100250244, | |
| "learning_rate": 1.3131313131313132e-05, | |
| "loss": 0.7955, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20028449502133713, | |
| "grad_norm": 2.235410451889038, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.796, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.20331910858226648, | |
| "grad_norm": 2.7857375144958496, | |
| "learning_rate": 1.3535353535353538e-05, | |
| "loss": 0.8008, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.20635372214319583, | |
| "grad_norm": 2.310331106185913, | |
| "learning_rate": 1.3737373737373739e-05, | |
| "loss": 0.7802, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.20938833570412518, | |
| "grad_norm": 2.8039300441741943, | |
| "learning_rate": 1.3939393939393942e-05, | |
| "loss": 0.7946, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.21242294926505453, | |
| "grad_norm": 2.345369815826416, | |
| "learning_rate": 1.4141414141414143e-05, | |
| "loss": 0.7835, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.21545756282598388, | |
| "grad_norm": 2.8384764194488525, | |
| "learning_rate": 1.4343434343434344e-05, | |
| "loss": 0.7855, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.21849217638691323, | |
| "grad_norm": 2.371159076690674, | |
| "learning_rate": 1.4545454545454546e-05, | |
| "loss": 0.7689, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.22152678994784258, | |
| "grad_norm": 2.9578475952148438, | |
| "learning_rate": 1.4747474747474747e-05, | |
| "loss": 0.7918, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.22456140350877193, | |
| "grad_norm": 2.8900325298309326, | |
| "learning_rate": 1.4949494949494952e-05, | |
| "loss": 0.7746, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.22759601706970128, | |
| "grad_norm": 2.4469008445739746, | |
| "learning_rate": 1.5151515151515153e-05, | |
| "loss": 0.7919, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.23063063063063063, | |
| "grad_norm": 2.410154104232788, | |
| "learning_rate": 1.5353535353535354e-05, | |
| "loss": 0.7987, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.23366524419155998, | |
| "grad_norm": 2.599086284637451, | |
| "learning_rate": 1.555555555555556e-05, | |
| "loss": 0.7714, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.23669985775248933, | |
| "grad_norm": 2.4329092502593994, | |
| "learning_rate": 1.575757575757576e-05, | |
| "loss": 0.8041, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.23973447131341868, | |
| "grad_norm": 2.7239413261413574, | |
| "learning_rate": 1.595959595959596e-05, | |
| "loss": 0.7941, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.24276908487434803, | |
| "grad_norm": 2.8684301376342773, | |
| "learning_rate": 1.616161616161616e-05, | |
| "loss": 0.8076, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.24580369843527738, | |
| "grad_norm": 3.004608631134033, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "loss": 0.7893, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.24883831199620673, | |
| "grad_norm": 2.6391353607177734, | |
| "learning_rate": 1.6565656565656567e-05, | |
| "loss": 0.8116, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2518729255571361, | |
| "grad_norm": 3.3293023109436035, | |
| "learning_rate": 1.6767676767676768e-05, | |
| "loss": 0.7952, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2549075391180654, | |
| "grad_norm": 2.5788087844848633, | |
| "learning_rate": 1.6969696969696972e-05, | |
| "loss": 0.7905, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2579421526789948, | |
| "grad_norm": 3.4907588958740234, | |
| "learning_rate": 1.7171717171717173e-05, | |
| "loss": 0.7903, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2609767662399241, | |
| "grad_norm": 2.8050403594970703, | |
| "learning_rate": 1.7373737373737375e-05, | |
| "loss": 0.7813, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2640113798008535, | |
| "grad_norm": 3.180819034576416, | |
| "learning_rate": 1.7575757575757576e-05, | |
| "loss": 0.7797, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.26704599336178286, | |
| "grad_norm": 2.614614486694336, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 0.7903, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.2700806069227122, | |
| "grad_norm": 3.212218761444092, | |
| "learning_rate": 1.797979797979798e-05, | |
| "loss": 0.7927, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.27311522048364156, | |
| "grad_norm": 2.6879336833953857, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.8099, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2761498340445709, | |
| "grad_norm": 3.2700448036193848, | |
| "learning_rate": 1.8383838383838387e-05, | |
| "loss": 0.794, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.27918444760550026, | |
| "grad_norm": 3.1249783039093018, | |
| "learning_rate": 1.8585858585858588e-05, | |
| "loss": 0.7807, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.2822190611664296, | |
| "grad_norm": 2.4789459705352783, | |
| "learning_rate": 1.8787878787878792e-05, | |
| "loss": 0.7829, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.28525367472735896, | |
| "grad_norm": 2.624887466430664, | |
| "learning_rate": 1.8989898989898993e-05, | |
| "loss": 0.8043, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.2882882882882883, | |
| "grad_norm": 2.3336539268493652, | |
| "learning_rate": 1.9191919191919194e-05, | |
| "loss": 0.7827, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.29132290184921766, | |
| "grad_norm": 2.5490119457244873, | |
| "learning_rate": 1.9393939393939395e-05, | |
| "loss": 0.8072, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.294357515410147, | |
| "grad_norm": 2.5659990310668945, | |
| "learning_rate": 1.9595959595959596e-05, | |
| "loss": 0.7986, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.29739212897107636, | |
| "grad_norm": 2.3554656505584717, | |
| "learning_rate": 1.97979797979798e-05, | |
| "loss": 0.8059, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3004267425320057, | |
| "grad_norm": 2.4364328384399414, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7874, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.30346135609293506, | |
| "grad_norm": 2.3030965328216553, | |
| "learning_rate": 1.9999937418875125e-05, | |
| "loss": 0.7854, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.30346135609293506, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.7053, | |
| "eval_samples_per_second": 20.312, | |
| "eval_steps_per_second": 5.08, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3064959696538644, | |
| "grad_norm": 2.318124294281006, | |
| "learning_rate": 1.9999749676283775e-05, | |
| "loss": 0.7964, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.30953058321479376, | |
| "grad_norm": 2.2906177043914795, | |
| "learning_rate": 1.9999436774575783e-05, | |
| "loss": 0.8049, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3125651967757231, | |
| "grad_norm": 2.2947778701782227, | |
| "learning_rate": 1.999899871766749e-05, | |
| "loss": 0.7881, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.31559981033665246, | |
| "grad_norm": 2.548788547515869, | |
| "learning_rate": 1.999843551104172e-05, | |
| "loss": 0.803, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.3186344238975818, | |
| "grad_norm": 2.3735954761505127, | |
| "learning_rate": 1.9997747161747696e-05, | |
| "loss": 0.8044, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.32166903745851116, | |
| "grad_norm": 2.620364189147949, | |
| "learning_rate": 1.9996933678400948e-05, | |
| "loss": 0.7806, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3247036510194405, | |
| "grad_norm": 2.3897509574890137, | |
| "learning_rate": 1.999599507118322e-05, | |
| "loss": 0.7862, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.32773826458036986, | |
| "grad_norm": 2.6068966388702393, | |
| "learning_rate": 1.9994931351842327e-05, | |
| "loss": 0.8051, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3307728781412992, | |
| "grad_norm": 2.560683488845825, | |
| "learning_rate": 1.999374253369202e-05, | |
| "loss": 0.7956, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.33380749170222856, | |
| "grad_norm": 2.1629045009613037, | |
| "learning_rate": 1.999242863161182e-05, | |
| "loss": 0.8074, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3368421052631579, | |
| "grad_norm": 2.653731346130371, | |
| "learning_rate": 1.999098966204682e-05, | |
| "loss": 0.8071, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.33987671882408726, | |
| "grad_norm": 2.484330177307129, | |
| "learning_rate": 1.9989425643007475e-05, | |
| "loss": 0.8135, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3429113323850166, | |
| "grad_norm": 2.3897571563720703, | |
| "learning_rate": 1.9987736594069417e-05, | |
| "loss": 0.8066, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.34594594594594597, | |
| "grad_norm": 2.653904676437378, | |
| "learning_rate": 1.998592253637315e-05, | |
| "loss": 0.7908, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.3489805595068753, | |
| "grad_norm": 2.2755186557769775, | |
| "learning_rate": 1.9983983492623832e-05, | |
| "loss": 0.7824, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.35201517306780467, | |
| "grad_norm": 2.260126829147339, | |
| "learning_rate": 1.9981919487090973e-05, | |
| "loss": 0.7916, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.355049786628734, | |
| "grad_norm": 2.6538352966308594, | |
| "learning_rate": 1.9979730545608128e-05, | |
| "loss": 0.7927, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.35808440018966337, | |
| "grad_norm": 2.2571558952331543, | |
| "learning_rate": 1.9977416695572577e-05, | |
| "loss": 0.7826, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3611190137505927, | |
| "grad_norm": 2.5154271125793457, | |
| "learning_rate": 1.9974977965945e-05, | |
| "loss": 0.807, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.36415362731152207, | |
| "grad_norm": 2.4908971786499023, | |
| "learning_rate": 1.9972414387249074e-05, | |
| "loss": 0.7831, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3671882408724514, | |
| "grad_norm": 2.3925859928131104, | |
| "learning_rate": 1.996972599157113e-05, | |
| "loss": 0.7844, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.37022285443338077, | |
| "grad_norm": 2.3527793884277344, | |
| "learning_rate": 1.9966912812559733e-05, | |
| "loss": 0.7921, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3732574679943101, | |
| "grad_norm": 2.2694365978240967, | |
| "learning_rate": 1.9963974885425267e-05, | |
| "loss": 0.7816, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.37629208155523947, | |
| "grad_norm": 2.436676025390625, | |
| "learning_rate": 1.9960912246939486e-05, | |
| "loss": 0.7782, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3793266951161688, | |
| "grad_norm": 2.3587653636932373, | |
| "learning_rate": 1.9957724935435065e-05, | |
| "loss": 0.8024, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.38236130867709817, | |
| "grad_norm": 2.3145172595977783, | |
| "learning_rate": 1.9954412990805107e-05, | |
| "loss": 0.8115, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3853959222380275, | |
| "grad_norm": 2.271946430206299, | |
| "learning_rate": 1.995097645450266e-05, | |
| "loss": 0.7975, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.38843053579895687, | |
| "grad_norm": 2.2533860206604004, | |
| "learning_rate": 1.994741536954019e-05, | |
| "loss": 0.8187, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.3914651493598862, | |
| "grad_norm": 2.5504581928253174, | |
| "learning_rate": 1.994372978048903e-05, | |
| "loss": 0.7913, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.39449976292081557, | |
| "grad_norm": 2.3467888832092285, | |
| "learning_rate": 1.993991973347884e-05, | |
| "loss": 0.7955, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3975343764817449, | |
| "grad_norm": 2.356933832168579, | |
| "learning_rate": 1.9935985276197033e-05, | |
| "loss": 0.808, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.40056899004267427, | |
| "grad_norm": 2.608546257019043, | |
| "learning_rate": 1.9931926457888155e-05, | |
| "loss": 0.785, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4036036036036036, | |
| "grad_norm": 2.2730495929718018, | |
| "learning_rate": 1.9927743329353295e-05, | |
| "loss": 0.79, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.40663821716453297, | |
| "grad_norm": 2.2720224857330322, | |
| "learning_rate": 1.992343594294943e-05, | |
| "loss": 0.8084, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.4096728307254623, | |
| "grad_norm": 2.2325122356414795, | |
| "learning_rate": 1.9919004352588768e-05, | |
| "loss": 0.8008, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.41270744428639167, | |
| "grad_norm": 2.4259414672851562, | |
| "learning_rate": 1.9914448613738107e-05, | |
| "loss": 0.7827, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.415742057847321, | |
| "grad_norm": 2.2663819789886475, | |
| "learning_rate": 1.9909768783418086e-05, | |
| "loss": 0.8059, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.41877667140825037, | |
| "grad_norm": 2.910830020904541, | |
| "learning_rate": 1.990496492020252e-05, | |
| "loss": 0.8159, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4218112849691797, | |
| "grad_norm": 2.4485902786254883, | |
| "learning_rate": 1.9900037084217637e-05, | |
| "loss": 0.7921, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.42484589853010907, | |
| "grad_norm": 2.3199424743652344, | |
| "learning_rate": 1.989498533714135e-05, | |
| "loss": 0.8006, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4278805120910384, | |
| "grad_norm": 2.529900550842285, | |
| "learning_rate": 1.9889809742202454e-05, | |
| "loss": 0.7805, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.43091512565196777, | |
| "grad_norm": 2.5135438442230225, | |
| "learning_rate": 1.988451036417986e-05, | |
| "loss": 0.8088, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4339497392128971, | |
| "grad_norm": 2.359200954437256, | |
| "learning_rate": 1.9879087269401782e-05, | |
| "loss": 0.7963, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.43698435277382647, | |
| "grad_norm": 2.307164192199707, | |
| "learning_rate": 1.9873540525744888e-05, | |
| "loss": 0.79, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.4400189663347558, | |
| "grad_norm": 2.341834545135498, | |
| "learning_rate": 1.986787020263347e-05, | |
| "loss": 0.7955, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.44305357989568517, | |
| "grad_norm": 2.409449338912964, | |
| "learning_rate": 1.986207637103857e-05, | |
| "loss": 0.7761, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.4460881934566145, | |
| "grad_norm": 2.3119232654571533, | |
| "learning_rate": 1.9856159103477085e-05, | |
| "loss": 0.7992, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.44912280701754387, | |
| "grad_norm": 2.3376715183258057, | |
| "learning_rate": 1.9850118474010873e-05, | |
| "loss": 0.784, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.4521574205784732, | |
| "grad_norm": 2.387392997741699, | |
| "learning_rate": 1.98439545582458e-05, | |
| "loss": 0.7806, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.45519203413940257, | |
| "grad_norm": 2.2223150730133057, | |
| "learning_rate": 1.983766743333084e-05, | |
| "loss": 0.7914, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.45519203413940257, | |
| "eval_loss": NaN, | |
| "eval_runtime": 205.0592, | |
| "eval_samples_per_second": 20.277, | |
| "eval_steps_per_second": 5.072, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4582266477003319, | |
| "grad_norm": 2.727046012878418, | |
| "learning_rate": 1.9831257177957045e-05, | |
| "loss": 0.793, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.46126126126126127, | |
| "grad_norm": 2.266788959503174, | |
| "learning_rate": 1.9824723872356623e-05, | |
| "loss": 0.8127, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4642958748221906, | |
| "grad_norm": 2.530904531478882, | |
| "learning_rate": 1.9818067598301894e-05, | |
| "loss": 0.7973, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.46733048838311997, | |
| "grad_norm": 2.3305609226226807, | |
| "learning_rate": 1.981128843910428e-05, | |
| "loss": 0.7961, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.4703651019440493, | |
| "grad_norm": 2.1240079402923584, | |
| "learning_rate": 1.9804386479613268e-05, | |
| "loss": 0.8093, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.47339971550497867, | |
| "grad_norm": 2.625185489654541, | |
| "learning_rate": 1.9797361806215335e-05, | |
| "loss": 0.7988, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.476434329065908, | |
| "grad_norm": 2.4407734870910645, | |
| "learning_rate": 1.9790214506832868e-05, | |
| "loss": 0.8166, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.47946894262683737, | |
| "grad_norm": 2.4013476371765137, | |
| "learning_rate": 1.9782944670923075e-05, | |
| "loss": 0.7935, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.4825035561877667, | |
| "grad_norm": 2.357010841369629, | |
| "learning_rate": 1.9775552389476865e-05, | |
| "loss": 0.7818, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.48553816974869607, | |
| "grad_norm": 2.1862401962280273, | |
| "learning_rate": 1.9768037755017687e-05, | |
| "loss": 0.7879, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4885727833096254, | |
| "grad_norm": 2.3130927085876465, | |
| "learning_rate": 1.97604008616004e-05, | |
| "loss": 0.7914, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.49160739687055477, | |
| "grad_norm": 2.0661509037017822, | |
| "learning_rate": 1.9752641804810083e-05, | |
| "loss": 0.8023, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.4946420104314841, | |
| "grad_norm": 2.2117955684661865, | |
| "learning_rate": 1.9744760681760832e-05, | |
| "loss": 0.7972, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.49767662399241347, | |
| "grad_norm": 2.603163242340088, | |
| "learning_rate": 1.973675759109456e-05, | |
| "loss": 0.7913, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5007112375533428, | |
| "grad_norm": 2.2511062622070312, | |
| "learning_rate": 1.9728632632979746e-05, | |
| "loss": 0.7914, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5037458511142722, | |
| "grad_norm": 2.375213861465454, | |
| "learning_rate": 1.9720385909110197e-05, | |
| "loss": 0.7928, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5067804646752015, | |
| "grad_norm": 2.5348660945892334, | |
| "learning_rate": 1.9712017522703764e-05, | |
| "loss": 0.7894, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5098150782361308, | |
| "grad_norm": 2.2094035148620605, | |
| "learning_rate": 1.9703527578501052e-05, | |
| "loss": 0.7813, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5128496917970602, | |
| "grad_norm": 2.2283380031585693, | |
| "learning_rate": 1.9694916182764113e-05, | |
| "loss": 0.7877, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5158843053579896, | |
| "grad_norm": 2.189119338989258, | |
| "learning_rate": 1.9686183443275118e-05, | |
| "loss": 0.7989, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.518918918918919, | |
| "grad_norm": 2.2636640071868896, | |
| "learning_rate": 1.967732946933499e-05, | |
| "loss": 0.8059, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5219535324798482, | |
| "grad_norm": 2.243251085281372, | |
| "learning_rate": 1.9668354371762066e-05, | |
| "loss": 0.7904, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5249881460407776, | |
| "grad_norm": 2.215536117553711, | |
| "learning_rate": 1.9659258262890683e-05, | |
| "loss": 0.7912, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.528022759601707, | |
| "grad_norm": 2.2998199462890625, | |
| "learning_rate": 1.9650041256569792e-05, | |
| "loss": 0.797, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5310573731626363, | |
| "grad_norm": 2.3586950302124023, | |
| "learning_rate": 1.9640703468161508e-05, | |
| "loss": 0.7907, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5340919867235657, | |
| "grad_norm": 2.257404088973999, | |
| "learning_rate": 1.96312450145397e-05, | |
| "loss": 0.7977, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.537126600284495, | |
| "grad_norm": 2.4808526039123535, | |
| "learning_rate": 1.9621666014088495e-05, | |
| "loss": 0.7929, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5401612138454244, | |
| "grad_norm": 2.208704710006714, | |
| "learning_rate": 1.9611966586700825e-05, | |
| "loss": 0.7975, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5431958274063537, | |
| "grad_norm": 2.4615161418914795, | |
| "learning_rate": 1.9602146853776894e-05, | |
| "loss": 0.7991, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5462304409672831, | |
| "grad_norm": 2.4766757488250732, | |
| "learning_rate": 1.9592206938222703e-05, | |
| "loss": 0.7911, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5492650545282124, | |
| "grad_norm": 2.2009432315826416, | |
| "learning_rate": 1.9582146964448457e-05, | |
| "loss": 0.788, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5522996680891418, | |
| "grad_norm": 2.258129358291626, | |
| "learning_rate": 1.9571967058367067e-05, | |
| "loss": 0.7893, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5553342816500711, | |
| "grad_norm": 2.1914985179901123, | |
| "learning_rate": 1.956166734739251e-05, | |
| "loss": 0.8057, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5583688952110005, | |
| "grad_norm": 2.2815279960632324, | |
| "learning_rate": 1.9551247960438298e-05, | |
| "loss": 0.7823, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 2.2393579483032227, | |
| "learning_rate": 1.954070902791582e-05, | |
| "loss": 0.7899, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5644381223328592, | |
| "grad_norm": 2.56640625, | |
| "learning_rate": 1.953005068173272e-05, | |
| "loss": 0.7731, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5674727358937885, | |
| "grad_norm": 2.3896234035491943, | |
| "learning_rate": 1.9519273055291266e-05, | |
| "loss": 0.7936, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.5705073494547179, | |
| "grad_norm": 2.21494722366333, | |
| "learning_rate": 1.9508376283486653e-05, | |
| "loss": 0.8121, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.5735419630156472, | |
| "grad_norm": 2.400538444519043, | |
| "learning_rate": 1.949736050270532e-05, | |
| "loss": 0.7938, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.5765765765765766, | |
| "grad_norm": 2.1337621212005615, | |
| "learning_rate": 1.9486225850823265e-05, | |
| "loss": 0.8049, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.579611190137506, | |
| "grad_norm": 2.314168930053711, | |
| "learning_rate": 1.9474972467204298e-05, | |
| "loss": 0.8109, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.5826458036984353, | |
| "grad_norm": 2.2364132404327393, | |
| "learning_rate": 1.9463600492698297e-05, | |
| "loss": 0.78, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5856804172593646, | |
| "grad_norm": 2.204206705093384, | |
| "learning_rate": 1.945211006963945e-05, | |
| "loss": 0.8187, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.588715030820294, | |
| "grad_norm": 2.1865625381469727, | |
| "learning_rate": 1.9440501341844484e-05, | |
| "loss": 0.7844, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.5917496443812233, | |
| "grad_norm": 2.143092155456543, | |
| "learning_rate": 1.9428774454610845e-05, | |
| "loss": 0.7967, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5947842579421527, | |
| "grad_norm": 2.3440985679626465, | |
| "learning_rate": 1.9416929554714887e-05, | |
| "loss": 0.8096, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.597818871503082, | |
| "grad_norm": 2.43977689743042, | |
| "learning_rate": 1.9404966790410047e-05, | |
| "loss": 0.795, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6008534850640114, | |
| "grad_norm": 2.2740390300750732, | |
| "learning_rate": 1.9392886311424975e-05, | |
| "loss": 0.7937, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6038880986249408, | |
| "grad_norm": 2.1775126457214355, | |
| "learning_rate": 1.938068826896166e-05, | |
| "loss": 0.8019, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6069227121858701, | |
| "grad_norm": 2.1858487129211426, | |
| "learning_rate": 1.9368372815693547e-05, | |
| "loss": 0.7962, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6069227121858701, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.6421, | |
| "eval_samples_per_second": 20.318, | |
| "eval_steps_per_second": 5.082, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6099573257467994, | |
| "grad_norm": 2.2958755493164062, | |
| "learning_rate": 1.9355940105763622e-05, | |
| "loss": 0.8003, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6129919393077288, | |
| "grad_norm": 2.1330931186676025, | |
| "learning_rate": 1.934339029478248e-05, | |
| "loss": 0.7803, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6160265528686582, | |
| "grad_norm": 2.3881402015686035, | |
| "learning_rate": 1.9330723539826373e-05, | |
| "loss": 0.8044, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6190611664295875, | |
| "grad_norm": 2.2404513359069824, | |
| "learning_rate": 1.9317939999435262e-05, | |
| "loss": 0.8097, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6220957799905168, | |
| "grad_norm": 2.194645404815674, | |
| "learning_rate": 1.930503983361081e-05, | |
| "loss": 0.8178, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6251303935514462, | |
| "grad_norm": 2.509723424911499, | |
| "learning_rate": 1.92920232038144e-05, | |
| "loss": 0.7936, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6281650071123756, | |
| "grad_norm": 2.2457869052886963, | |
| "learning_rate": 1.9278890272965097e-05, | |
| "loss": 0.8162, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6311996206733049, | |
| "grad_norm": 2.2989683151245117, | |
| "learning_rate": 1.9265641205437612e-05, | |
| "loss": 0.8012, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6342342342342342, | |
| "grad_norm": 2.3188092708587646, | |
| "learning_rate": 1.925227616706026e-05, | |
| "loss": 0.8094, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6372688477951636, | |
| "grad_norm": 2.2978076934814453, | |
| "learning_rate": 1.9238795325112867e-05, | |
| "loss": 0.7811, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.640303461356093, | |
| "grad_norm": 2.331542730331421, | |
| "learning_rate": 1.9225198848324687e-05, | |
| "loss": 0.7946, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6433380749170223, | |
| "grad_norm": 2.1989738941192627, | |
| "learning_rate": 1.921148690687228e-05, | |
| "loss": 0.7983, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6463726884779516, | |
| "grad_norm": 2.4261419773101807, | |
| "learning_rate": 1.9197659672377388e-05, | |
| "loss": 0.8134, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.649407302038881, | |
| "grad_norm": 2.3790082931518555, | |
| "learning_rate": 1.918371731790479e-05, | |
| "loss": 0.79, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.6524419155998104, | |
| "grad_norm": 2.2966949939727783, | |
| "learning_rate": 1.9169660017960135e-05, | |
| "loss": 0.8029, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6554765291607397, | |
| "grad_norm": 2.5911426544189453, | |
| "learning_rate": 1.915548794848775e-05, | |
| "loss": 0.8118, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.658511142721669, | |
| "grad_norm": 2.3282856941223145, | |
| "learning_rate": 1.9141201286868435e-05, | |
| "loss": 0.8092, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6615457562825984, | |
| "grad_norm": 2.415398359298706, | |
| "learning_rate": 1.9126800211917277e-05, | |
| "loss": 0.8156, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.6645803698435278, | |
| "grad_norm": 2.2823410034179688, | |
| "learning_rate": 1.911228490388136e-05, | |
| "loss": 0.8004, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.6676149834044571, | |
| "grad_norm": 2.2104527950286865, | |
| "learning_rate": 1.9097655544437544e-05, | |
| "loss": 0.8023, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6706495969653864, | |
| "grad_norm": 2.3354063034057617, | |
| "learning_rate": 1.908291231669019e-05, | |
| "loss": 0.8117, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.6736842105263158, | |
| "grad_norm": 2.230656147003174, | |
| "learning_rate": 1.906805540516885e-05, | |
| "loss": 0.7797, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.6767188240872452, | |
| "grad_norm": 2.464111328125, | |
| "learning_rate": 1.905308499582597e-05, | |
| "loss": 0.7929, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.6797534376481745, | |
| "grad_norm": 2.188788652420044, | |
| "learning_rate": 1.903800127603456e-05, | |
| "loss": 0.7971, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.6827880512091038, | |
| "grad_norm": 2.202427387237549, | |
| "learning_rate": 1.9022804434585854e-05, | |
| "loss": 0.8026, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6858226647700332, | |
| "grad_norm": 2.2621190547943115, | |
| "learning_rate": 1.9007494661686937e-05, | |
| "loss": 0.8112, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.6888572783309626, | |
| "grad_norm": 2.333603620529175, | |
| "learning_rate": 1.8992072148958368e-05, | |
| "loss": 0.7937, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.6918918918918919, | |
| "grad_norm": 2.626451253890991, | |
| "learning_rate": 1.8976537089431793e-05, | |
| "loss": 0.8005, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.6949265054528212, | |
| "grad_norm": 2.4107227325439453, | |
| "learning_rate": 1.8960889677547506e-05, | |
| "loss": 0.7813, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.6979611190137506, | |
| "grad_norm": 2.7147607803344727, | |
| "learning_rate": 1.8945130109152035e-05, | |
| "loss": 0.8036, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.70099573257468, | |
| "grad_norm": 2.281703233718872, | |
| "learning_rate": 1.8929258581495688e-05, | |
| "loss": 0.7946, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7040303461356093, | |
| "grad_norm": 3.016942262649536, | |
| "learning_rate": 1.891327529323007e-05, | |
| "loss": 0.7786, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7070649596965386, | |
| "grad_norm": 2.5729317665100098, | |
| "learning_rate": 1.8897180444405615e-05, | |
| "loss": 0.8141, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.710099573257468, | |
| "grad_norm": 2.75722336769104, | |
| "learning_rate": 1.888097423646907e-05, | |
| "loss": 0.8079, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7131341868183974, | |
| "grad_norm": 2.843980073928833, | |
| "learning_rate": 1.8864656872260985e-05, | |
| "loss": 0.795, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7161688003793267, | |
| "grad_norm": 3.3999879360198975, | |
| "learning_rate": 1.884822855601316e-05, | |
| "loss": 0.8086, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.719203413940256, | |
| "grad_norm": 3.1997487545013428, | |
| "learning_rate": 1.8831689493346095e-05, | |
| "loss": 0.7919, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7222380275011854, | |
| "grad_norm": 2.4851367473602295, | |
| "learning_rate": 1.881503989126642e-05, | |
| "loss": 0.7914, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7252726410621148, | |
| "grad_norm": 2.738428831100464, | |
| "learning_rate": 1.8798279958164295e-05, | |
| "loss": 0.7929, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7283072546230441, | |
| "grad_norm": 2.1467783451080322, | |
| "learning_rate": 1.8781409903810823e-05, | |
| "loss": 0.815, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7313418681839734, | |
| "grad_norm": 16.773521423339844, | |
| "learning_rate": 1.8764429939355394e-05, | |
| "loss": 0.8014, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7343764817449028, | |
| "grad_norm": 3.171633720397949, | |
| "learning_rate": 1.874734027732306e-05, | |
| "loss": 0.805, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7374110953058322, | |
| "grad_norm": 2.249514102935791, | |
| "learning_rate": 1.8730141131611882e-05, | |
| "loss": 0.8109, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7404457088667615, | |
| "grad_norm": 2.816488742828369, | |
| "learning_rate": 1.8712832717490238e-05, | |
| "loss": 0.8044, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.7434803224276908, | |
| "grad_norm": 2.558295488357544, | |
| "learning_rate": 1.8695415251594123e-05, | |
| "loss": 0.8269, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7465149359886202, | |
| "grad_norm": 2.5777034759521484, | |
| "learning_rate": 1.8677888951924473e-05, | |
| "loss": 0.7971, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7495495495495496, | |
| "grad_norm": 2.394287586212158, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 0.8092, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7525841631104789, | |
| "grad_norm": 2.2798614501953125, | |
| "learning_rate": 1.864251073007642e-05, | |
| "loss": 0.7964, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7556187766714082, | |
| "grad_norm": 2.3587262630462646, | |
| "learning_rate": 1.8624659250699807e-05, | |
| "loss": 0.7928, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.7586533902323376, | |
| "grad_norm": 2.189763307571411, | |
| "learning_rate": 1.8606699823147675e-05, | |
| "loss": 0.7812, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7586533902323376, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.0596, | |
| "eval_samples_per_second": 20.376, | |
| "eval_steps_per_second": 5.097, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.761688003793267, | |
| "grad_norm": 2.433803081512451, | |
| "learning_rate": 1.8588632672204264e-05, | |
| "loss": 0.8111, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.7647226173541963, | |
| "grad_norm": 2.28267765045166, | |
| "learning_rate": 1.8570458024002094e-05, | |
| "loss": 0.8001, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.7677572309151256, | |
| "grad_norm": 2.3339545726776123, | |
| "learning_rate": 1.8552176106019156e-05, | |
| "loss": 0.8158, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.770791844476055, | |
| "grad_norm": 2.284759998321533, | |
| "learning_rate": 1.8533787147076046e-05, | |
| "loss": 0.7852, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.7738264580369844, | |
| "grad_norm": 2.35969614982605, | |
| "learning_rate": 1.8515291377333114e-05, | |
| "loss": 0.7909, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7768610715979137, | |
| "grad_norm": 2.4525341987609863, | |
| "learning_rate": 1.8496689028287572e-05, | |
| "loss": 0.817, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.779895685158843, | |
| "grad_norm": 2.7126755714416504, | |
| "learning_rate": 1.847798033277061e-05, | |
| "loss": 0.797, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.7829302987197724, | |
| "grad_norm": 2.2292561531066895, | |
| "learning_rate": 1.8459165524944463e-05, | |
| "loss": 0.8044, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.7859649122807018, | |
| "grad_norm": 2.447347640991211, | |
| "learning_rate": 1.8440244840299507e-05, | |
| "loss": 0.7979, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.7889995258416311, | |
| "grad_norm": 2.2336087226867676, | |
| "learning_rate": 1.842121851565128e-05, | |
| "loss": 0.8036, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7920341394025604, | |
| "grad_norm": 2.3720502853393555, | |
| "learning_rate": 1.8402086789137547e-05, | |
| "loss": 0.7979, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.7950687529634898, | |
| "grad_norm": 2.1780807971954346, | |
| "learning_rate": 1.8382849900215297e-05, | |
| "loss": 0.7876, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.7981033665244192, | |
| "grad_norm": 2.3325858116149902, | |
| "learning_rate": 1.8363508089657763e-05, | |
| "loss": 0.7997, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8011379800853485, | |
| "grad_norm": 2.3341164588928223, | |
| "learning_rate": 1.8344061599551397e-05, | |
| "loss": 0.7844, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.8041725936462779, | |
| "grad_norm": 2.4678280353546143, | |
| "learning_rate": 1.8324510673292844e-05, | |
| "loss": 0.7946, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8072072072072072, | |
| "grad_norm": 2.424893856048584, | |
| "learning_rate": 1.8304855555585893e-05, | |
| "loss": 0.7916, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8102418207681366, | |
| "grad_norm": 2.5433976650238037, | |
| "learning_rate": 1.8285096492438424e-05, | |
| "loss": 0.7983, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8132764343290659, | |
| "grad_norm": 2.499178647994995, | |
| "learning_rate": 1.826523373115931e-05, | |
| "loss": 0.7944, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8163110478899953, | |
| "grad_norm": 2.2453994750976562, | |
| "learning_rate": 1.8245267520355348e-05, | |
| "loss": 0.8148, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.8193456614509246, | |
| "grad_norm": 2.350146770477295, | |
| "learning_rate": 1.8225198109928116e-05, | |
| "loss": 0.7986, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.822380275011854, | |
| "grad_norm": 2.2048559188842773, | |
| "learning_rate": 1.8205025751070878e-05, | |
| "loss": 0.8093, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8254148885727833, | |
| "grad_norm": 2.2730185985565186, | |
| "learning_rate": 1.8184750696265408e-05, | |
| "loss": 0.7787, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8284495021337127, | |
| "grad_norm": 2.423301935195923, | |
| "learning_rate": 1.8164373199278858e-05, | |
| "loss": 0.823, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.831484115694642, | |
| "grad_norm": 2.309649705886841, | |
| "learning_rate": 1.8143893515160565e-05, | |
| "loss": 0.7901, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8345187292555714, | |
| "grad_norm": 2.249284267425537, | |
| "learning_rate": 1.812331190023886e-05, | |
| "loss": 0.8095, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8375533428165007, | |
| "grad_norm": 2.2063703536987305, | |
| "learning_rate": 1.8102628612117868e-05, | |
| "loss": 0.8008, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8405879563774301, | |
| "grad_norm": 2.2518839836120605, | |
| "learning_rate": 1.8081843909674277e-05, | |
| "loss": 0.8051, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8436225699383594, | |
| "grad_norm": 2.226356267929077, | |
| "learning_rate": 1.8060958053054095e-05, | |
| "loss": 0.8036, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.8466571834992888, | |
| "grad_norm": 2.186485767364502, | |
| "learning_rate": 1.8039971303669407e-05, | |
| "loss": 0.8025, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.8496917970602181, | |
| "grad_norm": 2.4235646724700928, | |
| "learning_rate": 1.8018883924195085e-05, | |
| "loss": 0.7799, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8527264106211475, | |
| "grad_norm": 2.2470943927764893, | |
| "learning_rate": 1.799769617856552e-05, | |
| "loss": 0.8025, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.8557610241820768, | |
| "grad_norm": 2.2514889240264893, | |
| "learning_rate": 1.79764083319713e-05, | |
| "loss": 0.7978, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.8587956377430062, | |
| "grad_norm": 2.220952033996582, | |
| "learning_rate": 1.79550206508559e-05, | |
| "loss": 0.7978, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.8618302513039355, | |
| "grad_norm": 2.4699270725250244, | |
| "learning_rate": 1.7933533402912354e-05, | |
| "loss": 0.7835, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 2.3011207580566406, | |
| "learning_rate": 1.7911946857079886e-05, | |
| "loss": 0.8008, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8678994784257942, | |
| "grad_norm": 2.2239327430725098, | |
| "learning_rate": 1.7890261283540563e-05, | |
| "loss": 0.7904, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.8709340919867236, | |
| "grad_norm": 2.077845573425293, | |
| "learning_rate": 1.78684769537159e-05, | |
| "loss": 0.7962, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.8739687055476529, | |
| "grad_norm": 2.2492687702178955, | |
| "learning_rate": 1.7846594140263475e-05, | |
| "loss": 0.8076, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.8770033191085823, | |
| "grad_norm": 2.1000773906707764, | |
| "learning_rate": 1.78246131170735e-05, | |
| "loss": 0.8107, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.8800379326695116, | |
| "grad_norm": 2.2440242767333984, | |
| "learning_rate": 1.7802534159265407e-05, | |
| "loss": 0.784, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.883072546230441, | |
| "grad_norm": 2.1370065212249756, | |
| "learning_rate": 1.7780357543184396e-05, | |
| "loss": 0.7926, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.8861071597913703, | |
| "grad_norm": 2.2182703018188477, | |
| "learning_rate": 1.775808354639799e-05, | |
| "loss": 0.8079, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.8891417733522997, | |
| "grad_norm": 2.236370325088501, | |
| "learning_rate": 1.773571244769254e-05, | |
| "loss": 0.7851, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.892176386913229, | |
| "grad_norm": 2.216042995452881, | |
| "learning_rate": 1.771324452706975e-05, | |
| "loss": 0.798, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.8952110004741584, | |
| "grad_norm": 2.2804715633392334, | |
| "learning_rate": 1.769068006574317e-05, | |
| "loss": 0.7916, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8982456140350877, | |
| "grad_norm": 2.188271999359131, | |
| "learning_rate": 1.7668019346134674e-05, | |
| "loss": 0.7993, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9012802275960171, | |
| "grad_norm": 2.372596025466919, | |
| "learning_rate": 1.7645262651870926e-05, | |
| "loss": 0.816, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9043148411569464, | |
| "grad_norm": 2.174302339553833, | |
| "learning_rate": 1.7622410267779834e-05, | |
| "loss": 0.8247, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9073494547178758, | |
| "grad_norm": 2.0944302082061768, | |
| "learning_rate": 1.7599462479886976e-05, | |
| "loss": 0.7979, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.9103840682788051, | |
| "grad_norm": 2.118502140045166, | |
| "learning_rate": 1.7576419575412028e-05, | |
| "loss": 0.8007, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9103840682788051, | |
| "eval_loss": NaN, | |
| "eval_runtime": 203.6703, | |
| "eval_samples_per_second": 20.415, | |
| "eval_steps_per_second": 5.106, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9134186818397345, | |
| "grad_norm": 2.2053334712982178, | |
| "learning_rate": 1.755328184276517e-05, | |
| "loss": 0.7913, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9164532954006638, | |
| "grad_norm": 2.243788957595825, | |
| "learning_rate": 1.7530049571543464e-05, | |
| "loss": 0.801, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9194879089615932, | |
| "grad_norm": 2.363306760787964, | |
| "learning_rate": 1.7506723052527243e-05, | |
| "loss": 0.8278, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9225225225225225, | |
| "grad_norm": 2.2566967010498047, | |
| "learning_rate": 1.7483302577676475e-05, | |
| "loss": 0.7929, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.9255571360834519, | |
| "grad_norm": 2.1566100120544434, | |
| "learning_rate": 1.7459788440127083e-05, | |
| "loss": 0.7953, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9285917496443812, | |
| "grad_norm": 2.279130697250366, | |
| "learning_rate": 1.7436180934187307e-05, | |
| "loss": 0.8125, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9316263632053106, | |
| "grad_norm": 2.172891616821289, | |
| "learning_rate": 1.7412480355334006e-05, | |
| "loss": 0.8007, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9346609767662399, | |
| "grad_norm": 2.2873098850250244, | |
| "learning_rate": 1.738868700020895e-05, | |
| "loss": 0.779, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9376955903271693, | |
| "grad_norm": 2.3155357837677, | |
| "learning_rate": 1.7364801166615124e-05, | |
| "loss": 0.8025, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.9407302038880986, | |
| "grad_norm": 2.20151686668396, | |
| "learning_rate": 1.7340823153513003e-05, | |
| "loss": 0.8173, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.943764817449028, | |
| "grad_norm": 2.2033958435058594, | |
| "learning_rate": 1.7316753261016782e-05, | |
| "loss": 0.8095, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9467994310099573, | |
| "grad_norm": 2.2805070877075195, | |
| "learning_rate": 1.7292591790390668e-05, | |
| "loss": 0.8139, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.9498340445708867, | |
| "grad_norm": 2.335238456726074, | |
| "learning_rate": 1.7268339044045044e-05, | |
| "loss": 0.7898, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.952868658131816, | |
| "grad_norm": 2.1889333724975586, | |
| "learning_rate": 1.7243995325532755e-05, | |
| "loss": 0.8002, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.9559032716927454, | |
| "grad_norm": 2.5007987022399902, | |
| "learning_rate": 1.7219560939545246e-05, | |
| "loss": 0.7892, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9589378852536747, | |
| "grad_norm": 2.1342813968658447, | |
| "learning_rate": 1.7195036191908798e-05, | |
| "loss": 0.8028, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.9619724988146041, | |
| "grad_norm": 2.199307918548584, | |
| "learning_rate": 1.7170421389580666e-05, | |
| "loss": 0.793, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.9650071123755334, | |
| "grad_norm": 2.1955904960632324, | |
| "learning_rate": 1.7145716840645253e-05, | |
| "loss": 0.8085, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.9680417259364628, | |
| "grad_norm": 2.41741943359375, | |
| "learning_rate": 1.712092285431026e-05, | |
| "loss": 0.7964, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.9710763394973921, | |
| "grad_norm": 2.3595402240753174, | |
| "learning_rate": 1.7096039740902782e-05, | |
| "loss": 0.7999, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9741109530583215, | |
| "grad_norm": 2.15049147605896, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 0.7813, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.9771455666192508, | |
| "grad_norm": 2.240618944168091, | |
| "learning_rate": 1.7046007379752624e-05, | |
| "loss": 0.8038, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.9801801801801802, | |
| "grad_norm": 2.418973922729492, | |
| "learning_rate": 1.702085875822623e-05, | |
| "loss": 0.7672, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.9832147937411095, | |
| "grad_norm": 2.2924294471740723, | |
| "learning_rate": 1.6995622262052093e-05, | |
| "loss": 0.8013, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.9862494073020389, | |
| "grad_norm": 2.2135136127471924, | |
| "learning_rate": 1.6970298207095887e-05, | |
| "loss": 0.8112, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9892840208629682, | |
| "grad_norm": 2.2720751762390137, | |
| "learning_rate": 1.6944886910319173e-05, | |
| "loss": 0.7896, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.9923186344238976, | |
| "grad_norm": 2.172165632247925, | |
| "learning_rate": 1.6919388689775463e-05, | |
| "loss": 0.8084, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.9953532479848269, | |
| "grad_norm": 2.1608591079711914, | |
| "learning_rate": 1.6893803864606224e-05, | |
| "loss": 0.7914, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.9983878615457563, | |
| "grad_norm": 2.153231143951416, | |
| "learning_rate": 1.6868132755036875e-05, | |
| "loss": 0.803, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.0014224751066856, | |
| "grad_norm": 2.242035150527954, | |
| "learning_rate": 1.6842375682372803e-05, | |
| "loss": 0.7964, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.004457088667615, | |
| "grad_norm": 2.490118980407715, | |
| "learning_rate": 1.681653296899533e-05, | |
| "loss": 0.7645, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.0074917022285443, | |
| "grad_norm": 2.258561372756958, | |
| "learning_rate": 1.6790604938357664e-05, | |
| "loss": 0.7579, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.0105263157894737, | |
| "grad_norm": 2.3750267028808594, | |
| "learning_rate": 1.676459191498087e-05, | |
| "loss": 0.7849, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.013560929350403, | |
| "grad_norm": 2.5088164806365967, | |
| "learning_rate": 1.6738494224449802e-05, | |
| "loss": 0.7814, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.0165955429113325, | |
| "grad_norm": 2.4220824241638184, | |
| "learning_rate": 1.6712312193409032e-05, | |
| "loss": 0.7774, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.0196301564722616, | |
| "grad_norm": 2.2030773162841797, | |
| "learning_rate": 1.6686046149558736e-05, | |
| "loss": 0.7495, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.022664770033191, | |
| "grad_norm": 2.336583375930786, | |
| "learning_rate": 1.6659696421650645e-05, | |
| "loss": 0.7508, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.0256993835941204, | |
| "grad_norm": 2.3112359046936035, | |
| "learning_rate": 1.6633263339483867e-05, | |
| "loss": 0.7525, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.0287339971550498, | |
| "grad_norm": 2.165022611618042, | |
| "learning_rate": 1.6606747233900816e-05, | |
| "loss": 0.787, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.0317686107159791, | |
| "grad_norm": 2.3545596599578857, | |
| "learning_rate": 1.658014843678303e-05, | |
| "loss": 0.7665, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0348032242769085, | |
| "grad_norm": 2.231351137161255, | |
| "learning_rate": 1.655346728104704e-05, | |
| "loss": 0.7726, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.037837837837838, | |
| "grad_norm": 2.292428970336914, | |
| "learning_rate": 1.652670410064019e-05, | |
| "loss": 0.7722, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.0408724513987673, | |
| "grad_norm": 2.2555713653564453, | |
| "learning_rate": 1.6499859230536468e-05, | |
| "loss": 0.755, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.0439070649596967, | |
| "grad_norm": 2.5129449367523193, | |
| "learning_rate": 1.647293300673231e-05, | |
| "loss": 0.7736, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.0469416785206258, | |
| "grad_norm": 2.269122362136841, | |
| "learning_rate": 1.6445925766242392e-05, | |
| "loss": 0.7898, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.0499762920815552, | |
| "grad_norm": 2.21991229057312, | |
| "learning_rate": 1.641883784709541e-05, | |
| "loss": 0.7767, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.0530109056424846, | |
| "grad_norm": 2.324253797531128, | |
| "learning_rate": 1.639166958832985e-05, | |
| "loss": 0.7728, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.056045519203414, | |
| "grad_norm": 2.3205628395080566, | |
| "learning_rate": 1.6364421329989758e-05, | |
| "loss": 0.7845, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.0590801327643433, | |
| "grad_norm": 2.361678123474121, | |
| "learning_rate": 1.6337093413120463e-05, | |
| "loss": 0.7455, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.0621147463252727, | |
| "grad_norm": 2.3375606536865234, | |
| "learning_rate": 1.6309686179764317e-05, | |
| "loss": 0.7754, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0621147463252727, | |
| "eval_loss": NaN, | |
| "eval_runtime": 203.6306, | |
| "eval_samples_per_second": 20.419, | |
| "eval_steps_per_second": 5.107, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.065149359886202, | |
| "grad_norm": 2.3522422313690186, | |
| "learning_rate": 1.6282199972956425e-05, | |
| "loss": 0.7759, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.0681839734471312, | |
| "grad_norm": 2.4227213859558105, | |
| "learning_rate": 1.6254635136720328e-05, | |
| "loss": 0.7772, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.0712185870080606, | |
| "grad_norm": 2.235722303390503, | |
| "learning_rate": 1.6226992016063726e-05, | |
| "loss": 0.7694, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.07425320056899, | |
| "grad_norm": 2.2417314052581787, | |
| "learning_rate": 1.6199270956974128e-05, | |
| "loss": 0.7628, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.0772878141299194, | |
| "grad_norm": 2.348954200744629, | |
| "learning_rate": 1.6171472306414554e-05, | |
| "loss": 0.7656, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.0803224276908487, | |
| "grad_norm": 2.346963882446289, | |
| "learning_rate": 1.614359641231916e-05, | |
| "loss": 0.7839, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.0833570412517781, | |
| "grad_norm": 2.2969138622283936, | |
| "learning_rate": 1.6115643623588915e-05, | |
| "loss": 0.7728, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.0863916548127075, | |
| "grad_norm": 2.3338327407836914, | |
| "learning_rate": 1.608761429008721e-05, | |
| "loss": 0.7902, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.0894262683736369, | |
| "grad_norm": 2.2462401390075684, | |
| "learning_rate": 1.6059508762635482e-05, | |
| "loss": 0.7906, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.0924608819345663, | |
| "grad_norm": 2.2091758251190186, | |
| "learning_rate": 1.6031327393008848e-05, | |
| "loss": 0.7587, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0954954954954954, | |
| "grad_norm": 2.2392489910125732, | |
| "learning_rate": 1.6003070533931657e-05, | |
| "loss": 0.7598, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.0985301090564248, | |
| "grad_norm": 2.193833112716675, | |
| "learning_rate": 1.5974738539073125e-05, | |
| "loss": 0.7622, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.1015647226173542, | |
| "grad_norm": 2.187610149383545, | |
| "learning_rate": 1.594633176304287e-05, | |
| "loss": 0.7796, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.1045993361782835, | |
| "grad_norm": 2.2753069400787354, | |
| "learning_rate": 1.5917850561386487e-05, | |
| "loss": 0.7783, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.107633949739213, | |
| "grad_norm": 2.2835614681243896, | |
| "learning_rate": 1.588929529058111e-05, | |
| "loss": 0.7801, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.1106685633001423, | |
| "grad_norm": 2.46498441696167, | |
| "learning_rate": 1.5860666308030933e-05, | |
| "loss": 0.7683, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.1137031768610717, | |
| "grad_norm": 2.361351490020752, | |
| "learning_rate": 1.5831963972062734e-05, | |
| "loss": 0.783, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.1167377904220008, | |
| "grad_norm": 2.2396347522735596, | |
| "learning_rate": 1.5803188641921417e-05, | |
| "loss": 0.7563, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.1197724039829302, | |
| "grad_norm": 2.2810609340667725, | |
| "learning_rate": 1.5774340677765483e-05, | |
| "loss": 0.7865, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 2.147937297821045, | |
| "learning_rate": 1.5745420440662543e-05, | |
| "loss": 0.7684, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.125841631104789, | |
| "grad_norm": 2.355337142944336, | |
| "learning_rate": 1.5716428292584788e-05, | |
| "loss": 0.7693, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.1288762446657183, | |
| "grad_norm": 2.4234957695007324, | |
| "learning_rate": 1.568736459640447e-05, | |
| "loss": 0.7567, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.1319108582266477, | |
| "grad_norm": 2.2321126461029053, | |
| "learning_rate": 1.5658229715889345e-05, | |
| "loss": 0.7984, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.134945471787577, | |
| "grad_norm": 2.2270772457122803, | |
| "learning_rate": 1.5629024015698137e-05, | |
| "loss": 0.7868, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.1379800853485065, | |
| "grad_norm": 2.4906022548675537, | |
| "learning_rate": 1.5599747861375957e-05, | |
| "loss": 0.7761, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.1410146989094359, | |
| "grad_norm": 2.4099533557891846, | |
| "learning_rate": 1.5570401619349737e-05, | |
| "loss": 0.7727, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.144049312470365, | |
| "grad_norm": 2.167451858520508, | |
| "learning_rate": 1.5540985656923648e-05, | |
| "loss": 0.7449, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.1470839260312944, | |
| "grad_norm": 2.4137990474700928, | |
| "learning_rate": 1.551150034227449e-05, | |
| "loss": 0.7836, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.1501185395922238, | |
| "grad_norm": 2.0170676708221436, | |
| "learning_rate": 1.54819460444471e-05, | |
| "loss": 0.771, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.1531531531531531, | |
| "grad_norm": 2.3556909561157227, | |
| "learning_rate": 1.5452323133349712e-05, | |
| "loss": 0.769, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1561877667140825, | |
| "grad_norm": 2.1490001678466797, | |
| "learning_rate": 1.5422631979749354e-05, | |
| "loss": 0.7467, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.159222380275012, | |
| "grad_norm": 2.240185260772705, | |
| "learning_rate": 1.5392872955267176e-05, | |
| "loss": 0.7677, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.1622569938359413, | |
| "grad_norm": 2.1402430534362793, | |
| "learning_rate": 1.5363046432373824e-05, | |
| "loss": 0.7706, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.1652916073968704, | |
| "grad_norm": 2.043536424636841, | |
| "learning_rate": 1.5333152784384777e-05, | |
| "loss": 0.7679, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.1683262209577998, | |
| "grad_norm": 2.146432399749756, | |
| "learning_rate": 1.5303192385455652e-05, | |
| "loss": 0.7746, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.1713608345187292, | |
| "grad_norm": 2.216525077819824, | |
| "learning_rate": 1.5273165610577543e-05, | |
| "loss": 0.7735, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.1743954480796586, | |
| "grad_norm": 2.134411334991455, | |
| "learning_rate": 1.5243072835572319e-05, | |
| "loss": 0.771, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.177430061640588, | |
| "grad_norm": 2.210275173187256, | |
| "learning_rate": 1.5212914437087921e-05, | |
| "loss": 0.7665, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.1804646752015173, | |
| "grad_norm": 2.2126176357269287, | |
| "learning_rate": 1.5182690792593659e-05, | |
| "loss": 0.7658, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.1834992887624467, | |
| "grad_norm": 1.9885146617889404, | |
| "learning_rate": 1.5152402280375454e-05, | |
| "loss": 0.7509, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.186533902323376, | |
| "grad_norm": 2.2027952671051025, | |
| "learning_rate": 1.5122049279531143e-05, | |
| "loss": 0.7811, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.1895685158843055, | |
| "grad_norm": 2.1936960220336914, | |
| "learning_rate": 1.509163216996572e-05, | |
| "loss": 0.7785, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.1926031294452346, | |
| "grad_norm": 2.166518449783325, | |
| "learning_rate": 1.5061151332386565e-05, | |
| "loss": 0.7775, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.195637743006164, | |
| "grad_norm": 2.0909955501556396, | |
| "learning_rate": 1.5030607148298697e-05, | |
| "loss": 0.7783, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.1986723565670934, | |
| "grad_norm": 2.287322521209717, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.7678, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.2017069701280227, | |
| "grad_norm": 2.265106678009033, | |
| "learning_rate": 1.4969330270576428e-05, | |
| "loss": 0.7772, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.2047415836889521, | |
| "grad_norm": 2.2475264072418213, | |
| "learning_rate": 1.4938598343897215e-05, | |
| "loss": 0.7509, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.2077761972498815, | |
| "grad_norm": 2.3240981101989746, | |
| "learning_rate": 1.4907804604610064e-05, | |
| "loss": 0.7849, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.2108108108108109, | |
| "grad_norm": 2.148869276046753, | |
| "learning_rate": 1.4876949438136348e-05, | |
| "loss": 0.7781, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.2138454243717403, | |
| "grad_norm": 2.318875789642334, | |
| "learning_rate": 1.484603323066627e-05, | |
| "loss": 0.7987, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2138454243717403, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.8025, | |
| "eval_samples_per_second": 20.302, | |
| "eval_steps_per_second": 5.078, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2168800379326696, | |
| "grad_norm": 2.1469035148620605, | |
| "learning_rate": 1.4815056369154039e-05, | |
| "loss": 0.7924, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.2199146514935988, | |
| "grad_norm": 2.3983654975891113, | |
| "learning_rate": 1.4784019241313025e-05, | |
| "loss": 0.7431, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.2229492650545282, | |
| "grad_norm": 2.1171765327453613, | |
| "learning_rate": 1.47529222356109e-05, | |
| "loss": 0.7583, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.2259838786154575, | |
| "grad_norm": 2.3186557292938232, | |
| "learning_rate": 1.4721765741264786e-05, | |
| "loss": 0.7545, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.229018492176387, | |
| "grad_norm": 2.308945417404175, | |
| "learning_rate": 1.4690550148236371e-05, | |
| "loss": 0.7752, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.2320531057373163, | |
| "grad_norm": 2.141418933868408, | |
| "learning_rate": 1.4659275847227044e-05, | |
| "loss": 0.7501, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.2350877192982457, | |
| "grad_norm": 2.1447696685791016, | |
| "learning_rate": 1.4627943229672992e-05, | |
| "loss": 0.7446, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.238122332859175, | |
| "grad_norm": 2.062683582305908, | |
| "learning_rate": 1.4596552687740304e-05, | |
| "loss": 0.7729, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.2411569464201042, | |
| "grad_norm": 2.283247232437134, | |
| "learning_rate": 1.4565104614320065e-05, | |
| "loss": 0.7752, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.2441915599810336, | |
| "grad_norm": 2.299151659011841, | |
| "learning_rate": 1.453359940302344e-05, | |
| "loss": 0.7794, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.247226173541963, | |
| "grad_norm": 2.2340760231018066, | |
| "learning_rate": 1.4502037448176734e-05, | |
| "loss": 0.7811, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.2502607871028923, | |
| "grad_norm": 2.305233955383301, | |
| "learning_rate": 1.4470419144816483e-05, | |
| "loss": 0.7663, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.2532954006638217, | |
| "grad_norm": 2.1460888385772705, | |
| "learning_rate": 1.4438744888684481e-05, | |
| "loss": 0.7584, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.256330014224751, | |
| "grad_norm": 2.3122851848602295, | |
| "learning_rate": 1.4407015076222845e-05, | |
| "loss": 0.7817, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.2593646277856805, | |
| "grad_norm": 2.0583643913269043, | |
| "learning_rate": 1.4375230104569044e-05, | |
| "loss": 0.7695, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.2623992413466096, | |
| "grad_norm": 2.262274980545044, | |
| "learning_rate": 1.4343390371550936e-05, | |
| "loss": 0.7739, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.2654338549075392, | |
| "grad_norm": 2.151893377304077, | |
| "learning_rate": 1.4311496275681785e-05, | |
| "loss": 0.7789, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.2684684684684684, | |
| "grad_norm": 2.28367280960083, | |
| "learning_rate": 1.4279548216155265e-05, | |
| "loss": 0.775, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.2715030820293978, | |
| "grad_norm": 2.3059751987457275, | |
| "learning_rate": 1.424754659284048e-05, | |
| "loss": 0.7613, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.2745376955903271, | |
| "grad_norm": 2.436896800994873, | |
| "learning_rate": 1.4215491806276944e-05, | |
| "loss": 0.7835, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2775723091512565, | |
| "grad_norm": 2.102220058441162, | |
| "learning_rate": 1.418338425766958e-05, | |
| "loss": 0.7932, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.280606922712186, | |
| "grad_norm": 2.1711723804473877, | |
| "learning_rate": 1.4151224348883692e-05, | |
| "loss": 0.7668, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.2836415362731153, | |
| "grad_norm": 2.213289737701416, | |
| "learning_rate": 1.4119012482439929e-05, | |
| "loss": 0.7745, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.2866761498340447, | |
| "grad_norm": 2.0960137844085693, | |
| "learning_rate": 1.408674906150926e-05, | |
| "loss": 0.7742, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.2897107633949738, | |
| "grad_norm": 2.081200122833252, | |
| "learning_rate": 1.4054434489907916e-05, | |
| "loss": 0.7652, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.2927453769559032, | |
| "grad_norm": 2.284423351287842, | |
| "learning_rate": 1.4022069172092354e-05, | |
| "loss": 0.7762, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.2957799905168326, | |
| "grad_norm": 2.292639970779419, | |
| "learning_rate": 1.3989653513154165e-05, | |
| "loss": 0.7644, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.298814604077762, | |
| "grad_norm": 2.282759428024292, | |
| "learning_rate": 1.3957187918815032e-05, | |
| "loss": 0.7658, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.3018492176386913, | |
| "grad_norm": 2.2170190811157227, | |
| "learning_rate": 1.3924672795421638e-05, | |
| "loss": 0.7661, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.3048838311996207, | |
| "grad_norm": 2.202991247177124, | |
| "learning_rate": 1.3892108549940583e-05, | |
| "loss": 0.7881, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.30791844476055, | |
| "grad_norm": 2.148986339569092, | |
| "learning_rate": 1.3859495589953289e-05, | |
| "loss": 0.7865, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.3109530583214792, | |
| "grad_norm": 2.1265058517456055, | |
| "learning_rate": 1.3826834323650899e-05, | |
| "loss": 0.789, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.3139876718824088, | |
| "grad_norm": 2.1798737049102783, | |
| "learning_rate": 1.3794125159829173e-05, | |
| "loss": 0.7707, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.317022285443338, | |
| "grad_norm": 2.2072978019714355, | |
| "learning_rate": 1.376136850788336e-05, | |
| "loss": 0.763, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.3200568990042674, | |
| "grad_norm": 2.12349534034729, | |
| "learning_rate": 1.3728564777803089e-05, | |
| "loss": 0.7505, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.3230915125651967, | |
| "grad_norm": 2.104276180267334, | |
| "learning_rate": 1.3695714380167221e-05, | |
| "loss": 0.7891, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.3261261261261261, | |
| "grad_norm": 2.038515090942383, | |
| "learning_rate": 1.3662817726138729e-05, | |
| "loss": 0.7668, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.3291607396870555, | |
| "grad_norm": 2.0557668209075928, | |
| "learning_rate": 1.3629875227459532e-05, | |
| "loss": 0.7685, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.3321953532479849, | |
| "grad_norm": 2.221299886703491, | |
| "learning_rate": 1.359688729644536e-05, | |
| "loss": 0.7765, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.3352299668089143, | |
| "grad_norm": 2.383873462677002, | |
| "learning_rate": 1.356385434598057e-05, | |
| "loss": 0.7863, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3382645803698434, | |
| "grad_norm": 2.144969940185547, | |
| "learning_rate": 1.3530776789513009e-05, | |
| "loss": 0.7854, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.341299193930773, | |
| "grad_norm": 2.3431177139282227, | |
| "learning_rate": 1.3497655041048812e-05, | |
| "loss": 0.7491, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.3443338074917022, | |
| "grad_norm": 2.1558756828308105, | |
| "learning_rate": 1.3464489515147239e-05, | |
| "loss": 0.7935, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.3473684210526315, | |
| "grad_norm": 2.4032328128814697, | |
| "learning_rate": 1.3431280626915466e-05, | |
| "loss": 0.765, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.350403034613561, | |
| "grad_norm": 2.373549461364746, | |
| "learning_rate": 1.3398028792003413e-05, | |
| "loss": 0.766, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.3534376481744903, | |
| "grad_norm": 2.38440203666687, | |
| "learning_rate": 1.3364734426598527e-05, | |
| "loss": 0.7849, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.3564722617354197, | |
| "grad_norm": 2.319101333618164, | |
| "learning_rate": 1.3331397947420578e-05, | |
| "loss": 0.7738, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.359506875296349, | |
| "grad_norm": 2.1911866664886475, | |
| "learning_rate": 1.3298019771716435e-05, | |
| "loss": 0.7779, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.3625414888572784, | |
| "grad_norm": 2.273451328277588, | |
| "learning_rate": 1.3264600317254854e-05, | |
| "loss": 0.76, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.3655761024182076, | |
| "grad_norm": 2.2576355934143066, | |
| "learning_rate": 1.3231140002321252e-05, | |
| "loss": 0.7687, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3655761024182076, | |
| "eval_loss": NaN, | |
| "eval_runtime": 203.9814, | |
| "eval_samples_per_second": 20.384, | |
| "eval_steps_per_second": 5.099, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.368610715979137, | |
| "grad_norm": 2.2846243381500244, | |
| "learning_rate": 1.3197639245712454e-05, | |
| "loss": 0.77, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.3716453295400663, | |
| "grad_norm": 2.1583635807037354, | |
| "learning_rate": 1.3164098466731467e-05, | |
| "loss": 0.7681, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.3746799431009957, | |
| "grad_norm": 2.126995325088501, | |
| "learning_rate": 1.3130518085182224e-05, | |
| "loss": 0.7755, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.377714556661925, | |
| "grad_norm": 2.2357747554779053, | |
| "learning_rate": 1.3096898521364338e-05, | |
| "loss": 0.7509, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.3807491702228545, | |
| "grad_norm": 2.0657386779785156, | |
| "learning_rate": 1.3063240196067837e-05, | |
| "loss": 0.8043, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.3837837837837839, | |
| "grad_norm": 2.1653802394866943, | |
| "learning_rate": 1.3029543530567884e-05, | |
| "loss": 0.7676, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.386818397344713, | |
| "grad_norm": 2.161508798599243, | |
| "learning_rate": 1.2995808946619533e-05, | |
| "loss": 0.7735, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.3898530109056426, | |
| "grad_norm": 2.185350179672241, | |
| "learning_rate": 1.2962036866452423e-05, | |
| "loss": 0.7891, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.3928876244665718, | |
| "grad_norm": 2.198807954788208, | |
| "learning_rate": 1.2928227712765504e-05, | |
| "loss": 0.7657, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.3959222380275011, | |
| "grad_norm": 2.2574117183685303, | |
| "learning_rate": 1.2894381908721757e-05, | |
| "loss": 0.7893, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.3989568515884305, | |
| "grad_norm": 2.2577221393585205, | |
| "learning_rate": 1.2860499877942876e-05, | |
| "loss": 0.7787, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.40199146514936, | |
| "grad_norm": 2.265421152114868, | |
| "learning_rate": 1.282658204450398e-05, | |
| "loss": 0.7714, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.4050260787102893, | |
| "grad_norm": 2.0731489658355713, | |
| "learning_rate": 1.2792628832928302e-05, | |
| "loss": 0.782, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.4080606922712187, | |
| "grad_norm": 2.130533218383789, | |
| "learning_rate": 1.275864066818188e-05, | |
| "loss": 0.7873, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.411095305832148, | |
| "grad_norm": 2.0858917236328125, | |
| "learning_rate": 1.2724617975668229e-05, | |
| "loss": 0.8005, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.4141299193930772, | |
| "grad_norm": 2.0890703201293945, | |
| "learning_rate": 1.2690561181223024e-05, | |
| "loss": 0.775, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.4171645329540066, | |
| "grad_norm": 2.3934412002563477, | |
| "learning_rate": 1.2656470711108763e-05, | |
| "loss": 0.7931, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.420199146514936, | |
| "grad_norm": 2.2045512199401855, | |
| "learning_rate": 1.2622346992009447e-05, | |
| "loss": 0.7747, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.4232337600758653, | |
| "grad_norm": 2.371346950531006, | |
| "learning_rate": 1.2588190451025209e-05, | |
| "loss": 0.7581, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.4262683736367947, | |
| "grad_norm": 2.1523914337158203, | |
| "learning_rate": 1.2554001515667009e-05, | |
| "loss": 0.7741, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.429302987197724, | |
| "grad_norm": 2.2948853969573975, | |
| "learning_rate": 1.2519780613851254e-05, | |
| "loss": 0.7925, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.4323376007586535, | |
| "grad_norm": 2.2751922607421875, | |
| "learning_rate": 1.2485528173894447e-05, | |
| "loss": 0.7784, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.4353722143195826, | |
| "grad_norm": 2.1596484184265137, | |
| "learning_rate": 1.2451244624507831e-05, | |
| "loss": 0.7895, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.4384068278805122, | |
| "grad_norm": 2.2292542457580566, | |
| "learning_rate": 1.2416930394792026e-05, | |
| "loss": 0.7698, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.4414414414414414, | |
| "grad_norm": 2.1559245586395264, | |
| "learning_rate": 1.238258591423165e-05, | |
| "loss": 0.776, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.4444760550023708, | |
| "grad_norm": 2.072768211364746, | |
| "learning_rate": 1.234821161268995e-05, | |
| "loss": 0.7665, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.4475106685633001, | |
| "grad_norm": 2.3749032020568848, | |
| "learning_rate": 1.2313807920403419e-05, | |
| "loss": 0.7765, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.4505452821242295, | |
| "grad_norm": 2.1614534854888916, | |
| "learning_rate": 1.22793752679764e-05, | |
| "loss": 0.7908, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.4535798956851589, | |
| "grad_norm": 2.2183918952941895, | |
| "learning_rate": 1.2244914086375726e-05, | |
| "loss": 0.7662, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.4566145092460883, | |
| "grad_norm": 2.230243444442749, | |
| "learning_rate": 1.22104248069253e-05, | |
| "loss": 0.7758, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4596491228070176, | |
| "grad_norm": 2.503591775894165, | |
| "learning_rate": 1.2175907861300698e-05, | |
| "loss": 0.7739, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.4626837363679468, | |
| "grad_norm": 2.4481165409088135, | |
| "learning_rate": 1.2141363681523777e-05, | |
| "loss": 0.788, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.4657183499288762, | |
| "grad_norm": 2.30068302154541, | |
| "learning_rate": 1.2106792699957264e-05, | |
| "loss": 0.7905, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.4687529634898056, | |
| "grad_norm": 2.2382349967956543, | |
| "learning_rate": 1.2072195349299344e-05, | |
| "loss": 0.7617, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.471787577050735, | |
| "grad_norm": 2.2054882049560547, | |
| "learning_rate": 1.2037572062578238e-05, | |
| "loss": 0.7802, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.4748221906116643, | |
| "grad_norm": 2.282318115234375, | |
| "learning_rate": 1.2002923273146793e-05, | |
| "loss": 0.7711, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.4778568041725937, | |
| "grad_norm": 2.1077611446380615, | |
| "learning_rate": 1.1968249414677055e-05, | |
| "loss": 0.761, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.480891417733523, | |
| "grad_norm": 2.1558871269226074, | |
| "learning_rate": 1.1933550921154836e-05, | |
| "loss": 0.7872, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.4839260312944522, | |
| "grad_norm": 2.332897901535034, | |
| "learning_rate": 1.1898828226874284e-05, | |
| "loss": 0.7791, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.4869606448553818, | |
| "grad_norm": 2.0427420139312744, | |
| "learning_rate": 1.1864081766432457e-05, | |
| "loss": 0.782, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.489995258416311, | |
| "grad_norm": 2.2689926624298096, | |
| "learning_rate": 1.1829311974723868e-05, | |
| "loss": 0.7659, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.4930298719772404, | |
| "grad_norm": 2.2432594299316406, | |
| "learning_rate": 1.1794519286935056e-05, | |
| "loss": 0.7543, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.4960644855381697, | |
| "grad_norm": 2.0946452617645264, | |
| "learning_rate": 1.1759704138539121e-05, | |
| "loss": 0.7712, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.499099099099099, | |
| "grad_norm": 2.1154541969299316, | |
| "learning_rate": 1.1724866965290302e-05, | |
| "loss": 0.7732, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.5021337126600285, | |
| "grad_norm": 2.195223569869995, | |
| "learning_rate": 1.1690008203218493e-05, | |
| "loss": 0.7596, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.5051683262209576, | |
| "grad_norm": 2.105151653289795, | |
| "learning_rate": 1.1655128288623803e-05, | |
| "loss": 0.775, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.5082029397818872, | |
| "grad_norm": 2.196159601211548, | |
| "learning_rate": 1.1620227658071088e-05, | |
| "loss": 0.7893, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.5112375533428164, | |
| "grad_norm": 2.3409769535064697, | |
| "learning_rate": 1.158530674838449e-05, | |
| "loss": 0.7494, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.514272166903746, | |
| "grad_norm": 2.177128553390503, | |
| "learning_rate": 1.155036599664198e-05, | |
| "loss": 0.7595, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.5173067804646752, | |
| "grad_norm": 2.3704683780670166, | |
| "learning_rate": 1.1515405840169861e-05, | |
| "loss": 0.7607, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5173067804646752, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.5697, | |
| "eval_samples_per_second": 20.326, | |
| "eval_steps_per_second": 5.084, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5203413940256045, | |
| "grad_norm": 2.175325393676758, | |
| "learning_rate": 1.1480426716537316e-05, | |
| "loss": 0.7626, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.523376007586534, | |
| "grad_norm": 2.093395471572876, | |
| "learning_rate": 1.1445429063550925e-05, | |
| "loss": 0.7787, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.5264106211474633, | |
| "grad_norm": 2.2766175270080566, | |
| "learning_rate": 1.1410413319249193e-05, | |
| "loss": 0.7592, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 1.5294452347083927, | |
| "grad_norm": 2.00833797454834, | |
| "learning_rate": 1.1375379921897052e-05, | |
| "loss": 0.7685, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.5324798482693218, | |
| "grad_norm": 2.2018191814422607, | |
| "learning_rate": 1.1340329309980379e-05, | |
| "loss": 0.753, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.5355144618302514, | |
| "grad_norm": 2.228724479675293, | |
| "learning_rate": 1.130526192220052e-05, | |
| "loss": 0.7687, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.5385490753911806, | |
| "grad_norm": 2.241725444793701, | |
| "learning_rate": 1.1270178197468788e-05, | |
| "loss": 0.7674, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.54158368895211, | |
| "grad_norm": 2.1697564125061035, | |
| "learning_rate": 1.1235078574900984e-05, | |
| "loss": 0.7726, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.5446183025130393, | |
| "grad_norm": 2.197449207305908, | |
| "learning_rate": 1.119996349381187e-05, | |
| "loss": 0.7672, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 1.5476529160739687, | |
| "grad_norm": 2.0595834255218506, | |
| "learning_rate": 1.1164833393709707e-05, | |
| "loss": 0.7706, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.550687529634898, | |
| "grad_norm": 2.3002591133117676, | |
| "learning_rate": 1.112968871429073e-05, | |
| "loss": 0.7875, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.5537221431958272, | |
| "grad_norm": 2.175219774246216, | |
| "learning_rate": 1.1094529895433653e-05, | |
| "loss": 0.7809, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.5567567567567568, | |
| "grad_norm": 2.1368846893310547, | |
| "learning_rate": 1.1059357377194161e-05, | |
| "loss": 0.7878, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.559791370317686, | |
| "grad_norm": 2.210344076156616, | |
| "learning_rate": 1.102417159979941e-05, | |
| "loss": 0.7543, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.5628259838786156, | |
| "grad_norm": 2.1888577938079834, | |
| "learning_rate": 1.09889730036425e-05, | |
| "loss": 0.7731, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.5658605974395448, | |
| "grad_norm": 2.3701112270355225, | |
| "learning_rate": 1.0953762029276982e-05, | |
| "loss": 0.7689, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.5688952110004741, | |
| "grad_norm": 2.069556713104248, | |
| "learning_rate": 1.0918539117411334e-05, | |
| "loss": 0.767, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 1.5719298245614035, | |
| "grad_norm": 2.209773302078247, | |
| "learning_rate": 1.0883304708903441e-05, | |
| "loss": 0.7696, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.5749644381223329, | |
| "grad_norm": 2.1156795024871826, | |
| "learning_rate": 1.0848059244755093e-05, | |
| "loss": 0.7827, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.5779990516832623, | |
| "grad_norm": 2.359513521194458, | |
| "learning_rate": 1.0812803166106445e-05, | |
| "loss": 0.7612, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5810336652441914, | |
| "grad_norm": 2.2499759197235107, | |
| "learning_rate": 1.0777536914230509e-05, | |
| "loss": 0.7671, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 1.584068278805121, | |
| "grad_norm": 2.219525098800659, | |
| "learning_rate": 1.0742260930527625e-05, | |
| "loss": 0.776, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.5871028923660502, | |
| "grad_norm": 2.339210033416748, | |
| "learning_rate": 1.0706975656519946e-05, | |
| "loss": 0.7669, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 1.5901375059269798, | |
| "grad_norm": 2.3282480239868164, | |
| "learning_rate": 1.06716815338459e-05, | |
| "loss": 0.7843, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.593172119487909, | |
| "grad_norm": 2.09635329246521, | |
| "learning_rate": 1.0636379004254665e-05, | |
| "loss": 0.7598, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.5962067330488383, | |
| "grad_norm": 2.3128199577331543, | |
| "learning_rate": 1.0601068509600642e-05, | |
| "loss": 0.7673, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.5992413466097677, | |
| "grad_norm": 2.3669700622558594, | |
| "learning_rate": 1.0565750491837925e-05, | |
| "loss": 0.7697, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 1.602275960170697, | |
| "grad_norm": 2.2540953159332275, | |
| "learning_rate": 1.0530425393014773e-05, | |
| "loss": 0.7641, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.6053105737316264, | |
| "grad_norm": 2.3089439868927, | |
| "learning_rate": 1.049509365526807e-05, | |
| "loss": 0.768, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 1.6083451872925556, | |
| "grad_norm": 2.0084259510040283, | |
| "learning_rate": 1.0459755720817797e-05, | |
| "loss": 0.7504, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6113798008534852, | |
| "grad_norm": 2.261798143386841, | |
| "learning_rate": 1.0424412031961485e-05, | |
| "loss": 0.7629, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.6144144144144144, | |
| "grad_norm": 2.2108161449432373, | |
| "learning_rate": 1.0389063031068698e-05, | |
| "loss": 0.738, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.6174490279753437, | |
| "grad_norm": 2.2087652683258057, | |
| "learning_rate": 1.0353709160575488e-05, | |
| "loss": 0.7758, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 1.6204836415362731, | |
| "grad_norm": 2.218838691711426, | |
| "learning_rate": 1.0318350862978848e-05, | |
| "loss": 0.7641, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.6235182550972025, | |
| "grad_norm": 2.156991720199585, | |
| "learning_rate": 1.0282988580831183e-05, | |
| "loss": 0.7577, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.6265528686581319, | |
| "grad_norm": 2.1950886249542236, | |
| "learning_rate": 1.0247622756734775e-05, | |
| "loss": 0.7888, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.629587482219061, | |
| "grad_norm": 2.114649534225464, | |
| "learning_rate": 1.0212253833336237e-05, | |
| "loss": 0.7766, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.6326220957799906, | |
| "grad_norm": 2.217928409576416, | |
| "learning_rate": 1.0176882253320968e-05, | |
| "loss": 0.7529, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.6356567093409198, | |
| "grad_norm": 2.2367630004882812, | |
| "learning_rate": 1.0141508459407622e-05, | |
| "loss": 0.7699, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 1.6386913229018494, | |
| "grad_norm": 2.1163787841796875, | |
| "learning_rate": 1.0106132894342564e-05, | |
| "loss": 0.7637, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6417259364627785, | |
| "grad_norm": 2.083292245864868, | |
| "learning_rate": 1.0070756000894321e-05, | |
| "loss": 0.783, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 1.644760550023708, | |
| "grad_norm": 2.1091578006744385, | |
| "learning_rate": 1.0035378221848053e-05, | |
| "loss": 0.7848, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.6477951635846373, | |
| "grad_norm": 2.0994367599487305, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7698, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.6508297771455667, | |
| "grad_norm": 2.2539420127868652, | |
| "learning_rate": 9.964621778151947e-06, | |
| "loss": 0.769, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.653864390706496, | |
| "grad_norm": 2.154353380203247, | |
| "learning_rate": 9.929243999105682e-06, | |
| "loss": 0.7496, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.6568990042674252, | |
| "grad_norm": 2.265385150909424, | |
| "learning_rate": 9.89386710565744e-06, | |
| "loss": 0.768, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.6599336178283548, | |
| "grad_norm": 2.073289632797241, | |
| "learning_rate": 9.858491540592383e-06, | |
| "loss": 0.7807, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 1.662968231389284, | |
| "grad_norm": 2.135737657546997, | |
| "learning_rate": 9.823117746679034e-06, | |
| "loss": 0.766, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.6660028449502133, | |
| "grad_norm": 2.1423635482788086, | |
| "learning_rate": 9.787746166663765e-06, | |
| "loss": 0.7588, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.6690374585111427, | |
| "grad_norm": 2.0772769451141357, | |
| "learning_rate": 9.752377243265229e-06, | |
| "loss": 0.7647, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6690374585111427, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.4783, | |
| "eval_samples_per_second": 20.335, | |
| "eval_steps_per_second": 5.086, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.672072072072072, | |
| "grad_norm": 2.118854284286499, | |
| "learning_rate": 9.71701141916882e-06, | |
| "loss": 0.7728, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 1.6751066856330015, | |
| "grad_norm": 2.1113169193267822, | |
| "learning_rate": 9.681649137021158e-06, | |
| "loss": 0.7796, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.6781412991939306, | |
| "grad_norm": 2.064993143081665, | |
| "learning_rate": 9.646290839424515e-06, | |
| "loss": 0.762, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 1.6811759127548602, | |
| "grad_norm": 2.151028633117676, | |
| "learning_rate": 9.610936968931302e-06, | |
| "loss": 0.7709, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 2.201429843902588, | |
| "learning_rate": 9.57558796803852e-06, | |
| "loss": 0.7619, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.687245139876719, | |
| "grad_norm": 2.065720796585083, | |
| "learning_rate": 9.540244279182206e-06, | |
| "loss": 0.798, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.6902797534376481, | |
| "grad_norm": 2.1895217895507812, | |
| "learning_rate": 9.504906344731933e-06, | |
| "loss": 0.7802, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 1.6933143669985775, | |
| "grad_norm": 2.2065324783325195, | |
| "learning_rate": 9.46957460698523e-06, | |
| "loss": 0.7767, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.696348980559507, | |
| "grad_norm": 2.17842698097229, | |
| "learning_rate": 9.434249508162076e-06, | |
| "loss": 0.7604, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 1.6993835941204363, | |
| "grad_norm": 2.164586305618286, | |
| "learning_rate": 9.398931490399363e-06, | |
| "loss": 0.756, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7024182076813656, | |
| "grad_norm": 2.235797643661499, | |
| "learning_rate": 9.363620995745337e-06, | |
| "loss": 0.777, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.7054528212422948, | |
| "grad_norm": 2.084228038787842, | |
| "learning_rate": 9.328318466154102e-06, | |
| "loss": 0.765, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.7084874348032244, | |
| "grad_norm": 2.1382224559783936, | |
| "learning_rate": 9.293024343480056e-06, | |
| "loss": 0.7712, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.7115220483641536, | |
| "grad_norm": 2.304201126098633, | |
| "learning_rate": 9.257739069472375e-06, | |
| "loss": 0.7748, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.714556661925083, | |
| "grad_norm": 2.1342921257019043, | |
| "learning_rate": 9.222463085769495e-06, | |
| "loss": 0.762, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.7175912754860123, | |
| "grad_norm": 2.189845323562622, | |
| "learning_rate": 9.187196833893559e-06, | |
| "loss": 0.7605, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.7206258890469417, | |
| "grad_norm": 2.3078930377960205, | |
| "learning_rate": 9.151940755244912e-06, | |
| "loss": 0.7677, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.723660502607871, | |
| "grad_norm": 2.303213357925415, | |
| "learning_rate": 9.11669529109656e-06, | |
| "loss": 0.7735, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.7266951161688002, | |
| "grad_norm": 2.3082435131073, | |
| "learning_rate": 9.081460882588668e-06, | |
| "loss": 0.7683, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 1.7297297297297298, | |
| "grad_norm": 2.2995338439941406, | |
| "learning_rate": 9.046237970723022e-06, | |
| "loss": 0.752, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.732764343290659, | |
| "grad_norm": 2.0647215843200684, | |
| "learning_rate": 9.011026996357504e-06, | |
| "loss": 0.7731, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 1.7357989568515886, | |
| "grad_norm": 2.160496473312378, | |
| "learning_rate": 8.975828400200592e-06, | |
| "loss": 0.7543, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.7388335704125177, | |
| "grad_norm": 2.3624351024627686, | |
| "learning_rate": 8.94064262280584e-06, | |
| "loss": 0.774, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.7418681839734471, | |
| "grad_norm": 2.165922164916992, | |
| "learning_rate": 8.90547010456635e-06, | |
| "loss": 0.7815, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.7449027975343765, | |
| "grad_norm": 2.138921022415161, | |
| "learning_rate": 8.870311285709274e-06, | |
| "loss": 0.7593, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.7479374110953059, | |
| "grad_norm": 2.122746706008911, | |
| "learning_rate": 8.835166606290295e-06, | |
| "loss": 0.783, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.7509720246562352, | |
| "grad_norm": 2.119318723678589, | |
| "learning_rate": 8.80003650618813e-06, | |
| "loss": 0.7778, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 1.7540066382171644, | |
| "grad_norm": 2.092092990875244, | |
| "learning_rate": 8.76492142509902e-06, | |
| "loss": 0.765, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.757041251778094, | |
| "grad_norm": 2.156517267227173, | |
| "learning_rate": 8.729821802531213e-06, | |
| "loss": 0.7583, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.7600758653390232, | |
| "grad_norm": 2.2096939086914062, | |
| "learning_rate": 8.694738077799487e-06, | |
| "loss": 0.7698, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.7631104788999525, | |
| "grad_norm": 1.981307029724121, | |
| "learning_rate": 8.659670690019626e-06, | |
| "loss": 0.7771, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 1.766145092460882, | |
| "grad_norm": 2.1369874477386475, | |
| "learning_rate": 8.624620078102952e-06, | |
| "loss": 0.777, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.7691797060218113, | |
| "grad_norm": 2.1329455375671387, | |
| "learning_rate": 8.58958668075081e-06, | |
| "loss": 0.7748, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.7722143195827407, | |
| "grad_norm": 2.1630642414093018, | |
| "learning_rate": 8.554570936449073e-06, | |
| "loss": 0.7675, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 1.7752489331436698, | |
| "grad_norm": 2.1383938789367676, | |
| "learning_rate": 8.519573283462688e-06, | |
| "loss": 0.7702, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.7782835467045994, | |
| "grad_norm": 2.001569986343384, | |
| "learning_rate": 8.484594159830142e-06, | |
| "loss": 0.7553, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 1.7813181602655286, | |
| "grad_norm": 2.1266000270843506, | |
| "learning_rate": 8.449634003358022e-06, | |
| "loss": 0.7757, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 1.7843527738264582, | |
| "grad_norm": 2.172466278076172, | |
| "learning_rate": 8.414693251615513e-06, | |
| "loss": 0.7625, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.7873873873873873, | |
| "grad_norm": 2.2140755653381348, | |
| "learning_rate": 8.379772341928916e-06, | |
| "loss": 0.7653, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 1.7904220009483167, | |
| "grad_norm": 2.1517114639282227, | |
| "learning_rate": 8.344871711376202e-06, | |
| "loss": 0.7744, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.793456614509246, | |
| "grad_norm": 2.1749329566955566, | |
| "learning_rate": 8.309991796781512e-06, | |
| "loss": 0.7663, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.7964912280701755, | |
| "grad_norm": 2.2055773735046387, | |
| "learning_rate": 8.2751330347097e-06, | |
| "loss": 0.7768, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.7995258416311048, | |
| "grad_norm": 2.117532253265381, | |
| "learning_rate": 8.24029586146088e-06, | |
| "loss": 0.7774, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 1.802560455192034, | |
| "grad_norm": 2.2061290740966797, | |
| "learning_rate": 8.205480713064947e-06, | |
| "loss": 0.7819, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.8055950687529636, | |
| "grad_norm": 2.2168619632720947, | |
| "learning_rate": 8.170688025276134e-06, | |
| "loss": 0.7646, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.8086296823138928, | |
| "grad_norm": 2.238391637802124, | |
| "learning_rate": 8.135918233567545e-06, | |
| "loss": 0.7782, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 1.8116642958748221, | |
| "grad_norm": 2.105041265487671, | |
| "learning_rate": 8.101171773125716e-06, | |
| "loss": 0.7525, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.8146989094357515, | |
| "grad_norm": 2.0234577655792236, | |
| "learning_rate": 8.066449078845168e-06, | |
| "loss": 0.7658, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 1.817733522996681, | |
| "grad_norm": 2.188751220703125, | |
| "learning_rate": 8.031750585322948e-06, | |
| "loss": 0.7616, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 1.8207681365576103, | |
| "grad_norm": 2.1460800170898438, | |
| "learning_rate": 7.99707672685321e-06, | |
| "loss": 0.7617, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.8207681365576103, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.6188, | |
| "eval_samples_per_second": 20.321, | |
| "eval_steps_per_second": 5.083, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.8238027501185394, | |
| "grad_norm": 1.964609146118164, | |
| "learning_rate": 7.962427937421763e-06, | |
| "loss": 0.7876, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 1.826837363679469, | |
| "grad_norm": 2.0918426513671875, | |
| "learning_rate": 7.92780465070066e-06, | |
| "loss": 0.7744, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 1.8298719772403982, | |
| "grad_norm": 2.059992790222168, | |
| "learning_rate": 7.89320730004274e-06, | |
| "loss": 0.7665, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.8329065908013278, | |
| "grad_norm": 2.0450618267059326, | |
| "learning_rate": 7.858636318476226e-06, | |
| "loss": 0.7636, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 1.835941204362257, | |
| "grad_norm": 2.1291024684906006, | |
| "learning_rate": 7.824092138699307e-06, | |
| "loss": 0.7652, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.8389758179231863, | |
| "grad_norm": 2.0631322860717773, | |
| "learning_rate": 7.789575193074703e-06, | |
| "loss": 0.7819, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.8420104314841157, | |
| "grad_norm": 2.1132242679595947, | |
| "learning_rate": 7.755085913624274e-06, | |
| "loss": 0.7507, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 1.845045045045045, | |
| "grad_norm": 2.1836376190185547, | |
| "learning_rate": 7.720624732023604e-06, | |
| "loss": 0.7798, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 1.8480796586059745, | |
| "grad_norm": 2.087453842163086, | |
| "learning_rate": 7.686192079596586e-06, | |
| "loss": 0.7817, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.8511142721669036, | |
| "grad_norm": 2.0749568939208984, | |
| "learning_rate": 7.651788387310053e-06, | |
| "loss": 0.7893, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8541488857278332, | |
| "grad_norm": 2.085495710372925, | |
| "learning_rate": 7.617414085768352e-06, | |
| "loss": 0.7765, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 1.8571834992887624, | |
| "grad_norm": 2.1710901260375977, | |
| "learning_rate": 7.5830696052079754e-06, | |
| "loss": 0.7745, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.860218112849692, | |
| "grad_norm": 2.12780499458313, | |
| "learning_rate": 7.548755375492173e-06, | |
| "loss": 0.7845, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 1.8632527264106211, | |
| "grad_norm": 2.240459680557251, | |
| "learning_rate": 7.514471826105556e-06, | |
| "loss": 0.7731, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 1.8662873399715505, | |
| "grad_norm": 2.130253553390503, | |
| "learning_rate": 7.480219386148751e-06, | |
| "loss": 0.7904, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.8693219535324799, | |
| "grad_norm": 2.216806650161743, | |
| "learning_rate": 7.445998484332994e-06, | |
| "loss": 0.7529, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 1.8723565670934093, | |
| "grad_norm": 2.250821352005005, | |
| "learning_rate": 7.411809548974792e-06, | |
| "loss": 0.7566, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 1.8753911806543386, | |
| "grad_norm": 2.088843822479248, | |
| "learning_rate": 7.377653007990559e-06, | |
| "loss": 0.7568, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.8784257942152678, | |
| "grad_norm": 2.108356237411499, | |
| "learning_rate": 7.343529288891239e-06, | |
| "loss": 0.7769, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 1.8814604077761974, | |
| "grad_norm": 2.1201224327087402, | |
| "learning_rate": 7.3094388187769815e-06, | |
| "loss": 0.7651, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8844950213371265, | |
| "grad_norm": 2.066652536392212, | |
| "learning_rate": 7.275382024331773e-06, | |
| "loss": 0.7773, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 1.887529634898056, | |
| "grad_norm": 2.1200613975524902, | |
| "learning_rate": 7.241359331818121e-06, | |
| "loss": 0.7835, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 1.8905642484589853, | |
| "grad_norm": 2.160304546356201, | |
| "learning_rate": 7.2073711670717e-06, | |
| "loss": 0.7489, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 1.8935988620199147, | |
| "grad_norm": 2.2403597831726074, | |
| "learning_rate": 7.173417955496025e-06, | |
| "loss": 0.7579, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.896633475580844, | |
| "grad_norm": 2.083029270172119, | |
| "learning_rate": 7.13950012205713e-06, | |
| "loss": 0.7715, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.8996680891417732, | |
| "grad_norm": 2.031341075897217, | |
| "learning_rate": 7.105618091278246e-06, | |
| "loss": 0.7494, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 1.9027027027027028, | |
| "grad_norm": 2.155595541000366, | |
| "learning_rate": 7.071772287234497e-06, | |
| "loss": 0.7797, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 1.905737316263632, | |
| "grad_norm": 2.2580630779266357, | |
| "learning_rate": 7.037963133547583e-06, | |
| "loss": 0.7801, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 1.9087719298245616, | |
| "grad_norm": 2.2416763305664062, | |
| "learning_rate": 7.004191053380469e-06, | |
| "loss": 0.7753, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 1.9118065433854907, | |
| "grad_norm": 2.0690417289733887, | |
| "learning_rate": 6.970456469432116e-06, | |
| "loss": 0.7693, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.91484115694642, | |
| "grad_norm": 2.0331270694732666, | |
| "learning_rate": 6.936759803932167e-06, | |
| "loss": 0.7578, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 1.9178757705073495, | |
| "grad_norm": 2.1094417572021484, | |
| "learning_rate": 6.903101478635663e-06, | |
| "loss": 0.7589, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 1.9209103840682789, | |
| "grad_norm": 2.042919635772705, | |
| "learning_rate": 6.869481914817779e-06, | |
| "loss": 0.7659, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 1.9239449976292082, | |
| "grad_norm": 2.0610294342041016, | |
| "learning_rate": 6.835901533268536e-06, | |
| "loss": 0.7648, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 1.9269796111901374, | |
| "grad_norm": 2.0473015308380127, | |
| "learning_rate": 6.802360754287548e-06, | |
| "loss": 0.7763, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.930014224751067, | |
| "grad_norm": 2.1137821674346924, | |
| "learning_rate": 6.768859997678751e-06, | |
| "loss": 0.7849, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.9330488383119961, | |
| "grad_norm": 2.12430477142334, | |
| "learning_rate": 6.735399682745145e-06, | |
| "loss": 0.7644, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 1.9360834518729255, | |
| "grad_norm": 2.0620853900909424, | |
| "learning_rate": 6.701980228283569e-06, | |
| "loss": 0.7733, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 1.939118065433855, | |
| "grad_norm": 2.124861478805542, | |
| "learning_rate": 6.668602052579425e-06, | |
| "loss": 0.7538, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.9421526789947843, | |
| "grad_norm": 2.003215789794922, | |
| "learning_rate": 6.635265573401475e-06, | |
| "loss": 0.7574, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9451872925557137, | |
| "grad_norm": 2.1591827869415283, | |
| "learning_rate": 6.601971207996592e-06, | |
| "loss": 0.7591, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 1.9482219061166428, | |
| "grad_norm": 2.1029608249664307, | |
| "learning_rate": 6.5687193730845375e-06, | |
| "loss": 0.7927, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.9512565196775724, | |
| "grad_norm": 2.069796085357666, | |
| "learning_rate": 6.535510484852767e-06, | |
| "loss": 0.7491, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 1.9542911332385016, | |
| "grad_norm": 2.0876927375793457, | |
| "learning_rate": 6.50234495895119e-06, | |
| "loss": 0.7615, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 1.9573257467994312, | |
| "grad_norm": 2.0911192893981934, | |
| "learning_rate": 6.469223210486992e-06, | |
| "loss": 0.7579, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.9603603603603603, | |
| "grad_norm": 2.2714855670928955, | |
| "learning_rate": 6.4361456540194325e-06, | |
| "loss": 0.7543, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 1.9633949739212897, | |
| "grad_norm": 2.075011968612671, | |
| "learning_rate": 6.403112703554643e-06, | |
| "loss": 0.752, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 1.966429587482219, | |
| "grad_norm": 2.097029447555542, | |
| "learning_rate": 6.370124772540469e-06, | |
| "loss": 0.7338, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.9694642010431485, | |
| "grad_norm": 2.147951602935791, | |
| "learning_rate": 6.337182273861273e-06, | |
| "loss": 0.7735, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 1.9724988146040778, | |
| "grad_norm": 2.0643298625946045, | |
| "learning_rate": 6.3042856198327795e-06, | |
| "loss": 0.7775, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9724988146040778, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.7127, | |
| "eval_samples_per_second": 20.311, | |
| "eval_steps_per_second": 5.08, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.975533428165007, | |
| "grad_norm": 2.043440580368042, | |
| "learning_rate": 6.2714352221969155e-06, | |
| "loss": 0.7593, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.9785680417259366, | |
| "grad_norm": 2.250242233276367, | |
| "learning_rate": 6.238631492116643e-06, | |
| "loss": 0.7657, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 1.9816026552868657, | |
| "grad_norm": 2.068997383117676, | |
| "learning_rate": 6.205874840170833e-06, | |
| "loss": 0.7774, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 1.9846372688477951, | |
| "grad_norm": 2.1658010482788086, | |
| "learning_rate": 6.173165676349103e-06, | |
| "loss": 0.7792, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.9876718824087245, | |
| "grad_norm": 2.1947412490844727, | |
| "learning_rate": 6.140504410046712e-06, | |
| "loss": 0.7631, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.9907064959696539, | |
| "grad_norm": 2.082087278366089, | |
| "learning_rate": 6.107891450059419e-06, | |
| "loss": 0.7564, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 1.9937411095305833, | |
| "grad_norm": 2.043905019760132, | |
| "learning_rate": 6.075327204578363e-06, | |
| "loss": 0.7834, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.9967757230915124, | |
| "grad_norm": 2.1089298725128174, | |
| "learning_rate": 6.042812081184973e-06, | |
| "loss": 0.7595, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 1.999810336652442, | |
| "grad_norm": 2.0419466495513916, | |
| "learning_rate": 6.010346486845837e-06, | |
| "loss": 0.7481, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 2.002844950213371, | |
| "grad_norm": 2.1154048442840576, | |
| "learning_rate": 5.97793082790765e-06, | |
| "loss": 0.7479, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.0058795637743008, | |
| "grad_norm": 1.9878915548324585, | |
| "learning_rate": 5.945565510092086e-06, | |
| "loss": 0.7547, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 2.00891417733523, | |
| "grad_norm": 2.1136927604675293, | |
| "learning_rate": 5.913250938490744e-06, | |
| "loss": 0.7294, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 2.0119487908961595, | |
| "grad_norm": 2.1599817276000977, | |
| "learning_rate": 5.880987517560075e-06, | |
| "loss": 0.744, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 2.0149834044570887, | |
| "grad_norm": 2.200779438018799, | |
| "learning_rate": 5.84877565111631e-06, | |
| "loss": 0.7353, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 2.018018018018018, | |
| "grad_norm": 2.05635404586792, | |
| "learning_rate": 5.81661574233042e-06, | |
| "loss": 0.7534, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.0210526315789474, | |
| "grad_norm": 2.193406820297241, | |
| "learning_rate": 5.784508193723058e-06, | |
| "loss": 0.7443, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 2.0240872451398766, | |
| "grad_norm": 2.120541572570801, | |
| "learning_rate": 5.752453407159521e-06, | |
| "loss": 0.7414, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 2.027121858700806, | |
| "grad_norm": 2.075017213821411, | |
| "learning_rate": 5.720451783844741e-06, | |
| "loss": 0.7439, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.0301564722617353, | |
| "grad_norm": 2.1643104553222656, | |
| "learning_rate": 5.688503724318217e-06, | |
| "loss": 0.7565, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 2.033191085822665, | |
| "grad_norm": 2.148364782333374, | |
| "learning_rate": 5.656609628449064e-06, | |
| "loss": 0.7353, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.036225699383594, | |
| "grad_norm": 2.205618143081665, | |
| "learning_rate": 5.6247698954309616e-06, | |
| "loss": 0.7529, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 2.0392603129445233, | |
| "grad_norm": 2.1727452278137207, | |
| "learning_rate": 5.592984923777156e-06, | |
| "loss": 0.7439, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.042294926505453, | |
| "grad_norm": 2.1897435188293457, | |
| "learning_rate": 5.561255111315525e-06, | |
| "loss": 0.7578, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 2.045329540066382, | |
| "grad_norm": 2.155768871307373, | |
| "learning_rate": 5.529580855183518e-06, | |
| "loss": 0.7687, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 2.0483641536273116, | |
| "grad_norm": 2.100489616394043, | |
| "learning_rate": 5.497962551823266e-06, | |
| "loss": 0.7333, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.0513987671882408, | |
| "grad_norm": 2.0687026977539062, | |
| "learning_rate": 5.466400596976568e-06, | |
| "loss": 0.745, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 2.0544333807491704, | |
| "grad_norm": 2.144482135772705, | |
| "learning_rate": 5.434895385679937e-06, | |
| "loss": 0.7369, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 2.0574679943100995, | |
| "grad_norm": 2.1690595149993896, | |
| "learning_rate": 5.403447312259702e-06, | |
| "loss": 0.7443, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 2.060502607871029, | |
| "grad_norm": 2.0905041694641113, | |
| "learning_rate": 5.3720567703270135e-06, | |
| "loss": 0.7333, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 2.0635372214319583, | |
| "grad_norm": 2.205411911010742, | |
| "learning_rate": 5.340724152772956e-06, | |
| "loss": 0.7399, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.0665718349928874, | |
| "grad_norm": 2.3007540702819824, | |
| "learning_rate": 5.3094498517636324e-06, | |
| "loss": 0.7452, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 2.069606448553817, | |
| "grad_norm": 2.2882180213928223, | |
| "learning_rate": 5.278234258735215e-06, | |
| "loss": 0.7285, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 2.072641062114746, | |
| "grad_norm": 2.2169337272644043, | |
| "learning_rate": 5.247077764389099e-06, | |
| "loss": 0.7437, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 2.075675675675676, | |
| "grad_norm": 2.1462368965148926, | |
| "learning_rate": 5.215980758686978e-06, | |
| "loss": 0.7366, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 2.078710289236605, | |
| "grad_norm": 2.200030565261841, | |
| "learning_rate": 5.18494363084596e-06, | |
| "loss": 0.7546, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.0817449027975345, | |
| "grad_norm": 2.1369688510894775, | |
| "learning_rate": 5.153966769333734e-06, | |
| "loss": 0.7388, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 2.0847795163584637, | |
| "grad_norm": 2.191958427429199, | |
| "learning_rate": 5.1230505618636575e-06, | |
| "loss": 0.7297, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 2.0878141299193933, | |
| "grad_norm": 2.1999082565307617, | |
| "learning_rate": 5.092195395389937e-06, | |
| "loss": 0.7428, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 2.0908487434803225, | |
| "grad_norm": 2.373140335083008, | |
| "learning_rate": 5.061401656102791e-06, | |
| "loss": 0.7585, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 2.0938833570412516, | |
| "grad_norm": 2.2395219802856445, | |
| "learning_rate": 5.030669729423572e-06, | |
| "loss": 0.7382, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.096917970602181, | |
| "grad_norm": 2.1638119220733643, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.7538, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 2.0999525841631104, | |
| "grad_norm": 2.173945188522339, | |
| "learning_rate": 4.969392851701306e-06, | |
| "loss": 0.7403, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 2.10298719772404, | |
| "grad_norm": 2.2739264965057373, | |
| "learning_rate": 4.938848667613436e-06, | |
| "loss": 0.7459, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 2.106021811284969, | |
| "grad_norm": 2.2303757667541504, | |
| "learning_rate": 4.908367830034284e-06, | |
| "loss": 0.717, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 2.1090564248458987, | |
| "grad_norm": 2.083385944366455, | |
| "learning_rate": 4.8779507204688595e-06, | |
| "loss": 0.747, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.112091038406828, | |
| "grad_norm": 2.252095937728882, | |
| "learning_rate": 4.84759771962455e-06, | |
| "loss": 0.7414, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 2.115125651967757, | |
| "grad_norm": 2.192476272583008, | |
| "learning_rate": 4.817309207406347e-06, | |
| "loss": 0.7078, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 2.1181602655286866, | |
| "grad_norm": 2.170762062072754, | |
| "learning_rate": 4.787085562912076e-06, | |
| "loss": 0.7482, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 2.121194879089616, | |
| "grad_norm": 2.1275007724761963, | |
| "learning_rate": 4.756927164427685e-06, | |
| "loss": 0.7415, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 2.1242294926505454, | |
| "grad_norm": 2.1509687900543213, | |
| "learning_rate": 4.726834389422461e-06, | |
| "loss": 0.7326, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.1242294926505454, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.0742, | |
| "eval_samples_per_second": 20.375, | |
| "eval_steps_per_second": 5.096, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.1272641062114745, | |
| "grad_norm": 2.0226032733917236, | |
| "learning_rate": 4.696807614544352e-06, | |
| "loss": 0.7411, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 2.130298719772404, | |
| "grad_norm": 2.1866512298583984, | |
| "learning_rate": 4.666847215615225e-06, | |
| "loss": 0.7394, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 2.2013630867004395, | |
| "learning_rate": 4.636953567626176e-06, | |
| "loss": 0.7356, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 2.1363679468942625, | |
| "grad_norm": 2.4076430797576904, | |
| "learning_rate": 4.607127044732827e-06, | |
| "loss": 0.7501, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 2.139402560455192, | |
| "grad_norm": 2.1296753883361816, | |
| "learning_rate": 4.57736802025065e-06, | |
| "loss": 0.7399, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.142437174016121, | |
| "grad_norm": 2.22512149810791, | |
| "learning_rate": 4.5476768666502895e-06, | |
| "loss": 0.7553, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 2.145471787577051, | |
| "grad_norm": 2.178199052810669, | |
| "learning_rate": 4.518053955552903e-06, | |
| "loss": 0.7458, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 2.14850640113798, | |
| "grad_norm": 2.0975606441497803, | |
| "learning_rate": 4.488499657725511e-06, | |
| "loss": 0.7234, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 2.1515410146989096, | |
| "grad_norm": 2.1103413105010986, | |
| "learning_rate": 4.459014343076356e-06, | |
| "loss": 0.7431, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 2.1545756282598387, | |
| "grad_norm": 2.2594456672668457, | |
| "learning_rate": 4.429598380650266e-06, | |
| "loss": 0.7578, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.1576102418207683, | |
| "grad_norm": 2.2270963191986084, | |
| "learning_rate": 4.400252138624047e-06, | |
| "loss": 0.7461, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 2.1606448553816975, | |
| "grad_norm": 2.191631555557251, | |
| "learning_rate": 4.370975984301866e-06, | |
| "loss": 0.7425, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 2.1636794689426266, | |
| "grad_norm": 2.124616861343384, | |
| "learning_rate": 4.341770284110655e-06, | |
| "loss": 0.7495, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 2.1667140825035562, | |
| "grad_norm": 2.18497371673584, | |
| "learning_rate": 4.312635403595532e-06, | |
| "loss": 0.7239, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 2.1697486960644854, | |
| "grad_norm": 2.149658441543579, | |
| "learning_rate": 4.283571707415214e-06, | |
| "loss": 0.7325, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.172783309625415, | |
| "grad_norm": 2.133171558380127, | |
| "learning_rate": 4.25457955933746e-06, | |
| "loss": 0.7324, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 2.175817923186344, | |
| "grad_norm": 2.15606689453125, | |
| "learning_rate": 4.2256593222345185e-06, | |
| "loss": 0.7273, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 2.1788525367472737, | |
| "grad_norm": 2.237285614013672, | |
| "learning_rate": 4.196811358078585e-06, | |
| "loss": 0.7463, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 2.181887150308203, | |
| "grad_norm": 2.267974853515625, | |
| "learning_rate": 4.168036027937267e-06, | |
| "loss": 0.7405, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 2.1849217638691325, | |
| "grad_norm": 2.0772793292999268, | |
| "learning_rate": 4.139333691969071e-06, | |
| "loss": 0.7418, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.1879563774300617, | |
| "grad_norm": 2.1171600818634033, | |
| "learning_rate": 4.1107047094188946e-06, | |
| "loss": 0.7222, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 2.190990990990991, | |
| "grad_norm": 2.3039426803588867, | |
| "learning_rate": 4.082149438613514e-06, | |
| "loss": 0.731, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 2.1940256045519204, | |
| "grad_norm": 2.207404375076294, | |
| "learning_rate": 4.053668236957135e-06, | |
| "loss": 0.7499, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 2.1970602181128496, | |
| "grad_norm": 2.1732053756713867, | |
| "learning_rate": 4.025261460926877e-06, | |
| "loss": 0.7617, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 2.200094831673779, | |
| "grad_norm": 2.537623167037964, | |
| "learning_rate": 3.996929466068344e-06, | |
| "loss": 0.72, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.2031294452347083, | |
| "grad_norm": 2.2418384552001953, | |
| "learning_rate": 3.96867260699116e-06, | |
| "loss": 0.727, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 2.206164058795638, | |
| "grad_norm": 2.189655065536499, | |
| "learning_rate": 3.940491237364519e-06, | |
| "loss": 0.7321, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 2.209198672356567, | |
| "grad_norm": 2.1620960235595703, | |
| "learning_rate": 3.912385709912794e-06, | |
| "loss": 0.7421, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 2.2122332859174962, | |
| "grad_norm": 2.216374397277832, | |
| "learning_rate": 3.884356376411089e-06, | |
| "loss": 0.7632, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 2.215267899478426, | |
| "grad_norm": 2.152038335800171, | |
| "learning_rate": 3.8564035876808405e-06, | |
| "loss": 0.7349, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.218302513039355, | |
| "grad_norm": 2.1349170207977295, | |
| "learning_rate": 3.828527693585451e-06, | |
| "loss": 0.7373, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 2.2213371266002846, | |
| "grad_norm": 2.1102511882781982, | |
| "learning_rate": 3.8007290430258712e-06, | |
| "loss": 0.7278, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 2.2243717401612138, | |
| "grad_norm": 2.124293088912964, | |
| "learning_rate": 3.7730079839362755e-06, | |
| "loss": 0.7315, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 2.2274063537221434, | |
| "grad_norm": 2.0747625827789307, | |
| "learning_rate": 3.7453648632796745e-06, | |
| "loss": 0.7391, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 2.2304409672830725, | |
| "grad_norm": 2.1584184169769287, | |
| "learning_rate": 3.7178000270435765e-06, | |
| "loss": 0.7526, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.2334755808440017, | |
| "grad_norm": 2.0907371044158936, | |
| "learning_rate": 3.690313820235686e-06, | |
| "loss": 0.7176, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 2.2365101944049313, | |
| "grad_norm": 2.100431442260742, | |
| "learning_rate": 3.662906586879542e-06, | |
| "loss": 0.7619, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 2.2395448079658604, | |
| "grad_norm": 2.1690993309020996, | |
| "learning_rate": 3.6355786700102426e-06, | |
| "loss": 0.7385, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 2.24257942152679, | |
| "grad_norm": 2.2165980339050293, | |
| "learning_rate": 3.6083304116701535e-06, | |
| "loss": 0.7577, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 2.1825928688049316, | |
| "learning_rate": 3.5811621529045927e-06, | |
| "loss": 0.7353, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.2486486486486488, | |
| "grad_norm": 2.1485071182250977, | |
| "learning_rate": 3.5540742337576083e-06, | |
| "loss": 0.7534, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 2.251683262209578, | |
| "grad_norm": 2.193574905395508, | |
| "learning_rate": 3.5270669932676926e-06, | |
| "loss": 0.7395, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 2.2547178757705075, | |
| "grad_norm": 2.152099847793579, | |
| "learning_rate": 3.5001407694635326e-06, | |
| "loss": 0.7265, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 2.2577524893314367, | |
| "grad_norm": 2.2395544052124023, | |
| "learning_rate": 3.4732958993598153e-06, | |
| "loss": 0.7482, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 2.2607871028923663, | |
| "grad_norm": 2.209214448928833, | |
| "learning_rate": 3.446532718952966e-06, | |
| "loss": 0.7366, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 2.2638217164532954, | |
| "grad_norm": 2.282007932662964, | |
| "learning_rate": 3.4198515632169703e-06, | |
| "loss": 0.7641, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 2.2668563300142246, | |
| "grad_norm": 2.1614651679992676, | |
| "learning_rate": 3.3932527660991877e-06, | |
| "loss": 0.7502, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 2.269890943575154, | |
| "grad_norm": 2.306121826171875, | |
| "learning_rate": 3.3667366605161322e-06, | |
| "loss": 0.7464, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 2.2729255571360834, | |
| "grad_norm": 2.2220146656036377, | |
| "learning_rate": 3.340303578349361e-06, | |
| "loss": 0.7497, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 2.275960170697013, | |
| "grad_norm": 2.1184372901916504, | |
| "learning_rate": 3.313953850441266e-06, | |
| "loss": 0.7434, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.275960170697013, | |
| "eval_loss": NaN, | |
| "eval_runtime": 203.9554, | |
| "eval_samples_per_second": 20.387, | |
| "eval_steps_per_second": 5.099, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.278994784257942, | |
| "grad_norm": 2.2895419597625732, | |
| "learning_rate": 3.2876878065909714e-06, | |
| "loss": 0.7362, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 2.2820293978188717, | |
| "grad_norm": 2.148529291152954, | |
| "learning_rate": 3.2615057755502e-06, | |
| "loss": 0.7558, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 2.285064011379801, | |
| "grad_norm": 2.1223018169403076, | |
| "learning_rate": 3.2354080850191328e-06, | |
| "loss": 0.7591, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 2.28809862494073, | |
| "grad_norm": 2.2450056076049805, | |
| "learning_rate": 3.2093950616423397e-06, | |
| "loss": 0.7335, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 2.2911332385016596, | |
| "grad_norm": 2.173351526260376, | |
| "learning_rate": 3.1834670310046735e-06, | |
| "loss": 0.7546, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 2.2941678520625888, | |
| "grad_norm": 2.0341904163360596, | |
| "learning_rate": 3.157624317627195e-06, | |
| "loss": 0.7447, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 2.2972024656235184, | |
| "grad_norm": 2.0900380611419678, | |
| "learning_rate": 3.1318672449631283e-06, | |
| "loss": 0.7544, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 2.3002370791844475, | |
| "grad_norm": 2.2536418437957764, | |
| "learning_rate": 3.106196135393782e-06, | |
| "loss": 0.7326, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 2.303271692745377, | |
| "grad_norm": 2.1906511783599854, | |
| "learning_rate": 3.0806113102245395e-06, | |
| "loss": 0.7522, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 2.3063063063063063, | |
| "grad_norm": 2.1476638317108154, | |
| "learning_rate": 3.055113089680829e-06, | |
| "loss": 0.7423, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.3093409198672354, | |
| "grad_norm": 2.1808698177337646, | |
| "learning_rate": 3.029701792904117e-06, | |
| "loss": 0.7612, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 2.312375533428165, | |
| "grad_norm": 2.085947036743164, | |
| "learning_rate": 3.00437773794791e-06, | |
| "loss": 0.7439, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 2.315410146989094, | |
| "grad_norm": 2.1122984886169434, | |
| "learning_rate": 2.979141241773775e-06, | |
| "loss": 0.7383, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 2.318444760550024, | |
| "grad_norm": 2.1743266582489014, | |
| "learning_rate": 2.953992620247379e-06, | |
| "loss": 0.7458, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 2.321479374110953, | |
| "grad_norm": 2.1469690799713135, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 0.756, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 2.3245139876718826, | |
| "grad_norm": 2.077439069747925, | |
| "learning_rate": 2.9039602590972203e-06, | |
| "loss": 0.7316, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 2.3275486012328117, | |
| "grad_norm": 2.4407780170440674, | |
| "learning_rate": 2.879077145689746e-06, | |
| "loss": 0.7343, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 2.330583214793741, | |
| "grad_norm": 2.1479241847991943, | |
| "learning_rate": 2.8542831593547483e-06, | |
| "loss": 0.7407, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 2.3336178283546705, | |
| "grad_norm": 2.2073941230773926, | |
| "learning_rate": 2.829578610419337e-06, | |
| "loss": 0.7471, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 2.3366524419155996, | |
| "grad_norm": 2.0292317867279053, | |
| "learning_rate": 2.8049638080912045e-06, | |
| "loss": 0.7332, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.339687055476529, | |
| "grad_norm": 2.2681589126586914, | |
| "learning_rate": 2.780439060454756e-06, | |
| "loss": 0.7436, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 2.3427216690374584, | |
| "grad_norm": 2.057676076889038, | |
| "learning_rate": 2.75600467446725e-06, | |
| "loss": 0.7352, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 2.345756282598388, | |
| "grad_norm": 2.2097206115722656, | |
| "learning_rate": 2.7316609559549568e-06, | |
| "loss": 0.7275, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 2.348790896159317, | |
| "grad_norm": 2.3039231300354004, | |
| "learning_rate": 2.707408209609339e-06, | |
| "loss": 0.7556, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 2.3518255097202467, | |
| "grad_norm": 2.097167730331421, | |
| "learning_rate": 2.683246738983217e-06, | |
| "loss": 0.7404, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 2.354860123281176, | |
| "grad_norm": 2.083677053451538, | |
| "learning_rate": 2.6591768464870016e-06, | |
| "loss": 0.7359, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 2.3578947368421055, | |
| "grad_norm": 2.0094263553619385, | |
| "learning_rate": 2.6351988333848787e-06, | |
| "loss": 0.7297, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 2.3609293504030346, | |
| "grad_norm": 2.13173770904541, | |
| "learning_rate": 2.611312999791055e-06, | |
| "loss": 0.7315, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 2.363963963963964, | |
| "grad_norm": 2.1343095302581787, | |
| "learning_rate": 2.587519644666001e-06, | |
| "loss": 0.7309, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 2.3669985775248934, | |
| "grad_norm": 2.122774600982666, | |
| "learning_rate": 2.5638190658126937e-06, | |
| "loss": 0.7428, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3700331910858226, | |
| "grad_norm": 2.1483750343322754, | |
| "learning_rate": 2.5402115598729182e-06, | |
| "loss": 0.7386, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 2.373067804646752, | |
| "grad_norm": 2.189150810241699, | |
| "learning_rate": 2.5166974223235295e-06, | |
| "loss": 0.7237, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 2.3761024182076813, | |
| "grad_norm": 2.156003475189209, | |
| "learning_rate": 2.493276947472756e-06, | |
| "loss": 0.7614, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 2.379137031768611, | |
| "grad_norm": 2.100486993789673, | |
| "learning_rate": 2.4699504284565413e-06, | |
| "loss": 0.7543, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 2.38217164532954, | |
| "grad_norm": 2.1420738697052, | |
| "learning_rate": 2.446718157234832e-06, | |
| "loss": 0.7475, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 2.385206258890469, | |
| "grad_norm": 2.205798625946045, | |
| "learning_rate": 2.4235804245879723e-06, | |
| "loss": 0.7362, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 2.388240872451399, | |
| "grad_norm": 2.193894147872925, | |
| "learning_rate": 2.4005375201130275e-06, | |
| "loss": 0.7456, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 2.391275486012328, | |
| "grad_norm": 2.1630735397338867, | |
| "learning_rate": 2.3775897322201667e-06, | |
| "loss": 0.7553, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 2.3943100995732576, | |
| "grad_norm": 2.247066020965576, | |
| "learning_rate": 2.354737348129077e-06, | |
| "loss": 0.7324, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 2.3973447131341867, | |
| "grad_norm": 2.1332616806030273, | |
| "learning_rate": 2.33198065386533e-06, | |
| "loss": 0.7413, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.4003793266951163, | |
| "grad_norm": 2.1438822746276855, | |
| "learning_rate": 2.3093199342568316e-06, | |
| "loss": 0.7215, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 2.4034139402560455, | |
| "grad_norm": 2.1765286922454834, | |
| "learning_rate": 2.2867554729302545e-06, | |
| "loss": 0.7362, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 2.4064485538169746, | |
| "grad_norm": 2.059136390686035, | |
| "learning_rate": 2.2642875523074613e-06, | |
| "loss": 0.7373, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 2.4094831673779042, | |
| "grad_norm": 2.195065975189209, | |
| "learning_rate": 2.2419164536020112e-06, | |
| "loss": 0.7363, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 2.4125177809388334, | |
| "grad_norm": 2.2990760803222656, | |
| "learning_rate": 2.2196424568156073e-06, | |
| "loss": 0.7348, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 2.415552394499763, | |
| "grad_norm": 2.251936435699463, | |
| "learning_rate": 2.197465840734596e-06, | |
| "loss": 0.7257, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 2.418587008060692, | |
| "grad_norm": 2.142132520675659, | |
| "learning_rate": 2.1753868829265046e-06, | |
| "loss": 0.7526, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 2.4216216216216218, | |
| "grad_norm": 2.235971450805664, | |
| "learning_rate": 2.1534058597365284e-06, | |
| "loss": 0.7402, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 2.424656235182551, | |
| "grad_norm": 2.1040003299713135, | |
| "learning_rate": 2.1315230462840985e-06, | |
| "loss": 0.7271, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 2.4276908487434805, | |
| "grad_norm": 2.153122663497925, | |
| "learning_rate": 2.1097387164594406e-06, | |
| "loss": 0.7274, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.4276908487434805, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.5921, | |
| "eval_samples_per_second": 20.323, | |
| "eval_steps_per_second": 5.083, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.4307254623044097, | |
| "grad_norm": 2.2538816928863525, | |
| "learning_rate": 2.0880531429201146e-06, | |
| "loss": 0.749, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 2.4337600758653393, | |
| "grad_norm": 2.194967031478882, | |
| "learning_rate": 2.0664665970876496e-06, | |
| "loss": 0.7395, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 2.4367946894262684, | |
| "grad_norm": 2.1779520511627197, | |
| "learning_rate": 2.0449793491441026e-06, | |
| "loss": 0.7476, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 2.4398293029871976, | |
| "grad_norm": 2.243781805038452, | |
| "learning_rate": 2.0235916680287015e-06, | |
| "loss": 0.7268, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 2.442863916548127, | |
| "grad_norm": 2.1497793197631836, | |
| "learning_rate": 2.0023038214344827e-06, | |
| "loss": 0.76, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 2.4458985301090563, | |
| "grad_norm": 2.209360122680664, | |
| "learning_rate": 1.9811160758049163e-06, | |
| "loss": 0.7133, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 2.448933143669986, | |
| "grad_norm": 2.1285481452941895, | |
| "learning_rate": 1.960028696330596e-06, | |
| "loss": 0.7349, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 2.451967757230915, | |
| "grad_norm": 2.16249680519104, | |
| "learning_rate": 1.9390419469459066e-06, | |
| "loss": 0.7392, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 2.4550023707918447, | |
| "grad_norm": 2.1789779663085938, | |
| "learning_rate": 1.9181560903257234e-06, | |
| "loss": 0.7299, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 2.458036984352774, | |
| "grad_norm": 2.196904420852661, | |
| "learning_rate": 1.8973713878821343e-06, | |
| "loss": 0.74, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.461071597913703, | |
| "grad_norm": 2.087092399597168, | |
| "learning_rate": 1.8766880997611424e-06, | |
| "loss": 0.7503, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 2.4641062114746326, | |
| "grad_norm": 2.023298978805542, | |
| "learning_rate": 1.8561064848394384e-06, | |
| "loss": 0.7409, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 2.4671408250355618, | |
| "grad_norm": 2.2470693588256836, | |
| "learning_rate": 1.8356268007211442e-06, | |
| "loss": 0.7553, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 2.4701754385964914, | |
| "grad_norm": 2.0922911167144775, | |
| "learning_rate": 1.8152493037345942e-06, | |
| "loss": 0.7257, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 2.4732100521574205, | |
| "grad_norm": 2.100867986679077, | |
| "learning_rate": 1.7949742489291256e-06, | |
| "loss": 0.7275, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 2.47624466571835, | |
| "grad_norm": 2.309476375579834, | |
| "learning_rate": 1.7748018900718856e-06, | |
| "loss": 0.7349, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 2.4792792792792793, | |
| "grad_norm": 2.1686556339263916, | |
| "learning_rate": 1.7547324796446553e-06, | |
| "loss": 0.746, | |
| "step": 817 | |
| }, | |
| { | |
| "epoch": 2.4823138928402084, | |
| "grad_norm": 2.161126136779785, | |
| "learning_rate": 1.7347662688406907e-06, | |
| "loss": 0.7637, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 2.485348506401138, | |
| "grad_norm": 2.0931811332702637, | |
| "learning_rate": 1.7149035075615795e-06, | |
| "loss": 0.7417, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 2.488383119962067, | |
| "grad_norm": 2.220621347427368, | |
| "learning_rate": 1.6951444444141084e-06, | |
| "loss": 0.7426, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.491417733522997, | |
| "grad_norm": 2.1453609466552734, | |
| "learning_rate": 1.6754893267071593e-06, | |
| "loss": 0.7277, | |
| "step": 821 | |
| }, | |
| { | |
| "epoch": 2.494452347083926, | |
| "grad_norm": 2.204213857650757, | |
| "learning_rate": 1.6559384004486057e-06, | |
| "loss": 0.7456, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 2.4974869606448555, | |
| "grad_norm": 2.1252691745758057, | |
| "learning_rate": 1.6364919103422394e-06, | |
| "loss": 0.7423, | |
| "step": 823 | |
| }, | |
| { | |
| "epoch": 2.5005215742057847, | |
| "grad_norm": 2.285757064819336, | |
| "learning_rate": 1.6171500997847056e-06, | |
| "loss": 0.7377, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 2.503556187766714, | |
| "grad_norm": 2.1519298553466797, | |
| "learning_rate": 1.5979132108624572e-06, | |
| "loss": 0.7397, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.5065908013276434, | |
| "grad_norm": 2.1684153079986572, | |
| "learning_rate": 1.5787814843487226e-06, | |
| "loss": 0.7355, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 2.509625414888573, | |
| "grad_norm": 2.1787893772125244, | |
| "learning_rate": 1.5597551597004968e-06, | |
| "loss": 0.7356, | |
| "step": 827 | |
| }, | |
| { | |
| "epoch": 2.512660028449502, | |
| "grad_norm": 2.22249436378479, | |
| "learning_rate": 1.5408344750555382e-06, | |
| "loss": 0.7439, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 2.5156946420104314, | |
| "grad_norm": 2.2051734924316406, | |
| "learning_rate": 1.522019667229393e-06, | |
| "loss": 0.7271, | |
| "step": 829 | |
| }, | |
| { | |
| "epoch": 2.518729255571361, | |
| "grad_norm": 2.0986456871032715, | |
| "learning_rate": 1.5033109717124284e-06, | |
| "loss": 0.7385, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.52176386913229, | |
| "grad_norm": 2.115734100341797, | |
| "learning_rate": 1.4847086226668871e-06, | |
| "loss": 0.7317, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 2.5247984826932193, | |
| "grad_norm": 2.02058482170105, | |
| "learning_rate": 1.4662128529239572e-06, | |
| "loss": 0.7524, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 2.527833096254149, | |
| "grad_norm": 2.1982414722442627, | |
| "learning_rate": 1.4478238939808454e-06, | |
| "loss": 0.7409, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 2.5308677098150785, | |
| "grad_norm": 2.1642115116119385, | |
| "learning_rate": 1.4295419759979079e-06, | |
| "loss": 0.7365, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 2.5339023233760076, | |
| "grad_norm": 2.118788003921509, | |
| "learning_rate": 1.4113673277957395e-06, | |
| "loss": 0.7276, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 2.536936936936937, | |
| "grad_norm": 2.0929243564605713, | |
| "learning_rate": 1.393300176852327e-06, | |
| "loss": 0.7449, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 2.5399715504978664, | |
| "grad_norm": 2.1339359283447266, | |
| "learning_rate": 1.3753407493001968e-06, | |
| "loss": 0.7241, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 2.5430061640587955, | |
| "grad_norm": 2.15981125831604, | |
| "learning_rate": 1.3574892699235798e-06, | |
| "loss": 0.7373, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 2.546040777619725, | |
| "grad_norm": 2.184105396270752, | |
| "learning_rate": 1.339745962155613e-06, | |
| "loss": 0.7283, | |
| "step": 839 | |
| }, | |
| { | |
| "epoch": 2.5490753911806543, | |
| "grad_norm": 2.0236942768096924, | |
| "learning_rate": 1.3221110480755306e-06, | |
| "loss": 0.7279, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.552110004741584, | |
| "grad_norm": 2.0451905727386475, | |
| "learning_rate": 1.3045847484058748e-06, | |
| "loss": 0.7417, | |
| "step": 841 | |
| }, | |
| { | |
| "epoch": 2.555144618302513, | |
| "grad_norm": 2.113330602645874, | |
| "learning_rate": 1.287167282509767e-06, | |
| "loss": 0.7604, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 2.558179231863442, | |
| "grad_norm": 2.0830495357513428, | |
| "learning_rate": 1.2698588683881185e-06, | |
| "loss": 0.7579, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 2.561213845424372, | |
| "grad_norm": 2.065068483352661, | |
| "learning_rate": 1.252659722676941e-06, | |
| "loss": 0.7268, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 2.564248458985301, | |
| "grad_norm": 2.157485008239746, | |
| "learning_rate": 1.2355700606446119e-06, | |
| "loss": 0.7305, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 2.5672830725462306, | |
| "grad_norm": 2.135056257247925, | |
| "learning_rate": 1.2185900961891794e-06, | |
| "loss": 0.7332, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 2.5703176861071597, | |
| "grad_norm": 2.1909451484680176, | |
| "learning_rate": 1.2017200418357077e-06, | |
| "loss": 0.7402, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 2.5733522996680893, | |
| "grad_norm": 2.2838058471679688, | |
| "learning_rate": 1.184960108733586e-06, | |
| "loss": 0.7336, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 2.5763869132290185, | |
| "grad_norm": 2.0348660945892334, | |
| "learning_rate": 1.1683105066539068e-06, | |
| "loss": 0.7367, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 2.5794215267899476, | |
| "grad_norm": 2.088353157043457, | |
| "learning_rate": 1.151771443986842e-06, | |
| "loss": 0.749, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.5794215267899476, | |
| "eval_loss": NaN, | |
| "eval_runtime": 205.2076, | |
| "eval_samples_per_second": 20.262, | |
| "eval_steps_per_second": 5.068, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.5824561403508772, | |
| "grad_norm": 2.1730659008026123, | |
| "learning_rate": 1.1353431277390125e-06, | |
| "loss": 0.7359, | |
| "step": 851 | |
| }, | |
| { | |
| "epoch": 2.5854907539118064, | |
| "grad_norm": 2.10697078704834, | |
| "learning_rate": 1.1190257635309276e-06, | |
| "loss": 0.7223, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 2.588525367472736, | |
| "grad_norm": 2.1502668857574463, | |
| "learning_rate": 1.1028195555943877e-06, | |
| "loss": 0.7265, | |
| "step": 853 | |
| }, | |
| { | |
| "epoch": 2.591559981033665, | |
| "grad_norm": 2.0735018253326416, | |
| "learning_rate": 1.0867247067699315e-06, | |
| "loss": 0.7345, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 2.5945945945945947, | |
| "grad_norm": 2.1325292587280273, | |
| "learning_rate": 1.0707414185043163e-06, | |
| "loss": 0.746, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 2.597629208155524, | |
| "grad_norm": 2.1418328285217285, | |
| "learning_rate": 1.0548698908479671e-06, | |
| "loss": 0.7401, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 2.600663821716453, | |
| "grad_norm": 2.061847448348999, | |
| "learning_rate": 1.0391103224524957e-06, | |
| "loss": 0.7319, | |
| "step": 857 | |
| }, | |
| { | |
| "epoch": 2.6036984352773827, | |
| "grad_norm": 2.1931376457214355, | |
| "learning_rate": 1.0234629105682104e-06, | |
| "loss": 0.7429, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 2.6067330488383122, | |
| "grad_norm": 2.1396734714508057, | |
| "learning_rate": 1.0079278510416313e-06, | |
| "loss": 0.7369, | |
| "step": 859 | |
| }, | |
| { | |
| "epoch": 2.6097676623992414, | |
| "grad_norm": 2.2338297367095947, | |
| "learning_rate": 9.925053383130667e-07, | |
| "loss": 0.745, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.6128022759601706, | |
| "grad_norm": 2.120897054672241, | |
| "learning_rate": 9.771955654141496e-07, | |
| "loss": 0.7444, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 2.6158368895211, | |
| "grad_norm": 2.204875946044922, | |
| "learning_rate": 9.619987239654405e-07, | |
| "loss": 0.7293, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 2.6188715030820293, | |
| "grad_norm": 2.228181838989258, | |
| "learning_rate": 9.469150041740338e-07, | |
| "loss": 0.7251, | |
| "step": 863 | |
| }, | |
| { | |
| "epoch": 2.6219061166429585, | |
| "grad_norm": 2.170053720474243, | |
| "learning_rate": 9.319445948311534e-07, | |
| "loss": 0.7444, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 2.624940730203888, | |
| "grad_norm": 2.163527011871338, | |
| "learning_rate": 9.170876833098119e-07, | |
| "loss": 0.7568, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 2.6279753437648177, | |
| "grad_norm": 2.1692657470703125, | |
| "learning_rate": 9.023444555624572e-07, | |
| "loss": 0.736, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 2.631009957325747, | |
| "grad_norm": 2.0737569332122803, | |
| "learning_rate": 8.87715096118642e-07, | |
| "loss": 0.7368, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 2.634044570886676, | |
| "grad_norm": 2.05617094039917, | |
| "learning_rate": 8.731997880827258e-07, | |
| "loss": 0.7334, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 2.6370791844476056, | |
| "grad_norm": 1.9645661115646362, | |
| "learning_rate": 8.587987131315656e-07, | |
| "loss": 0.7422, | |
| "step": 869 | |
| }, | |
| { | |
| "epoch": 2.6401137980085347, | |
| "grad_norm": 2.0881567001342773, | |
| "learning_rate": 8.445120515122552e-07, | |
| "loss": 0.7296, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.6431484115694643, | |
| "grad_norm": 2.2319769859313965, | |
| "learning_rate": 8.303399820398672e-07, | |
| "loss": 0.7502, | |
| "step": 871 | |
| }, | |
| { | |
| "epoch": 2.6461830251303935, | |
| "grad_norm": 2.1117403507232666, | |
| "learning_rate": 8.162826820952097e-07, | |
| "loss": 0.7319, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 2.649217638691323, | |
| "grad_norm": 2.056861400604248, | |
| "learning_rate": 8.023403276226127e-07, | |
| "loss": 0.7429, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 2.6522522522522523, | |
| "grad_norm": 2.140435218811035, | |
| "learning_rate": 7.885130931277219e-07, | |
| "loss": 0.7418, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 2.6552868658131814, | |
| "grad_norm": 2.11114239692688, | |
| "learning_rate": 7.74801151675314e-07, | |
| "loss": 0.7521, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.658321479374111, | |
| "grad_norm": 2.0694892406463623, | |
| "learning_rate": 7.612046748871327e-07, | |
| "loss": 0.7385, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 2.66135609293504, | |
| "grad_norm": 2.155579090118408, | |
| "learning_rate": 7.477238329397419e-07, | |
| "loss": 0.7427, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 2.6643907064959698, | |
| "grad_norm": 2.223646402359009, | |
| "learning_rate": 7.343587945623908e-07, | |
| "loss": 0.731, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 2.667425320056899, | |
| "grad_norm": 2.047801971435547, | |
| "learning_rate": 7.211097270349065e-07, | |
| "loss": 0.7426, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 2.6704599336178285, | |
| "grad_norm": 2.119241714477539, | |
| "learning_rate": 7.07976796185601e-07, | |
| "loss": 0.7407, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.6734945471787577, | |
| "grad_norm": 2.2282469272613525, | |
| "learning_rate": 6.949601663891891e-07, | |
| "loss": 0.7541, | |
| "step": 881 | |
| }, | |
| { | |
| "epoch": 2.676529160739687, | |
| "grad_norm": 2.175909996032715, | |
| "learning_rate": 6.820600005647382e-07, | |
| "loss": 0.7461, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 2.6795637743006164, | |
| "grad_norm": 2.047321081161499, | |
| "learning_rate": 6.692764601736268e-07, | |
| "loss": 0.7197, | |
| "step": 883 | |
| }, | |
| { | |
| "epoch": 2.682598387861546, | |
| "grad_norm": 2.0702617168426514, | |
| "learning_rate": 6.566097052175213e-07, | |
| "loss": 0.7656, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 2.685633001422475, | |
| "grad_norm": 2.1683459281921387, | |
| "learning_rate": 6.440598942363796e-07, | |
| "loss": 0.7521, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 2.6886676149834043, | |
| "grad_norm": 2.067746162414551, | |
| "learning_rate": 6.316271843064536e-07, | |
| "loss": 0.7531, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 2.691702228544334, | |
| "grad_norm": 2.1310875415802, | |
| "learning_rate": 6.193117310383412e-07, | |
| "loss": 0.7538, | |
| "step": 887 | |
| }, | |
| { | |
| "epoch": 2.694736842105263, | |
| "grad_norm": 2.1403868198394775, | |
| "learning_rate": 6.071136885750272e-07, | |
| "loss": 0.7373, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 2.6977714556661923, | |
| "grad_norm": 2.049807071685791, | |
| "learning_rate": 5.950332095899547e-07, | |
| "loss": 0.738, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 2.700806069227122, | |
| "grad_norm": 2.080238103866577, | |
| "learning_rate": 5.830704452851166e-07, | |
| "loss": 0.7212, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.7038406827880515, | |
| "grad_norm": 2.1021158695220947, | |
| "learning_rate": 5.71225545389158e-07, | |
| "loss": 0.7411, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 2.7068752963489806, | |
| "grad_norm": 2.0948758125305176, | |
| "learning_rate": 5.594986581555173e-07, | |
| "loss": 0.7369, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 2.7099099099099098, | |
| "grad_norm": 2.1047518253326416, | |
| "learning_rate": 5.478899303605512e-07, | |
| "loss": 0.7289, | |
| "step": 893 | |
| }, | |
| { | |
| "epoch": 2.7129445234708394, | |
| "grad_norm": 2.1207687854766846, | |
| "learning_rate": 5.363995073017047e-07, | |
| "loss": 0.7385, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 2.7159791370317685, | |
| "grad_norm": 2.051896572113037, | |
| "learning_rate": 5.250275327957033e-07, | |
| "loss": 0.7244, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 2.719013750592698, | |
| "grad_norm": 2.220273017883301, | |
| "learning_rate": 5.137741491767345e-07, | |
| "loss": 0.7522, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 2.7220483641536273, | |
| "grad_norm": 2.1100893020629883, | |
| "learning_rate": 5.026394972946813e-07, | |
| "loss": 0.7276, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 2.725082977714557, | |
| "grad_norm": 2.300666332244873, | |
| "learning_rate": 4.91623716513352e-07, | |
| "loss": 0.7404, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 2.728117591275486, | |
| "grad_norm": 2.2343952655792236, | |
| "learning_rate": 4.807269447087348e-07, | |
| "loss": 0.7474, | |
| "step": 899 | |
| }, | |
| { | |
| "epoch": 2.731152204836415, | |
| "grad_norm": 2.1582469940185547, | |
| "learning_rate": 4.6994931826728094e-07, | |
| "loss": 0.7275, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.731152204836415, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.8855, | |
| "eval_samples_per_second": 20.294, | |
| "eval_steps_per_second": 5.076, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.734186818397345, | |
| "grad_norm": 2.057573080062866, | |
| "learning_rate": 4.592909720841843e-07, | |
| "loss": 0.7351, | |
| "step": 901 | |
| }, | |
| { | |
| "epoch": 2.737221431958274, | |
| "grad_norm": 2.320993423461914, | |
| "learning_rate": 4.487520395617029e-07, | |
| "loss": 0.7086, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 2.7402560455192035, | |
| "grad_norm": 2.2511098384857178, | |
| "learning_rate": 4.3833265260749157e-07, | |
| "loss": 0.723, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 2.7432906590801327, | |
| "grad_norm": 2.182513952255249, | |
| "learning_rate": 4.280329416329365e-07, | |
| "loss": 0.7196, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 2.7463252726410623, | |
| "grad_norm": 2.1579484939575195, | |
| "learning_rate": 4.178530355515409e-07, | |
| "loss": 0.7334, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 2.7493598862019915, | |
| "grad_norm": 2.091196060180664, | |
| "learning_rate": 4.077930617773007e-07, | |
| "loss": 0.7387, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 2.7523944997629206, | |
| "grad_norm": 2.1611766815185547, | |
| "learning_rate": 3.97853146223105e-07, | |
| "loss": 0.7515, | |
| "step": 907 | |
| }, | |
| { | |
| "epoch": 2.75542911332385, | |
| "grad_norm": 2.240387201309204, | |
| "learning_rate": 3.880334132991792e-07, | |
| "loss": 0.7377, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 2.7584637268847794, | |
| "grad_norm": 2.1546630859375, | |
| "learning_rate": 3.783339859115065e-07, | |
| "loss": 0.7388, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 2.761498340445709, | |
| "grad_norm": 2.0892395973205566, | |
| "learning_rate": 3.687549854603023e-07, | |
| "loss": 0.7488, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.764532954006638, | |
| "grad_norm": 2.182509422302246, | |
| "learning_rate": 3.5929653183849444e-07, | |
| "loss": 0.736, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 2.7675675675675677, | |
| "grad_norm": 2.198559522628784, | |
| "learning_rate": 3.49958743430211e-07, | |
| "loss": 0.736, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 2.770602181128497, | |
| "grad_norm": 2.188133955001831, | |
| "learning_rate": 3.4074173710931804e-07, | |
| "loss": 0.7345, | |
| "step": 913 | |
| }, | |
| { | |
| "epoch": 2.773636794689426, | |
| "grad_norm": 2.133892774581909, | |
| "learning_rate": 3.3164562823793654e-07, | |
| "loss": 0.7275, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 2.7766714082503556, | |
| "grad_norm": 2.159824848175049, | |
| "learning_rate": 3.226705306650113e-07, | |
| "loss": 0.7198, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 2.7797060218112852, | |
| "grad_norm": 2.1087584495544434, | |
| "learning_rate": 3.1381655672488634e-07, | |
| "loss": 0.7381, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 2.7827406353722144, | |
| "grad_norm": 2.0807528495788574, | |
| "learning_rate": 3.050838172358883e-07, | |
| "loss": 0.7386, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 2.7857752489331435, | |
| "grad_norm": 2.1473801136016846, | |
| "learning_rate": 2.9647242149895005e-07, | |
| "loss": 0.7336, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 2.788809862494073, | |
| "grad_norm": 2.219571352005005, | |
| "learning_rate": 2.879824772962381e-07, | |
| "loss": 0.7438, | |
| "step": 919 | |
| }, | |
| { | |
| "epoch": 2.7918444760550023, | |
| "grad_norm": 2.144059419631958, | |
| "learning_rate": 2.796140908898026e-07, | |
| "loss": 0.7338, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.7948790896159315, | |
| "grad_norm": 2.1552734375, | |
| "learning_rate": 2.7136736702025436e-07, | |
| "loss": 0.7345, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 2.797913703176861, | |
| "grad_norm": 2.18730092048645, | |
| "learning_rate": 2.632424089054419e-07, | |
| "loss": 0.7222, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 2.8009483167377907, | |
| "grad_norm": 2.233304500579834, | |
| "learning_rate": 2.552393182391677e-07, | |
| "loss": 0.7474, | |
| "step": 923 | |
| }, | |
| { | |
| "epoch": 2.80398293029872, | |
| "grad_norm": 2.041757583618164, | |
| "learning_rate": 2.473581951899184e-07, | |
| "loss": 0.7329, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 2.1639747619628906, | |
| "learning_rate": 2.395991383995999e-07, | |
| "loss": 0.748, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.8100521574205786, | |
| "grad_norm": 2.0612905025482178, | |
| "learning_rate": 2.3196224498231447e-07, | |
| "loss": 0.7311, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 2.8130867709815077, | |
| "grad_norm": 2.260965347290039, | |
| "learning_rate": 2.2444761052313857e-07, | |
| "loss": 0.7353, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 2.8161213845424373, | |
| "grad_norm": 2.1008665561676025, | |
| "learning_rate": 2.1705532907692617e-07, | |
| "loss": 0.7677, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 2.8191559981033665, | |
| "grad_norm": 2.20302414894104, | |
| "learning_rate": 2.0978549316713615e-07, | |
| "loss": 0.7447, | |
| "step": 929 | |
| }, | |
| { | |
| "epoch": 2.822190611664296, | |
| "grad_norm": 2.100362539291382, | |
| "learning_rate": 2.0263819378466888e-07, | |
| "loss": 0.7393, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.8252252252252252, | |
| "grad_norm": 2.2228076457977295, | |
| "learning_rate": 1.9561352038673264e-07, | |
| "loss": 0.7501, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 2.8282598387861544, | |
| "grad_norm": 2.2403063774108887, | |
| "learning_rate": 1.8871156089572018e-07, | |
| "loss": 0.7508, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 2.831294452347084, | |
| "grad_norm": 2.1292290687561035, | |
| "learning_rate": 1.8193240169810943e-07, | |
| "loss": 0.7362, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 2.834329065908013, | |
| "grad_norm": 2.029907703399658, | |
| "learning_rate": 1.752761276433801e-07, | |
| "loss": 0.7504, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 2.8373636794689427, | |
| "grad_norm": 2.175046682357788, | |
| "learning_rate": 1.6874282204295765e-07, | |
| "loss": 0.7437, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 2.840398293029872, | |
| "grad_norm": 2.150054693222046, | |
| "learning_rate": 1.623325666691644e-07, | |
| "loss": 0.7388, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 2.8434329065908015, | |
| "grad_norm": 2.2034387588500977, | |
| "learning_rate": 1.5604544175419901e-07, | |
| "loss": 0.7451, | |
| "step": 937 | |
| }, | |
| { | |
| "epoch": 2.8464675201517307, | |
| "grad_norm": 2.1042325496673584, | |
| "learning_rate": 1.4988152598913063e-07, | |
| "loss": 0.7535, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 2.84950213371266, | |
| "grad_norm": 2.1048696041107178, | |
| "learning_rate": 1.4384089652291544e-07, | |
| "loss": 0.745, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 2.8525367472735894, | |
| "grad_norm": 2.3105156421661377, | |
| "learning_rate": 1.3792362896143164e-07, | |
| "loss": 0.7233, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.8555713608345186, | |
| "grad_norm": 2.1417715549468994, | |
| "learning_rate": 1.3212979736653142e-07, | |
| "loss": 0.7467, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 2.858605974395448, | |
| "grad_norm": 2.1809537410736084, | |
| "learning_rate": 1.2645947425511397e-07, | |
| "loss": 0.7432, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 2.8616405879563773, | |
| "grad_norm": 2.2112855911254883, | |
| "learning_rate": 1.209127305982205e-07, | |
| "loss": 0.7487, | |
| "step": 943 | |
| }, | |
| { | |
| "epoch": 2.864675201517307, | |
| "grad_norm": 2.059002161026001, | |
| "learning_rate": 1.1548963582013961e-07, | |
| "loss": 0.734, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 2.867709815078236, | |
| "grad_norm": 2.04377818107605, | |
| "learning_rate": 1.1019025779754666e-07, | |
| "loss": 0.7213, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 2.8707444286391652, | |
| "grad_norm": 2.0912530422210693, | |
| "learning_rate": 1.0501466285865124e-07, | |
| "loss": 0.7548, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 2.873779042200095, | |
| "grad_norm": 2.2096190452575684, | |
| "learning_rate": 9.996291578236228e-08, | |
| "loss": 0.7246, | |
| "step": 947 | |
| }, | |
| { | |
| "epoch": 2.8768136557610244, | |
| "grad_norm": 2.284653663635254, | |
| "learning_rate": 9.503507979748305e-08, | |
| "loss": 0.7339, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 2.8798482693219536, | |
| "grad_norm": 2.097752332687378, | |
| "learning_rate": 9.023121658191636e-08, | |
| "loss": 0.7303, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 2.8828828828828827, | |
| "grad_norm": 2.171391010284424, | |
| "learning_rate": 8.555138626189619e-08, | |
| "loss": 0.7503, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.8828828828828827, | |
| "eval_loss": NaN, | |
| "eval_runtime": 204.3654, | |
| "eval_samples_per_second": 20.346, | |
| "eval_steps_per_second": 5.089, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.8859174964438123, | |
| "grad_norm": 2.1366286277770996, | |
| "learning_rate": 8.099564741123167e-08, | |
| "loss": 0.7241, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 2.8889521100047415, | |
| "grad_norm": 2.0811874866485596, | |
| "learning_rate": 7.656405705057434e-08, | |
| "loss": 0.7504, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 2.8919867235656707, | |
| "grad_norm": 2.080226421356201, | |
| "learning_rate": 7.225667064670761e-08, | |
| "loss": 0.7355, | |
| "step": 953 | |
| }, | |
| { | |
| "epoch": 2.8950213371266003, | |
| "grad_norm": 2.200331211090088, | |
| "learning_rate": 6.807354211184613e-08, | |
| "loss": 0.7427, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 2.89805595068753, | |
| "grad_norm": 2.038591146469116, | |
| "learning_rate": 6.401472380297091e-08, | |
| "loss": 0.7379, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.901090564248459, | |
| "grad_norm": 2.1141886711120605, | |
| "learning_rate": 6.008026652116306e-08, | |
| "loss": 0.7376, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 2.904125177809388, | |
| "grad_norm": 2.0803427696228027, | |
| "learning_rate": 5.6270219510975445e-08, | |
| "loss": 0.7424, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 2.9071597913703178, | |
| "grad_norm": 2.0900638103485107, | |
| "learning_rate": 5.258463045981432e-08, | |
| "loss": 0.7243, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 2.910194404931247, | |
| "grad_norm": 2.133312463760376, | |
| "learning_rate": 4.902354549733979e-08, | |
| "loss": 0.7257, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 2.9132290184921765, | |
| "grad_norm": 2.2082343101501465, | |
| "learning_rate": 4.5587009194894005e-08, | |
| "loss": 0.7346, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.9162636320531057, | |
| "grad_norm": 2.0872416496276855, | |
| "learning_rate": 4.227506456493835e-08, | |
| "loss": 0.7437, | |
| "step": 961 | |
| }, | |
| { | |
| "epoch": 2.9192982456140353, | |
| "grad_norm": 2.136866807937622, | |
| "learning_rate": 3.908775306051604e-08, | |
| "loss": 0.7362, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 2.9223328591749644, | |
| "grad_norm": 2.125523805618286, | |
| "learning_rate": 3.602511457473479e-08, | |
| "loss": 0.7298, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 2.9253674727358936, | |
| "grad_norm": 2.055690050125122, | |
| "learning_rate": 3.3087187440268284e-08, | |
| "loss": 0.7285, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 2.928402086296823, | |
| "grad_norm": 2.26811146736145, | |
| "learning_rate": 3.027400842887218e-08, | |
| "loss": 0.732, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.9314366998577523, | |
| "grad_norm": 2.0984137058258057, | |
| "learning_rate": 2.758561275092886e-08, | |
| "loss": 0.7238, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 2.934471313418682, | |
| "grad_norm": 2.117258310317993, | |
| "learning_rate": 2.5022034055003363e-08, | |
| "loss": 0.7604, | |
| "step": 967 | |
| }, | |
| { | |
| "epoch": 2.937505926979611, | |
| "grad_norm": 2.135441541671753, | |
| "learning_rate": 2.2583304427421516e-08, | |
| "loss": 0.7358, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 2.9405405405405407, | |
| "grad_norm": 2.1416280269622803, | |
| "learning_rate": 2.0269454391874665e-08, | |
| "loss": 0.7436, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 2.94357515410147, | |
| "grad_norm": 2.1733927726745605, | |
| "learning_rate": 1.80805129090289e-08, | |
| "loss": 0.7308, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.946609767662399, | |
| "grad_norm": 2.1229727268218994, | |
| "learning_rate": 1.6016507376169776e-08, | |
| "loss": 0.7577, | |
| "step": 971 | |
| }, | |
| { | |
| "epoch": 2.9496443812233286, | |
| "grad_norm": 2.283834218978882, | |
| "learning_rate": 1.4077463626852584e-08, | |
| "loss": 0.748, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 2.952678994784258, | |
| "grad_norm": 2.205937623977661, | |
| "learning_rate": 1.2263405930585947e-08, | |
| "loss": 0.7335, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 2.9557136083451874, | |
| "grad_norm": 2.111551284790039, | |
| "learning_rate": 1.0574356992525403e-08, | |
| "loss": 0.7295, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 2.9587482219061165, | |
| "grad_norm": 2.058469295501709, | |
| "learning_rate": 9.010337953185843e-09, | |
| "loss": 0.7547, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.961782835467046, | |
| "grad_norm": 2.1459245681762695, | |
| "learning_rate": 7.571368388181732e-09, | |
| "loss": 0.7368, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 2.9648174490279753, | |
| "grad_norm": 2.1419780254364014, | |
| "learning_rate": 6.257466307980631e-09, | |
| "loss": 0.7364, | |
| "step": 977 | |
| }, | |
| { | |
| "epoch": 2.9678520625889044, | |
| "grad_norm": 2.1368930339813232, | |
| "learning_rate": 5.068648157675604e-09, | |
| "loss": 0.7243, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 2.970886676149834, | |
| "grad_norm": 2.072345733642578, | |
| "learning_rate": 4.00492881678427e-09, | |
| "loss": 0.7229, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 2.9739212897107636, | |
| "grad_norm": 2.173828363418579, | |
| "learning_rate": 3.0663215990534013e-09, | |
| "loss": 0.711, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.976955903271693, | |
| "grad_norm": 2.117293357849121, | |
| "learning_rate": 2.2528382523057115e-09, | |
| "loss": 0.7648, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 2.979990516832622, | |
| "grad_norm": 2.126967191696167, | |
| "learning_rate": 1.564488958279986e-09, | |
| "loss": 0.7359, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 2.9830251303935515, | |
| "grad_norm": 2.086338520050049, | |
| "learning_rate": 1.0012823325111776e-09, | |
| "loss": 0.7575, | |
| "step": 983 | |
| }, | |
| { | |
| "epoch": 2.9860597439544807, | |
| "grad_norm": 2.0174200534820557, | |
| "learning_rate": 5.632254242204926e-10, | |
| "loss": 0.7324, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 2.9890943575154103, | |
| "grad_norm": 2.0716025829315186, | |
| "learning_rate": 2.503237162254646e-10, | |
| "loss": 0.7482, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.9921289710763395, | |
| "grad_norm": 2.0842347145080566, | |
| "learning_rate": 6.258112487667056e-11, | |
| "loss": 0.7334, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 2.995163584637269, | |
| "grad_norm": 2.1061208248138428, | |
| "learning_rate": 0.0, | |
| "loss": 0.7299, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 2.995163584637269, | |
| "step": 987, | |
| "total_flos": 4.415483185831556e+19, | |
| "train_loss": 0.7698779804849093, | |
| "train_runtime": 131039.1076, | |
| "train_samples_per_second": 3.863, | |
| "train_steps_per_second": 0.008 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 987, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.415483185831556e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |