| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.18851918182675087, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00018851918182675087, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 5e-06, | |
| "loss": 2.7659, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00037703836365350174, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.5842, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0005655575454802526, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 2.8169, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0007540767273070035, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2e-05, | |
| "loss": 2.6938, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0009425959091337543, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 2.5e-05, | |
| "loss": 2.7862, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0011311150909605052, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 3e-05, | |
| "loss": 2.8844, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0013196342727872562, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 3.5000000000000004e-05, | |
| "loss": 2.8254, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.001508153454614007, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 4e-05, | |
| "loss": 2.7735, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.001696672636440758, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 4.4999999999999996e-05, | |
| "loss": 2.8222, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0018851918182675087, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5e-05, | |
| "loss": 2.6943, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0020737110000942595, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.5e-05, | |
| "loss": 2.6735, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0022622301819210104, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 6e-05, | |
| "loss": 2.6482, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0024507493637477614, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 2.8788, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0026392685455745124, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 7.000000000000001e-05, | |
| "loss": 2.7531, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.002827787727401263, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 7.5e-05, | |
| "loss": 2.7911, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.003016306909228014, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 8e-05, | |
| "loss": 2.7358, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.003204826091054765, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8.5e-05, | |
| "loss": 2.7272, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.003393345272881516, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 2.7176, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0035818644547082664, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.5e-05, | |
| "loss": 2.8573, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0037703836365350174, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.0001, | |
| "loss": 2.7512, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.003958902818361768, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 0.000105, | |
| "loss": 2.7962, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.004147422000188519, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00011, | |
| "loss": 2.7, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.00433594118201527, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 0.000115, | |
| "loss": 2.7128, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.004524460363842021, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.00012, | |
| "loss": 2.729, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.004712979545668771, | |
| "grad_norm": 1.25, | |
| "learning_rate": 0.000125, | |
| "loss": 2.698, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.004901498727495523, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 2.7461, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.005090017909322273, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 0.000135, | |
| "loss": 2.7315, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.005278537091149025, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 0.00014000000000000001, | |
| "loss": 2.7089, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.005467056272975775, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 0.000145, | |
| "loss": 2.6724, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.005655575454802526, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 0.00015, | |
| "loss": 2.799, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005844094636629277, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 0.000155, | |
| "loss": 2.7939, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.006032613818456028, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 0.00016, | |
| "loss": 2.8004, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.006221133000282778, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 0.000165, | |
| "loss": 2.6322, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.00640965218210953, | |
| "grad_norm": 1.0, | |
| "learning_rate": 0.00017, | |
| "loss": 2.7095, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.00659817136393628, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 0.000175, | |
| "loss": 2.7111, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.006786690545763032, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 2.7666, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.006975209727589782, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 0.000185, | |
| "loss": 2.7779, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.007163728909416533, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 0.00019, | |
| "loss": 2.7684, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.007352248091243284, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.00019500000000000002, | |
| "loss": 2.8674, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.007540767273070035, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 0.0002, | |
| "loss": 2.7694, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.007729286454896786, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 0.000205, | |
| "loss": 2.6799, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.007917805636723537, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 0.00021, | |
| "loss": 2.6289, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.008106324818550288, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 0.000215, | |
| "loss": 2.7937, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.008294844000377038, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 0.00022, | |
| "loss": 2.78, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.00848336318220379, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.00022500000000000002, | |
| "loss": 2.6351, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.00867188236403054, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 0.00023, | |
| "loss": 2.8156, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.00886040154585729, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 0.000235, | |
| "loss": 2.8304, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.009048920727684042, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.00024, | |
| "loss": 2.7148, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.009237439909510793, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.000245, | |
| "loss": 2.7169, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.009425959091337543, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.00025, | |
| "loss": 2.8345, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.009614478273164294, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000255, | |
| "loss": 2.8149, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.009802997454991046, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.00026000000000000003, | |
| "loss": 2.8182, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.009991516636817797, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.00026500000000000004, | |
| "loss": 2.8114, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.010180035818644547, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.00027, | |
| "loss": 2.803, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.010368555000471298, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000275, | |
| "loss": 2.7979, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.01055707418229805, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.00028000000000000003, | |
| "loss": 2.8062, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0107455933641248, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.000285, | |
| "loss": 2.6728, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.01093411254595155, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00029, | |
| "loss": 2.7547, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.011122631727778302, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.000295, | |
| "loss": 2.6773, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.011311150909605052, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0003, | |
| "loss": 2.7238, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.011499670091431803, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.000305, | |
| "loss": 2.6842, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.011688189273258555, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.00031, | |
| "loss": 2.8449, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.011876708455085304, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000315, | |
| "loss": 2.6828, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.012065227636912056, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.00032, | |
| "loss": 2.7663, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.012253746818738807, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.00032500000000000004, | |
| "loss": 2.6127, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.012442266000565557, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.00033, | |
| "loss": 2.6333, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.012630785182392308, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.000335, | |
| "loss": 2.7669, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.01281930436421906, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.00034, | |
| "loss": 2.7363, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.013007823546045811, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.000345, | |
| "loss": 2.6626, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.01319634272787256, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.00035, | |
| "loss": 2.7896, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.013384861909699312, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000355, | |
| "loss": 2.7407, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.013573381091526063, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.00035999999999999997, | |
| "loss": 2.804, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.013761900273352813, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000365, | |
| "loss": 2.781, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.013950419455179565, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.00037, | |
| "loss": 2.5436, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.014138938637006316, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.000375, | |
| "loss": 2.7272, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.014327457818833066, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.00038, | |
| "loss": 2.6777, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.014515977000659817, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.00038500000000000003, | |
| "loss": 2.8211, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.014704496182486568, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.00039000000000000005, | |
| "loss": 2.7639, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.014893015364313318, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000395, | |
| "loss": 2.6884, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.01508153454614007, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0004, | |
| "loss": 2.6492, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.015270053727966821, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.00040500000000000003, | |
| "loss": 2.8072, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.015458572909793572, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.00041, | |
| "loss": 2.7446, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.015647092091620324, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000415, | |
| "loss": 2.7554, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.015835611273447073, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.00042, | |
| "loss": 2.7212, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.016024130455273823, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000425, | |
| "loss": 2.6933, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.016212649637100576, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.00043, | |
| "loss": 2.7461, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.016401168818927326, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.000435, | |
| "loss": 2.7079, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.016589688000754076, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.00044, | |
| "loss": 2.8562, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.01677820718258083, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.00044500000000000003, | |
| "loss": 2.6606, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.01696672636440758, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 2.7817, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.017155245546234328, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.000455, | |
| "loss": 2.7714, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.01734376472806108, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.00046, | |
| "loss": 2.7217, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.01753228390988783, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.000465, | |
| "loss": 2.6855, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.01772080309171458, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.00047, | |
| "loss": 2.7111, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.017909322273541334, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.000475, | |
| "loss": 2.6868, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.018097841455368083, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.00048, | |
| "loss": 2.7355, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.018286360637194833, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.00048499999999999997, | |
| "loss": 2.7172, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.018474879819021586, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.00049, | |
| "loss": 2.8204, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.018663399000848336, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.000495, | |
| "loss": 2.6965, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.018851918182675086, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0005, | |
| "loss": 2.7988, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01904043736450184, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000505, | |
| "loss": 2.7069, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.01922895654632859, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.00051, | |
| "loss": 2.6942, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.019417475728155338, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.000515, | |
| "loss": 2.7497, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.01960599490998209, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0005200000000000001, | |
| "loss": 2.6381, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.01979451409180884, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0005250000000000001, | |
| "loss": 2.6969, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.019983033273635594, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0005300000000000001, | |
| "loss": 2.7247, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.020171552455462344, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000535, | |
| "loss": 2.828, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.020360071637289093, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.00054, | |
| "loss": 2.7309, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.020548590819115847, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.000545, | |
| "loss": 2.8354, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.020737110000942596, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.00055, | |
| "loss": 2.8101, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.020925629182769346, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000555, | |
| "loss": 2.7837, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.0211141483645961, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0005600000000000001, | |
| "loss": 2.6813, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.02130266754642285, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.000565, | |
| "loss": 2.7035, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.0214911867282496, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.00057, | |
| "loss": 2.6901, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.02167970591007635, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.000575, | |
| "loss": 2.7001, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0218682250919031, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.00058, | |
| "loss": 2.7508, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.02205674427372985, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.000585, | |
| "loss": 2.7348, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.022245263455556604, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.00059, | |
| "loss": 2.7434, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.022433782637383354, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0005949999999999999, | |
| "loss": 2.6735, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.022622301819210103, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0006, | |
| "loss": 2.6258, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.022810821001036857, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.000605, | |
| "loss": 2.7676, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.022999340182863606, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.00061, | |
| "loss": 2.7045, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.023187859364690356, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.000615, | |
| "loss": 2.6322, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.02337637854651711, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.00062, | |
| "loss": 2.6953, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.02356489772834386, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.000625, | |
| "loss": 2.6045, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.02375341691017061, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.00063, | |
| "loss": 2.6551, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.02394193609199736, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000635, | |
| "loss": 2.656, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.02413045527382411, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 0.00064, | |
| "loss": 2.791, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.02431897445565086, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0006450000000000001, | |
| "loss": 2.6599, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.024507493637477614, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0006500000000000001, | |
| "loss": 2.633, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.024696012819304364, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.0006550000000000001, | |
| "loss": 2.6002, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.024884532001131113, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.00066, | |
| "loss": 2.7593, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.025073051182957867, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000665, | |
| "loss": 2.706, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.025261570364784616, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.00067, | |
| "loss": 2.7094, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.02545008954661137, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 0.000675, | |
| "loss": 2.6961, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.02563860872843812, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.00068, | |
| "loss": 2.7805, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.02582712791026487, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0006850000000000001, | |
| "loss": 2.6559, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.026015647092091622, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.00069, | |
| "loss": 2.7455, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.02620416627391837, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.000695, | |
| "loss": 2.7533, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.02639268545574512, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0007, | |
| "loss": 2.7434, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.026581204637571874, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000705, | |
| "loss": 2.7018, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.026769723819398624, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.00071, | |
| "loss": 2.6182, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.026958243001225374, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.000715, | |
| "loss": 2.5742, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.027146762183052127, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0007199999999999999, | |
| "loss": 2.6547, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.027335281364878877, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.000725, | |
| "loss": 2.7054, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.027523800546705626, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.00073, | |
| "loss": 2.5809, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.02771231972853238, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.000735, | |
| "loss": 2.6474, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.02790083891035913, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.00074, | |
| "loss": 2.7606, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.02808935809218588, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.000745, | |
| "loss": 2.6923, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.028277877274012632, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.00075, | |
| "loss": 2.782, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02846639645583938, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000755, | |
| "loss": 2.7369, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.02865491563766613, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.00076, | |
| "loss": 2.6287, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.028843434819492884, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.0007650000000000001, | |
| "loss": 2.6649, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.029031954001319634, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.0007700000000000001, | |
| "loss": 2.7421, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.029220473183146384, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0007750000000000001, | |
| "loss": 2.5988, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.029408992364973137, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0007800000000000001, | |
| "loss": 2.6876, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.029597511546799887, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 0.000785, | |
| "loss": 2.6846, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.029786030728626636, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.00079, | |
| "loss": 2.7869, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.02997454991045339, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.000795, | |
| "loss": 2.6972, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.03016306909228014, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0008, | |
| "loss": 2.7664, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03035158827410689, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000805, | |
| "loss": 2.6554, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.030540107455933642, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008100000000000001, | |
| "loss": 2.662, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.03072862663776039, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.000815, | |
| "loss": 2.622, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.030917145819587145, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.00082, | |
| "loss": 2.6071, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.031105665001413894, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.000825, | |
| "loss": 2.6724, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.03129418418324065, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.00083, | |
| "loss": 2.5888, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.031482703365067394, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.000835, | |
| "loss": 2.7932, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.03167122254689415, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.00084, | |
| "loss": 2.6234, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.0318597417287209, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0008449999999999999, | |
| "loss": 2.6725, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.032048260910547646, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 0.00085, | |
| "loss": 2.6502, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0322367800923744, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.000855, | |
| "loss": 2.7151, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.03242529927420115, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.00086, | |
| "loss": 2.8332, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.0326138184560279, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000865, | |
| "loss": 2.8183, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.03280233763785465, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00087, | |
| "loss": 2.6777, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.032990856819681405, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.000875, | |
| "loss": 2.6281, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.03317937600150815, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.00088, | |
| "loss": 2.7047, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.033367895183334904, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000885, | |
| "loss": 2.6637, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.03355641436516166, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0008900000000000001, | |
| "loss": 2.7817, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.033744933546988404, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0008950000000000001, | |
| "loss": 2.6216, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.03393345272881516, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 2.6608, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03412197191064191, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009050000000000001, | |
| "loss": 2.712, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.034310491092468656, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.00091, | |
| "loss": 2.6812, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.03449901027429541, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.000915, | |
| "loss": 2.6181, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.03468752945612216, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.00092, | |
| "loss": 2.5939, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.03487604863794891, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.000925, | |
| "loss": 2.6378, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.03506456781977566, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 0.00093, | |
| "loss": 2.658, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.035253087001602415, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.0009350000000000001, | |
| "loss": 2.6324, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.03544160618342916, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00094, | |
| "loss": 2.7615, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.035630125365255914, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.000945, | |
| "loss": 2.8334, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.03581864454708267, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.00095, | |
| "loss": 2.8026, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.036007163728909414, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.000955, | |
| "loss": 2.6532, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.03619568291073617, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00096, | |
| "loss": 2.5541, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.03638420209256292, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000965, | |
| "loss": 2.6375, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.036572721274389666, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009699999999999999, | |
| "loss": 2.5705, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.03676124045621642, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.000975, | |
| "loss": 2.6405, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.03694975963804317, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.00098, | |
| "loss": 2.7821, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.03713827881986992, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000985, | |
| "loss": 2.6889, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.03732679800169667, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.00099, | |
| "loss": 2.6658, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.037515317183523425, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000995, | |
| "loss": 2.6969, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.03770383636535017, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.001, | |
| "loss": 2.5479, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.037892355547176924, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 0.0009998040752351098, | |
| "loss": 2.7177, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.03808087472900368, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009996081504702195, | |
| "loss": 2.7224, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.038269393910830424, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009994122257053293, | |
| "loss": 2.6316, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.03845791309265718, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009992163009404388, | |
| "loss": 2.8178, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.03864643227448393, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009990203761755486, | |
| "loss": 2.7619, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.038834951456310676, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009988244514106584, | |
| "loss": 2.5739, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.03902347063813743, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009986285266457681, | |
| "loss": 2.7797, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.03921198981996418, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009984326018808779, | |
| "loss": 2.695, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.039400509001790936, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009982366771159876, | |
| "loss": 2.7551, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.03958902818361768, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009980407523510972, | |
| "loss": 2.7898, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.039777547365444435, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.000997844827586207, | |
| "loss": 2.6824, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.03996606654727119, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009976489028213167, | |
| "loss": 2.8341, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.040154585729097934, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009974529780564262, | |
| "loss": 2.6885, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.04034310491092469, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.000997257053291536, | |
| "loss": 2.5722, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.04053162409275144, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009970611285266457, | |
| "loss": 2.7023, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.04072014327457819, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009968652037617555, | |
| "loss": 2.6429, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.04090866245640494, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009966692789968653, | |
| "loss": 2.7053, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.04109718163823169, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000996473354231975, | |
| "loss": 2.7841, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.04128570082005844, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009962774294670846, | |
| "loss": 2.6687, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.04147422000188519, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009960815047021943, | |
| "loss": 2.7893, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.041662739183711946, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000995885579937304, | |
| "loss": 2.5992, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.04185125836553869, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009956896551724138, | |
| "loss": 2.7238, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.042039777547365445, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009954937304075236, | |
| "loss": 2.7477, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.0422282967291922, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009952978056426334, | |
| "loss": 2.6079, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.042416815911018944, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.000995101880877743, | |
| "loss": 2.6389, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0426053350928457, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009949059561128527, | |
| "loss": 2.6014, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.04279385427467245, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009947100313479624, | |
| "loss": 2.6708, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.0429823734564992, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009945141065830722, | |
| "loss": 2.7032, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.04317089263832595, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009943181818181817, | |
| "loss": 2.7911, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.0433594118201527, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009941222570532915, | |
| "loss": 2.5071, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04354793100197945, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009939263322884012, | |
| "loss": 2.695, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.0437364501838062, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000993730407523511, | |
| "loss": 2.5969, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.043924969365632956, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009935344827586207, | |
| "loss": 2.6602, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.0441134885474597, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009933385579937305, | |
| "loss": 2.6561, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.044302007729286455, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.00099314263322884, | |
| "loss": 2.6442, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.04449052691111321, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 0.0009929467084639498, | |
| "loss": 2.7465, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.044679046092939954, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009927507836990596, | |
| "loss": 2.7102, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.04486756527476671, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009925548589341693, | |
| "loss": 2.7074, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.04505608445659346, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.000992358934169279, | |
| "loss": 2.6626, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.04524460363842021, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009921630094043888, | |
| "loss": 2.5579, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04543312282024696, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009919670846394984, | |
| "loss": 2.7225, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.04562164200207371, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009917711598746081, | |
| "loss": 2.6952, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.04581016118390046, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.000991575235109718, | |
| "loss": 2.6886, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.04599868036572721, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009913793103448277, | |
| "loss": 2.6096, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.046187199547553966, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009911833855799374, | |
| "loss": 2.7612, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.04637571872938071, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009909874608150472, | |
| "loss": 2.6082, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.046564237911207465, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009907915360501567, | |
| "loss": 2.7621, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.04675275709303422, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009905956112852665, | |
| "loss": 2.6764, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.046941276274860964, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009903996865203762, | |
| "loss": 2.6527, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.04712979545668772, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009902037617554858, | |
| "loss": 2.5762, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04731831463851447, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009900078369905955, | |
| "loss": 2.7241, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.04750683382034122, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009898119122257053, | |
| "loss": 2.6935, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.04769535300216797, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.000989615987460815, | |
| "loss": 2.776, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.04788387218399472, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009894200626959248, | |
| "loss": 2.7799, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.04807239136582147, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009892241379310346, | |
| "loss": 2.7589, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.04826091054764822, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009890282131661443, | |
| "loss": 2.646, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.048449429729474976, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009888322884012539, | |
| "loss": 2.7226, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.04863794891130172, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009886363636363636, | |
| "loss": 2.6825, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.048826468093128475, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009884404388714734, | |
| "loss": 2.6494, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.04901498727495523, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009882445141065831, | |
| "loss": 2.7586, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.049203506456781974, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000988048589341693, | |
| "loss": 2.7986, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.04939202563860873, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009878526645768027, | |
| "loss": 2.624, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.04958054482043548, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009876567398119122, | |
| "loss": 2.4967, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.04976906400226223, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000987460815047022, | |
| "loss": 2.5694, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.04995758318408898, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009872648902821317, | |
| "loss": 2.7369, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.05014610236591573, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009870689655172413, | |
| "loss": 2.641, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.05033462154774248, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000986873040752351, | |
| "loss": 2.5988, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.05052314072956923, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009866771159874608, | |
| "loss": 2.6935, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.050711659911395986, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009864811912225705, | |
| "loss": 2.6573, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.05090017909322274, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009862852664576803, | |
| "loss": 2.5501, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.051088698275049485, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 0.00098608934169279, | |
| "loss": 2.7173, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.05127721745687624, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.0009858934169278996, | |
| "loss": 2.7147, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.05146573663870299, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009856974921630094, | |
| "loss": 2.6823, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.05165425582052974, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009855015673981191, | |
| "loss": 2.7399, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.05184277500235649, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009853056426332289, | |
| "loss": 2.8052, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.052031294184183244, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 0.0009851097178683386, | |
| "loss": 2.6471, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.05221981336600999, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009849137931034484, | |
| "loss": 2.5997, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.05240833254783674, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.000984717868338558, | |
| "loss": 2.6933, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.052596851729663496, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009845219435736677, | |
| "loss": 2.7849, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.05278537091149024, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0009843260188087774, | |
| "loss": 2.7277, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.052973890093316996, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009841300940438872, | |
| "loss": 2.7328, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.05316240927514375, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000983934169278997, | |
| "loss": 2.8041, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.053350928456970495, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009837382445141067, | |
| "loss": 2.6497, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.05353944763879725, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009835423197492165, | |
| "loss": 2.6852, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.053727966820624, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.000983346394984326, | |
| "loss": 2.6116, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.05391648600245075, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009831504702194358, | |
| "loss": 2.5864, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.0541050051842775, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009829545454545455, | |
| "loss": 2.6291, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.054293524366104254, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000982758620689655, | |
| "loss": 2.672, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.054482043547931, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009825626959247648, | |
| "loss": 2.6036, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.05467056272975775, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0009823667711598746, | |
| "loss": 2.4802, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.054859081911584506, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.0009821708463949844, | |
| "loss": 2.721, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.05504760109341125, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009819749216300941, | |
| "loss": 2.6039, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.055236120275238006, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009817789968652039, | |
| "loss": 2.7125, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.05542463945706476, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009815830721003134, | |
| "loss": 2.7176, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.055613158638891505, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009813871473354232, | |
| "loss": 2.7061, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.05580167782071826, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000981191222570533, | |
| "loss": 2.7324, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.05599019700254501, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009809952978056427, | |
| "loss": 2.6318, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.05617871618437176, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009807993730407524, | |
| "loss": 2.637, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.05636723536619851, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009806034482758622, | |
| "loss": 2.5645, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.056555754548025264, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009804075235109717, | |
| "loss": 2.7011, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05674427372985201, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009802115987460815, | |
| "loss": 2.7293, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.05693279291167876, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009800156739811913, | |
| "loss": 2.5779, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.057121312093505516, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000979819749216301, | |
| "loss": 2.7574, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.05730983127533226, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009796238244514106, | |
| "loss": 2.7168, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.057498350457159016, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009794278996865203, | |
| "loss": 2.6531, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.05768686963898577, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.00097923197492163, | |
| "loss": 2.6852, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.057875388820812515, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009790360501567398, | |
| "loss": 2.8098, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.05806390800263927, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009788401253918496, | |
| "loss": 2.5938, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.05825242718446602, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009786442006269591, | |
| "loss": 2.6858, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.05844094636629277, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.000978448275862069, | |
| "loss": 2.6455, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.05862946554811952, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009782523510971787, | |
| "loss": 2.7194, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.058817984729946274, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009780564263322884, | |
| "loss": 2.5933, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.05900650391177302, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009778605015673982, | |
| "loss": 2.7103, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.05919502309359977, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000977664576802508, | |
| "loss": 2.7317, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.059383542275426526, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 0.0009774686520376177, | |
| "loss": 2.6629, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.05957206145725327, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009772727272727272, | |
| "loss": 2.811, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.059760580639080026, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.000977076802507837, | |
| "loss": 2.679, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.05994909982090678, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009768808777429468, | |
| "loss": 2.7421, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.060137619002733525, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.0009766849529780565, | |
| "loss": 2.7717, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.06032613818456028, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.0009764890282131662, | |
| "loss": 2.7456, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06051465736638703, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009762931034482759, | |
| "loss": 2.6342, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.06070317654821378, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 0.0009760971786833856, | |
| "loss": 2.7088, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.06089169573004053, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009759012539184952, | |
| "loss": 2.651, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.061080214911867284, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 0.000975705329153605, | |
| "loss": 2.7472, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.06126873409369403, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009755094043887147, | |
| "loss": 2.8417, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.06145725327552078, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009753134796238245, | |
| "loss": 2.7298, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.061645772457347536, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009751175548589341, | |
| "loss": 2.4934, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.06183429163917429, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0009749216300940439, | |
| "loss": 2.6646, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.062022810821001036, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009747257053291537, | |
| "loss": 2.6718, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.06221133000282779, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009745297805642633, | |
| "loss": 2.5689, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.06239984918465454, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009743338557993731, | |
| "loss": 2.5188, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.0625883683664813, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009741379310344828, | |
| "loss": 2.7064, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.06277688754830804, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009739420062695925, | |
| "loss": 2.726, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.06296540673013479, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009737460815047022, | |
| "loss": 2.7389, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.06315392591196155, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.000973550156739812, | |
| "loss": 2.8134, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.0633424450937883, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009733542319749216, | |
| "loss": 2.7394, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.06353096427561504, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009731583072100314, | |
| "loss": 2.6256, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.0637194834574418, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009729623824451412, | |
| "loss": 2.7413, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.06390800263926855, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009727664576802508, | |
| "loss": 2.7725, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.06409652182109529, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009725705329153606, | |
| "loss": 2.8092, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.06428504100292205, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009723746081504702, | |
| "loss": 2.7276, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.0644735601847488, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009721786833855799, | |
| "loss": 2.5861, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.06466207936657555, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009719827586206896, | |
| "loss": 2.6467, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.0648505985484023, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009717868338557994, | |
| "loss": 2.7404, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.06503911773022905, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.000971590909090909, | |
| "loss": 2.6333, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.0652276369120558, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009713949843260188, | |
| "loss": 2.6079, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.06541615609388256, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009711990595611286, | |
| "loss": 2.5708, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.0656046752757093, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009710031347962382, | |
| "loss": 2.6675, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.06579319445753605, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.000970807210031348, | |
| "loss": 2.7782, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.06598171363936281, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009706112852664577, | |
| "loss": 2.6853, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.06617023282118956, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009704153605015674, | |
| "loss": 2.7684, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.0663587520030163, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009702194357366771, | |
| "loss": 2.5759, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.06654727118484306, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009700235109717869, | |
| "loss": 2.7151, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.06673579036666981, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009698275862068966, | |
| "loss": 2.6346, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.06692430954849656, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009696316614420063, | |
| "loss": 2.5878, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.06711282873032332, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009694357366771161, | |
| "loss": 2.6841, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.06730134791215006, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009692398119122258, | |
| "loss": 2.5688, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.06748986709397681, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009690438871473355, | |
| "loss": 2.5057, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.06767838627580357, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.0009688479623824452, | |
| "loss": 2.6444, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.06786690545763031, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009686520376175549, | |
| "loss": 2.6894, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06805542463945706, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009684561128526645, | |
| "loss": 2.5921, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.06824394382128382, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0009682601880877743, | |
| "loss": 2.7547, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.06843246300311057, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.000968064263322884, | |
| "loss": 2.7235, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.06862098218493731, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009678683385579937, | |
| "loss": 2.6726, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.06880950136676407, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.0009676724137931034, | |
| "loss": 2.7688, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.06899802054859082, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009674764890282132, | |
| "loss": 2.6567, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.06918653973041756, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009672805642633229, | |
| "loss": 2.7241, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.06937505891224433, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009670846394984326, | |
| "loss": 2.603, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.06956357809407107, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009668887147335424, | |
| "loss": 2.6863, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.06975209727589782, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.000966692789968652, | |
| "loss": 2.6655, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06994061645772458, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009664968652037618, | |
| "loss": 2.5301, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.07012913563955132, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009663009404388715, | |
| "loss": 2.7405, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.07031765482137807, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009661050156739812, | |
| "loss": 2.7326, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.07050617400320483, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.000965909090909091, | |
| "loss": 2.6685, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.07069469318503158, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009657131661442007, | |
| "loss": 2.6664, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.07088321236685832, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009655172413793104, | |
| "loss": 2.5892, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.07107173154868508, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009653213166144201, | |
| "loss": 2.6351, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.07126025073051183, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009651253918495299, | |
| "loss": 2.6587, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.07144876991233857, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009649294670846394, | |
| "loss": 2.7744, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.07163728909416534, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009647335423197492, | |
| "loss": 2.7516, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.07182580827599208, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009645376175548589, | |
| "loss": 2.6607, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.07201432745781883, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009643416927899687, | |
| "loss": 2.7513, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.07220284663964559, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009641457680250783, | |
| "loss": 2.607, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.07239136582147233, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009639498432601881, | |
| "loss": 2.5463, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.07257988500329908, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009637539184952979, | |
| "loss": 2.6368, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.07276840418512584, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009635579937304075, | |
| "loss": 2.5846, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.07295692336695259, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009633620689655173, | |
| "loss": 2.7072, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.07314544254877933, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000963166144200627, | |
| "loss": 2.6918, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.07333396173060609, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009629702194357367, | |
| "loss": 2.6682, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.07352248091243284, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009627742946708464, | |
| "loss": 2.6512, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.07371100009425958, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 0.0009625783699059562, | |
| "loss": 2.718, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.07389951927608635, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0009623824451410658, | |
| "loss": 2.7208, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.07408803845791309, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009621865203761756, | |
| "loss": 2.7411, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.07427655763973984, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009619905956112854, | |
| "loss": 2.6763, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.0744650768215666, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000961794670846395, | |
| "loss": 2.6919, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.07465359600339334, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009615987460815048, | |
| "loss": 2.767, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.07484211518522009, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009614028213166145, | |
| "loss": 2.6868, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.07503063436704685, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009612068965517241, | |
| "loss": 2.6393, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.0752191535488736, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009610109717868338, | |
| "loss": 2.5917, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.07540767273070034, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009608150470219436, | |
| "loss": 2.709, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0755961919125271, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009606191222570532, | |
| "loss": 2.6591, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.07578471109435385, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.000960423197492163, | |
| "loss": 2.7638, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.0759732302761806, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009602272727272728, | |
| "loss": 2.58, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.07616174945800736, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009600313479623824, | |
| "loss": 2.5257, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.0763502686398341, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009598354231974922, | |
| "loss": 2.6512, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.07653878782166085, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009596394984326019, | |
| "loss": 2.6432, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.07672730700348761, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009594435736677116, | |
| "loss": 2.6028, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.07691582618531435, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 0.0009592476489028213, | |
| "loss": 2.707, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.0771043453671411, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009590517241379311, | |
| "loss": 2.5831, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.07729286454896786, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009588557993730408, | |
| "loss": 2.7447, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0774813837307946, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009586598746081505, | |
| "loss": 2.6452, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.07766990291262135, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0009584639498432603, | |
| "loss": 2.7138, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.07785842209444811, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.00095826802507837, | |
| "loss": 2.5726, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.07804694127627486, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009580721003134797, | |
| "loss": 2.7128, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.0782354604581016, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009578761755485894, | |
| "loss": 2.5482, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.07842397963992837, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009576802507836991, | |
| "loss": 2.768, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.07861249882175511, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009574843260188087, | |
| "loss": 2.774, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.07880101800358187, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009572884012539185, | |
| "loss": 2.7388, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.07898953718540862, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0009570924764890282, | |
| "loss": 2.5905, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.07917805636723536, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009568965517241379, | |
| "loss": 2.7478, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07936657554906212, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009567006269592476, | |
| "loss": 2.699, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.07955509473088887, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009565047021943574, | |
| "loss": 2.5614, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.07974361391271562, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009563087774294671, | |
| "loss": 2.6036, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.07993213309454238, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009561128526645768, | |
| "loss": 2.6515, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.08012065227636912, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009559169278996866, | |
| "loss": 2.7026, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.08030917145819587, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009557210031347962, | |
| "loss": 2.7497, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.08049769064002263, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.000955525078369906, | |
| "loss": 2.6129, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.08068620982184938, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009553291536050157, | |
| "loss": 2.6113, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.08087472900367612, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009551332288401254, | |
| "loss": 2.4547, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.08106324818550288, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009549373040752351, | |
| "loss": 2.6197, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.08125176736732963, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009547413793103449, | |
| "loss": 2.684, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.08144028654915637, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009545454545454546, | |
| "loss": 2.6874, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.08162880573098313, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009543495297805643, | |
| "loss": 2.6019, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.08181732491280988, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009541536050156741, | |
| "loss": 2.6309, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.08200584409463663, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009539576802507836, | |
| "loss": 2.6848, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.08219436327646339, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009537617554858934, | |
| "loss": 2.7124, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.08238288245829013, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009535658307210031, | |
| "loss": 2.5744, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.08257140164011688, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0009533699059561129, | |
| "loss": 2.7689, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.08275992082194364, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009531739811912225, | |
| "loss": 2.7709, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.08294844000377039, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009529780564263323, | |
| "loss": 2.5495, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.08313695918559713, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009527821316614421, | |
| "loss": 2.696, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.08332547836742389, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009525862068965517, | |
| "loss": 2.6657, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.08351399754925064, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009523902821316615, | |
| "loss": 2.6998, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.08370251673107738, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009521943573667712, | |
| "loss": 2.7154, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.08389103591290414, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009519984326018809, | |
| "loss": 2.6478, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.08407955509473089, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009518025078369906, | |
| "loss": 2.6899, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.08426807427655764, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009516065830721004, | |
| "loss": 2.7137, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.0844565934583844, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.00095141065830721, | |
| "loss": 2.6207, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.08464511264021114, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009512147335423198, | |
| "loss": 2.7149, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.08483363182203789, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009510188087774296, | |
| "loss": 2.7011, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08502215100386465, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0009508228840125392, | |
| "loss": 2.6496, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.0852106701856914, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.000950626959247649, | |
| "loss": 2.6714, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.08539918936751814, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0009504310344827587, | |
| "loss": 2.6271, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.0855877085493449, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009502351097178683, | |
| "loss": 2.6513, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.08577622773117165, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.000950039184952978, | |
| "loss": 2.6638, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.0859647469129984, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009498432601880878, | |
| "loss": 2.7398, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.08615326609482515, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009496473354231974, | |
| "loss": 2.7013, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.0863417852766519, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009494514106583072, | |
| "loss": 2.6336, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.08653030445847865, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.000949255485893417, | |
| "loss": 2.5915, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.0867188236403054, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009490595611285266, | |
| "loss": 2.6545, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.08690734282213215, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009488636363636364, | |
| "loss": 2.6792, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.0870958620039589, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009486677115987461, | |
| "loss": 2.6238, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.08728438118578566, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009484717868338558, | |
| "loss": 2.6929, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.0874729003676124, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009482758620689655, | |
| "loss": 2.7269, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.08766141954943915, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009480799373040753, | |
| "loss": 2.6728, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.08784993873126591, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 0.000947884012539185, | |
| "loss": 2.667, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.08803845791309266, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009476880877742947, | |
| "loss": 2.7706, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.0882269770949194, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009474921630094045, | |
| "loss": 2.7464, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.08841549627674616, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0009472962382445142, | |
| "loss": 2.6004, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.08860401545857291, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009471003134796239, | |
| "loss": 2.6237, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08879253464039966, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009469043887147336, | |
| "loss": 2.6628, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.08898105382222642, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009467084639498434, | |
| "loss": 2.7066, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.08916957300405316, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009465125391849529, | |
| "loss": 2.6655, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.08935809218587991, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009463166144200627, | |
| "loss": 2.6333, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.08954661136770667, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009461206896551724, | |
| "loss": 2.5766, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.08973513054953342, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009459247648902821, | |
| "loss": 2.7387, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.08992364973136016, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 0.0009457288401253918, | |
| "loss": 2.7342, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.09011216891318692, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0009455329153605016, | |
| "loss": 2.6416, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.09030068809501367, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009453369905956113, | |
| "loss": 2.6143, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.09048920727684041, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000945141065830721, | |
| "loss": 2.7185, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.09067772645866717, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 0.0009449451410658308, | |
| "loss": 2.6152, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.09086624564049392, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009447492163009404, | |
| "loss": 2.6592, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.09105476482232067, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009445532915360502, | |
| "loss": 2.5181, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.09124328400414743, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009443573667711599, | |
| "loss": 2.6332, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.09143180318597417, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009441614420062696, | |
| "loss": 2.521, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.09162032236780092, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009439655172413793, | |
| "loss": 2.6339, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.09180884154962768, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009437695924764891, | |
| "loss": 2.6627, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.09199736073145443, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009435736677115988, | |
| "loss": 2.6227, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.09218587991328117, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009433777429467085, | |
| "loss": 2.784, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.09237439909510793, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 0.0009431818181818183, | |
| "loss": 2.5622, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.09256291827693468, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009429858934169278, | |
| "loss": 2.6712, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.09275143745876142, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009427899686520376, | |
| "loss": 2.5781, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.09293995664058818, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009425940438871473, | |
| "loss": 2.6193, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.09312847582241493, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009423981191222571, | |
| "loss": 2.716, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.09331699500424168, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009422021943573667, | |
| "loss": 2.745, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.09350551418606844, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009420062695924765, | |
| "loss": 2.5251, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.09369403336789518, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009418103448275863, | |
| "loss": 2.7023, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.09388255254972193, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009416144200626959, | |
| "loss": 2.7697, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.09407107173154869, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009414184952978057, | |
| "loss": 2.6253, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.09425959091337544, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0009412225705329154, | |
| "loss": 2.6668, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09425959091337544, | |
| "eval_runtime": 58.5785, | |
| "eval_samples_per_second": 17.481, | |
| "eval_steps_per_second": 0.546, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09425959091337544, | |
| "eval/hellaswag_acc": 0.37572196773551086, | |
| "eval/hellaswag_acc_norm": 0.4714200358494324, | |
| "eval_hellaswag_elapsed_time": 195.95180106163025, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09444811009520218, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009410266457680251, | |
| "loss": 2.6645, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.09463662927702894, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.0009408307210031348, | |
| "loss": 2.7233, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.09482514845885569, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009406347962382446, | |
| "loss": 2.6959, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.09501366764068243, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009404388714733542, | |
| "loss": 2.747, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.0952021868225092, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.000940242946708464, | |
| "loss": 2.521, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.09539070600433594, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009400470219435738, | |
| "loss": 2.7368, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.09557922518616269, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009398510971786834, | |
| "loss": 2.6509, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.09576774436798945, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009396551724137932, | |
| "loss": 2.785, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.09595626354981619, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009394592476489029, | |
| "loss": 2.5647, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.09614478273164294, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009392633228840125, | |
| "loss": 2.6087, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.0963333019134697, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009390673981191222, | |
| "loss": 2.6032, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.09652182109529645, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000938871473354232, | |
| "loss": 2.6934, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.09671034027712319, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009386755485893416, | |
| "loss": 2.7077, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.09689885945894995, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009384796238244514, | |
| "loss": 2.7372, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.0970873786407767, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009382836990595611, | |
| "loss": 2.5907, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.09727589782260344, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0009380877742946708, | |
| "loss": 2.5623, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.0974644170044302, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009378918495297806, | |
| "loss": 2.6949, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.09765293618625695, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 0.0009376959247648903, | |
| "loss": 2.6505, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.0978414553680837, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009375, | |
| "loss": 2.6902, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.09802997454991046, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009373040752351097, | |
| "loss": 2.6529, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.0982184937317372, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009371081504702195, | |
| "loss": 2.5571, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.09840701291356395, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009369122257053292, | |
| "loss": 2.7718, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.09859553209539071, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009367163009404389, | |
| "loss": 2.7112, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.09878405127721746, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009365203761755486, | |
| "loss": 2.5318, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.0989725704590442, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009363244514106584, | |
| "loss": 2.6242, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.09916108964087096, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009361285266457681, | |
| "loss": 2.6603, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.09934960882269771, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009359326018808778, | |
| "loss": 2.7204, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.09953812800452445, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009357366771159876, | |
| "loss": 2.6355, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.09972664718635121, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009355407523510971, | |
| "loss": 2.6126, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.09991516636817796, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0009353448275862069, | |
| "loss": 2.5042, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1001036855500047, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009351489028213166, | |
| "loss": 2.639, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.10029220473183147, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009349529780564263, | |
| "loss": 2.6981, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.10048072391365821, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.000934757053291536, | |
| "loss": 2.6578, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.10066924309548496, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009345611285266458, | |
| "loss": 2.7651, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.10085776227731172, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009343652037617555, | |
| "loss": 2.6639, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.10104628145913847, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009341692789968652, | |
| "loss": 2.6911, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.10123480064096523, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000933973354231975, | |
| "loss": 2.6213, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.10142331982279197, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009337774294670846, | |
| "loss": 2.6084, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.10161183900461872, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009335815047021944, | |
| "loss": 2.6893, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.10180035818644548, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009333855799373041, | |
| "loss": 2.547, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.10198887736827222, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009331896551724138, | |
| "loss": 2.7084, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.10217739655009897, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009329937304075235, | |
| "loss": 2.6611, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.10236591573192573, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009327978056426333, | |
| "loss": 2.658, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.10255443491375248, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.000932601880877743, | |
| "loss": 2.7423, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.10274295409557922, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0009324059561128527, | |
| "loss": 2.6237, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.10293147327740598, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009322100313479625, | |
| "loss": 2.6846, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.10311999245923273, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009320141065830722, | |
| "loss": 2.5963, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.10330851164105948, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009318181818181818, | |
| "loss": 2.6334, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.10349703082288624, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009316222570532915, | |
| "loss": 2.6657, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.10368555000471298, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009314263322884013, | |
| "loss": 2.7307, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.10387406918653973, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009312304075235109, | |
| "loss": 2.7773, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.10406258836836649, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009310344827586207, | |
| "loss": 2.7054, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.10425110755019323, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009308385579937305, | |
| "loss": 2.6325, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.10443962673201998, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009306426332288401, | |
| "loss": 2.5011, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.10462814591384674, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009304467084639499, | |
| "loss": 2.6252, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.10481666509567349, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009302507836990596, | |
| "loss": 2.6176, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.10500518427750023, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009300548589341693, | |
| "loss": 2.699, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.10519370345932699, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.000929858934169279, | |
| "loss": 2.5058, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.10538222264115374, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009296630094043888, | |
| "loss": 2.6034, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.10557074182298049, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0009294670846394984, | |
| "loss": 2.5785, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.10575926100480725, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.0009292711598746082, | |
| "loss": 2.7846, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.10594778018663399, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 0.000929075235109718, | |
| "loss": 2.7049, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.10613629936846074, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009288793103448276, | |
| "loss": 2.72, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.1063248185502875, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009286833855799374, | |
| "loss": 2.7468, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.10651333773211424, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009284874608150471, | |
| "loss": 2.706, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.10670185691394099, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009282915360501567, | |
| "loss": 2.6664, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.10689037609576775, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0009280956112852664, | |
| "loss": 2.6685, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.1070788952775945, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0009278996865203762, | |
| "loss": 2.6295, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.10726741445942124, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0009277037617554858, | |
| "loss": 2.7556, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.107455933641248, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009275078369905956, | |
| "loss": 2.6027, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.10764445282307475, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0009273119122257053, | |
| "loss": 2.622, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.1078329720049015, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.000927115987460815, | |
| "loss": 2.6136, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.10802149118672826, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009269200626959248, | |
| "loss": 2.6196, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.108210010368555, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009267241379310345, | |
| "loss": 2.6569, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.10839852955038175, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009265282131661443, | |
| "loss": 2.7018, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.10858704873220851, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009263322884012539, | |
| "loss": 2.5521, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.10877556791403525, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009261363636363637, | |
| "loss": 2.6091, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.108964087095862, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009259404388714734, | |
| "loss": 2.5969, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.10915260627768876, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009257445141065831, | |
| "loss": 2.6032, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.1093411254595155, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009255485893416928, | |
| "loss": 2.6755, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.10952964464134225, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009253526645768026, | |
| "loss": 2.6504, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.10971816382316901, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009251567398119123, | |
| "loss": 2.7218, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.10990668300499576, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.000924960815047022, | |
| "loss": 2.6733, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.1100952021868225, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009247648902821318, | |
| "loss": 2.7395, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.11028372136864927, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009245689655172413, | |
| "loss": 2.7391, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.11047224055047601, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009243730407523511, | |
| "loss": 2.5436, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.11066075973230276, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009241771159874608, | |
| "loss": 2.6671, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.11084927891412952, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0009239811912225705, | |
| "loss": 2.7918, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.11103779809595626, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009237852664576802, | |
| "loss": 2.6591, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.11122631727778301, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.00092358934169279, | |
| "loss": 2.5917, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.11141483645960977, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009233934169278996, | |
| "loss": 2.5555, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.11160335564143652, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0009231974921630094, | |
| "loss": 2.7138, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.11179187482326326, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009230015673981192, | |
| "loss": 2.7638, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.11198039400509002, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009228056426332288, | |
| "loss": 2.6662, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.11216891318691677, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009226097178683386, | |
| "loss": 2.7772, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.11235743236874352, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 0.0009224137931034483, | |
| "loss": 2.7929, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.11254595155057028, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.000922217868338558, | |
| "loss": 2.7971, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.11273447073239702, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009220219435736677, | |
| "loss": 2.7352, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.11292298991422377, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009218260188087775, | |
| "loss": 2.6516, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.11311150909605053, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009216300940438871, | |
| "loss": 2.7391, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.11330002827787727, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009214341692789969, | |
| "loss": 2.6186, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.11348854745970402, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009212382445141067, | |
| "loss": 2.7238, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.11367706664153078, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0009210423197492164, | |
| "loss": 2.5381, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.11386558582335753, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 0.000920846394984326, | |
| "loss": 2.6816, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.11405410500518427, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009206504702194357, | |
| "loss": 2.5738, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.11424262418701103, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009204545454545455, | |
| "loss": 2.6701, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.11443114336883778, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009202586206896551, | |
| "loss": 2.668, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.11461966255066453, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0009200626959247649, | |
| "loss": 2.6853, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.11480818173249129, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009198667711598747, | |
| "loss": 2.6926, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.11499670091431803, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009196708463949843, | |
| "loss": 2.7387, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.11518522009614478, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0009194749216300941, | |
| "loss": 2.613, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.11537373927797154, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009192789968652038, | |
| "loss": 2.5144, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.11556225845979828, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.0009190830721003135, | |
| "loss": 2.7777, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.11575077764162503, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009188871473354232, | |
| "loss": 2.6103, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.11593929682345179, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.000918691222570533, | |
| "loss": 2.6806, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.11612781600527854, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009184952978056426, | |
| "loss": 2.6122, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.11631633518710528, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009182993730407524, | |
| "loss": 2.6198, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.11650485436893204, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009181034482758622, | |
| "loss": 2.5324, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.11669337355075879, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0009179075235109718, | |
| "loss": 2.6424, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.11688189273258554, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009177115987460816, | |
| "loss": 2.6544, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1170704119144123, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009175156739811913, | |
| "loss": 2.6725, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.11725893109623904, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.000917319749216301, | |
| "loss": 2.6113, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.11744745027806579, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009171238244514106, | |
| "loss": 2.6232, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.11763596945989255, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009169278996865204, | |
| "loss": 2.5457, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.1178244886417193, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00091673197492163, | |
| "loss": 2.5777, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.11801300782354604, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009165360501567398, | |
| "loss": 2.6668, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.1182015270053728, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009163401253918495, | |
| "loss": 2.7571, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.11839004618719955, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009161442006269592, | |
| "loss": 2.6457, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.11857856536902629, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.000915948275862069, | |
| "loss": 2.5717, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 0.11876708455085305, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009157523510971787, | |
| "loss": 2.6518, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1189556037326798, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009155564263322885, | |
| "loss": 2.6215, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 0.11914412291450655, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009153605015673981, | |
| "loss": 2.6829, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.1193326420963333, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0009151645768025079, | |
| "loss": 2.6905, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.11952116127816005, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009149686520376176, | |
| "loss": 2.5993, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.1197096804599868, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009147727272727273, | |
| "loss": 2.6115, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.11989819964181356, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.000914576802507837, | |
| "loss": 2.7553, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.1200867188236403, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009143808777429468, | |
| "loss": 2.685, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 0.12027523800546705, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009141849529780565, | |
| "loss": 2.6026, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.12046375718729381, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0009139890282131662, | |
| "loss": 2.7128, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 0.12065227636912056, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.000913793103448276, | |
| "loss": 2.5892, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1208407955509473, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009135971786833855, | |
| "loss": 2.6472, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 0.12102931473277406, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009134012539184953, | |
| "loss": 2.6738, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.12121783391460081, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.000913205329153605, | |
| "loss": 2.6913, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 0.12140635309642756, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009130094043887147, | |
| "loss": 2.6355, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.12159487227825432, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009128134796238244, | |
| "loss": 2.5979, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.12178339146008106, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009126175548589342, | |
| "loss": 2.7308, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.12197191064190781, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009124216300940438, | |
| "loss": 2.6809, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 0.12216042982373457, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009122257053291536, | |
| "loss": 2.7713, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.12234894900556131, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0009120297805642634, | |
| "loss": 2.4518, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 0.12253746818738806, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.000911833855799373, | |
| "loss": 2.5884, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.12272598736921482, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009116379310344828, | |
| "loss": 2.6641, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.12291450655104157, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009114420062695925, | |
| "loss": 2.5444, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.12310302573286831, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009112460815047022, | |
| "loss": 2.6773, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 0.12329154491469507, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009110501567398119, | |
| "loss": 2.6689, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.12348006409652182, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009108542319749217, | |
| "loss": 2.8371, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.12366858327834858, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009106583072100313, | |
| "loss": 2.6491, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.12385710246017533, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009104623824451411, | |
| "loss": 2.7192, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 0.12404562164200207, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009102664576802509, | |
| "loss": 2.7254, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.12423414082382883, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009100705329153606, | |
| "loss": 2.6403, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 0.12442266000565558, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009098746081504702, | |
| "loss": 2.6217, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.12461117918748232, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0009096786833855799, | |
| "loss": 2.7402, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 0.12479969836930908, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009094827586206897, | |
| "loss": 2.7237, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.12498821755113583, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009092868338557993, | |
| "loss": 2.5321, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 0.1251767367329626, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0009090909090909091, | |
| "loss": 2.4766, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.12536525591478934, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0009088949843260188, | |
| "loss": 2.5655, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.12555377509661608, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009086990595611285, | |
| "loss": 2.4495, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.12574229427844283, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009085031347962383, | |
| "loss": 2.7035, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 0.12593081346026958, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.000908307210031348, | |
| "loss": 2.5528, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.12611933264209632, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009081112852664577, | |
| "loss": 2.5787, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 0.1263078518239231, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009079153605015674, | |
| "loss": 2.6167, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.12649637100574984, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009077194357366772, | |
| "loss": 2.7147, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 0.1266848901875766, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0009075235109717868, | |
| "loss": 2.7819, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.12687340936940333, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009073275862068966, | |
| "loss": 2.5718, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 0.12706192855123008, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0009071316614420063, | |
| "loss": 2.6887, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.12725044773305683, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000906935736677116, | |
| "loss": 2.6037, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.1274389669148836, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0009067398119122258, | |
| "loss": 2.6148, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.12762748609671035, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 0.0009065438871473355, | |
| "loss": 2.6737, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 0.1278160052785371, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009063479623824452, | |
| "loss": 2.7679, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.12800452446036384, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0009061520376175548, | |
| "loss": 2.6948, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 0.12819304364219059, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009059561128526646, | |
| "loss": 2.6185, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.12838156282401733, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.0009057601880877742, | |
| "loss": 2.7351, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 0.1285700820058441, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.000905564263322884, | |
| "loss": 2.6394, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.12875860118767085, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009053683385579937, | |
| "loss": 2.7473, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 0.1289471203694976, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0009051724137931034, | |
| "loss": 2.5965, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.12913563955132434, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0009049764890282132, | |
| "loss": 2.8092, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.1293241587331511, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009047805642633229, | |
| "loss": 2.58, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.12951267791497784, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009045846394984327, | |
| "loss": 2.6549, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 0.1297011970968046, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009043887147335423, | |
| "loss": 2.6374, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.12988971627863136, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009041927899686521, | |
| "loss": 2.5968, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 0.1300782354604581, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009039968652037618, | |
| "loss": 2.6792, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.13026675464228485, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0009038009404388715, | |
| "loss": 2.6631, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 0.1304552738241116, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.0009036050156739812, | |
| "loss": 2.7008, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.13064379300593834, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.000903409090909091, | |
| "loss": 2.5822, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 0.13083231218776512, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0009032131661442007, | |
| "loss": 2.5259, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.13102083136959186, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0009030172413793104, | |
| "loss": 2.5717, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.1312093505514186, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009028213166144202, | |
| "loss": 2.7658, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.13139786973324535, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0009026253918495298, | |
| "loss": 2.6304, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 0.1315863889150721, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0009024294670846395, | |
| "loss": 2.6773, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.13177490809689885, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0009022335423197492, | |
| "loss": 2.6761, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 0.13196342727872562, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 0.0009020376175548589, | |
| "loss": 2.6741, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.13215194646055237, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009018416927899686, | |
| "loss": 2.5493, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 0.1323404656423791, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009016457680250784, | |
| "loss": 2.6143, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.13252898482420586, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.000901449843260188, | |
| "loss": 2.6671, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 0.1327175040060326, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009012539184952978, | |
| "loss": 2.6108, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.13290602318785935, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009010579937304076, | |
| "loss": 2.6649, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.13309454236968613, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0009008620689655172, | |
| "loss": 2.6925, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.13328306155151287, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000900666144200627, | |
| "loss": 2.6506, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 0.13347158073333962, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 0.0009004702194357367, | |
| "loss": 2.7378, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.13366009991516636, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0009002742946708464, | |
| "loss": 2.6705, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 0.1338486190969931, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0009000783699059561, | |
| "loss": 2.7019, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.13403713827881986, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008998824451410659, | |
| "loss": 2.6219, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 0.13422565746064663, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0008996865203761755, | |
| "loss": 2.5641, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.13441417664247338, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0008994905956112853, | |
| "loss": 2.6977, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 0.13460269582430012, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008992946708463951, | |
| "loss": 2.7332, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.13479121500612687, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008990987460815048, | |
| "loss": 2.5868, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.13497973418795361, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008989028213166145, | |
| "loss": 2.6311, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.1351682533697804, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008987068965517241, | |
| "loss": 2.6369, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 0.13535677255160714, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0008985109717868339, | |
| "loss": 2.778, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.13554529173343388, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0008983150470219435, | |
| "loss": 2.6355, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 0.13573381091526063, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008981191222570533, | |
| "loss": 2.6524, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.13592233009708737, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.000897923197492163, | |
| "loss": 2.7543, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.13611084927891412, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008977272727272727, | |
| "loss": 2.7062, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.1362993684607409, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0008975313479623825, | |
| "loss": 2.7506, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 0.13648788764256764, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008973354231974922, | |
| "loss": 2.6015, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.1366764068243944, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008971394984326019, | |
| "loss": 2.6391, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.13686492600622113, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0008969435736677116, | |
| "loss": 2.7587, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.13705344518804788, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008967476489028214, | |
| "loss": 2.7175, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 0.13724196436987462, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.000896551724137931, | |
| "loss": 2.6185, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.1374304835517014, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008963557993730408, | |
| "loss": 2.5831, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 0.13761900273352815, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0008961598746081505, | |
| "loss": 2.6364, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.1378075219153549, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008959639498432602, | |
| "loss": 2.5928, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 0.13799604109718164, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.00089576802507837, | |
| "loss": 2.6144, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.13818456027900838, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008955721003134797, | |
| "loss": 2.5887, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 0.13837307946083513, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 0.0008953761755485894, | |
| "loss": 2.5888, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 0.1385615986426619, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.000895180250783699, | |
| "loss": 2.5938, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.13875011782448865, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008949843260188088, | |
| "loss": 2.5876, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.1389386370063154, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.0008947884012539184, | |
| "loss": 2.6677, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 0.13912715618814214, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0008945924764890282, | |
| "loss": 2.5932, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 0.1393156753699689, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008943965517241379, | |
| "loss": 2.6358, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 0.13950419455179563, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008942006269592476, | |
| "loss": 2.6529, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1396927137336224, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008940047021943573, | |
| "loss": 2.6557, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 0.13988123291544916, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008938087774294671, | |
| "loss": 2.6333, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.1400697520972759, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0008936128526645769, | |
| "loss": 2.5974, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 0.14025827127910265, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008934169278996865, | |
| "loss": 2.6484, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.1404467904609294, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 0.0008932210031347963, | |
| "loss": 2.617, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.14063530964275614, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.000893025078369906, | |
| "loss": 2.6803, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 0.14082382882458291, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008928291536050157, | |
| "loss": 2.6882, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 0.14101234800640966, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0008926332288401254, | |
| "loss": 2.6814, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 0.1412008671882364, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0008924373040752352, | |
| "loss": 2.7618, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 0.14138938637006315, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008922413793103448, | |
| "loss": 2.7598, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1415779055518899, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 0.0008920454545454546, | |
| "loss": 2.6827, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 0.14176642473371664, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0008918495297805644, | |
| "loss": 2.7531, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.14195494391554342, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.000891653605015674, | |
| "loss": 2.5641, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 0.14214346309737017, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008914576802507837, | |
| "loss": 2.6333, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 0.1423319822791969, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008912617554858934, | |
| "loss": 2.7232, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.14252050146102366, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008910658307210031, | |
| "loss": 2.6999, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.1427090206428504, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008908699059561128, | |
| "loss": 2.6267, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 0.14289753982467715, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.0008906739811912226, | |
| "loss": 2.6947, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 0.14308605900650392, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0008904780564263322, | |
| "loss": 2.6448, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 0.14327457818833067, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000890282131661442, | |
| "loss": 2.7433, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.14346309737015742, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008900862068965518, | |
| "loss": 2.6372, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 0.14365161655198416, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008898902821316614, | |
| "loss": 2.7054, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 0.1438401357338109, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008896943573667712, | |
| "loss": 2.6433, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 0.14402865491563765, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008894984326018809, | |
| "loss": 2.6648, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 0.14421717409746443, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008893025078369906, | |
| "loss": 2.6961, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.14440569327929118, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0008891065830721003, | |
| "loss": 2.5835, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 0.14459421246111792, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008889106583072101, | |
| "loss": 2.6922, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 0.14478273164294467, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008887147335423197, | |
| "loss": 2.5899, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.1449712508247714, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008885188087774295, | |
| "loss": 2.6164, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 0.14515977000659816, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008883228840125393, | |
| "loss": 2.708, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.14534828918842493, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000888126959247649, | |
| "loss": 2.59, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 0.14553680837025168, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008879310344827587, | |
| "loss": 2.5905, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 0.14572532755207843, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008877351097178683, | |
| "loss": 2.6069, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 0.14591384673390517, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0008875391849529781, | |
| "loss": 2.695, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 0.14610236591573192, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008873432601880877, | |
| "loss": 2.7418, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.14629088509755866, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008871473354231975, | |
| "loss": 2.6499, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.14647940427938544, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008869514106583072, | |
| "loss": 2.6959, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 0.14666792346121219, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008867554858934169, | |
| "loss": 2.6255, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 0.14685644264303893, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008865595611285267, | |
| "loss": 2.6885, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 0.14704496182486568, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008863636363636364, | |
| "loss": 2.6154, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.14723348100669242, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008861677115987461, | |
| "loss": 2.6715, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 0.14742200018851917, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008859717868338558, | |
| "loss": 2.6356, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 0.14761051937034594, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008857758620689656, | |
| "loss": 2.6181, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 0.1477990385521727, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008855799373040752, | |
| "loss": 2.6921, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.14798755773399944, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 0.000885384012539185, | |
| "loss": 2.5067, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.14817607691582618, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008851880877742947, | |
| "loss": 2.729, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 0.14836459609765293, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0008849921630094044, | |
| "loss": 2.6272, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 0.14855311527947967, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008847962382445142, | |
| "loss": 2.5603, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 0.14874163446130645, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008846003134796239, | |
| "loss": 2.7072, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 0.1489301536431332, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008844043887147336, | |
| "loss": 2.6431, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.14911867282495994, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008842084639498433, | |
| "loss": 2.6058, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 0.1493071920067867, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.000884012539184953, | |
| "loss": 2.6369, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.14949571118861343, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008838166144200626, | |
| "loss": 2.5803, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 0.14968423037044018, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008836206896551724, | |
| "loss": 2.6228, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.14987274955226695, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0008834247648902821, | |
| "loss": 2.5875, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.1500612687340937, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008832288401253918, | |
| "loss": 2.6705, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 0.15024978791592045, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008830329153605015, | |
| "loss": 2.6356, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 0.1504383070977472, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008828369905956113, | |
| "loss": 2.6354, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 0.15062682627957394, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008826410658307211, | |
| "loss": 2.7054, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 0.15081534546140068, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008824451410658307, | |
| "loss": 2.5472, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.15100386464322746, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0008822492163009405, | |
| "loss": 2.5917, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 0.1511923838250542, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008820532915360502, | |
| "loss": 2.6797, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 0.15138090300688095, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008818573667711599, | |
| "loss": 2.6894, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 0.1515694221887077, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0008816614420062696, | |
| "loss": 2.6542, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 0.15175794137053444, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008814655172413794, | |
| "loss": 2.7165, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.1519464605523612, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.000881269592476489, | |
| "loss": 2.6396, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 0.15213497973418796, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008810736677115988, | |
| "loss": 2.5562, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 0.1523234989160147, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008808777429467086, | |
| "loss": 2.5097, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 0.15251201809784146, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008806818181818182, | |
| "loss": 2.6022, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 0.1527005372796682, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008804858934169279, | |
| "loss": 2.6144, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.15288905646149495, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008802899686520376, | |
| "loss": 2.7956, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 0.1530775756433217, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008800940438871473, | |
| "loss": 2.6763, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 0.15326609482514847, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.000879898119122257, | |
| "loss": 2.7261, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 0.15345461400697522, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008797021943573668, | |
| "loss": 2.5969, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 0.15364313318880196, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008795062695924764, | |
| "loss": 2.5595, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.1538316523706287, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008793103448275862, | |
| "loss": 2.4956, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 0.15402017155245545, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000879114420062696, | |
| "loss": 2.6691, | |
| "step": 817 | |
| }, | |
| { | |
| "epoch": 0.1542086907342822, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0008789184952978056, | |
| "loss": 2.5205, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 0.15439720991610897, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008787225705329154, | |
| "loss": 2.5277, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 0.15458572909793572, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.0008785266457680251, | |
| "loss": 2.6255, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.15477424827976247, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008783307210031348, | |
| "loss": 2.6624, | |
| "step": 821 | |
| }, | |
| { | |
| "epoch": 0.1549627674615892, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.0008781347962382445, | |
| "loss": 2.6192, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 0.15515128664341596, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008779388714733543, | |
| "loss": 2.6637, | |
| "step": 823 | |
| }, | |
| { | |
| "epoch": 0.1553398058252427, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.000877742946708464, | |
| "loss": 2.6116, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 0.15552832500706948, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008775470219435737, | |
| "loss": 2.7295, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.15571684418889623, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008773510971786835, | |
| "loss": 2.6604, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 0.15590536337072297, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008771551724137932, | |
| "loss": 2.661, | |
| "step": 827 | |
| }, | |
| { | |
| "epoch": 0.15609388255254972, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008769592476489029, | |
| "loss": 2.6784, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 0.15628240173437646, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008767633228840125, | |
| "loss": 2.6322, | |
| "step": 829 | |
| }, | |
| { | |
| "epoch": 0.1564709209162032, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008765673981191223, | |
| "loss": 2.6958, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.15665944009802998, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008763714733542319, | |
| "loss": 2.614, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 0.15684795927985673, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008761755485893417, | |
| "loss": 2.61, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.15703647846168348, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008759796238244514, | |
| "loss": 2.6638, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 0.15722499764351022, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0008757836990595611, | |
| "loss": 2.5964, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 0.15741351682533697, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008755877742946709, | |
| "loss": 2.6957, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.15760203600716374, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008753918495297806, | |
| "loss": 2.5888, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 0.1577905551889905, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008751959247648903, | |
| "loss": 2.644, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 0.15797907437081724, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.000875, | |
| "loss": 2.772, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 0.15816759355264398, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008748040752351098, | |
| "loss": 2.6483, | |
| "step": 839 | |
| }, | |
| { | |
| "epoch": 0.15835611273447073, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008746081504702194, | |
| "loss": 2.5755, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.15854463191629747, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008744122257053292, | |
| "loss": 2.6584, | |
| "step": 841 | |
| }, | |
| { | |
| "epoch": 0.15873315109812425, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008742163009404389, | |
| "loss": 2.6448, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 0.158921670279951, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008740203761755486, | |
| "loss": 2.6187, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 0.15911018946177774, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008738244514106584, | |
| "loss": 2.7043, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 0.1592987086436045, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008736285266457681, | |
| "loss": 2.5718, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.15948722782543123, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008734326018808778, | |
| "loss": 2.6218, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 0.15967574700725798, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008732366771159875, | |
| "loss": 2.7162, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 0.15986426618908475, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008730407523510972, | |
| "loss": 2.7002, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 0.1600527853709115, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 0.0008728448275862068, | |
| "loss": 2.5591, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 0.16024130455273825, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008726489028213166, | |
| "loss": 2.68, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.160429823734565, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008724529780564263, | |
| "loss": 2.6753, | |
| "step": 851 | |
| }, | |
| { | |
| "epoch": 0.16061834291639174, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000872257053291536, | |
| "loss": 2.6676, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 0.16080686209821848, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.0008720611285266457, | |
| "loss": 2.6622, | |
| "step": 853 | |
| }, | |
| { | |
| "epoch": 0.16099538128004526, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0008718652037617555, | |
| "loss": 2.8021, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 0.161183900461872, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008716692789968653, | |
| "loss": 2.6823, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.16137241964369875, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.0008714733542319749, | |
| "loss": 2.6915, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 0.1615609388255255, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008712774294670847, | |
| "loss": 2.6373, | |
| "step": 857 | |
| }, | |
| { | |
| "epoch": 0.16174945800735224, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008710815047021944, | |
| "loss": 2.6126, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 0.161937977189179, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008708855799373041, | |
| "loss": 2.544, | |
| "step": 859 | |
| }, | |
| { | |
| "epoch": 0.16212649637100576, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008706896551724138, | |
| "loss": 2.5462, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.1623150155528325, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008704937304075236, | |
| "loss": 2.6703, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 0.16250353473465926, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.0008702978056426332, | |
| "loss": 2.606, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 0.162692053916486, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.000870101880877743, | |
| "loss": 2.5978, | |
| "step": 863 | |
| }, | |
| { | |
| "epoch": 0.16288057309831275, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0008699059561128528, | |
| "loss": 2.6027, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.1630690922801395, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0008697100313479624, | |
| "loss": 2.654, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.16325761146196627, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008695141065830722, | |
| "loss": 2.4922, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 0.16344613064379301, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008693181818181818, | |
| "loss": 2.54, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 0.16363464982561976, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008691222570532915, | |
| "loss": 2.6802, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 0.1638231690074465, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0008689263322884012, | |
| "loss": 2.6451, | |
| "step": 869 | |
| }, | |
| { | |
| "epoch": 0.16401168818927325, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.000868730407523511, | |
| "loss": 2.6301, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.1642002073711, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008685344827586206, | |
| "loss": 2.6155, | |
| "step": 871 | |
| }, | |
| { | |
| "epoch": 0.16438872655292677, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0008683385579937304, | |
| "loss": 2.5815, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 0.16457724573475352, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008681426332288402, | |
| "loss": 2.6225, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 0.16476576491658027, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008679467084639498, | |
| "loss": 2.6562, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 0.164954284098407, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008677507836990596, | |
| "loss": 2.6155, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.16514280328023376, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008675548589341693, | |
| "loss": 2.6055, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 0.1653313224620605, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.000867358934169279, | |
| "loss": 2.637, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 0.16551984164388728, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008671630094043887, | |
| "loss": 2.6913, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 0.16570836082571402, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008669670846394985, | |
| "loss": 2.5859, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 0.16589688000754077, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008667711598746082, | |
| "loss": 2.7039, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.16608539918936752, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008665752351097179, | |
| "loss": 2.662, | |
| "step": 881 | |
| }, | |
| { | |
| "epoch": 0.16627391837119426, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008663793103448277, | |
| "loss": 2.6888, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 0.166462437553021, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0008661833855799374, | |
| "loss": 2.6346, | |
| "step": 883 | |
| }, | |
| { | |
| "epoch": 0.16665095673484778, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008659874608150471, | |
| "loss": 2.6536, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 0.16683947591667453, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008657915360501567, | |
| "loss": 2.5995, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.16702799509850128, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008655956112852665, | |
| "loss": 2.7111, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 0.16721651428032802, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0008653996865203761, | |
| "loss": 2.6629, | |
| "step": 887 | |
| }, | |
| { | |
| "epoch": 0.16740503346215477, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008652037617554859, | |
| "loss": 2.5553, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 0.1675935526439815, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008650078369905956, | |
| "loss": 2.7594, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 0.1677820718258083, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008648119122257053, | |
| "loss": 2.6729, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.16797059100763503, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.000864615987460815, | |
| "loss": 2.7915, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 0.16815911018946178, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008644200626959248, | |
| "loss": 2.5758, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 0.16834762937128853, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008642241379310345, | |
| "loss": 2.5708, | |
| "step": 893 | |
| }, | |
| { | |
| "epoch": 0.16853614855311527, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008640282131661442, | |
| "loss": 2.6254, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 0.16872466773494202, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.000863832288401254, | |
| "loss": 2.5748, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.1689131869167688, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008636363636363636, | |
| "loss": 2.7726, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.16910170609859554, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008634404388714734, | |
| "loss": 2.6658, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 0.16929022528042229, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0008632445141065831, | |
| "loss": 2.6508, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 0.16947874446224903, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0008630485893416928, | |
| "loss": 2.6266, | |
| "step": 899 | |
| }, | |
| { | |
| "epoch": 0.16966726364407578, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008628526645768026, | |
| "loss": 2.7459, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.16985578282590252, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008626567398119123, | |
| "loss": 2.587, | |
| "step": 901 | |
| }, | |
| { | |
| "epoch": 0.1700443020077293, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.000862460815047022, | |
| "loss": 2.6079, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 0.17023282118955604, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008622648902821317, | |
| "loss": 2.7354, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 0.1704213403713828, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0008620689655172414, | |
| "loss": 2.5455, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 0.17060985955320954, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.000861873040752351, | |
| "loss": 2.6302, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.17079837873503628, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008616771159874608, | |
| "loss": 2.4757, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 0.17098689791686303, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0008614811912225705, | |
| "loss": 2.6884, | |
| "step": 907 | |
| }, | |
| { | |
| "epoch": 0.1711754170986898, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008612852664576803, | |
| "loss": 2.5877, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 0.17136393628051655, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008610893416927899, | |
| "loss": 2.6323, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 0.1715524554623433, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008608934169278997, | |
| "loss": 2.657, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.17174097464417004, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008606974921630095, | |
| "loss": 2.6591, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 0.1719294938259968, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008605015673981191, | |
| "loss": 2.6385, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 0.17211801300782353, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 0.0008603056426332289, | |
| "loss": 2.7853, | |
| "step": 913 | |
| }, | |
| { | |
| "epoch": 0.1723065321896503, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008601097178683386, | |
| "loss": 2.7299, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 0.17249505137147705, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008599137931034483, | |
| "loss": 2.7032, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.1726835705533038, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.000859717868338558, | |
| "loss": 2.6568, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 0.17287208973513055, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008595219435736678, | |
| "loss": 2.8574, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 0.1730606089169573, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008593260188087774, | |
| "loss": 2.7283, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 0.17324912809878404, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0008591300940438872, | |
| "loss": 2.712, | |
| "step": 919 | |
| }, | |
| { | |
| "epoch": 0.1734376472806108, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.000858934169278997, | |
| "loss": 2.5807, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.17362616646243756, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008587382445141066, | |
| "loss": 2.6998, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 0.1738146856442643, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008585423197492164, | |
| "loss": 2.5734, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 0.17400320482609105, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000858346394984326, | |
| "loss": 2.6374, | |
| "step": 923 | |
| }, | |
| { | |
| "epoch": 0.1741917240079178, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008581504702194357, | |
| "loss": 2.5408, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 0.17438024318974454, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0008579545454545454, | |
| "loss": 2.661, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.17456876237157132, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008577586206896552, | |
| "loss": 2.701, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 0.17475728155339806, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008575626959247648, | |
| "loss": 2.71, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 0.1749458007352248, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008573667711598746, | |
| "loss": 2.6468, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.17513431991705156, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0008571708463949844, | |
| "loss": 2.5039, | |
| "step": 929 | |
| }, | |
| { | |
| "epoch": 0.1753228390988783, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.000856974921630094, | |
| "loss": 2.6299, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.17551135828070505, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008567789968652038, | |
| "loss": 2.7097, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 0.17569987746253182, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008565830721003135, | |
| "loss": 2.662, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 0.17588839664435857, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008563871473354232, | |
| "loss": 2.6554, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 0.17607691582618532, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008561912225705329, | |
| "loss": 2.5157, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 0.17626543500801206, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008559952978056427, | |
| "loss": 2.658, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.1764539541898388, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008557993730407524, | |
| "loss": 2.5697, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 0.17664247337166555, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008556034482758621, | |
| "loss": 2.6467, | |
| "step": 937 | |
| }, | |
| { | |
| "epoch": 0.17683099255349233, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008554075235109719, | |
| "loss": 2.7308, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 0.17701951173531907, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008552115987460816, | |
| "loss": 2.5552, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 0.17720803091714582, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.0008550156739811913, | |
| "loss": 2.7391, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.17739655009897257, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.000854819749216301, | |
| "loss": 2.5914, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 0.1775850692807993, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008546238244514107, | |
| "loss": 2.6652, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 0.17777358846262606, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008544278996865203, | |
| "loss": 2.7119, | |
| "step": 943 | |
| }, | |
| { | |
| "epoch": 0.17796210764445283, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008542319749216301, | |
| "loss": 2.7099, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 0.17815062682627958, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0008540360501567398, | |
| "loss": 2.6452, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.17833914600810633, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008538401253918495, | |
| "loss": 2.5148, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 0.17852766518993307, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008536442006269592, | |
| "loss": 2.6379, | |
| "step": 947 | |
| }, | |
| { | |
| "epoch": 0.17871618437175982, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.000853448275862069, | |
| "loss": 2.6155, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 0.17890470355358656, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008532523510971787, | |
| "loss": 2.5122, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 0.17909322273541334, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008530564263322884, | |
| "loss": 2.7232, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.17928174191724008, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.0008528605015673982, | |
| "loss": 2.7196, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 0.17947026109906683, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008526645768025078, | |
| "loss": 2.5332, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 0.17965878028089358, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0008524686520376176, | |
| "loss": 2.7149, | |
| "step": 953 | |
| }, | |
| { | |
| "epoch": 0.17984729946272032, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008522727272727273, | |
| "loss": 2.7601, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 0.1800358186445471, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.000852076802507837, | |
| "loss": 2.6729, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.18022433782637384, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008518808777429467, | |
| "loss": 2.5701, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 0.1804128570082006, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008516849529780565, | |
| "loss": 2.6001, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 0.18060137619002734, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008514890282131662, | |
| "loss": 2.7079, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 0.18078989537185408, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008512931034482759, | |
| "loss": 2.6712, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 0.18097841455368083, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.0008510971786833856, | |
| "loss": 2.722, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1811669337355076, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0008509012539184952, | |
| "loss": 2.5951, | |
| "step": 961 | |
| }, | |
| { | |
| "epoch": 0.18135545291733435, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.000850705329153605, | |
| "loss": 2.6927, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 0.1815439720991611, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008505094043887147, | |
| "loss": 2.6872, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 0.18173249128098784, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0008503134796238245, | |
| "loss": 2.498, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 0.1819210104628146, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008501175548589341, | |
| "loss": 2.7254, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.18210952964464133, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008499216300940439, | |
| "loss": 2.6044, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 0.1822980488264681, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008497257053291537, | |
| "loss": 2.6202, | |
| "step": 967 | |
| }, | |
| { | |
| "epoch": 0.18248656800829485, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0008495297805642633, | |
| "loss": 2.6671, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 0.1826750871901216, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.0008493338557993731, | |
| "loss": 2.7925, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 0.18286360637194835, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008491379310344828, | |
| "loss": 2.5893, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1830521255537751, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0008489420062695925, | |
| "loss": 2.6704, | |
| "step": 971 | |
| }, | |
| { | |
| "epoch": 0.18324064473560184, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008487460815047022, | |
| "loss": 2.6642, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 0.1834291639174286, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.000848550156739812, | |
| "loss": 2.6579, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 0.18361768309925536, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.0008483542319749216, | |
| "loss": 2.47, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 0.1838062022810821, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008481583072100314, | |
| "loss": 2.5976, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.18399472146290885, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0008479623824451412, | |
| "loss": 2.7708, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 0.1841832406447356, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008477664576802508, | |
| "loss": 2.6823, | |
| "step": 977 | |
| }, | |
| { | |
| "epoch": 0.18437175982656234, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0008475705329153606, | |
| "loss": 2.6927, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 0.18456027900838912, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0008473746081504702, | |
| "loss": 2.682, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 0.18474879819021586, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008471786833855799, | |
| "loss": 2.6509, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1849373173720426, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008469827586206896, | |
| "loss": 2.7625, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 0.18512583655386936, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008467868338557994, | |
| "loss": 2.7144, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 0.1853143557356961, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.000846590909090909, | |
| "loss": 2.7065, | |
| "step": 983 | |
| }, | |
| { | |
| "epoch": 0.18550287491752285, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0008463949843260188, | |
| "loss": 2.5608, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 0.18569139409934962, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0008461990595611286, | |
| "loss": 2.587, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.18587991328117637, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008460031347962382, | |
| "loss": 2.5719, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 0.18606843246300311, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.000845807210031348, | |
| "loss": 2.6672, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 0.18625695164482986, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0008456112852664577, | |
| "loss": 2.6857, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 0.1864454708266566, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0008454153605015674, | |
| "loss": 2.6708, | |
| "step": 989 | |
| }, | |
| { | |
| "epoch": 0.18663399000848335, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0008452194357366771, | |
| "loss": 2.6183, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.18682250919031013, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008450235109717869, | |
| "loss": 2.6452, | |
| "step": 991 | |
| }, | |
| { | |
| "epoch": 0.18701102837213687, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0008448275862068966, | |
| "loss": 2.704, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.18719954755396362, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008446316614420063, | |
| "loss": 2.674, | |
| "step": 993 | |
| }, | |
| { | |
| "epoch": 0.18738806673579037, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.000844435736677116, | |
| "loss": 2.7137, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 0.1875765859176171, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008442398119122258, | |
| "loss": 2.7582, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.18776510509944386, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0008440438871473355, | |
| "loss": 2.6009, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 0.18795362428127063, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0008438479623824452, | |
| "loss": 2.6595, | |
| "step": 997 | |
| }, | |
| { | |
| "epoch": 0.18814214346309738, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0008436520376175549, | |
| "loss": 2.552, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 0.18833066264492412, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0008434561128526645, | |
| "loss": 2.5806, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 0.18851918182675087, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0008432601880877743, | |
| "loss": 2.6469, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18851918182675087, | |
| "eval_runtime": 16.219, | |
| "eval_samples_per_second": 63.136, | |
| "eval_steps_per_second": 1.973, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18851918182675087, | |
| "eval/hellaswag_acc": 0.3743278231428002, | |
| "eval/hellaswag_acc_norm": 0.4706233817964549, | |
| "eval_hellaswag_elapsed_time": 116.27660393714905, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 5304, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.325965577388032e+18, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |