| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 816, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004920049200492005, | |
| "grad_norm": 483.4133605957031, | |
| "learning_rate": 0.0, | |
| "loss": 0.95, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00984009840098401, | |
| "grad_norm": 414.1131286621094, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.7919, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.014760147601476014, | |
| "grad_norm": 20.664552688598633, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.2639, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.01968019680196802, | |
| "grad_norm": 19.735389709472656, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.3098, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.024600246002460024, | |
| "grad_norm": 630.93115234375, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 1.3057, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02952029520295203, | |
| "grad_norm": 108.70830535888672, | |
| "learning_rate": 4e-05, | |
| "loss": 0.784, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.03444034440344403, | |
| "grad_norm": 25.684120178222656, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.6977, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.03936039360393604, | |
| "grad_norm": 28.066545486450195, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 0.6476, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04428044280442804, | |
| "grad_norm": 76.08965301513672, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.5767, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.04920049200492005, | |
| "grad_norm": 14.550333023071289, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.4199, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05412054120541206, | |
| "grad_norm": 66.70437622070312, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6942, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.05904059040590406, | |
| "grad_norm": 73.81010437011719, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 1.517, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.06396063960639606, | |
| "grad_norm": 6.138183116912842, | |
| "learning_rate": 9.6e-05, | |
| "loss": 0.3904, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.06888068880688807, | |
| "grad_norm": 11.106021881103516, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 0.4124, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.07380073800738007, | |
| "grad_norm": 12.445630073547363, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 0.4418, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07872078720787208, | |
| "grad_norm": 7.307021141052246, | |
| "learning_rate": 0.00012, | |
| "loss": 0.4751, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.08364083640836409, | |
| "grad_norm": 19.08457374572754, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 0.5671, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.08856088560885608, | |
| "grad_norm": 11.03348159790039, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 0.4441, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.09348093480934809, | |
| "grad_norm": 155.23716735839844, | |
| "learning_rate": 0.000144, | |
| "loss": 0.5707, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0984009840098401, | |
| "grad_norm": 7.583343029022217, | |
| "learning_rate": 0.000152, | |
| "loss": 0.4662, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1033210332103321, | |
| "grad_norm": 6.289183139801025, | |
| "learning_rate": 0.00016, | |
| "loss": 0.4997, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.10824108241082411, | |
| "grad_norm": 24.76793098449707, | |
| "learning_rate": 0.000168, | |
| "loss": 0.491, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.11316113161131611, | |
| "grad_norm": 17.512184143066406, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 0.4472, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.11808118081180811, | |
| "grad_norm": 16.47793960571289, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.5235, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.12300123001230012, | |
| "grad_norm": 8.312434196472168, | |
| "learning_rate": 0.000192, | |
| "loss": 0.4818, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12792127921279212, | |
| "grad_norm": 83.81122589111328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.13284132841328414, | |
| "grad_norm": 8.704654693603516, | |
| "learning_rate": 0.000199999211292062, | |
| "loss": 0.4744, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.13776137761377613, | |
| "grad_norm": 2.8565006256103516, | |
| "learning_rate": 0.00019999684518068916, | |
| "loss": 0.4066, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.14268142681426815, | |
| "grad_norm": 5.916236877441406, | |
| "learning_rate": 0.00019999290170320485, | |
| "loss": 0.4154, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.14760147601476015, | |
| "grad_norm": 5.697567462921143, | |
| "learning_rate": 0.00019998738092181421, | |
| "loss": 0.4639, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15252152521525214, | |
| "grad_norm": 1.615671157836914, | |
| "learning_rate": 0.00019998028292360286, | |
| "loss": 0.4108, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.15744157441574416, | |
| "grad_norm": 11.121039390563965, | |
| "learning_rate": 0.00019997160782053578, | |
| "loss": 0.449, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.16236162361623616, | |
| "grad_norm": 7.386440277099609, | |
| "learning_rate": 0.00019996135574945544, | |
| "loss": 0.4216, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.16728167281672818, | |
| "grad_norm": 84.96491241455078, | |
| "learning_rate": 0.00019994952687207954, | |
| "loss": 0.5282, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.17220172201722017, | |
| "grad_norm": 6.692220211029053, | |
| "learning_rate": 0.00019993612137499876, | |
| "loss": 0.5036, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.17712177121771217, | |
| "grad_norm": 5.1402363777160645, | |
| "learning_rate": 0.00019992113946967353, | |
| "loss": 0.4041, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1820418204182042, | |
| "grad_norm": 3.2179603576660156, | |
| "learning_rate": 0.00019990458139243077, | |
| "loss": 0.398, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.18696186961869618, | |
| "grad_norm": 5.34651517868042, | |
| "learning_rate": 0.00019988644740446022, | |
| "loss": 0.4233, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1918819188191882, | |
| "grad_norm": 4.085568428039551, | |
| "learning_rate": 0.00019986673779181033, | |
| "loss": 0.3935, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1968019680196802, | |
| "grad_norm": 1.318534255027771, | |
| "learning_rate": 0.0001998454528653836, | |
| "loss": 0.3458, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2017220172201722, | |
| "grad_norm": 3.834606409072876, | |
| "learning_rate": 0.0001998225929609319, | |
| "loss": 0.3819, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2066420664206642, | |
| "grad_norm": 4.840269088745117, | |
| "learning_rate": 0.00019979815843905097, | |
| "loss": 0.473, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2115621156211562, | |
| "grad_norm": 498.64990234375, | |
| "learning_rate": 0.0001997721496851748, | |
| "loss": 3.6745, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.21648216482164823, | |
| "grad_norm": 4.956181526184082, | |
| "learning_rate": 0.00019974456710956964, | |
| "loss": 0.3385, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.22140221402214022, | |
| "grad_norm": 6.580547332763672, | |
| "learning_rate": 0.00019971541114732741, | |
| "loss": 0.4277, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.22632226322263221, | |
| "grad_norm": 37.05827713012695, | |
| "learning_rate": 0.0001996846822583589, | |
| "loss": 0.8045, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.23124231242312424, | |
| "grad_norm": 5.152987480163574, | |
| "learning_rate": 0.00019965238092738643, | |
| "loss": 0.4173, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.23616236162361623, | |
| "grad_norm": 277.78857421875, | |
| "learning_rate": 0.0001996185076639364, | |
| "loss": 0.8626, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.24108241082410825, | |
| "grad_norm": 2.9399607181549072, | |
| "learning_rate": 0.00019958306300233098, | |
| "loss": 0.3167, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.24600246002460024, | |
| "grad_norm": 3.668168306350708, | |
| "learning_rate": 0.00019954604750167993, | |
| "loss": 0.4422, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.25092250922509224, | |
| "grad_norm": 4.103700637817383, | |
| "learning_rate": 0.00019950746174587163, | |
| "loss": 0.3683, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.25584255842558423, | |
| "grad_norm": 0.7908763885498047, | |
| "learning_rate": 0.0001994673063435639, | |
| "loss": 0.3834, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2607626076260763, | |
| "grad_norm": 1.0205233097076416, | |
| "learning_rate": 0.0001994255819281744, | |
| "loss": 0.375, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2656826568265683, | |
| "grad_norm": 3.6355137825012207, | |
| "learning_rate": 0.0001993822891578708, | |
| "loss": 0.4557, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.27060270602706027, | |
| "grad_norm": 6.3725409507751465, | |
| "learning_rate": 0.00019933742871556, | |
| "loss": 0.4183, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.27552275522755226, | |
| "grad_norm": 6.519746780395508, | |
| "learning_rate": 0.00019929100130887782, | |
| "loss": 0.4416, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.28044280442804426, | |
| "grad_norm": 3.950495719909668, | |
| "learning_rate": 0.0001992430076701775, | |
| "loss": 0.4255, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2853628536285363, | |
| "grad_norm": 2.0773677825927734, | |
| "learning_rate": 0.00019919344855651833, | |
| "loss": 0.3393, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2902829028290283, | |
| "grad_norm": 8.755096435546875, | |
| "learning_rate": 0.00019914232474965365, | |
| "loss": 0.5312, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.2952029520295203, | |
| "grad_norm": 4.103138446807861, | |
| "learning_rate": 0.00019908963705601846, | |
| "loss": 0.4104, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3001230012300123, | |
| "grad_norm": 3.1862292289733887, | |
| "learning_rate": 0.0001990353863067169, | |
| "loss": 0.4211, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.3050430504305043, | |
| "grad_norm": 3.5777299404144287, | |
| "learning_rate": 0.00019897957335750878, | |
| "loss": 0.38, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.30996309963099633, | |
| "grad_norm": 3.0276339054107666, | |
| "learning_rate": 0.00019892219908879653, | |
| "loss": 0.4561, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3148831488314883, | |
| "grad_norm": 1.853022575378418, | |
| "learning_rate": 0.00019886326440561093, | |
| "loss": 0.3874, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3198031980319803, | |
| "grad_norm": 5.521330833435059, | |
| "learning_rate": 0.00019880277023759702, | |
| "loss": 0.459, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3247232472324723, | |
| "grad_norm": 8.374741554260254, | |
| "learning_rate": 0.0001987407175389994, | |
| "loss": 0.4025, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3296432964329643, | |
| "grad_norm": 3.8265085220336914, | |
| "learning_rate": 0.0001986771072886472, | |
| "loss": 0.4654, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.33456334563345635, | |
| "grad_norm": 2.002042770385742, | |
| "learning_rate": 0.00019861194048993863, | |
| "loss": 0.312, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.33948339483394835, | |
| "grad_norm": 6.2479634284973145, | |
| "learning_rate": 0.0001985452181708251, | |
| "loss": 0.4739, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.34440344403444034, | |
| "grad_norm": 3.072579860687256, | |
| "learning_rate": 0.00019847694138379506, | |
| "loss": 0.4282, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.34932349323493234, | |
| "grad_norm": 1.4464001655578613, | |
| "learning_rate": 0.0001984071112058574, | |
| "loss": 0.4165, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.35424354243542433, | |
| "grad_norm": 1.2664532661437988, | |
| "learning_rate": 0.00019833572873852444, | |
| "loss": 0.4575, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3591635916359164, | |
| "grad_norm": 9.184704780578613, | |
| "learning_rate": 0.00019826279510779454, | |
| "loss": 0.4957, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3640836408364084, | |
| "grad_norm": 8.479774475097656, | |
| "learning_rate": 0.00019818831146413434, | |
| "loss": 0.5062, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.36900369003690037, | |
| "grad_norm": 6.585694789886475, | |
| "learning_rate": 0.0001981122789824607, | |
| "loss": 0.4461, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.37392373923739236, | |
| "grad_norm": 2.46947979927063, | |
| "learning_rate": 0.0001980346988621221, | |
| "loss": 0.4175, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.37884378843788435, | |
| "grad_norm": 5.467379570007324, | |
| "learning_rate": 0.00019795557232687956, | |
| "loss": 0.4634, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.3837638376383764, | |
| "grad_norm": 3.7511723041534424, | |
| "learning_rate": 0.0001978749006248877, | |
| "loss": 0.466, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3886838868388684, | |
| "grad_norm": 4.01120138168335, | |
| "learning_rate": 0.00019779268502867473, | |
| "loss": 0.5087, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3936039360393604, | |
| "grad_norm": 3.0289227962493896, | |
| "learning_rate": 0.0001977089268351225, | |
| "loss": 0.4315, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3985239852398524, | |
| "grad_norm": 2.6868069171905518, | |
| "learning_rate": 0.00019762362736544607, | |
| "loss": 0.3795, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.4034440344403444, | |
| "grad_norm": 3.252519130706787, | |
| "learning_rate": 0.00019753678796517282, | |
| "loss": 0.3436, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.40836408364083643, | |
| "grad_norm": 3.5098648071289062, | |
| "learning_rate": 0.00019744841000412123, | |
| "loss": 0.3921, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.4132841328413284, | |
| "grad_norm": 4.654256820678711, | |
| "learning_rate": 0.00019735849487637929, | |
| "loss": 0.4444, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4182041820418204, | |
| "grad_norm": 1.7671858072280884, | |
| "learning_rate": 0.0001972670440002825, | |
| "loss": 0.3749, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4231242312423124, | |
| "grad_norm": 2.725391387939453, | |
| "learning_rate": 0.00019717405881839145, | |
| "loss": 0.4183, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4280442804428044, | |
| "grad_norm": 1.985857605934143, | |
| "learning_rate": 0.00019707954079746927, | |
| "loss": 0.3992, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.43296432964329645, | |
| "grad_norm": 4.41717004776001, | |
| "learning_rate": 0.00019698349142845814, | |
| "loss": 0.4746, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.43788437884378845, | |
| "grad_norm": 4.98541784286499, | |
| "learning_rate": 0.00019688591222645607, | |
| "loss": 0.3828, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.44280442804428044, | |
| "grad_norm": 5.29671573638916, | |
| "learning_rate": 0.00019678680473069293, | |
| "loss": 0.3513, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.44772447724477243, | |
| "grad_norm": 2.3669795989990234, | |
| "learning_rate": 0.00019668617050450603, | |
| "loss": 0.3433, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.45264452644526443, | |
| "grad_norm": 3.3942222595214844, | |
| "learning_rate": 0.00019658401113531565, | |
| "loss": 0.4033, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.4575645756457565, | |
| "grad_norm": 2.1776537895202637, | |
| "learning_rate": 0.00019648032823459994, | |
| "loss": 0.2924, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.46248462484624847, | |
| "grad_norm": 3.5817902088165283, | |
| "learning_rate": 0.00019637512343786937, | |
| "loss": 0.3886, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.46740467404674046, | |
| "grad_norm": 5.881927490234375, | |
| "learning_rate": 0.00019626839840464119, | |
| "loss": 0.5516, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.47232472324723246, | |
| "grad_norm": 3.89084529876709, | |
| "learning_rate": 0.0001961601548184129, | |
| "loss": 0.5291, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.47724477244772445, | |
| "grad_norm": 1.7908971309661865, | |
| "learning_rate": 0.00019605039438663614, | |
| "loss": 0.4671, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4821648216482165, | |
| "grad_norm": 3.8980045318603516, | |
| "learning_rate": 0.0001959391188406893, | |
| "loss": 0.4105, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.4870848708487085, | |
| "grad_norm": 5.375885486602783, | |
| "learning_rate": 0.00019582632993585052, | |
| "loss": 0.4652, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.4920049200492005, | |
| "grad_norm": 4.329046249389648, | |
| "learning_rate": 0.00019571202945126994, | |
| "loss": 0.4507, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4969249692496925, | |
| "grad_norm": 3.3222126960754395, | |
| "learning_rate": 0.0001955962191899415, | |
| "loss": 0.43, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5018450184501845, | |
| "grad_norm": 5.076950550079346, | |
| "learning_rate": 0.00019547890097867468, | |
| "loss": 0.3756, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5067650676506765, | |
| "grad_norm": 4.1895294189453125, | |
| "learning_rate": 0.00019536007666806556, | |
| "loss": 0.4545, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.5116851168511685, | |
| "grad_norm": 2.738429069519043, | |
| "learning_rate": 0.00019523974813246767, | |
| "loss": 0.3971, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5166051660516605, | |
| "grad_norm": 1.7775121927261353, | |
| "learning_rate": 0.00019511791726996243, | |
| "loss": 0.3815, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5215252152521526, | |
| "grad_norm": 0.5655261874198914, | |
| "learning_rate": 0.0001949945860023292, | |
| "loss": 0.308, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5264452644526445, | |
| "grad_norm": 2.861567258834839, | |
| "learning_rate": 0.00019486975627501502, | |
| "loss": 0.349, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5313653136531366, | |
| "grad_norm": 0.9508899450302124, | |
| "learning_rate": 0.0001947434300571038, | |
| "loss": 0.3095, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5362853628536285, | |
| "grad_norm": 4.669578552246094, | |
| "learning_rate": 0.00019461560934128533, | |
| "loss": 0.5094, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.5412054120541205, | |
| "grad_norm": 0.9468059539794922, | |
| "learning_rate": 0.0001944862961438239, | |
| "loss": 0.2996, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5461254612546126, | |
| "grad_norm": 3.7661190032958984, | |
| "learning_rate": 0.00019435549250452645, | |
| "loss": 0.3556, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.5510455104551045, | |
| "grad_norm": 3.7507622241973877, | |
| "learning_rate": 0.0001942232004867103, | |
| "loss": 0.4196, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5559655596555966, | |
| "grad_norm": 1.455446481704712, | |
| "learning_rate": 0.0001940894221771708, | |
| "loss": 0.3751, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.5608856088560885, | |
| "grad_norm": 5.634677886962891, | |
| "learning_rate": 0.00019395415968614813, | |
| "loss": 0.4659, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5658056580565806, | |
| "grad_norm": 1.9559741020202637, | |
| "learning_rate": 0.00019381741514729443, | |
| "loss": 0.4113, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5707257072570726, | |
| "grad_norm": 2.967988967895508, | |
| "learning_rate": 0.0001936791907176397, | |
| "loss": 0.4565, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5756457564575646, | |
| "grad_norm": 6.420986175537109, | |
| "learning_rate": 0.00019353948857755803, | |
| "loss": 0.4465, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5805658056580566, | |
| "grad_norm": 5.503588676452637, | |
| "learning_rate": 0.00019339831093073318, | |
| "loss": 0.4705, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5854858548585485, | |
| "grad_norm": 5.966702461242676, | |
| "learning_rate": 0.00019325566000412376, | |
| "loss": 0.4205, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5904059040590406, | |
| "grad_norm": 2.865349054336548, | |
| "learning_rate": 0.0001931115380479281, | |
| "loss": 0.3988, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5953259532595326, | |
| "grad_norm": 1.7353636026382446, | |
| "learning_rate": 0.00019296594733554892, | |
| "loss": 0.4364, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.6002460024600246, | |
| "grad_norm": 3.236239194869995, | |
| "learning_rate": 0.0001928188901635571, | |
| "loss": 0.4553, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.6051660516605166, | |
| "grad_norm": 2.1501266956329346, | |
| "learning_rate": 0.00019267036885165588, | |
| "loss": 0.4121, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.6100861008610086, | |
| "grad_norm": 2.416868209838867, | |
| "learning_rate": 0.00019252038574264405, | |
| "loss": 0.3616, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.6150061500615006, | |
| "grad_norm": 2.7004334926605225, | |
| "learning_rate": 0.00019236894320237894, | |
| "loss": 0.3443, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6199261992619927, | |
| "grad_norm": 1.992761492729187, | |
| "learning_rate": 0.00019221604361973919, | |
| "loss": 0.4068, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6248462484624846, | |
| "grad_norm": 0.9761249423027039, | |
| "learning_rate": 0.00019206168940658712, | |
| "loss": 0.3952, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.6297662976629766, | |
| "grad_norm": 1.5893077850341797, | |
| "learning_rate": 0.00019190588299773062, | |
| "loss": 0.383, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.6346863468634686, | |
| "grad_norm": 1.1404695510864258, | |
| "learning_rate": 0.00019174862685088472, | |
| "loss": 0.4274, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.6396063960639606, | |
| "grad_norm": 1.63871431350708, | |
| "learning_rate": 0.0001915899234466328, | |
| "loss": 0.3883, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6445264452644527, | |
| "grad_norm": 1.9504516124725342, | |
| "learning_rate": 0.00019142977528838762, | |
| "loss": 0.3932, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.6494464944649446, | |
| "grad_norm": 1.6097129583358765, | |
| "learning_rate": 0.0001912681849023516, | |
| "loss": 0.4028, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.6543665436654367, | |
| "grad_norm": 2.071103572845459, | |
| "learning_rate": 0.00019110515483747716, | |
| "loss": 0.4016, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.6592865928659286, | |
| "grad_norm": 2.177647352218628, | |
| "learning_rate": 0.0001909406876654264, | |
| "loss": 0.3975, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.6642066420664207, | |
| "grad_norm": 1.2018887996673584, | |
| "learning_rate": 0.00019077478598053063, | |
| "loss": 0.3846, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6691266912669127, | |
| "grad_norm": 3.688076972961426, | |
| "learning_rate": 0.00019060745239974936, | |
| "loss": 0.4432, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.6740467404674046, | |
| "grad_norm": 2.5613861083984375, | |
| "learning_rate": 0.0001904386895626291, | |
| "loss": 0.3704, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.6789667896678967, | |
| "grad_norm": 2.255295753479004, | |
| "learning_rate": 0.00019026850013126157, | |
| "loss": 0.3267, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6838868388683886, | |
| "grad_norm": 3.4777557849884033, | |
| "learning_rate": 0.0001900968867902419, | |
| "loss": 0.5534, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6888068880688807, | |
| "grad_norm": 2.0409767627716064, | |
| "learning_rate": 0.00018992385224662623, | |
| "loss": 0.4607, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6937269372693727, | |
| "grad_norm": 2.408515691757202, | |
| "learning_rate": 0.00018974939922988883, | |
| "loss": 0.3969, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6986469864698647, | |
| "grad_norm": 1.8554408550262451, | |
| "learning_rate": 0.00018957353049187936, | |
| "loss": 0.3385, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7035670356703567, | |
| "grad_norm": 3.487424612045288, | |
| "learning_rate": 0.00018939624880677918, | |
| "loss": 0.4132, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.7084870848708487, | |
| "grad_norm": 3.606100559234619, | |
| "learning_rate": 0.0001892175569710577, | |
| "loss": 0.4644, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7134071340713407, | |
| "grad_norm": 3.1930618286132812, | |
| "learning_rate": 0.00018903745780342839, | |
| "loss": 0.4235, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.7183271832718328, | |
| "grad_norm": 3.3350257873535156, | |
| "learning_rate": 0.00018885595414480405, | |
| "loss": 0.4837, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.7232472324723247, | |
| "grad_norm": 2.0124611854553223, | |
| "learning_rate": 0.0001886730488582522, | |
| "loss": 0.4149, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.7281672816728167, | |
| "grad_norm": 3.699632167816162, | |
| "learning_rate": 0.00018848874482894993, | |
| "loss": 0.376, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.7330873308733087, | |
| "grad_norm": 2.4049108028411865, | |
| "learning_rate": 0.00018830304496413822, | |
| "loss": 0.4215, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.7380073800738007, | |
| "grad_norm": 2.440385341644287, | |
| "learning_rate": 0.00018811595219307622, | |
| "loss": 0.4041, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7429274292742928, | |
| "grad_norm": 2.6796436309814453, | |
| "learning_rate": 0.000187927469466995, | |
| "loss": 0.3949, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.7478474784747847, | |
| "grad_norm": 1.940114974975586, | |
| "learning_rate": 0.00018773759975905098, | |
| "loss": 0.4411, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.7527675276752768, | |
| "grad_norm": 3.338021755218506, | |
| "learning_rate": 0.00018754634606427914, | |
| "loss": 0.4607, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.7576875768757687, | |
| "grad_norm": 2.3407375812530518, | |
| "learning_rate": 0.00018735371139954558, | |
| "loss": 0.416, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.7626076260762608, | |
| "grad_norm": 1.1078053712844849, | |
| "learning_rate": 0.0001871596988035001, | |
| "loss": 0.34, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7675276752767528, | |
| "grad_norm": 1.750227928161621, | |
| "learning_rate": 0.00018696431133652817, | |
| "loss": 0.3084, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.7724477244772447, | |
| "grad_norm": 2.9180145263671875, | |
| "learning_rate": 0.00018676755208070275, | |
| "loss": 0.4109, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.7773677736777368, | |
| "grad_norm": 2.675165891647339, | |
| "learning_rate": 0.00018656942413973555, | |
| "loss": 0.4438, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.7822878228782287, | |
| "grad_norm": 3.3854095935821533, | |
| "learning_rate": 0.0001863699306389282, | |
| "loss": 0.4418, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.7872078720787208, | |
| "grad_norm": 0.5620162487030029, | |
| "learning_rate": 0.0001861690747251228, | |
| "loss": 0.3806, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7921279212792128, | |
| "grad_norm": 1.223493218421936, | |
| "learning_rate": 0.00018596685956665245, | |
| "loss": 0.3758, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.7970479704797048, | |
| "grad_norm": 1.9586799144744873, | |
| "learning_rate": 0.00018576328835329117, | |
| "loss": 0.3354, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8019680196801968, | |
| "grad_norm": 3.2813546657562256, | |
| "learning_rate": 0.00018555836429620358, | |
| "loss": 0.4063, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.8068880688806888, | |
| "grad_norm": 2.182837963104248, | |
| "learning_rate": 0.00018535209062789433, | |
| "loss": 0.3697, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.8118081180811808, | |
| "grad_norm": 1.3659495115280151, | |
| "learning_rate": 0.00018514447060215698, | |
| "loss": 0.3351, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.8167281672816729, | |
| "grad_norm": 1.170257329940796, | |
| "learning_rate": 0.00018493550749402278, | |
| "loss": 0.3225, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.8216482164821648, | |
| "grad_norm": 4.230517387390137, | |
| "learning_rate": 0.00018472520459970898, | |
| "loss": 0.4448, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.8265682656826568, | |
| "grad_norm": 2.8111300468444824, | |
| "learning_rate": 0.0001845135652365668, | |
| "loss": 0.3761, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.8314883148831488, | |
| "grad_norm": 5.860655307769775, | |
| "learning_rate": 0.00018430059274302917, | |
| "loss": 0.4974, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.8364083640836408, | |
| "grad_norm": 3.6116364002227783, | |
| "learning_rate": 0.00018408629047855804, | |
| "loss": 0.4327, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8413284132841329, | |
| "grad_norm": 2.6450071334838867, | |
| "learning_rate": 0.00018387066182359133, | |
| "loss": 0.3813, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.8462484624846248, | |
| "grad_norm": 2.9791674613952637, | |
| "learning_rate": 0.00018365371017948964, | |
| "loss": 0.4184, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.8511685116851169, | |
| "grad_norm": 1.7529772520065308, | |
| "learning_rate": 0.00018343543896848273, | |
| "loss": 0.3489, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.8560885608856088, | |
| "grad_norm": 3.5216493606567383, | |
| "learning_rate": 0.00018321585163361527, | |
| "loss": 0.3988, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.8610086100861009, | |
| "grad_norm": 2.470106840133667, | |
| "learning_rate": 0.00018299495163869275, | |
| "loss": 0.3919, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.8659286592865929, | |
| "grad_norm": 3.1759798526763916, | |
| "learning_rate": 0.0001827727424682268, | |
| "loss": 0.3853, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.8708487084870848, | |
| "grad_norm": 3.581413745880127, | |
| "learning_rate": 0.00018254922762738008, | |
| "loss": 0.4041, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.8757687576875769, | |
| "grad_norm": 1.35221266746521, | |
| "learning_rate": 0.00018232441064191125, | |
| "loss": 0.3564, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.8806888068880688, | |
| "grad_norm": 2.2829418182373047, | |
| "learning_rate": 0.0001820982950581191, | |
| "loss": 0.443, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.8856088560885609, | |
| "grad_norm": 2.931074619293213, | |
| "learning_rate": 0.00018187088444278674, | |
| "loss": 0.4088, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8905289052890529, | |
| "grad_norm": 3.7436723709106445, | |
| "learning_rate": 0.00018164218238312535, | |
| "loss": 0.4888, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.8954489544895449, | |
| "grad_norm": 2.7169697284698486, | |
| "learning_rate": 0.00018141219248671745, | |
| "loss": 0.4432, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.9003690036900369, | |
| "grad_norm": 1.1118288040161133, | |
| "learning_rate": 0.00018118091838146029, | |
| "loss": 0.3677, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.9052890528905289, | |
| "grad_norm": 3.0052273273468018, | |
| "learning_rate": 0.00018094836371550824, | |
| "loss": 0.3169, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.9102091020910209, | |
| "grad_norm": 2.911255121231079, | |
| "learning_rate": 0.00018071453215721554, | |
| "loss": 0.4721, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.915129151291513, | |
| "grad_norm": 2.483900547027588, | |
| "learning_rate": 0.00018047942739507836, | |
| "loss": 0.3812, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.9200492004920049, | |
| "grad_norm": 1.8842488527297974, | |
| "learning_rate": 0.00018024305313767646, | |
| "loss": 0.436, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.9249692496924969, | |
| "grad_norm": 1.5961415767669678, | |
| "learning_rate": 0.000180005413113615, | |
| "loss": 0.3808, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.9298892988929889, | |
| "grad_norm": 2.0334715843200684, | |
| "learning_rate": 0.00017976651107146533, | |
| "loss": 0.4548, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.9348093480934809, | |
| "grad_norm": 1.6422673463821411, | |
| "learning_rate": 0.0001795263507797063, | |
| "loss": 0.406, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.939729397293973, | |
| "grad_norm": 2.5241055488586426, | |
| "learning_rate": 0.00017928493602666445, | |
| "loss": 0.3661, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.9446494464944649, | |
| "grad_norm": 2.3822920322418213, | |
| "learning_rate": 0.00017904227062045437, | |
| "loss": 0.4581, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.949569495694957, | |
| "grad_norm": 3.649919271469116, | |
| "learning_rate": 0.00017879835838891875, | |
| "loss": 0.4743, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.9544895448954489, | |
| "grad_norm": 1.9197454452514648, | |
| "learning_rate": 0.00017855320317956784, | |
| "loss": 0.3857, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.959409594095941, | |
| "grad_norm": 1.4304083585739136, | |
| "learning_rate": 0.00017830680885951887, | |
| "loss": 0.3935, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.964329643296433, | |
| "grad_norm": 0.4576971232891083, | |
| "learning_rate": 0.00017805917931543492, | |
| "loss": 0.4147, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.9692496924969249, | |
| "grad_norm": 1.4386779069900513, | |
| "learning_rate": 0.00017781031845346375, | |
| "loss": 0.3927, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.974169741697417, | |
| "grad_norm": 1.496974229812622, | |
| "learning_rate": 0.00017756023019917607, | |
| "loss": 0.3666, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.9790897908979089, | |
| "grad_norm": 1.221921682357788, | |
| "learning_rate": 0.00017730891849750377, | |
| "loss": 0.3938, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.984009840098401, | |
| "grad_norm": 1.7949525117874146, | |
| "learning_rate": 0.0001770563873126775, | |
| "loss": 0.4118, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.988929889298893, | |
| "grad_norm": 1.1061089038848877, | |
| "learning_rate": 0.0001768026406281642, | |
| "loss": 0.4086, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.993849938499385, | |
| "grad_norm": 3.217977523803711, | |
| "learning_rate": 0.00017654768244660448, | |
| "loss": 0.4018, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.998769987699877, | |
| "grad_norm": 0.9173564314842224, | |
| "learning_rate": 0.00017629151678974907, | |
| "loss": 0.3952, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.783997058868408, | |
| "learning_rate": 0.00017603414769839577, | |
| "loss": 0.375, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.004920049200492, | |
| "grad_norm": 3.309582471847534, | |
| "learning_rate": 0.00017577557923232546, | |
| "loss": 0.4257, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.009840098400984, | |
| "grad_norm": 1.2689415216445923, | |
| "learning_rate": 0.00017551581547023819, | |
| "loss": 0.4078, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.014760147601476, | |
| "grad_norm": 1.5618160963058472, | |
| "learning_rate": 0.00017525486050968875, | |
| "loss": 0.3948, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.019680196801968, | |
| "grad_norm": 2.377791166305542, | |
| "learning_rate": 0.00017499271846702213, | |
| "loss": 0.3407, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.0246002460024601, | |
| "grad_norm": 1.7102715969085693, | |
| "learning_rate": 0.00017472939347730856, | |
| "loss": 0.3997, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.029520295202952, | |
| "grad_norm": 1.2720469236373901, | |
| "learning_rate": 0.0001744648896942782, | |
| "loss": 0.339, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.034440344403444, | |
| "grad_norm": 2.7468247413635254, | |
| "learning_rate": 0.00017419921129025576, | |
| "loss": 0.3818, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.039360393603936, | |
| "grad_norm": 3.501011371612549, | |
| "learning_rate": 0.0001739323624560945, | |
| "loss": 0.451, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.044280442804428, | |
| "grad_norm": 1.6358418464660645, | |
| "learning_rate": 0.00017366434740111037, | |
| "loss": 0.3493, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.04920049200492, | |
| "grad_norm": 3.540642023086548, | |
| "learning_rate": 0.00017339517035301532, | |
| "loss": 0.455, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.054120541205412, | |
| "grad_norm": 2.093965530395508, | |
| "learning_rate": 0.00017312483555785086, | |
| "loss": 0.3673, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.0590405904059041, | |
| "grad_norm": 4.747845649719238, | |
| "learning_rate": 0.000172853347279921, | |
| "loss": 0.5216, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.063960639606396, | |
| "grad_norm": 2.5414655208587646, | |
| "learning_rate": 0.00017258070980172494, | |
| "loss": 0.4571, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.068880688806888, | |
| "grad_norm": 1.3232766389846802, | |
| "learning_rate": 0.0001723069274238895, | |
| "loss": 0.4014, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.07380073800738, | |
| "grad_norm": 2.045196771621704, | |
| "learning_rate": 0.0001720320044651014, | |
| "loss": 0.4119, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.0787207872078721, | |
| "grad_norm": 5.2392096519470215, | |
| "learning_rate": 0.00017175594526203905, | |
| "loss": 0.3691, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.083640836408364, | |
| "grad_norm": 2.589878797531128, | |
| "learning_rate": 0.00017147875416930416, | |
| "loss": 0.4317, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.088560885608856, | |
| "grad_norm": 1.5000386238098145, | |
| "learning_rate": 0.00017120043555935298, | |
| "loss": 0.4135, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.0934809348093482, | |
| "grad_norm": 0.8919417262077332, | |
| "learning_rate": 0.00017092099382242748, | |
| "loss": 0.4183, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.09840098400984, | |
| "grad_norm": 1.059650182723999, | |
| "learning_rate": 0.00017064043336648599, | |
| "loss": 0.3791, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.103321033210332, | |
| "grad_norm": 1.8085955381393433, | |
| "learning_rate": 0.0001703587586171337, | |
| "loss": 0.3893, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.1082410824108242, | |
| "grad_norm": 2.2094881534576416, | |
| "learning_rate": 0.00017007597401755276, | |
| "loss": 0.3871, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.1131611316113161, | |
| "grad_norm": 3.2818965911865234, | |
| "learning_rate": 0.00016979208402843237, | |
| "loss": 0.4412, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.118081180811808, | |
| "grad_norm": 1.4197732210159302, | |
| "learning_rate": 0.00016950709312789833, | |
| "loss": 0.3248, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.1230012300123002, | |
| "grad_norm": 3.690911054611206, | |
| "learning_rate": 0.00016922100581144228, | |
| "loss": 0.4552, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.1279212792127922, | |
| "grad_norm": 4.1127424240112305, | |
| "learning_rate": 0.00016893382659185105, | |
| "loss": 0.4887, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.132841328413284, | |
| "grad_norm": 2.658750295639038, | |
| "learning_rate": 0.00016864555999913518, | |
| "loss": 0.4037, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.137761377613776, | |
| "grad_norm": 1.4189069271087646, | |
| "learning_rate": 0.0001683562105804577, | |
| "loss": 0.3705, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.1426814268142682, | |
| "grad_norm": 2.599860191345215, | |
| "learning_rate": 0.00016806578290006225, | |
| "loss": 0.3975, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.1476014760147601, | |
| "grad_norm": 3.787053108215332, | |
| "learning_rate": 0.0001677742815392012, | |
| "loss": 0.4294, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.152521525215252, | |
| "grad_norm": 3.4738941192626953, | |
| "learning_rate": 0.00016748171109606328, | |
| "loss": 0.3847, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.1574415744157442, | |
| "grad_norm": 2.0890064239501953, | |
| "learning_rate": 0.00016718807618570106, | |
| "loss": 0.4156, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.1623616236162362, | |
| "grad_norm": 2.275296211242676, | |
| "learning_rate": 0.00016689338143995833, | |
| "loss": 0.4598, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.1672816728167281, | |
| "grad_norm": 0.8225153088569641, | |
| "learning_rate": 0.00016659763150739677, | |
| "loss": 0.3495, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.17220172201722, | |
| "grad_norm": 0.9762566685676575, | |
| "learning_rate": 0.00016630083105322266, | |
| "loss": 0.3705, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.1771217712177122, | |
| "grad_norm": 3.3081791400909424, | |
| "learning_rate": 0.00016600298475921365, | |
| "loss": 0.4167, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.1820418204182042, | |
| "grad_norm": 4.026612281799316, | |
| "learning_rate": 0.00016570409732364437, | |
| "loss": 0.4859, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.186961869618696, | |
| "grad_norm": 2.193952798843384, | |
| "learning_rate": 0.0001654041734612127, | |
| "loss": 0.4207, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.1918819188191883, | |
| "grad_norm": 2.2682714462280273, | |
| "learning_rate": 0.00016510321790296525, | |
| "loss": 0.4344, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.1968019680196802, | |
| "grad_norm": 3.3938522338867188, | |
| "learning_rate": 0.00016480123539622281, | |
| "loss": 0.4628, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.2017220172201721, | |
| "grad_norm": 4.911561489105225, | |
| "learning_rate": 0.00016449823070450531, | |
| "loss": 0.3449, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.2066420664206643, | |
| "grad_norm": 2.2653610706329346, | |
| "learning_rate": 0.00016419420860745699, | |
| "loss": 0.3965, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.2115621156211562, | |
| "grad_norm": 1.626495361328125, | |
| "learning_rate": 0.00016388917390077054, | |
| "loss": 0.3818, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.2164821648216482, | |
| "grad_norm": 2.9067697525024414, | |
| "learning_rate": 0.00016358313139611195, | |
| "loss": 0.4184, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.2214022140221403, | |
| "grad_norm": 1.6488162279129028, | |
| "learning_rate": 0.0001632760859210442, | |
| "loss": 0.3561, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.2263222632226323, | |
| "grad_norm": 1.5693081617355347, | |
| "learning_rate": 0.00016296804231895142, | |
| "loss": 0.404, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.2312423124231242, | |
| "grad_norm": 2.674132823944092, | |
| "learning_rate": 0.00016265900544896225, | |
| "loss": 0.4402, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.2361623616236161, | |
| "grad_norm": 1.432892918586731, | |
| "learning_rate": 0.00016234898018587337, | |
| "loss": 0.3073, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.2410824108241083, | |
| "grad_norm": 1.0736567974090576, | |
| "learning_rate": 0.0001620379714200725, | |
| "loss": 0.3551, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.2460024600246002, | |
| "grad_norm": 2.692246675491333, | |
| "learning_rate": 0.00016172598405746124, | |
| "loss": 0.4585, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.2509225092250922, | |
| "grad_norm": 1.1363232135772705, | |
| "learning_rate": 0.00016141302301937786, | |
| "loss": 0.3566, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.2558425584255843, | |
| "grad_norm": 2.9427497386932373, | |
| "learning_rate": 0.0001610990932425194, | |
| "loss": 0.4541, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.2607626076260763, | |
| "grad_norm": 1.8412046432495117, | |
| "learning_rate": 0.00016078419967886402, | |
| "loss": 0.4018, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.2656826568265682, | |
| "grad_norm": 1.6179234981536865, | |
| "learning_rate": 0.0001604683472955928, | |
| "loss": 0.4115, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.2706027060270602, | |
| "grad_norm": 1.2234046459197998, | |
| "learning_rate": 0.00016015154107501133, | |
| "loss": 0.4339, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.2755227552275523, | |
| "grad_norm": 1.4952470064163208, | |
| "learning_rate": 0.00015983378601447127, | |
| "loss": 0.4079, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.2804428044280443, | |
| "grad_norm": 0.6399968266487122, | |
| "learning_rate": 0.0001595150871262914, | |
| "loss": 0.4262, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.2853628536285364, | |
| "grad_norm": 2.328315258026123, | |
| "learning_rate": 0.00015919544943767856, | |
| "loss": 0.4236, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.2902829028290284, | |
| "grad_norm": 2.3608176708221436, | |
| "learning_rate": 0.00015887487799064838, | |
| "loss": 0.3888, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.2952029520295203, | |
| "grad_norm": 1.258406639099121, | |
| "learning_rate": 0.00015855337784194577, | |
| "loss": 0.405, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.3001230012300122, | |
| "grad_norm": 1.4067128896713257, | |
| "learning_rate": 0.00015823095406296514, | |
| "loss": 0.426, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.3050430504305042, | |
| "grad_norm": 2.433593988418579, | |
| "learning_rate": 0.00015790761173967036, | |
| "loss": 0.404, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.3099630996309963, | |
| "grad_norm": 1.216810703277588, | |
| "learning_rate": 0.00015758335597251458, | |
| "loss": 0.3607, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.3148831488314883, | |
| "grad_norm": 3.8141870498657227, | |
| "learning_rate": 0.00015725819187635968, | |
| "loss": 0.487, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.3198031980319804, | |
| "grad_norm": 0.3127140700817108, | |
| "learning_rate": 0.00015693212458039584, | |
| "loss": 0.3555, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.3247232472324724, | |
| "grad_norm": 3.0095064640045166, | |
| "learning_rate": 0.00015660515922806027, | |
| "loss": 0.4525, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.3296432964329643, | |
| "grad_norm": 1.490798830986023, | |
| "learning_rate": 0.00015627730097695638, | |
| "loss": 0.3951, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.3345633456334562, | |
| "grad_norm": 2.4473958015441895, | |
| "learning_rate": 0.0001559485549987723, | |
| "loss": 0.3184, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.3394833948339484, | |
| "grad_norm": 1.3399827480316162, | |
| "learning_rate": 0.0001556189264791992, | |
| "loss": 0.401, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.3444034440344403, | |
| "grad_norm": 1.9885616302490234, | |
| "learning_rate": 0.0001552884206178498, | |
| "loss": 0.4482, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.3493234932349323, | |
| "grad_norm": 1.2298444509506226, | |
| "learning_rate": 0.00015495704262817597, | |
| "loss": 0.4295, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.3542435424354244, | |
| "grad_norm": 1.3658753633499146, | |
| "learning_rate": 0.0001546247977373867, | |
| "loss": 0.3741, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.3591635916359164, | |
| "grad_norm": 3.8412437438964844, | |
| "learning_rate": 0.00015429169118636566, | |
| "loss": 0.356, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.3640836408364083, | |
| "grad_norm": 2.24770188331604, | |
| "learning_rate": 0.00015395772822958845, | |
| "loss": 0.3911, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.3690036900369003, | |
| "grad_norm": 1.0389429330825806, | |
| "learning_rate": 0.00015362291413503984, | |
| "loss": 0.4239, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.3739237392373924, | |
| "grad_norm": 2.6337220668792725, | |
| "learning_rate": 0.00015328725418413045, | |
| "loss": 0.3546, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.3788437884378844, | |
| "grad_norm": 1.609165906906128, | |
| "learning_rate": 0.00015295075367161367, | |
| "loss": 0.4083, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.3837638376383765, | |
| "grad_norm": 2.580286741256714, | |
| "learning_rate": 0.00015261341790550196, | |
| "loss": 0.3493, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.3886838868388685, | |
| "grad_norm": 2.396101474761963, | |
| "learning_rate": 0.0001522752522069833, | |
| "loss": 0.4164, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.3936039360393604, | |
| "grad_norm": 1.4685685634613037, | |
| "learning_rate": 0.00015193626191033712, | |
| "loss": 0.3765, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.3985239852398523, | |
| "grad_norm": 2.472041368484497, | |
| "learning_rate": 0.0001515964523628501, | |
| "loss": 0.4015, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.4034440344403443, | |
| "grad_norm": 2.8179895877838135, | |
| "learning_rate": 0.00015125582892473204, | |
| "loss": 0.4108, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.4083640836408364, | |
| "grad_norm": 2.4579968452453613, | |
| "learning_rate": 0.00015091439696903115, | |
| "loss": 0.4333, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.4132841328413284, | |
| "grad_norm": 2.46209716796875, | |
| "learning_rate": 0.00015057216188154928, | |
| "loss": 0.468, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.4182041820418205, | |
| "grad_norm": 1.7040590047836304, | |
| "learning_rate": 0.00015022912906075702, | |
| "loss": 0.421, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.4231242312423125, | |
| "grad_norm": 4.77639102935791, | |
| "learning_rate": 0.00014988530391770856, | |
| "loss": 0.4105, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.4280442804428044, | |
| "grad_norm": 1.7009060382843018, | |
| "learning_rate": 0.00014954069187595633, | |
| "loss": 0.4034, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.4329643296432963, | |
| "grad_norm": 1.7801786661148071, | |
| "learning_rate": 0.00014919529837146528, | |
| "loss": 0.3962, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.4378843788437885, | |
| "grad_norm": 0.9307105541229248, | |
| "learning_rate": 0.0001488491288525275, | |
| "loss": 0.3895, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.4428044280442804, | |
| "grad_norm": 1.3841267824172974, | |
| "learning_rate": 0.0001485021887796759, | |
| "loss": 0.3871, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.4477244772447724, | |
| "grad_norm": 1.3614524602890015, | |
| "learning_rate": 0.00014815448362559826, | |
| "loss": 0.4055, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.4526445264452645, | |
| "grad_norm": 3.51263165473938, | |
| "learning_rate": 0.00014780601887505088, | |
| "loss": 0.2708, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.4575645756457565, | |
| "grad_norm": 2.4436159133911133, | |
| "learning_rate": 0.00014745680002477203, | |
| "loss": 0.388, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.4624846248462484, | |
| "grad_norm": 0.775227427482605, | |
| "learning_rate": 0.00014710683258339536, | |
| "loss": 0.3506, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.4674046740467404, | |
| "grad_norm": 1.1680070161819458, | |
| "learning_rate": 0.0001467561220713628, | |
| "loss": 0.3227, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.4723247232472325, | |
| "grad_norm": 2.8542237281799316, | |
| "learning_rate": 0.0001464046740208377, | |
| "loss": 0.3588, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4772447724477245, | |
| "grad_norm": 2.2465827465057373, | |
| "learning_rate": 0.00014605249397561736, | |
| "loss": 0.4161, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.4821648216482166, | |
| "grad_norm": 3.5913736820220947, | |
| "learning_rate": 0.00014569958749104575, | |
| "loss": 0.4758, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.4870848708487086, | |
| "grad_norm": 0.5437675714492798, | |
| "learning_rate": 0.00014534596013392575, | |
| "loss": 0.3388, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.4920049200492005, | |
| "grad_norm": 2.386204242706299, | |
| "learning_rate": 0.00014499161748243147, | |
| "loss": 0.4425, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.4969249692496924, | |
| "grad_norm": 1.160514235496521, | |
| "learning_rate": 0.0001446365651260201, | |
| "loss": 0.3747, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.5018450184501844, | |
| "grad_norm": 0.49445146322250366, | |
| "learning_rate": 0.00014428080866534396, | |
| "loss": 0.3707, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.5067650676506765, | |
| "grad_norm": 1.3350694179534912, | |
| "learning_rate": 0.00014392435371216185, | |
| "loss": 0.3455, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.5116851168511685, | |
| "grad_norm": 1.1016676425933838, | |
| "learning_rate": 0.0001435672058892509, | |
| "loss": 0.4095, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.5166051660516606, | |
| "grad_norm": 2.0227558612823486, | |
| "learning_rate": 0.00014320937083031748, | |
| "loss": 0.3706, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.5215252152521526, | |
| "grad_norm": 3.2734158039093018, | |
| "learning_rate": 0.0001428508541799086, | |
| "loss": 0.3384, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.5264452644526445, | |
| "grad_norm": 1.967950701713562, | |
| "learning_rate": 0.0001424916615933229, | |
| "loss": 0.4321, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.5313653136531364, | |
| "grad_norm": 1.8679777383804321, | |
| "learning_rate": 0.00014213179873652127, | |
| "loss": 0.3597, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.5362853628536284, | |
| "grad_norm": 1.111864447593689, | |
| "learning_rate": 0.00014177127128603745, | |
| "loss": 0.3639, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.5412054120541205, | |
| "grad_norm": 1.1539496183395386, | |
| "learning_rate": 0.0001414100849288888, | |
| "loss": 0.3735, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.5461254612546127, | |
| "grad_norm": 2.5453989505767822, | |
| "learning_rate": 0.00014104824536248614, | |
| "loss": 0.4241, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.5510455104551046, | |
| "grad_norm": 1.5490731000900269, | |
| "learning_rate": 0.00014068575829454436, | |
| "loss": 0.38, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.5559655596555966, | |
| "grad_norm": 2.0369129180908203, | |
| "learning_rate": 0.00014032262944299194, | |
| "loss": 0.432, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.5608856088560885, | |
| "grad_norm": 1.938671588897705, | |
| "learning_rate": 0.00013995886453588104, | |
| "loss": 0.4407, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.5658056580565805, | |
| "grad_norm": 1.5802247524261475, | |
| "learning_rate": 0.00013959446931129704, | |
| "loss": 0.4174, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.5707257072570726, | |
| "grad_norm": 1.7823857069015503, | |
| "learning_rate": 0.0001392294495172681, | |
| "loss": 0.3608, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.5756457564575646, | |
| "grad_norm": 1.5793462991714478, | |
| "learning_rate": 0.0001388638109116744, | |
| "loss": 0.4049, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.5805658056580567, | |
| "grad_norm": 2.478447437286377, | |
| "learning_rate": 0.00013849755926215735, | |
| "loss": 0.3822, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.5854858548585486, | |
| "grad_norm": 3.0512235164642334, | |
| "learning_rate": 0.00013813070034602863, | |
| "loss": 0.3729, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.5904059040590406, | |
| "grad_norm": 2.298110008239746, | |
| "learning_rate": 0.00013776323995017898, | |
| "loss": 0.3757, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.5953259532595325, | |
| "grad_norm": 1.1335664987564087, | |
| "learning_rate": 0.00013739518387098705, | |
| "loss": 0.3436, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.6002460024600245, | |
| "grad_norm": 1.9639568328857422, | |
| "learning_rate": 0.0001370265379142279, | |
| "loss": 0.4321, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.6051660516605166, | |
| "grad_norm": 2.0375776290893555, | |
| "learning_rate": 0.0001366573078949813, | |
| "loss": 0.3924, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.6100861008610086, | |
| "grad_norm": 2.925692558288574, | |
| "learning_rate": 0.00013628749963754026, | |
| "loss": 0.447, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.6150061500615007, | |
| "grad_norm": 2.7469842433929443, | |
| "learning_rate": 0.0001359171189753189, | |
| "loss": 0.4045, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.6199261992619927, | |
| "grad_norm": 1.8784551620483398, | |
| "learning_rate": 0.00013554617175076062, | |
| "loss": 0.398, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.6248462484624846, | |
| "grad_norm": 3.961890459060669, | |
| "learning_rate": 0.0001351746638152458, | |
| "loss": 0.4837, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.6297662976629765, | |
| "grad_norm": 1.2118688821792603, | |
| "learning_rate": 0.00013480260102899966, | |
| "loss": 0.3792, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.6346863468634685, | |
| "grad_norm": 0.8447842597961426, | |
| "learning_rate": 0.0001344299892609996, | |
| "loss": 0.3939, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.6396063960639606, | |
| "grad_norm": 0.8459701538085938, | |
| "learning_rate": 0.00013405683438888282, | |
| "loss": 0.4246, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.6445264452644528, | |
| "grad_norm": 2.4549758434295654, | |
| "learning_rate": 0.00013368314229885347, | |
| "loss": 0.3642, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.6494464944649447, | |
| "grad_norm": 3.813248872756958, | |
| "learning_rate": 0.00013330891888559002, | |
| "loss": 0.4009, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.6543665436654367, | |
| "grad_norm": 3.946821689605713, | |
| "learning_rate": 0.00013293417005215188, | |
| "loss": 0.3961, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.6592865928659286, | |
| "grad_norm": 1.7004120349884033, | |
| "learning_rate": 0.0001325589017098867, | |
| "loss": 0.4145, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.6642066420664205, | |
| "grad_norm": 2.824493169784546, | |
| "learning_rate": 0.00013218311977833687, | |
| "loss": 0.403, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.6691266912669127, | |
| "grad_norm": 2.5144972801208496, | |
| "learning_rate": 0.0001318068301851463, | |
| "loss": 0.4236, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.6740467404674046, | |
| "grad_norm": 2.8602144718170166, | |
| "learning_rate": 0.00013143003886596669, | |
| "loss": 0.4267, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.6789667896678968, | |
| "grad_norm": 1.5796253681182861, | |
| "learning_rate": 0.0001310527517643642, | |
| "loss": 0.4181, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.6838868388683887, | |
| "grad_norm": 1.748310923576355, | |
| "learning_rate": 0.00013067497483172538, | |
| "loss": 0.3817, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.6888068880688807, | |
| "grad_norm": 1.1796998977661133, | |
| "learning_rate": 0.00013029671402716366, | |
| "loss": 0.3891, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.6937269372693726, | |
| "grad_norm": 0.8031755089759827, | |
| "learning_rate": 0.00012991797531742492, | |
| "loss": 0.3746, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.6986469864698646, | |
| "grad_norm": 2.449571132659912, | |
| "learning_rate": 0.00012953876467679373, | |
| "loss": 0.3759, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.7035670356703567, | |
| "grad_norm": 2.167459726333618, | |
| "learning_rate": 0.00012915908808699893, | |
| "loss": 0.4026, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.7084870848708487, | |
| "grad_norm": 0.8750459551811218, | |
| "learning_rate": 0.00012877895153711935, | |
| "loss": 0.3612, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.7134071340713408, | |
| "grad_norm": 0.9922705292701721, | |
| "learning_rate": 0.00012839836102348926, | |
| "loss": 0.3894, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.7183271832718328, | |
| "grad_norm": 2.778425693511963, | |
| "learning_rate": 0.00012801732254960388, | |
| "loss": 0.5034, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.7232472324723247, | |
| "grad_norm": 2.2978157997131348, | |
| "learning_rate": 0.00012763584212602453, | |
| "loss": 0.3919, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.7281672816728166, | |
| "grad_norm": 0.7636315822601318, | |
| "learning_rate": 0.00012725392577028402, | |
| "loss": 0.3465, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.7330873308733086, | |
| "grad_norm": 0.9501156210899353, | |
| "learning_rate": 0.0001268715795067916, | |
| "loss": 0.353, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.7380073800738007, | |
| "grad_norm": 1.9191248416900635, | |
| "learning_rate": 0.00012648880936673787, | |
| "loss": 0.4535, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.742927429274293, | |
| "grad_norm": 0.5128054618835449, | |
| "learning_rate": 0.00012610562138799978, | |
| "loss": 0.3761, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.7478474784747848, | |
| "grad_norm": 1.427462100982666, | |
| "learning_rate": 0.00012572202161504543, | |
| "loss": 0.3871, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.7527675276752768, | |
| "grad_norm": 0.44678959250450134, | |
| "learning_rate": 0.00012533801609883842, | |
| "loss": 0.3858, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.7576875768757687, | |
| "grad_norm": 1.5311493873596191, | |
| "learning_rate": 0.00012495361089674285, | |
| "loss": 0.3638, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.7626076260762606, | |
| "grad_norm": 1.7714836597442627, | |
| "learning_rate": 0.00012456881207242732, | |
| "loss": 0.324, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.7675276752767528, | |
| "grad_norm": 0.6259622573852539, | |
| "learning_rate": 0.00012418362569576965, | |
| "loss": 0.3832, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.7724477244772447, | |
| "grad_norm": 1.4297990798950195, | |
| "learning_rate": 0.00012379805784276082, | |
| "loss": 0.3464, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.777367773677737, | |
| "grad_norm": 0.9419127702713013, | |
| "learning_rate": 0.0001234121145954094, | |
| "loss": 0.3605, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.7822878228782288, | |
| "grad_norm": 1.7905269861221313, | |
| "learning_rate": 0.00012302580204164541, | |
| "loss": 0.4042, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.7872078720787208, | |
| "grad_norm": 2.3646910190582275, | |
| "learning_rate": 0.0001226391262752245, | |
| "loss": 0.4208, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.7921279212792127, | |
| "grad_norm": 1.26406991481781, | |
| "learning_rate": 0.00012225209339563145, | |
| "loss": 0.3653, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.7970479704797047, | |
| "grad_norm": 2.371533155441284, | |
| "learning_rate": 0.00012186470950798445, | |
| "loss": 0.4039, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.8019680196801968, | |
| "grad_norm": 3.1603784561157227, | |
| "learning_rate": 0.00012147698072293842, | |
| "loss": 0.4911, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.8068880688806888, | |
| "grad_norm": 2.687168836593628, | |
| "learning_rate": 0.00012108891315658879, | |
| "loss": 0.4356, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.811808118081181, | |
| "grad_norm": 3.9243521690368652, | |
| "learning_rate": 0.00012070051293037492, | |
| "loss": 0.434, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.8167281672816729, | |
| "grad_norm": 2.8489391803741455, | |
| "learning_rate": 0.00012031178617098371, | |
| "loss": 0.3572, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.8216482164821648, | |
| "grad_norm": 2.8946075439453125, | |
| "learning_rate": 0.00011992273901025269, | |
| "loss": 0.3993, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.8265682656826567, | |
| "grad_norm": 1.3082534074783325, | |
| "learning_rate": 0.0001195333775850736, | |
| "loss": 0.4137, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.8314883148831487, | |
| "grad_norm": 1.9355298280715942, | |
| "learning_rate": 0.00011914370803729533, | |
| "loss": 0.3746, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.8364083640836408, | |
| "grad_norm": 2.1702141761779785, | |
| "learning_rate": 0.00011875373651362727, | |
| "loss": 0.3622, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.841328413284133, | |
| "grad_norm": 1.4988595247268677, | |
| "learning_rate": 0.00011836346916554205, | |
| "loss": 0.3619, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.846248462484625, | |
| "grad_norm": 1.761991262435913, | |
| "learning_rate": 0.00011797291214917881, | |
| "loss": 0.4358, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.8511685116851169, | |
| "grad_norm": 0.745695173740387, | |
| "learning_rate": 0.00011758207162524598, | |
| "loss": 0.3995, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.8560885608856088, | |
| "grad_norm": 1.9512763023376465, | |
| "learning_rate": 0.00011719095375892396, | |
| "loss": 0.4432, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.8610086100861007, | |
| "grad_norm": 2.6219289302825928, | |
| "learning_rate": 0.00011679956471976814, | |
| "loss": 0.4677, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.865928659286593, | |
| "grad_norm": 3.995495080947876, | |
| "learning_rate": 0.0001164079106816113, | |
| "loss": 0.2968, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.8708487084870848, | |
| "grad_norm": 0.8375853300094604, | |
| "learning_rate": 0.00011601599782246646, | |
| "loss": 0.4035, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.875768757687577, | |
| "grad_norm": 1.5365298986434937, | |
| "learning_rate": 0.00011562383232442926, | |
| "loss": 0.417, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.880688806888069, | |
| "grad_norm": 0.9506827592849731, | |
| "learning_rate": 0.0001152314203735805, | |
| "loss": 0.3772, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.8856088560885609, | |
| "grad_norm": 0.900571346282959, | |
| "learning_rate": 0.00011483876815988867, | |
| "loss": 0.3805, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.8905289052890528, | |
| "grad_norm": 1.0722607374191284, | |
| "learning_rate": 0.00011444588187711205, | |
| "loss": 0.4088, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.8954489544895448, | |
| "grad_norm": 1.7054160833358765, | |
| "learning_rate": 0.00011405276772270126, | |
| "loss": 0.3956, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.900369003690037, | |
| "grad_norm": 0.986569881439209, | |
| "learning_rate": 0.0001136594318977014, | |
| "loss": 0.4079, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.9052890528905289, | |
| "grad_norm": 0.8886733651161194, | |
| "learning_rate": 0.0001132658806066542, | |
| "loss": 0.3842, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.910209102091021, | |
| "grad_norm": 0.8961542248725891, | |
| "learning_rate": 0.00011287212005750024, | |
| "loss": 0.3881, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.915129151291513, | |
| "grad_norm": 2.585698127746582, | |
| "learning_rate": 0.00011247815646148087, | |
| "loss": 0.4671, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.920049200492005, | |
| "grad_norm": 1.6980706453323364, | |
| "learning_rate": 0.00011208399603304047, | |
| "loss": 0.3499, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.9249692496924968, | |
| "grad_norm": 0.8087127804756165, | |
| "learning_rate": 0.00011168964498972818, | |
| "loss": 0.3786, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.9298892988929888, | |
| "grad_norm": 1.2001378536224365, | |
| "learning_rate": 0.00011129510955209996, | |
| "loss": 0.3502, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.934809348093481, | |
| "grad_norm": 1.2342605590820312, | |
| "learning_rate": 0.00011090039594362045, | |
| "loss": 0.3924, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.939729397293973, | |
| "grad_norm": 1.3440324068069458, | |
| "learning_rate": 0.00011050551039056479, | |
| "loss": 0.4143, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.944649446494465, | |
| "grad_norm": 0.7715713977813721, | |
| "learning_rate": 0.00011011045912192035, | |
| "loss": 0.3371, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.949569495694957, | |
| "grad_norm": 0.9439634084701538, | |
| "learning_rate": 0.0001097152483692886, | |
| "loss": 0.3916, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.954489544895449, | |
| "grad_norm": 1.3055254220962524, | |
| "learning_rate": 0.00010931988436678666, | |
| "loss": 0.4016, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.9594095940959408, | |
| "grad_norm": 0.8916832208633423, | |
| "learning_rate": 0.00010892437335094912, | |
| "loss": 0.3525, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.964329643296433, | |
| "grad_norm": 1.3914546966552734, | |
| "learning_rate": 0.00010852872156062946, | |
| "loss": 0.3771, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.969249692496925, | |
| "grad_norm": 0.9857692122459412, | |
| "learning_rate": 0.00010813293523690191, | |
| "loss": 0.3393, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.974169741697417, | |
| "grad_norm": 2.0917956829071045, | |
| "learning_rate": 0.00010773702062296273, | |
| "loss": 0.4354, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.979089790897909, | |
| "grad_norm": 1.275038719177246, | |
| "learning_rate": 0.00010734098396403192, | |
| "loss": 0.3398, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.984009840098401, | |
| "grad_norm": 2.0591840744018555, | |
| "learning_rate": 0.00010694483150725458, | |
| "loss": 0.4068, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.988929889298893, | |
| "grad_norm": 4.258203506469727, | |
| "learning_rate": 0.00010654856950160253, | |
| "loss": 0.5179, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.9938499384993849, | |
| "grad_norm": 1.638200283050537, | |
| "learning_rate": 0.00010615220419777548, | |
| "loss": 0.3814, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.998769987699877, | |
| "grad_norm": 1.5387883186340332, | |
| "learning_rate": 0.00010575574184810269, | |
| "loss": 0.3818, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 4.10122013092041, | |
| "learning_rate": 0.0001053591887064442, | |
| "loss": 0.3577, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.004920049200492, | |
| "grad_norm": 1.2864503860473633, | |
| "learning_rate": 0.00010496255102809223, | |
| "loss": 0.3394, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.009840098400984, | |
| "grad_norm": 0.9785577654838562, | |
| "learning_rate": 0.00010456583506967248, | |
| "loss": 0.3734, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.014760147601476, | |
| "grad_norm": 0.43325719237327576, | |
| "learning_rate": 0.00010416904708904548, | |
| "loss": 0.3805, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.019680196801968, | |
| "grad_norm": 3.109909772872925, | |
| "learning_rate": 0.00010377219334520783, | |
| "loss": 0.4594, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.02460024600246, | |
| "grad_norm": 1.8757784366607666, | |
| "learning_rate": 0.00010337528009819344, | |
| "loss": 0.4087, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 2.029520295202952, | |
| "grad_norm": 3.7887425422668457, | |
| "learning_rate": 0.00010297831360897492, | |
| "loss": 0.436, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.034440344403444, | |
| "grad_norm": 0.3496626019477844, | |
| "learning_rate": 0.00010258130013936474, | |
| "loss": 0.3854, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.039360393603936, | |
| "grad_norm": 1.3036730289459229, | |
| "learning_rate": 0.00010218424595191631, | |
| "loss": 0.3803, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.044280442804428, | |
| "grad_norm": 4.245285987854004, | |
| "learning_rate": 0.00010178715730982549, | |
| "loss": 0.4952, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 2.0492004920049203, | |
| "grad_norm": 3.09157133102417, | |
| "learning_rate": 0.00010139004047683151, | |
| "loss": 0.3944, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.054120541205412, | |
| "grad_norm": 1.5304118394851685, | |
| "learning_rate": 0.00010099290171711841, | |
| "loss": 0.4174, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 2.059040590405904, | |
| "grad_norm": 2.2359232902526855, | |
| "learning_rate": 0.00010059574729521595, | |
| "loss": 0.3358, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.063960639606396, | |
| "grad_norm": 0.6024315357208252, | |
| "learning_rate": 0.0001001985834759011, | |
| "loss": 0.3981, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 2.068880688806888, | |
| "grad_norm": 1.2679041624069214, | |
| "learning_rate": 9.980141652409895e-05, | |
| "loss": 0.4225, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.07380073800738, | |
| "grad_norm": 0.895416796207428, | |
| "learning_rate": 9.940425270478407e-05, | |
| "loss": 0.388, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 2.078720787207872, | |
| "grad_norm": 1.567826747894287, | |
| "learning_rate": 9.900709828288164e-05, | |
| "loss": 0.3704, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.0836408364083643, | |
| "grad_norm": 1.9329123497009277, | |
| "learning_rate": 9.860995952316851e-05, | |
| "loss": 0.4234, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.088560885608856, | |
| "grad_norm": 0.44675880670547485, | |
| "learning_rate": 9.821284269017455e-05, | |
| "loss": 0.3876, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.093480934809348, | |
| "grad_norm": 1.7258495092391968, | |
| "learning_rate": 9.781575404808371e-05, | |
| "loss": 0.4297, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 2.09840098400984, | |
| "grad_norm": 1.092556118965149, | |
| "learning_rate": 9.741869986063526e-05, | |
| "loss": 0.4026, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.103321033210332, | |
| "grad_norm": 1.4725236892700195, | |
| "learning_rate": 9.702168639102509e-05, | |
| "loss": 0.4385, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.108241082410824, | |
| "grad_norm": 2.7385778427124023, | |
| "learning_rate": 9.662471990180657e-05, | |
| "loss": 0.4424, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.113161131611316, | |
| "grad_norm": 2.480210542678833, | |
| "learning_rate": 9.622780665479222e-05, | |
| "loss": 0.4206, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.1180811808118083, | |
| "grad_norm": 0.8362523913383484, | |
| "learning_rate": 9.583095291095453e-05, | |
| "loss": 0.3986, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.1230012300123002, | |
| "grad_norm": 2.0622987747192383, | |
| "learning_rate": 9.543416493032757e-05, | |
| "loss": 0.3485, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.127921279212792, | |
| "grad_norm": 2.7538793087005615, | |
| "learning_rate": 9.503744897190778e-05, | |
| "loss": 0.3756, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.132841328413284, | |
| "grad_norm": 0.8746367692947388, | |
| "learning_rate": 9.464081129355586e-05, | |
| "loss": 0.3751, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.137761377613776, | |
| "grad_norm": 1.8478419780731201, | |
| "learning_rate": 9.424425815189733e-05, | |
| "loss": 0.4292, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.142681426814268, | |
| "grad_norm": 1.6647083759307861, | |
| "learning_rate": 9.384779580222453e-05, | |
| "loss": 0.339, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.14760147601476, | |
| "grad_norm": 4.109962463378906, | |
| "learning_rate": 9.345143049839749e-05, | |
| "loss": 0.4544, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.1525215252152523, | |
| "grad_norm": 1.978119969367981, | |
| "learning_rate": 9.305516849274541e-05, | |
| "loss": 0.3702, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.1574415744157442, | |
| "grad_norm": 1.917183518409729, | |
| "learning_rate": 9.265901603596811e-05, | |
| "loss": 0.4077, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.162361623616236, | |
| "grad_norm": 0.47567835450172424, | |
| "learning_rate": 9.226297937703728e-05, | |
| "loss": 0.3356, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.167281672816728, | |
| "grad_norm": 2.546321392059326, | |
| "learning_rate": 9.186706476309812e-05, | |
| "loss": 0.4337, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.17220172201722, | |
| "grad_norm": 2.111480236053467, | |
| "learning_rate": 9.147127843937055e-05, | |
| "loss": 0.4024, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.177121771217712, | |
| "grad_norm": 1.1858526468276978, | |
| "learning_rate": 9.107562664905093e-05, | |
| "loss": 0.3637, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.1820418204182044, | |
| "grad_norm": 1.404078722000122, | |
| "learning_rate": 9.068011563321336e-05, | |
| "loss": 0.4173, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.1869618696186963, | |
| "grad_norm": 1.1295206546783447, | |
| "learning_rate": 9.028475163071141e-05, | |
| "loss": 0.3856, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.1918819188191883, | |
| "grad_norm": 1.2605645656585693, | |
| "learning_rate": 8.988954087807968e-05, | |
| "loss": 0.4193, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.19680196801968, | |
| "grad_norm": 1.1261564493179321, | |
| "learning_rate": 8.949448960943524e-05, | |
| "loss": 0.407, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.201722017220172, | |
| "grad_norm": 2.366487979888916, | |
| "learning_rate": 8.909960405637958e-05, | |
| "loss": 0.3946, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.206642066420664, | |
| "grad_norm": 2.1479427814483643, | |
| "learning_rate": 8.870489044790006e-05, | |
| "loss": 0.3728, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.211562115621156, | |
| "grad_norm": 2.990525245666504, | |
| "learning_rate": 8.831035501027186e-05, | |
| "loss": 0.3367, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.2164821648216484, | |
| "grad_norm": 1.812566876411438, | |
| "learning_rate": 8.791600396695954e-05, | |
| "loss": 0.3689, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.2214022140221403, | |
| "grad_norm": 0.5948531031608582, | |
| "learning_rate": 8.752184353851916e-05, | |
| "loss": 0.4018, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.2263222632226323, | |
| "grad_norm": 1.8020761013031006, | |
| "learning_rate": 8.712787994249979e-05, | |
| "loss": 0.3965, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.231242312423124, | |
| "grad_norm": 1.5464495420455933, | |
| "learning_rate": 8.673411939334581e-05, | |
| "loss": 0.3353, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.236162361623616, | |
| "grad_norm": 1.8382320404052734, | |
| "learning_rate": 8.634056810229862e-05, | |
| "loss": 0.3916, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.241082410824108, | |
| "grad_norm": 1.5499740839004517, | |
| "learning_rate": 8.594723227729875e-05, | |
| "loss": 0.3895, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.2460024600246005, | |
| "grad_norm": 4.030876636505127, | |
| "learning_rate": 8.555411812288798e-05, | |
| "loss": 0.4616, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.2509225092250924, | |
| "grad_norm": 1.1098424196243286, | |
| "learning_rate": 8.516123184011135e-05, | |
| "loss": 0.2977, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.2558425584255843, | |
| "grad_norm": 1.2961804866790771, | |
| "learning_rate": 8.47685796264195e-05, | |
| "loss": 0.36, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.2607626076260763, | |
| "grad_norm": 1.140372633934021, | |
| "learning_rate": 8.437616767557077e-05, | |
| "loss": 0.351, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.265682656826568, | |
| "grad_norm": 3.4962611198425293, | |
| "learning_rate": 8.398400217753357e-05, | |
| "loss": 0.4724, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.27060270602706, | |
| "grad_norm": 2.6000497341156006, | |
| "learning_rate": 8.359208931838871e-05, | |
| "loss": 0.4618, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.275522755227552, | |
| "grad_norm": 2.955470323562622, | |
| "learning_rate": 8.320043528023188e-05, | |
| "loss": 0.461, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.280442804428044, | |
| "grad_norm": 0.9662995934486389, | |
| "learning_rate": 8.280904624107606e-05, | |
| "loss": 0.3457, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.2853628536285364, | |
| "grad_norm": 0.8392460346221924, | |
| "learning_rate": 8.241792837475405e-05, | |
| "loss": 0.354, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.2902829028290284, | |
| "grad_norm": 2.8896520137786865, | |
| "learning_rate": 8.202708785082121e-05, | |
| "loss": 0.416, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 2.2952029520295203, | |
| "grad_norm": 1.6311709880828857, | |
| "learning_rate": 8.163653083445799e-05, | |
| "loss": 0.399, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.3001230012300122, | |
| "grad_norm": 0.4800054132938385, | |
| "learning_rate": 8.124626348637279e-05, | |
| "loss": 0.3758, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 2.305043050430504, | |
| "grad_norm": 1.0817440748214722, | |
| "learning_rate": 8.085629196270469e-05, | |
| "loss": 0.3764, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.3099630996309966, | |
| "grad_norm": 1.6088804006576538, | |
| "learning_rate": 8.046662241492645e-05, | |
| "loss": 0.4473, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 2.3148831488314885, | |
| "grad_norm": 1.0749715566635132, | |
| "learning_rate": 8.007726098974734e-05, | |
| "loss": 0.3703, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.3198031980319804, | |
| "grad_norm": 1.5354204177856445, | |
| "learning_rate": 7.96882138290163e-05, | |
| "loss": 0.4164, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.3247232472324724, | |
| "grad_norm": 1.117240309715271, | |
| "learning_rate": 7.929948706962508e-05, | |
| "loss": 0.4144, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.3296432964329643, | |
| "grad_norm": 0.9542057514190674, | |
| "learning_rate": 7.891108684341121e-05, | |
| "loss": 0.4162, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.3345633456334562, | |
| "grad_norm": 1.411424994468689, | |
| "learning_rate": 7.852301927706159e-05, | |
| "loss": 0.4402, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.339483394833948, | |
| "grad_norm": 1.8303946256637573, | |
| "learning_rate": 7.813529049201556e-05, | |
| "loss": 0.423, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 2.34440344403444, | |
| "grad_norm": 1.6640418767929077, | |
| "learning_rate": 7.774790660436858e-05, | |
| "loss": 0.3943, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.3493234932349325, | |
| "grad_norm": 0.5208873152732849, | |
| "learning_rate": 7.736087372477554e-05, | |
| "loss": 0.4215, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 2.3542435424354244, | |
| "grad_norm": 2.1671223640441895, | |
| "learning_rate": 7.69741979583546e-05, | |
| "loss": 0.3839, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.3591635916359164, | |
| "grad_norm": 2.075159788131714, | |
| "learning_rate": 7.658788540459062e-05, | |
| "loss": 0.3851, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 2.3640836408364083, | |
| "grad_norm": 1.8642665147781372, | |
| "learning_rate": 7.620194215723919e-05, | |
| "loss": 0.3669, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.3690036900369003, | |
| "grad_norm": 2.8715755939483643, | |
| "learning_rate": 7.581637430423037e-05, | |
| "loss": 0.4352, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 2.373923739237392, | |
| "grad_norm": 0.8020451664924622, | |
| "learning_rate": 7.543118792757266e-05, | |
| "loss": 0.3657, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.3788437884378846, | |
| "grad_norm": 2.100980758666992, | |
| "learning_rate": 7.504638910325717e-05, | |
| "loss": 0.3141, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.3837638376383765, | |
| "grad_norm": 3.8309755325317383, | |
| "learning_rate": 7.466198390116158e-05, | |
| "loss": 0.494, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.3886838868388685, | |
| "grad_norm": 1.7863093614578247, | |
| "learning_rate": 7.427797838495463e-05, | |
| "loss": 0.3792, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 2.3936039360393604, | |
| "grad_norm": 1.1884002685546875, | |
| "learning_rate": 7.389437861200024e-05, | |
| "loss": 0.3928, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.3985239852398523, | |
| "grad_norm": 1.9756462574005127, | |
| "learning_rate": 7.35111906332622e-05, | |
| "loss": 0.4218, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 2.4034440344403443, | |
| "grad_norm": 3.6889054775238037, | |
| "learning_rate": 7.312842049320844e-05, | |
| "loss": 0.4441, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.408364083640836, | |
| "grad_norm": 2.106717109680176, | |
| "learning_rate": 7.2746074229716e-05, | |
| "loss": 0.3783, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 2.4132841328413286, | |
| "grad_norm": 1.312242865562439, | |
| "learning_rate": 7.236415787397548e-05, | |
| "loss": 0.3342, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.4182041820418205, | |
| "grad_norm": 0.7120693325996399, | |
| "learning_rate": 7.198267745039612e-05, | |
| "loss": 0.3846, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.4231242312423125, | |
| "grad_norm": 1.6067770719528198, | |
| "learning_rate": 7.160163897651075e-05, | |
| "loss": 0.4396, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.4280442804428044, | |
| "grad_norm": 4.197781562805176, | |
| "learning_rate": 7.122104846288064e-05, | |
| "loss": 0.2713, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.4329643296432963, | |
| "grad_norm": 1.1666693687438965, | |
| "learning_rate": 7.08409119130011e-05, | |
| "loss": 0.3647, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.4378843788437883, | |
| "grad_norm": 1.3826804161071777, | |
| "learning_rate": 7.04612353232063e-05, | |
| "loss": 0.3739, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 2.4428044280442807, | |
| "grad_norm": 1.140659213066101, | |
| "learning_rate": 7.008202468257514e-05, | |
| "loss": 0.4207, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.4477244772447726, | |
| "grad_norm": 2.2047266960144043, | |
| "learning_rate": 6.970328597283637e-05, | |
| "loss": 0.3767, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 2.4526445264452645, | |
| "grad_norm": 2.385573148727417, | |
| "learning_rate": 6.932502516827461e-05, | |
| "loss": 0.3369, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.4575645756457565, | |
| "grad_norm": 1.869011402130127, | |
| "learning_rate": 6.894724823563583e-05, | |
| "loss": 0.3521, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 2.4624846248462484, | |
| "grad_norm": 1.2904314994812012, | |
| "learning_rate": 6.85699611340333e-05, | |
| "loss": 0.3519, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.4674046740467404, | |
| "grad_norm": 1.8398619890213013, | |
| "learning_rate": 6.819316981485372e-05, | |
| "loss": 0.3123, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 2.4723247232472323, | |
| "grad_norm": 2.567601442337036, | |
| "learning_rate": 6.781688022166311e-05, | |
| "loss": 0.4435, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.4772447724477242, | |
| "grad_norm": 1.0559594631195068, | |
| "learning_rate": 6.744109829011332e-05, | |
| "loss": 0.3921, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.4821648216482166, | |
| "grad_norm": 1.4271594285964966, | |
| "learning_rate": 6.706582994784814e-05, | |
| "loss": 0.382, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.4870848708487086, | |
| "grad_norm": 2.100080728530884, | |
| "learning_rate": 6.669108111441003e-05, | |
| "loss": 0.4241, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 2.4920049200492005, | |
| "grad_norm": 2.3189799785614014, | |
| "learning_rate": 6.631685770114654e-05, | |
| "loss": 0.4492, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.4969249692496924, | |
| "grad_norm": 1.2089158296585083, | |
| "learning_rate": 6.594316561111724e-05, | |
| "loss": 0.3763, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 2.5018450184501844, | |
| "grad_norm": 2.086798906326294, | |
| "learning_rate": 6.557001073900044e-05, | |
| "loss": 0.4291, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.5067650676506767, | |
| "grad_norm": 1.3246550559997559, | |
| "learning_rate": 6.519739897100034e-05, | |
| "loss": 0.4328, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 2.5116851168511687, | |
| "grad_norm": 3.522636890411377, | |
| "learning_rate": 6.482533618475422e-05, | |
| "loss": 0.3572, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.5166051660516606, | |
| "grad_norm": 2.3924167156219482, | |
| "learning_rate": 6.445382824923938e-05, | |
| "loss": 0.4262, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 2.5215252152521526, | |
| "grad_norm": 3.661113739013672, | |
| "learning_rate": 6.408288102468113e-05, | |
| "loss": 0.3796, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.5264452644526445, | |
| "grad_norm": 1.2376595735549927, | |
| "learning_rate": 6.371250036245976e-05, | |
| "loss": 0.3796, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.5313653136531364, | |
| "grad_norm": 1.8098406791687012, | |
| "learning_rate": 6.334269210501875e-05, | |
| "loss": 0.3707, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.5362853628536284, | |
| "grad_norm": 1.7512861490249634, | |
| "learning_rate": 6.297346208577213e-05, | |
| "loss": 0.3753, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 2.5412054120541203, | |
| "grad_norm": 2.3691437244415283, | |
| "learning_rate": 6.260481612901299e-05, | |
| "loss": 0.3678, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.5461254612546127, | |
| "grad_norm": 2.599379539489746, | |
| "learning_rate": 6.223676004982105e-05, | |
| "loss": 0.4462, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 2.5510455104551046, | |
| "grad_norm": 1.492092251777649, | |
| "learning_rate": 6.18692996539714e-05, | |
| "loss": 0.4379, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.5559655596555966, | |
| "grad_norm": 1.2708606719970703, | |
| "learning_rate": 6.150244073784266e-05, | |
| "loss": 0.3573, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 2.5608856088560885, | |
| "grad_norm": 2.399810552597046, | |
| "learning_rate": 6.113618908832561e-05, | |
| "loss": 0.4584, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.5658056580565805, | |
| "grad_norm": 3.6795196533203125, | |
| "learning_rate": 6.0770550482731924e-05, | |
| "loss": 0.247, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 2.570725707257073, | |
| "grad_norm": 1.5731709003448486, | |
| "learning_rate": 6.0405530688702986e-05, | |
| "loss": 0.4207, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.5756457564575648, | |
| "grad_norm": 2.2327213287353516, | |
| "learning_rate": 6.0041135464119024e-05, | |
| "loss": 0.389, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.5805658056580567, | |
| "grad_norm": 2.482600688934326, | |
| "learning_rate": 5.9677370557008104e-05, | |
| "loss": 0.4297, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.5854858548585486, | |
| "grad_norm": 0.5008729696273804, | |
| "learning_rate": 5.9314241705455674e-05, | |
| "loss": 0.37, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 2.5904059040590406, | |
| "grad_norm": 1.346571683883667, | |
| "learning_rate": 5.895175463751385e-05, | |
| "loss": 0.398, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.5953259532595325, | |
| "grad_norm": 1.3295096158981323, | |
| "learning_rate": 5.858991507111122e-05, | |
| "loss": 0.4046, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 2.6002460024600245, | |
| "grad_norm": 2.531033515930176, | |
| "learning_rate": 5.8228728713962543e-05, | |
| "loss": 0.441, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.6051660516605164, | |
| "grad_norm": 1.4059702157974243, | |
| "learning_rate": 5.786820126347876e-05, | |
| "loss": 0.3887, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 2.6100861008610083, | |
| "grad_norm": 0.8365688323974609, | |
| "learning_rate": 5.750833840667711e-05, | |
| "loss": 0.3926, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.6150061500615007, | |
| "grad_norm": 1.1072005033493042, | |
| "learning_rate": 5.7149145820091385e-05, | |
| "loss": 0.4331, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 2.6199261992619927, | |
| "grad_norm": 4.232044219970703, | |
| "learning_rate": 5.6790629169682564e-05, | |
| "loss": 0.313, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.6248462484624846, | |
| "grad_norm": 1.9496935606002808, | |
| "learning_rate": 5.6432794110749134e-05, | |
| "loss": 0.388, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.6297662976629765, | |
| "grad_norm": 2.281867265701294, | |
| "learning_rate": 5.607564628783817e-05, | |
| "loss": 0.3739, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.6346863468634685, | |
| "grad_norm": 1.5007566213607788, | |
| "learning_rate": 5.571919133465605e-05, | |
| "loss": 0.4018, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 2.639606396063961, | |
| "grad_norm": 1.5338659286499023, | |
| "learning_rate": 5.5363434873979903e-05, | |
| "loss": 0.3782, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 2.644526445264453, | |
| "grad_norm": 1.8886133432388306, | |
| "learning_rate": 5.500838251756857e-05, | |
| "loss": 0.4441, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 2.6494464944649447, | |
| "grad_norm": 3.30102801322937, | |
| "learning_rate": 5.465403986607426e-05, | |
| "loss": 0.3137, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.6543665436654367, | |
| "grad_norm": 1.8262077569961548, | |
| "learning_rate": 5.430041250895428e-05, | |
| "loss": 0.4104, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 2.6592865928659286, | |
| "grad_norm": 1.551676869392395, | |
| "learning_rate": 5.3947506024382665e-05, | |
| "loss": 0.3337, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 2.6642066420664205, | |
| "grad_norm": 2.0609912872314453, | |
| "learning_rate": 5.359532597916233e-05, | |
| "loss": 0.3059, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 2.6691266912669125, | |
| "grad_norm": 2.948434829711914, | |
| "learning_rate": 5.324387792863719e-05, | |
| "loss": 0.4629, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.6740467404674044, | |
| "grad_norm": 1.7256718873977661, | |
| "learning_rate": 5.289316741660466e-05, | |
| "loss": 0.3752, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.678966789667897, | |
| "grad_norm": 3.3157119750976562, | |
| "learning_rate": 5.254319997522796e-05, | |
| "loss": 0.4715, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 2.6838868388683887, | |
| "grad_norm": 2.951591730117798, | |
| "learning_rate": 5.21939811249492e-05, | |
| "loss": 0.4372, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 2.6888068880688807, | |
| "grad_norm": 1.9655730724334717, | |
| "learning_rate": 5.1845516374401784e-05, | |
| "loss": 0.3728, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.6937269372693726, | |
| "grad_norm": 2.9351847171783447, | |
| "learning_rate": 5.14978112203241e-05, | |
| "loss": 0.4404, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 2.6986469864698646, | |
| "grad_norm": 1.8943357467651367, | |
| "learning_rate": 5.11508711474725e-05, | |
| "loss": 0.2844, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.703567035670357, | |
| "grad_norm": 0.9512324333190918, | |
| "learning_rate": 5.080470162853472e-05, | |
| "loss": 0.3566, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 2.708487084870849, | |
| "grad_norm": 1.0358315706253052, | |
| "learning_rate": 5.0459308124043715e-05, | |
| "loss": 0.3267, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 2.713407134071341, | |
| "grad_norm": 0.828611433506012, | |
| "learning_rate": 5.0114696082291425e-05, | |
| "loss": 0.3766, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 2.7183271832718328, | |
| "grad_norm": 1.0593851804733276, | |
| "learning_rate": 4.9770870939242986e-05, | |
| "loss": 0.3895, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.7232472324723247, | |
| "grad_norm": 1.6970057487487793, | |
| "learning_rate": 4.942783811845074e-05, | |
| "loss": 0.3349, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.7281672816728166, | |
| "grad_norm": 0.8009957671165466, | |
| "learning_rate": 4.908560303096887e-05, | |
| "loss": 0.3741, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.7330873308733086, | |
| "grad_norm": 1.5965189933776855, | |
| "learning_rate": 4.874417107526795e-05, | |
| "loss": 0.326, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 2.7380073800738005, | |
| "grad_norm": 1.6400642395019531, | |
| "learning_rate": 4.840354763714991e-05, | |
| "loss": 0.3416, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 2.742927429274293, | |
| "grad_norm": 2.1281847953796387, | |
| "learning_rate": 4.8063738089662926e-05, | |
| "loss": 0.3142, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 2.747847478474785, | |
| "grad_norm": 1.0202291011810303, | |
| "learning_rate": 4.772474779301669e-05, | |
| "loss": 0.389, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.7527675276752768, | |
| "grad_norm": 1.6404527425765991, | |
| "learning_rate": 4.738658209449805e-05, | |
| "loss": 0.3891, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 2.7576875768757687, | |
| "grad_norm": 0.9586972594261169, | |
| "learning_rate": 4.704924632838636e-05, | |
| "loss": 0.3888, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 2.7626076260762606, | |
| "grad_norm": 3.16829776763916, | |
| "learning_rate": 4.671274581586958e-05, | |
| "loss": 0.4402, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 2.767527675276753, | |
| "grad_norm": 1.2215882539749146, | |
| "learning_rate": 4.637708586496018e-05, | |
| "loss": 0.3508, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.772447724477245, | |
| "grad_norm": 1.4616819620132446, | |
| "learning_rate": 4.604227177041156e-05, | |
| "loss": 0.4277, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.777367773677737, | |
| "grad_norm": 1.0973330736160278, | |
| "learning_rate": 4.570830881363439e-05, | |
| "loss": 0.4127, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 2.782287822878229, | |
| "grad_norm": 1.676638126373291, | |
| "learning_rate": 4.537520226261333e-05, | |
| "loss": 0.4243, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 2.787207872078721, | |
| "grad_norm": 2.636601448059082, | |
| "learning_rate": 4.5042957371824057e-05, | |
| "loss": 0.3116, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.7921279212792127, | |
| "grad_norm": 1.0604605674743652, | |
| "learning_rate": 4.471157938215017e-05, | |
| "loss": 0.4186, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 2.7970479704797047, | |
| "grad_norm": 1.1565591096878052, | |
| "learning_rate": 4.438107352080076e-05, | |
| "loss": 0.4068, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.8019680196801966, | |
| "grad_norm": 0.5449007153511047, | |
| "learning_rate": 4.405144500122772e-05, | |
| "loss": 0.3739, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 2.8068880688806885, | |
| "grad_norm": 1.7176798582077026, | |
| "learning_rate": 4.372269902304363e-05, | |
| "loss": 0.4036, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 2.811808118081181, | |
| "grad_norm": 1.9100306034088135, | |
| "learning_rate": 4.339484077193974e-05, | |
| "loss": 0.3436, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 2.816728167281673, | |
| "grad_norm": 0.950062096118927, | |
| "learning_rate": 4.3067875419604184e-05, | |
| "loss": 0.4206, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 2.821648216482165, | |
| "grad_norm": 1.950170636177063, | |
| "learning_rate": 4.2741808123640335e-05, | |
| "loss": 0.4187, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.8265682656826567, | |
| "grad_norm": 1.7998218536376953, | |
| "learning_rate": 4.241664402748544e-05, | |
| "loss": 0.3643, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.8314883148831487, | |
| "grad_norm": 0.5359982252120972, | |
| "learning_rate": 4.209238826032965e-05, | |
| "loss": 0.4071, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 2.836408364083641, | |
| "grad_norm": 2.177288055419922, | |
| "learning_rate": 4.1769045937034876e-05, | |
| "loss": 0.4656, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 2.841328413284133, | |
| "grad_norm": 1.8691096305847168, | |
| "learning_rate": 4.144662215805426e-05, | |
| "loss": 0.4428, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 2.846248462484625, | |
| "grad_norm": 1.3971619606018066, | |
| "learning_rate": 4.1125122009351634e-05, | |
| "loss": 0.3774, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.851168511685117, | |
| "grad_norm": 1.863781213760376, | |
| "learning_rate": 4.080455056232147e-05, | |
| "loss": 0.3686, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 2.856088560885609, | |
| "grad_norm": 2.2776503562927246, | |
| "learning_rate": 4.048491287370863e-05, | |
| "loss": 0.4009, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 2.8610086100861007, | |
| "grad_norm": 0.3682532012462616, | |
| "learning_rate": 4.016621398552877e-05, | |
| "loss": 0.4106, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 2.8659286592865927, | |
| "grad_norm": 1.6590131521224976, | |
| "learning_rate": 3.9848458924988684e-05, | |
| "loss": 0.4113, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.8708487084870846, | |
| "grad_norm": 0.8245828747749329, | |
| "learning_rate": 3.953165270440721e-05, | |
| "loss": 0.3874, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.875768757687577, | |
| "grad_norm": 1.1494457721710205, | |
| "learning_rate": 3.921580032113602e-05, | |
| "loss": 0.4041, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.880688806888069, | |
| "grad_norm": 0.3291958272457123, | |
| "learning_rate": 3.8900906757480614e-05, | |
| "loss": 0.4009, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 2.885608856088561, | |
| "grad_norm": 3.5113492012023926, | |
| "learning_rate": 3.858697698062217e-05, | |
| "loss": 0.4783, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.890528905289053, | |
| "grad_norm": 0.7835597991943359, | |
| "learning_rate": 3.8274015942538745e-05, | |
| "loss": 0.3928, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 2.8954489544895448, | |
| "grad_norm": 1.4036983251571655, | |
| "learning_rate": 3.7962028579927555e-05, | |
| "loss": 0.3694, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.900369003690037, | |
| "grad_norm": 1.1807712316513062, | |
| "learning_rate": 3.7651019814126654e-05, | |
| "loss": 0.385, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 2.905289052890529, | |
| "grad_norm": 2.0742995738983154, | |
| "learning_rate": 3.734099455103779e-05, | |
| "loss": 0.4164, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.910209102091021, | |
| "grad_norm": 2.6549105644226074, | |
| "learning_rate": 3.7031957681048604e-05, | |
| "loss": 0.347, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 2.915129151291513, | |
| "grad_norm": 1.3094247579574585, | |
| "learning_rate": 3.6723914078955825e-05, | |
| "loss": 0.4112, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.920049200492005, | |
| "grad_norm": 0.5627428293228149, | |
| "learning_rate": 3.64168686038881e-05, | |
| "loss": 0.3947, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.924969249692497, | |
| "grad_norm": 1.4705300331115723, | |
| "learning_rate": 3.6110826099229453e-05, | |
| "loss": 0.3828, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 2.9298892988929888, | |
| "grad_norm": 0.9498153924942017, | |
| "learning_rate": 3.580579139254303e-05, | |
| "loss": 0.3829, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 2.9348093480934807, | |
| "grad_norm": 2.052823781967163, | |
| "learning_rate": 3.550176929549468e-05, | |
| "loss": 0.3334, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 2.939729397293973, | |
| "grad_norm": 0.9632225036621094, | |
| "learning_rate": 3.5198764603777235e-05, | |
| "loss": 0.3681, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 2.944649446494465, | |
| "grad_norm": 1.2577297687530518, | |
| "learning_rate": 3.489678209703475e-05, | |
| "loss": 0.3469, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.949569495694957, | |
| "grad_norm": 1.42790949344635, | |
| "learning_rate": 3.459582653878731e-05, | |
| "loss": 0.4072, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 2.954489544895449, | |
| "grad_norm": 2.504870653152466, | |
| "learning_rate": 3.429590267635565e-05, | |
| "loss": 0.4232, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 2.959409594095941, | |
| "grad_norm": 2.3047032356262207, | |
| "learning_rate": 3.399701524078635e-05, | |
| "loss": 0.3763, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 2.9643296432964332, | |
| "grad_norm": 1.7464078664779663, | |
| "learning_rate": 3.369916894677733e-05, | |
| "loss": 0.3354, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 2.969249692496925, | |
| "grad_norm": 1.6479971408843994, | |
| "learning_rate": 3.340236849260324e-05, | |
| "loss": 0.3798, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.974169741697417, | |
| "grad_norm": 1.558695673942566, | |
| "learning_rate": 3.31066185600417e-05, | |
| "loss": 0.3488, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 2.979089790897909, | |
| "grad_norm": 3.189610719680786, | |
| "learning_rate": 3.281192381429894e-05, | |
| "loss": 0.441, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 2.984009840098401, | |
| "grad_norm": 0.9114331603050232, | |
| "learning_rate": 3.251828890393677e-05, | |
| "loss": 0.3922, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.988929889298893, | |
| "grad_norm": 0.84954833984375, | |
| "learning_rate": 3.222571846079881e-05, | |
| "loss": 0.3682, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 2.993849938499385, | |
| "grad_norm": 2.6202147006988525, | |
| "learning_rate": 3.193421709993779e-05, | |
| "loss": 0.453, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.998769987699877, | |
| "grad_norm": 1.6845208406448364, | |
| "learning_rate": 3.1643789419542324e-05, | |
| "loss": 0.3606, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 4.893674850463867, | |
| "learning_rate": 3.135444000086485e-05, | |
| "loss": 0.5199, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 3.004920049200492, | |
| "grad_norm": 1.350771427154541, | |
| "learning_rate": 3.1066173408148955e-05, | |
| "loss": 0.3319, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 3.009840098400984, | |
| "grad_norm": 2.239192247390747, | |
| "learning_rate": 3.077899418855772e-05, | |
| "loss": 0.4358, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 3.014760147601476, | |
| "grad_norm": 2.0310704708099365, | |
| "learning_rate": 3.04929068721017e-05, | |
| "loss": 0.4024, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 3.019680196801968, | |
| "grad_norm": 0.5520709156990051, | |
| "learning_rate": 3.0207915971567624e-05, | |
| "loss": 0.3869, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 3.02460024600246, | |
| "grad_norm": 1.409179925918579, | |
| "learning_rate": 2.992402598244727e-05, | |
| "loss": 0.343, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 3.029520295202952, | |
| "grad_norm": 3.2636709213256836, | |
| "learning_rate": 2.9641241382866348e-05, | |
| "loss": 0.3208, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 3.034440344403444, | |
| "grad_norm": 1.3331984281539917, | |
| "learning_rate": 2.9359566633514037e-05, | |
| "loss": 0.4065, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 3.039360393603936, | |
| "grad_norm": 1.49379563331604, | |
| "learning_rate": 2.907900617757252e-05, | |
| "loss": 0.3844, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.044280442804428, | |
| "grad_norm": 1.0063300132751465, | |
| "learning_rate": 2.879956444064703e-05, | |
| "loss": 0.4103, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 3.0492004920049203, | |
| "grad_norm": 1.5763076543807983, | |
| "learning_rate": 2.8521245830695864e-05, | |
| "loss": 0.4199, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 3.054120541205412, | |
| "grad_norm": 1.9557186365127563, | |
| "learning_rate": 2.8244054737960935e-05, | |
| "loss": 0.3928, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 3.059040590405904, | |
| "grad_norm": 1.7936758995056152, | |
| "learning_rate": 2.7967995534898596e-05, | |
| "loss": 0.3503, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 3.063960639606396, | |
| "grad_norm": 2.0918500423431396, | |
| "learning_rate": 2.7693072576110514e-05, | |
| "loss": 0.3772, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.068880688806888, | |
| "grad_norm": 1.531785249710083, | |
| "learning_rate": 2.7419290198275095e-05, | |
| "loss": 0.413, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 3.07380073800738, | |
| "grad_norm": 0.7834340929985046, | |
| "learning_rate": 2.7146652720079003e-05, | |
| "loss": 0.3919, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 3.078720787207872, | |
| "grad_norm": 1.8467501401901245, | |
| "learning_rate": 2.6875164442149147e-05, | |
| "loss": 0.368, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 3.0836408364083643, | |
| "grad_norm": 1.6197096109390259, | |
| "learning_rate": 2.6604829646984686e-05, | |
| "loss": 0.3476, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 3.088560885608856, | |
| "grad_norm": 2.2266929149627686, | |
| "learning_rate": 2.6335652598889683e-05, | |
| "loss": 0.3692, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.093480934809348, | |
| "grad_norm": 1.0801973342895508, | |
| "learning_rate": 2.60676375439055e-05, | |
| "loss": 0.4145, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 3.09840098400984, | |
| "grad_norm": 0.6759971976280212, | |
| "learning_rate": 2.5800788709744227e-05, | |
| "loss": 0.3621, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 3.103321033210332, | |
| "grad_norm": 1.5428274869918823, | |
| "learning_rate": 2.5535110305721776e-05, | |
| "loss": 0.3946, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 3.108241082410824, | |
| "grad_norm": 0.4800112843513489, | |
| "learning_rate": 2.5270606522691443e-05, | |
| "loss": 0.3695, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 3.113161131611316, | |
| "grad_norm": 1.418677568435669, | |
| "learning_rate": 2.500728153297788e-05, | |
| "loss": 0.3413, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 3.1180811808118083, | |
| "grad_norm": 1.384252667427063, | |
| "learning_rate": 2.4745139490311254e-05, | |
| "loss": 0.3376, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 3.1230012300123002, | |
| "grad_norm": 0.7807061672210693, | |
| "learning_rate": 2.4484184529761834e-05, | |
| "loss": 0.4, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 3.127921279212792, | |
| "grad_norm": 1.9366016387939453, | |
| "learning_rate": 2.4224420767674562e-05, | |
| "loss": 0.3731, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 3.132841328413284, | |
| "grad_norm": 2.5923564434051514, | |
| "learning_rate": 2.3965852301604254e-05, | |
| "loss": 0.4395, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 3.137761377613776, | |
| "grad_norm": 0.9284645318984985, | |
| "learning_rate": 2.370848321025093e-05, | |
| "loss": 0.3901, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.142681426814268, | |
| "grad_norm": 1.9988764524459839, | |
| "learning_rate": 2.345231755339554e-05, | |
| "loss": 0.4379, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 3.14760147601476, | |
| "grad_norm": 1.626031517982483, | |
| "learning_rate": 2.3197359371835802e-05, | |
| "loss": 0.4256, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 3.1525215252152523, | |
| "grad_norm": 2.1211905479431152, | |
| "learning_rate": 2.2943612687322525e-05, | |
| "loss": 0.3934, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 3.1574415744157442, | |
| "grad_norm": 1.0140880346298218, | |
| "learning_rate": 2.2691081502496246e-05, | |
| "loss": 0.3604, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 3.162361623616236, | |
| "grad_norm": 2.3775453567504883, | |
| "learning_rate": 2.243976980082394e-05, | |
| "loss": 0.4068, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 3.167281672816728, | |
| "grad_norm": 2.1912922859191895, | |
| "learning_rate": 2.218968154653629e-05, | |
| "loss": 0.3614, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 3.17220172201722, | |
| "grad_norm": 1.8802082538604736, | |
| "learning_rate": 2.194082068456509e-05, | |
| "loss": 0.3843, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 3.177121771217712, | |
| "grad_norm": 1.67764151096344, | |
| "learning_rate": 2.169319114048114e-05, | |
| "loss": 0.3707, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 3.1820418204182044, | |
| "grad_norm": 1.9697654247283936, | |
| "learning_rate": 2.1446796820432167e-05, | |
| "loss": 0.3357, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 3.1869618696186963, | |
| "grad_norm": 1.7767447233200073, | |
| "learning_rate": 2.1201641611081246e-05, | |
| "loss": 0.3937, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.1918819188191883, | |
| "grad_norm": 1.3625164031982422, | |
| "learning_rate": 2.0957729379545655e-05, | |
| "loss": 0.3593, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 3.19680196801968, | |
| "grad_norm": 1.0841906070709229, | |
| "learning_rate": 2.0715063973335568e-05, | |
| "loss": 0.393, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 3.201722017220172, | |
| "grad_norm": 3.0648295879364014, | |
| "learning_rate": 2.04736492202937e-05, | |
| "loss": 0.3615, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 3.206642066420664, | |
| "grad_norm": 1.1780354976654053, | |
| "learning_rate": 2.0233488928534673e-05, | |
| "loss": 0.3733, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 3.211562115621156, | |
| "grad_norm": 2.0348012447357178, | |
| "learning_rate": 1.9994586886385046e-05, | |
| "loss": 0.3895, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 3.2164821648216484, | |
| "grad_norm": 0.7234269380569458, | |
| "learning_rate": 1.9756946862323535e-05, | |
| "loss": 0.3621, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 3.2214022140221403, | |
| "grad_norm": 2.2290384769439697, | |
| "learning_rate": 1.9520572604921672e-05, | |
| "loss": 0.4369, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 3.2263222632226323, | |
| "grad_norm": 0.7513899803161621, | |
| "learning_rate": 1.9285467842784467e-05, | |
| "loss": 0.3614, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 3.231242312423124, | |
| "grad_norm": 2.5259876251220703, | |
| "learning_rate": 1.9051636284491757e-05, | |
| "loss": 0.3877, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 3.236162361623616, | |
| "grad_norm": 2.885737180709839, | |
| "learning_rate": 1.8819081618539723e-05, | |
| "loss": 0.4691, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.241082410824108, | |
| "grad_norm": 1.888336181640625, | |
| "learning_rate": 1.858780751328255e-05, | |
| "loss": 0.433, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 3.2460024600246005, | |
| "grad_norm": 0.801278293132782, | |
| "learning_rate": 1.8357817616874694e-05, | |
| "loss": 0.3704, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 3.2509225092250924, | |
| "grad_norm": 1.2432537078857422, | |
| "learning_rate": 1.8129115557213262e-05, | |
| "loss": 0.3552, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 3.2558425584255843, | |
| "grad_norm": 1.9892895221710205, | |
| "learning_rate": 1.7901704941880914e-05, | |
| "loss": 0.3551, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 3.2607626076260763, | |
| "grad_norm": 1.448431372642517, | |
| "learning_rate": 1.7675589358088763e-05, | |
| "loss": 0.4053, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 3.265682656826568, | |
| "grad_norm": 2.4297046661376953, | |
| "learning_rate": 1.745077237261994e-05, | |
| "loss": 0.4334, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 3.27060270602706, | |
| "grad_norm": 1.624751329421997, | |
| "learning_rate": 1.7227257531773223e-05, | |
| "loss": 0.4296, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 3.275522755227552, | |
| "grad_norm": 1.1023207902908325, | |
| "learning_rate": 1.7005048361307262e-05, | |
| "loss": 0.375, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 3.280442804428044, | |
| "grad_norm": 1.6138256788253784, | |
| "learning_rate": 1.6784148366384754e-05, | |
| "loss": 0.3394, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 3.2853628536285364, | |
| "grad_norm": 0.9887522459030151, | |
| "learning_rate": 1.656456103151728e-05, | |
| "loss": 0.3597, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.2902829028290284, | |
| "grad_norm": 1.7043898105621338, | |
| "learning_rate": 1.6346289820510363e-05, | |
| "loss": 0.3417, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 3.2952029520295203, | |
| "grad_norm": 1.6882188320159912, | |
| "learning_rate": 1.612933817640868e-05, | |
| "loss": 0.436, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 3.3001230012300122, | |
| "grad_norm": 0.7217171788215637, | |
| "learning_rate": 1.5913709521441988e-05, | |
| "loss": 0.3997, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 3.305043050430504, | |
| "grad_norm": 2.6820271015167236, | |
| "learning_rate": 1.5699407256970833e-05, | |
| "loss": 0.3115, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 3.3099630996309966, | |
| "grad_norm": 1.4860421419143677, | |
| "learning_rate": 1.5486434763433222e-05, | |
| "loss": 0.3516, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 3.3148831488314885, | |
| "grad_norm": 1.136051893234253, | |
| "learning_rate": 1.527479540029104e-05, | |
| "loss": 0.4023, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 3.3198031980319804, | |
| "grad_norm": 2.500821828842163, | |
| "learning_rate": 1.5064492505977234e-05, | |
| "loss": 0.4225, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 3.3247232472324724, | |
| "grad_norm": 0.5306374430656433, | |
| "learning_rate": 1.4855529397843038e-05, | |
| "loss": 0.3675, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 3.3296432964329643, | |
| "grad_norm": 1.5522453784942627, | |
| "learning_rate": 1.4647909372105672e-05, | |
| "loss": 0.3182, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 3.3345633456334562, | |
| "grad_norm": 1.6273597478866577, | |
| "learning_rate": 1.4441635703796408e-05, | |
| "loss": 0.3548, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.339483394833948, | |
| "grad_norm": 1.7513864040374756, | |
| "learning_rate": 1.4236711646708844e-05, | |
| "loss": 0.3177, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 3.34440344403444, | |
| "grad_norm": 1.033565878868103, | |
| "learning_rate": 1.4033140433347569e-05, | |
| "loss": 0.3639, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 3.3493234932349325, | |
| "grad_norm": 1.3103158473968506, | |
| "learning_rate": 1.3830925274877216e-05, | |
| "loss": 0.4256, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 3.3542435424354244, | |
| "grad_norm": 2.1008458137512207, | |
| "learning_rate": 1.363006936107183e-05, | |
| "loss": 0.4194, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 3.3591635916359164, | |
| "grad_norm": 1.350831151008606, | |
| "learning_rate": 1.343057586026446e-05, | |
| "loss": 0.3792, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 3.3640836408364083, | |
| "grad_norm": 3.0984957218170166, | |
| "learning_rate": 1.3232447919297274e-05, | |
| "loss": 0.4341, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 3.3690036900369003, | |
| "grad_norm": 0.47078070044517517, | |
| "learning_rate": 1.3035688663471834e-05, | |
| "loss": 0.3664, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 3.373923739237392, | |
| "grad_norm": 1.277298927307129, | |
| "learning_rate": 1.2840301196499893e-05, | |
| "loss": 0.3714, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 3.3788437884378846, | |
| "grad_norm": 2.4945287704467773, | |
| "learning_rate": 1.2646288600454448e-05, | |
| "loss": 0.3517, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 3.3837638376383765, | |
| "grad_norm": 0.9373493194580078, | |
| "learning_rate": 1.2453653935720867e-05, | |
| "loss": 0.3881, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.3886838868388685, | |
| "grad_norm": 4.251840591430664, | |
| "learning_rate": 1.2262400240949023e-05, | |
| "loss": 0.305, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 3.3936039360393604, | |
| "grad_norm": 2.382617950439453, | |
| "learning_rate": 1.2072530533005012e-05, | |
| "loss": 0.4376, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 3.3985239852398523, | |
| "grad_norm": 1.3531382083892822, | |
| "learning_rate": 1.1884047806923815e-05, | |
| "loss": 0.4127, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 3.4034440344403443, | |
| "grad_norm": 0.8284920454025269, | |
| "learning_rate": 1.169695503586179e-05, | |
| "loss": 0.406, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 3.408364083640836, | |
| "grad_norm": 0.6216104030609131, | |
| "learning_rate": 1.1511255171050084e-05, | |
| "loss": 0.3963, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 3.4132841328413286, | |
| "grad_norm": 2.1421051025390625, | |
| "learning_rate": 1.1326951141747788e-05, | |
| "loss": 0.449, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 3.4182041820418205, | |
| "grad_norm": 1.2773298025131226, | |
| "learning_rate": 1.1144045855195973e-05, | |
| "loss": 0.3583, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 3.4231242312423125, | |
| "grad_norm": 1.9336838722229004, | |
| "learning_rate": 1.0962542196571634e-05, | |
| "loss": 0.363, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 3.4280442804428044, | |
| "grad_norm": 2.467573881149292, | |
| "learning_rate": 1.078244302894229e-05, | |
| "loss": 0.4245, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 3.4329643296432963, | |
| "grad_norm": 2.337416648864746, | |
| "learning_rate": 1.0603751193220846e-05, | |
| "loss": 0.4083, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.4378843788437883, | |
| "grad_norm": 2.5366225242614746, | |
| "learning_rate": 1.0426469508120662e-05, | |
| "loss": 0.353, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 3.4428044280442807, | |
| "grad_norm": 1.9000239372253418, | |
| "learning_rate": 1.0250600770111185e-05, | |
| "loss": 0.4028, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 3.4477244772447726, | |
| "grad_norm": 1.7372283935546875, | |
| "learning_rate": 1.0076147753373789e-05, | |
| "loss": 0.4029, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 3.4526445264452645, | |
| "grad_norm": 1.1029900312423706, | |
| "learning_rate": 9.903113209758096e-06, | |
| "loss": 0.3817, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 3.4575645756457565, | |
| "grad_norm": 1.5212130546569824, | |
| "learning_rate": 9.731499868738447e-06, | |
| "loss": 0.3745, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 3.4624846248462484, | |
| "grad_norm": 1.2530347108840942, | |
| "learning_rate": 9.561310437370907e-06, | |
| "loss": 0.4198, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 3.4674046740467404, | |
| "grad_norm": 1.090973138809204, | |
| "learning_rate": 9.392547600250634e-06, | |
| "loss": 0.3743, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 3.4723247232472323, | |
| "grad_norm": 0.8587853312492371, | |
| "learning_rate": 9.225214019469385e-06, | |
| "loss": 0.3928, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 3.4772447724477242, | |
| "grad_norm": 1.6450562477111816, | |
| "learning_rate": 9.059312334573633e-06, | |
| "loss": 0.3529, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 3.4821648216482166, | |
| "grad_norm": 1.3053218126296997, | |
| "learning_rate": 8.89484516252287e-06, | |
| "loss": 0.3634, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.4870848708487086, | |
| "grad_norm": 2.639911413192749, | |
| "learning_rate": 8.731815097648433e-06, | |
| "loss": 0.4159, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 3.4920049200492005, | |
| "grad_norm": 0.9935341477394104, | |
| "learning_rate": 8.570224711612385e-06, | |
| "loss": 0.3803, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 3.4969249692496924, | |
| "grad_norm": 1.752165675163269, | |
| "learning_rate": 8.410076553367208e-06, | |
| "loss": 0.4104, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 3.5018450184501844, | |
| "grad_norm": 1.270850419998169, | |
| "learning_rate": 8.251373149115293e-06, | |
| "loss": 0.4122, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 3.5067650676506767, | |
| "grad_norm": 2.370002508163452, | |
| "learning_rate": 8.094117002269363e-06, | |
| "loss": 0.4529, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 3.5116851168511687, | |
| "grad_norm": 2.229987382888794, | |
| "learning_rate": 7.938310593412879e-06, | |
| "loss": 0.4117, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 3.5166051660516606, | |
| "grad_norm": 1.700907588005066, | |
| "learning_rate": 7.783956380260837e-06, | |
| "loss": 0.3801, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 3.5215252152521526, | |
| "grad_norm": 1.5140172243118286, | |
| "learning_rate": 7.631056797621106e-06, | |
| "loss": 0.3708, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 3.5264452644526445, | |
| "grad_norm": 1.4080220460891724, | |
| "learning_rate": 7.479614257355971e-06, | |
| "loss": 0.3763, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 3.5313653136531364, | |
| "grad_norm": 1.585070252418518, | |
| "learning_rate": 7.329631148344118e-06, | |
| "loss": 0.358, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.5362853628536284, | |
| "grad_norm": 2.044015645980835, | |
| "learning_rate": 7.181109836442912e-06, | |
| "loss": 0.3774, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 3.5412054120541203, | |
| "grad_norm": 0.8359534740447998, | |
| "learning_rate": 7.034052664451118e-06, | |
| "loss": 0.3663, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 3.5461254612546127, | |
| "grad_norm": 2.3022444248199463, | |
| "learning_rate": 6.88846195207189e-06, | |
| "loss": 0.3065, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 3.5510455104551046, | |
| "grad_norm": 1.8175033330917358, | |
| "learning_rate": 6.7443399958762584e-06, | |
| "loss": 0.4242, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 3.5559655596555966, | |
| "grad_norm": 1.7454516887664795, | |
| "learning_rate": 6.6016890692668364e-06, | |
| "loss": 0.3996, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 3.5608856088560885, | |
| "grad_norm": 2.403921604156494, | |
| "learning_rate": 6.460511422441984e-06, | |
| "loss": 0.4444, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 3.5658056580565805, | |
| "grad_norm": 1.0997297763824463, | |
| "learning_rate": 6.320809282360319e-06, | |
| "loss": 0.4124, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 3.570725707257073, | |
| "grad_norm": 3.04303240776062, | |
| "learning_rate": 6.1825848527055865e-06, | |
| "loss": 0.4291, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 3.5756457564575648, | |
| "grad_norm": 0.9251189827919006, | |
| "learning_rate": 6.04584031385188e-06, | |
| "loss": 0.3733, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 3.5805658056580567, | |
| "grad_norm": 1.9034310579299927, | |
| "learning_rate": 5.910577822829233e-06, | |
| "loss": 0.3884, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.5854858548585486, | |
| "grad_norm": 1.187487244606018, | |
| "learning_rate": 5.77679951328971e-06, | |
| "loss": 0.4108, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 3.5904059040590406, | |
| "grad_norm": 1.513329267501831, | |
| "learning_rate": 5.644507495473572e-06, | |
| "loss": 0.4008, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 3.5953259532595325, | |
| "grad_norm": 2.4123191833496094, | |
| "learning_rate": 5.5137038561761115e-06, | |
| "loss": 0.4162, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 3.6002460024600245, | |
| "grad_norm": 1.3358474969863892, | |
| "learning_rate": 5.3843906587146886e-06, | |
| "loss": 0.4287, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 3.6051660516605164, | |
| "grad_norm": 1.746752142906189, | |
| "learning_rate": 5.256569942896217e-06, | |
| "loss": 0.341, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 3.6100861008610083, | |
| "grad_norm": 1.716902732849121, | |
| "learning_rate": 5.130243724984995e-06, | |
| "loss": 0.4344, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 3.6150061500615007, | |
| "grad_norm": 0.44636377692222595, | |
| "learning_rate": 5.005413997670816e-06, | |
| "loss": 0.3995, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 3.6199261992619927, | |
| "grad_norm": 0.6673928499221802, | |
| "learning_rate": 4.8820827300376075e-06, | |
| "loss": 0.3771, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 3.6248462484624846, | |
| "grad_norm": 1.8165249824523926, | |
| "learning_rate": 4.760251867532362e-06, | |
| "loss": 0.4214, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 3.6297662976629765, | |
| "grad_norm": 1.8206608295440674, | |
| "learning_rate": 4.639923331934471e-06, | |
| "loss": 0.3361, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.6346863468634685, | |
| "grad_norm": 1.2049740552902222, | |
| "learning_rate": 4.521099021325336e-06, | |
| "loss": 0.4241, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 3.639606396063961, | |
| "grad_norm": 2.151357650756836, | |
| "learning_rate": 4.403780810058511e-06, | |
| "loss": 0.3934, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 3.644526445264453, | |
| "grad_norm": 2.024153470993042, | |
| "learning_rate": 4.287970548730069e-06, | |
| "loss": 0.4109, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 3.6494464944649447, | |
| "grad_norm": 0.612326979637146, | |
| "learning_rate": 4.173670064149482e-06, | |
| "loss": 0.4119, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 3.6543665436654367, | |
| "grad_norm": 1.2650341987609863, | |
| "learning_rate": 4.060881159310725e-06, | |
| "loss": 0.4048, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 3.6592865928659286, | |
| "grad_norm": 1.5588371753692627, | |
| "learning_rate": 3.949605613363882e-06, | |
| "loss": 0.3616, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 3.6642066420664205, | |
| "grad_norm": 0.8163132667541504, | |
| "learning_rate": 3.839845181587098e-06, | |
| "loss": 0.4051, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 3.6691266912669125, | |
| "grad_norm": 2.6811370849609375, | |
| "learning_rate": 3.7316015953588467e-06, | |
| "loss": 0.4446, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 3.6740467404674044, | |
| "grad_norm": 3.0077154636383057, | |
| "learning_rate": 3.6248765621306414e-06, | |
| "loss": 0.3562, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 3.678966789667897, | |
| "grad_norm": 1.8142826557159424, | |
| "learning_rate": 3.519671765400079e-06, | |
| "loss": 0.3967, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.6838868388683887, | |
| "grad_norm": 4.520020008087158, | |
| "learning_rate": 3.4159888646843495e-06, | |
| "loss": 0.4737, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 3.6888068880688807, | |
| "grad_norm": 2.5950474739074707, | |
| "learning_rate": 3.313829495493992e-06, | |
| "loss": 0.3269, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 3.6937269372693726, | |
| "grad_norm": 0.9162222146987915, | |
| "learning_rate": 3.2131952693070898e-06, | |
| "loss": 0.4284, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 3.6986469864698646, | |
| "grad_norm": 2.3598175048828125, | |
| "learning_rate": 3.1140877735439387e-06, | |
| "loss": 0.4268, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 3.703567035670357, | |
| "grad_norm": 2.1901378631591797, | |
| "learning_rate": 3.0165085715418763e-06, | |
| "loss": 0.3514, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 3.708487084870849, | |
| "grad_norm": 1.2730752229690552, | |
| "learning_rate": 2.9204592025307566e-06, | |
| "loss": 0.3697, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 3.713407134071341, | |
| "grad_norm": 1.7523503303527832, | |
| "learning_rate": 2.8259411816085492e-06, | |
| "loss": 0.3626, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 3.7183271832718328, | |
| "grad_norm": 0.7201489806175232, | |
| "learning_rate": 2.732955999717546e-06, | |
| "loss": 0.4082, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 3.7232472324723247, | |
| "grad_norm": 2.6464169025421143, | |
| "learning_rate": 2.6415051236207355e-06, | |
| "loss": 0.3311, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 3.7281672816728166, | |
| "grad_norm": 1.9799178838729858, | |
| "learning_rate": 2.551589995878789e-06, | |
| "loss": 0.392, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.7330873308733086, | |
| "grad_norm": 1.5155545473098755, | |
| "learning_rate": 2.4632120348272003e-06, | |
| "loss": 0.3762, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 3.7380073800738005, | |
| "grad_norm": 1.5089105367660522, | |
| "learning_rate": 2.376372634553936e-06, | |
| "loss": 0.3995, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 3.742927429274293, | |
| "grad_norm": 1.772503137588501, | |
| "learning_rate": 2.291073164877511e-06, | |
| "loss": 0.3853, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 3.747847478474785, | |
| "grad_norm": 2.189436435699463, | |
| "learning_rate": 2.207314971325292e-06, | |
| "loss": 0.3494, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 3.7527675276752768, | |
| "grad_norm": 1.9785796403884888, | |
| "learning_rate": 2.125099375112316e-06, | |
| "loss": 0.3675, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 3.7576875768757687, | |
| "grad_norm": 2.732494831085205, | |
| "learning_rate": 2.0444276731204415e-06, | |
| "loss": 0.4188, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 3.7626076260762606, | |
| "grad_norm": 1.5634301900863647, | |
| "learning_rate": 1.9653011378779283e-06, | |
| "loss": 0.4186, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 3.767527675276753, | |
| "grad_norm": 0.6259942650794983, | |
| "learning_rate": 1.88772101753929e-06, | |
| "loss": 0.3834, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 3.772447724477245, | |
| "grad_norm": 1.3457146883010864, | |
| "learning_rate": 1.8116885358656744e-06, | |
| "loss": 0.3696, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 3.777367773677737, | |
| "grad_norm": 1.3714008331298828, | |
| "learning_rate": 1.7372048922054906e-06, | |
| "loss": 0.3921, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.782287822878229, | |
| "grad_norm": 1.4138679504394531, | |
| "learning_rate": 1.6642712614755695e-06, | |
| "loss": 0.4379, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 3.787207872078721, | |
| "grad_norm": 0.921842634677887, | |
| "learning_rate": 1.5928887941426107e-06, | |
| "loss": 0.3714, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 3.7921279212792127, | |
| "grad_norm": 2.7711589336395264, | |
| "learning_rate": 1.523058616204942e-06, | |
| "loss": 0.3689, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 3.7970479704797047, | |
| "grad_norm": 2.5462987422943115, | |
| "learning_rate": 1.4547818291749115e-06, | |
| "loss": 0.4578, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 3.8019680196801966, | |
| "grad_norm": 2.8806490898132324, | |
| "learning_rate": 1.3880595100613792e-06, | |
| "loss": 0.3297, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 3.8068880688806885, | |
| "grad_norm": 1.5188145637512207, | |
| "learning_rate": 1.3228927113528189e-06, | |
| "loss": 0.3871, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 3.811808118081181, | |
| "grad_norm": 0.9707936644554138, | |
| "learning_rate": 1.2592824610006215e-06, | |
| "loss": 0.3656, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 3.816728167281673, | |
| "grad_norm": 1.8770543336868286, | |
| "learning_rate": 1.1972297624030072e-06, | |
| "loss": 0.3981, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 3.821648216482165, | |
| "grad_norm": 2.3081560134887695, | |
| "learning_rate": 1.1367355943890823e-06, | |
| "loss": 0.341, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 3.8265682656826567, | |
| "grad_norm": 1.113144040107727, | |
| "learning_rate": 1.0778009112034748e-06, | |
| "loss": 0.3586, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.8314883148831487, | |
| "grad_norm": 0.5980240702629089, | |
| "learning_rate": 1.0204266424912123e-06, | |
| "loss": 0.376, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 3.836408364083641, | |
| "grad_norm": 0.6723970174789429, | |
| "learning_rate": 9.64613693283123e-07, | |
| "loss": 0.4038, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 3.841328413284133, | |
| "grad_norm": 2.4948697090148926, | |
| "learning_rate": 9.103629439815354e-07, | |
| "loss": 0.3738, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 3.846248462484625, | |
| "grad_norm": 1.11293625831604, | |
| "learning_rate": 8.57675250346368e-07, | |
| "loss": 0.3866, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 3.851168511685117, | |
| "grad_norm": 2.0996763706207275, | |
| "learning_rate": 8.065514434816845e-07, | |
| "loss": 0.4064, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 3.856088560885609, | |
| "grad_norm": 1.6557263135910034, | |
| "learning_rate": 7.569923298225146e-07, | |
| "loss": 0.3567, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 3.8610086100861007, | |
| "grad_norm": 1.717772364616394, | |
| "learning_rate": 7.08998691122198e-07, | |
| "loss": 0.3856, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 3.8659286592865927, | |
| "grad_norm": 1.4299819469451904, | |
| "learning_rate": 6.625712844400056e-07, | |
| "loss": 0.3652, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 3.8708487084870846, | |
| "grad_norm": 2.8910887241363525, | |
| "learning_rate": 6.177108421292266e-07, | |
| "loss": 0.4677, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 3.875768757687577, | |
| "grad_norm": 1.175137996673584, | |
| "learning_rate": 5.744180718255776e-07, | |
| "loss": 0.4193, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.880688806888069, | |
| "grad_norm": 1.1175763607025146, | |
| "learning_rate": 5.326936564361118e-07, | |
| "loss": 0.3875, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 3.885608856088561, | |
| "grad_norm": 0.9984952211380005, | |
| "learning_rate": 4.92538254128383e-07, | |
| "loss": 0.3799, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 3.890528905289053, | |
| "grad_norm": 1.142543077468872, | |
| "learning_rate": 4.5395249832007604e-07, | |
| "loss": 0.4194, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 3.8954489544895448, | |
| "grad_norm": 1.1013692617416382, | |
| "learning_rate": 4.1693699766902626e-07, | |
| "loss": 0.3853, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 3.900369003690037, | |
| "grad_norm": 1.5713825225830078, | |
| "learning_rate": 3.814923360636158e-07, | |
| "loss": 0.4418, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 3.905289052890529, | |
| "grad_norm": 3.3740017414093018, | |
| "learning_rate": 3.4761907261356976e-07, | |
| "loss": 0.3226, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 3.910209102091021, | |
| "grad_norm": 2.347411870956421, | |
| "learning_rate": 3.1531774164111903e-07, | |
| "loss": 0.4269, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 3.915129151291513, | |
| "grad_norm": 0.46610283851623535, | |
| "learning_rate": 2.8458885267260705e-07, | |
| "loss": 0.3861, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 3.920049200492005, | |
| "grad_norm": 2.183335304260254, | |
| "learning_rate": 2.554328904303738e-07, | |
| "loss": 0.4076, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 3.924969249692497, | |
| "grad_norm": 0.9739826321601868, | |
| "learning_rate": 2.2785031482521758e-07, | |
| "loss": 0.366, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.9298892988929888, | |
| "grad_norm": 1.9975255727767944, | |
| "learning_rate": 2.0184156094905648e-07, | |
| "loss": 0.4491, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 3.9348093480934807, | |
| "grad_norm": 2.732900619506836, | |
| "learning_rate": 1.7740703906810042e-07, | |
| "loss": 0.3248, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 3.939729397293973, | |
| "grad_norm": 0.8809100389480591, | |
| "learning_rate": 1.545471346164007e-07, | |
| "loss": 0.3633, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 3.944649446494465, | |
| "grad_norm": 0.5867434740066528, | |
| "learning_rate": 1.3326220818968838e-07, | |
| "loss": 0.3881, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 3.949569495694957, | |
| "grad_norm": 0.8650780320167542, | |
| "learning_rate": 1.1355259553978981e-07, | |
| "loss": 0.3669, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 3.954489544895449, | |
| "grad_norm": 1.4509629011154175, | |
| "learning_rate": 9.541860756925314e-08, | |
| "loss": 0.3649, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 3.959409594095941, | |
| "grad_norm": 2.9854180812835693, | |
| "learning_rate": 7.886053032649665e-08, | |
| "loss": 0.3379, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 3.9643296432964332, | |
| "grad_norm": 3.3452847003936768, | |
| "learning_rate": 6.387862500125685e-08, | |
| "loss": 0.3104, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 3.969249692496925, | |
| "grad_norm": 1.342034935951233, | |
| "learning_rate": 5.047312792046954e-08, | |
| "loss": 0.3895, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 3.974169741697417, | |
| "grad_norm": 1.3684653043746948, | |
| "learning_rate": 3.8644250544594975e-08, | |
| "loss": 0.3729, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.979089790897909, | |
| "grad_norm": 2.351048231124878, | |
| "learning_rate": 2.839217946422057e-08, | |
| "loss": 0.4621, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 3.984009840098401, | |
| "grad_norm": 0.49089106917381287, | |
| "learning_rate": 1.971707639712994e-08, | |
| "loss": 0.3819, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 3.988929889298893, | |
| "grad_norm": 1.8144298791885376, | |
| "learning_rate": 1.2619078185793776e-08, | |
| "loss": 0.4157, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 3.993849938499385, | |
| "grad_norm": 1.8721059560775757, | |
| "learning_rate": 7.098296795138293e-09, | |
| "loss": 0.3468, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 3.998769987699877, | |
| "grad_norm": 1.0250661373138428, | |
| "learning_rate": 3.154819310868806e-09, | |
| "loss": 0.401, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.8312103748321533, | |
| "learning_rate": 7.887079380153317e-10, | |
| "loss": 0.3332, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 816, | |
| "total_flos": 1.3456927249947034e+17, | |
| "train_loss": 0.40810306406780783, | |
| "train_runtime": 2344.5136, | |
| "train_samples_per_second": 11.093, | |
| "train_steps_per_second": 0.348 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 816, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 2400000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3456927249947034e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |