Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 313, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 2.369070291519165, | |
| "learning_rate": 0.0, | |
| "loss": 2.6135, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 4.346841335296631, | |
| "learning_rate": 1.5625e-06, | |
| "loss": 2.7665, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": Infinity, | |
| "learning_rate": 3.125e-06, | |
| "loss": 3.7654, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 3.766371488571167, | |
| "learning_rate": 3.125e-06, | |
| "loss": 2.8535, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 10.782400131225586, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 3.599, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 11.706145286560059, | |
| "learning_rate": 6.25e-06, | |
| "loss": 3.446, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 5.429175853729248, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 3.0459, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 9.394842147827148, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 3.2699, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 11.506270408630371, | |
| "learning_rate": 1.09375e-05, | |
| "loss": 3.3714, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": Infinity, | |
| "learning_rate": 1.25e-05, | |
| "loss": 3.6752, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 1.7662224769592285, | |
| "learning_rate": 1.25e-05, | |
| "loss": 2.7507, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 1.310247778892517, | |
| "learning_rate": 1.4062500000000001e-05, | |
| "loss": 2.4075, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 9.33992862701416, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 2.8547, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 13.393060684204102, | |
| "learning_rate": 1.71875e-05, | |
| "loss": 2.4172, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 4.501781463623047, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 2.2421, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 3.861884593963623, | |
| "learning_rate": 2.0312500000000002e-05, | |
| "loss": 2.5123, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 2.095228433609009, | |
| "learning_rate": 2.1875e-05, | |
| "loss": 2.5304, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 3.1186673641204834, | |
| "learning_rate": 2.34375e-05, | |
| "loss": 2.2436, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 2.2644600868225098, | |
| "learning_rate": 2.5e-05, | |
| "loss": 2.5769, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 2.515594482421875, | |
| "learning_rate": 2.6562500000000002e-05, | |
| "loss": 2.2113, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 1.5017852783203125, | |
| "learning_rate": 2.8125000000000003e-05, | |
| "loss": 2.0214, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 1.3775278329849243, | |
| "learning_rate": 2.96875e-05, | |
| "loss": 2.4895, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 1.2877167463302612, | |
| "learning_rate": 3.125e-05, | |
| "loss": 2.4068, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 1.2189050912857056, | |
| "learning_rate": 3.2812500000000005e-05, | |
| "loss": 2.4256, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0954622030258179, | |
| "learning_rate": 3.4375e-05, | |
| "loss": 2.1232, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 1.1209958791732788, | |
| "learning_rate": 3.59375e-05, | |
| "loss": 2.2687, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 1.6779496669769287, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 2.2828, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 1.3499761819839478, | |
| "learning_rate": 3.90625e-05, | |
| "loss": 2.1665, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 0.9469571113586426, | |
| "learning_rate": 4.0625000000000005e-05, | |
| "loss": 2.4005, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.0315577983856201, | |
| "learning_rate": 4.21875e-05, | |
| "loss": 2.1806, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 4.572431564331055, | |
| "learning_rate": 4.375e-05, | |
| "loss": 1.9205, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 2.5722362995147705, | |
| "learning_rate": 4.5312500000000004e-05, | |
| "loss": 2.0675, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 0.9897894263267517, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 2.3044, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 0.9320685267448425, | |
| "learning_rate": 4.8437500000000005e-05, | |
| "loss": 2.125, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.898413360118866, | |
| "learning_rate": 5e-05, | |
| "loss": 2.0403, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 1.007996678352356, | |
| "learning_rate": 4.999843759868819e-05, | |
| "loss": 2.3807, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 0.9936152696609497, | |
| "learning_rate": 4.9993750590040575e-05, | |
| "loss": 2.1271, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 1.0016372203826904, | |
| "learning_rate": 4.998593955989626e-05, | |
| "loss": 1.94, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 0.996575117111206, | |
| "learning_rate": 4.9975005484572305e-05, | |
| "loss": 2.0956, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.0652625560760498, | |
| "learning_rate": 4.996094973074183e-05, | |
| "loss": 1.8066, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 0.9658384919166565, | |
| "learning_rate": 4.994377405526308e-05, | |
| "loss": 2.2093, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 1.4187798500061035, | |
| "learning_rate": 4.992348060495989e-05, | |
| "loss": 2.1982, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 0.8778465986251831, | |
| "learning_rate": 4.990007191635334e-05, | |
| "loss": 2.1151, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.9850378036499023, | |
| "learning_rate": 4.987355091534468e-05, | |
| "loss": 2.4274, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 1.1433888673782349, | |
| "learning_rate": 4.9843920916849645e-05, | |
| "loss": 2.0115, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 1.1455744504928589, | |
| "learning_rate": 4.981118562438414e-05, | |
| "loss": 2.0401, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 1.2628226280212402, | |
| "learning_rate": 4.9775349129601243e-05, | |
| "loss": 1.861, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.9735330939292908, | |
| "learning_rate": 4.973641591177991e-05, | |
| "loss": 1.8776, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 1.3270994424819946, | |
| "learning_rate": 4.969439083726496e-05, | |
| "loss": 1.9652, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.0269558429718018, | |
| "learning_rate": 4.964927915885893e-05, | |
| "loss": 1.8055, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 0.9380667209625244, | |
| "learning_rate": 4.960108651516545e-05, | |
| "loss": 2.2103, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 1.07145357131958, | |
| "learning_rate": 4.954981892988451e-05, | |
| "loss": 2.3342, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 6.015671730041504, | |
| "learning_rate": 4.949548281105951e-05, | |
| "loss": 2.2024, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.9981369376182556, | |
| "learning_rate": 4.943808495027631e-05, | |
| "loss": 1.8096, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.9215571880340576, | |
| "learning_rate": 4.937763252181434e-05, | |
| "loss": 2.3134, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 1.0013606548309326, | |
| "learning_rate": 4.93141330817499e-05, | |
| "loss": 2.2369, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 0.859836995601654, | |
| "learning_rate": 4.924759456701167e-05, | |
| "loss": 1.6866, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.9404245615005493, | |
| "learning_rate": 4.917802529438864e-05, | |
| "loss": 1.8875, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 0.9107878804206848, | |
| "learning_rate": 4.910543395949067e-05, | |
| "loss": 2.161, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.8832803964614868, | |
| "learning_rate": 4.9029829635661475e-05, | |
| "loss": 1.9762, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 0.8990869522094727, | |
| "learning_rate": 4.895122177284465e-05, | |
| "loss": 2.1535, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.8530718684196472, | |
| "learning_rate": 4.8869620196402436e-05, | |
| "loss": 2.1623, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 0.869974672794342, | |
| "learning_rate": 4.878503510588765e-05, | |
| "loss": 2.0117, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 1.0152883529663086, | |
| "learning_rate": 4.8697477073768766e-05, | |
| "loss": 1.9535, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.8475722670555115, | |
| "learning_rate": 4.8606957044108556e-05, | |
| "loss": 1.7775, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.8312771916389465, | |
| "learning_rate": 4.851348633119606e-05, | |
| "loss": 1.7419, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 0.8250728249549866, | |
| "learning_rate": 4.8417076618132426e-05, | |
| "loss": 1.9348, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.8720153570175171, | |
| "learning_rate": 4.8317739955370636e-05, | |
| "loss": 1.8901, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 0.9559609293937683, | |
| "learning_rate": 4.821548875920927e-05, | |
| "loss": 2.121, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.8743850588798523, | |
| "learning_rate": 4.811033581024056e-05, | |
| "loss": 2.0368, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 0.9637888669967651, | |
| "learning_rate": 4.800229425175294e-05, | |
| "loss": 2.1733, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.8457867503166199, | |
| "learning_rate": 4.7891377588088223e-05, | |
| "loss": 1.9019, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 0.9266003370285034, | |
| "learning_rate": 4.777759968295369e-05, | |
| "loss": 1.9502, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.8747023940086365, | |
| "learning_rate": 4.766097475768919e-05, | |
| "loss": 1.8326, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8718913793563843, | |
| "learning_rate": 4.754151738948962e-05, | |
| "loss": 1.9553, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.841492235660553, | |
| "learning_rate": 4.741924250958289e-05, | |
| "loss": 1.9343, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 0.9255774617195129, | |
| "learning_rate": 4.729416540136361e-05, | |
| "loss": 2.0969, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 1.050870418548584, | |
| "learning_rate": 4.7166301698482815e-05, | |
| "loss": 2.0594, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 1.2079211473464966, | |
| "learning_rate": 4.703566738289389e-05, | |
| "loss": 1.6793, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.9625230431556702, | |
| "learning_rate": 4.69022787828549e-05, | |
| "loss": 2.1397, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 0.8908649682998657, | |
| "learning_rate": 4.676615257088776e-05, | |
| "loss": 2.1244, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.8250916004180908, | |
| "learning_rate": 4.662730576169423e-05, | |
| "loss": 1.8623, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 0.8161985278129578, | |
| "learning_rate": 4.6485755710029256e-05, | |
| "loss": 1.7176, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.8897624015808105, | |
| "learning_rate": 4.6341520108531746e-05, | |
| "loss": 1.8418, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 1.2834111452102661, | |
| "learning_rate": 4.619461698551315e-05, | |
| "loss": 2.1907, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.8536003232002258, | |
| "learning_rate": 4.604506470270403e-05, | |
| "loss": 1.8812, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 0.9330596327781677, | |
| "learning_rate": 4.589288195295901e-05, | |
| "loss": 2.1898, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.896048367023468, | |
| "learning_rate": 4.573808775792033e-05, | |
| "loss": 1.946, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 0.8645696043968201, | |
| "learning_rate": 4.5580701465640254e-05, | |
| "loss": 2.0109, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.9180052280426025, | |
| "learning_rate": 4.5420742748162734e-05, | |
| "loss": 2.244, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 0.8442833423614502, | |
| "learning_rate": 4.525823159906459e-05, | |
| "loss": 2.1774, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 1.0204061269760132, | |
| "learning_rate": 4.509318833095642e-05, | |
| "loss": 2.0481, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 0.9492475986480713, | |
| "learning_rate": 4.492563357294369e-05, | |
| "loss": 1.8785, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.8547176122665405, | |
| "learning_rate": 4.475558826804833e-05, | |
| "loss": 2.0108, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.8365771174430847, | |
| "learning_rate": 4.458307367059092e-05, | |
| "loss": 1.707, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 1.0437438488006592, | |
| "learning_rate": 4.440811134353412e-05, | |
| "loss": 2.0576, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 0.9977460503578186, | |
| "learning_rate": 4.42307231557875e-05, | |
| "loss": 2.013, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.894542396068573, | |
| "learning_rate": 4.4050931279474015e-05, | |
| "loss": 1.9567, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 0.97090744972229, | |
| "learning_rate": 4.386875818715874e-05, | |
| "loss": 1.8304, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9039097428321838, | |
| "learning_rate": 4.368422664903997e-05, | |
| "loss": 1.8483, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 0.8699350357055664, | |
| "learning_rate": 4.349735973010305e-05, | |
| "loss": 1.8927, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.943987250328064, | |
| "learning_rate": 4.330818078723755e-05, | |
| "loss": 1.8737, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 2.9347333908081055, | |
| "learning_rate": 4.311671346631774e-05, | |
| "loss": 2.1136, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.8647440671920776, | |
| "learning_rate": 4.292298169924709e-05, | |
| "loss": 2.0747, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 1.7402944564819336, | |
| "learning_rate": 4.272700970096696e-05, | |
| "loss": 1.8312, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 2.0673842430114746, | |
| "learning_rate": 4.252882196642992e-05, | |
| "loss": 1.8589, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 0.865168571472168, | |
| "learning_rate": 4.23284432675381e-05, | |
| "loss": 2.2354, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.8473728895187378, | |
| "learning_rate": 4.212589865004684e-05, | |
| "loss": 2.0795, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 1.0328521728515625, | |
| "learning_rate": 4.192121343043424e-05, | |
| "loss": 1.7663, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.871279776096344, | |
| "learning_rate": 4.1714413192736754e-05, | |
| "loss": 2.0111, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 0.7886031270027161, | |
| "learning_rate": 4.150552378535137e-05, | |
| "loss": 1.6976, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 1.020559310913086, | |
| "learning_rate": 4.1294571317804854e-05, | |
| "loss": 2.1983, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 0.9146674275398254, | |
| "learning_rate": 4.108158215749014e-05, | |
| "loss": 1.9464, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.9817960262298584, | |
| "learning_rate": 4.0866582926370725e-05, | |
| "loss": 2.1838, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.8889943361282349, | |
| "learning_rate": 4.064960049765304e-05, | |
| "loss": 2.3109, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.8687418699264526, | |
| "learning_rate": 4.043066199242762e-05, | |
| "loss": 1.8089, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 0.8281345963478088, | |
| "learning_rate": 4.020979477627907e-05, | |
| "loss": 1.885, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 1.014325499534607, | |
| "learning_rate": 3.998702645586565e-05, | |
| "loss": 1.8577, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 0.8055040240287781, | |
| "learning_rate": 3.976238487546864e-05, | |
| "loss": 1.8949, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.7790651917457581, | |
| "learning_rate": 3.953589811351204e-05, | |
| "loss": 1.7247, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 1.029447317123413, | |
| "learning_rate": 3.930759447905298e-05, | |
| "loss": 2.1379, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.8778262734413147, | |
| "learning_rate": 3.907750250824327e-05, | |
| "loss": 1.7193, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 0.9018813967704773, | |
| "learning_rate": 3.884565096076269e-05, | |
| "loss": 2.1621, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 0.9208047389984131, | |
| "learning_rate": 3.861206881622419e-05, | |
| "loss": 1.5739, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.866239070892334, | |
| "learning_rate": 3.837678527055168e-05, | |
| "loss": 1.7945, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.825024425983429, | |
| "learning_rate": 3.813982973233083e-05, | |
| "loss": 1.9337, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 0.9117039442062378, | |
| "learning_rate": 3.7901231819133105e-05, | |
| "loss": 2.1744, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 0.8706339001655579, | |
| "learning_rate": 3.766102135381393e-05, | |
| "loss": 1.7783, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 1.1152760982513428, | |
| "learning_rate": 3.741922836078499e-05, | |
| "loss": 1.9098, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.8744529485702515, | |
| "learning_rate": 3.717588306226143e-05, | |
| "loss": 1.9069, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 0.8513407707214355, | |
| "learning_rate": 3.693101587448436e-05, | |
| "loss": 1.8288, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.8916627168655396, | |
| "learning_rate": 3.6684657403919005e-05, | |
| "loss": 2.0371, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 0.810296356678009, | |
| "learning_rate": 3.6436838443429175e-05, | |
| "loss": 1.8719, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 0.8865705728530884, | |
| "learning_rate": 3.618758996842839e-05, | |
| "loss": 2.322, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.8098453879356384, | |
| "learning_rate": 3.5936943133008183e-05, | |
| "loss": 1.7499, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 0.8217070698738098, | |
| "learning_rate": 3.568492926604412e-05, | |
| "loss": 1.8611, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 0.8360695838928223, | |
| "learning_rate": 3.5431579867279905e-05, | |
| "loss": 1.8808, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.8596831560134888, | |
| "learning_rate": 3.517692660339018e-05, | |
| "loss": 1.7403, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 0.8024532794952393, | |
| "learning_rate": 3.492100130402242e-05, | |
| "loss": 1.8293, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.8411235809326172, | |
| "learning_rate": 3.4663835957818515e-05, | |
| "loss": 1.821, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 0.9146133065223694, | |
| "learning_rate": 3.440546270841639e-05, | |
| "loss": 2.2463, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 0.887295663356781, | |
| "learning_rate": 3.414591385043237e-05, | |
| "loss": 1.7169, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 1.2334985733032227, | |
| "learning_rate": 3.3885221825424537e-05, | |
| "loss": 2.0077, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.9017387628555298, | |
| "learning_rate": 3.362341921783784e-05, | |
| "loss": 1.9779, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.8526884317398071, | |
| "learning_rate": 3.336053875093128e-05, | |
| "loss": 1.9481, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 0.9560144543647766, | |
| "learning_rate": 3.309661328268776e-05, | |
| "loss": 2.1816, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 0.8322418332099915, | |
| "learning_rate": 3.283167580170712e-05, | |
| "loss": 1.7626, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 0.9681915640830994, | |
| "learning_rate": 3.256575942308278e-05, | |
| "loss": 2.0646, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 0.9159672260284424, | |
| "learning_rate": 3.229889738426264e-05, | |
| "loss": 2.0736, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9347404837608337, | |
| "learning_rate": 3.203112304089466e-05, | |
| "loss": 1.8036, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 0.8153049945831299, | |
| "learning_rate": 3.176246986265767e-05, | |
| "loss": 1.9047, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 0.8197898864746094, | |
| "learning_rate": 3.149297142907792e-05, | |
| "loss": 1.8461, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 0.8855655193328857, | |
| "learning_rate": 3.122266142533191e-05, | |
| "loss": 2.0462, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 0.85092693567276, | |
| "learning_rate": 3.095157363803598e-05, | |
| "loss": 1.7231, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 3.6206603050231934, | |
| "learning_rate": 3.06797419510233e-05, | |
| "loss": 1.9591, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.8123088479042053, | |
| "learning_rate": 3.0407200341108617e-05, | |
| "loss": 1.8609, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 0.891497015953064, | |
| "learning_rate": 3.013398287384144e-05, | |
| "loss": 1.8931, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 0.8028988242149353, | |
| "learning_rate": 2.986012369924811e-05, | |
| "loss": 1.8709, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 0.806952178478241, | |
| "learning_rate": 2.9585657047563315e-05, | |
| "loss": 1.8017, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.9176279306411743, | |
| "learning_rate": 2.931061722495159e-05, | |
| "loss": 1.8632, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 0.903057873249054, | |
| "learning_rate": 2.9035038609219306e-05, | |
| "loss": 1.9366, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 0.7879480123519897, | |
| "learning_rate": 2.875895564551772e-05, | |
| "loss": 1.5944, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 0.8131020069122314, | |
| "learning_rate": 2.8482402842037614e-05, | |
| "loss": 1.6836, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.9431760907173157, | |
| "learning_rate": 2.8205414765696003e-05, | |
| "loss": 2.1004, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.9120539426803589, | |
| "learning_rate": 2.792802603781562e-05, | |
| "loss": 1.9059, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 0.8896984457969666, | |
| "learning_rate": 2.7650271329797427e-05, | |
| "loss": 1.7062, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 0.8532401323318481, | |
| "learning_rate": 2.737218535878705e-05, | |
| "loss": 1.9162, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 0.985470175743103, | |
| "learning_rate": 2.7093802883335357e-05, | |
| "loss": 2.0339, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 0.8058697581291199, | |
| "learning_rate": 2.6815158699053932e-05, | |
| "loss": 1.7464, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.9152082800865173, | |
| "learning_rate": 2.6536287634265918e-05, | |
| "loss": 2.1235, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 0.8978695869445801, | |
| "learning_rate": 2.6257224545652688e-05, | |
| "loss": 1.9461, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 0.8205640316009521, | |
| "learning_rate": 2.5978004313897104e-05, | |
| "loss": 1.9596, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 0.8047122359275818, | |
| "learning_rate": 2.569866183932368e-05, | |
| "loss": 1.8327, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.8385019898414612, | |
| "learning_rate": 2.5419232037536316e-05, | |
| "loss": 1.7761, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8739506602287292, | |
| "learning_rate": 2.5139749835054123e-05, | |
| "loss": 1.7779, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.867371141910553, | |
| "learning_rate": 2.4860250164945876e-05, | |
| "loss": 1.9015, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 0.8940768241882324, | |
| "learning_rate": 2.4580767962463687e-05, | |
| "loss": 1.9762, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.8470816612243652, | |
| "learning_rate": 2.4301338160676324e-05, | |
| "loss": 1.8437, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 0.9534160494804382, | |
| "learning_rate": 2.40219956861029e-05, | |
| "loss": 1.8555, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.8401153683662415, | |
| "learning_rate": 2.374277545434732e-05, | |
| "loss": 1.6986, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 0.8277825713157654, | |
| "learning_rate": 2.346371236573409e-05, | |
| "loss": 1.8941, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 0.9646350145339966, | |
| "learning_rate": 2.318484130094607e-05, | |
| "loss": 1.8884, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 0.8772605061531067, | |
| "learning_rate": 2.2906197116664653e-05, | |
| "loss": 2.0836, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 0.8015424013137817, | |
| "learning_rate": 2.262781464121296e-05, | |
| "loss": 1.8136, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.8925164937973022, | |
| "learning_rate": 2.2349728670202582e-05, | |
| "loss": 1.93, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.8571159243583679, | |
| "learning_rate": 2.2071973962184384e-05, | |
| "loss": 1.7503, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 0.9113547205924988, | |
| "learning_rate": 2.1794585234303993e-05, | |
| "loss": 1.749, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 1.083584189414978, | |
| "learning_rate": 2.1517597157962392e-05, | |
| "loss": 2.0044, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 0.8615824580192566, | |
| "learning_rate": 2.124104435448228e-05, | |
| "loss": 1.8804, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.8497064113616943, | |
| "learning_rate": 2.0964961390780703e-05, | |
| "loss": 1.9249, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 0.8820909857749939, | |
| "learning_rate": 2.0689382775048418e-05, | |
| "loss": 1.8348, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.8180757761001587, | |
| "learning_rate": 2.0414342952436694e-05, | |
| "loss": 1.6585, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 0.8933749794960022, | |
| "learning_rate": 2.0139876300751904e-05, | |
| "loss": 2.0343, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 0.8355430960655212, | |
| "learning_rate": 1.9866017126158574e-05, | |
| "loss": 1.896, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.8331963419914246, | |
| "learning_rate": 1.9592799658891385e-05, | |
| "loss": 1.8097, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 0.8662923574447632, | |
| "learning_rate": 1.9320258048976702e-05, | |
| "loss": 2.0222, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.8453770279884338, | |
| "learning_rate": 1.904842636196402e-05, | |
| "loss": 1.8906, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.9572663307189941, | |
| "learning_rate": 1.8777338574668095e-05, | |
| "loss": 2.1215, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 0.8225608468055725, | |
| "learning_rate": 1.850702857092208e-05, | |
| "loss": 1.9586, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8892737627029419, | |
| "learning_rate": 1.8237530137342335e-05, | |
| "loss": 2.1155, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 0.8747507333755493, | |
| "learning_rate": 1.796887695910535e-05, | |
| "loss": 1.9199, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 0.8982093334197998, | |
| "learning_rate": 1.7701102615737368e-05, | |
| "loss": 2.0063, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 0.8300778865814209, | |
| "learning_rate": 1.7434240576917226e-05, | |
| "loss": 1.8665, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.8988847732543945, | |
| "learning_rate": 1.7168324198292888e-05, | |
| "loss": 2.0139, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.845274806022644, | |
| "learning_rate": 1.6903386717312236e-05, | |
| "loss": 2.17, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.8392491340637207, | |
| "learning_rate": 1.6639461249068726e-05, | |
| "loss": 1.8705, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 0.8753416538238525, | |
| "learning_rate": 1.637658078216217e-05, | |
| "loss": 1.6344, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.9340012669563293, | |
| "learning_rate": 1.6114778174575473e-05, | |
| "loss": 2.317, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 0.9441534876823425, | |
| "learning_rate": 1.585408614956763e-05, | |
| "loss": 1.7811, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.8446135520935059, | |
| "learning_rate": 1.559453729158361e-05, | |
| "loss": 1.8073, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 0.8299307823181152, | |
| "learning_rate": 1.5336164042181494e-05, | |
| "loss": 1.8219, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 0.9229199290275574, | |
| "learning_rate": 1.5078998695977586e-05, | |
| "loss": 2.1888, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 0.9439289569854736, | |
| "learning_rate": 1.482307339660983e-05, | |
| "loss": 2.1184, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.8746132850646973, | |
| "learning_rate": 1.4568420132720106e-05, | |
| "loss": 2.0046, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.9310538172721863, | |
| "learning_rate": 1.4315070733955888e-05, | |
| "loss": 1.9485, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.7922965884208679, | |
| "learning_rate": 1.4063056866991826e-05, | |
| "loss": 1.6378, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 0.8741850256919861, | |
| "learning_rate": 1.381241003157162e-05, | |
| "loss": 1.923, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 0.8461443781852722, | |
| "learning_rate": 1.3563161556570826e-05, | |
| "loss": 1.7054, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 0.8542807102203369, | |
| "learning_rate": 1.3315342596080996e-05, | |
| "loss": 1.966, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.8851953744888306, | |
| "learning_rate": 1.3068984125515644e-05, | |
| "loss": 1.6682, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 0.874228835105896, | |
| "learning_rate": 1.2824116937738579e-05, | |
| "loss": 2.0335, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.8620577454566956, | |
| "learning_rate": 1.2580771639215027e-05, | |
| "loss": 1.7941, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 0.8221322894096375, | |
| "learning_rate": 1.2338978646186084e-05, | |
| "loss": 1.7023, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.8699591159820557, | |
| "learning_rate": 1.2098768180866895e-05, | |
| "loss": 1.8151, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7773942947387695, | |
| "learning_rate": 1.1860170267669174e-05, | |
| "loss": 1.7476, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 0.7643865942955017, | |
| "learning_rate": 1.1623214729448317e-05, | |
| "loss": 1.6205, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 0.8884011507034302, | |
| "learning_rate": 1.1387931183775822e-05, | |
| "loss": 1.8415, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.853610634803772, | |
| "learning_rate": 1.1154349039237322e-05, | |
| "loss": 1.7808, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 0.9273776412010193, | |
| "learning_rate": 1.0922497491756734e-05, | |
| "loss": 2.0959, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.2236069440841675, | |
| "learning_rate": 1.0692405520947028e-05, | |
| "loss": 1.9245, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 0.8130596280097961, | |
| "learning_rate": 1.0464101886487958e-05, | |
| "loss": 1.7531, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 0.838800311088562, | |
| "learning_rate": 1.0237615124531363e-05, | |
| "loss": 1.8452, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 0.807829737663269, | |
| "learning_rate": 1.0012973544134358e-05, | |
| "loss": 1.7922, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.9993897676467896, | |
| "learning_rate": 9.79020522372093e-06, | |
| "loss": 1.9489, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.8562806844711304, | |
| "learning_rate": 9.569338007572382e-06, | |
| "loss": 1.7506, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.8807257413864136, | |
| "learning_rate": 9.35039950234696e-06, | |
| "loss": 1.9989, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 0.8456102013587952, | |
| "learning_rate": 9.133417073629289e-06, | |
| "loss": 1.8108, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 0.8205898404121399, | |
| "learning_rate": 8.918417842509867e-06, | |
| "loss": 1.8678, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 0.8826982975006104, | |
| "learning_rate": 8.705428682195155e-06, | |
| "loss": 1.9144, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.856715202331543, | |
| "learning_rate": 8.494476214648626e-06, | |
| "loss": 1.947, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 0.8198254108428955, | |
| "learning_rate": 8.285586807263254e-06, | |
| "loss": 1.7441, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 0.8953871130943298, | |
| "learning_rate": 8.078786569565763e-06, | |
| "loss": 2.0205, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 0.8237789273262024, | |
| "learning_rate": 7.874101349953167e-06, | |
| "loss": 1.6816, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 0.8389543294906616, | |
| "learning_rate": 7.671556732461905e-06, | |
| "loss": 2.0749, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.7939170598983765, | |
| "learning_rate": 7.471178033570081e-06, | |
| "loss": 1.694, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.9527349472045898, | |
| "learning_rate": 7.272990299033045e-06, | |
| "loss": 1.8355, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 0.8178287148475647, | |
| "learning_rate": 7.077018300752916e-06, | |
| "loss": 1.9606, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 0.9523343443870544, | |
| "learning_rate": 6.883286533682265e-06, | |
| "loss": 2.1264, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 0.8723739981651306, | |
| "learning_rate": 6.691819212762454e-06, | |
| "loss": 1.7615, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9021463394165039, | |
| "learning_rate": 6.502640269896953e-06, | |
| "loss": 2.2895, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 0.8240249156951904, | |
| "learning_rate": 6.3157733509600355e-06, | |
| "loss": 1.717, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.7992557883262634, | |
| "learning_rate": 6.1312418128412565e-06, | |
| "loss": 1.5757, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 0.8500726222991943, | |
| "learning_rate": 5.949068720525991e-06, | |
| "loss": 1.9647, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 1.9370149374008179, | |
| "learning_rate": 5.769276844212501e-06, | |
| "loss": 1.6064, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.7748081684112549, | |
| "learning_rate": 5.591888656465874e-06, | |
| "loss": 1.7576, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.9065202474594116, | |
| "learning_rate": 5.416926329409083e-06, | |
| "loss": 1.9247, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 1.0182149410247803, | |
| "learning_rate": 5.244411731951671e-06, | |
| "loss": 1.8929, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.8464491963386536, | |
| "learning_rate": 5.074366427056309e-06, | |
| "loss": 1.7739, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 0.7988802194595337, | |
| "learning_rate": 4.90681166904359e-06, | |
| "loss": 1.7561, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.8608583211898804, | |
| "learning_rate": 4.741768400935417e-06, | |
| "loss": 1.9507, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 0.8953453302383423, | |
| "learning_rate": 4.579257251837271e-06, | |
| "loss": 1.9238, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 0.8528779745101929, | |
| "learning_rate": 4.419298534359759e-06, | |
| "loss": 1.7754, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 0.8692997097969055, | |
| "learning_rate": 4.261912242079674e-06, | |
| "loss": 1.9674, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 1.1034767627716064, | |
| "learning_rate": 4.107118047040995e-06, | |
| "loss": 1.7799, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.8734923005104065, | |
| "learning_rate": 3.954935297295975e-06, | |
| "loss": 1.9235, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 0.8586539030075073, | |
| "learning_rate": 3.8053830144868547e-06, | |
| "loss": 1.8945, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 0.8804144859313965, | |
| "learning_rate": 3.6584798914682582e-06, | |
| "loss": 2.1428, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 0.903613805770874, | |
| "learning_rate": 3.514244289970753e-06, | |
| "loss": 2.0652, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 0.7958067655563354, | |
| "learning_rate": 3.3726942383057763e-06, | |
| "loss": 1.5834, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.8151316046714783, | |
| "learning_rate": 3.233847429112244e-06, | |
| "loss": 1.8321, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 0.8837787508964539, | |
| "learning_rate": 3.0977212171451e-06, | |
| "loss": 1.6769, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.831078290939331, | |
| "learning_rate": 2.9643326171061165e-06, | |
| "loss": 1.6826, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 0.9044331312179565, | |
| "learning_rate": 2.833698301517185e-06, | |
| "loss": 2.0594, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 0.9114620685577393, | |
| "learning_rate": 2.7058345986363974e-06, | |
| "loss": 2.2189, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.8622688055038452, | |
| "learning_rate": 2.5807574904171155e-06, | |
| "loss": 1.907, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 0.8636655807495117, | |
| "learning_rate": 2.4584826105103764e-06, | |
| "loss": 1.8381, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 0.8480424284934998, | |
| "learning_rate": 2.3390252423108076e-06, | |
| "loss": 1.4082, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 0.9045252203941345, | |
| "learning_rate": 2.222400317046308e-06, | |
| "loss": 1.9188, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 0.9049263596534729, | |
| "learning_rate": 2.108622411911773e-06, | |
| "loss": 2.0128, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.8421158194541931, | |
| "learning_rate": 1.997705748247067e-06, | |
| "loss": 1.7721, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 0.8193771839141846, | |
| "learning_rate": 1.8896641897594492e-06, | |
| "loss": 1.8937, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.8462507128715515, | |
| "learning_rate": 1.78451124079074e-06, | |
| "loss": 2.017, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 0.9032920598983765, | |
| "learning_rate": 1.6822600446293636e-06, | |
| "loss": 1.7346, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 0.8496252298355103, | |
| "learning_rate": 1.5829233818675766e-06, | |
| "loss": 1.8274, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.8702892065048218, | |
| "learning_rate": 1.486513668803946e-06, | |
| "loss": 1.6593, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 0.8834957480430603, | |
| "learning_rate": 1.3930429558914494e-06, | |
| "loss": 1.9114, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 0.8452805876731873, | |
| "learning_rate": 1.3025229262312366e-06, | |
| "loss": 1.8522, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.8840207457542419, | |
| "learning_rate": 1.214964894112361e-06, | |
| "loss": 1.7781, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 0.8479911684989929, | |
| "learning_rate": 1.1303798035975643e-06, | |
| "loss": 2.0021, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.8397225737571716, | |
| "learning_rate": 1.0487782271553504e-06, | |
| "loss": 1.8152, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 0.8048727512359619, | |
| "learning_rate": 9.701703643385295e-07, | |
| "loss": 1.8098, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 0.9148857593536377, | |
| "learning_rate": 8.94566040509337e-07, | |
| "loss": 2.1158, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 0.8682562112808228, | |
| "learning_rate": 8.219747056113586e-07, | |
| "loss": 1.6974, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.8475475311279297, | |
| "learning_rate": 7.524054329883346e-07, | |
| "loss": 1.8491, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.8906409740447998, | |
| "learning_rate": 6.858669182500971e-07, | |
| "loss": 2.0995, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 0.906276285648346, | |
| "learning_rate": 6.223674781856592e-07, | |
| "loss": 2.2162, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 0.9206042885780334, | |
| "learning_rate": 5.619150497236992e-07, | |
| "loss": 2.1362, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 0.7929039597511292, | |
| "learning_rate": 5.045171889404954e-07, | |
| "loss": 1.8142, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 0.7848972082138062, | |
| "learning_rate": 4.501810701154907e-07, | |
| "loss": 1.3785, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8327988982200623, | |
| "learning_rate": 3.98913484834551e-07, | |
| "loss": 1.7493, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 0.9012575149536133, | |
| "learning_rate": 3.507208411410778e-07, | |
| "loss": 1.8922, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 0.9032233953475952, | |
| "learning_rate": 3.0560916273504325e-07, | |
| "loss": 1.8345, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 0.8583292961120605, | |
| "learning_rate": 2.635840882200924e-07, | |
| "loss": 1.9222, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 0.9118805527687073, | |
| "learning_rate": 2.246508703987543e-07, | |
| "loss": 1.7877, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.8609216809272766, | |
| "learning_rate": 1.8881437561586722e-07, | |
| "loss": 2.0112, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 0.8714916706085205, | |
| "learning_rate": 1.5607908315035667e-07, | |
| "loss": 1.949, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 0.8294073939323425, | |
| "learning_rate": 1.264490846553279e-07, | |
| "loss": 1.7595, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 0.9029958248138428, | |
| "learning_rate": 9.992808364666373e-08, | |
| "loss": 1.9646, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 0.7959998846054077, | |
| "learning_rate": 7.651939504010885e-08, | |
| "loss": 1.6563, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.8054251074790955, | |
| "learning_rate": 5.622594473692067e-08, | |
| "loss": 1.838, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 0.9047015905380249, | |
| "learning_rate": 3.90502692581729e-08, | |
| "loss": 1.713, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 0.856833279132843, | |
| "learning_rate": 2.4994515427695374e-08, | |
| "loss": 1.8898, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3098646402359009, | |
| "learning_rate": 1.4060440103746964e-08, | |
| "loss": 1.8755, | |
| "step": 313 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 313, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.23639789568e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |