diff --git "a/resemble_to_hit_frequency_5039/checkpoint-40000/trainer_state.json" "b/resemble_to_hit_frequency_5039/checkpoint-40000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/resemble_to_hit_frequency_5039/checkpoint-40000/trainer_state.json" @@ -0,0 +1,6003 @@ +{ + "best_global_step": 40000, + "best_metric": 3.5570318698883057, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_hit_frequency_5039/checkpoint-40000", + "epoch": 11.655011655011656, + "eval_steps": 1000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014568764568764568, + "grad_norm": 1.6134361028671265, + "learning_rate": 0.000294, + "loss": 8.4822, + "step": 50 + }, + { + "epoch": 0.029137529137529136, + "grad_norm": 0.6467522382736206, + "learning_rate": 0.0005939999999999999, + "loss": 6.7172, + "step": 100 + }, + { + "epoch": 0.043706293706293704, + "grad_norm": 0.4256949722766876, + "learning_rate": 0.0005998285714285713, + "loss": 6.3649, + "step": 150 + }, + { + "epoch": 0.05827505827505827, + "grad_norm": 0.4922082722187042, + "learning_rate": 0.0005996536443148687, + "loss": 6.1487, + "step": 200 + }, + { + "epoch": 0.07284382284382285, + "grad_norm": 0.4709911346435547, + "learning_rate": 0.0005994787172011662, + "loss": 6.0174, + "step": 250 + }, + { + "epoch": 0.08741258741258741, + "grad_norm": 0.48052042722702026, + "learning_rate": 0.0005993037900874635, + "loss": 5.8773, + "step": 300 + }, + { + "epoch": 0.10198135198135198, + "grad_norm": 0.5878971219062805, + "learning_rate": 0.0005991288629737609, + "loss": 5.7603, + "step": 350 + }, + { + "epoch": 0.11655011655011654, + "grad_norm": 0.4191853702068329, + "learning_rate": 0.0005989539358600582, + "loss": 5.6447, + "step": 400 + }, + { + "epoch": 0.13111888111888112, + "grad_norm": 0.49607983231544495, + "learning_rate": 0.0005987790087463557, + "loss": 5.5178, + "step": 450 + }, + { + "epoch": 0.1456876456876457, + "grad_norm": 0.4248947501182556, + "learning_rate": 0.000598604081632653, + "loss": 5.4198, + "step": 500 + }, + { + "epoch": 0.16025641025641027, + "grad_norm": 0.48206138610839844, + "learning_rate": 0.0005984291545189504, + "loss": 5.3384, + "step": 550 + }, + { + "epoch": 0.17482517482517482, + "grad_norm": 0.4465309679508209, + "learning_rate": 0.0005982542274052477, + "loss": 5.2645, + "step": 600 + }, + { + "epoch": 0.1893939393939394, + "grad_norm": 0.42823970317840576, + "learning_rate": 0.0005980793002915452, + "loss": 5.2088, + "step": 650 + }, + { + "epoch": 0.20396270396270397, + "grad_norm": 0.4172956943511963, + "learning_rate": 0.0005979043731778425, + "loss": 5.1419, + "step": 700 + }, + { + "epoch": 0.21853146853146854, + "grad_norm": 0.424402117729187, + "learning_rate": 0.0005977294460641399, + "loss": 5.0645, + "step": 750 + }, + { + "epoch": 0.2331002331002331, + "grad_norm": 0.4406491816043854, + "learning_rate": 0.0005975545189504372, + "loss": 5.0224, + "step": 800 + }, + { + "epoch": 0.24766899766899766, + "grad_norm": 0.4717820882797241, + "learning_rate": 0.0005973795918367347, + "loss": 5.003, + "step": 850 + }, + { + "epoch": 0.26223776223776224, + "grad_norm": 0.4521999657154083, + "learning_rate": 0.000597204664723032, + "loss": 4.9142, + "step": 900 + }, + { + "epoch": 0.2768065268065268, + "grad_norm": 0.4754863679409027, + "learning_rate": 0.0005970297376093294, + "loss": 4.8762, + "step": 950 + }, + { + "epoch": 0.2913752913752914, + "grad_norm": 0.41961297392845154, + "learning_rate": 0.0005968548104956268, + "loss": 4.8402, + "step": 1000 + }, + { + "epoch": 0.2913752913752914, + "eval_accuracy": 0.2549595710849709, + "eval_loss": 4.753758430480957, + "eval_runtime": 180.4427, + "eval_samples_per_second": 92.229, + "eval_steps_per_second": 5.769, + "step": 1000 + }, + { + "epoch": 0.30594405594405594, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0005966798833819242, + "loss": 4.7815, + "step": 1050 + }, + { + "epoch": 0.32051282051282054, + "grad_norm": 0.4702153503894806, + "learning_rate": 0.0005965049562682215, + "loss": 4.7461, + "step": 1100 + }, + { + "epoch": 0.3350815850815851, + "grad_norm": 0.4264092743396759, + "learning_rate": 0.0005963300291545189, + "loss": 4.6878, + "step": 1150 + }, + { + "epoch": 0.34965034965034963, + "grad_norm": 0.4903077185153961, + "learning_rate": 0.0005961551020408162, + "loss": 4.6656, + "step": 1200 + }, + { + "epoch": 0.36421911421911424, + "grad_norm": 0.4931991994380951, + "learning_rate": 0.0005959801749271137, + "loss": 4.6333, + "step": 1250 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.43286022543907166, + "learning_rate": 0.000595805247813411, + "loss": 4.6066, + "step": 1300 + }, + { + "epoch": 0.39335664335664333, + "grad_norm": 0.40360233187675476, + "learning_rate": 0.0005956303206997084, + "loss": 4.5706, + "step": 1350 + }, + { + "epoch": 0.40792540792540793, + "grad_norm": 0.420491486787796, + "learning_rate": 0.0005954553935860059, + "loss": 4.5591, + "step": 1400 + }, + { + "epoch": 0.4224941724941725, + "grad_norm": 0.4152667820453644, + "learning_rate": 0.0005952804664723032, + "loss": 4.5331, + "step": 1450 + }, + { + "epoch": 0.4370629370629371, + "grad_norm": 0.4153015613555908, + "learning_rate": 0.0005951055393586005, + "loss": 4.5102, + "step": 1500 + }, + { + "epoch": 0.45163170163170163, + "grad_norm": 0.4187549352645874, + "learning_rate": 0.0005949306122448979, + "loss": 4.4927, + "step": 1550 + }, + { + "epoch": 0.4662004662004662, + "grad_norm": 0.4402385354042053, + "learning_rate": 0.0005947556851311952, + "loss": 4.4652, + "step": 1600 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 0.41887184977531433, + "learning_rate": 0.0005945807580174927, + "loss": 4.4431, + "step": 1650 + }, + { + "epoch": 0.49533799533799533, + "grad_norm": 0.4349214434623718, + "learning_rate": 0.00059440583090379, + "loss": 4.4302, + "step": 1700 + }, + { + "epoch": 0.5099067599067599, + "grad_norm": 0.416457861661911, + "learning_rate": 0.0005942309037900874, + "loss": 4.4188, + "step": 1750 + }, + { + "epoch": 0.5244755244755245, + "grad_norm": 0.3888656198978424, + "learning_rate": 0.0005940559766763847, + "loss": 4.3824, + "step": 1800 + }, + { + "epoch": 0.539044289044289, + "grad_norm": 0.38429805636405945, + "learning_rate": 0.0005938810495626822, + "loss": 4.38, + "step": 1850 + }, + { + "epoch": 0.5536130536130536, + "grad_norm": 0.4373445510864258, + "learning_rate": 0.0005937061224489796, + "loss": 4.3664, + "step": 1900 + }, + { + "epoch": 0.5681818181818182, + "grad_norm": 0.43909236788749695, + "learning_rate": 0.0005935311953352769, + "loss": 4.3394, + "step": 1950 + }, + { + "epoch": 0.5827505827505828, + "grad_norm": 0.3650919795036316, + "learning_rate": 0.0005933562682215743, + "loss": 4.3353, + "step": 2000 + }, + { + "epoch": 0.5827505827505828, + "eval_accuracy": 0.30004649542771444, + "eval_loss": 4.282717704772949, + "eval_runtime": 180.3042, + "eval_samples_per_second": 92.3, + "eval_steps_per_second": 5.774, + "step": 2000 + }, + { + "epoch": 0.5973193473193473, + "grad_norm": 0.4164070188999176, + "learning_rate": 0.0005931813411078717, + "loss": 4.3193, + "step": 2050 + }, + { + "epoch": 0.6118881118881119, + "grad_norm": 0.370237797498703, + "learning_rate": 0.000593006413994169, + "loss": 4.3105, + "step": 2100 + }, + { + "epoch": 0.6264568764568764, + "grad_norm": 0.4082745611667633, + "learning_rate": 0.0005928314868804664, + "loss": 4.2941, + "step": 2150 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 0.3935624957084656, + "learning_rate": 0.0005926565597667638, + "loss": 4.2833, + "step": 2200 + }, + { + "epoch": 0.6555944055944056, + "grad_norm": 0.3984358310699463, + "learning_rate": 0.0005924816326530612, + "loss": 4.2679, + "step": 2250 + }, + { + "epoch": 0.6701631701631702, + "grad_norm": 0.3854668140411377, + "learning_rate": 0.0005923067055393586, + "loss": 4.2746, + "step": 2300 + }, + { + "epoch": 0.6847319347319347, + "grad_norm": 0.37700507044792175, + "learning_rate": 0.0005921317784256559, + "loss": 4.2429, + "step": 2350 + }, + { + "epoch": 0.6993006993006993, + "grad_norm": 0.3662063777446747, + "learning_rate": 0.0005919568513119533, + "loss": 4.2584, + "step": 2400 + }, + { + "epoch": 0.7138694638694638, + "grad_norm": 0.38568246364593506, + "learning_rate": 0.0005917819241982507, + "loss": 4.2344, + "step": 2450 + }, + { + "epoch": 0.7284382284382285, + "grad_norm": 0.43455713987350464, + "learning_rate": 0.000591606997084548, + "loss": 4.2328, + "step": 2500 + }, + { + "epoch": 0.743006993006993, + "grad_norm": 0.3856061100959778, + "learning_rate": 0.0005914320699708454, + "loss": 4.2092, + "step": 2550 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.38549911975860596, + "learning_rate": 0.0005912571428571428, + "loss": 4.218, + "step": 2600 + }, + { + "epoch": 0.7721445221445221, + "grad_norm": 0.39746782183647156, + "learning_rate": 0.0005910822157434402, + "loss": 4.1968, + "step": 2650 + }, + { + "epoch": 0.7867132867132867, + "grad_norm": 0.39923539757728577, + "learning_rate": 0.0005909072886297376, + "loss": 4.1886, + "step": 2700 + }, + { + "epoch": 0.8012820512820513, + "grad_norm": 0.3931720554828644, + "learning_rate": 0.0005907323615160349, + "loss": 4.1767, + "step": 2750 + }, + { + "epoch": 0.8158508158508159, + "grad_norm": 0.3459513783454895, + "learning_rate": 0.0005905574344023324, + "loss": 4.1785, + "step": 2800 + }, + { + "epoch": 0.8304195804195804, + "grad_norm": 0.39997783303260803, + "learning_rate": 0.0005903825072886297, + "loss": 4.1691, + "step": 2850 + }, + { + "epoch": 0.844988344988345, + "grad_norm": 0.3813166618347168, + "learning_rate": 0.000590207580174927, + "loss": 4.1596, + "step": 2900 + }, + { + "epoch": 0.8595571095571095, + "grad_norm": 0.37703993916511536, + "learning_rate": 0.0005900326530612244, + "loss": 4.1536, + "step": 2950 + }, + { + "epoch": 0.8741258741258742, + "grad_norm": 0.3787032961845398, + "learning_rate": 0.0005898577259475218, + "loss": 4.1411, + "step": 3000 + }, + { + "epoch": 0.8741258741258742, + "eval_accuracy": 0.31596194853706383, + "eval_loss": 4.094162940979004, + "eval_runtime": 180.2783, + "eval_samples_per_second": 92.313, + "eval_steps_per_second": 5.774, + "step": 3000 + }, + { + "epoch": 0.8886946386946387, + "grad_norm": 0.35312145948410034, + "learning_rate": 0.0005896827988338192, + "loss": 4.1438, + "step": 3050 + }, + { + "epoch": 0.9032634032634033, + "grad_norm": 0.38526853919029236, + "learning_rate": 0.0005895078717201166, + "loss": 4.1301, + "step": 3100 + }, + { + "epoch": 0.9178321678321678, + "grad_norm": 0.36466994881629944, + "learning_rate": 0.000589332944606414, + "loss": 4.1235, + "step": 3150 + }, + { + "epoch": 0.9324009324009324, + "grad_norm": 0.3571998178958893, + "learning_rate": 0.0005891580174927114, + "loss": 4.1292, + "step": 3200 + }, + { + "epoch": 0.946969696969697, + "grad_norm": 0.3403795063495636, + "learning_rate": 0.0005889830903790087, + "loss": 4.1082, + "step": 3250 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 0.38671016693115234, + "learning_rate": 0.000588808163265306, + "loss": 4.096, + "step": 3300 + }, + { + "epoch": 0.9761072261072261, + "grad_norm": 0.3343498408794403, + "learning_rate": 0.0005886332361516035, + "loss": 4.0914, + "step": 3350 + }, + { + "epoch": 0.9906759906759907, + "grad_norm": 0.3740348815917969, + "learning_rate": 0.0005884583090379008, + "loss": 4.1069, + "step": 3400 + }, + { + "epoch": 1.0052447552447552, + "grad_norm": 0.37788498401641846, + "learning_rate": 0.0005882833819241982, + "loss": 4.0461, + "step": 3450 + }, + { + "epoch": 1.0198135198135199, + "grad_norm": 0.3331277072429657, + "learning_rate": 0.0005881084548104955, + "loss": 4.0172, + "step": 3500 + }, + { + "epoch": 1.0343822843822843, + "grad_norm": 0.38395169377326965, + "learning_rate": 0.000587933527696793, + "loss": 4.014, + "step": 3550 + }, + { + "epoch": 1.048951048951049, + "grad_norm": 0.35354799032211304, + "learning_rate": 0.0005877586005830904, + "loss": 4.0174, + "step": 3600 + }, + { + "epoch": 1.0635198135198136, + "grad_norm": 0.35139200091362, + "learning_rate": 0.0005875836734693877, + "loss": 4.0323, + "step": 3650 + }, + { + "epoch": 1.078088578088578, + "grad_norm": 0.36168310046195984, + "learning_rate": 0.0005874087463556851, + "loss": 4.0024, + "step": 3700 + }, + { + "epoch": 1.0926573426573427, + "grad_norm": 0.3537745773792267, + "learning_rate": 0.0005872338192419825, + "loss": 4.0106, + "step": 3750 + }, + { + "epoch": 1.1072261072261071, + "grad_norm": 0.3509289026260376, + "learning_rate": 0.0005870588921282798, + "loss": 3.9951, + "step": 3800 + }, + { + "epoch": 1.1217948717948718, + "grad_norm": 0.3399880826473236, + "learning_rate": 0.0005868839650145772, + "loss": 3.9968, + "step": 3850 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.31629034876823425, + "learning_rate": 0.0005867090379008745, + "loss": 3.9957, + "step": 3900 + }, + { + "epoch": 1.150932400932401, + "grad_norm": 0.34750989079475403, + "learning_rate": 0.000586534110787172, + "loss": 3.9916, + "step": 3950 + }, + { + "epoch": 1.1655011655011656, + "grad_norm": 0.34375834465026855, + "learning_rate": 0.0005863591836734694, + "loss": 3.994, + "step": 4000 + }, + { + "epoch": 1.1655011655011656, + "eval_accuracy": 0.3255772359138492, + "eval_loss": 3.989028215408325, + "eval_runtime": 180.1395, + "eval_samples_per_second": 92.384, + "eval_steps_per_second": 5.779, + "step": 4000 + }, + { + "epoch": 1.18006993006993, + "grad_norm": 0.37413716316223145, + "learning_rate": 0.0005861842565597667, + "loss": 3.9981, + "step": 4050 + }, + { + "epoch": 1.1946386946386947, + "grad_norm": 0.35629984736442566, + "learning_rate": 0.0005860093294460641, + "loss": 3.975, + "step": 4100 + }, + { + "epoch": 1.2092074592074593, + "grad_norm": 0.3381233215332031, + "learning_rate": 0.0005858344023323615, + "loss": 3.9851, + "step": 4150 + }, + { + "epoch": 1.2237762237762237, + "grad_norm": 0.34585943818092346, + "learning_rate": 0.0005856594752186588, + "loss": 3.9808, + "step": 4200 + }, + { + "epoch": 1.2383449883449884, + "grad_norm": 0.35993272066116333, + "learning_rate": 0.0005854845481049562, + "loss": 3.975, + "step": 4250 + }, + { + "epoch": 1.2529137529137528, + "grad_norm": 0.32540130615234375, + "learning_rate": 0.0005853096209912535, + "loss": 3.9714, + "step": 4300 + }, + { + "epoch": 1.2674825174825175, + "grad_norm": 0.36224445700645447, + "learning_rate": 0.000585134693877551, + "loss": 3.9754, + "step": 4350 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.3662620186805725, + "learning_rate": 0.0005849597667638484, + "loss": 3.9694, + "step": 4400 + }, + { + "epoch": 1.2966200466200466, + "grad_norm": 0.35438838601112366, + "learning_rate": 0.0005847848396501457, + "loss": 3.9519, + "step": 4450 + }, + { + "epoch": 1.3111888111888113, + "grad_norm": 0.34450942277908325, + "learning_rate": 0.0005846099125364432, + "loss": 3.963, + "step": 4500 + }, + { + "epoch": 1.3257575757575757, + "grad_norm": 0.351962685585022, + "learning_rate": 0.0005844349854227405, + "loss": 3.9591, + "step": 4550 + }, + { + "epoch": 1.3403263403263403, + "grad_norm": 0.3839578926563263, + "learning_rate": 0.0005842600583090379, + "loss": 3.9561, + "step": 4600 + }, + { + "epoch": 1.354895104895105, + "grad_norm": 0.32113179564476013, + "learning_rate": 0.0005840851311953352, + "loss": 3.949, + "step": 4650 + }, + { + "epoch": 1.3694638694638694, + "grad_norm": 0.33071938157081604, + "learning_rate": 0.0005839102040816325, + "loss": 3.9608, + "step": 4700 + }, + { + "epoch": 1.384032634032634, + "grad_norm": 0.33803558349609375, + "learning_rate": 0.00058373527696793, + "loss": 3.9482, + "step": 4750 + }, + { + "epoch": 1.3986013986013985, + "grad_norm": 0.31636884808540344, + "learning_rate": 0.0005835603498542273, + "loss": 3.9437, + "step": 4800 + }, + { + "epoch": 1.4131701631701632, + "grad_norm": 0.3646225035190582, + "learning_rate": 0.0005833854227405247, + "loss": 3.9303, + "step": 4850 + }, + { + "epoch": 1.4277389277389276, + "grad_norm": 0.3559642732143402, + "learning_rate": 0.0005832104956268222, + "loss": 3.9403, + "step": 4900 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 0.3481752276420593, + "learning_rate": 0.0005830355685131195, + "loss": 3.9357, + "step": 4950 + }, + { + "epoch": 1.456876456876457, + "grad_norm": 0.313125878572464, + "learning_rate": 0.0005828606413994169, + "loss": 3.9303, + "step": 5000 + }, + { + "epoch": 1.456876456876457, + "eval_accuracy": 0.3321257535516557, + "eval_loss": 3.9129536151885986, + "eval_runtime": 180.4532, + "eval_samples_per_second": 92.223, + "eval_steps_per_second": 5.769, + "step": 5000 + }, + { + "epoch": 1.4714452214452214, + "grad_norm": 0.33051010966300964, + "learning_rate": 0.0005826857142857142, + "loss": 3.9226, + "step": 5050 + }, + { + "epoch": 1.486013986013986, + "grad_norm": 0.3060428500175476, + "learning_rate": 0.0005825107871720116, + "loss": 3.9254, + "step": 5100 + }, + { + "epoch": 1.5005827505827507, + "grad_norm": 0.34262314438819885, + "learning_rate": 0.000582335860058309, + "loss": 3.9131, + "step": 5150 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.33539673686027527, + "learning_rate": 0.0005821609329446063, + "loss": 3.9158, + "step": 5200 + }, + { + "epoch": 1.5297202797202796, + "grad_norm": 0.3277048170566559, + "learning_rate": 0.0005819860058309037, + "loss": 3.9228, + "step": 5250 + }, + { + "epoch": 1.5442890442890445, + "grad_norm": 0.31714221835136414, + "learning_rate": 0.0005818110787172012, + "loss": 3.9245, + "step": 5300 + }, + { + "epoch": 1.558857808857809, + "grad_norm": 0.329098105430603, + "learning_rate": 0.0005816361516034985, + "loss": 3.9212, + "step": 5350 + }, + { + "epoch": 1.5734265734265733, + "grad_norm": 0.33248335123062134, + "learning_rate": 0.0005814612244897959, + "loss": 3.9066, + "step": 5400 + }, + { + "epoch": 1.587995337995338, + "grad_norm": 0.3300471305847168, + "learning_rate": 0.0005812862973760932, + "loss": 3.9076, + "step": 5450 + }, + { + "epoch": 1.6025641025641026, + "grad_norm": 0.3110630214214325, + "learning_rate": 0.0005811113702623907, + "loss": 3.8996, + "step": 5500 + }, + { + "epoch": 1.617132867132867, + "grad_norm": 0.34096479415893555, + "learning_rate": 0.000580936443148688, + "loss": 3.8914, + "step": 5550 + }, + { + "epoch": 1.6317016317016317, + "grad_norm": 0.3256978690624237, + "learning_rate": 0.0005807615160349853, + "loss": 3.8901, + "step": 5600 + }, + { + "epoch": 1.6462703962703964, + "grad_norm": 0.3170398771762848, + "learning_rate": 0.0005805865889212827, + "loss": 3.9086, + "step": 5650 + }, + { + "epoch": 1.6608391608391608, + "grad_norm": 0.32134151458740234, + "learning_rate": 0.0005804116618075802, + "loss": 3.8843, + "step": 5700 + }, + { + "epoch": 1.6754079254079253, + "grad_norm": 0.3455315828323364, + "learning_rate": 0.0005802367346938775, + "loss": 3.8936, + "step": 5750 + }, + { + "epoch": 1.68997668997669, + "grad_norm": 0.33487361669540405, + "learning_rate": 0.0005800618075801749, + "loss": 3.903, + "step": 5800 + }, + { + "epoch": 1.7045454545454546, + "grad_norm": 0.3249671459197998, + "learning_rate": 0.0005798868804664722, + "loss": 3.8913, + "step": 5850 + }, + { + "epoch": 1.719114219114219, + "grad_norm": 0.35598769783973694, + "learning_rate": 0.0005797119533527697, + "loss": 3.8821, + "step": 5900 + }, + { + "epoch": 1.7336829836829837, + "grad_norm": 0.34034013748168945, + "learning_rate": 0.000579537026239067, + "loss": 3.8849, + "step": 5950 + }, + { + "epoch": 1.7482517482517483, + "grad_norm": 0.33674389123916626, + "learning_rate": 0.0005793620991253643, + "loss": 3.8992, + "step": 6000 + }, + { + "epoch": 1.7482517482517483, + "eval_accuracy": 0.33753787307759514, + "eval_loss": 3.85577654838562, + "eval_runtime": 180.1447, + "eval_samples_per_second": 92.381, + "eval_steps_per_second": 5.779, + "step": 6000 + }, + { + "epoch": 1.7628205128205128, + "grad_norm": 0.32885122299194336, + "learning_rate": 0.0005791871720116617, + "loss": 3.8805, + "step": 6050 + }, + { + "epoch": 1.7773892773892774, + "grad_norm": 0.32068461179733276, + "learning_rate": 0.0005790122448979591, + "loss": 3.8668, + "step": 6100 + }, + { + "epoch": 1.791958041958042, + "grad_norm": 0.3308079242706299, + "learning_rate": 0.0005788373177842565, + "loss": 3.8776, + "step": 6150 + }, + { + "epoch": 1.8065268065268065, + "grad_norm": 0.32728639245033264, + "learning_rate": 0.0005786623906705539, + "loss": 3.8633, + "step": 6200 + }, + { + "epoch": 1.821095571095571, + "grad_norm": 0.3404487073421478, + "learning_rate": 0.0005784874635568512, + "loss": 3.8712, + "step": 6250 + }, + { + "epoch": 1.8356643356643356, + "grad_norm": 0.32237741351127625, + "learning_rate": 0.0005783125364431487, + "loss": 3.8582, + "step": 6300 + }, + { + "epoch": 1.8502331002331003, + "grad_norm": 0.3479669392108917, + "learning_rate": 0.000578137609329446, + "loss": 3.8647, + "step": 6350 + }, + { + "epoch": 1.8648018648018647, + "grad_norm": 0.3184560239315033, + "learning_rate": 0.0005779626822157434, + "loss": 3.847, + "step": 6400 + }, + { + "epoch": 1.8793706293706294, + "grad_norm": 0.3197358548641205, + "learning_rate": 0.0005777877551020408, + "loss": 3.8617, + "step": 6450 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.2957116663455963, + "learning_rate": 0.0005776128279883381, + "loss": 3.854, + "step": 6500 + }, + { + "epoch": 1.9085081585081585, + "grad_norm": 0.3220060169696808, + "learning_rate": 0.0005774379008746355, + "loss": 3.851, + "step": 6550 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.3108726441860199, + "learning_rate": 0.0005772629737609329, + "loss": 3.8559, + "step": 6600 + }, + { + "epoch": 1.9376456876456878, + "grad_norm": 0.33560416102409363, + "learning_rate": 0.0005770880466472303, + "loss": 3.8505, + "step": 6650 + }, + { + "epoch": 1.9522144522144522, + "grad_norm": 0.33253157138824463, + "learning_rate": 0.0005769131195335277, + "loss": 3.8468, + "step": 6700 + }, + { + "epoch": 1.9667832167832167, + "grad_norm": 0.3143483102321625, + "learning_rate": 0.000576738192419825, + "loss": 3.8416, + "step": 6750 + }, + { + "epoch": 1.9813519813519813, + "grad_norm": 0.32564249634742737, + "learning_rate": 0.0005765632653061224, + "loss": 3.843, + "step": 6800 + }, + { + "epoch": 1.995920745920746, + "grad_norm": 0.33519217371940613, + "learning_rate": 0.0005763883381924198, + "loss": 3.8431, + "step": 6850 + }, + { + "epoch": 2.0104895104895104, + "grad_norm": 0.32294219732284546, + "learning_rate": 0.0005762134110787171, + "loss": 3.7722, + "step": 6900 + }, + { + "epoch": 2.025058275058275, + "grad_norm": 0.3262682557106018, + "learning_rate": 0.0005760384839650145, + "loss": 3.7428, + "step": 6950 + }, + { + "epoch": 2.0396270396270397, + "grad_norm": 0.3397265374660492, + "learning_rate": 0.0005758635568513119, + "loss": 3.7487, + "step": 7000 + }, + { + "epoch": 2.0396270396270397, + "eval_accuracy": 0.3415744146738347, + "eval_loss": 3.8127987384796143, + "eval_runtime": 180.1348, + "eval_samples_per_second": 92.386, + "eval_steps_per_second": 5.779, + "step": 7000 + }, + { + "epoch": 2.054195804195804, + "grad_norm": 0.3330610990524292, + "learning_rate": 0.0005756886297376093, + "loss": 3.7405, + "step": 7050 + }, + { + "epoch": 2.0687645687645686, + "grad_norm": 0.3221195638179779, + "learning_rate": 0.0005755137026239067, + "loss": 3.7561, + "step": 7100 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.32453685998916626, + "learning_rate": 0.000575338775510204, + "loss": 3.7532, + "step": 7150 + }, + { + "epoch": 2.097902097902098, + "grad_norm": 0.3615976870059967, + "learning_rate": 0.0005751638483965014, + "loss": 3.7618, + "step": 7200 + }, + { + "epoch": 2.1124708624708624, + "grad_norm": 0.323742538690567, + "learning_rate": 0.0005749889212827988, + "loss": 3.7508, + "step": 7250 + }, + { + "epoch": 2.1270396270396272, + "grad_norm": 0.3381347954273224, + "learning_rate": 0.0005748139941690962, + "loss": 3.7588, + "step": 7300 + }, + { + "epoch": 2.1416083916083917, + "grad_norm": 0.3426363468170166, + "learning_rate": 0.0005746390670553935, + "loss": 3.7579, + "step": 7350 + }, + { + "epoch": 2.156177156177156, + "grad_norm": 0.31964731216430664, + "learning_rate": 0.000574464139941691, + "loss": 3.7528, + "step": 7400 + }, + { + "epoch": 2.1707459207459205, + "grad_norm": 0.3354383111000061, + "learning_rate": 0.0005742892128279883, + "loss": 3.7556, + "step": 7450 + }, + { + "epoch": 2.1853146853146854, + "grad_norm": 0.3251858353614807, + "learning_rate": 0.0005741142857142857, + "loss": 3.7556, + "step": 7500 + }, + { + "epoch": 2.19988344988345, + "grad_norm": 0.3399089276790619, + "learning_rate": 0.000573939358600583, + "loss": 3.7415, + "step": 7550 + }, + { + "epoch": 2.2144522144522143, + "grad_norm": 0.3444349467754364, + "learning_rate": 0.0005737644314868805, + "loss": 3.7515, + "step": 7600 + }, + { + "epoch": 2.229020979020979, + "grad_norm": 0.31715652346611023, + "learning_rate": 0.0005735895043731778, + "loss": 3.7618, + "step": 7650 + }, + { + "epoch": 2.2435897435897436, + "grad_norm": 0.34369540214538574, + "learning_rate": 0.0005734145772594752, + "loss": 3.7687, + "step": 7700 + }, + { + "epoch": 2.258158508158508, + "grad_norm": 0.3494495153427124, + "learning_rate": 0.0005732396501457726, + "loss": 3.748, + "step": 7750 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.31449177861213684, + "learning_rate": 0.0005730647230320698, + "loss": 3.7541, + "step": 7800 + }, + { + "epoch": 2.2872960372960374, + "grad_norm": 0.3397660553455353, + "learning_rate": 0.0005728897959183673, + "loss": 3.7624, + "step": 7850 + }, + { + "epoch": 2.301864801864802, + "grad_norm": 0.34240466356277466, + "learning_rate": 0.0005727148688046647, + "loss": 3.7432, + "step": 7900 + }, + { + "epoch": 2.3164335664335667, + "grad_norm": 0.3217261731624603, + "learning_rate": 0.000572539941690962, + "loss": 3.7499, + "step": 7950 + }, + { + "epoch": 2.331002331002331, + "grad_norm": 0.3246598243713379, + "learning_rate": 0.0005723650145772595, + "loss": 3.7619, + "step": 8000 + }, + { + "epoch": 2.331002331002331, + "eval_accuracy": 0.34476242059382917, + "eval_loss": 3.7828927040100098, + "eval_runtime": 179.9962, + "eval_samples_per_second": 92.458, + "eval_steps_per_second": 5.783, + "step": 8000 + }, + { + "epoch": 2.3455710955710956, + "grad_norm": 0.3367806673049927, + "learning_rate": 0.0005721900874635568, + "loss": 3.7485, + "step": 8050 + }, + { + "epoch": 2.36013986013986, + "grad_norm": 0.3171541392803192, + "learning_rate": 0.0005720151603498542, + "loss": 3.7546, + "step": 8100 + }, + { + "epoch": 2.374708624708625, + "grad_norm": 0.33225518465042114, + "learning_rate": 0.0005718402332361515, + "loss": 3.7429, + "step": 8150 + }, + { + "epoch": 2.3892773892773893, + "grad_norm": 0.3193056881427765, + "learning_rate": 0.000571665306122449, + "loss": 3.7622, + "step": 8200 + }, + { + "epoch": 2.4038461538461537, + "grad_norm": 0.3187880218029022, + "learning_rate": 0.0005714903790087463, + "loss": 3.7435, + "step": 8250 + }, + { + "epoch": 2.4184149184149186, + "grad_norm": 0.33991068601608276, + "learning_rate": 0.0005713154518950437, + "loss": 3.7494, + "step": 8300 + }, + { + "epoch": 2.432983682983683, + "grad_norm": 0.3092400133609772, + "learning_rate": 0.000571140524781341, + "loss": 3.7612, + "step": 8350 + }, + { + "epoch": 2.4475524475524475, + "grad_norm": 0.31092721223831177, + "learning_rate": 0.0005709655976676385, + "loss": 3.7488, + "step": 8400 + }, + { + "epoch": 2.462121212121212, + "grad_norm": 0.32930874824523926, + "learning_rate": 0.0005707906705539358, + "loss": 3.758, + "step": 8450 + }, + { + "epoch": 2.476689976689977, + "grad_norm": 0.32361528277397156, + "learning_rate": 0.0005706157434402332, + "loss": 3.7454, + "step": 8500 + }, + { + "epoch": 2.4912587412587412, + "grad_norm": 0.33115440607070923, + "learning_rate": 0.0005704408163265305, + "loss": 3.7402, + "step": 8550 + }, + { + "epoch": 2.5058275058275057, + "grad_norm": 0.328485369682312, + "learning_rate": 0.000570265889212828, + "loss": 3.7372, + "step": 8600 + }, + { + "epoch": 2.5203962703962706, + "grad_norm": 0.35709500312805176, + "learning_rate": 0.0005700909620991253, + "loss": 3.7461, + "step": 8650 + }, + { + "epoch": 2.534965034965035, + "grad_norm": 0.32163530588150024, + "learning_rate": 0.0005699160349854227, + "loss": 3.7541, + "step": 8700 + }, + { + "epoch": 2.5495337995337994, + "grad_norm": 0.31789329648017883, + "learning_rate": 0.00056974110787172, + "loss": 3.7438, + "step": 8750 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.3170648515224457, + "learning_rate": 0.0005695661807580175, + "loss": 3.7557, + "step": 8800 + }, + { + "epoch": 2.5786713286713288, + "grad_norm": 0.3424239158630371, + "learning_rate": 0.0005693912536443148, + "loss": 3.7398, + "step": 8850 + }, + { + "epoch": 2.593240093240093, + "grad_norm": 0.318135529756546, + "learning_rate": 0.0005692163265306122, + "loss": 3.7284, + "step": 8900 + }, + { + "epoch": 2.607808857808858, + "grad_norm": 0.33802515268325806, + "learning_rate": 0.0005690413994169095, + "loss": 3.738, + "step": 8950 + }, + { + "epoch": 2.6223776223776225, + "grad_norm": 0.32018738985061646, + "learning_rate": 0.000568866472303207, + "loss": 3.74, + "step": 9000 + }, + { + "epoch": 2.6223776223776225, + "eval_accuracy": 0.3477287677347602, + "eval_loss": 3.751537799835205, + "eval_runtime": 180.2979, + "eval_samples_per_second": 92.303, + "eval_steps_per_second": 5.774, + "step": 9000 + }, + { + "epoch": 2.636946386946387, + "grad_norm": 0.3212384283542633, + "learning_rate": 0.0005686915451895044, + "loss": 3.7381, + "step": 9050 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.3253323435783386, + "learning_rate": 0.0005685166180758016, + "loss": 3.739, + "step": 9100 + }, + { + "epoch": 2.666083916083916, + "grad_norm": 0.3387431502342224, + "learning_rate": 0.000568341690962099, + "loss": 3.7248, + "step": 9150 + }, + { + "epoch": 2.6806526806526807, + "grad_norm": 0.32496801018714905, + "learning_rate": 0.0005681667638483965, + "loss": 3.7298, + "step": 9200 + }, + { + "epoch": 2.695221445221445, + "grad_norm": 0.32816433906555176, + "learning_rate": 0.0005679918367346938, + "loss": 3.7296, + "step": 9250 + }, + { + "epoch": 2.70979020979021, + "grad_norm": 0.3408059775829315, + "learning_rate": 0.0005678169096209912, + "loss": 3.7364, + "step": 9300 + }, + { + "epoch": 2.7243589743589745, + "grad_norm": 0.33964434266090393, + "learning_rate": 0.0005676419825072885, + "loss": 3.7332, + "step": 9350 + }, + { + "epoch": 2.738927738927739, + "grad_norm": 0.31630218029022217, + "learning_rate": 0.000567467055393586, + "loss": 3.7283, + "step": 9400 + }, + { + "epoch": 2.7534965034965033, + "grad_norm": 0.34303176403045654, + "learning_rate": 0.0005672921282798833, + "loss": 3.7337, + "step": 9450 + }, + { + "epoch": 2.768065268065268, + "grad_norm": 0.30772241950035095, + "learning_rate": 0.0005671172011661807, + "loss": 3.7223, + "step": 9500 + }, + { + "epoch": 2.7826340326340326, + "grad_norm": 0.3346325755119324, + "learning_rate": 0.000566942274052478, + "loss": 3.7366, + "step": 9550 + }, + { + "epoch": 2.797202797202797, + "grad_norm": 0.321429580450058, + "learning_rate": 0.0005667673469387755, + "loss": 3.7289, + "step": 9600 + }, + { + "epoch": 2.811771561771562, + "grad_norm": 0.3273778259754181, + "learning_rate": 0.0005665924198250728, + "loss": 3.7253, + "step": 9650 + }, + { + "epoch": 2.8263403263403264, + "grad_norm": 0.33299872279167175, + "learning_rate": 0.0005664174927113702, + "loss": 3.7264, + "step": 9700 + }, + { + "epoch": 2.840909090909091, + "grad_norm": 0.31705546379089355, + "learning_rate": 0.0005662425655976676, + "loss": 3.7263, + "step": 9750 + }, + { + "epoch": 2.8554778554778553, + "grad_norm": 0.34314480423927307, + "learning_rate": 0.000566067638483965, + "loss": 3.7151, + "step": 9800 + }, + { + "epoch": 2.87004662004662, + "grad_norm": 0.32017573714256287, + "learning_rate": 0.0005658927113702623, + "loss": 3.7329, + "step": 9850 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 0.31930816173553467, + "learning_rate": 0.0005657177842565597, + "loss": 3.7235, + "step": 9900 + }, + { + "epoch": 2.8991841491841495, + "grad_norm": 0.31949570775032043, + "learning_rate": 0.0005655428571428572, + "loss": 3.7227, + "step": 9950 + }, + { + "epoch": 2.913752913752914, + "grad_norm": 0.30999991297721863, + "learning_rate": 0.0005653679300291545, + "loss": 3.7152, + "step": 10000 + }, + { + "epoch": 2.913752913752914, + "eval_accuracy": 0.34985880864932545, + "eval_loss": 3.7270307540893555, + "eval_runtime": 180.2671, + "eval_samples_per_second": 92.319, + "eval_steps_per_second": 5.775, + "step": 10000 + }, + { + "epoch": 2.9283216783216783, + "grad_norm": 0.3184822201728821, + "learning_rate": 0.0005651930029154518, + "loss": 3.7289, + "step": 10050 + }, + { + "epoch": 2.9428904428904428, + "grad_norm": 0.31392183899879456, + "learning_rate": 0.0005650180758017492, + "loss": 3.7275, + "step": 10100 + }, + { + "epoch": 2.957459207459207, + "grad_norm": 0.3100379407405853, + "learning_rate": 0.0005648431486880466, + "loss": 3.7078, + "step": 10150 + }, + { + "epoch": 2.972027972027972, + "grad_norm": 0.3107777237892151, + "learning_rate": 0.000564668221574344, + "loss": 3.7191, + "step": 10200 + }, + { + "epoch": 2.9865967365967365, + "grad_norm": 0.31457746028900146, + "learning_rate": 0.0005644932944606413, + "loss": 3.7216, + "step": 10250 + }, + { + "epoch": 3.001165501165501, + "grad_norm": 0.3300207555294037, + "learning_rate": 0.0005643183673469387, + "loss": 3.7241, + "step": 10300 + }, + { + "epoch": 3.015734265734266, + "grad_norm": 0.33615049719810486, + "learning_rate": 0.0005641434402332362, + "loss": 3.6097, + "step": 10350 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.32839542627334595, + "learning_rate": 0.0005639685131195335, + "loss": 3.6192, + "step": 10400 + }, + { + "epoch": 3.0448717948717947, + "grad_norm": 0.32775548100471497, + "learning_rate": 0.0005637935860058308, + "loss": 3.6201, + "step": 10450 + }, + { + "epoch": 3.0594405594405596, + "grad_norm": 0.3305208086967468, + "learning_rate": 0.0005636186588921282, + "loss": 3.6244, + "step": 10500 + }, + { + "epoch": 3.074009324009324, + "grad_norm": 0.3248291015625, + "learning_rate": 0.0005634437317784256, + "loss": 3.6289, + "step": 10550 + }, + { + "epoch": 3.0885780885780885, + "grad_norm": 0.334089070558548, + "learning_rate": 0.000563268804664723, + "loss": 3.6128, + "step": 10600 + }, + { + "epoch": 3.1031468531468533, + "grad_norm": 0.33667150139808655, + "learning_rate": 0.0005630938775510203, + "loss": 3.6316, + "step": 10650 + }, + { + "epoch": 3.117715617715618, + "grad_norm": 0.3139183223247528, + "learning_rate": 0.0005629189504373177, + "loss": 3.6267, + "step": 10700 + }, + { + "epoch": 3.132284382284382, + "grad_norm": 0.3240184187889099, + "learning_rate": 0.0005627440233236151, + "loss": 3.6155, + "step": 10750 + }, + { + "epoch": 3.1468531468531467, + "grad_norm": 0.3177716135978699, + "learning_rate": 0.0005625690962099125, + "loss": 3.6157, + "step": 10800 + }, + { + "epoch": 3.1614219114219115, + "grad_norm": 0.32491302490234375, + "learning_rate": 0.0005623941690962099, + "loss": 3.6529, + "step": 10850 + }, + { + "epoch": 3.175990675990676, + "grad_norm": 0.3269357681274414, + "learning_rate": 0.0005622192419825073, + "loss": 3.6252, + "step": 10900 + }, + { + "epoch": 3.1905594405594404, + "grad_norm": 0.33358559012413025, + "learning_rate": 0.0005620443148688046, + "loss": 3.6477, + "step": 10950 + }, + { + "epoch": 3.2051282051282053, + "grad_norm": 0.32112857699394226, + "learning_rate": 0.000561869387755102, + "loss": 3.6367, + "step": 11000 + }, + { + "epoch": 3.2051282051282053, + "eval_accuracy": 0.3516898159961675, + "eval_loss": 3.7140629291534424, + "eval_runtime": 180.3296, + "eval_samples_per_second": 92.287, + "eval_steps_per_second": 5.773, + "step": 11000 + }, + { + "epoch": 3.2196969696969697, + "grad_norm": 0.328512042760849, + "learning_rate": 0.0005616944606413993, + "loss": 3.6396, + "step": 11050 + }, + { + "epoch": 3.234265734265734, + "grad_norm": 0.3449825644493103, + "learning_rate": 0.0005615195335276968, + "loss": 3.6327, + "step": 11100 + }, + { + "epoch": 3.248834498834499, + "grad_norm": 0.32266926765441895, + "learning_rate": 0.0005613446064139941, + "loss": 3.6382, + "step": 11150 + }, + { + "epoch": 3.2634032634032635, + "grad_norm": 0.3263072073459625, + "learning_rate": 0.0005611696793002915, + "loss": 3.6265, + "step": 11200 + }, + { + "epoch": 3.277972027972028, + "grad_norm": 0.32438746094703674, + "learning_rate": 0.0005609947521865889, + "loss": 3.6519, + "step": 11250 + }, + { + "epoch": 3.2925407925407923, + "grad_norm": 0.3556417226791382, + "learning_rate": 0.0005608198250728863, + "loss": 3.6388, + "step": 11300 + }, + { + "epoch": 3.3071095571095572, + "grad_norm": 0.31459367275238037, + "learning_rate": 0.0005606448979591836, + "loss": 3.6413, + "step": 11350 + }, + { + "epoch": 3.3216783216783217, + "grad_norm": 0.3164815902709961, + "learning_rate": 0.000560469970845481, + "loss": 3.6394, + "step": 11400 + }, + { + "epoch": 3.336247086247086, + "grad_norm": 0.3238040804862976, + "learning_rate": 0.0005602950437317783, + "loss": 3.639, + "step": 11450 + }, + { + "epoch": 3.350815850815851, + "grad_norm": 0.31536027789115906, + "learning_rate": 0.0005601201166180758, + "loss": 3.651, + "step": 11500 + }, + { + "epoch": 3.3653846153846154, + "grad_norm": 0.3251273036003113, + "learning_rate": 0.0005599451895043731, + "loss": 3.6398, + "step": 11550 + }, + { + "epoch": 3.37995337995338, + "grad_norm": 0.3183720111846924, + "learning_rate": 0.0005597702623906705, + "loss": 3.6425, + "step": 11600 + }, + { + "epoch": 3.3945221445221447, + "grad_norm": 0.3452969193458557, + "learning_rate": 0.0005595953352769679, + "loss": 3.6396, + "step": 11650 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.31187903881073, + "learning_rate": 0.0005594204081632653, + "loss": 3.6399, + "step": 11700 + }, + { + "epoch": 3.4236596736596736, + "grad_norm": 0.3159955143928528, + "learning_rate": 0.0005592454810495627, + "loss": 3.6371, + "step": 11750 + }, + { + "epoch": 3.438228438228438, + "grad_norm": 0.3242449462413788, + "learning_rate": 0.00055907055393586, + "loss": 3.6376, + "step": 11800 + }, + { + "epoch": 3.452797202797203, + "grad_norm": 0.33960285782814026, + "learning_rate": 0.0005588956268221573, + "loss": 3.6397, + "step": 11850 + }, + { + "epoch": 3.4673659673659674, + "grad_norm": 0.34514838457107544, + "learning_rate": 0.0005587206997084548, + "loss": 3.6349, + "step": 11900 + }, + { + "epoch": 3.481934731934732, + "grad_norm": 0.33326658606529236, + "learning_rate": 0.0005585457725947521, + "loss": 3.6432, + "step": 11950 + }, + { + "epoch": 3.4965034965034967, + "grad_norm": 0.3219590187072754, + "learning_rate": 0.0005583708454810495, + "loss": 3.642, + "step": 12000 + }, + { + "epoch": 3.4965034965034967, + "eval_accuracy": 0.35353681570054407, + "eval_loss": 3.697685480117798, + "eval_runtime": 180.3495, + "eval_samples_per_second": 92.276, + "eval_steps_per_second": 5.772, + "step": 12000 + }, + { + "epoch": 3.511072261072261, + "grad_norm": 0.315857470035553, + "learning_rate": 0.0005581959183673468, + "loss": 3.6484, + "step": 12050 + }, + { + "epoch": 3.5256410256410255, + "grad_norm": 0.33714818954467773, + "learning_rate": 0.0005580209912536443, + "loss": 3.6465, + "step": 12100 + }, + { + "epoch": 3.54020979020979, + "grad_norm": 0.3196263909339905, + "learning_rate": 0.0005578460641399417, + "loss": 3.6444, + "step": 12150 + }, + { + "epoch": 3.554778554778555, + "grad_norm": 0.34034839272499084, + "learning_rate": 0.000557671137026239, + "loss": 3.6403, + "step": 12200 + }, + { + "epoch": 3.5693473193473193, + "grad_norm": 0.32852211594581604, + "learning_rate": 0.0005574962099125363, + "loss": 3.6461, + "step": 12250 + }, + { + "epoch": 3.583916083916084, + "grad_norm": 0.3598001003265381, + "learning_rate": 0.0005573212827988338, + "loss": 3.6392, + "step": 12300 + }, + { + "epoch": 3.5984848484848486, + "grad_norm": 0.3342962861061096, + "learning_rate": 0.0005571463556851311, + "loss": 3.6414, + "step": 12350 + }, + { + "epoch": 3.613053613053613, + "grad_norm": 0.316803514957428, + "learning_rate": 0.0005569714285714285, + "loss": 3.6486, + "step": 12400 + }, + { + "epoch": 3.6276223776223775, + "grad_norm": 0.31796908378601074, + "learning_rate": 0.0005567965014577258, + "loss": 3.6369, + "step": 12450 + }, + { + "epoch": 3.642191142191142, + "grad_norm": 0.309007465839386, + "learning_rate": 0.0005566215743440233, + "loss": 3.6429, + "step": 12500 + }, + { + "epoch": 3.656759906759907, + "grad_norm": 0.3321513831615448, + "learning_rate": 0.0005564466472303207, + "loss": 3.6487, + "step": 12550 + }, + { + "epoch": 3.6713286713286712, + "grad_norm": 0.35138118267059326, + "learning_rate": 0.000556271720116618, + "loss": 3.6527, + "step": 12600 + }, + { + "epoch": 3.685897435897436, + "grad_norm": 0.3067615032196045, + "learning_rate": 0.0005560967930029155, + "loss": 3.6444, + "step": 12650 + }, + { + "epoch": 3.7004662004662006, + "grad_norm": 0.33694183826446533, + "learning_rate": 0.0005559218658892128, + "loss": 3.6325, + "step": 12700 + }, + { + "epoch": 3.715034965034965, + "grad_norm": 0.31776705384254456, + "learning_rate": 0.0005557469387755101, + "loss": 3.6527, + "step": 12750 + }, + { + "epoch": 3.7296037296037294, + "grad_norm": 0.3377169668674469, + "learning_rate": 0.0005555720116618075, + "loss": 3.6424, + "step": 12800 + }, + { + "epoch": 3.7441724941724943, + "grad_norm": 0.3101692199707031, + "learning_rate": 0.0005553970845481049, + "loss": 3.6359, + "step": 12850 + }, + { + "epoch": 3.7587412587412588, + "grad_norm": 0.3166581392288208, + "learning_rate": 0.0005552221574344023, + "loss": 3.6416, + "step": 12900 + }, + { + "epoch": 3.773310023310023, + "grad_norm": 0.31438636779785156, + "learning_rate": 0.0005550472303206997, + "loss": 3.6336, + "step": 12950 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.3247930705547333, + "learning_rate": 0.000554872303206997, + "loss": 3.6416, + "step": 13000 + }, + { + "epoch": 3.787878787878788, + "eval_accuracy": 0.3546786229921654, + "eval_loss": 3.6786322593688965, + "eval_runtime": 180.419, + "eval_samples_per_second": 92.241, + "eval_steps_per_second": 5.77, + "step": 13000 + }, + { + "epoch": 3.8024475524475525, + "grad_norm": 0.3598824441432953, + "learning_rate": 0.0005546973760932945, + "loss": 3.6428, + "step": 13050 + }, + { + "epoch": 3.817016317016317, + "grad_norm": 0.32811933755874634, + "learning_rate": 0.0005545224489795918, + "loss": 3.6448, + "step": 13100 + }, + { + "epoch": 3.8315850815850814, + "grad_norm": 0.3222385346889496, + "learning_rate": 0.0005543475218658891, + "loss": 3.6489, + "step": 13150 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.326913058757782, + "learning_rate": 0.0005541725947521865, + "loss": 3.6217, + "step": 13200 + }, + { + "epoch": 3.8607226107226107, + "grad_norm": 0.31770044565200806, + "learning_rate": 0.0005539976676384839, + "loss": 3.6383, + "step": 13250 + }, + { + "epoch": 3.875291375291375, + "grad_norm": 0.3197103440761566, + "learning_rate": 0.0005538227405247813, + "loss": 3.6432, + "step": 13300 + }, + { + "epoch": 3.88986013986014, + "grad_norm": 0.33483409881591797, + "learning_rate": 0.0005536478134110787, + "loss": 3.6325, + "step": 13350 + }, + { + "epoch": 3.9044289044289044, + "grad_norm": 0.3026617765426636, + "learning_rate": 0.000553472886297376, + "loss": 3.6343, + "step": 13400 + }, + { + "epoch": 3.918997668997669, + "grad_norm": 0.2976735532283783, + "learning_rate": 0.0005532979591836735, + "loss": 3.6483, + "step": 13450 + }, + { + "epoch": 3.9335664335664333, + "grad_norm": 0.3455604612827301, + "learning_rate": 0.0005531230320699708, + "loss": 3.6413, + "step": 13500 + }, + { + "epoch": 3.948135198135198, + "grad_norm": 0.3204672932624817, + "learning_rate": 0.0005529481049562682, + "loss": 3.6384, + "step": 13550 + }, + { + "epoch": 3.9627039627039626, + "grad_norm": 0.340648889541626, + "learning_rate": 0.0005527731778425655, + "loss": 3.6425, + "step": 13600 + }, + { + "epoch": 3.9772727272727275, + "grad_norm": 0.3379724323749542, + "learning_rate": 0.0005525982507288629, + "loss": 3.6327, + "step": 13650 + }, + { + "epoch": 3.991841491841492, + "grad_norm": 0.3036077320575714, + "learning_rate": 0.0005524233236151603, + "loss": 3.6375, + "step": 13700 + }, + { + "epoch": 4.006410256410256, + "grad_norm": 0.34318360686302185, + "learning_rate": 0.0005522483965014576, + "loss": 3.5803, + "step": 13750 + }, + { + "epoch": 4.020979020979021, + "grad_norm": 0.3264276087284088, + "learning_rate": 0.000552073469387755, + "loss": 3.5362, + "step": 13800 + }, + { + "epoch": 4.035547785547785, + "grad_norm": 0.3238934278488159, + "learning_rate": 0.0005518985422740525, + "loss": 3.5332, + "step": 13850 + }, + { + "epoch": 4.05011655011655, + "grad_norm": 0.32926997542381287, + "learning_rate": 0.0005517236151603498, + "loss": 3.5372, + "step": 13900 + }, + { + "epoch": 4.064685314685315, + "grad_norm": 0.32314813137054443, + "learning_rate": 0.0005515486880466472, + "loss": 3.5272, + "step": 13950 + }, + { + "epoch": 4.0792540792540795, + "grad_norm": 0.3332814872264862, + "learning_rate": 0.0005513737609329446, + "loss": 3.5382, + "step": 14000 + }, + { + "epoch": 4.0792540792540795, + "eval_accuracy": 0.3562127134068402, + "eval_loss": 3.6715903282165527, + "eval_runtime": 180.15, + "eval_samples_per_second": 92.379, + "eval_steps_per_second": 5.779, + "step": 14000 + }, + { + "epoch": 4.093822843822844, + "grad_norm": 0.33132901787757874, + "learning_rate": 0.0005511988338192419, + "loss": 3.549, + "step": 14050 + }, + { + "epoch": 4.108391608391608, + "grad_norm": 0.32595717906951904, + "learning_rate": 0.0005510239067055393, + "loss": 3.5445, + "step": 14100 + }, + { + "epoch": 4.122960372960373, + "grad_norm": 0.3297913670539856, + "learning_rate": 0.0005508489795918366, + "loss": 3.5392, + "step": 14150 + }, + { + "epoch": 4.137529137529137, + "grad_norm": 0.35622304677963257, + "learning_rate": 0.0005506740524781341, + "loss": 3.5387, + "step": 14200 + }, + { + "epoch": 4.1520979020979025, + "grad_norm": 0.32156145572662354, + "learning_rate": 0.0005504991253644315, + "loss": 3.5461, + "step": 14250 + }, + { + "epoch": 4.166666666666667, + "grad_norm": Infinity, + "learning_rate": 0.0005503241982507288, + "loss": 3.5555, + "step": 14300 + }, + { + "epoch": 4.181235431235431, + "grad_norm": 0.32054704427719116, + "learning_rate": 0.0005501492711370262, + "loss": 3.5634, + "step": 14350 + }, + { + "epoch": 4.195804195804196, + "grad_norm": 0.3304331302642822, + "learning_rate": 0.0005499743440233236, + "loss": 3.557, + "step": 14400 + }, + { + "epoch": 4.21037296037296, + "grad_norm": 0.33280083537101746, + "learning_rate": 0.000549799416909621, + "loss": 3.5636, + "step": 14450 + }, + { + "epoch": 4.224941724941725, + "grad_norm": 0.3097744584083557, + "learning_rate": 0.0005496244897959183, + "loss": 3.5591, + "step": 14500 + }, + { + "epoch": 4.239510489510489, + "grad_norm": 0.3197658658027649, + "learning_rate": 0.0005494495626822156, + "loss": 3.5661, + "step": 14550 + }, + { + "epoch": 4.2540792540792545, + "grad_norm": 0.3759899437427521, + "learning_rate": 0.0005492746355685131, + "loss": 3.5621, + "step": 14600 + }, + { + "epoch": 4.268648018648019, + "grad_norm": 0.34865570068359375, + "learning_rate": 0.0005490997084548105, + "loss": 3.5642, + "step": 14650 + }, + { + "epoch": 4.283216783216783, + "grad_norm": 0.3441263735294342, + "learning_rate": 0.0005489247813411078, + "loss": 3.5676, + "step": 14700 + }, + { + "epoch": 4.297785547785548, + "grad_norm": 0.33596622943878174, + "learning_rate": 0.0005487498542274052, + "loss": 3.5693, + "step": 14750 + }, + { + "epoch": 4.312354312354312, + "grad_norm": 0.3372125029563904, + "learning_rate": 0.0005485749271137026, + "loss": 3.5674, + "step": 14800 + }, + { + "epoch": 4.326923076923077, + "grad_norm": 0.3590675890445709, + "learning_rate": 0.0005484, + "loss": 3.5677, + "step": 14850 + }, + { + "epoch": 4.341491841491841, + "grad_norm": 0.3344537615776062, + "learning_rate": 0.0005482250728862973, + "loss": 3.5582, + "step": 14900 + }, + { + "epoch": 4.356060606060606, + "grad_norm": 0.3320492208003998, + "learning_rate": 0.0005480501457725946, + "loss": 3.5648, + "step": 14950 + }, + { + "epoch": 4.370629370629371, + "grad_norm": 0.336557537317276, + "learning_rate": 0.0005478752186588921, + "loss": 3.5647, + "step": 15000 + }, + { + "epoch": 4.370629370629371, + "eval_accuracy": 0.3576591986276676, + "eval_loss": 3.6587440967559814, + "eval_runtime": 180.1159, + "eval_samples_per_second": 92.396, + "eval_steps_per_second": 5.78, + "step": 15000 + }, + { + "epoch": 4.385198135198135, + "grad_norm": 0.3224494159221649, + "learning_rate": 0.0005477002915451894, + "loss": 3.5672, + "step": 15050 + }, + { + "epoch": 4.3997668997669, + "grad_norm": 0.3005123734474182, + "learning_rate": 0.0005475253644314868, + "loss": 3.567, + "step": 15100 + }, + { + "epoch": 4.414335664335664, + "grad_norm": 0.31343790888786316, + "learning_rate": 0.0005473504373177842, + "loss": 3.5575, + "step": 15150 + }, + { + "epoch": 4.428904428904429, + "grad_norm": 0.3172782361507416, + "learning_rate": 0.0005471755102040816, + "loss": 3.5665, + "step": 15200 + }, + { + "epoch": 4.443473193473194, + "grad_norm": 0.32356658577919006, + "learning_rate": 0.000547000583090379, + "loss": 3.5787, + "step": 15250 + }, + { + "epoch": 4.458041958041958, + "grad_norm": 0.35370585322380066, + "learning_rate": 0.0005468256559766763, + "loss": 3.5677, + "step": 15300 + }, + { + "epoch": 4.472610722610723, + "grad_norm": 0.32113948464393616, + "learning_rate": 0.0005466507288629738, + "loss": 3.5588, + "step": 15350 + }, + { + "epoch": 4.487179487179487, + "grad_norm": 0.3236648738384247, + "learning_rate": 0.0005464758017492711, + "loss": 3.5723, + "step": 15400 + }, + { + "epoch": 4.501748251748252, + "grad_norm": 0.32024386525154114, + "learning_rate": 0.0005463008746355684, + "loss": 3.582, + "step": 15450 + }, + { + "epoch": 4.516317016317016, + "grad_norm": 0.34335383772850037, + "learning_rate": 0.0005461259475218658, + "loss": 3.5728, + "step": 15500 + }, + { + "epoch": 4.5308857808857805, + "grad_norm": 0.3075568377971649, + "learning_rate": 0.0005459510204081633, + "loss": 3.5652, + "step": 15550 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.3292197585105896, + "learning_rate": 0.0005457760932944606, + "loss": 3.565, + "step": 15600 + }, + { + "epoch": 4.56002331002331, + "grad_norm": 0.35107719898223877, + "learning_rate": 0.000545601166180758, + "loss": 3.5702, + "step": 15650 + }, + { + "epoch": 4.574592074592075, + "grad_norm": 0.3471798598766327, + "learning_rate": 0.0005454262390670553, + "loss": 3.5681, + "step": 15700 + }, + { + "epoch": 4.589160839160839, + "grad_norm": 0.31821051239967346, + "learning_rate": 0.0005452513119533528, + "loss": 3.582, + "step": 15750 + }, + { + "epoch": 4.603729603729604, + "grad_norm": 0.3309209644794464, + "learning_rate": 0.0005450763848396501, + "loss": 3.5883, + "step": 15800 + }, + { + "epoch": 4.618298368298368, + "grad_norm": 0.33727866411209106, + "learning_rate": 0.0005449014577259474, + "loss": 3.5817, + "step": 15850 + }, + { + "epoch": 4.632867132867133, + "grad_norm": 0.3144679069519043, + "learning_rate": 0.0005447265306122448, + "loss": 3.5724, + "step": 15900 + }, + { + "epoch": 4.647435897435898, + "grad_norm": 0.32342618703842163, + "learning_rate": 0.0005445516034985423, + "loss": 3.5855, + "step": 15950 + }, + { + "epoch": 4.662004662004662, + "grad_norm": 0.3141750395298004, + "learning_rate": 0.0005443766763848396, + "loss": 3.5807, + "step": 16000 + }, + { + "epoch": 4.662004662004662, + "eval_accuracy": 0.3584899780834147, + "eval_loss": 3.6451687812805176, + "eval_runtime": 180.0666, + "eval_samples_per_second": 92.421, + "eval_steps_per_second": 5.781, + "step": 16000 + }, + { + "epoch": 4.676573426573427, + "grad_norm": 0.318861186504364, + "learning_rate": 0.000544201749271137, + "loss": 3.5705, + "step": 16050 + }, + { + "epoch": 4.691142191142191, + "grad_norm": 0.31984490156173706, + "learning_rate": 0.0005440268221574343, + "loss": 3.5858, + "step": 16100 + }, + { + "epoch": 4.7057109557109555, + "grad_norm": 0.3313526511192322, + "learning_rate": 0.0005438518950437318, + "loss": 3.5778, + "step": 16150 + }, + { + "epoch": 4.72027972027972, + "grad_norm": 0.332089900970459, + "learning_rate": 0.0005436769679300291, + "loss": 3.5776, + "step": 16200 + }, + { + "epoch": 4.734848484848484, + "grad_norm": 0.33302974700927734, + "learning_rate": 0.0005435020408163265, + "loss": 3.5832, + "step": 16250 + }, + { + "epoch": 4.74941724941725, + "grad_norm": 0.3242354691028595, + "learning_rate": 0.0005433271137026238, + "loss": 3.5848, + "step": 16300 + }, + { + "epoch": 4.763986013986014, + "grad_norm": 0.3078085482120514, + "learning_rate": 0.0005431521865889212, + "loss": 3.5824, + "step": 16350 + }, + { + "epoch": 4.778554778554779, + "grad_norm": 0.3317912220954895, + "learning_rate": 0.0005429772594752186, + "loss": 3.5782, + "step": 16400 + }, + { + "epoch": 4.793123543123543, + "grad_norm": 0.30730515718460083, + "learning_rate": 0.000542802332361516, + "loss": 3.5779, + "step": 16450 + }, + { + "epoch": 4.8076923076923075, + "grad_norm": 0.35136038064956665, + "learning_rate": 0.0005426274052478133, + "loss": 3.583, + "step": 16500 + }, + { + "epoch": 4.822261072261072, + "grad_norm": 0.3428604304790497, + "learning_rate": 0.0005424524781341108, + "loss": 3.578, + "step": 16550 + }, + { + "epoch": 4.836829836829837, + "grad_norm": 0.3045051693916321, + "learning_rate": 0.0005422775510204081, + "loss": 3.5811, + "step": 16600 + }, + { + "epoch": 4.851398601398602, + "grad_norm": 0.3164063096046448, + "learning_rate": 0.0005421026239067055, + "loss": 3.5821, + "step": 16650 + }, + { + "epoch": 4.865967365967366, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0005419276967930028, + "loss": 3.5749, + "step": 16700 + }, + { + "epoch": 4.880536130536131, + "grad_norm": 0.3375592529773712, + "learning_rate": 0.0005417527696793002, + "loss": 3.5713, + "step": 16750 + }, + { + "epoch": 4.895104895104895, + "grad_norm": 0.3262588083744049, + "learning_rate": 0.0005415778425655976, + "loss": 3.5773, + "step": 16800 + }, + { + "epoch": 4.909673659673659, + "grad_norm": 0.33031025528907776, + "learning_rate": 0.000541402915451895, + "loss": 3.5719, + "step": 16850 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.32215115427970886, + "learning_rate": 0.0005412279883381923, + "loss": 3.5679, + "step": 16900 + }, + { + "epoch": 4.938811188811189, + "grad_norm": 0.3194146156311035, + "learning_rate": 0.0005410530612244898, + "loss": 3.5837, + "step": 16950 + }, + { + "epoch": 4.953379953379954, + "grad_norm": 0.3187941312789917, + "learning_rate": 0.0005408781341107871, + "loss": 3.5693, + "step": 17000 + }, + { + "epoch": 4.953379953379954, + "eval_accuracy": 0.35988401777879797, + "eval_loss": 3.632936477661133, + "eval_runtime": 180.0341, + "eval_samples_per_second": 92.438, + "eval_steps_per_second": 5.782, + "step": 17000 + }, + { + "epoch": 4.967948717948718, + "grad_norm": 0.32214635610580444, + "learning_rate": 0.0005407032069970845, + "loss": 3.5817, + "step": 17050 + }, + { + "epoch": 4.9825174825174825, + "grad_norm": 0.3381812870502472, + "learning_rate": 0.0005405282798833819, + "loss": 3.5721, + "step": 17100 + }, + { + "epoch": 4.997086247086247, + "grad_norm": 0.328273206949234, + "learning_rate": 0.0005403533527696793, + "loss": 3.5866, + "step": 17150 + }, + { + "epoch": 5.011655011655011, + "grad_norm": 0.32486042380332947, + "learning_rate": 0.0005401784256559766, + "loss": 3.4864, + "step": 17200 + }, + { + "epoch": 5.026223776223776, + "grad_norm": 0.3191656172275543, + "learning_rate": 0.000540003498542274, + "loss": 3.4736, + "step": 17250 + }, + { + "epoch": 5.040792540792541, + "grad_norm": 0.3504127264022827, + "learning_rate": 0.0005398285714285714, + "loss": 3.469, + "step": 17300 + }, + { + "epoch": 5.055361305361306, + "grad_norm": 0.3454863727092743, + "learning_rate": 0.0005396536443148688, + "loss": 3.4665, + "step": 17350 + }, + { + "epoch": 5.06993006993007, + "grad_norm": 0.30901169776916504, + "learning_rate": 0.0005394787172011661, + "loss": 3.4741, + "step": 17400 + }, + { + "epoch": 5.084498834498834, + "grad_norm": 0.33311742544174194, + "learning_rate": 0.0005393037900874635, + "loss": 3.4876, + "step": 17450 + }, + { + "epoch": 5.099067599067599, + "grad_norm": 0.33518463373184204, + "learning_rate": 0.0005391288629737609, + "loss": 3.4755, + "step": 17500 + }, + { + "epoch": 5.113636363636363, + "grad_norm": 0.33938467502593994, + "learning_rate": 0.0005389539358600583, + "loss": 3.489, + "step": 17550 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 0.3346013128757477, + "learning_rate": 0.0005387790087463557, + "loss": 3.4899, + "step": 17600 + }, + { + "epoch": 5.142773892773893, + "grad_norm": 0.3396677076816559, + "learning_rate": 0.0005386040816326529, + "loss": 3.4791, + "step": 17650 + }, + { + "epoch": 5.1573426573426575, + "grad_norm": 0.32493624091148376, + "learning_rate": 0.0005384291545189504, + "loss": 3.4999, + "step": 17700 + }, + { + "epoch": 5.171911421911422, + "grad_norm": 0.34523579478263855, + "learning_rate": 0.0005382542274052478, + "loss": 3.4942, + "step": 17750 + }, + { + "epoch": 5.186480186480186, + "grad_norm": 0.34241601824760437, + "learning_rate": 0.0005380793002915451, + "loss": 3.4986, + "step": 17800 + }, + { + "epoch": 5.201048951048951, + "grad_norm": 0.3449043035507202, + "learning_rate": 0.0005379043731778425, + "loss": 3.5096, + "step": 17850 + }, + { + "epoch": 5.215617715617715, + "grad_norm": 0.33027029037475586, + "learning_rate": 0.0005377294460641399, + "loss": 3.5021, + "step": 17900 + }, + { + "epoch": 5.230186480186481, + "grad_norm": 0.33586353063583374, + "learning_rate": 0.0005375545189504373, + "loss": 3.4964, + "step": 17950 + }, + { + "epoch": 5.244755244755245, + "grad_norm": 0.3348841071128845, + "learning_rate": 0.0005373795918367346, + "loss": 3.5152, + "step": 18000 + }, + { + "epoch": 5.244755244755245, + "eval_accuracy": 0.3607333765910926, + "eval_loss": 3.6339569091796875, + "eval_runtime": 180.5183, + "eval_samples_per_second": 92.19, + "eval_steps_per_second": 5.767, + "step": 18000 + }, + { + "epoch": 5.2593240093240095, + "grad_norm": 0.33033329248428345, + "learning_rate": 0.000537204664723032, + "loss": 3.4922, + "step": 18050 + }, + { + "epoch": 5.273892773892774, + "grad_norm": 0.32480764389038086, + "learning_rate": 0.0005370297376093294, + "loss": 3.5049, + "step": 18100 + }, + { + "epoch": 5.288461538461538, + "grad_norm": 0.3114669919013977, + "learning_rate": 0.0005368548104956268, + "loss": 3.5045, + "step": 18150 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 0.32912948727607727, + "learning_rate": 0.0005366798833819241, + "loss": 3.5039, + "step": 18200 + }, + { + "epoch": 5.317599067599067, + "grad_norm": 0.325888067483902, + "learning_rate": 0.0005365049562682215, + "loss": 3.5107, + "step": 18250 + }, + { + "epoch": 5.3321678321678325, + "grad_norm": 0.3258603811264038, + "learning_rate": 0.0005363300291545189, + "loss": 3.5079, + "step": 18300 + }, + { + "epoch": 5.346736596736597, + "grad_norm": 0.34344643354415894, + "learning_rate": 0.0005361551020408163, + "loss": 3.5056, + "step": 18350 + }, + { + "epoch": 5.361305361305361, + "grad_norm": 0.34246399998664856, + "learning_rate": 0.0005359801749271136, + "loss": 3.5118, + "step": 18400 + }, + { + "epoch": 5.375874125874126, + "grad_norm": 0.35261663794517517, + "learning_rate": 0.000535805247813411, + "loss": 3.5154, + "step": 18450 + }, + { + "epoch": 5.39044289044289, + "grad_norm": 0.33429020643234253, + "learning_rate": 0.0005356303206997085, + "loss": 3.515, + "step": 18500 + }, + { + "epoch": 5.405011655011655, + "grad_norm": 0.3388688266277313, + "learning_rate": 0.0005354553935860058, + "loss": 3.5011, + "step": 18550 + }, + { + "epoch": 5.41958041958042, + "grad_norm": 0.31441932916641235, + "learning_rate": 0.0005352804664723031, + "loss": 3.524, + "step": 18600 + }, + { + "epoch": 5.4341491841491845, + "grad_norm": 0.33346623182296753, + "learning_rate": 0.0005351055393586006, + "loss": 3.5096, + "step": 18650 + }, + { + "epoch": 5.448717948717949, + "grad_norm": 0.3645952045917511, + "learning_rate": 0.0005349306122448979, + "loss": 3.5162, + "step": 18700 + }, + { + "epoch": 5.463286713286713, + "grad_norm": 0.3252617120742798, + "learning_rate": 0.0005347556851311953, + "loss": 3.5166, + "step": 18750 + }, + { + "epoch": 5.477855477855478, + "grad_norm": 0.32356569170951843, + "learning_rate": 0.0005345807580174926, + "loss": 3.5259, + "step": 18800 + }, + { + "epoch": 5.492424242424242, + "grad_norm": 0.32452526688575745, + "learning_rate": 0.0005344058309037901, + "loss": 3.5419, + "step": 18850 + }, + { + "epoch": 5.506993006993007, + "grad_norm": 0.3109516501426697, + "learning_rate": 0.0005342309037900875, + "loss": 3.523, + "step": 18900 + }, + { + "epoch": 5.521561771561771, + "grad_norm": 0.32956892251968384, + "learning_rate": 0.0005340559766763848, + "loss": 3.5346, + "step": 18950 + }, + { + "epoch": 5.536130536130536, + "grad_norm": 0.347649484872818, + "learning_rate": 0.0005338810495626821, + "loss": 3.5148, + "step": 19000 + }, + { + "epoch": 5.536130536130536, + "eval_accuracy": 0.3614395097307616, + "eval_loss": 3.624124526977539, + "eval_runtime": 180.5076, + "eval_samples_per_second": 92.196, + "eval_steps_per_second": 5.767, + "step": 19000 + }, + { + "epoch": 5.550699300699301, + "grad_norm": 0.34442394971847534, + "learning_rate": 0.0005337061224489796, + "loss": 3.508, + "step": 19050 + }, + { + "epoch": 5.565268065268065, + "grad_norm": 0.3646959960460663, + "learning_rate": 0.0005335311953352769, + "loss": 3.5272, + "step": 19100 + }, + { + "epoch": 5.57983682983683, + "grad_norm": 0.34306755661964417, + "learning_rate": 0.0005333562682215743, + "loss": 3.5253, + "step": 19150 + }, + { + "epoch": 5.594405594405594, + "grad_norm": 0.34549543261528015, + "learning_rate": 0.0005331813411078716, + "loss": 3.5349, + "step": 19200 + }, + { + "epoch": 5.608974358974359, + "grad_norm": 0.3486803472042084, + "learning_rate": 0.0005330064139941691, + "loss": 3.518, + "step": 19250 + }, + { + "epoch": 5.623543123543124, + "grad_norm": 0.3553147315979004, + "learning_rate": 0.0005328314868804665, + "loss": 3.5229, + "step": 19300 + }, + { + "epoch": 5.638111888111888, + "grad_norm": 0.3389810025691986, + "learning_rate": 0.0005326565597667638, + "loss": 3.5184, + "step": 19350 + }, + { + "epoch": 5.652680652680653, + "grad_norm": 0.3389154076576233, + "learning_rate": 0.0005324816326530612, + "loss": 3.5242, + "step": 19400 + }, + { + "epoch": 5.667249417249417, + "grad_norm": 0.31988218426704407, + "learning_rate": 0.0005323067055393586, + "loss": 3.5365, + "step": 19450 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 0.32239192724227905, + "learning_rate": 0.0005321317784256559, + "loss": 3.5347, + "step": 19500 + }, + { + "epoch": 5.696386946386946, + "grad_norm": 0.3520359694957733, + "learning_rate": 0.0005319568513119533, + "loss": 3.5332, + "step": 19550 + }, + { + "epoch": 5.7109557109557105, + "grad_norm": 0.3352511525154114, + "learning_rate": 0.0005317819241982506, + "loss": 3.534, + "step": 19600 + }, + { + "epoch": 5.725524475524476, + "grad_norm": 0.3281591236591339, + "learning_rate": 0.0005316069970845481, + "loss": 3.5274, + "step": 19650 + }, + { + "epoch": 5.74009324009324, + "grad_norm": 0.33789217472076416, + "learning_rate": 0.0005314320699708454, + "loss": 3.5266, + "step": 19700 + }, + { + "epoch": 5.754662004662005, + "grad_norm": 0.34207120537757874, + "learning_rate": 0.0005312571428571428, + "loss": 3.5315, + "step": 19750 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 0.351068913936615, + "learning_rate": 0.0005310822157434403, + "loss": 3.5341, + "step": 19800 + }, + { + "epoch": 5.783799533799534, + "grad_norm": 0.3352493643760681, + "learning_rate": 0.0005309072886297376, + "loss": 3.53, + "step": 19850 + }, + { + "epoch": 5.798368298368298, + "grad_norm": 0.327741801738739, + "learning_rate": 0.0005307323615160349, + "loss": 3.5304, + "step": 19900 + }, + { + "epoch": 5.812937062937063, + "grad_norm": 0.32836633920669556, + "learning_rate": 0.0005305574344023323, + "loss": 3.5286, + "step": 19950 + }, + { + "epoch": 5.827505827505828, + "grad_norm": 0.3504875600337982, + "learning_rate": 0.0005303825072886296, + "loss": 3.5384, + "step": 20000 + }, + { + "epoch": 5.827505827505828, + "eval_accuracy": 0.3625647367105273, + "eval_loss": 3.612804412841797, + "eval_runtime": 180.4713, + "eval_samples_per_second": 92.214, + "eval_steps_per_second": 5.768, + "step": 20000 + }, + { + "epoch": 5.842074592074592, + "grad_norm": 0.3540632426738739, + "learning_rate": 0.0005302075801749271, + "loss": 3.5366, + "step": 20050 + }, + { + "epoch": 5.856643356643357, + "grad_norm": 0.34035322070121765, + "learning_rate": 0.0005300326530612244, + "loss": 3.5303, + "step": 20100 + }, + { + "epoch": 5.871212121212121, + "grad_norm": 0.31729087233543396, + "learning_rate": 0.0005298577259475218, + "loss": 3.5335, + "step": 20150 + }, + { + "epoch": 5.8857808857808855, + "grad_norm": 0.3735673427581787, + "learning_rate": 0.0005296827988338193, + "loss": 3.5277, + "step": 20200 + }, + { + "epoch": 5.90034965034965, + "grad_norm": 0.314452201128006, + "learning_rate": 0.0005295078717201166, + "loss": 3.5406, + "step": 20250 + }, + { + "epoch": 5.914918414918415, + "grad_norm": 0.3204086422920227, + "learning_rate": 0.000529332944606414, + "loss": 3.5359, + "step": 20300 + }, + { + "epoch": 5.92948717948718, + "grad_norm": 0.3485746681690216, + "learning_rate": 0.0005291580174927113, + "loss": 3.5245, + "step": 20350 + }, + { + "epoch": 5.944055944055944, + "grad_norm": 0.34968072175979614, + "learning_rate": 0.0005289830903790087, + "loss": 3.54, + "step": 20400 + }, + { + "epoch": 5.958624708624709, + "grad_norm": 0.3806632161140442, + "learning_rate": 0.0005288081632653061, + "loss": 3.525, + "step": 20450 + }, + { + "epoch": 5.973193473193473, + "grad_norm": 0.3304056227207184, + "learning_rate": 0.0005286332361516034, + "loss": 3.5232, + "step": 20500 + }, + { + "epoch": 5.9877622377622375, + "grad_norm": 0.33363205194473267, + "learning_rate": 0.0005284583090379008, + "loss": 3.5174, + "step": 20550 + }, + { + "epoch": 6.002331002331002, + "grad_norm": 0.3507980704307556, + "learning_rate": 0.0005282833819241983, + "loss": 3.5095, + "step": 20600 + }, + { + "epoch": 6.016899766899767, + "grad_norm": 0.3389154374599457, + "learning_rate": 0.0005281084548104956, + "loss": 3.4025, + "step": 20650 + }, + { + "epoch": 6.031468531468532, + "grad_norm": 0.33325284719467163, + "learning_rate": 0.000527933527696793, + "loss": 3.4253, + "step": 20700 + }, + { + "epoch": 6.046037296037296, + "grad_norm": 0.34633567929267883, + "learning_rate": 0.0005277586005830903, + "loss": 3.4288, + "step": 20750 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 0.33911773562431335, + "learning_rate": 0.0005275836734693877, + "loss": 3.4302, + "step": 20800 + }, + { + "epoch": 6.075174825174825, + "grad_norm": 0.3277522027492523, + "learning_rate": 0.0005274087463556851, + "loss": 3.4381, + "step": 20850 + }, + { + "epoch": 6.089743589743589, + "grad_norm": 0.3419731855392456, + "learning_rate": 0.0005272338192419824, + "loss": 3.4431, + "step": 20900 + }, + { + "epoch": 6.104312354312355, + "grad_norm": 0.35028308629989624, + "learning_rate": 0.0005270588921282798, + "loss": 3.4435, + "step": 20950 + }, + { + "epoch": 6.118881118881119, + "grad_norm": 0.3204551339149475, + "learning_rate": 0.0005268839650145772, + "loss": 3.4338, + "step": 21000 + }, + { + "epoch": 6.118881118881119, + "eval_accuracy": 0.3627538228202005, + "eval_loss": 3.615595579147339, + "eval_runtime": 180.6199, + "eval_samples_per_second": 92.138, + "eval_steps_per_second": 5.763, + "step": 21000 + }, + { + "epoch": 6.133449883449884, + "grad_norm": 0.3347219228744507, + "learning_rate": 0.0005267090379008746, + "loss": 3.4486, + "step": 21050 + }, + { + "epoch": 6.148018648018648, + "grad_norm": 0.3284785747528076, + "learning_rate": 0.000526534110787172, + "loss": 3.4548, + "step": 21100 + }, + { + "epoch": 6.1625874125874125, + "grad_norm": 0.33264586329460144, + "learning_rate": 0.0005263591836734693, + "loss": 3.4476, + "step": 21150 + }, + { + "epoch": 6.177156177156177, + "grad_norm": 0.3285725712776184, + "learning_rate": 0.0005261842565597668, + "loss": 3.4675, + "step": 21200 + }, + { + "epoch": 6.191724941724941, + "grad_norm": 0.3390142321586609, + "learning_rate": 0.0005260093294460641, + "loss": 3.455, + "step": 21250 + }, + { + "epoch": 6.206293706293707, + "grad_norm": 0.33934858441352844, + "learning_rate": 0.0005258344023323614, + "loss": 3.4463, + "step": 21300 + }, + { + "epoch": 6.220862470862471, + "grad_norm": 0.3672083914279938, + "learning_rate": 0.0005256594752186588, + "loss": 3.4512, + "step": 21350 + }, + { + "epoch": 6.235431235431236, + "grad_norm": 0.3115769624710083, + "learning_rate": 0.0005254845481049562, + "loss": 3.4634, + "step": 21400 + }, + { + "epoch": 6.25, + "grad_norm": 0.32785558700561523, + "learning_rate": 0.0005253096209912536, + "loss": 3.4688, + "step": 21450 + }, + { + "epoch": 6.264568764568764, + "grad_norm": 0.3327209949493408, + "learning_rate": 0.000525134693877551, + "loss": 3.4517, + "step": 21500 + }, + { + "epoch": 6.279137529137529, + "grad_norm": 0.34631094336509705, + "learning_rate": 0.0005249597667638484, + "loss": 3.4574, + "step": 21550 + }, + { + "epoch": 6.293706293706293, + "grad_norm": 0.3532359004020691, + "learning_rate": 0.0005247848396501458, + "loss": 3.4656, + "step": 21600 + }, + { + "epoch": 6.308275058275059, + "grad_norm": 0.36950933933258057, + "learning_rate": 0.0005246099125364431, + "loss": 3.4769, + "step": 21650 + }, + { + "epoch": 6.322843822843823, + "grad_norm": 0.336834579706192, + "learning_rate": 0.0005244349854227404, + "loss": 3.4637, + "step": 21700 + }, + { + "epoch": 6.3374125874125875, + "grad_norm": 0.30184629559516907, + "learning_rate": 0.0005242600583090379, + "loss": 3.4716, + "step": 21750 + }, + { + "epoch": 6.351981351981352, + "grad_norm": 0.34009432792663574, + "learning_rate": 0.0005240851311953352, + "loss": 3.4698, + "step": 21800 + }, + { + "epoch": 6.366550116550116, + "grad_norm": 0.32678115367889404, + "learning_rate": 0.0005239102040816326, + "loss": 3.4706, + "step": 21850 + }, + { + "epoch": 6.381118881118881, + "grad_norm": 0.34370940923690796, + "learning_rate": 0.00052373527696793, + "loss": 3.4649, + "step": 21900 + }, + { + "epoch": 6.395687645687646, + "grad_norm": 0.31767651438713074, + "learning_rate": 0.0005235603498542274, + "loss": 3.4903, + "step": 21950 + }, + { + "epoch": 6.410256410256411, + "grad_norm": 0.35483428835868835, + "learning_rate": 0.0005233854227405248, + "loss": 3.4762, + "step": 22000 + }, + { + "epoch": 6.410256410256411, + "eval_accuracy": 0.3631498688509091, + "eval_loss": 3.6074860095977783, + "eval_runtime": 180.0487, + "eval_samples_per_second": 92.431, + "eval_steps_per_second": 5.782, + "step": 22000 + }, + { + "epoch": 6.424825174825175, + "grad_norm": 0.31931906938552856, + "learning_rate": 0.0005232104956268221, + "loss": 3.4758, + "step": 22050 + }, + { + "epoch": 6.4393939393939394, + "grad_norm": 0.3227771818637848, + "learning_rate": 0.0005230355685131195, + "loss": 3.4678, + "step": 22100 + }, + { + "epoch": 6.453962703962704, + "grad_norm": 0.35156136751174927, + "learning_rate": 0.0005228606413994169, + "loss": 3.4803, + "step": 22150 + }, + { + "epoch": 6.468531468531468, + "grad_norm": 0.33394086360931396, + "learning_rate": 0.0005226857142857142, + "loss": 3.471, + "step": 22200 + }, + { + "epoch": 6.483100233100233, + "grad_norm": 0.3395681381225586, + "learning_rate": 0.0005225107871720116, + "loss": 3.4759, + "step": 22250 + }, + { + "epoch": 6.497668997668998, + "grad_norm": 0.32322457432746887, + "learning_rate": 0.0005223358600583089, + "loss": 3.48, + "step": 22300 + }, + { + "epoch": 6.5122377622377625, + "grad_norm": 0.32809075713157654, + "learning_rate": 0.0005221609329446064, + "loss": 3.4774, + "step": 22350 + }, + { + "epoch": 6.526806526806527, + "grad_norm": 0.32868528366088867, + "learning_rate": 0.0005219860058309038, + "loss": 3.4811, + "step": 22400 + }, + { + "epoch": 6.541375291375291, + "grad_norm": 0.33489176630973816, + "learning_rate": 0.0005218110787172011, + "loss": 3.4916, + "step": 22450 + }, + { + "epoch": 6.555944055944056, + "grad_norm": 0.3436543941497803, + "learning_rate": 0.0005216361516034985, + "loss": 3.4859, + "step": 22500 + }, + { + "epoch": 6.57051282051282, + "grad_norm": 0.3015133738517761, + "learning_rate": 0.0005214612244897959, + "loss": 3.4779, + "step": 22550 + }, + { + "epoch": 6.585081585081585, + "grad_norm": 0.3797510862350464, + "learning_rate": 0.0005212862973760932, + "loss": 3.4846, + "step": 22600 + }, + { + "epoch": 6.59965034965035, + "grad_norm": 0.327371209859848, + "learning_rate": 0.0005211113702623906, + "loss": 3.4941, + "step": 22650 + }, + { + "epoch": 6.6142191142191145, + "grad_norm": 0.3728986084461212, + "learning_rate": 0.0005209364431486879, + "loss": 3.4986, + "step": 22700 + }, + { + "epoch": 6.628787878787879, + "grad_norm": 0.3234831988811493, + "learning_rate": 0.0005207615160349854, + "loss": 3.4824, + "step": 22750 + }, + { + "epoch": 6.643356643356643, + "grad_norm": 0.3303401470184326, + "learning_rate": 0.0005205865889212828, + "loss": 3.4857, + "step": 22800 + }, + { + "epoch": 6.657925407925408, + "grad_norm": 0.3562447726726532, + "learning_rate": 0.0005204116618075801, + "loss": 3.4825, + "step": 22850 + }, + { + "epoch": 6.672494172494172, + "grad_norm": 0.3363456428050995, + "learning_rate": 0.0005202367346938776, + "loss": 3.4786, + "step": 22900 + }, + { + "epoch": 6.687062937062937, + "grad_norm": 0.337936669588089, + "learning_rate": 0.0005200618075801749, + "loss": 3.4894, + "step": 22950 + }, + { + "epoch": 6.701631701631702, + "grad_norm": 0.34164348244667053, + "learning_rate": 0.0005198868804664723, + "loss": 3.4815, + "step": 23000 + }, + { + "epoch": 6.701631701631702, + "eval_accuracy": 0.3637440554878363, + "eval_loss": 3.6008543968200684, + "eval_runtime": 180.4988, + "eval_samples_per_second": 92.2, + "eval_steps_per_second": 5.767, + "step": 23000 + }, + { + "epoch": 6.716200466200466, + "grad_norm": 0.3702085018157959, + "learning_rate": 0.0005197119533527696, + "loss": 3.4993, + "step": 23050 + }, + { + "epoch": 6.730769230769231, + "grad_norm": 0.33993563055992126, + "learning_rate": 0.000519537026239067, + "loss": 3.4772, + "step": 23100 + }, + { + "epoch": 6.745337995337995, + "grad_norm": 0.33401525020599365, + "learning_rate": 0.0005193620991253644, + "loss": 3.4976, + "step": 23150 + }, + { + "epoch": 6.75990675990676, + "grad_norm": 0.37840354442596436, + "learning_rate": 0.0005191871720116618, + "loss": 3.4828, + "step": 23200 + }, + { + "epoch": 6.774475524475524, + "grad_norm": 0.3243924379348755, + "learning_rate": 0.0005190122448979591, + "loss": 3.4938, + "step": 23250 + }, + { + "epoch": 6.7890442890442895, + "grad_norm": 0.3309505581855774, + "learning_rate": 0.0005188373177842566, + "loss": 3.4723, + "step": 23300 + }, + { + "epoch": 6.803613053613054, + "grad_norm": 0.35153377056121826, + "learning_rate": 0.0005186623906705539, + "loss": 3.4872, + "step": 23350 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 0.3381296396255493, + "learning_rate": 0.0005184874635568513, + "loss": 3.4899, + "step": 23400 + }, + { + "epoch": 6.832750582750583, + "grad_norm": 0.3551500737667084, + "learning_rate": 0.0005183125364431486, + "loss": 3.4895, + "step": 23450 + }, + { + "epoch": 6.847319347319347, + "grad_norm": 0.33850058913230896, + "learning_rate": 0.000518137609329446, + "loss": 3.4793, + "step": 23500 + }, + { + "epoch": 6.861888111888112, + "grad_norm": 0.3279431164264679, + "learning_rate": 0.0005179626822157434, + "loss": 3.4967, + "step": 23550 + }, + { + "epoch": 6.876456876456876, + "grad_norm": 0.3145736753940582, + "learning_rate": 0.0005177877551020407, + "loss": 3.5046, + "step": 23600 + }, + { + "epoch": 6.891025641025641, + "grad_norm": 0.3533722162246704, + "learning_rate": 0.0005176128279883381, + "loss": 3.4892, + "step": 23650 + }, + { + "epoch": 6.905594405594406, + "grad_norm": 0.3434518575668335, + "learning_rate": 0.0005174379008746356, + "loss": 3.4818, + "step": 23700 + }, + { + "epoch": 6.92016317016317, + "grad_norm": 0.30422964692115784, + "learning_rate": 0.0005172629737609329, + "loss": 3.4961, + "step": 23750 + }, + { + "epoch": 6.934731934731935, + "grad_norm": 0.34872138500213623, + "learning_rate": 0.0005170880466472303, + "loss": 3.4941, + "step": 23800 + }, + { + "epoch": 6.949300699300699, + "grad_norm": 0.3359842598438263, + "learning_rate": 0.0005169131195335276, + "loss": 3.4905, + "step": 23850 + }, + { + "epoch": 6.963869463869464, + "grad_norm": 0.3362923264503479, + "learning_rate": 0.0005167381924198251, + "loss": 3.4967, + "step": 23900 + }, + { + "epoch": 6.978438228438229, + "grad_norm": 0.33967387676239014, + "learning_rate": 0.0005165632653061224, + "loss": 3.4997, + "step": 23950 + }, + { + "epoch": 6.993006993006993, + "grad_norm": 0.326475590467453, + "learning_rate": 0.0005163883381924197, + "loss": 3.4942, + "step": 24000 + }, + { + "epoch": 6.993006993006993, + "eval_accuracy": 0.36491220313304396, + "eval_loss": 3.5894508361816406, + "eval_runtime": 180.3515, + "eval_samples_per_second": 92.275, + "eval_steps_per_second": 5.772, + "step": 24000 + }, + { + "epoch": 7.007575757575758, + "grad_norm": 0.35610419511795044, + "learning_rate": 0.0005162134110787171, + "loss": 3.4349, + "step": 24050 + }, + { + "epoch": 7.022144522144522, + "grad_norm": 0.3531475067138672, + "learning_rate": 0.0005160384839650146, + "loss": 3.3823, + "step": 24100 + }, + { + "epoch": 7.036713286713287, + "grad_norm": 0.3476791977882385, + "learning_rate": 0.0005158635568513119, + "loss": 3.4016, + "step": 24150 + }, + { + "epoch": 7.051282051282051, + "grad_norm": 0.35229551792144775, + "learning_rate": 0.0005156886297376093, + "loss": 3.3868, + "step": 24200 + }, + { + "epoch": 7.0658508158508155, + "grad_norm": 0.3391362428665161, + "learning_rate": 0.0005155137026239066, + "loss": 3.3928, + "step": 24250 + }, + { + "epoch": 7.08041958041958, + "grad_norm": 0.34460726380348206, + "learning_rate": 0.0005153387755102041, + "loss": 3.3913, + "step": 24300 + }, + { + "epoch": 7.094988344988345, + "grad_norm": 0.35683906078338623, + "learning_rate": 0.0005151638483965014, + "loss": 3.3972, + "step": 24350 + }, + { + "epoch": 7.10955710955711, + "grad_norm": 0.3500906825065613, + "learning_rate": 0.0005149889212827987, + "loss": 3.4121, + "step": 24400 + }, + { + "epoch": 7.124125874125874, + "grad_norm": 0.32340437173843384, + "learning_rate": 0.0005148139941690961, + "loss": 3.4042, + "step": 24450 + }, + { + "epoch": 7.138694638694639, + "grad_norm": 0.36307796835899353, + "learning_rate": 0.0005146390670553936, + "loss": 3.4152, + "step": 24500 + }, + { + "epoch": 7.153263403263403, + "grad_norm": 0.35622280836105347, + "learning_rate": 0.0005144641399416909, + "loss": 3.4038, + "step": 24550 + }, + { + "epoch": 7.1678321678321675, + "grad_norm": 0.34201404452323914, + "learning_rate": 0.0005142892128279883, + "loss": 3.413, + "step": 24600 + }, + { + "epoch": 7.182400932400933, + "grad_norm": 0.3477611243724823, + "learning_rate": 0.0005141142857142856, + "loss": 3.4147, + "step": 24650 + }, + { + "epoch": 7.196969696969697, + "grad_norm": 0.3193877339363098, + "learning_rate": 0.0005139393586005831, + "loss": 3.4349, + "step": 24700 + }, + { + "epoch": 7.211538461538462, + "grad_norm": 0.3370342254638672, + "learning_rate": 0.0005137644314868804, + "loss": 3.4269, + "step": 24750 + }, + { + "epoch": 7.226107226107226, + "grad_norm": 0.35344481468200684, + "learning_rate": 0.0005135895043731778, + "loss": 3.4046, + "step": 24800 + }, + { + "epoch": 7.2406759906759905, + "grad_norm": 0.3530924320220947, + "learning_rate": 0.0005134145772594752, + "loss": 3.4115, + "step": 24850 + }, + { + "epoch": 7.255244755244755, + "grad_norm": 0.3493140935897827, + "learning_rate": 0.0005132396501457726, + "loss": 3.4237, + "step": 24900 + }, + { + "epoch": 7.269813519813519, + "grad_norm": 0.33685219287872314, + "learning_rate": 0.0005130647230320699, + "loss": 3.4313, + "step": 24950 + }, + { + "epoch": 7.284382284382285, + "grad_norm": 0.3504573702812195, + "learning_rate": 0.0005128897959183673, + "loss": 3.4237, + "step": 25000 + }, + { + "epoch": 7.284382284382285, + "eval_accuracy": 0.3649218455839104, + "eval_loss": 3.599851369857788, + "eval_runtime": 180.4257, + "eval_samples_per_second": 92.237, + "eval_steps_per_second": 5.77, + "step": 25000 + }, + { + "epoch": 7.298951048951049, + "grad_norm": 0.34710603952407837, + "learning_rate": 0.0005127148688046647, + "loss": 3.4347, + "step": 25050 + }, + { + "epoch": 7.313519813519814, + "grad_norm": 0.3456078767776489, + "learning_rate": 0.0005125399416909621, + "loss": 3.4325, + "step": 25100 + }, + { + "epoch": 7.328088578088578, + "grad_norm": 0.36139947175979614, + "learning_rate": 0.0005123650145772594, + "loss": 3.4531, + "step": 25150 + }, + { + "epoch": 7.3426573426573425, + "grad_norm": 0.3331305980682373, + "learning_rate": 0.0005121900874635568, + "loss": 3.4372, + "step": 25200 + }, + { + "epoch": 7.357226107226107, + "grad_norm": 0.3419002294540405, + "learning_rate": 0.0005120151603498543, + "loss": 3.4222, + "step": 25250 + }, + { + "epoch": 7.371794871794872, + "grad_norm": 0.37077078223228455, + "learning_rate": 0.0005118402332361515, + "loss": 3.438, + "step": 25300 + }, + { + "epoch": 7.386363636363637, + "grad_norm": 0.37061864137649536, + "learning_rate": 0.0005116653061224489, + "loss": 3.4384, + "step": 25350 + }, + { + "epoch": 7.400932400932401, + "grad_norm": 0.33451831340789795, + "learning_rate": 0.0005114903790087463, + "loss": 3.4323, + "step": 25400 + }, + { + "epoch": 7.415501165501166, + "grad_norm": 0.36487630009651184, + "learning_rate": 0.0005113154518950437, + "loss": 3.4338, + "step": 25450 + }, + { + "epoch": 7.43006993006993, + "grad_norm": 0.34303170442581177, + "learning_rate": 0.0005111405247813411, + "loss": 3.446, + "step": 25500 + }, + { + "epoch": 7.444638694638694, + "grad_norm": 0.3491624593734741, + "learning_rate": 0.0005109655976676384, + "loss": 3.4407, + "step": 25550 + }, + { + "epoch": 7.459207459207459, + "grad_norm": 0.3570358455181122, + "learning_rate": 0.0005107906705539358, + "loss": 3.4499, + "step": 25600 + }, + { + "epoch": 7.473776223776224, + "grad_norm": 0.3398280739784241, + "learning_rate": 0.0005106157434402332, + "loss": 3.438, + "step": 25650 + }, + { + "epoch": 7.488344988344989, + "grad_norm": 0.3448866307735443, + "learning_rate": 0.0005104408163265306, + "loss": 3.4396, + "step": 25700 + }, + { + "epoch": 7.502913752913753, + "grad_norm": 0.35469329357147217, + "learning_rate": 0.0005102658892128279, + "loss": 3.4361, + "step": 25750 + }, + { + "epoch": 7.5174825174825175, + "grad_norm": 0.35180532932281494, + "learning_rate": 0.0005100909620991253, + "loss": 3.4589, + "step": 25800 + }, + { + "epoch": 7.532051282051282, + "grad_norm": 0.3383461833000183, + "learning_rate": 0.0005099160349854227, + "loss": 3.446, + "step": 25850 + }, + { + "epoch": 7.546620046620046, + "grad_norm": 0.35350677371025085, + "learning_rate": 0.0005097411078717201, + "loss": 3.4507, + "step": 25900 + }, + { + "epoch": 7.561188811188811, + "grad_norm": 0.3186721205711365, + "learning_rate": 0.0005095661807580174, + "loss": 3.4341, + "step": 25950 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 0.3171408474445343, + "learning_rate": 0.0005093912536443149, + "loss": 3.4501, + "step": 26000 + }, + { + "epoch": 7.575757575757576, + "eval_accuracy": 0.3657292244576768, + "eval_loss": 3.5900423526763916, + "eval_runtime": 180.2145, + "eval_samples_per_second": 92.346, + "eval_steps_per_second": 5.776, + "step": 26000 + }, + { + "epoch": 7.590326340326341, + "grad_norm": 0.35610276460647583, + "learning_rate": 0.0005092163265306122, + "loss": 3.4641, + "step": 26050 + }, + { + "epoch": 7.604895104895105, + "grad_norm": 0.37525665760040283, + "learning_rate": 0.0005090413994169096, + "loss": 3.4647, + "step": 26100 + }, + { + "epoch": 7.619463869463869, + "grad_norm": 0.33461683988571167, + "learning_rate": 0.000508866472303207, + "loss": 3.4597, + "step": 26150 + }, + { + "epoch": 7.634032634032634, + "grad_norm": 0.3235708773136139, + "learning_rate": 0.0005086915451895044, + "loss": 3.4497, + "step": 26200 + }, + { + "epoch": 7.648601398601398, + "grad_norm": 0.34801435470581055, + "learning_rate": 0.0005085166180758017, + "loss": 3.4504, + "step": 26250 + }, + { + "epoch": 7.663170163170163, + "grad_norm": 0.32955285906791687, + "learning_rate": 0.0005083416909620991, + "loss": 3.449, + "step": 26300 + }, + { + "epoch": 7.677738927738928, + "grad_norm": 0.3284403383731842, + "learning_rate": 0.0005081667638483964, + "loss": 3.4546, + "step": 26350 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.32493704557418823, + "learning_rate": 0.0005079918367346939, + "loss": 3.4384, + "step": 26400 + }, + { + "epoch": 7.706876456876457, + "grad_norm": 0.34628820419311523, + "learning_rate": 0.0005078169096209912, + "loss": 3.4534, + "step": 26450 + }, + { + "epoch": 7.721445221445221, + "grad_norm": 0.33644041419029236, + "learning_rate": 0.0005076419825072886, + "loss": 3.4552, + "step": 26500 + }, + { + "epoch": 7.736013986013986, + "grad_norm": 0.34094300866127014, + "learning_rate": 0.000507467055393586, + "loss": 3.4536, + "step": 26550 + }, + { + "epoch": 7.75058275058275, + "grad_norm": 0.34397369623184204, + "learning_rate": 0.0005072921282798834, + "loss": 3.4635, + "step": 26600 + }, + { + "epoch": 7.765151515151516, + "grad_norm": 0.3402233123779297, + "learning_rate": 0.0005071172011661807, + "loss": 3.4708, + "step": 26650 + }, + { + "epoch": 7.77972027972028, + "grad_norm": 0.3712950050830841, + "learning_rate": 0.0005069422740524781, + "loss": 3.4713, + "step": 26700 + }, + { + "epoch": 7.7942890442890445, + "grad_norm": 0.3284025490283966, + "learning_rate": 0.0005067673469387754, + "loss": 3.4556, + "step": 26750 + }, + { + "epoch": 7.808857808857809, + "grad_norm": 0.34438565373420715, + "learning_rate": 0.0005065924198250729, + "loss": 3.4744, + "step": 26800 + }, + { + "epoch": 7.823426573426573, + "grad_norm": 0.33172059059143066, + "learning_rate": 0.0005064174927113702, + "loss": 3.4534, + "step": 26850 + }, + { + "epoch": 7.837995337995338, + "grad_norm": 0.3375876843929291, + "learning_rate": 0.0005062425655976676, + "loss": 3.4484, + "step": 26900 + }, + { + "epoch": 7.852564102564102, + "grad_norm": 0.3456272780895233, + "learning_rate": 0.0005060676384839649, + "loss": 3.4474, + "step": 26950 + }, + { + "epoch": 7.867132867132867, + "grad_norm": 0.3476708233356476, + "learning_rate": 0.0005058927113702624, + "loss": 3.4529, + "step": 27000 + }, + { + "epoch": 7.867132867132867, + "eval_accuracy": 0.36633469981756955, + "eval_loss": 3.5783565044403076, + "eval_runtime": 182.5033, + "eval_samples_per_second": 91.187, + "eval_steps_per_second": 5.704, + "step": 27000 + }, + { + "epoch": 7.881701631701632, + "grad_norm": 0.357236385345459, + "learning_rate": 0.0005057177842565598, + "loss": 3.4582, + "step": 27050 + }, + { + "epoch": 7.896270396270396, + "grad_norm": 0.3404090404510498, + "learning_rate": 0.0005055428571428571, + "loss": 3.4633, + "step": 27100 + }, + { + "epoch": 7.910839160839161, + "grad_norm": 0.341049462556839, + "learning_rate": 0.0005053679300291544, + "loss": 3.4626, + "step": 27150 + }, + { + "epoch": 7.925407925407925, + "grad_norm": 0.321346640586853, + "learning_rate": 0.0005051930029154519, + "loss": 3.4529, + "step": 27200 + }, + { + "epoch": 7.93997668997669, + "grad_norm": 0.31583067774772644, + "learning_rate": 0.0005050180758017492, + "loss": 3.4681, + "step": 27250 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 0.36198437213897705, + "learning_rate": 0.0005048431486880466, + "loss": 3.452, + "step": 27300 + }, + { + "epoch": 7.9691142191142195, + "grad_norm": 0.34580230712890625, + "learning_rate": 0.0005046682215743439, + "loss": 3.4541, + "step": 27350 + }, + { + "epoch": 7.983682983682984, + "grad_norm": 0.3525956869125366, + "learning_rate": 0.0005044932944606414, + "loss": 3.4677, + "step": 27400 + }, + { + "epoch": 7.998251748251748, + "grad_norm": 0.312714546918869, + "learning_rate": 0.0005043183673469388, + "loss": 3.453, + "step": 27450 + }, + { + "epoch": 8.012820512820513, + "grad_norm": 0.3349739909172058, + "learning_rate": 0.0005041434402332361, + "loss": 3.3605, + "step": 27500 + }, + { + "epoch": 8.027389277389277, + "grad_norm": 0.35149267315864563, + "learning_rate": 0.0005039685131195334, + "loss": 3.3568, + "step": 27550 + }, + { + "epoch": 8.041958041958042, + "grad_norm": 0.35762932896614075, + "learning_rate": 0.0005037935860058309, + "loss": 3.3594, + "step": 27600 + }, + { + "epoch": 8.056526806526806, + "grad_norm": 0.3227376341819763, + "learning_rate": 0.0005036186588921282, + "loss": 3.3618, + "step": 27650 + }, + { + "epoch": 8.07109557109557, + "grad_norm": 0.3321167826652527, + "learning_rate": 0.0005034437317784256, + "loss": 3.3757, + "step": 27700 + }, + { + "epoch": 8.085664335664335, + "grad_norm": 0.34823182225227356, + "learning_rate": 0.000503268804664723, + "loss": 3.368, + "step": 27750 + }, + { + "epoch": 8.1002331002331, + "grad_norm": 0.3144749701023102, + "learning_rate": 0.0005030938775510204, + "loss": 3.3762, + "step": 27800 + }, + { + "epoch": 8.114801864801866, + "grad_norm": 0.33065780997276306, + "learning_rate": 0.0005029189504373178, + "loss": 3.3705, + "step": 27850 + }, + { + "epoch": 8.12937062937063, + "grad_norm": 0.3569163382053375, + "learning_rate": 0.0005027440233236151, + "loss": 3.3917, + "step": 27900 + }, + { + "epoch": 8.143939393939394, + "grad_norm": 0.336088091135025, + "learning_rate": 0.0005025690962099126, + "loss": 3.387, + "step": 27950 + }, + { + "epoch": 8.158508158508159, + "grad_norm": 0.31934666633605957, + "learning_rate": 0.0005023941690962099, + "loss": 3.3725, + "step": 28000 + }, + { + "epoch": 8.158508158508159, + "eval_accuracy": 0.3663358757262118, + "eval_loss": 3.590759754180908, + "eval_runtime": 182.8908, + "eval_samples_per_second": 90.994, + "eval_steps_per_second": 5.692, + "step": 28000 + }, + { + "epoch": 8.173076923076923, + "grad_norm": 0.36414483189582825, + "learning_rate": 0.0005022192419825072, + "loss": 3.3922, + "step": 28050 + }, + { + "epoch": 8.187645687645688, + "grad_norm": 0.3432634472846985, + "learning_rate": 0.0005020443148688046, + "loss": 3.3869, + "step": 28100 + }, + { + "epoch": 8.202214452214452, + "grad_norm": 0.34101763367652893, + "learning_rate": 0.000501869387755102, + "loss": 3.3947, + "step": 28150 + }, + { + "epoch": 8.216783216783217, + "grad_norm": 0.36927416920661926, + "learning_rate": 0.0005016944606413994, + "loss": 3.3831, + "step": 28200 + }, + { + "epoch": 8.231351981351981, + "grad_norm": 0.3365326523780823, + "learning_rate": 0.0005015195335276967, + "loss": 3.3901, + "step": 28250 + }, + { + "epoch": 8.245920745920746, + "grad_norm": 0.36989808082580566, + "learning_rate": 0.0005013446064139941, + "loss": 3.3948, + "step": 28300 + }, + { + "epoch": 8.26048951048951, + "grad_norm": 0.3689005374908447, + "learning_rate": 0.0005011696793002916, + "loss": 3.3917, + "step": 28350 + }, + { + "epoch": 8.275058275058274, + "grad_norm": 0.3656901717185974, + "learning_rate": 0.0005009947521865889, + "loss": 3.3954, + "step": 28400 + }, + { + "epoch": 8.289627039627039, + "grad_norm": 0.3664736747741699, + "learning_rate": 0.0005008198250728862, + "loss": 3.4009, + "step": 28450 + }, + { + "epoch": 8.304195804195805, + "grad_norm": 0.3412057161331177, + "learning_rate": 0.0005006448979591836, + "loss": 3.3916, + "step": 28500 + }, + { + "epoch": 8.31876456876457, + "grad_norm": 0.37343958020210266, + "learning_rate": 0.000500469970845481, + "loss": 3.3981, + "step": 28550 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.40472412109375, + "learning_rate": 0.0005002950437317784, + "loss": 3.4063, + "step": 28600 + }, + { + "epoch": 8.347902097902098, + "grad_norm": 0.33591440320014954, + "learning_rate": 0.0005001201166180757, + "loss": 3.3988, + "step": 28650 + }, + { + "epoch": 8.362470862470863, + "grad_norm": 0.3387737572193146, + "learning_rate": 0.0004999451895043731, + "loss": 3.4103, + "step": 28700 + }, + { + "epoch": 8.377039627039627, + "grad_norm": 0.3714272975921631, + "learning_rate": 0.0004997702623906706, + "loss": 3.4127, + "step": 28750 + }, + { + "epoch": 8.391608391608392, + "grad_norm": 0.34964922070503235, + "learning_rate": 0.0004995953352769679, + "loss": 3.409, + "step": 28800 + }, + { + "epoch": 8.406177156177156, + "grad_norm": 0.34254536032676697, + "learning_rate": 0.0004994204081632653, + "loss": 3.4076, + "step": 28850 + }, + { + "epoch": 8.42074592074592, + "grad_norm": 0.34269341826438904, + "learning_rate": 0.0004992454810495626, + "loss": 3.3974, + "step": 28900 + }, + { + "epoch": 8.435314685314685, + "grad_norm": 0.32962408661842346, + "learning_rate": 0.00049907055393586, + "loss": 3.3982, + "step": 28950 + }, + { + "epoch": 8.44988344988345, + "grad_norm": 0.37458404898643494, + "learning_rate": 0.0004988956268221574, + "loss": 3.4032, + "step": 29000 + }, + { + "epoch": 8.44988344988345, + "eval_accuracy": 0.3665615325946589, + "eval_loss": 3.5825419425964355, + "eval_runtime": 180.8696, + "eval_samples_per_second": 92.011, + "eval_steps_per_second": 5.756, + "step": 29000 + }, + { + "epoch": 8.464452214452214, + "grad_norm": 0.34700310230255127, + "learning_rate": 0.0004987206997084547, + "loss": 3.4302, + "step": 29050 + }, + { + "epoch": 8.479020979020978, + "grad_norm": 0.3363369405269623, + "learning_rate": 0.0004985457725947521, + "loss": 3.4196, + "step": 29100 + }, + { + "epoch": 8.493589743589745, + "grad_norm": 0.34493017196655273, + "learning_rate": 0.0004983708454810496, + "loss": 3.4233, + "step": 29150 + }, + { + "epoch": 8.508158508158509, + "grad_norm": 0.3357371389865875, + "learning_rate": 0.0004981959183673469, + "loss": 3.4124, + "step": 29200 + }, + { + "epoch": 8.522727272727273, + "grad_norm": 0.3642560541629791, + "learning_rate": 0.0004980209912536443, + "loss": 3.4161, + "step": 29250 + }, + { + "epoch": 8.537296037296038, + "grad_norm": 0.3482314944267273, + "learning_rate": 0.0004978460641399417, + "loss": 3.4204, + "step": 29300 + }, + { + "epoch": 8.551864801864802, + "grad_norm": 0.3307981491088867, + "learning_rate": 0.000497671137026239, + "loss": 3.4275, + "step": 29350 + }, + { + "epoch": 8.566433566433567, + "grad_norm": 0.3394106924533844, + "learning_rate": 0.0004974962099125364, + "loss": 3.4057, + "step": 29400 + }, + { + "epoch": 8.581002331002331, + "grad_norm": 0.3372842073440552, + "learning_rate": 0.0004973212827988337, + "loss": 3.4092, + "step": 29450 + }, + { + "epoch": 8.595571095571096, + "grad_norm": 0.32758432626724243, + "learning_rate": 0.0004971463556851312, + "loss": 3.4188, + "step": 29500 + }, + { + "epoch": 8.61013986013986, + "grad_norm": 0.3386209309101105, + "learning_rate": 0.0004969714285714286, + "loss": 3.4205, + "step": 29550 + }, + { + "epoch": 8.624708624708624, + "grad_norm": 0.3470524549484253, + "learning_rate": 0.0004967965014577259, + "loss": 3.4182, + "step": 29600 + }, + { + "epoch": 8.639277389277389, + "grad_norm": 0.3339594900608063, + "learning_rate": 0.0004966215743440233, + "loss": 3.4176, + "step": 29650 + }, + { + "epoch": 8.653846153846153, + "grad_norm": 0.3515772223472595, + "learning_rate": 0.0004964466472303207, + "loss": 3.4288, + "step": 29700 + }, + { + "epoch": 8.668414918414918, + "grad_norm": 0.35019543766975403, + "learning_rate": 0.000496271720116618, + "loss": 3.4287, + "step": 29750 + }, + { + "epoch": 8.682983682983682, + "grad_norm": 0.3279973566532135, + "learning_rate": 0.0004960967930029154, + "loss": 3.4211, + "step": 29800 + }, + { + "epoch": 8.697552447552448, + "grad_norm": 0.33548232913017273, + "learning_rate": 0.0004959218658892127, + "loss": 3.4308, + "step": 29850 + }, + { + "epoch": 8.712121212121213, + "grad_norm": 0.3599195182323456, + "learning_rate": 0.0004957469387755102, + "loss": 3.4228, + "step": 29900 + }, + { + "epoch": 8.726689976689977, + "grad_norm": 0.34652629494667053, + "learning_rate": 0.0004955720116618075, + "loss": 3.4285, + "step": 29950 + }, + { + "epoch": 8.741258741258742, + "grad_norm": 0.332381933927536, + "learning_rate": 0.0004953970845481049, + "loss": 3.4275, + "step": 30000 + }, + { + "epoch": 8.741258741258742, + "eval_accuracy": 0.36736185601657184, + "eval_loss": 3.5740106105804443, + "eval_runtime": 182.2015, + "eval_samples_per_second": 91.338, + "eval_steps_per_second": 5.713, + "step": 30000 + }, + { + "epoch": 8.755827505827506, + "grad_norm": 0.3524293601512909, + "learning_rate": 0.0004952221574344023, + "loss": 3.4245, + "step": 30050 + }, + { + "epoch": 8.77039627039627, + "grad_norm": 0.33680975437164307, + "learning_rate": 0.0004950472303206997, + "loss": 3.4345, + "step": 30100 + }, + { + "epoch": 8.784965034965035, + "grad_norm": 0.34272924065589905, + "learning_rate": 0.0004948723032069971, + "loss": 3.4335, + "step": 30150 + }, + { + "epoch": 8.7995337995338, + "grad_norm": 0.3409082591533661, + "learning_rate": 0.0004946973760932944, + "loss": 3.4283, + "step": 30200 + }, + { + "epoch": 8.814102564102564, + "grad_norm": 0.36862286925315857, + "learning_rate": 0.0004945224489795917, + "loss": 3.4235, + "step": 30250 + }, + { + "epoch": 8.828671328671328, + "grad_norm": 0.3254280388355255, + "learning_rate": 0.0004943475218658892, + "loss": 3.4312, + "step": 30300 + }, + { + "epoch": 8.843240093240093, + "grad_norm": 0.3392513394355774, + "learning_rate": 0.0004941725947521865, + "loss": 3.4257, + "step": 30350 + }, + { + "epoch": 8.857808857808857, + "grad_norm": 0.3554167151451111, + "learning_rate": 0.0004939976676384839, + "loss": 3.4304, + "step": 30400 + }, + { + "epoch": 8.872377622377622, + "grad_norm": 0.35996973514556885, + "learning_rate": 0.0004938227405247813, + "loss": 3.4399, + "step": 30450 + }, + { + "epoch": 8.886946386946388, + "grad_norm": 0.36442074179649353, + "learning_rate": 0.0004936478134110787, + "loss": 3.4316, + "step": 30500 + }, + { + "epoch": 8.901515151515152, + "grad_norm": 0.36240333318710327, + "learning_rate": 0.0004934728862973761, + "loss": 3.4262, + "step": 30550 + }, + { + "epoch": 8.916083916083917, + "grad_norm": 0.33148348331451416, + "learning_rate": 0.0004932979591836734, + "loss": 3.4361, + "step": 30600 + }, + { + "epoch": 8.930652680652681, + "grad_norm": 0.3203504681587219, + "learning_rate": 0.0004931230320699707, + "loss": 3.4466, + "step": 30650 + }, + { + "epoch": 8.945221445221446, + "grad_norm": 0.357393741607666, + "learning_rate": 0.0004929481049562682, + "loss": 3.4402, + "step": 30700 + }, + { + "epoch": 8.95979020979021, + "grad_norm": 0.36473795771598816, + "learning_rate": 0.0004927731778425655, + "loss": 3.4386, + "step": 30750 + }, + { + "epoch": 8.974358974358974, + "grad_norm": 0.31827130913734436, + "learning_rate": 0.0004925982507288629, + "loss": 3.4416, + "step": 30800 + }, + { + "epoch": 8.988927738927739, + "grad_norm": 0.35165274143218994, + "learning_rate": 0.0004924233236151604, + "loss": 3.4313, + "step": 30850 + }, + { + "epoch": 9.003496503496503, + "grad_norm": 0.3829018771648407, + "learning_rate": 0.0004922483965014577, + "loss": 3.4134, + "step": 30900 + }, + { + "epoch": 9.018065268065268, + "grad_norm": 0.3232770264148712, + "learning_rate": 0.0004920734693877551, + "loss": 3.3232, + "step": 30950 + }, + { + "epoch": 9.032634032634032, + "grad_norm": 0.33512336015701294, + "learning_rate": 0.0004918985422740524, + "loss": 3.3293, + "step": 31000 + }, + { + "epoch": 9.032634032634032, + "eval_accuracy": 0.3672867154543323, + "eval_loss": 3.579836130142212, + "eval_runtime": 181.2154, + "eval_samples_per_second": 91.835, + "eval_steps_per_second": 5.745, + "step": 31000 + }, + { + "epoch": 9.047202797202797, + "grad_norm": 0.35247862339019775, + "learning_rate": 0.0004917236151603499, + "loss": 3.323, + "step": 31050 + }, + { + "epoch": 9.061771561771561, + "grad_norm": 0.33538663387298584, + "learning_rate": 0.0004915486880466472, + "loss": 3.3462, + "step": 31100 + }, + { + "epoch": 9.076340326340326, + "grad_norm": 0.3494170010089874, + "learning_rate": 0.0004913737609329445, + "loss": 3.3328, + "step": 31150 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.35296133160591125, + "learning_rate": 0.0004911988338192419, + "loss": 3.3354, + "step": 31200 + }, + { + "epoch": 9.105477855477856, + "grad_norm": 0.3609370291233063, + "learning_rate": 0.0004910239067055393, + "loss": 3.3555, + "step": 31250 + }, + { + "epoch": 9.12004662004662, + "grad_norm": 0.3352583050727844, + "learning_rate": 0.0004908489795918367, + "loss": 3.3444, + "step": 31300 + }, + { + "epoch": 9.134615384615385, + "grad_norm": 0.3525612950325012, + "learning_rate": 0.0004906740524781341, + "loss": 3.3484, + "step": 31350 + }, + { + "epoch": 9.14918414918415, + "grad_norm": 0.37619081139564514, + "learning_rate": 0.0004904991253644314, + "loss": 3.3576, + "step": 31400 + }, + { + "epoch": 9.163752913752914, + "grad_norm": 0.3352401852607727, + "learning_rate": 0.0004903241982507289, + "loss": 3.3542, + "step": 31450 + }, + { + "epoch": 9.178321678321678, + "grad_norm": 0.3672662675380707, + "learning_rate": 0.0004901492711370262, + "loss": 3.3445, + "step": 31500 + }, + { + "epoch": 9.192890442890443, + "grad_norm": 0.36354750394821167, + "learning_rate": 0.0004899743440233235, + "loss": 3.3598, + "step": 31550 + }, + { + "epoch": 9.207459207459207, + "grad_norm": 0.332333505153656, + "learning_rate": 0.0004897994169096209, + "loss": 3.3689, + "step": 31600 + }, + { + "epoch": 9.222027972027972, + "grad_norm": 0.35484346747398376, + "learning_rate": 0.0004896244897959183, + "loss": 3.3683, + "step": 31650 + }, + { + "epoch": 9.236596736596736, + "grad_norm": 0.3454098403453827, + "learning_rate": 0.0004894495626822157, + "loss": 3.3646, + "step": 31700 + }, + { + "epoch": 9.2511655011655, + "grad_norm": 0.34342116117477417, + "learning_rate": 0.0004892746355685131, + "loss": 3.3622, + "step": 31750 + }, + { + "epoch": 9.265734265734265, + "grad_norm": 0.3866739273071289, + "learning_rate": 0.0004890997084548104, + "loss": 3.3679, + "step": 31800 + }, + { + "epoch": 9.280303030303031, + "grad_norm": 0.3526863753795624, + "learning_rate": 0.0004889247813411079, + "loss": 3.3813, + "step": 31850 + }, + { + "epoch": 9.294871794871796, + "grad_norm": 0.36674419045448303, + "learning_rate": 0.0004887498542274052, + "loss": 3.3658, + "step": 31900 + }, + { + "epoch": 9.30944055944056, + "grad_norm": 0.36833953857421875, + "learning_rate": 0.0004885749271137026, + "loss": 3.3734, + "step": 31950 + }, + { + "epoch": 9.324009324009324, + "grad_norm": 0.3428957462310791, + "learning_rate": 0.0004883999999999999, + "loss": 3.3803, + "step": 32000 + }, + { + "epoch": 9.324009324009324, + "eval_accuracy": 0.36745534075363046, + "eval_loss": 3.579030990600586, + "eval_runtime": 180.7616, + "eval_samples_per_second": 92.066, + "eval_steps_per_second": 5.759, + "step": 32000 + }, + { + "epoch": 9.338578088578089, + "grad_norm": 0.3581482172012329, + "learning_rate": 0.0004882250728862973, + "loss": 3.3766, + "step": 32050 + }, + { + "epoch": 9.353146853146853, + "grad_norm": 0.3714257478713989, + "learning_rate": 0.0004880501457725947, + "loss": 3.3814, + "step": 32100 + }, + { + "epoch": 9.367715617715618, + "grad_norm": 0.361931174993515, + "learning_rate": 0.00048787521865889207, + "loss": 3.3851, + "step": 32150 + }, + { + "epoch": 9.382284382284382, + "grad_norm": 0.3409428596496582, + "learning_rate": 0.00048770029154518945, + "loss": 3.384, + "step": 32200 + }, + { + "epoch": 9.396853146853147, + "grad_norm": 0.40810930728912354, + "learning_rate": 0.0004875253644314868, + "loss": 3.3816, + "step": 32250 + }, + { + "epoch": 9.411421911421911, + "grad_norm": 0.3254898190498352, + "learning_rate": 0.0004873504373177842, + "loss": 3.381, + "step": 32300 + }, + { + "epoch": 9.425990675990676, + "grad_norm": 0.354233056306839, + "learning_rate": 0.00048717551020408163, + "loss": 3.3847, + "step": 32350 + }, + { + "epoch": 9.44055944055944, + "grad_norm": 0.3318980038166046, + "learning_rate": 0.000487000583090379, + "loss": 3.3792, + "step": 32400 + }, + { + "epoch": 9.455128205128204, + "grad_norm": 0.32618919014930725, + "learning_rate": 0.00048682565597667633, + "loss": 3.3899, + "step": 32450 + }, + { + "epoch": 9.469696969696969, + "grad_norm": 0.39949190616607666, + "learning_rate": 0.0004866507288629737, + "loss": 3.3837, + "step": 32500 + }, + { + "epoch": 9.484265734265735, + "grad_norm": 0.3685564398765564, + "learning_rate": 0.0004864758017492711, + "loss": 3.3851, + "step": 32550 + }, + { + "epoch": 9.4988344988345, + "grad_norm": 0.359235018491745, + "learning_rate": 0.00048630087463556845, + "loss": 3.3893, + "step": 32600 + }, + { + "epoch": 9.513403263403264, + "grad_norm": 0.33161383867263794, + "learning_rate": 0.00048612594752186583, + "loss": 3.4009, + "step": 32650 + }, + { + "epoch": 9.527972027972028, + "grad_norm": 0.3646078109741211, + "learning_rate": 0.0004859510204081632, + "loss": 3.4062, + "step": 32700 + }, + { + "epoch": 9.542540792540793, + "grad_norm": 0.32304298877716064, + "learning_rate": 0.00048577609329446064, + "loss": 3.4058, + "step": 32750 + }, + { + "epoch": 9.557109557109557, + "grad_norm": 0.340385764837265, + "learning_rate": 0.000485601166180758, + "loss": 3.4003, + "step": 32800 + }, + { + "epoch": 9.571678321678322, + "grad_norm": 0.353704571723938, + "learning_rate": 0.0004854262390670554, + "loss": 3.3916, + "step": 32850 + }, + { + "epoch": 9.586247086247086, + "grad_norm": 0.3353423476219177, + "learning_rate": 0.0004852513119533527, + "loss": 3.4019, + "step": 32900 + }, + { + "epoch": 9.60081585081585, + "grad_norm": 0.3232695758342743, + "learning_rate": 0.0004850763848396501, + "loss": 3.3974, + "step": 32950 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 0.36285659670829773, + "learning_rate": 0.00048490145772594746, + "loss": 3.3931, + "step": 33000 + }, + { + "epoch": 9.615384615384615, + "eval_accuracy": 0.368203101059235, + "eval_loss": 3.5726845264434814, + "eval_runtime": 180.0842, + "eval_samples_per_second": 92.412, + "eval_steps_per_second": 5.781, + "step": 33000 + }, + { + "epoch": 9.62995337995338, + "grad_norm": 0.3308947682380676, + "learning_rate": 0.00048472653061224484, + "loss": 3.4049, + "step": 33050 + }, + { + "epoch": 9.644522144522144, + "grad_norm": 0.3408724367618561, + "learning_rate": 0.0004845516034985422, + "loss": 3.404, + "step": 33100 + }, + { + "epoch": 9.659090909090908, + "grad_norm": 0.34324896335601807, + "learning_rate": 0.0004843766763848396, + "loss": 3.399, + "step": 33150 + }, + { + "epoch": 9.673659673659674, + "grad_norm": 0.34077367186546326, + "learning_rate": 0.000484201749271137, + "loss": 3.3953, + "step": 33200 + }, + { + "epoch": 9.688228438228439, + "grad_norm": 0.35905328392982483, + "learning_rate": 0.0004840268221574344, + "loss": 3.3853, + "step": 33250 + }, + { + "epoch": 9.702797202797203, + "grad_norm": 0.3622050881385803, + "learning_rate": 0.00048385189504373177, + "loss": 3.4025, + "step": 33300 + }, + { + "epoch": 9.717365967365968, + "grad_norm": 0.34367215633392334, + "learning_rate": 0.0004836769679300291, + "loss": 3.4029, + "step": 33350 + }, + { + "epoch": 9.731934731934732, + "grad_norm": 0.32383468747138977, + "learning_rate": 0.00048350204081632647, + "loss": 3.4049, + "step": 33400 + }, + { + "epoch": 9.746503496503497, + "grad_norm": 0.36959537863731384, + "learning_rate": 0.00048332711370262384, + "loss": 3.405, + "step": 33450 + }, + { + "epoch": 9.761072261072261, + "grad_norm": 0.3404758870601654, + "learning_rate": 0.0004831521865889212, + "loss": 3.4005, + "step": 33500 + }, + { + "epoch": 9.775641025641026, + "grad_norm": 0.36188212037086487, + "learning_rate": 0.0004829772594752186, + "loss": 3.4074, + "step": 33550 + }, + { + "epoch": 9.79020979020979, + "grad_norm": 0.38642576336860657, + "learning_rate": 0.00048280233236151597, + "loss": 3.4068, + "step": 33600 + }, + { + "epoch": 9.804778554778554, + "grad_norm": 0.32433605194091797, + "learning_rate": 0.0004826274052478134, + "loss": 3.4092, + "step": 33650 + }, + { + "epoch": 9.819347319347319, + "grad_norm": 0.3639720678329468, + "learning_rate": 0.0004824524781341108, + "loss": 3.3985, + "step": 33700 + }, + { + "epoch": 9.833916083916083, + "grad_norm": 0.3690209686756134, + "learning_rate": 0.00048227755102040815, + "loss": 3.407, + "step": 33750 + }, + { + "epoch": 9.848484848484848, + "grad_norm": 0.32806217670440674, + "learning_rate": 0.0004821026239067055, + "loss": 3.4117, + "step": 33800 + }, + { + "epoch": 9.863053613053612, + "grad_norm": 0.32632794976234436, + "learning_rate": 0.00048192769679300285, + "loss": 3.4169, + "step": 33850 + }, + { + "epoch": 9.877622377622378, + "grad_norm": 0.34658604860305786, + "learning_rate": 0.0004817527696793002, + "loss": 3.4117, + "step": 33900 + }, + { + "epoch": 9.892191142191143, + "grad_norm": 0.34974268078804016, + "learning_rate": 0.0004815778425655976, + "loss": 3.4073, + "step": 33950 + }, + { + "epoch": 9.906759906759907, + "grad_norm": 0.3343101739883423, + "learning_rate": 0.000481402915451895, + "loss": 3.4063, + "step": 34000 + }, + { + "epoch": 9.906759906759907, + "eval_accuracy": 0.3688429129514813, + "eval_loss": 3.5587732791900635, + "eval_runtime": 180.2379, + "eval_samples_per_second": 92.334, + "eval_steps_per_second": 5.776, + "step": 34000 + }, + { + "epoch": 9.921328671328672, + "grad_norm": 0.33629804849624634, + "learning_rate": 0.0004812279883381924, + "loss": 3.4184, + "step": 34050 + }, + { + "epoch": 9.935897435897436, + "grad_norm": 0.35826265811920166, + "learning_rate": 0.0004810530612244898, + "loss": 3.4062, + "step": 34100 + }, + { + "epoch": 9.9504662004662, + "grad_norm": 0.3323402404785156, + "learning_rate": 0.00048087813411078716, + "loss": 3.4029, + "step": 34150 + }, + { + "epoch": 9.965034965034965, + "grad_norm": 0.3231922388076782, + "learning_rate": 0.00048070320699708453, + "loss": 3.4137, + "step": 34200 + }, + { + "epoch": 9.97960372960373, + "grad_norm": 0.35591524839401245, + "learning_rate": 0.00048052827988338186, + "loss": 3.4172, + "step": 34250 + }, + { + "epoch": 9.994172494172494, + "grad_norm": 0.3526099920272827, + "learning_rate": 0.00048035335276967923, + "loss": 3.4215, + "step": 34300 + }, + { + "epoch": 10.008741258741258, + "grad_norm": 0.367563933134079, + "learning_rate": 0.0004801784256559766, + "loss": 3.3311, + "step": 34350 + }, + { + "epoch": 10.023310023310023, + "grad_norm": 0.34572193026542664, + "learning_rate": 0.000480003498542274, + "loss": 3.3062, + "step": 34400 + }, + { + "epoch": 10.037878787878787, + "grad_norm": 0.362204909324646, + "learning_rate": 0.00047982857142857136, + "loss": 3.3028, + "step": 34450 + }, + { + "epoch": 10.052447552447552, + "grad_norm": 0.3749389946460724, + "learning_rate": 0.0004796536443148688, + "loss": 3.3031, + "step": 34500 + }, + { + "epoch": 10.067016317016318, + "grad_norm": 0.3729357421398163, + "learning_rate": 0.00047947871720116616, + "loss": 3.3036, + "step": 34550 + }, + { + "epoch": 10.081585081585082, + "grad_norm": 0.3892238140106201, + "learning_rate": 0.00047930379008746354, + "loss": 3.3145, + "step": 34600 + }, + { + "epoch": 10.096153846153847, + "grad_norm": 0.3650963008403778, + "learning_rate": 0.0004791288629737609, + "loss": 3.3232, + "step": 34650 + }, + { + "epoch": 10.110722610722611, + "grad_norm": 0.3529200851917267, + "learning_rate": 0.00047895393586005824, + "loss": 3.3166, + "step": 34700 + }, + { + "epoch": 10.125291375291376, + "grad_norm": 0.3430958390235901, + "learning_rate": 0.0004787790087463556, + "loss": 3.3311, + "step": 34750 + }, + { + "epoch": 10.13986013986014, + "grad_norm": 0.35546183586120605, + "learning_rate": 0.000478604081632653, + "loss": 3.3229, + "step": 34800 + }, + { + "epoch": 10.154428904428904, + "grad_norm": 0.3477681279182434, + "learning_rate": 0.00047842915451895037, + "loss": 3.3211, + "step": 34850 + }, + { + "epoch": 10.168997668997669, + "grad_norm": 0.35804784297943115, + "learning_rate": 0.0004782542274052478, + "loss": 3.318, + "step": 34900 + }, + { + "epoch": 10.183566433566433, + "grad_norm": 0.3714865744113922, + "learning_rate": 0.00047807930029154517, + "loss": 3.3529, + "step": 34950 + }, + { + "epoch": 10.198135198135198, + "grad_norm": 0.37744787335395813, + "learning_rate": 0.00047790437317784255, + "loss": 3.3379, + "step": 35000 + }, + { + "epoch": 10.198135198135198, + "eval_accuracy": 0.36837090322248356, + "eval_loss": 3.5747363567352295, + "eval_runtime": 180.0894, + "eval_samples_per_second": 92.41, + "eval_steps_per_second": 5.78, + "step": 35000 + }, + { + "epoch": 10.212703962703962, + "grad_norm": 0.3652697801589966, + "learning_rate": 0.0004777294460641399, + "loss": 3.3403, + "step": 35050 + }, + { + "epoch": 10.227272727272727, + "grad_norm": 0.3565238118171692, + "learning_rate": 0.0004775545189504373, + "loss": 3.3517, + "step": 35100 + }, + { + "epoch": 10.241841491841491, + "grad_norm": 0.3647816777229309, + "learning_rate": 0.0004773795918367346, + "loss": 3.3465, + "step": 35150 + }, + { + "epoch": 10.256410256410255, + "grad_norm": 0.3312961161136627, + "learning_rate": 0.000477204664723032, + "loss": 3.3448, + "step": 35200 + }, + { + "epoch": 10.270979020979022, + "grad_norm": 0.3463350534439087, + "learning_rate": 0.00047702973760932937, + "loss": 3.329, + "step": 35250 + }, + { + "epoch": 10.285547785547786, + "grad_norm": 0.36243367195129395, + "learning_rate": 0.00047685481049562675, + "loss": 3.3469, + "step": 35300 + }, + { + "epoch": 10.30011655011655, + "grad_norm": 0.3585239350795746, + "learning_rate": 0.0004766798833819242, + "loss": 3.3488, + "step": 35350 + }, + { + "epoch": 10.314685314685315, + "grad_norm": 0.33923816680908203, + "learning_rate": 0.00047650495626822155, + "loss": 3.357, + "step": 35400 + }, + { + "epoch": 10.32925407925408, + "grad_norm": 0.3626267910003662, + "learning_rate": 0.00047633002915451893, + "loss": 3.356, + "step": 35450 + }, + { + "epoch": 10.343822843822844, + "grad_norm": 0.36127206683158875, + "learning_rate": 0.0004761551020408163, + "loss": 3.3728, + "step": 35500 + }, + { + "epoch": 10.358391608391608, + "grad_norm": 0.3516559600830078, + "learning_rate": 0.0004759801749271137, + "loss": 3.3548, + "step": 35550 + }, + { + "epoch": 10.372960372960373, + "grad_norm": 0.38914352655410767, + "learning_rate": 0.000475805247813411, + "loss": 3.3593, + "step": 35600 + }, + { + "epoch": 10.387529137529137, + "grad_norm": 0.3629930317401886, + "learning_rate": 0.0004756303206997084, + "loss": 3.3497, + "step": 35650 + }, + { + "epoch": 10.402097902097902, + "grad_norm": 0.34036391973495483, + "learning_rate": 0.00047545539358600575, + "loss": 3.3635, + "step": 35700 + }, + { + "epoch": 10.416666666666666, + "grad_norm": 0.35723787546157837, + "learning_rate": 0.00047528046647230313, + "loss": 3.364, + "step": 35750 + }, + { + "epoch": 10.43123543123543, + "grad_norm": 0.3406592309474945, + "learning_rate": 0.00047510553935860056, + "loss": 3.3589, + "step": 35800 + }, + { + "epoch": 10.445804195804195, + "grad_norm": 0.3650604784488678, + "learning_rate": 0.00047493061224489794, + "loss": 3.3673, + "step": 35850 + }, + { + "epoch": 10.460372960372961, + "grad_norm": 0.33995601534843445, + "learning_rate": 0.0004747556851311953, + "loss": 3.3702, + "step": 35900 + }, + { + "epoch": 10.474941724941726, + "grad_norm": 0.3596780002117157, + "learning_rate": 0.0004745807580174927, + "loss": 3.3651, + "step": 35950 + }, + { + "epoch": 10.48951048951049, + "grad_norm": 0.358271062374115, + "learning_rate": 0.00047440583090379006, + "loss": 3.3768, + "step": 36000 + }, + { + "epoch": 10.48951048951049, + "eval_accuracy": 0.3689066471998911, + "eval_loss": 3.565972089767456, + "eval_runtime": 180.2039, + "eval_samples_per_second": 92.351, + "eval_steps_per_second": 5.777, + "step": 36000 + }, + { + "epoch": 10.504079254079254, + "grad_norm": 0.3587784767150879, + "learning_rate": 0.0004742309037900874, + "loss": 3.3685, + "step": 36050 + }, + { + "epoch": 10.518648018648019, + "grad_norm": 0.36644667387008667, + "learning_rate": 0.00047405597667638476, + "loss": 3.3731, + "step": 36100 + }, + { + "epoch": 10.533216783216783, + "grad_norm": 0.3659219741821289, + "learning_rate": 0.00047388104956268214, + "loss": 3.3799, + "step": 36150 + }, + { + "epoch": 10.547785547785548, + "grad_norm": 0.36219388246536255, + "learning_rate": 0.00047370612244897957, + "loss": 3.366, + "step": 36200 + }, + { + "epoch": 10.562354312354312, + "grad_norm": 0.3452727496623993, + "learning_rate": 0.00047353119533527694, + "loss": 3.3727, + "step": 36250 + }, + { + "epoch": 10.576923076923077, + "grad_norm": 0.34664297103881836, + "learning_rate": 0.0004733562682215743, + "loss": 3.359, + "step": 36300 + }, + { + "epoch": 10.591491841491841, + "grad_norm": 0.34712809324264526, + "learning_rate": 0.0004731813411078717, + "loss": 3.3701, + "step": 36350 + }, + { + "epoch": 10.606060606060606, + "grad_norm": 0.34347906708717346, + "learning_rate": 0.00047300641399416907, + "loss": 3.3803, + "step": 36400 + }, + { + "epoch": 10.62062937062937, + "grad_norm": 0.37337714433670044, + "learning_rate": 0.00047283148688046645, + "loss": 3.3882, + "step": 36450 + }, + { + "epoch": 10.635198135198134, + "grad_norm": 0.36376672983169556, + "learning_rate": 0.00047265655976676377, + "loss": 3.383, + "step": 36500 + }, + { + "epoch": 10.649766899766899, + "grad_norm": 0.34523946046829224, + "learning_rate": 0.00047248163265306114, + "loss": 3.3846, + "step": 36550 + }, + { + "epoch": 10.664335664335665, + "grad_norm": 0.3508089482784271, + "learning_rate": 0.0004723067055393585, + "loss": 3.3739, + "step": 36600 + }, + { + "epoch": 10.67890442890443, + "grad_norm": 0.3470657467842102, + "learning_rate": 0.00047213177842565595, + "loss": 3.3717, + "step": 36650 + }, + { + "epoch": 10.693473193473194, + "grad_norm": 0.3334925174713135, + "learning_rate": 0.0004719568513119533, + "loss": 3.3814, + "step": 36700 + }, + { + "epoch": 10.708041958041958, + "grad_norm": 0.3517080545425415, + "learning_rate": 0.0004717819241982507, + "loss": 3.3845, + "step": 36750 + }, + { + "epoch": 10.722610722610723, + "grad_norm": 0.3703469932079315, + "learning_rate": 0.0004716069970845481, + "loss": 3.3785, + "step": 36800 + }, + { + "epoch": 10.737179487179487, + "grad_norm": 0.3503482937812805, + "learning_rate": 0.00047143206997084545, + "loss": 3.3877, + "step": 36850 + }, + { + "epoch": 10.751748251748252, + "grad_norm": 0.36413902044296265, + "learning_rate": 0.00047125714285714283, + "loss": 3.3901, + "step": 36900 + }, + { + "epoch": 10.766317016317016, + "grad_norm": 0.35273477435112, + "learning_rate": 0.00047108221574344015, + "loss": 3.3989, + "step": 36950 + }, + { + "epoch": 10.78088578088578, + "grad_norm": 0.3469065725803375, + "learning_rate": 0.0004709072886297375, + "loss": 3.3929, + "step": 37000 + }, + { + "epoch": 10.78088578088578, + "eval_accuracy": 0.36930116454936474, + "eval_loss": 3.5597054958343506, + "eval_runtime": 180.1588, + "eval_samples_per_second": 92.374, + "eval_steps_per_second": 5.778, + "step": 37000 + }, + { + "epoch": 10.795454545454545, + "grad_norm": 0.347210556268692, + "learning_rate": 0.00047073236151603495, + "loss": 3.3819, + "step": 37050 + }, + { + "epoch": 10.81002331002331, + "grad_norm": 0.35915273427963257, + "learning_rate": 0.00047055743440233233, + "loss": 3.3801, + "step": 37100 + }, + { + "epoch": 10.824592074592074, + "grad_norm": 0.3388284146785736, + "learning_rate": 0.0004703825072886297, + "loss": 3.3866, + "step": 37150 + }, + { + "epoch": 10.83916083916084, + "grad_norm": 0.3657146990299225, + "learning_rate": 0.0004702075801749271, + "loss": 3.4009, + "step": 37200 + }, + { + "epoch": 10.853729603729604, + "grad_norm": 0.35583174228668213, + "learning_rate": 0.00047003265306122446, + "loss": 3.387, + "step": 37250 + }, + { + "epoch": 10.868298368298369, + "grad_norm": 0.3616805672645569, + "learning_rate": 0.00046985772594752183, + "loss": 3.3672, + "step": 37300 + }, + { + "epoch": 10.882867132867133, + "grad_norm": 0.34906110167503357, + "learning_rate": 0.0004696827988338192, + "loss": 3.3822, + "step": 37350 + }, + { + "epoch": 10.897435897435898, + "grad_norm": 0.37446925044059753, + "learning_rate": 0.00046950787172011653, + "loss": 3.3935, + "step": 37400 + }, + { + "epoch": 10.912004662004662, + "grad_norm": 0.3785672187805176, + "learning_rate": 0.0004693329446064139, + "loss": 3.3824, + "step": 37450 + }, + { + "epoch": 10.926573426573427, + "grad_norm": 0.37299731373786926, + "learning_rate": 0.00046915801749271134, + "loss": 3.3865, + "step": 37500 + }, + { + "epoch": 10.941142191142191, + "grad_norm": 0.3548412621021271, + "learning_rate": 0.0004689830903790087, + "loss": 3.3952, + "step": 37550 + }, + { + "epoch": 10.955710955710956, + "grad_norm": 0.36777183413505554, + "learning_rate": 0.0004688081632653061, + "loss": 3.3878, + "step": 37600 + }, + { + "epoch": 10.97027972027972, + "grad_norm": 0.36412835121154785, + "learning_rate": 0.00046863323615160346, + "loss": 3.4091, + "step": 37650 + }, + { + "epoch": 10.984848484848484, + "grad_norm": 0.3270232379436493, + "learning_rate": 0.00046845830903790084, + "loss": 3.3996, + "step": 37700 + }, + { + "epoch": 10.999417249417249, + "grad_norm": 0.3319988250732422, + "learning_rate": 0.0004682833819241982, + "loss": 3.3991, + "step": 37750 + }, + { + "epoch": 11.013986013986013, + "grad_norm": 0.35844141244888306, + "learning_rate": 0.0004681084548104956, + "loss": 3.2767, + "step": 37800 + }, + { + "epoch": 11.028554778554778, + "grad_norm": 0.3383696377277374, + "learning_rate": 0.0004679335276967929, + "loss": 3.2718, + "step": 37850 + }, + { + "epoch": 11.043123543123544, + "grad_norm": 0.3634346127510071, + "learning_rate": 0.0004677586005830903, + "loss": 3.2706, + "step": 37900 + }, + { + "epoch": 11.057692307692308, + "grad_norm": 0.3992638885974884, + "learning_rate": 0.0004675836734693877, + "loss": 3.2881, + "step": 37950 + }, + { + "epoch": 11.072261072261073, + "grad_norm": 0.35264912247657776, + "learning_rate": 0.0004674087463556851, + "loss": 3.2905, + "step": 38000 + }, + { + "epoch": 11.072261072261073, + "eval_accuracy": 0.36926294751849176, + "eval_loss": 3.5672919750213623, + "eval_runtime": 180.0525, + "eval_samples_per_second": 92.429, + "eval_steps_per_second": 5.782, + "step": 38000 + }, + { + "epoch": 11.086829836829837, + "grad_norm": 0.38650333881378174, + "learning_rate": 0.00046723381924198247, + "loss": 3.3106, + "step": 38050 + }, + { + "epoch": 11.101398601398602, + "grad_norm": 0.3478892743587494, + "learning_rate": 0.00046705889212827985, + "loss": 3.3016, + "step": 38100 + }, + { + "epoch": 11.115967365967366, + "grad_norm": 0.3671860992908478, + "learning_rate": 0.0004668839650145772, + "loss": 3.2985, + "step": 38150 + }, + { + "epoch": 11.13053613053613, + "grad_norm": 0.3565201461315155, + "learning_rate": 0.0004667090379008746, + "loss": 3.3071, + "step": 38200 + }, + { + "epoch": 11.145104895104895, + "grad_norm": 0.3274824321269989, + "learning_rate": 0.000466534110787172, + "loss": 3.3222, + "step": 38250 + }, + { + "epoch": 11.15967365967366, + "grad_norm": 0.3710516691207886, + "learning_rate": 0.0004663591836734693, + "loss": 3.3109, + "step": 38300 + }, + { + "epoch": 11.174242424242424, + "grad_norm": 0.37232545018196106, + "learning_rate": 0.0004661842565597667, + "loss": 3.3054, + "step": 38350 + }, + { + "epoch": 11.188811188811188, + "grad_norm": 0.3739616274833679, + "learning_rate": 0.0004660093294460641, + "loss": 3.3147, + "step": 38400 + }, + { + "epoch": 11.203379953379953, + "grad_norm": 0.35690245032310486, + "learning_rate": 0.0004658344023323615, + "loss": 3.3187, + "step": 38450 + }, + { + "epoch": 11.217948717948717, + "grad_norm": 0.3522016704082489, + "learning_rate": 0.00046565947521865885, + "loss": 3.321, + "step": 38500 + }, + { + "epoch": 11.232517482517483, + "grad_norm": 0.379158079624176, + "learning_rate": 0.00046548454810495623, + "loss": 3.3273, + "step": 38550 + }, + { + "epoch": 11.247086247086248, + "grad_norm": 0.37325507402420044, + "learning_rate": 0.0004653096209912536, + "loss": 3.3222, + "step": 38600 + }, + { + "epoch": 11.261655011655012, + "grad_norm": 0.3767625093460083, + "learning_rate": 0.000465134693877551, + "loss": 3.3269, + "step": 38650 + }, + { + "epoch": 11.276223776223777, + "grad_norm": 0.3531850278377533, + "learning_rate": 0.0004649597667638484, + "loss": 3.3361, + "step": 38700 + }, + { + "epoch": 11.290792540792541, + "grad_norm": 0.35781583189964294, + "learning_rate": 0.0004647848396501457, + "loss": 3.3308, + "step": 38750 + }, + { + "epoch": 11.305361305361306, + "grad_norm": 0.35981640219688416, + "learning_rate": 0.0004646099125364431, + "loss": 3.3252, + "step": 38800 + }, + { + "epoch": 11.31993006993007, + "grad_norm": 0.36371827125549316, + "learning_rate": 0.0004644349854227405, + "loss": 3.3374, + "step": 38850 + }, + { + "epoch": 11.334498834498834, + "grad_norm": 0.37464508414268494, + "learning_rate": 0.00046426005830903786, + "loss": 3.3461, + "step": 38900 + }, + { + "epoch": 11.349067599067599, + "grad_norm": 0.38214632868766785, + "learning_rate": 0.00046408513119533523, + "loss": 3.3348, + "step": 38950 + }, + { + "epoch": 11.363636363636363, + "grad_norm": 0.40841469168663025, + "learning_rate": 0.0004639102040816326, + "loss": 3.3375, + "step": 39000 + }, + { + "epoch": 11.363636363636363, + "eval_accuracy": 0.3695302315528744, + "eval_loss": 3.563751220703125, + "eval_runtime": 180.0277, + "eval_samples_per_second": 92.441, + "eval_steps_per_second": 5.782, + "step": 39000 + }, + { + "epoch": 11.378205128205128, + "grad_norm": 0.35644689202308655, + "learning_rate": 0.00046373527696793, + "loss": 3.3485, + "step": 39050 + }, + { + "epoch": 11.392773892773892, + "grad_norm": 0.3444243371486664, + "learning_rate": 0.00046356034985422736, + "loss": 3.3417, + "step": 39100 + }, + { + "epoch": 11.407342657342657, + "grad_norm": 0.3749789893627167, + "learning_rate": 0.0004633854227405248, + "loss": 3.3419, + "step": 39150 + }, + { + "epoch": 11.421911421911421, + "grad_norm": 0.3557623326778412, + "learning_rate": 0.0004632104956268221, + "loss": 3.3325, + "step": 39200 + }, + { + "epoch": 11.436480186480187, + "grad_norm": 0.36125391721725464, + "learning_rate": 0.0004630355685131195, + "loss": 3.3398, + "step": 39250 + }, + { + "epoch": 11.451048951048952, + "grad_norm": 0.3687732517719269, + "learning_rate": 0.00046286064139941687, + "loss": 3.3518, + "step": 39300 + }, + { + "epoch": 11.465617715617716, + "grad_norm": 0.3502034842967987, + "learning_rate": 0.00046268571428571424, + "loss": 3.3484, + "step": 39350 + }, + { + "epoch": 11.48018648018648, + "grad_norm": 0.3895909786224365, + "learning_rate": 0.0004625107871720116, + "loss": 3.3564, + "step": 39400 + }, + { + "epoch": 11.494755244755245, + "grad_norm": 0.3652609884738922, + "learning_rate": 0.000462335860058309, + "loss": 3.346, + "step": 39450 + }, + { + "epoch": 11.50932400932401, + "grad_norm": 0.372211754322052, + "learning_rate": 0.00046216093294460637, + "loss": 3.3468, + "step": 39500 + }, + { + "epoch": 11.523892773892774, + "grad_norm": 0.3634597063064575, + "learning_rate": 0.0004619860058309038, + "loss": 3.343, + "step": 39550 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 0.3725431561470032, + "learning_rate": 0.0004618110787172012, + "loss": 3.3475, + "step": 39600 + }, + { + "epoch": 11.553030303030303, + "grad_norm": 0.3666999042034149, + "learning_rate": 0.0004616361516034985, + "loss": 3.3463, + "step": 39650 + }, + { + "epoch": 11.567599067599067, + "grad_norm": 0.33625391125679016, + "learning_rate": 0.00046146122448979587, + "loss": 3.3364, + "step": 39700 + }, + { + "epoch": 11.582167832167832, + "grad_norm": 0.35108792781829834, + "learning_rate": 0.00046128629737609325, + "loss": 3.3491, + "step": 39750 + }, + { + "epoch": 11.596736596736596, + "grad_norm": 0.36968687176704407, + "learning_rate": 0.0004611113702623906, + "loss": 3.3587, + "step": 39800 + }, + { + "epoch": 11.61130536130536, + "grad_norm": 0.37255340814590454, + "learning_rate": 0.000460936443148688, + "loss": 3.3613, + "step": 39850 + }, + { + "epoch": 11.625874125874127, + "grad_norm": 0.37071385979652405, + "learning_rate": 0.0004607615160349854, + "loss": 3.3637, + "step": 39900 + }, + { + "epoch": 11.640442890442891, + "grad_norm": 0.3244622051715851, + "learning_rate": 0.00046058658892128275, + "loss": 3.347, + "step": 39950 + }, + { + "epoch": 11.655011655011656, + "grad_norm": 0.33037108182907104, + "learning_rate": 0.0004604116618075802, + "loss": 3.352, + "step": 40000 + }, + { + "epoch": 11.655011655011656, + "eval_accuracy": 0.3698865318714751, + "eval_loss": 3.5570318698883057, + "eval_runtime": 179.9937, + "eval_samples_per_second": 92.459, + "eval_steps_per_second": 5.784, + "step": 40000 + } + ], + "logging_steps": 50, + "max_steps": 171600, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.36123005550592e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}