stratified_10m_curriculum_llama_source_difficulty / checkpoints /checkpoint-3842 /trainer_state.json
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.564458263573307, | |
| "eval_steps": 500, | |
| "global_step": 3842, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.09567089213106912, | |
| "grad_norm": 19.851242065429688, | |
| "learning_rate": 0.00011666666666666667, | |
| "loss": 121.1525, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19134178426213824, | |
| "grad_norm": 13.892197608947754, | |
| "learning_rate": 0.00023333333333333333, | |
| "loss": 77.1477, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2870126763932074, | |
| "grad_norm": 12.60777759552002, | |
| "learning_rate": 0.00035, | |
| "loss": 64.8839, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3826835685242765, | |
| "grad_norm": 8.890196800231934, | |
| "learning_rate": 0.00046666666666666666, | |
| "loss": 59.8849, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4783544606553456, | |
| "grad_norm": 8.705379486083984, | |
| "learning_rate": 0.0005833333333333334, | |
| "loss": 57.5299, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5740253527864148, | |
| "grad_norm": 7.457199573516846, | |
| "learning_rate": 0.0007, | |
| "loss": 56.2806, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6696962449174838, | |
| "grad_norm": 7.943981647491455, | |
| "learning_rate": 0.000699821634561209, | |
| "loss": 55.3251, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.765367137048553, | |
| "grad_norm": 7.716305732727051, | |
| "learning_rate": 0.0006992867200404345, | |
| "loss": 52.4636, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8610380291796221, | |
| "grad_norm": 7.123641014099121, | |
| "learning_rate": 0.0006983958016391807, | |
| "loss": 52.215, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9567089213106912, | |
| "grad_norm": 7.223691463470459, | |
| "learning_rate": 0.0006971497874091708, | |
| "loss": 52.1157, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9988041138483617, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 8.116789817810059, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 103.9352, | |
| "eval_samples_per_second": 514.33, | |
| "eval_steps_per_second": 1.01, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.0535756995933987, | |
| "grad_norm": 7.065810203552246, | |
| "learning_rate": 0.0006955499473268326, | |
| "loss": 50.0351, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1492465917244679, | |
| "grad_norm": 7.027311325073242, | |
| "learning_rate": 0.0006935979119988993, | |
| "loss": 48.6995, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.244917483855537, | |
| "grad_norm": 6.899056911468506, | |
| "learning_rate": 0.0006912956710004438, | |
| "loss": 48.9226, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3405883759866062, | |
| "grad_norm": 7.2765116691589355, | |
| "learning_rate": 0.0006886455708470427, | |
| "loss": 48.9019, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4362592681176751, | |
| "grad_norm": 8.293883323669434, | |
| "learning_rate": 0.0006856503126031346, | |
| "loss": 44.5473, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5319301602487443, | |
| "grad_norm": 7.498640537261963, | |
| "learning_rate": 0.0006823129491290102, | |
| "loss": 45.238, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6276010523798135, | |
| "grad_norm": 7.582442283630371, | |
| "learning_rate": 0.0006786368819692442, | |
| "loss": 45.6905, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7232719445108826, | |
| "grad_norm": 8.016205787658691, | |
| "learning_rate": 0.0006746258578857331, | |
| "loss": 42.9074, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8189428366419516, | |
| "grad_norm": 8.324728012084961, | |
| "learning_rate": 0.000670283965038881, | |
| "loss": 40.5995, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9146137287730207, | |
| "grad_norm": 8.61531925201416, | |
| "learning_rate": 0.0006656156288208179, | |
| "loss": 41.5048, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.9988041138483617, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 8.638134956359863, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 104.5141, | |
| "eval_samples_per_second": 511.481, | |
| "eval_steps_per_second": 1.005, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 2.0114805070557282, | |
| "grad_norm": 119.22761535644531, | |
| "learning_rate": 0.000660625607344904, | |
| "loss": 52.7863, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.1071513991867974, | |
| "grad_norm": 7.394373893737793, | |
| "learning_rate": 0.0006553189865961112, | |
| "loss": 87.7065, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.2028222913178666, | |
| "grad_norm": 5.8150200843811035, | |
| "learning_rate": 0.0006497011752472301, | |
| "loss": 68.8677, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.2984931834489357, | |
| "grad_norm": 8.597038269042969, | |
| "learning_rate": 0.0006437778991461825, | |
| "loss": 59.723, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.3597225544128198, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 5.760004997253418, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 102.8572, | |
| "eval_samples_per_second": 519.721, | |
| "eval_steps_per_second": 1.021, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 3.0344415211671847, | |
| "grad_norm": 9.546870231628418, | |
| "learning_rate": 0.0006375551954800587, | |
| "loss": 49.7983, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.130112413298254, | |
| "grad_norm": 11.537946701049805, | |
| "learning_rate": 0.0006310394066218296, | |
| "loss": 38.324, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.225783305429323, | |
| "grad_norm": 11.162705421447754, | |
| "learning_rate": 0.0006242371736660025, | |
| "loss": 27.47, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.321454197560392, | |
| "grad_norm": 12.32886791229248, | |
| "learning_rate": 0.000617155429659811, | |
| "loss": 18.7372, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.3597225544128198, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 7.427099227905273, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 110.0904, | |
| "eval_samples_per_second": 485.573, | |
| "eval_steps_per_second": 0.954, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 4.057402535278642, | |
| "grad_norm": 13.549798011779785, | |
| "learning_rate": 0.0006098013925368385, | |
| "loss": 57.3773, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.15307342740971, | |
| "grad_norm": 10.063131332397461, | |
| "learning_rate": 0.0006021825577602754, | |
| "loss": 71.4705, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.24874431954078, | |
| "grad_norm": 9.314085006713867, | |
| "learning_rate": 0.0005943066906833104, | |
| "loss": 69.1017, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.344415211671849, | |
| "grad_norm": 8.48709774017334, | |
| "learning_rate": 0.0005861818186344407, | |
| "loss": 66.7627, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.440086103802918, | |
| "grad_norm": 9.457844734191895, | |
| "learning_rate": 0.00057781622273577, | |
| "loss": 62.5878, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.535756995933987, | |
| "grad_norm": 8.216409683227539, | |
| "learning_rate": 0.0005692184294626307, | |
| "loss": 63.0275, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.631427888065057, | |
| "grad_norm": 8.80875301361084, | |
| "learning_rate": 0.0005603972019531362, | |
| "loss": 62.8742, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.727098780196125, | |
| "grad_norm": 9.469932556152344, | |
| "learning_rate": 0.0005513615310765172, | |
| "loss": 56.6002, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.822769672327194, | |
| "grad_norm": 9.208184242248535, | |
| "learning_rate": 0.0005421206262693491, | |
| "loss": 55.5842, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.918440564458264, | |
| "grad_norm": 9.542802810668945, | |
| "learning_rate": 0.0005326839061490078, | |
| "loss": 56.4509, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.998804113848362, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 5.560527801513672, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 102.9429, | |
| "eval_samples_per_second": 519.288, | |
| "eval_steps_per_second": 1.02, | |
| "step": 1942 | |
| }, | |
| { | |
| "epoch": 5.015307342740971, | |
| "grad_norm": 10.22252082824707, | |
| "learning_rate": 0.0005230609889139216, | |
| "loss": 52.8616, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 5.11097823487204, | |
| "grad_norm": 11.31312370300293, | |
| "learning_rate": 0.0005132616825404055, | |
| "loss": 45.5469, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.206649127003109, | |
| "grad_norm": 11.319524765014648, | |
| "learning_rate": 0.0005032959747860662, | |
| "loss": 47.203, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 5.302320019134179, | |
| "grad_norm": 11.748647689819336, | |
| "learning_rate": 0.000493174023009969, | |
| "loss": 48.3289, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.397990911265247, | |
| "grad_norm": 12.092212677001953, | |
| "learning_rate": 0.00048290614381994235, | |
| "loss": 39.7512, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 5.493661803396317, | |
| "grad_norm": 12.723077774047852, | |
| "learning_rate": 0.00047250280255757023, | |
| "loss": 38.0036, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.589332695527386, | |
| "grad_norm": 13.034537315368652, | |
| "learning_rate": 0.0004619746026315906, | |
| "loss": 39.4596, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 5.685003587658455, | |
| "grad_norm": 11.695505142211914, | |
| "learning_rate": 0.00045133227471057203, | |
| "loss": 36.7652, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.780674479789524, | |
| "grad_norm": 12.36651611328125, | |
| "learning_rate": 0.00044058666578588224, | |
| "loss": 30.5917, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 5.876345371920593, | |
| "grad_norm": 12.983572006225586, | |
| "learning_rate": 0.0004297487281160982, | |
| "loss": 32.1428, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 5.972016264051662, | |
| "grad_norm": 13.80376148223877, | |
| "learning_rate": 0.00041882950806412285, | |
| "loss": 33.0843, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 5.998804113848362, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 6.923346042633057, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 103.7481, | |
| "eval_samples_per_second": 515.258, | |
| "eval_steps_per_second": 1.012, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 6.068883042334369, | |
| "grad_norm": 9.631818771362305, | |
| "learning_rate": 0.0004078401348383897, | |
| "loss": 65.9342, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 6.164553934465439, | |
| "grad_norm": 7.13682746887207, | |
| "learning_rate": 0.00039679180914962693, | |
| "loss": 70.9535, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 6.260224826596508, | |
| "grad_norm": 7.001780033111572, | |
| "learning_rate": 0.00038569579179474536, | |
| "loss": 65.2834, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 6.355895718727577, | |
| "grad_norm": 8.191680908203125, | |
| "learning_rate": 0.00037456339217948394, | |
| "loss": 60.6176, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 6.451566610858646, | |
| "grad_norm": 8.527460098266602, | |
| "learning_rate": 0.0003634059567915124, | |
| "loss": 57.9774, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.547237502989716, | |
| "grad_norm": 10.255255699157715, | |
| "learning_rate": 0.00035223485763573775, | |
| "loss": 48.2276, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 6.642908395120784, | |
| "grad_norm": 11.824259757995605, | |
| "learning_rate": 0.00034106148064360405, | |
| "loss": 44.7545, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 6.738579287251854, | |
| "grad_norm": 12.938841819763184, | |
| "learning_rate": 0.0003298972140681969, | |
| "loss": 39.2877, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 6.753886629992825, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 5.779562950134277, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 104.2115, | |
| "eval_samples_per_second": 512.967, | |
| "eval_steps_per_second": 1.008, | |
| "step": 2858 | |
| }, | |
| { | |
| "epoch": 7.080363549390098, | |
| "grad_norm": 13.708643913269043, | |
| "learning_rate": 0.00031875343687698203, | |
| "loss": 31.5294, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 7.176034441521167, | |
| "grad_norm": 13.05090618133545, | |
| "learning_rate": 0.000307641507154008, | |
| "loss": 28.8423, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 7.271705333652236, | |
| "grad_norm": 15.017513275146484, | |
| "learning_rate": 0.0002965727505233939, | |
| "loss": 23.1749, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 7.367376225783305, | |
| "grad_norm": 13.55999755859375, | |
| "learning_rate": 0.0002855584486059016, | |
| "loss": 19.6325, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 7.463047117914375, | |
| "grad_norm": 10.594040870666504, | |
| "learning_rate": 0.00027460982752035653, | |
| "loss": 18.3422, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 7.558718010045443, | |
| "grad_norm": 13.867926597595215, | |
| "learning_rate": 0.00026373804644163934, | |
| "loss": 14.5862, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 7.654388902176513, | |
| "grad_norm": 11.124190330505371, | |
| "learning_rate": 0.00025295418622690716, | |
| "loss": 13.728, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 7.750059794307582, | |
| "grad_norm": 13.195384979248047, | |
| "learning_rate": 0.00024226923812164069, | |
| "loss": 13.0752, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 7.753886629992825, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 7.642109394073486, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 103.2383, | |
| "eval_samples_per_second": 517.802, | |
| "eval_steps_per_second": 1.017, | |
| "step": 3252 | |
| }, | |
| { | |
| "epoch": 8.091844056445826, | |
| "grad_norm": 7.070100784301758, | |
| "learning_rate": 0.0002316940925570241, | |
| "loss": 83.8433, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 8.187514948576895, | |
| "grad_norm": 7.2077250480651855, | |
| "learning_rate": 0.0002212395280500796, | |
| "loss": 73.3812, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 8.283185840707965, | |
| "grad_norm": 8.034393310546875, | |
| "learning_rate": 0.00021091620021786818, | |
| "loss": 68.4622, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 8.378856732839033, | |
| "grad_norm": 9.863322257995605, | |
| "learning_rate": 0.0002007346309169531, | |
| "loss": 63.8324, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 8.474527624970102, | |
| "grad_norm": 11.584691047668457, | |
| "learning_rate": 0.00019070519751919701, | |
| "loss": 58.9669, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 8.564458263573307, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 5.5344061851501465, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 103.9205, | |
| "eval_samples_per_second": 514.403, | |
| "eval_steps_per_second": 1.01, | |
| "step": 3547 | |
| }, | |
| { | |
| "epoch": 9.005740253527865, | |
| "grad_norm": 11.757286071777344, | |
| "learning_rate": 0.0001808381223348215, | |
| "loss": 54.5028, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 9.101411145658933, | |
| "grad_norm": 14.31762981414795, | |
| "learning_rate": 0.0001711434621935113, | |
| "loss": 46.9798, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 9.197082037790002, | |
| "grad_norm": 16.303457260131836, | |
| "learning_rate": 0.00016163109819418135, | |
| "loss": 40.9249, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 9.292752929921072, | |
| "grad_norm": 17.622509002685547, | |
| "learning_rate": 0.0001523107256338523, | |
| "loss": 35.8961, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 9.388423822052141, | |
| "grad_norm": 17.22403335571289, | |
| "learning_rate": 0.00014319184412590392, | |
| "loss": 31.1963, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 9.48409471418321, | |
| "grad_norm": 17.277957916259766, | |
| "learning_rate": 0.00013428374791777268, | |
| "loss": 26.8592, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 9.564458263573307, | |
| "eval_accuracy": 0.0, | |
| "eval_loss": 7.6266303062438965, | |
| "eval_normalizer": 685885.0, | |
| "eval_runtime": 104.3587, | |
| "eval_samples_per_second": 512.243, | |
| "eval_steps_per_second": 1.006, | |
| "step": 3842 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 5220, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.780357216788741e+17, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |