| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.999972803176589, | |
| "eval_steps": 1000, | |
| "global_step": 9192, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005439364682205118, | |
| "grad_norm": 0.7295230031013489, | |
| "learning_rate": 9.946692776327242e-05, | |
| "loss": 0.2937, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010878729364410237, | |
| "grad_norm": 0.6098010540008545, | |
| "learning_rate": 9.892297650130549e-05, | |
| "loss": 0.0771, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.016318094046615356, | |
| "grad_norm": 0.7312902808189392, | |
| "learning_rate": 9.837902523933856e-05, | |
| "loss": 0.0662, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.021757458728820473, | |
| "grad_norm": 0.531105637550354, | |
| "learning_rate": 9.783507397737163e-05, | |
| "loss": 0.0591, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02719682341102559, | |
| "grad_norm": 0.3680162727832794, | |
| "learning_rate": 9.72911227154047e-05, | |
| "loss": 0.0593, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03263618809323071, | |
| "grad_norm": 0.5968620777130127, | |
| "learning_rate": 9.674717145343778e-05, | |
| "loss": 0.0534, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03807555277543583, | |
| "grad_norm": 0.31027188897132874, | |
| "learning_rate": 9.620322019147084e-05, | |
| "loss": 0.0471, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.043514917457640946, | |
| "grad_norm": 0.3172386884689331, | |
| "learning_rate": 9.565926892950392e-05, | |
| "loss": 0.0486, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.04895428213984607, | |
| "grad_norm": 0.3238416314125061, | |
| "learning_rate": 9.5115317667537e-05, | |
| "loss": 0.0483, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.05439364682205118, | |
| "grad_norm": 0.5560753345489502, | |
| "learning_rate": 9.457136640557006e-05, | |
| "loss": 0.0392, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0598330115042563, | |
| "grad_norm": 0.4322672188282013, | |
| "learning_rate": 9.402741514360314e-05, | |
| "loss": 0.0395, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.06527237618646142, | |
| "grad_norm": 0.29789817333221436, | |
| "learning_rate": 9.348346388163621e-05, | |
| "loss": 0.0396, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07071174086866654, | |
| "grad_norm": 0.2505125403404236, | |
| "learning_rate": 9.293951261966927e-05, | |
| "loss": 0.0312, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.07615110555087166, | |
| "grad_norm": 0.4463261663913727, | |
| "learning_rate": 9.239556135770236e-05, | |
| "loss": 0.0382, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08159047023307678, | |
| "grad_norm": 0.30546480417251587, | |
| "learning_rate": 9.185161009573543e-05, | |
| "loss": 0.0373, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.08702983491528189, | |
| "grad_norm": 0.1717098206281662, | |
| "learning_rate": 9.130765883376849e-05, | |
| "loss": 0.0405, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.09246919959748702, | |
| "grad_norm": 0.2477688491344452, | |
| "learning_rate": 9.076370757180158e-05, | |
| "loss": 0.0364, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.09790856427969213, | |
| "grad_norm": 0.4148896038532257, | |
| "learning_rate": 9.021975630983465e-05, | |
| "loss": 0.0327, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.10334792896189725, | |
| "grad_norm": 0.36908024549484253, | |
| "learning_rate": 8.967580504786771e-05, | |
| "loss": 0.0307, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.10878729364410236, | |
| "grad_norm": 0.612694263458252, | |
| "learning_rate": 8.913185378590078e-05, | |
| "loss": 0.0291, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10878729364410236, | |
| "eval_loss": 0.031227048486471176, | |
| "eval_runtime": 623.3239, | |
| "eval_samples_per_second": 26.218, | |
| "eval_steps_per_second": 3.278, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11422665832630749, | |
| "grad_norm": 0.13878417015075684, | |
| "learning_rate": 8.858790252393387e-05, | |
| "loss": 0.0314, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1196660230085126, | |
| "grad_norm": 0.3287246823310852, | |
| "learning_rate": 8.804395126196693e-05, | |
| "loss": 0.0289, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.12510538769071772, | |
| "grad_norm": 0.2922157943248749, | |
| "learning_rate": 8.75e-05, | |
| "loss": 0.0327, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.13054475237292285, | |
| "grad_norm": 0.2656305432319641, | |
| "learning_rate": 8.695604873803309e-05, | |
| "loss": 0.0306, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.13598411705512797, | |
| "grad_norm": 0.1364014893770218, | |
| "learning_rate": 8.641209747606614e-05, | |
| "loss": 0.0296, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.14142348173733307, | |
| "grad_norm": 0.19203521311283112, | |
| "learning_rate": 8.586814621409922e-05, | |
| "loss": 0.0305, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1468628464195382, | |
| "grad_norm": 0.1480610966682434, | |
| "learning_rate": 8.53241949521323e-05, | |
| "loss": 0.0217, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.15230221110174333, | |
| "grad_norm": 0.04526514559984207, | |
| "learning_rate": 8.478024369016536e-05, | |
| "loss": 0.0216, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.15774157578394843, | |
| "grad_norm": 0.7239785194396973, | |
| "learning_rate": 8.423629242819843e-05, | |
| "loss": 0.0293, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.16318094046615356, | |
| "grad_norm": 0.28628188371658325, | |
| "learning_rate": 8.369234116623151e-05, | |
| "loss": 0.0266, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.16862030514835868, | |
| "grad_norm": 0.3383677005767822, | |
| "learning_rate": 8.314838990426458e-05, | |
| "loss": 0.0279, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.17405966983056378, | |
| "grad_norm": 0.1695743054151535, | |
| "learning_rate": 8.260443864229765e-05, | |
| "loss": 0.0273, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1794990345127689, | |
| "grad_norm": 0.08721613138914108, | |
| "learning_rate": 8.206048738033073e-05, | |
| "loss": 0.0281, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.18493839919497404, | |
| "grad_norm": 0.28628501296043396, | |
| "learning_rate": 8.15165361183638e-05, | |
| "loss": 0.0286, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.19037776387717914, | |
| "grad_norm": 0.11635430157184601, | |
| "learning_rate": 8.097258485639687e-05, | |
| "loss": 0.0295, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.19581712855938427, | |
| "grad_norm": 0.17274004220962524, | |
| "learning_rate": 8.042863359442994e-05, | |
| "loss": 0.0276, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2012564932415894, | |
| "grad_norm": 0.3431914150714874, | |
| "learning_rate": 7.988468233246302e-05, | |
| "loss": 0.0236, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2066958579237945, | |
| "grad_norm": 0.29347464442253113, | |
| "learning_rate": 7.934073107049609e-05, | |
| "loss": 0.0259, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.21213522260599962, | |
| "grad_norm": 0.2548673450946808, | |
| "learning_rate": 7.879677980852916e-05, | |
| "loss": 0.0247, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.21757458728820472, | |
| "grad_norm": 0.1950218230485916, | |
| "learning_rate": 7.825282854656223e-05, | |
| "loss": 0.0245, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.21757458728820472, | |
| "eval_loss": 0.024380268529057503, | |
| "eval_runtime": 612.7819, | |
| "eval_samples_per_second": 26.669, | |
| "eval_steps_per_second": 3.334, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.22301395197040985, | |
| "grad_norm": 0.14043785631656647, | |
| "learning_rate": 7.77088772845953e-05, | |
| "loss": 0.0258, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.22845331665261498, | |
| "grad_norm": 0.2565617263317108, | |
| "learning_rate": 7.716492602262838e-05, | |
| "loss": 0.026, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.23389268133482008, | |
| "grad_norm": 0.19237670302391052, | |
| "learning_rate": 7.662097476066145e-05, | |
| "loss": 0.0219, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.2393320460170252, | |
| "grad_norm": 0.17301031947135925, | |
| "learning_rate": 7.607702349869452e-05, | |
| "loss": 0.0273, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.24477141069923034, | |
| "grad_norm": 0.4380134344100952, | |
| "learning_rate": 7.55330722367276e-05, | |
| "loss": 0.0206, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.25021077538143544, | |
| "grad_norm": 0.5898825526237488, | |
| "learning_rate": 7.498912097476067e-05, | |
| "loss": 0.0221, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.25565014006364056, | |
| "grad_norm": 0.38086146116256714, | |
| "learning_rate": 7.444516971279374e-05, | |
| "loss": 0.0219, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2610895047458457, | |
| "grad_norm": 0.4513295888900757, | |
| "learning_rate": 7.390121845082681e-05, | |
| "loss": 0.0232, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.2665288694280508, | |
| "grad_norm": 0.09053909033536911, | |
| "learning_rate": 7.335726718885987e-05, | |
| "loss": 0.0221, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.27196823411025595, | |
| "grad_norm": 0.20339776575565338, | |
| "learning_rate": 7.281331592689296e-05, | |
| "loss": 0.0214, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.277407598792461, | |
| "grad_norm": 0.09539163112640381, | |
| "learning_rate": 7.226936466492603e-05, | |
| "loss": 0.0186, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.28284696347466615, | |
| "grad_norm": 0.8186866044998169, | |
| "learning_rate": 7.172541340295909e-05, | |
| "loss": 0.0216, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2882863281568713, | |
| "grad_norm": 0.2770097553730011, | |
| "learning_rate": 7.118146214099218e-05, | |
| "loss": 0.0203, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.2937256928390764, | |
| "grad_norm": 0.2176240086555481, | |
| "learning_rate": 7.063751087902525e-05, | |
| "loss": 0.0284, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.29916505752128153, | |
| "grad_norm": 0.3172820210456848, | |
| "learning_rate": 7.009355961705831e-05, | |
| "loss": 0.0196, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.30460442220348666, | |
| "grad_norm": 0.5219048261642456, | |
| "learning_rate": 6.95496083550914e-05, | |
| "loss": 0.0209, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.31004378688569173, | |
| "grad_norm": 0.16620703041553497, | |
| "learning_rate": 6.900565709312445e-05, | |
| "loss": 0.0253, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.31548315156789686, | |
| "grad_norm": 0.33126798272132874, | |
| "learning_rate": 6.846170583115753e-05, | |
| "loss": 0.0217, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.320922516250102, | |
| "grad_norm": 0.07635556906461716, | |
| "learning_rate": 6.791775456919061e-05, | |
| "loss": 0.0232, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.3263618809323071, | |
| "grad_norm": 0.3889711797237396, | |
| "learning_rate": 6.737380330722367e-05, | |
| "loss": 0.0212, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3263618809323071, | |
| "eval_loss": 0.021349932998418808, | |
| "eval_runtime": 613.1791, | |
| "eval_samples_per_second": 26.651, | |
| "eval_steps_per_second": 3.332, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.33180124561451224, | |
| "grad_norm": 0.3016066551208496, | |
| "learning_rate": 6.682985204525674e-05, | |
| "loss": 0.0211, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.33724061029671737, | |
| "grad_norm": 0.07768701761960983, | |
| "learning_rate": 6.628590078328982e-05, | |
| "loss": 0.0178, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.34267997497892244, | |
| "grad_norm": 0.39988207817077637, | |
| "learning_rate": 6.574194952132289e-05, | |
| "loss": 0.0183, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.34811933966112757, | |
| "grad_norm": 0.17848381400108337, | |
| "learning_rate": 6.519799825935596e-05, | |
| "loss": 0.0177, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3535587043433327, | |
| "grad_norm": 0.3475474417209625, | |
| "learning_rate": 6.465404699738903e-05, | |
| "loss": 0.0202, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.3589980690255378, | |
| "grad_norm": 0.13056176900863647, | |
| "learning_rate": 6.411009573542211e-05, | |
| "loss": 0.0191, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.36443743370774295, | |
| "grad_norm": 0.1627858579158783, | |
| "learning_rate": 6.356614447345518e-05, | |
| "loss": 0.0189, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.3698767983899481, | |
| "grad_norm": 0.3630661070346832, | |
| "learning_rate": 6.302219321148825e-05, | |
| "loss": 0.0223, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.37531616307215315, | |
| "grad_norm": 0.18658381700515747, | |
| "learning_rate": 6.247824194952132e-05, | |
| "loss": 0.0208, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.3807555277543583, | |
| "grad_norm": 0.255504310131073, | |
| "learning_rate": 6.19342906875544e-05, | |
| "loss": 0.0177, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3861948924365634, | |
| "grad_norm": 0.28671759366989136, | |
| "learning_rate": 6.139033942558747e-05, | |
| "loss": 0.0204, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.39163425711876854, | |
| "grad_norm": 0.10439465939998627, | |
| "learning_rate": 6.0846388163620536e-05, | |
| "loss": 0.0148, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.39707362180097366, | |
| "grad_norm": 0.1532258689403534, | |
| "learning_rate": 6.0302436901653615e-05, | |
| "loss": 0.0173, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.4025129864831788, | |
| "grad_norm": 0.45614758133888245, | |
| "learning_rate": 5.975848563968669e-05, | |
| "loss": 0.015, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.40795235116538386, | |
| "grad_norm": 0.19495117664337158, | |
| "learning_rate": 5.9214534377719754e-05, | |
| "loss": 0.0186, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.413391715847589, | |
| "grad_norm": 0.1256154626607895, | |
| "learning_rate": 5.867058311575283e-05, | |
| "loss": 0.0156, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4188310805297941, | |
| "grad_norm": 0.10156747698783875, | |
| "learning_rate": 5.8126631853785905e-05, | |
| "loss": 0.0188, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.42427044521199925, | |
| "grad_norm": 0.2215280681848526, | |
| "learning_rate": 5.758268059181897e-05, | |
| "loss": 0.0191, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4297098098942044, | |
| "grad_norm": 0.5749198198318481, | |
| "learning_rate": 5.703872932985205e-05, | |
| "loss": 0.0161, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.43514917457640945, | |
| "grad_norm": 0.2301347702741623, | |
| "learning_rate": 5.649477806788512e-05, | |
| "loss": 0.018, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.43514917457640945, | |
| "eval_loss": 0.017653847113251686, | |
| "eval_runtime": 611.9222, | |
| "eval_samples_per_second": 26.706, | |
| "eval_steps_per_second": 3.339, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4405885392586146, | |
| "grad_norm": 0.18594112992286682, | |
| "learning_rate": 5.595082680591819e-05, | |
| "loss": 0.0177, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.4460279039408197, | |
| "grad_norm": 0.18527820706367493, | |
| "learning_rate": 5.540687554395126e-05, | |
| "loss": 0.0167, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.45146726862302483, | |
| "grad_norm": 0.6579691171646118, | |
| "learning_rate": 5.486292428198434e-05, | |
| "loss": 0.0162, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.45690663330522996, | |
| "grad_norm": 0.03042120486497879, | |
| "learning_rate": 5.431897302001741e-05, | |
| "loss": 0.0144, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.4623459979874351, | |
| "grad_norm": 0.2600632309913635, | |
| "learning_rate": 5.377502175805048e-05, | |
| "loss": 0.0154, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.46778536266964016, | |
| "grad_norm": 0.20093311369419098, | |
| "learning_rate": 5.323107049608356e-05, | |
| "loss": 0.0168, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4732247273518453, | |
| "grad_norm": 0.307822048664093, | |
| "learning_rate": 5.2687119234116625e-05, | |
| "loss": 0.0148, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.4786640920340504, | |
| "grad_norm": 0.23011909425258636, | |
| "learning_rate": 5.21431679721497e-05, | |
| "loss": 0.0153, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.48410345671625554, | |
| "grad_norm": 0.10574093461036682, | |
| "learning_rate": 5.1599216710182777e-05, | |
| "loss": 0.0171, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.48954282139846067, | |
| "grad_norm": 0.2588224709033966, | |
| "learning_rate": 5.1055265448215836e-05, | |
| "loss": 0.0181, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.4949821860806658, | |
| "grad_norm": 0.10799703747034073, | |
| "learning_rate": 5.0511314186248915e-05, | |
| "loss": 0.0169, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.5004215507628709, | |
| "grad_norm": 0.10647980868816376, | |
| "learning_rate": 4.996736292428199e-05, | |
| "loss": 0.0134, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.505860915445076, | |
| "grad_norm": 0.026128219440579414, | |
| "learning_rate": 4.942341166231506e-05, | |
| "loss": 0.0164, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.5113002801272811, | |
| "grad_norm": 0.22313055396080017, | |
| "learning_rate": 4.887946040034813e-05, | |
| "loss": 0.0139, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5167396448094862, | |
| "grad_norm": 0.41442468762397766, | |
| "learning_rate": 4.8335509138381205e-05, | |
| "loss": 0.0135, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.5221790094916914, | |
| "grad_norm": 0.13141965866088867, | |
| "learning_rate": 4.779155787641428e-05, | |
| "loss": 0.0138, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.5276183741738965, | |
| "grad_norm": 0.23391123116016388, | |
| "learning_rate": 4.724760661444735e-05, | |
| "loss": 0.0148, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.5330577388561016, | |
| "grad_norm": 0.2290557622909546, | |
| "learning_rate": 4.6703655352480416e-05, | |
| "loss": 0.014, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.5384971035383067, | |
| "grad_norm": 0.263883113861084, | |
| "learning_rate": 4.6159704090513496e-05, | |
| "loss": 0.0129, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.5439364682205119, | |
| "grad_norm": 0.4367043673992157, | |
| "learning_rate": 4.561575282854657e-05, | |
| "loss": 0.0138, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5439364682205119, | |
| "eval_loss": 0.015788141638040543, | |
| "eval_runtime": 612.4211, | |
| "eval_samples_per_second": 26.684, | |
| "eval_steps_per_second": 3.336, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.549375832902717, | |
| "grad_norm": 0.17054913938045502, | |
| "learning_rate": 4.5071801566579634e-05, | |
| "loss": 0.0122, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.554815197584922, | |
| "grad_norm": 0.47613638639450073, | |
| "learning_rate": 4.4527850304612713e-05, | |
| "loss": 0.0148, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5602545622671272, | |
| "grad_norm": 0.1685837209224701, | |
| "learning_rate": 4.398389904264578e-05, | |
| "loss": 0.0166, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.5656939269493323, | |
| "grad_norm": 0.23199672996997833, | |
| "learning_rate": 4.343994778067885e-05, | |
| "loss": 0.0139, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5711332916315375, | |
| "grad_norm": 0.5401563048362732, | |
| "learning_rate": 4.2895996518711924e-05, | |
| "loss": 0.0134, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.5765726563137425, | |
| "grad_norm": 0.1670864224433899, | |
| "learning_rate": 4.2352045256745e-05, | |
| "loss": 0.0143, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5820120209959476, | |
| "grad_norm": 0.08487512916326523, | |
| "learning_rate": 4.180809399477807e-05, | |
| "loss": 0.0152, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.5874513856781528, | |
| "grad_norm": 0.07271425426006317, | |
| "learning_rate": 4.126414273281114e-05, | |
| "loss": 0.0166, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5928907503603579, | |
| "grad_norm": 0.27096590399742126, | |
| "learning_rate": 4.0720191470844215e-05, | |
| "loss": 0.0134, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.5983301150425631, | |
| "grad_norm": 0.1035747304558754, | |
| "learning_rate": 4.017624020887729e-05, | |
| "loss": 0.0144, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6037694797247681, | |
| "grad_norm": 0.34544578194618225, | |
| "learning_rate": 3.963228894691035e-05, | |
| "loss": 0.0138, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.6092088444069733, | |
| "grad_norm": 0.4660258889198303, | |
| "learning_rate": 3.908833768494343e-05, | |
| "loss": 0.0166, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6146482090891784, | |
| "grad_norm": 0.12394747883081436, | |
| "learning_rate": 3.8544386422976505e-05, | |
| "loss": 0.0124, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.6200875737713835, | |
| "grad_norm": 0.13275495171546936, | |
| "learning_rate": 3.800043516100957e-05, | |
| "loss": 0.0144, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.6255269384535886, | |
| "grad_norm": 0.23382355272769928, | |
| "learning_rate": 3.745648389904265e-05, | |
| "loss": 0.0163, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.6309663031357937, | |
| "grad_norm": 0.10067889094352722, | |
| "learning_rate": 3.691253263707572e-05, | |
| "loss": 0.0166, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.6364056678179989, | |
| "grad_norm": 0.09269619733095169, | |
| "learning_rate": 3.636858137510879e-05, | |
| "loss": 0.0155, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.641845032500204, | |
| "grad_norm": 0.22883062064647675, | |
| "learning_rate": 3.582463011314187e-05, | |
| "loss": 0.0119, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.647284397182409, | |
| "grad_norm": 0.3405645787715912, | |
| "learning_rate": 3.5280678851174934e-05, | |
| "loss": 0.0134, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.6527237618646142, | |
| "grad_norm": 0.10464873909950256, | |
| "learning_rate": 3.4736727589208007e-05, | |
| "loss": 0.0115, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6527237618646142, | |
| "eval_loss": 0.013636166229844093, | |
| "eval_runtime": 612.7881, | |
| "eval_samples_per_second": 26.668, | |
| "eval_steps_per_second": 3.334, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6581631265468193, | |
| "grad_norm": 0.15743671357631683, | |
| "learning_rate": 3.4192776327241086e-05, | |
| "loss": 0.0127, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.6636024912290245, | |
| "grad_norm": 0.28343307971954346, | |
| "learning_rate": 3.364882506527415e-05, | |
| "loss": 0.013, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6690418559112296, | |
| "grad_norm": 0.0802890881896019, | |
| "learning_rate": 3.3104873803307224e-05, | |
| "loss": 0.0156, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.6744812205934347, | |
| "grad_norm": 0.2691664397716522, | |
| "learning_rate": 3.25609225413403e-05, | |
| "loss": 0.0131, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.6799205852756398, | |
| "grad_norm": 0.08483448624610901, | |
| "learning_rate": 3.201697127937337e-05, | |
| "loss": 0.0127, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.6853599499578449, | |
| "grad_norm": 0.2310999631881714, | |
| "learning_rate": 3.147302001740644e-05, | |
| "loss": 0.01, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.6907993146400501, | |
| "grad_norm": 0.10749073326587677, | |
| "learning_rate": 3.0929068755439515e-05, | |
| "loss": 0.0128, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.6962386793222551, | |
| "grad_norm": 0.14852339029312134, | |
| "learning_rate": 3.0385117493472587e-05, | |
| "loss": 0.0097, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.7016780440044603, | |
| "grad_norm": 0.09708331525325775, | |
| "learning_rate": 2.9841166231505656e-05, | |
| "loss": 0.0137, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.7071174086866654, | |
| "grad_norm": 0.13305231928825378, | |
| "learning_rate": 2.9297214969538732e-05, | |
| "loss": 0.0106, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.7125567733688705, | |
| "grad_norm": 0.2321113497018814, | |
| "learning_rate": 2.8753263707571805e-05, | |
| "loss": 0.0161, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.7179961380510756, | |
| "grad_norm": 0.1432623565196991, | |
| "learning_rate": 2.8209312445604874e-05, | |
| "loss": 0.0096, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.7234355027332807, | |
| "grad_norm": 0.1964827924966812, | |
| "learning_rate": 2.766536118363795e-05, | |
| "loss": 0.0174, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.7288748674154859, | |
| "grad_norm": 0.21941740810871124, | |
| "learning_rate": 2.712140992167102e-05, | |
| "loss": 0.0125, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.734314232097691, | |
| "grad_norm": 0.06487424671649933, | |
| "learning_rate": 2.6577458659704092e-05, | |
| "loss": 0.0143, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.7397535967798962, | |
| "grad_norm": 0.08458438515663147, | |
| "learning_rate": 2.603350739773716e-05, | |
| "loss": 0.0122, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.7451929614621012, | |
| "grad_norm": 0.21706067025661469, | |
| "learning_rate": 2.5489556135770237e-05, | |
| "loss": 0.0136, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.7506323261443063, | |
| "grad_norm": 0.04576512426137924, | |
| "learning_rate": 2.4945604873803306e-05, | |
| "loss": 0.0127, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.7560716908265115, | |
| "grad_norm": 0.05280361324548721, | |
| "learning_rate": 2.4401653611836382e-05, | |
| "loss": 0.0107, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.7615110555087166, | |
| "grad_norm": 0.5558903217315674, | |
| "learning_rate": 2.3857702349869455e-05, | |
| "loss": 0.0088, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7615110555087166, | |
| "eval_loss": 0.01208407897502184, | |
| "eval_runtime": 610.9461, | |
| "eval_samples_per_second": 26.749, | |
| "eval_steps_per_second": 3.344, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7669504201909217, | |
| "grad_norm": 0.470074325799942, | |
| "learning_rate": 2.3313751087902524e-05, | |
| "loss": 0.0135, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.7723897848731268, | |
| "grad_norm": 0.2360675036907196, | |
| "learning_rate": 2.2769799825935597e-05, | |
| "loss": 0.0133, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.7778291495553319, | |
| "grad_norm": 0.11457215994596481, | |
| "learning_rate": 2.222584856396867e-05, | |
| "loss": 0.011, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.7832685142375371, | |
| "grad_norm": 0.010224751196801662, | |
| "learning_rate": 2.1681897302001742e-05, | |
| "loss": 0.0148, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.7887078789197421, | |
| "grad_norm": 0.2885963022708893, | |
| "learning_rate": 2.1137946040034815e-05, | |
| "loss": 0.0132, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.7941472436019473, | |
| "grad_norm": 0.3628193438053131, | |
| "learning_rate": 2.0593994778067884e-05, | |
| "loss": 0.0105, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7995866082841524, | |
| "grad_norm": 0.07265301048755646, | |
| "learning_rate": 2.005004351610096e-05, | |
| "loss": 0.0104, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.8050259729663576, | |
| "grad_norm": 0.13889417052268982, | |
| "learning_rate": 1.9506092254134032e-05, | |
| "loss": 0.0129, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.8104653376485627, | |
| "grad_norm": 0.06533674895763397, | |
| "learning_rate": 1.89621409921671e-05, | |
| "loss": 0.0097, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.8159047023307677, | |
| "grad_norm": 0.23986752331256866, | |
| "learning_rate": 1.8418189730200174e-05, | |
| "loss": 0.0119, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8213440670129729, | |
| "grad_norm": 0.09028589725494385, | |
| "learning_rate": 1.7874238468233247e-05, | |
| "loss": 0.0093, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.826783431695178, | |
| "grad_norm": 0.11503873020410538, | |
| "learning_rate": 1.733028720626632e-05, | |
| "loss": 0.0091, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.8322227963773832, | |
| "grad_norm": 0.5392634868621826, | |
| "learning_rate": 1.6786335944299392e-05, | |
| "loss": 0.0118, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.8376621610595882, | |
| "grad_norm": 0.14633004367351532, | |
| "learning_rate": 1.6242384682332464e-05, | |
| "loss": 0.0112, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.8431015257417933, | |
| "grad_norm": 0.03690435364842415, | |
| "learning_rate": 1.5698433420365534e-05, | |
| "loss": 0.0117, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.8485408904239985, | |
| "grad_norm": 0.1749696284532547, | |
| "learning_rate": 1.5154482158398608e-05, | |
| "loss": 0.0108, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.8539802551062036, | |
| "grad_norm": 0.1117832213640213, | |
| "learning_rate": 1.4610530896431682e-05, | |
| "loss": 0.0142, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.8594196197884088, | |
| "grad_norm": 0.06253615021705627, | |
| "learning_rate": 1.4066579634464751e-05, | |
| "loss": 0.0101, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.8648589844706138, | |
| "grad_norm": 0.11594853550195694, | |
| "learning_rate": 1.3522628372497826e-05, | |
| "loss": 0.0102, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.8702983491528189, | |
| "grad_norm": 0.2881285846233368, | |
| "learning_rate": 1.2978677110530895e-05, | |
| "loss": 0.0096, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8702983491528189, | |
| "eval_loss": 0.010398774407804012, | |
| "eval_runtime": 612.7356, | |
| "eval_samples_per_second": 26.671, | |
| "eval_steps_per_second": 3.334, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8757377138350241, | |
| "grad_norm": 0.10731537640094757, | |
| "learning_rate": 1.243472584856397e-05, | |
| "loss": 0.0102, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.8811770785172292, | |
| "grad_norm": 0.08623083680868149, | |
| "learning_rate": 1.1890774586597042e-05, | |
| "loss": 0.0095, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.8866164431994343, | |
| "grad_norm": 0.16180342435836792, | |
| "learning_rate": 1.1346823324630114e-05, | |
| "loss": 0.0094, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.8920558078816394, | |
| "grad_norm": 0.09304390847682953, | |
| "learning_rate": 1.0802872062663185e-05, | |
| "loss": 0.0105, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.8974951725638446, | |
| "grad_norm": 0.08745424449443817, | |
| "learning_rate": 1.0258920800696258e-05, | |
| "loss": 0.0077, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.9029345372460497, | |
| "grad_norm": 0.18484659492969513, | |
| "learning_rate": 9.71496953872933e-06, | |
| "loss": 0.0116, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.9083739019282547, | |
| "grad_norm": 0.18615852296352386, | |
| "learning_rate": 9.171018276762403e-06, | |
| "loss": 0.0087, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.9138132666104599, | |
| "grad_norm": 0.038191672414541245, | |
| "learning_rate": 8.627067014795474e-06, | |
| "loss": 0.0091, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.919252631292665, | |
| "grad_norm": 0.049192268401384354, | |
| "learning_rate": 8.083115752828548e-06, | |
| "loss": 0.0113, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.9246919959748702, | |
| "grad_norm": 0.12032605707645416, | |
| "learning_rate": 7.539164490861619e-06, | |
| "loss": 0.0116, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.9301313606570752, | |
| "grad_norm": 0.2927582561969757, | |
| "learning_rate": 7.006092254134029e-06, | |
| "loss": 0.0114, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.9355707253392803, | |
| "grad_norm": 0.04057031497359276, | |
| "learning_rate": 6.462140992167103e-06, | |
| "loss": 0.0093, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.9410100900214855, | |
| "grad_norm": 0.23389148712158203, | |
| "learning_rate": 5.918189730200174e-06, | |
| "loss": 0.0123, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.9464494547036906, | |
| "grad_norm": 0.14039556682109833, | |
| "learning_rate": 5.374238468233247e-06, | |
| "loss": 0.0088, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.9518888193858958, | |
| "grad_norm": 0.10834332555532455, | |
| "learning_rate": 4.830287206266319e-06, | |
| "loss": 0.0126, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.9573281840681008, | |
| "grad_norm": 0.20089313387870789, | |
| "learning_rate": 4.286335944299391e-06, | |
| "loss": 0.007, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.962767548750306, | |
| "grad_norm": 0.10719793289899826, | |
| "learning_rate": 3.742384682332463e-06, | |
| "loss": 0.0073, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.9682069134325111, | |
| "grad_norm": 0.051126107573509216, | |
| "learning_rate": 3.1984334203655352e-06, | |
| "loss": 0.0098, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.9736462781147162, | |
| "grad_norm": 0.2430637627840042, | |
| "learning_rate": 2.6544821583986074e-06, | |
| "loss": 0.0078, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.9790856427969213, | |
| "grad_norm": 0.06034141406416893, | |
| "learning_rate": 2.1105308964316795e-06, | |
| "loss": 0.0087, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9790856427969213, | |
| "eval_loss": 0.009567675180733204, | |
| "eval_runtime": 610.7757, | |
| "eval_samples_per_second": 26.756, | |
| "eval_steps_per_second": 3.345, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9845250074791264, | |
| "grad_norm": 0.23722563683986664, | |
| "learning_rate": 1.566579634464752e-06, | |
| "loss": 0.0094, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.9899643721613316, | |
| "grad_norm": 0.1395604908466339, | |
| "learning_rate": 1.0226283724978243e-06, | |
| "loss": 0.01, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.9954037368435367, | |
| "grad_norm": 0.12211757898330688, | |
| "learning_rate": 4.786771105308965e-07, | |
| "loss": 0.0091, | |
| "step": 9150 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 9192, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.365723691894702e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |