{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999972803176589, "eval_steps": 1000, "global_step": 9192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005439364682205118, "grad_norm": 0.7295230031013489, "learning_rate": 9.946692776327242e-05, "loss": 0.2937, "step": 50 }, { "epoch": 0.010878729364410237, "grad_norm": 0.6098010540008545, "learning_rate": 9.892297650130549e-05, "loss": 0.0771, "step": 100 }, { "epoch": 0.016318094046615356, "grad_norm": 0.7312902808189392, "learning_rate": 9.837902523933856e-05, "loss": 0.0662, "step": 150 }, { "epoch": 0.021757458728820473, "grad_norm": 0.531105637550354, "learning_rate": 9.783507397737163e-05, "loss": 0.0591, "step": 200 }, { "epoch": 0.02719682341102559, "grad_norm": 0.3680162727832794, "learning_rate": 9.72911227154047e-05, "loss": 0.0593, "step": 250 }, { "epoch": 0.03263618809323071, "grad_norm": 0.5968620777130127, "learning_rate": 9.674717145343778e-05, "loss": 0.0534, "step": 300 }, { "epoch": 0.03807555277543583, "grad_norm": 0.31027188897132874, "learning_rate": 9.620322019147084e-05, "loss": 0.0471, "step": 350 }, { "epoch": 0.043514917457640946, "grad_norm": 0.3172386884689331, "learning_rate": 9.565926892950392e-05, "loss": 0.0486, "step": 400 }, { "epoch": 0.04895428213984607, "grad_norm": 0.3238416314125061, "learning_rate": 9.5115317667537e-05, "loss": 0.0483, "step": 450 }, { "epoch": 0.05439364682205118, "grad_norm": 0.5560753345489502, "learning_rate": 9.457136640557006e-05, "loss": 0.0392, "step": 500 }, { "epoch": 0.0598330115042563, "grad_norm": 0.4322672188282013, "learning_rate": 9.402741514360314e-05, "loss": 0.0395, "step": 550 }, { "epoch": 0.06527237618646142, "grad_norm": 0.29789817333221436, "learning_rate": 9.348346388163621e-05, "loss": 0.0396, "step": 600 }, { "epoch": 0.07071174086866654, "grad_norm": 0.2505125403404236, "learning_rate": 9.293951261966927e-05, "loss": 0.0312, "step": 650 }, { "epoch": 0.07615110555087166, "grad_norm": 0.4463261663913727, "learning_rate": 9.239556135770236e-05, "loss": 0.0382, "step": 700 }, { "epoch": 0.08159047023307678, "grad_norm": 0.30546480417251587, "learning_rate": 9.185161009573543e-05, "loss": 0.0373, "step": 750 }, { "epoch": 0.08702983491528189, "grad_norm": 0.1717098206281662, "learning_rate": 9.130765883376849e-05, "loss": 0.0405, "step": 800 }, { "epoch": 0.09246919959748702, "grad_norm": 0.2477688491344452, "learning_rate": 9.076370757180158e-05, "loss": 0.0364, "step": 850 }, { "epoch": 0.09790856427969213, "grad_norm": 0.4148896038532257, "learning_rate": 9.021975630983465e-05, "loss": 0.0327, "step": 900 }, { "epoch": 0.10334792896189725, "grad_norm": 0.36908024549484253, "learning_rate": 8.967580504786771e-05, "loss": 0.0307, "step": 950 }, { "epoch": 0.10878729364410236, "grad_norm": 0.612694263458252, "learning_rate": 8.913185378590078e-05, "loss": 0.0291, "step": 1000 }, { "epoch": 0.10878729364410236, "eval_loss": 0.031227048486471176, "eval_runtime": 623.3239, "eval_samples_per_second": 26.218, "eval_steps_per_second": 3.278, "step": 1000 }, { "epoch": 0.11422665832630749, "grad_norm": 0.13878417015075684, "learning_rate": 8.858790252393387e-05, "loss": 0.0314, "step": 1050 }, { "epoch": 0.1196660230085126, "grad_norm": 0.3287246823310852, "learning_rate": 8.804395126196693e-05, "loss": 0.0289, "step": 1100 }, { "epoch": 0.12510538769071772, "grad_norm": 0.2922157943248749, "learning_rate": 8.75e-05, "loss": 0.0327, "step": 1150 }, { "epoch": 0.13054475237292285, "grad_norm": 0.2656305432319641, "learning_rate": 8.695604873803309e-05, "loss": 0.0306, "step": 1200 }, { "epoch": 0.13598411705512797, "grad_norm": 0.1364014893770218, "learning_rate": 8.641209747606614e-05, "loss": 0.0296, "step": 1250 }, { "epoch": 0.14142348173733307, "grad_norm": 0.19203521311283112, "learning_rate": 8.586814621409922e-05, "loss": 0.0305, "step": 1300 }, { "epoch": 0.1468628464195382, "grad_norm": 0.1480610966682434, "learning_rate": 8.53241949521323e-05, "loss": 0.0217, "step": 1350 }, { "epoch": 0.15230221110174333, "grad_norm": 0.04526514559984207, "learning_rate": 8.478024369016536e-05, "loss": 0.0216, "step": 1400 }, { "epoch": 0.15774157578394843, "grad_norm": 0.7239785194396973, "learning_rate": 8.423629242819843e-05, "loss": 0.0293, "step": 1450 }, { "epoch": 0.16318094046615356, "grad_norm": 0.28628188371658325, "learning_rate": 8.369234116623151e-05, "loss": 0.0266, "step": 1500 }, { "epoch": 0.16862030514835868, "grad_norm": 0.3383677005767822, "learning_rate": 8.314838990426458e-05, "loss": 0.0279, "step": 1550 }, { "epoch": 0.17405966983056378, "grad_norm": 0.1695743054151535, "learning_rate": 8.260443864229765e-05, "loss": 0.0273, "step": 1600 }, { "epoch": 0.1794990345127689, "grad_norm": 0.08721613138914108, "learning_rate": 8.206048738033073e-05, "loss": 0.0281, "step": 1650 }, { "epoch": 0.18493839919497404, "grad_norm": 0.28628501296043396, "learning_rate": 8.15165361183638e-05, "loss": 0.0286, "step": 1700 }, { "epoch": 0.19037776387717914, "grad_norm": 0.11635430157184601, "learning_rate": 8.097258485639687e-05, "loss": 0.0295, "step": 1750 }, { "epoch": 0.19581712855938427, "grad_norm": 0.17274004220962524, "learning_rate": 8.042863359442994e-05, "loss": 0.0276, "step": 1800 }, { "epoch": 0.2012564932415894, "grad_norm": 0.3431914150714874, "learning_rate": 7.988468233246302e-05, "loss": 0.0236, "step": 1850 }, { "epoch": 0.2066958579237945, "grad_norm": 0.29347464442253113, "learning_rate": 7.934073107049609e-05, "loss": 0.0259, "step": 1900 }, { "epoch": 0.21213522260599962, "grad_norm": 0.2548673450946808, "learning_rate": 7.879677980852916e-05, "loss": 0.0247, "step": 1950 }, { "epoch": 0.21757458728820472, "grad_norm": 0.1950218230485916, "learning_rate": 7.825282854656223e-05, "loss": 0.0245, "step": 2000 }, { "epoch": 0.21757458728820472, "eval_loss": 0.024380268529057503, "eval_runtime": 612.7819, "eval_samples_per_second": 26.669, "eval_steps_per_second": 3.334, "step": 2000 }, { "epoch": 0.22301395197040985, "grad_norm": 0.14043785631656647, "learning_rate": 7.77088772845953e-05, "loss": 0.0258, "step": 2050 }, { "epoch": 0.22845331665261498, "grad_norm": 0.2565617263317108, "learning_rate": 7.716492602262838e-05, "loss": 0.026, "step": 2100 }, { "epoch": 0.23389268133482008, "grad_norm": 0.19237670302391052, "learning_rate": 7.662097476066145e-05, "loss": 0.0219, "step": 2150 }, { "epoch": 0.2393320460170252, "grad_norm": 0.17301031947135925, "learning_rate": 7.607702349869452e-05, "loss": 0.0273, "step": 2200 }, { "epoch": 0.24477141069923034, "grad_norm": 0.4380134344100952, "learning_rate": 7.55330722367276e-05, "loss": 0.0206, "step": 2250 }, { "epoch": 0.25021077538143544, "grad_norm": 0.5898825526237488, "learning_rate": 7.498912097476067e-05, "loss": 0.0221, "step": 2300 }, { "epoch": 0.25565014006364056, "grad_norm": 0.38086146116256714, "learning_rate": 7.444516971279374e-05, "loss": 0.0219, "step": 2350 }, { "epoch": 0.2610895047458457, "grad_norm": 0.4513295888900757, "learning_rate": 7.390121845082681e-05, "loss": 0.0232, "step": 2400 }, { "epoch": 0.2665288694280508, "grad_norm": 0.09053909033536911, "learning_rate": 7.335726718885987e-05, "loss": 0.0221, "step": 2450 }, { "epoch": 0.27196823411025595, "grad_norm": 0.20339776575565338, "learning_rate": 7.281331592689296e-05, "loss": 0.0214, "step": 2500 }, { "epoch": 0.277407598792461, "grad_norm": 0.09539163112640381, "learning_rate": 7.226936466492603e-05, "loss": 0.0186, "step": 2550 }, { "epoch": 0.28284696347466615, "grad_norm": 0.8186866044998169, "learning_rate": 7.172541340295909e-05, "loss": 0.0216, "step": 2600 }, { "epoch": 0.2882863281568713, "grad_norm": 0.2770097553730011, "learning_rate": 7.118146214099218e-05, "loss": 0.0203, "step": 2650 }, { "epoch": 0.2937256928390764, "grad_norm": 0.2176240086555481, "learning_rate": 7.063751087902525e-05, "loss": 0.0284, "step": 2700 }, { "epoch": 0.29916505752128153, "grad_norm": 0.3172820210456848, "learning_rate": 7.009355961705831e-05, "loss": 0.0196, "step": 2750 }, { "epoch": 0.30460442220348666, "grad_norm": 0.5219048261642456, "learning_rate": 6.95496083550914e-05, "loss": 0.0209, "step": 2800 }, { "epoch": 0.31004378688569173, "grad_norm": 0.16620703041553497, "learning_rate": 6.900565709312445e-05, "loss": 0.0253, "step": 2850 }, { "epoch": 0.31548315156789686, "grad_norm": 0.33126798272132874, "learning_rate": 6.846170583115753e-05, "loss": 0.0217, "step": 2900 }, { "epoch": 0.320922516250102, "grad_norm": 0.07635556906461716, "learning_rate": 6.791775456919061e-05, "loss": 0.0232, "step": 2950 }, { "epoch": 0.3263618809323071, "grad_norm": 0.3889711797237396, "learning_rate": 6.737380330722367e-05, "loss": 0.0212, "step": 3000 }, { "epoch": 0.3263618809323071, "eval_loss": 0.021349932998418808, "eval_runtime": 613.1791, "eval_samples_per_second": 26.651, "eval_steps_per_second": 3.332, "step": 3000 }, { "epoch": 0.33180124561451224, "grad_norm": 0.3016066551208496, "learning_rate": 6.682985204525674e-05, "loss": 0.0211, "step": 3050 }, { "epoch": 0.33724061029671737, "grad_norm": 0.07768701761960983, "learning_rate": 6.628590078328982e-05, "loss": 0.0178, "step": 3100 }, { "epoch": 0.34267997497892244, "grad_norm": 0.39988207817077637, "learning_rate": 6.574194952132289e-05, "loss": 0.0183, "step": 3150 }, { "epoch": 0.34811933966112757, "grad_norm": 0.17848381400108337, "learning_rate": 6.519799825935596e-05, "loss": 0.0177, "step": 3200 }, { "epoch": 0.3535587043433327, "grad_norm": 0.3475474417209625, "learning_rate": 6.465404699738903e-05, "loss": 0.0202, "step": 3250 }, { "epoch": 0.3589980690255378, "grad_norm": 0.13056176900863647, "learning_rate": 6.411009573542211e-05, "loss": 0.0191, "step": 3300 }, { "epoch": 0.36443743370774295, "grad_norm": 0.1627858579158783, "learning_rate": 6.356614447345518e-05, "loss": 0.0189, "step": 3350 }, { "epoch": 0.3698767983899481, "grad_norm": 0.3630661070346832, "learning_rate": 6.302219321148825e-05, "loss": 0.0223, "step": 3400 }, { "epoch": 0.37531616307215315, "grad_norm": 0.18658381700515747, "learning_rate": 6.247824194952132e-05, "loss": 0.0208, "step": 3450 }, { "epoch": 0.3807555277543583, "grad_norm": 0.255504310131073, "learning_rate": 6.19342906875544e-05, "loss": 0.0177, "step": 3500 }, { "epoch": 0.3861948924365634, "grad_norm": 0.28671759366989136, "learning_rate": 6.139033942558747e-05, "loss": 0.0204, "step": 3550 }, { "epoch": 0.39163425711876854, "grad_norm": 0.10439465939998627, "learning_rate": 6.0846388163620536e-05, "loss": 0.0148, "step": 3600 }, { "epoch": 0.39707362180097366, "grad_norm": 0.1532258689403534, "learning_rate": 6.0302436901653615e-05, "loss": 0.0173, "step": 3650 }, { "epoch": 0.4025129864831788, "grad_norm": 0.45614758133888245, "learning_rate": 5.975848563968669e-05, "loss": 0.015, "step": 3700 }, { "epoch": 0.40795235116538386, "grad_norm": 0.19495117664337158, "learning_rate": 5.9214534377719754e-05, "loss": 0.0186, "step": 3750 }, { "epoch": 0.413391715847589, "grad_norm": 0.1256154626607895, "learning_rate": 5.867058311575283e-05, "loss": 0.0156, "step": 3800 }, { "epoch": 0.4188310805297941, "grad_norm": 0.10156747698783875, "learning_rate": 5.8126631853785905e-05, "loss": 0.0188, "step": 3850 }, { "epoch": 0.42427044521199925, "grad_norm": 0.2215280681848526, "learning_rate": 5.758268059181897e-05, "loss": 0.0191, "step": 3900 }, { "epoch": 0.4297098098942044, "grad_norm": 0.5749198198318481, "learning_rate": 5.703872932985205e-05, "loss": 0.0161, "step": 3950 }, { "epoch": 0.43514917457640945, "grad_norm": 0.2301347702741623, "learning_rate": 5.649477806788512e-05, "loss": 0.018, "step": 4000 }, { "epoch": 0.43514917457640945, "eval_loss": 0.017653847113251686, "eval_runtime": 611.9222, "eval_samples_per_second": 26.706, "eval_steps_per_second": 3.339, "step": 4000 }, { "epoch": 0.4405885392586146, "grad_norm": 0.18594112992286682, "learning_rate": 5.595082680591819e-05, "loss": 0.0177, "step": 4050 }, { "epoch": 0.4460279039408197, "grad_norm": 0.18527820706367493, "learning_rate": 5.540687554395126e-05, "loss": 0.0167, "step": 4100 }, { "epoch": 0.45146726862302483, "grad_norm": 0.6579691171646118, "learning_rate": 5.486292428198434e-05, "loss": 0.0162, "step": 4150 }, { "epoch": 0.45690663330522996, "grad_norm": 0.03042120486497879, "learning_rate": 5.431897302001741e-05, "loss": 0.0144, "step": 4200 }, { "epoch": 0.4623459979874351, "grad_norm": 0.2600632309913635, "learning_rate": 5.377502175805048e-05, "loss": 0.0154, "step": 4250 }, { "epoch": 0.46778536266964016, "grad_norm": 0.20093311369419098, "learning_rate": 5.323107049608356e-05, "loss": 0.0168, "step": 4300 }, { "epoch": 0.4732247273518453, "grad_norm": 0.307822048664093, "learning_rate": 5.2687119234116625e-05, "loss": 0.0148, "step": 4350 }, { "epoch": 0.4786640920340504, "grad_norm": 0.23011909425258636, "learning_rate": 5.21431679721497e-05, "loss": 0.0153, "step": 4400 }, { "epoch": 0.48410345671625554, "grad_norm": 0.10574093461036682, "learning_rate": 5.1599216710182777e-05, "loss": 0.0171, "step": 4450 }, { "epoch": 0.48954282139846067, "grad_norm": 0.2588224709033966, "learning_rate": 5.1055265448215836e-05, "loss": 0.0181, "step": 4500 }, { "epoch": 0.4949821860806658, "grad_norm": 0.10799703747034073, "learning_rate": 5.0511314186248915e-05, "loss": 0.0169, "step": 4550 }, { "epoch": 0.5004215507628709, "grad_norm": 0.10647980868816376, "learning_rate": 4.996736292428199e-05, "loss": 0.0134, "step": 4600 }, { "epoch": 0.505860915445076, "grad_norm": 0.026128219440579414, "learning_rate": 4.942341166231506e-05, "loss": 0.0164, "step": 4650 }, { "epoch": 0.5113002801272811, "grad_norm": 0.22313055396080017, "learning_rate": 4.887946040034813e-05, "loss": 0.0139, "step": 4700 }, { "epoch": 0.5167396448094862, "grad_norm": 0.41442468762397766, "learning_rate": 4.8335509138381205e-05, "loss": 0.0135, "step": 4750 }, { "epoch": 0.5221790094916914, "grad_norm": 0.13141965866088867, "learning_rate": 4.779155787641428e-05, "loss": 0.0138, "step": 4800 }, { "epoch": 0.5276183741738965, "grad_norm": 0.23391123116016388, "learning_rate": 4.724760661444735e-05, "loss": 0.0148, "step": 4850 }, { "epoch": 0.5330577388561016, "grad_norm": 0.2290557622909546, "learning_rate": 4.6703655352480416e-05, "loss": 0.014, "step": 4900 }, { "epoch": 0.5384971035383067, "grad_norm": 0.263883113861084, "learning_rate": 4.6159704090513496e-05, "loss": 0.0129, "step": 4950 }, { "epoch": 0.5439364682205119, "grad_norm": 0.4367043673992157, "learning_rate": 4.561575282854657e-05, "loss": 0.0138, "step": 5000 }, { "epoch": 0.5439364682205119, "eval_loss": 0.015788141638040543, "eval_runtime": 612.4211, "eval_samples_per_second": 26.684, "eval_steps_per_second": 3.336, "step": 5000 }, { "epoch": 0.549375832902717, "grad_norm": 0.17054913938045502, "learning_rate": 4.5071801566579634e-05, "loss": 0.0122, "step": 5050 }, { "epoch": 0.554815197584922, "grad_norm": 0.47613638639450073, "learning_rate": 4.4527850304612713e-05, "loss": 0.0148, "step": 5100 }, { "epoch": 0.5602545622671272, "grad_norm": 0.1685837209224701, "learning_rate": 4.398389904264578e-05, "loss": 0.0166, "step": 5150 }, { "epoch": 0.5656939269493323, "grad_norm": 0.23199672996997833, "learning_rate": 4.343994778067885e-05, "loss": 0.0139, "step": 5200 }, { "epoch": 0.5711332916315375, "grad_norm": 0.5401563048362732, "learning_rate": 4.2895996518711924e-05, "loss": 0.0134, "step": 5250 }, { "epoch": 0.5765726563137425, "grad_norm": 0.1670864224433899, "learning_rate": 4.2352045256745e-05, "loss": 0.0143, "step": 5300 }, { "epoch": 0.5820120209959476, "grad_norm": 0.08487512916326523, "learning_rate": 4.180809399477807e-05, "loss": 0.0152, "step": 5350 }, { "epoch": 0.5874513856781528, "grad_norm": 0.07271425426006317, "learning_rate": 4.126414273281114e-05, "loss": 0.0166, "step": 5400 }, { "epoch": 0.5928907503603579, "grad_norm": 0.27096590399742126, "learning_rate": 4.0720191470844215e-05, "loss": 0.0134, "step": 5450 }, { "epoch": 0.5983301150425631, "grad_norm": 0.1035747304558754, "learning_rate": 4.017624020887729e-05, "loss": 0.0144, "step": 5500 }, { "epoch": 0.6037694797247681, "grad_norm": 0.34544578194618225, "learning_rate": 3.963228894691035e-05, "loss": 0.0138, "step": 5550 }, { "epoch": 0.6092088444069733, "grad_norm": 0.4660258889198303, "learning_rate": 3.908833768494343e-05, "loss": 0.0166, "step": 5600 }, { "epoch": 0.6146482090891784, "grad_norm": 0.12394747883081436, "learning_rate": 3.8544386422976505e-05, "loss": 0.0124, "step": 5650 }, { "epoch": 0.6200875737713835, "grad_norm": 0.13275495171546936, "learning_rate": 3.800043516100957e-05, "loss": 0.0144, "step": 5700 }, { "epoch": 0.6255269384535886, "grad_norm": 0.23382355272769928, "learning_rate": 3.745648389904265e-05, "loss": 0.0163, "step": 5750 }, { "epoch": 0.6309663031357937, "grad_norm": 0.10067889094352722, "learning_rate": 3.691253263707572e-05, "loss": 0.0166, "step": 5800 }, { "epoch": 0.6364056678179989, "grad_norm": 0.09269619733095169, "learning_rate": 3.636858137510879e-05, "loss": 0.0155, "step": 5850 }, { "epoch": 0.641845032500204, "grad_norm": 0.22883062064647675, "learning_rate": 3.582463011314187e-05, "loss": 0.0119, "step": 5900 }, { "epoch": 0.647284397182409, "grad_norm": 0.3405645787715912, "learning_rate": 3.5280678851174934e-05, "loss": 0.0134, "step": 5950 }, { "epoch": 0.6527237618646142, "grad_norm": 0.10464873909950256, "learning_rate": 3.4736727589208007e-05, "loss": 0.0115, "step": 6000 }, { "epoch": 0.6527237618646142, "eval_loss": 0.013636166229844093, "eval_runtime": 612.7881, "eval_samples_per_second": 26.668, "eval_steps_per_second": 3.334, "step": 6000 }, { "epoch": 0.6581631265468193, "grad_norm": 0.15743671357631683, "learning_rate": 3.4192776327241086e-05, "loss": 0.0127, "step": 6050 }, { "epoch": 0.6636024912290245, "grad_norm": 0.28343307971954346, "learning_rate": 3.364882506527415e-05, "loss": 0.013, "step": 6100 }, { "epoch": 0.6690418559112296, "grad_norm": 0.0802890881896019, "learning_rate": 3.3104873803307224e-05, "loss": 0.0156, "step": 6150 }, { "epoch": 0.6744812205934347, "grad_norm": 0.2691664397716522, "learning_rate": 3.25609225413403e-05, "loss": 0.0131, "step": 6200 }, { "epoch": 0.6799205852756398, "grad_norm": 0.08483448624610901, "learning_rate": 3.201697127937337e-05, "loss": 0.0127, "step": 6250 }, { "epoch": 0.6853599499578449, "grad_norm": 0.2310999631881714, "learning_rate": 3.147302001740644e-05, "loss": 0.01, "step": 6300 }, { "epoch": 0.6907993146400501, "grad_norm": 0.10749073326587677, "learning_rate": 3.0929068755439515e-05, "loss": 0.0128, "step": 6350 }, { "epoch": 0.6962386793222551, "grad_norm": 0.14852339029312134, "learning_rate": 3.0385117493472587e-05, "loss": 0.0097, "step": 6400 }, { "epoch": 0.7016780440044603, "grad_norm": 0.09708331525325775, "learning_rate": 2.9841166231505656e-05, "loss": 0.0137, "step": 6450 }, { "epoch": 0.7071174086866654, "grad_norm": 0.13305231928825378, "learning_rate": 2.9297214969538732e-05, "loss": 0.0106, "step": 6500 }, { "epoch": 0.7125567733688705, "grad_norm": 0.2321113497018814, "learning_rate": 2.8753263707571805e-05, "loss": 0.0161, "step": 6550 }, { "epoch": 0.7179961380510756, "grad_norm": 0.1432623565196991, "learning_rate": 2.8209312445604874e-05, "loss": 0.0096, "step": 6600 }, { "epoch": 0.7234355027332807, "grad_norm": 0.1964827924966812, "learning_rate": 2.766536118363795e-05, "loss": 0.0174, "step": 6650 }, { "epoch": 0.7288748674154859, "grad_norm": 0.21941740810871124, "learning_rate": 2.712140992167102e-05, "loss": 0.0125, "step": 6700 }, { "epoch": 0.734314232097691, "grad_norm": 0.06487424671649933, "learning_rate": 2.6577458659704092e-05, "loss": 0.0143, "step": 6750 }, { "epoch": 0.7397535967798962, "grad_norm": 0.08458438515663147, "learning_rate": 2.603350739773716e-05, "loss": 0.0122, "step": 6800 }, { "epoch": 0.7451929614621012, "grad_norm": 0.21706067025661469, "learning_rate": 2.5489556135770237e-05, "loss": 0.0136, "step": 6850 }, { "epoch": 0.7506323261443063, "grad_norm": 0.04576512426137924, "learning_rate": 2.4945604873803306e-05, "loss": 0.0127, "step": 6900 }, { "epoch": 0.7560716908265115, "grad_norm": 0.05280361324548721, "learning_rate": 2.4401653611836382e-05, "loss": 0.0107, "step": 6950 }, { "epoch": 0.7615110555087166, "grad_norm": 0.5558903217315674, "learning_rate": 2.3857702349869455e-05, "loss": 0.0088, "step": 7000 }, { "epoch": 0.7615110555087166, "eval_loss": 0.01208407897502184, "eval_runtime": 610.9461, "eval_samples_per_second": 26.749, "eval_steps_per_second": 3.344, "step": 7000 }, { "epoch": 0.7669504201909217, "grad_norm": 0.470074325799942, "learning_rate": 2.3313751087902524e-05, "loss": 0.0135, "step": 7050 }, { "epoch": 0.7723897848731268, "grad_norm": 0.2360675036907196, "learning_rate": 2.2769799825935597e-05, "loss": 0.0133, "step": 7100 }, { "epoch": 0.7778291495553319, "grad_norm": 0.11457215994596481, "learning_rate": 2.222584856396867e-05, "loss": 0.011, "step": 7150 }, { "epoch": 0.7832685142375371, "grad_norm": 0.010224751196801662, "learning_rate": 2.1681897302001742e-05, "loss": 0.0148, "step": 7200 }, { "epoch": 0.7887078789197421, "grad_norm": 0.2885963022708893, "learning_rate": 2.1137946040034815e-05, "loss": 0.0132, "step": 7250 }, { "epoch": 0.7941472436019473, "grad_norm": 0.3628193438053131, "learning_rate": 2.0593994778067884e-05, "loss": 0.0105, "step": 7300 }, { "epoch": 0.7995866082841524, "grad_norm": 0.07265301048755646, "learning_rate": 2.005004351610096e-05, "loss": 0.0104, "step": 7350 }, { "epoch": 0.8050259729663576, "grad_norm": 0.13889417052268982, "learning_rate": 1.9506092254134032e-05, "loss": 0.0129, "step": 7400 }, { "epoch": 0.8104653376485627, "grad_norm": 0.06533674895763397, "learning_rate": 1.89621409921671e-05, "loss": 0.0097, "step": 7450 }, { "epoch": 0.8159047023307677, "grad_norm": 0.23986752331256866, "learning_rate": 1.8418189730200174e-05, "loss": 0.0119, "step": 7500 }, { "epoch": 0.8213440670129729, "grad_norm": 0.09028589725494385, "learning_rate": 1.7874238468233247e-05, "loss": 0.0093, "step": 7550 }, { "epoch": 0.826783431695178, "grad_norm": 0.11503873020410538, "learning_rate": 1.733028720626632e-05, "loss": 0.0091, "step": 7600 }, { "epoch": 0.8322227963773832, "grad_norm": 0.5392634868621826, "learning_rate": 1.6786335944299392e-05, "loss": 0.0118, "step": 7650 }, { "epoch": 0.8376621610595882, "grad_norm": 0.14633004367351532, "learning_rate": 1.6242384682332464e-05, "loss": 0.0112, "step": 7700 }, { "epoch": 0.8431015257417933, "grad_norm": 0.03690435364842415, "learning_rate": 1.5698433420365534e-05, "loss": 0.0117, "step": 7750 }, { "epoch": 0.8485408904239985, "grad_norm": 0.1749696284532547, "learning_rate": 1.5154482158398608e-05, "loss": 0.0108, "step": 7800 }, { "epoch": 0.8539802551062036, "grad_norm": 0.1117832213640213, "learning_rate": 1.4610530896431682e-05, "loss": 0.0142, "step": 7850 }, { "epoch": 0.8594196197884088, "grad_norm": 0.06253615021705627, "learning_rate": 1.4066579634464751e-05, "loss": 0.0101, "step": 7900 }, { "epoch": 0.8648589844706138, "grad_norm": 0.11594853550195694, "learning_rate": 1.3522628372497826e-05, "loss": 0.0102, "step": 7950 }, { "epoch": 0.8702983491528189, "grad_norm": 0.2881285846233368, "learning_rate": 1.2978677110530895e-05, "loss": 0.0096, "step": 8000 }, { "epoch": 0.8702983491528189, "eval_loss": 0.010398774407804012, "eval_runtime": 612.7356, "eval_samples_per_second": 26.671, "eval_steps_per_second": 3.334, "step": 8000 }, { "epoch": 0.8757377138350241, "grad_norm": 0.10731537640094757, "learning_rate": 1.243472584856397e-05, "loss": 0.0102, "step": 8050 }, { "epoch": 0.8811770785172292, "grad_norm": 0.08623083680868149, "learning_rate": 1.1890774586597042e-05, "loss": 0.0095, "step": 8100 }, { "epoch": 0.8866164431994343, "grad_norm": 0.16180342435836792, "learning_rate": 1.1346823324630114e-05, "loss": 0.0094, "step": 8150 }, { "epoch": 0.8920558078816394, "grad_norm": 0.09304390847682953, "learning_rate": 1.0802872062663185e-05, "loss": 0.0105, "step": 8200 }, { "epoch": 0.8974951725638446, "grad_norm": 0.08745424449443817, "learning_rate": 1.0258920800696258e-05, "loss": 0.0077, "step": 8250 }, { "epoch": 0.9029345372460497, "grad_norm": 0.18484659492969513, "learning_rate": 9.71496953872933e-06, "loss": 0.0116, "step": 8300 }, { "epoch": 0.9083739019282547, "grad_norm": 0.18615852296352386, "learning_rate": 9.171018276762403e-06, "loss": 0.0087, "step": 8350 }, { "epoch": 0.9138132666104599, "grad_norm": 0.038191672414541245, "learning_rate": 8.627067014795474e-06, "loss": 0.0091, "step": 8400 }, { "epoch": 0.919252631292665, "grad_norm": 0.049192268401384354, "learning_rate": 8.083115752828548e-06, "loss": 0.0113, "step": 8450 }, { "epoch": 0.9246919959748702, "grad_norm": 0.12032605707645416, "learning_rate": 7.539164490861619e-06, "loss": 0.0116, "step": 8500 }, { "epoch": 0.9301313606570752, "grad_norm": 0.2927582561969757, "learning_rate": 7.006092254134029e-06, "loss": 0.0114, "step": 8550 }, { "epoch": 0.9355707253392803, "grad_norm": 0.04057031497359276, "learning_rate": 6.462140992167103e-06, "loss": 0.0093, "step": 8600 }, { "epoch": 0.9410100900214855, "grad_norm": 0.23389148712158203, "learning_rate": 5.918189730200174e-06, "loss": 0.0123, "step": 8650 }, { "epoch": 0.9464494547036906, "grad_norm": 0.14039556682109833, "learning_rate": 5.374238468233247e-06, "loss": 0.0088, "step": 8700 }, { "epoch": 0.9518888193858958, "grad_norm": 0.10834332555532455, "learning_rate": 4.830287206266319e-06, "loss": 0.0126, "step": 8750 }, { "epoch": 0.9573281840681008, "grad_norm": 0.20089313387870789, "learning_rate": 4.286335944299391e-06, "loss": 0.007, "step": 8800 }, { "epoch": 0.962767548750306, "grad_norm": 0.10719793289899826, "learning_rate": 3.742384682332463e-06, "loss": 0.0073, "step": 8850 }, { "epoch": 0.9682069134325111, "grad_norm": 0.051126107573509216, "learning_rate": 3.1984334203655352e-06, "loss": 0.0098, "step": 8900 }, { "epoch": 0.9736462781147162, "grad_norm": 0.2430637627840042, "learning_rate": 2.6544821583986074e-06, "loss": 0.0078, "step": 8950 }, { "epoch": 0.9790856427969213, "grad_norm": 0.06034141406416893, "learning_rate": 2.1105308964316795e-06, "loss": 0.0087, "step": 9000 }, { "epoch": 0.9790856427969213, "eval_loss": 0.009567675180733204, "eval_runtime": 610.7757, "eval_samples_per_second": 26.756, "eval_steps_per_second": 3.345, "step": 9000 }, { "epoch": 0.9845250074791264, "grad_norm": 0.23722563683986664, "learning_rate": 1.566579634464752e-06, "loss": 0.0094, "step": 9050 }, { "epoch": 0.9899643721613316, "grad_norm": 0.1395604908466339, "learning_rate": 1.0226283724978243e-06, "loss": 0.01, "step": 9100 }, { "epoch": 0.9954037368435367, "grad_norm": 0.12211757898330688, "learning_rate": 4.786771105308965e-07, "loss": 0.0091, "step": 9150 } ], "logging_steps": 50, "max_steps": 9192, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.365723691894702e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }