| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.09976057462090981, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0009976057462090981, |
| "grad_norm": 0.0038604736328125, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 0.6617, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0019952114924181963, |
| "grad_norm": 6.3125, |
| "learning_rate": 0.000196, |
| "loss": 0.4755, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0029928172386272946, |
| "grad_norm": 8.5625, |
| "learning_rate": 0.000194, |
| "loss": 0.7675, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0039904229848363925, |
| "grad_norm": 2.375, |
| "learning_rate": 0.000192, |
| "loss": 0.3436, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.004988028731045491, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 0.00019, |
| "loss": 0.3634, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.005985634477254589, |
| "grad_norm": 0.09130859375, |
| "learning_rate": 0.000188, |
| "loss": 0.2994, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.006983240223463687, |
| "grad_norm": 0.003570556640625, |
| "learning_rate": 0.00018600000000000002, |
| "loss": 0.3632, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.007980845969672785, |
| "grad_norm": 0.6875, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 0.4774, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.008978451715881883, |
| "grad_norm": 22.875, |
| "learning_rate": 0.000182, |
| "loss": 0.4284, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.009976057462090982, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.00018, |
| "loss": 0.2701, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.01097366320830008, |
| "grad_norm": 2.640625, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 0.3871, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.011971268954509178, |
| "grad_norm": 0.031982421875, |
| "learning_rate": 0.00017600000000000002, |
| "loss": 0.4374, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.012968874700718277, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.000174, |
| "loss": 0.5315, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.013966480446927373, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.000172, |
| "loss": 0.2943, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.014964086193136472, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 0.00017, |
| "loss": 0.4425, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.01596169193934557, |
| "grad_norm": 0.0390625, |
| "learning_rate": 0.000168, |
| "loss": 0.5047, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.01695929768555467, |
| "grad_norm": 0.0003719329833984375, |
| "learning_rate": 0.000166, |
| "loss": 0.5276, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.017956903431763767, |
| "grad_norm": 4.71875, |
| "learning_rate": 0.000164, |
| "loss": 0.4847, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.018954509177972863, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 0.000162, |
| "loss": 0.3319, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.019952114924181964, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00016, |
| "loss": 0.5723, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.02094972067039106, |
| "grad_norm": 16.25, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 0.2602, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.02194732641660016, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.00015600000000000002, |
| "loss": 0.4477, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.022944932162809257, |
| "grad_norm": 23.75, |
| "learning_rate": 0.000154, |
| "loss": 0.3337, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.023942537909018357, |
| "grad_norm": 0.00189208984375, |
| "learning_rate": 0.000152, |
| "loss": 0.3281, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.024940143655227454, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 0.586, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.025937749401436554, |
| "grad_norm": 0.10595703125, |
| "learning_rate": 0.000148, |
| "loss": 0.4816, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.02693535514764565, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.000146, |
| "loss": 0.2697, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.027932960893854747, |
| "grad_norm": 3.9375, |
| "learning_rate": 0.000144, |
| "loss": 0.3875, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.028930566640063847, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.000142, |
| "loss": 0.4165, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.029928172386272944, |
| "grad_norm": 0.62109375, |
| "learning_rate": 0.00014, |
| "loss": 0.369, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.030925778132482044, |
| "grad_norm": 4.1875, |
| "learning_rate": 0.000138, |
| "loss": 0.5041, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.03192338387869114, |
| "grad_norm": 41.75, |
| "learning_rate": 0.00013600000000000003, |
| "loss": 0.217, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.03292098962490024, |
| "grad_norm": 0.01171875, |
| "learning_rate": 0.000134, |
| "loss": 0.2277, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.03391859537110934, |
| "grad_norm": 42.25, |
| "learning_rate": 0.000132, |
| "loss": 0.404, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.034916201117318434, |
| "grad_norm": 3.09375, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.4747, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.035913806863527534, |
| "grad_norm": 0.10498046875, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 0.2959, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.036911412609736634, |
| "grad_norm": 0.0250244140625, |
| "learning_rate": 0.000126, |
| "loss": 0.3211, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.03790901835594573, |
| "grad_norm": 0.043212890625, |
| "learning_rate": 0.000124, |
| "loss": 0.5209, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.03890662410215483, |
| "grad_norm": 0.02734375, |
| "learning_rate": 0.000122, |
| "loss": 0.36, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.03990422984836393, |
| "grad_norm": 0.01025390625, |
| "learning_rate": 0.00012, |
| "loss": 0.4134, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.04090183559457303, |
| "grad_norm": 13.3125, |
| "learning_rate": 0.000118, |
| "loss": 0.5532, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.04189944134078212, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 0.000116, |
| "loss": 0.383, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.04289704708699122, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.00011399999999999999, |
| "loss": 0.2125, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.04389465283320032, |
| "grad_norm": 0.0068359375, |
| "learning_rate": 0.00011200000000000001, |
| "loss": 0.4954, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.04489225857940942, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 0.00011000000000000002, |
| "loss": 0.505, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.045889864325618514, |
| "grad_norm": 0.0024261474609375, |
| "learning_rate": 0.00010800000000000001, |
| "loss": 0.6058, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.046887470071827614, |
| "grad_norm": 0.0030059814453125, |
| "learning_rate": 0.00010600000000000002, |
| "loss": 0.3943, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.047885075818036714, |
| "grad_norm": 10.875, |
| "learning_rate": 0.00010400000000000001, |
| "loss": 0.3382, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.04888268156424581, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 0.00010200000000000001, |
| "loss": 0.3642, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.04988028731045491, |
| "grad_norm": 3.265625, |
| "learning_rate": 0.0001, |
| "loss": 0.4616, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.05087789305666401, |
| "grad_norm": 7.0, |
| "learning_rate": 9.8e-05, |
| "loss": 0.3914, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.05187549880287311, |
| "grad_norm": 15.0, |
| "learning_rate": 9.6e-05, |
| "loss": 0.3353, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.0528731045490822, |
| "grad_norm": 0.0050048828125, |
| "learning_rate": 9.4e-05, |
| "loss": 0.4456, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.0538707102952913, |
| "grad_norm": 0.006927490234375, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.2634, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.0548683160415004, |
| "grad_norm": 0.002685546875, |
| "learning_rate": 9e-05, |
| "loss": 0.3685, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.055865921787709494, |
| "grad_norm": 9.0625, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.352, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.056863527533918594, |
| "grad_norm": 0.0137939453125, |
| "learning_rate": 8.6e-05, |
| "loss": 0.4359, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.057861133280127694, |
| "grad_norm": 0.010009765625, |
| "learning_rate": 8.4e-05, |
| "loss": 0.3103, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.058858739026336794, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 8.2e-05, |
| "loss": 0.4468, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.05985634477254589, |
| "grad_norm": 4.15625, |
| "learning_rate": 8e-05, |
| "loss": 0.299, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.06085395051875499, |
| "grad_norm": 3.984375, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 0.3858, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.06185155626496409, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 7.6e-05, |
| "loss": 0.5305, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.06284916201117319, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 7.4e-05, |
| "loss": 0.4533, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.06384676775738228, |
| "grad_norm": 36.5, |
| "learning_rate": 7.2e-05, |
| "loss": 0.4423, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.06484437350359137, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 7e-05, |
| "loss": 0.5003, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.06584197924980048, |
| "grad_norm": 4.625, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.527, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.06683958499600957, |
| "grad_norm": 0.20703125, |
| "learning_rate": 6.6e-05, |
| "loss": 0.2865, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.06783719074221868, |
| "grad_norm": 0.04296875, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.3976, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.06883479648842777, |
| "grad_norm": 0.01275634765625, |
| "learning_rate": 6.2e-05, |
| "loss": 0.2756, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.06983240223463687, |
| "grad_norm": 8.25, |
| "learning_rate": 6e-05, |
| "loss": 0.2762, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.07083000798084597, |
| "grad_norm": 6.125, |
| "learning_rate": 5.8e-05, |
| "loss": 0.4, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.07182761372705507, |
| "grad_norm": 0.00193023681640625, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 0.4484, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.07282521947326416, |
| "grad_norm": 0.0228271484375, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 0.1631, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.07382282521947327, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 0.3579, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.07482043096568236, |
| "grad_norm": 11.4375, |
| "learning_rate": 5e-05, |
| "loss": 0.2258, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.07581803671189145, |
| "grad_norm": 20.875, |
| "learning_rate": 4.8e-05, |
| "loss": 0.4282, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.07681564245810056, |
| "grad_norm": 0.046875, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.3967, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.07781324820430965, |
| "grad_norm": 0.0035552978515625, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.3414, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.07881085395051876, |
| "grad_norm": 0.00653076171875, |
| "learning_rate": 4.2e-05, |
| "loss": 0.3421, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.07980845969672785, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 4e-05, |
| "loss": 0.2607, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.08080606544293695, |
| "grad_norm": 0.0027313232421875, |
| "learning_rate": 3.8e-05, |
| "loss": 0.3414, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.08180367118914605, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 3.6e-05, |
| "loss": 0.1826, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.08280127693535515, |
| "grad_norm": 34.25, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 0.374, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.08379888268156424, |
| "grad_norm": 11.75, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.3928, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.08479648842777335, |
| "grad_norm": 2.203125, |
| "learning_rate": 3e-05, |
| "loss": 0.2718, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.08579409417398244, |
| "grad_norm": 0.004302978515625, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 0.4629, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.08679169992019153, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.6785, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.08778930566640064, |
| "grad_norm": 2.90625, |
| "learning_rate": 2.4e-05, |
| "loss": 0.3149, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.08878691141260973, |
| "grad_norm": 15.0, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.1649, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.08978451715881884, |
| "grad_norm": 13.5625, |
| "learning_rate": 2e-05, |
| "loss": 0.3696, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.09078212290502793, |
| "grad_norm": 4.71875, |
| "learning_rate": 1.8e-05, |
| "loss": 0.2958, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.09177972865123703, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.4197, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.09277733439744613, |
| "grad_norm": 1.2578125, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 0.3516, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.09377494014365523, |
| "grad_norm": 0.0252685546875, |
| "learning_rate": 1.2e-05, |
| "loss": 0.3903, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.09477254588986432, |
| "grad_norm": 0.001434326171875, |
| "learning_rate": 1e-05, |
| "loss": 0.4062, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.09577015163607343, |
| "grad_norm": 3.625, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.5118, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.09676775738228252, |
| "grad_norm": 0.001373291015625, |
| "learning_rate": 6e-06, |
| "loss": 0.3564, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.09776536312849161, |
| "grad_norm": 0.017578125, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.2415, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.09876296887470072, |
| "grad_norm": 5.9375, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.4096, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.09976057462090981, |
| "grad_norm": 0.0230712890625, |
| "learning_rate": 0.0, |
| "loss": 0.3159, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.937397973909504e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|