{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 533, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001876172607879925, "grad_norm": 0.4121309108650883, "learning_rate": 3.7037037037037037e-06, "loss": 1.6355, "step": 1 }, { "epoch": 0.009380863039399626, "grad_norm": 0.4145535276322867, "learning_rate": 1.8518518518518518e-05, "loss": 1.6509, "step": 5 }, { "epoch": 0.01876172607879925, "grad_norm": 0.499090421739371, "learning_rate": 3.7037037037037037e-05, "loss": 1.6194, "step": 10 }, { "epoch": 0.028142589118198873, "grad_norm": 0.2168002082363555, "learning_rate": 5.555555555555556e-05, "loss": 1.6102, "step": 15 }, { "epoch": 0.0375234521575985, "grad_norm": 0.19182026954015316, "learning_rate": 7.407407407407407e-05, "loss": 1.5826, "step": 20 }, { "epoch": 0.04690431519699812, "grad_norm": 0.17104214986412603, "learning_rate": 9.25925925925926e-05, "loss": 1.4983, "step": 25 }, { "epoch": 0.05628517823639775, "grad_norm": 0.1314352337398472, "learning_rate": 0.00011111111111111112, "loss": 1.4705, "step": 30 }, { "epoch": 0.06566604127579738, "grad_norm": 0.13683731835169527, "learning_rate": 0.00012962962962962963, "loss": 1.4351, "step": 35 }, { "epoch": 0.075046904315197, "grad_norm": 0.12592565344109571, "learning_rate": 0.00014814814814814815, "loss": 1.4154, "step": 40 }, { "epoch": 0.08442776735459662, "grad_norm": 0.10747815841874438, "learning_rate": 0.0001666666666666667, "loss": 1.3925, "step": 45 }, { "epoch": 0.09380863039399624, "grad_norm": 0.09736495320865902, "learning_rate": 0.0001851851851851852, "loss": 1.3452, "step": 50 }, { "epoch": 0.10318949343339587, "grad_norm": 0.09026377977279064, "learning_rate": 0.00019999784921417228, "loss": 1.3658, "step": 55 }, { "epoch": 0.1125703564727955, "grad_norm": 0.08191150814769302, "learning_rate": 0.00019992258142410334, "loss": 1.3526, "step": 60 }, { "epoch": 0.12195121951219512, "grad_norm": 0.0987761994127843, "learning_rate": 0.00019973986684235418, "loss": 1.3457, "step": 65 }, { "epoch": 0.13133208255159476, "grad_norm": 0.08470882333862399, "learning_rate": 0.00019944990194198758, "loss": 1.3237, "step": 70 }, { "epoch": 0.14071294559099437, "grad_norm": 0.08215471773448521, "learning_rate": 0.00019905299852237654, "loss": 1.3171, "step": 75 }, { "epoch": 0.150093808630394, "grad_norm": 0.07920641876308229, "learning_rate": 0.00019854958337392654, "loss": 1.2913, "step": 80 }, { "epoch": 0.15947467166979362, "grad_norm": 0.09467087341552041, "learning_rate": 0.00019794019781914766, "loss": 1.3156, "step": 85 }, { "epoch": 0.16885553470919323, "grad_norm": 0.08313358090040693, "learning_rate": 0.0001972254971305701, "loss": 1.3032, "step": 90 }, { "epoch": 0.17823639774859287, "grad_norm": 0.08181396678268484, "learning_rate": 0.00019640624982612942, "loss": 1.3047, "step": 95 }, { "epoch": 0.18761726078799248, "grad_norm": 0.08648614872822097, "learning_rate": 0.00019548333684277919, "loss": 1.2938, "step": 100 }, { "epoch": 0.19699812382739212, "grad_norm": 0.08114011033820992, "learning_rate": 0.00019445775058921853, "loss": 1.2919, "step": 105 }, { "epoch": 0.20637898686679174, "grad_norm": 0.08069454171446046, "learning_rate": 0.00019333059387875525, "loss": 1.3007, "step": 110 }, { "epoch": 0.21575984990619138, "grad_norm": 0.1055930521893841, "learning_rate": 0.0001921030787434499, "loss": 1.3022, "step": 115 }, { "epoch": 0.225140712945591, "grad_norm": 0.1039770749041834, "learning_rate": 0.0001907765251308173, "loss": 1.2945, "step": 120 }, { "epoch": 0.23452157598499063, "grad_norm": 0.08612620245766023, "learning_rate": 0.0001893523594844865, "loss": 1.2734, "step": 125 }, { "epoch": 0.24390243902439024, "grad_norm": 0.09453075060390269, "learning_rate": 0.00018783211321034534, "loss": 1.2903, "step": 130 }, { "epoch": 0.25328330206378985, "grad_norm": 0.09115708333629854, "learning_rate": 0.00018621742102981905, "loss": 1.3021, "step": 135 }, { "epoch": 0.2626641651031895, "grad_norm": 0.09246301111274602, "learning_rate": 0.0001845100192220537, "loss": 1.2994, "step": 140 }, { "epoch": 0.27204502814258913, "grad_norm": 0.08863879008659571, "learning_rate": 0.00018271174375689454, "loss": 1.3049, "step": 145 }, { "epoch": 0.28142589118198874, "grad_norm": 0.09121089281314407, "learning_rate": 0.00018082452832066687, "loss": 1.2836, "step": 150 }, { "epoch": 0.29080675422138835, "grad_norm": 0.08883133842744517, "learning_rate": 0.00017885040223688235, "loss": 1.2643, "step": 155 }, { "epoch": 0.300187617260788, "grad_norm": 0.0889208798307562, "learning_rate": 0.0001767914882841067, "loss": 1.2902, "step": 160 }, { "epoch": 0.30956848030018763, "grad_norm": 0.10746121860126627, "learning_rate": 0.00017465000041333494, "loss": 1.2968, "step": 165 }, { "epoch": 0.31894934333958724, "grad_norm": 0.0832629700388369, "learning_rate": 0.0001724282413673291, "loss": 1.3031, "step": 170 }, { "epoch": 0.32833020637898686, "grad_norm": 0.09626519128659516, "learning_rate": 0.00017012860020447796, "loss": 1.3223, "step": 175 }, { "epoch": 0.33771106941838647, "grad_norm": 0.08434970867620112, "learning_rate": 0.0001677535497298416, "loss": 1.2684, "step": 180 }, { "epoch": 0.34709193245778613, "grad_norm": 0.08136406881743141, "learning_rate": 0.0001653056438361432, "loss": 1.265, "step": 185 }, { "epoch": 0.35647279549718575, "grad_norm": 0.0897096956349179, "learning_rate": 0.0001627875147575671, "loss": 1.2936, "step": 190 }, { "epoch": 0.36585365853658536, "grad_norm": 0.08876653720245482, "learning_rate": 0.00016020187023931638, "loss": 1.2961, "step": 195 }, { "epoch": 0.37523452157598497, "grad_norm": 0.08532928831285669, "learning_rate": 0.00015755149062597333, "loss": 1.2634, "step": 200 }, { "epoch": 0.38461538461538464, "grad_norm": 0.09494232614320906, "learning_rate": 0.00015483922587179386, "loss": 1.2794, "step": 205 }, { "epoch": 0.39399624765478425, "grad_norm": 0.0938403596147772, "learning_rate": 0.00015206799247615037, "loss": 1.2668, "step": 210 }, { "epoch": 0.40337711069418386, "grad_norm": 0.09298814066383618, "learning_rate": 0.00014924077034741924, "loss": 1.2704, "step": 215 }, { "epoch": 0.41275797373358347, "grad_norm": 0.09381021867831099, "learning_rate": 0.0001463605995986836, "loss": 1.2743, "step": 220 }, { "epoch": 0.42213883677298314, "grad_norm": 0.10478061496717873, "learning_rate": 0.000143430577278699, "loss": 1.3064, "step": 225 }, { "epoch": 0.43151969981238275, "grad_norm": 0.0967412755428019, "learning_rate": 0.0001404538540416353, "loss": 1.2749, "step": 230 }, { "epoch": 0.44090056285178236, "grad_norm": 0.10245748178767897, "learning_rate": 0.00013743363075917724, "loss": 1.2879, "step": 235 }, { "epoch": 0.450281425891182, "grad_norm": 0.09494585437974697, "learning_rate": 0.00013437315507862566, "loss": 1.268, "step": 240 }, { "epoch": 0.4596622889305816, "grad_norm": 0.10070892164674464, "learning_rate": 0.0001312757179307012, "loss": 1.2741, "step": 245 }, { "epoch": 0.46904315196998125, "grad_norm": 0.08590715074864536, "learning_rate": 0.000128144649990805, "loss": 1.28, "step": 250 }, { "epoch": 0.47842401500938087, "grad_norm": 0.0921370657647381, "learning_rate": 0.00012498331809754243, "loss": 1.3099, "step": 255 }, { "epoch": 0.4878048780487805, "grad_norm": 0.09411198985391388, "learning_rate": 0.00012179512163235974, "loss": 1.2402, "step": 260 }, { "epoch": 0.4971857410881801, "grad_norm": 0.0955041592156138, "learning_rate": 0.0001185834888641883, "loss": 1.2637, "step": 265 }, { "epoch": 0.5065666041275797, "grad_norm": 0.08843739914811306, "learning_rate": 0.0001153518732630253, "loss": 1.2524, "step": 270 }, { "epoch": 0.5159474671669794, "grad_norm": 0.08894269736301577, "learning_rate": 0.00011210374978641631, "loss": 1.3058, "step": 275 }, { "epoch": 0.525328330206379, "grad_norm": 0.09583708909351138, "learning_rate": 0.0001088426111428319, "loss": 1.2462, "step": 280 }, { "epoch": 0.5347091932457786, "grad_norm": 0.0934542252776399, "learning_rate": 0.00010557196403595688, "loss": 1.2641, "step": 285 }, { "epoch": 0.5440900562851783, "grad_norm": 0.08688918325528941, "learning_rate": 0.00010229532539393049, "loss": 1.2593, "step": 290 }, { "epoch": 0.5534709193245778, "grad_norm": 0.08796474911729359, "learning_rate": 9.901621858759203e-05, "loss": 1.2639, "step": 295 }, { "epoch": 0.5628517823639775, "grad_norm": 0.09427974834689387, "learning_rate": 9.57381696417989e-05, "loss": 1.257, "step": 300 }, { "epoch": 0.5722326454033771, "grad_norm": 0.08656751800419889, "learning_rate": 9.246470344389065e-05, "loss": 1.2545, "step": 305 }, { "epoch": 0.5816135084427767, "grad_norm": 0.09133977950050422, "learning_rate": 8.919933995337624e-05, "loss": 1.2629, "step": 310 }, { "epoch": 0.5909943714821764, "grad_norm": 0.0881803403964572, "learning_rate": 8.594559041692003e-05, "loss": 1.2668, "step": 315 }, { "epoch": 0.600375234521576, "grad_norm": 0.08194022763623, "learning_rate": 8.270695359269698e-05, "loss": 1.2426, "step": 320 }, { "epoch": 0.6097560975609756, "grad_norm": 0.08404336869271818, "learning_rate": 7.948691198817666e-05, "loss": 1.2608, "step": 325 }, { "epoch": 0.6191369606003753, "grad_norm": 0.0911226278784459, "learning_rate": 7.628892811538137e-05, "loss": 1.2446, "step": 330 }, { "epoch": 0.6285178236397748, "grad_norm": 0.0866722940291979, "learning_rate": 7.311644076764564e-05, "loss": 1.2661, "step": 335 }, { "epoch": 0.6378986866791745, "grad_norm": 0.08492545007005736, "learning_rate": 6.997286132188057e-05, "loss": 1.2476, "step": 340 }, { "epoch": 0.6472795497185742, "grad_norm": 0.08729514750397552, "learning_rate": 6.68615700703186e-05, "loss": 1.2471, "step": 345 }, { "epoch": 0.6566604127579737, "grad_norm": 0.08416435736232936, "learning_rate": 6.37859125856842e-05, "loss": 1.2773, "step": 350 }, { "epoch": 0.6660412757973734, "grad_norm": 0.09054925125258262, "learning_rate": 6.074919612369787e-05, "loss": 1.2854, "step": 355 }, { "epoch": 0.6754221388367729, "grad_norm": 0.0840484253491192, "learning_rate": 5.7754686066783045e-05, "loss": 1.2321, "step": 360 }, { "epoch": 0.6848030018761726, "grad_norm": 0.09091692153105359, "learning_rate": 5.4805602412798906e-05, "loss": 1.2519, "step": 365 }, { "epoch": 0.6941838649155723, "grad_norm": 0.0996295825165971, "learning_rate": 5.1905116312575475e-05, "loss": 1.2422, "step": 370 }, { "epoch": 0.7035647279549718, "grad_norm": 0.09167775721865827, "learning_rate": 4.905634665997371e-05, "loss": 1.274, "step": 375 }, { "epoch": 0.7129455909943715, "grad_norm": 0.09342003802996314, "learning_rate": 4.6262356738137937e-05, "loss": 1.2727, "step": 380 }, { "epoch": 0.7223264540337712, "grad_norm": 0.08770871963126985, "learning_rate": 4.3526150925546e-05, "loss": 1.2558, "step": 385 }, { "epoch": 0.7317073170731707, "grad_norm": 0.09069696254933435, "learning_rate": 4.0850671465400144e-05, "loss": 1.2593, "step": 390 }, { "epoch": 0.7410881801125704, "grad_norm": 0.08665209061432325, "learning_rate": 3.823879530183154e-05, "loss": 1.273, "step": 395 }, { "epoch": 0.7504690431519699, "grad_norm": 0.08862136722443326, "learning_rate": 3.569333098632109e-05, "loss": 1.2613, "step": 400 }, { "epoch": 0.7598499061913696, "grad_norm": 0.08931630016078942, "learning_rate": 3.3217015657663145e-05, "loss": 1.255, "step": 405 }, { "epoch": 0.7692307692307693, "grad_norm": 0.08603421106367509, "learning_rate": 3.081251209871872e-05, "loss": 1.2956, "step": 410 }, { "epoch": 0.7786116322701688, "grad_norm": 0.08449087840242356, "learning_rate": 2.848240587312433e-05, "loss": 1.2554, "step": 415 }, { "epoch": 0.7879924953095685, "grad_norm": 0.09050119999554507, "learning_rate": 2.622920254503416e-05, "loss": 1.2555, "step": 420 }, { "epoch": 0.797373358348968, "grad_norm": 0.08603303013311027, "learning_rate": 2.405532498488612e-05, "loss": 1.2445, "step": 425 }, { "epoch": 0.8067542213883677, "grad_norm": 0.0917770059937159, "learning_rate": 2.196311076408808e-05, "loss": 1.2499, "step": 430 }, { "epoch": 0.8161350844277674, "grad_norm": 0.09260597797294362, "learning_rate": 1.995480964142663e-05, "loss": 1.2643, "step": 435 }, { "epoch": 0.8255159474671669, "grad_norm": 0.08638741822571576, "learning_rate": 1.803258114390034e-05, "loss": 1.2381, "step": 440 }, { "epoch": 0.8348968105065666, "grad_norm": 0.08438081141056018, "learning_rate": 1.6198492244579722e-05, "loss": 1.2422, "step": 445 }, { "epoch": 0.8442776735459663, "grad_norm": 0.08400750233223092, "learning_rate": 1.445451513999012e-05, "loss": 1.2178, "step": 450 }, { "epoch": 0.8536585365853658, "grad_norm": 0.08554689021587018, "learning_rate": 1.2802525129408038e-05, "loss": 1.2406, "step": 455 }, { "epoch": 0.8630393996247655, "grad_norm": 0.08737158991700927, "learning_rate": 1.1244298598351077e-05, "loss": 1.2434, "step": 460 }, { "epoch": 0.8724202626641651, "grad_norm": 0.08730413757640722, "learning_rate": 9.781511108429909e-06, "loss": 1.2653, "step": 465 }, { "epoch": 0.8818011257035647, "grad_norm": 0.0881674821245138, "learning_rate": 8.415735595616203e-06, "loss": 1.2524, "step": 470 }, { "epoch": 0.8911819887429644, "grad_norm": 0.09353121525052323, "learning_rate": 7.1484406788639215e-06, "loss": 1.2523, "step": 475 }, { "epoch": 0.900562851782364, "grad_norm": 0.08427560236702278, "learning_rate": 5.980989080902721e-06, "loss": 1.2812, "step": 480 }, { "epoch": 0.9099437148217636, "grad_norm": 0.08684819671006813, "learning_rate": 4.914636162901798e-06, "loss": 1.2549, "step": 485 }, { "epoch": 0.9193245778611632, "grad_norm": 0.08436176928753658, "learning_rate": 3.950528574579415e-06, "loss": 1.2724, "step": 490 }, { "epoch": 0.9287054409005628, "grad_norm": 0.0881012431213454, "learning_rate": 3.089703021210033e-06, "loss": 1.2375, "step": 495 }, { "epoch": 0.9380863039399625, "grad_norm": 0.10177275096404065, "learning_rate": 2.333085148854708e-06, "loss": 1.2587, "step": 500 }, { "epoch": 0.9474671669793621, "grad_norm": 0.08489096282767854, "learning_rate": 1.6814885490135102e-06, "loss": 1.2745, "step": 505 }, { "epoch": 0.9568480300187617, "grad_norm": 0.08633339614238927, "learning_rate": 1.1356138837702702e-06, "loss": 1.2495, "step": 510 }, { "epoch": 0.9662288930581614, "grad_norm": 0.0865458432922912, "learning_rate": 6.960481323703638e-07, "loss": 1.2604, "step": 515 }, { "epoch": 0.975609756097561, "grad_norm": 0.0850043491504774, "learning_rate": 3.632639600416932e-07, "loss": 1.2568, "step": 520 }, { "epoch": 0.9849906191369606, "grad_norm": 0.08694509913799078, "learning_rate": 1.376192097375495e-07, "loss": 1.2397, "step": 525 }, { "epoch": 0.9943714821763602, "grad_norm": 0.08603902256571394, "learning_rate": 1.9356517347990378e-08, "loss": 1.2735, "step": 530 }, { "epoch": 1.0, "eval_loss": 1.1848469972610474, "eval_runtime": 1561.8445, "eval_samples_per_second": 8.566, "eval_steps_per_second": 0.536, "step": 533 }, { "epoch": 1.0, "step": 533, "total_flos": 6701415728676864.0, "train_loss": 1.2950908849208038, "train_runtime": 12657.6418, "train_samples_per_second": 2.694, "train_steps_per_second": 0.042 } ], "logging_steps": 5, "max_steps": 533, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6701415728676864.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }