{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 4942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020234722784297856, "grad_norm": 8.72037124633789, "learning_rate": 1.25e-06, "loss": 1.209, "step": 50 }, { "epoch": 0.04046944556859571, "grad_norm": 5.450883865356445, "learning_rate": 2.5e-06, "loss": 0.9725, "step": 100 }, { "epoch": 0.060704168352893564, "grad_norm": 9.1498441696167, "learning_rate": 3.7500000000000005e-06, "loss": 0.6988, "step": 150 }, { "epoch": 0.08093889113719142, "grad_norm": 7.334131717681885, "learning_rate": 5e-06, "loss": 0.5294, "step": 200 }, { "epoch": 0.10117361392148927, "grad_norm": 9.698792457580566, "learning_rate": 4.9472796288485875e-06, "loss": 0.4225, "step": 250 }, { "epoch": 0.12140833670578713, "grad_norm": 4.219217777252197, "learning_rate": 4.894559257697175e-06, "loss": 0.3813, "step": 300 }, { "epoch": 0.141643059490085, "grad_norm": 12.876835823059082, "learning_rate": 4.841838886545762e-06, "loss": 0.333, "step": 350 }, { "epoch": 0.16187778227438285, "grad_norm": 8.669962882995605, "learning_rate": 4.789118515394349e-06, "loss": 0.3039, "step": 400 }, { "epoch": 0.1821125050586807, "grad_norm": 13.13197135925293, "learning_rate": 4.736398144242936e-06, "loss": 0.3216, "step": 450 }, { "epoch": 0.20234722784297854, "grad_norm": 10.586642265319824, "learning_rate": 4.683677773091523e-06, "loss": 0.2824, "step": 500 }, { "epoch": 0.20234722784297854, "eval_loss": 0.2994783818721771, "eval_runtime": 99.3532, "eval_samples_per_second": 41.851, "eval_steps_per_second": 0.654, "eval_tag_accuracy": 0.9016354016354017, "eval_tag_f1": 0.9666317639529918, "eval_tag_f1-CONTINUE": 0.9075288265882885, "eval_tag_f1-FINISH": 0.9075288265882885, "eval_tag_f1-TERMINATE": 0.8949396352427433, "eval_token_accuracy": 0.9666317639529918, "eval_token_f1": 0.9016354016354017, "eval_token_f1_negative": 0.9075288265882885, "eval_token_f1_positive": 0.8949396352427433, "step": 500 }, { "epoch": 0.2225819506272764, "grad_norm": 5.3532562255859375, "learning_rate": 4.63095740194011e-06, "loss": 0.2726, "step": 550 }, { "epoch": 0.24281667341157426, "grad_norm": 12.720158576965332, "learning_rate": 4.578237030788697e-06, "loss": 0.255, "step": 600 }, { "epoch": 0.2630513961958721, "grad_norm": 6.4184088706970215, "learning_rate": 4.525516659637284e-06, "loss": 0.254, "step": 650 }, { "epoch": 0.28328611898017, "grad_norm": 7.8461151123046875, "learning_rate": 4.4727962884858715e-06, "loss": 0.2647, "step": 700 }, { "epoch": 0.3035208417644678, "grad_norm": 3.1113088130950928, "learning_rate": 4.420075917334458e-06, "loss": 0.2442, "step": 750 }, { "epoch": 0.3237555645487657, "grad_norm": 8.432883262634277, "learning_rate": 4.367355546183045e-06, "loss": 0.247, "step": 800 }, { "epoch": 0.3439902873330635, "grad_norm": 4.828508377075195, "learning_rate": 4.314635175031632e-06, "loss": 0.2359, "step": 850 }, { "epoch": 0.3642250101173614, "grad_norm": 4.320587635040283, "learning_rate": 4.26191480388022e-06, "loss": 0.2549, "step": 900 }, { "epoch": 0.38445973290165925, "grad_norm": 8.660967826843262, "learning_rate": 4.209194432728807e-06, "loss": 0.2453, "step": 950 }, { "epoch": 0.4046944556859571, "grad_norm": 6.111636161804199, "learning_rate": 4.156474061577394e-06, "loss": 0.2447, "step": 1000 }, { "epoch": 0.4046944556859571, "eval_loss": 0.26576703786849976, "eval_runtime": 89.7698, "eval_samples_per_second": 46.318, "eval_steps_per_second": 0.724, "eval_tag_accuracy": 0.9086099086099086, "eval_tag_f1": 0.9704089737670849, "eval_tag_f1-CONTINUE": 0.9146067415730337, "eval_tag_f1-FINISH": 0.9146067415730337, "eval_tag_f1-TERMINATE": 0.9017071908949819, "eval_token_accuracy": 0.9704089737670849, "eval_token_f1": 0.9086099086099086, "eval_token_f1_negative": 0.9146067415730337, "eval_token_f1_positive": 0.9017071908949819, "step": 1000 }, { "epoch": 0.42492917847025496, "grad_norm": 6.170385837554932, "learning_rate": 4.103753690425981e-06, "loss": 0.2341, "step": 1050 }, { "epoch": 0.4451639012545528, "grad_norm": 9.265089988708496, "learning_rate": 4.051033319274568e-06, "loss": 0.2335, "step": 1100 }, { "epoch": 0.4653986240388507, "grad_norm": 8.301984786987305, "learning_rate": 3.9983129481231555e-06, "loss": 0.2176, "step": 1150 }, { "epoch": 0.4856333468231485, "grad_norm": 2.7331886291503906, "learning_rate": 3.945592576971743e-06, "loss": 0.2537, "step": 1200 }, { "epoch": 0.5058680696074463, "grad_norm": 3.475696325302124, "learning_rate": 3.89287220582033e-06, "loss": 0.2384, "step": 1250 }, { "epoch": 0.5261027923917442, "grad_norm": 10.670371055603027, "learning_rate": 3.840151834668917e-06, "loss": 0.2076, "step": 1300 }, { "epoch": 0.5463375151760421, "grad_norm": 7.286683559417725, "learning_rate": 3.7874314635175035e-06, "loss": 0.2209, "step": 1350 }, { "epoch": 0.56657223796034, "grad_norm": 5.6653876304626465, "learning_rate": 3.7347110923660906e-06, "loss": 0.2277, "step": 1400 }, { "epoch": 0.5868069607446378, "grad_norm": 8.548470497131348, "learning_rate": 3.6819907212146777e-06, "loss": 0.2178, "step": 1450 }, { "epoch": 0.6070416835289356, "grad_norm": 1.2667831182479858, "learning_rate": 3.629270350063265e-06, "loss": 0.2116, "step": 1500 }, { "epoch": 0.6070416835289356, "eval_loss": 0.21864494681358337, "eval_runtime": 87.99, "eval_samples_per_second": 47.255, "eval_steps_per_second": 0.739, "eval_tag_accuracy": 0.9336219336219336, "eval_tag_f1": 0.9717647246654861, "eval_tag_f1-CONTINUE": 0.9345351043643264, "eval_tag_f1-FINISH": 0.9345351043643264, "eval_tag_f1-TERMINATE": 0.9326829268292683, "eval_token_accuracy": 0.9717647246654861, "eval_token_f1": 0.9336219336219336, "eval_token_f1_negative": 0.9345351043643264, "eval_token_f1_positive": 0.9326829268292683, "step": 1500 }, { "epoch": 0.6272764063132336, "grad_norm": 12.82320785522461, "learning_rate": 3.576549978911852e-06, "loss": 0.2244, "step": 1550 }, { "epoch": 0.6475111290975314, "grad_norm": 5.0135955810546875, "learning_rate": 3.523829607760439e-06, "loss": 0.1991, "step": 1600 }, { "epoch": 0.6677458518818292, "grad_norm": 5.661144733428955, "learning_rate": 3.471109236609026e-06, "loss": 0.2301, "step": 1650 }, { "epoch": 0.687980574666127, "grad_norm": 4.150299549102783, "learning_rate": 3.4183888654576133e-06, "loss": 0.215, "step": 1700 }, { "epoch": 0.7082152974504249, "grad_norm": 1.3773038387298584, "learning_rate": 3.3656684943062e-06, "loss": 0.2415, "step": 1750 }, { "epoch": 0.7284500202347228, "grad_norm": 7.56788969039917, "learning_rate": 3.312948123154787e-06, "loss": 0.215, "step": 1800 }, { "epoch": 0.7486847430190207, "grad_norm": 9.647685050964355, "learning_rate": 3.260227752003374e-06, "loss": 0.2187, "step": 1850 }, { "epoch": 0.7689194658033185, "grad_norm": 3.352022409439087, "learning_rate": 3.2075073808519613e-06, "loss": 0.232, "step": 1900 }, { "epoch": 0.7891541885876163, "grad_norm": 6.138416290283203, "learning_rate": 3.1547870097005484e-06, "loss": 0.2067, "step": 1950 }, { "epoch": 0.8093889113719142, "grad_norm": 5.341176509857178, "learning_rate": 3.1020666385491355e-06, "loss": 0.2068, "step": 2000 }, { "epoch": 0.8093889113719142, "eval_loss": 0.202407106757164, "eval_runtime": 99.5295, "eval_samples_per_second": 41.777, "eval_steps_per_second": 0.653, "eval_tag_accuracy": 0.9377104377104377, "eval_tag_f1": 0.973316054937847, "eval_tag_f1-CONTINUE": 0.93849441937782, "eval_tag_f1-FINISH": 0.93849441937782, "eval_tag_f1-TERMINATE": 0.9369062119366626, "eval_token_accuracy": 0.973316054937847, "eval_token_f1": 0.9377104377104377, "eval_token_f1_negative": 0.93849441937782, "eval_token_f1_positive": 0.9369062119366626, "step": 2000 }, { "epoch": 0.8296236341562121, "grad_norm": 3.7065298557281494, "learning_rate": 3.0493462673977226e-06, "loss": 0.2186, "step": 2050 }, { "epoch": 0.8498583569405099, "grad_norm": 11.173847198486328, "learning_rate": 2.9966258962463097e-06, "loss": 0.2298, "step": 2100 }, { "epoch": 0.8700930797248078, "grad_norm": 7.461451053619385, "learning_rate": 2.943905525094897e-06, "loss": 0.2171, "step": 2150 }, { "epoch": 0.8903278025091056, "grad_norm": 5.774527549743652, "learning_rate": 2.891185153943484e-06, "loss": 0.2008, "step": 2200 }, { "epoch": 0.9105625252934035, "grad_norm": 13.355608940124512, "learning_rate": 2.8384647827920706e-06, "loss": 0.1928, "step": 2250 }, { "epoch": 0.9307972480777014, "grad_norm": 5.512803554534912, "learning_rate": 2.7857444116406586e-06, "loss": 0.2151, "step": 2300 }, { "epoch": 0.9510319708619992, "grad_norm": 7.355721473693848, "learning_rate": 2.7330240404892457e-06, "loss": 0.2313, "step": 2350 }, { "epoch": 0.971266693646297, "grad_norm": 8.145475387573242, "learning_rate": 2.680303669337833e-06, "loss": 0.206, "step": 2400 }, { "epoch": 0.9915014164305949, "grad_norm": 3.35742449760437, "learning_rate": 2.6275832981864195e-06, "loss": 0.2126, "step": 2450 }, { "epoch": 1.0117361392148927, "grad_norm": 4.693106651306152, "learning_rate": 2.5748629270350066e-06, "loss": 0.1958, "step": 2500 }, { "epoch": 1.0117361392148927, "eval_loss": 0.20388753712177277, "eval_runtime": 90.2103, "eval_samples_per_second": 46.092, "eval_steps_per_second": 0.721, "eval_tag_accuracy": 0.9321789321789322, "eval_tag_f1": 0.9744642522149032, "eval_tag_f1-CONTINUE": 0.9338338808071328, "eval_tag_f1-FINISH": 0.9338338808071328, "eval_tag_f1-TERMINATE": 0.9304390725209669, "eval_token_accuracy": 0.9744642522149032, "eval_token_f1": 0.9321789321789322, "eval_token_f1_negative": 0.9338338808071328, "eval_token_f1_positive": 0.9304390725209669, "step": 2500 }, { "epoch": 1.0319708619991905, "grad_norm": 4.066263675689697, "learning_rate": 2.5221425558835937e-06, "loss": 0.1705, "step": 2550 }, { "epoch": 1.0522055847834886, "grad_norm": 1.981619954109192, "learning_rate": 2.4694221847321804e-06, "loss": 0.2019, "step": 2600 }, { "epoch": 1.0724403075677864, "grad_norm": 17.264829635620117, "learning_rate": 2.416701813580768e-06, "loss": 0.1502, "step": 2650 }, { "epoch": 1.0926750303520842, "grad_norm": 8.771439552307129, "learning_rate": 2.363981442429355e-06, "loss": 0.1552, "step": 2700 }, { "epoch": 1.112909753136382, "grad_norm": 11.144837379455566, "learning_rate": 2.311261071277942e-06, "loss": 0.1537, "step": 2750 }, { "epoch": 1.13314447592068, "grad_norm": 3.879704475402832, "learning_rate": 2.2585407001265293e-06, "loss": 0.1816, "step": 2800 }, { "epoch": 1.1533791987049777, "grad_norm": 4.202072620391846, "learning_rate": 2.2058203289751164e-06, "loss": 0.1522, "step": 2850 }, { "epoch": 1.1736139214892756, "grad_norm": 19.130678176879883, "learning_rate": 2.1530999578237035e-06, "loss": 0.1646, "step": 2900 }, { "epoch": 1.1938486442735734, "grad_norm": 1.3073982000350952, "learning_rate": 2.10037958667229e-06, "loss": 0.1594, "step": 2950 }, { "epoch": 1.2140833670578712, "grad_norm": 6.3615851402282715, "learning_rate": 2.0476592155208773e-06, "loss": 0.1616, "step": 3000 }, { "epoch": 1.2140833670578712, "eval_loss": 0.21922506392002106, "eval_runtime": 90.428, "eval_samples_per_second": 45.981, "eval_steps_per_second": 0.719, "eval_tag_accuracy": 0.9391534391534392, "eval_tag_f1": 0.974827471052257, "eval_tag_f1-CONTINUE": 0.9403161122906346, "eval_tag_f1-FINISH": 0.9403161122906346, "eval_tag_f1-TERMINATE": 0.9379445670836399, "eval_token_accuracy": 0.974827471052257, "eval_token_f1": 0.9391534391534392, "eval_token_f1_negative": 0.9403161122906346, "eval_token_f1_positive": 0.9379445670836399, "step": 3000 }, { "epoch": 1.2343180898421693, "grad_norm": 5.983785152435303, "learning_rate": 1.9949388443694644e-06, "loss": 0.1884, "step": 3050 }, { "epoch": 1.254552812626467, "grad_norm": 7.194103717803955, "learning_rate": 1.9422184732180515e-06, "loss": 0.1682, "step": 3100 }, { "epoch": 1.274787535410765, "grad_norm": 8.178751945495605, "learning_rate": 1.8894981020666386e-06, "loss": 0.1621, "step": 3150 }, { "epoch": 1.2950222581950628, "grad_norm": 9.569229125976562, "learning_rate": 1.8367777309152257e-06, "loss": 0.162, "step": 3200 }, { "epoch": 1.3152569809793606, "grad_norm": 10.708954811096191, "learning_rate": 1.7840573597638128e-06, "loss": 0.1437, "step": 3250 }, { "epoch": 1.3354917037636584, "grad_norm": 8.722265243530273, "learning_rate": 1.7313369886124e-06, "loss": 0.163, "step": 3300 }, { "epoch": 1.3557264265479563, "grad_norm": 1.2432576417922974, "learning_rate": 1.6786166174609872e-06, "loss": 0.1529, "step": 3350 }, { "epoch": 1.375961149332254, "grad_norm": 8.486040115356445, "learning_rate": 1.6258962463095744e-06, "loss": 0.1509, "step": 3400 }, { "epoch": 1.396195872116552, "grad_norm": 7.521843910217285, "learning_rate": 1.5731758751581612e-06, "loss": 0.176, "step": 3450 }, { "epoch": 1.41643059490085, "grad_norm": 5.913136959075928, "learning_rate": 1.5204555040067484e-06, "loss": 0.1635, "step": 3500 }, { "epoch": 1.41643059490085, "eval_loss": 0.21361203491687775, "eval_runtime": 99.2858, "eval_samples_per_second": 41.879, "eval_steps_per_second": 0.655, "eval_tag_accuracy": 0.9389129389129389, "eval_tag_f1": 0.9748354538838472, "eval_tag_f1-CONTINUE": 0.9399243140964996, "eval_tag_f1-FINISH": 0.9399243140964996, "eval_tag_f1-TERMINATE": 0.937866927592955, "eval_token_accuracy": 0.9748354538838472, "eval_token_f1": 0.9389129389129389, "eval_token_f1_negative": 0.9399243140964996, "eval_token_f1_positive": 0.937866927592955, "step": 3500 }, { "epoch": 1.4366653176851476, "grad_norm": 3.9816882610321045, "learning_rate": 1.4677351328553355e-06, "loss": 0.1598, "step": 3550 }, { "epoch": 1.4569000404694457, "grad_norm": 11.924986839294434, "learning_rate": 1.4150147617039226e-06, "loss": 0.1722, "step": 3600 }, { "epoch": 1.4771347632537435, "grad_norm": 4.420969009399414, "learning_rate": 1.3622943905525097e-06, "loss": 0.1408, "step": 3650 }, { "epoch": 1.4973694860380413, "grad_norm": 11.450356483459473, "learning_rate": 1.3095740194010966e-06, "loss": 0.167, "step": 3700 }, { "epoch": 1.5176042088223392, "grad_norm": 8.506190299987793, "learning_rate": 1.2568536482496837e-06, "loss": 0.1547, "step": 3750 }, { "epoch": 1.537838931606637, "grad_norm": 2.8181309700012207, "learning_rate": 1.2041332770982708e-06, "loss": 0.1425, "step": 3800 }, { "epoch": 1.5580736543909348, "grad_norm": 2.5935609340667725, "learning_rate": 1.151412905946858e-06, "loss": 0.1547, "step": 3850 }, { "epoch": 1.5783083771752326, "grad_norm": 10.59027099609375, "learning_rate": 1.098692534795445e-06, "loss": 0.1495, "step": 3900 }, { "epoch": 1.5985430999595307, "grad_norm": 2.5813493728637695, "learning_rate": 1.0459721636440321e-06, "loss": 0.1305, "step": 3950 }, { "epoch": 1.6187778227438283, "grad_norm": 1.844016671180725, "learning_rate": 9.932517924926192e-07, "loss": 0.1566, "step": 4000 }, { "epoch": 1.6187778227438283, "eval_loss": 0.22482524812221527, "eval_runtime": 87.0812, "eval_samples_per_second": 47.749, "eval_steps_per_second": 0.746, "eval_tag_accuracy": 0.9377104377104377, "eval_tag_f1": 0.9749232650313393, "eval_tag_f1-CONTINUE": 0.9394718392147698, "eval_tag_f1-FINISH": 0.9394718392147698, "eval_tag_f1-TERMINATE": 0.9358434481050285, "eval_token_accuracy": 0.9749232650313393, "eval_token_f1": 0.9377104377104377, "eval_token_f1_negative": 0.9394718392147698, "eval_token_f1_positive": 0.9358434481050285, "step": 4000 }, { "epoch": 1.6390125455281264, "grad_norm": 9.678681373596191, "learning_rate": 9.405314213412063e-07, "loss": 0.1682, "step": 4050 }, { "epoch": 1.659247268312424, "grad_norm": 11.279572486877441, "learning_rate": 8.878110501897934e-07, "loss": 0.1655, "step": 4100 }, { "epoch": 1.679481991096722, "grad_norm": 6.408283710479736, "learning_rate": 8.350906790383805e-07, "loss": 0.1375, "step": 4150 }, { "epoch": 1.6997167138810199, "grad_norm": 8.360751152038574, "learning_rate": 7.823703078869676e-07, "loss": 0.148, "step": 4200 }, { "epoch": 1.7199514366653177, "grad_norm": 9.093498229980469, "learning_rate": 7.296499367355547e-07, "loss": 0.1511, "step": 4250 }, { "epoch": 1.7401861594496155, "grad_norm": 8.47696304321289, "learning_rate": 6.769295655841418e-07, "loss": 0.183, "step": 4300 }, { "epoch": 1.7604208822339134, "grad_norm": 5.258668422698975, "learning_rate": 6.242091944327289e-07, "loss": 0.1583, "step": 4350 }, { "epoch": 1.7806556050182114, "grad_norm": 2.8200266361236572, "learning_rate": 5.714888232813159e-07, "loss": 0.1392, "step": 4400 }, { "epoch": 1.800890327802509, "grad_norm": 1.5096231698989868, "learning_rate": 5.18768452129903e-07, "loss": 0.1252, "step": 4450 }, { "epoch": 1.821125050586807, "grad_norm": 7.513113975524902, "learning_rate": 4.660480809784902e-07, "loss": 0.165, "step": 4500 }, { "epoch": 1.821125050586807, "eval_loss": 0.21320512890815735, "eval_runtime": 90.0868, "eval_samples_per_second": 46.156, "eval_steps_per_second": 0.722, "eval_tag_accuracy": 0.9403559403559404, "eval_tag_f1": 0.9753503465214146, "eval_tag_f1-CONTINUE": 0.9416470588235294, "eval_tag_f1-FINISH": 0.9416470588235294, "eval_tag_f1-TERMINATE": 0.9390063944909002, "eval_token_accuracy": 0.9753503465214146, "eval_token_f1": 0.9403559403559404, "eval_token_f1_negative": 0.9416470588235294, "eval_token_f1_positive": 0.9390063944909002, "step": 4500 }, { "epoch": 1.8413597733711047, "grad_norm": 3.344792604446411, "learning_rate": 4.1332770982707723e-07, "loss": 0.1645, "step": 4550 }, { "epoch": 1.8615944961554027, "grad_norm": 1.2586418390274048, "learning_rate": 3.606073386756643e-07, "loss": 0.1497, "step": 4600 }, { "epoch": 1.8818292189397006, "grad_norm": 5.412999629974365, "learning_rate": 3.078869675242514e-07, "loss": 0.1586, "step": 4650 }, { "epoch": 1.9020639417239984, "grad_norm": 4.021740436553955, "learning_rate": 2.551665963728385e-07, "loss": 0.1804, "step": 4700 }, { "epoch": 1.9222986645082962, "grad_norm": 5.421517372131348, "learning_rate": 2.0244622522142556e-07, "loss": 0.1426, "step": 4750 }, { "epoch": 1.942533387292594, "grad_norm": 13.003674507141113, "learning_rate": 1.4972585407001267e-07, "loss": 0.1798, "step": 4800 }, { "epoch": 1.962768110076892, "grad_norm": 2.8676435947418213, "learning_rate": 9.700548291859976e-08, "loss": 0.1675, "step": 4850 }, { "epoch": 1.9830028328611897, "grad_norm": 5.923835277557373, "learning_rate": 4.4285111767186845e-08, "loss": 0.1406, "step": 4900 } ], "logging_steps": 50, "max_steps": 4942, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.105108458176471e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }