diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,13408 @@ +{ + "best_global_step": 9000, + "best_metric": 2.6552714315745756, + "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/HNet_Ori-BPT3/checkpoint-9000", + "epoch": 2.5525849230551025, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.0028366782497695198, + "grad_norm": 590.2120361328125, + "loss": 144.5784, + "loss_ce": 170.91241455078125, + "loss_region": 0.030412333086133003, + "loss_total": 170.9428253173828, + "lr": 2.20454076850486e-05, + "router/selected_tokens_s0": 1.0, + "step": 10, + "tokens_trained": 0.03276544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.0056733564995390395, + "grad_norm": 565.2921142578125, + "loss": 52.047, + "loss_ce": 28.61202049255371, + "loss_region": 0.03181665763258934, + "loss_total": 28.643836975097656, + "lr": 4.654030511288038e-05, + "router/selected_tokens_s0": 1.0, + "step": 20, + "tokens_trained": 0.06553088 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.00851003474930856, + "grad_norm": 361.24432373046875, + "loss": 18.4265, + "loss_ce": 16.737817764282227, + "loss_region": 0.03595759719610214, + "loss_total": 16.773775100708008, + "lr": 7.103520254071216e-05, + "router/selected_tokens_s0": 1.0, + "step": 30, + "tokens_trained": 0.09829632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.011346712999078079, + "grad_norm": 649.0695190429688, + "loss": 8.0445, + "loss_ce": 11.410881996154785, + "loss_region": 0.03821098059415817, + "loss_total": 11.449092864990234, + "lr": 9.553009996854394e-05, + "router/selected_tokens_s0": 1.0, + "step": 40, + "tokens_trained": 0.13106176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.014183391248847599, + "grad_norm": 534.2383422851562, + "loss": 9.3219, + "loss_ce": 9.884474754333496, + "loss_region": 0.040100596845149994, + "loss_total": 9.924575805664062, + "lr": 0.00012002499739637572, + "router/selected_tokens_s0": 1.0, + "step": 50, + "tokens_trained": 0.1638272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.01702006949861712, + "grad_norm": 273.8401184082031, + "loss": 14.1755, + "loss_ce": 12.677406311035156, + "loss_region": 0.041250791400671005, + "loss_total": 12.718657493591309, + "lr": 0.00014451989482420748, + "router/selected_tokens_s0": 1.0, + "step": 60, + "tokens_trained": 0.19659264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.01985674774838664, + "grad_norm": 544.6290893554688, + "loss": 14.1136, + "loss_ce": 14.262775421142578, + "loss_region": 0.042144227772951126, + "loss_total": 14.304919242858887, + "lr": 0.00016901479225203927, + "router/selected_tokens_s0": 1.0, + "step": 70, + "tokens_trained": 0.22935808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.022693425998156158, + "grad_norm": 527.1918334960938, + "loss": 15.2492, + "loss_ce": 11.932450294494629, + "loss_region": 0.04246167093515396, + "loss_total": 11.9749116897583, + "lr": 0.00019350968967987104, + "router/selected_tokens_s0": 1.0, + "step": 80, + "tokens_trained": 0.26212192 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.025530104247925678, + "grad_norm": 343.09454345703125, + "loss": 12.0101, + "loss_ce": 6.092933177947998, + "loss_region": 0.04214272275567055, + "loss_total": 6.13507604598999, + "lr": 0.0002180045871077028, + "router/selected_tokens_s0": 1.0, + "step": 90, + "tokens_trained": 0.29488736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.028366782497695198, + "grad_norm": 309.79541015625, + "loss": 9.8843, + "loss_ce": 5.214886665344238, + "loss_region": 0.041769951581954956, + "loss_total": 5.256656646728516, + "lr": 0.00024249948453553463, + "router/selected_tokens_s0": 1.0, + "step": 100, + "tokens_trained": 0.3276528 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.031203460747464717, + "grad_norm": 251.26068115234375, + "loss": 8.5835, + "loss_ce": 12.269608497619629, + "loss_region": 0.04041137546300888, + "loss_total": 12.310019493103027, + "lr": 0.00026699438196336637, + "router/selected_tokens_s0": 1.0, + "step": 110, + "tokens_trained": 0.36041744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.03404013899723424, + "grad_norm": 148.94601440429688, + "loss": 6.4366, + "loss_ce": 3.2050940990448, + "loss_region": 0.03642381727695465, + "loss_total": 3.241518020629883, + "lr": 0.00029148927939119814, + "router/selected_tokens_s0": 1.0, + "step": 120, + "tokens_trained": 0.39318128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.03687681724700376, + "grad_norm": 187.2681427001953, + "loss": 10.4928, + "loss_ce": 6.001107215881348, + "loss_region": 0.030254848301410675, + "loss_total": 6.031362056732178, + "lr": 0.00031598417681902996, + "router/selected_tokens_s0": 4752.0, + "step": 130, + "tokens_trained": 0.42594672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.03971349549677328, + "grad_norm": 218.34559631347656, + "loss": 8.5742, + "loss_ce": 3.848691701889038, + "loss_region": 0.03400004655122757, + "loss_total": 3.8826918601989746, + "lr": 0.00034047907424686173, + "router/selected_tokens_s0": 7042.125, + "step": 140, + "tokens_trained": 0.458709112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.042550173746542796, + "grad_norm": 215.60699462890625, + "loss": 4.5762, + "loss_ce": 5.0876851081848145, + "loss_region": 0.03198177367448807, + "loss_total": 5.119667053222656, + "lr": 0.0003649739716746935, + "router/selected_tokens_s0": 424.5, + "step": 150, + "tokens_trained": 0.491469992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.045386851996312316, + "grad_norm": 147.6339111328125, + "loss": 5.8047, + "loss_ce": 8.435795783996582, + "loss_region": 0.03364315256476402, + "loss_total": 8.469438552856445, + "lr": 0.00038946886910252526, + "router/selected_tokens_s0": 536.875, + "step": 160, + "tokens_trained": 0.524234632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.048223530246081836, + "grad_norm": 218.0553741455078, + "loss": 5.7968, + "loss_ce": 6.644444942474365, + "loss_region": 0.031727153807878494, + "loss_total": 6.676172256469727, + "lr": 0.0004139637665303571, + "router/selected_tokens_s0": 1833.5, + "step": 170, + "tokens_trained": 0.556999272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.051060208495851356, + "grad_norm": 100.56309509277344, + "loss": 6.7503, + "loss_ce": 8.332029342651367, + "loss_region": 0.03232778236269951, + "loss_total": 8.364356994628906, + "lr": 0.0004384586639581888, + "router/selected_tokens_s0": 1649.75, + "step": 180, + "tokens_trained": 0.589762952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.053896886745620876, + "grad_norm": 157.10765075683594, + "loss": 6.4449, + "loss_ce": 4.925128936767578, + "loss_region": 0.031663134694099426, + "loss_total": 4.956791877746582, + "lr": 0.0004629535613860206, + "router/selected_tokens_s0": 1687.375, + "step": 190, + "tokens_trained": 0.622527592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.056733564995390395, + "grad_norm": 83.81340026855469, + "loss": 3.7524, + "loss_ce": 5.0940961837768555, + "loss_region": 0.02894311398267746, + "loss_total": 5.123039245605469, + "lr": 0.00048744845881385244, + "router/selected_tokens_s0": 3074.125, + "step": 200, + "tokens_trained": 0.655293032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.059570243245159915, + "grad_norm": 169.4013671875, + "loss": 5.9481, + "loss_ce": 9.220865249633789, + "loss_region": 0.02949724718928337, + "loss_total": 9.250362396240234, + "lr": 0.0005119433562416841, + "router/selected_tokens_s0": 3610.375, + "step": 210, + "tokens_trained": 0.688057672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.062406921494929435, + "grad_norm": 80.7753677368164, + "loss": 5.1122, + "loss_ce": 3.287958860397339, + "loss_region": 0.029488109052181244, + "loss_total": 3.3174469470977783, + "lr": 0.0005364382536695159, + "router/selected_tokens_s0": 2584.75, + "step": 220, + "tokens_trained": 0.720823112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.06524359974469895, + "grad_norm": 89.39635467529297, + "loss": 3.3047, + "loss_ce": 2.1086361408233643, + "loss_region": 0.029821382835507393, + "loss_total": 2.1384575366973877, + "lr": 0.0005609331510973477, + "router/selected_tokens_s0": 3991.5, + "step": 230, + "tokens_trained": 0.753588552 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.06808027799446847, + "grad_norm": 90.892333984375, + "loss": 4.2563, + "loss_ce": 2.7003867626190186, + "loss_region": 0.030828693881630898, + "loss_total": 2.731215476989746, + "lr": 0.0005854280485251795, + "router/selected_tokens_s0": 4964.125, + "step": 240, + "tokens_trained": 0.786353992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.070916956244238, + "grad_norm": 86.70359802246094, + "loss": 2.8849, + "loss_ce": 3.55375599861145, + "loss_region": 0.029162542894482613, + "loss_total": 3.582918643951416, + "lr": 0.0006099229459530113, + "router/selected_tokens_s0": 2891.75, + "step": 250, + "tokens_trained": 0.819119432 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.07375363449400751, + "grad_norm": 106.11075592041016, + "loss": 4.4058, + "loss_ce": 5.333348751068115, + "loss_region": 0.029971925541758537, + "loss_total": 5.363320827484131, + "lr": 0.0006344178433808431, + "router/selected_tokens_s0": 4181.375, + "step": 260, + "tokens_trained": 0.851884072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.07659031274377703, + "grad_norm": 75.7653579711914, + "loss": 3.6076, + "loss_ce": 2.3445212841033936, + "loss_region": 0.029431568458676338, + "loss_total": 2.373952865600586, + "lr": 0.0006589127408086749, + "router/selected_tokens_s0": 3440.0, + "step": 270, + "tokens_trained": 0.884649512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.07942699099354655, + "grad_norm": 95.4271469116211, + "loss": 2.8447, + "loss_ce": 3.030097007751465, + "loss_region": 0.030556708574295044, + "loss_total": 3.0606536865234375, + "lr": 0.0006834076382365066, + "router/selected_tokens_s0": 4730.5, + "step": 280, + "tokens_trained": 0.917414936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.08226366924331607, + "grad_norm": 74.673828125, + "loss": 2.0288, + "loss_ce": 2.1509435176849365, + "loss_region": 0.028712084516882896, + "loss_total": 2.1796555519104004, + "lr": 0.0007079025356643384, + "router/selected_tokens_s0": 2658.625, + "step": 290, + "tokens_trained": 0.950180376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.08510034749308559, + "grad_norm": 77.71709442138672, + "loss": 2.0227, + "loss_ce": 2.286048650741577, + "loss_region": 0.03060404770076275, + "loss_total": 2.316652774810791, + "lr": 0.0007323974330921702, + "router/selected_tokens_s0": 4752.0, + "step": 300, + "tokens_trained": 0.982945816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.08793702574285511, + "grad_norm": 55.31558609008789, + "loss": 2.1281, + "loss_ce": 2.0437748432159424, + "loss_region": 0.030610591173171997, + "loss_total": 2.074385404586792, + "lr": 0.000756892330520002, + "router/selected_tokens_s0": 4748.625, + "step": 310, + "tokens_trained": 1.015711256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.09077370399262463, + "grad_norm": 77.07698059082031, + "loss": 2.5761, + "loss_ce": 2.7218589782714844, + "loss_region": 0.03093603625893593, + "loss_total": 2.7527949810028076, + "lr": 0.0007813872279478337, + "router/selected_tokens_s0": 4946.625, + "step": 320, + "tokens_trained": 1.048476696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.09361038224239415, + "grad_norm": 47.57994842529297, + "loss": 2.239, + "loss_ce": 1.9163914918899536, + "loss_region": 0.029897142201662064, + "loss_total": 1.9462885856628418, + "lr": 0.0008058821253756655, + "router/selected_tokens_s0": 4135.875, + "step": 330, + "tokens_trained": 1.081242136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.09644706049216367, + "grad_norm": 58.62579345703125, + "loss": 2.8423, + "loss_ce": 3.2828376293182373, + "loss_region": 0.03434763103723526, + "loss_total": 3.317185163497925, + "lr": 0.0008303770228034974, + "router/selected_tokens_s0": 6686.5, + "step": 340, + "tokens_trained": 1.114007576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.09928373874193319, + "grad_norm": 34.5246696472168, + "loss": 2.5891, + "loss_ce": 1.537825345993042, + "loss_region": 0.02885586954653263, + "loss_total": 1.5666812658309937, + "lr": 0.0008548719202313291, + "router/selected_tokens_s0": 154.125, + "step": 350, + "tokens_trained": 1.146773016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.10212041699170271, + "grad_norm": 37.228973388671875, + "loss": 2.7756, + "loss_ce": 1.9871504306793213, + "loss_region": 0.029301652684807777, + "loss_total": 2.0164520740509033, + "lr": 0.0008793668176591608, + "router/selected_tokens_s0": 3631.75, + "step": 360, + "tokens_trained": 1.179538456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.10495709524147223, + "grad_norm": 30.546344757080078, + "loss": 2.4884, + "loss_ce": 1.4886701107025146, + "loss_region": 0.031588103622198105, + "loss_total": 1.5202581882476807, + "lr": 0.0009038617150869926, + "router/selected_tokens_s0": 5236.625, + "step": 370, + "tokens_trained": 1.212303896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.10779377349124175, + "grad_norm": 45.68803405761719, + "loss": 2.8937, + "loss_ce": 2.285705804824829, + "loss_region": 0.030362222343683243, + "loss_total": 2.316067934036255, + "lr": 0.0009283566125148244, + "router/selected_tokens_s0": 4493.625, + "step": 380, + "tokens_trained": 1.245068536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.11063045174101127, + "grad_norm": 32.428009033203125, + "loss": 1.9186, + "loss_ce": 1.5672893524169922, + "loss_region": 0.03746495023369789, + "loss_total": 1.6047543287277222, + "lr": 0.0009528515099426562, + "router/selected_tokens_s0": 8134.375, + "step": 390, + "tokens_trained": 1.277833176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.11346712999078079, + "grad_norm": 35.54498291015625, + "loss": 1.6959, + "loss_ce": 1.6413251161575317, + "loss_region": 0.026098042726516724, + "loss_total": 1.667423129081726, + "lr": 0.000977346407370488, + "router/selected_tokens_s0": 625.5, + "step": 400, + "tokens_trained": 1.310598616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.11630380824055031, + "grad_norm": 8.186758041381836, + "loss": 1.671, + "loss_ce": 1.324172019958496, + "loss_region": 0.03537043184041977, + "loss_total": 1.3595424890518188, + "lr": 0.0010018413047983197, + "router/selected_tokens_s0": 7117.75, + "step": 410, + "tokens_trained": 1.343364056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.11914048649031983, + "grad_norm": 32.364845275878906, + "loss": 1.7487, + "loss_ce": 1.6946724653244019, + "loss_region": 0.030674295499920845, + "loss_total": 1.7253468036651611, + "lr": 0.0010263362022261515, + "router/selected_tokens_s0": 4591.75, + "step": 420, + "tokens_trained": 1.376129496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.12197716474008935, + "grad_norm": 51.924861907958984, + "loss": 1.6652, + "loss_ce": 1.7081111669540405, + "loss_region": 0.029956262558698654, + "loss_total": 1.738067388534546, + "lr": 0.0010508310996539833, + "router/selected_tokens_s0": 4165.25, + "step": 430, + "tokens_trained": 1.408889864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.12481384298985887, + "grad_norm": 31.08187484741211, + "loss": 1.6269, + "loss_ce": 1.688795804977417, + "loss_region": 0.030442532151937485, + "loss_total": 1.71923828125, + "lr": 0.0010753259970818151, + "router/selected_tokens_s0": 4528.875, + "step": 440, + "tokens_trained": 1.441655304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1276505212396284, + "grad_norm": 9.750688552856445, + "loss": 1.646, + "loss_ce": 1.342025637626648, + "loss_region": 0.0289932768791914, + "loss_total": 1.371018886566162, + "lr": 0.001099820894509647, + "router/selected_tokens_s0": 3472.375, + "step": 450, + "tokens_trained": 1.474420744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1304871994893979, + "grad_norm": 69.62458038330078, + "loss": 2.646, + "loss_ce": 2.835515022277832, + "loss_region": 0.03730851039290428, + "loss_total": 2.872823476791382, + "lr": 0.0011243157919374788, + "router/selected_tokens_s0": 7822.125, + "step": 460, + "tokens_trained": 1.507186184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.13332387773916743, + "grad_norm": 62.241451263427734, + "loss": 2.2121, + "loss_ce": 1.9173500537872314, + "loss_region": 0.033008284866809845, + "loss_total": 1.9503583908081055, + "lr": 0.0011488106893653104, + "router/selected_tokens_s0": 5854.125, + "step": 470, + "tokens_trained": 1.539950832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.13616055598893695, + "grad_norm": 36.45135498046875, + "loss": 1.8122, + "loss_ce": 1.579708456993103, + "loss_region": 0.030225276947021484, + "loss_total": 1.6099337339401245, + "lr": 0.0011733055867931422, + "router/selected_tokens_s0": 4330.5, + "step": 480, + "tokens_trained": 1.572715472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.13899723423870647, + "grad_norm": 13.028325080871582, + "loss": 1.5027, + "loss_ce": 1.357754111289978, + "loss_region": 0.03526536747813225, + "loss_total": 1.393019437789917, + "lr": 0.001197800484220974, + "router/selected_tokens_s0": 7119.25, + "step": 490, + "tokens_trained": 1.605480912 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.141833912488476, + "grad_norm": 24.705984115600586, + "loss": 1.6096, + "loss_ce": 1.6097279787063599, + "loss_region": 0.02911153808236122, + "loss_total": 1.6388394832611084, + "lr": 0.0012222953816488059, + "router/selected_tokens_s0": 3648.75, + "step": 500, + "tokens_trained": 1.638244216 + }, + { + "epoch": 0.141833912488476, + "eval_ppl": 4.8348835473380465, + "eval_runtime": 2.9238, + "step": 500, + "tokens_trained": 1.638244216 + }, + { + "epoch": 0.141833912488476, + "eval_F": 0.3934690889573574, + "eval_F_cds": 0.29905151571508276, + "eval_F_dig": 0.4478214443836758, + "eval_F_exon": 0.39103450221457386, + "eval_F_intron": 0.40873021991492037, + "eval_F_nig": 0.4262229153142855, + "eval_F_promoter": 0.30306008909923465, + "eval_F_utr": 0.3906123042448191, + "eval_G": 0.49025372407568035, + "eval_G_cds": 0.48331595902636837, + "eval_G_dig": 0.49727705981261555, + "eval_G_exon": 0.4909996295084916, + "eval_G_intron": 0.4915825135015993, + "eval_G_nig": 0.49304083637658525, + "eval_G_promoter": 0.48200754687828323, + "eval_G_utr": 0.4901697268782234, + "eval_avg_bp_per_token": 2.541495705926663, + "eval_bp_per_token/cds": 3.343905472636816, + "eval_bp_per_token/dig": 2.2330328583890666, + "eval_bp_per_token/exon": 2.5573190967462667, + "eval_bp_per_token/intron": 2.4466015755041455, + "eval_bp_per_token/nig": 2.346190136826938, + "eval_bp_per_token/promoter": 3.299675661589863, + "eval_bp_per_token/utr": 2.56008320560543, + "eval_ppl_cds": 5.567015659246301, + "eval_ppl_dig": 4.898425899350941, + "eval_ppl_exon": 4.9662705320329295, + "eval_ppl_intron": 4.767518067357663, + "eval_ppl_nig": 4.6987085494689405, + "eval_ppl_promoter": 5.216405144788708, + "eval_ppl_utr": 4.913846632347962, + "step": 500, + "tokens_trained": 1.638244216 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1446705907382455, + "grad_norm": 37.46432113647461, + "loss": 1.5582, + "loss_ce": 1.5548115968704224, + "loss_region": 0.02565930411219597, + "loss_total": 1.5804709196090698, + "lr": 0.0012243786686061229, + "router/selected_tokens_s0": 1004.25, + "step": 510, + "tokens_trained": 1.671005424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.14750726898801503, + "grad_norm": 27.722349166870117, + "loss": 1.5672, + "loss_ce": 1.478359341621399, + "loss_region": 0.031882915645837784, + "loss_total": 1.510242223739624, + "lr": 0.0012239717766222718, + "router/selected_tokens_s0": 5380.75, + "step": 520, + "tokens_trained": 1.703770864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.15034394723778455, + "grad_norm": 26.949983596801758, + "loss": 1.6157, + "loss_ce": 1.4986213445663452, + "loss_region": 0.03651271015405655, + "loss_total": 1.5351340770721436, + "lr": 0.001223564884638421, + "router/selected_tokens_s0": 7781.0, + "step": 530, + "tokens_trained": 1.736536304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.15318062548755407, + "grad_norm": 28.232316970825195, + "loss": 1.6637, + "loss_ce": 1.4607714414596558, + "loss_region": 0.025137916207313538, + "loss_total": 1.485909342765808, + "lr": 0.00122315799265457, + "router/selected_tokens_s0": 612.875, + "step": 540, + "tokens_trained": 1.769301744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1560173037373236, + "grad_norm": 23.33485221862793, + "loss": 1.4993, + "loss_ce": 1.4412897825241089, + "loss_region": 0.035474810749292374, + "loss_total": 1.4767645597457886, + "lr": 0.001222751100670719, + "router/selected_tokens_s0": 7357.5, + "step": 550, + "tokens_trained": 1.802067184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1588539819870931, + "grad_norm": 21.005512237548828, + "loss": 1.4335, + "loss_ce": 1.3612841367721558, + "loss_region": 0.029854778200387955, + "loss_total": 1.3911389112472534, + "lr": 0.001222344208686868, + "router/selected_tokens_s0": 4172.125, + "step": 560, + "tokens_trained": 1.834832624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.16169066023686263, + "grad_norm": 19.53492546081543, + "loss": 1.4383, + "loss_ce": 1.4045627117156982, + "loss_region": 0.02937491238117218, + "loss_total": 1.433937668800354, + "lr": 0.0012219373167030169, + "router/selected_tokens_s0": 3881.875, + "step": 570, + "tokens_trained": 1.867598064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.16452733848663215, + "grad_norm": 25.31780242919922, + "loss": 1.7004, + "loss_ce": 1.591187834739685, + "loss_region": 0.03149839863181114, + "loss_total": 1.6226862668991089, + "lr": 0.0012215304247191658, + "router/selected_tokens_s0": 5153.875, + "step": 580, + "tokens_trained": 1.900363504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.16736401673640167, + "grad_norm": 16.421045303344727, + "loss": 1.5092, + "loss_ce": 1.2439810037612915, + "loss_region": 0.02931862138211727, + "loss_total": 1.2732995748519897, + "lr": 0.0012211235327353148, + "router/selected_tokens_s0": 3840.5, + "step": 590, + "tokens_trained": 1.933128944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.17020069498617119, + "grad_norm": 25.38547706604004, + "loss": 1.5893, + "loss_ce": 1.5482516288757324, + "loss_region": 0.025499241426587105, + "loss_total": 1.5737508535385132, + "lr": 0.0012207166407514638, + "router/selected_tokens_s0": 1237.25, + "step": 600, + "tokens_trained": 1.96589048 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1730373732359407, + "grad_norm": 14.48205852508545, + "loss": 1.3098, + "loss_ce": 1.2969579696655273, + "loss_region": 0.03318855166435242, + "loss_total": 1.3301465511322021, + "lr": 0.0012203097487676127, + "router/selected_tokens_s0": 6087.625, + "step": 610, + "tokens_trained": 1.99865592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.17587405148571023, + "grad_norm": 10.29987907409668, + "loss": 1.2844, + "loss_ce": 1.2728289365768433, + "loss_region": 0.03153729811310768, + "loss_total": 1.3043662309646606, + "lr": 0.0012199028567837617, + "router/selected_tokens_s0": 5177.0, + "step": 620, + "tokens_trained": 2.03142136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.17871072973547975, + "grad_norm": 14.114507675170898, + "loss": 1.2792, + "loss_ce": 1.2729930877685547, + "loss_region": 0.03177153319120407, + "loss_total": 1.3047646284103394, + "lr": 0.0012194959647999107, + "router/selected_tokens_s0": 5318.5, + "step": 630, + "tokens_trained": 2.0641868 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.18154740798524927, + "grad_norm": 35.086570739746094, + "loss": 1.327, + "loss_ce": 1.4959396123886108, + "loss_region": 0.031267955899238586, + "loss_total": 1.527207612991333, + "lr": 0.0012190890728160596, + "router/selected_tokens_s0": 5018.625, + "step": 640, + "tokens_trained": 2.09695224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.18438408623501878, + "grad_norm": 12.891855239868164, + "loss": 1.3231, + "loss_ce": 1.251932978630066, + "loss_region": 0.030069500207901, + "loss_total": 1.2820024490356445, + "lr": 0.0012186821808322086, + "router/selected_tokens_s0": 4308.125, + "step": 650, + "tokens_trained": 2.12971768 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.1872207644847883, + "grad_norm": 16.94170570373535, + "loss": 1.273, + "loss_ce": 1.303807258605957, + "loss_region": 0.030183279886841774, + "loss_total": 1.3339905738830566, + "lr": 0.0012182752888483576, + "router/selected_tokens_s0": 4374.375, + "step": 660, + "tokens_trained": 2.16248312 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.19005744273455782, + "grad_norm": 8.820389747619629, + "loss": 1.291, + "loss_ce": 1.2488102912902832, + "loss_region": 0.030493643134832382, + "loss_total": 1.2793039083480835, + "lr": 0.0012178683968645065, + "router/selected_tokens_s0": 4566.875, + "step": 670, + "tokens_trained": 2.19524856 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.19289412098432734, + "grad_norm": 12.072690963745117, + "loss": 1.2551, + "loss_ce": 1.257431149482727, + "loss_region": 0.02906171977519989, + "loss_total": 1.2864928245544434, + "lr": 0.0012174615048806555, + "router/selected_tokens_s0": 3676.75, + "step": 680, + "tokens_trained": 2.228014 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.19573079923409686, + "grad_norm": 3.4100522994995117, + "loss": 1.2685, + "loss_ce": 1.217279314994812, + "loss_region": 0.03290281072258949, + "loss_total": 1.2501821517944336, + "lr": 0.0012170546128968045, + "router/selected_tokens_s0": 5992.0, + "step": 690, + "tokens_trained": 2.26077944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.19856747748386638, + "grad_norm": 6.675322532653809, + "loss": 1.2504, + "loss_ce": 1.1835153102874756, + "loss_region": 0.031250134110450745, + "loss_total": 1.2147654294967651, + "lr": 0.0012166477209129534, + "router/selected_tokens_s0": 5040.625, + "step": 700, + "tokens_trained": 2.29354488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2014041557336359, + "grad_norm": 21.388051986694336, + "loss": 1.267, + "loss_ce": 1.3746044635772705, + "loss_region": 0.027913136407732964, + "loss_total": 1.402517557144165, + "lr": 0.0012162408289291026, + "router/selected_tokens_s0": 2922.75, + "step": 710, + "tokens_trained": 2.32631032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.20424083398340542, + "grad_norm": 12.917130470275879, + "loss": 1.3025, + "loss_ce": 1.2145620584487915, + "loss_region": 0.031132886186242104, + "loss_total": 1.2456949949264526, + "lr": 0.0012158339369452516, + "router/selected_tokens_s0": 4968.875, + "step": 720, + "tokens_trained": 2.35907576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.20707751223317494, + "grad_norm": 8.888051986694336, + "loss": 1.2457, + "loss_ce": 1.185524821281433, + "loss_region": 0.03197301924228668, + "loss_total": 1.2174978256225586, + "lr": 0.0012154270449614005, + "router/selected_tokens_s0": 5463.0, + "step": 730, + "tokens_trained": 2.3918396 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.20991419048294446, + "grad_norm": 13.051305770874023, + "loss": 1.2446, + "loss_ce": 1.1078685522079468, + "loss_region": 0.0308807585388422, + "loss_total": 1.138749361038208, + "lr": 0.0012150201529775495, + "router/selected_tokens_s0": 4844.0, + "step": 740, + "tokens_trained": 2.424600048 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.21275086873271398, + "grad_norm": 6.593105316162109, + "loss": 1.2851, + "loss_ce": 1.255039930343628, + "loss_region": 0.029710784554481506, + "loss_total": 1.2847506999969482, + "lr": 0.0012146132609936982, + "router/selected_tokens_s0": 4083.875, + "step": 750, + "tokens_trained": 2.457364688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2155875469824835, + "grad_norm": 3.900451183319092, + "loss": 1.2291, + "loss_ce": 1.1926592588424683, + "loss_region": 0.030736476182937622, + "loss_total": 1.2233957052230835, + "lr": 0.0012142063690098472, + "router/selected_tokens_s0": 4719.25, + "step": 760, + "tokens_trained": 2.490130128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.21842422523225302, + "grad_norm": 8.001019477844238, + "loss": 1.2285, + "loss_ce": 1.1942657232284546, + "loss_region": 0.03041156381368637, + "loss_total": 1.224677324295044, + "lr": 0.0012137994770259962, + "router/selected_tokens_s0": 4525.75, + "step": 770, + "tokens_trained": 2.522895568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.22126090348202254, + "grad_norm": 5.169371128082275, + "loss": 1.2072, + "loss_ce": 1.2079213857650757, + "loss_region": 0.031087037175893784, + "loss_total": 1.2390084266662598, + "lr": 0.0012133925850421454, + "router/selected_tokens_s0": 4938.25, + "step": 780, + "tokens_trained": 2.555659392 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.22409758173179206, + "grad_norm": 8.434707641601562, + "loss": 1.2079, + "loss_ce": 1.2038490772247314, + "loss_region": 0.02821769006550312, + "loss_total": 1.2320667505264282, + "lr": 0.0012129856930582943, + "router/selected_tokens_s0": 3119.875, + "step": 790, + "tokens_trained": 2.588422136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.22693425998156158, + "grad_norm": 8.451072692871094, + "loss": 1.2072, + "loss_ce": 1.2617510557174683, + "loss_region": 0.0316130593419075, + "loss_total": 1.29336416721344, + "lr": 0.0012125788010744433, + "router/selected_tokens_s0": 5238.75, + "step": 800, + "tokens_trained": 2.621187576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2297709382313311, + "grad_norm": 12.750673294067383, + "loss": 1.2283, + "loss_ce": 1.2528263330459595, + "loss_region": 0.03109751269221306, + "loss_total": 1.283923864364624, + "lr": 0.0012121719090905923, + "router/selected_tokens_s0": 4940.75, + "step": 810, + "tokens_trained": 2.653953016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.23260761648110062, + "grad_norm": 10.307655334472656, + "loss": 1.2544, + "loss_ce": 1.2496147155761719, + "loss_region": 0.02913491614162922, + "loss_total": 1.2787495851516724, + "lr": 0.0012117650171067412, + "router/selected_tokens_s0": 3717.75, + "step": 820, + "tokens_trained": 2.686718456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.23544429473087014, + "grad_norm": 0.6592714190483093, + "loss": 1.2022, + "loss_ce": 1.0889158248901367, + "loss_region": 0.031184613704681396, + "loss_total": 1.120100498199463, + "lr": 0.0012113581251228902, + "router/selected_tokens_s0": 5037.375, + "step": 830, + "tokens_trained": 2.71948036 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.23828097298063966, + "grad_norm": 3.0865817070007324, + "loss": 1.1958, + "loss_ce": 1.267112374305725, + "loss_region": 0.02916303649544716, + "loss_total": 1.2962753772735596, + "lr": 0.0012109512331390391, + "router/selected_tokens_s0": 3734.375, + "step": 840, + "tokens_trained": 2.7522458 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.24111765123040918, + "grad_norm": 2.513849973678589, + "loss": 1.2014, + "loss_ce": 1.108485221862793, + "loss_region": 0.0302209984511137, + "loss_total": 1.1387062072753906, + "lr": 0.0012105443411551881, + "router/selected_tokens_s0": 4417.125, + "step": 850, + "tokens_trained": 2.78501124 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2439543294801787, + "grad_norm": 5.594594478607178, + "loss": 1.206, + "loss_ce": 1.1815146207809448, + "loss_region": 0.031508028507232666, + "loss_total": 1.2130227088928223, + "lr": 0.001210137449171337, + "router/selected_tokens_s0": 5212.875, + "step": 860, + "tokens_trained": 2.81777668 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.24679100772994822, + "grad_norm": 2.2655980587005615, + "loss": 1.1897, + "loss_ce": 1.2304372787475586, + "loss_region": 0.031548820436000824, + "loss_total": 1.2619861364364624, + "lr": 0.001209730557187486, + "router/selected_tokens_s0": 5213.25, + "step": 870, + "tokens_trained": 2.85054212 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.24962768597971774, + "grad_norm": 4.335860252380371, + "loss": 1.1897, + "loss_ce": 1.2337130308151245, + "loss_region": 0.02997858263552189, + "loss_total": 1.2636916637420654, + "lr": 0.001209323665203635, + "router/selected_tokens_s0": 4252.5, + "step": 880, + "tokens_trained": 2.88330756 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.25246436422948726, + "grad_norm": 12.377155303955078, + "loss": 1.1966, + "loss_ce": 1.1369762420654297, + "loss_region": 0.029613491147756577, + "loss_total": 1.1665897369384766, + "lr": 0.001208916773219784, + "router/selected_tokens_s0": 4027.75, + "step": 890, + "tokens_trained": 2.916073 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2553010424792568, + "grad_norm": 7.238094806671143, + "loss": 1.2143, + "loss_ce": 1.1700671911239624, + "loss_region": 0.029774101451039314, + "loss_total": 1.1998412609100342, + "lr": 0.001208509881235933, + "router/selected_tokens_s0": 4116.875, + "step": 900, + "tokens_trained": 2.94883828 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2581377207290263, + "grad_norm": 3.2694191932678223, + "loss": 1.1892, + "loss_ce": 1.1454379558563232, + "loss_region": 0.029824109748005867, + "loss_total": 1.1752620935440063, + "lr": 0.001208102989252082, + "router/selected_tokens_s0": 4152.625, + "step": 910, + "tokens_trained": 2.981597288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2609743989787958, + "grad_norm": 9.457625389099121, + "loss": 1.2038, + "loss_ce": 1.3160332441329956, + "loss_region": 0.030873605981469154, + "loss_total": 1.3469069004058838, + "lr": 0.0012076960972682309, + "router/selected_tokens_s0": 4797.5, + "step": 920, + "tokens_trained": 3.014362456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.26381107722856534, + "grad_norm": 4.293655872344971, + "loss": 1.1978, + "loss_ce": 1.1440948247909546, + "loss_region": 0.02935035713016987, + "loss_total": 1.173445224761963, + "lr": 0.0012072892052843798, + "router/selected_tokens_s0": 3829.5, + "step": 930, + "tokens_trained": 3.047127096 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.26664775547833486, + "grad_norm": 1.7136532068252563, + "loss": 1.1906, + "loss_ce": 1.1432236433029175, + "loss_region": 0.028851088136434555, + "loss_total": 1.1720746755599976, + "lr": 0.0012068823133005288, + "router/selected_tokens_s0": 3479.125, + "step": 940, + "tokens_trained": 3.079892536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2694844337281044, + "grad_norm": 4.0433244705200195, + "loss": 1.1868, + "loss_ce": 1.168936014175415, + "loss_region": 0.02876598760485649, + "loss_total": 1.1977020502090454, + "lr": 0.0012064754213166778, + "router/selected_tokens_s0": 3396.25, + "step": 950, + "tokens_trained": 3.11265336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2723211119778739, + "grad_norm": 6.829047203063965, + "loss": 1.1828, + "loss_ce": 1.2480430603027344, + "loss_region": 0.02934931591153145, + "loss_total": 1.2773923873901367, + "lr": 0.001206068529332827, + "router/selected_tokens_s0": 3843.75, + "step": 960, + "tokens_trained": 3.1454188 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2751577902276434, + "grad_norm": 5.5668439865112305, + "loss": 1.1882, + "loss_ce": 1.1349202394485474, + "loss_region": 0.0297370757907629, + "loss_total": 1.1646573543548584, + "lr": 0.001205661637348976, + "router/selected_tokens_s0": 4102.5, + "step": 970, + "tokens_trained": 3.17818424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.27799446847741294, + "grad_norm": 3.729381561279297, + "loss": 1.1839, + "loss_ce": 1.1995916366577148, + "loss_region": 0.03041483648121357, + "loss_total": 1.230006456375122, + "lr": 0.0012052547453651249, + "router/selected_tokens_s0": 4537.125, + "step": 980, + "tokens_trained": 3.21094968 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.28083114672718246, + "grad_norm": 2.7978885173797607, + "loss": 1.1739, + "loss_ce": 1.1886447668075562, + "loss_region": 0.030223874375224113, + "loss_total": 1.218868613243103, + "lr": 0.0012048478533812738, + "router/selected_tokens_s0": 4418.875, + "step": 990, + "tokens_trained": 3.24371512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.283667824976952, + "grad_norm": 2.7768421173095703, + "loss": 1.1695, + "loss_ce": 1.1791244745254517, + "loss_region": 0.03016069531440735, + "loss_total": 1.2092851400375366, + "lr": 0.0012044409613974226, + "router/selected_tokens_s0": 4373.0, + "step": 1000, + "tokens_trained": 3.27648056 + }, + { + "epoch": 0.283667824976952, + "eval_ppl": 3.1457362037176693, + "eval_runtime": 2.5704, + "step": 1000, + "tokens_trained": 3.27648056 + }, + { + "epoch": 0.283667824976952, + "eval_F": 0.35905403615092213, + "eval_F_cds": 0.3614752043728926, + "eval_F_dig": 0.36203349219991143, + "eval_F_exon": 0.3609332242502892, + "eval_F_intron": 0.3608845011093654, + "eval_F_nig": 0.36360427639485304, + "eval_F_promoter": 0.3446594753609168, + "eval_F_utr": 0.35993294503032014, + "eval_G": 0.4747950002316863, + "eval_G_cds": 0.4875693056072159, + "eval_G_dig": 0.4165539971384483, + "eval_G_exon": 0.4825983323253731, + "eval_G_intron": 0.4746974505122046, + "eval_G_nig": 0.4719204972271849, + "eval_G_promoter": 0.47860970096474814, + "eval_G_utr": 0.4806883865646302, + "eval_avg_bp_per_token": 2.785096111772066, + "eval_bp_per_token/cds": 2.7664414817466, + "eval_bp_per_token/dig": 2.7621753830659665, + "eval_bp_per_token/exon": 2.770595591683602, + "eval_bp_per_token/intron": 2.7709696507497057, + "eval_bp_per_token/nig": 2.7502426811780905, + "eval_bp_per_token/promoter": 2.90141450181467, + "eval_bp_per_token/utr": 2.77829527362593, + "eval_ppl_cds": 3.7937951600140427, + "eval_ppl_dig": 1.292568207392483, + "eval_ppl_exon": 3.5063285971819904, + "eval_ppl_intron": 3.1623742022954864, + "eval_ppl_nig": 3.03123217862896, + "eval_ppl_promoter": 3.420873133996253, + "eval_ppl_utr": 3.4079030610184535, + "step": 1000, + "tokens_trained": 3.27648056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2865045032267215, + "grad_norm": 1.750190258026123, + "loss": 1.1681, + "loss_ce": 1.1951100826263428, + "loss_region": 0.029561972245573997, + "loss_total": 1.2246720790863037, + "lr": 0.0012040340694135716, + "router/selected_tokens_s0": 3974.5, + "step": 1010, + "tokens_trained": 3.309246 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.289341181476491, + "grad_norm": 5.037286758422852, + "loss": 1.1855, + "loss_ce": 1.1606330871582031, + "loss_region": 0.030172061175107956, + "loss_total": 1.190805196762085, + "lr": 0.0012036271774297205, + "router/selected_tokens_s0": 4388.375, + "step": 1020, + "tokens_trained": 3.34201144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.29217785972626054, + "grad_norm": 5.963747024536133, + "loss": 1.1794, + "loss_ce": 1.116599678993225, + "loss_region": 0.030543407425284386, + "loss_total": 1.1471431255340576, + "lr": 0.0012032202854458697, + "router/selected_tokens_s0": 4640.0, + "step": 1030, + "tokens_trained": 3.37477688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.29501453797603006, + "grad_norm": 4.626336574554443, + "loss": 1.1934, + "loss_ce": 1.094927430152893, + "loss_region": 0.02999301068484783, + "loss_total": 1.1249204874038696, + "lr": 0.0012028133934620187, + "router/selected_tokens_s0": 4248.5, + "step": 1040, + "tokens_trained": 3.40754232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.2978512162257996, + "grad_norm": 4.208251476287842, + "loss": 1.1843, + "loss_ce": 1.1818771362304688, + "loss_region": 0.030715491622686386, + "loss_total": 1.212592601776123, + "lr": 0.0012024065014781676, + "router/selected_tokens_s0": 4729.75, + "step": 1050, + "tokens_trained": 3.44030696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3006878944755691, + "grad_norm": 2.3673582077026367, + "loss": 1.1726, + "loss_ce": 1.1216882467269897, + "loss_region": 0.030366381630301476, + "loss_total": 1.1520546674728394, + "lr": 0.0012019996094943166, + "router/selected_tokens_s0": 4503.625, + "step": 1060, + "tokens_trained": 3.4730724 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3035245727253386, + "grad_norm": 2.6513352394104004, + "loss": 1.1707, + "loss_ce": 1.1285063028335571, + "loss_region": 0.02974226139485836, + "loss_total": 1.1582485437393188, + "lr": 0.0012015927175104656, + "router/selected_tokens_s0": 4085.375, + "step": 1070, + "tokens_trained": 3.50583784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.30636125097510813, + "grad_norm": 1.0276976823806763, + "loss": 1.165, + "loss_ce": 1.1330546140670776, + "loss_region": 0.029834387823939323, + "loss_total": 1.162889003753662, + "lr": 0.0012011858255266145, + "router/selected_tokens_s0": 4155.5, + "step": 1080, + "tokens_trained": 3.53860328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.30919792922487765, + "grad_norm": 3.4352457523345947, + "loss": 1.1759, + "loss_ce": 1.153834581375122, + "loss_region": 0.030001970008015633, + "loss_total": 1.183836579322815, + "lr": 0.0012007789335427635, + "router/selected_tokens_s0": 4271.375, + "step": 1090, + "tokens_trained": 3.57136872 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3120346074746472, + "grad_norm": 3.4334914684295654, + "loss": 1.1668, + "loss_ce": 1.0656555891036987, + "loss_region": 0.03014238551259041, + "loss_total": 1.0957980155944824, + "lr": 0.0012003720415589125, + "router/selected_tokens_s0": 4376.625, + "step": 1100, + "tokens_trained": 3.60413416 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3148712857244167, + "grad_norm": 7.573620796203613, + "loss": 1.1737, + "loss_ce": 1.1206940412521362, + "loss_region": 0.030071774497628212, + "loss_total": 1.1507657766342163, + "lr": 0.0011999651495750614, + "router/selected_tokens_s0": 4325.0, + "step": 1110, + "tokens_trained": 3.6368996 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3177079639741862, + "grad_norm": 4.200015544891357, + "loss": 1.1705, + "loss_ce": 1.1700469255447388, + "loss_region": 0.02990192547440529, + "loss_total": 1.1999489068984985, + "lr": 0.0011995582575912104, + "router/selected_tokens_s0": 4194.25, + "step": 1120, + "tokens_trained": 3.669661712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.32054464222395573, + "grad_norm": 5.207011699676514, + "loss": 1.1668, + "loss_ce": 1.1708717346191406, + "loss_region": 0.029880443587899208, + "loss_total": 1.2007521390914917, + "lr": 0.0011991513656073594, + "router/selected_tokens_s0": 4177.25, + "step": 1130, + "tokens_trained": 3.702426352 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.32338132047372525, + "grad_norm": 4.160227298736572, + "loss": 1.1671, + "loss_ce": 1.1502091884613037, + "loss_region": 0.030087152495980263, + "loss_total": 1.1802963018417358, + "lr": 0.0011987444736235083, + "router/selected_tokens_s0": 4325.25, + "step": 1140, + "tokens_trained": 3.735191792 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3262179987234948, + "grad_norm": 2.3496572971343994, + "loss": 1.1578, + "loss_ce": 1.0942906141281128, + "loss_region": 0.02960728108882904, + "loss_total": 1.123897910118103, + "lr": 0.0011983375816396573, + "router/selected_tokens_s0": 3976.25, + "step": 1150, + "tokens_trained": 3.767957232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3290546769732643, + "grad_norm": 3.0820891857147217, + "loss": 1.158, + "loss_ce": 1.2191810607910156, + "loss_region": 0.030029961839318275, + "loss_total": 1.249211072921753, + "lr": 0.0011979306896558062, + "router/selected_tokens_s0": 4285.125, + "step": 1160, + "tokens_trained": 3.800722672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3318913552230338, + "grad_norm": 1.7340823411941528, + "loss": 1.1537, + "loss_ce": 1.0748310089111328, + "loss_region": 0.030402792617678642, + "loss_total": 1.1052337884902954, + "lr": 0.0011975237976719552, + "router/selected_tokens_s0": 4566.375, + "step": 1170, + "tokens_trained": 3.833488112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.33472803347280333, + "grad_norm": 1.6883597373962402, + "loss": 1.1524, + "loss_ce": 1.15337073802948, + "loss_region": 0.029628688469529152, + "loss_total": 1.1829993724822998, + "lr": 0.0011971169056881042, + "router/selected_tokens_s0": 3994.125, + "step": 1180, + "tokens_trained": 3.866252752 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.33756471172257285, + "grad_norm": 1.3079456090927124, + "loss": 1.155, + "loss_ce": 1.147839903831482, + "loss_region": 0.029972558841109276, + "loss_total": 1.1778124570846558, + "lr": 0.0011967100137042531, + "router/selected_tokens_s0": 4250.125, + "step": 1190, + "tokens_trained": 3.899018184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.34040138997234237, + "grad_norm": 2.042187452316284, + "loss": 1.1551, + "loss_ce": 1.1045622825622559, + "loss_region": 0.030126892030239105, + "loss_total": 1.134689211845398, + "lr": 0.0011963031217204021, + "router/selected_tokens_s0": 4366.25, + "step": 1200, + "tokens_trained": 3.931783624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3432380682221119, + "grad_norm": 0.5720299482345581, + "loss": 1.1514, + "loss_ce": 1.1252881288528442, + "loss_region": 0.02972925268113613, + "loss_total": 1.155017375946045, + "lr": 0.0011958962297365513, + "router/selected_tokens_s0": 4055.0, + "step": 1210, + "tokens_trained": 3.964549064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3460747464718814, + "grad_norm": 2.726912498474121, + "loss": 1.1481, + "loss_ce": 1.0980409383773804, + "loss_region": 0.030369114130735397, + "loss_total": 1.1284101009368896, + "lr": 0.0011954893377527003, + "router/selected_tokens_s0": 4549.75, + "step": 1220, + "tokens_trained": 3.997311912 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.34891142472165093, + "grad_norm": 1.576530933380127, + "loss": 1.1547, + "loss_ce": 1.1488255262374878, + "loss_region": 0.03008064441382885, + "loss_total": 1.1789062023162842, + "lr": 0.0011950824457688492, + "router/selected_tokens_s0": 4327.125, + "step": 1230, + "tokens_trained": 4.030077352 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.35174810297142045, + "grad_norm": 1.7633917331695557, + "loss": 1.1491, + "loss_ce": 1.0437774658203125, + "loss_region": 0.03009728156030178, + "loss_total": 1.0738747119903564, + "lr": 0.0011946755537849982, + "router/selected_tokens_s0": 4352.5, + "step": 1240, + "tokens_trained": 4.062842792 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.35458478122118997, + "grad_norm": 0.8599131107330322, + "loss": 1.1502, + "loss_ce": 1.1635342836380005, + "loss_region": 0.030227093026041985, + "loss_total": 1.1937613487243652, + "lr": 0.001194268661801147, + "router/selected_tokens_s0": 4437.875, + "step": 1250, + "tokens_trained": 4.095608232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3574214594709595, + "grad_norm": 2.0207033157348633, + "loss": 1.1525, + "loss_ce": 1.161281943321228, + "loss_region": 0.02980414777994156, + "loss_total": 1.1910860538482666, + "lr": 0.001193861769817296, + "router/selected_tokens_s0": 4113.375, + "step": 1260, + "tokens_trained": 4.128373672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.360258137720729, + "grad_norm": 1.6762081384658813, + "loss": 1.1549, + "loss_ce": 1.176638126373291, + "loss_region": 0.02979988045990467, + "loss_total": 1.2064380645751953, + "lr": 0.0011934548778334449, + "router/selected_tokens_s0": 4110.375, + "step": 1270, + "tokens_trained": 4.161136768 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.36309481597049853, + "grad_norm": 1.5674160718917847, + "loss": 1.1538, + "loss_ce": 1.1160061359405518, + "loss_region": 0.029819507151842117, + "loss_total": 1.1458256244659424, + "lr": 0.001193047985849594, + "router/selected_tokens_s0": 4122.75, + "step": 1280, + "tokens_trained": 4.193902208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.36593149422026805, + "grad_norm": 1.232892394065857, + "loss": 1.1499, + "loss_ce": 1.192215085029602, + "loss_region": 0.030095556750893593, + "loss_total": 1.2223106622695923, + "lr": 0.001192641093865743, + "router/selected_tokens_s0": 4337.75, + "step": 1290, + "tokens_trained": 4.226667648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.36876817247003757, + "grad_norm": 1.280081033706665, + "loss": 1.1625, + "loss_ce": 1.0769988298416138, + "loss_region": 0.030076846480369568, + "loss_total": 1.1070756912231445, + "lr": 0.001192234201881892, + "router/selected_tokens_s0": 4330.625, + "step": 1300, + "tokens_trained": 4.259424272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3716048507198071, + "grad_norm": 0.7819789052009583, + "loss": 1.1516, + "loss_ce": 1.0531295537948608, + "loss_region": 0.029812535271048546, + "loss_total": 1.0829421281814575, + "lr": 0.001191827309898041, + "router/selected_tokens_s0": 4107.75, + "step": 1310, + "tokens_trained": 4.292189712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3744415289695766, + "grad_norm": 4.3887505531311035, + "loss": 1.1524, + "loss_ce": 1.0992565155029297, + "loss_region": 0.030015140771865845, + "loss_total": 1.1292716264724731, + "lr": 0.00119142041791419, + "router/selected_tokens_s0": 4279.625, + "step": 1320, + "tokens_trained": 4.32495164 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.37727820721934613, + "grad_norm": 2.5429630279541016, + "loss": 1.1622, + "loss_ce": 0.9915607571601868, + "loss_region": 0.02960185892879963, + "loss_total": 1.0211626291275024, + "lr": 0.0011910135259303389, + "router/selected_tokens_s0": 3922.75, + "step": 1330, + "tokens_trained": 4.35771708 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.38011488546911565, + "grad_norm": 1.3790112733840942, + "loss": 1.1526, + "loss_ce": 1.2076722383499146, + "loss_region": 0.029480615630745888, + "loss_total": 1.2371528148651123, + "lr": 0.0011906066339464878, + "router/selected_tokens_s0": 3831.5, + "step": 1340, + "tokens_trained": 4.39048252 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.38295156371888517, + "grad_norm": 3.28352427482605, + "loss": 1.1523, + "loss_ce": 0.9999480247497559, + "loss_region": 0.02995798923075199, + "loss_total": 1.0299060344696045, + "lr": 0.0011901997419626368, + "router/selected_tokens_s0": 4236.0, + "step": 1350, + "tokens_trained": 4.42324796 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3857882419686547, + "grad_norm": 2.173388719558716, + "loss": 1.1469, + "loss_ce": 1.1173208951950073, + "loss_region": 0.030063528567552567, + "loss_total": 1.1473844051361084, + "lr": 0.0011897928499787858, + "router/selected_tokens_s0": 4322.25, + "step": 1360, + "tokens_trained": 4.4560134 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3886249202184242, + "grad_norm": 1.3337340354919434, + "loss": 1.1514, + "loss_ce": 1.097347617149353, + "loss_region": 0.030277268961071968, + "loss_total": 1.1276248693466187, + "lr": 0.0011893859579949347, + "router/selected_tokens_s0": 4490.375, + "step": 1370, + "tokens_trained": 4.48877884 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3914615984681937, + "grad_norm": 1.5072178840637207, + "loss": 1.1454, + "loss_ce": 1.1354695558547974, + "loss_region": 0.0300710741430521, + "loss_total": 1.1655405759811401, + "lr": 0.0011889790660110837, + "router/selected_tokens_s0": 4323.125, + "step": 1380, + "tokens_trained": 4.52154428 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.39429827671796325, + "grad_norm": 1.4634846448898315, + "loss": 1.1434, + "loss_ce": 1.1472464799880981, + "loss_region": 0.029943954199552536, + "loss_total": 1.1771904230117798, + "lr": 0.0011885721740272327, + "router/selected_tokens_s0": 4222.625, + "step": 1390, + "tokens_trained": 4.55430972 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.39713495496773277, + "grad_norm": 1.1301681995391846, + "loss": 1.1491, + "loss_ce": 0.932141900062561, + "loss_region": 0.03013395331799984, + "loss_total": 0.9622758626937866, + "lr": 0.0011881652820433816, + "router/selected_tokens_s0": 4389.125, + "step": 1400, + "tokens_trained": 4.58707516 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.3999716332175023, + "grad_norm": 1.153057336807251, + "loss": 1.1483, + "loss_ce": 1.0930418968200684, + "loss_region": 0.029886895790696144, + "loss_total": 1.1229287385940552, + "lr": 0.0011877583900595306, + "router/selected_tokens_s0": 4177.875, + "step": 1410, + "tokens_trained": 4.6198406 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4028083114672718, + "grad_norm": 2.0346107482910156, + "loss": 1.1355, + "loss_ce": 1.130191683769226, + "loss_region": 0.030217666178941727, + "loss_total": 1.1604093313217163, + "lr": 0.0011873514980756796, + "router/selected_tokens_s0": 4435.0, + "step": 1420, + "tokens_trained": 4.652606024 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4056449897170413, + "grad_norm": 1.2362136840820312, + "loss": 1.1461, + "loss_ce": 1.1180355548858643, + "loss_region": 0.029944026842713356, + "loss_total": 1.1479796171188354, + "lr": 0.0011869446060918285, + "router/selected_tokens_s0": 4219.25, + "step": 1430, + "tokens_trained": 4.685371464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.40848166796681085, + "grad_norm": 1.6414567232131958, + "loss": 1.1476, + "loss_ce": 1.1310675144195557, + "loss_region": 0.030178584158420563, + "loss_total": 1.1612460613250732, + "lr": 0.0011865377141079775, + "router/selected_tokens_s0": 4406.125, + "step": 1440, + "tokens_trained": 4.718136904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.41131834621658037, + "grad_norm": 0.8733806014060974, + "loss": 1.1452, + "loss_ce": 1.1529111862182617, + "loss_region": 0.029908571392297745, + "loss_total": 1.1828197240829468, + "lr": 0.0011861308221241265, + "router/selected_tokens_s0": 4186.5, + "step": 1450, + "tokens_trained": 4.750902344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4141550244663499, + "grad_norm": 2.170149087905884, + "loss": 1.1364, + "loss_ce": 1.1446956396102905, + "loss_region": 0.030016543343663216, + "loss_total": 1.1747121810913086, + "lr": 0.0011857239301402756, + "router/selected_tokens_s0": 4279.125, + "step": 1460, + "tokens_trained": 4.783666984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4169917027161194, + "grad_norm": 1.5901942253112793, + "loss": 1.1418, + "loss_ce": 1.1736469268798828, + "loss_region": 0.02991572767496109, + "loss_total": 1.203562617301941, + "lr": 0.0011853170381564246, + "router/selected_tokens_s0": 4190.375, + "step": 1470, + "tokens_trained": 4.816432424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4198283809658889, + "grad_norm": 0.7801039814949036, + "loss": 1.1359, + "loss_ce": 1.0415936708450317, + "loss_region": 0.030063536018133163, + "loss_total": 1.0716571807861328, + "lr": 0.0011849101461725736, + "router/selected_tokens_s0": 4323.5, + "step": 1480, + "tokens_trained": 4.849197864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.42266505921565845, + "grad_norm": 1.1225630044937134, + "loss": 1.1387, + "loss_ce": 1.1764026880264282, + "loss_region": 0.02989169955253601, + "loss_total": 1.2062944173812866, + "lr": 0.0011845032541887225, + "router/selected_tokens_s0": 4166.375, + "step": 1490, + "tokens_trained": 4.881963248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.42550173746542796, + "grad_norm": 1.3516196012496948, + "loss": 1.1445, + "loss_ce": 1.1119225025177002, + "loss_region": 0.03007156029343605, + "loss_total": 1.1419941186904907, + "lr": 0.0011840963622048713, + "router/selected_tokens_s0": 4332.625, + "step": 1500, + "tokens_trained": 4.914728608 + }, + { + "epoch": 0.42550173746542796, + "eval_ppl": 3.0476700462359805, + "eval_runtime": 2.5167, + "step": 1500, + "tokens_trained": 4.914728608 + }, + { + "epoch": 0.42550173746542796, + "eval_F": 0.3395766737890528, + "eval_F_cds": 0.33560010026602843, + "eval_F_dig": 0.34591244107612573, + "eval_F_exon": 0.33732050667193275, + "eval_F_intron": 0.340589821591843, + "eval_F_nig": 0.3449097161371641, + "eval_F_promoter": 0.3287406377406758, + "eval_F_utr": 0.33810586816514, + "eval_G": 0.4388793285567115, + "eval_G_cds": 0.4465895620992391, + "eval_G_dig": 0.39567722372516084, + "eval_G_exon": 0.44327135296181625, + "eval_G_intron": 0.4386635275964277, + "eval_G_nig": 0.4373593879668909, + "eval_G_promoter": 0.44171817290159177, + "eval_G_utr": 0.44355779628952524, + "eval_avg_bp_per_token": 2.944843027178028, + "eval_bp_per_token/cds": 2.9797368928296066, + "eval_bp_per_token/dig": 2.8909049841891283, + "eval_bp_per_token/exon": 2.9645396002341724, + "eval_bp_per_token/intron": 2.93608304360423, + "eval_bp_per_token/nig": 2.8993094517590188, + "eval_bp_per_token/promoter": 3.0419117237000717, + "eval_bp_per_token/utr": 2.9576534871366778, + "eval_ppl_cds": 3.7328596405663, + "eval_ppl_dig": 1.1534605141350962, + "eval_ppl_exon": 3.4439528933373436, + "eval_ppl_intron": 3.0653985302604827, + "eval_ppl_nig": 2.904936687189015, + "eval_ppl_promoter": 3.3618258190318606, + "eval_ppl_utr": 3.3512748939063846, + "step": 1500, + "tokens_trained": 4.914728608 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4283384157151975, + "grad_norm": 1.0354516506195068, + "loss": 1.1407, + "loss_ce": 1.2179700136184692, + "loss_region": 0.029973506927490234, + "loss_total": 1.2479435205459595, + "lr": 0.0011836894702210202, + "router/selected_tokens_s0": 4242.375, + "step": 1510, + "tokens_trained": 4.947494048 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.431175093964967, + "grad_norm": 0.9974690675735474, + "loss": 1.1361, + "loss_ce": 1.1464780569076538, + "loss_region": 0.03020160086452961, + "loss_total": 1.1766796112060547, + "lr": 0.0011832825782371692, + "router/selected_tokens_s0": 4443.875, + "step": 1520, + "tokens_trained": 4.980259488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4340117722147365, + "grad_norm": 1.61404550075531, + "loss": 1.1383, + "loss_ce": 1.1023921966552734, + "loss_region": 0.029910210520029068, + "loss_total": 1.1323024034500122, + "lr": 0.0011828756862533184, + "router/selected_tokens_s0": 4174.25, + "step": 1530, + "tokens_trained": 5.013024928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.43684845046450604, + "grad_norm": 1.551711082458496, + "loss": 1.1369, + "loss_ce": 1.085469365119934, + "loss_region": 0.02990012802183628, + "loss_total": 1.115369439125061, + "lr": 0.0011824687942694674, + "router/selected_tokens_s0": 4162.25, + "step": 1540, + "tokens_trained": 5.04578704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.43968512871427556, + "grad_norm": 1.3328109979629517, + "loss": 1.1358, + "loss_ce": 1.1522539854049683, + "loss_region": 0.02980169840157032, + "loss_total": 1.1820557117462158, + "lr": 0.0011820619022856163, + "router/selected_tokens_s0": 4050.75, + "step": 1550, + "tokens_trained": 5.078551904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4425218069640451, + "grad_norm": 2.2517945766448975, + "loss": 1.1398, + "loss_ce": 1.0304194688796997, + "loss_region": 0.030139248818159103, + "loss_total": 1.0605586767196655, + "lr": 0.0011816550103017653, + "router/selected_tokens_s0": 4399.625, + "step": 1560, + "tokens_trained": 5.111317344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4453584852138146, + "grad_norm": 1.0419440269470215, + "loss": 1.1423, + "loss_ce": 1.2029235363006592, + "loss_region": 0.029878782108426094, + "loss_total": 1.2328022718429565, + "lr": 0.0011812481183179143, + "router/selected_tokens_s0": 4131.75, + "step": 1570, + "tokens_trained": 5.144082784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4481951634635841, + "grad_norm": 0.8405026197433472, + "loss": 1.1357, + "loss_ce": 1.1085268259048462, + "loss_region": 0.02992934361100197, + "loss_total": 1.1384562253952026, + "lr": 0.0011808412263340632, + "router/selected_tokens_s0": 4185.75, + "step": 1580, + "tokens_trained": 5.176848224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.45103184171335364, + "grad_norm": 1.8782676458358765, + "loss": 1.1447, + "loss_ce": 1.0933234691619873, + "loss_region": 0.030135583132505417, + "loss_total": 1.1234591007232666, + "lr": 0.0011804343343502122, + "router/selected_tokens_s0": 4400.5, + "step": 1590, + "tokens_trained": 5.209613664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.45386851996312316, + "grad_norm": 1.116540551185608, + "loss": 1.1417, + "loss_ce": 1.1890523433685303, + "loss_region": 0.0303688682615757, + "loss_total": 1.2194212675094604, + "lr": 0.0011800274423663611, + "router/selected_tokens_s0": 4597.375, + "step": 1600, + "tokens_trained": 5.242378304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4567051982128927, + "grad_norm": 0.9224187135696411, + "loss": 1.1352, + "loss_ce": 1.0753121376037598, + "loss_region": 0.030113881453871727, + "loss_total": 1.1054260730743408, + "lr": 0.0011796205503825101, + "router/selected_tokens_s0": 4381.5, + "step": 1610, + "tokens_trained": 5.275142944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4595418764626622, + "grad_norm": 1.250409483909607, + "loss": 1.1423, + "loss_ce": 1.1405887603759766, + "loss_region": 0.030090278014540672, + "loss_total": 1.1706790924072266, + "lr": 0.001179213658398659, + "router/selected_tokens_s0": 4360.875, + "step": 1620, + "tokens_trained": 5.307906784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4623785547124317, + "grad_norm": 0.6683188080787659, + "loss": 1.1358, + "loss_ce": 1.0137219429016113, + "loss_region": 0.0301409512758255, + "loss_total": 1.0438629388809204, + "lr": 0.001178806766414808, + "router/selected_tokens_s0": 4420.25, + "step": 1630, + "tokens_trained": 5.340672224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.46521523296220124, + "grad_norm": 1.3055206537246704, + "loss": 1.1378, + "loss_ce": 1.120367407798767, + "loss_region": 0.029992438852787018, + "loss_total": 1.150359869003296, + "lr": 0.001178399874430957, + "router/selected_tokens_s0": 4256.375, + "step": 1640, + "tokens_trained": 5.373436896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.46805191121197076, + "grad_norm": 1.2817225456237793, + "loss": 1.1365, + "loss_ce": 1.159173607826233, + "loss_region": 0.030014952644705772, + "loss_total": 1.1891885995864868, + "lr": 0.001177992982447106, + "router/selected_tokens_s0": 4277.875, + "step": 1650, + "tokens_trained": 5.406202336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4708885894617403, + "grad_norm": 1.2652041912078857, + "loss": 1.1303, + "loss_ce": 1.1445159912109375, + "loss_region": 0.03000623546540737, + "loss_total": 1.1745222806930542, + "lr": 0.001177586090463255, + "router/selected_tokens_s0": 4274.375, + "step": 1660, + "tokens_trained": 5.438967776 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4737252677115098, + "grad_norm": 1.7784186601638794, + "loss": 1.1334, + "loss_ce": 1.1069244146347046, + "loss_region": 0.030016450211405754, + "loss_total": 1.136940836906433, + "lr": 0.001177179198479404, + "router/selected_tokens_s0": 4287.625, + "step": 1670, + "tokens_trained": 5.471733216 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4765619459612793, + "grad_norm": 1.0779353380203247, + "loss": 1.1315, + "loss_ce": 1.1237202882766724, + "loss_region": 0.029916411265730858, + "loss_total": 1.1536366939544678, + "lr": 0.0011767723064955529, + "router/selected_tokens_s0": 4156.75, + "step": 1680, + "tokens_trained": 5.504498656 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.47939862421104884, + "grad_norm": 0.7689351439476013, + "loss": 1.1324, + "loss_ce": 1.0980726480484009, + "loss_region": 0.030096061527729034, + "loss_total": 1.1281687021255493, + "lr": 0.0011763654145117018, + "router/selected_tokens_s0": 4377.5, + "step": 1690, + "tokens_trained": 5.537264096 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.48223530246081836, + "grad_norm": 0.6869276165962219, + "loss": 1.1332, + "loss_ce": 1.0792652368545532, + "loss_region": 0.030072998255491257, + "loss_total": 1.1093382835388184, + "lr": 0.0011759585225278508, + "router/selected_tokens_s0": 4349.625, + "step": 1700, + "tokens_trained": 5.570029536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4850719807105879, + "grad_norm": 0.9587815403938293, + "loss": 1.1361, + "loss_ce": 1.0378434658050537, + "loss_region": 0.03009817562997341, + "loss_total": 1.067941665649414, + "lr": 0.001175551630544, + "router/selected_tokens_s0": 4384.25, + "step": 1710, + "tokens_trained": 5.602794976 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4879086589603574, + "grad_norm": 1.1542259454727173, + "loss": 1.1294, + "loss_ce": 1.074008584022522, + "loss_region": 0.030034121125936508, + "loss_total": 1.104042649269104, + "lr": 0.001175144738560149, + "router/selected_tokens_s0": 4306.0, + "step": 1720, + "tokens_trained": 5.635560416 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4907453372101269, + "grad_norm": 1.0194206237792969, + "loss": 1.1296, + "loss_ce": 1.1548231840133667, + "loss_region": 0.03011094592511654, + "loss_total": 1.184934139251709, + "lr": 0.001174737846576298, + "router/selected_tokens_s0": 4395.625, + "step": 1730, + "tokens_trained": 5.668325856 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.49358201545989644, + "grad_norm": 1.108144998550415, + "loss": 1.1351, + "loss_ce": 1.0953419208526611, + "loss_region": 0.03002314455807209, + "loss_total": 1.1253650188446045, + "lr": 0.0011743309545924469, + "router/selected_tokens_s0": 4292.125, + "step": 1740, + "tokens_trained": 5.701091296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.49641869370966596, + "grad_norm": 1.075562834739685, + "loss": 1.1347, + "loss_ce": 1.1154391765594482, + "loss_region": 0.029949212446808815, + "loss_total": 1.1453883647918701, + "lr": 0.0011739240626085956, + "router/selected_tokens_s0": 4188.625, + "step": 1750, + "tokens_trained": 5.733856736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.4992553719594355, + "grad_norm": 1.3173739910125732, + "loss": 1.1325, + "loss_ce": 1.0855435132980347, + "loss_region": 0.02994917891919613, + "loss_total": 1.1154927015304565, + "lr": 0.0011735171706247446, + "router/selected_tokens_s0": 4183.625, + "step": 1760, + "tokens_trained": 5.766622176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.502092050209205, + "grad_norm": 0.8734815716743469, + "loss": 1.1316, + "loss_ce": 1.190360188484192, + "loss_region": 0.03002040646970272, + "loss_total": 1.2203805446624756, + "lr": 0.0011731102786408936, + "router/selected_tokens_s0": 4294.5, + "step": 1770, + "tokens_trained": 5.799387616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5049287284589745, + "grad_norm": 2.5296459197998047, + "loss": 1.1361, + "loss_ce": 0.9863566756248474, + "loss_region": 0.02998475357890129, + "loss_total": 1.0163414478302002, + "lr": 0.0011727033866570427, + "router/selected_tokens_s0": 4235.875, + "step": 1780, + "tokens_trained": 5.832153056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.507765406708744, + "grad_norm": 0.7834669947624207, + "loss": 1.1297, + "loss_ce": 0.9555173516273499, + "loss_region": 0.0301660243421793, + "loss_total": 0.9856833815574646, + "lr": 0.0011722964946731917, + "router/selected_tokens_s0": 4416.375, + "step": 1790, + "tokens_trained": 5.864918496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5106020849585136, + "grad_norm": 0.9466329216957092, + "loss": 1.1295, + "loss_ce": 1.0096023082733154, + "loss_region": 0.030076030641794205, + "loss_total": 1.0396783351898193, + "lr": 0.0011718896026893407, + "router/selected_tokens_s0": 4354.625, + "step": 1800, + "tokens_trained": 5.897683936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5134387632082831, + "grad_norm": 1.151943325996399, + "loss": 1.1267, + "loss_ce": 1.0721287727355957, + "loss_region": 0.029984835535287857, + "loss_total": 1.1021136045455933, + "lr": 0.0011714827107054896, + "router/selected_tokens_s0": 4239.75, + "step": 1810, + "tokens_trained": 5.930449376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5162754414580526, + "grad_norm": 0.5502280592918396, + "loss": 1.1249, + "loss_ce": 1.0287433862686157, + "loss_region": 0.029946208000183105, + "loss_total": 1.0586895942687988, + "lr": 0.0011710758187216386, + "router/selected_tokens_s0": 4179.375, + "step": 1820, + "tokens_trained": 5.96321104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5191121197078221, + "grad_norm": 1.5447858572006226, + "loss": 1.1319, + "loss_ce": 1.1280238628387451, + "loss_region": 0.030087478458881378, + "loss_total": 1.158111333847046, + "lr": 0.0011706689267377876, + "router/selected_tokens_s0": 4389.75, + "step": 1830, + "tokens_trained": 5.99597648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5219487979575916, + "grad_norm": 0.9524003863334656, + "loss": 1.1274, + "loss_ce": 1.0977569818496704, + "loss_region": 0.030062809586524963, + "loss_total": 1.1278197765350342, + "lr": 0.0011702620347539365, + "router/selected_tokens_s0": 4354.0, + "step": 1840, + "tokens_trained": 6.028741744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5247854762073612, + "grad_norm": 0.6106662750244141, + "loss": 1.1264, + "loss_ce": 1.06783926486969, + "loss_region": 0.029942721128463745, + "loss_total": 1.097782015800476, + "lr": 0.0011698551427700855, + "router/selected_tokens_s0": 4162.625, + "step": 1850, + "tokens_trained": 6.061507184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5276221544571307, + "grad_norm": 1.2853341102600098, + "loss": 1.1329, + "loss_ce": 1.0429413318634033, + "loss_region": 0.02999758906662464, + "loss_total": 1.0729389190673828, + "lr": 0.0011694482507862345, + "router/selected_tokens_s0": 4247.5, + "step": 1860, + "tokens_trained": 6.094268624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5304588327069002, + "grad_norm": 2.993485927581787, + "loss": 1.1236, + "loss_ce": 1.0583568811416626, + "loss_region": 0.030023684725165367, + "loss_total": 1.0883805751800537, + "lr": 0.0011690413588023834, + "router/selected_tokens_s0": 4302.0, + "step": 1870, + "tokens_trained": 6.127034064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5332955109566697, + "grad_norm": 0.7363700866699219, + "loss": 1.1308, + "loss_ce": 1.1353397369384766, + "loss_region": 0.029933562502264977, + "loss_total": 1.1652733087539673, + "lr": 0.0011686344668185324, + "router/selected_tokens_s0": 4149.375, + "step": 1880, + "tokens_trained": 6.159799504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5361321892064392, + "grad_norm": 0.8693296313285828, + "loss": 1.1274, + "loss_ce": 1.0827381610870361, + "loss_region": 0.030024589970707893, + "loss_total": 1.1127628087997437, + "lr": 0.0011682275748346814, + "router/selected_tokens_s0": 4302.25, + "step": 1890, + "tokens_trained": 6.192561072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5389688674562088, + "grad_norm": 0.4028984606266022, + "loss": 1.1162, + "loss_ce": 1.1056593656539917, + "loss_region": 0.030071411281824112, + "loss_total": 1.1357307434082031, + "lr": 0.0011678206828508303, + "router/selected_tokens_s0": 4372.625, + "step": 1900, + "tokens_trained": 6.225326512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5418055457059783, + "grad_norm": 1.1904973983764648, + "loss": 1.1294, + "loss_ce": 1.0976545810699463, + "loss_region": 0.030053725466132164, + "loss_total": 1.1277083158493042, + "lr": 0.0011674137908669793, + "router/selected_tokens_s0": 4348.125, + "step": 1910, + "tokens_trained": 6.258091952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5446422239557478, + "grad_norm": 1.018221378326416, + "loss": 1.1277, + "loss_ce": 1.1479384899139404, + "loss_region": 0.030054787173867226, + "loss_total": 1.1779932975769043, + "lr": 0.0011670068988831283, + "router/selected_tokens_s0": 4353.25, + "step": 1920, + "tokens_trained": 6.290857392 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5474789022055173, + "grad_norm": 0.4506734013557434, + "loss": 1.1235, + "loss_ce": 1.1137655973434448, + "loss_region": 0.03005811758339405, + "loss_total": 1.1438237428665161, + "lr": 0.0011666000068992772, + "router/selected_tokens_s0": 4341.5, + "step": 1930, + "tokens_trained": 6.323622832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5503155804552868, + "grad_norm": 1.5671348571777344, + "loss": 1.1318, + "loss_ce": 1.1652703285217285, + "loss_region": 0.030141720548272133, + "loss_total": 1.195412039756775, + "lr": 0.0011661931149154262, + "router/selected_tokens_s0": 4458.125, + "step": 1940, + "tokens_trained": 6.356388272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5531522587050564, + "grad_norm": 1.2511063814163208, + "loss": 1.1246, + "loss_ce": 1.2078148126602173, + "loss_region": 0.03000708669424057, + "loss_total": 1.2378219366073608, + "lr": 0.0011657862229315751, + "router/selected_tokens_s0": 4275.625, + "step": 1950, + "tokens_trained": 6.389153712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5559889369548259, + "grad_norm": 1.1278033256530762, + "loss": 1.1253, + "loss_ce": 1.1528972387313843, + "loss_region": 0.029990505427122116, + "loss_total": 1.1828877925872803, + "lr": 0.0011653793309477243, + "router/selected_tokens_s0": 4247.625, + "step": 1960, + "tokens_trained": 6.421919152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5588256152045954, + "grad_norm": 0.7347070574760437, + "loss": 1.1292, + "loss_ce": 1.1609221696853638, + "loss_region": 0.03007410652935505, + "loss_total": 1.1909962892532349, + "lr": 0.0011649724389638733, + "router/selected_tokens_s0": 4377.0, + "step": 1970, + "tokens_trained": 6.454684592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5616622934543649, + "grad_norm": 0.8754347562789917, + "loss": 1.1321, + "loss_ce": 1.1314905881881714, + "loss_region": 0.030018918216228485, + "loss_total": 1.1615095138549805, + "lr": 0.0011645655469800223, + "router/selected_tokens_s0": 4292.375, + "step": 1980, + "tokens_trained": 6.487450032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5644989717041344, + "grad_norm": 1.4375395774841309, + "loss": 1.1251, + "loss_ce": 1.15834641456604, + "loss_region": 0.030011450871825218, + "loss_total": 1.1883578300476074, + "lr": 0.0011641586549961712, + "router/selected_tokens_s0": 4281.875, + "step": 1990, + "tokens_trained": 6.520215472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.567335649953904, + "grad_norm": 1.3012388944625854, + "loss": 1.1244, + "loss_ce": 1.1547801494598389, + "loss_region": 0.03002019412815571, + "loss_total": 1.184800386428833, + "lr": 0.00116375176301232, + "router/selected_tokens_s0": 4298.375, + "step": 2000, + "tokens_trained": 6.552980912 + }, + { + "epoch": 0.567335649953904, + "eval_ppl": 2.997264738752139, + "eval_runtime": 2.4974, + "step": 2000, + "tokens_trained": 6.552980912 + }, + { + "epoch": 0.567335649953904, + "eval_F": 0.33877094677913017, + "eval_F_cds": 0.3354545528054273, + "eval_F_dig": 0.3349740865171758, + "eval_F_exon": 0.33771546252151097, + "eval_F_intron": 0.3394511609404705, + "eval_F_nig": 0.33961248247030124, + "eval_F_promoter": 0.33587224314868064, + "eval_F_utr": 0.3390466904438115, + "eval_G": 0.3927095408069945, + "eval_G_cds": 0.38760326352277413, + "eval_G_dig": 0.38993240031773313, + "eval_G_exon": 0.3922000848097159, + "eval_G_intron": 0.39271919880055167, + "eval_G_nig": 0.3935918508753731, + "eval_G_promoter": 0.3926971556782782, + "eval_G_utr": 0.3912176578977754, + "eval_avg_bp_per_token": 2.9518469913300267, + "eval_bp_per_token/cds": 2.981029744974208, + "eval_bp_per_token/dig": 2.9853055512361997, + "eval_bp_per_token/exon": 2.9610725920975693, + "eval_bp_per_token/intron": 2.9459318896698954, + "eval_bp_per_token/nig": 2.9445325234400035, + "eval_bp_per_token/promoter": 2.977322539741189, + "eval_bp_per_token/utr": 2.9494462803662875, + "eval_ppl_cds": 3.6941119312579422, + "eval_ppl_dig": 1.1218375588220217, + "eval_ppl_exon": 3.4074634485917565, + "eval_ppl_intron": 3.014504389955456, + "eval_ppl_nig": 2.843623870937302, + "eval_ppl_promoter": 3.3305259507076883, + "eval_ppl_utr": 3.322006494837333, + "step": 2000, + "tokens_trained": 6.552980912 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5701723282036735, + "grad_norm": 1.7854270935058594, + "loss": 1.1275, + "loss_ce": 1.1118180751800537, + "loss_region": 0.030034875497221947, + "loss_total": 1.1418529748916626, + "lr": 0.001163344871028469, + "router/selected_tokens_s0": 4323.625, + "step": 2010, + "tokens_trained": 6.585746352 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.573009006453443, + "grad_norm": 1.2741203308105469, + "loss": 1.1297, + "loss_ce": 1.1596630811691284, + "loss_region": 0.030020276084542274, + "loss_total": 1.1896833181381226, + "lr": 0.001162937979044618, + "router/selected_tokens_s0": 4296.625, + "step": 2020, + "tokens_trained": 6.618511792 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5758456847032125, + "grad_norm": 1.3113727569580078, + "loss": 1.1274, + "loss_ce": 1.130359411239624, + "loss_region": 0.030052313581109047, + "loss_total": 1.1604117155075073, + "lr": 0.001162531087060767, + "router/selected_tokens_s0": 4347.25, + "step": 2030, + "tokens_trained": 6.651277232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.578682362952982, + "grad_norm": 1.585740089416504, + "loss": 1.1242, + "loss_ce": 1.113228440284729, + "loss_region": 0.029946262016892433, + "loss_total": 1.143174648284912, + "lr": 0.001162124195076916, + "router/selected_tokens_s0": 4151.5, + "step": 2040, + "tokens_trained": 6.684041872 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5815190412027516, + "grad_norm": 1.4227651357650757, + "loss": 1.1227, + "loss_ce": 1.1707289218902588, + "loss_region": 0.03000037930905819, + "loss_total": 1.200729250907898, + "lr": 0.001161717303093065, + "router/selected_tokens_s0": 4264.25, + "step": 2050, + "tokens_trained": 6.716806512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5843557194525211, + "grad_norm": 1.4349584579467773, + "loss": 1.126, + "loss_ce": 1.123897910118103, + "loss_region": 0.029999535530805588, + "loss_total": 1.1538974046707153, + "lr": 0.001161310411109214, + "router/selected_tokens_s0": 4258.5, + "step": 2060, + "tokens_trained": 6.749571952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5871923977022906, + "grad_norm": 1.525637149810791, + "loss": 1.1223, + "loss_ce": 1.0622094869613647, + "loss_region": 0.03016025200486183, + "loss_total": 1.092369794845581, + "lr": 0.001160903519125363, + "router/selected_tokens_s0": 4409.25, + "step": 2070, + "tokens_trained": 6.782337392 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5900290759520601, + "grad_norm": 0.31481412053108215, + "loss": 1.1308, + "loss_ce": 1.1158243417739868, + "loss_region": 0.030056282877922058, + "loss_total": 1.1458805799484253, + "lr": 0.001160496627141512, + "router/selected_tokens_s0": 4358.875, + "step": 2080, + "tokens_trained": 6.815102832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5928657542018296, + "grad_norm": 1.4279309511184692, + "loss": 1.1212, + "loss_ce": 1.1024186611175537, + "loss_region": 0.03000911884009838, + "loss_total": 1.1324278116226196, + "lr": 0.0011600897351576609, + "router/selected_tokens_s0": 4277.25, + "step": 2090, + "tokens_trained": 6.847868272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5957024324515992, + "grad_norm": 1.3502033948898315, + "loss": 1.1243, + "loss_ce": 1.215091347694397, + "loss_region": 0.03004975989460945, + "loss_total": 1.2451411485671997, + "lr": 0.0011596828431738098, + "router/selected_tokens_s0": 4345.25, + "step": 2100, + "tokens_trained": 6.880633712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.5985391107013687, + "grad_norm": 0.30469629168510437, + "loss": 1.1227, + "loss_ce": 1.0989904403686523, + "loss_region": 0.03004642389714718, + "loss_total": 1.1290369033813477, + "lr": 0.0011592759511899588, + "router/selected_tokens_s0": 4339.125, + "step": 2110, + "tokens_trained": 6.913397016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6013757889511382, + "grad_norm": 3.0106451511383057, + "loss": 1.1271, + "loss_ce": 1.0580655336380005, + "loss_region": 0.03005184419453144, + "loss_total": 1.0881173610687256, + "lr": 0.0011588690592061078, + "router/selected_tokens_s0": 4347.75, + "step": 2120, + "tokens_trained": 6.946162296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6042124672009077, + "grad_norm": 1.4084529876708984, + "loss": 1.1261, + "loss_ce": 0.9337919354438782, + "loss_region": 0.029956450685858727, + "loss_total": 0.9637483954429626, + "lr": 0.0011584621672222567, + "router/selected_tokens_s0": 4181.25, + "step": 2130, + "tokens_trained": 6.978927736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6070491454506772, + "grad_norm": 0.7794283032417297, + "loss": 1.1287, + "loss_ce": 1.0321320295333862, + "loss_region": 0.030011894181370735, + "loss_total": 1.0621439218521118, + "lr": 0.0011580552752384057, + "router/selected_tokens_s0": 4285.875, + "step": 2140, + "tokens_trained": 7.011693176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6098858237004467, + "grad_norm": 0.7242727279663086, + "loss": 1.1314, + "loss_ce": 1.1077067852020264, + "loss_region": 0.030075622722506523, + "loss_total": 1.1377824544906616, + "lr": 0.0011576483832545547, + "router/selected_tokens_s0": 4383.25, + "step": 2150, + "tokens_trained": 7.044458616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6127225019502163, + "grad_norm": 0.8703320622444153, + "loss": 1.1255, + "loss_ce": 1.042706847190857, + "loss_region": 0.030024481937289238, + "loss_total": 1.072731375694275, + "lr": 0.0011572414912707036, + "router/selected_tokens_s0": 4306.0, + "step": 2160, + "tokens_trained": 7.077224056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6155591801999858, + "grad_norm": 2.464707374572754, + "loss": 1.12, + "loss_ce": 1.0845450162887573, + "loss_region": 0.029988931491971016, + "loss_total": 1.1145339012145996, + "lr": 0.0011568345992868526, + "router/selected_tokens_s0": 4238.875, + "step": 2170, + "tokens_trained": 7.109989496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6183958584497553, + "grad_norm": 2.0766637325286865, + "loss": 1.1266, + "loss_ce": 1.1240020990371704, + "loss_region": 0.030013838782906532, + "loss_total": 1.1540158987045288, + "lr": 0.0011564277073030016, + "router/selected_tokens_s0": 4291.875, + "step": 2180, + "tokens_trained": 7.142754936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6212325366995248, + "grad_norm": 1.402709722518921, + "loss": 1.1265, + "loss_ce": 1.1370148658752441, + "loss_region": 0.03003770112991333, + "loss_total": 1.1670525074005127, + "lr": 0.0011560208153191505, + "router/selected_tokens_s0": 4328.625, + "step": 2190, + "tokens_trained": 7.175520376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6240692149492943, + "grad_norm": 0.7657859325408936, + "loss": 1.1259, + "loss_ce": 1.116765022277832, + "loss_region": 0.030005475506186485, + "loss_total": 1.1467704772949219, + "lr": 0.0011556139233352995, + "router/selected_tokens_s0": 4272.125, + "step": 2200, + "tokens_trained": 7.208285816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6269058931990639, + "grad_norm": 3.5244100093841553, + "loss": 1.1305, + "loss_ce": 1.1446946859359741, + "loss_region": 0.030087754130363464, + "loss_total": 1.174782395362854, + "lr": 0.0011552070313514487, + "router/selected_tokens_s0": 4414.25, + "step": 2210, + "tokens_trained": 7.241051256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6297425714488334, + "grad_norm": 0.599822998046875, + "loss": 1.1324, + "loss_ce": 1.0551592111587524, + "loss_region": 0.030122289434075356, + "loss_total": 1.085281491279602, + "lr": 0.0011548001393675976, + "router/selected_tokens_s0": 4453.875, + "step": 2220, + "tokens_trained": 7.273816696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6325792496986029, + "grad_norm": 2.314722776412964, + "loss": 1.1277, + "loss_ce": 1.1485532522201538, + "loss_region": 0.030024103820323944, + "loss_total": 1.1785773038864136, + "lr": 0.0011543932473837466, + "router/selected_tokens_s0": 4313.5, + "step": 2230, + "tokens_trained": 7.306582136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6354159279483724, + "grad_norm": 2.072960615158081, + "loss": 1.131, + "loss_ce": 1.0349353551864624, + "loss_region": 0.030028166249394417, + "loss_total": 1.0649635791778564, + "lr": 0.0011539863553998956, + "router/selected_tokens_s0": 4319.625, + "step": 2240, + "tokens_trained": 7.339347576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.638252606198142, + "grad_norm": 1.371410846710205, + "loss": 1.1226, + "loss_ce": 1.0738561153411865, + "loss_region": 0.030064314603805542, + "loss_total": 1.1039204597473145, + "lr": 0.0011535794634160443, + "router/selected_tokens_s0": 4378.375, + "step": 2250, + "tokens_trained": 7.372113016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6410892844479115, + "grad_norm": 3.474445343017578, + "loss": 1.1284, + "loss_ce": 1.0069116353988647, + "loss_region": 0.030036170035600662, + "loss_total": 1.0369478464126587, + "lr": 0.0011531725714321933, + "router/selected_tokens_s0": 4332.625, + "step": 2260, + "tokens_trained": 7.404878456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.643925962697681, + "grad_norm": 0.5796771049499512, + "loss": 1.1245, + "loss_ce": 1.138779640197754, + "loss_region": 0.030022747814655304, + "loss_total": 1.1688023805618286, + "lr": 0.0011527656794483422, + "router/selected_tokens_s0": 4308.875, + "step": 2270, + "tokens_trained": 7.437643896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6467626409474505, + "grad_norm": 1.155604362487793, + "loss": 1.1216, + "loss_ce": 0.9782689809799194, + "loss_region": 0.030030813068151474, + "loss_total": 1.0082998275756836, + "lr": 0.0011523587874644914, + "router/selected_tokens_s0": 4321.625, + "step": 2280, + "tokens_trained": 7.470409336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.64959931919722, + "grad_norm": 1.8259997367858887, + "loss": 1.1318, + "loss_ce": 1.055479884147644, + "loss_region": 0.030021535232663155, + "loss_total": 1.0855014324188232, + "lr": 0.0011519518954806404, + "router/selected_tokens_s0": 4307.375, + "step": 2290, + "tokens_trained": 7.503173472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6524359974469895, + "grad_norm": 1.2909961938858032, + "loss": 1.1216, + "loss_ce": 1.1016438007354736, + "loss_region": 0.030030114576220512, + "loss_total": 1.1316739320755005, + "lr": 0.0011515450034967894, + "router/selected_tokens_s0": 4321.625, + "step": 2300, + "tokens_trained": 7.535938912 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6552726756967591, + "grad_norm": 3.855242967605591, + "loss": 1.1332, + "loss_ce": 1.1084688901901245, + "loss_region": 0.030001208186149597, + "loss_total": 1.1384700536727905, + "lr": 0.0011511381115129383, + "router/selected_tokens_s0": 4267.625, + "step": 2310, + "tokens_trained": 7.568704352 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6581093539465286, + "grad_norm": 0.6401855945587158, + "loss": 1.1235, + "loss_ce": 1.068629503250122, + "loss_region": 0.030046915635466576, + "loss_total": 1.0986764430999756, + "lr": 0.0011507312195290873, + "router/selected_tokens_s0": 4353.5, + "step": 2320, + "tokens_trained": 7.601469792 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6609460321962981, + "grad_norm": 2.758415460586548, + "loss": 1.1224, + "loss_ce": 1.1197397708892822, + "loss_region": 0.030033273622393608, + "loss_total": 1.1497730016708374, + "lr": 0.0011503243275452363, + "router/selected_tokens_s0": 4317.625, + "step": 2330, + "tokens_trained": 7.634233608 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6637827104460676, + "grad_norm": 3.6356966495513916, + "loss": 1.1258, + "loss_ce": 1.192346453666687, + "loss_region": 0.030019540339708328, + "loss_total": 1.2223659753799438, + "lr": 0.0011499174355613852, + "router/selected_tokens_s0": 4307.0, + "step": 2340, + "tokens_trained": 7.666998248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6666193886958371, + "grad_norm": 0.5084363222122192, + "loss": 1.1211, + "loss_ce": 1.0241565704345703, + "loss_region": 0.030024418607354164, + "loss_total": 1.0541809797286987, + "lr": 0.0011495105435775342, + "router/selected_tokens_s0": 4311.75, + "step": 2350, + "tokens_trained": 7.699763688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6694560669456067, + "grad_norm": 2.6118147373199463, + "loss": 1.1205, + "loss_ce": 1.094053864479065, + "loss_region": 0.030054572969675064, + "loss_total": 1.1241084337234497, + "lr": 0.0011491036515936831, + "router/selected_tokens_s0": 4375.625, + "step": 2360, + "tokens_trained": 7.732529128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6722927451953762, + "grad_norm": 1.5716001987457275, + "loss": 1.1174, + "loss_ce": 1.0806825160980225, + "loss_region": 0.02999335154891014, + "loss_total": 1.1106758117675781, + "lr": 0.0011486967596098321, + "router/selected_tokens_s0": 4245.125, + "step": 2370, + "tokens_trained": 7.765294568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6751294234451457, + "grad_norm": 1.6855603456497192, + "loss": 1.1248, + "loss_ce": 1.0957375764846802, + "loss_region": 0.030019070953130722, + "loss_total": 1.1257566213607788, + "lr": 0.001148289867625981, + "router/selected_tokens_s0": 4306.25, + "step": 2380, + "tokens_trained": 7.798060008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6779661016949152, + "grad_norm": 1.7085551023483276, + "loss": 1.1219, + "loss_ce": 1.0849840641021729, + "loss_region": 0.029990842565894127, + "loss_total": 1.114974856376648, + "lr": 0.00114788297564213, + "router/selected_tokens_s0": 4250.875, + "step": 2390, + "tokens_trained": 7.830825448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6808027799446847, + "grad_norm": 2.7529702186584473, + "loss": 1.1278, + "loss_ce": 1.1395268440246582, + "loss_region": 0.030015477910637856, + "loss_total": 1.1695423126220703, + "lr": 0.001147476083658279, + "router/selected_tokens_s0": 4305.125, + "step": 2400, + "tokens_trained": 7.863590888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6836394581944543, + "grad_norm": 1.855435848236084, + "loss": 1.1225, + "loss_ce": 1.055867075920105, + "loss_region": 0.030039696022868156, + "loss_total": 1.085906744003296, + "lr": 0.001147069191674428, + "router/selected_tokens_s0": 4357.375, + "step": 2410, + "tokens_trained": 7.896356328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6864761364442238, + "grad_norm": 1.9066152572631836, + "loss": 1.1243, + "loss_ce": 0.9804560542106628, + "loss_region": 0.03004065528512001, + "loss_total": 1.010496735572815, + "lr": 0.001146662299690577, + "router/selected_tokens_s0": 4339.375, + "step": 2420, + "tokens_trained": 7.929121768 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6893128146939933, + "grad_norm": 1.6631957292556763, + "loss": 1.1181, + "loss_ce": 1.1269235610961914, + "loss_region": 0.030016232281923294, + "loss_total": 1.1569397449493408, + "lr": 0.001146255407706726, + "router/selected_tokens_s0": 4304.375, + "step": 2430, + "tokens_trained": 7.961887208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6921494929437628, + "grad_norm": 1.932186245918274, + "loss": 1.1318, + "loss_ce": 1.1084073781967163, + "loss_region": 0.030037561431527138, + "loss_total": 1.1384449005126953, + "lr": 0.0011458485157228749, + "router/selected_tokens_s0": 4342.375, + "step": 2440, + "tokens_trained": 7.994651848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6949861711935323, + "grad_norm": 2.0729987621307373, + "loss": 1.1219, + "loss_ce": 1.0754549503326416, + "loss_region": 0.030010342597961426, + "loss_total": 1.105465292930603, + "lr": 0.0011454416237390238, + "router/selected_tokens_s0": 4284.25, + "step": 2450, + "tokens_trained": 8.027417288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.6978228494433019, + "grad_norm": 2.743365526199341, + "loss": 1.1183, + "loss_ce": 1.1507514715194702, + "loss_region": 0.030012760311365128, + "loss_total": 1.1807641983032227, + "lr": 0.001145034731755173, + "router/selected_tokens_s0": 4299.125, + "step": 2460, + "tokens_trained": 8.060182704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7006595276930714, + "grad_norm": 1.968074083328247, + "loss": 1.1248, + "loss_ce": 1.1554365158081055, + "loss_region": 0.03006228432059288, + "loss_total": 1.185498833656311, + "lr": 0.001144627839771322, + "router/selected_tokens_s0": 4397.5, + "step": 2470, + "tokens_trained": 8.092948144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7034962059428409, + "grad_norm": 0.6022619605064392, + "loss": 1.1233, + "loss_ce": 1.0739916563034058, + "loss_region": 0.030015716329216957, + "loss_total": 1.104007363319397, + "lr": 0.001144220947787471, + "router/selected_tokens_s0": 4304.25, + "step": 2480, + "tokens_trained": 8.125713584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7063328841926104, + "grad_norm": 2.9086802005767822, + "loss": 1.1155, + "loss_ce": 1.1227823495864868, + "loss_region": 0.030057305470108986, + "loss_total": 1.1528396606445312, + "lr": 0.00114381405580362, + "router/selected_tokens_s0": 4393.875, + "step": 2490, + "tokens_trained": 8.158479016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7091695624423799, + "grad_norm": 1.8187512159347534, + "loss": 1.1248, + "loss_ce": 1.0580413341522217, + "loss_region": 0.030032671988010406, + "loss_total": 1.088073968887329, + "lr": 0.0011434071638197687, + "router/selected_tokens_s0": 4340.0, + "step": 2500, + "tokens_trained": 8.191244456 + }, + { + "epoch": 0.7091695624423799, + "eval_ppl": 2.9815305929864326, + "eval_runtime": 2.4796, + "step": 2500, + "tokens_trained": 8.191244456 + }, + { + "epoch": 0.7091695624423799, + "eval_F": 0.34048558481131336, + "eval_F_cds": 0.3413653968998391, + "eval_F_dig": 0.3326561970987317, + "eval_F_exon": 0.34301915535870453, + "eval_F_intron": 0.3409895477582185, + "eval_F_nig": 0.34018024599300895, + "eval_F_promoter": 0.3386885010090298, + "eval_F_utr": 0.34306656745268094, + "eval_G": 0.37360140820500265, + "eval_G_cds": 0.37391617995023085, + "eval_G_dig": 0.39410936238508215, + "eval_G_exon": 0.37318875715857475, + "eval_G_intron": 0.3727733807645177, + "eval_G_nig": 0.3734594960312147, + "eval_G_promoter": 0.37594098275253596, + "eval_G_utr": 0.3722500326080449, + "eval_avg_bp_per_token": 2.9369819005822793, + "eval_bp_per_token/cds": 2.929412322050359, + "eval_bp_per_token/dig": 3.006106631175135, + "eval_bp_per_token/exon": 2.915289086273542, + "eval_bp_per_token/intron": 2.932641210190579, + "eval_bp_per_token/nig": 2.9396180753557073, + "eval_bp_per_token/promoter": 2.952565549231147, + "eval_bp_per_token/utr": 2.9148861908204733, + "eval_ppl_cds": 3.7211953918524787, + "eval_ppl_dig": 1.1071312956552213, + "eval_ppl_exon": 3.408594147596357, + "eval_ppl_intron": 2.996762231969892, + "eval_ppl_nig": 2.8097869859130795, + "eval_ppl_promoter": 3.341004188366384, + "eval_ppl_utr": 3.3285188682998834, + "step": 2500, + "tokens_trained": 8.191244456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7120062406921495, + "grad_norm": 1.3883668184280396, + "loss": 1.1168, + "loss_ce": 1.0345538854599, + "loss_region": 0.030011983588337898, + "loss_total": 1.064565896987915, + "lr": 0.0011430002718359176, + "router/selected_tokens_s0": 4293.25, + "step": 2510, + "tokens_trained": 8.224009896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.714842918941919, + "grad_norm": 0.5920007228851318, + "loss": 1.1128, + "loss_ce": 1.1446270942687988, + "loss_region": 0.030029037967324257, + "loss_total": 1.1746561527252197, + "lr": 0.0011425933798520666, + "router/selected_tokens_s0": 4338.125, + "step": 2520, + "tokens_trained": 8.256775336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7176795971916885, + "grad_norm": 2.293912410736084, + "loss": 1.119, + "loss_ce": 1.1278671026229858, + "loss_region": 0.030034860596060753, + "loss_total": 1.1579020023345947, + "lr": 0.0011421864878682158, + "router/selected_tokens_s0": 4356.25, + "step": 2530, + "tokens_trained": 8.289540776 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.720516275441458, + "grad_norm": 1.4504122734069824, + "loss": 1.1161, + "loss_ce": 0.9545093774795532, + "loss_region": 0.030026227235794067, + "loss_total": 0.9845355749130249, + "lr": 0.0011417795958843647, + "router/selected_tokens_s0": 4322.375, + "step": 2540, + "tokens_trained": 8.322306216 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7233529536912275, + "grad_norm": 1.777256727218628, + "loss": 1.1177, + "loss_ce": 1.0747570991516113, + "loss_region": 0.030009755864739418, + "loss_total": 1.104766845703125, + "lr": 0.0011413727039005137, + "router/selected_tokens_s0": 4293.375, + "step": 2550, + "tokens_trained": 8.355071656 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7261896319409971, + "grad_norm": 1.637231707572937, + "loss": 1.1121, + "loss_ce": 1.1526259183883667, + "loss_region": 0.030018767341971397, + "loss_total": 1.1826447248458862, + "lr": 0.0011409658119166627, + "router/selected_tokens_s0": 4318.5, + "step": 2560, + "tokens_trained": 8.387835072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7290263101907666, + "grad_norm": 1.0746310949325562, + "loss": 1.1151, + "loss_ce": 1.1064670085906982, + "loss_region": 0.03001333586871624, + "loss_total": 1.1364803314208984, + "lr": 0.0011405589199328116, + "router/selected_tokens_s0": 4294.375, + "step": 2570, + "tokens_trained": 8.420600512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7318629884405361, + "grad_norm": 1.3798960447311401, + "loss": 1.1198, + "loss_ce": 1.073905110359192, + "loss_region": 0.030032221227884293, + "loss_total": 1.1039373874664307, + "lr": 0.0011401520279489606, + "router/selected_tokens_s0": 4356.375, + "step": 2580, + "tokens_trained": 8.453365928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7346996666903056, + "grad_norm": 1.8040990829467773, + "loss": 1.1175, + "loss_ce": 1.0255845785140991, + "loss_region": 0.03001689724624157, + "loss_total": 1.0556014776229858, + "lr": 0.0011397451359651096, + "router/selected_tokens_s0": 4312.5, + "step": 2590, + "tokens_trained": 8.486131368 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7375363449400751, + "grad_norm": 2.420259952545166, + "loss": 1.1193, + "loss_ce": 1.0581092834472656, + "loss_region": 0.030017009004950523, + "loss_total": 1.088126301765442, + "lr": 0.0011393382439812585, + "router/selected_tokens_s0": 4316.25, + "step": 2600, + "tokens_trained": 8.518896808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7403730231898447, + "grad_norm": 2.068054437637329, + "loss": 1.1114, + "loss_ce": 1.0681673288345337, + "loss_region": 0.030040811747312546, + "loss_total": 1.0982081890106201, + "lr": 0.0011389313519974075, + "router/selected_tokens_s0": 4369.0, + "step": 2610, + "tokens_trained": 8.551662248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7432097014396142, + "grad_norm": 1.7490754127502441, + "loss": 1.1182, + "loss_ce": 1.0639960765838623, + "loss_region": 0.030034611001610756, + "loss_total": 1.094030737876892, + "lr": 0.0011385244600135565, + "router/selected_tokens_s0": 4363.5, + "step": 2620, + "tokens_trained": 8.584426888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7460463796893837, + "grad_norm": 1.4811182022094727, + "loss": 1.1131, + "loss_ce": 1.0907317399978638, + "loss_region": 0.03001326695084572, + "loss_total": 1.120745062828064, + "lr": 0.0011381175680297054, + "router/selected_tokens_s0": 4307.875, + "step": 2630, + "tokens_trained": 8.617192328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7488830579391532, + "grad_norm": 2.1497602462768555, + "loss": 1.1096, + "loss_ce": 1.123599886894226, + "loss_region": 0.030037013813853264, + "loss_total": 1.1536369323730469, + "lr": 0.0011377106760458544, + "router/selected_tokens_s0": 4368.5, + "step": 2640, + "tokens_trained": 8.649951656 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7517197361889227, + "grad_norm": 2.179588556289673, + "loss": 1.1129, + "loss_ce": 0.9365400671958923, + "loss_region": 0.030036216601729393, + "loss_total": 0.9665762782096863, + "lr": 0.0011373037840620034, + "router/selected_tokens_s0": 4350.375, + "step": 2650, + "tokens_trained": 8.682717096 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7545564144386923, + "grad_norm": 1.6021926403045654, + "loss": 1.1095, + "loss_ce": 1.1449388265609741, + "loss_region": 0.030037803575396538, + "loss_total": 1.1749765872955322, + "lr": 0.0011368968920781523, + "router/selected_tokens_s0": 4373.0, + "step": 2660, + "tokens_trained": 8.715482536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7573930926884618, + "grad_norm": 1.2494678497314453, + "loss": 1.1097, + "loss_ce": 1.0806819200515747, + "loss_region": 0.03000866435468197, + "loss_total": 1.1106905937194824, + "lr": 0.0011364900000943013, + "router/selected_tokens_s0": 4295.25, + "step": 2670, + "tokens_trained": 8.748247976 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7602297709382313, + "grad_norm": 1.3196409940719604, + "loss": 1.1136, + "loss_ce": 1.069360375404358, + "loss_region": 0.030009115114808083, + "loss_total": 1.0993695259094238, + "lr": 0.0011360831081104503, + "router/selected_tokens_s0": 4291.75, + "step": 2680, + "tokens_trained": 8.781013416 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7630664491880008, + "grad_norm": 2.674771308898926, + "loss": 1.1188, + "loss_ce": 1.1771190166473389, + "loss_region": 0.030019955709576607, + "loss_total": 1.207139015197754, + "lr": 0.0011356762161265992, + "router/selected_tokens_s0": 4330.125, + "step": 2690, + "tokens_trained": 8.813778696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7659031274377703, + "grad_norm": 1.6932164430618286, + "loss": 1.1031, + "loss_ce": 1.0857900381088257, + "loss_region": 0.0300260242074728, + "loss_total": 1.1158161163330078, + "lr": 0.0011352693241427482, + "router/selected_tokens_s0": 4347.5, + "step": 2700, + "tokens_trained": 8.846544136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7687398056875399, + "grad_norm": 1.5329583883285522, + "loss": 1.1098, + "loss_ce": 1.0980348587036133, + "loss_region": 0.030030502006411552, + "loss_total": 1.1280653476715088, + "lr": 0.0011348624321588974, + "router/selected_tokens_s0": 4366.375, + "step": 2710, + "tokens_trained": 8.879309576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7715764839373094, + "grad_norm": 1.829464077949524, + "loss": 1.1093, + "loss_ce": 1.1128755807876587, + "loss_region": 0.03000422567129135, + "loss_total": 1.142879843711853, + "lr": 0.0011344555401750463, + "router/selected_tokens_s0": 4282.0, + "step": 2720, + "tokens_trained": 8.912075016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7744131621870789, + "grad_norm": 2.8766870498657227, + "loss": 1.1187, + "loss_ce": 1.12075674533844, + "loss_region": 0.030019240453839302, + "loss_total": 1.1507760286331177, + "lr": 0.0011340486481911953, + "router/selected_tokens_s0": 4327.25, + "step": 2730, + "tokens_trained": 8.944840456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7772498404368484, + "grad_norm": 2.2969014644622803, + "loss": 1.1166, + "loss_ce": 1.0795077085494995, + "loss_region": 0.030028002336621284, + "loss_total": 1.1095356941223145, + "lr": 0.001133641756207344, + "router/selected_tokens_s0": 4352.75, + "step": 2740, + "tokens_trained": 8.977605896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7800865186866179, + "grad_norm": 1.7521798610687256, + "loss": 1.1139, + "loss_ce": 1.1274807453155518, + "loss_region": 0.030016858130693436, + "loss_total": 1.1574976444244385, + "lr": 0.001133234864223493, + "router/selected_tokens_s0": 4320.0, + "step": 2750, + "tokens_trained": 9.010371336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7829231969363875, + "grad_norm": 2.6245367527008057, + "loss": 1.1075, + "loss_ce": 1.1328058242797852, + "loss_region": 0.03003484010696411, + "loss_total": 1.1628406047821045, + "lr": 0.001132827972239642, + "router/selected_tokens_s0": 4367.625, + "step": 2760, + "tokens_trained": 9.043136776 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.785759875186157, + "grad_norm": 1.162583351135254, + "loss": 1.1181, + "loss_ce": 1.151093602180481, + "loss_region": 0.030036624521017075, + "loss_total": 1.1811301708221436, + "lr": 0.001132421080255791, + "router/selected_tokens_s0": 4392.5, + "step": 2770, + "tokens_trained": 9.075902216 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7885965534359265, + "grad_norm": 1.4981096982955933, + "loss": 1.1104, + "loss_ce": 1.0844680070877075, + "loss_region": 0.030015481635928154, + "loss_total": 1.1144834756851196, + "lr": 0.0011320141882719401, + "router/selected_tokens_s0": 4314.625, + "step": 2780, + "tokens_trained": 9.108667656 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.791433231685696, + "grad_norm": 1.8612878322601318, + "loss": 1.1073, + "loss_ce": 1.0089409351348877, + "loss_region": 0.029995379969477654, + "loss_total": 1.0389362573623657, + "lr": 0.001131607296288089, + "router/selected_tokens_s0": 4257.875, + "step": 2790, + "tokens_trained": 9.14143004 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7942699099354655, + "grad_norm": 0.6861640810966492, + "loss": 1.1058, + "loss_ce": 0.9385975003242493, + "loss_region": 0.029996687546372414, + "loss_total": 0.9685941934585571, + "lr": 0.001131200404304238, + "router/selected_tokens_s0": 4290.625, + "step": 2800, + "tokens_trained": 9.17419548 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.797106588185235, + "grad_norm": 2.205390214920044, + "loss": 1.108, + "loss_ce": 1.0670945644378662, + "loss_region": 0.030025651678442955, + "loss_total": 1.0971201658248901, + "lr": 0.001130793512320387, + "router/selected_tokens_s0": 4342.875, + "step": 2810, + "tokens_trained": 9.20696092 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.7999432664350046, + "grad_norm": 2.068150520324707, + "loss": 1.106, + "loss_ce": 1.0238165855407715, + "loss_region": 0.03002651408314705, + "loss_total": 1.0538431406021118, + "lr": 0.001130386620336536, + "router/selected_tokens_s0": 4359.75, + "step": 2820, + "tokens_trained": 9.23972636 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8027799446847741, + "grad_norm": 1.1060576438903809, + "loss": 1.1065, + "loss_ce": 1.0474674701690674, + "loss_region": 0.03000919334590435, + "loss_total": 1.0774766206741333, + "lr": 0.001129979728352685, + "router/selected_tokens_s0": 4301.75, + "step": 2830, + "tokens_trained": 9.2724918 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8056166229345436, + "grad_norm": 1.369165301322937, + "loss": 1.1081, + "loss_ce": 1.0370676517486572, + "loss_region": 0.030027758330106735, + "loss_total": 1.067095398902893, + "lr": 0.001129572836368834, + "router/selected_tokens_s0": 4375.625, + "step": 2840, + "tokens_trained": 9.30525692 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8084533011843131, + "grad_norm": 2.285675525665283, + "loss": 1.109, + "loss_ce": 1.0967503786087036, + "loss_region": 0.0300269927829504, + "loss_total": 1.1267774105072021, + "lr": 0.0011291659443849829, + "router/selected_tokens_s0": 4370.0, + "step": 2850, + "tokens_trained": 9.33802236 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8112899794340827, + "grad_norm": 0.8950642943382263, + "loss": 1.1015, + "loss_ce": 1.091797947883606, + "loss_region": 0.03003113530576229, + "loss_total": 1.1218290328979492, + "lr": 0.0011287590524011318, + "router/selected_tokens_s0": 4384.125, + "step": 2860, + "tokens_trained": 9.3707878 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8141266576838522, + "grad_norm": 2.1465282440185547, + "loss": 1.1012, + "loss_ce": 0.9929934144020081, + "loss_region": 0.030015377327799797, + "loss_total": 1.0230088233947754, + "lr": 0.0011283521604172808, + "router/selected_tokens_s0": 4333.0, + "step": 2870, + "tokens_trained": 9.403548272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8169633359336217, + "grad_norm": 2.1108782291412354, + "loss": 1.1029, + "loss_ce": 1.0729644298553467, + "loss_region": 0.03002534806728363, + "loss_total": 1.1029897928237915, + "lr": 0.0011279452684334298, + "router/selected_tokens_s0": 4357.25, + "step": 2880, + "tokens_trained": 9.436313712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8198000141833912, + "grad_norm": 1.7104750871658325, + "loss": 1.1041, + "loss_ce": 1.116651177406311, + "loss_region": 0.030016740784049034, + "loss_total": 1.1466679573059082, + "lr": 0.0011275383764495787, + "router/selected_tokens_s0": 4340.875, + "step": 2890, + "tokens_trained": 9.469079152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8226366924331607, + "grad_norm": 1.7549395561218262, + "loss": 1.1098, + "loss_ce": 0.977597713470459, + "loss_region": 0.030016236007213593, + "loss_total": 1.0076138973236084, + "lr": 0.0011271314844657277, + "router/selected_tokens_s0": 4328.875, + "step": 2900, + "tokens_trained": 9.50184356 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8254733706829303, + "grad_norm": 2.076667547225952, + "loss": 1.1041, + "loss_ce": 0.9882082343101501, + "loss_region": 0.03001856803894043, + "loss_total": 1.0182268619537354, + "lr": 0.0011267245924818767, + "router/selected_tokens_s0": 4341.75, + "step": 2910, + "tokens_trained": 9.534608992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8283100489326998, + "grad_norm": 1.930834412574768, + "loss": 1.1031, + "loss_ce": 1.1864138841629028, + "loss_region": 0.03002503328025341, + "loss_total": 1.216438889503479, + "lr": 0.0011263177004980256, + "router/selected_tokens_s0": 4379.125, + "step": 2920, + "tokens_trained": 9.567373632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8311467271824693, + "grad_norm": 0.7202333807945251, + "loss": 1.103, + "loss_ce": 1.0883651971817017, + "loss_region": 0.030032740905880928, + "loss_total": 1.1183979511260986, + "lr": 0.0011259108085141746, + "router/selected_tokens_s0": 4386.375, + "step": 2930, + "tokens_trained": 9.600139072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8339834054322388, + "grad_norm": 1.0626195669174194, + "loss": 1.1043, + "loss_ce": 1.0197147130966187, + "loss_region": 0.03001200221478939, + "loss_total": 1.0497267246246338, + "lr": 0.0011255039165303236, + "router/selected_tokens_s0": 4317.625, + "step": 2940, + "tokens_trained": 9.632904512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8368200836820083, + "grad_norm": 2.428861379623413, + "loss": 1.1036, + "loss_ce": 0.9022196531295776, + "loss_region": 0.030008511617779732, + "loss_total": 0.932228147983551, + "lr": 0.0011250970245464725, + "router/selected_tokens_s0": 4322.125, + "step": 2950, + "tokens_trained": 9.665669952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8396567619317779, + "grad_norm": 0.9146430492401123, + "loss": 1.1015, + "loss_ce": 1.1206673383712769, + "loss_region": 0.030019836500287056, + "loss_total": 1.1506872177124023, + "lr": 0.0011246901325626217, + "router/selected_tokens_s0": 4355.75, + "step": 2960, + "tokens_trained": 9.698432616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8424934401815474, + "grad_norm": 1.3183574676513672, + "loss": 1.0992, + "loss_ce": 1.088549256324768, + "loss_region": 0.030013682320713997, + "loss_total": 1.118562936782837, + "lr": 0.0011242832405787707, + "router/selected_tokens_s0": 4324.875, + "step": 2970, + "tokens_trained": 9.731196464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8453301184313169, + "grad_norm": 1.7237669229507446, + "loss": 1.1016, + "loss_ce": 1.1303554773330688, + "loss_region": 0.030019812285900116, + "loss_total": 1.1603752374649048, + "lr": 0.0011238763485949196, + "router/selected_tokens_s0": 4352.75, + "step": 2980, + "tokens_trained": 9.763955848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8481667966810864, + "grad_norm": 2.353868246078491, + "loss": 1.097, + "loss_ce": 1.1438567638397217, + "loss_region": 0.03001641482114792, + "loss_total": 1.1738731861114502, + "lr": 0.0011234694566110684, + "router/selected_tokens_s0": 4346.25, + "step": 2990, + "tokens_trained": 9.796721288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8510034749308559, + "grad_norm": 2.239737033843994, + "loss": 1.099, + "loss_ce": 1.137770414352417, + "loss_region": 0.03001844510436058, + "loss_total": 1.1677888631820679, + "lr": 0.0011230625646272174, + "router/selected_tokens_s0": 4345.625, + "step": 3000, + "tokens_trained": 9.829486728 + }, + { + "epoch": 0.8510034749308559, + "eval_ppl": 2.91798250107805, + "eval_runtime": 2.489, + "step": 3000, + "tokens_trained": 9.829486728 + }, + { + "epoch": 0.8510034749308559, + "eval_F": 0.34119725725854944, + "eval_F_cds": 0.339909922293828, + "eval_F_dig": 0.3374221944422741, + "eval_F_exon": 0.3444720286625102, + "eval_F_intron": 0.3423051363848719, + "eval_F_nig": 0.3420074982635899, + "eval_F_promoter": 0.33568609090152685, + "eval_F_utr": 0.3433317082766702, + "eval_G": 0.35626090599344656, + "eval_G_cds": 0.3533774528284723, + "eval_G_dig": 0.39929882420827145, + "eval_G_exon": 0.35481589922102014, + "eval_G_intron": 0.3559872186522367, + "eval_G_nig": 0.35704285773301014, + "eval_G_promoter": 0.354736183175574, + "eval_G_utr": 0.3543053844969594, + "eval_avg_bp_per_token": 2.930855916119598, + "eval_bp_per_token/cds": 2.9419558959963843, + "eval_bp_per_token/dig": 2.9636461870947826, + "eval_bp_per_token/exon": 2.9029933254166496, + "eval_bp_per_token/intron": 2.921370127720335, + "eval_bp_per_token/nig": 2.923912502144284, + "eval_bp_per_token/promoter": 2.9789735920078644, + "eval_bp_per_token/utr": 2.9126351452344177, + "eval_ppl_cds": 3.5636364626812047, + "eval_ppl_dig": 1.0968188962634289, + "eval_ppl_exon": 3.3285669782872387, + "eval_ppl_intron": 2.935885553210843, + "eval_ppl_nig": 2.7347129604188645, + "eval_ppl_promoter": 3.292230226733986, + "eval_ppl_utr": 3.2942767869833, + "step": 3000, + "tokens_trained": 9.829486728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8538401531806254, + "grad_norm": 1.0114418268203735, + "loss": 1.092, + "loss_ce": 1.0748389959335327, + "loss_region": 0.030020562931895256, + "loss_total": 1.1048595905303955, + "lr": 0.0011226556726433663, + "router/selected_tokens_s0": 4357.25, + "step": 3010, + "tokens_trained": 9.862252168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.856676831430395, + "grad_norm": 2.267549753189087, + "loss": 1.0971, + "loss_ce": 1.088890552520752, + "loss_region": 0.03001115657389164, + "loss_total": 1.1189017295837402, + "lr": 0.0011222487806595153, + "router/selected_tokens_s0": 4332.125, + "step": 3020, + "tokens_trained": 9.895016808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8595135096801645, + "grad_norm": 1.3197458982467651, + "loss": 1.0948, + "loss_ce": 0.9036920070648193, + "loss_region": 0.029985321685671806, + "loss_total": 0.9336773157119751, + "lr": 0.0011218418886756645, + "router/selected_tokens_s0": 4263.875, + "step": 3030, + "tokens_trained": 9.927782248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.862350187929934, + "grad_norm": 1.6852810382843018, + "loss": 1.1014, + "loss_ce": 1.1086361408233643, + "loss_region": 0.030025212094187737, + "loss_total": 1.1386613845825195, + "lr": 0.0011214349966918134, + "router/selected_tokens_s0": 4387.0, + "step": 3040, + "tokens_trained": 9.960547688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8651868661797035, + "grad_norm": 1.753929853439331, + "loss": 1.1021, + "loss_ce": 1.020461082458496, + "loss_region": 0.03000788949429989, + "loss_total": 1.050468921661377, + "lr": 0.0011210281047079624, + "router/selected_tokens_s0": 4309.625, + "step": 3050, + "tokens_trained": 9.993313128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.868023544429473, + "grad_norm": 1.005724310874939, + "loss": 1.1004, + "loss_ce": 1.0221396684646606, + "loss_region": 0.030008656904101372, + "loss_total": 1.0521483421325684, + "lr": 0.0011206212127241114, + "router/selected_tokens_s0": 4308.25, + "step": 3060, + "tokens_trained": 10.026078568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8708602226792426, + "grad_norm": 1.1538729667663574, + "loss": 1.0968, + "loss_ce": 1.045743465423584, + "loss_region": 0.03000555746257305, + "loss_total": 1.0757490396499634, + "lr": 0.0011202143207402603, + "router/selected_tokens_s0": 4306.375, + "step": 3070, + "tokens_trained": 10.058844008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8736969009290121, + "grad_norm": 1.8730782270431519, + "loss": 1.1067, + "loss_ce": 1.0756598711013794, + "loss_region": 0.03001215122640133, + "loss_total": 1.105672001838684, + "lr": 0.0011198074287564093, + "router/selected_tokens_s0": 4321.25, + "step": 3080, + "tokens_trained": 10.091604144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8765335791787816, + "grad_norm": 2.151102066040039, + "loss": 1.1026, + "loss_ce": 1.1392779350280762, + "loss_region": 0.030021771788597107, + "loss_total": 1.1692997217178345, + "lr": 0.0011194005367725583, + "router/selected_tokens_s0": 4390.625, + "step": 3090, + "tokens_trained": 10.124368784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8793702574285511, + "grad_norm": 2.1336331367492676, + "loss": 1.0989, + "loss_ce": 1.0690431594848633, + "loss_region": 0.030014000833034515, + "loss_total": 1.0990571975708008, + "lr": 0.0011189936447887072, + "router/selected_tokens_s0": 4350.0, + "step": 3100, + "tokens_trained": 10.157134224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8822069356783206, + "grad_norm": 2.0660271644592285, + "loss": 1.0952, + "loss_ce": 1.130868911743164, + "loss_region": 0.030013030394911766, + "loss_total": 1.1608819961547852, + "lr": 0.0011185867528048562, + "router/selected_tokens_s0": 4349.5, + "step": 3110, + "tokens_trained": 10.189899664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8850436139280902, + "grad_norm": 0.5485074520111084, + "loss": 1.0989, + "loss_ce": 1.0510705709457397, + "loss_region": 0.030014289543032646, + "loss_total": 1.0810848474502563, + "lr": 0.0011181798608210052, + "router/selected_tokens_s0": 4352.875, + "step": 3120, + "tokens_trained": 10.222665104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8878802921778597, + "grad_norm": 1.4256670475006104, + "loss": 1.098, + "loss_ce": 1.012781023979187, + "loss_region": 0.030008379369974136, + "loss_total": 1.0427894592285156, + "lr": 0.0011177729688371541, + "router/selected_tokens_s0": 4316.25, + "step": 3130, + "tokens_trained": 10.255429744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8907169704276292, + "grad_norm": 1.5442920923233032, + "loss": 1.0946, + "loss_ce": 0.9727160334587097, + "loss_region": 0.030018145218491554, + "loss_total": 1.0027341842651367, + "lr": 0.001117366076853303, + "router/selected_tokens_s0": 4410.125, + "step": 3140, + "tokens_trained": 10.288194384 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8935536486773987, + "grad_norm": 1.4367228746414185, + "loss": 1.0967, + "loss_ce": 1.1256974935531616, + "loss_region": 0.03001110814511776, + "loss_total": 1.1557085514068604, + "lr": 0.001116959184869452, + "router/selected_tokens_s0": 4353.0, + "step": 3150, + "tokens_trained": 10.320959664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8963903269271682, + "grad_norm": 1.1446796655654907, + "loss": 1.0908, + "loss_ce": 1.0701184272766113, + "loss_region": 0.03002384677529335, + "loss_total": 1.100142240524292, + "lr": 0.001116552292885601, + "router/selected_tokens_s0": 4393.625, + "step": 3160, + "tokens_trained": 10.353725104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.8992270051769378, + "grad_norm": 1.2145036458969116, + "loss": 1.0973, + "loss_ce": 1.064907193183899, + "loss_region": 0.030015455558896065, + "loss_total": 1.094922661781311, + "lr": 0.00111614540090175, + "router/selected_tokens_s0": 4343.5, + "step": 3170, + "tokens_trained": 10.386490544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9020636834267073, + "grad_norm": 1.4071613550186157, + "loss": 1.1022, + "loss_ce": 1.0602645874023438, + "loss_region": 0.030010098591446877, + "loss_total": 1.090274691581726, + "lr": 0.001115738508917899, + "router/selected_tokens_s0": 4317.875, + "step": 3180, + "tokens_trained": 10.419255984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9049003616764768, + "grad_norm": 1.6725516319274902, + "loss": 1.0985, + "loss_ce": 1.0676279067993164, + "loss_region": 0.030014997348189354, + "loss_total": 1.0976428985595703, + "lr": 0.001115331616934048, + "router/selected_tokens_s0": 4344.375, + "step": 3190, + "tokens_trained": 10.452021424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9077370399262463, + "grad_norm": 1.2930175065994263, + "loss": 1.0925, + "loss_ce": 1.0800068378448486, + "loss_region": 0.030022740364074707, + "loss_total": 1.1100295782089233, + "lr": 0.0011149247249501969, + "router/selected_tokens_s0": 4410.0, + "step": 3200, + "tokens_trained": 10.484786064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9105737181760158, + "grad_norm": 1.8128279447555542, + "loss": 1.0904, + "loss_ce": 0.978069007396698, + "loss_region": 0.03001835197210312, + "loss_total": 1.008087396621704, + "lr": 0.001114517832966346, + "router/selected_tokens_s0": 4359.875, + "step": 3210, + "tokens_trained": 10.517551504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9134103964257854, + "grad_norm": 2.7744452953338623, + "loss": 1.0922, + "loss_ce": 1.0043200254440308, + "loss_region": 0.030023187398910522, + "loss_total": 1.0343432426452637, + "lr": 0.001114110940982495, + "router/selected_tokens_s0": 4378.0, + "step": 3220, + "tokens_trained": 10.550316944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9162470746755549, + "grad_norm": 1.3683607578277588, + "loss": 1.1049, + "loss_ce": 1.1094452142715454, + "loss_region": 0.03001292422413826, + "loss_total": 1.139458179473877, + "lr": 0.001113704048998644, + "router/selected_tokens_s0": 4363.5, + "step": 3230, + "tokens_trained": 10.583082384 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9190837529253244, + "grad_norm": 0.2690750062465668, + "loss": 1.0941, + "loss_ce": 1.1386804580688477, + "loss_region": 0.030020570382475853, + "loss_total": 1.1687010526657104, + "lr": 0.0011132971570147927, + "router/selected_tokens_s0": 4375.5, + "step": 3240, + "tokens_trained": 10.615847824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9219204311750939, + "grad_norm": 3.4670774936676025, + "loss": 1.1104, + "loss_ce": 1.0814176797866821, + "loss_region": 0.030020495876669884, + "loss_total": 1.1114381551742554, + "lr": 0.0011128902650309417, + "router/selected_tokens_s0": 4377.625, + "step": 3250, + "tokens_trained": 10.648613248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9247571094248634, + "grad_norm": 0.8661336302757263, + "loss": 1.0912, + "loss_ce": 0.8863070011138916, + "loss_region": 0.030015867203474045, + "loss_total": 0.9163228869438171, + "lr": 0.0011124833730470907, + "router/selected_tokens_s0": 4347.25, + "step": 3260, + "tokens_trained": 10.681378688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.927593787674633, + "grad_norm": 1.5195131301879883, + "loss": 1.0884, + "loss_ce": 0.9159345626831055, + "loss_region": 0.03002041205763817, + "loss_total": 0.9459549784660339, + "lr": 0.0011120764810632396, + "router/selected_tokens_s0": 4345.625, + "step": 3270, + "tokens_trained": 10.714144128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9304304659244025, + "grad_norm": 1.2169824838638306, + "loss": 1.0911, + "loss_ce": 1.0144977569580078, + "loss_region": 0.03001958690583706, + "loss_total": 1.0445173978805542, + "lr": 0.0011116695890793888, + "router/selected_tokens_s0": 4394.125, + "step": 3280, + "tokens_trained": 10.746909568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.933267144174172, + "grad_norm": 0.5433168411254883, + "loss": 1.0815, + "loss_ce": 1.0072896480560303, + "loss_region": 0.03001921810209751, + "loss_total": 1.0373088121414185, + "lr": 0.0011112626970955378, + "router/selected_tokens_s0": 4374.375, + "step": 3290, + "tokens_trained": 10.779675008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9361038224239415, + "grad_norm": 1.009089469909668, + "loss": 1.0887, + "loss_ce": 1.1322096586227417, + "loss_region": 0.030020419508218765, + "loss_total": 1.162230134010315, + "lr": 0.0011108558051116867, + "router/selected_tokens_s0": 4387.875, + "step": 3300, + "tokens_trained": 10.812440448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.938940500673711, + "grad_norm": 1.1186658143997192, + "loss": 1.0913, + "loss_ce": 1.0417039394378662, + "loss_region": 0.030010957270860672, + "loss_total": 1.0717148780822754, + "lr": 0.0011104489131278357, + "router/selected_tokens_s0": 4330.5, + "step": 3310, + "tokens_trained": 10.845202832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9417771789234806, + "grad_norm": 1.406549334526062, + "loss": 1.0884, + "loss_ce": 1.1175626516342163, + "loss_region": 0.030026914551854134, + "loss_total": 1.1475895643234253, + "lr": 0.0011100420211439847, + "router/selected_tokens_s0": 4413.375, + "step": 3320, + "tokens_trained": 10.877968272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9446138571732501, + "grad_norm": 1.7244771718978882, + "loss": 1.0946, + "loss_ce": 1.044142246246338, + "loss_region": 0.03001406043767929, + "loss_total": 1.0741562843322754, + "lr": 0.0011096351291601336, + "router/selected_tokens_s0": 4364.875, + "step": 3330, + "tokens_trained": 10.910733712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9474505354230196, + "grad_norm": 1.0954854488372803, + "loss": 1.0904, + "loss_ce": 1.0164037942886353, + "loss_region": 0.030010055750608444, + "loss_total": 1.0464138984680176, + "lr": 0.0011092282371762826, + "router/selected_tokens_s0": 4342.75, + "step": 3340, + "tokens_trained": 10.943499152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9502872136727891, + "grad_norm": 2.0860025882720947, + "loss": 1.0943, + "loss_ce": 1.155411958694458, + "loss_region": 0.030009519308805466, + "loss_total": 1.1854214668273926, + "lr": 0.0011088213451924316, + "router/selected_tokens_s0": 4352.25, + "step": 3350, + "tokens_trained": 10.976264592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9531238919225586, + "grad_norm": 1.1401242017745972, + "loss": 1.0872, + "loss_ce": 1.0502551794052124, + "loss_region": 0.030008839443325996, + "loss_total": 1.0802639722824097, + "lr": 0.0011084144532085805, + "router/selected_tokens_s0": 4341.5, + "step": 3360, + "tokens_trained": 11.009030032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9559605701723282, + "grad_norm": 1.704352617263794, + "loss": 1.0874, + "loss_ce": 0.9294300079345703, + "loss_region": 0.030014311894774437, + "loss_total": 0.9594443440437317, + "lr": 0.0011080075612247295, + "router/selected_tokens_s0": 4352.125, + "step": 3370, + "tokens_trained": 11.041794728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9587972484220977, + "grad_norm": 0.9261600971221924, + "loss": 1.0935, + "loss_ce": 1.0589406490325928, + "loss_region": 0.030018316581845284, + "loss_total": 1.088958978652954, + "lr": 0.0011076006692408785, + "router/selected_tokens_s0": 4379.125, + "step": 3380, + "tokens_trained": 11.074559368 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9616339266718672, + "grad_norm": 0.7537907958030701, + "loss": 1.0865, + "loss_ce": 1.0465339422225952, + "loss_region": 0.030020853504538536, + "loss_total": 1.076554775238037, + "lr": 0.0011071937772570274, + "router/selected_tokens_s0": 4404.75, + "step": 3390, + "tokens_trained": 11.107324808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9644706049216367, + "grad_norm": 1.0815021991729736, + "loss": 1.0952, + "loss_ce": 1.0314733982086182, + "loss_region": 0.030017558485269547, + "loss_total": 1.0614910125732422, + "lr": 0.0011067868852731764, + "router/selected_tokens_s0": 4383.625, + "step": 3400, + "tokens_trained": 11.140090248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9673072831714062, + "grad_norm": 1.633634090423584, + "loss": 1.088, + "loss_ce": 0.9958590269088745, + "loss_region": 0.030007656663656235, + "loss_total": 1.0258666276931763, + "lr": 0.0011063799932893254, + "router/selected_tokens_s0": 4323.5, + "step": 3410, + "tokens_trained": 11.172855688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9701439614211758, + "grad_norm": 1.0511754751205444, + "loss": 1.0871, + "loss_ce": 1.0179994106292725, + "loss_region": 0.030009476467967033, + "loss_total": 1.048008918762207, + "lr": 0.0011059731013054743, + "router/selected_tokens_s0": 4384.875, + "step": 3420, + "tokens_trained": 11.205621096 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9729806396709453, + "grad_norm": 1.6246494054794312, + "loss": 1.0822, + "loss_ce": 0.9318454265594482, + "loss_region": 0.029994873329997063, + "loss_total": 0.9618402719497681, + "lr": 0.0011055662093216233, + "router/selected_tokens_s0": 4316.375, + "step": 3430, + "tokens_trained": 11.238384136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9758173179207148, + "grad_norm": 1.043264627456665, + "loss": 1.0927, + "loss_ce": 1.0697022676467896, + "loss_region": 0.03001013770699501, + "loss_total": 1.0997123718261719, + "lr": 0.0011051593173377723, + "router/selected_tokens_s0": 4351.5, + "step": 3440, + "tokens_trained": 11.271149576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9786539961704843, + "grad_norm": 0.5135401487350464, + "loss": 1.0883, + "loss_ce": 1.0403201580047607, + "loss_region": 0.03001248463988304, + "loss_total": 1.070332646369934, + "lr": 0.0011047524253539212, + "router/selected_tokens_s0": 4380.0, + "step": 3450, + "tokens_trained": 11.303915016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9814906744202538, + "grad_norm": 0.6191660165786743, + "loss": 1.0821, + "loss_ce": 0.9997903108596802, + "loss_region": 0.030010463669896126, + "loss_total": 1.0298007726669312, + "lr": 0.0011043455333700704, + "router/selected_tokens_s0": 4325.0, + "step": 3460, + "tokens_trained": 11.336680456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9843273526700234, + "grad_norm": 2.0896031856536865, + "loss": 1.0852, + "loss_ce": 0.9407132863998413, + "loss_region": 0.030007831752300262, + "loss_total": 0.9707211256027222, + "lr": 0.0011039386413862194, + "router/selected_tokens_s0": 4340.0, + "step": 3470, + "tokens_trained": 11.369444288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9871640309197929, + "grad_norm": 1.5842080116271973, + "loss": 1.0773, + "loss_ce": 1.0703927278518677, + "loss_region": 0.030027827247977257, + "loss_total": 1.100420594215393, + "lr": 0.0011035317494023683, + "router/selected_tokens_s0": 4422.875, + "step": 3480, + "tokens_trained": 11.402209728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9900007091695624, + "grad_norm": 1.6284222602844238, + "loss": 1.0867, + "loss_ce": 1.0688589811325073, + "loss_region": 0.03001173585653305, + "loss_total": 1.0988707542419434, + "lr": 0.001103124857418517, + "router/selected_tokens_s0": 4370.375, + "step": 3490, + "tokens_trained": 11.434975168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9928373874193319, + "grad_norm": 1.9294172525405884, + "loss": 1.0776, + "loss_ce": 1.0885505676269531, + "loss_region": 0.03002048470079899, + "loss_total": 1.1185710430145264, + "lr": 0.001102717965434666, + "router/selected_tokens_s0": 4402.375, + "step": 3500, + "tokens_trained": 11.467740608 + }, + { + "epoch": 0.9928373874193319, + "eval_ppl": 2.8891544142739582, + "eval_runtime": 2.4909, + "step": 3500, + "tokens_trained": 11.467740608 + }, + { + "epoch": 0.9928373874193319, + "eval_F": 0.3418065949445779, + "eval_F_cds": 0.34451083924282977, + "eval_F_dig": 0.33807328697554495, + "eval_F_exon": 0.34726493074167064, + "eval_F_intron": 0.34224230575312725, + "eval_F_nig": 0.3415901920743997, + "eval_F_promoter": 0.339251188483381, + "eval_F_utr": 0.34464885946681034, + "eval_G": 0.34547800502934484, + "eval_G_cds": 0.3442593687333732, + "eval_G_dig": 0.3991668762370758, + "eval_G_exon": 0.3453487502027675, + "eval_G_intron": 0.34490806786026235, + "eval_G_nig": 0.3455515495124094, + "eval_G_promoter": 0.34497689050299185, + "eval_G_utr": 0.3442341927156835, + "eval_avg_bp_per_token": 2.9256310872589943, + "eval_bp_per_token/cds": 2.902666291132704, + "eval_bp_per_token/dig": 2.957938525537324, + "eval_bp_per_token/exon": 2.8796458020228277, + "eval_bp_per_token/intron": 2.921906448121405, + "eval_bp_per_token/nig": 2.9274845215175147, + "eval_bp_per_token/promoter": 2.947668376551575, + "eval_bp_per_token/utr": 2.9015038713519954, + "eval_ppl_cds": 3.5389763754938555, + "eval_ppl_dig": 1.091459889456152, + "eval_ppl_exon": 3.306826954534152, + "eval_ppl_intron": 2.9106190204474447, + "eval_ppl_nig": 2.694991382732784, + "eval_ppl_promoter": 3.283923741138257, + "eval_ppl_utr": 3.295742249982149, + "step": 3500, + "tokens_trained": 11.467740608 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.9956740656691014, + "grad_norm": 1.324675440788269, + "loss": 1.0868, + "loss_ce": 1.0776382684707642, + "loss_region": 0.03000813163816929, + "loss_total": 1.1076463460922241, + "lr": 0.001102311073450815, + "router/selected_tokens_s0": 4340.25, + "step": 3510, + "tokens_trained": 11.500506048 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 0.998510743918871, + "grad_norm": 1.6382735967636108, + "loss": 1.0901, + "loss_ce": 1.0050663948059082, + "loss_region": 0.030013620853424072, + "loss_total": 1.0350799560546875, + "lr": 0.001101904181466964, + "router/selected_tokens_s0": 4364.0, + "step": 3520, + "tokens_trained": 11.533271488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0011346712999079, + "grad_norm": 1.1215876340866089, + "loss": 1.0781, + "loss_ce": 1.0582538843154907, + "loss_region": 0.030011579394340515, + "loss_total": 1.0882654190063477, + "lr": 0.0011014972894831132, + "router/selected_tokens_s0": 4343.125, + "step": 3530, + "tokens_trained": 11.56357952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0039713495496774, + "grad_norm": 1.4025973081588745, + "loss": 1.0809, + "loss_ce": 1.037977695465088, + "loss_region": 0.02999720722436905, + "loss_total": 1.0679749250411987, + "lr": 0.0011010903974992621, + "router/selected_tokens_s0": 4358.5, + "step": 3540, + "tokens_trained": 11.59634496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.006808027799447, + "grad_norm": 0.7676182985305786, + "loss": 1.0854, + "loss_ce": 0.96112459897995, + "loss_region": 0.030009040609002113, + "loss_total": 0.9911336302757263, + "lr": 0.001100683505515411, + "router/selected_tokens_s0": 4343.25, + "step": 3550, + "tokens_trained": 11.6291104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0096447060492164, + "grad_norm": 0.8700928688049316, + "loss": 1.0844, + "loss_ce": 1.12131667137146, + "loss_region": 0.030012287199497223, + "loss_total": 1.1513289213180542, + "lr": 0.00110027661353156, + "router/selected_tokens_s0": 4377.75, + "step": 3560, + "tokens_trained": 11.66187584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.012481384298986, + "grad_norm": 0.3621160089969635, + "loss": 1.0866, + "loss_ce": 0.9774419665336609, + "loss_region": 0.030013367533683777, + "loss_total": 1.0074553489685059, + "lr": 0.001099869721547709, + "router/selected_tokens_s0": 4358.375, + "step": 3570, + "tokens_trained": 11.69464128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0153180625487555, + "grad_norm": 0.8118414878845215, + "loss": 1.089, + "loss_ce": 1.0349894762039185, + "loss_region": 0.03000425547361374, + "loss_total": 1.0649937391281128, + "lr": 0.001099462829563858, + "router/selected_tokens_s0": 4320.625, + "step": 3580, + "tokens_trained": 11.72740592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.018154740798525, + "grad_norm": 1.3667856454849243, + "loss": 1.0864, + "loss_ce": 1.1101551055908203, + "loss_region": 0.030023517087101936, + "loss_total": 1.1401786804199219, + "lr": 0.001099055937580007, + "router/selected_tokens_s0": 4445.875, + "step": 3590, + "tokens_trained": 11.76017136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0209914190482945, + "grad_norm": 1.459873914718628, + "loss": 1.0875, + "loss_ce": 1.038351058959961, + "loss_region": 0.030004626139998436, + "loss_total": 1.068355679512024, + "lr": 0.001098649045596156, + "router/selected_tokens_s0": 4320.75, + "step": 3600, + "tokens_trained": 11.7929352 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.023828097298064, + "grad_norm": 0.9393401145935059, + "loss": 1.0852, + "loss_ce": 1.040799856185913, + "loss_region": 0.030013561248779297, + "loss_total": 1.0708134174346924, + "lr": 0.0010982421536123049, + "router/selected_tokens_s0": 4366.625, + "step": 3610, + "tokens_trained": 11.82570064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0266647755478335, + "grad_norm": 1.4277124404907227, + "loss": 1.0821, + "loss_ce": 0.9711215496063232, + "loss_region": 0.03001675009727478, + "loss_total": 1.0011383295059204, + "lr": 0.0010978352616284538, + "router/selected_tokens_s0": 4369.5, + "step": 3620, + "tokens_trained": 11.85846448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.029501453797603, + "grad_norm": 0.8825812935829163, + "loss": 1.0782, + "loss_ce": 1.0676993131637573, + "loss_region": 0.030013611540198326, + "loss_total": 1.0977128744125366, + "lr": 0.0010974283696446028, + "router/selected_tokens_s0": 4372.375, + "step": 3630, + "tokens_trained": 11.89122992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0323381320473726, + "grad_norm": 0.9022896885871887, + "loss": 1.0733, + "loss_ce": 1.0407861471176147, + "loss_region": 0.030012723058462143, + "loss_total": 1.0707988739013672, + "lr": 0.0010970214776607518, + "router/selected_tokens_s0": 4355.75, + "step": 3640, + "tokens_trained": 11.92399536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.035174810297142, + "grad_norm": 0.8870510458946228, + "loss": 1.0749, + "loss_ce": 1.1323661804199219, + "loss_region": 0.03002365306019783, + "loss_total": 1.162389874458313, + "lr": 0.0010966145856769007, + "router/selected_tokens_s0": 4435.75, + "step": 3650, + "tokens_trained": 11.9567608 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0380114885469116, + "grad_norm": 1.4656965732574463, + "loss": 1.0832, + "loss_ce": 1.0585097074508667, + "loss_region": 0.030004626139998436, + "loss_total": 1.0885143280029297, + "lr": 0.0010962076936930497, + "router/selected_tokens_s0": 4329.25, + "step": 3660, + "tokens_trained": 11.98952624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0408481667966811, + "grad_norm": 0.9163527488708496, + "loss": 1.0781, + "loss_ce": 1.0935940742492676, + "loss_region": 0.030009469017386436, + "loss_total": 1.1236035823822021, + "lr": 0.0010958008017091987, + "router/selected_tokens_s0": 4359.375, + "step": 3670, + "tokens_trained": 12.02229168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0436848450464506, + "grad_norm": 1.3013805150985718, + "loss": 1.0876, + "loss_ce": 1.0595567226409912, + "loss_region": 0.030006933957338333, + "loss_total": 1.0895636081695557, + "lr": 0.0010953939097253476, + "router/selected_tokens_s0": 4343.625, + "step": 3680, + "tokens_trained": 12.05505712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0465215232962202, + "grad_norm": 0.39981648325920105, + "loss": 1.0707, + "loss_ce": 1.1020740270614624, + "loss_region": 0.030012022703886032, + "loss_total": 1.1320860385894775, + "lr": 0.0010949870177414966, + "router/selected_tokens_s0": 4384.75, + "step": 3690, + "tokens_trained": 12.08782256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0493582015459897, + "grad_norm": 0.9564698338508606, + "loss": 1.0787, + "loss_ce": 1.0650237798690796, + "loss_region": 0.03000781685113907, + "loss_total": 1.0950316190719604, + "lr": 0.0010945801257576456, + "router/selected_tokens_s0": 4363.75, + "step": 3700, + "tokens_trained": 12.1205864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0521948797957592, + "grad_norm": 0.9331677556037903, + "loss": 1.08, + "loss_ce": 1.0856927633285522, + "loss_region": 0.030012279748916626, + "loss_total": 1.1157050132751465, + "lr": 0.0010941732337737947, + "router/selected_tokens_s0": 4404.5, + "step": 3710, + "tokens_trained": 12.15335184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0550315580455287, + "grad_norm": 1.2002500295639038, + "loss": 1.0818, + "loss_ce": 1.058534026145935, + "loss_region": 0.030018793419003487, + "loss_total": 1.0885528326034546, + "lr": 0.0010937663417899437, + "router/selected_tokens_s0": 4392.125, + "step": 3720, + "tokens_trained": 12.18611712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0578682362952982, + "grad_norm": 1.4924200773239136, + "loss": 1.0778, + "loss_ce": 1.020750880241394, + "loss_region": 0.030010871589183807, + "loss_total": 1.0507616996765137, + "lr": 0.0010933594498060927, + "router/selected_tokens_s0": 4356.625, + "step": 3730, + "tokens_trained": 12.21888256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0607049145450678, + "grad_norm": 0.6684730648994446, + "loss": 1.0769, + "loss_ce": 1.0789055824279785, + "loss_region": 0.030011769384145737, + "loss_total": 1.1089173555374146, + "lr": 0.0010929525578222414, + "router/selected_tokens_s0": 4387.875, + "step": 3740, + "tokens_trained": 12.251648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0635415927948373, + "grad_norm": 0.6117927432060242, + "loss": 1.0797, + "loss_ce": 1.0758237838745117, + "loss_region": 0.030013950541615486, + "loss_total": 1.1058377027511597, + "lr": 0.0010925456658383904, + "router/selected_tokens_s0": 4407.75, + "step": 3750, + "tokens_trained": 12.284409608 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0663782710446068, + "grad_norm": 0.6212737560272217, + "loss": 1.08, + "loss_ce": 1.074630618095398, + "loss_region": 0.03000750206410885, + "loss_total": 1.1046380996704102, + "lr": 0.0010921387738545394, + "router/selected_tokens_s0": 4371.375, + "step": 3760, + "tokens_trained": 12.317175048 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0692149492943763, + "grad_norm": 1.4000393152236938, + "loss": 1.0721, + "loss_ce": 0.9761142134666443, + "loss_region": 0.030007638037204742, + "loss_total": 1.0061218738555908, + "lr": 0.0010917318818706883, + "router/selected_tokens_s0": 4287.125, + "step": 3770, + "tokens_trained": 12.349940488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0720516275441458, + "grad_norm": 0.7589385509490967, + "loss": 1.0796, + "loss_ce": 1.0433313846588135, + "loss_region": 0.030011288821697235, + "loss_total": 1.0733426809310913, + "lr": 0.0010913249898868375, + "router/selected_tokens_s0": 4376.375, + "step": 3780, + "tokens_trained": 12.382705928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0748883057939154, + "grad_norm": 0.9254264235496521, + "loss": 1.0757, + "loss_ce": 1.0672318935394287, + "loss_region": 0.030009465292096138, + "loss_total": 1.0972414016723633, + "lr": 0.0010909180979029865, + "router/selected_tokens_s0": 4369.625, + "step": 3790, + "tokens_trained": 12.415470568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0777249840436849, + "grad_norm": 0.8547407984733582, + "loss": 1.0803, + "loss_ce": 1.086848497390747, + "loss_region": 0.03000779263675213, + "loss_total": 1.116856336593628, + "lr": 0.0010905112059191354, + "router/selected_tokens_s0": 4377.5, + "step": 3800, + "tokens_trained": 12.448236008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0805616622934544, + "grad_norm": 2.435622215270996, + "loss": 1.0783, + "loss_ce": 1.0909473896026611, + "loss_region": 0.03001653589308262, + "loss_total": 1.1209639310836792, + "lr": 0.0010901043139352844, + "router/selected_tokens_s0": 4412.5, + "step": 3810, + "tokens_trained": 12.481001448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.083398340543224, + "grad_norm": 0.8429534435272217, + "loss": 1.0679, + "loss_ce": 1.0455752611160278, + "loss_region": 0.030013732612133026, + "loss_total": 1.0755889415740967, + "lr": 0.0010896974219514334, + "router/selected_tokens_s0": 4386.125, + "step": 3820, + "tokens_trained": 12.513766888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0862350187929934, + "grad_norm": 1.3827040195465088, + "loss": 1.0802, + "loss_ce": 1.1371749639511108, + "loss_region": 0.030014697462320328, + "loss_total": 1.1671897172927856, + "lr": 0.0010892905299675823, + "router/selected_tokens_s0": 4394.5, + "step": 3830, + "tokens_trained": 12.546532328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.089071697042763, + "grad_norm": 1.121778130531311, + "loss": 1.0882, + "loss_ce": 1.0060161352157593, + "loss_region": 0.030006043612957, + "loss_total": 1.0360221862792969, + "lr": 0.0010888836379837313, + "router/selected_tokens_s0": 4312.375, + "step": 3840, + "tokens_trained": 12.57929404 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0919083752925325, + "grad_norm": 1.114713430404663, + "loss": 1.0791, + "loss_ce": 1.0474870204925537, + "loss_region": 0.03002369962632656, + "loss_total": 1.0775107145309448, + "lr": 0.0010884767459998803, + "router/selected_tokens_s0": 4427.0, + "step": 3850, + "tokens_trained": 12.61205788 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.094745053542302, + "grad_norm": 0.42109477519989014, + "loss": 1.0759, + "loss_ce": 1.0544071197509766, + "loss_region": 0.030009722337126732, + "loss_total": 1.0844168663024902, + "lr": 0.0010880698540160292, + "router/selected_tokens_s0": 4372.875, + "step": 3860, + "tokens_trained": 12.64482332 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.0975817317920715, + "grad_norm": 1.0385088920593262, + "loss": 1.0656, + "loss_ce": 1.1451784372329712, + "loss_region": 0.030008675530552864, + "loss_total": 1.175187110900879, + "lr": 0.0010876629620321782, + "router/selected_tokens_s0": 4386.0, + "step": 3870, + "tokens_trained": 12.677587992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.100418410041841, + "grad_norm": 1.0264872312545776, + "loss": 1.0732, + "loss_ce": 1.0522844791412354, + "loss_region": 0.030011439695954323, + "loss_total": 1.0822958946228027, + "lr": 0.0010872560700483272, + "router/selected_tokens_s0": 4350.75, + "step": 3880, + "tokens_trained": 12.710352632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1032550882916106, + "grad_norm": 0.6505580544471741, + "loss": 1.077, + "loss_ce": 1.0118770599365234, + "loss_region": 0.03001365438103676, + "loss_total": 1.0418907403945923, + "lr": 0.0010868491780644761, + "router/selected_tokens_s0": 4405.125, + "step": 3890, + "tokens_trained": 12.743118072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.10609176654138, + "grad_norm": 1.2206717729568481, + "loss": 1.0648, + "loss_ce": 1.009666919708252, + "loss_region": 0.030009053647518158, + "loss_total": 1.0396759510040283, + "lr": 0.001086442286080625, + "router/selected_tokens_s0": 4362.0, + "step": 3900, + "tokens_trained": 12.775882712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1089284447911496, + "grad_norm": 0.9762550592422485, + "loss": 1.0833, + "loss_ce": 1.1374930143356323, + "loss_region": 0.03000687249004841, + "loss_total": 1.1674998998641968, + "lr": 0.001086035394096774, + "router/selected_tokens_s0": 4368.875, + "step": 3910, + "tokens_trained": 12.808648152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1117651230409191, + "grad_norm": 0.6860953569412231, + "loss": 1.0783, + "loss_ce": 1.1070737838745117, + "loss_region": 0.030013803392648697, + "loss_total": 1.1370875835418701, + "lr": 0.001085628502112923, + "router/selected_tokens_s0": 4440.25, + "step": 3920, + "tokens_trained": 12.841413592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1146018012906886, + "grad_norm": 0.5641375184059143, + "loss": 1.0779, + "loss_ce": 0.9596564173698425, + "loss_region": 0.030009876936674118, + "loss_total": 0.9896662831306458, + "lr": 0.001085221610129072, + "router/selected_tokens_s0": 4372.25, + "step": 3930, + "tokens_trained": 12.874179032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1174384795404582, + "grad_norm": 1.3612422943115234, + "loss": 1.0745, + "loss_ce": 1.0073390007019043, + "loss_region": 0.030006812885403633, + "loss_total": 1.0373457670211792, + "lr": 0.001084814718145221, + "router/selected_tokens_s0": 4349.25, + "step": 3940, + "tokens_trained": 12.906944472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1202751577902277, + "grad_norm": 0.9040305614471436, + "loss": 1.0713, + "loss_ce": 0.9823886156082153, + "loss_region": 0.03001086413860321, + "loss_total": 1.012399435043335, + "lr": 0.00108440782616137, + "router/selected_tokens_s0": 4368.375, + "step": 3950, + "tokens_trained": 12.939709912 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1231118360399972, + "grad_norm": 0.6158255934715271, + "loss": 1.0631, + "loss_ce": 1.072802186012268, + "loss_region": 0.030010098591446877, + "loss_total": 1.1028122901916504, + "lr": 0.001084000934177519, + "router/selected_tokens_s0": 4384.5, + "step": 3960, + "tokens_trained": 12.972475352 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1259485142897667, + "grad_norm": 0.8492525815963745, + "loss": 1.0761, + "loss_ce": 1.1295671463012695, + "loss_region": 0.03000839613378048, + "loss_total": 1.1595755815505981, + "lr": 0.001083594042193668, + "router/selected_tokens_s0": 4394.5, + "step": 3970, + "tokens_trained": 13.005240792 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1287851925395362, + "grad_norm": 1.4454373121261597, + "loss": 1.0671, + "loss_ce": 1.059134840965271, + "loss_region": 0.030011136084794998, + "loss_total": 1.0891460180282593, + "lr": 0.001083187150209817, + "router/selected_tokens_s0": 4354.625, + "step": 3980, + "tokens_trained": 13.038005432 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1316218707893058, + "grad_norm": 1.1143423318862915, + "loss": 1.0746, + "loss_ce": 1.058432698249817, + "loss_region": 0.030015287920832634, + "loss_total": 1.08844792842865, + "lr": 0.0010827802582259658, + "router/selected_tokens_s0": 4413.25, + "step": 3990, + "tokens_trained": 13.070770872 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1344585490390753, + "grad_norm": 0.926952600479126, + "loss": 1.0726, + "loss_ce": 1.0535566806793213, + "loss_region": 0.03000652976334095, + "loss_total": 1.083563208580017, + "lr": 0.0010823733662421147, + "router/selected_tokens_s0": 4332.125, + "step": 4000, + "tokens_trained": 13.103536312 + }, + { + "epoch": 1.1344585490390753, + "eval_ppl": 2.858632587615727, + "eval_runtime": 2.4962, + "step": 4000, + "tokens_trained": 13.103536312 + }, + { + "epoch": 1.1344585490390753, + "eval_F": 0.34138791413731373, + "eval_F_cds": 0.3448908798343993, + "eval_F_dig": 0.3374221944422741, + "eval_F_exon": 0.34516320139927215, + "eval_F_intron": 0.34168107017140814, + "eval_F_nig": 0.3411899187908908, + "eval_F_promoter": 0.33952618612122176, + "eval_F_utr": 0.34327183776802744, + "eval_G": 0.34184569864652725, + "eval_G_cds": 0.34168474533742754, + "eval_G_dig": 0.39810788440503164, + "eval_G_exon": 0.34140616184021216, + "eval_G_intron": 0.3411866290260269, + "eval_G_nig": 0.34169810595567374, + "eval_G_promoter": 0.3417757543881514, + "eval_G_utr": 0.3401876713258752, + "eval_avg_bp_per_token": 2.9292191041003814, + "eval_bp_per_token/cds": 2.899467798279136, + "eval_bp_per_token/dig": 2.9636461870947826, + "eval_bp_per_token/exon": 2.897180220678382, + "eval_bp_per_token/intron": 2.9267058883254458, + "eval_bp_per_token/nig": 2.930918954299122, + "eval_bp_per_token/promoter": 2.945280926411278, + "eval_bp_per_token/utr": 2.9131431418961005, + "eval_ppl_cds": 3.4450720333639553, + "eval_ppl_dig": 1.088491176901866, + "eval_ppl_exon": 3.2953068260471907, + "eval_ppl_intron": 2.887916254694354, + "eval_ppl_nig": 2.65992247163589, + "eval_ppl_promoter": 3.249167345940797, + "eval_ppl_utr": 3.267379860704035, + "step": 4000, + "tokens_trained": 13.103536312 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1372952272888448, + "grad_norm": 1.135546088218689, + "loss": 1.0709, + "loss_ce": 1.05037522315979, + "loss_region": 0.030016878619790077, + "loss_total": 1.0803921222686768, + "lr": 0.0010819664742582637, + "router/selected_tokens_s0": 4442.875, + "step": 4010, + "tokens_trained": 13.136300952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1401319055386143, + "grad_norm": 0.9313811659812927, + "loss": 1.0706, + "loss_ce": 1.0053969621658325, + "loss_region": 0.029991673305630684, + "loss_total": 1.0353885889053345, + "lr": 0.0010815595822744127, + "router/selected_tokens_s0": 4338.625, + "step": 4020, + "tokens_trained": 13.169065592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1429685837883838, + "grad_norm": 1.150964617729187, + "loss": 1.0746, + "loss_ce": 1.0354498624801636, + "loss_region": 0.03000836819410324, + "loss_total": 1.0654581785202026, + "lr": 0.0010811526902905618, + "router/selected_tokens_s0": 4357.125, + "step": 4030, + "tokens_trained": 13.201831032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1458052620381534, + "grad_norm": 0.3598765432834625, + "loss": 1.069, + "loss_ce": 0.9737571477890015, + "loss_region": 0.030010921880602837, + "loss_total": 1.0037680864334106, + "lr": 0.0010807457983067108, + "router/selected_tokens_s0": 4364.125, + "step": 4040, + "tokens_trained": 13.234596472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1486419402879229, + "grad_norm": 1.6098700761795044, + "loss": 1.0724, + "loss_ce": 1.090259075164795, + "loss_region": 0.030013924464583397, + "loss_total": 1.1202729940414429, + "lr": 0.0010803389063228598, + "router/selected_tokens_s0": 4396.5, + "step": 4050, + "tokens_trained": 13.267361888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1514786185376924, + "grad_norm": 1.2312268018722534, + "loss": 1.0755, + "loss_ce": 1.0816667079925537, + "loss_region": 0.0300059225410223, + "loss_total": 1.1116726398468018, + "lr": 0.0010799320143390087, + "router/selected_tokens_s0": 4383.875, + "step": 4060, + "tokens_trained": 13.300127328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.154315296787462, + "grad_norm": 0.8974295854568481, + "loss": 1.074, + "loss_ce": 0.999101459980011, + "loss_region": 0.030007481575012207, + "loss_total": 1.029109001159668, + "lr": 0.0010795251223551577, + "router/selected_tokens_s0": 4339.875, + "step": 4070, + "tokens_trained": 13.332892768 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1571519750372314, + "grad_norm": 0.9909172654151917, + "loss": 1.0664, + "loss_ce": 1.1255804300308228, + "loss_region": 0.030011769384145737, + "loss_total": 1.1555922031402588, + "lr": 0.0010791182303713067, + "router/selected_tokens_s0": 4367.375, + "step": 4080, + "tokens_trained": 13.365658208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.159988653287001, + "grad_norm": 2.2824649810791016, + "loss": 1.0724, + "loss_ce": 0.9207800626754761, + "loss_region": 0.030002696439623833, + "loss_total": 0.9507827758789062, + "lr": 0.0010787113383874556, + "router/selected_tokens_s0": 4278.5, + "step": 4090, + "tokens_trained": 13.398422568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1628253315367705, + "grad_norm": 0.4956927001476288, + "loss": 1.0733, + "loss_ce": 1.1247563362121582, + "loss_region": 0.03000745177268982, + "loss_total": 1.1547638177871704, + "lr": 0.0010783044464036046, + "router/selected_tokens_s0": 4366.5, + "step": 4100, + "tokens_trained": 13.431185432 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.16566200978654, + "grad_norm": 0.6976671814918518, + "loss": 1.0697, + "loss_ce": 1.0819003582000732, + "loss_region": 0.03002307377755642, + "loss_total": 1.1119234561920166, + "lr": 0.0010778975544197536, + "router/selected_tokens_s0": 4439.5, + "step": 4110, + "tokens_trained": 13.463950872 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1684986880363095, + "grad_norm": 1.2541862726211548, + "loss": 1.0653, + "loss_ce": 0.7924370169639587, + "loss_region": 0.029992103576660156, + "loss_total": 0.8224291205406189, + "lr": 0.0010774906624359025, + "router/selected_tokens_s0": 4297.25, + "step": 4120, + "tokens_trained": 13.496713976 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.171335366286079, + "grad_norm": 1.4140042066574097, + "loss": 1.0685, + "loss_ce": 1.0910414457321167, + "loss_region": 0.03000573255121708, + "loss_total": 1.1210471391677856, + "lr": 0.0010770837704520515, + "router/selected_tokens_s0": 4412.5, + "step": 4130, + "tokens_trained": 13.529479416 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1741720445358486, + "grad_norm": 0.5698431134223938, + "loss": 1.0775, + "loss_ce": 0.8972399830818176, + "loss_region": 0.029999306425452232, + "loss_total": 0.9272392988204956, + "lr": 0.0010766768784682005, + "router/selected_tokens_s0": 4358.875, + "step": 4140, + "tokens_trained": 13.562244856 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.177008722785618, + "grad_norm": 1.3367623090744019, + "loss": 1.0704, + "loss_ce": 1.1016747951507568, + "loss_region": 0.03001498058438301, + "loss_total": 1.1316897869110107, + "lr": 0.0010762699864843494, + "router/selected_tokens_s0": 4404.625, + "step": 4150, + "tokens_trained": 13.595010296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1798454010353876, + "grad_norm": 0.7532950043678284, + "loss": 1.0571, + "loss_ce": 0.9316068887710571, + "loss_region": 0.03000483848154545, + "loss_total": 0.9616117477416992, + "lr": 0.0010758630945004984, + "router/selected_tokens_s0": 4305.625, + "step": 4160, + "tokens_trained": 13.627775736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1826820792851571, + "grad_norm": 0.8210463523864746, + "loss": 1.066, + "loss_ce": 1.0508811473846436, + "loss_region": 0.030011793598532677, + "loss_total": 1.0808929204940796, + "lr": 0.0010754562025166474, + "router/selected_tokens_s0": 4387.75, + "step": 4170, + "tokens_trained": 13.660541176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1855187575349266, + "grad_norm": 1.2525079250335693, + "loss": 1.0643, + "loss_ce": 1.0516717433929443, + "loss_region": 0.030006369575858116, + "loss_total": 1.0816781520843506, + "lr": 0.0010750493105327963, + "router/selected_tokens_s0": 4361.0, + "step": 4180, + "tokens_trained": 13.693306616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1883554357846962, + "grad_norm": 1.0099766254425049, + "loss": 1.0655, + "loss_ce": 0.9692405462265015, + "loss_region": 0.0300027746707201, + "loss_total": 0.9992433190345764, + "lr": 0.0010746424185489453, + "router/selected_tokens_s0": 4389.0, + "step": 4190, + "tokens_trained": 13.726072056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1911921140344657, + "grad_norm": 0.7821201682090759, + "loss": 1.0697, + "loss_ce": 1.018595814704895, + "loss_region": 0.030003640800714493, + "loss_total": 1.0485994815826416, + "lr": 0.0010742355265650943, + "router/selected_tokens_s0": 4325.0, + "step": 4200, + "tokens_trained": 13.758837496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1940287922842352, + "grad_norm": 1.1446911096572876, + "loss": 1.0679, + "loss_ce": 1.0561493635177612, + "loss_region": 0.030004194006323814, + "loss_total": 1.086153507232666, + "lr": 0.0010738286345812434, + "router/selected_tokens_s0": 4347.875, + "step": 4210, + "tokens_trained": 13.791602936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1968654705340047, + "grad_norm": 0.8190633654594421, + "loss": 1.079, + "loss_ce": 1.0492353439331055, + "loss_region": 0.030007485300302505, + "loss_total": 1.0792428255081177, + "lr": 0.0010734217425973924, + "router/selected_tokens_s0": 4382.25, + "step": 4220, + "tokens_trained": 13.824368376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.1997021487837742, + "grad_norm": 1.038085699081421, + "loss": 1.066, + "loss_ce": 0.9230837821960449, + "loss_region": 0.030007129535079002, + "loss_total": 0.9530909061431885, + "lr": 0.0010730148506135414, + "router/selected_tokens_s0": 4315.875, + "step": 4230, + "tokens_trained": 13.857133016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2025388270335438, + "grad_norm": 1.4383383989334106, + "loss": 1.0598, + "loss_ce": 1.0923779010772705, + "loss_region": 0.03000866062939167, + "loss_total": 1.1223865747451782, + "lr": 0.0010726079586296901, + "router/selected_tokens_s0": 4404.625, + "step": 4240, + "tokens_trained": 13.889896856 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2053755052833133, + "grad_norm": 0.6213952898979187, + "loss": 1.0635, + "loss_ce": 1.0380363464355469, + "loss_region": 0.030008360743522644, + "loss_total": 1.068044662475586, + "lr": 0.001072201066645839, + "router/selected_tokens_s0": 4405.25, + "step": 4250, + "tokens_trained": 13.922662296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2082121835330828, + "grad_norm": 0.4920593202114105, + "loss": 1.0628, + "loss_ce": 1.0617847442626953, + "loss_region": 0.030003665015101433, + "loss_total": 1.091788411140442, + "lr": 0.001071794174661988, + "router/selected_tokens_s0": 4345.0, + "step": 4260, + "tokens_trained": 13.955427736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2110488617828523, + "grad_norm": 1.1883982419967651, + "loss": 1.0624, + "loss_ce": 1.0569326877593994, + "loss_region": 0.03000705875456333, + "loss_total": 1.0869396924972534, + "lr": 0.0010713872826781372, + "router/selected_tokens_s0": 4376.875, + "step": 4270, + "tokens_trained": 13.988193176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2138855400326218, + "grad_norm": 0.7268418669700623, + "loss": 1.0621, + "loss_ce": 1.1230334043502808, + "loss_region": 0.030010607093572617, + "loss_total": 1.1530439853668213, + "lr": 0.0010709803906942862, + "router/selected_tokens_s0": 4399.0, + "step": 4280, + "tokens_trained": 14.020958616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2167222182823914, + "grad_norm": 0.8508139848709106, + "loss": 1.0648, + "loss_ce": 1.012586236000061, + "loss_region": 0.030012402683496475, + "loss_total": 1.0425986051559448, + "lr": 0.0010705734987104352, + "router/selected_tokens_s0": 4399.25, + "step": 4290, + "tokens_trained": 14.053724056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2195588965321609, + "grad_norm": 1.003320574760437, + "loss": 1.072, + "loss_ce": 1.092397928237915, + "loss_region": 0.030008139088749886, + "loss_total": 1.1224061250686646, + "lr": 0.0010701666067265841, + "router/selected_tokens_s0": 4384.5, + "step": 4300, + "tokens_trained": 14.086488728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2223955747819304, + "grad_norm": 0.6508564949035645, + "loss": 1.0592, + "loss_ce": 0.9947494864463806, + "loss_region": 0.030006522312760353, + "loss_total": 1.0247559547424316, + "lr": 0.001069759714742733, + "router/selected_tokens_s0": 4323.125, + "step": 4310, + "tokens_trained": 14.119254168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2252322530317, + "grad_norm": 0.6111332774162292, + "loss": 1.0612, + "loss_ce": 1.0148944854736328, + "loss_region": 0.030005091801285744, + "loss_total": 1.044899582862854, + "lr": 0.001069352822758882, + "router/selected_tokens_s0": 4324.25, + "step": 4320, + "tokens_trained": 14.152019608 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2280689312814694, + "grad_norm": 1.111618161201477, + "loss": 1.0653, + "loss_ce": 0.9681676030158997, + "loss_region": 0.03000504896044731, + "loss_total": 0.9981726408004761, + "lr": 0.001068945930775031, + "router/selected_tokens_s0": 4334.0, + "step": 4330, + "tokens_trained": 14.184785048 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.230905609531239, + "grad_norm": 0.6468565464019775, + "loss": 1.0612, + "loss_ce": 1.0180631875991821, + "loss_region": 0.030008560046553612, + "loss_total": 1.0480717420578003, + "lr": 0.00106853903879118, + "router/selected_tokens_s0": 4372.0, + "step": 4340, + "tokens_trained": 14.217550488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2337422877810085, + "grad_norm": 0.4824322760105133, + "loss": 1.0606, + "loss_ce": 1.054766058921814, + "loss_region": 0.030008021742105484, + "loss_total": 1.084774136543274, + "lr": 0.001068132146807329, + "router/selected_tokens_s0": 4369.5, + "step": 4350, + "tokens_trained": 14.250315928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.236578966030778, + "grad_norm": 0.8620288968086243, + "loss": 1.0604, + "loss_ce": 1.075032114982605, + "loss_region": 0.030014168471097946, + "loss_total": 1.105046272277832, + "lr": 0.001067725254823478, + "router/selected_tokens_s0": 4397.375, + "step": 4360, + "tokens_trained": 14.283081368 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2394156442805475, + "grad_norm": 1.4154425859451294, + "loss": 1.0614, + "loss_ce": 1.0848474502563477, + "loss_region": 0.030006200075149536, + "loss_total": 1.1148536205291748, + "lr": 0.0010673183628396269, + "router/selected_tokens_s0": 4343.25, + "step": 4370, + "tokens_trained": 14.315846808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.242252322530317, + "grad_norm": 0.7003890872001648, + "loss": 1.0594, + "loss_ce": 1.1462643146514893, + "loss_region": 0.030004924163222313, + "loss_total": 1.176269292831421, + "lr": 0.0010669114708557758, + "router/selected_tokens_s0": 4383.25, + "step": 4380, + "tokens_trained": 14.348612232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2450890007800866, + "grad_norm": 0.9333593845367432, + "loss": 1.0628, + "loss_ce": 1.1088054180145264, + "loss_region": 0.0300068948417902, + "loss_total": 1.1388123035430908, + "lr": 0.0010665045788719248, + "router/selected_tokens_s0": 4374.875, + "step": 4390, + "tokens_trained": 14.381377672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.247925679029856, + "grad_norm": 0.7894501686096191, + "loss": 1.0632, + "loss_ce": 0.9245185256004333, + "loss_region": 0.030007462948560715, + "loss_total": 0.9545260071754456, + "lr": 0.0010660976868880738, + "router/selected_tokens_s0": 4363.625, + "step": 4400, + "tokens_trained": 14.414143112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2507623572796256, + "grad_norm": 1.051579236984253, + "loss": 1.0676, + "loss_ce": 1.0265341997146606, + "loss_region": 0.03001078963279724, + "loss_total": 1.0565450191497803, + "lr": 0.0010656907949042227, + "router/selected_tokens_s0": 4416.375, + "step": 4410, + "tokens_trained": 14.446908552 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2535990355293951, + "grad_norm": 1.4166457653045654, + "loss": 1.0661, + "loss_ce": 1.1268078088760376, + "loss_region": 0.0300078634172678, + "loss_total": 1.1568156480789185, + "lr": 0.0010652839029203717, + "router/selected_tokens_s0": 4376.375, + "step": 4420, + "tokens_trained": 14.479673992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2564357137791646, + "grad_norm": 0.9460220336914062, + "loss": 1.0627, + "loss_ce": 1.0467621088027954, + "loss_region": 0.030009562149643898, + "loss_total": 1.07677161693573, + "lr": 0.0010648770109365207, + "router/selected_tokens_s0": 4361.5, + "step": 4430, + "tokens_trained": 14.512439432 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2592723920289342, + "grad_norm": 0.726325273513794, + "loss": 1.0611, + "loss_ce": 1.060929298400879, + "loss_region": 0.030005550011992455, + "loss_total": 1.0909348726272583, + "lr": 0.0010644701189526696, + "router/selected_tokens_s0": 4371.625, + "step": 4440, + "tokens_trained": 14.545204872 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2621090702787037, + "grad_norm": 0.8391557931900024, + "loss": 1.0602, + "loss_ce": 0.9280992150306702, + "loss_region": 0.030011408030986786, + "loss_total": 0.9581106305122375, + "lr": 0.0010640632269688186, + "router/selected_tokens_s0": 4365.625, + "step": 4450, + "tokens_trained": 14.577970312 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2649457485284732, + "grad_norm": 0.5766838788986206, + "loss": 1.0606, + "loss_ce": 1.0834920406341553, + "loss_region": 0.03000706620514393, + "loss_total": 1.1134991645812988, + "lr": 0.0010636563349849678, + "router/selected_tokens_s0": 4382.625, + "step": 4460, + "tokens_trained": 14.610735752 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2677824267782427, + "grad_norm": 0.7650503516197205, + "loss": 1.0655, + "loss_ce": 1.0730332136154175, + "loss_region": 0.030004587024450302, + "loss_total": 1.1030378341674805, + "lr": 0.0010632494430011167, + "router/selected_tokens_s0": 4357.375, + "step": 4470, + "tokens_trained": 14.643501192 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2706191050280122, + "grad_norm": 0.9502279758453369, + "loss": 1.065, + "loss_ce": 0.9979308247566223, + "loss_region": 0.030005743727087975, + "loss_total": 1.027936577796936, + "lr": 0.0010628425510172657, + "router/selected_tokens_s0": 4355.125, + "step": 4480, + "tokens_trained": 14.676266632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2734557832777817, + "grad_norm": 0.42353641986846924, + "loss": 1.0697, + "loss_ce": 0.9458868503570557, + "loss_region": 0.03001011349260807, + "loss_total": 0.975896954536438, + "lr": 0.0010624356590334145, + "router/selected_tokens_s0": 4327.25, + "step": 4490, + "tokens_trained": 14.709032072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2762924615275513, + "grad_norm": 0.8221932649612427, + "loss": 1.0532, + "loss_ce": 1.0401586294174194, + "loss_region": 0.03000483103096485, + "loss_total": 1.0701634883880615, + "lr": 0.0010620287670495634, + "router/selected_tokens_s0": 4364.625, + "step": 4500, + "tokens_trained": 14.741797512 + }, + { + "epoch": 1.2762924615275513, + "eval_ppl": 2.8214683274593275, + "eval_runtime": 2.514, + "step": 4500, + "tokens_trained": 14.741797512 + }, + { + "epoch": 1.2762924615275513, + "eval_F": 0.3404783661767054, + "eval_F_cds": 0.34415505656136036, + "eval_F_dig": 0.34039117639398914, + "eval_F_exon": 0.3441758117753265, + "eval_F_intron": 0.3404660876483458, + "eval_F_nig": 0.34068460925568095, + "eval_F_promoter": 0.3389634986468706, + "eval_F_utr": 0.3419717924374995, + "eval_G": 0.33991637341184733, + "eval_G_cds": 0.33970658857472646, + "eval_G_dig": 0.40259630268966323, + "eval_G_exon": 0.3395000370271109, + "eval_G_intron": 0.33899745701570494, + "eval_G_nig": 0.3395800911177655, + "eval_G_promoter": 0.34070296612716594, + "eval_G_utr": 0.33846465688852967, + "eval_avg_bp_per_token": 2.9370441688533258, + "eval_bp_per_token/cds": 2.9056670269254266, + "eval_bp_per_token/dig": 2.937796480489671, + "eval_bp_per_token/exon": 2.9054918032786885, + "eval_bp_per_token/intron": 2.937150090063775, + "eval_bp_per_token/nig": 2.9352661459664247, + "eval_bp_per_token/promoter": 2.9501701628404295, + "eval_bp_per_token/utr": 2.924217792561839, + "eval_ppl_cds": 3.362117196422307, + "eval_ppl_dig": 1.084496515952716, + "eval_ppl_exon": 3.2638291915167845, + "eval_ppl_intron": 2.863014968465922, + "eval_ppl_nig": 2.628156788503626, + "eval_ppl_promoter": 3.186888990118295, + "eval_ppl_utr": 3.231163925316475, + "step": 4500, + "tokens_trained": 14.741797512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2791291397773208, + "grad_norm": 0.574324905872345, + "loss": 1.0626, + "loss_ce": 1.1369348764419556, + "loss_region": 0.030004041269421577, + "loss_total": 1.1669389009475708, + "lr": 0.0010616218750657124, + "router/selected_tokens_s0": 4390.0, + "step": 4510, + "tokens_trained": 14.774562152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2819658180270903, + "grad_norm": 1.4291346073150635, + "loss": 1.0542, + "loss_ce": 1.0772862434387207, + "loss_region": 0.030009262263774872, + "loss_total": 1.1072955131530762, + "lr": 0.0010612149830818616, + "router/selected_tokens_s0": 4374.125, + "step": 4520, + "tokens_trained": 14.807327592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2848024962768598, + "grad_norm": 1.110178828239441, + "loss": 1.0581, + "loss_ce": 1.0101850032806396, + "loss_region": 0.030011240392923355, + "loss_total": 1.0401962995529175, + "lr": 0.0010608080910980105, + "router/selected_tokens_s0": 4400.875, + "step": 4530, + "tokens_trained": 14.840092232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2876391745266293, + "grad_norm": 0.6965222358703613, + "loss": 1.0677, + "loss_ce": 1.0458329916000366, + "loss_region": 0.030003167688846588, + "loss_total": 1.075836181640625, + "lr": 0.0010604011991141595, + "router/selected_tokens_s0": 4285.875, + "step": 4540, + "tokens_trained": 14.872854344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2904758527763989, + "grad_norm": 0.7366101145744324, + "loss": 1.0673, + "loss_ce": 1.127131700515747, + "loss_region": 0.030009111389517784, + "loss_total": 1.157140851020813, + "lr": 0.0010599943071303085, + "router/selected_tokens_s0": 4368.75, + "step": 4550, + "tokens_trained": 14.905619784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2933125310261684, + "grad_norm": 0.5127747058868408, + "loss": 1.0583, + "loss_ce": 1.0789624452590942, + "loss_region": 0.030002892017364502, + "loss_total": 1.1089653968811035, + "lr": 0.0010595874151464574, + "router/selected_tokens_s0": 4366.5, + "step": 4560, + "tokens_trained": 14.938385224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.296149209275938, + "grad_norm": 0.8208303451538086, + "loss": 1.0564, + "loss_ce": 1.0156883001327515, + "loss_region": 0.03000694513320923, + "loss_total": 1.0456953048706055, + "lr": 0.0010591805231626064, + "router/selected_tokens_s0": 4368.5, + "step": 4570, + "tokens_trained": 14.971149864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.2989858875257074, + "grad_norm": 0.9243417978286743, + "loss": 1.0546, + "loss_ce": 1.1268659830093384, + "loss_region": 0.030008889734745026, + "loss_total": 1.1568748950958252, + "lr": 0.0010587736311787554, + "router/selected_tokens_s0": 4378.75, + "step": 4580, + "tokens_trained": 15.003915304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.301822565775477, + "grad_norm": 1.2765685319900513, + "loss": 1.0589, + "loss_ce": 1.0031052827835083, + "loss_region": 0.03000444732606411, + "loss_total": 1.0331097841262817, + "lr": 0.0010583667391949043, + "router/selected_tokens_s0": 4328.75, + "step": 4590, + "tokens_trained": 15.036680744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3046592440252465, + "grad_norm": 0.7742276787757874, + "loss": 1.06, + "loss_ce": 1.079167127609253, + "loss_region": 0.030005935579538345, + "loss_total": 1.109173059463501, + "lr": 0.0010579598472110533, + "router/selected_tokens_s0": 4353.25, + "step": 4600, + "tokens_trained": 15.069442824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.307495922275016, + "grad_norm": 0.793449878692627, + "loss": 1.0558, + "loss_ce": 1.1816250085830688, + "loss_region": 0.030013153329491615, + "loss_total": 1.2116382122039795, + "lr": 0.0010575529552272023, + "router/selected_tokens_s0": 4410.125, + "step": 4610, + "tokens_trained": 15.102208264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3103326005247855, + "grad_norm": 0.45884019136428833, + "loss": 1.0581, + "loss_ce": 1.03169584274292, + "loss_region": 0.030012015253305435, + "loss_total": 1.061707854270935, + "lr": 0.0010571460632433512, + "router/selected_tokens_s0": 4406.125, + "step": 4620, + "tokens_trained": 15.134972904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.313169278774555, + "grad_norm": 0.297644704580307, + "loss": 1.0543, + "loss_ce": 0.9691150188446045, + "loss_region": 0.030010992661118507, + "loss_total": 0.9991260170936584, + "lr": 0.0010567391712595002, + "router/selected_tokens_s0": 4378.125, + "step": 4630, + "tokens_trained": 15.167735632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3160059570243245, + "grad_norm": 1.0193889141082764, + "loss": 1.0631, + "loss_ce": 0.9981553554534912, + "loss_region": 0.030008038505911827, + "loss_total": 1.0281634330749512, + "lr": 0.0010563322792756492, + "router/selected_tokens_s0": 4370.875, + "step": 4640, + "tokens_trained": 15.200501072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.318842635274094, + "grad_norm": 0.5435932278633118, + "loss": 1.0567, + "loss_ce": 1.0280530452728271, + "loss_region": 0.030006207525730133, + "loss_total": 1.0580592155456543, + "lr": 0.0010559253872917981, + "router/selected_tokens_s0": 4404.625, + "step": 4650, + "tokens_trained": 15.233266512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3216793135238636, + "grad_norm": 1.5278434753417969, + "loss": 1.0548, + "loss_ce": 1.136357307434082, + "loss_region": 0.03000808134675026, + "loss_total": 1.166365385055542, + "lr": 0.001055518495307947, + "router/selected_tokens_s0": 4371.0, + "step": 4660, + "tokens_trained": 15.266028416 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.324515991773633, + "grad_norm": 1.0518913269042969, + "loss": 1.0663, + "loss_ce": 1.0637515783309937, + "loss_region": 0.030026618391275406, + "loss_total": 1.0937782526016235, + "lr": 0.001055111603324096, + "router/selected_tokens_s0": 4432.5, + "step": 4670, + "tokens_trained": 15.298793832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3273526700234026, + "grad_norm": 0.41353392601013184, + "loss": 1.0573, + "loss_ce": 1.1546579599380493, + "loss_region": 0.030011750757694244, + "loss_total": 1.1846697330474854, + "lr": 0.001054704711340245, + "router/selected_tokens_s0": 4418.375, + "step": 4680, + "tokens_trained": 15.331559256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3301893482731721, + "grad_norm": 1.145396113395691, + "loss": 1.0615, + "loss_ce": 0.9925062656402588, + "loss_region": 0.030006494373083115, + "loss_total": 1.0225127935409546, + "lr": 0.001054297819356394, + "router/selected_tokens_s0": 4337.125, + "step": 4690, + "tokens_trained": 15.364323896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3330260265229417, + "grad_norm": 0.48764264583587646, + "loss": 1.0592, + "loss_ce": 1.013685941696167, + "loss_region": 0.030003532767295837, + "loss_total": 1.043689489364624, + "lr": 0.0010538909273725432, + "router/selected_tokens_s0": 4342.125, + "step": 4700, + "tokens_trained": 15.397089336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3358627047727112, + "grad_norm": 0.8164799809455872, + "loss": 1.0516, + "loss_ce": 0.9586069583892822, + "loss_region": 0.030013196170330048, + "loss_total": 0.9886201620101929, + "lr": 0.0010534840353886921, + "router/selected_tokens_s0": 4396.625, + "step": 4710, + "tokens_trained": 15.42985476 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3386993830224807, + "grad_norm": 1.13369619846344, + "loss": 1.0594, + "loss_ce": 1.0693820714950562, + "loss_region": 0.030003618448972702, + "loss_total": 1.0993857383728027, + "lr": 0.001053077143404841, + "router/selected_tokens_s0": 4341.625, + "step": 4720, + "tokens_trained": 15.4626202 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3415360612722502, + "grad_norm": 0.8559716939926147, + "loss": 1.052, + "loss_ce": 0.9566583633422852, + "loss_region": 0.029995476827025414, + "loss_total": 0.9866538643836975, + "lr": 0.0010526702514209898, + "router/selected_tokens_s0": 4327.0, + "step": 4730, + "tokens_trained": 15.49538484 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3443727395220197, + "grad_norm": 0.642441987991333, + "loss": 1.0486, + "loss_ce": 0.9989664554595947, + "loss_region": 0.030002903193235397, + "loss_total": 1.028969407081604, + "lr": 0.0010522633594371388, + "router/selected_tokens_s0": 4368.75, + "step": 4740, + "tokens_trained": 15.52815028 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3472094177717893, + "grad_norm": 1.2081654071807861, + "loss": 1.0543, + "loss_ce": 1.0263653993606567, + "loss_region": 0.030002884566783905, + "loss_total": 1.0563682317733765, + "lr": 0.0010518564674532878, + "router/selected_tokens_s0": 4350.625, + "step": 4750, + "tokens_trained": 15.56091572 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3500460960215588, + "grad_norm": 0.8961039781570435, + "loss": 1.0605, + "loss_ce": 0.8918865919113159, + "loss_region": 0.02998742088675499, + "loss_total": 0.9218739867210388, + "lr": 0.0010514495754694367, + "router/selected_tokens_s0": 4227.875, + "step": 4760, + "tokens_trained": 15.593678736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3528827742713283, + "grad_norm": 0.9029963612556458, + "loss": 1.054, + "loss_ce": 1.041927695274353, + "loss_region": 0.030009398236870766, + "loss_total": 1.071937084197998, + "lr": 0.001051042683485586, + "router/selected_tokens_s0": 4384.125, + "step": 4770, + "tokens_trained": 15.626444176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3557194525210978, + "grad_norm": 0.9543034434318542, + "loss": 1.0544, + "loss_ce": 1.0488133430480957, + "loss_region": 0.030006732791662216, + "loss_total": 1.0788201093673706, + "lr": 0.0010506357915017349, + "router/selected_tokens_s0": 4367.625, + "step": 4780, + "tokens_trained": 15.659209616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3585561307708673, + "grad_norm": 1.4998373985290527, + "loss": 1.0569, + "loss_ce": 1.1177537441253662, + "loss_region": 0.030007855966687202, + "loss_total": 1.147761583328247, + "lr": 0.0010502288995178838, + "router/selected_tokens_s0": 4367.375, + "step": 4790, + "tokens_trained": 15.691975056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3613928090206369, + "grad_norm": 0.9493989944458008, + "loss": 1.0632, + "loss_ce": 1.1187384128570557, + "loss_region": 0.030011937022209167, + "loss_total": 1.1487503051757812, + "lr": 0.0010498220075340328, + "router/selected_tokens_s0": 4409.0, + "step": 4800, + "tokens_trained": 15.724740496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3642294872704064, + "grad_norm": 0.8666090965270996, + "loss": 1.0504, + "loss_ce": 1.0977500677108765, + "loss_region": 0.030009282752871513, + "loss_total": 1.127759337425232, + "lr": 0.0010494151155501818, + "router/selected_tokens_s0": 4413.0, + "step": 4810, + "tokens_trained": 15.757505936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.367066165520176, + "grad_norm": 1.7307463884353638, + "loss": 1.0497, + "loss_ce": 0.9964741468429565, + "loss_region": 0.03000612184405327, + "loss_total": 1.0264803171157837, + "lr": 0.0010490082235663307, + "router/selected_tokens_s0": 4340.625, + "step": 4820, + "tokens_trained": 15.790271376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3699028437699454, + "grad_norm": 1.00801420211792, + "loss": 1.0593, + "loss_ce": 1.056219458580017, + "loss_region": 0.030001841485500336, + "loss_total": 1.0862213373184204, + "lr": 0.0010486013315824797, + "router/selected_tokens_s0": 4315.75, + "step": 4830, + "tokens_trained": 15.823036016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.372739522019715, + "grad_norm": 0.6336276531219482, + "loss": 1.0541, + "loss_ce": 0.939272940158844, + "loss_region": 0.030005156993865967, + "loss_total": 0.96927809715271, + "lr": 0.0010481944395986287, + "router/selected_tokens_s0": 4368.625, + "step": 4840, + "tokens_trained": 15.855801456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3755762002694845, + "grad_norm": 1.0217934846878052, + "loss": 1.052, + "loss_ce": 0.9233169555664062, + "loss_region": 0.02999945543706417, + "loss_total": 0.9533163905143738, + "lr": 0.0010477875476147776, + "router/selected_tokens_s0": 4278.125, + "step": 4850, + "tokens_trained": 15.888566096 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.378412878519254, + "grad_norm": 0.8734573125839233, + "loss": 1.0512, + "loss_ce": 1.0704128742218018, + "loss_region": 0.030010642483830452, + "loss_total": 1.1004235744476318, + "lr": 0.0010473806556309266, + "router/selected_tokens_s0": 4385.25, + "step": 4860, + "tokens_trained": 15.921331536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3812495567690235, + "grad_norm": 0.9410074949264526, + "loss": 1.0473, + "loss_ce": 1.0614757537841797, + "loss_region": 0.03000745177268982, + "loss_total": 1.091483235359192, + "lr": 0.0010469737636470756, + "router/selected_tokens_s0": 4373.375, + "step": 4870, + "tokens_trained": 15.954096176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.384086235018793, + "grad_norm": 0.9506546854972839, + "loss": 1.0553, + "loss_ce": 1.0681748390197754, + "loss_region": 0.03000558167695999, + "loss_total": 1.0981804132461548, + "lr": 0.0010465668716632245, + "router/selected_tokens_s0": 4355.875, + "step": 4880, + "tokens_trained": 15.9868564 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3869229132685625, + "grad_norm": 0.6107691526412964, + "loss": 1.0465, + "loss_ce": 1.0117448568344116, + "loss_region": 0.030008085072040558, + "loss_total": 1.0417529344558716, + "lr": 0.0010461599796793735, + "router/selected_tokens_s0": 4373.0, + "step": 4890, + "tokens_trained": 16.019621784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.389759591518332, + "grad_norm": 0.8904739618301392, + "loss": 1.0524, + "loss_ce": 1.0931122303009033, + "loss_region": 0.030003776773810387, + "loss_total": 1.1231160163879395, + "lr": 0.0010457530876955225, + "router/selected_tokens_s0": 4370.75, + "step": 4900, + "tokens_trained": 16.052387224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3925962697681016, + "grad_norm": 0.8106483817100525, + "loss": 1.0501, + "loss_ce": 0.8826806545257568, + "loss_region": 0.030005743727087975, + "loss_total": 0.9126864075660706, + "lr": 0.0010453461957116714, + "router/selected_tokens_s0": 4362.25, + "step": 4910, + "tokens_trained": 16.085152664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.395432948017871, + "grad_norm": 0.8432952761650085, + "loss": 1.0554, + "loss_ce": 0.9541028738021851, + "loss_region": 0.03000727668404579, + "loss_total": 0.9841101765632629, + "lr": 0.0010449393037278204, + "router/selected_tokens_s0": 4347.75, + "step": 4920, + "tokens_trained": 16.117918104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.3982696262676406, + "grad_norm": 0.7111901640892029, + "loss": 1.0514, + "loss_ce": 0.9792753458023071, + "loss_region": 0.030003240332007408, + "loss_total": 1.0092785358428955, + "lr": 0.0010445324117439694, + "router/selected_tokens_s0": 4354.5, + "step": 4930, + "tokens_trained": 16.150683544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4011063045174101, + "grad_norm": 0.623307466506958, + "loss": 1.0486, + "loss_ce": 0.8407849669456482, + "loss_region": 0.030006825923919678, + "loss_total": 0.8707917928695679, + "lr": 0.0010441255197601183, + "router/selected_tokens_s0": 4359.375, + "step": 4940, + "tokens_trained": 16.183448184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4039429827671797, + "grad_norm": 0.7381167411804199, + "loss": 1.0515, + "loss_ce": 0.9730595350265503, + "loss_region": 0.030005378648638725, + "loss_total": 1.0030648708343506, + "lr": 0.0010437186277762675, + "router/selected_tokens_s0": 4344.625, + "step": 4950, + "tokens_trained": 16.216210576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4067796610169492, + "grad_norm": 1.5306568145751953, + "loss": 1.0493, + "loss_ce": 1.1129964590072632, + "loss_region": 0.03000766597688198, + "loss_total": 1.1430041790008545, + "lr": 0.0010433117357924165, + "router/selected_tokens_s0": 4405.875, + "step": 4960, + "tokens_trained": 16.248975216 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4096163392667187, + "grad_norm": 1.0243196487426758, + "loss": 1.053, + "loss_ce": 1.0323237180709839, + "loss_region": 0.0300030205398798, + "loss_total": 1.0623267889022827, + "lr": 0.0010429048438085654, + "router/selected_tokens_s0": 4350.75, + "step": 4970, + "tokens_trained": 16.281739056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4124530175164882, + "grad_norm": 0.41617700457572937, + "loss": 1.0515, + "loss_ce": 1.0560728311538696, + "loss_region": 0.030008800327777863, + "loss_total": 1.086081624031067, + "lr": 0.0010424979518247142, + "router/selected_tokens_s0": 4361.875, + "step": 4980, + "tokens_trained": 16.314504496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4152896957662577, + "grad_norm": 1.0856021642684937, + "loss": 1.0525, + "loss_ce": 0.9173005223274231, + "loss_region": 0.030003489926457405, + "loss_total": 0.9473040103912354, + "lr": 0.0010420910598408631, + "router/selected_tokens_s0": 4334.375, + "step": 4990, + "tokens_trained": 16.347269936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4181263740160273, + "grad_norm": 0.7011216878890991, + "loss": 1.0519, + "loss_ce": 1.0040265321731567, + "loss_region": 0.030011465772986412, + "loss_total": 1.0340379476547241, + "lr": 0.0010416841678570121, + "router/selected_tokens_s0": 4393.0, + "step": 5000, + "tokens_trained": 16.380034576 + }, + { + "epoch": 1.4181263740160273, + "eval_ppl": 2.788169093074203, + "eval_runtime": 2.4902, + "step": 5000, + "tokens_trained": 16.380034576 + }, + { + "epoch": 1.4181263740160273, + "eval_F": 0.3413229464258417, + "eval_F_cds": 0.34488279386436593, + "eval_F_dig": 0.3365887959996875, + "eval_F_exon": 0.34454255649279203, + "eval_F_intron": 0.34125204874115167, + "eval_F_nig": 0.34140188383936837, + "eval_F_promoter": 0.3405175878617453, + "eval_F_utr": 0.3429040617863649, + "eval_G": 0.3415226863108361, + "eval_G_cds": 0.34280297813856925, + "eval_G_dig": 0.39681383799515585, + "eval_G_exon": 0.3413263496822721, + "eval_G_intron": 0.3403742930108439, + "eval_G_nig": 0.34037531674994653, + "eval_G_promoter": 0.34437466936381766, + "eval_G_utr": 0.34068450968191655, + "eval_avg_bp_per_token": 2.929776654255114, + "eval_bp_per_token/cds": 2.8995357779236612, + "eval_bp_per_token/dig": 2.9709842154131847, + "eval_bp_per_token/exon": 2.9023990829444033, + "eval_bp_per_token/intron": 2.930385337432876, + "eval_bp_per_token/nig": 2.9290992444274444, + "eval_bp_per_token/promoter": 2.9367058726082993, + "eval_bp_per_token/utr": 2.916267584555522, + "eval_ppl_cds": 3.2638592963627073, + "eval_ppl_dig": 1.0816082686781683, + "eval_ppl_exon": 3.2486368312535303, + "eval_ppl_intron": 2.8401016704114306, + "eval_ppl_nig": 2.6066140907483235, + "eval_ppl_promoter": 3.109201262145088, + "eval_ppl_utr": 3.1537355648599172, + "step": 5000, + "tokens_trained": 16.380034576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4209630522657968, + "grad_norm": 0.9108154773712158, + "loss": 1.0451, + "loss_ce": 1.066657543182373, + "loss_region": 0.03000144474208355, + "loss_total": 1.0966589450836182, + "lr": 0.001041277275873161, + "router/selected_tokens_s0": 4315.875, + "step": 5010, + "tokens_trained": 16.412800016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4237997305155663, + "grad_norm": 0.8278728127479553, + "loss": 1.0582, + "loss_ce": 1.0547834634780884, + "loss_region": 0.03000921569764614, + "loss_total": 1.0847927331924438, + "lr": 0.0010408703838893103, + "router/selected_tokens_s0": 4401.25, + "step": 5020, + "tokens_trained": 16.445565456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4266364087653358, + "grad_norm": 0.3643961250782013, + "loss": 1.0453, + "loss_ce": 1.0456041097640991, + "loss_region": 0.030006922781467438, + "loss_total": 1.0756109952926636, + "lr": 0.0010404634919054592, + "router/selected_tokens_s0": 4321.5, + "step": 5030, + "tokens_trained": 16.478329936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4294730870151053, + "grad_norm": 0.8244170546531677, + "loss": 1.0413, + "loss_ce": 1.1286065578460693, + "loss_region": 0.03001227229833603, + "loss_total": 1.1586188077926636, + "lr": 0.0010400565999216082, + "router/selected_tokens_s0": 4414.625, + "step": 5040, + "tokens_trained": 16.511090344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4323097652648749, + "grad_norm": 0.7646901607513428, + "loss": 1.0397, + "loss_ce": 0.9538711905479431, + "loss_region": 0.030008982867002487, + "loss_total": 0.9838801622390747, + "lr": 0.0010396497079377572, + "router/selected_tokens_s0": 4366.25, + "step": 5050, + "tokens_trained": 16.543855784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4351464435146444, + "grad_norm": 0.5411663055419922, + "loss": 1.0546, + "loss_ce": 1.10159432888031, + "loss_region": 0.030010610818862915, + "loss_total": 1.1316049098968506, + "lr": 0.0010392428159539061, + "router/selected_tokens_s0": 4370.5, + "step": 5060, + "tokens_trained": 16.576621224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.437983121764414, + "grad_norm": 0.4553099274635315, + "loss": 1.053, + "loss_ce": 0.924439549446106, + "loss_region": 0.030009716749191284, + "loss_total": 0.9544492959976196, + "lr": 0.001038835923970055, + "router/selected_tokens_s0": 4413.125, + "step": 5070, + "tokens_trained": 16.609386664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4408198000141834, + "grad_norm": 0.7976288795471191, + "loss": 1.0524, + "loss_ce": 0.8370477557182312, + "loss_region": 0.03001226671040058, + "loss_total": 0.8670600056648254, + "lr": 0.001038429031986204, + "router/selected_tokens_s0": 4350.125, + "step": 5080, + "tokens_trained": 16.642152104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.443656478263953, + "grad_norm": 0.7374417781829834, + "loss": 1.0481, + "loss_ce": 1.087965726852417, + "loss_region": 0.030008574947714806, + "loss_total": 1.1179742813110352, + "lr": 0.001038022140002353, + "router/selected_tokens_s0": 4363.5, + "step": 5090, + "tokens_trained": 16.674914848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4464931565137225, + "grad_norm": 0.8701497912406921, + "loss": 1.0423, + "loss_ce": 1.102858066558838, + "loss_region": 0.030004970729351044, + "loss_total": 1.1328630447387695, + "lr": 0.001037615248018502, + "router/selected_tokens_s0": 4363.125, + "step": 5100, + "tokens_trained": 16.707680128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.449329834763492, + "grad_norm": 0.7448126077651978, + "loss": 1.0436, + "loss_ce": 1.1202232837677002, + "loss_region": 0.030008699744939804, + "loss_total": 1.150231957435608, + "lr": 0.001037208356034651, + "router/selected_tokens_s0": 4389.375, + "step": 5110, + "tokens_trained": 16.740445568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4521665130132615, + "grad_norm": 0.6302592754364014, + "loss": 1.0486, + "loss_ce": 1.0902258157730103, + "loss_region": 0.03000813163816929, + "loss_total": 1.1202338933944702, + "lr": 0.0010368014640508, + "router/selected_tokens_s0": 4353.375, + "step": 5120, + "tokens_trained": 16.773211008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.455003191263031, + "grad_norm": 1.019002079963684, + "loss": 1.0513, + "loss_ce": 1.0098958015441895, + "loss_region": 0.02999117411673069, + "loss_total": 1.0398869514465332, + "lr": 0.0010363945720669489, + "router/selected_tokens_s0": 4293.875, + "step": 5130, + "tokens_trained": 16.805971784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4578398695128005, + "grad_norm": 0.9714931845664978, + "loss": 1.0518, + "loss_ce": 0.9985664486885071, + "loss_region": 0.030006328597664833, + "loss_total": 1.0285727977752686, + "lr": 0.0010359876800830978, + "router/selected_tokens_s0": 4355.5, + "step": 5140, + "tokens_trained": 16.838737224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.46067654776257, + "grad_norm": 0.8966345191001892, + "loss": 1.0456, + "loss_ce": 1.061452031135559, + "loss_region": 0.030002159997820854, + "loss_total": 1.0914541482925415, + "lr": 0.0010355807880992468, + "router/selected_tokens_s0": 4324.375, + "step": 5150, + "tokens_trained": 16.871502664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4635132260123396, + "grad_norm": 0.6720635294914246, + "loss": 1.0403, + "loss_ce": 1.0887715816497803, + "loss_region": 0.030005764216184616, + "loss_total": 1.1187773942947388, + "lr": 0.0010351738961153958, + "router/selected_tokens_s0": 4345.375, + "step": 5160, + "tokens_trained": 16.904268104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.466349904262109, + "grad_norm": 0.7350347638130188, + "loss": 1.0505, + "loss_ce": 1.0535961389541626, + "loss_region": 0.030005428940057755, + "loss_total": 1.0836015939712524, + "lr": 0.0010347670041315447, + "router/selected_tokens_s0": 4381.75, + "step": 5170, + "tokens_trained": 16.937030128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4691865825118786, + "grad_norm": 0.44304972887039185, + "loss": 1.0432, + "loss_ce": 1.0507234334945679, + "loss_region": 0.030006472021341324, + "loss_total": 1.0807299613952637, + "lr": 0.0010343601121476937, + "router/selected_tokens_s0": 4387.25, + "step": 5180, + "tokens_trained": 16.969795184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4720232607616481, + "grad_norm": 0.441457062959671, + "loss": 1.0491, + "loss_ce": 1.0120573043823242, + "loss_region": 0.030006378889083862, + "loss_total": 1.0420637130737305, + "lr": 0.0010339532201638427, + "router/selected_tokens_s0": 4363.625, + "step": 5190, + "tokens_trained": 17.002560624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4748599390114177, + "grad_norm": 0.8241252303123474, + "loss": 1.0567, + "loss_ce": 1.0892800092697144, + "loss_region": 0.03000868298113346, + "loss_total": 1.119288682937622, + "lr": 0.0010335463281799918, + "router/selected_tokens_s0": 4367.125, + "step": 5200, + "tokens_trained": 17.035326064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4776966172611872, + "grad_norm": 0.5182619690895081, + "loss": 1.0408, + "loss_ce": 0.9733250737190247, + "loss_region": 0.030004369094967842, + "loss_total": 1.0033293962478638, + "lr": 0.0010331394361961408, + "router/selected_tokens_s0": 4326.25, + "step": 5210, + "tokens_trained": 17.068091504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4805332955109567, + "grad_norm": 0.7119126319885254, + "loss": 1.0389, + "loss_ce": 0.9868582487106323, + "loss_region": 0.03000757470726967, + "loss_total": 1.016865849494934, + "lr": 0.0010327325442122898, + "router/selected_tokens_s0": 4388.0, + "step": 5220, + "tokens_trained": 17.100856944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4833699737607262, + "grad_norm": 1.0945305824279785, + "loss": 1.0462, + "loss_ce": 0.9747534990310669, + "loss_region": 0.030006079003214836, + "loss_total": 1.0047595500946045, + "lr": 0.0010323256522284385, + "router/selected_tokens_s0": 4341.25, + "step": 5230, + "tokens_trained": 17.133622384 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4862066520104957, + "grad_norm": 0.8298206329345703, + "loss": 1.0351, + "loss_ce": 1.05121648311615, + "loss_region": 0.030005764216184616, + "loss_total": 1.0812222957611084, + "lr": 0.0010319187602445875, + "router/selected_tokens_s0": 4326.375, + "step": 5240, + "tokens_trained": 17.166385424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4890433302602653, + "grad_norm": 0.4219936728477478, + "loss": 1.0484, + "loss_ce": 0.8292503952980042, + "loss_region": 0.030046647414565086, + "loss_total": 0.8592970371246338, + "lr": 0.0010315118682607365, + "router/selected_tokens_s0": 4365.125, + "step": 5250, + "tokens_trained": 17.199150864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4918800085100348, + "grad_norm": 0.19638904929161072, + "loss": 1.0493, + "loss_ce": 0.9696671962738037, + "loss_region": 0.030005378648638725, + "loss_total": 0.9996725916862488, + "lr": 0.0010311049762768854, + "router/selected_tokens_s0": 4325.25, + "step": 5260, + "tokens_trained": 17.231916304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4947166867598043, + "grad_norm": 0.786198079586029, + "loss": 1.0451, + "loss_ce": 1.0354949235916138, + "loss_region": 0.03000979870557785, + "loss_total": 1.0655046701431274, + "lr": 0.0010306980842930346, + "router/selected_tokens_s0": 4347.125, + "step": 5270, + "tokens_trained": 17.264676432 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.4975533650095738, + "grad_norm": 0.7419178485870361, + "loss": 1.041, + "loss_ce": 0.928809404373169, + "loss_region": 0.030001329258084297, + "loss_total": 0.9588107466697693, + "lr": 0.0010302911923091836, + "router/selected_tokens_s0": 4279.625, + "step": 5280, + "tokens_trained": 17.297440272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5003900432593433, + "grad_norm": 0.8321080803871155, + "loss": 1.0508, + "loss_ce": 1.090352177619934, + "loss_region": 0.030001569539308548, + "loss_total": 1.1203536987304688, + "lr": 0.0010298843003253325, + "router/selected_tokens_s0": 4361.625, + "step": 5290, + "tokens_trained": 17.330205712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5032267215091129, + "grad_norm": 0.43943366408348083, + "loss": 1.0423, + "loss_ce": 1.054518461227417, + "loss_region": 0.03000991977751255, + "loss_total": 1.0845283269882202, + "lr": 0.0010294774083414815, + "router/selected_tokens_s0": 4375.625, + "step": 5300, + "tokens_trained": 17.362971152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5060633997588824, + "grad_norm": 0.9542965888977051, + "loss": 1.0469, + "loss_ce": 1.113122820854187, + "loss_region": 0.030009111389517784, + "loss_total": 1.143131971359253, + "lr": 0.0010290705163576305, + "router/selected_tokens_s0": 4369.0, + "step": 5310, + "tokens_trained": 17.395736592 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.508900078008652, + "grad_norm": 0.6456644535064697, + "loss": 1.0438, + "loss_ce": 1.0606231689453125, + "loss_region": 0.030006930232048035, + "loss_total": 1.090630054473877, + "lr": 0.0010286636243737794, + "router/selected_tokens_s0": 4360.125, + "step": 5320, + "tokens_trained": 17.428502032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5117367562584214, + "grad_norm": 1.505149006843567, + "loss": 1.0426, + "loss_ce": 1.065946340560913, + "loss_region": 0.030012134462594986, + "loss_total": 1.0959584712982178, + "lr": 0.0010282567323899284, + "router/selected_tokens_s0": 4342.875, + "step": 5330, + "tokens_trained": 17.461262816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.514573434508191, + "grad_norm": 0.5819237232208252, + "loss": 1.0424, + "loss_ce": 0.9712111353874207, + "loss_region": 0.030006036162376404, + "loss_total": 1.0012171268463135, + "lr": 0.0010278498404060774, + "router/selected_tokens_s0": 4354.625, + "step": 5340, + "tokens_trained": 17.494028256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5174101127579604, + "grad_norm": 0.5115887522697449, + "loss": 1.0468, + "loss_ce": 1.0570096969604492, + "loss_region": 0.0300076175481081, + "loss_total": 1.087017297744751, + "lr": 0.0010274429484222263, + "router/selected_tokens_s0": 4346.5, + "step": 5350, + "tokens_trained": 17.526793696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.52024679100773, + "grad_norm": 0.5018046498298645, + "loss": 1.0437, + "loss_ce": 0.9743192195892334, + "loss_region": 0.030005570501089096, + "loss_total": 1.0043247938156128, + "lr": 0.0010270360564383753, + "router/selected_tokens_s0": 4360.75, + "step": 5360, + "tokens_trained": 17.559559136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5230834692574995, + "grad_norm": 0.7800183892250061, + "loss": 1.0429, + "loss_ce": 1.0386371612548828, + "loss_region": 0.030008897185325623, + "loss_total": 1.0686460733413696, + "lr": 0.0010266291644545243, + "router/selected_tokens_s0": 4364.125, + "step": 5370, + "tokens_trained": 17.59232456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.525920147507269, + "grad_norm": 0.6966549754142761, + "loss": 1.0489, + "loss_ce": 0.8995506167411804, + "loss_region": 0.03000815026462078, + "loss_total": 0.9295587539672852, + "lr": 0.0010262222724706732, + "router/selected_tokens_s0": 4385.75, + "step": 5380, + "tokens_trained": 17.6250892 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5287568257570385, + "grad_norm": 0.5771371722221375, + "loss": 1.0467, + "loss_ce": 0.9319908022880554, + "loss_region": 0.0300018098205328, + "loss_total": 0.961992621421814, + "lr": 0.0010258153804868222, + "router/selected_tokens_s0": 4344.5, + "step": 5390, + "tokens_trained": 17.65785464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.531593504006808, + "grad_norm": 0.553460955619812, + "loss": 1.0509, + "loss_ce": 1.0958824157714844, + "loss_region": 0.030008256435394287, + "loss_total": 1.1258907318115234, + "lr": 0.0010254084885029712, + "router/selected_tokens_s0": 4395.875, + "step": 5400, + "tokens_trained": 17.69062008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5344301822565776, + "grad_norm": 0.7295851111412048, + "loss": 1.0446, + "loss_ce": 1.0735763311386108, + "loss_region": 0.030002327635884285, + "loss_total": 1.1035786867141724, + "lr": 0.0010250015965191201, + "router/selected_tokens_s0": 4300.625, + "step": 5410, + "tokens_trained": 17.72338552 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.537266860506347, + "grad_norm": 0.4493541419506073, + "loss": 1.041, + "loss_ce": 1.1575278043746948, + "loss_region": 0.030008507892489433, + "loss_total": 1.187536358833313, + "lr": 0.001024594704535269, + "router/selected_tokens_s0": 4385.875, + "step": 5420, + "tokens_trained": 17.75615016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5401035387561166, + "grad_norm": 1.0313796997070312, + "loss": 1.0498, + "loss_ce": 1.0423924922943115, + "loss_region": 0.030005216598510742, + "loss_total": 1.0723977088928223, + "lr": 0.001024187812551418, + "router/selected_tokens_s0": 4385.5, + "step": 5430, + "tokens_trained": 17.78891464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5429402170058861, + "grad_norm": 0.6560305953025818, + "loss": 1.0418, + "loss_ce": 0.9702669978141785, + "loss_region": 0.030001483857631683, + "loss_total": 1.0002684593200684, + "lr": 0.001023780920567567, + "router/selected_tokens_s0": 4346.375, + "step": 5440, + "tokens_trained": 17.821677712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5457768952556556, + "grad_norm": 0.2240542620420456, + "loss": 1.038, + "loss_ce": 1.117678165435791, + "loss_region": 0.03000263124704361, + "loss_total": 1.1476807594299316, + "lr": 0.0010233740285837162, + "router/selected_tokens_s0": 4357.25, + "step": 5450, + "tokens_trained": 17.854443152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5486135735054252, + "grad_norm": 0.3824736475944519, + "loss": 1.0438, + "loss_ce": 1.013928771018982, + "loss_region": 0.030003132298588753, + "loss_total": 1.0439319610595703, + "lr": 0.0010229671365998652, + "router/selected_tokens_s0": 4288.0, + "step": 5460, + "tokens_trained": 17.88720588 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5514502517551947, + "grad_norm": 1.2875090837478638, + "loss": 1.0417, + "loss_ce": 1.0048933029174805, + "loss_region": 0.03000313974916935, + "loss_total": 1.0348964929580688, + "lr": 0.0010225602446160141, + "router/selected_tokens_s0": 4326.0, + "step": 5470, + "tokens_trained": 17.91997132 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5542869300049642, + "grad_norm": 0.32429569959640503, + "loss": 1.0345, + "loss_ce": 0.8651331067085266, + "loss_region": 0.029996881261467934, + "loss_total": 0.8951299786567688, + "lr": 0.0010221533526321629, + "router/selected_tokens_s0": 4298.625, + "step": 5480, + "tokens_trained": 17.952736472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5571236082547337, + "grad_norm": 0.7797481417655945, + "loss": 1.0423, + "loss_ce": 1.0778536796569824, + "loss_region": 0.03001037798821926, + "loss_total": 1.1078640222549438, + "lr": 0.0010217464606483118, + "router/selected_tokens_s0": 4417.75, + "step": 5490, + "tokens_trained": 17.985498936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5599602865045032, + "grad_norm": 0.6121678352355957, + "loss": 1.0487, + "loss_ce": 1.0614382028579712, + "loss_region": 0.030010351911187172, + "loss_total": 1.0914485454559326, + "lr": 0.0010213395686644608, + "router/selected_tokens_s0": 4413.5, + "step": 5500, + "tokens_trained": 18.018264376 + }, + { + "epoch": 1.5599602865045032, + "eval_ppl": 2.76765789476978, + "eval_runtime": 2.4835, + "step": 5500, + "tokens_trained": 18.018264376 + }, + { + "epoch": 1.5599602865045032, + "eval_F": 0.3403231655326333, + "eval_F_cds": 0.34277235568564984, + "eval_F_dig": 0.3267703205979634, + "eval_F_exon": 0.34366801139729736, + "eval_F_intron": 0.3408284238068733, + "eval_F_nig": 0.34099593292063235, + "eval_F_promoter": 0.33725428255936774, + "eval_F_utr": 0.34147572250874536, + "eval_G": 0.3415901383306685, + "eval_G_cds": 0.3420066364335616, + "eval_G_dig": 0.39036181619006693, + "eval_G_exon": 0.3413997151557254, + "eval_G_intron": 0.34085139204059245, + "eval_G_nig": 0.3408783641023706, + "eval_G_promoter": 0.34279264794696634, + "eval_G_utr": 0.34078024569894544, + "eval_avg_bp_per_token": 2.9383835756079635, + "eval_bp_per_token/cds": 2.9173881248378195, + "eval_bp_per_token/dig": 3.0602534470391327, + "eval_bp_per_token/exon": 2.9097849285831554, + "eval_bp_per_token/intron": 2.934027593211061, + "eval_bp_per_token/nig": 2.9325862963672136, + "eval_bp_per_token/promoter": 2.9651217248040944, + "eval_bp_per_token/utr": 2.9284658735128364, + "eval_ppl_cds": 3.18907175811622, + "eval_ppl_dig": 1.081375085654356, + "eval_ppl_exon": 3.2359904440638463, + "eval_ppl_intron": 2.8314949801099587, + "eval_ppl_nig": 2.600385066779234, + "eval_ppl_promoter": 3.03807385354442, + "eval_ppl_utr": 3.1444852350411727, + "step": 5500, + "tokens_trained": 18.018264376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5627969647542728, + "grad_norm": 0.4803471267223358, + "loss": 1.0374, + "loss_ce": 0.9833582043647766, + "loss_region": 0.030004315078258514, + "loss_total": 1.0133625268936157, + "lr": 0.0010209326766806098, + "router/selected_tokens_s0": 4338.875, + "step": 5510, + "tokens_trained": 18.051029816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5656336430040423, + "grad_norm": 1.0378037691116333, + "loss": 1.0417, + "loss_ce": 1.0130213499069214, + "loss_region": 0.030004722997546196, + "loss_total": 1.043026089668274, + "lr": 0.001020525784696759, + "router/selected_tokens_s0": 4365.25, + "step": 5520, + "tokens_trained": 18.083794456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5684703212538118, + "grad_norm": 0.2039332240819931, + "loss": 1.0337, + "loss_ce": 0.972812831401825, + "loss_region": 0.030001500621438026, + "loss_total": 1.0028142929077148, + "lr": 0.001020118892712908, + "router/selected_tokens_s0": 4273.375, + "step": 5530, + "tokens_trained": 18.116559896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5713069995035813, + "grad_norm": 0.9828659296035767, + "loss": 1.0435, + "loss_ce": 1.0882686376571655, + "loss_region": 0.030004315078258514, + "loss_total": 1.1182729005813599, + "lr": 0.0010197120007290569, + "router/selected_tokens_s0": 4338.375, + "step": 5540, + "tokens_trained": 18.149325336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5741436777533508, + "grad_norm": 0.4906889498233795, + "loss": 1.0405, + "loss_ce": 0.9475066661834717, + "loss_region": 0.030002078041434288, + "loss_total": 0.9775087237358093, + "lr": 0.0010193051087452058, + "router/selected_tokens_s0": 4325.25, + "step": 5550, + "tokens_trained": 18.182090776 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5769803560031204, + "grad_norm": 0.6963337659835815, + "loss": 1.0379, + "loss_ce": 0.7684432864189148, + "loss_region": 0.029997603967785835, + "loss_total": 0.7984408736228943, + "lr": 0.0010188982167613548, + "router/selected_tokens_s0": 4256.625, + "step": 5560, + "tokens_trained": 18.214852704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5798170342528899, + "grad_norm": 0.6726390719413757, + "loss": 1.0506, + "loss_ce": 1.0656613111495972, + "loss_region": 0.030005795881152153, + "loss_total": 1.0956671237945557, + "lr": 0.0010184913247775038, + "router/selected_tokens_s0": 4383.75, + "step": 5570, + "tokens_trained": 18.247618144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5826537125026594, + "grad_norm": 0.5460783839225769, + "loss": 1.0391, + "loss_ce": 0.9652643203735352, + "loss_region": 0.030002180486917496, + "loss_total": 0.9952664971351624, + "lr": 0.0010180844327936527, + "router/selected_tokens_s0": 4303.875, + "step": 5580, + "tokens_trained": 18.280383584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.585490390752429, + "grad_norm": 0.35639381408691406, + "loss": 1.0356, + "loss_ce": 0.88677579164505, + "loss_region": 0.030002212151885033, + "loss_total": 0.916778028011322, + "lr": 0.0010176775408098017, + "router/selected_tokens_s0": 4362.625, + "step": 5590, + "tokens_trained": 18.313149024 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5883270690021984, + "grad_norm": 0.5016544461250305, + "loss": 1.0311, + "loss_ce": 1.0120795965194702, + "loss_region": 0.030004823580384254, + "loss_total": 1.0420844554901123, + "lr": 0.0010172706488259507, + "router/selected_tokens_s0": 4349.75, + "step": 5600, + "tokens_trained": 18.345914464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.591163747251968, + "grad_norm": 0.687404990196228, + "loss": 1.0409, + "loss_ce": 1.033144235610962, + "loss_region": 0.03000274859368801, + "loss_total": 1.063146948814392, + "lr": 0.0010168637568420996, + "router/selected_tokens_s0": 4313.5, + "step": 5610, + "tokens_trained": 18.378679104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5940004255017375, + "grad_norm": 0.3557313084602356, + "loss": 1.0342, + "loss_ce": 0.9033691883087158, + "loss_region": 0.030007191002368927, + "loss_total": 0.9333763718605042, + "lr": 0.0010164568648582486, + "router/selected_tokens_s0": 4394.375, + "step": 5620, + "tokens_trained": 18.411444544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.596837103751507, + "grad_norm": 0.6189426183700562, + "loss": 1.04, + "loss_ce": 0.8717077970504761, + "loss_region": 0.03000614605844021, + "loss_total": 0.9017139673233032, + "lr": 0.0010160499728743976, + "router/selected_tokens_s0": 4337.5, + "step": 5630, + "tokens_trained": 18.444209984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.5996737820012765, + "grad_norm": 0.6988716721534729, + "loss": 1.0305, + "loss_ce": 0.975788950920105, + "loss_region": 0.029997356235980988, + "loss_total": 1.0057862997055054, + "lr": 0.0010156430808905465, + "router/selected_tokens_s0": 4335.875, + "step": 5640, + "tokens_trained": 18.476975424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.602510460251046, + "grad_norm": 0.6643272042274475, + "loss": 1.0403, + "loss_ce": 1.1351817846298218, + "loss_region": 0.030008381232619286, + "loss_total": 1.1651902198791504, + "lr": 0.0010152361889066955, + "router/selected_tokens_s0": 4362.25, + "step": 5650, + "tokens_trained": 18.509739264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6053471385008156, + "grad_norm": 0.8286615014076233, + "loss": 1.0373, + "loss_ce": 0.8297767043113708, + "loss_region": 0.03000292181968689, + "loss_total": 0.8597795963287354, + "lr": 0.0010148292969228445, + "router/selected_tokens_s0": 4327.25, + "step": 5660, + "tokens_trained": 18.542503904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.608183816750585, + "grad_norm": 0.22749805450439453, + "loss": 1.0352, + "loss_ce": 0.9598128795623779, + "loss_region": 0.03000240959227085, + "loss_total": 0.9898152947425842, + "lr": 0.0010144224049389934, + "router/selected_tokens_s0": 4316.75, + "step": 5670, + "tokens_trained": 18.575269344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6110204950003546, + "grad_norm": 0.25335147976875305, + "loss": 1.0285, + "loss_ce": 0.975443422794342, + "loss_region": 0.030008111149072647, + "loss_total": 1.0054515600204468, + "lr": 0.0010140155129551424, + "router/selected_tokens_s0": 4387.0, + "step": 5680, + "tokens_trained": 18.608031832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6138571732501241, + "grad_norm": 0.8008378148078918, + "loss": 1.0393, + "loss_ce": 1.088114619255066, + "loss_region": 0.030002346262335777, + "loss_total": 1.1181169748306274, + "lr": 0.0010136086209712914, + "router/selected_tokens_s0": 4368.875, + "step": 5690, + "tokens_trained": 18.640797272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6166938514998936, + "grad_norm": 0.6417054533958435, + "loss": 1.0381, + "loss_ce": 0.9379876255989075, + "loss_region": 0.03000989928841591, + "loss_total": 0.9679975509643555, + "lr": 0.0010132017289874405, + "router/selected_tokens_s0": 4368.875, + "step": 5700, + "tokens_trained": 18.673562712 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6195305297496632, + "grad_norm": 1.3401010036468506, + "loss": 1.0339, + "loss_ce": 1.040677785873413, + "loss_region": 0.030000442638993263, + "loss_total": 1.0706782341003418, + "lr": 0.0010127948370035895, + "router/selected_tokens_s0": 4332.25, + "step": 5710, + "tokens_trained": 18.706328152 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6223672079994327, + "grad_norm": 0.9140957593917847, + "loss": 1.0432, + "loss_ce": 1.1209052801132202, + "loss_region": 0.03001645766198635, + "loss_total": 1.1509217023849487, + "lr": 0.0010123879450197385, + "router/selected_tokens_s0": 4408.625, + "step": 5720, + "tokens_trained": 18.739093568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6252038862492022, + "grad_norm": 0.38389793038368225, + "loss": 1.0393, + "loss_ce": 0.9129772782325745, + "loss_region": 0.02999955601990223, + "loss_total": 0.9429768323898315, + "lr": 0.0010119810530358872, + "router/selected_tokens_s0": 4306.5, + "step": 5730, + "tokens_trained": 18.771858208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6280405644989717, + "grad_norm": 1.761724829673767, + "loss": 1.0455, + "loss_ce": 1.0788276195526123, + "loss_region": 0.030005717650055885, + "loss_total": 1.1088333129882812, + "lr": 0.0010115741610520362, + "router/selected_tokens_s0": 4344.625, + "step": 5740, + "tokens_trained": 18.804619848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6308772427487412, + "grad_norm": 0.514764666557312, + "loss": 1.0331, + "loss_ce": 1.0706536769866943, + "loss_region": 0.030006183311343193, + "loss_total": 1.1006598472595215, + "lr": 0.0010111672690681851, + "router/selected_tokens_s0": 4372.25, + "step": 5750, + "tokens_trained": 18.837385288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6337139209985108, + "grad_norm": 0.40612781047821045, + "loss": 1.036, + "loss_ce": 1.0906848907470703, + "loss_region": 0.030003517866134644, + "loss_total": 1.1206884384155273, + "lr": 0.0010107603770843341, + "router/selected_tokens_s0": 4322.25, + "step": 5760, + "tokens_trained": 18.870150648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6365505992482803, + "grad_norm": 0.26753610372543335, + "loss": 1.0316, + "loss_ce": 1.00560462474823, + "loss_region": 0.030005289241671562, + "loss_total": 1.0356099605560303, + "lr": 0.0010103534851004833, + "router/selected_tokens_s0": 4323.5, + "step": 5770, + "tokens_trained": 18.902916088 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6393872774980498, + "grad_norm": 0.41570785641670227, + "loss": 1.0386, + "loss_ce": 0.9916934370994568, + "loss_region": 0.03000512719154358, + "loss_total": 1.0216985940933228, + "lr": 0.0010099465931166323, + "router/selected_tokens_s0": 4347.375, + "step": 5780, + "tokens_trained": 18.935681528 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6422239557478193, + "grad_norm": 0.5421174168586731, + "loss": 1.0318, + "loss_ce": 0.9400946497917175, + "loss_region": 0.030001504346728325, + "loss_total": 0.9700961709022522, + "lr": 0.0010095397011327812, + "router/selected_tokens_s0": 4331.0, + "step": 5790, + "tokens_trained": 18.968446968 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6450606339975888, + "grad_norm": 0.4832181930541992, + "loss": 1.0337, + "loss_ce": 0.9629077911376953, + "loss_region": 0.030001387000083923, + "loss_total": 0.9929091930389404, + "lr": 0.0010091328091489302, + "router/selected_tokens_s0": 4295.375, + "step": 5800, + "tokens_trained": 19.001210808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6478973122473584, + "grad_norm": 1.0204321146011353, + "loss": 1.0304, + "loss_ce": 1.013646125793457, + "loss_region": 0.03000660054385662, + "loss_total": 1.0436527729034424, + "lr": 0.0010087259171650792, + "router/selected_tokens_s0": 4419.5, + "step": 5810, + "tokens_trained": 19.033976248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6507339904971279, + "grad_norm": 0.5032446384429932, + "loss": 1.0334, + "loss_ce": 1.0649542808532715, + "loss_region": 0.030000925064086914, + "loss_total": 1.0949552059173584, + "lr": 0.0010083190251812281, + "router/selected_tokens_s0": 4287.125, + "step": 5820, + "tokens_trained": 19.066741688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6535706687468974, + "grad_norm": 0.5891702175140381, + "loss": 1.0297, + "loss_ce": 1.0153957605361938, + "loss_region": 0.030004706233739853, + "loss_total": 1.0454005002975464, + "lr": 0.001007912133197377, + "router/selected_tokens_s0": 4354.75, + "step": 5830, + "tokens_trained": 19.099507128 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.656407346996667, + "grad_norm": 0.6594350934028625, + "loss": 1.0346, + "loss_ce": 1.0509767532348633, + "loss_region": 0.03000517748296261, + "loss_total": 1.080981969833374, + "lr": 0.001007505241213526, + "router/selected_tokens_s0": 4363.25, + "step": 5840, + "tokens_trained": 19.132272568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6592440252464364, + "grad_norm": 0.5906273126602173, + "loss": 1.0337, + "loss_ce": 1.0429621934890747, + "loss_region": 0.030009938403964043, + "loss_total": 1.0729721784591675, + "lr": 0.001007098349229675, + "router/selected_tokens_s0": 4366.125, + "step": 5850, + "tokens_trained": 19.165038008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.662080703496206, + "grad_norm": 0.47190093994140625, + "loss": 1.0335, + "loss_ce": 0.950088381767273, + "loss_region": 0.030002925544977188, + "loss_total": 0.9800913333892822, + "lr": 0.001006691457245824, + "router/selected_tokens_s0": 4327.5, + "step": 5860, + "tokens_trained": 19.197803448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6649173817459755, + "grad_norm": 0.5748708844184875, + "loss": 1.0324, + "loss_ce": 1.0693488121032715, + "loss_region": 0.030013523995876312, + "loss_total": 1.0993623733520508, + "lr": 0.001006284565261973, + "router/selected_tokens_s0": 4386.25, + "step": 5870, + "tokens_trained": 19.230568888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.667754059995745, + "grad_norm": 0.5576515793800354, + "loss": 1.0347, + "loss_ce": 1.090317964553833, + "loss_region": 0.030007001012563705, + "loss_total": 1.120324969291687, + "lr": 0.001005877673278122, + "router/selected_tokens_s0": 4399.0, + "step": 5880, + "tokens_trained": 19.263333528 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6705907382455145, + "grad_norm": 0.4692791998386383, + "loss": 1.0231, + "loss_ce": 0.9621900320053101, + "loss_region": 0.02999553643167019, + "loss_total": 0.9921855926513672, + "lr": 0.0010054707812942709, + "router/selected_tokens_s0": 4265.375, + "step": 5890, + "tokens_trained": 19.296098168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.673427416495284, + "grad_norm": 0.5106430649757385, + "loss": 1.0364, + "loss_ce": 0.8931739330291748, + "loss_region": 0.030010921880602837, + "loss_total": 0.923184871673584, + "lr": 0.0010050638893104198, + "router/selected_tokens_s0": 4364.125, + "step": 5900, + "tokens_trained": 19.328862688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6762640947450536, + "grad_norm": 0.5919066667556763, + "loss": 1.0405, + "loss_ce": 1.1164112091064453, + "loss_region": 0.03000679798424244, + "loss_total": 1.1464179754257202, + "lr": 0.0010046569973265688, + "router/selected_tokens_s0": 4368.0, + "step": 5910, + "tokens_trained": 19.361627328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.679100772994823, + "grad_norm": 0.5985324382781982, + "loss": 1.029, + "loss_ce": 0.8035845160484314, + "loss_region": 0.030008496716618538, + "loss_total": 0.8335930109024048, + "lr": 0.0010042501053427178, + "router/selected_tokens_s0": 4344.0, + "step": 5920, + "tokens_trained": 19.394392768 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6819374512445926, + "grad_norm": 0.46029677987098694, + "loss": 1.0369, + "loss_ce": 1.1152499914169312, + "loss_region": 0.030002374202013016, + "loss_total": 1.1452523469924927, + "lr": 0.0010038432133588667, + "router/selected_tokens_s0": 4378.875, + "step": 5930, + "tokens_trained": 19.427158208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6847741294943621, + "grad_norm": 0.5811964273452759, + "loss": 1.0304, + "loss_ce": 1.0038641691207886, + "loss_region": 0.03000630810856819, + "loss_total": 1.0338704586029053, + "lr": 0.0010034363213750157, + "router/selected_tokens_s0": 4316.0, + "step": 5940, + "tokens_trained": 19.459922848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6876108077441316, + "grad_norm": 0.2964920401573181, + "loss": 1.0347, + "loss_ce": 1.0238761901855469, + "loss_region": 0.03000667691230774, + "loss_total": 1.0538828372955322, + "lr": 0.0010030294293911649, + "router/selected_tokens_s0": 4331.875, + "step": 5950, + "tokens_trained": 19.492688288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6904474859939012, + "grad_norm": 0.20688390731811523, + "loss": 1.031, + "loss_ce": 1.0256707668304443, + "loss_region": 0.030003707855939865, + "loss_total": 1.055674433708191, + "lr": 0.0010026225374073139, + "router/selected_tokens_s0": 4327.875, + "step": 5960, + "tokens_trained": 19.525453728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6932841642436707, + "grad_norm": 0.4191875755786896, + "loss": 1.0357, + "loss_ce": 0.9435751438140869, + "loss_region": 0.03000720962882042, + "loss_total": 0.9735823273658752, + "lr": 0.0010022156454234628, + "router/selected_tokens_s0": 4370.375, + "step": 5970, + "tokens_trained": 19.558219168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6961208424934402, + "grad_norm": 0.6510814428329468, + "loss": 1.0246, + "loss_ce": 0.9768120050430298, + "loss_region": 0.030007587745785713, + "loss_total": 1.0068196058273315, + "lr": 0.0010018087534396116, + "router/selected_tokens_s0": 4360.875, + "step": 5980, + "tokens_trained": 19.590983808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.6989575207432097, + "grad_norm": 0.477987676858902, + "loss": 1.0252, + "loss_ce": 1.0579345226287842, + "loss_region": 0.030001208186149597, + "loss_total": 1.0879356861114502, + "lr": 0.0010014018614557605, + "router/selected_tokens_s0": 4310.375, + "step": 5990, + "tokens_trained": 19.623749248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7017941989929792, + "grad_norm": 0.5178934335708618, + "loss": 1.0325, + "loss_ce": 1.01963472366333, + "loss_region": 0.03001089207828045, + "loss_total": 1.0496456623077393, + "lr": 0.0010009949694719095, + "router/selected_tokens_s0": 4379.5, + "step": 6000, + "tokens_trained": 19.65651468 + }, + { + "epoch": 1.7017941989929792, + "eval_ppl": 2.7438295521464737, + "eval_runtime": 2.524, + "step": 6000, + "tokens_trained": 19.65651468 + }, + { + "epoch": 1.7017941989929792, + "eval_F": 0.34151615105799704, + "eval_F_cds": 0.3443491198421619, + "eval_F_dig": 0.3249212178034742, + "eval_F_exon": 0.3452196236634976, + "eval_F_intron": 0.3418051895789037, + "eval_F_nig": 0.3420160147164305, + "eval_F_promoter": 0.33950926318966235, + "eval_F_utr": 0.34296393229500766, + "eval_G": 0.3440260132521393, + "eval_G_cds": 0.34685659178384587, + "eval_G_dig": 0.3855714028765268, + "eval_G_exon": 0.3442092298449093, + "eval_G_intron": 0.3428874806376615, + "eval_G_nig": 0.34218012092239336, + "eval_G_promoter": 0.34825656510586706, + "eval_G_utr": 0.3435660953042277, + "eval_avg_bp_per_token": 2.9281192028607097, + "eval_bp_per_token/cds": 2.904029493260696, + "eval_bp_per_token/dig": 3.077669124719461, + "eval_bp_per_token/exon": 2.896706709160742, + "eval_bp_per_token/intron": 2.9256431162791223, + "eval_bp_per_token/nig": 2.9238396945508875, + "eval_bp_per_token/promoter": 2.9454277347400777, + "eval_bp_per_token/utr": 2.9157584977181474, + "eval_ppl_cds": 3.1473755860930703, + "eval_ppl_dig": 1.0781771784423138, + "eval_ppl_exon": 3.215945780217024, + "eval_ppl_intron": 2.8130272235017966, + "eval_ppl_nig": 2.58073245677091, + "eval_ppl_promoter": 2.9110875223569272, + "eval_ppl_utr": 3.090760374244644, + "step": 6000, + "tokens_trained": 19.65651468 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7046308772427488, + "grad_norm": 0.3065147399902344, + "loss": 1.0284, + "loss_ce": 1.0937912464141846, + "loss_region": 0.03000638633966446, + "loss_total": 1.1237976551055908, + "lr": 0.0010005880774880585, + "router/selected_tokens_s0": 4393.875, + "step": 6010, + "tokens_trained": 19.68928012 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7074675554925183, + "grad_norm": 0.462746798992157, + "loss": 1.0309, + "loss_ce": 1.0331625938415527, + "loss_region": 0.030001437291502953, + "loss_total": 1.0631639957427979, + "lr": 0.0010001811855042076, + "router/selected_tokens_s0": 4302.125, + "step": 6020, + "tokens_trained": 19.722045536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7103042337422878, + "grad_norm": 0.47632360458374023, + "loss": 1.0327, + "loss_ce": 0.9992091655731201, + "loss_region": 0.030003167688846588, + "loss_total": 1.0292123556137085, + "lr": 0.0009997742935203566, + "router/selected_tokens_s0": 4340.25, + "step": 6030, + "tokens_trained": 19.754810976 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7131409119920573, + "grad_norm": 0.24395163357257843, + "loss": 1.0306, + "loss_ce": 1.0945944786071777, + "loss_region": 0.03001002036035061, + "loss_total": 1.1246044635772705, + "lr": 0.0009993674015365056, + "router/selected_tokens_s0": 4380.875, + "step": 6040, + "tokens_trained": 19.787576416 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7159775902418268, + "grad_norm": 0.5877518653869629, + "loss": 1.0278, + "loss_ce": 0.9862028956413269, + "loss_region": 0.030005421489477158, + "loss_total": 1.016208291053772, + "lr": 0.0009989605095526545, + "router/selected_tokens_s0": 4347.0, + "step": 6050, + "tokens_trained": 19.820338544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7188142684915964, + "grad_norm": 0.48769211769104004, + "loss": 1.0304, + "loss_ce": 1.0761768817901611, + "loss_region": 0.030014390125870705, + "loss_total": 1.1061912775039673, + "lr": 0.0009985536175688035, + "router/selected_tokens_s0": 4388.5, + "step": 6060, + "tokens_trained": 19.853099816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7216509467413659, + "grad_norm": 0.7390428185462952, + "loss": 1.033, + "loss_ce": 1.0156022310256958, + "loss_region": 0.0300312303006649, + "loss_total": 1.0456334352493286, + "lr": 0.0009981467255849525, + "router/selected_tokens_s0": 4449.375, + "step": 6070, + "tokens_trained": 19.885862632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7244876249911354, + "grad_norm": 0.529603123664856, + "loss": 1.0352, + "loss_ce": 0.9190145134925842, + "loss_region": 0.030002398416399956, + "loss_total": 0.9490169286727905, + "lr": 0.0009977398336011014, + "router/selected_tokens_s0": 4333.75, + "step": 6080, + "tokens_trained": 19.918628072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.727324303240905, + "grad_norm": 0.4798877239227295, + "loss": 1.0327, + "loss_ce": 1.0317190885543823, + "loss_region": 0.030008524656295776, + "loss_total": 1.0617276430130005, + "lr": 0.0009973329416172504, + "router/selected_tokens_s0": 4363.0, + "step": 6090, + "tokens_trained": 19.951393512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7301609814906744, + "grad_norm": 0.38358357548713684, + "loss": 1.0253, + "loss_ce": 0.981336236000061, + "loss_region": 0.030010856688022614, + "loss_total": 1.0113470554351807, + "lr": 0.0009969260496333994, + "router/selected_tokens_s0": 4339.5, + "step": 6100, + "tokens_trained": 19.984154824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.732997659740444, + "grad_norm": 0.6959689855575562, + "loss": 1.0329, + "loss_ce": 1.0202362537384033, + "loss_region": 0.0300093125551939, + "loss_total": 1.0502455234527588, + "lr": 0.0009965191576495483, + "router/selected_tokens_s0": 4384.0, + "step": 6110, + "tokens_trained": 20.016919464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7358343379902135, + "grad_norm": 0.40080687403678894, + "loss": 1.0209, + "loss_ce": 1.011544108390808, + "loss_region": 0.030006486922502518, + "loss_total": 1.041550636291504, + "lr": 0.0009961122656656973, + "router/selected_tokens_s0": 4340.5, + "step": 6120, + "tokens_trained": 20.049684904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.738671016239983, + "grad_norm": 0.6023288369178772, + "loss": 1.0308, + "loss_ce": 1.0501810312271118, + "loss_region": 0.03000890463590622, + "loss_total": 1.0801899433135986, + "lr": 0.0009957053736818463, + "router/selected_tokens_s0": 4388.625, + "step": 6130, + "tokens_trained": 20.082450344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7415076944897525, + "grad_norm": 0.6402179598808289, + "loss": 1.0291, + "loss_ce": 0.9643874764442444, + "loss_region": 0.030004534870386124, + "loss_total": 0.9943920373916626, + "lr": 0.0009952984816979952, + "router/selected_tokens_s0": 4334.25, + "step": 6140, + "tokens_trained": 20.115215784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.744344372739522, + "grad_norm": 0.3781474232673645, + "loss": 1.0335, + "loss_ce": 0.9693796038627625, + "loss_region": 0.029998643323779106, + "loss_total": 0.9993782639503479, + "lr": 0.0009948915897141442, + "router/selected_tokens_s0": 4263.625, + "step": 6150, + "tokens_trained": 20.147981224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7471810509892916, + "grad_norm": 0.542966365814209, + "loss": 1.03, + "loss_ce": 0.8867242932319641, + "loss_region": 0.03000199794769287, + "loss_total": 0.916726291179657, + "lr": 0.0009944846977302932, + "router/selected_tokens_s0": 4323.375, + "step": 6160, + "tokens_trained": 20.180746664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.750017729239061, + "grad_norm": 0.8848214149475098, + "loss": 1.0291, + "loss_ce": 0.8852431178092957, + "loss_region": 0.030019113793969154, + "loss_total": 0.9152622222900391, + "lr": 0.0009940778057464421, + "router/selected_tokens_s0": 4397.0, + "step": 6170, + "tokens_trained": 20.213511184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7528544074888306, + "grad_norm": 0.46940702199935913, + "loss": 1.0313, + "loss_ce": 0.9889075756072998, + "loss_region": 0.03000248782336712, + "loss_total": 1.0189100503921509, + "lr": 0.000993670913762591, + "router/selected_tokens_s0": 4355.375, + "step": 6180, + "tokens_trained": 20.246275824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7556910857386, + "grad_norm": 0.7385205626487732, + "loss": 1.0307, + "loss_ce": 1.0188992023468018, + "loss_region": 0.030005453154444695, + "loss_total": 1.0489046573638916, + "lr": 0.00099326402177874, + "router/selected_tokens_s0": 4358.875, + "step": 6190, + "tokens_trained": 20.279039664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7585277639883696, + "grad_norm": 0.46529778838157654, + "loss": 1.0285, + "loss_ce": 0.7485909461975098, + "loss_region": 0.030002525076270103, + "loss_total": 0.7785934805870056, + "lr": 0.0009928571297948892, + "router/selected_tokens_s0": 4315.0, + "step": 6200, + "tokens_trained": 20.311804304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7613644422381391, + "grad_norm": 0.9400434494018555, + "loss": 1.0333, + "loss_ce": 0.9820476770401001, + "loss_region": 0.030005764216184616, + "loss_total": 1.0120534896850586, + "lr": 0.0009924502378110382, + "router/selected_tokens_s0": 4343.875, + "step": 6210, + "tokens_trained": 20.344569744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7642011204879087, + "grad_norm": 0.36110371351242065, + "loss": 1.0273, + "loss_ce": 0.9966421723365784, + "loss_region": 0.030001837760210037, + "loss_total": 1.026643991470337, + "lr": 0.0009920433458271872, + "router/selected_tokens_s0": 4324.25, + "step": 6220, + "tokens_trained": 20.377335184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7670377987376782, + "grad_norm": 0.32094910740852356, + "loss": 1.0183, + "loss_ce": 0.9931308627128601, + "loss_region": 0.02998800203204155, + "loss_total": 1.0231188535690308, + "lr": 0.000991636453843336, + "router/selected_tokens_s0": 4319.375, + "step": 6230, + "tokens_trained": 20.410099824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7698744769874477, + "grad_norm": 0.6356220245361328, + "loss": 1.0232, + "loss_ce": 0.9035879373550415, + "loss_region": 0.02999711036682129, + "loss_total": 0.9335850477218628, + "lr": 0.0009912295618594849, + "router/selected_tokens_s0": 4267.25, + "step": 6240, + "tokens_trained": 20.442865264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7727111552372172, + "grad_norm": 0.6256352066993713, + "loss": 1.0244, + "loss_ce": 0.9663033485412598, + "loss_region": 0.030001938343048096, + "loss_total": 0.9963052868843079, + "lr": 0.0009908226698756338, + "router/selected_tokens_s0": 4327.0, + "step": 6250, + "tokens_trained": 20.475630704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7755478334869867, + "grad_norm": 0.6321415305137634, + "loss": 1.0334, + "loss_ce": 0.8710272312164307, + "loss_region": 0.029989438131451607, + "loss_total": 0.9010166525840759, + "lr": 0.0009904157778917828, + "router/selected_tokens_s0": 4299.875, + "step": 6260, + "tokens_trained": 20.508396144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7783845117367563, + "grad_norm": 0.5275737643241882, + "loss": 1.0215, + "loss_ce": 0.8997759222984314, + "loss_region": 0.030003786087036133, + "loss_total": 0.9297797083854675, + "lr": 0.000990008885907932, + "router/selected_tokens_s0": 4319.75, + "step": 6270, + "tokens_trained": 20.541161584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7812211899865258, + "grad_norm": 0.3214111030101776, + "loss": 1.0308, + "loss_ce": 1.0357693433761597, + "loss_region": 0.030006133019924164, + "loss_total": 1.0657755136489868, + "lr": 0.000989601993924081, + "router/selected_tokens_s0": 4368.75, + "step": 6280, + "tokens_trained": 20.573927024 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7840578682362953, + "grad_norm": 0.5135270953178406, + "loss": 1.0276, + "loss_ce": 1.0426994562149048, + "loss_region": 0.030001819133758545, + "loss_total": 1.0727012157440186, + "lr": 0.00098919510194023, + "router/selected_tokens_s0": 4314.75, + "step": 6290, + "tokens_trained": 20.606692464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7868945464860648, + "grad_norm": 0.552690327167511, + "loss": 1.0324, + "loss_ce": 0.9552174210548401, + "loss_region": 0.030004629865288734, + "loss_total": 0.9852220416069031, + "lr": 0.0009887882099563789, + "router/selected_tokens_s0": 4336.25, + "step": 6300, + "tokens_trained": 20.639457104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7897312247358343, + "grad_norm": 0.3548526167869568, + "loss": 1.0292, + "loss_ce": 1.0080056190490723, + "loss_region": 0.030002424493432045, + "loss_total": 1.0380080938339233, + "lr": 0.0009883813179725278, + "router/selected_tokens_s0": 4354.375, + "step": 6310, + "tokens_trained": 20.672222544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7925679029856039, + "grad_norm": 0.584200382232666, + "loss": 1.0328, + "loss_ce": 1.0079751014709473, + "loss_region": 0.030002448707818985, + "loss_total": 1.0379775762557983, + "lr": 0.0009879744259886768, + "router/selected_tokens_s0": 4336.125, + "step": 6320, + "tokens_trained": 20.704987984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.7954045812353734, + "grad_norm": 0.4808727502822876, + "loss": 1.025, + "loss_ce": 1.007968783378601, + "loss_region": 0.030000345781445503, + "loss_total": 1.0379691123962402, + "lr": 0.0009875675340048258, + "router/selected_tokens_s0": 4286.0, + "step": 6330, + "tokens_trained": 20.737753424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.798241259485143, + "grad_norm": 0.6309012770652771, + "loss": 1.0237, + "loss_ce": 1.107874870300293, + "loss_region": 0.030004162341356277, + "loss_total": 1.1378790140151978, + "lr": 0.0009871606420209747, + "router/selected_tokens_s0": 4346.375, + "step": 6340, + "tokens_trained": 20.770518864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8010779377349124, + "grad_norm": 0.5760451555252075, + "loss": 1.0323, + "loss_ce": 0.9537031650543213, + "loss_region": 0.030006103217601776, + "loss_total": 0.9837092757225037, + "lr": 0.0009867537500371237, + "router/selected_tokens_s0": 4342.75, + "step": 6350, + "tokens_trained": 20.803284304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.803914615984682, + "grad_norm": 0.3284536600112915, + "loss": 1.0216, + "loss_ce": 0.9741318821907043, + "loss_region": 0.030000338330864906, + "loss_total": 1.0041322708129883, + "lr": 0.0009863468580532727, + "router/selected_tokens_s0": 4289.875, + "step": 6360, + "tokens_trained": 20.836048936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8067512942344515, + "grad_norm": 0.5585495829582214, + "loss": 1.0309, + "loss_ce": 1.01836097240448, + "loss_region": 0.03000062145292759, + "loss_total": 1.0483615398406982, + "lr": 0.0009859399660694216, + "router/selected_tokens_s0": 4364.375, + "step": 6370, + "tokens_trained": 20.868814376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.809587972484221, + "grad_norm": 0.1517442911863327, + "loss": 1.0294, + "loss_ce": 1.0748825073242188, + "loss_region": 0.030010292306542397, + "loss_total": 1.1048928499221802, + "lr": 0.0009855330740855706, + "router/selected_tokens_s0": 4393.375, + "step": 6380, + "tokens_trained": 20.901579816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8124246507339905, + "grad_norm": 0.6283737421035767, + "loss": 1.0322, + "loss_ce": 1.0237987041473389, + "loss_region": 0.030001981183886528, + "loss_total": 1.0538007020950317, + "lr": 0.0009851261821017196, + "router/selected_tokens_s0": 4333.625, + "step": 6390, + "tokens_trained": 20.934345256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.81526132898376, + "grad_norm": 0.4309033453464508, + "loss": 1.0349, + "loss_ce": 0.9419378042221069, + "loss_region": 0.030002743005752563, + "loss_total": 0.9719405174255371, + "lr": 0.0009847192901178685, + "router/selected_tokens_s0": 4317.375, + "step": 6400, + "tokens_trained": 20.967109088 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8180980072335295, + "grad_norm": 0.20555104315280914, + "loss": 1.0214, + "loss_ce": 0.933826744556427, + "loss_region": 0.02999957650899887, + "loss_total": 0.9638262987136841, + "lr": 0.0009843123981340175, + "router/selected_tokens_s0": 4284.25, + "step": 6410, + "tokens_trained": 20.999874528 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.820934685483299, + "grad_norm": 0.5018749237060547, + "loss": 1.0248, + "loss_ce": 1.0134315490722656, + "loss_region": 0.030000217258930206, + "loss_total": 1.0434317588806152, + "lr": 0.0009839055061501665, + "router/selected_tokens_s0": 4271.375, + "step": 6420, + "tokens_trained": 21.032639968 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8237713637330686, + "grad_norm": 0.5851964950561523, + "loss": 1.0338, + "loss_ce": 0.9692232608795166, + "loss_region": 0.03000546433031559, + "loss_total": 0.9992287158966064, + "lr": 0.0009834986141663154, + "router/selected_tokens_s0": 4347.5, + "step": 6430, + "tokens_trained": 21.065405408 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.826608041982838, + "grad_norm": 0.39419281482696533, + "loss": 1.0284, + "loss_ce": 1.0628924369812012, + "loss_region": 0.0300043523311615, + "loss_total": 1.092896819114685, + "lr": 0.0009830917221824644, + "router/selected_tokens_s0": 4324.875, + "step": 6440, + "tokens_trained": 21.098170848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8294447202326076, + "grad_norm": 0.5206553339958191, + "loss": 1.0197, + "loss_ce": 1.0841736793518066, + "loss_region": 0.0300045907497406, + "loss_total": 1.1141782999038696, + "lr": 0.0009826848301986136, + "router/selected_tokens_s0": 4364.25, + "step": 6450, + "tokens_trained": 21.13093628 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8322813984823771, + "grad_norm": 0.2195570319890976, + "loss": 1.0281, + "loss_ce": 1.002855896949768, + "loss_region": 0.030004169791936874, + "loss_total": 1.0328600406646729, + "lr": 0.0009822779382147625, + "router/selected_tokens_s0": 4314.625, + "step": 6460, + "tokens_trained": 21.163697856 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8351180767321467, + "grad_norm": 0.7602940797805786, + "loss": 1.0209, + "loss_ce": 0.9888380765914917, + "loss_region": 0.03000422567129135, + "loss_total": 1.018842339515686, + "lr": 0.0009818710462309115, + "router/selected_tokens_s0": 4364.75, + "step": 6470, + "tokens_trained": 21.196463296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8379547549819162, + "grad_norm": 0.2704218626022339, + "loss": 1.0326, + "loss_ce": 1.0629881620407104, + "loss_region": 0.030001401901245117, + "loss_total": 1.0929895639419556, + "lr": 0.0009814641542470603, + "router/selected_tokens_s0": 4301.75, + "step": 6480, + "tokens_trained": 21.229228736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8407914332316857, + "grad_norm": 0.4285254180431366, + "loss": 1.0214, + "loss_ce": 0.9565003514289856, + "loss_region": 0.03000580705702305, + "loss_total": 0.9865061640739441, + "lr": 0.0009810572622632092, + "router/selected_tokens_s0": 4312.375, + "step": 6490, + "tokens_trained": 21.261994176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8436281114814552, + "grad_norm": 0.2764984369277954, + "loss": 1.0249, + "loss_ce": 1.0562752485275269, + "loss_region": 0.030008725821971893, + "loss_total": 1.0862839221954346, + "lr": 0.0009806503702793582, + "router/selected_tokens_s0": 4395.625, + "step": 6500, + "tokens_trained": 21.294759616 + }, + { + "epoch": 1.8436281114814552, + "eval_ppl": 2.719240196787984, + "eval_runtime": 2.4852, + "step": 6500, + "tokens_trained": 21.294759616 + }, + { + "epoch": 1.8436281114814552, + "eval_F": 0.3393643610064645, + "eval_F_cds": 0.3420203604725441, + "eval_F_dig": 0.3413808370445608, + "eval_F_exon": 0.34287809969814087, + "eval_F_intron": 0.33961228489181566, + "eval_F_nig": 0.3401546966344871, + "eval_F_promoter": 0.3361994198254964, + "eval_F_utr": 0.3403125240551151, + "eval_G": 0.34161998320486464, + "eval_G_cds": 0.34332446003408235, + "eval_G_dig": 0.39934358681993387, + "eval_G_exon": 0.34179626890145853, + "eval_G_intron": 0.34055942263612277, + "eval_G_nig": 0.3402422656561974, + "eval_G_promoter": 0.3442145956186178, + "eval_G_utr": 0.34100238800472976, + "eval_avg_bp_per_token": 2.946685376844716, + "eval_bp_per_token/cds": 2.9238025438555013, + "eval_bp_per_token/dig": 2.9292798291119926, + "eval_bp_per_token/exon": 2.9164883988810266, + "eval_bp_per_token/intron": 2.944534236500168, + "eval_bp_per_token/nig": 2.939838873001213, + "eval_bp_per_token/promoter": 2.974425121016116, + "eval_bp_per_token/utr": 2.938475458041167, + "eval_ppl_cds": 3.058239244523598, + "eval_ppl_dig": 1.0800319377974048, + "eval_ppl_exon": 3.21105125207959, + "eval_ppl_intron": 2.7948166784618853, + "eval_ppl_nig": 2.570261786790299, + "eval_ppl_promoter": 2.8363258206308832, + "eval_ppl_utr": 3.038061029444503, + "step": 6500, + "tokens_trained": 21.294759616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8464647897312247, + "grad_norm": 0.4310530126094818, + "loss": 1.0161, + "loss_ce": 0.8663329482078552, + "loss_region": 0.030006980523467064, + "loss_total": 0.8963399529457092, + "lr": 0.0009802434782955071, + "router/selected_tokens_s0": 4334.5, + "step": 6510, + "tokens_trained": 21.327524256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8493014679809943, + "grad_norm": 0.2964246869087219, + "loss": 1.0236, + "loss_ce": 1.0441830158233643, + "loss_region": 0.03000643663108349, + "loss_total": 1.0741894245147705, + "lr": 0.0009798365863116563, + "router/selected_tokens_s0": 4382.625, + "step": 6520, + "tokens_trained": 21.360289696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8521381462307638, + "grad_norm": 0.3320431709289551, + "loss": 1.0227, + "loss_ce": 0.9191460609436035, + "loss_region": 0.030004648491740227, + "loss_total": 0.9491506814956665, + "lr": 0.0009794296943278053, + "router/selected_tokens_s0": 4335.0, + "step": 6530, + "tokens_trained": 21.393055136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8549748244805333, + "grad_norm": 0.5521734356880188, + "loss": 1.018, + "loss_ce": 1.074299693107605, + "loss_region": 0.03000738099217415, + "loss_total": 1.1043070554733276, + "lr": 0.0009790228023439543, + "router/selected_tokens_s0": 4351.5, + "step": 6540, + "tokens_trained": 21.425820576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8578115027303028, + "grad_norm": 0.3981088101863861, + "loss": 1.0187, + "loss_ce": 1.0034153461456299, + "loss_region": 0.03000948764383793, + "loss_total": 1.0334248542785645, + "lr": 0.0009786159103601032, + "router/selected_tokens_s0": 4381.375, + "step": 6550, + "tokens_trained": 21.458586016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8606481809800723, + "grad_norm": 0.3619522750377655, + "loss": 1.0296, + "loss_ce": 0.865348219871521, + "loss_region": 0.030003046616911888, + "loss_total": 0.8953512907028198, + "lr": 0.0009782090183762522, + "router/selected_tokens_s0": 4279.0, + "step": 6560, + "tokens_trained": 21.491351456 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8634848592298419, + "grad_norm": 0.5320607423782349, + "loss": 1.0219, + "loss_ce": 1.0214167833328247, + "loss_region": 0.030002374202013016, + "loss_total": 1.0514191389083862, + "lr": 0.0009778021263924012, + "router/selected_tokens_s0": 4311.875, + "step": 6570, + "tokens_trained": 21.524116896 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8663215374796114, + "grad_norm": 0.2956782877445221, + "loss": 1.0234, + "loss_ce": 0.8803601861000061, + "loss_region": 0.03000090830028057, + "loss_total": 0.910361111164093, + "lr": 0.0009773952344085501, + "router/selected_tokens_s0": 4292.375, + "step": 6580, + "tokens_trained": 21.556882336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.869158215729381, + "grad_norm": 0.4348936080932617, + "loss": 1.0269, + "loss_ce": 1.0594830513000488, + "loss_region": 0.03000825271010399, + "loss_total": 1.0894912481307983, + "lr": 0.000976988342424699, + "router/selected_tokens_s0": 4375.625, + "step": 6590, + "tokens_trained": 21.589647776 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8719948939791504, + "grad_norm": 0.6076143383979797, + "loss": 1.0177, + "loss_ce": 0.907101035118103, + "loss_region": 0.030000492930412292, + "loss_total": 0.9371015429496765, + "lr": 0.000976581450440848, + "router/selected_tokens_s0": 4359.0, + "step": 6600, + "tokens_trained": 21.622413216 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.87483157222892, + "grad_norm": 0.19470538198947906, + "loss": 1.0245, + "loss_ce": 0.9608269929885864, + "loss_region": 0.030007699504494667, + "loss_total": 0.9908347129821777, + "lr": 0.0009761745584569969, + "router/selected_tokens_s0": 4379.625, + "step": 6610, + "tokens_trained": 21.655178656 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8776682504786895, + "grad_norm": 0.8752713203430176, + "loss": 1.0257, + "loss_ce": 0.9740114808082581, + "loss_region": 0.030000993981957436, + "loss_total": 1.0040124654769897, + "lr": 0.000975767666473146, + "router/selected_tokens_s0": 4279.0, + "step": 6620, + "tokens_trained": 21.687944096 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.880504928728459, + "grad_norm": 0.39157724380493164, + "loss": 1.0279, + "loss_ce": 1.0259708166122437, + "loss_region": 0.030008818954229355, + "loss_total": 1.055979609489441, + "lr": 0.000975360774489295, + "router/selected_tokens_s0": 4390.125, + "step": 6630, + "tokens_trained": 21.720707792 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8833416069782285, + "grad_norm": 0.6138049960136414, + "loss": 1.0229, + "loss_ce": 0.9629898071289062, + "loss_region": 0.030006365850567818, + "loss_total": 0.9929961562156677, + "lr": 0.0009749538825054439, + "router/selected_tokens_s0": 4384.375, + "step": 6640, + "tokens_trained": 21.753473232 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.886178285227998, + "grad_norm": 0.43559062480926514, + "loss": 1.0204, + "loss_ce": 1.0367153882980347, + "loss_region": 0.029998227953910828, + "loss_total": 1.066713571548462, + "lr": 0.0009745469905215929, + "router/selected_tokens_s0": 4288.0, + "step": 6650, + "tokens_trained": 21.786238672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8890149634777675, + "grad_norm": 0.4014212489128113, + "loss": 1.0182, + "loss_ce": 1.0711160898208618, + "loss_region": 0.029999660328030586, + "loss_total": 1.1011157035827637, + "lr": 0.0009741400985377418, + "router/selected_tokens_s0": 4285.75, + "step": 6660, + "tokens_trained": 21.819004112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.891851641727537, + "grad_norm": 0.5424520969390869, + "loss": 1.0179, + "loss_ce": 0.978352963924408, + "loss_region": 0.030008796602487564, + "loss_total": 1.00836181640625, + "lr": 0.0009737332065538909, + "router/selected_tokens_s0": 4390.5, + "step": 6670, + "tokens_trained": 21.851769552 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.8946883199773066, + "grad_norm": 0.30934035778045654, + "loss": 1.0153, + "loss_ce": 1.080556035041809, + "loss_region": 0.030008671805262566, + "loss_total": 1.1105647087097168, + "lr": 0.0009733263145700399, + "router/selected_tokens_s0": 4396.125, + "step": 6680, + "tokens_trained": 21.884534992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.897524998227076, + "grad_norm": 0.2793532907962799, + "loss": 1.0172, + "loss_ce": 0.9416234493255615, + "loss_region": 0.03001220151782036, + "loss_total": 0.971635639667511, + "lr": 0.0009729194225861888, + "router/selected_tokens_s0": 4390.875, + "step": 6690, + "tokens_trained": 21.917300432 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9003616764768456, + "grad_norm": 0.3333662152290344, + "loss": 1.014, + "loss_ce": 1.1505012512207031, + "loss_region": 0.030008589848876, + "loss_total": 1.1805098056793213, + "lr": 0.0009725125306023378, + "router/selected_tokens_s0": 4385.5, + "step": 6700, + "tokens_trained": 21.950065872 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9031983547266151, + "grad_norm": 0.6804750561714172, + "loss": 1.0246, + "loss_ce": 0.9861887693405151, + "loss_region": 0.030010070651769638, + "loss_total": 1.0161988735198975, + "lr": 0.0009721056386184868, + "router/selected_tokens_s0": 4425.25, + "step": 6710, + "tokens_trained": 21.982831312 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9060350329763847, + "grad_norm": 0.41757962107658386, + "loss": 1.027, + "loss_ce": 0.9903926253318787, + "loss_region": 0.029996752738952637, + "loss_total": 1.0203893184661865, + "lr": 0.0009716987466346357, + "router/selected_tokens_s0": 4325.5, + "step": 6720, + "tokens_trained": 22.015596752 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9088717112261542, + "grad_norm": 0.3082675039768219, + "loss": 1.0152, + "loss_ce": 1.0476970672607422, + "loss_region": 0.030004357919096947, + "loss_total": 1.077701449394226, + "lr": 0.0009712918546507846, + "router/selected_tokens_s0": 4322.875, + "step": 6730, + "tokens_trained": 22.048362192 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9117083894759237, + "grad_norm": 0.33058109879493713, + "loss": 1.0084, + "loss_ce": 1.134926199913025, + "loss_region": 0.030010567978024483, + "loss_total": 1.1649367809295654, + "lr": 0.0009708849626669337, + "router/selected_tokens_s0": 4403.125, + "step": 6740, + "tokens_trained": 22.081127632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9145450677256932, + "grad_norm": 0.244770810008049, + "loss": 1.0176, + "loss_ce": 1.0867021083831787, + "loss_region": 0.030003776773810387, + "loss_total": 1.1167058944702148, + "lr": 0.0009704780706830826, + "router/selected_tokens_s0": 4347.125, + "step": 6750, + "tokens_trained": 22.113893072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9173817459754627, + "grad_norm": 0.6021504998207092, + "loss": 1.0237, + "loss_ce": 0.8549233675003052, + "loss_region": 0.029998784884810448, + "loss_total": 0.8849221467971802, + "lr": 0.0009700711786992316, + "router/selected_tokens_s0": 4271.875, + "step": 6760, + "tokens_trained": 22.146655448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9202184242252323, + "grad_norm": 0.4463154673576355, + "loss": 1.0189, + "loss_ce": 0.9287962913513184, + "loss_region": 0.029997825622558594, + "loss_total": 0.958794116973877, + "lr": 0.0009696642867153806, + "router/selected_tokens_s0": 4255.5, + "step": 6770, + "tokens_trained": 22.179420888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9230551024750018, + "grad_norm": 0.46010589599609375, + "loss": 1.0311, + "loss_ce": 0.9898989200592041, + "loss_region": 0.029999876394867897, + "loss_total": 1.019898772239685, + "lr": 0.0009692573947315295, + "router/selected_tokens_s0": 4311.125, + "step": 6780, + "tokens_trained": 22.212186328 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9258917807247713, + "grad_norm": 0.490548700094223, + "loss": 1.0194, + "loss_ce": 1.0876526832580566, + "loss_region": 0.030008064582943916, + "loss_total": 1.1176607608795166, + "lr": 0.0009688505027476785, + "router/selected_tokens_s0": 4357.0, + "step": 6790, + "tokens_trained": 22.244951768 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9287284589745408, + "grad_norm": 0.2691711485385895, + "loss": 1.0177, + "loss_ce": 0.9520583152770996, + "loss_region": 0.030003707855939865, + "loss_total": 0.982062041759491, + "lr": 0.0009684436107638276, + "router/selected_tokens_s0": 4321.125, + "step": 6800, + "tokens_trained": 22.277717208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9315651372243103, + "grad_norm": 0.4872676730155945, + "loss": 1.0176, + "loss_ce": 0.9619548916816711, + "loss_region": 0.03000466339290142, + "loss_total": 0.9919595718383789, + "lr": 0.0009680367187799765, + "router/selected_tokens_s0": 4401.875, + "step": 6810, + "tokens_trained": 22.310482648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9344018154740799, + "grad_norm": 0.3577340543270111, + "loss": 1.0207, + "loss_ce": 1.1256054639816284, + "loss_region": 0.03000939078629017, + "loss_total": 1.1556148529052734, + "lr": 0.0009676298267961255, + "router/selected_tokens_s0": 4371.5, + "step": 6820, + "tokens_trained": 22.343246488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9372384937238494, + "grad_norm": 0.343302458524704, + "loss": 1.0216, + "loss_ce": 1.022916316986084, + "loss_region": 0.03001173585653305, + "loss_total": 1.05292809009552, + "lr": 0.0009672229348122745, + "router/selected_tokens_s0": 4398.0, + "step": 6830, + "tokens_trained": 22.376011928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.940075171973619, + "grad_norm": 0.26703327894210815, + "loss": 1.023, + "loss_ce": 0.8495204448699951, + "loss_region": 0.03000800684094429, + "loss_total": 0.8795284628868103, + "lr": 0.0009668160428284234, + "router/selected_tokens_s0": 4344.25, + "step": 6840, + "tokens_trained": 22.408777368 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9429118502233884, + "grad_norm": 0.20303326845169067, + "loss": 1.0249, + "loss_ce": 1.0567682981491089, + "loss_region": 0.03000817447900772, + "loss_total": 1.0867764949798584, + "lr": 0.0009664091508445723, + "router/selected_tokens_s0": 4356.5, + "step": 6850, + "tokens_trained": 22.441542808 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.945748528473158, + "grad_norm": 0.32166531682014465, + "loss": 1.026, + "loss_ce": 0.8659758567810059, + "loss_region": 0.030001448467373848, + "loss_total": 0.8959773182868958, + "lr": 0.0009660022588607213, + "router/selected_tokens_s0": 4328.625, + "step": 6860, + "tokens_trained": 22.474308248 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9485852067229275, + "grad_norm": 0.33950793743133545, + "loss": 1.0135, + "loss_ce": 0.8396299481391907, + "loss_region": 0.02999655157327652, + "loss_total": 0.869626522064209, + "lr": 0.0009655953668768703, + "router/selected_tokens_s0": 4286.5, + "step": 6870, + "tokens_trained": 22.507073688 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.951421884972697, + "grad_norm": 0.5167945623397827, + "loss": 1.0174, + "loss_ce": 0.9886255264282227, + "loss_region": 0.029992789030075073, + "loss_total": 1.0186183452606201, + "lr": 0.0009651884748930193, + "router/selected_tokens_s0": 4255.75, + "step": 6880, + "tokens_trained": 22.539837536 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9542585632224665, + "grad_norm": 0.4705487787723541, + "loss": 1.0276, + "loss_ce": 0.9448988437652588, + "loss_region": 0.029985623434185982, + "loss_total": 0.9748844504356384, + "lr": 0.0009647815829091683, + "router/selected_tokens_s0": 4308.625, + "step": 6890, + "tokens_trained": 22.572602976 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.957095241472236, + "grad_norm": 0.305258184671402, + "loss": 1.0224, + "loss_ce": 1.0831265449523926, + "loss_region": 0.0300028957426548, + "loss_total": 1.1131294965744019, + "lr": 0.0009643746909253172, + "router/selected_tokens_s0": 4364.5, + "step": 6900, + "tokens_trained": 22.605367616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9599319197220055, + "grad_norm": 0.21500487625598907, + "loss": 1.0138, + "loss_ce": 0.8985041975975037, + "loss_region": 0.030009577050805092, + "loss_total": 0.928513765335083, + "lr": 0.0009639677989414662, + "router/selected_tokens_s0": 4287.0, + "step": 6910, + "tokens_trained": 22.638133056 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.962768597971775, + "grad_norm": 0.3461516499519348, + "loss": 1.0155, + "loss_ce": 0.9311016798019409, + "loss_region": 0.030005477368831635, + "loss_total": 0.9611071348190308, + "lr": 0.0009635609069576153, + "router/selected_tokens_s0": 4329.875, + "step": 6920, + "tokens_trained": 22.670898496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9656052762215446, + "grad_norm": 0.3964892029762268, + "loss": 1.0248, + "loss_ce": 1.0959316492080688, + "loss_region": 0.030009768903255463, + "loss_total": 1.1259413957595825, + "lr": 0.0009631540149737642, + "router/selected_tokens_s0": 4374.875, + "step": 6930, + "tokens_trained": 22.703663936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.968441954471314, + "grad_norm": 0.3205887973308563, + "loss": 1.0163, + "loss_ce": 1.1087725162506104, + "loss_region": 0.030007241293787956, + "loss_total": 1.1387797594070435, + "lr": 0.0009627471229899132, + "router/selected_tokens_s0": 4352.75, + "step": 6940, + "tokens_trained": 22.736429376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9712786327210836, + "grad_norm": 0.33910778164863586, + "loss": 1.0218, + "loss_ce": 1.038806438446045, + "loss_region": 0.030002577230334282, + "loss_total": 1.0688090324401855, + "lr": 0.0009623402310060622, + "router/selected_tokens_s0": 4360.625, + "step": 6950, + "tokens_trained": 22.769194816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9741153109708531, + "grad_norm": 0.2701500952243805, + "loss": 1.0235, + "loss_ce": 1.1355714797973633, + "loss_region": 0.03000757098197937, + "loss_total": 1.165579080581665, + "lr": 0.0009619333390222111, + "router/selected_tokens_s0": 4395.5, + "step": 6960, + "tokens_trained": 22.801960256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9769519892206227, + "grad_norm": 0.5333274006843567, + "loss": 1.0296, + "loss_ce": 0.912113606929779, + "loss_region": 0.03000345081090927, + "loss_total": 0.9421170353889465, + "lr": 0.00096152644703836, + "router/selected_tokens_s0": 4313.75, + "step": 6970, + "tokens_trained": 22.834725696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9797886674703922, + "grad_norm": 0.26877132058143616, + "loss": 1.016, + "loss_ce": 1.008344292640686, + "loss_region": 0.030002953484654427, + "loss_total": 1.0383472442626953, + "lr": 0.0009611195550545089, + "router/selected_tokens_s0": 4327.25, + "step": 6980, + "tokens_trained": 22.867491136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9826253457201617, + "grad_norm": 0.19557499885559082, + "loss": 1.0284, + "loss_ce": 0.8492376804351807, + "loss_region": 0.029998069629073143, + "loss_total": 0.8792357444763184, + "lr": 0.000960712663070658, + "router/selected_tokens_s0": 4274.0, + "step": 6990, + "tokens_trained": 22.900255448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9854620239699312, + "grad_norm": 0.26989954710006714, + "loss": 1.0176, + "loss_ce": 1.0254831314086914, + "loss_region": 0.02999790757894516, + "loss_total": 1.0554810762405396, + "lr": 0.000960305771086807, + "router/selected_tokens_s0": 4297.0, + "step": 7000, + "tokens_trained": 22.933020888 + }, + { + "epoch": 1.9854620239699312, + "eval_ppl": 2.709992797695115, + "eval_runtime": 2.4894, + "step": 7000, + "tokens_trained": 22.933020888 + }, + { + "epoch": 1.9854620239699312, + "eval_F": 0.33996839087292824, + "eval_F_cds": 0.34227911151361273, + "eval_F_dig": 0.33067687579758837, + "eval_F_exon": 0.3433576889440573, + "eval_F_intron": 0.3404298540324931, + "eval_F_nig": 0.34082371131874434, + "eval_F_promoter": 0.3365449296781681, + "eval_F_utr": 0.3413132168424294, + "eval_G": 0.3426802844372926, + "eval_G_cds": 0.3440049449497053, + "eval_G_dig": 0.3962462684259187, + "eval_G_exon": 0.34246664143827404, + "eval_G_intron": 0.3416454998246621, + "eval_G_nig": 0.3411694445581286, + "eval_G_promoter": 0.34569645663780835, + "eval_G_utr": 0.3421408631456393, + "eval_avg_bp_per_token": 2.9414499313666345, + "eval_bp_per_token/cds": 2.9215922513583745, + "eval_bp_per_token/dig": 3.0241001811451524, + "eval_bp_per_token/exon": 2.9124147563881357, + "eval_bp_per_token/intron": 2.937462705325934, + "eval_bp_per_token/nig": 2.934068161310474, + "eval_bp_per_token/promoter": 2.971371462812653, + "eval_bp_per_token/utr": 2.9298601714027965, + "eval_ppl_cds": 3.081433580837444, + "eval_ppl_dig": 1.0774753829843062, + "eval_ppl_exon": 3.201283955843817, + "eval_ppl_intron": 2.7913113527725355, + "eval_ppl_nig": 2.56843067695636, + "eval_ppl_promoter": 2.8156228133154633, + "eval_ppl_utr": 3.0477104866485876, + "step": 7000, + "tokens_trained": 22.933020888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9882987022197007, + "grad_norm": 0.2712688148021698, + "loss": 1.0252, + "loss_ce": 1.0077736377716064, + "loss_region": 0.030003046616911888, + "loss_total": 1.0377767086029053, + "lr": 0.000959898879102956, + "router/selected_tokens_s0": 4346.875, + "step": 7010, + "tokens_trained": 22.965785528 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9911353804694703, + "grad_norm": 0.20418202877044678, + "loss": 1.0258, + "loss_ce": 1.10317862033844, + "loss_region": 0.030004605650901794, + "loss_total": 1.133183240890503, + "lr": 0.0009594919871191049, + "router/selected_tokens_s0": 4334.375, + "step": 7020, + "tokens_trained": 22.998550968 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9939720587192398, + "grad_norm": 0.3529861271381378, + "loss": 1.0173, + "loss_ce": 0.897994339466095, + "loss_region": 0.030005095526576042, + "loss_total": 0.9279994368553162, + "lr": 0.0009590850951352539, + "router/selected_tokens_s0": 4333.375, + "step": 7030, + "tokens_trained": 23.031316408 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9968087369690093, + "grad_norm": 0.334820032119751, + "loss": 1.0226, + "loss_ce": 0.9357202053070068, + "loss_region": 0.029994290322065353, + "loss_total": 0.9657145142555237, + "lr": 0.0009586782031514028, + "router/selected_tokens_s0": 4254.25, + "step": 7040, + "tokens_trained": 23.064081848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 1.9996454152187788, + "grad_norm": 0.41582390666007996, + "loss": 1.0209, + "loss_ce": 0.9339370727539062, + "loss_region": 0.0300084687769413, + "loss_total": 0.9639455676078796, + "lr": 0.0009582713111675519, + "router/selected_tokens_s0": 4347.25, + "step": 7050, + "tokens_trained": 23.096847288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0022693425998157, + "grad_norm": 0.5205407738685608, + "loss": 1.0096, + "loss_ce": 0.9696592688560486, + "loss_region": 0.03001180849969387, + "loss_total": 0.9996711015701294, + "lr": 0.0009578644191837009, + "router/selected_tokens_s0": 4385.25, + "step": 7060, + "tokens_trained": 23.12715532 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0051060208495852, + "grad_norm": 0.280998170375824, + "loss": 1.0155, + "loss_ce": 0.9050310254096985, + "loss_region": 0.030005203559994698, + "loss_total": 0.9350362420082092, + "lr": 0.0009574575271998498, + "router/selected_tokens_s0": 4360.125, + "step": 7070, + "tokens_trained": 23.15992076 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0079426990993547, + "grad_norm": 0.3218519389629364, + "loss": 1.0045, + "loss_ce": 1.0558598041534424, + "loss_region": 0.030001701787114143, + "loss_total": 1.0858615636825562, + "lr": 0.0009570506352159988, + "router/selected_tokens_s0": 4294.5, + "step": 7080, + "tokens_trained": 23.1926862 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0107793773491243, + "grad_norm": 0.3248656392097473, + "loss": 1.0152, + "loss_ce": 0.960863471031189, + "loss_region": 0.030006583780050278, + "loss_total": 0.9908700585365295, + "lr": 0.0009566437432321478, + "router/selected_tokens_s0": 4341.125, + "step": 7090, + "tokens_trained": 23.22545164 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.013616055598894, + "grad_norm": 0.4756578505039215, + "loss": 1.0161, + "loss_ce": 0.9680695533752441, + "loss_region": 0.030004283413290977, + "loss_total": 0.9980738162994385, + "lr": 0.0009562368512482966, + "router/selected_tokens_s0": 4319.0, + "step": 7100, + "tokens_trained": 23.25821708 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0164527338486633, + "grad_norm": 0.20847953855991364, + "loss": 1.0168, + "loss_ce": 0.9128162264823914, + "loss_region": 0.030003240332007408, + "loss_total": 0.9428194761276245, + "lr": 0.0009558299592644456, + "router/selected_tokens_s0": 4317.625, + "step": 7110, + "tokens_trained": 23.29098252 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.019289412098433, + "grad_norm": 0.0922790989279747, + "loss": 1.0155, + "loss_ce": 1.0814945697784424, + "loss_region": 0.030005749315023422, + "loss_total": 1.1115002632141113, + "lr": 0.0009554230672805947, + "router/selected_tokens_s0": 4327.5, + "step": 7120, + "tokens_trained": 23.32374796 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0221260903482023, + "grad_norm": 0.21533632278442383, + "loss": 1.0068, + "loss_ce": 0.9197317957878113, + "loss_region": 0.030004169791936874, + "loss_total": 0.9497359395027161, + "lr": 0.0009550161752967436, + "router/selected_tokens_s0": 4351.25, + "step": 7130, + "tokens_trained": 23.3565134 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.024962768597972, + "grad_norm": 0.3035869300365448, + "loss": 1.0178, + "loss_ce": 0.9156540036201477, + "loss_region": 0.030003761872649193, + "loss_total": 0.9456577897071838, + "lr": 0.0009546092833128926, + "router/selected_tokens_s0": 4343.625, + "step": 7140, + "tokens_trained": 23.38927804 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0277994468477414, + "grad_norm": 0.35370907187461853, + "loss": 1.0047, + "loss_ce": 0.9257456064224243, + "loss_region": 0.03000473417341709, + "loss_total": 0.9557503461837769, + "lr": 0.0009542023913290416, + "router/selected_tokens_s0": 4342.375, + "step": 7150, + "tokens_trained": 23.42204348 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.030636125097511, + "grad_norm": 0.6606413722038269, + "loss": 0.9998, + "loss_ce": 1.0529682636260986, + "loss_region": 0.030001679435372353, + "loss_total": 1.0829699039459229, + "lr": 0.0009537954993451905, + "router/selected_tokens_s0": 4324.75, + "step": 7160, + "tokens_trained": 23.45480892 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0334728033472804, + "grad_norm": 0.37469908595085144, + "loss": 1.0071, + "loss_ce": 1.0678061246871948, + "loss_region": 0.030009904876351357, + "loss_total": 1.097815990447998, + "lr": 0.0009533886073613396, + "router/selected_tokens_s0": 4383.5, + "step": 7170, + "tokens_trained": 23.48757436 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.03630948159705, + "grad_norm": 0.16916786134243011, + "loss": 1.0064, + "loss_ce": 0.9398297071456909, + "loss_region": 0.030004480853676796, + "loss_total": 0.9698342084884644, + "lr": 0.0009529817153774886, + "router/selected_tokens_s0": 4354.125, + "step": 7180, + "tokens_trained": 23.520339 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0391461598468195, + "grad_norm": 0.5720134973526001, + "loss": 1.0081, + "loss_ce": 0.9152596592903137, + "loss_region": 0.03000476211309433, + "loss_total": 0.9452643990516663, + "lr": 0.0009525748233936375, + "router/selected_tokens_s0": 4345.25, + "step": 7190, + "tokens_trained": 23.55310444 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.041982838096589, + "grad_norm": 0.15418080985546112, + "loss": 1.0203, + "loss_ce": 0.9774296283721924, + "loss_region": 0.02999977394938469, + "loss_total": 1.0074293613433838, + "lr": 0.0009521679314097865, + "router/selected_tokens_s0": 4293.375, + "step": 7200, + "tokens_trained": 23.58586988 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0448195163463585, + "grad_norm": 0.27360469102859497, + "loss": 1.0172, + "loss_ce": 1.078895092010498, + "loss_region": 0.03000294603407383, + "loss_total": 1.1088980436325073, + "lr": 0.0009517610394259355, + "router/selected_tokens_s0": 4340.75, + "step": 7210, + "tokens_trained": 23.61863532 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.047656194596128, + "grad_norm": 0.18781284987926483, + "loss": 1.0159, + "loss_ce": 1.0002025365829468, + "loss_region": 0.0300070159137249, + "loss_total": 1.0302095413208008, + "lr": 0.0009513541474420843, + "router/selected_tokens_s0": 4363.625, + "step": 7220, + "tokens_trained": 23.65140076 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0504928728458975, + "grad_norm": 0.2986643612384796, + "loss": 1.0099, + "loss_ce": 1.0988267660140991, + "loss_region": 0.030009916052222252, + "loss_total": 1.1288366317749023, + "lr": 0.0009509472554582333, + "router/selected_tokens_s0": 4384.25, + "step": 7230, + "tokens_trained": 23.6841662 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.053329551095667, + "grad_norm": 0.3949977159500122, + "loss": 1.0139, + "loss_ce": 0.8460358381271362, + "loss_region": 0.03000258095562458, + "loss_total": 0.8760384321212769, + "lr": 0.0009505403634743824, + "router/selected_tokens_s0": 4336.875, + "step": 7240, + "tokens_trained": 23.71693004 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0561662293454366, + "grad_norm": 0.13388489186763763, + "loss": 1.0099, + "loss_ce": 1.0630372762680054, + "loss_region": 0.03001607023179531, + "loss_total": 1.0930533409118652, + "lr": 0.0009501334714905313, + "router/selected_tokens_s0": 4411.625, + "step": 7250, + "tokens_trained": 23.74969548 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.059002907595206, + "grad_norm": 0.3381938934326172, + "loss": 1.0098, + "loss_ce": 1.0139682292938232, + "loss_region": 0.030003046616911888, + "loss_total": 1.043971300125122, + "lr": 0.0009497265795066803, + "router/selected_tokens_s0": 4322.875, + "step": 7260, + "tokens_trained": 23.78246092 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0618395858449756, + "grad_norm": 0.12383253872394562, + "loss": 1.0046, + "loss_ce": 1.0764480829238892, + "loss_region": 0.030012380331754684, + "loss_total": 1.106460452079773, + "lr": 0.0009493196875228293, + "router/selected_tokens_s0": 4350.75, + "step": 7270, + "tokens_trained": 23.81522636 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.064676264094745, + "grad_norm": 0.30957257747650146, + "loss": 1.0084, + "loss_ce": 1.047568678855896, + "loss_region": 0.030002739280462265, + "loss_total": 1.0775713920593262, + "lr": 0.0009489127955389782, + "router/selected_tokens_s0": 4290.125, + "step": 7280, + "tokens_trained": 23.8479918 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0675129423445147, + "grad_norm": 0.33209699392318726, + "loss": 1.0139, + "loss_ce": 0.8992984294891357, + "loss_region": 0.030000081285834312, + "loss_total": 0.9292985200881958, + "lr": 0.0009485059035551272, + "router/selected_tokens_s0": 4354.5, + "step": 7290, + "tokens_trained": 23.88075724 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.070349620594284, + "grad_norm": 0.42607948184013367, + "loss": 1.02, + "loss_ce": 1.000959038734436, + "loss_region": 0.03000684455037117, + "loss_total": 1.0309659242630005, + "lr": 0.0009480990115712763, + "router/selected_tokens_s0": 4360.875, + "step": 7300, + "tokens_trained": 23.91352268 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0731862988440537, + "grad_norm": 0.23813225328922272, + "loss": 1.0122, + "loss_ce": 0.9922530055046082, + "loss_region": 0.030007174238562584, + "loss_total": 1.0222601890563965, + "lr": 0.0009476921195874252, + "router/selected_tokens_s0": 4337.0, + "step": 7310, + "tokens_trained": 23.94628812 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.076022977093823, + "grad_norm": 0.26517459750175476, + "loss": 1.0079, + "loss_ce": 1.1117208003997803, + "loss_region": 0.030004991218447685, + "loss_total": 1.141725778579712, + "lr": 0.0009472852276035742, + "router/selected_tokens_s0": 4337.25, + "step": 7320, + "tokens_trained": 23.97905276 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0788596553435927, + "grad_norm": 0.47756844758987427, + "loss": 1.0066, + "loss_ce": 0.8415612578392029, + "loss_region": 0.02998754195868969, + "loss_total": 0.8715487718582153, + "lr": 0.0009468783356197232, + "router/selected_tokens_s0": 4282.875, + "step": 7330, + "tokens_trained": 24.0118182 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0816963335933623, + "grad_norm": 0.27931317687034607, + "loss": 1.0111, + "loss_ce": 1.1385244131088257, + "loss_region": 0.030008604750037193, + "loss_total": 1.1685329675674438, + "lr": 0.0009464714436358721, + "router/selected_tokens_s0": 4387.875, + "step": 7340, + "tokens_trained": 24.04458364 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0845330118431318, + "grad_norm": 0.3101511001586914, + "loss": 0.9966, + "loss_ce": 0.8754054307937622, + "loss_region": 0.030013345181941986, + "loss_total": 0.9054187536239624, + "lr": 0.000946064551652021, + "router/selected_tokens_s0": 4363.5, + "step": 7350, + "tokens_trained": 24.07734908 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0873696900929013, + "grad_norm": 0.3301701843738556, + "loss": 1.0119, + "loss_ce": 0.8650031089782715, + "loss_region": 0.029985841363668442, + "loss_total": 0.8949889540672302, + "lr": 0.0009456576596681699, + "router/selected_tokens_s0": 4256.625, + "step": 7360, + "tokens_trained": 24.110113704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.090206368342671, + "grad_norm": 0.26068466901779175, + "loss": 1.0052, + "loss_ce": 0.9636062979698181, + "loss_region": 0.030003955587744713, + "loss_total": 0.9936102628707886, + "lr": 0.000945250767684319, + "router/selected_tokens_s0": 4325.5, + "step": 7370, + "tokens_trained": 24.142879144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0930430465924403, + "grad_norm": 0.3576655685901642, + "loss": 1.0131, + "loss_ce": 1.0395320653915405, + "loss_region": 0.030007902532815933, + "loss_total": 1.069540023803711, + "lr": 0.000944843875700468, + "router/selected_tokens_s0": 4370.25, + "step": 7380, + "tokens_trained": 24.175643784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.09587972484221, + "grad_norm": 0.35709458589553833, + "loss": 1.0068, + "loss_ce": 1.0109641551971436, + "loss_region": 0.030006196349859238, + "loss_total": 1.0409703254699707, + "lr": 0.000944436983716617, + "router/selected_tokens_s0": 4318.125, + "step": 7390, + "tokens_trained": 24.208409224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.0987164030919794, + "grad_norm": 0.24661777913570404, + "loss": 0.9994, + "loss_ce": 0.9633923768997192, + "loss_region": 0.030005071312189102, + "loss_total": 0.9933974742889404, + "lr": 0.0009440300917327659, + "router/selected_tokens_s0": 4311.25, + "step": 7400, + "tokens_trained": 24.241174664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.101553081341749, + "grad_norm": 0.4736558496952057, + "loss": 1.0129, + "loss_ce": 1.0214873552322388, + "loss_region": 0.0300043486058712, + "loss_total": 1.0514917373657227, + "lr": 0.0009436231997489149, + "router/selected_tokens_s0": 4337.625, + "step": 7410, + "tokens_trained": 24.273940104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1043897595915184, + "grad_norm": 0.23777659237384796, + "loss": 0.9994, + "loss_ce": 1.057599425315857, + "loss_region": 0.030008653178811073, + "loss_total": 1.0876080989837646, + "lr": 0.000943216307765064, + "router/selected_tokens_s0": 4344.375, + "step": 7420, + "tokens_trained": 24.306705544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.107226437841288, + "grad_norm": 0.20976249873638153, + "loss": 1.007, + "loss_ce": 0.9605597853660583, + "loss_region": 0.030000170692801476, + "loss_total": 0.9905599355697632, + "lr": 0.0009428094157812129, + "router/selected_tokens_s0": 4314.5, + "step": 7430, + "tokens_trained": 24.339470184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1100631160910575, + "grad_norm": 0.3941101133823395, + "loss": 1.0143, + "loss_ce": 0.961480438709259, + "loss_region": 0.030003497377038002, + "loss_total": 0.9914839267730713, + "lr": 0.0009424025237973619, + "router/selected_tokens_s0": 4289.875, + "step": 7440, + "tokens_trained": 24.372235624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.112899794340827, + "grad_norm": 0.08056443929672241, + "loss": 1.0057, + "loss_ce": 0.9109418392181396, + "loss_region": 0.030004508793354034, + "loss_total": 0.9409463405609131, + "lr": 0.0009419956318135108, + "router/selected_tokens_s0": 4307.125, + "step": 7450, + "tokens_trained": 24.404997 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1157364725905965, + "grad_norm": 0.17271296679973602, + "loss": 1.0069, + "loss_ce": 1.0220129489898682, + "loss_region": 0.030005160719156265, + "loss_total": 1.052018165588379, + "lr": 0.0009415887398296598, + "router/selected_tokens_s0": 4345.125, + "step": 7460, + "tokens_trained": 24.43776244 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.118573150840366, + "grad_norm": 0.25527146458625793, + "loss": 1.014, + "loss_ce": 0.9348462224006653, + "loss_region": 0.030002281069755554, + "loss_total": 0.964848518371582, + "lr": 0.0009411818478458087, + "router/selected_tokens_s0": 4343.875, + "step": 7470, + "tokens_trained": 24.47052788 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1214098290901355, + "grad_norm": 0.17030328512191772, + "loss": 1.0024, + "loss_ce": 0.9565524458885193, + "loss_region": 0.030008699744939804, + "loss_total": 0.986561119556427, + "lr": 0.0009407749558619576, + "router/selected_tokens_s0": 4346.25, + "step": 7480, + "tokens_trained": 24.50329252 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.124246507339905, + "grad_norm": 0.7695507407188416, + "loss": 1.0138, + "loss_ce": 1.0727932453155518, + "loss_region": 0.030004819855093956, + "loss_total": 1.1027981042861938, + "lr": 0.0009403680638781067, + "router/selected_tokens_s0": 4348.5, + "step": 7490, + "tokens_trained": 24.53605796 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1270831855896746, + "grad_norm": 0.22502440214157104, + "loss": 1.009, + "loss_ce": 0.9865615963935852, + "loss_region": 0.030008045956492424, + "loss_total": 1.0165696144104004, + "lr": 0.0009399611718942557, + "router/selected_tokens_s0": 4383.125, + "step": 7500, + "tokens_trained": 24.5688234 + }, + { + "epoch": 2.1270831855896746, + "eval_ppl": 2.6909390388741863, + "eval_runtime": 2.4761, + "step": 7500, + "tokens_trained": 24.5688234 + }, + { + "epoch": 2.1270831855896746, + "eval_F": 0.33846436712107264, + "eval_F_cds": 0.34180203928164243, + "eval_F_dig": 0.3347136495038675, + "eval_F_exon": 0.34256777724490084, + "eval_F_intron": 0.3388297929749957, + "eval_F_nig": 0.33910811920762923, + "eval_F_promoter": 0.3351262239157689, + "eval_F_utr": 0.3399960656522892, + "eval_G": 0.3431427257288273, + "eval_G_cds": 0.3455253442096369, + "eval_G_dig": 0.3956906016420554, + "eval_G_exon": 0.34362459270178014, + "eval_G_intron": 0.3421316768327175, + "eval_G_nig": 0.34148599859667783, + "eval_G_promoter": 0.34604106305272764, + "eval_G_utr": 0.34290347711342894, + "eval_avg_bp_per_token": 2.9545207624242713, + "eval_bp_per_token/cds": 2.9256700811431005, + "eval_bp_per_token/dig": 2.9876283846872083, + "eval_bp_per_token/exon": 2.919130363172198, + "eval_bp_per_token/intron": 2.9513343298999564, + "eval_bp_per_token/nig": 2.9489119940172226, + "eval_bp_per_token/promoter": 2.9839503107681042, + "eval_bp_per_token/utr": 2.941210505131817, + "eval_ppl_cds": 3.011458379011898, + "eval_ppl_dig": 1.077258019472255, + "eval_ppl_exon": 3.1817092111368908, + "eval_ppl_intron": 2.7765117404298523, + "eval_ppl_nig": 2.5592860334875858, + "eval_ppl_promoter": 2.760719843556623, + "eval_ppl_utr": 3.023992301412559, + "step": 7500, + "tokens_trained": 24.5688234 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.129919863839444, + "grad_norm": 0.37802258133888245, + "loss": 1.0073, + "loss_ce": 0.9940395355224609, + "loss_region": 0.030000092461705208, + "loss_total": 1.024039626121521, + "lr": 0.0009395542799104046, + "router/selected_tokens_s0": 4281.75, + "step": 7510, + "tokens_trained": 24.60158804 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1327565420892136, + "grad_norm": 0.30092689394950867, + "loss": 1.0019, + "loss_ce": 1.0793100595474243, + "loss_region": 0.03000376932322979, + "loss_total": 1.1093138456344604, + "lr": 0.0009391473879265536, + "router/selected_tokens_s0": 4334.0, + "step": 7520, + "tokens_trained": 24.634352656 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.135593220338983, + "grad_norm": 0.16765359044075012, + "loss": 1.0072, + "loss_ce": 0.9504039287567139, + "loss_region": 0.030000116676092148, + "loss_total": 0.9804040193557739, + "lr": 0.0009387404959427026, + "router/selected_tokens_s0": 4295.75, + "step": 7530, + "tokens_trained": 24.66711596 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1384298985887527, + "grad_norm": 0.22500154376029968, + "loss": 1.0115, + "loss_ce": 0.9912354350090027, + "loss_region": 0.030008718371391296, + "loss_total": 1.0212441682815552, + "lr": 0.0009383336039588515, + "router/selected_tokens_s0": 4341.0, + "step": 7540, + "tokens_trained": 24.6998814 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.141266576838522, + "grad_norm": 0.3195549547672272, + "loss": 1.0078, + "loss_ce": 0.9803912043571472, + "loss_region": 0.030008045956492424, + "loss_total": 1.0103992223739624, + "lr": 0.0009379267119750006, + "router/selected_tokens_s0": 4358.75, + "step": 7550, + "tokens_trained": 24.732646072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1441032550882917, + "grad_norm": 0.29788845777511597, + "loss": 1.0076, + "loss_ce": 1.0669872760772705, + "loss_region": 0.02999989315867424, + "loss_total": 1.0969871282577515, + "lr": 0.0009375198199911496, + "router/selected_tokens_s0": 4306.125, + "step": 7560, + "tokens_trained": 24.765403184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.146939933338061, + "grad_norm": 0.4390762150287628, + "loss": 1.0125, + "loss_ce": 0.9829087853431702, + "loss_region": 0.03000224567949772, + "loss_total": 1.012911081314087, + "lr": 0.0009371129280072985, + "router/selected_tokens_s0": 4340.125, + "step": 7570, + "tokens_trained": 24.798168624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1497766115878307, + "grad_norm": 0.3829552233219147, + "loss": 1.0151, + "loss_ce": 0.9999470114707947, + "loss_region": 0.03000633604824543, + "loss_total": 1.0299533605575562, + "lr": 0.0009367060360234475, + "router/selected_tokens_s0": 4363.5, + "step": 7580, + "tokens_trained": 24.830934064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1526132898376003, + "grad_norm": 0.39794671535491943, + "loss": 0.9928, + "loss_ce": 1.1021723747253418, + "loss_region": 0.030006568878889084, + "loss_total": 1.1321789026260376, + "lr": 0.0009362991440395965, + "router/selected_tokens_s0": 4346.875, + "step": 7590, + "tokens_trained": 24.863699504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1554499680873698, + "grad_norm": 0.4373714029788971, + "loss": 1.0016, + "loss_ce": 0.9409556984901428, + "loss_region": 0.03000253438949585, + "loss_total": 0.9709582328796387, + "lr": 0.0009358922520557453, + "router/selected_tokens_s0": 4310.0, + "step": 7600, + "tokens_trained": 24.896464928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1582866463371393, + "grad_norm": 0.19574929773807526, + "loss": 1.0104, + "loss_ce": 1.1051008701324463, + "loss_region": 0.03000916913151741, + "loss_total": 1.1351100206375122, + "lr": 0.0009354853600718943, + "router/selected_tokens_s0": 4365.625, + "step": 7610, + "tokens_trained": 24.929227968 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.161123324586909, + "grad_norm": 0.26452067494392395, + "loss": 1.0039, + "loss_ce": 1.055397629737854, + "loss_region": 0.030004659667611122, + "loss_total": 1.085402250289917, + "lr": 0.0009350784680880434, + "router/selected_tokens_s0": 4319.625, + "step": 7620, + "tokens_trained": 24.961993408 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1639600028366783, + "grad_norm": 0.3438540995121002, + "loss": 1.0102, + "loss_ce": 1.0382965803146362, + "loss_region": 0.030003128573298454, + "loss_total": 1.068299651145935, + "lr": 0.0009346715761041923, + "router/selected_tokens_s0": 4360.375, + "step": 7630, + "tokens_trained": 24.994758848 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.166796681086448, + "grad_norm": 0.21197521686553955, + "loss": 1.0024, + "loss_ce": 1.1299232244491577, + "loss_region": 0.030007576569914818, + "loss_total": 1.1599308252334595, + "lr": 0.0009342646841203413, + "router/selected_tokens_s0": 4386.25, + "step": 7640, + "tokens_trained": 25.027524288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1696333593362174, + "grad_norm": 0.5655157566070557, + "loss": 1.0144, + "loss_ce": 0.8700268268585205, + "loss_region": 0.029996460303664207, + "loss_total": 0.9000232815742493, + "lr": 0.0009338577921364903, + "router/selected_tokens_s0": 4267.625, + "step": 7650, + "tokens_trained": 25.06028972 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.172470037585987, + "grad_norm": 0.1706627458333969, + "loss": 1.0027, + "loss_ce": 0.6285883188247681, + "loss_region": 0.03003131039440632, + "loss_total": 0.6586196422576904, + "lr": 0.0009334509001526392, + "router/selected_tokens_s0": 4184.0, + "step": 7660, + "tokens_trained": 25.09305516 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1753067158357564, + "grad_norm": 0.2852633595466614, + "loss": 1.0049, + "loss_ce": 1.0811607837677002, + "loss_region": 0.0300059225410223, + "loss_total": 1.1111667156219482, + "lr": 0.0009330440081687883, + "router/selected_tokens_s0": 4344.25, + "step": 7670, + "tokens_trained": 25.125817728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.178143394085526, + "grad_norm": 0.5574907660484314, + "loss": 1.0103, + "loss_ce": 1.127759337425232, + "loss_region": 0.030007967725396156, + "loss_total": 1.1577672958374023, + "lr": 0.0009326371161849373, + "router/selected_tokens_s0": 4343.75, + "step": 7680, + "tokens_trained": 25.158583168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1809800723352954, + "grad_norm": 0.2042626291513443, + "loss": 1.004, + "loss_ce": 1.0829006433486938, + "loss_region": 0.030003361403942108, + "loss_total": 1.1129039525985718, + "lr": 0.0009322302242010862, + "router/selected_tokens_s0": 4307.5, + "step": 7690, + "tokens_trained": 25.191341672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.183816750585065, + "grad_norm": 0.3326032757759094, + "loss": 1.0021, + "loss_ce": 0.9698677659034729, + "loss_region": 0.030009783804416656, + "loss_total": 0.9998775720596313, + "lr": 0.0009318233322172352, + "router/selected_tokens_s0": 4387.5, + "step": 7700, + "tokens_trained": 25.224107112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1866534288348345, + "grad_norm": 0.290920615196228, + "loss": 1.0097, + "loss_ce": 0.9935495257377625, + "loss_region": 0.030008604750037193, + "loss_total": 1.0235581398010254, + "lr": 0.0009314164402333842, + "router/selected_tokens_s0": 4356.0, + "step": 7710, + "tokens_trained": 25.256872552 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.189490107084604, + "grad_norm": 0.33112096786499023, + "loss": 1.0114, + "loss_ce": 1.0084608793258667, + "loss_region": 0.03000568598508835, + "loss_total": 1.0384665727615356, + "lr": 0.000931009548249533, + "router/selected_tokens_s0": 4343.875, + "step": 7720, + "tokens_trained": 25.289637992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1923267853343735, + "grad_norm": 0.2671538293361664, + "loss": 1.0032, + "loss_ce": 1.067348837852478, + "loss_region": 0.03000442311167717, + "loss_total": 1.097353219985962, + "lr": 0.000930602656265682, + "router/selected_tokens_s0": 4322.875, + "step": 7730, + "tokens_trained": 25.322402632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.195163463584143, + "grad_norm": 0.22132344543933868, + "loss": 1.0067, + "loss_ce": 0.9750442504882812, + "loss_region": 0.02999834716320038, + "loss_total": 1.005042552947998, + "lr": 0.000930195764281831, + "router/selected_tokens_s0": 4315.625, + "step": 7740, + "tokens_trained": 25.355165584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.1980001418339126, + "grad_norm": 0.207183837890625, + "loss": 1.0103, + "loss_ce": 0.9280068278312683, + "loss_region": 0.03000536374747753, + "loss_total": 0.9580121636390686, + "lr": 0.00092978887229798, + "router/selected_tokens_s0": 4340.625, + "step": 7750, + "tokens_trained": 25.387931024 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.200836820083682, + "grad_norm": 0.28947076201438904, + "loss": 1.003, + "loss_ce": 0.9716025590896606, + "loss_region": 0.030001068487763405, + "loss_total": 1.001603603363037, + "lr": 0.000929381980314129, + "router/selected_tokens_s0": 4326.0, + "step": 7760, + "tokens_trained": 25.420695664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2036734983334516, + "grad_norm": 0.15499567985534668, + "loss": 1.0068, + "loss_ce": 0.9203482270240784, + "loss_region": 0.029997482895851135, + "loss_total": 0.9503456950187683, + "lr": 0.000928975088330278, + "router/selected_tokens_s0": 4285.5, + "step": 7770, + "tokens_trained": 25.453461104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.206510176583221, + "grad_norm": 0.2202332764863968, + "loss": 1.0066, + "loss_ce": 1.0267713069915771, + "loss_region": 0.030008550733327866, + "loss_total": 1.0567798614501953, + "lr": 0.0009285681963464269, + "router/selected_tokens_s0": 4333.875, + "step": 7780, + "tokens_trained": 25.486226544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2093468548329906, + "grad_norm": 0.15923179686069489, + "loss": 1.0111, + "loss_ce": 0.7928053140640259, + "loss_region": 0.030006855726242065, + "loss_total": 0.8228121995925903, + "lr": 0.0009281613043625759, + "router/selected_tokens_s0": 4286.0, + "step": 7790, + "tokens_trained": 25.518991984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.21218353308276, + "grad_norm": 0.25226891040802, + "loss": 1.0055, + "loss_ce": 0.7962496280670166, + "loss_region": 0.02999335527420044, + "loss_total": 0.826242983341217, + "lr": 0.000927754412378725, + "router/selected_tokens_s0": 4231.0, + "step": 7800, + "tokens_trained": 25.551757424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2150202113325297, + "grad_norm": 0.3380224406719208, + "loss": 1.0072, + "loss_ce": 0.9047214984893799, + "loss_region": 0.03000718727707863, + "loss_total": 0.9347286820411682, + "lr": 0.0009273475203948739, + "router/selected_tokens_s0": 4362.875, + "step": 7810, + "tokens_trained": 25.584522064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.217856889582299, + "grad_norm": 0.37128975987434387, + "loss": 1.0153, + "loss_ce": 1.0808064937591553, + "loss_region": 0.030004477128386497, + "loss_total": 1.1108109951019287, + "lr": 0.0009269406284110229, + "router/selected_tokens_s0": 4316.25, + "step": 7820, + "tokens_trained": 25.617287344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2206935678320687, + "grad_norm": 0.17406021058559418, + "loss": 1.0048, + "loss_ce": 0.9988422989845276, + "loss_region": 0.030010806396603584, + "loss_total": 1.0288530588150024, + "lr": 0.0009265337364271718, + "router/selected_tokens_s0": 4391.125, + "step": 7830, + "tokens_trained": 25.650052784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2235302460818382, + "grad_norm": 0.2049620896577835, + "loss": 1.0075, + "loss_ce": 0.7676858305931091, + "loss_region": 0.030001988634467125, + "loss_total": 0.797687828540802, + "lr": 0.0009261268444433207, + "router/selected_tokens_s0": 4302.75, + "step": 7840, + "tokens_trained": 25.682818224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2263669243316078, + "grad_norm": 0.18191517889499664, + "loss": 1.0085, + "loss_ce": 1.0509260892868042, + "loss_region": 0.030002878978848457, + "loss_total": 1.080928921699524, + "lr": 0.0009257199524594697, + "router/selected_tokens_s0": 4337.5, + "step": 7850, + "tokens_trained": 25.715583664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2292036025813773, + "grad_norm": 0.2727009057998657, + "loss": 1.0092, + "loss_ce": 0.9303435683250427, + "loss_region": 0.03000258468091488, + "loss_total": 0.9603461623191833, + "lr": 0.0009253130604756186, + "router/selected_tokens_s0": 4319.25, + "step": 7860, + "tokens_trained": 25.748349104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.232040280831147, + "grad_norm": 0.19896797835826874, + "loss": 1.0009, + "loss_ce": 0.8852936029434204, + "loss_region": 0.029992498457431793, + "loss_total": 0.915286123752594, + "lr": 0.0009249061684917677, + "router/selected_tokens_s0": 4218.0, + "step": 7870, + "tokens_trained": 25.781114544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2348769590809163, + "grad_norm": 0.3492950201034546, + "loss": 0.9967, + "loss_ce": 1.0309711694717407, + "loss_region": 0.030007202178239822, + "loss_total": 1.0609784126281738, + "lr": 0.0009244992765079167, + "router/selected_tokens_s0": 4382.375, + "step": 7880, + "tokens_trained": 25.813879984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.237713637330686, + "grad_norm": 0.35697612166404724, + "loss": 1.0036, + "loss_ce": 0.9015291333198547, + "loss_region": 0.03001173585653305, + "loss_total": 0.931540846824646, + "lr": 0.0009240923845240656, + "router/selected_tokens_s0": 4383.75, + "step": 7890, + "tokens_trained": 25.846645424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2405503155804554, + "grad_norm": 0.4583146572113037, + "loss": 1.0029, + "loss_ce": 0.9004272222518921, + "loss_region": 0.029994402080774307, + "loss_total": 0.9304216504096985, + "lr": 0.0009236854925402146, + "router/selected_tokens_s0": 4259.875, + "step": 7900, + "tokens_trained": 25.879410864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.243386993830225, + "grad_norm": 0.2985894978046417, + "loss": 1.0041, + "loss_ce": 1.007926344871521, + "loss_region": 0.03000694513320923, + "loss_total": 1.037933349609375, + "lr": 0.0009232786005563636, + "router/selected_tokens_s0": 4331.625, + "step": 7910, + "tokens_trained": 25.912176304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2462236720799944, + "grad_norm": 0.10321372747421265, + "loss": 0.9989, + "loss_ce": 1.0649851560592651, + "loss_region": 0.03000921942293644, + "loss_total": 1.0949944257736206, + "lr": 0.0009228717085725126, + "router/selected_tokens_s0": 4366.125, + "step": 7920, + "tokens_trained": 25.944941744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.249060350329764, + "grad_norm": 0.32021722197532654, + "loss": 0.9993, + "loss_ce": 0.9634207487106323, + "loss_region": 0.030000248923897743, + "loss_total": 0.9934210181236267, + "lr": 0.0009224648165886616, + "router/selected_tokens_s0": 4320.375, + "step": 7930, + "tokens_trained": 25.977707184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2518970285795334, + "grad_norm": 0.16878224909305573, + "loss": 1.0061, + "loss_ce": 1.0446586608886719, + "loss_region": 0.030006736516952515, + "loss_total": 1.0746654272079468, + "lr": 0.0009220579246048106, + "router/selected_tokens_s0": 4315.5, + "step": 7940, + "tokens_trained": 26.010471824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.254733706829303, + "grad_norm": 0.31431037187576294, + "loss": 1.0049, + "loss_ce": 0.9202573299407959, + "loss_region": 0.030004659667611122, + "loss_total": 0.9502620100975037, + "lr": 0.0009216510326209595, + "router/selected_tokens_s0": 4333.625, + "step": 7950, + "tokens_trained": 26.043237264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2575703850790725, + "grad_norm": 0.23674289882183075, + "loss": 1.0011, + "loss_ce": 1.0571632385253906, + "loss_region": 0.030006958171725273, + "loss_total": 1.0871702432632446, + "lr": 0.0009212441406371085, + "router/selected_tokens_s0": 4360.125, + "step": 7960, + "tokens_trained": 26.076002704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.260407063328842, + "grad_norm": 0.25749471783638, + "loss": 1.0089, + "loss_ce": 0.9559653401374817, + "loss_region": 0.02999885194003582, + "loss_total": 0.9859641790390015, + "lr": 0.0009208372486532574, + "router/selected_tokens_s0": 4289.5, + "step": 7970, + "tokens_trained": 26.108768144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2632437415786115, + "grad_norm": 0.2684595286846161, + "loss": 1.0032, + "loss_ce": 1.0311172008514404, + "loss_region": 0.03000614605844021, + "loss_total": 1.0611233711242676, + "lr": 0.0009204303566694063, + "router/selected_tokens_s0": 4364.625, + "step": 7980, + "tokens_trained": 26.141533584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.266080419828381, + "grad_norm": 0.07345940172672272, + "loss": 1.0067, + "loss_ce": 0.9582409858703613, + "loss_region": 0.030004076659679413, + "loss_total": 0.9882450699806213, + "lr": 0.0009200234646855554, + "router/selected_tokens_s0": 4327.125, + "step": 7990, + "tokens_trained": 26.174299024 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2689170980781506, + "grad_norm": 0.48699265718460083, + "loss": 0.9958, + "loss_ce": 0.9360334873199463, + "loss_region": 0.02999790757894516, + "loss_total": 0.9660313725471497, + "lr": 0.0009196165727017044, + "router/selected_tokens_s0": 4286.75, + "step": 8000, + "tokens_trained": 26.207064464 + }, + { + "epoch": 2.2689170980781506, + "eval_ppl": 2.675763473474689, + "eval_runtime": 2.52, + "step": 8000, + "tokens_trained": 26.207064464 + }, + { + "epoch": 2.2689170980781506, + "eval_F": 0.33788475322460654, + "eval_F_cds": 0.3406942613870673, + "eval_F_dig": 0.33807328697554495, + "eval_F_exon": 0.34177786554574435, + "eval_F_intron": 0.33823463656141445, + "eval_F_nig": 0.338685135383212, + "eval_F_promoter": 0.3342786670935006, + "eval_F_utr": 0.33936314884663743, + "eval_G": 0.34392113488427467, + "eval_G_cds": 0.345890950081264, + "eval_G_dig": 0.39644647937989946, + "eval_G_exon": 0.34398976707302453, + "eval_G_intron": 0.3428512733718868, + "eval_G_nig": 0.34238171948129126, + "eval_G_promoter": 0.34701431891546575, + "eval_G_utr": 0.3434394217364158, + "eval_avg_bp_per_token": 2.9595890032222227, + "eval_bp_per_token/cds": 2.9351829876109554, + "eval_bp_per_token/dig": 2.957938525537324, + "eval_bp_per_token/exon": 2.925877011968634, + "eval_bp_per_token/intron": 2.9565274868543114, + "eval_bp_per_token/nig": 2.9525948898481484, + "eval_bp_per_token/promoter": 2.9915160566327477, + "eval_bp_per_token/utr": 2.9466959020111902, + "eval_ppl_cds": 2.946380276551898, + "eval_ppl_dig": 1.074921998987061, + "eval_ppl_exon": 3.1784720601991023, + "eval_ppl_intron": 2.767541640268946, + "eval_ppl_nig": 2.556881473041184, + "eval_ppl_promoter": 2.7027128267686855, + "eval_ppl_utr": 3.001400942095713, + "step": 8000, + "tokens_trained": 26.207064464 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.27175377632792, + "grad_norm": 0.07347661256790161, + "loss": 0.9965, + "loss_ce": 0.9630281925201416, + "loss_region": 0.030000781640410423, + "loss_total": 0.993028998374939, + "lr": 0.0009192096807178533, + "router/selected_tokens_s0": 4299.125, + "step": 8010, + "tokens_trained": 26.239829904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2745904545776896, + "grad_norm": 0.24807631969451904, + "loss": 1.0008, + "loss_ce": 1.1123920679092407, + "loss_region": 0.030006518587470055, + "loss_total": 1.1423985958099365, + "lr": 0.0009188027887340023, + "router/selected_tokens_s0": 4390.5, + "step": 8020, + "tokens_trained": 26.272595344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.277427132827459, + "grad_norm": 0.3007449209690094, + "loss": 0.9956, + "loss_ce": 0.9147019386291504, + "loss_region": 0.030003497377038002, + "loss_total": 0.9447054266929626, + "lr": 0.0009183958967501513, + "router/selected_tokens_s0": 4319.625, + "step": 8030, + "tokens_trained": 26.305360784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2802638110772286, + "grad_norm": 0.21032722294330597, + "loss": 1.0008, + "loss_ce": 1.049250841140747, + "loss_region": 0.030001293867826462, + "loss_total": 1.0792521238327026, + "lr": 0.0009179890047663002, + "router/selected_tokens_s0": 4292.0, + "step": 8040, + "tokens_trained": 26.338126208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.283100489326998, + "grad_norm": 0.3483218848705292, + "loss": 1.0073, + "loss_ce": 0.8617912530899048, + "loss_region": 0.03000951185822487, + "loss_total": 0.8918007612228394, + "lr": 0.0009175821127824493, + "router/selected_tokens_s0": 4351.5, + "step": 8050, + "tokens_trained": 26.370891648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2859371675767677, + "grad_norm": 0.2576548159122467, + "loss": 1.0056, + "loss_ce": 0.9780833125114441, + "loss_region": 0.029995353892445564, + "loss_total": 1.008078694343567, + "lr": 0.0009171752207985983, + "router/selected_tokens_s0": 4229.0, + "step": 8060, + "tokens_trained": 26.403656288 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.288773845826537, + "grad_norm": 0.2615581154823303, + "loss": 1.0011, + "loss_ce": 0.9198725819587708, + "loss_region": 0.029999352991580963, + "loss_total": 0.9498719573020935, + "lr": 0.0009167683288147472, + "router/selected_tokens_s0": 4287.375, + "step": 8070, + "tokens_trained": 26.436421728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2916105240763067, + "grad_norm": 0.4514448940753937, + "loss": 1.0006, + "loss_ce": 1.0151890516281128, + "loss_region": 0.03000083938241005, + "loss_total": 1.0451898574829102, + "lr": 0.0009163614368308962, + "router/selected_tokens_s0": 4293.0, + "step": 8080, + "tokens_trained": 26.469187168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2944472023260762, + "grad_norm": 0.26465368270874023, + "loss": 0.9959, + "loss_ce": 0.9038240313529968, + "loss_region": 0.029997428879141808, + "loss_total": 0.933821439743042, + "lr": 0.000915954544847045, + "router/selected_tokens_s0": 4314.375, + "step": 8090, + "tokens_trained": 26.501951008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.2972838805758458, + "grad_norm": 0.31580230593681335, + "loss": 1.0092, + "loss_ce": 1.0212898254394531, + "loss_region": 0.03001166507601738, + "loss_total": 1.0513014793395996, + "lr": 0.000915547652863194, + "router/selected_tokens_s0": 4372.625, + "step": 8100, + "tokens_trained": 26.534716448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3001205588256153, + "grad_norm": 0.3395834267139435, + "loss": 0.9982, + "loss_ce": 1.0258086919784546, + "loss_region": 0.030008181929588318, + "loss_total": 1.055816888809204, + "lr": 0.000915140760879343, + "router/selected_tokens_s0": 4322.75, + "step": 8110, + "tokens_trained": 26.5674814 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.302957237075385, + "grad_norm": 0.14858978986740112, + "loss": 1.0075, + "loss_ce": 1.0804212093353271, + "loss_region": 0.030006686225533485, + "loss_total": 1.1104278564453125, + "lr": 0.000914733868895492, + "router/selected_tokens_s0": 4336.375, + "step": 8120, + "tokens_trained": 26.60024684 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3057939153251543, + "grad_norm": 0.3444601893424988, + "loss": 0.9978, + "loss_ce": 1.0786794424057007, + "loss_region": 0.03001074679195881, + "loss_total": 1.1086901426315308, + "lr": 0.000914326976911641, + "router/selected_tokens_s0": 4365.625, + "step": 8130, + "tokens_trained": 26.633011272 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.308630593574924, + "grad_norm": 0.1070174053311348, + "loss": 1.0016, + "loss_ce": 0.9199082851409912, + "loss_region": 0.030010441318154335, + "loss_total": 0.9499187469482422, + "lr": 0.00091392008492779, + "router/selected_tokens_s0": 4326.0, + "step": 8140, + "tokens_trained": 26.665774 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3114672718246934, + "grad_norm": 0.15402337908744812, + "loss": 1.0032, + "loss_ce": 0.9957820773124695, + "loss_region": 0.029999444261193275, + "loss_total": 1.025781512260437, + "lr": 0.000913513192943939, + "router/selected_tokens_s0": 4297.125, + "step": 8150, + "tokens_trained": 26.69853944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.314303950074463, + "grad_norm": 0.14355799555778503, + "loss": 1.0022, + "loss_ce": 1.0118954181671143, + "loss_region": 0.030009998008608818, + "loss_total": 1.041905403137207, + "lr": 0.0009131063009600879, + "router/selected_tokens_s0": 4360.75, + "step": 8160, + "tokens_trained": 26.73130488 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3171406283242324, + "grad_norm": 0.3245370090007782, + "loss": 1.0084, + "loss_ce": 0.954037070274353, + "loss_region": 0.030010370537638664, + "loss_total": 0.9840474128723145, + "lr": 0.000912699408976237, + "router/selected_tokens_s0": 4336.625, + "step": 8170, + "tokens_trained": 26.76407032 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.319977306574002, + "grad_norm": 0.32974427938461304, + "loss": 0.9997, + "loss_ce": 1.0400961637496948, + "loss_region": 0.03002246282994747, + "loss_total": 1.0701186656951904, + "lr": 0.000912292516992386, + "router/selected_tokens_s0": 4380.0, + "step": 8180, + "tokens_trained": 26.79683576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3228139848237714, + "grad_norm": 0.11554761976003647, + "loss": 1.0055, + "loss_ce": 1.099100947380066, + "loss_region": 0.030006272718310356, + "loss_total": 1.1291072368621826, + "lr": 0.0009118856250085349, + "router/selected_tokens_s0": 4365.125, + "step": 8190, + "tokens_trained": 26.8296012 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.325650663073541, + "grad_norm": 0.18515661358833313, + "loss": 1.0049, + "loss_ce": 1.0066508054733276, + "loss_region": 0.030001958832144737, + "loss_total": 1.0366528034210205, + "lr": 0.0009114787330246839, + "router/selected_tokens_s0": 4317.875, + "step": 8200, + "tokens_trained": 26.86236568 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3284873413233105, + "grad_norm": 0.30867400765419006, + "loss": 1.001, + "loss_ce": 0.8267403244972229, + "loss_region": 0.02999831922352314, + "loss_total": 0.8567386269569397, + "lr": 0.0009110718410408328, + "router/selected_tokens_s0": 4291.75, + "step": 8210, + "tokens_trained": 26.89513112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.33132401957308, + "grad_norm": 0.2077079713344574, + "loss": 0.9965, + "loss_ce": 0.9515328407287598, + "loss_region": 0.030003594234585762, + "loss_total": 0.9815364480018616, + "lr": 0.0009106649490569817, + "router/selected_tokens_s0": 4320.875, + "step": 8220, + "tokens_trained": 26.92789576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3341606978228495, + "grad_norm": 0.39926815032958984, + "loss": 1.0021, + "loss_ce": 0.9237360954284668, + "loss_region": 0.03000444732606411, + "loss_total": 0.9537405371665955, + "lr": 0.0009102580570731307, + "router/selected_tokens_s0": 4343.625, + "step": 8230, + "tokens_trained": 26.9606612 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.336997376072619, + "grad_norm": 0.15397949516773224, + "loss": 1.0012, + "loss_ce": 1.0071156024932861, + "loss_region": 0.029999390244483948, + "loss_total": 1.0371149778366089, + "lr": 0.0009098511650892797, + "router/selected_tokens_s0": 4284.75, + "step": 8240, + "tokens_trained": 26.99342664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3398340543223886, + "grad_norm": 0.24903906881809235, + "loss": 1.0044, + "loss_ce": 0.8528275489807129, + "loss_region": 0.029994992539286613, + "loss_total": 0.8828225135803223, + "lr": 0.0009094442731054287, + "router/selected_tokens_s0": 4289.625, + "step": 8250, + "tokens_trained": 27.02619208 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.342670732572158, + "grad_norm": 0.31778421998023987, + "loss": 1.0053, + "loss_ce": 0.8575053811073303, + "loss_region": 0.03000861033797264, + "loss_total": 0.8875139951705933, + "lr": 0.0009090373811215777, + "router/selected_tokens_s0": 4308.0, + "step": 8260, + "tokens_trained": 27.05895752 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3455074108219276, + "grad_norm": 0.3042631149291992, + "loss": 0.9969, + "loss_ce": 0.9608312249183655, + "loss_region": 0.029996080324053764, + "loss_total": 0.9908273220062256, + "lr": 0.0009086304891377266, + "router/selected_tokens_s0": 4263.75, + "step": 8270, + "tokens_trained": 27.09172296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.348344089071697, + "grad_norm": 0.18761594593524933, + "loss": 1.0052, + "loss_ce": 0.9143087863922119, + "loss_region": 0.030006544664502144, + "loss_total": 0.9443153142929077, + "lr": 0.0009082235971538756, + "router/selected_tokens_s0": 4355.25, + "step": 8280, + "tokens_trained": 27.1244884 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3511807673214666, + "grad_norm": 0.4719885587692261, + "loss": 0.9938, + "loss_ce": 0.9472616314888, + "loss_region": 0.030000489205121994, + "loss_total": 0.9772621393203735, + "lr": 0.0009078167051700246, + "router/selected_tokens_s0": 4292.875, + "step": 8290, + "tokens_trained": 27.15725384 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.354017445571236, + "grad_norm": 0.21311889588832855, + "loss": 0.9984, + "loss_ce": 1.0406630039215088, + "loss_region": 0.02999855950474739, + "loss_total": 1.0706615447998047, + "lr": 0.0009074098131861736, + "router/selected_tokens_s0": 4271.375, + "step": 8300, + "tokens_trained": 27.19001928 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3568541238210057, + "grad_norm": 0.14241135120391846, + "loss": 1.0015, + "loss_ce": 1.0369809865951538, + "loss_region": 0.03000539541244507, + "loss_total": 1.066986322402954, + "lr": 0.0009070029212023226, + "router/selected_tokens_s0": 4329.875, + "step": 8310, + "tokens_trained": 27.22278472 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.359690802070775, + "grad_norm": 0.4316507875919342, + "loss": 1.0024, + "loss_ce": 0.971124529838562, + "loss_region": 0.030004283413290977, + "loss_total": 1.0011287927627563, + "lr": 0.0009065960292184716, + "router/selected_tokens_s0": 4316.125, + "step": 8320, + "tokens_trained": 27.25555016 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3625274803205447, + "grad_norm": 0.10433993488550186, + "loss": 0.996, + "loss_ce": 1.0387852191925049, + "loss_region": 0.03000986948609352, + "loss_total": 1.068795084953308, + "lr": 0.0009061891372346205, + "router/selected_tokens_s0": 4392.875, + "step": 8330, + "tokens_trained": 27.2883156 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3653641585703142, + "grad_norm": 0.20729485154151917, + "loss": 0.9958, + "loss_ce": 0.9542734622955322, + "loss_region": 0.030002444982528687, + "loss_total": 0.9842759370803833, + "lr": 0.0009057822452507694, + "router/selected_tokens_s0": 4299.5, + "step": 8340, + "tokens_trained": 27.32108104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3682008368200838, + "grad_norm": 0.23723992705345154, + "loss": 0.9994, + "loss_ce": 0.967811107635498, + "loss_region": 0.030007444322109222, + "loss_total": 0.9978185296058655, + "lr": 0.0009053753532669184, + "router/selected_tokens_s0": 4321.125, + "step": 8350, + "tokens_trained": 27.35384648 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3710375150698533, + "grad_norm": 0.2902890145778656, + "loss": 1.0006, + "loss_ce": 0.9341420531272888, + "loss_region": 0.030005110427737236, + "loss_total": 0.96414715051651, + "lr": 0.0009049684612830673, + "router/selected_tokens_s0": 4363.25, + "step": 8360, + "tokens_trained": 27.386609784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.373874193319623, + "grad_norm": 0.30313265323638916, + "loss": 1.0053, + "loss_ce": 0.950921893119812, + "loss_region": 0.03000725992023945, + "loss_total": 0.9809291362762451, + "lr": 0.0009045615692992164, + "router/selected_tokens_s0": 4315.5, + "step": 8370, + "tokens_trained": 27.419375224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3767108715693923, + "grad_norm": 0.3925122618675232, + "loss": 1.0112, + "loss_ce": 0.8709821105003357, + "loss_region": 0.029991142451763153, + "loss_total": 0.9009732604026794, + "lr": 0.0009041546773153654, + "router/selected_tokens_s0": 4294.875, + "step": 8380, + "tokens_trained": 27.452140664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.379547549819162, + "grad_norm": 0.23642219603061676, + "loss": 1.0018, + "loss_ce": 0.9585750699043274, + "loss_region": 0.030005360022187233, + "loss_total": 0.9885804057121277, + "lr": 0.0009037477853315143, + "router/selected_tokens_s0": 4336.0, + "step": 8390, + "tokens_trained": 27.484904504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3823842280689314, + "grad_norm": 0.30860647559165955, + "loss": 1.0078, + "loss_ce": 1.0180624723434448, + "loss_region": 0.030008690431714058, + "loss_total": 1.0480711460113525, + "lr": 0.0009033408933476633, + "router/selected_tokens_s0": 4350.375, + "step": 8400, + "tokens_trained": 27.517669944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.385220906318701, + "grad_norm": 0.4122979938983917, + "loss": 1.0007, + "loss_ce": 0.9838353991508484, + "loss_region": 0.029999272897839546, + "loss_total": 1.0138347148895264, + "lr": 0.0009029340013638123, + "router/selected_tokens_s0": 4272.0, + "step": 8410, + "tokens_trained": 27.550435384 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3880575845684704, + "grad_norm": 0.3279622197151184, + "loss": 1.0042, + "loss_ce": 0.8460020422935486, + "loss_region": 0.029992690309882164, + "loss_total": 0.8759947419166565, + "lr": 0.0009025271093799613, + "router/selected_tokens_s0": 4262.75, + "step": 8420, + "tokens_trained": 27.583198336 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.39089426281824, + "grad_norm": 0.23770715296268463, + "loss": 0.9991, + "loss_ce": 0.9167661070823669, + "loss_region": 0.03001309558749199, + "loss_total": 0.946779191493988, + "lr": 0.0009021202173961103, + "router/selected_tokens_s0": 4374.75, + "step": 8430, + "tokens_trained": 27.615963616 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3937309410680094, + "grad_norm": 0.3547171652317047, + "loss": 1.006, + "loss_ce": 0.9971125721931458, + "loss_region": 0.030004054307937622, + "loss_total": 1.0271166563034058, + "lr": 0.0009017133254122593, + "router/selected_tokens_s0": 4341.5, + "step": 8440, + "tokens_trained": 27.648725192 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.396567619317779, + "grad_norm": 0.11001409590244293, + "loss": 1.0033, + "loss_ce": 0.9495263695716858, + "loss_region": 0.030003296211361885, + "loss_total": 0.9795296788215637, + "lr": 0.0009013064334284082, + "router/selected_tokens_s0": 4290.5, + "step": 8450, + "tokens_trained": 27.681490632 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.3994042975675485, + "grad_norm": 0.1604621708393097, + "loss": 0.9933, + "loss_ce": 0.866722822189331, + "loss_region": 0.030003976076841354, + "loss_total": 0.8967267870903015, + "lr": 0.0009008995414445572, + "router/selected_tokens_s0": 4313.875, + "step": 8460, + "tokens_trained": 27.714256072 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.402240975817318, + "grad_norm": 0.2528783082962036, + "loss": 1.0023, + "loss_ce": 1.0685746669769287, + "loss_region": 0.03000004217028618, + "loss_total": 1.0985747575759888, + "lr": 0.000900492649460706, + "router/selected_tokens_s0": 4294.0, + "step": 8470, + "tokens_trained": 27.747021512 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4050776540670875, + "grad_norm": 0.2676202654838562, + "loss": 0.9918, + "loss_ce": 1.0423674583435059, + "loss_region": 0.03000633604824543, + "loss_total": 1.0723737478256226, + "lr": 0.000900085757476855, + "router/selected_tokens_s0": 4343.375, + "step": 8480, + "tokens_trained": 27.779786952 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.407914332316857, + "grad_norm": 0.2899905741214752, + "loss": 1.0039, + "loss_ce": 1.1039665937423706, + "loss_region": 0.03000645898282528, + "loss_total": 1.1339730024337769, + "lr": 0.0008996788654930041, + "router/selected_tokens_s0": 4372.0, + "step": 8490, + "tokens_trained": 27.812552392 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4107510105666266, + "grad_norm": 0.18642191588878632, + "loss": 0.9986, + "loss_ce": 0.924929141998291, + "loss_region": 0.02999548427760601, + "loss_total": 0.9549246430397034, + "lr": 0.0008992719735091531, + "router/selected_tokens_s0": 4286.0, + "step": 8500, + "tokens_trained": 27.845317832 + }, + { + "epoch": 2.4107510105666266, + "eval_ppl": 2.6650693852029543, + "eval_runtime": 2.5356, + "step": 8500, + "tokens_trained": 27.845317832 + }, + { + "epoch": 2.4107510105666266, + "eval_F": 0.33888007554938054, + "eval_F_cds": 0.3421659079331452, + "eval_F_dig": 0.3292444722243925, + "eval_F_exon": 0.3428357829999718, + "eval_F_intron": 0.3392565016212616, + "eval_F_nig": 0.34004114392994556, + "eval_F_promoter": 0.33509801902983655, + "eval_F_utr": 0.34026120647627844, + "eval_G": 0.34680558566268765, + "eval_G_cds": 0.3487144065706593, + "eval_G_dig": 0.39149339466625, + "eval_G_exon": 0.34678120900569864, + "eval_G_intron": 0.3458409517180805, + "eval_G_nig": 0.34534857959760706, + "eval_G_promoter": 0.34982811986582935, + "eval_G_utr": 0.34636744709478356, + "eval_avg_bp_per_token": 2.950896414841843, + "eval_bp_per_token/cds": 2.922558842990831, + "eval_bp_per_token/dig": 3.0372567631703844, + "eval_bp_per_token/exon": 2.9168483851059452, + "eval_bp_per_token/intron": 2.94762221275387, + "eval_bp_per_token/nig": 2.9408205973040062, + "eval_bp_per_token/promoter": 2.9842014670667503, + "eval_bp_per_token/utr": 2.938918633587211, + "eval_ppl_cds": 2.9201114219021784, + "eval_ppl_dig": 1.0733490724771484, + "eval_ppl_exon": 3.1792957415495278, + "eval_ppl_intron": 2.7594583529367775, + "eval_ppl_nig": 2.548446656994938, + "eval_ppl_promoter": 2.6536365126162527, + "eval_ppl_utr": 2.9488982180330483, + "step": 8500, + "tokens_trained": 27.845317832 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.413587688816396, + "grad_norm": 0.13449706137180328, + "loss": 1.0005, + "loss_ce": 0.8301422595977783, + "loss_region": 0.030001115053892136, + "loss_total": 0.8601433634757996, + "lr": 0.000898865081525302, + "router/selected_tokens_s0": 4313.375, + "step": 8510, + "tokens_trained": 27.878081672 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4164243670661656, + "grad_norm": 0.309061735868454, + "loss": 0.9957, + "loss_ce": 1.0199123620986938, + "loss_region": 0.02999977581202984, + "loss_total": 1.0499120950698853, + "lr": 0.000898458189541451, + "router/selected_tokens_s0": 4288.375, + "step": 8520, + "tokens_trained": 27.910847112 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.419261045315935, + "grad_norm": 0.16196733713150024, + "loss": 1.0005, + "loss_ce": 0.8637193441390991, + "loss_region": 0.03000469133257866, + "loss_total": 0.8937240242958069, + "lr": 0.0008980512975576, + "router/selected_tokens_s0": 4308.0, + "step": 8530, + "tokens_trained": 27.943612552 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4220977235657046, + "grad_norm": 0.09371009469032288, + "loss": 1.0006, + "loss_ce": 0.9934786558151245, + "loss_region": 0.03000270016491413, + "loss_total": 1.0234813690185547, + "lr": 0.0008976444055737489, + "router/selected_tokens_s0": 4304.0, + "step": 8540, + "tokens_trained": 27.976377992 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.424934401815474, + "grad_norm": 0.307698518037796, + "loss": 0.9983, + "loss_ce": 0.8561427593231201, + "loss_region": 0.029997562989592552, + "loss_total": 0.8861403465270996, + "lr": 0.000897237513589898, + "router/selected_tokens_s0": 4264.875, + "step": 8550, + "tokens_trained": 28.009141296 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4277710800652437, + "grad_norm": 0.1772736757993698, + "loss": 1.0038, + "loss_ce": 1.0826610326766968, + "loss_region": 0.030012359842658043, + "loss_total": 1.1126734018325806, + "lr": 0.000896830621606047, + "router/selected_tokens_s0": 4370.75, + "step": 8560, + "tokens_trained": 28.041906736 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.430607758315013, + "grad_norm": 0.15302515029907227, + "loss": 1.0069, + "loss_ce": 1.0702699422836304, + "loss_region": 0.030007587745785713, + "loss_total": 1.1002775430679321, + "lr": 0.0008964237296221959, + "router/selected_tokens_s0": 4339.0, + "step": 8570, + "tokens_trained": 28.074672176 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4334444365647827, + "grad_norm": 0.22621497511863708, + "loss": 0.9973, + "loss_ce": 1.0483866930007935, + "loss_region": 0.030001938343048096, + "loss_total": 1.0783886909484863, + "lr": 0.0008960168376383449, + "router/selected_tokens_s0": 4306.25, + "step": 8580, + "tokens_trained": 28.107436008 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4362811148145522, + "grad_norm": 0.11164890974760056, + "loss": 1.0005, + "loss_ce": 0.972744882106781, + "loss_region": 0.030010616406798363, + "loss_total": 1.0027555227279663, + "lr": 0.0008956099456544937, + "router/selected_tokens_s0": 4377.75, + "step": 8590, + "tokens_trained": 28.140201448 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4391177930643217, + "grad_norm": 0.20279476046562195, + "loss": 0.9963, + "loss_ce": 1.0406827926635742, + "loss_region": 0.030004046857357025, + "loss_total": 1.0706868171691895, + "lr": 0.0008952030536706427, + "router/selected_tokens_s0": 4347.5, + "step": 8600, + "tokens_trained": 28.172966888 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4419544713140913, + "grad_norm": 0.24574866890907288, + "loss": 1.0029, + "loss_ce": 0.9247978925704956, + "loss_region": 0.030002659186720848, + "loss_total": 0.954800546169281, + "lr": 0.0008947961616867917, + "router/selected_tokens_s0": 4319.5, + "step": 8610, + "tokens_trained": 28.205730728 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.444791149563861, + "grad_norm": 0.06437017023563385, + "loss": 1.0028, + "loss_ce": 1.105587124824524, + "loss_region": 0.030001772567629814, + "loss_total": 1.1355888843536377, + "lr": 0.0008943892697029407, + "router/selected_tokens_s0": 4317.75, + "step": 8620, + "tokens_trained": 28.238496168 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4476278278136303, + "grad_norm": 0.1782883256673813, + "loss": 0.999, + "loss_ce": 0.9905262589454651, + "loss_region": 0.03000597096979618, + "loss_total": 1.020532250404358, + "lr": 0.0008939823777190897, + "router/selected_tokens_s0": 4339.125, + "step": 8630, + "tokens_trained": 28.271258408 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4504645060634, + "grad_norm": 0.19895051419734955, + "loss": 1.003, + "loss_ce": 1.0925418138504028, + "loss_region": 0.030004877597093582, + "loss_total": 1.122546672821045, + "lr": 0.0008935754857352387, + "router/selected_tokens_s0": 4349.25, + "step": 8640, + "tokens_trained": 28.304020264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4533011843131693, + "grad_norm": 0.36560705304145813, + "loss": 0.9997, + "loss_ce": 1.0445257425308228, + "loss_region": 0.030004719272255898, + "loss_total": 1.0745304822921753, + "lr": 0.0008931685937513876, + "router/selected_tokens_s0": 4336.375, + "step": 8650, + "tokens_trained": 28.336784904 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.456137862562939, + "grad_norm": 0.20657919347286224, + "loss": 0.9994, + "loss_ce": 0.9821572303771973, + "loss_region": 0.030005378648638725, + "loss_total": 1.0121625661849976, + "lr": 0.0008927617017675366, + "router/selected_tokens_s0": 4318.25, + "step": 8660, + "tokens_trained": 28.369550344 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4589745408127084, + "grad_norm": 0.16892334818840027, + "loss": 0.9987, + "loss_ce": 1.0167680978775024, + "loss_region": 0.029993655160069466, + "loss_total": 1.0467617511749268, + "lr": 0.0008923548097836857, + "router/selected_tokens_s0": 4239.375, + "step": 8670, + "tokens_trained": 28.402315784 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.461811219062478, + "grad_norm": 0.1816655993461609, + "loss": 1.0039, + "loss_ce": 0.9022430777549744, + "loss_region": 0.029996415600180626, + "loss_total": 0.9322394728660583, + "lr": 0.0008919479177998346, + "router/selected_tokens_s0": 4260.75, + "step": 8680, + "tokens_trained": 28.435081224 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4646478973122474, + "grad_norm": 0.086321622133255, + "loss": 0.9946, + "loss_ce": 1.0136384963989258, + "loss_region": 0.030008167028427124, + "loss_total": 1.0436466932296753, + "lr": 0.0008915410258159836, + "router/selected_tokens_s0": 4355.625, + "step": 8690, + "tokens_trained": 28.467846664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.467484575562017, + "grad_norm": 0.15028896927833557, + "loss": 1.0012, + "loss_ce": 0.9969090819358826, + "loss_region": 0.03000759892165661, + "loss_total": 1.0269166231155396, + "lr": 0.0008911341338321326, + "router/selected_tokens_s0": 4352.0, + "step": 8700, + "tokens_trained": 28.500612104 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4703212538117865, + "grad_norm": 0.09089077264070511, + "loss": 0.9924, + "loss_ce": 1.0716396570205688, + "loss_region": 0.030006859451532364, + "loss_total": 1.1016465425491333, + "lr": 0.0008907272418482815, + "router/selected_tokens_s0": 4361.875, + "step": 8710, + "tokens_trained": 28.533377544 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.473157932061556, + "grad_norm": 0.2638002932071686, + "loss": 1.0043, + "loss_ce": 1.0565223693847656, + "loss_region": 0.030004462227225304, + "loss_total": 1.086526870727539, + "lr": 0.0008903203498644304, + "router/selected_tokens_s0": 4336.625, + "step": 8720, + "tokens_trained": 28.566141984 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4759946103113255, + "grad_norm": 0.15809841454029083, + "loss": 0.9925, + "loss_ce": 1.0733001232147217, + "loss_region": 0.030008846893906593, + "loss_total": 1.103308916091919, + "lr": 0.0008899134578805794, + "router/selected_tokens_s0": 4357.375, + "step": 8730, + "tokens_trained": 28.598907424 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.478831288561095, + "grad_norm": 0.18726810812950134, + "loss": 1.0042, + "loss_ce": 0.8972605466842651, + "loss_region": 0.029985716566443443, + "loss_total": 0.9272462725639343, + "lr": 0.0008895065658967284, + "router/selected_tokens_s0": 4263.625, + "step": 8740, + "tokens_trained": 28.631672864 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4816679668108645, + "grad_norm": 0.24068403244018555, + "loss": 1.0095, + "loss_ce": 1.0139508247375488, + "loss_region": 0.03000541776418686, + "loss_total": 1.0439562797546387, + "lr": 0.0008890996739128774, + "router/selected_tokens_s0": 4343.625, + "step": 8750, + "tokens_trained": 28.664438304 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.484504645060634, + "grad_norm": 0.21258457005023956, + "loss": 0.9875, + "loss_ce": 1.0121517181396484, + "loss_region": 0.029999777674674988, + "loss_total": 1.0421514511108398, + "lr": 0.0008886927819290264, + "router/selected_tokens_s0": 4282.25, + "step": 8760, + "tokens_trained": 28.697203744 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4873413233104036, + "grad_norm": 0.10942172259092331, + "loss": 0.9972, + "loss_ce": 0.9001731276512146, + "loss_region": 0.030006736516952515, + "loss_total": 0.9301798343658447, + "lr": 0.0008882858899451753, + "router/selected_tokens_s0": 4348.875, + "step": 8770, + "tokens_trained": 28.729969184 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.490178001560173, + "grad_norm": 0.13711389899253845, + "loss": 1.0009, + "loss_ce": 1.051249384880066, + "loss_region": 0.02999911643564701, + "loss_total": 1.0812485218048096, + "lr": 0.0008878789979613243, + "router/selected_tokens_s0": 4341.875, + "step": 8780, + "tokens_trained": 28.762734624 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4930146798099426, + "grad_norm": 0.15089532732963562, + "loss": 0.9942, + "loss_ce": 0.9496617317199707, + "loss_region": 0.029995227232575417, + "loss_total": 0.9796569347381592, + "lr": 0.0008874721059774733, + "router/selected_tokens_s0": 4251.25, + "step": 8790, + "tokens_trained": 28.795500064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.495851358059712, + "grad_norm": 0.21835339069366455, + "loss": 0.9976, + "loss_ce": 0.9721809029579163, + "loss_region": 0.03000730648636818, + "loss_total": 1.0021882057189941, + "lr": 0.0008870652139936223, + "router/selected_tokens_s0": 4327.875, + "step": 8800, + "tokens_trained": 28.828265504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.4986880363094817, + "grad_norm": 0.14370621740818024, + "loss": 1.0031, + "loss_ce": 0.661625325679779, + "loss_region": 0.030003832653164864, + "loss_total": 0.69162917137146, + "lr": 0.0008866583220097713, + "router/selected_tokens_s0": 4298.125, + "step": 8810, + "tokens_trained": 28.861030944 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.501524714559251, + "grad_norm": 0.34035536646842957, + "loss": 1.0, + "loss_ce": 0.8154221773147583, + "loss_region": 0.029994478449225426, + "loss_total": 0.8454166650772095, + "lr": 0.0008862514300259203, + "router/selected_tokens_s0": 4243.375, + "step": 8820, + "tokens_trained": 28.893796384 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5043613928090207, + "grad_norm": 0.2638392448425293, + "loss": 1.0036, + "loss_ce": 1.0097345113754272, + "loss_region": 0.030002137646079063, + "loss_total": 1.0397366285324097, + "lr": 0.0008858445380420692, + "router/selected_tokens_s0": 4334.25, + "step": 8830, + "tokens_trained": 28.926561824 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5071980710587902, + "grad_norm": 0.12338658422231674, + "loss": 1.0063, + "loss_ce": 1.0209345817565918, + "loss_region": 0.03000703826546669, + "loss_total": 1.0509415864944458, + "lr": 0.0008854376460582181, + "router/selected_tokens_s0": 4338.375, + "step": 8840, + "tokens_trained": 28.959327264 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5100347493085597, + "grad_norm": 0.31785932183265686, + "loss": 0.9897, + "loss_ce": 0.9506969451904297, + "loss_region": 0.029998788610100746, + "loss_total": 0.9806957244873047, + "lr": 0.000885030754074367, + "router/selected_tokens_s0": 4288.0, + "step": 8850, + "tokens_trained": 28.992092704 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5128714275583293, + "grad_norm": 0.1892634928226471, + "loss": 1.0028, + "loss_ce": 1.0616570711135864, + "loss_region": 0.03000863641500473, + "loss_total": 1.0916657447814941, + "lr": 0.000884623862090516, + "router/selected_tokens_s0": 4351.25, + "step": 8860, + "tokens_trained": 29.024858144 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.515708105808099, + "grad_norm": 0.04792920872569084, + "loss": 1.0017, + "loss_ce": 1.0865520238876343, + "loss_region": 0.030003543943166733, + "loss_total": 1.1165555715560913, + "lr": 0.0008842169701066651, + "router/selected_tokens_s0": 4327.375, + "step": 8870, + "tokens_trained": 29.057623584 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5185447840578683, + "grad_norm": 0.31960293650627136, + "loss": 0.9978, + "loss_ce": 0.8718461990356445, + "loss_region": 0.030006276443600655, + "loss_total": 0.9018524885177612, + "lr": 0.0008838100781228141, + "router/selected_tokens_s0": 4355.75, + "step": 8880, + "tokens_trained": 29.090389024 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.521381462307638, + "grad_norm": 0.18474900722503662, + "loss": 0.989, + "loss_ce": 1.1126625537872314, + "loss_region": 0.03000907599925995, + "loss_total": 1.1426715850830078, + "lr": 0.000883403186138963, + "router/selected_tokens_s0": 4373.625, + "step": 8890, + "tokens_trained": 29.123153664 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5242181405574073, + "grad_norm": 0.1370496153831482, + "loss": 0.9925, + "loss_ce": 1.1119452714920044, + "loss_region": 0.029999537393450737, + "loss_total": 1.1419447660446167, + "lr": 0.000882996294155112, + "router/selected_tokens_s0": 4300.0, + "step": 8900, + "tokens_trained": 29.155917496 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.527054818807177, + "grad_norm": 0.22416529059410095, + "loss": 0.996, + "loss_ce": 1.1018669605255127, + "loss_region": 0.0300053171813488, + "loss_total": 1.131872296333313, + "lr": 0.000882589402171261, + "router/selected_tokens_s0": 4341.375, + "step": 8910, + "tokens_trained": 29.188682936 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5298914970569464, + "grad_norm": 0.21939510107040405, + "loss": 0.994, + "loss_ce": 0.7690940499305725, + "loss_region": 0.030000334605574608, + "loss_total": 0.7990943789482117, + "lr": 0.00088218251018741, + "router/selected_tokens_s0": 4324.0, + "step": 8920, + "tokens_trained": 29.221448376 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.532728175306716, + "grad_norm": 0.21124182641506195, + "loss": 1.0, + "loss_ce": 0.9944325089454651, + "loss_region": 0.030004018917679787, + "loss_total": 1.0244364738464355, + "lr": 0.000881775618203559, + "router/selected_tokens_s0": 4294.75, + "step": 8930, + "tokens_trained": 29.254213816 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5355648535564854, + "grad_norm": 0.306956946849823, + "loss": 0.9964, + "loss_ce": 1.0029845237731934, + "loss_region": 0.030008617788553238, + "loss_total": 1.032993197441101, + "lr": 0.000881368726219708, + "router/selected_tokens_s0": 4352.0, + "step": 8940, + "tokens_trained": 29.286979256 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.538401531806255, + "grad_norm": 0.25835534930229187, + "loss": 0.9845, + "loss_ce": 0.9159678220748901, + "loss_region": 0.03001241199672222, + "loss_total": 0.9459802508354187, + "lr": 0.0008809618342358569, + "router/selected_tokens_s0": 4392.375, + "step": 8950, + "tokens_trained": 29.319744696 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5412382100560245, + "grad_norm": 0.18166546523571014, + "loss": 0.997, + "loss_ce": 0.9494093060493469, + "loss_region": 0.03000432625412941, + "loss_total": 0.979413628578186, + "lr": 0.0008805549422520058, + "router/selected_tokens_s0": 4338.0, + "step": 8960, + "tokens_trained": 29.352510136 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.544074888305794, + "grad_norm": 0.40455761551856995, + "loss": 1.0039, + "loss_ce": 0.9609086513519287, + "loss_region": 0.03000527061522007, + "loss_total": 0.9909139275550842, + "lr": 0.0008801480502681547, + "router/selected_tokens_s0": 4334.75, + "step": 8970, + "tokens_trained": 29.385275576 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5469115665555635, + "grad_norm": 0.11374037712812424, + "loss": 0.9853, + "loss_ce": 1.0540075302124023, + "loss_region": 0.029997827485203743, + "loss_total": 1.084005355834961, + "lr": 0.0008797411582843037, + "router/selected_tokens_s0": 4318.375, + "step": 8980, + "tokens_trained": 29.418036064 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.549748244805333, + "grad_norm": 0.13507553935050964, + "loss": 0.9963, + "loss_ce": 0.8324553370475769, + "loss_region": 0.030012784525752068, + "loss_total": 0.8624681234359741, + "lr": 0.0008793342663004528, + "router/selected_tokens_s0": 4378.0, + "step": 8990, + "tokens_trained": 29.450801504 + }, + { + "comp/rl_weight": 0.03, + "comp/strictness": 0.0, + "epoch": 2.5525849230551025, + "grad_norm": 0.23446859419345856, + "loss": 0.9993, + "loss_ce": 0.9564284682273865, + "loss_region": 0.030007630586624146, + "loss_total": 0.986436128616333, + "lr": 0.0008789273743166017, + "router/selected_tokens_s0": 4340.25, + "step": 9000, + "tokens_trained": 29.483566944 + }, + { + "epoch": 2.5525849230551025, + "eval_ppl": 2.6552714315745756, + "eval_runtime": 2.4714, + "step": 9000, + "tokens_trained": 29.483566944 + }, + { + "epoch": 2.5525849230551025, + "eval_F": 0.33839005764716673, + "eval_F_cds": 0.341179419589071, + "eval_F_dig": 0.33510430502383, + "eval_F_exon": 0.3422997714898299, + "eval_F_intron": 0.3386960369675393, + "eval_F_nig": 0.33927277062921446, + "eval_F_promoter": 0.3350105838834461, + "eval_F_utr": 0.3398763246350037, + "eval_G": 0.3463624175616004, + "eval_G_cds": 0.3486885693695369, + "eval_G_dig": 0.3961704263191135, + "eval_G_exon": 0.34642735765720656, + "eval_G_intron": 0.345296902410104, + "eval_G_nig": 0.34468144082472857, + "eval_G_promoter": 0.3497449071891786, + "eval_G_utr": 0.3458268919080731, + "eval_avg_bp_per_token": 2.9551695666031716, + "eval_bp_per_token/cds": 2.931009148220126, + "eval_bp_per_token/dig": 2.9841454884588483, + "eval_bp_per_token/exon": 2.9214159146165573, + "eval_bp_per_token/intron": 2.9524998548944943, + "eval_bp_per_token/nig": 2.9474808666354284, + "eval_bp_per_token/promoter": 2.984980320346867, + "eval_bp_per_token/utr": 2.9422467159897328, + "eval_ppl_cds": 2.9075094700083897, + "eval_ppl_dig": 1.0744148503492457, + "eval_ppl_exon": 3.1644619202401714, + "eval_ppl_intron": 2.7495037615910887, + "eval_ppl_nig": 2.544889273211015, + "eval_ppl_promoter": 2.65953265677077, + "eval_ppl_utr": 2.95693266811597, + "step": 9000, + "tokens_trained": 29.483566944 + } + ], + "logging_steps": 10, + "max_steps": 30600, + "num_input_tokens_seen": 0, + "num_train_epochs": 9, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}