| { | |
| "best_global_step": 3000, | |
| "best_metric": 2.91798250107805, | |
| "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/HNet_Ori-BPT3/checkpoint-3000", | |
| "epoch": 0.8510034749308559, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.0028366782497695198, | |
| "grad_norm": 590.2120361328125, | |
| "loss": 144.5784, | |
| "loss_ce": 170.91241455078125, | |
| "loss_region": 0.030412333086133003, | |
| "loss_total": 170.9428253173828, | |
| "lr": 2.20454076850486e-05, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 10, | |
| "tokens_trained": 0.03276544 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.0056733564995390395, | |
| "grad_norm": 565.2921142578125, | |
| "loss": 52.047, | |
| "loss_ce": 28.61202049255371, | |
| "loss_region": 0.03181665763258934, | |
| "loss_total": 28.643836975097656, | |
| "lr": 4.654030511288038e-05, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 20, | |
| "tokens_trained": 0.06553088 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.00851003474930856, | |
| "grad_norm": 361.24432373046875, | |
| "loss": 18.4265, | |
| "loss_ce": 16.737817764282227, | |
| "loss_region": 0.03595759719610214, | |
| "loss_total": 16.773775100708008, | |
| "lr": 7.103520254071216e-05, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 30, | |
| "tokens_trained": 0.09829632 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.011346712999078079, | |
| "grad_norm": 649.0695190429688, | |
| "loss": 8.0445, | |
| "loss_ce": 11.410881996154785, | |
| "loss_region": 0.03821098059415817, | |
| "loss_total": 11.449092864990234, | |
| "lr": 9.553009996854394e-05, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 40, | |
| "tokens_trained": 0.13106176 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.014183391248847599, | |
| "grad_norm": 534.2383422851562, | |
| "loss": 9.3219, | |
| "loss_ce": 9.884474754333496, | |
| "loss_region": 0.040100596845149994, | |
| "loss_total": 9.924575805664062, | |
| "lr": 0.00012002499739637572, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 50, | |
| "tokens_trained": 0.1638272 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.01702006949861712, | |
| "grad_norm": 273.8401184082031, | |
| "loss": 14.1755, | |
| "loss_ce": 12.677406311035156, | |
| "loss_region": 0.041250791400671005, | |
| "loss_total": 12.718657493591309, | |
| "lr": 0.00014451989482420748, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 60, | |
| "tokens_trained": 0.19659264 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.01985674774838664, | |
| "grad_norm": 544.6290893554688, | |
| "loss": 14.1136, | |
| "loss_ce": 14.262775421142578, | |
| "loss_region": 0.042144227772951126, | |
| "loss_total": 14.304919242858887, | |
| "lr": 0.00016901479225203927, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 70, | |
| "tokens_trained": 0.22935808 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.022693425998156158, | |
| "grad_norm": 527.1918334960938, | |
| "loss": 15.2492, | |
| "loss_ce": 11.932450294494629, | |
| "loss_region": 0.04246167093515396, | |
| "loss_total": 11.9749116897583, | |
| "lr": 0.00019350968967987104, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 80, | |
| "tokens_trained": 0.26212192 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.025530104247925678, | |
| "grad_norm": 343.09454345703125, | |
| "loss": 12.0101, | |
| "loss_ce": 6.092933177947998, | |
| "loss_region": 0.04214272275567055, | |
| "loss_total": 6.13507604598999, | |
| "lr": 0.0002180045871077028, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 90, | |
| "tokens_trained": 0.29488736 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.028366782497695198, | |
| "grad_norm": 309.79541015625, | |
| "loss": 9.8843, | |
| "loss_ce": 5.214886665344238, | |
| "loss_region": 0.041769951581954956, | |
| "loss_total": 5.256656646728516, | |
| "lr": 0.00024249948453553463, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 100, | |
| "tokens_trained": 0.3276528 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.031203460747464717, | |
| "grad_norm": 251.26068115234375, | |
| "loss": 8.5835, | |
| "loss_ce": 12.269608497619629, | |
| "loss_region": 0.04041137546300888, | |
| "loss_total": 12.310019493103027, | |
| "lr": 0.00026699438196336637, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 110, | |
| "tokens_trained": 0.36041744 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.03404013899723424, | |
| "grad_norm": 148.94601440429688, | |
| "loss": 6.4366, | |
| "loss_ce": 3.2050940990448, | |
| "loss_region": 0.03642381727695465, | |
| "loss_total": 3.241518020629883, | |
| "lr": 0.00029148927939119814, | |
| "router/selected_tokens_s0": 1.0, | |
| "step": 120, | |
| "tokens_trained": 0.39318128 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.03687681724700376, | |
| "grad_norm": 187.2681427001953, | |
| "loss": 10.4928, | |
| "loss_ce": 6.001107215881348, | |
| "loss_region": 0.030254848301410675, | |
| "loss_total": 6.031362056732178, | |
| "lr": 0.00031598417681902996, | |
| "router/selected_tokens_s0": 4752.0, | |
| "step": 130, | |
| "tokens_trained": 0.42594672 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.03971349549677328, | |
| "grad_norm": 218.34559631347656, | |
| "loss": 8.5742, | |
| "loss_ce": 3.848691701889038, | |
| "loss_region": 0.03400004655122757, | |
| "loss_total": 3.8826918601989746, | |
| "lr": 0.00034047907424686173, | |
| "router/selected_tokens_s0": 7042.125, | |
| "step": 140, | |
| "tokens_trained": 0.458709112 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.042550173746542796, | |
| "grad_norm": 215.60699462890625, | |
| "loss": 4.5762, | |
| "loss_ce": 5.0876851081848145, | |
| "loss_region": 0.03198177367448807, | |
| "loss_total": 5.119667053222656, | |
| "lr": 0.0003649739716746935, | |
| "router/selected_tokens_s0": 424.5, | |
| "step": 150, | |
| "tokens_trained": 0.491469992 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.045386851996312316, | |
| "grad_norm": 147.6339111328125, | |
| "loss": 5.8047, | |
| "loss_ce": 8.435795783996582, | |
| "loss_region": 0.03364315256476402, | |
| "loss_total": 8.469438552856445, | |
| "lr": 0.00038946886910252526, | |
| "router/selected_tokens_s0": 536.875, | |
| "step": 160, | |
| "tokens_trained": 0.524234632 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.048223530246081836, | |
| "grad_norm": 218.0553741455078, | |
| "loss": 5.7968, | |
| "loss_ce": 6.644444942474365, | |
| "loss_region": 0.031727153807878494, | |
| "loss_total": 6.676172256469727, | |
| "lr": 0.0004139637665303571, | |
| "router/selected_tokens_s0": 1833.5, | |
| "step": 170, | |
| "tokens_trained": 0.556999272 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.051060208495851356, | |
| "grad_norm": 100.56309509277344, | |
| "loss": 6.7503, | |
| "loss_ce": 8.332029342651367, | |
| "loss_region": 0.03232778236269951, | |
| "loss_total": 8.364356994628906, | |
| "lr": 0.0004384586639581888, | |
| "router/selected_tokens_s0": 1649.75, | |
| "step": 180, | |
| "tokens_trained": 0.589762952 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.053896886745620876, | |
| "grad_norm": 157.10765075683594, | |
| "loss": 6.4449, | |
| "loss_ce": 4.925128936767578, | |
| "loss_region": 0.031663134694099426, | |
| "loss_total": 4.956791877746582, | |
| "lr": 0.0004629535613860206, | |
| "router/selected_tokens_s0": 1687.375, | |
| "step": 190, | |
| "tokens_trained": 0.622527592 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.056733564995390395, | |
| "grad_norm": 83.81340026855469, | |
| "loss": 3.7524, | |
| "loss_ce": 5.0940961837768555, | |
| "loss_region": 0.02894311398267746, | |
| "loss_total": 5.123039245605469, | |
| "lr": 0.00048744845881385244, | |
| "router/selected_tokens_s0": 3074.125, | |
| "step": 200, | |
| "tokens_trained": 0.655293032 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.059570243245159915, | |
| "grad_norm": 169.4013671875, | |
| "loss": 5.9481, | |
| "loss_ce": 9.220865249633789, | |
| "loss_region": 0.02949724718928337, | |
| "loss_total": 9.250362396240234, | |
| "lr": 0.0005119433562416841, | |
| "router/selected_tokens_s0": 3610.375, | |
| "step": 210, | |
| "tokens_trained": 0.688057672 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.062406921494929435, | |
| "grad_norm": 80.7753677368164, | |
| "loss": 5.1122, | |
| "loss_ce": 3.287958860397339, | |
| "loss_region": 0.029488109052181244, | |
| "loss_total": 3.3174469470977783, | |
| "lr": 0.0005364382536695159, | |
| "router/selected_tokens_s0": 2584.75, | |
| "step": 220, | |
| "tokens_trained": 0.720823112 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.06524359974469895, | |
| "grad_norm": 89.39635467529297, | |
| "loss": 3.3047, | |
| "loss_ce": 2.1086361408233643, | |
| "loss_region": 0.029821382835507393, | |
| "loss_total": 2.1384575366973877, | |
| "lr": 0.0005609331510973477, | |
| "router/selected_tokens_s0": 3991.5, | |
| "step": 230, | |
| "tokens_trained": 0.753588552 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.06808027799446847, | |
| "grad_norm": 90.892333984375, | |
| "loss": 4.2563, | |
| "loss_ce": 2.7003867626190186, | |
| "loss_region": 0.030828693881630898, | |
| "loss_total": 2.731215476989746, | |
| "lr": 0.0005854280485251795, | |
| "router/selected_tokens_s0": 4964.125, | |
| "step": 240, | |
| "tokens_trained": 0.786353992 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.070916956244238, | |
| "grad_norm": 86.70359802246094, | |
| "loss": 2.8849, | |
| "loss_ce": 3.55375599861145, | |
| "loss_region": 0.029162542894482613, | |
| "loss_total": 3.582918643951416, | |
| "lr": 0.0006099229459530113, | |
| "router/selected_tokens_s0": 2891.75, | |
| "step": 250, | |
| "tokens_trained": 0.819119432 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.07375363449400751, | |
| "grad_norm": 106.11075592041016, | |
| "loss": 4.4058, | |
| "loss_ce": 5.333348751068115, | |
| "loss_region": 0.029971925541758537, | |
| "loss_total": 5.363320827484131, | |
| "lr": 0.0006344178433808431, | |
| "router/selected_tokens_s0": 4181.375, | |
| "step": 260, | |
| "tokens_trained": 0.851884072 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.07659031274377703, | |
| "grad_norm": 75.7653579711914, | |
| "loss": 3.6076, | |
| "loss_ce": 2.3445212841033936, | |
| "loss_region": 0.029431568458676338, | |
| "loss_total": 2.373952865600586, | |
| "lr": 0.0006589127408086749, | |
| "router/selected_tokens_s0": 3440.0, | |
| "step": 270, | |
| "tokens_trained": 0.884649512 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.07942699099354655, | |
| "grad_norm": 95.4271469116211, | |
| "loss": 2.8447, | |
| "loss_ce": 3.030097007751465, | |
| "loss_region": 0.030556708574295044, | |
| "loss_total": 3.0606536865234375, | |
| "lr": 0.0006834076382365066, | |
| "router/selected_tokens_s0": 4730.5, | |
| "step": 280, | |
| "tokens_trained": 0.917414936 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.08226366924331607, | |
| "grad_norm": 74.673828125, | |
| "loss": 2.0288, | |
| "loss_ce": 2.1509435176849365, | |
| "loss_region": 0.028712084516882896, | |
| "loss_total": 2.1796555519104004, | |
| "lr": 0.0007079025356643384, | |
| "router/selected_tokens_s0": 2658.625, | |
| "step": 290, | |
| "tokens_trained": 0.950180376 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.08510034749308559, | |
| "grad_norm": 77.71709442138672, | |
| "loss": 2.0227, | |
| "loss_ce": 2.286048650741577, | |
| "loss_region": 0.03060404770076275, | |
| "loss_total": 2.316652774810791, | |
| "lr": 0.0007323974330921702, | |
| "router/selected_tokens_s0": 4752.0, | |
| "step": 300, | |
| "tokens_trained": 0.982945816 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.08793702574285511, | |
| "grad_norm": 55.31558609008789, | |
| "loss": 2.1281, | |
| "loss_ce": 2.0437748432159424, | |
| "loss_region": 0.030610591173171997, | |
| "loss_total": 2.074385404586792, | |
| "lr": 0.000756892330520002, | |
| "router/selected_tokens_s0": 4748.625, | |
| "step": 310, | |
| "tokens_trained": 1.015711256 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.09077370399262463, | |
| "grad_norm": 77.07698059082031, | |
| "loss": 2.5761, | |
| "loss_ce": 2.7218589782714844, | |
| "loss_region": 0.03093603625893593, | |
| "loss_total": 2.7527949810028076, | |
| "lr": 0.0007813872279478337, | |
| "router/selected_tokens_s0": 4946.625, | |
| "step": 320, | |
| "tokens_trained": 1.048476696 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.09361038224239415, | |
| "grad_norm": 47.57994842529297, | |
| "loss": 2.239, | |
| "loss_ce": 1.9163914918899536, | |
| "loss_region": 0.029897142201662064, | |
| "loss_total": 1.9462885856628418, | |
| "lr": 0.0008058821253756655, | |
| "router/selected_tokens_s0": 4135.875, | |
| "step": 330, | |
| "tokens_trained": 1.081242136 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.09644706049216367, | |
| "grad_norm": 58.62579345703125, | |
| "loss": 2.8423, | |
| "loss_ce": 3.2828376293182373, | |
| "loss_region": 0.03434763103723526, | |
| "loss_total": 3.317185163497925, | |
| "lr": 0.0008303770228034974, | |
| "router/selected_tokens_s0": 6686.5, | |
| "step": 340, | |
| "tokens_trained": 1.114007576 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.09928373874193319, | |
| "grad_norm": 34.5246696472168, | |
| "loss": 2.5891, | |
| "loss_ce": 1.537825345993042, | |
| "loss_region": 0.02885586954653263, | |
| "loss_total": 1.5666812658309937, | |
| "lr": 0.0008548719202313291, | |
| "router/selected_tokens_s0": 154.125, | |
| "step": 350, | |
| "tokens_trained": 1.146773016 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.10212041699170271, | |
| "grad_norm": 37.228973388671875, | |
| "loss": 2.7756, | |
| "loss_ce": 1.9871504306793213, | |
| "loss_region": 0.029301652684807777, | |
| "loss_total": 2.0164520740509033, | |
| "lr": 0.0008793668176591608, | |
| "router/selected_tokens_s0": 3631.75, | |
| "step": 360, | |
| "tokens_trained": 1.179538456 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.10495709524147223, | |
| "grad_norm": 30.546344757080078, | |
| "loss": 2.4884, | |
| "loss_ce": 1.4886701107025146, | |
| "loss_region": 0.031588103622198105, | |
| "loss_total": 1.5202581882476807, | |
| "lr": 0.0009038617150869926, | |
| "router/selected_tokens_s0": 5236.625, | |
| "step": 370, | |
| "tokens_trained": 1.212303896 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.10779377349124175, | |
| "grad_norm": 45.68803405761719, | |
| "loss": 2.8937, | |
| "loss_ce": 2.285705804824829, | |
| "loss_region": 0.030362222343683243, | |
| "loss_total": 2.316067934036255, | |
| "lr": 0.0009283566125148244, | |
| "router/selected_tokens_s0": 4493.625, | |
| "step": 380, | |
| "tokens_trained": 1.245068536 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.11063045174101127, | |
| "grad_norm": 32.428009033203125, | |
| "loss": 1.9186, | |
| "loss_ce": 1.5672893524169922, | |
| "loss_region": 0.03746495023369789, | |
| "loss_total": 1.6047543287277222, | |
| "lr": 0.0009528515099426562, | |
| "router/selected_tokens_s0": 8134.375, | |
| "step": 390, | |
| "tokens_trained": 1.277833176 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.11346712999078079, | |
| "grad_norm": 35.54498291015625, | |
| "loss": 1.6959, | |
| "loss_ce": 1.6413251161575317, | |
| "loss_region": 0.026098042726516724, | |
| "loss_total": 1.667423129081726, | |
| "lr": 0.000977346407370488, | |
| "router/selected_tokens_s0": 625.5, | |
| "step": 400, | |
| "tokens_trained": 1.310598616 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.11630380824055031, | |
| "grad_norm": 8.186758041381836, | |
| "loss": 1.671, | |
| "loss_ce": 1.324172019958496, | |
| "loss_region": 0.03537043184041977, | |
| "loss_total": 1.3595424890518188, | |
| "lr": 0.0010018413047983197, | |
| "router/selected_tokens_s0": 7117.75, | |
| "step": 410, | |
| "tokens_trained": 1.343364056 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.11914048649031983, | |
| "grad_norm": 32.364845275878906, | |
| "loss": 1.7487, | |
| "loss_ce": 1.6946724653244019, | |
| "loss_region": 0.030674295499920845, | |
| "loss_total": 1.7253468036651611, | |
| "lr": 0.0010263362022261515, | |
| "router/selected_tokens_s0": 4591.75, | |
| "step": 420, | |
| "tokens_trained": 1.376129496 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.12197716474008935, | |
| "grad_norm": 51.924861907958984, | |
| "loss": 1.6652, | |
| "loss_ce": 1.7081111669540405, | |
| "loss_region": 0.029956262558698654, | |
| "loss_total": 1.738067388534546, | |
| "lr": 0.0010508310996539833, | |
| "router/selected_tokens_s0": 4165.25, | |
| "step": 430, | |
| "tokens_trained": 1.408889864 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.12481384298985887, | |
| "grad_norm": 31.08187484741211, | |
| "loss": 1.6269, | |
| "loss_ce": 1.688795804977417, | |
| "loss_region": 0.030442532151937485, | |
| "loss_total": 1.71923828125, | |
| "lr": 0.0010753259970818151, | |
| "router/selected_tokens_s0": 4528.875, | |
| "step": 440, | |
| "tokens_trained": 1.441655304 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1276505212396284, | |
| "grad_norm": 9.750688552856445, | |
| "loss": 1.646, | |
| "loss_ce": 1.342025637626648, | |
| "loss_region": 0.0289932768791914, | |
| "loss_total": 1.371018886566162, | |
| "lr": 0.001099820894509647, | |
| "router/selected_tokens_s0": 3472.375, | |
| "step": 450, | |
| "tokens_trained": 1.474420744 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1304871994893979, | |
| "grad_norm": 69.62458038330078, | |
| "loss": 2.646, | |
| "loss_ce": 2.835515022277832, | |
| "loss_region": 0.03730851039290428, | |
| "loss_total": 2.872823476791382, | |
| "lr": 0.0011243157919374788, | |
| "router/selected_tokens_s0": 7822.125, | |
| "step": 460, | |
| "tokens_trained": 1.507186184 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.13332387773916743, | |
| "grad_norm": 62.241451263427734, | |
| "loss": 2.2121, | |
| "loss_ce": 1.9173500537872314, | |
| "loss_region": 0.033008284866809845, | |
| "loss_total": 1.9503583908081055, | |
| "lr": 0.0011488106893653104, | |
| "router/selected_tokens_s0": 5854.125, | |
| "step": 470, | |
| "tokens_trained": 1.539950832 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.13616055598893695, | |
| "grad_norm": 36.45135498046875, | |
| "loss": 1.8122, | |
| "loss_ce": 1.579708456993103, | |
| "loss_region": 0.030225276947021484, | |
| "loss_total": 1.6099337339401245, | |
| "lr": 0.0011733055867931422, | |
| "router/selected_tokens_s0": 4330.5, | |
| "step": 480, | |
| "tokens_trained": 1.572715472 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.13899723423870647, | |
| "grad_norm": 13.028325080871582, | |
| "loss": 1.5027, | |
| "loss_ce": 1.357754111289978, | |
| "loss_region": 0.03526536747813225, | |
| "loss_total": 1.393019437789917, | |
| "lr": 0.001197800484220974, | |
| "router/selected_tokens_s0": 7119.25, | |
| "step": 490, | |
| "tokens_trained": 1.605480912 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.141833912488476, | |
| "grad_norm": 24.705984115600586, | |
| "loss": 1.6096, | |
| "loss_ce": 1.6097279787063599, | |
| "loss_region": 0.02911153808236122, | |
| "loss_total": 1.6388394832611084, | |
| "lr": 0.0012222953816488059, | |
| "router/selected_tokens_s0": 3648.75, | |
| "step": 500, | |
| "tokens_trained": 1.638244216 | |
| }, | |
| { | |
| "epoch": 0.141833912488476, | |
| "eval_ppl": 4.8348835473380465, | |
| "eval_runtime": 2.9238, | |
| "step": 500, | |
| "tokens_trained": 1.638244216 | |
| }, | |
| { | |
| "epoch": 0.141833912488476, | |
| "eval_F": 0.3934690889573574, | |
| "eval_F_cds": 0.29905151571508276, | |
| "eval_F_dig": 0.4478214443836758, | |
| "eval_F_exon": 0.39103450221457386, | |
| "eval_F_intron": 0.40873021991492037, | |
| "eval_F_nig": 0.4262229153142855, | |
| "eval_F_promoter": 0.30306008909923465, | |
| "eval_F_utr": 0.3906123042448191, | |
| "eval_G": 0.49025372407568035, | |
| "eval_G_cds": 0.48331595902636837, | |
| "eval_G_dig": 0.49727705981261555, | |
| "eval_G_exon": 0.4909996295084916, | |
| "eval_G_intron": 0.4915825135015993, | |
| "eval_G_nig": 0.49304083637658525, | |
| "eval_G_promoter": 0.48200754687828323, | |
| "eval_G_utr": 0.4901697268782234, | |
| "eval_avg_bp_per_token": 2.541495705926663, | |
| "eval_bp_per_token/cds": 3.343905472636816, | |
| "eval_bp_per_token/dig": 2.2330328583890666, | |
| "eval_bp_per_token/exon": 2.5573190967462667, | |
| "eval_bp_per_token/intron": 2.4466015755041455, | |
| "eval_bp_per_token/nig": 2.346190136826938, | |
| "eval_bp_per_token/promoter": 3.299675661589863, | |
| "eval_bp_per_token/utr": 2.56008320560543, | |
| "eval_ppl_cds": 5.567015659246301, | |
| "eval_ppl_dig": 4.898425899350941, | |
| "eval_ppl_exon": 4.9662705320329295, | |
| "eval_ppl_intron": 4.767518067357663, | |
| "eval_ppl_nig": 4.6987085494689405, | |
| "eval_ppl_promoter": 5.216405144788708, | |
| "eval_ppl_utr": 4.913846632347962, | |
| "step": 500, | |
| "tokens_trained": 1.638244216 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1446705907382455, | |
| "grad_norm": 37.46432113647461, | |
| "loss": 1.5582, | |
| "loss_ce": 1.5548115968704224, | |
| "loss_region": 0.02565930411219597, | |
| "loss_total": 1.5804709196090698, | |
| "lr": 0.0012243786686061229, | |
| "router/selected_tokens_s0": 1004.25, | |
| "step": 510, | |
| "tokens_trained": 1.671005424 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.14750726898801503, | |
| "grad_norm": 27.722349166870117, | |
| "loss": 1.5672, | |
| "loss_ce": 1.478359341621399, | |
| "loss_region": 0.031882915645837784, | |
| "loss_total": 1.510242223739624, | |
| "lr": 0.0012239717766222718, | |
| "router/selected_tokens_s0": 5380.75, | |
| "step": 520, | |
| "tokens_trained": 1.703770864 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.15034394723778455, | |
| "grad_norm": 26.949983596801758, | |
| "loss": 1.6157, | |
| "loss_ce": 1.4986213445663452, | |
| "loss_region": 0.03651271015405655, | |
| "loss_total": 1.5351340770721436, | |
| "lr": 0.001223564884638421, | |
| "router/selected_tokens_s0": 7781.0, | |
| "step": 530, | |
| "tokens_trained": 1.736536304 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.15318062548755407, | |
| "grad_norm": 28.232316970825195, | |
| "loss": 1.6637, | |
| "loss_ce": 1.4607714414596558, | |
| "loss_region": 0.025137916207313538, | |
| "loss_total": 1.485909342765808, | |
| "lr": 0.00122315799265457, | |
| "router/selected_tokens_s0": 612.875, | |
| "step": 540, | |
| "tokens_trained": 1.769301744 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1560173037373236, | |
| "grad_norm": 23.33485221862793, | |
| "loss": 1.4993, | |
| "loss_ce": 1.4412897825241089, | |
| "loss_region": 0.035474810749292374, | |
| "loss_total": 1.4767645597457886, | |
| "lr": 0.001222751100670719, | |
| "router/selected_tokens_s0": 7357.5, | |
| "step": 550, | |
| "tokens_trained": 1.802067184 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1588539819870931, | |
| "grad_norm": 21.005512237548828, | |
| "loss": 1.4335, | |
| "loss_ce": 1.3612841367721558, | |
| "loss_region": 0.029854778200387955, | |
| "loss_total": 1.3911389112472534, | |
| "lr": 0.001222344208686868, | |
| "router/selected_tokens_s0": 4172.125, | |
| "step": 560, | |
| "tokens_trained": 1.834832624 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.16169066023686263, | |
| "grad_norm": 19.53492546081543, | |
| "loss": 1.4383, | |
| "loss_ce": 1.4045627117156982, | |
| "loss_region": 0.02937491238117218, | |
| "loss_total": 1.433937668800354, | |
| "lr": 0.0012219373167030169, | |
| "router/selected_tokens_s0": 3881.875, | |
| "step": 570, | |
| "tokens_trained": 1.867598064 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.16452733848663215, | |
| "grad_norm": 25.31780242919922, | |
| "loss": 1.7004, | |
| "loss_ce": 1.591187834739685, | |
| "loss_region": 0.03149839863181114, | |
| "loss_total": 1.6226862668991089, | |
| "lr": 0.0012215304247191658, | |
| "router/selected_tokens_s0": 5153.875, | |
| "step": 580, | |
| "tokens_trained": 1.900363504 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.16736401673640167, | |
| "grad_norm": 16.421045303344727, | |
| "loss": 1.5092, | |
| "loss_ce": 1.2439810037612915, | |
| "loss_region": 0.02931862138211727, | |
| "loss_total": 1.2732995748519897, | |
| "lr": 0.0012211235327353148, | |
| "router/selected_tokens_s0": 3840.5, | |
| "step": 590, | |
| "tokens_trained": 1.933128944 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.17020069498617119, | |
| "grad_norm": 25.38547706604004, | |
| "loss": 1.5893, | |
| "loss_ce": 1.5482516288757324, | |
| "loss_region": 0.025499241426587105, | |
| "loss_total": 1.5737508535385132, | |
| "lr": 0.0012207166407514638, | |
| "router/selected_tokens_s0": 1237.25, | |
| "step": 600, | |
| "tokens_trained": 1.96589048 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1730373732359407, | |
| "grad_norm": 14.48205852508545, | |
| "loss": 1.3098, | |
| "loss_ce": 1.2969579696655273, | |
| "loss_region": 0.03318855166435242, | |
| "loss_total": 1.3301465511322021, | |
| "lr": 0.0012203097487676127, | |
| "router/selected_tokens_s0": 6087.625, | |
| "step": 610, | |
| "tokens_trained": 1.99865592 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.17587405148571023, | |
| "grad_norm": 10.29987907409668, | |
| "loss": 1.2844, | |
| "loss_ce": 1.2728289365768433, | |
| "loss_region": 0.03153729811310768, | |
| "loss_total": 1.3043662309646606, | |
| "lr": 0.0012199028567837617, | |
| "router/selected_tokens_s0": 5177.0, | |
| "step": 620, | |
| "tokens_trained": 2.03142136 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.17871072973547975, | |
| "grad_norm": 14.114507675170898, | |
| "loss": 1.2792, | |
| "loss_ce": 1.2729930877685547, | |
| "loss_region": 0.03177153319120407, | |
| "loss_total": 1.3047646284103394, | |
| "lr": 0.0012194959647999107, | |
| "router/selected_tokens_s0": 5318.5, | |
| "step": 630, | |
| "tokens_trained": 2.0641868 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.18154740798524927, | |
| "grad_norm": 35.086570739746094, | |
| "loss": 1.327, | |
| "loss_ce": 1.4959396123886108, | |
| "loss_region": 0.031267955899238586, | |
| "loss_total": 1.527207612991333, | |
| "lr": 0.0012190890728160596, | |
| "router/selected_tokens_s0": 5018.625, | |
| "step": 640, | |
| "tokens_trained": 2.09695224 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.18438408623501878, | |
| "grad_norm": 12.891855239868164, | |
| "loss": 1.3231, | |
| "loss_ce": 1.251932978630066, | |
| "loss_region": 0.030069500207901, | |
| "loss_total": 1.2820024490356445, | |
| "lr": 0.0012186821808322086, | |
| "router/selected_tokens_s0": 4308.125, | |
| "step": 650, | |
| "tokens_trained": 2.12971768 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.1872207644847883, | |
| "grad_norm": 16.94170570373535, | |
| "loss": 1.273, | |
| "loss_ce": 1.303807258605957, | |
| "loss_region": 0.030183279886841774, | |
| "loss_total": 1.3339905738830566, | |
| "lr": 0.0012182752888483576, | |
| "router/selected_tokens_s0": 4374.375, | |
| "step": 660, | |
| "tokens_trained": 2.16248312 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.19005744273455782, | |
| "grad_norm": 8.820389747619629, | |
| "loss": 1.291, | |
| "loss_ce": 1.2488102912902832, | |
| "loss_region": 0.030493643134832382, | |
| "loss_total": 1.2793039083480835, | |
| "lr": 0.0012178683968645065, | |
| "router/selected_tokens_s0": 4566.875, | |
| "step": 670, | |
| "tokens_trained": 2.19524856 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.19289412098432734, | |
| "grad_norm": 12.072690963745117, | |
| "loss": 1.2551, | |
| "loss_ce": 1.257431149482727, | |
| "loss_region": 0.02906171977519989, | |
| "loss_total": 1.2864928245544434, | |
| "lr": 0.0012174615048806555, | |
| "router/selected_tokens_s0": 3676.75, | |
| "step": 680, | |
| "tokens_trained": 2.228014 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.19573079923409686, | |
| "grad_norm": 3.4100522994995117, | |
| "loss": 1.2685, | |
| "loss_ce": 1.217279314994812, | |
| "loss_region": 0.03290281072258949, | |
| "loss_total": 1.2501821517944336, | |
| "lr": 0.0012170546128968045, | |
| "router/selected_tokens_s0": 5992.0, | |
| "step": 690, | |
| "tokens_trained": 2.26077944 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.19856747748386638, | |
| "grad_norm": 6.675322532653809, | |
| "loss": 1.2504, | |
| "loss_ce": 1.1835153102874756, | |
| "loss_region": 0.031250134110450745, | |
| "loss_total": 1.2147654294967651, | |
| "lr": 0.0012166477209129534, | |
| "router/selected_tokens_s0": 5040.625, | |
| "step": 700, | |
| "tokens_trained": 2.29354488 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2014041557336359, | |
| "grad_norm": 21.388051986694336, | |
| "loss": 1.267, | |
| "loss_ce": 1.3746044635772705, | |
| "loss_region": 0.027913136407732964, | |
| "loss_total": 1.402517557144165, | |
| "lr": 0.0012162408289291026, | |
| "router/selected_tokens_s0": 2922.75, | |
| "step": 710, | |
| "tokens_trained": 2.32631032 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.20424083398340542, | |
| "grad_norm": 12.917130470275879, | |
| "loss": 1.3025, | |
| "loss_ce": 1.2145620584487915, | |
| "loss_region": 0.031132886186242104, | |
| "loss_total": 1.2456949949264526, | |
| "lr": 0.0012158339369452516, | |
| "router/selected_tokens_s0": 4968.875, | |
| "step": 720, | |
| "tokens_trained": 2.35907576 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.20707751223317494, | |
| "grad_norm": 8.888051986694336, | |
| "loss": 1.2457, | |
| "loss_ce": 1.185524821281433, | |
| "loss_region": 0.03197301924228668, | |
| "loss_total": 1.2174978256225586, | |
| "lr": 0.0012154270449614005, | |
| "router/selected_tokens_s0": 5463.0, | |
| "step": 730, | |
| "tokens_trained": 2.3918396 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.20991419048294446, | |
| "grad_norm": 13.051305770874023, | |
| "loss": 1.2446, | |
| "loss_ce": 1.1078685522079468, | |
| "loss_region": 0.0308807585388422, | |
| "loss_total": 1.138749361038208, | |
| "lr": 0.0012150201529775495, | |
| "router/selected_tokens_s0": 4844.0, | |
| "step": 740, | |
| "tokens_trained": 2.424600048 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.21275086873271398, | |
| "grad_norm": 6.593105316162109, | |
| "loss": 1.2851, | |
| "loss_ce": 1.255039930343628, | |
| "loss_region": 0.029710784554481506, | |
| "loss_total": 1.2847506999969482, | |
| "lr": 0.0012146132609936982, | |
| "router/selected_tokens_s0": 4083.875, | |
| "step": 750, | |
| "tokens_trained": 2.457364688 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2155875469824835, | |
| "grad_norm": 3.900451183319092, | |
| "loss": 1.2291, | |
| "loss_ce": 1.1926592588424683, | |
| "loss_region": 0.030736476182937622, | |
| "loss_total": 1.2233957052230835, | |
| "lr": 0.0012142063690098472, | |
| "router/selected_tokens_s0": 4719.25, | |
| "step": 760, | |
| "tokens_trained": 2.490130128 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.21842422523225302, | |
| "grad_norm": 8.001019477844238, | |
| "loss": 1.2285, | |
| "loss_ce": 1.1942657232284546, | |
| "loss_region": 0.03041156381368637, | |
| "loss_total": 1.224677324295044, | |
| "lr": 0.0012137994770259962, | |
| "router/selected_tokens_s0": 4525.75, | |
| "step": 770, | |
| "tokens_trained": 2.522895568 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.22126090348202254, | |
| "grad_norm": 5.169371128082275, | |
| "loss": 1.2072, | |
| "loss_ce": 1.2079213857650757, | |
| "loss_region": 0.031087037175893784, | |
| "loss_total": 1.2390084266662598, | |
| "lr": 0.0012133925850421454, | |
| "router/selected_tokens_s0": 4938.25, | |
| "step": 780, | |
| "tokens_trained": 2.555659392 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.22409758173179206, | |
| "grad_norm": 8.434707641601562, | |
| "loss": 1.2079, | |
| "loss_ce": 1.2038490772247314, | |
| "loss_region": 0.02821769006550312, | |
| "loss_total": 1.2320667505264282, | |
| "lr": 0.0012129856930582943, | |
| "router/selected_tokens_s0": 3119.875, | |
| "step": 790, | |
| "tokens_trained": 2.588422136 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.22693425998156158, | |
| "grad_norm": 8.451072692871094, | |
| "loss": 1.2072, | |
| "loss_ce": 1.2617510557174683, | |
| "loss_region": 0.0316130593419075, | |
| "loss_total": 1.29336416721344, | |
| "lr": 0.0012125788010744433, | |
| "router/selected_tokens_s0": 5238.75, | |
| "step": 800, | |
| "tokens_trained": 2.621187576 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2297709382313311, | |
| "grad_norm": 12.750673294067383, | |
| "loss": 1.2283, | |
| "loss_ce": 1.2528263330459595, | |
| "loss_region": 0.03109751269221306, | |
| "loss_total": 1.283923864364624, | |
| "lr": 0.0012121719090905923, | |
| "router/selected_tokens_s0": 4940.75, | |
| "step": 810, | |
| "tokens_trained": 2.653953016 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.23260761648110062, | |
| "grad_norm": 10.307655334472656, | |
| "loss": 1.2544, | |
| "loss_ce": 1.2496147155761719, | |
| "loss_region": 0.02913491614162922, | |
| "loss_total": 1.2787495851516724, | |
| "lr": 0.0012117650171067412, | |
| "router/selected_tokens_s0": 3717.75, | |
| "step": 820, | |
| "tokens_trained": 2.686718456 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.23544429473087014, | |
| "grad_norm": 0.6592714190483093, | |
| "loss": 1.2022, | |
| "loss_ce": 1.0889158248901367, | |
| "loss_region": 0.031184613704681396, | |
| "loss_total": 1.120100498199463, | |
| "lr": 0.0012113581251228902, | |
| "router/selected_tokens_s0": 5037.375, | |
| "step": 830, | |
| "tokens_trained": 2.71948036 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.23828097298063966, | |
| "grad_norm": 3.0865817070007324, | |
| "loss": 1.1958, | |
| "loss_ce": 1.267112374305725, | |
| "loss_region": 0.02916303649544716, | |
| "loss_total": 1.2962753772735596, | |
| "lr": 0.0012109512331390391, | |
| "router/selected_tokens_s0": 3734.375, | |
| "step": 840, | |
| "tokens_trained": 2.7522458 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.24111765123040918, | |
| "grad_norm": 2.513849973678589, | |
| "loss": 1.2014, | |
| "loss_ce": 1.108485221862793, | |
| "loss_region": 0.0302209984511137, | |
| "loss_total": 1.1387062072753906, | |
| "lr": 0.0012105443411551881, | |
| "router/selected_tokens_s0": 4417.125, | |
| "step": 850, | |
| "tokens_trained": 2.78501124 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2439543294801787, | |
| "grad_norm": 5.594594478607178, | |
| "loss": 1.206, | |
| "loss_ce": 1.1815146207809448, | |
| "loss_region": 0.031508028507232666, | |
| "loss_total": 1.2130227088928223, | |
| "lr": 0.001210137449171337, | |
| "router/selected_tokens_s0": 5212.875, | |
| "step": 860, | |
| "tokens_trained": 2.81777668 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.24679100772994822, | |
| "grad_norm": 2.2655980587005615, | |
| "loss": 1.1897, | |
| "loss_ce": 1.2304372787475586, | |
| "loss_region": 0.031548820436000824, | |
| "loss_total": 1.2619861364364624, | |
| "lr": 0.001209730557187486, | |
| "router/selected_tokens_s0": 5213.25, | |
| "step": 870, | |
| "tokens_trained": 2.85054212 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.24962768597971774, | |
| "grad_norm": 4.335860252380371, | |
| "loss": 1.1897, | |
| "loss_ce": 1.2337130308151245, | |
| "loss_region": 0.02997858263552189, | |
| "loss_total": 1.2636916637420654, | |
| "lr": 0.001209323665203635, | |
| "router/selected_tokens_s0": 4252.5, | |
| "step": 880, | |
| "tokens_trained": 2.88330756 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.25246436422948726, | |
| "grad_norm": 12.377155303955078, | |
| "loss": 1.1966, | |
| "loss_ce": 1.1369762420654297, | |
| "loss_region": 0.029613491147756577, | |
| "loss_total": 1.1665897369384766, | |
| "lr": 0.001208916773219784, | |
| "router/selected_tokens_s0": 4027.75, | |
| "step": 890, | |
| "tokens_trained": 2.916073 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2553010424792568, | |
| "grad_norm": 7.238094806671143, | |
| "loss": 1.2143, | |
| "loss_ce": 1.1700671911239624, | |
| "loss_region": 0.029774101451039314, | |
| "loss_total": 1.1998412609100342, | |
| "lr": 0.001208509881235933, | |
| "router/selected_tokens_s0": 4116.875, | |
| "step": 900, | |
| "tokens_trained": 2.94883828 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2581377207290263, | |
| "grad_norm": 3.2694191932678223, | |
| "loss": 1.1892, | |
| "loss_ce": 1.1454379558563232, | |
| "loss_region": 0.029824109748005867, | |
| "loss_total": 1.1752620935440063, | |
| "lr": 0.001208102989252082, | |
| "router/selected_tokens_s0": 4152.625, | |
| "step": 910, | |
| "tokens_trained": 2.981597288 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2609743989787958, | |
| "grad_norm": 9.457625389099121, | |
| "loss": 1.2038, | |
| "loss_ce": 1.3160332441329956, | |
| "loss_region": 0.030873605981469154, | |
| "loss_total": 1.3469069004058838, | |
| "lr": 0.0012076960972682309, | |
| "router/selected_tokens_s0": 4797.5, | |
| "step": 920, | |
| "tokens_trained": 3.014362456 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.26381107722856534, | |
| "grad_norm": 4.293655872344971, | |
| "loss": 1.1978, | |
| "loss_ce": 1.1440948247909546, | |
| "loss_region": 0.02935035713016987, | |
| "loss_total": 1.173445224761963, | |
| "lr": 0.0012072892052843798, | |
| "router/selected_tokens_s0": 3829.5, | |
| "step": 930, | |
| "tokens_trained": 3.047127096 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.26664775547833486, | |
| "grad_norm": 1.7136532068252563, | |
| "loss": 1.1906, | |
| "loss_ce": 1.1432236433029175, | |
| "loss_region": 0.028851088136434555, | |
| "loss_total": 1.1720746755599976, | |
| "lr": 0.0012068823133005288, | |
| "router/selected_tokens_s0": 3479.125, | |
| "step": 940, | |
| "tokens_trained": 3.079892536 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2694844337281044, | |
| "grad_norm": 4.0433244705200195, | |
| "loss": 1.1868, | |
| "loss_ce": 1.168936014175415, | |
| "loss_region": 0.02876598760485649, | |
| "loss_total": 1.1977020502090454, | |
| "lr": 0.0012064754213166778, | |
| "router/selected_tokens_s0": 3396.25, | |
| "step": 950, | |
| "tokens_trained": 3.11265336 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2723211119778739, | |
| "grad_norm": 6.829047203063965, | |
| "loss": 1.1828, | |
| "loss_ce": 1.2480430603027344, | |
| "loss_region": 0.02934931591153145, | |
| "loss_total": 1.2773923873901367, | |
| "lr": 0.001206068529332827, | |
| "router/selected_tokens_s0": 3843.75, | |
| "step": 960, | |
| "tokens_trained": 3.1454188 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2751577902276434, | |
| "grad_norm": 5.5668439865112305, | |
| "loss": 1.1882, | |
| "loss_ce": 1.1349202394485474, | |
| "loss_region": 0.0297370757907629, | |
| "loss_total": 1.1646573543548584, | |
| "lr": 0.001205661637348976, | |
| "router/selected_tokens_s0": 4102.5, | |
| "step": 970, | |
| "tokens_trained": 3.17818424 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.27799446847741294, | |
| "grad_norm": 3.729381561279297, | |
| "loss": 1.1839, | |
| "loss_ce": 1.1995916366577148, | |
| "loss_region": 0.03041483648121357, | |
| "loss_total": 1.230006456375122, | |
| "lr": 0.0012052547453651249, | |
| "router/selected_tokens_s0": 4537.125, | |
| "step": 980, | |
| "tokens_trained": 3.21094968 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.28083114672718246, | |
| "grad_norm": 2.7978885173797607, | |
| "loss": 1.1739, | |
| "loss_ce": 1.1886447668075562, | |
| "loss_region": 0.030223874375224113, | |
| "loss_total": 1.218868613243103, | |
| "lr": 0.0012048478533812738, | |
| "router/selected_tokens_s0": 4418.875, | |
| "step": 990, | |
| "tokens_trained": 3.24371512 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.283667824976952, | |
| "grad_norm": 2.7768421173095703, | |
| "loss": 1.1695, | |
| "loss_ce": 1.1791244745254517, | |
| "loss_region": 0.03016069531440735, | |
| "loss_total": 1.2092851400375366, | |
| "lr": 0.0012044409613974226, | |
| "router/selected_tokens_s0": 4373.0, | |
| "step": 1000, | |
| "tokens_trained": 3.27648056 | |
| }, | |
| { | |
| "epoch": 0.283667824976952, | |
| "eval_ppl": 3.1457362037176693, | |
| "eval_runtime": 2.5704, | |
| "step": 1000, | |
| "tokens_trained": 3.27648056 | |
| }, | |
| { | |
| "epoch": 0.283667824976952, | |
| "eval_F": 0.35905403615092213, | |
| "eval_F_cds": 0.3614752043728926, | |
| "eval_F_dig": 0.36203349219991143, | |
| "eval_F_exon": 0.3609332242502892, | |
| "eval_F_intron": 0.3608845011093654, | |
| "eval_F_nig": 0.36360427639485304, | |
| "eval_F_promoter": 0.3446594753609168, | |
| "eval_F_utr": 0.35993294503032014, | |
| "eval_G": 0.4747950002316863, | |
| "eval_G_cds": 0.4875693056072159, | |
| "eval_G_dig": 0.4165539971384483, | |
| "eval_G_exon": 0.4825983323253731, | |
| "eval_G_intron": 0.4746974505122046, | |
| "eval_G_nig": 0.4719204972271849, | |
| "eval_G_promoter": 0.47860970096474814, | |
| "eval_G_utr": 0.4806883865646302, | |
| "eval_avg_bp_per_token": 2.785096111772066, | |
| "eval_bp_per_token/cds": 2.7664414817466, | |
| "eval_bp_per_token/dig": 2.7621753830659665, | |
| "eval_bp_per_token/exon": 2.770595591683602, | |
| "eval_bp_per_token/intron": 2.7709696507497057, | |
| "eval_bp_per_token/nig": 2.7502426811780905, | |
| "eval_bp_per_token/promoter": 2.90141450181467, | |
| "eval_bp_per_token/utr": 2.77829527362593, | |
| "eval_ppl_cds": 3.7937951600140427, | |
| "eval_ppl_dig": 1.292568207392483, | |
| "eval_ppl_exon": 3.5063285971819904, | |
| "eval_ppl_intron": 3.1623742022954864, | |
| "eval_ppl_nig": 3.03123217862896, | |
| "eval_ppl_promoter": 3.420873133996253, | |
| "eval_ppl_utr": 3.4079030610184535, | |
| "step": 1000, | |
| "tokens_trained": 3.27648056 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2865045032267215, | |
| "grad_norm": 1.750190258026123, | |
| "loss": 1.1681, | |
| "loss_ce": 1.1951100826263428, | |
| "loss_region": 0.029561972245573997, | |
| "loss_total": 1.2246720790863037, | |
| "lr": 0.0012040340694135716, | |
| "router/selected_tokens_s0": 3974.5, | |
| "step": 1010, | |
| "tokens_trained": 3.309246 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.289341181476491, | |
| "grad_norm": 5.037286758422852, | |
| "loss": 1.1855, | |
| "loss_ce": 1.1606330871582031, | |
| "loss_region": 0.030172061175107956, | |
| "loss_total": 1.190805196762085, | |
| "lr": 0.0012036271774297205, | |
| "router/selected_tokens_s0": 4388.375, | |
| "step": 1020, | |
| "tokens_trained": 3.34201144 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.29217785972626054, | |
| "grad_norm": 5.963747024536133, | |
| "loss": 1.1794, | |
| "loss_ce": 1.116599678993225, | |
| "loss_region": 0.030543407425284386, | |
| "loss_total": 1.1471431255340576, | |
| "lr": 0.0012032202854458697, | |
| "router/selected_tokens_s0": 4640.0, | |
| "step": 1030, | |
| "tokens_trained": 3.37477688 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.29501453797603006, | |
| "grad_norm": 4.626336574554443, | |
| "loss": 1.1934, | |
| "loss_ce": 1.094927430152893, | |
| "loss_region": 0.02999301068484783, | |
| "loss_total": 1.1249204874038696, | |
| "lr": 0.0012028133934620187, | |
| "router/selected_tokens_s0": 4248.5, | |
| "step": 1040, | |
| "tokens_trained": 3.40754232 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.2978512162257996, | |
| "grad_norm": 4.208251476287842, | |
| "loss": 1.1843, | |
| "loss_ce": 1.1818771362304688, | |
| "loss_region": 0.030715491622686386, | |
| "loss_total": 1.212592601776123, | |
| "lr": 0.0012024065014781676, | |
| "router/selected_tokens_s0": 4729.75, | |
| "step": 1050, | |
| "tokens_trained": 3.44030696 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3006878944755691, | |
| "grad_norm": 2.3673582077026367, | |
| "loss": 1.1726, | |
| "loss_ce": 1.1216882467269897, | |
| "loss_region": 0.030366381630301476, | |
| "loss_total": 1.1520546674728394, | |
| "lr": 0.0012019996094943166, | |
| "router/selected_tokens_s0": 4503.625, | |
| "step": 1060, | |
| "tokens_trained": 3.4730724 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3035245727253386, | |
| "grad_norm": 2.6513352394104004, | |
| "loss": 1.1707, | |
| "loss_ce": 1.1285063028335571, | |
| "loss_region": 0.02974226139485836, | |
| "loss_total": 1.1582485437393188, | |
| "lr": 0.0012015927175104656, | |
| "router/selected_tokens_s0": 4085.375, | |
| "step": 1070, | |
| "tokens_trained": 3.50583784 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.30636125097510813, | |
| "grad_norm": 1.0276976823806763, | |
| "loss": 1.165, | |
| "loss_ce": 1.1330546140670776, | |
| "loss_region": 0.029834387823939323, | |
| "loss_total": 1.162889003753662, | |
| "lr": 0.0012011858255266145, | |
| "router/selected_tokens_s0": 4155.5, | |
| "step": 1080, | |
| "tokens_trained": 3.53860328 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.30919792922487765, | |
| "grad_norm": 3.4352457523345947, | |
| "loss": 1.1759, | |
| "loss_ce": 1.153834581375122, | |
| "loss_region": 0.030001970008015633, | |
| "loss_total": 1.183836579322815, | |
| "lr": 0.0012007789335427635, | |
| "router/selected_tokens_s0": 4271.375, | |
| "step": 1090, | |
| "tokens_trained": 3.57136872 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3120346074746472, | |
| "grad_norm": 3.4334914684295654, | |
| "loss": 1.1668, | |
| "loss_ce": 1.0656555891036987, | |
| "loss_region": 0.03014238551259041, | |
| "loss_total": 1.0957980155944824, | |
| "lr": 0.0012003720415589125, | |
| "router/selected_tokens_s0": 4376.625, | |
| "step": 1100, | |
| "tokens_trained": 3.60413416 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3148712857244167, | |
| "grad_norm": 7.573620796203613, | |
| "loss": 1.1737, | |
| "loss_ce": 1.1206940412521362, | |
| "loss_region": 0.030071774497628212, | |
| "loss_total": 1.1507657766342163, | |
| "lr": 0.0011999651495750614, | |
| "router/selected_tokens_s0": 4325.0, | |
| "step": 1110, | |
| "tokens_trained": 3.6368996 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3177079639741862, | |
| "grad_norm": 4.200015544891357, | |
| "loss": 1.1705, | |
| "loss_ce": 1.1700469255447388, | |
| "loss_region": 0.02990192547440529, | |
| "loss_total": 1.1999489068984985, | |
| "lr": 0.0011995582575912104, | |
| "router/selected_tokens_s0": 4194.25, | |
| "step": 1120, | |
| "tokens_trained": 3.669661712 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.32054464222395573, | |
| "grad_norm": 5.207011699676514, | |
| "loss": 1.1668, | |
| "loss_ce": 1.1708717346191406, | |
| "loss_region": 0.029880443587899208, | |
| "loss_total": 1.2007521390914917, | |
| "lr": 0.0011991513656073594, | |
| "router/selected_tokens_s0": 4177.25, | |
| "step": 1130, | |
| "tokens_trained": 3.702426352 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.32338132047372525, | |
| "grad_norm": 4.160227298736572, | |
| "loss": 1.1671, | |
| "loss_ce": 1.1502091884613037, | |
| "loss_region": 0.030087152495980263, | |
| "loss_total": 1.1802963018417358, | |
| "lr": 0.0011987444736235083, | |
| "router/selected_tokens_s0": 4325.25, | |
| "step": 1140, | |
| "tokens_trained": 3.735191792 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3262179987234948, | |
| "grad_norm": 2.3496572971343994, | |
| "loss": 1.1578, | |
| "loss_ce": 1.0942906141281128, | |
| "loss_region": 0.02960728108882904, | |
| "loss_total": 1.123897910118103, | |
| "lr": 0.0011983375816396573, | |
| "router/selected_tokens_s0": 3976.25, | |
| "step": 1150, | |
| "tokens_trained": 3.767957232 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3290546769732643, | |
| "grad_norm": 3.0820891857147217, | |
| "loss": 1.158, | |
| "loss_ce": 1.2191810607910156, | |
| "loss_region": 0.030029961839318275, | |
| "loss_total": 1.249211072921753, | |
| "lr": 0.0011979306896558062, | |
| "router/selected_tokens_s0": 4285.125, | |
| "step": 1160, | |
| "tokens_trained": 3.800722672 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3318913552230338, | |
| "grad_norm": 1.7340823411941528, | |
| "loss": 1.1537, | |
| "loss_ce": 1.0748310089111328, | |
| "loss_region": 0.030402792617678642, | |
| "loss_total": 1.1052337884902954, | |
| "lr": 0.0011975237976719552, | |
| "router/selected_tokens_s0": 4566.375, | |
| "step": 1170, | |
| "tokens_trained": 3.833488112 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.33472803347280333, | |
| "grad_norm": 1.6883597373962402, | |
| "loss": 1.1524, | |
| "loss_ce": 1.15337073802948, | |
| "loss_region": 0.029628688469529152, | |
| "loss_total": 1.1829993724822998, | |
| "lr": 0.0011971169056881042, | |
| "router/selected_tokens_s0": 3994.125, | |
| "step": 1180, | |
| "tokens_trained": 3.866252752 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.33756471172257285, | |
| "grad_norm": 1.3079456090927124, | |
| "loss": 1.155, | |
| "loss_ce": 1.147839903831482, | |
| "loss_region": 0.029972558841109276, | |
| "loss_total": 1.1778124570846558, | |
| "lr": 0.0011967100137042531, | |
| "router/selected_tokens_s0": 4250.125, | |
| "step": 1190, | |
| "tokens_trained": 3.899018184 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.34040138997234237, | |
| "grad_norm": 2.042187452316284, | |
| "loss": 1.1551, | |
| "loss_ce": 1.1045622825622559, | |
| "loss_region": 0.030126892030239105, | |
| "loss_total": 1.134689211845398, | |
| "lr": 0.0011963031217204021, | |
| "router/selected_tokens_s0": 4366.25, | |
| "step": 1200, | |
| "tokens_trained": 3.931783624 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3432380682221119, | |
| "grad_norm": 0.5720299482345581, | |
| "loss": 1.1514, | |
| "loss_ce": 1.1252881288528442, | |
| "loss_region": 0.02972925268113613, | |
| "loss_total": 1.155017375946045, | |
| "lr": 0.0011958962297365513, | |
| "router/selected_tokens_s0": 4055.0, | |
| "step": 1210, | |
| "tokens_trained": 3.964549064 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3460747464718814, | |
| "grad_norm": 2.726912498474121, | |
| "loss": 1.1481, | |
| "loss_ce": 1.0980409383773804, | |
| "loss_region": 0.030369114130735397, | |
| "loss_total": 1.1284101009368896, | |
| "lr": 0.0011954893377527003, | |
| "router/selected_tokens_s0": 4549.75, | |
| "step": 1220, | |
| "tokens_trained": 3.997311912 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.34891142472165093, | |
| "grad_norm": 1.576530933380127, | |
| "loss": 1.1547, | |
| "loss_ce": 1.1488255262374878, | |
| "loss_region": 0.03008064441382885, | |
| "loss_total": 1.1789062023162842, | |
| "lr": 0.0011950824457688492, | |
| "router/selected_tokens_s0": 4327.125, | |
| "step": 1230, | |
| "tokens_trained": 4.030077352 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.35174810297142045, | |
| "grad_norm": 1.7633917331695557, | |
| "loss": 1.1491, | |
| "loss_ce": 1.0437774658203125, | |
| "loss_region": 0.03009728156030178, | |
| "loss_total": 1.0738747119903564, | |
| "lr": 0.0011946755537849982, | |
| "router/selected_tokens_s0": 4352.5, | |
| "step": 1240, | |
| "tokens_trained": 4.062842792 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.35458478122118997, | |
| "grad_norm": 0.8599131107330322, | |
| "loss": 1.1502, | |
| "loss_ce": 1.1635342836380005, | |
| "loss_region": 0.030227093026041985, | |
| "loss_total": 1.1937613487243652, | |
| "lr": 0.001194268661801147, | |
| "router/selected_tokens_s0": 4437.875, | |
| "step": 1250, | |
| "tokens_trained": 4.095608232 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3574214594709595, | |
| "grad_norm": 2.0207033157348633, | |
| "loss": 1.1525, | |
| "loss_ce": 1.161281943321228, | |
| "loss_region": 0.02980414777994156, | |
| "loss_total": 1.1910860538482666, | |
| "lr": 0.001193861769817296, | |
| "router/selected_tokens_s0": 4113.375, | |
| "step": 1260, | |
| "tokens_trained": 4.128373672 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.360258137720729, | |
| "grad_norm": 1.6762081384658813, | |
| "loss": 1.1549, | |
| "loss_ce": 1.176638126373291, | |
| "loss_region": 0.02979988045990467, | |
| "loss_total": 1.2064380645751953, | |
| "lr": 0.0011934548778334449, | |
| "router/selected_tokens_s0": 4110.375, | |
| "step": 1270, | |
| "tokens_trained": 4.161136768 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.36309481597049853, | |
| "grad_norm": 1.5674160718917847, | |
| "loss": 1.1538, | |
| "loss_ce": 1.1160061359405518, | |
| "loss_region": 0.029819507151842117, | |
| "loss_total": 1.1458256244659424, | |
| "lr": 0.001193047985849594, | |
| "router/selected_tokens_s0": 4122.75, | |
| "step": 1280, | |
| "tokens_trained": 4.193902208 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.36593149422026805, | |
| "grad_norm": 1.232892394065857, | |
| "loss": 1.1499, | |
| "loss_ce": 1.192215085029602, | |
| "loss_region": 0.030095556750893593, | |
| "loss_total": 1.2223106622695923, | |
| "lr": 0.001192641093865743, | |
| "router/selected_tokens_s0": 4337.75, | |
| "step": 1290, | |
| "tokens_trained": 4.226667648 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.36876817247003757, | |
| "grad_norm": 1.280081033706665, | |
| "loss": 1.1625, | |
| "loss_ce": 1.0769988298416138, | |
| "loss_region": 0.030076846480369568, | |
| "loss_total": 1.1070756912231445, | |
| "lr": 0.001192234201881892, | |
| "router/selected_tokens_s0": 4330.625, | |
| "step": 1300, | |
| "tokens_trained": 4.259424272 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3716048507198071, | |
| "grad_norm": 0.7819789052009583, | |
| "loss": 1.1516, | |
| "loss_ce": 1.0531295537948608, | |
| "loss_region": 0.029812535271048546, | |
| "loss_total": 1.0829421281814575, | |
| "lr": 0.001191827309898041, | |
| "router/selected_tokens_s0": 4107.75, | |
| "step": 1310, | |
| "tokens_trained": 4.292189712 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3744415289695766, | |
| "grad_norm": 4.3887505531311035, | |
| "loss": 1.1524, | |
| "loss_ce": 1.0992565155029297, | |
| "loss_region": 0.030015140771865845, | |
| "loss_total": 1.1292716264724731, | |
| "lr": 0.00119142041791419, | |
| "router/selected_tokens_s0": 4279.625, | |
| "step": 1320, | |
| "tokens_trained": 4.32495164 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.37727820721934613, | |
| "grad_norm": 2.5429630279541016, | |
| "loss": 1.1622, | |
| "loss_ce": 0.9915607571601868, | |
| "loss_region": 0.02960185892879963, | |
| "loss_total": 1.0211626291275024, | |
| "lr": 0.0011910135259303389, | |
| "router/selected_tokens_s0": 3922.75, | |
| "step": 1330, | |
| "tokens_trained": 4.35771708 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.38011488546911565, | |
| "grad_norm": 1.3790112733840942, | |
| "loss": 1.1526, | |
| "loss_ce": 1.2076722383499146, | |
| "loss_region": 0.029480615630745888, | |
| "loss_total": 1.2371528148651123, | |
| "lr": 0.0011906066339464878, | |
| "router/selected_tokens_s0": 3831.5, | |
| "step": 1340, | |
| "tokens_trained": 4.39048252 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.38295156371888517, | |
| "grad_norm": 3.28352427482605, | |
| "loss": 1.1523, | |
| "loss_ce": 0.9999480247497559, | |
| "loss_region": 0.02995798923075199, | |
| "loss_total": 1.0299060344696045, | |
| "lr": 0.0011901997419626368, | |
| "router/selected_tokens_s0": 4236.0, | |
| "step": 1350, | |
| "tokens_trained": 4.42324796 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3857882419686547, | |
| "grad_norm": 2.173388719558716, | |
| "loss": 1.1469, | |
| "loss_ce": 1.1173208951950073, | |
| "loss_region": 0.030063528567552567, | |
| "loss_total": 1.1473844051361084, | |
| "lr": 0.0011897928499787858, | |
| "router/selected_tokens_s0": 4322.25, | |
| "step": 1360, | |
| "tokens_trained": 4.4560134 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3886249202184242, | |
| "grad_norm": 1.3337340354919434, | |
| "loss": 1.1514, | |
| "loss_ce": 1.097347617149353, | |
| "loss_region": 0.030277268961071968, | |
| "loss_total": 1.1276248693466187, | |
| "lr": 0.0011893859579949347, | |
| "router/selected_tokens_s0": 4490.375, | |
| "step": 1370, | |
| "tokens_trained": 4.48877884 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3914615984681937, | |
| "grad_norm": 1.5072178840637207, | |
| "loss": 1.1454, | |
| "loss_ce": 1.1354695558547974, | |
| "loss_region": 0.0300710741430521, | |
| "loss_total": 1.1655405759811401, | |
| "lr": 0.0011889790660110837, | |
| "router/selected_tokens_s0": 4323.125, | |
| "step": 1380, | |
| "tokens_trained": 4.52154428 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.39429827671796325, | |
| "grad_norm": 1.4634846448898315, | |
| "loss": 1.1434, | |
| "loss_ce": 1.1472464799880981, | |
| "loss_region": 0.029943954199552536, | |
| "loss_total": 1.1771904230117798, | |
| "lr": 0.0011885721740272327, | |
| "router/selected_tokens_s0": 4222.625, | |
| "step": 1390, | |
| "tokens_trained": 4.55430972 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.39713495496773277, | |
| "grad_norm": 1.1301681995391846, | |
| "loss": 1.1491, | |
| "loss_ce": 0.932141900062561, | |
| "loss_region": 0.03013395331799984, | |
| "loss_total": 0.9622758626937866, | |
| "lr": 0.0011881652820433816, | |
| "router/selected_tokens_s0": 4389.125, | |
| "step": 1400, | |
| "tokens_trained": 4.58707516 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.3999716332175023, | |
| "grad_norm": 1.153057336807251, | |
| "loss": 1.1483, | |
| "loss_ce": 1.0930418968200684, | |
| "loss_region": 0.029886895790696144, | |
| "loss_total": 1.1229287385940552, | |
| "lr": 0.0011877583900595306, | |
| "router/selected_tokens_s0": 4177.875, | |
| "step": 1410, | |
| "tokens_trained": 4.6198406 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4028083114672718, | |
| "grad_norm": 2.0346107482910156, | |
| "loss": 1.1355, | |
| "loss_ce": 1.130191683769226, | |
| "loss_region": 0.030217666178941727, | |
| "loss_total": 1.1604093313217163, | |
| "lr": 0.0011873514980756796, | |
| "router/selected_tokens_s0": 4435.0, | |
| "step": 1420, | |
| "tokens_trained": 4.652606024 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4056449897170413, | |
| "grad_norm": 1.2362136840820312, | |
| "loss": 1.1461, | |
| "loss_ce": 1.1180355548858643, | |
| "loss_region": 0.029944026842713356, | |
| "loss_total": 1.1479796171188354, | |
| "lr": 0.0011869446060918285, | |
| "router/selected_tokens_s0": 4219.25, | |
| "step": 1430, | |
| "tokens_trained": 4.685371464 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.40848166796681085, | |
| "grad_norm": 1.6414567232131958, | |
| "loss": 1.1476, | |
| "loss_ce": 1.1310675144195557, | |
| "loss_region": 0.030178584158420563, | |
| "loss_total": 1.1612460613250732, | |
| "lr": 0.0011865377141079775, | |
| "router/selected_tokens_s0": 4406.125, | |
| "step": 1440, | |
| "tokens_trained": 4.718136904 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.41131834621658037, | |
| "grad_norm": 0.8733806014060974, | |
| "loss": 1.1452, | |
| "loss_ce": 1.1529111862182617, | |
| "loss_region": 0.029908571392297745, | |
| "loss_total": 1.1828197240829468, | |
| "lr": 0.0011861308221241265, | |
| "router/selected_tokens_s0": 4186.5, | |
| "step": 1450, | |
| "tokens_trained": 4.750902344 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4141550244663499, | |
| "grad_norm": 2.170149087905884, | |
| "loss": 1.1364, | |
| "loss_ce": 1.1446956396102905, | |
| "loss_region": 0.030016543343663216, | |
| "loss_total": 1.1747121810913086, | |
| "lr": 0.0011857239301402756, | |
| "router/selected_tokens_s0": 4279.125, | |
| "step": 1460, | |
| "tokens_trained": 4.783666984 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4169917027161194, | |
| "grad_norm": 1.5901942253112793, | |
| "loss": 1.1418, | |
| "loss_ce": 1.1736469268798828, | |
| "loss_region": 0.02991572767496109, | |
| "loss_total": 1.203562617301941, | |
| "lr": 0.0011853170381564246, | |
| "router/selected_tokens_s0": 4190.375, | |
| "step": 1470, | |
| "tokens_trained": 4.816432424 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4198283809658889, | |
| "grad_norm": 0.7801039814949036, | |
| "loss": 1.1359, | |
| "loss_ce": 1.0415936708450317, | |
| "loss_region": 0.030063536018133163, | |
| "loss_total": 1.0716571807861328, | |
| "lr": 0.0011849101461725736, | |
| "router/selected_tokens_s0": 4323.5, | |
| "step": 1480, | |
| "tokens_trained": 4.849197864 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.42266505921565845, | |
| "grad_norm": 1.1225630044937134, | |
| "loss": 1.1387, | |
| "loss_ce": 1.1764026880264282, | |
| "loss_region": 0.02989169955253601, | |
| "loss_total": 1.2062944173812866, | |
| "lr": 0.0011845032541887225, | |
| "router/selected_tokens_s0": 4166.375, | |
| "step": 1490, | |
| "tokens_trained": 4.881963248 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.42550173746542796, | |
| "grad_norm": 1.3516196012496948, | |
| "loss": 1.1445, | |
| "loss_ce": 1.1119225025177002, | |
| "loss_region": 0.03007156029343605, | |
| "loss_total": 1.1419941186904907, | |
| "lr": 0.0011840963622048713, | |
| "router/selected_tokens_s0": 4332.625, | |
| "step": 1500, | |
| "tokens_trained": 4.914728608 | |
| }, | |
| { | |
| "epoch": 0.42550173746542796, | |
| "eval_ppl": 3.0476700462359805, | |
| "eval_runtime": 2.5167, | |
| "step": 1500, | |
| "tokens_trained": 4.914728608 | |
| }, | |
| { | |
| "epoch": 0.42550173746542796, | |
| "eval_F": 0.3395766737890528, | |
| "eval_F_cds": 0.33560010026602843, | |
| "eval_F_dig": 0.34591244107612573, | |
| "eval_F_exon": 0.33732050667193275, | |
| "eval_F_intron": 0.340589821591843, | |
| "eval_F_nig": 0.3449097161371641, | |
| "eval_F_promoter": 0.3287406377406758, | |
| "eval_F_utr": 0.33810586816514, | |
| "eval_G": 0.4388793285567115, | |
| "eval_G_cds": 0.4465895620992391, | |
| "eval_G_dig": 0.39567722372516084, | |
| "eval_G_exon": 0.44327135296181625, | |
| "eval_G_intron": 0.4386635275964277, | |
| "eval_G_nig": 0.4373593879668909, | |
| "eval_G_promoter": 0.44171817290159177, | |
| "eval_G_utr": 0.44355779628952524, | |
| "eval_avg_bp_per_token": 2.944843027178028, | |
| "eval_bp_per_token/cds": 2.9797368928296066, | |
| "eval_bp_per_token/dig": 2.8909049841891283, | |
| "eval_bp_per_token/exon": 2.9645396002341724, | |
| "eval_bp_per_token/intron": 2.93608304360423, | |
| "eval_bp_per_token/nig": 2.8993094517590188, | |
| "eval_bp_per_token/promoter": 3.0419117237000717, | |
| "eval_bp_per_token/utr": 2.9576534871366778, | |
| "eval_ppl_cds": 3.7328596405663, | |
| "eval_ppl_dig": 1.1534605141350962, | |
| "eval_ppl_exon": 3.4439528933373436, | |
| "eval_ppl_intron": 3.0653985302604827, | |
| "eval_ppl_nig": 2.904936687189015, | |
| "eval_ppl_promoter": 3.3618258190318606, | |
| "eval_ppl_utr": 3.3512748939063846, | |
| "step": 1500, | |
| "tokens_trained": 4.914728608 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4283384157151975, | |
| "grad_norm": 1.0354516506195068, | |
| "loss": 1.1407, | |
| "loss_ce": 1.2179700136184692, | |
| "loss_region": 0.029973506927490234, | |
| "loss_total": 1.2479435205459595, | |
| "lr": 0.0011836894702210202, | |
| "router/selected_tokens_s0": 4242.375, | |
| "step": 1510, | |
| "tokens_trained": 4.947494048 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.431175093964967, | |
| "grad_norm": 0.9974690675735474, | |
| "loss": 1.1361, | |
| "loss_ce": 1.1464780569076538, | |
| "loss_region": 0.03020160086452961, | |
| "loss_total": 1.1766796112060547, | |
| "lr": 0.0011832825782371692, | |
| "router/selected_tokens_s0": 4443.875, | |
| "step": 1520, | |
| "tokens_trained": 4.980259488 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4340117722147365, | |
| "grad_norm": 1.61404550075531, | |
| "loss": 1.1383, | |
| "loss_ce": 1.1023921966552734, | |
| "loss_region": 0.029910210520029068, | |
| "loss_total": 1.1323024034500122, | |
| "lr": 0.0011828756862533184, | |
| "router/selected_tokens_s0": 4174.25, | |
| "step": 1530, | |
| "tokens_trained": 5.013024928 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.43684845046450604, | |
| "grad_norm": 1.551711082458496, | |
| "loss": 1.1369, | |
| "loss_ce": 1.085469365119934, | |
| "loss_region": 0.02990012802183628, | |
| "loss_total": 1.115369439125061, | |
| "lr": 0.0011824687942694674, | |
| "router/selected_tokens_s0": 4162.25, | |
| "step": 1540, | |
| "tokens_trained": 5.04578704 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.43968512871427556, | |
| "grad_norm": 1.3328109979629517, | |
| "loss": 1.1358, | |
| "loss_ce": 1.1522539854049683, | |
| "loss_region": 0.02980169840157032, | |
| "loss_total": 1.1820557117462158, | |
| "lr": 0.0011820619022856163, | |
| "router/selected_tokens_s0": 4050.75, | |
| "step": 1550, | |
| "tokens_trained": 5.078551904 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4425218069640451, | |
| "grad_norm": 2.2517945766448975, | |
| "loss": 1.1398, | |
| "loss_ce": 1.0304194688796997, | |
| "loss_region": 0.030139248818159103, | |
| "loss_total": 1.0605586767196655, | |
| "lr": 0.0011816550103017653, | |
| "router/selected_tokens_s0": 4399.625, | |
| "step": 1560, | |
| "tokens_trained": 5.111317344 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4453584852138146, | |
| "grad_norm": 1.0419440269470215, | |
| "loss": 1.1423, | |
| "loss_ce": 1.2029235363006592, | |
| "loss_region": 0.029878782108426094, | |
| "loss_total": 1.2328022718429565, | |
| "lr": 0.0011812481183179143, | |
| "router/selected_tokens_s0": 4131.75, | |
| "step": 1570, | |
| "tokens_trained": 5.144082784 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4481951634635841, | |
| "grad_norm": 0.8405026197433472, | |
| "loss": 1.1357, | |
| "loss_ce": 1.1085268259048462, | |
| "loss_region": 0.02992934361100197, | |
| "loss_total": 1.1384562253952026, | |
| "lr": 0.0011808412263340632, | |
| "router/selected_tokens_s0": 4185.75, | |
| "step": 1580, | |
| "tokens_trained": 5.176848224 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.45103184171335364, | |
| "grad_norm": 1.8782676458358765, | |
| "loss": 1.1447, | |
| "loss_ce": 1.0933234691619873, | |
| "loss_region": 0.030135583132505417, | |
| "loss_total": 1.1234591007232666, | |
| "lr": 0.0011804343343502122, | |
| "router/selected_tokens_s0": 4400.5, | |
| "step": 1590, | |
| "tokens_trained": 5.209613664 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.45386851996312316, | |
| "grad_norm": 1.116540551185608, | |
| "loss": 1.1417, | |
| "loss_ce": 1.1890523433685303, | |
| "loss_region": 0.0303688682615757, | |
| "loss_total": 1.2194212675094604, | |
| "lr": 0.0011800274423663611, | |
| "router/selected_tokens_s0": 4597.375, | |
| "step": 1600, | |
| "tokens_trained": 5.242378304 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4567051982128927, | |
| "grad_norm": 0.9224187135696411, | |
| "loss": 1.1352, | |
| "loss_ce": 1.0753121376037598, | |
| "loss_region": 0.030113881453871727, | |
| "loss_total": 1.1054260730743408, | |
| "lr": 0.0011796205503825101, | |
| "router/selected_tokens_s0": 4381.5, | |
| "step": 1610, | |
| "tokens_trained": 5.275142944 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4595418764626622, | |
| "grad_norm": 1.250409483909607, | |
| "loss": 1.1423, | |
| "loss_ce": 1.1405887603759766, | |
| "loss_region": 0.030090278014540672, | |
| "loss_total": 1.1706790924072266, | |
| "lr": 0.001179213658398659, | |
| "router/selected_tokens_s0": 4360.875, | |
| "step": 1620, | |
| "tokens_trained": 5.307906784 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4623785547124317, | |
| "grad_norm": 0.6683188080787659, | |
| "loss": 1.1358, | |
| "loss_ce": 1.0137219429016113, | |
| "loss_region": 0.0301409512758255, | |
| "loss_total": 1.0438629388809204, | |
| "lr": 0.001178806766414808, | |
| "router/selected_tokens_s0": 4420.25, | |
| "step": 1630, | |
| "tokens_trained": 5.340672224 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.46521523296220124, | |
| "grad_norm": 1.3055206537246704, | |
| "loss": 1.1378, | |
| "loss_ce": 1.120367407798767, | |
| "loss_region": 0.029992438852787018, | |
| "loss_total": 1.150359869003296, | |
| "lr": 0.001178399874430957, | |
| "router/selected_tokens_s0": 4256.375, | |
| "step": 1640, | |
| "tokens_trained": 5.373436896 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.46805191121197076, | |
| "grad_norm": 1.2817225456237793, | |
| "loss": 1.1365, | |
| "loss_ce": 1.159173607826233, | |
| "loss_region": 0.030014952644705772, | |
| "loss_total": 1.1891885995864868, | |
| "lr": 0.001177992982447106, | |
| "router/selected_tokens_s0": 4277.875, | |
| "step": 1650, | |
| "tokens_trained": 5.406202336 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4708885894617403, | |
| "grad_norm": 1.2652041912078857, | |
| "loss": 1.1303, | |
| "loss_ce": 1.1445159912109375, | |
| "loss_region": 0.03000623546540737, | |
| "loss_total": 1.1745222806930542, | |
| "lr": 0.001177586090463255, | |
| "router/selected_tokens_s0": 4274.375, | |
| "step": 1660, | |
| "tokens_trained": 5.438967776 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4737252677115098, | |
| "grad_norm": 1.7784186601638794, | |
| "loss": 1.1334, | |
| "loss_ce": 1.1069244146347046, | |
| "loss_region": 0.030016450211405754, | |
| "loss_total": 1.136940836906433, | |
| "lr": 0.001177179198479404, | |
| "router/selected_tokens_s0": 4287.625, | |
| "step": 1670, | |
| "tokens_trained": 5.471733216 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4765619459612793, | |
| "grad_norm": 1.0779353380203247, | |
| "loss": 1.1315, | |
| "loss_ce": 1.1237202882766724, | |
| "loss_region": 0.029916411265730858, | |
| "loss_total": 1.1536366939544678, | |
| "lr": 0.0011767723064955529, | |
| "router/selected_tokens_s0": 4156.75, | |
| "step": 1680, | |
| "tokens_trained": 5.504498656 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.47939862421104884, | |
| "grad_norm": 0.7689351439476013, | |
| "loss": 1.1324, | |
| "loss_ce": 1.0980726480484009, | |
| "loss_region": 0.030096061527729034, | |
| "loss_total": 1.1281687021255493, | |
| "lr": 0.0011763654145117018, | |
| "router/selected_tokens_s0": 4377.5, | |
| "step": 1690, | |
| "tokens_trained": 5.537264096 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.48223530246081836, | |
| "grad_norm": 0.6869276165962219, | |
| "loss": 1.1332, | |
| "loss_ce": 1.0792652368545532, | |
| "loss_region": 0.030072998255491257, | |
| "loss_total": 1.1093382835388184, | |
| "lr": 0.0011759585225278508, | |
| "router/selected_tokens_s0": 4349.625, | |
| "step": 1700, | |
| "tokens_trained": 5.570029536 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4850719807105879, | |
| "grad_norm": 0.9587815403938293, | |
| "loss": 1.1361, | |
| "loss_ce": 1.0378434658050537, | |
| "loss_region": 0.03009817562997341, | |
| "loss_total": 1.067941665649414, | |
| "lr": 0.001175551630544, | |
| "router/selected_tokens_s0": 4384.25, | |
| "step": 1710, | |
| "tokens_trained": 5.602794976 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4879086589603574, | |
| "grad_norm": 1.1542259454727173, | |
| "loss": 1.1294, | |
| "loss_ce": 1.074008584022522, | |
| "loss_region": 0.030034121125936508, | |
| "loss_total": 1.104042649269104, | |
| "lr": 0.001175144738560149, | |
| "router/selected_tokens_s0": 4306.0, | |
| "step": 1720, | |
| "tokens_trained": 5.635560416 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4907453372101269, | |
| "grad_norm": 1.0194206237792969, | |
| "loss": 1.1296, | |
| "loss_ce": 1.1548231840133667, | |
| "loss_region": 0.03011094592511654, | |
| "loss_total": 1.184934139251709, | |
| "lr": 0.001174737846576298, | |
| "router/selected_tokens_s0": 4395.625, | |
| "step": 1730, | |
| "tokens_trained": 5.668325856 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.49358201545989644, | |
| "grad_norm": 1.108144998550415, | |
| "loss": 1.1351, | |
| "loss_ce": 1.0953419208526611, | |
| "loss_region": 0.03002314455807209, | |
| "loss_total": 1.1253650188446045, | |
| "lr": 0.0011743309545924469, | |
| "router/selected_tokens_s0": 4292.125, | |
| "step": 1740, | |
| "tokens_trained": 5.701091296 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.49641869370966596, | |
| "grad_norm": 1.075562834739685, | |
| "loss": 1.1347, | |
| "loss_ce": 1.1154391765594482, | |
| "loss_region": 0.029949212446808815, | |
| "loss_total": 1.1453883647918701, | |
| "lr": 0.0011739240626085956, | |
| "router/selected_tokens_s0": 4188.625, | |
| "step": 1750, | |
| "tokens_trained": 5.733856736 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.4992553719594355, | |
| "grad_norm": 1.3173739910125732, | |
| "loss": 1.1325, | |
| "loss_ce": 1.0855435132980347, | |
| "loss_region": 0.02994917891919613, | |
| "loss_total": 1.1154927015304565, | |
| "lr": 0.0011735171706247446, | |
| "router/selected_tokens_s0": 4183.625, | |
| "step": 1760, | |
| "tokens_trained": 5.766622176 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.502092050209205, | |
| "grad_norm": 0.8734815716743469, | |
| "loss": 1.1316, | |
| "loss_ce": 1.190360188484192, | |
| "loss_region": 0.03002040646970272, | |
| "loss_total": 1.2203805446624756, | |
| "lr": 0.0011731102786408936, | |
| "router/selected_tokens_s0": 4294.5, | |
| "step": 1770, | |
| "tokens_trained": 5.799387616 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5049287284589745, | |
| "grad_norm": 2.5296459197998047, | |
| "loss": 1.1361, | |
| "loss_ce": 0.9863566756248474, | |
| "loss_region": 0.02998475357890129, | |
| "loss_total": 1.0163414478302002, | |
| "lr": 0.0011727033866570427, | |
| "router/selected_tokens_s0": 4235.875, | |
| "step": 1780, | |
| "tokens_trained": 5.832153056 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.507765406708744, | |
| "grad_norm": 0.7834669947624207, | |
| "loss": 1.1297, | |
| "loss_ce": 0.9555173516273499, | |
| "loss_region": 0.0301660243421793, | |
| "loss_total": 0.9856833815574646, | |
| "lr": 0.0011722964946731917, | |
| "router/selected_tokens_s0": 4416.375, | |
| "step": 1790, | |
| "tokens_trained": 5.864918496 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5106020849585136, | |
| "grad_norm": 0.9466329216957092, | |
| "loss": 1.1295, | |
| "loss_ce": 1.0096023082733154, | |
| "loss_region": 0.030076030641794205, | |
| "loss_total": 1.0396783351898193, | |
| "lr": 0.0011718896026893407, | |
| "router/selected_tokens_s0": 4354.625, | |
| "step": 1800, | |
| "tokens_trained": 5.897683936 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5134387632082831, | |
| "grad_norm": 1.151943325996399, | |
| "loss": 1.1267, | |
| "loss_ce": 1.0721287727355957, | |
| "loss_region": 0.029984835535287857, | |
| "loss_total": 1.1021136045455933, | |
| "lr": 0.0011714827107054896, | |
| "router/selected_tokens_s0": 4239.75, | |
| "step": 1810, | |
| "tokens_trained": 5.930449376 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5162754414580526, | |
| "grad_norm": 0.5502280592918396, | |
| "loss": 1.1249, | |
| "loss_ce": 1.0287433862686157, | |
| "loss_region": 0.029946208000183105, | |
| "loss_total": 1.0586895942687988, | |
| "lr": 0.0011710758187216386, | |
| "router/selected_tokens_s0": 4179.375, | |
| "step": 1820, | |
| "tokens_trained": 5.96321104 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5191121197078221, | |
| "grad_norm": 1.5447858572006226, | |
| "loss": 1.1319, | |
| "loss_ce": 1.1280238628387451, | |
| "loss_region": 0.030087478458881378, | |
| "loss_total": 1.158111333847046, | |
| "lr": 0.0011706689267377876, | |
| "router/selected_tokens_s0": 4389.75, | |
| "step": 1830, | |
| "tokens_trained": 5.99597648 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5219487979575916, | |
| "grad_norm": 0.9524003863334656, | |
| "loss": 1.1274, | |
| "loss_ce": 1.0977569818496704, | |
| "loss_region": 0.030062809586524963, | |
| "loss_total": 1.1278197765350342, | |
| "lr": 0.0011702620347539365, | |
| "router/selected_tokens_s0": 4354.0, | |
| "step": 1840, | |
| "tokens_trained": 6.028741744 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5247854762073612, | |
| "grad_norm": 0.6106662750244141, | |
| "loss": 1.1264, | |
| "loss_ce": 1.06783926486969, | |
| "loss_region": 0.029942721128463745, | |
| "loss_total": 1.097782015800476, | |
| "lr": 0.0011698551427700855, | |
| "router/selected_tokens_s0": 4162.625, | |
| "step": 1850, | |
| "tokens_trained": 6.061507184 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5276221544571307, | |
| "grad_norm": 1.2853341102600098, | |
| "loss": 1.1329, | |
| "loss_ce": 1.0429413318634033, | |
| "loss_region": 0.02999758906662464, | |
| "loss_total": 1.0729389190673828, | |
| "lr": 0.0011694482507862345, | |
| "router/selected_tokens_s0": 4247.5, | |
| "step": 1860, | |
| "tokens_trained": 6.094268624 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5304588327069002, | |
| "grad_norm": 2.993485927581787, | |
| "loss": 1.1236, | |
| "loss_ce": 1.0583568811416626, | |
| "loss_region": 0.030023684725165367, | |
| "loss_total": 1.0883805751800537, | |
| "lr": 0.0011690413588023834, | |
| "router/selected_tokens_s0": 4302.0, | |
| "step": 1870, | |
| "tokens_trained": 6.127034064 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5332955109566697, | |
| "grad_norm": 0.7363700866699219, | |
| "loss": 1.1308, | |
| "loss_ce": 1.1353397369384766, | |
| "loss_region": 0.029933562502264977, | |
| "loss_total": 1.1652733087539673, | |
| "lr": 0.0011686344668185324, | |
| "router/selected_tokens_s0": 4149.375, | |
| "step": 1880, | |
| "tokens_trained": 6.159799504 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5361321892064392, | |
| "grad_norm": 0.8693296313285828, | |
| "loss": 1.1274, | |
| "loss_ce": 1.0827381610870361, | |
| "loss_region": 0.030024589970707893, | |
| "loss_total": 1.1127628087997437, | |
| "lr": 0.0011682275748346814, | |
| "router/selected_tokens_s0": 4302.25, | |
| "step": 1890, | |
| "tokens_trained": 6.192561072 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5389688674562088, | |
| "grad_norm": 0.4028984606266022, | |
| "loss": 1.1162, | |
| "loss_ce": 1.1056593656539917, | |
| "loss_region": 0.030071411281824112, | |
| "loss_total": 1.1357307434082031, | |
| "lr": 0.0011678206828508303, | |
| "router/selected_tokens_s0": 4372.625, | |
| "step": 1900, | |
| "tokens_trained": 6.225326512 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5418055457059783, | |
| "grad_norm": 1.1904973983764648, | |
| "loss": 1.1294, | |
| "loss_ce": 1.0976545810699463, | |
| "loss_region": 0.030053725466132164, | |
| "loss_total": 1.1277083158493042, | |
| "lr": 0.0011674137908669793, | |
| "router/selected_tokens_s0": 4348.125, | |
| "step": 1910, | |
| "tokens_trained": 6.258091952 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5446422239557478, | |
| "grad_norm": 1.018221378326416, | |
| "loss": 1.1277, | |
| "loss_ce": 1.1479384899139404, | |
| "loss_region": 0.030054787173867226, | |
| "loss_total": 1.1779932975769043, | |
| "lr": 0.0011670068988831283, | |
| "router/selected_tokens_s0": 4353.25, | |
| "step": 1920, | |
| "tokens_trained": 6.290857392 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5474789022055173, | |
| "grad_norm": 0.4506734013557434, | |
| "loss": 1.1235, | |
| "loss_ce": 1.1137655973434448, | |
| "loss_region": 0.03005811758339405, | |
| "loss_total": 1.1438237428665161, | |
| "lr": 0.0011666000068992772, | |
| "router/selected_tokens_s0": 4341.5, | |
| "step": 1930, | |
| "tokens_trained": 6.323622832 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5503155804552868, | |
| "grad_norm": 1.5671348571777344, | |
| "loss": 1.1318, | |
| "loss_ce": 1.1652703285217285, | |
| "loss_region": 0.030141720548272133, | |
| "loss_total": 1.195412039756775, | |
| "lr": 0.0011661931149154262, | |
| "router/selected_tokens_s0": 4458.125, | |
| "step": 1940, | |
| "tokens_trained": 6.356388272 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5531522587050564, | |
| "grad_norm": 1.2511063814163208, | |
| "loss": 1.1246, | |
| "loss_ce": 1.2078148126602173, | |
| "loss_region": 0.03000708669424057, | |
| "loss_total": 1.2378219366073608, | |
| "lr": 0.0011657862229315751, | |
| "router/selected_tokens_s0": 4275.625, | |
| "step": 1950, | |
| "tokens_trained": 6.389153712 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5559889369548259, | |
| "grad_norm": 1.1278033256530762, | |
| "loss": 1.1253, | |
| "loss_ce": 1.1528972387313843, | |
| "loss_region": 0.029990505427122116, | |
| "loss_total": 1.1828877925872803, | |
| "lr": 0.0011653793309477243, | |
| "router/selected_tokens_s0": 4247.625, | |
| "step": 1960, | |
| "tokens_trained": 6.421919152 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5588256152045954, | |
| "grad_norm": 0.7347070574760437, | |
| "loss": 1.1292, | |
| "loss_ce": 1.1609221696853638, | |
| "loss_region": 0.03007410652935505, | |
| "loss_total": 1.1909962892532349, | |
| "lr": 0.0011649724389638733, | |
| "router/selected_tokens_s0": 4377.0, | |
| "step": 1970, | |
| "tokens_trained": 6.454684592 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5616622934543649, | |
| "grad_norm": 0.8754347562789917, | |
| "loss": 1.1321, | |
| "loss_ce": 1.1314905881881714, | |
| "loss_region": 0.030018918216228485, | |
| "loss_total": 1.1615095138549805, | |
| "lr": 0.0011645655469800223, | |
| "router/selected_tokens_s0": 4292.375, | |
| "step": 1980, | |
| "tokens_trained": 6.487450032 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5644989717041344, | |
| "grad_norm": 1.4375395774841309, | |
| "loss": 1.1251, | |
| "loss_ce": 1.15834641456604, | |
| "loss_region": 0.030011450871825218, | |
| "loss_total": 1.1883578300476074, | |
| "lr": 0.0011641586549961712, | |
| "router/selected_tokens_s0": 4281.875, | |
| "step": 1990, | |
| "tokens_trained": 6.520215472 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.567335649953904, | |
| "grad_norm": 1.3012388944625854, | |
| "loss": 1.1244, | |
| "loss_ce": 1.1547801494598389, | |
| "loss_region": 0.03002019412815571, | |
| "loss_total": 1.184800386428833, | |
| "lr": 0.00116375176301232, | |
| "router/selected_tokens_s0": 4298.375, | |
| "step": 2000, | |
| "tokens_trained": 6.552980912 | |
| }, | |
| { | |
| "epoch": 0.567335649953904, | |
| "eval_ppl": 2.997264738752139, | |
| "eval_runtime": 2.4974, | |
| "step": 2000, | |
| "tokens_trained": 6.552980912 | |
| }, | |
| { | |
| "epoch": 0.567335649953904, | |
| "eval_F": 0.33877094677913017, | |
| "eval_F_cds": 0.3354545528054273, | |
| "eval_F_dig": 0.3349740865171758, | |
| "eval_F_exon": 0.33771546252151097, | |
| "eval_F_intron": 0.3394511609404705, | |
| "eval_F_nig": 0.33961248247030124, | |
| "eval_F_promoter": 0.33587224314868064, | |
| "eval_F_utr": 0.3390466904438115, | |
| "eval_G": 0.3927095408069945, | |
| "eval_G_cds": 0.38760326352277413, | |
| "eval_G_dig": 0.38993240031773313, | |
| "eval_G_exon": 0.3922000848097159, | |
| "eval_G_intron": 0.39271919880055167, | |
| "eval_G_nig": 0.3935918508753731, | |
| "eval_G_promoter": 0.3926971556782782, | |
| "eval_G_utr": 0.3912176578977754, | |
| "eval_avg_bp_per_token": 2.9518469913300267, | |
| "eval_bp_per_token/cds": 2.981029744974208, | |
| "eval_bp_per_token/dig": 2.9853055512361997, | |
| "eval_bp_per_token/exon": 2.9610725920975693, | |
| "eval_bp_per_token/intron": 2.9459318896698954, | |
| "eval_bp_per_token/nig": 2.9445325234400035, | |
| "eval_bp_per_token/promoter": 2.977322539741189, | |
| "eval_bp_per_token/utr": 2.9494462803662875, | |
| "eval_ppl_cds": 3.6941119312579422, | |
| "eval_ppl_dig": 1.1218375588220217, | |
| "eval_ppl_exon": 3.4074634485917565, | |
| "eval_ppl_intron": 3.014504389955456, | |
| "eval_ppl_nig": 2.843623870937302, | |
| "eval_ppl_promoter": 3.3305259507076883, | |
| "eval_ppl_utr": 3.322006494837333, | |
| "step": 2000, | |
| "tokens_trained": 6.552980912 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5701723282036735, | |
| "grad_norm": 1.7854270935058594, | |
| "loss": 1.1275, | |
| "loss_ce": 1.1118180751800537, | |
| "loss_region": 0.030034875497221947, | |
| "loss_total": 1.1418529748916626, | |
| "lr": 0.001163344871028469, | |
| "router/selected_tokens_s0": 4323.625, | |
| "step": 2010, | |
| "tokens_trained": 6.585746352 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.573009006453443, | |
| "grad_norm": 1.2741203308105469, | |
| "loss": 1.1297, | |
| "loss_ce": 1.1596630811691284, | |
| "loss_region": 0.030020276084542274, | |
| "loss_total": 1.1896833181381226, | |
| "lr": 0.001162937979044618, | |
| "router/selected_tokens_s0": 4296.625, | |
| "step": 2020, | |
| "tokens_trained": 6.618511792 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5758456847032125, | |
| "grad_norm": 1.3113727569580078, | |
| "loss": 1.1274, | |
| "loss_ce": 1.130359411239624, | |
| "loss_region": 0.030052313581109047, | |
| "loss_total": 1.1604117155075073, | |
| "lr": 0.001162531087060767, | |
| "router/selected_tokens_s0": 4347.25, | |
| "step": 2030, | |
| "tokens_trained": 6.651277232 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.578682362952982, | |
| "grad_norm": 1.585740089416504, | |
| "loss": 1.1242, | |
| "loss_ce": 1.113228440284729, | |
| "loss_region": 0.029946262016892433, | |
| "loss_total": 1.143174648284912, | |
| "lr": 0.001162124195076916, | |
| "router/selected_tokens_s0": 4151.5, | |
| "step": 2040, | |
| "tokens_trained": 6.684041872 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5815190412027516, | |
| "grad_norm": 1.4227651357650757, | |
| "loss": 1.1227, | |
| "loss_ce": 1.1707289218902588, | |
| "loss_region": 0.03000037930905819, | |
| "loss_total": 1.200729250907898, | |
| "lr": 0.001161717303093065, | |
| "router/selected_tokens_s0": 4264.25, | |
| "step": 2050, | |
| "tokens_trained": 6.716806512 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5843557194525211, | |
| "grad_norm": 1.4349584579467773, | |
| "loss": 1.126, | |
| "loss_ce": 1.123897910118103, | |
| "loss_region": 0.029999535530805588, | |
| "loss_total": 1.1538974046707153, | |
| "lr": 0.001161310411109214, | |
| "router/selected_tokens_s0": 4258.5, | |
| "step": 2060, | |
| "tokens_trained": 6.749571952 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5871923977022906, | |
| "grad_norm": 1.525637149810791, | |
| "loss": 1.1223, | |
| "loss_ce": 1.0622094869613647, | |
| "loss_region": 0.03016025200486183, | |
| "loss_total": 1.092369794845581, | |
| "lr": 0.001160903519125363, | |
| "router/selected_tokens_s0": 4409.25, | |
| "step": 2070, | |
| "tokens_trained": 6.782337392 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5900290759520601, | |
| "grad_norm": 0.31481412053108215, | |
| "loss": 1.1308, | |
| "loss_ce": 1.1158243417739868, | |
| "loss_region": 0.030056282877922058, | |
| "loss_total": 1.1458805799484253, | |
| "lr": 0.001160496627141512, | |
| "router/selected_tokens_s0": 4358.875, | |
| "step": 2080, | |
| "tokens_trained": 6.815102832 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5928657542018296, | |
| "grad_norm": 1.4279309511184692, | |
| "loss": 1.1212, | |
| "loss_ce": 1.1024186611175537, | |
| "loss_region": 0.03000911884009838, | |
| "loss_total": 1.1324278116226196, | |
| "lr": 0.0011600897351576609, | |
| "router/selected_tokens_s0": 4277.25, | |
| "step": 2090, | |
| "tokens_trained": 6.847868272 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5957024324515992, | |
| "grad_norm": 1.3502033948898315, | |
| "loss": 1.1243, | |
| "loss_ce": 1.215091347694397, | |
| "loss_region": 0.03004975989460945, | |
| "loss_total": 1.2451411485671997, | |
| "lr": 0.0011596828431738098, | |
| "router/selected_tokens_s0": 4345.25, | |
| "step": 2100, | |
| "tokens_trained": 6.880633712 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.5985391107013687, | |
| "grad_norm": 0.30469629168510437, | |
| "loss": 1.1227, | |
| "loss_ce": 1.0989904403686523, | |
| "loss_region": 0.03004642389714718, | |
| "loss_total": 1.1290369033813477, | |
| "lr": 0.0011592759511899588, | |
| "router/selected_tokens_s0": 4339.125, | |
| "step": 2110, | |
| "tokens_trained": 6.913397016 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6013757889511382, | |
| "grad_norm": 3.0106451511383057, | |
| "loss": 1.1271, | |
| "loss_ce": 1.0580655336380005, | |
| "loss_region": 0.03005184419453144, | |
| "loss_total": 1.0881173610687256, | |
| "lr": 0.0011588690592061078, | |
| "router/selected_tokens_s0": 4347.75, | |
| "step": 2120, | |
| "tokens_trained": 6.946162296 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6042124672009077, | |
| "grad_norm": 1.4084529876708984, | |
| "loss": 1.1261, | |
| "loss_ce": 0.9337919354438782, | |
| "loss_region": 0.029956450685858727, | |
| "loss_total": 0.9637483954429626, | |
| "lr": 0.0011584621672222567, | |
| "router/selected_tokens_s0": 4181.25, | |
| "step": 2130, | |
| "tokens_trained": 6.978927736 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6070491454506772, | |
| "grad_norm": 0.7794283032417297, | |
| "loss": 1.1287, | |
| "loss_ce": 1.0321320295333862, | |
| "loss_region": 0.030011894181370735, | |
| "loss_total": 1.0621439218521118, | |
| "lr": 0.0011580552752384057, | |
| "router/selected_tokens_s0": 4285.875, | |
| "step": 2140, | |
| "tokens_trained": 7.011693176 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6098858237004467, | |
| "grad_norm": 0.7242727279663086, | |
| "loss": 1.1314, | |
| "loss_ce": 1.1077067852020264, | |
| "loss_region": 0.030075622722506523, | |
| "loss_total": 1.1377824544906616, | |
| "lr": 0.0011576483832545547, | |
| "router/selected_tokens_s0": 4383.25, | |
| "step": 2150, | |
| "tokens_trained": 7.044458616 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6127225019502163, | |
| "grad_norm": 0.8703320622444153, | |
| "loss": 1.1255, | |
| "loss_ce": 1.042706847190857, | |
| "loss_region": 0.030024481937289238, | |
| "loss_total": 1.072731375694275, | |
| "lr": 0.0011572414912707036, | |
| "router/selected_tokens_s0": 4306.0, | |
| "step": 2160, | |
| "tokens_trained": 7.077224056 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6155591801999858, | |
| "grad_norm": 2.464707374572754, | |
| "loss": 1.12, | |
| "loss_ce": 1.0845450162887573, | |
| "loss_region": 0.029988931491971016, | |
| "loss_total": 1.1145339012145996, | |
| "lr": 0.0011568345992868526, | |
| "router/selected_tokens_s0": 4238.875, | |
| "step": 2170, | |
| "tokens_trained": 7.109989496 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6183958584497553, | |
| "grad_norm": 2.0766637325286865, | |
| "loss": 1.1266, | |
| "loss_ce": 1.1240020990371704, | |
| "loss_region": 0.030013838782906532, | |
| "loss_total": 1.1540158987045288, | |
| "lr": 0.0011564277073030016, | |
| "router/selected_tokens_s0": 4291.875, | |
| "step": 2180, | |
| "tokens_trained": 7.142754936 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6212325366995248, | |
| "grad_norm": 1.402709722518921, | |
| "loss": 1.1265, | |
| "loss_ce": 1.1370148658752441, | |
| "loss_region": 0.03003770112991333, | |
| "loss_total": 1.1670525074005127, | |
| "lr": 0.0011560208153191505, | |
| "router/selected_tokens_s0": 4328.625, | |
| "step": 2190, | |
| "tokens_trained": 7.175520376 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6240692149492943, | |
| "grad_norm": 0.7657859325408936, | |
| "loss": 1.1259, | |
| "loss_ce": 1.116765022277832, | |
| "loss_region": 0.030005475506186485, | |
| "loss_total": 1.1467704772949219, | |
| "lr": 0.0011556139233352995, | |
| "router/selected_tokens_s0": 4272.125, | |
| "step": 2200, | |
| "tokens_trained": 7.208285816 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6269058931990639, | |
| "grad_norm": 3.5244100093841553, | |
| "loss": 1.1305, | |
| "loss_ce": 1.1446946859359741, | |
| "loss_region": 0.030087754130363464, | |
| "loss_total": 1.174782395362854, | |
| "lr": 0.0011552070313514487, | |
| "router/selected_tokens_s0": 4414.25, | |
| "step": 2210, | |
| "tokens_trained": 7.241051256 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6297425714488334, | |
| "grad_norm": 0.599822998046875, | |
| "loss": 1.1324, | |
| "loss_ce": 1.0551592111587524, | |
| "loss_region": 0.030122289434075356, | |
| "loss_total": 1.085281491279602, | |
| "lr": 0.0011548001393675976, | |
| "router/selected_tokens_s0": 4453.875, | |
| "step": 2220, | |
| "tokens_trained": 7.273816696 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6325792496986029, | |
| "grad_norm": 2.314722776412964, | |
| "loss": 1.1277, | |
| "loss_ce": 1.1485532522201538, | |
| "loss_region": 0.030024103820323944, | |
| "loss_total": 1.1785773038864136, | |
| "lr": 0.0011543932473837466, | |
| "router/selected_tokens_s0": 4313.5, | |
| "step": 2230, | |
| "tokens_trained": 7.306582136 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6354159279483724, | |
| "grad_norm": 2.072960615158081, | |
| "loss": 1.131, | |
| "loss_ce": 1.0349353551864624, | |
| "loss_region": 0.030028166249394417, | |
| "loss_total": 1.0649635791778564, | |
| "lr": 0.0011539863553998956, | |
| "router/selected_tokens_s0": 4319.625, | |
| "step": 2240, | |
| "tokens_trained": 7.339347576 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.638252606198142, | |
| "grad_norm": 1.371410846710205, | |
| "loss": 1.1226, | |
| "loss_ce": 1.0738561153411865, | |
| "loss_region": 0.030064314603805542, | |
| "loss_total": 1.1039204597473145, | |
| "lr": 0.0011535794634160443, | |
| "router/selected_tokens_s0": 4378.375, | |
| "step": 2250, | |
| "tokens_trained": 7.372113016 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6410892844479115, | |
| "grad_norm": 3.474445343017578, | |
| "loss": 1.1284, | |
| "loss_ce": 1.0069116353988647, | |
| "loss_region": 0.030036170035600662, | |
| "loss_total": 1.0369478464126587, | |
| "lr": 0.0011531725714321933, | |
| "router/selected_tokens_s0": 4332.625, | |
| "step": 2260, | |
| "tokens_trained": 7.404878456 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.643925962697681, | |
| "grad_norm": 0.5796771049499512, | |
| "loss": 1.1245, | |
| "loss_ce": 1.138779640197754, | |
| "loss_region": 0.030022747814655304, | |
| "loss_total": 1.1688023805618286, | |
| "lr": 0.0011527656794483422, | |
| "router/selected_tokens_s0": 4308.875, | |
| "step": 2270, | |
| "tokens_trained": 7.437643896 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6467626409474505, | |
| "grad_norm": 1.155604362487793, | |
| "loss": 1.1216, | |
| "loss_ce": 0.9782689809799194, | |
| "loss_region": 0.030030813068151474, | |
| "loss_total": 1.0082998275756836, | |
| "lr": 0.0011523587874644914, | |
| "router/selected_tokens_s0": 4321.625, | |
| "step": 2280, | |
| "tokens_trained": 7.470409336 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.64959931919722, | |
| "grad_norm": 1.8259997367858887, | |
| "loss": 1.1318, | |
| "loss_ce": 1.055479884147644, | |
| "loss_region": 0.030021535232663155, | |
| "loss_total": 1.0855014324188232, | |
| "lr": 0.0011519518954806404, | |
| "router/selected_tokens_s0": 4307.375, | |
| "step": 2290, | |
| "tokens_trained": 7.503173472 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6524359974469895, | |
| "grad_norm": 1.2909961938858032, | |
| "loss": 1.1216, | |
| "loss_ce": 1.1016438007354736, | |
| "loss_region": 0.030030114576220512, | |
| "loss_total": 1.1316739320755005, | |
| "lr": 0.0011515450034967894, | |
| "router/selected_tokens_s0": 4321.625, | |
| "step": 2300, | |
| "tokens_trained": 7.535938912 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6552726756967591, | |
| "grad_norm": 3.855242967605591, | |
| "loss": 1.1332, | |
| "loss_ce": 1.1084688901901245, | |
| "loss_region": 0.030001208186149597, | |
| "loss_total": 1.1384700536727905, | |
| "lr": 0.0011511381115129383, | |
| "router/selected_tokens_s0": 4267.625, | |
| "step": 2310, | |
| "tokens_trained": 7.568704352 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6581093539465286, | |
| "grad_norm": 0.6401855945587158, | |
| "loss": 1.1235, | |
| "loss_ce": 1.068629503250122, | |
| "loss_region": 0.030046915635466576, | |
| "loss_total": 1.0986764430999756, | |
| "lr": 0.0011507312195290873, | |
| "router/selected_tokens_s0": 4353.5, | |
| "step": 2320, | |
| "tokens_trained": 7.601469792 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6609460321962981, | |
| "grad_norm": 2.758415460586548, | |
| "loss": 1.1224, | |
| "loss_ce": 1.1197397708892822, | |
| "loss_region": 0.030033273622393608, | |
| "loss_total": 1.1497730016708374, | |
| "lr": 0.0011503243275452363, | |
| "router/selected_tokens_s0": 4317.625, | |
| "step": 2330, | |
| "tokens_trained": 7.634233608 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6637827104460676, | |
| "grad_norm": 3.6356966495513916, | |
| "loss": 1.1258, | |
| "loss_ce": 1.192346453666687, | |
| "loss_region": 0.030019540339708328, | |
| "loss_total": 1.2223659753799438, | |
| "lr": 0.0011499174355613852, | |
| "router/selected_tokens_s0": 4307.0, | |
| "step": 2340, | |
| "tokens_trained": 7.666998248 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6666193886958371, | |
| "grad_norm": 0.5084363222122192, | |
| "loss": 1.1211, | |
| "loss_ce": 1.0241565704345703, | |
| "loss_region": 0.030024418607354164, | |
| "loss_total": 1.0541809797286987, | |
| "lr": 0.0011495105435775342, | |
| "router/selected_tokens_s0": 4311.75, | |
| "step": 2350, | |
| "tokens_trained": 7.699763688 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6694560669456067, | |
| "grad_norm": 2.6118147373199463, | |
| "loss": 1.1205, | |
| "loss_ce": 1.094053864479065, | |
| "loss_region": 0.030054572969675064, | |
| "loss_total": 1.1241084337234497, | |
| "lr": 0.0011491036515936831, | |
| "router/selected_tokens_s0": 4375.625, | |
| "step": 2360, | |
| "tokens_trained": 7.732529128 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6722927451953762, | |
| "grad_norm": 1.5716001987457275, | |
| "loss": 1.1174, | |
| "loss_ce": 1.0806825160980225, | |
| "loss_region": 0.02999335154891014, | |
| "loss_total": 1.1106758117675781, | |
| "lr": 0.0011486967596098321, | |
| "router/selected_tokens_s0": 4245.125, | |
| "step": 2370, | |
| "tokens_trained": 7.765294568 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6751294234451457, | |
| "grad_norm": 1.6855603456497192, | |
| "loss": 1.1248, | |
| "loss_ce": 1.0957375764846802, | |
| "loss_region": 0.030019070953130722, | |
| "loss_total": 1.1257566213607788, | |
| "lr": 0.001148289867625981, | |
| "router/selected_tokens_s0": 4306.25, | |
| "step": 2380, | |
| "tokens_trained": 7.798060008 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 1.7085551023483276, | |
| "loss": 1.1219, | |
| "loss_ce": 1.0849840641021729, | |
| "loss_region": 0.029990842565894127, | |
| "loss_total": 1.114974856376648, | |
| "lr": 0.00114788297564213, | |
| "router/selected_tokens_s0": 4250.875, | |
| "step": 2390, | |
| "tokens_trained": 7.830825448 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6808027799446847, | |
| "grad_norm": 2.7529702186584473, | |
| "loss": 1.1278, | |
| "loss_ce": 1.1395268440246582, | |
| "loss_region": 0.030015477910637856, | |
| "loss_total": 1.1695423126220703, | |
| "lr": 0.001147476083658279, | |
| "router/selected_tokens_s0": 4305.125, | |
| "step": 2400, | |
| "tokens_trained": 7.863590888 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6836394581944543, | |
| "grad_norm": 1.855435848236084, | |
| "loss": 1.1225, | |
| "loss_ce": 1.055867075920105, | |
| "loss_region": 0.030039696022868156, | |
| "loss_total": 1.085906744003296, | |
| "lr": 0.001147069191674428, | |
| "router/selected_tokens_s0": 4357.375, | |
| "step": 2410, | |
| "tokens_trained": 7.896356328 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6864761364442238, | |
| "grad_norm": 1.9066152572631836, | |
| "loss": 1.1243, | |
| "loss_ce": 0.9804560542106628, | |
| "loss_region": 0.03004065528512001, | |
| "loss_total": 1.010496735572815, | |
| "lr": 0.001146662299690577, | |
| "router/selected_tokens_s0": 4339.375, | |
| "step": 2420, | |
| "tokens_trained": 7.929121768 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6893128146939933, | |
| "grad_norm": 1.6631957292556763, | |
| "loss": 1.1181, | |
| "loss_ce": 1.1269235610961914, | |
| "loss_region": 0.030016232281923294, | |
| "loss_total": 1.1569397449493408, | |
| "lr": 0.001146255407706726, | |
| "router/selected_tokens_s0": 4304.375, | |
| "step": 2430, | |
| "tokens_trained": 7.961887208 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6921494929437628, | |
| "grad_norm": 1.932186245918274, | |
| "loss": 1.1318, | |
| "loss_ce": 1.1084073781967163, | |
| "loss_region": 0.030037561431527138, | |
| "loss_total": 1.1384449005126953, | |
| "lr": 0.0011458485157228749, | |
| "router/selected_tokens_s0": 4342.375, | |
| "step": 2440, | |
| "tokens_trained": 7.994651848 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6949861711935323, | |
| "grad_norm": 2.0729987621307373, | |
| "loss": 1.1219, | |
| "loss_ce": 1.0754549503326416, | |
| "loss_region": 0.030010342597961426, | |
| "loss_total": 1.105465292930603, | |
| "lr": 0.0011454416237390238, | |
| "router/selected_tokens_s0": 4284.25, | |
| "step": 2450, | |
| "tokens_trained": 8.027417288 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.6978228494433019, | |
| "grad_norm": 2.743365526199341, | |
| "loss": 1.1183, | |
| "loss_ce": 1.1507514715194702, | |
| "loss_region": 0.030012760311365128, | |
| "loss_total": 1.1807641983032227, | |
| "lr": 0.001145034731755173, | |
| "router/selected_tokens_s0": 4299.125, | |
| "step": 2460, | |
| "tokens_trained": 8.060182704 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7006595276930714, | |
| "grad_norm": 1.968074083328247, | |
| "loss": 1.1248, | |
| "loss_ce": 1.1554365158081055, | |
| "loss_region": 0.03006228432059288, | |
| "loss_total": 1.185498833656311, | |
| "lr": 0.001144627839771322, | |
| "router/selected_tokens_s0": 4397.5, | |
| "step": 2470, | |
| "tokens_trained": 8.092948144 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7034962059428409, | |
| "grad_norm": 0.6022619605064392, | |
| "loss": 1.1233, | |
| "loss_ce": 1.0739916563034058, | |
| "loss_region": 0.030015716329216957, | |
| "loss_total": 1.104007363319397, | |
| "lr": 0.001144220947787471, | |
| "router/selected_tokens_s0": 4304.25, | |
| "step": 2480, | |
| "tokens_trained": 8.125713584 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7063328841926104, | |
| "grad_norm": 2.9086802005767822, | |
| "loss": 1.1155, | |
| "loss_ce": 1.1227823495864868, | |
| "loss_region": 0.030057305470108986, | |
| "loss_total": 1.1528396606445312, | |
| "lr": 0.00114381405580362, | |
| "router/selected_tokens_s0": 4393.875, | |
| "step": 2490, | |
| "tokens_trained": 8.158479016 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7091695624423799, | |
| "grad_norm": 1.8187512159347534, | |
| "loss": 1.1248, | |
| "loss_ce": 1.0580413341522217, | |
| "loss_region": 0.030032671988010406, | |
| "loss_total": 1.088073968887329, | |
| "lr": 0.0011434071638197687, | |
| "router/selected_tokens_s0": 4340.0, | |
| "step": 2500, | |
| "tokens_trained": 8.191244456 | |
| }, | |
| { | |
| "epoch": 0.7091695624423799, | |
| "eval_ppl": 2.9815305929864326, | |
| "eval_runtime": 2.4796, | |
| "step": 2500, | |
| "tokens_trained": 8.191244456 | |
| }, | |
| { | |
| "epoch": 0.7091695624423799, | |
| "eval_F": 0.34048558481131336, | |
| "eval_F_cds": 0.3413653968998391, | |
| "eval_F_dig": 0.3326561970987317, | |
| "eval_F_exon": 0.34301915535870453, | |
| "eval_F_intron": 0.3409895477582185, | |
| "eval_F_nig": 0.34018024599300895, | |
| "eval_F_promoter": 0.3386885010090298, | |
| "eval_F_utr": 0.34306656745268094, | |
| "eval_G": 0.37360140820500265, | |
| "eval_G_cds": 0.37391617995023085, | |
| "eval_G_dig": 0.39410936238508215, | |
| "eval_G_exon": 0.37318875715857475, | |
| "eval_G_intron": 0.3727733807645177, | |
| "eval_G_nig": 0.3734594960312147, | |
| "eval_G_promoter": 0.37594098275253596, | |
| "eval_G_utr": 0.3722500326080449, | |
| "eval_avg_bp_per_token": 2.9369819005822793, | |
| "eval_bp_per_token/cds": 2.929412322050359, | |
| "eval_bp_per_token/dig": 3.006106631175135, | |
| "eval_bp_per_token/exon": 2.915289086273542, | |
| "eval_bp_per_token/intron": 2.932641210190579, | |
| "eval_bp_per_token/nig": 2.9396180753557073, | |
| "eval_bp_per_token/promoter": 2.952565549231147, | |
| "eval_bp_per_token/utr": 2.9148861908204733, | |
| "eval_ppl_cds": 3.7211953918524787, | |
| "eval_ppl_dig": 1.1071312956552213, | |
| "eval_ppl_exon": 3.408594147596357, | |
| "eval_ppl_intron": 2.996762231969892, | |
| "eval_ppl_nig": 2.8097869859130795, | |
| "eval_ppl_promoter": 3.341004188366384, | |
| "eval_ppl_utr": 3.3285188682998834, | |
| "step": 2500, | |
| "tokens_trained": 8.191244456 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7120062406921495, | |
| "grad_norm": 1.3883668184280396, | |
| "loss": 1.1168, | |
| "loss_ce": 1.0345538854599, | |
| "loss_region": 0.030011983588337898, | |
| "loss_total": 1.064565896987915, | |
| "lr": 0.0011430002718359176, | |
| "router/selected_tokens_s0": 4293.25, | |
| "step": 2510, | |
| "tokens_trained": 8.224009896 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.714842918941919, | |
| "grad_norm": 0.5920007228851318, | |
| "loss": 1.1128, | |
| "loss_ce": 1.1446270942687988, | |
| "loss_region": 0.030029037967324257, | |
| "loss_total": 1.1746561527252197, | |
| "lr": 0.0011425933798520666, | |
| "router/selected_tokens_s0": 4338.125, | |
| "step": 2520, | |
| "tokens_trained": 8.256775336 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7176795971916885, | |
| "grad_norm": 2.293912410736084, | |
| "loss": 1.119, | |
| "loss_ce": 1.1278671026229858, | |
| "loss_region": 0.030034860596060753, | |
| "loss_total": 1.1579020023345947, | |
| "lr": 0.0011421864878682158, | |
| "router/selected_tokens_s0": 4356.25, | |
| "step": 2530, | |
| "tokens_trained": 8.289540776 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.720516275441458, | |
| "grad_norm": 1.4504122734069824, | |
| "loss": 1.1161, | |
| "loss_ce": 0.9545093774795532, | |
| "loss_region": 0.030026227235794067, | |
| "loss_total": 0.9845355749130249, | |
| "lr": 0.0011417795958843647, | |
| "router/selected_tokens_s0": 4322.375, | |
| "step": 2540, | |
| "tokens_trained": 8.322306216 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7233529536912275, | |
| "grad_norm": 1.777256727218628, | |
| "loss": 1.1177, | |
| "loss_ce": 1.0747570991516113, | |
| "loss_region": 0.030009755864739418, | |
| "loss_total": 1.104766845703125, | |
| "lr": 0.0011413727039005137, | |
| "router/selected_tokens_s0": 4293.375, | |
| "step": 2550, | |
| "tokens_trained": 8.355071656 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7261896319409971, | |
| "grad_norm": 1.637231707572937, | |
| "loss": 1.1121, | |
| "loss_ce": 1.1526259183883667, | |
| "loss_region": 0.030018767341971397, | |
| "loss_total": 1.1826447248458862, | |
| "lr": 0.0011409658119166627, | |
| "router/selected_tokens_s0": 4318.5, | |
| "step": 2560, | |
| "tokens_trained": 8.387835072 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7290263101907666, | |
| "grad_norm": 1.0746310949325562, | |
| "loss": 1.1151, | |
| "loss_ce": 1.1064670085906982, | |
| "loss_region": 0.03001333586871624, | |
| "loss_total": 1.1364803314208984, | |
| "lr": 0.0011405589199328116, | |
| "router/selected_tokens_s0": 4294.375, | |
| "step": 2570, | |
| "tokens_trained": 8.420600512 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7318629884405361, | |
| "grad_norm": 1.3798960447311401, | |
| "loss": 1.1198, | |
| "loss_ce": 1.073905110359192, | |
| "loss_region": 0.030032221227884293, | |
| "loss_total": 1.1039373874664307, | |
| "lr": 0.0011401520279489606, | |
| "router/selected_tokens_s0": 4356.375, | |
| "step": 2580, | |
| "tokens_trained": 8.453365928 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7346996666903056, | |
| "grad_norm": 1.8040990829467773, | |
| "loss": 1.1175, | |
| "loss_ce": 1.0255845785140991, | |
| "loss_region": 0.03001689724624157, | |
| "loss_total": 1.0556014776229858, | |
| "lr": 0.0011397451359651096, | |
| "router/selected_tokens_s0": 4312.5, | |
| "step": 2590, | |
| "tokens_trained": 8.486131368 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7375363449400751, | |
| "grad_norm": 2.420259952545166, | |
| "loss": 1.1193, | |
| "loss_ce": 1.0581092834472656, | |
| "loss_region": 0.030017009004950523, | |
| "loss_total": 1.088126301765442, | |
| "lr": 0.0011393382439812585, | |
| "router/selected_tokens_s0": 4316.25, | |
| "step": 2600, | |
| "tokens_trained": 8.518896808 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7403730231898447, | |
| "grad_norm": 2.068054437637329, | |
| "loss": 1.1114, | |
| "loss_ce": 1.0681673288345337, | |
| "loss_region": 0.030040811747312546, | |
| "loss_total": 1.0982081890106201, | |
| "lr": 0.0011389313519974075, | |
| "router/selected_tokens_s0": 4369.0, | |
| "step": 2610, | |
| "tokens_trained": 8.551662248 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7432097014396142, | |
| "grad_norm": 1.7490754127502441, | |
| "loss": 1.1182, | |
| "loss_ce": 1.0639960765838623, | |
| "loss_region": 0.030034611001610756, | |
| "loss_total": 1.094030737876892, | |
| "lr": 0.0011385244600135565, | |
| "router/selected_tokens_s0": 4363.5, | |
| "step": 2620, | |
| "tokens_trained": 8.584426888 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7460463796893837, | |
| "grad_norm": 1.4811182022094727, | |
| "loss": 1.1131, | |
| "loss_ce": 1.0907317399978638, | |
| "loss_region": 0.03001326695084572, | |
| "loss_total": 1.120745062828064, | |
| "lr": 0.0011381175680297054, | |
| "router/selected_tokens_s0": 4307.875, | |
| "step": 2630, | |
| "tokens_trained": 8.617192328 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7488830579391532, | |
| "grad_norm": 2.1497602462768555, | |
| "loss": 1.1096, | |
| "loss_ce": 1.123599886894226, | |
| "loss_region": 0.030037013813853264, | |
| "loss_total": 1.1536369323730469, | |
| "lr": 0.0011377106760458544, | |
| "router/selected_tokens_s0": 4368.5, | |
| "step": 2640, | |
| "tokens_trained": 8.649951656 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7517197361889227, | |
| "grad_norm": 2.179588556289673, | |
| "loss": 1.1129, | |
| "loss_ce": 0.9365400671958923, | |
| "loss_region": 0.030036216601729393, | |
| "loss_total": 0.9665762782096863, | |
| "lr": 0.0011373037840620034, | |
| "router/selected_tokens_s0": 4350.375, | |
| "step": 2650, | |
| "tokens_trained": 8.682717096 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7545564144386923, | |
| "grad_norm": 1.6021926403045654, | |
| "loss": 1.1095, | |
| "loss_ce": 1.1449388265609741, | |
| "loss_region": 0.030037803575396538, | |
| "loss_total": 1.1749765872955322, | |
| "lr": 0.0011368968920781523, | |
| "router/selected_tokens_s0": 4373.0, | |
| "step": 2660, | |
| "tokens_trained": 8.715482536 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7573930926884618, | |
| "grad_norm": 1.2494678497314453, | |
| "loss": 1.1097, | |
| "loss_ce": 1.0806819200515747, | |
| "loss_region": 0.03000866435468197, | |
| "loss_total": 1.1106905937194824, | |
| "lr": 0.0011364900000943013, | |
| "router/selected_tokens_s0": 4295.25, | |
| "step": 2670, | |
| "tokens_trained": 8.748247976 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7602297709382313, | |
| "grad_norm": 1.3196409940719604, | |
| "loss": 1.1136, | |
| "loss_ce": 1.069360375404358, | |
| "loss_region": 0.030009115114808083, | |
| "loss_total": 1.0993695259094238, | |
| "lr": 0.0011360831081104503, | |
| "router/selected_tokens_s0": 4291.75, | |
| "step": 2680, | |
| "tokens_trained": 8.781013416 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7630664491880008, | |
| "grad_norm": 2.674771308898926, | |
| "loss": 1.1188, | |
| "loss_ce": 1.1771190166473389, | |
| "loss_region": 0.030019955709576607, | |
| "loss_total": 1.207139015197754, | |
| "lr": 0.0011356762161265992, | |
| "router/selected_tokens_s0": 4330.125, | |
| "step": 2690, | |
| "tokens_trained": 8.813778696 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7659031274377703, | |
| "grad_norm": 1.6932164430618286, | |
| "loss": 1.1031, | |
| "loss_ce": 1.0857900381088257, | |
| "loss_region": 0.0300260242074728, | |
| "loss_total": 1.1158161163330078, | |
| "lr": 0.0011352693241427482, | |
| "router/selected_tokens_s0": 4347.5, | |
| "step": 2700, | |
| "tokens_trained": 8.846544136 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7687398056875399, | |
| "grad_norm": 1.5329583883285522, | |
| "loss": 1.1098, | |
| "loss_ce": 1.0980348587036133, | |
| "loss_region": 0.030030502006411552, | |
| "loss_total": 1.1280653476715088, | |
| "lr": 0.0011348624321588974, | |
| "router/selected_tokens_s0": 4366.375, | |
| "step": 2710, | |
| "tokens_trained": 8.879309576 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7715764839373094, | |
| "grad_norm": 1.829464077949524, | |
| "loss": 1.1093, | |
| "loss_ce": 1.1128755807876587, | |
| "loss_region": 0.03000422567129135, | |
| "loss_total": 1.142879843711853, | |
| "lr": 0.0011344555401750463, | |
| "router/selected_tokens_s0": 4282.0, | |
| "step": 2720, | |
| "tokens_trained": 8.912075016 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7744131621870789, | |
| "grad_norm": 2.8766870498657227, | |
| "loss": 1.1187, | |
| "loss_ce": 1.12075674533844, | |
| "loss_region": 0.030019240453839302, | |
| "loss_total": 1.1507760286331177, | |
| "lr": 0.0011340486481911953, | |
| "router/selected_tokens_s0": 4327.25, | |
| "step": 2730, | |
| "tokens_trained": 8.944840456 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7772498404368484, | |
| "grad_norm": 2.2969014644622803, | |
| "loss": 1.1166, | |
| "loss_ce": 1.0795077085494995, | |
| "loss_region": 0.030028002336621284, | |
| "loss_total": 1.1095356941223145, | |
| "lr": 0.001133641756207344, | |
| "router/selected_tokens_s0": 4352.75, | |
| "step": 2740, | |
| "tokens_trained": 8.977605896 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7800865186866179, | |
| "grad_norm": 1.7521798610687256, | |
| "loss": 1.1139, | |
| "loss_ce": 1.1274807453155518, | |
| "loss_region": 0.030016858130693436, | |
| "loss_total": 1.1574976444244385, | |
| "lr": 0.001133234864223493, | |
| "router/selected_tokens_s0": 4320.0, | |
| "step": 2750, | |
| "tokens_trained": 9.010371336 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7829231969363875, | |
| "grad_norm": 2.6245367527008057, | |
| "loss": 1.1075, | |
| "loss_ce": 1.1328058242797852, | |
| "loss_region": 0.03003484010696411, | |
| "loss_total": 1.1628406047821045, | |
| "lr": 0.001132827972239642, | |
| "router/selected_tokens_s0": 4367.625, | |
| "step": 2760, | |
| "tokens_trained": 9.043136776 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.785759875186157, | |
| "grad_norm": 1.162583351135254, | |
| "loss": 1.1181, | |
| "loss_ce": 1.151093602180481, | |
| "loss_region": 0.030036624521017075, | |
| "loss_total": 1.1811301708221436, | |
| "lr": 0.001132421080255791, | |
| "router/selected_tokens_s0": 4392.5, | |
| "step": 2770, | |
| "tokens_trained": 9.075902216 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7885965534359265, | |
| "grad_norm": 1.4981096982955933, | |
| "loss": 1.1104, | |
| "loss_ce": 1.0844680070877075, | |
| "loss_region": 0.030015481635928154, | |
| "loss_total": 1.1144834756851196, | |
| "lr": 0.0011320141882719401, | |
| "router/selected_tokens_s0": 4314.625, | |
| "step": 2780, | |
| "tokens_trained": 9.108667656 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.791433231685696, | |
| "grad_norm": 1.8612878322601318, | |
| "loss": 1.1073, | |
| "loss_ce": 1.0089409351348877, | |
| "loss_region": 0.029995379969477654, | |
| "loss_total": 1.0389362573623657, | |
| "lr": 0.001131607296288089, | |
| "router/selected_tokens_s0": 4257.875, | |
| "step": 2790, | |
| "tokens_trained": 9.14143004 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7942699099354655, | |
| "grad_norm": 0.6861640810966492, | |
| "loss": 1.1058, | |
| "loss_ce": 0.9385975003242493, | |
| "loss_region": 0.029996687546372414, | |
| "loss_total": 0.9685941934585571, | |
| "lr": 0.001131200404304238, | |
| "router/selected_tokens_s0": 4290.625, | |
| "step": 2800, | |
| "tokens_trained": 9.17419548 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.797106588185235, | |
| "grad_norm": 2.205390214920044, | |
| "loss": 1.108, | |
| "loss_ce": 1.0670945644378662, | |
| "loss_region": 0.030025651678442955, | |
| "loss_total": 1.0971201658248901, | |
| "lr": 0.001130793512320387, | |
| "router/selected_tokens_s0": 4342.875, | |
| "step": 2810, | |
| "tokens_trained": 9.20696092 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.7999432664350046, | |
| "grad_norm": 2.068150520324707, | |
| "loss": 1.106, | |
| "loss_ce": 1.0238165855407715, | |
| "loss_region": 0.03002651408314705, | |
| "loss_total": 1.0538431406021118, | |
| "lr": 0.001130386620336536, | |
| "router/selected_tokens_s0": 4359.75, | |
| "step": 2820, | |
| "tokens_trained": 9.23972636 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8027799446847741, | |
| "grad_norm": 1.1060576438903809, | |
| "loss": 1.1065, | |
| "loss_ce": 1.0474674701690674, | |
| "loss_region": 0.03000919334590435, | |
| "loss_total": 1.0774766206741333, | |
| "lr": 0.001129979728352685, | |
| "router/selected_tokens_s0": 4301.75, | |
| "step": 2830, | |
| "tokens_trained": 9.2724918 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8056166229345436, | |
| "grad_norm": 1.369165301322937, | |
| "loss": 1.1081, | |
| "loss_ce": 1.0370676517486572, | |
| "loss_region": 0.030027758330106735, | |
| "loss_total": 1.067095398902893, | |
| "lr": 0.001129572836368834, | |
| "router/selected_tokens_s0": 4375.625, | |
| "step": 2840, | |
| "tokens_trained": 9.30525692 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8084533011843131, | |
| "grad_norm": 2.285675525665283, | |
| "loss": 1.109, | |
| "loss_ce": 1.0967503786087036, | |
| "loss_region": 0.0300269927829504, | |
| "loss_total": 1.1267774105072021, | |
| "lr": 0.0011291659443849829, | |
| "router/selected_tokens_s0": 4370.0, | |
| "step": 2850, | |
| "tokens_trained": 9.33802236 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8112899794340827, | |
| "grad_norm": 0.8950642943382263, | |
| "loss": 1.1015, | |
| "loss_ce": 1.091797947883606, | |
| "loss_region": 0.03003113530576229, | |
| "loss_total": 1.1218290328979492, | |
| "lr": 0.0011287590524011318, | |
| "router/selected_tokens_s0": 4384.125, | |
| "step": 2860, | |
| "tokens_trained": 9.3707878 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8141266576838522, | |
| "grad_norm": 2.1465282440185547, | |
| "loss": 1.1012, | |
| "loss_ce": 0.9929934144020081, | |
| "loss_region": 0.030015377327799797, | |
| "loss_total": 1.0230088233947754, | |
| "lr": 0.0011283521604172808, | |
| "router/selected_tokens_s0": 4333.0, | |
| "step": 2870, | |
| "tokens_trained": 9.403548272 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8169633359336217, | |
| "grad_norm": 2.1108782291412354, | |
| "loss": 1.1029, | |
| "loss_ce": 1.0729644298553467, | |
| "loss_region": 0.03002534806728363, | |
| "loss_total": 1.1029897928237915, | |
| "lr": 0.0011279452684334298, | |
| "router/selected_tokens_s0": 4357.25, | |
| "step": 2880, | |
| "tokens_trained": 9.436313712 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8198000141833912, | |
| "grad_norm": 1.7104750871658325, | |
| "loss": 1.1041, | |
| "loss_ce": 1.116651177406311, | |
| "loss_region": 0.030016740784049034, | |
| "loss_total": 1.1466679573059082, | |
| "lr": 0.0011275383764495787, | |
| "router/selected_tokens_s0": 4340.875, | |
| "step": 2890, | |
| "tokens_trained": 9.469079152 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8226366924331607, | |
| "grad_norm": 1.7549395561218262, | |
| "loss": 1.1098, | |
| "loss_ce": 0.977597713470459, | |
| "loss_region": 0.030016236007213593, | |
| "loss_total": 1.0076138973236084, | |
| "lr": 0.0011271314844657277, | |
| "router/selected_tokens_s0": 4328.875, | |
| "step": 2900, | |
| "tokens_trained": 9.50184356 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8254733706829303, | |
| "grad_norm": 2.076667547225952, | |
| "loss": 1.1041, | |
| "loss_ce": 0.9882082343101501, | |
| "loss_region": 0.03001856803894043, | |
| "loss_total": 1.0182268619537354, | |
| "lr": 0.0011267245924818767, | |
| "router/selected_tokens_s0": 4341.75, | |
| "step": 2910, | |
| "tokens_trained": 9.534608992 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8283100489326998, | |
| "grad_norm": 1.930834412574768, | |
| "loss": 1.1031, | |
| "loss_ce": 1.1864138841629028, | |
| "loss_region": 0.03002503328025341, | |
| "loss_total": 1.216438889503479, | |
| "lr": 0.0011263177004980256, | |
| "router/selected_tokens_s0": 4379.125, | |
| "step": 2920, | |
| "tokens_trained": 9.567373632 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8311467271824693, | |
| "grad_norm": 0.7202333807945251, | |
| "loss": 1.103, | |
| "loss_ce": 1.0883651971817017, | |
| "loss_region": 0.030032740905880928, | |
| "loss_total": 1.1183979511260986, | |
| "lr": 0.0011259108085141746, | |
| "router/selected_tokens_s0": 4386.375, | |
| "step": 2930, | |
| "tokens_trained": 9.600139072 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8339834054322388, | |
| "grad_norm": 1.0626195669174194, | |
| "loss": 1.1043, | |
| "loss_ce": 1.0197147130966187, | |
| "loss_region": 0.03001200221478939, | |
| "loss_total": 1.0497267246246338, | |
| "lr": 0.0011255039165303236, | |
| "router/selected_tokens_s0": 4317.625, | |
| "step": 2940, | |
| "tokens_trained": 9.632904512 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8368200836820083, | |
| "grad_norm": 2.428861379623413, | |
| "loss": 1.1036, | |
| "loss_ce": 0.9022196531295776, | |
| "loss_region": 0.030008511617779732, | |
| "loss_total": 0.932228147983551, | |
| "lr": 0.0011250970245464725, | |
| "router/selected_tokens_s0": 4322.125, | |
| "step": 2950, | |
| "tokens_trained": 9.665669952 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8396567619317779, | |
| "grad_norm": 0.9146430492401123, | |
| "loss": 1.1015, | |
| "loss_ce": 1.1206673383712769, | |
| "loss_region": 0.030019836500287056, | |
| "loss_total": 1.1506872177124023, | |
| "lr": 0.0011246901325626217, | |
| "router/selected_tokens_s0": 4355.75, | |
| "step": 2960, | |
| "tokens_trained": 9.698432616 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8424934401815474, | |
| "grad_norm": 1.3183574676513672, | |
| "loss": 1.0992, | |
| "loss_ce": 1.088549256324768, | |
| "loss_region": 0.030013682320713997, | |
| "loss_total": 1.118562936782837, | |
| "lr": 0.0011242832405787707, | |
| "router/selected_tokens_s0": 4324.875, | |
| "step": 2970, | |
| "tokens_trained": 9.731196464 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8453301184313169, | |
| "grad_norm": 1.7237669229507446, | |
| "loss": 1.1016, | |
| "loss_ce": 1.1303554773330688, | |
| "loss_region": 0.030019812285900116, | |
| "loss_total": 1.1603752374649048, | |
| "lr": 0.0011238763485949196, | |
| "router/selected_tokens_s0": 4352.75, | |
| "step": 2980, | |
| "tokens_trained": 9.763955848 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8481667966810864, | |
| "grad_norm": 2.353868246078491, | |
| "loss": 1.097, | |
| "loss_ce": 1.1438567638397217, | |
| "loss_region": 0.03001641482114792, | |
| "loss_total": 1.1738731861114502, | |
| "lr": 0.0011234694566110684, | |
| "router/selected_tokens_s0": 4346.25, | |
| "step": 2990, | |
| "tokens_trained": 9.796721288 | |
| }, | |
| { | |
| "comp/rl_weight": 0.03, | |
| "comp/strictness": 0.0, | |
| "epoch": 0.8510034749308559, | |
| "grad_norm": 2.239737033843994, | |
| "loss": 1.099, | |
| "loss_ce": 1.137770414352417, | |
| "loss_region": 0.03001844510436058, | |
| "loss_total": 1.1677888631820679, | |
| "lr": 0.0011230625646272174, | |
| "router/selected_tokens_s0": 4345.625, | |
| "step": 3000, | |
| "tokens_trained": 9.829486728 | |
| }, | |
| { | |
| "epoch": 0.8510034749308559, | |
| "eval_ppl": 2.91798250107805, | |
| "eval_runtime": 2.489, | |
| "step": 3000, | |
| "tokens_trained": 9.829486728 | |
| }, | |
| { | |
| "epoch": 0.8510034749308559, | |
| "eval_F": 0.34119725725854944, | |
| "eval_F_cds": 0.339909922293828, | |
| "eval_F_dig": 0.3374221944422741, | |
| "eval_F_exon": 0.3444720286625102, | |
| "eval_F_intron": 0.3423051363848719, | |
| "eval_F_nig": 0.3420074982635899, | |
| "eval_F_promoter": 0.33568609090152685, | |
| "eval_F_utr": 0.3433317082766702, | |
| "eval_G": 0.35626090599344656, | |
| "eval_G_cds": 0.3533774528284723, | |
| "eval_G_dig": 0.39929882420827145, | |
| "eval_G_exon": 0.35481589922102014, | |
| "eval_G_intron": 0.3559872186522367, | |
| "eval_G_nig": 0.35704285773301014, | |
| "eval_G_promoter": 0.354736183175574, | |
| "eval_G_utr": 0.3543053844969594, | |
| "eval_avg_bp_per_token": 2.930855916119598, | |
| "eval_bp_per_token/cds": 2.9419558959963843, | |
| "eval_bp_per_token/dig": 2.9636461870947826, | |
| "eval_bp_per_token/exon": 2.9029933254166496, | |
| "eval_bp_per_token/intron": 2.921370127720335, | |
| "eval_bp_per_token/nig": 2.923912502144284, | |
| "eval_bp_per_token/promoter": 2.9789735920078644, | |
| "eval_bp_per_token/utr": 2.9126351452344177, | |
| "eval_ppl_cds": 3.5636364626812047, | |
| "eval_ppl_dig": 1.0968188962634289, | |
| "eval_ppl_exon": 3.3285669782872387, | |
| "eval_ppl_intron": 2.935885553210843, | |
| "eval_ppl_nig": 2.7347129604188645, | |
| "eval_ppl_promoter": 3.292230226733986, | |
| "eval_ppl_utr": 3.2942767869833, | |
| "step": 3000, | |
| "tokens_trained": 9.829486728 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 30600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 3000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |