joelniklaus HF Staff commited on
Commit
a6df89a
Β·
1 Parent(s): d877609

made plot d3 component more general and added finephrase vs baseline comparison

Browse files
app/src/content/assets/data/finephrase_vs_baselines.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ runname,seed,steps,agg_score_micro,lighteval|arc_cf:easy|3/prob_norm_token,lighteval|drop|3/prob_norm_token,lighteval|gsm8k|3/prob_norm_token,lighteval|hellaswag_cf|3/prob_norm_token,lighteval|openbookqa_cf|3/prob_norm_token,lighteval|piqa_cf|3/prob_norm_token,lighteval|squad_v2|3/prob_norm_token,lighteval|treb_qa|3/prob_norm_token,lighteval|wikitablequestions|3/prob_norm_token,lighteval|winogrande_cf|3/prob_norm_token,lighteval|xcsqa_cf|3/prob_norm_token,lighteval|mmlu_redux_cf:_average|3/prob_norm_token,agg_score_RC,agg_score_GK,agg_score_NLU,agg_score_MATH,agg_score_TABLE,agg_score_RES,agg_score_macro
2
+ cosmopedia,42,500,0.02913293526383949,0.01644980523353067,0.05708106828394926,0.07507547011560446,0.025204289035927866,0.004981102491946008,0.015223517258831067,0.059624836394866286,0.009275745169189628,0.050868520098123385,0.0011497275257879671,0.0023969106282150515,0.03226423093010228,0.05835295233940777,0.02435701808181647,0.013177008280857917,0.07507547011560446,0.030072132633656507,0.007533843459664041,0.034761404151834534
3
+ cosmopedia,42,1000,0.04124795323520204,0.03376632969431081,0.0716128388383384,0.08040159709443524,0.03560581270448807,0.0062957483423988275,0.02612757916237944,0.09776743961934783,0.009555730693441691,0.08185223438207805,0.001536835821548027,0.0049266465186134055,0.04552664595104469,0.08469013922884311,0.03964648782267775,0.01857132426301805,0.08040159709443524,0.04570398253775987,0.012449991341130557,0.04691058704797743
4
+ cosmopedia,42,1500,0.04963047226398357,0.0402981783360864,0.07913986423773972,0.09180952610013449,0.04059972968013296,0.007410394221063179,0.030842821946334014,0.12491390042085257,0.021110385005617776,0.1002281011619102,0.0019249414445667013,0.006217583147858968,0.05107024146550601,0.10202688232929615,0.045684209900796205,0.02126233556234983,0.09180952610013449,0.060669243083763987,0.014823599771752053,0.05604596612468212
5
+ cosmopedia,42,2000,0.055408550551699416,0.0568002954147212,0.0909358649801166,0.09635380830154233,0.04342689974729869,0.009485241437043596,0.03475827127170817,0.15457097289454855,0.004615149488716231,0.09751936721761113,0.0020059463567881027,0.012172360178745664,0.06225842933155281,0.12275341893733258,0.05952936237313701,0.022716423052043397,0.09635380830154233,0.05106725835316368,0.01880529096249914,0.061870926996619696
6
+ cosmopedia,42,2500,0.06144020140882423,0.06562859372528666,0.11021627928084318,0.09630655113051576,0.0468583343189605,0.009629600406227172,0.0368552236394807,0.16902945248764667,0.01050788263277363,0.11023229745426368,0.0022796756956278428,0.013052901110046075,0.06668562502421899,0.13962286588424494,0.06615710937475283,0.02456900500729417,0.09630655113051576,0.060370090043518655,0.019845908385251316,0.06781192163759628
7
+ cosmopedia,42,3000,0.06121668029315411,0.06969621409720483,0.10185953561700806,0.09874590044085123,0.05009863073138798,0.010188544711124098,0.039933801768543206,0.1716537704504427,0.01083332140482438,0.09492669454623737,0.002904571514808216,0.013753760564276728,0.07000541767114048,0.13675665303372536,0.06985081588417266,0.0265016011230981,0.09874590044085123,0.05288000797553087,0.021292035681314676,0.06767116902311548
8
+ cosmopedia,42,3500,0.0663916711095035,0.07541054538396873,0.10016331777214653,0.10381392344419293,0.05243001054507087,0.010455061992097035,0.04267662005835878,0.19460969592718852,0.010023798092275377,0.1133829366996711,0.0032535435652782125,0.01596619460094678,0.0745144052328472,0.14738650684966753,0.07496247530840797,0.02784177705517454,0.10381392344419293,0.06170336739597324,0.023032625550467534,0.07312344593398062
9
+ cosmopedia,42,4000,0.07180351600888919,0.07566954613967665,0.11273495410886125,0.09608445460195386,0.05442526749758481,0.01166117470176923,0.043781903998197,0.22876664811713013,0.026032774541086438,0.11887824245733007,0.003758279690394813,0.016374059802257045,0.07347488645042907,0.17075080111299568,0.07457221629505287,0.029091773593989814,0.09608445460195386,0.07245550849920826,0.02393904616740776,0.07781563337843471
10
+ cosmopedia,42,4500,0.0656027681218991,0.07107521915583201,0.09964708629801075,0.09723004289941957,0.055162473954239795,0.011634927328086455,0.04693290401977367,0.19028292652250303,0.022640654916820647,0.1021815961640159,0.003596286975441124,0.015527511272809375,0.0713215879558367,0.1449650064102569,0.07119840355583434,0.029379380464840458,0.09723004289941957,0.06241112554041827,0.02469844754022317,0.07164706773516545
11
+ cosmopedia,42,5000,0.07441916275873679,0.07772802928199113,0.11979591994484576,0.1054445123428739,0.0555141688550074,0.012375042435616844,0.04678685589565672,0.25030905667429465,0.012293637162199007,0.11558599574917801,0.0037032972020692,0.017017301322272728,0.07647613623883621,0.18505248830957022,0.07710208276041366,0.029608733028538302,0.1054445123428739,0.06393981645568851,0.025393066551182095,0.08109011657471112
12
+ cosmopedia,42,5500,0.07616889961526692,0.08586521784016052,0.10124758702743844,0.10674611045804462,0.058034833992440034,0.012319562347954182,0.04744391394212781,0.2546519548034475,0.018954785938345143,0.12347192103193484,0.0051610805765532584,0.01839009914731498,0.08173972827744186,0.17794977091544295,0.08380247305880119,0.031597957284496644,0.10674611045804462,0.07121335348513999,0.026051191812465662,0.08289347616906519
13
+ cosmopedia,42,6000,0.08099789724204517,0.08882521070108303,0.12735352719958415,0.10941075268970758,0.05812652072180636,0.012385956248201166,0.04765550787536823,0.2776440888809085,0.016711447928293665,0.13042478387629072,0.003921410885071205,0.018112324905211703,0.08140323499301569,0.20249880804024634,0.08511422284704936,0.03102396580343878,0.10941075268970758,0.07356811590229219,0.026051263009593695,0.08794452138205466
14
+ cosmopedia,42,6500,0.07801033294560415,0.08204899090790067,0.12286647679849201,0.08790568205765933,0.05984436116044507,0.01205167251574742,0.04851285111081279,0.2848961360831886,0.023770329934153595,0.11508294245412083,0.00420367110222168,0.018246525704118936,0.07669435551838873,0.2038813064408403,0.0793716732131447,0.03202401613133338,0.08790568205765933,0.06942663619413722,0.02627034977689305,0.08314661063566799
15
+ cosmopedia,42,7000,0.08548292823468344,0.08533814469577075,0.13544702269754663,0.10719485942364332,0.06019020289676362,0.013405605557102096,0.0496142996152204,0.30618047286508054,0.017818594804336126,0.13963953817422833,0.004348850038180937,0.0236620389015861,0.08295550914674236,0.22081374778131357,0.08414682692125655,0.032269526467472276,0.10719485942364332,0.07872906648928223,0.02889398135796954,0.0920080014068229
16
+ cosmopedia,42,7500,0.0865703162890722,0.08984361281399103,0.12025344931032177,0.10320614567039915,0.06122459770640719,0.012495559512436838,0.049643987849612194,0.331511354685079,0.019897403537450682,0.1431926647202153,0.006029502617134309,0.02085308298844795,0.08069243405737127,0.22588240199770038,0.08526802343568116,0.03362705016177075,0.10320614567039915,0.08154503412883299,0.027664210116832327,0.09286547758520279
17
+ cosmopedia,42,8000,0.08935496800395408,0.08787559443980666,0.15113832825406034,0.10177575236106819,0.06267944055666991,0.012328437894109482,0.05049906237803139,0.32612879296982994,0.023309744095949672,0.14577382630581717,0.004575005081071323,0.023176683151316647,0.08299894855971805,0.23863356061194513,0.08543727149976235,0.033627222818870615,0.10177575236106819,0.08454178520088343,0.028668061141152505,0.09544727560561371
18
+ cosmopedia,42,8500,0.08496482413844532,0.09032872308584992,0.14596326443088656,0.10521061598164272,0.06293376930262534,0.01179172491640762,0.050726946193502785,0.30740922848116525,0.02429351905327144,0.1121663499152876,0.0049556757944356,0.02224768208948586,0.08155039041678296,0.22668624645602592,0.08593955675131644,0.03394472254853047,0.10521061598164272,0.06822993448427951,0.02825545106646542,0.09137775454804342
19
+ cosmopedia,42,9000,0.08361274715260704,0.09254362908583605,0.10953523679799548,0.10380177690532401,0.06243000178511952,0.011652919822048803,0.05092647819510025,0.30653581218202514,0.017896957383616914,0.12946289888553303,0.003848008119362756,0.026501396702508038,0.08821784996681448,0.2080355244900103,0.09038073952632526,0.03313900495224114,0.10380177690532401,0.07367992813457497,0.029693598239885696,0.08978842870806024
20
+ cosmopedia,42,9500,0.09277612714731535,0.10514627047570546,0.13663089759800565,0.10320967226040505,0.06605400194318554,0.01298875894560893,0.053697929823851555,0.35209512779237045,0.021987363748635042,0.13337928761288226,0.006941631855732927,0.029765689477053946,0.09141689423434754,0.24436301269518806,0.0982815823550265,0.03649781689945924,0.10320967226040505,0.07768332568075866,0.03215079274883815,0.09869770043994595
21
+ cosmopedia,42,10000,0.09713070150622272,0.1099484461088254,0.1385219884835079,0.1077912252718024,0.06854509328349631,0.013037806175627246,0.05760761921210011,0.37396816691837215,0.02340183572970216,0.136869361648535,0.0063521372614830434,0.032283548759282396,0.09724118922193857,0.25624507770094,0.10359481766538198,0.037448615272489674,0.1077912252718024,0.08013559868911858,0.034309658049003246,0.10325416544145598
22
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,500,0.05497202738487924,0.043888791627692214,0.07460081309682072,0.0909782586710973,0.027876488119456623,0.008072883577787215,0.02051420846060384,0.1617051955054037,0.06602295809899959,0.10268333935383694,0.00815657924316373,0.0033059830143184607,0.05185882984937056,0.1181530043011122,0.04787381073853139,0.018016533681310176,0.0909782586710973,0.08435314872641828,0.010631025017569838,0.06166763018933987
23
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,1000,0.08339086204445532,0.10238764461840953,0.12673011117801516,0.09052971517217097,0.04346501296225431,0.014916245756766831,0.034019607127749436,0.2496537550077984,0.09630394687339318,0.13819540213570458,0.013993487690899496,0.008922625179076039,0.08157279083122602,0.18819193309290677,0.09198021772481778,0.028729250326576902,0.09052971517217097,0.11724967450454887,0.019286159354530766,0.08932782502925869
24
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,1500,0.09615738084060316,0.12802980724125926,0.12775816534750992,0.10104346038604804,0.047949606190980876,0.01808526273352843,0.037748044488513696,0.30796469863451875,0.10921484534315283,0.15463125522120116,0.014607328677324173,0.01325062131471197,0.09360547450848884,0.21786143199101432,0.11081764087487406,0.031278467434152524,0.10104346038604804,0.131923050282177,0.023027976178918035,0.10265867119119733
25
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,2000,0.1086427603353443,0.14954049130397948,0.14182266926274986,0.09585888107956692,0.05308496960119366,0.02031388308111189,0.04177542481892087,0.3670467200065427,0.12060223958208637,0.1721704883907508,0.020847733413292047,0.016052783415978204,0.10459684006795883,0.2544346946346463,0.12706866568596914,0.03696635150724285,0.09585888107956692,0.14638636398641858,0.026047363772003656,0.11446038677764125
26
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,2500,0.11678573043499364,0.16499517381011572,0.1570696228480358,0.1018030630962526,0.05632240550395245,0.022942654273776987,0.04527304492919819,0.40037123618139686,0.1309033874369144,0.1652454706254375,0.023832756157793077,0.02096279806086712,0.11170715229618304,0.2787204295147163,0.13835116305314937,0.04007758083087276,0.1018030630962526,0.14807442903117596,0.029726165754614103,0.12279213854679684
27
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,3000,0.1227298056760559,0.1705434193639903,0.13641915890766848,0.10409819680450168,0.058691497421398967,0.027163807386248698,0.04850198210945816,0.42305727309724306,0.14337923513086004,0.19030104091793812,0.030022640888477992,0.022736857564893343,0.11784255851999163,0.27973821600245574,0.14419298894199098,0.04435706915493848,0.10409819680450168,0.16684013802439907,0.032800882353533393,0.1286712485469699
28
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,3500,0.1314334968225223,0.18423965688370678,0.15271030792608034,0.10109513940753845,0.061509397478021324,0.027635735341552784,0.05051220430460552,0.4588881783575565,0.1618549897950074,0.19050082146842992,0.034468238566735684,0.028825584897231887,0.12496170744380108,0.30579924314181844,0.15460068216375394,0.047988818022378504,0.10109513940753845,0.17617790563171865,0.0356578415144634,0.13688660498027858
29
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,4000,0.1332019362216416,0.18507502454513677,0.1572850859214566,0.10419155470794772,0.06255835953403013,0.026967514379063716,0.05180308261411266,0.48022449664935574,0.15174584132025593,0.18943187355809618,0.03342770763839489,0.028484697186613422,0.12722799660523532,0.3187547912854062,0.15615151057518606,0.047993033586212513,0.10419155470794772,0.17058885743917607,0.0357517647265966,0.13890525205342086
30
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,4500,0.13926821960208588,0.19403697967519826,0.1900001270576879,0.10370921833392541,0.06451143584709607,0.028245490069410798,0.05306335171055211,0.5047326573045379,0.16739842289589846,0.16832572032790716,0.03340946942237883,0.033063203858285165,0.13072255872215258,0.34736639218111287,0.16237976919867542,0.048960452634737445,0.10370921833392541,0.1678620716119028,0.03812401521274936,0.1447336531955172
31
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,5000,0.14442127461323784,0.19665543753740394,0.14085530492645892,0.10684709596788833,0.0657321822351631,0.030879691559855493,0.0539732739946766,0.5382313844966209,0.17004663652172006,0.2226942357913624,0.03222495551252107,0.037558640515889564,0.1373564562992935,0.3395433447115399,0.16700594691834872,0.04897856887384208,0.10684709596788833,0.19637043615654123,0.04080386869014055,0.14992487688638348
32
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,5500,0.13971015880682064,0.19605343311675158,0.1511972802969607,0.10557630149963114,0.0669723742730497,0.030218822633090305,0.05430898824221272,0.5021353527787574,0.16100417640605444,0.20616568279157108,0.035622785743038135,0.03367826370748683,0.1335884441932436,0.32666631653785905,0.16482093865499758,0.051297580008043915,0.10557630149963114,0.18358492959881276,0.03940202486092995,0.14522468186004575
33
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,6000,0.14398073283339033,0.2022853724762129,0.15652976336605173,0.10722511152867062,0.06634623060591266,0.028803854752332634,0.055847650631034675,0.5193328249159094,0.17035290221546426,0.20868541415667066,0.037035771146383144,0.039972555100646324,0.13535134310539554,0.33793129414098055,0.16881835779080423,0.051691000876147905,0.10722511152867062,0.18951915818606746,0.04154135349467122,0.14945437933622366
34
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,6500,0.1505931655024272,0.20959759419315002,0.17286796783518985,0.10915211312785428,0.0679797088727532,0.029815842021685482,0.05570872361648764,0.5484850210055906,0.18048638798990244,0.21785600020824938,0.03353868589734435,0.04035781445342067,0.14127212680749857,0.36067649442039024,0.1754348605003243,0.05075919738504878,0.10915211312785428,0.1991711940990759,0.0419607933638646,0.15619244214942637
35
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,7000,0.1464742487252148,0.20341816996444992,0.14551407211819187,0.1057297626245669,0.06956275571849373,0.03240960808738502,0.05658341735298749,0.5381168910757818,0.1691133849347958,0.21521073712008235,0.03707498715625,0.04248064018647134,0.14247655836312156,0.3418154815969868,0.17294736416378576,0.053318871437371865,0.1057297626245669,0.19216206102743907,0.04382455520894796,0.1516330160098497
36
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,7500,0.1517105578379645,0.20132129855879002,0.1527633022636308,0.10704359442804749,0.07022009847906449,0.03147189735161151,0.05654149495954995,0.570532862874293,0.18488848294861215,0.22447892983300183,0.037943160000325785,0.04132730784489554,0.14199426451375163,0.36164808256896186,0.17165778153627081,0.05408162923969514,0.10704359442804749,0.204683706390807,0.043113566718685666,0.157038060147078
37
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,8000,0.153111585321741,0.20370600323462096,0.15559967087940202,0.10995400480081047,0.0703778830508127,0.03168853210271354,0.05685130056512766,0.5573852453552104,0.20006111411461655,0.23535280897044764,0.03544271514235402,0.04078750637679567,0.14013223926798019,0.35649245811730623,0.17191912125130057,0.05291029909658336,0.10995400480081047,0.2177069615425321,0.04310911301487896,0.15868199297056862
38
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,8500,0.153474214811659,0.2156416888077374,0.14767760594012713,0.1086404631393254,0.0703495646729733,0.035481303387208424,0.057656227068679705,0.563167335678502,0.19176190889916359,0.22524712311352688,0.037423606381044354,0.04378362868833993,0.14486012196328,0.35542247080931455,0.1802509053855087,0.053886585527008826,0.1086404631393254,0.20850451600634523,0.045640386381409354,0.15872422120815202
39
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,9000,0.15431434783397444,0.21332785298597565,0.18138948523020929,0.10920361971138605,0.07165906701155657,0.03237368284994406,0.059718017340528194,0.5700328205641509,0.17389265540119148,0.21668574221748044,0.03588189889073877,0.0425072941680442,0.1451000376364875,0.3757111528971801,0.17921394531123158,0.05377048295114767,0.10920361971138605,0.19528919880933596,0.04486633145283881,0.1596757885221867
40
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,9500,0.16005748327065486,0.21738016620018716,0.1753445777547784,0.1118670648000796,0.07378068138362223,0.033658712377510745,0.06202369654688045,0.5824510751503132,0.19484951287508973,0.23499616874860976,0.038260197199537195,0.04559582780714385,0.150482118404106,0.3788978264525458,0.18393114230214658,0.05602043929157971,0.1118670648000796,0.21492284081184976,0.04709274557717835,0.16545534320589664
41
+ mix-fw_edu_hq-table_smollm2_1.7b_hq,42,10000,0.16643219455201952,0.22104265954029498,0.21328824188432033,0.11368725290137877,0.0766473487870297,0.035538152606588894,0.06356709975061516,0.6127574281795107,0.2130087186112811,0.2053092042656179,0.0386371297699402,0.04991314653034026,0.15378995179731605,0.4130228350319155,0.18741630566880552,0.05764223927848495,0.11368725290137877,0.2091589614384495,0.04967279962918144,0.17176673232470263
42
+ nemotron_hq_synth,42,500,0.039651040279900684,0.02521080354500527,0.08721113630836978,0.05662676366027639,0.031814615729775696,0.006743888401216779,0.02405165913161938,0.07518432561800878,0.0353232685179539,0.0832831589480137,0.0025603179472343247,0.0038350073918069906,0.04396753815952727,0.08119773096318927,0.03458917085226627,0.01718746683850501,0.05662676366027639,0.059303213732983806,0.011543518308214382,0.043407977392572517
43
+ nemotron_hq_synth,42,1000,0.05810237150109671,0.0557521277001006,0.09014895108979679,0.09624088068246797,0.04663541901386824,0.009979157708001441,0.03837330121857019,0.12378904501683104,0.04635777456911922,0.1157220273651087,0.004684014565035495,0.00639588700425435,0.0631498720800065,0.10696899805331392,0.05945099989005355,0.025659716789451868,0.09624088068246797,0.08103990096711396,0.018249448643608657,0.06460165750433498
44
+ nemotron_hq_synth,42,1500,0.07195689680181379,0.07913489236087619,0.12539193988020703,0.10047277888907442,0.053379673742216534,0.013153722355709014,0.0447560898586656,0.17189836266809538,0.06217527836391408,0.11914666513783351,0.00823924208199497,0.011196835684551864,0.07453728059862684,0.1486451512741512,0.07683608647975151,0.03080945791210575,0.10047277888907442,0.09066097175087379,0.02303554929964216,0.0784099992675998
45
+ nemotron_hq_synth,42,2000,0.07928569861732147,0.09393732383469464,0.10410439073944557,0.10282672182849376,0.05853735121383968,0.014509907583777317,0.04926649835018796,0.21742409707244026,0.061424601614937874,0.13188065357213857,0.012847561329417693,0.016855623157659313,0.08781365311082491,0.1607642439059429,0.09087548847275978,0.03569245627162869,0.10282672182849376,0.09665262759353822,0.02687734303054153,0.08561481351715082
46
+ nemotron_hq_synth,42,2500,0.0891604770455474,0.11464012531838454,0.13864367612403083,0.10316555962415479,0.06173150262670635,0.015651130257446712,0.05181886103820301,0.24177343020955966,0.0740838039662981,0.14573391323978432,0.013470914822123186,0.015466920506005604,0.09374588681387162,0.19020855316679525,0.10419300606612808,0.03760120872441477,0.10316555962415479,0.10990885860304121,0.027645637267218443,0.09545380390862541
47
+ nemotron_hq_synth,42,3000,0.09037626326918476,0.12623950632915582,0.11855261679744733,0.09678583517920873,0.0626349588611655,0.018345536309102427,0.053424336708777755,0.26236462212792905,0.07217109145641441,0.1337614057460242,0.022926523956926907,0.01702235584892099,0.100286369909144,0.1904586194626882,0.1132629381191499,0.042780741409046204,0.09678583517920873,0.1029662486012193,0.02959740962226706,0.09597529873226324
48
+ nemotron_hq_synth,42,3500,0.09795320268695086,0.13382261010354013,0.12547854646356435,0.10405079222906868,0.0661486602964717,0.019409284692449225,0.05416908711654374,0.3091239440020756,0.07689545480774088,0.13819714037034073,0.02220307637515695,0.022287988088026713,0.10365184769843165,0.21730124523281996,0.1187372289009859,0.04417586833581433,0.10405079222906868,0.1075462975890408,0.03195545329900656,0.10396114759778935
49
+ nemotron_hq_synth,42,4000,0.10276022435362109,0.14183301447543797,0.13105414710509541,0.10318167913865792,0.0669034257826245,0.02160038185279176,0.056667393754237814,0.3115664928779588,0.0840886294835673,0.15864345267403296,0.021705684316899336,0.02521391472559382,0.11066447605655556,0.22131031999152712,0.12624874526599678,0.04430455504976191,0.10318167913865792,0.12136604107880013,0.03449389677754113,0.1084842062170475
50
+ nemotron_hq_synth,42,4500,0.10589796102766387,0.14995689210296353,0.1300186029361498,0.1006565847256015,0.06847679516921591,0.02170730691707844,0.057402810772006006,0.33263917774047097,0.0879004934545943,0.15894128403493232,0.023406293690234,0.026482898374156214,0.1131863924145636,0.2313288903383104,0.13157164225876355,0.04594154442972496,0.1006565847256015,0.12342088874476331,0.03519767202108022,0.11135287041970733
51
+ nemotron_hq_synth,42,5000,0.11121488341912057,0.1534008443898165,0.14951222505049067,0.09848300488058413,0.07003801135800797,0.021958084416484416,0.05922456540434963,0.33748169546968565,0.10156976368099678,0.16092560983388768,0.029777554512878564,0.031205228129759163,0.12100201390250556,0.24349696026008816,0.137201429146161,0.04990778293544326,0.09848300488058413,0.13124768675744222,0.03746262598353107,0.11629991499387499
52
+ nemotron_hq_synth,42,5500,0.11311404511736926,0.15516654882933922,0.15209282365884583,0.10158771318104871,0.07123201424060252,0.023565275199956378,0.06114934177051213,0.33601066310629,0.09859562206099962,0.17149658277121704,0.033916364386397654,0.032227710595457267,0.12032788160776454,0.2440517433825679,0.13774721521855188,0.05257418931350009,0.10158771318104871,0.13504610241610832,0.03898077585530859,0.11833128989451426
53
+ nemotron_hq_synth,42,6000,0.1073559536046002,0.15586794207035007,0.1326965003090768,0.09926817477604612,0.07181212663172408,0.021472294824712485,0.06196634584942247,0.3238753541014552,0.08075065961013964,0.1619637283610994,0.02901137347268065,0.03003813846910205,0.11954880477939328,0.22828592720526597,0.1377083734248717,0.050411750052202366,0.09926817477604612,0.12135719398561952,0.037825593047745666,0.11247616874862522
54
+ nemotron_hq_synth,42,6500,0.11459190815940934,0.1598060184380004,0.16356373328177568,0.09893754808174242,0.07275127003678514,0.022982679039582052,0.06347790304677574,0.3552690759021704,0.10281926644847421,0.161481453393715,0.024991369360983777,0.028435012555457707,0.12058756832744968,0.259416404591973,0.14019679338272506,0.04887131969888446,0.09893754808174242,0.1321503599210946,0.03829853154727183,0.11964515953728189
55
+ nemotron_hq_synth,42,7000,0.11947696525346256,0.16957283545722018,0.15338508023648154,0.10267713811600235,0.07293863960611924,0.02472887058756763,0.06224060315892607,0.38086180872412706,0.10187995606002764,0.1743867962225419,0.030390230575725107,0.03568095724664217,0.12498066705017005,0.2671234444803043,0.1472767512536951,0.051664435090922174,0.10267713811600235,0.1381333761412848,0.040883476997711964,0.1246264370133201
56
+ nemotron_hq_synth,42,7500,0.11905367547190097,0.17197933150833233,0.1660823814839022,0.10266969031663754,0.07421277319881611,0.02525573257213466,0.06415429666150758,0.3786566306462613,0.098942639696671,0.15674347826188884,0.03151208194269936,0.030634898097217055,0.1278001712767435,0.27236950606508176,0.1498897513925379,0.052862427570757736,0.10266969031663754,0.12784305897927992,0.0400149757769531,0.12427490168354133
57
+ nemotron_hq_synth,42,8000,0.11928478920066558,0.16781076293868702,0.14325354294818182,0.10281305926728988,0.07461424489875063,0.02400955924566592,0.06398430296886827,0.3993208038457467,0.0965965258078817,0.16150562113600314,0.03524446129861041,0.033692990564351134,0.12857159548795058,0.2712871733969643,0.1481911792133188,0.05492935309868052,0.10281305926728988,0.12905107347194242,0.04056228425962844,0.1244723537846374
58
+ nemotron_hq_synth,42,8500,0.1208980293451732,0.17216781431725084,0.1505033593002199,0.10392738378376219,0.07481191719915099,0.023163846657303182,0.06325500872883888,0.38594607091948824,0.10286949818236577,0.17545317494968726,0.031019160609311505,0.038232247696591115,0.12942686979810872,0.2682247151098541,0.15079734205767978,0.05291553890423125,0.10392738378376219,0.13916133656602653,0.041550367694244396,0.12609611401929968
59
+ nemotron_hq_synth,42,9000,0.11712752533452979,0.16624200206904388,0.13965175499435037,0.10565824906080684,0.075698399375578,0.02444246488153262,0.06597617482488803,0.3721941532205233,0.08841529043764194,0.16570800138694497,0.04046622878986389,0.032597048223138136,0.12848053675004567,0.2559229541074368,0.14736126940954478,0.058082314082720944,0.10565824906080684,0.12706164591229346,0.04100522930985293,0.12251527698044264
60
+ nemotron_hq_synth,42,9500,0.12537208257873547,0.17578932938837427,0.15246040812265357,0.10471494674296045,0.07893239139464389,0.0234359338779227,0.0669753380813127,0.4039636902627306,0.10326598659320063,0.1823187366583999,0.0402253715887885,0.036461854028429226,0.1359210042054092,0.2782120491926921,0.15585516679689174,0.059578881491716196,0.10471494674296045,0.14279236162580028,0.04229104199588821,0.1305740746409915
61
+ nemotron_hq_synth,42,10000,0.13053033482303275,0.1833489003594342,0.16390774148358422,0.10532042652172288,0.08129337889204831,0.026658446953565253,0.06982583711386792,0.4256470896685524,0.1004011755011827,0.17992691265266672,0.04341169785398802,0.043647083609207764,0.14297532726657225,0.2947774155760683,0.16316211381300322,0.062352538373018164,0.10532042652172288,0.1401640440769247,0.04671045589221365,0.1354144990421585
62
+ rewire,42,500,0.04265056692256005,0.017686775761007422,0.07063000758921417,0.0903926210271821,0.030379799013744737,0.004539264178227114,0.023639981534482,0.1107927894015786,0.02662659943622075,0.09234833137637262,0.002912124349926271,0.002438425242832636,0.039420084159932096,0.09071139849539639,0.028553429960469758,0.016645961681835506,0.0903926210271821,0.05948746540629669,0.010205890318513917,0.049332794481615726
63
+ rewire,42,1000,0.06484311728144444,0.04049910217052242,0.0985333468707251,0.09272197310137888,0.045354188509845,0.009081214028769788,0.03966070087151555,0.18394079775544742,0.05592438212146007,0.13637468879651304,0.00535327323369042,0.007256215229922551,0.06341752468754301,0.14123707231308624,0.05195831342903272,0.02535373087176771,0.09272197310137888,0.09614953545898655,0.018666043376735962,0.07101444475849801
64
+ rewire,42,1500,0.07190741905633209,0.05892115075137047,0.10436893735488467,0.09795460382659905,0.053258325342698795,0.010844613060567948,0.046119534385095785,0.20469495625029605,0.0573370692498,0.14140054855229928,0.006609331087688163,0.011982514632247587,0.06939744418243737,0.15453194680259036,0.06415929746690392,0.02993382821519348,0.09795460382659905,0.09936880890104964,0.02298222069263711,0.07815511765082893
65
+ rewire,42,2000,0.08061218582376058,0.07078496771644291,0.09514233622292204,0.09790497768108983,0.058547907461149816,0.013215992847969859,0.050800339383308475,0.25919881013704127,0.06798204974806973,0.1474184488528701,0.013355009392517886,0.015730607162547847,0.07726478327919721,0.17717057317998164,0.07402487549782005,0.03595145842683385,0.09790497768108983,0.10770024930046991,0.026582313131275393,0.08655574120291178
66
+ rewire,42,2500,0.08420039972794953,0.0771603417427458,0.11276586955621451,0.09885428216223155,0.06084503993366509,0.015064331133729063,0.0550019303420429,0.26251550608419394,0.07393736409516435,0.14873268946463206,0.012001684289889788,0.014570200439499864,0.07895555749138555,0.1876406878202042,0.07805794961706568,0.03642336211177744,0.09885428216223155,0.1113350267798982,0.028212153971757276,0.09008724374382238
67
+ rewire,42,3000,0.09273231175284881,0.08688501540330522,0.12433475254180555,0.10091173801077505,0.06409726550823135,0.016415555460041233,0.057262931143389734,0.3083276276216714,0.08598627557166745,0.14804341765311896,0.013863234095875228,0.02077112347443672,0.08588880454986808,0.21633119008173846,0.08638690997658666,0.038980249802053286,0.10091173801077505,0.1170148466123932,0.031483203359289225,0.09851802297380598
68
+ rewire,42,3500,0.0943552071841897,0.09198815676417393,0.11226571517194292,0.10031888172027258,0.0658774178188419,0.017955618813174097,0.05958258224760938,0.3155799049237895,0.09068626120982909,0.15085156921568946,0.014212479354972775,0.024739580702601693,0.08820431826737882,0.21392281004786623,0.09009623751577638,0.04004494858690734,0.10031888172027258,0.12076891521275927,0.03409259392112839,0.09987406450078502
69
+ rewire,42,4000,0.10045323135126227,0.09808493132564441,0.12856773797683327,0.099866389535988,0.06706545294538216,0.01766860798312627,0.06150439737831871,0.347323918915057,0.09378039241838648,0.15873791108538501,0.015553513269482568,0.02547021553417667,0.09181530784736656,0.23794582844594514,0.09495011958650548,0.04130948310743236,0.099866389535988,0.12625915175188573,0.03488107363187388,0.10586867434327175
70
+ rewire,42,4500,0.10036503829627282,0.10088855549984291,0.10524296722099793,0.10039707474235997,0.06861016353525448,0.018662317184749654,0.0616205043433571,0.3687800451936108,0.08722579301191119,0.15474602705856588,0.01565302153107318,0.02681297594213349,0.09574101429141714,0.23701150620730438,0.09831478489563003,0.04213159253316383,0.10039707474235997,0.12098591003523854,0.035698599156746745,0.10575657792840726
71
+ rewire,42,5000,0.1065431939221554,0.10500375304236402,0.13821420108568633,0.09877602848879723,0.06953066797835643,0.02138180895412673,0.06225948319143421,0.3955091290222677,0.09096177483016488,0.1559163685145909,0.018203569223752367,0.026421954077943774,0.09633958865638043,0.266861665053977,0.10067167084937223,0.0438671186010544,0.09877602848879723,0.12343907167237789,0.03668774874116824,0.11171721723445782
72
+ rewire,42,5500,0.11073908066220405,0.12446739213241784,0.1084806017696248,0.10143789776164783,0.07183017925454765,0.022495811849453262,0.06449095218497226,0.4110964572347125,0.099710100614002,0.1692942587153078,0.020466902502368847,0.03514829729241852,0.09995011663497495,0.25978852950216863,0.1122087543836964,0.04614854087845825,0.10143789776164783,0.1345021796646549,0.040711687108948014,0.11579959821659568
73
+ rewire,42,6000,0.11152831236617355,0.11750599169433543,0.14825706614540907,0.1022314017734436,0.07221089194749322,0.021348525798453536,0.06617793824944421,0.3930460633066562,0.10122473219185088,0.16298423454426988,0.018511070747557017,0.0324315609899276,0.10241027100524196,0.27065156472603263,0.10995813134978868,0.04536098134752512,0.1022314017734436,0.13210448336806038,0.039986008345941786,0.11671542848513204
74
+ rewire,42,6500,0.11412083094338983,0.12162911343289091,0.1650194763853759,0.10601544979229971,0.07396817821031502,0.020949160901119274,0.06491409833261005,0.40895763295645876,0.10067482623176112,0.15433181279214245,0.020207349210975126,0.030707694092558288,0.10207517898217155,0.28698855467091733,0.11185214620753123,0.047087763710645075,0.10601544979229971,0.1275033195119518,0.03885698444209587,0.1197173697225735
75
+ rewire,42,7000,0.11769021814814488,0.12846422033457444,0.11661160174171585,0.10587341120741427,0.07465826745252561,0.023253111661056477,0.06763325171512326,0.43359791467250897,0.1085309313673308,0.18878273910784718,0.020298853683952837,0.038092810721751746,0.10648550411193741,0.2751047582071124,0.11747486222325593,0.04747856056823922,0.10587341120741427,0.148656835237589,0.042993058032643826,0.12293024757937578
76
+ rewire,42,7500,0.11324781455415638,0.1231688180627862,0.17325189264268517,0.10435263532986278,0.07574792152655453,0.0220681262203481,0.06536118238242487,0.37196582126342603,0.11229246745678682,0.14885301527841707,0.020996462408347655,0.03641334834768204,0.10450208373055489,0.2726088569530556,0.11383545089667055,0.04837219196745109,0.10435263532986278,0.13057274136760194,0.04128088565015167,0.11850379369413226
77
+ rewire,42,8000,0.11610866109281774,0.12489472561651646,0.13397585930550662,0.1083357404261493,0.07447462168085858,0.023275636260537216,0.06511400709272641,0.4135252656710172,0.1119966717037618,0.17600068081918738,0.02060571249693513,0.03485447428173822,0.10625053775887865,0.2737505624882619,0.11557263168769755,0.047540167088896856,0.1083357404261493,0.14399867626147458,0.04108137254500061,0.12171319174958013
78
+ rewire,42,8500,0.12418821812993867,0.12752883908313112,0.15973221534385698,0.10529986221370606,0.07490085154496992,0.024443683465405628,0.06645745627167637,0.4622830744813988,0.11273660538068928,0.18596069933011203,0.01877625860555775,0.0434168485670852,0.10872222327167504,0.3110076449126279,0.11812553117740307,0.046838555075263834,0.10529986221370606,0.14934865235540065,0.04477266276805573,0.1292321514170762
79
+ rewire,42,9000,0.12182497336654456,0.13477283976810148,0.13235983136767265,0.10591517580386675,0.07646531376950948,0.0235884290613947,0.07026033133275691,0.4448382930540099,0.10738434116477466,0.19295601153377168,0.024094904175663404,0.037908880112197424,0.11135532925481549,0.2885990622108413,0.12306408451145848,0.05028010897258644,0.10591517580386675,0.15017017634927315,0.04391921350211634,0.12699130355835705
80
+ rewire,42,9500,0.12262415185928742,0.13955049369725644,0.14004088525941785,0.11081013558780674,0.07905528152583136,0.02503137766985094,0.0722994432167796,0.4158526439997781,0.1086849735165769,0.19352094653171775,0.024047511056761988,0.047313038347591575,0.11528309190207964,0.277946764629598,0.12741679279966805,0.051551396291296674,0.11081013558780674,0.15110296002414733,0.0482146197447407,0.1278404448462096
81
+ rewire,42,10000,0.13030916726404293,0.14874509430718114,0.14036438947513047,0.10747394495014771,0.08199891611739997,0.027135042878077907,0.07317693802463807,0.46080809037958503,0.11604413742583163,0.20332275094186225,0.026873999480503428,0.05653854302944602,0.12122816015871139,0.3005862399273578,0.13498662723294627,0.0544364577989517,0.10747394495014771,0.15968344418384695,0.05228350797738734,0.1349083703451063
82
+ synth_query_reasoning_answer,42,500,0.035513442621675266,0.015073976357627522,0.06616865382567644,0.07943777770518975,0.015712348051208167,0.0036345522202578828,0.0074687987072234565,0.08671991428745399,0.021924742825609692,0.07642687317160561,0.002177722836887656,0.0013750996400650284,0.050040851831297924,0.07644428405656521,0.03255741409446272,0.008945035444047912,0.07943777770518975,0.04917580799860765,0.004159483522515456,0.04178663380356479
83
+ synth_query_reasoning_answer,42,1000,0.04731168262865255,0.02963602643238462,0.08401242682543265,0.08853832296243212,0.023272789397401985,0.005266764151407125,0.01399878961669274,0.1338316493197785,0.03923059801809547,0.07982344474728056,0.0031333777210326144,0.004153994109947552,0.06284200824194466,0.10892203807260559,0.046239017337164644,0.0132030835592173,0.08853832296243212,0.05952702138268802,0.007806515959349139,0.054039333212242795
84
+ synth_query_reasoning_answer,42,1500,0.05326013801281323,0.03808885682928811,0.08936979532304469,0.0766902279229714,0.027866157648731175,0.005029376758557722,0.015261072495772647,0.16544255745029704,0.03834411968107126,0.10458882587147843,0.0038884547309342033,0.004740934368511638,0.06981127707310031,0.12740617638667087,0.05395006695119421,0.01587730618983269,0.0766902279229714,0.07146647277627484,0.008343794540947337,0.058955674127981895
85
+ synth_query_reasoning_answer,42,2000,0.06166518643564844,0.04961021583096391,0.10625470387154326,0.09076826729289443,0.031745314349828725,0.008043488101508345,0.022009781204622263,0.19844060651286843,0.04270292698392914,0.1018733703911475,0.004259666708248702,0.006110232140269859,0.07816366383995649,0.15234765519220583,0.0638869398354602,0.018002490529038715,0.09076826729289443,0.07228814868753831,0.012054500482133489,0.06822466700321182
86
+ synth_query_reasoning_answer,42,2500,0.06494348077664903,0.047233030314284975,0.10876514424103224,0.09437273417223085,0.03325026359922438,0.0069129589351941845,0.021642486150815994,0.21946348712337033,0.04659959768968433,0.10841051647534317,0.006552786889617815,0.006712842299140895,0.07940592142984923,0.16411431568220128,0.0633194758720671,0.019901525244421098,0.09437273417223085,0.07750505708251375,0.011756095795050358,0.07182820064141408
87
+ synth_query_reasoning_answer,42,3000,0.07304361950166396,0.06083236119922607,0.11901715462501503,0.0999531502178266,0.035673127025091546,0.008346536225954462,0.022860459533233932,0.24497066084633537,0.05924195629702546,0.12445701236521202,0.005383094494243185,0.007701793812884167,0.08808612737791967,0.1819939077356752,0.07445924428857287,0.020528110759667366,0.0999531502178266,0.09184948433111874,0.012969596524024187,0.08029224897614749
88
+ synth_query_reasoning_answer,42,3500,0.0743534698623073,0.06825549337045748,0.10911664631483034,0.09826798710967534,0.03686338324645009,0.008439231458359274,0.025503229494916638,0.2638559962546639,0.04372016041555822,0.12995317821807187,0.006999667144742354,0.009700564658356342,0.09156610066160584,0.18648632128474713,0.07991079701603165,0.02193152519559622,0.09826798710967534,0.08683666931681505,0.014547675203877418,0.0813301625211238
89
+ synth_query_reasoning_answer,42,4000,0.07893612365206563,0.0713987127718813,0.1146332358116625,0.10222040200077114,0.03770173201386019,0.00900240926040066,0.02627325010065712,0.2678410594913625,0.06529957425472337,0.14216174075174542,0.010616490683400528,0.007885458157405677,0.09219941852691732,0.1912371476515125,0.08179906564939932,0.02415911134863036,0.10222040200077114,0.1037306575032344,0.014387039172821152,0.08625557055439481
90
+ synth_query_reasoning_answer,42,4500,0.08041983543652169,0.08267124249172617,0.12380602615403143,0.10226453271781745,0.03827475345692155,0.009061552413699396,0.028257569462315637,0.2830962540397703,0.05070825883332228,0.12940000090762332,0.009633175916570135,0.010987480935782877,0.09687717790867972,0.20345114009690085,0.08977421020020294,0.023953964686745842,0.10226453271781745,0.0900541298704728,0.01610220093726597,0.08760002975156765
91
+ synth_query_reasoning_answer,42,5000,0.08058153947617179,0.07820590852350819,0.11771589365717491,0.1072380006691629,0.04032198412470698,0.010187498004546013,0.028489492588401548,0.271192788075246,0.05957526414957225,0.13624831859836145,0.00856856858084561,0.010704492872642352,0.09853026386989325,0.19445434086621044,0.08836808619670072,0.024445276352776296,0.1072380006691629,0.09791179137396686,0.01646049448852997,0.0881463316578912
92
+ synth_query_reasoning_answer,42,5500,0.08256078527521785,0.08843155774339825,0.1238367045421236,0.10917550385546258,0.040026651966034874,0.010164217880368826,0.02846147305792971,0.2747265048098881,0.0548526352585163,0.14119036604766205,0.005647394496900201,0.011793746550222164,0.10242266709410754,0.19928160467600584,0.0954271124187529,0.022837023231467538,0.10917550385546258,0.09802150065308918,0.016806479162840237,0.09025820399960306
93
+ synth_query_reasoning_answer,42,6000,0.08430762089591148,0.07919327341870015,0.12997988477615194,0.10273758126604275,0.04105881577603814,0.009640204226333889,0.029382070642368006,0.29928957416462665,0.05664930881048516,0.14264607399419546,0.010960652360041213,0.010555598217259678,0.0995984130986949,0.2146347294703893,0.08939584325869752,0.026009734068039678,0.10273758126604275,0.09964769140234031,0.016525957695320528,0.09149192286013835
94
+ synth_query_reasoning_answer,42,6500,0.08444149557411484,0.08402537139308493,0.13369818218066098,0.10285850115936718,0.04226174091240128,0.00991331271722322,0.030764933813341114,0.2839103286624604,0.055473570771933677,0.1477476162715078,0.012452525112519533,0.009840064076537727,0.10035179981834028,0.2088042554215607,0.09218858560571261,0.027357133012460406,0.10285850115936718,0.10161059352172074,0.01683943686903402,0.0916097509316426
95
+ synth_query_reasoning_answer,42,7000,0.08733579108353406,0.09274496195438126,0.12783228645351125,0.10585483141968924,0.04140499246430086,0.00990579037971003,0.0306657403064225,0.2964769632109272,0.06872084237283703,0.15010635810946463,0.009139265585396674,0.011518769287097315,0.10365869145867089,0.21215462483221922,0.09820182670652608,0.02527212902484877,0.10585483141968924,0.10941360024115082,0.017363433324409948,0.09471007425814067
96
+ synth_query_reasoning_answer,42,7500,0.08863417347387555,0.09240356217393896,0.14881973353501576,0.10151619983486602,0.042210124440366474,0.010713110240814129,0.0306682229730257,0.30600578486416224,0.061672811030978075,0.14370012267637594,0.013431786018876084,0.010090156104852884,0.10237846779323392,0.227412759199589,0.09739101498358643,0.027820955229621278,0.10151619983486602,0.10268646685367701,0.017157163106230906,0.09566409320126179
97
+ synth_query_reasoning_answer,42,8000,0.08755442520503119,0.08860498877063167,0.14179078288669023,0.09875658289513833,0.044096292751690164,0.009698260497968582,0.03077663652283185,0.29920128931824164,0.06572481798221814,0.14042232845936162,0.020122112503608597,0.010098737020688477,0.1013602728513049,0.22049603610246593,0.09498263081096828,0.03210920262764938,0.09875658289513833,0.10307357322078989,0.016857878013829635,0.09437931727847358
98
+ synth_query_reasoning_answer,42,8500,0.08335227495246045,0.08685831332021673,0.1055250465909481,0.097455577254604,0.043092917852824174,0.010305418491369097,0.032120632212379305,0.2741034843086726,0.06988872186519213,0.1509872786423176,0.01684208503668596,0.01180287876076734,0.10124494509354863,0.18981426544981034,0.09405162920688268,0.029967501444755067,0.097455577254604,0.11043800025375486,0.018076309821505248,0.08996721390521868
99
+ synth_query_reasoning_answer,42,9000,0.08549164796513085,0.09677796067439566,0.1361574540866481,0.10210580354158325,0.04443856787313211,0.010751481772278741,0.03339322593064408,0.2810053666746844,0.04336078094212285,0.14646892336813522,0.012589819426654004,0.013875996896216993,0.10497439439507479,0.20858141038066624,0.10087617753473523,0.028514193649893056,0.10210580354158325,0.09491485215512903,0.019340234866379938,0.09238877868806446
100
+ synth_query_reasoning_answer,42,9500,0.08675381637504477,0.09810283018124974,0.13254892176210545,0.09893137136118485,0.04511541180369288,0.011030940187333069,0.033065885461781463,0.31406529548089707,0.030923310141800064,0.14217114585352564,0.01581980415749073,0.012219747303091026,0.10705113280638545,0.22330710862150127,0.1025769814938176,0.030467607980591803,0.09893137136118485,0.08654722799766285,0.01877219098406852,0.09343374807313783
101
+ synth_query_reasoning_answer,42,10000,0.09358976838409616,0.10867687436008114,0.1268415551737933,0.10053703091204497,0.04709002622209718,0.011280139669537836,0.035570948809070906,0.3227502712932084,0.06067078915360809,0.16932094307810486,0.01544328007619672,0.012554458280223642,0.11234090358118656,0.22479591323350084,0.11050888897063385,0.03126665314914695,0.10053703091204497,0.11499586611585647,0.019801848919610794,0.10031770021679898
app/src/content/assets/image/newplot_2f81384e-bcac-804d-b760-e8611cc0302b.png DELETED

Git LFS Details

  • SHA256: bdbae1a37fba92bb8d7bc1829c4a2fc72d4cba266f3c4e4c778eaaa62fdc5152
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
app/src/content/chapters/experiments.mdx CHANGED
@@ -1,6 +1,5 @@
1
  import Image from "../../components/Image.astro";
2
  import HtmlEmbed from "../../components/HtmlEmbed.astro";
3
- import newplot_2f81384e_bcac_804d_b760_e8611cc0302b from "../assets/image/newplot_2f81384e-bcac-804d-b760-e8611cc0302b.png";
4
  import newplot_2c41384e_bcac_8073_9395_cf2d0e901187 from "../assets/image/newplot_2c41384e-bcac-8073-9395-cf2d0e901187.png";
5
  import newplot_2c31384e_bcac_800b_82e8_ff44228f7720 from "../assets/image/newplot_2c31384e-bcac-800b-82e8-ff44228f7720.png";
6
  import newplot_2e11384e_bcac_800a_abc6_d0690da3f955 from "../assets/image/newplot_2e11384e-bcac-800a-abc6-d0690da3f955.png";
@@ -56,9 +55,23 @@ TODO: Add appendix section of weird unexplainable results?
56
 
57
  We see that FinePhrase clearly outperforms the synthetic baselines.
58
 
59
- TODO: call the best one FinePhrase in the blog post and just show that one
60
-
61
- <Image src={newplot_2f81384e_bcac_804d_b760_e8611cc0302b} alt="Image" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  ### Baselines
64
 
@@ -66,10 +79,22 @@ DCLM, REWIRE and Nemotron-HQ-Synth are the strongest baselines in our setup by a
66
 
67
  <HtmlEmbed
68
  id="baselines-comparison"
69
- src="d3-baselines.html"
70
  data="baselines.csv"
71
  title="Baseline Comparison"
72
  desc="Figure: Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
 
 
 
 
 
 
 
 
 
 
 
 
73
  />
74
 
75
  #### Disecting the synthetic baselines
 
1
  import Image from "../../components/Image.astro";
2
  import HtmlEmbed from "../../components/HtmlEmbed.astro";
 
3
  import newplot_2c41384e_bcac_8073_9395_cf2d0e901187 from "../assets/image/newplot_2c41384e-bcac-8073-9395-cf2d0e901187.png";
4
  import newplot_2c31384e_bcac_800b_82e8_ff44228f7720 from "../assets/image/newplot_2c31384e-bcac-800b-82e8-ff44228f7720.png";
5
  import newplot_2e11384e_bcac_800a_abc6_d0690da3f955 from "../assets/image/newplot_2e11384e-bcac-800a-abc6-d0690da3f955.png";
 
55
 
56
  We see that FinePhrase clearly outperforms the synthetic baselines.
57
 
58
+ <HtmlEmbed
59
+ id="finephrase-vs-baselines"
60
+ src="d3-benchmark-comparison.html"
61
+ data="finephrase_vs_baselines.csv"
62
+ title="FinePhrase vs Synthetic Baselines"
63
+ desc="Figure: FinePhrase compared against synthetic data baselines across evaluation metrics."
64
+ config={{
65
+ defaultView: "line",
66
+ datasetNames: {
67
+ cosmopedia: "Cosmopedia",
68
+ "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
69
+ nemotron_hq_synth: "Nemotron-HQ-Synth",
70
+ rewire: "REWIRE",
71
+ synth_query_reasoning_answer: "SYNTH"
72
+ }
73
+ }}
74
+ />
75
 
76
  ### Baselines
77
 
 
79
 
80
  <HtmlEmbed
81
  id="baselines-comparison"
82
+ src="d3-benchmark-comparison.html"
83
  data="baselines.csv"
84
  title="Baseline Comparison"
85
  desc="Figure: Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
86
+ config={{
87
+ datasetNames: {
88
+ cosmopedia: "Cosmopedia",
89
+ dclm: "DCLM",
90
+ fw_edu_hq: "FineWeb-Edu (HQ)",
91
+ fw_edu_lq: "FineWeb-Edu (LQ)",
92
+ nemotron_hq_synth: "Nemotron-HQ-Synth",
93
+ rewire: "REWIRE",
94
+ synth_query_reasoning_answer: "SYNTH",
95
+ "ultra-fineweb": "Ultra-FineWeb"
96
+ }
97
+ }}
98
  />
99
 
100
  #### Disecting the synthetic baselines
app/src/content/embeds/{d3-baselines.html β†’ d3-benchmark-comparison.html} RENAMED
@@ -1,7 +1,39 @@
1
- <div class="d3-baselines"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  <style>
3
- .d3-baselines { position: relative; }
4
- .d3-baselines .controls {
5
  display: flex;
6
  gap: 16px;
7
  align-items: center;
@@ -9,18 +41,18 @@
9
  flex-wrap: wrap;
10
  margin: 10px 0 0 0;
11
  }
12
- .d3-baselines .controls .control-group {
13
  display: flex;
14
  flex-direction: column;
15
  align-items: flex-start;
16
  gap: 6px;
17
  }
18
- .d3-baselines .controls label {
19
  font-size: 12px;
20
  font-weight: 700;
21
  color: var(--text-color);
22
  }
23
- .d3-baselines .controls select {
24
  appearance: none;
25
  -webkit-appearance: none;
26
  -moz-appearance: none;
@@ -35,11 +67,11 @@
35
  background-repeat: no-repeat;
36
  background-position: right 8px center;
37
  }
38
- .d3-baselines .controls select:focus-visible {
39
  outline: 2px solid var(--primary-color);
40
  outline-offset: 2px;
41
  }
42
- .d3-baselines .legend {
43
  display: flex;
44
  flex-direction: column;
45
  align-items: flex-start;
@@ -47,17 +79,17 @@
47
  margin: 8px 0 0 0;
48
  padding-bottom: 4px;
49
  }
50
- .d3-baselines .legend .legend-title {
51
  font-size: 12px;
52
  font-weight: 700;
53
  color: var(--text-color);
54
  }
55
- .d3-baselines .legend .items {
56
  display: flex;
57
  flex-wrap: wrap;
58
  gap: 8px 14px;
59
  }
60
- .d3-baselines .legend .item {
61
  display: inline-flex;
62
  align-items: center;
63
  gap: 6px;
@@ -66,29 +98,29 @@
66
  color: var(--text-color);
67
  cursor: pointer;
68
  }
69
- .d3-baselines .legend .item.ghost { opacity: .25; }
70
- .d3-baselines .legend .swatch {
71
  width: 14px;
72
  height: 14px;
73
  border-radius: 3px;
74
  border: 1px solid var(--border-color);
75
  }
76
- .d3-baselines .bar.ghost { opacity: .25; }
77
- .d3-baselines .value-label.ghost { opacity: .25; }
78
- .d3-baselines .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
79
- .d3-baselines .line-path.ghost { opacity: .15; }
80
- .d3-baselines .line-dot.ghost { opacity: .15; }
81
- .d3-baselines .axes path { display: none; }
82
- .d3-baselines .axes line { stroke: var(--axis-color); }
83
- .d3-baselines .axes text { fill: var(--tick-color); }
84
- .d3-baselines .grid line { stroke: var(--grid-color); }
85
- .d3-baselines .hover-line {
86
  stroke: var(--text-color);
87
  stroke-opacity: 0.25;
88
  stroke-width: 1;
89
  pointer-events: none;
90
  }
91
- .d3-baselines .d3-tooltip {
92
  position: absolute;
93
  top: 0px;
94
  left: 0px;
@@ -107,7 +139,7 @@
107
  text-align: left;
108
  z-index: 10;
109
  }
110
- .d3-baselines .d3-tooltip .tip-dot {
111
  display: inline-block;
112
  width: 10px;
113
  height: 10px;
@@ -130,8 +162,8 @@
130
  const bootstrap = () => {
131
  const scriptEl = document.currentScript;
132
  let container = scriptEl ? scriptEl.previousElementSibling : null;
133
- if (!(container && container.classList && container.classList.contains('d3-baselines'))) {
134
- const cs = Array.from(document.querySelectorAll('.d3-baselines')).filter(el => !(el.dataset && el.dataset.mounted === 'true'));
135
  container = cs[cs.length - 1] || null;
136
  }
137
  if (!container) return;
@@ -139,26 +171,27 @@
139
 
140
  container.style.position = container.style.position || 'relative';
141
 
142
- // Tooltip
143
- let tip = container.querySelector('.d3-tooltip'), tipInner;
144
- if (!tip) {
145
- tip = document.createElement('div'); tip.className = 'd3-tooltip';
146
- tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tip.appendChild(tipInner);
147
- container.appendChild(tip);
148
- } else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; }
 
149
 
150
- // Display name maps
151
- const DATASET_NAMES = {
152
- 'cosmopedia': 'Cosmopedia',
153
- 'dclm': 'DCLM',
154
- 'fw_edu_hq': 'FineWeb-Edu (HQ)',
155
- 'fw_edu_lq': 'FineWeb-Edu (LQ)',
156
- 'nemotron_hq_synth': 'Nemotron-HQ-Synth',
157
- 'rewire': 'REWIRE',
158
- 'synth_query_reasoning_answer': 'SYNTH',
159
- 'ultra-fineweb': 'Ultra-FineWeb'
160
- };
161
 
 
 
 
 
162
  const METRIC_NAMES = {
163
  'agg_score_macro': 'Aggregate Score (Macro)',
164
  'agg_score_micro': 'Aggregate Score (Micro)',
@@ -182,34 +215,13 @@
182
  'lighteval|mmlu_redux_cf:_average|3/prob_norm_token': 'MMLU Redux'
183
  };
184
 
185
- const METRIC_ORDER = [
186
- 'agg_score_macro', 'agg_score_micro',
187
- 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU',
188
- 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES',
189
- 'lighteval|arc_cf:easy|3/prob_norm_token',
190
- 'lighteval|drop|3/prob_norm_token',
191
- 'lighteval|gsm8k|3/prob_norm_token',
192
- 'lighteval|hellaswag_cf|3/prob_norm_token',
193
- 'lighteval|openbookqa_cf|3/prob_norm_token',
194
- 'lighteval|piqa_cf|3/prob_norm_token',
195
- 'lighteval|squad_v2|3/prob_norm_token',
196
- 'lighteval|treb_qa|3/prob_norm_token',
197
- 'lighteval|wikitablequestions|3/prob_norm_token',
198
- 'lighteval|winogrande_cf|3/prob_norm_token',
199
- 'lighteval|xcsqa_cf|3/prob_norm_token',
200
- 'lighteval|mmlu_redux_cf:_average|3/prob_norm_token'
201
- ];
202
-
203
- // Read optional config
204
- let mountEl = container;
205
- while (mountEl && !mountEl.getAttribute?.('data-config')) { mountEl = mountEl.parentElement; }
206
- let providedConfig = null;
207
- try {
208
- const cfg = mountEl && mountEl.getAttribute ? mountEl.getAttribute('data-config') : null;
209
- if (cfg && cfg.trim()) providedConfig = cfg.trim().startsWith('{') ? JSON.parse(cfg) : null;
210
- } catch (_) {}
211
-
212
- const defaultMetric = (providedConfig && providedConfig.defaultMetric) || 'agg_score_macro';
213
 
214
  // SVG
215
  const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
@@ -217,13 +229,16 @@
217
 
218
  // State
219
  let allData = [];
 
220
  let currentMetric = defaultMetric;
221
- let currentView = 'bar'; // 'bar' or 'line'
222
  let colorMap = {};
223
  let highlight = null;
224
 
225
- // 1.05B tokens per 500 steps β†’ 2.1M tokens per step
226
- const TOKENS_PER_STEP = 2.1e6;
 
 
227
  function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
228
  function formatTokens(tokens) {
229
  if (tokens >= 1e9) return d3.format('.2f')(tokens / 1e9) + 'B';
@@ -234,19 +249,16 @@
234
  if (step >= 1000) return d3.format('.0f')(step / 1000) + 'K';
235
  return String(step);
236
  }
237
- // Compact: "12.6B (6K)" for axis ticks
238
  function stepLabelShort(step) { return `${formatTokens(stepsToTokens(step))} (${formatStep(step)})`; }
239
- // Verbose: "12.6B Tokens (6K Steps)" for tooltip
240
  function stepLabelLong(step) { return `${formatTokens(stepsToTokens(step))} Tokens (${formatStep(step)} Steps)`; }
241
 
242
- // Color helpers
243
  function getCategoricalColors(n) {
244
  try { if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') return window.ColorPalettes.getColors('categorical', n); } catch (_) {}
245
  return (d3.schemeTableau10 || ['#4e79a7','#f28e2b','#e15759','#76b7b2','#59a14f','#edc948','#b07aa1','#ff9da7','#9c755f','#bab0ac']).slice(0, n);
246
  }
247
 
248
  function initColors() {
249
- const allNames = Array.from(d3.group(allData, d => d.runname).keys()).sort();
250
  if (!Object.keys(colorMap).length) {
251
  const palette = getCategoricalColors(allNames.length);
252
  allNames.forEach((name, i) => { colorMap[name] = palette[i % palette.length]; });
@@ -267,29 +279,36 @@
267
  }
268
 
269
  function updateHighlight() {
270
- // Bar view
271
  gRoot.selectAll('rect.bar').classed('ghost', d => highlight && d.name !== highlight);
272
  gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
273
- // Line view
274
  gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
275
  gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
276
- // Legend
277
  container.querySelectorAll('.legend .item').forEach(el => {
278
  el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
279
  });
280
  }
281
 
 
 
 
 
 
 
 
 
 
 
282
  // ─── BAR CHART ───
283
  function renderBar() {
284
  const width = container.clientWidth || 800;
285
  const margin = { top: 12, right: 56, bottom: 32, left: 140 };
286
 
287
- const grouped = d3.group(allData, d => d.runname);
288
  const finalData = [];
289
- for (const [runname, rows] of grouped) {
290
- const maxStep = d3.max(rows, r => +r.steps);
291
- const row = rows.find(r => +r.steps === maxStep);
292
- if (row) finalData.push({ name: DATASET_NAMES[runname] || runname, rawName: runname, value: +row[currentMetric] });
293
  }
294
  finalData.sort((a, b) => b.value - a.value);
295
 
@@ -327,28 +346,23 @@
327
  g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
328
  });
329
 
330
- // Remove line-specific elements
331
- gRoot.selectAll('.line-path, .line-dot, .hover-line, .hover-overlay, .x-label, .y-label').remove();
332
-
333
  // Bars
 
 
 
 
334
  gRoot.selectAll('rect.bar').data(finalData, d => d.name).join(
335
  enter => enter.append('rect').attr('class', 'bar')
336
  .attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
337
  .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
338
  .attr('width', 0)
339
  .on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
340
- .on('mousemove', (ev, d) => {
341
- const [mx, my] = d3.pointer(ev, container);
342
- showTip(`<strong>${d.name}</strong><br/>${METRIC_NAMES[currentMetric] || currentMetric}: <strong>${d.value.toFixed(4)}</strong>`, mx, my);
343
- })
344
  .on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
345
  .transition().duration(300).attr('width', d => Math.max(0, x(d.value))),
346
  update => update
347
  .on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
348
- .on('mousemove', (ev, d) => {
349
- const [mx, my] = d3.pointer(ev, container);
350
- showTip(`<strong>${d.name}</strong><br/>${METRIC_NAMES[currentMetric] || currentMetric}: <strong>${d.value.toFixed(4)}</strong>`, mx, my);
351
- })
352
  .on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
353
  .transition().duration(300)
354
  .attr('y', d => y(d.name)).attr('height', y.bandwidth())
@@ -381,34 +395,28 @@
381
  const innerWidth = width - margin.left - margin.right;
382
  const innerHeight = height - margin.top - margin.bottom;
383
 
384
- // Remove bar-specific elements
385
- gRoot.selectAll('rect.bar, text.value-label').remove();
386
-
387
- // Build series: one line per dataset
388
- const grouped = d3.group(allData, d => d.runname);
389
  const series = [];
390
- for (const [runname, rows] of grouped) {
391
- const pts = rows.map(r => ({ step: +r.steps, value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
392
- series.push({ name: DATASET_NAMES[runname] || runname, rawName: runname, values: pts });
393
  }
394
 
395
- const allSteps = Array.from(new Set(allData.map(r => +r.steps))).sort((a, b) => a - b);
396
  const allValues = series.flatMap(s => s.values.map(v => v.value));
397
 
398
  const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
399
- const yMin = d3.min(allValues);
400
- const yMax = d3.max(allValues);
401
- const yPad = (yMax - yMin) * 0.08;
402
  const y = d3.scaleLinear().domain([yMin - yPad, yMax + yPad]).range([innerHeight, 0]).nice();
403
 
404
  // Grid
405
  gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
406
  g.selectAll('line').data(y.ticks(6)).join('line')
407
- .attr('x1', 0).attr('x2', innerWidth)
408
- .attr('y1', d => y(d)).attr('y2', d => y(d));
409
  });
410
 
411
- // X axis: tokens with steps in brackets
412
  gRoot.selectAll('.axis-x').data([0]).join('g').attr('class', 'axes axis-x')
413
  .attr('transform', `translate(0,${innerHeight})`)
414
  .call(d3.axisBottom(x).ticks(6).tickFormat(d => stepLabelShort(d)).tickSizeOuter(0))
@@ -434,7 +442,7 @@
434
  gRoot.selectAll('.y-label').data([0]).join('text').attr('class', 'y-label')
435
  .attr('transform', 'rotate(-90)').attr('x', -innerHeight / 2).attr('y', -44)
436
  .attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
437
- .text(METRIC_NAMES[currentMetric] || currentMetric);
438
 
439
  // Lines
440
  const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
@@ -448,7 +456,7 @@
448
  exit => exit.remove()
449
  );
450
 
451
- // Dots at each data point
452
  const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
453
  gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
454
  enter => enter.append('circle').attr('class', 'line-dot')
@@ -461,7 +469,7 @@
461
  exit => exit.remove()
462
  );
463
 
464
- // Hover overlay for vertical line + tooltip
465
  gRoot.selectAll('.hover-line').data([0]).join('line').attr('class', 'hover-line')
466
  .attr('y1', 0).attr('y2', innerHeight).style('display', 'none');
467
 
@@ -470,13 +478,9 @@
470
  .attr('fill', 'none').attr('pointer-events', 'all')
471
  .on('mousemove', (ev) => {
472
  const [mx] = d3.pointer(ev, gRoot.node());
473
- const stepTarget = x.invert(mx);
474
- const nearest = allSteps.reduce((best, s) => Math.abs(s - stepTarget) < Math.abs(best - stepTarget) ? s : best, allSteps[0]);
475
- const px = x(nearest);
476
 
477
- gRoot.select('.hover-line').attr('x1', px).attr('x2', px).style('display', null);
478
-
479
- // Build tooltip sorted by value at this step
480
  const entries = series.map(s => {
481
  const pt = s.values.find(v => v.step === nearest);
482
  return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
@@ -486,7 +490,6 @@
486
  entries.forEach(e => {
487
  html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(4)}</strong></div>`;
488
  });
489
-
490
  const [cx, cy] = d3.pointer(ev, container);
491
  showTip(html, cx, cy);
492
  })
@@ -496,23 +499,22 @@
496
  });
497
  }
498
 
499
- // ─── MAIN RENDER DISPATCHER ───
500
  function render() {
501
  if (!allData.length) return;
502
  initColors();
503
  gRoot.selectAll('*').remove();
504
- if (currentView === 'bar') renderBar();
505
- else renderLine();
506
  }
507
 
508
- // ─── BUILD UI ───
509
  function buildUI() {
510
  const controls = document.createElement('div'); controls.className = 'controls';
511
 
512
  // View toggle
513
  const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
514
- const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-select-baselines'); viewLabel.textContent = 'View';
515
- const viewSelect = document.createElement('select'); viewSelect.id = 'view-select-baselines';
516
  [['bar', 'Final Score (Bar)'], ['line', 'Training Progression (Line)']].forEach(([val, text]) => {
517
  const opt = document.createElement('option'); opt.value = val; opt.textContent = text;
518
  if (val === currentView) opt.selected = true;
@@ -522,19 +524,10 @@
522
  viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
523
  controls.appendChild(viewGroup);
524
 
525
- // Metric select
526
  const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
527
- const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-select-baselines'); metricLabel.textContent = 'Metric';
528
- const metricSelect = document.createElement('select'); metricSelect.id = 'metric-select-baselines';
529
- const aggGroup = document.createElement('optgroup'); aggGroup.label = 'Aggregate Scores';
530
- const indGroup = document.createElement('optgroup'); indGroup.label = 'Individual Benchmarks';
531
- METRIC_ORDER.forEach(key => {
532
- const opt = document.createElement('option'); opt.value = key; opt.textContent = METRIC_NAMES[key] || key;
533
- if (key === defaultMetric) opt.selected = true;
534
- if (key.startsWith('lighteval|')) indGroup.appendChild(opt); else aggGroup.appendChild(opt);
535
- });
536
- metricSelect.appendChild(aggGroup); metricSelect.appendChild(indGroup);
537
- metricSelect.addEventListener('change', () => { currentMetric = metricSelect.value; render(); });
538
  metricGroup.appendChild(metricLabel); metricGroup.appendChild(metricSelect);
539
  controls.appendChild(metricGroup);
540
 
@@ -546,18 +539,43 @@
546
  container.appendChild(legend);
547
  }
548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  function buildLegend() {
550
  const items = container.querySelector('.legend .items');
551
  if (!items) return;
552
  items.innerHTML = '';
553
- const allNames = Array.from(d3.group(allData, d => d.runname).keys()).sort();
554
- allNames.forEach(rawName => {
555
- const displayName = DATASET_NAMES[rawName] || rawName;
556
- const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', displayName);
557
- const sw = document.createElement('span'); sw.className = 'swatch'; sw.style.background = colorMap[rawName] || '#999';
558
- const txt = document.createElement('span'); txt.textContent = displayName;
 
 
 
 
 
 
 
 
 
559
  el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
560
- el.addEventListener('mouseenter', () => { highlight = displayName; updateHighlight(); });
561
  el.addEventListener('mouseleave', () => { highlight = null; updateHighlight(); });
562
  });
563
  }
@@ -589,6 +607,10 @@
589
  try {
590
  const text = await fetchFirstAvailable(csvPaths);
591
  allData = d3.csvParse(text);
 
 
 
 
592
  render();
593
  buildLegend();
594
  if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
 
1
+ <!--
2
+ Reusable bar/line chart for benchmark comparisons.
3
+
4
+ Configuration via data-config attribute:
5
+ {
6
+ "datasetNames": { "raw_name": "Display Name", ... }, // required
7
+ "defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
8
+ "defaultView": "bar", // optional, "bar" | "line", default: "bar"
9
+ "tokensPerStep": 2100000, // optional, default: 2.1e6
10
+ "runColumn": "runname", // optional, CSV column for series, default: "runname"
11
+ "stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
12
+ }
13
+
14
+ Example usage in MDX:
15
+ <HtmlEmbed
16
+ src="d3-benchmark-comparison.html"
17
+ data="baselines.csv"
18
+ title="Baseline Comparison"
19
+ config={{
20
+ datasetNames: {
21
+ cosmopedia: "Cosmopedia",
22
+ dclm: "DCLM",
23
+ fw_edu_hq: "FineWeb-Edu (HQ)",
24
+ fw_edu_lq: "FineWeb-Edu (LQ)",
25
+ nemotron_hq_synth: "Nemotron-HQ-Synth",
26
+ rewire: "REWIRE",
27
+ synth_query_reasoning_answer: "SYNTH",
28
+ "ultra-fineweb": "Ultra-FineWeb"
29
+ }
30
+ }}
31
+ />
32
+ -->
33
+ <div class="d3-benchmark-comparison"></div>
34
  <style>
35
+ .d3-benchmark-comparison { position: relative; }
36
+ .d3-benchmark-comparison .controls {
37
  display: flex;
38
  gap: 16px;
39
  align-items: center;
 
41
  flex-wrap: wrap;
42
  margin: 10px 0 0 0;
43
  }
44
+ .d3-benchmark-comparison .controls .control-group {
45
  display: flex;
46
  flex-direction: column;
47
  align-items: flex-start;
48
  gap: 6px;
49
  }
50
+ .d3-benchmark-comparison .controls label {
51
  font-size: 12px;
52
  font-weight: 700;
53
  color: var(--text-color);
54
  }
55
+ .d3-benchmark-comparison .controls select {
56
  appearance: none;
57
  -webkit-appearance: none;
58
  -moz-appearance: none;
 
67
  background-repeat: no-repeat;
68
  background-position: right 8px center;
69
  }
70
+ .d3-benchmark-comparison .controls select:focus-visible {
71
  outline: 2px solid var(--primary-color);
72
  outline-offset: 2px;
73
  }
74
+ .d3-benchmark-comparison .legend {
75
  display: flex;
76
  flex-direction: column;
77
  align-items: flex-start;
 
79
  margin: 8px 0 0 0;
80
  padding-bottom: 4px;
81
  }
82
+ .d3-benchmark-comparison .legend .legend-title {
83
  font-size: 12px;
84
  font-weight: 700;
85
  color: var(--text-color);
86
  }
87
+ .d3-benchmark-comparison .legend .items {
88
  display: flex;
89
  flex-wrap: wrap;
90
  gap: 8px 14px;
91
  }
92
+ .d3-benchmark-comparison .legend .item {
93
  display: inline-flex;
94
  align-items: center;
95
  gap: 6px;
 
98
  color: var(--text-color);
99
  cursor: pointer;
100
  }
101
+ .d3-benchmark-comparison .legend .item.ghost { opacity: .25; }
102
+ .d3-benchmark-comparison .legend .swatch {
103
  width: 14px;
104
  height: 14px;
105
  border-radius: 3px;
106
  border: 1px solid var(--border-color);
107
  }
108
+ .d3-benchmark-comparison .bar.ghost { opacity: .25; }
109
+ .d3-benchmark-comparison .value-label.ghost { opacity: .25; }
110
+ .d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
111
+ .d3-benchmark-comparison .line-path.ghost { opacity: .15; }
112
+ .d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
113
+ .d3-benchmark-comparison .axes path { display: none; }
114
+ .d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
115
+ .d3-benchmark-comparison .axes text { fill: var(--tick-color); }
116
+ .d3-benchmark-comparison .grid line { stroke: var(--grid-color); }
117
+ .d3-benchmark-comparison .hover-line {
118
  stroke: var(--text-color);
119
  stroke-opacity: 0.25;
120
  stroke-width: 1;
121
  pointer-events: none;
122
  }
123
+ .d3-benchmark-comparison .d3-tooltip {
124
  position: absolute;
125
  top: 0px;
126
  left: 0px;
 
139
  text-align: left;
140
  z-index: 10;
141
  }
142
+ .d3-benchmark-comparison .d3-tooltip .tip-dot {
143
  display: inline-block;
144
  width: 10px;
145
  height: 10px;
 
162
  const bootstrap = () => {
163
  const scriptEl = document.currentScript;
164
  let container = scriptEl ? scriptEl.previousElementSibling : null;
165
+ if (!(container && container.classList && container.classList.contains('d3-benchmark-comparison'))) {
166
+ const cs = Array.from(document.querySelectorAll('.d3-benchmark-comparison')).filter(el => !(el.dataset && el.dataset.mounted === 'true'));
167
  container = cs[cs.length - 1] || null;
168
  }
169
  if (!container) return;
 
171
 
172
  container.style.position = container.style.position || 'relative';
173
 
174
+ // ─── READ CONFIG ───
175
+ let mountEl = container;
176
+ while (mountEl && !mountEl.getAttribute?.('data-config')) { mountEl = mountEl.parentElement; }
177
+ let cfg = {};
178
+ try {
179
+ const raw = mountEl && mountEl.getAttribute ? mountEl.getAttribute('data-config') : null;
180
+ if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
181
+ } catch (_) {}
182
 
183
+ // Configurable settings with defaults
184
+ const DATASET_NAMES = cfg.datasetNames || {};
185
+ const RUN_COL = cfg.runColumn || 'runname';
186
+ const STEP_COL = cfg.stepColumn || 'steps';
187
+ const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
188
+ const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
189
+ const defaultView = cfg.defaultView || 'bar';
 
 
 
 
190
 
191
+ // Unique ID suffix for multiple instances on same page
192
+ const uid = Math.random().toString(36).slice(2, 8);
193
+
194
+ // Standard metric display names (shared across all CSVs from this benchmark suite)
195
  const METRIC_NAMES = {
196
  'agg_score_macro': 'Aggregate Score (Macro)',
197
  'agg_score_micro': 'Aggregate Score (Micro)',
 
215
  'lighteval|mmlu_redux_cf:_average|3/prob_norm_token': 'MMLU Redux'
216
  };
217
 
218
+ // Tooltip
219
+ let tip = container.querySelector('.d3-tooltip'), tipInner;
220
+ if (!tip) {
221
+ tip = document.createElement('div'); tip.className = 'd3-tooltip';
222
+ tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tip.appendChild(tipInner);
223
+ container.appendChild(tip);
224
+ } else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  // SVG
227
  const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
 
229
 
230
  // State
231
  let allData = [];
232
+ let metricKeys = []; // auto-detected from CSV columns
233
  let currentMetric = defaultMetric;
234
+ let currentView = defaultView;
235
  let colorMap = {};
236
  let highlight = null;
237
 
238
+ // ─── HELPERS ───
239
+ function displayName(raw) { return DATASET_NAMES[raw] || raw; }
240
+ function metricName(key) { return METRIC_NAMES[key] || key; }
241
+
242
  function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
243
  function formatTokens(tokens) {
244
  if (tokens >= 1e9) return d3.format('.2f')(tokens / 1e9) + 'B';
 
249
  if (step >= 1000) return d3.format('.0f')(step / 1000) + 'K';
250
  return String(step);
251
  }
 
252
  function stepLabelShort(step) { return `${formatTokens(stepsToTokens(step))} (${formatStep(step)})`; }
 
253
  function stepLabelLong(step) { return `${formatTokens(stepsToTokens(step))} Tokens (${formatStep(step)} Steps)`; }
254
 
 
255
  function getCategoricalColors(n) {
256
  try { if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') return window.ColorPalettes.getColors('categorical', n); } catch (_) {}
257
  return (d3.schemeTableau10 || ['#4e79a7','#f28e2b','#e15759','#76b7b2','#59a14f','#edc948','#b07aa1','#ff9da7','#9c755f','#bab0ac']).slice(0, n);
258
  }
259
 
260
  function initColors() {
261
+ const allNames = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
262
  if (!Object.keys(colorMap).length) {
263
  const palette = getCategoricalColors(allNames.length);
264
  allNames.forEach((name, i) => { colorMap[name] = palette[i % palette.length]; });
 
279
  }
280
 
281
  function updateHighlight() {
 
282
  gRoot.selectAll('rect.bar').classed('ghost', d => highlight && d.name !== highlight);
283
  gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
 
284
  gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
285
  gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
 
286
  container.querySelectorAll('.legend .item').forEach(el => {
287
  el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
288
  });
289
  }
290
 
291
+ // ─── AUTO-DETECT METRICS from CSV columns ───
292
+ function detectMetrics(columns) {
293
+ const skip = new Set([RUN_COL, STEP_COL, 'seed']);
294
+ // Ordered: aggregate first, then individual
295
+ const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
296
+ const agg = aggOrder.filter(k => columns.includes(k));
297
+ const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
298
+ return [...agg, ...ind];
299
+ }
300
+
301
  // ─── BAR CHART ───
302
  function renderBar() {
303
  const width = container.clientWidth || 800;
304
  const margin = { top: 12, right: 56, bottom: 32, left: 140 };
305
 
306
+ const grouped = d3.group(allData, d => d[RUN_COL]);
307
  const finalData = [];
308
+ for (const [raw, rows] of grouped) {
309
+ const maxStep = d3.max(rows, r => +r[STEP_COL]);
310
+ const row = rows.find(r => +r[STEP_COL] === maxStep);
311
+ if (row) finalData.push({ name: displayName(raw), rawName: raw, value: +row[currentMetric] });
312
  }
313
  finalData.sort((a, b) => b.value - a.value);
314
 
 
346
  g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
347
  });
348
 
 
 
 
349
  // Bars
350
+ const barTip = (ev, d) => {
351
+ const [mx, my] = d3.pointer(ev, container);
352
+ showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(4)}</strong>`, mx, my);
353
+ };
354
  gRoot.selectAll('rect.bar').data(finalData, d => d.name).join(
355
  enter => enter.append('rect').attr('class', 'bar')
356
  .attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
357
  .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
358
  .attr('width', 0)
359
  .on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
360
+ .on('mousemove', barTip)
 
 
 
361
  .on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
362
  .transition().duration(300).attr('width', d => Math.max(0, x(d.value))),
363
  update => update
364
  .on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
365
+ .on('mousemove', barTip)
 
 
 
366
  .on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
367
  .transition().duration(300)
368
  .attr('y', d => y(d.name)).attr('height', y.bandwidth())
 
395
  const innerWidth = width - margin.left - margin.right;
396
  const innerHeight = height - margin.top - margin.bottom;
397
 
398
+ // Build series
399
+ const grouped = d3.group(allData, d => d[RUN_COL]);
 
 
 
400
  const series = [];
401
+ for (const [raw, rows] of grouped) {
402
+ const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
403
+ series.push({ name: displayName(raw), rawName: raw, values: pts });
404
  }
405
 
406
+ const allSteps = Array.from(new Set(allData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
407
  const allValues = series.flatMap(s => s.values.map(v => v.value));
408
 
409
  const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
410
+ const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
 
 
411
  const y = d3.scaleLinear().domain([yMin - yPad, yMax + yPad]).range([innerHeight, 0]).nice();
412
 
413
  // Grid
414
  gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
415
  g.selectAll('line').data(y.ticks(6)).join('line')
416
+ .attr('x1', 0).attr('x2', innerWidth).attr('y1', d => y(d)).attr('y2', d => y(d));
 
417
  });
418
 
419
+ // X axis
420
  gRoot.selectAll('.axis-x').data([0]).join('g').attr('class', 'axes axis-x')
421
  .attr('transform', `translate(0,${innerHeight})`)
422
  .call(d3.axisBottom(x).ticks(6).tickFormat(d => stepLabelShort(d)).tickSizeOuter(0))
 
442
  gRoot.selectAll('.y-label').data([0]).join('text').attr('class', 'y-label')
443
  .attr('transform', 'rotate(-90)').attr('x', -innerHeight / 2).attr('y', -44)
444
  .attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
445
+ .text(metricName(currentMetric));
446
 
447
  // Lines
448
  const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
 
456
  exit => exit.remove()
457
  );
458
 
459
+ // Dots
460
  const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
461
  gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
462
  enter => enter.append('circle').attr('class', 'line-dot')
 
469
  exit => exit.remove()
470
  );
471
 
472
+ // Hover overlay
473
  gRoot.selectAll('.hover-line').data([0]).join('line').attr('class', 'hover-line')
474
  .attr('y1', 0).attr('y2', innerHeight).style('display', 'none');
475
 
 
478
  .attr('fill', 'none').attr('pointer-events', 'all')
479
  .on('mousemove', (ev) => {
480
  const [mx] = d3.pointer(ev, gRoot.node());
481
+ const nearest = allSteps.reduce((best, s) => Math.abs(s - x.invert(mx)) < Math.abs(best - x.invert(mx)) ? s : best, allSteps[0]);
482
+ gRoot.select('.hover-line').attr('x1', x(nearest)).attr('x2', x(nearest)).style('display', null);
 
483
 
 
 
 
484
  const entries = series.map(s => {
485
  const pt = s.values.find(v => v.step === nearest);
486
  return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
 
490
  entries.forEach(e => {
491
  html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(4)}</strong></div>`;
492
  });
 
493
  const [cx, cy] = d3.pointer(ev, container);
494
  showTip(html, cx, cy);
495
  })
 
499
  });
500
  }
501
 
502
+ // ─── RENDER ───
503
  function render() {
504
  if (!allData.length) return;
505
  initColors();
506
  gRoot.selectAll('*').remove();
507
+ if (currentView === 'bar') renderBar(); else renderLine();
 
508
  }
509
 
510
+ // ─── UI ───
511
  function buildUI() {
512
  const controls = document.createElement('div'); controls.className = 'controls';
513
 
514
  // View toggle
515
  const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
516
+ const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
517
+ const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
518
  [['bar', 'Final Score (Bar)'], ['line', 'Training Progression (Line)']].forEach(([val, text]) => {
519
  const opt = document.createElement('option'); opt.value = val; opt.textContent = text;
520
  if (val === currentView) opt.selected = true;
 
524
  viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
525
  controls.appendChild(viewGroup);
526
 
527
+ // Metric select (populated after data load)
528
  const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
529
+ const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
530
+ const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
 
 
 
 
 
 
 
 
 
531
  metricGroup.appendChild(metricLabel); metricGroup.appendChild(metricSelect);
532
  controls.appendChild(metricGroup);
533
 
 
539
  container.appendChild(legend);
540
  }
541
 
542
+ function populateMetricSelect() {
543
+ const sel = container.querySelector('#metric-' + uid);
544
+ if (!sel) return;
545
+ sel.innerHTML = '';
546
+ const aggGroup = document.createElement('optgroup'); aggGroup.label = 'Aggregate Scores';
547
+ const indGroup = document.createElement('optgroup'); indGroup.label = 'Individual Benchmarks';
548
+ metricKeys.forEach(key => {
549
+ const opt = document.createElement('option'); opt.value = key; opt.textContent = metricName(key);
550
+ if (key === currentMetric) opt.selected = true;
551
+ if (key.startsWith('agg_score')) aggGroup.appendChild(opt); else indGroup.appendChild(opt);
552
+ });
553
+ if (aggGroup.children.length) sel.appendChild(aggGroup);
554
+ if (indGroup.children.length) sel.appendChild(indGroup);
555
+ sel.addEventListener('change', () => { currentMetric = sel.value; render(); });
556
+ }
557
+
558
  function buildLegend() {
559
  const items = container.querySelector('.legend .items');
560
  if (!items) return;
561
  items.innerHTML = '';
562
+ // Sort by final score (max step) on current default metric, descending
563
+ const grouped = d3.group(allData, d => d[RUN_COL]);
564
+ const sorted = Array.from(grouped.entries())
565
+ .map(([raw, rows]) => {
566
+ const maxStep = d3.max(rows, r => +r[STEP_COL]);
567
+ const row = rows.find(r => +r[STEP_COL] === maxStep);
568
+ return { raw, score: row ? +row[defaultMetric] : 0 };
569
+ })
570
+ .sort((a, b) => b.score - a.score)
571
+ .map(d => d.raw);
572
+ sorted.forEach(raw => {
573
+ const name = displayName(raw);
574
+ const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
575
+ const sw = document.createElement('span'); sw.className = 'swatch'; sw.style.background = colorMap[raw] || '#999';
576
+ const txt = document.createElement('span'); txt.textContent = name;
577
  el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
578
+ el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
579
  el.addEventListener('mouseleave', () => { highlight = null; updateHighlight(); });
580
  });
581
  }
 
607
  try {
608
  const text = await fetchFirstAvailable(csvPaths);
609
  allData = d3.csvParse(text);
610
+ metricKeys = detectMetrics(allData.columns);
611
+ // Ensure defaultMetric is valid; fall back to first available
612
+ if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
613
+ populateMetricSelect();
614
  render();
615
  buildLegend();
616
  if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }