Spaces:
Running
Running
feat: update with real data points
Browse files- data/bed_dataset.csv +247 -13
- data/bigwig_dataset.csv +0 -0
- src/streamlit_app.py +115 -75
data/bed_dataset.csv
CHANGED
|
@@ -1,13 +1,247 @@
|
|
| 1 |
-
species,datasets
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MCC,model_name,species,datasets
|
| 2 |
+
0.334637850522995,NTv2 500M,cattle,intron
|
| 3 |
+
0.1238768473267555,BPNet arch. 6M,cattle,intron
|
| 4 |
+
0.383470207452774,Residual CNN 44M,cattle,intron
|
| 5 |
+
0.3828243613243103,HyenaDNA 7M,cattle,intron
|
| 6 |
+
0.4733810424804687,Caduceus 7M,cattle,intron
|
| 7 |
+
0.4315277338027954,Evo2 1B,cattle,intron
|
| 8 |
+
0.5455867648124695,NTv3 8M (pre),cattle,intron
|
| 9 |
+
0.5453664064407349,NTv3 100M (pre),cattle,intron
|
| 10 |
+
0.5628412365913391,NTv3 650M (pre),cattle,intron
|
| 11 |
+
0.5682631134986877,NTv3 650M (post),cattle,intron
|
| 12 |
+
0.3689357042312622,NTv2 500M,cattle,exon
|
| 13 |
+
0.3250860869884491,BPNet arch. 6M,cattle,exon
|
| 14 |
+
0.4674676060676574,Residual CNN 44M,cattle,exon
|
| 15 |
+
0.2207767516374588,HyenaDNA 7M,cattle,exon
|
| 16 |
+
0.4960922300815582,Caduceus 7M,cattle,exon
|
| 17 |
+
0.4969632029533386,Evo2 1B,cattle,exon
|
| 18 |
+
0.5432836413383484,NTv3 8M (pre),cattle,exon
|
| 19 |
+
0.5531933307647705,NTv3 100M (pre),cattle,exon
|
| 20 |
+
0.591151773929596,NTv3 650M (pre),cattle,exon
|
| 21 |
+
0.6253225207328796,NTv3 650M (post),cattle,exon
|
| 22 |
+
0.118808165192604,NTv2 500M,cattle,splice acceptor
|
| 23 |
+
0.4715546369552612,BPNet arch. 6M,cattle,splice acceptor
|
| 24 |
+
0.6620649099349976,Residual CNN 44M,cattle,splice acceptor
|
| 25 |
+
0.104436807334423,HyenaDNA 7M,cattle,splice acceptor
|
| 26 |
+
0.7064619660377502,Caduceus 7M,cattle,splice acceptor
|
| 27 |
+
0.2085049450397491,Evo2 1B,cattle,splice acceptor
|
| 28 |
+
0.7254849076271057,NTv3 8M (pre),cattle,splice acceptor
|
| 29 |
+
0.7404072880744934,NTv3 100M (pre),cattle,splice acceptor
|
| 30 |
+
0.7732946872711182,NTv3 650M (pre),cattle,splice acceptor
|
| 31 |
+
0.7679624557495117,NTv3 650M (post),cattle,splice acceptor
|
| 32 |
+
0.1412438601255417,NTv2 500M,cattle,start codon
|
| 33 |
+
0.1490814685821533,BPNet arch. 6M,cattle,start codon
|
| 34 |
+
0.3243320286273956,Residual CNN 44M,cattle,start codon
|
| 35 |
+
0.056509330868721,HyenaDNA 7M,cattle,start codon
|
| 36 |
+
0.3455557227134704,Caduceus 7M,cattle,start codon
|
| 37 |
+
0.1030694246292114,Evo2 1B,cattle,start codon
|
| 38 |
+
0.5275959968566895,NTv3 8M (pre),cattle,start codon
|
| 39 |
+
0.4962065815925598,NTv3 100M (pre),cattle,start codon
|
| 40 |
+
0.5591813921928406,NTv3 650M (pre),cattle,start codon
|
| 41 |
+
0.5492052435874939,NTv3 650M (post),cattle,start codon
|
| 42 |
+
0.5492052435874939,NTv2 500M,cattle,start codon
|
| 43 |
+
0.1015273928642273,BPNet arch. 6M,cattle,intron
|
| 44 |
+
0.3299930691719055,Residual CNN 44M,cattle,intron
|
| 45 |
+
0.3826011121273041,HyenaDNA 7M,cattle,intron
|
| 46 |
+
0.5564854741096497,Caduceus 7M,cattle,intron
|
| 47 |
+
0.5564854741096497,NTv2 500M,cattle,intron
|
| 48 |
+
0.323502242565155,BPNet arch. 6M,cattle,exon
|
| 49 |
+
0.519285261631012,Residual CNN 44M,cattle,exon
|
| 50 |
+
0.1038060635328292,HyenaDNA 7M,cattle,splice acceptor
|
| 51 |
+
0.1038060635328292,Caduceus 7M,cattle,splice acceptor
|
| 52 |
+
0.1038060635328292,NTv2 500M,cattle,splice acceptor
|
| 53 |
+
0.4435675740242004,BPNet arch. 6M,cattle,splice acceptor
|
| 54 |
+
0.6590774655342102,Residual CNN 44M,cattle,splice acceptor
|
| 55 |
+
0.1038060635328292,HyenaDNA 7M,cattle,splice acceptor
|
| 56 |
+
0.1038060635328292,Caduceus 7M,cattle,splice acceptor
|
| 57 |
+
0.1038060635328292,NTv2 500M,cattle,splice acceptor
|
| 58 |
+
0.0901669710874557,BPNet arch. 6M,cattle,start codon
|
| 59 |
+
0.3548502624034881,Residual CNN 44M,cattle,start codon
|
| 60 |
+
0.0545537285506725,HyenaDNA 7M,cattle,start codon
|
| 61 |
+
0.0545537285506725,Caduceus 7M,cattle,start codon
|
| 62 |
+
0.0639578104019165,BPNet arch. 6M,cattle,start codon
|
| 63 |
+
0.3266464471817016,Residual CNN 44M,cattle,intron
|
| 64 |
+
0.3266464471817016,HyenaDNA 7M,cattle,intron
|
| 65 |
+
0.3266464471817016,Caduceus 7M,cattle,intron
|
| 66 |
+
0.1383400112390518,BPNet arch. 6M,cattle,intron
|
| 67 |
+
0.4856111407279968,Residual CNN 44M,cattle,exon
|
| 68 |
+
0.4856111407279968,HyenaDNA 7M,cattle,exon
|
| 69 |
+
0.4856111407279968,Caduceus 7M,cattle,exon
|
| 70 |
+
0.4220209121704101,BPNet arch. 6M,cattle,splice acceptor
|
| 71 |
+
0.689546525478363,Residual CNN 44M,cattle,splice acceptor
|
| 72 |
+
0.689546525478363,HyenaDNA 7M,cattle,splice acceptor
|
| 73 |
+
0.689546525478363,Caduceus 7M,cattle,splice acceptor
|
| 74 |
+
0.0930091217160224,BPNet arch. 6M,cattle,start codon
|
| 75 |
+
0.423166275024414,Residual CNN 44M,cattle,start codon
|
| 76 |
+
0.423166275024414,HyenaDNA 7M,cattle,start codon
|
| 77 |
+
0.423166275024414,Caduceus 7M,cattle,start codon
|
| 78 |
+
0.4777896404266357,NTv2 500M,tomato,intron
|
| 79 |
+
0.3216900527477264,BPNet arch. 6M,tomato,intron
|
| 80 |
+
0.46840900182724,Residual CNN 44M,tomato,intron
|
| 81 |
+
0.5251263380050659,PlantCAD2 88M,tomato,intron
|
| 82 |
+
0.747674286365509,Evo2 1B,tomato,intron
|
| 83 |
+
0.6858112812042236,NTv3 8M (pre),tomato,intron
|
| 84 |
+
0.7038365006446838,NTv3 100M (pre),tomato,intron
|
| 85 |
+
0.7481895685195923,NTv3 650M (pre),tomato,intron
|
| 86 |
+
0.7458349466323853,NTv3 650M (post),tomato,intron
|
| 87 |
+
0.6147475838661194,NTv2 500M,tomato,exon
|
| 88 |
+
0.4551227986812591,BPNet arch. 6M,tomato,exon
|
| 89 |
+
0.5068296194076538,Residual CNN 44M,tomato,exon
|
| 90 |
+
0.7256030440330505,PlantCAD2 88M,tomato,exon
|
| 91 |
+
0.7006198763847351,Evo2 1B,tomato,exon
|
| 92 |
+
0.7537696361541748,NTv3 8M (pre),tomato,exon
|
| 93 |
+
0.7484462857246399,NTv3 100M (pre),tomato,exon
|
| 94 |
+
0.764011561870575,NTv3 650M (pre),tomato,exon
|
| 95 |
+
0.7750575542449951,NTv3 650M (post),tomato,exon
|
| 96 |
+
0.1691933125257492,NTv2 500M,tomato,splice acceptor
|
| 97 |
+
0.125656172633171,BPNet arch. 6M,tomato,splice acceptor
|
| 98 |
+
0.4359458982944488,Residual CNN 44M,tomato,splice acceptor
|
| 99 |
+
0.744257926940918,PlantCAD2 88M,tomato,splice acceptor
|
| 100 |
+
0.3791649639606476,Evo2 1B,tomato,splice acceptor
|
| 101 |
+
0.6623862385749817,NTv3 8M (pre),tomato,splice acceptor
|
| 102 |
+
0.6843105554580688,NTv3 100M (pre),tomato,splice acceptor
|
| 103 |
+
0.7641868591308594,NTv3 650M (pre),tomato,splice acceptor
|
| 104 |
+
0.7584431767463684,NTv3 650M (post),tomato,splice acceptor
|
| 105 |
+
0.132934883236885,NTv2 500M,tomato,start codon
|
| 106 |
+
0.0,BPNet arch. 6M,tomato,start codon
|
| 107 |
+
0.088478960096836,Residual CNN 44M,tomato,start codon
|
| 108 |
+
0.2019559442996978,PlantCAD2 88M,tomato,start codon
|
| 109 |
+
0.1622217148542404,Evo2 1B,tomato,start codon
|
| 110 |
+
0.2966536581516266,NTv3 8M (pre),tomato,start codon
|
| 111 |
+
0.3968957066535949,NTv3 100M (pre),tomato,start codon
|
| 112 |
+
0.4830105900764465,NTv3 650M (pre),tomato,start codon
|
| 113 |
+
0.5007501244544983,NTv3 650M (post),tomato,start codon
|
| 114 |
+
0.6770024299621582,NTv2 500M,tomato,intron
|
| 115 |
+
0.2927957773208618,BPNet arch. 6M,tomato,intron
|
| 116 |
+
0.1383400112390518,Residual CNN 44M,tomato,intron
|
| 117 |
+
0.1383400112390518,PlantCAD2 88M,tomato,intron
|
| 118 |
+
0.5751976370811462,NTv2 500M,tomato,exon
|
| 119 |
+
0.3057552278041839,BPNet arch. 6M,tomato,exon
|
| 120 |
+
0.168193981051445,NTv2 500M,tomato,splice acceptor
|
| 121 |
+
0.0,BPNet arch. 6M,tomato,splice acceptor
|
| 122 |
+
0.4833243191242218,Residual CNN 44M,tomato,splice acceptor
|
| 123 |
+
0.4833243191242218,PlantCAD2 88M,tomato,splice acceptor
|
| 124 |
+
0.1586925536394119,NTv2 500M,tomato,start codon
|
| 125 |
+
0.0,BPNet arch. 6M,tomato,start codon
|
| 126 |
+
0.1107296794652938,Residual CNN 44M,tomato,start codon
|
| 127 |
+
0.1107296794652938,PlantCAD2 88M,tomato,start codon
|
| 128 |
+
0.3502058088779449,BPNet arch. 6M,tomato,intron
|
| 129 |
+
0.5514466166496277,Residual CNN 44M,tomato,intron
|
| 130 |
+
0.5514466166496277,PlantCAD2 88M,tomato,intron
|
| 131 |
+
0.3020758032798767,BPNet arch. 6M,tomato,exon
|
| 132 |
+
0.4746756553649902,Residual CNN 44M,tomato,exon
|
| 133 |
+
0.4746756553649902,PlantCAD2 88M,tomato,exon
|
| 134 |
+
0.0,BPNet arch. 6M,tomato,splice acceptor
|
| 135 |
+
0.3391502797603607,Residual CNN 44M,tomato,splice acceptor
|
| 136 |
+
0.3391502797603607,PlantCAD2 88M,tomato,splice acceptor
|
| 137 |
+
0.0639578104019165,BPNet arch. 6M,tomato,start codon
|
| 138 |
+
0.0914037525653839,Residual CNN 44M,tomato,start codon
|
| 139 |
+
0.0914037525653839,PlantCAD2 88M,tomato,start codon
|
| 140 |
+
0.1995969861745834,NTv2 500M,human,intron
|
| 141 |
+
0.0296161584556102,BPNet arch. 6M,human,intron
|
| 142 |
+
0.2347834408283233,Residual CNN 44M,human,intron
|
| 143 |
+
0.33451908826828,HyenaDNA 7M,human,intron
|
| 144 |
+
0.4144788980484009,Caduceus 7M,human,intron
|
| 145 |
+
0.0,Evo2 1B,human,intron
|
| 146 |
+
0.4695742726325989,NTv3 8M (pre),human,intron
|
| 147 |
+
0.475054919719696,NTv3 100M (pre),human,intron
|
| 148 |
+
0.5504136681556702,NTv3 650M (pre),human,intron
|
| 149 |
+
0.5643875002861023,NTv3 650M (post),human,intron
|
| 150 |
+
0.1995969861745834,NTv2 500M,human,intron
|
| 151 |
+
0.2706590592861175,BPNet arch. 6M,human,exon
|
| 152 |
+
0.2678671479225158,Residual CNN 44M,human,exon
|
| 153 |
+
0.179698497056961,HyenaDNA 7M,human,exon
|
| 154 |
+
0.5098947286605835,Caduceus 7M,human,exon
|
| 155 |
+
0.4510694444179535,Evo2 1B,human,exon
|
| 156 |
+
0.6089931726455688,NTv3 8M (pre),human,exon
|
| 157 |
+
0.6492856740951538,NTv3 100M (pre),human,exon
|
| 158 |
+
0.6975767016410828,NTv3 650M (pre),human,exon
|
| 159 |
+
0.6822624206542969,NTv3 650M (post),human,exon
|
| 160 |
+
0.1493269056081771,NTv2 500M,human,splice acceptor
|
| 161 |
+
0.3807527124881744,BPNet arch. 6M,human,splice acceptor
|
| 162 |
+
0.6632664203643799,Residual CNN 44M,human,splice acceptor
|
| 163 |
+
0.1002769619226455,HyenaDNA 7M,human,splice acceptor
|
| 164 |
+
0.7357247471809387,Caduceus 7M,human,splice acceptor
|
| 165 |
+
0.1821079105138778,Evo2 1B,human,splice acceptor
|
| 166 |
+
0.7726271748542786,NTv3 8M (pre),human,splice acceptor
|
| 167 |
+
0.77947598695755,NTv3 100M (pre),human,splice acceptor
|
| 168 |
+
0.8028115034103394,NTv3 650M (pre),human,splice acceptor
|
| 169 |
+
0.7979229092597961,NTv3 650M (post),human,splice acceptor
|
| 170 |
+
0.139576569199562,NTv2 500M,human,start codon
|
| 171 |
+
0.1334401220083236,BPNet arch. 6M,human,start codon
|
| 172 |
+
0.3876807987689972,Residual CNN 44M,human,start codon
|
| 173 |
+
0.1003016158938407,HyenaDNA 7M,human,start codon
|
| 174 |
+
0.3958532512187958,Caduceus 7M,human,start codon
|
| 175 |
+
0.1399599611759185,Evo2 1B,human,start codon
|
| 176 |
+
0.540923535823822,NTv3 8M (pre),human,start codon
|
| 177 |
+
0.5464004278182983,NTv3 100M (pre),human,start codon
|
| 178 |
+
0.6803378462791443,NTv3 650M (pre),human,start codon
|
| 179 |
+
0.7310947179794312,NTv3 650M (post),human,start codon
|
| 180 |
+
0.7310947179794312,NTv2 500M,human,start codon
|
| 181 |
+
0.0172978900372982,BPNet arch. 6M,human,intron
|
| 182 |
+
0.2740728259086609,Residual CNN 44M,human,intron
|
| 183 |
+
0.3312098085880279,HyenaDNA 7M,human,intron
|
| 184 |
+
0.5108950138092041,Caduceus 7M,human,intron
|
| 185 |
+
0.5034915208816528,NTv3 8M (pre),human,intron
|
| 186 |
+
0.5154411792755127,NTv3 100M (pre),human,intron
|
| 187 |
+
0.5814740061759949,NTv3 650M (pre),human,intron
|
| 188 |
+
0.5920455455780029,NTv3 650M (post),human,intron
|
| 189 |
+
0.5920455455780029,NTv2 500M,human,intron
|
| 190 |
+
0.2252149283885955,BPNet arch. 6M,human,exon
|
| 191 |
+
0.4010578095912933,Residual CNN 44M,human,exon
|
| 192 |
+
0.1851459741592407,HyenaDNA 7M,human,exon
|
| 193 |
+
0.4599409103393554,Caduceus 7M,human,exon
|
| 194 |
+
0.5931490063667297,NTv3 8M (pre),human,exon
|
| 195 |
+
0.6058318018913269,NTv3 100M (pre),human,exon
|
| 196 |
+
0.6738048791885376,NTv3 650M (pre),human,exon
|
| 197 |
+
0.6738048791885376,NTv3 650M (post),human,exon
|
| 198 |
+
0.6738048791885376,NTv2 500M,human,exon
|
| 199 |
+
0.3751010596752167,BPNet arch. 6M,human,splice acceptor
|
| 200 |
+
0.681228756904602,Residual CNN 44M,human,splice acceptor
|
| 201 |
+
0.0252278540283441,HyenaDNA 7M,human,splice acceptor
|
| 202 |
+
0.7485092878341675,Caduceus 7M,human,splice acceptor
|
| 203 |
+
0.7772909998893738,NTv3 8M (pre),human,splice acceptor
|
| 204 |
+
0.794090747833252,NTv3 100M (pre),human,splice acceptor
|
| 205 |
+
0.8239933252334595,NTv3 650M (pre),human,splice acceptor
|
| 206 |
+
0.804115891456604,NTv3 650M (post),human,splice acceptor
|
| 207 |
+
0.804115891456604,NTv2 500M,human,splice acceptor
|
| 208 |
+
0.0,BPNet arch. 6M,human,start codon
|
| 209 |
+
0.3292546272277832,Residual CNN 44M,human,start codon
|
| 210 |
+
0.0647941380739212,HyenaDNA 7M,human,start codon
|
| 211 |
+
0.4505241215229034,Caduceus 7M,human,start codon
|
| 212 |
+
0.60422682762146,NTv3 8M (pre),human,start codon
|
| 213 |
+
0.6015576124191284,NTv3 100M (pre),human,start codon
|
| 214 |
+
0.6452956795692444,NTv3 650M (pre),human,start codon
|
| 215 |
+
0.6761345267295837,NTv3 650M (post),human,start codon
|
| 216 |
+
0.0185965970158576,BPNet arch. 6M,human,intron
|
| 217 |
+
0.2623045742511749,Residual CNN 44M,human,intron
|
| 218 |
+
0.2623045742511749,HyenaDNA 7M,human,intron
|
| 219 |
+
0.2623045742511749,Caduceus 7M,human,intron
|
| 220 |
+
0.4804849028587341,NTv3 8M (pre),human,intron
|
| 221 |
+
0.482195496559143,NTv3 100M (pre),human,intron
|
| 222 |
+
0.5425574779510498,NTv3 650M (pre),human,intron
|
| 223 |
+
0.5443048477172852,NTv3 650M (post),human,intron
|
| 224 |
+
0.2360571771860122,BPNet arch. 6M,human,exon
|
| 225 |
+
0.2360571771860122,Residual CNN 44M,human,exon
|
| 226 |
+
0.2360571771860122,HyenaDNA 7M,human,exon
|
| 227 |
+
0.2360571771860122,Caduceus 7M,human,exon
|
| 228 |
+
0.6339762210845947,NTv3 8M (pre),human,exon
|
| 229 |
+
0.6433913111686707,NTv3 100M (pre),human,exon
|
| 230 |
+
0.6518793702125549,NTv3 650M (pre),human,exon
|
| 231 |
+
0.6812491416931152,NTv3 650M (post),human,exon
|
| 232 |
+
0.3842235207557678,BPNet arch. 6M,human,splice acceptor
|
| 233 |
+
0.6810190081596375,Residual CNN 44M,human,splice acceptor
|
| 234 |
+
0.6810190081596375,HyenaDNA 7M,human,splice acceptor
|
| 235 |
+
0.6810190081596375,Caduceus 7M,human,splice acceptor
|
| 236 |
+
0.7796080708503723,NTv3 8M (pre),human,splice acceptor
|
| 237 |
+
0.7596970200538635,NTv3 100M (pre),human,splice acceptor
|
| 238 |
+
0.7915040850639343,NTv3 650M (pre),human,splice acceptor
|
| 239 |
+
0.7957100868225098,NTv3 650M (post),human,splice acceptor
|
| 240 |
+
0.1114460304379463,BPNet arch. 6M,human,start codon
|
| 241 |
+
0.3342535495758056,Residual CNN 44M,human,start codon
|
| 242 |
+
0.3342535495758056,HyenaDNA 7M,human,start codon
|
| 243 |
+
0.3342535495758056,Caduceus 7M,human,start codon
|
| 244 |
+
0.5167152881622314,NTv3 8M (pre),human,start codon
|
| 245 |
+
0.5340564250946045,NTv3 100M (pre),human,start codon
|
| 246 |
+
0.6148532032966614,NTv3 650M (pre),human,start codon
|
| 247 |
+
0.6582212448120117,NTv3 650M (post),human,start codon
|
data/bigwig_dataset.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/streamlit_app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
from typing import List
|
| 2 |
-
import ast
|
| 3 |
import os
|
| 4 |
|
| 5 |
import pandas as pd
|
|
@@ -10,47 +9,73 @@ import plotly.express as px
|
|
| 10 |
# Page config (must be the first Streamlit command)
|
| 11 |
# ---------------------------------------------------------------------
|
| 12 |
st.set_page_config(
|
| 13 |
-
page_title="
|
| 14 |
layout="wide",
|
| 15 |
)
|
| 16 |
|
| 17 |
# ---------------------------------------------------------------------
|
| 18 |
# Configuration
|
| 19 |
# ---------------------------------------------------------------------
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
MODEL_COLORS = {
|
| 34 |
-
"
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
|
|
|
| 43 |
}
|
| 44 |
|
|
|
|
|
|
|
| 45 |
_LAST_UPDATED = "Dec 10, 2025"
|
| 46 |
_INTRO = """
|
| 47 |
-
|
| 48 |
|
| 49 |
- **Pearson correlations (multi-assay)**: per-dataset scores across species and models.
|
| 50 |
- **MCC (bed tracks)**: per-track MCC values across species and models.
|
| 51 |
|
| 52 |
-
|
| 53 |
-
We expand this to (Model × Species × Dataset) and aggregate according to your filters.
|
| 54 |
"""
|
| 55 |
|
| 56 |
HERE = os.path.dirname(os.path.abspath(__file__)) # /app/src
|
|
@@ -59,6 +84,7 @@ DATA_DIR = os.path.join(PROJECT_ROOT, "data")
|
|
| 59 |
|
| 60 |
PEARSON_PATH = os.path.join(DATA_DIR, "bigwig_dataset.csv")
|
| 61 |
MCC_PATH = os.path.join(DATA_DIR, "bed_dataset.csv")
|
|
|
|
| 62 |
# ---------------------------------------------------------------------
|
| 63 |
# Data loading & preprocessing
|
| 64 |
# ---------------------------------------------------------------------
|
|
@@ -72,57 +98,71 @@ def load_raw_data():
|
|
| 72 |
pearson_df.columns = [c.strip() for c in pearson_df.columns]
|
| 73 |
mcc_df.columns = [c.strip() for c in mcc_df.columns]
|
| 74 |
|
| 75 |
-
# Optional: basic sanity check on required columns
|
| 76 |
-
# required_p = {"species", "datasets", "pearson correlation"}
|
| 77 |
-
# required_m = {"species", "datasets", "MCC"}
|
| 78 |
-
# missing_p = required_p - set(pearson_df.columns)
|
| 79 |
-
# missing_m = required_m - set(mcc_df.columns)
|
| 80 |
-
# if missing_p:
|
| 81 |
-
# st.error(f"Pearson CSV missing columns: {missing_p}")
|
| 82 |
-
# if missing_m:
|
| 83 |
-
# st.error(f"MCC CSV missing columns: {missing_m}")
|
| 84 |
-
|
| 85 |
return pearson_df, mcc_df
|
| 86 |
|
| 87 |
|
| 88 |
-
|
|
|
|
| 89 |
"""
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"""
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
continue
|
| 104 |
-
|
| 105 |
-
n_models = min(len(MODEL_NAMES), len(values))
|
| 106 |
-
for i in range(n_models):
|
| 107 |
-
new_row = {
|
| 108 |
-
"species": row["species"],
|
| 109 |
-
"datasets": row["datasets"],
|
| 110 |
-
"Model": MODEL_NAMES[i],
|
| 111 |
-
"Score": float(values[i]),
|
| 112 |
-
}
|
| 113 |
-
if "assay_type" in row.index:
|
| 114 |
-
new_row["assay_type"] = row["assay_type"]
|
| 115 |
-
rows.append(new_row)
|
| 116 |
-
|
| 117 |
-
return pd.DataFrame(rows)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
_PEARSON_DF, _MCC_DF = load_expanded_data()
|
|
@@ -259,7 +299,7 @@ def sidebar_toggle(label: str, value: bool = False, key: str | None = None) -> b
|
|
| 259 |
|
| 260 |
|
| 261 |
def main():
|
| 262 |
-
st.title("🧬
|
| 263 |
st.markdown(_INTRO)
|
| 264 |
st.markdown(f"_Last updated: **{_LAST_UPDATED}**_")
|
| 265 |
|
|
@@ -286,7 +326,7 @@ def main():
|
|
| 286 |
|
| 287 |
# Assay toggles (Pearson only), based on filtered species
|
| 288 |
if cfg.get("has_assay_type", False):
|
| 289 |
-
st.sidebar.subheader("Assay types
|
| 290 |
if selected_species:
|
| 291 |
df_for_assays = df_bench[df_bench["species"].isin(selected_species)]
|
| 292 |
else:
|
|
@@ -305,8 +345,8 @@ def main():
|
|
| 305 |
|
| 306 |
# Bed track / dataset toggles (MCC only), based on species selection
|
| 307 |
selected_datasets: List[str] = []
|
| 308 |
-
if benchmark_name == "MCC
|
| 309 |
-
st.sidebar.subheader("
|
| 310 |
if selected_species:
|
| 311 |
df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
|
| 312 |
else:
|
|
@@ -318,7 +358,7 @@ def main():
|
|
| 318 |
else:
|
| 319 |
selected_datasets = []
|
| 320 |
|
| 321 |
-
# Model toggles (we keep all models
|
| 322 |
st.sidebar.subheader("Models")
|
| 323 |
selected_models: List[str] = []
|
| 324 |
for model in _ALL_MODELS:
|
|
|
|
| 1 |
from typing import List
|
|
|
|
| 2 |
import os
|
| 3 |
|
| 4 |
import pandas as pd
|
|
|
|
| 9 |
# Page config (must be the first Streamlit command)
|
| 10 |
# ---------------------------------------------------------------------
|
| 11 |
st.set_page_config(
|
| 12 |
+
page_title="NTv3 Benchmark",
|
| 13 |
layout="wide",
|
| 14 |
)
|
| 15 |
|
| 16 |
# ---------------------------------------------------------------------
|
| 17 |
# Configuration
|
| 18 |
# ---------------------------------------------------------------------
|
| 19 |
+
COLORS = {
|
| 20 |
+
# Primary colors 1 (our models)
|
| 21 |
+
'blue_0': '#004697', # Darkest allowable blue
|
| 22 |
+
'blue_1': '#3973fc', # Main blue
|
| 23 |
+
'blue_2': '#7ea4fc', # Medium blue
|
| 24 |
+
'blue_3': '#c3d5fc', # Light blue (lightest allowable blue)
|
| 25 |
+
# Secondary colors 1
|
| 26 |
+
'red_1': '#ff554d', # Medium red
|
| 27 |
+
'red_2': '#ffe0de', # Light red
|
| 28 |
+
# Primary colors 2
|
| 29 |
+
'green_1': '#00b050', # Darkest green
|
| 30 |
+
'green_2': '#92d050', # Medium green
|
| 31 |
+
'green_3': '#c6e0b4', # Light green (lightest allowable green)
|
| 32 |
+
# Secondary colors 2
|
| 33 |
+
'gold_1': '#fdb932',
|
| 34 |
+
# Tertiary colors
|
| 35 |
+
'orange_1': '#ff975e',
|
| 36 |
+
'purple_1': '#9a6ce4',
|
| 37 |
+
'purple_2': '#bb9aef', # Medium purple
|
| 38 |
+
'purple_3': '#ceb5f5', # Light purple (lightest allowable purple)
|
| 39 |
+
# Grays (other models)
|
| 40 |
+
'gray_1': '#808080', # Darkest gray (use as a last resort)
|
| 41 |
+
'gray_2': '#b3b3b3', # Medium gray (start with this as the darkest when possible)
|
| 42 |
+
'gray_3': '#e6e6e6', # Lightest gray
|
| 43 |
+
'gray_4': '#ffffff', # It's actually just white (use as a last resort)
|
| 44 |
+
# If all other options are exhausted
|
| 45 |
+
'cyan_1': '#0096b4', # Darkest teal
|
| 46 |
+
'cyan_2': '#28bed2', # Medium cyan
|
| 47 |
+
'cyan_3': '#8cdceb', # Lightest cyan
|
| 48 |
+
'magenta_1': '#b428a0', # Darkest magenta
|
| 49 |
+
'magenta_2': '#dc50be', # Medium pink
|
| 50 |
+
'magenta_3': '#f5a0dc', # Lightest pink
|
| 51 |
+
'yellow_1': '#c8aa00', # Darkest yellow
|
| 52 |
+
'yellow_2': '#ffd200', # Medium yellow
|
| 53 |
+
'yellow_3': '#fff08c', # Lightest yellow
|
| 54 |
+
}
|
| 55 |
|
| 56 |
MODEL_COLORS = {
|
| 57 |
+
"NTv3 650M (post)": COLORS['blue_0'],
|
| 58 |
+
'NTv3 650M (pre)': COLORS['blue_1'], # #3973fc (Darkest blue)
|
| 59 |
+
'NTv3 100M (pre)': COLORS['blue_2'], # #7ea4fc (Medium blue)
|
| 60 |
+
'NTv3 8M (pre)': COLORS['blue_3'], # #c3d5fc (Light blue)
|
| 61 |
+
'Evo2 1B': COLORS['green_3'], # #b3b3b3 (Medium gray)
|
| 62 |
+
"NTv2 500M": COLORS['gray_1'],
|
| 63 |
+
"BPNet arch. 6M": COLORS['cyan_1'],
|
| 64 |
+
"Residual CNN 44M": COLORS['magenta_1'],
|
| 65 |
+
"PlantCAD2 88M": COLORS["purple_1"],
|
| 66 |
+
"Caduceus 7M": COLORS["purple_2"]
|
| 67 |
}
|
| 68 |
|
| 69 |
+
MODEL_NAMES = list(MODEL_COLORS.keys())
|
| 70 |
+
|
| 71 |
_LAST_UPDATED = "Dec 10, 2025"
|
| 72 |
_INTRO = """
|
| 73 |
+
Benchmark across gene annotation and functionnal tracks.
|
| 74 |
|
| 75 |
- **Pearson correlations (multi-assay)**: per-dataset scores across species and models.
|
| 76 |
- **MCC (bed tracks)**: per-track MCC values across species and models.
|
| 77 |
|
| 78 |
+
These tasks measure the model's ability the generalize to unseen tracks, species and assay types.
|
|
|
|
| 79 |
"""
|
| 80 |
|
| 81 |
HERE = os.path.dirname(os.path.abspath(__file__)) # /app/src
|
|
|
|
| 84 |
|
| 85 |
PEARSON_PATH = os.path.join(DATA_DIR, "bigwig_dataset.csv")
|
| 86 |
MCC_PATH = os.path.join(DATA_DIR, "bed_dataset.csv")
|
| 87 |
+
|
| 88 |
# ---------------------------------------------------------------------
|
| 89 |
# Data loading & preprocessing
|
| 90 |
# ---------------------------------------------------------------------
|
|
|
|
| 98 |
pearson_df.columns = [c.strip() for c in pearson_df.columns]
|
| 99 |
mcc_df.columns = [c.strip() for c in mcc_df.columns]
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
return pearson_df, mcc_df
|
| 102 |
|
| 103 |
|
| 104 |
+
@st.cache_data
|
| 105 |
+
def load_expanded_data():
|
| 106 |
"""
|
| 107 |
+
Load data in the new format where each row is already:
|
| 108 |
+
(species, [assay_type], datasets, model_name, metric)
|
| 109 |
+
and convert into a unified schema:
|
| 110 |
+
species, assay_type?, datasets, Model, Score
|
| 111 |
+
|
| 112 |
+
For Pearson:
|
| 113 |
+
If multiple rows share (species, assay_type, datasets, Model),
|
| 114 |
+
we average their Score.
|
| 115 |
+
|
| 116 |
+
For MCC:
|
| 117 |
+
If multiple rows share (species, datasets, Model),
|
| 118 |
+
we average their Score.
|
| 119 |
"""
|
| 120 |
+
pearson_df, mcc_df = load_raw_data()
|
| 121 |
+
|
| 122 |
+
# --- Pearson correlations ---
|
| 123 |
+
# Expect columns: species, assay_type, datasets, model_name, pearson correlation
|
| 124 |
+
pearson_df = pearson_df.rename(
|
| 125 |
+
columns={
|
| 126 |
+
"model_name": "Model",
|
| 127 |
+
"pearson correlation": "Score",
|
| 128 |
+
}
|
| 129 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
pearson_group_cols = ["species", "datasets", "Model"]
|
| 132 |
+
if "assay_type" in pearson_df.columns:
|
| 133 |
+
pearson_group_cols.append("assay_type")
|
| 134 |
|
| 135 |
+
pearson_df = (
|
| 136 |
+
pearson_df
|
| 137 |
+
.groupby(pearson_group_cols, as_index=False, dropna=False)["Score"]
|
| 138 |
+
.mean()
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# --- MCC (bed tracks) ---
|
| 142 |
+
# Expect columns: species, datasets, model_name, MCC
|
| 143 |
+
mcc_df = mcc_df.rename(
|
| 144 |
+
columns={
|
| 145 |
+
"model_name": "Model",
|
| 146 |
+
"MCC": "Score",
|
| 147 |
+
}
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Collapse duplicates with same (species, datasets, Model)
|
| 151 |
+
mcc_group_cols = ["species", "datasets", "Model"]
|
| 152 |
+
mcc_df = (
|
| 153 |
+
mcc_df
|
| 154 |
+
.groupby(mcc_group_cols, as_index=False, dropna=False)["Score"]
|
| 155 |
+
.mean()
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Optional sanity checks
|
| 159 |
+
for df_name, df in [("pearson", pearson_df), ("mcc", mcc_df)]:
|
| 160 |
+
required = {"species", "datasets", "Model", "Score"}
|
| 161 |
+
missing = required - set(df.columns)
|
| 162 |
+
if missing:
|
| 163 |
+
st.error(f"{df_name} dataframe missing columns: {missing}")
|
| 164 |
+
|
| 165 |
+
return pearson_df, mcc_df
|
| 166 |
|
| 167 |
|
| 168 |
_PEARSON_DF, _MCC_DF = load_expanded_data()
|
|
|
|
| 299 |
|
| 300 |
|
| 301 |
def main():
|
| 302 |
+
st.title("🧬 NTv3 Benchmark")
|
| 303 |
st.markdown(_INTRO)
|
| 304 |
st.markdown(f"_Last updated: **{_LAST_UPDATED}**_")
|
| 305 |
|
|
|
|
| 326 |
|
| 327 |
# Assay toggles (Pearson only), based on filtered species
|
| 328 |
if cfg.get("has_assay_type", False):
|
| 329 |
+
st.sidebar.subheader("Assay types")
|
| 330 |
if selected_species:
|
| 331 |
df_for_assays = df_bench[df_bench["species"].isin(selected_species)]
|
| 332 |
else:
|
|
|
|
| 345 |
|
| 346 |
# Bed track / dataset toggles (MCC only), based on species selection
|
| 347 |
selected_datasets: List[str] = []
|
| 348 |
+
if benchmark_name == "MCC":
|
| 349 |
+
st.sidebar.subheader("Genome annotations")
|
| 350 |
if selected_species:
|
| 351 |
df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
|
| 352 |
else:
|
|
|
|
| 358 |
else:
|
| 359 |
selected_datasets = []
|
| 360 |
|
| 361 |
+
# Model toggles (we keep all models in MODEL_NAMES; filters + data will prune)
|
| 362 |
st.sidebar.subheader("Models")
|
| 363 |
selected_models: List[str] = []
|
| 364 |
for model in _ALL_MODELS:
|