Spaces:

InstaDeepAI
/

ntv3_benchmark

Running

App Files Files Community

MidAtBest commited on Dec 11, 2025

Commit

9fc7bf0

1 Parent(s): b59506b

feat: update with real data points

Browse files

Files changed (3) hide show

data/bed_dataset.csv +247 -13
data/bigwig_dataset.csv +0 -0
src/streamlit_app.py +115 -75

data/bed_dataset.csv CHANGED Viewed

@@ -1,13 +1,247 @@
-species,datasets,MCC
-Human,Intron,"[0.893,0.170,0.006,0.828,0.235,0.114,0.707,0.904,0.829,0.474]"
-Human,Exon,"[0.294,0.007,0.428,0.341,0.757,0.442,0.512,0.860,0.884,0.873]"
-Human,Splice_acceptor,"[0.057,0.279,0.129,0.844,0.272,0.174,0.082,0.603,0.277,0.448]"
-Human,Start_codon,"[0.719,0.304,0.482,0.019,0.302,0.942,0.924,0.982,0.982,0.161]"
-Cattle,Intron,"[0.853,0.969,0.585,0.609,0.127,0.842,0.814,0.147,0.472,0.258]"
-Cattle,Exon,"[0.163,0.118,0.152,0.525,0.179,0.967,0.574,0.897,0.593,0.454]"
-Cattle,Splice_acceptor,"[0.977,0.332,0.505,0.069,0.928,0.780,0.618,0.525,0.787,0.741]"
-Cattle,Start_codon,"[0.499,0.858,0.155,0.246,0.494,0.853,0.439,0.853,0.882,0.295]"
-Tomato,Intron,"[0.171,0.995,0.512,0.446,0.816,0.344,0.637,0.492,0.992,0.526]"
-Tomato,Exon,"[0.751,0.186,0.778,0.341,0.853,0.711,0.161,0.559,0.204,0.153]"
-Tomato,Splice_acceptor,"[0.073,0.688,0.568,0.669,0.910,0.581,0.168,0.687,0.928,0.821]"
-Tomato,Start_codon,"[0.491,0.743,0.172,0.351,0.675,0.845,0.077,0.593,0.552,0.089]"

+MCC,model_name,species,datasets
+0.334637850522995,NTv2 500M,cattle,intron
+0.1238768473267555,BPNet arch. 6M,cattle,intron
+0.383470207452774,Residual CNN 44M,cattle,intron
+0.3828243613243103,HyenaDNA 7M,cattle,intron
+0.4733810424804687,Caduceus 7M,cattle,intron
+0.4315277338027954,Evo2 1B,cattle,intron
+0.5455867648124695,NTv3 8M (pre),cattle,intron
+0.5453664064407349,NTv3 100M (pre),cattle,intron
+0.5628412365913391,NTv3 650M (pre),cattle,intron
+0.5682631134986877,NTv3 650M (post),cattle,intron
+0.3689357042312622,NTv2 500M,cattle,exon
+0.3250860869884491,BPNet arch. 6M,cattle,exon
+0.4674676060676574,Residual CNN 44M,cattle,exon
+0.2207767516374588,HyenaDNA 7M,cattle,exon
+0.4960922300815582,Caduceus 7M,cattle,exon
+0.4969632029533386,Evo2 1B,cattle,exon
+0.5432836413383484,NTv3 8M (pre),cattle,exon
+0.5531933307647705,NTv3 100M (pre),cattle,exon
+0.591151773929596,NTv3 650M (pre),cattle,exon
+0.6253225207328796,NTv3 650M (post),cattle,exon
+0.118808165192604,NTv2 500M,cattle,splice acceptor
+0.4715546369552612,BPNet arch. 6M,cattle,splice acceptor
+0.6620649099349976,Residual CNN 44M,cattle,splice acceptor
+0.104436807334423,HyenaDNA 7M,cattle,splice acceptor
+0.7064619660377502,Caduceus 7M,cattle,splice acceptor
+0.2085049450397491,Evo2 1B,cattle,splice acceptor
+0.7254849076271057,NTv3 8M (pre),cattle,splice acceptor
+0.7404072880744934,NTv3 100M (pre),cattle,splice acceptor
+0.7732946872711182,NTv3 650M (pre),cattle,splice acceptor
+0.7679624557495117,NTv3 650M (post),cattle,splice acceptor
+0.1412438601255417,NTv2 500M,cattle,start codon
+0.1490814685821533,BPNet arch. 6M,cattle,start codon
+0.3243320286273956,Residual CNN 44M,cattle,start codon
+0.056509330868721,HyenaDNA 7M,cattle,start codon
+0.3455557227134704,Caduceus 7M,cattle,start codon
+0.1030694246292114,Evo2 1B,cattle,start codon
+0.5275959968566895,NTv3 8M (pre),cattle,start codon
+0.4962065815925598,NTv3 100M (pre),cattle,start codon
+0.5591813921928406,NTv3 650M (pre),cattle,start codon
+0.5492052435874939,NTv3 650M (post),cattle,start codon
+0.5492052435874939,NTv2 500M,cattle,start codon
+0.1015273928642273,BPNet arch. 6M,cattle,intron
+0.3299930691719055,Residual CNN 44M,cattle,intron
+0.3826011121273041,HyenaDNA 7M,cattle,intron
+0.5564854741096497,Caduceus 7M,cattle,intron
+0.5564854741096497,NTv2 500M,cattle,intron
+0.323502242565155,BPNet arch. 6M,cattle,exon
+0.519285261631012,Residual CNN 44M,cattle,exon
+0.1038060635328292,HyenaDNA 7M,cattle,splice acceptor
+0.1038060635328292,Caduceus 7M,cattle,splice acceptor
+0.1038060635328292,NTv2 500M,cattle,splice acceptor
+0.4435675740242004,BPNet arch. 6M,cattle,splice acceptor
+0.6590774655342102,Residual CNN 44M,cattle,splice acceptor
+0.1038060635328292,HyenaDNA 7M,cattle,splice acceptor
+0.1038060635328292,Caduceus 7M,cattle,splice acceptor
+0.1038060635328292,NTv2 500M,cattle,splice acceptor
+0.0901669710874557,BPNet arch. 6M,cattle,start codon
+0.3548502624034881,Residual CNN 44M,cattle,start codon
+0.0545537285506725,HyenaDNA 7M,cattle,start codon
+0.0545537285506725,Caduceus 7M,cattle,start codon
+0.0639578104019165,BPNet arch. 6M,cattle,start codon
+0.3266464471817016,Residual CNN 44M,cattle,intron
+0.3266464471817016,HyenaDNA 7M,cattle,intron
+0.3266464471817016,Caduceus 7M,cattle,intron
+0.1383400112390518,BPNet arch. 6M,cattle,intron
+0.4856111407279968,Residual CNN 44M,cattle,exon
+0.4856111407279968,HyenaDNA 7M,cattle,exon
+0.4856111407279968,Caduceus 7M,cattle,exon
+0.4220209121704101,BPNet arch. 6M,cattle,splice acceptor
+0.689546525478363,Residual CNN 44M,cattle,splice acceptor
+0.689546525478363,HyenaDNA 7M,cattle,splice acceptor
+0.689546525478363,Caduceus 7M,cattle,splice acceptor
+0.0930091217160224,BPNet arch. 6M,cattle,start codon
+0.423166275024414,Residual CNN 44M,cattle,start codon
+0.423166275024414,HyenaDNA 7M,cattle,start codon
+0.423166275024414,Caduceus 7M,cattle,start codon
+0.4777896404266357,NTv2 500M,tomato,intron
+0.3216900527477264,BPNet arch. 6M,tomato,intron
+0.46840900182724,Residual CNN 44M,tomato,intron
+0.5251263380050659,PlantCAD2 88M,tomato,intron
+0.747674286365509,Evo2 1B,tomato,intron
+0.6858112812042236,NTv3 8M (pre),tomato,intron
+0.7038365006446838,NTv3 100M (pre),tomato,intron
+0.7481895685195923,NTv3 650M (pre),tomato,intron
+0.7458349466323853,NTv3 650M (post),tomato,intron
+0.6147475838661194,NTv2 500M,tomato,exon
+0.4551227986812591,BPNet arch. 6M,tomato,exon
+0.5068296194076538,Residual CNN 44M,tomato,exon
+0.7256030440330505,PlantCAD2 88M,tomato,exon
+0.7006198763847351,Evo2 1B,tomato,exon
+0.7537696361541748,NTv3 8M (pre),tomato,exon
+0.7484462857246399,NTv3 100M (pre),tomato,exon
+0.764011561870575,NTv3 650M (pre),tomato,exon
+0.7750575542449951,NTv3 650M (post),tomato,exon
+0.1691933125257492,NTv2 500M,tomato,splice acceptor
+0.125656172633171,BPNet arch. 6M,tomato,splice acceptor
+0.4359458982944488,Residual CNN 44M,tomato,splice acceptor
+0.744257926940918,PlantCAD2 88M,tomato,splice acceptor
+0.3791649639606476,Evo2 1B,tomato,splice acceptor
+0.6623862385749817,NTv3 8M (pre),tomato,splice acceptor
+0.6843105554580688,NTv3 100M (pre),tomato,splice acceptor
+0.7641868591308594,NTv3 650M (pre),tomato,splice acceptor
+0.7584431767463684,NTv3 650M (post),tomato,splice acceptor
+0.132934883236885,NTv2 500M,tomato,start codon
+0.0,BPNet arch. 6M,tomato,start codon
+0.088478960096836,Residual CNN 44M,tomato,start codon
+0.2019559442996978,PlantCAD2 88M,tomato,start codon
+0.1622217148542404,Evo2 1B,tomato,start codon
+0.2966536581516266,NTv3 8M (pre),tomato,start codon
+0.3968957066535949,NTv3 100M (pre),tomato,start codon
+0.4830105900764465,NTv3 650M (pre),tomato,start codon
+0.5007501244544983,NTv3 650M (post),tomato,start codon
+0.6770024299621582,NTv2 500M,tomato,intron
+0.2927957773208618,BPNet arch. 6M,tomato,intron
+0.1383400112390518,Residual CNN 44M,tomato,intron
+0.1383400112390518,PlantCAD2 88M,tomato,intron
+0.5751976370811462,NTv2 500M,tomato,exon
+0.3057552278041839,BPNet arch. 6M,tomato,exon
+0.168193981051445,NTv2 500M,tomato,splice acceptor
+0.0,BPNet arch. 6M,tomato,splice acceptor
+0.4833243191242218,Residual CNN 44M,tomato,splice acceptor
+0.4833243191242218,PlantCAD2 88M,tomato,splice acceptor
+0.1586925536394119,NTv2 500M,tomato,start codon
+0.0,BPNet arch. 6M,tomato,start codon
+0.1107296794652938,Residual CNN 44M,tomato,start codon
+0.1107296794652938,PlantCAD2 88M,tomato,start codon
+0.3502058088779449,BPNet arch. 6M,tomato,intron
+0.5514466166496277,Residual CNN 44M,tomato,intron
+0.5514466166496277,PlantCAD2 88M,tomato,intron
+0.3020758032798767,BPNet arch. 6M,tomato,exon
+0.4746756553649902,Residual CNN 44M,tomato,exon
+0.4746756553649902,PlantCAD2 88M,tomato,exon
+0.0,BPNet arch. 6M,tomato,splice acceptor
+0.3391502797603607,Residual CNN 44M,tomato,splice acceptor
+0.3391502797603607,PlantCAD2 88M,tomato,splice acceptor
+0.0639578104019165,BPNet arch. 6M,tomato,start codon
+0.0914037525653839,Residual CNN 44M,tomato,start codon
+0.0914037525653839,PlantCAD2 88M,tomato,start codon
+0.1995969861745834,NTv2 500M,human,intron
+0.0296161584556102,BPNet arch. 6M,human,intron
+0.2347834408283233,Residual CNN 44M,human,intron
+0.33451908826828,HyenaDNA 7M,human,intron
+0.4144788980484009,Caduceus 7M,human,intron
+0.0,Evo2 1B,human,intron
+0.4695742726325989,NTv3 8M (pre),human,intron
+0.475054919719696,NTv3 100M (pre),human,intron
+0.5504136681556702,NTv3 650M (pre),human,intron
+0.5643875002861023,NTv3 650M (post),human,intron
+0.1995969861745834,NTv2 500M,human,intron
+0.2706590592861175,BPNet arch. 6M,human,exon
+0.2678671479225158,Residual CNN 44M,human,exon
+0.179698497056961,HyenaDNA 7M,human,exon
+0.5098947286605835,Caduceus 7M,human,exon
+0.4510694444179535,Evo2 1B,human,exon
+0.6089931726455688,NTv3 8M (pre),human,exon
+0.6492856740951538,NTv3 100M (pre),human,exon
+0.6975767016410828,NTv3 650M (pre),human,exon
+0.6822624206542969,NTv3 650M (post),human,exon
+0.1493269056081771,NTv2 500M,human,splice acceptor
+0.3807527124881744,BPNet arch. 6M,human,splice acceptor
+0.6632664203643799,Residual CNN 44M,human,splice acceptor
+0.1002769619226455,HyenaDNA 7M,human,splice acceptor
+0.7357247471809387,Caduceus 7M,human,splice acceptor
+0.1821079105138778,Evo2 1B,human,splice acceptor
+0.7726271748542786,NTv3 8M (pre),human,splice acceptor
+0.77947598695755,NTv3 100M (pre),human,splice acceptor
+0.8028115034103394,NTv3 650M (pre),human,splice acceptor
+0.7979229092597961,NTv3 650M (post),human,splice acceptor
+0.139576569199562,NTv2 500M,human,start codon
+0.1334401220083236,BPNet arch. 6M,human,start codon
+0.3876807987689972,Residual CNN 44M,human,start codon
+0.1003016158938407,HyenaDNA 7M,human,start codon
+0.3958532512187958,Caduceus 7M,human,start codon
+0.1399599611759185,Evo2 1B,human,start codon
+0.540923535823822,NTv3 8M (pre),human,start codon
+0.5464004278182983,NTv3 100M (pre),human,start codon
+0.6803378462791443,NTv3 650M (pre),human,start codon
+0.7310947179794312,NTv3 650M (post),human,start codon
+0.7310947179794312,NTv2 500M,human,start codon
+0.0172978900372982,BPNet arch. 6M,human,intron
+0.2740728259086609,Residual CNN 44M,human,intron
+0.3312098085880279,HyenaDNA 7M,human,intron
+0.5108950138092041,Caduceus 7M,human,intron
+0.5034915208816528,NTv3 8M (pre),human,intron
+0.5154411792755127,NTv3 100M (pre),human,intron
+0.5814740061759949,NTv3 650M (pre),human,intron
+0.5920455455780029,NTv3 650M (post),human,intron
+0.5920455455780029,NTv2 500M,human,intron
+0.2252149283885955,BPNet arch. 6M,human,exon
+0.4010578095912933,Residual CNN 44M,human,exon
+0.1851459741592407,HyenaDNA 7M,human,exon
+0.4599409103393554,Caduceus 7M,human,exon
+0.5931490063667297,NTv3 8M (pre),human,exon
+0.6058318018913269,NTv3 100M (pre),human,exon
+0.6738048791885376,NTv3 650M (pre),human,exon
+0.6738048791885376,NTv3 650M (post),human,exon
+0.6738048791885376,NTv2 500M,human,exon
+0.3751010596752167,BPNet arch. 6M,human,splice acceptor
+0.681228756904602,Residual CNN 44M,human,splice acceptor
+0.0252278540283441,HyenaDNA 7M,human,splice acceptor
+0.7485092878341675,Caduceus 7M,human,splice acceptor
+0.7772909998893738,NTv3 8M (pre),human,splice acceptor
+0.794090747833252,NTv3 100M (pre),human,splice acceptor
+0.8239933252334595,NTv3 650M (pre),human,splice acceptor
+0.804115891456604,NTv3 650M (post),human,splice acceptor
+0.804115891456604,NTv2 500M,human,splice acceptor
+0.0,BPNet arch. 6M,human,start codon
+0.3292546272277832,Residual CNN 44M,human,start codon
+0.0647941380739212,HyenaDNA 7M,human,start codon
+0.4505241215229034,Caduceus 7M,human,start codon
+0.60422682762146,NTv3 8M (pre),human,start codon
+0.6015576124191284,NTv3 100M (pre),human,start codon
+0.6452956795692444,NTv3 650M (pre),human,start codon
+0.6761345267295837,NTv3 650M (post),human,start codon
+0.0185965970158576,BPNet arch. 6M,human,intron
+0.2623045742511749,Residual CNN 44M,human,intron
+0.2623045742511749,HyenaDNA 7M,human,intron
+0.2623045742511749,Caduceus 7M,human,intron
+0.4804849028587341,NTv3 8M (pre),human,intron
+0.482195496559143,NTv3 100M (pre),human,intron
+0.5425574779510498,NTv3 650M (pre),human,intron
+0.5443048477172852,NTv3 650M (post),human,intron
+0.2360571771860122,BPNet arch. 6M,human,exon
+0.2360571771860122,Residual CNN 44M,human,exon
+0.2360571771860122,HyenaDNA 7M,human,exon
+0.2360571771860122,Caduceus 7M,human,exon
+0.6339762210845947,NTv3 8M (pre),human,exon
+0.6433913111686707,NTv3 100M (pre),human,exon
+0.6518793702125549,NTv3 650M (pre),human,exon
+0.6812491416931152,NTv3 650M (post),human,exon
+0.3842235207557678,BPNet arch. 6M,human,splice acceptor
+0.6810190081596375,Residual CNN 44M,human,splice acceptor
+0.6810190081596375,HyenaDNA 7M,human,splice acceptor
+0.6810190081596375,Caduceus 7M,human,splice acceptor
+0.7796080708503723,NTv3 8M (pre),human,splice acceptor
+0.7596970200538635,NTv3 100M (pre),human,splice acceptor
+0.7915040850639343,NTv3 650M (pre),human,splice acceptor
+0.7957100868225098,NTv3 650M (post),human,splice acceptor
+0.1114460304379463,BPNet arch. 6M,human,start codon
+0.3342535495758056,Residual CNN 44M,human,start codon
+0.3342535495758056,HyenaDNA 7M,human,start codon
+0.3342535495758056,Caduceus 7M,human,start codon
+0.5167152881622314,NTv3 8M (pre),human,start codon
+0.5340564250946045,NTv3 100M (pre),human,start codon
+0.6148532032966614,NTv3 650M (pre),human,start codon
+0.6582212448120117,NTv3 650M (post),human,start codon

data/bigwig_dataset.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from typing import List
-import ast
 import os
 import pandas as pd
@@ -10,47 +9,73 @@ import plotly.express as px
 # Page config (must be the first Streamlit command)
 # ---------------------------------------------------------------------
 st.set_page_config(
-    page_title="Custom Model Benchmarks",
     layout="wide",
 )
 # ---------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------
-MODEL_NAMES = [
-    "NTv2 500M MS",
-    "BPNet 6M",
-    "SpliceAI 44M",
-    "PlantCAD2 - Small 88M",
-    "Evo2 1b BF16",
-    "NTv3 8M",
-    "NTv3 100M",
-    "NTv3 650M",
-    "NTv3 650M - post-trained",
-]
 MODEL_COLORS = {
-    "NTv2 500M MS": "#1f77b4",
-    "BPNet 6M": "#ff7f0e",
-    "SpliceAI 44M": "#2ca02c",
-    "PlantCAD2 - Small 88M": "#d62728",
-    "Evo2 1b BF16": "#9467bd",
-    "NTv3 8M": "#8c564b",
-    "NTv3 100M": "#e377c2",
-    "NTv3 650M": "#7f7f7f",
-    "NTv3 650M - post-trained": "#bcbd22",
 }
 _LAST_UPDATED = "Dec 10, 2025"
 _INTRO = """
-Simple leaderboard over custom benchmarks.
 - **Pearson correlations (multi-assay)**: per-dataset scores across species and models.
 - **MCC (bed tracks)**: per-track MCC values across species and models.
-Each metric cell in the CSVs is a list of scores (one per model).
-We expand this to (Model × Species × Dataset) and aggregate according to your filters.
 """
 HERE = os.path.dirname(os.path.abspath(__file__))  # /app/src
@@ -59,6 +84,7 @@ DATA_DIR = os.path.join(PROJECT_ROOT, "data")
 PEARSON_PATH = os.path.join(DATA_DIR, "bigwig_dataset.csv")
 MCC_PATH = os.path.join(DATA_DIR, "bed_dataset.csv")
 # ---------------------------------------------------------------------
 # Data loading & preprocessing
 # ---------------------------------------------------------------------
@@ -72,57 +98,71 @@ def load_raw_data():
     pearson_df.columns = [c.strip() for c in pearson_df.columns]
     mcc_df.columns = [c.strip() for c in mcc_df.columns]
-    # Optional: basic sanity check on required columns
-    # required_p = {"species", "datasets", "pearson correlation"}
-    # required_m = {"species", "datasets", "MCC"}
-    # missing_p = required_p - set(pearson_df.columns)
-    # missing_m = required_m - set(mcc_df.columns)
-    # if missing_p:
-    #     st.error(f"Pearson CSV missing columns: {missing_p}")
-    # if missing_m:
-    #     st.error(f"MCC CSV missing columns: {missing_m}")
     return pearson_df, mcc_df
-def expand_metric_lists(df: pd.DataFrame, metric_col: str) -> pd.DataFrame:
     """
-    Take a DataFrame where `metric_col` is a stringified list, and expand it
-    into rows per Model, with scalar 'Score' and 'Model' columns.
     """
-    rows = []
-    for _, row in df.iterrows():
-        raw = row[metric_col]
-        try:
-            values = ast.literal_eval(str(raw))
-        except Exception:
-            # Skip rows that don't parse correctly
-            continue
-        if not isinstance(values, (list, tuple)):
-            continue
-        n_models = min(len(MODEL_NAMES), len(values))
-        for i in range(n_models):
-            new_row = {
-                "species": row["species"],
-                "datasets": row["datasets"],
-                "Model": MODEL_NAMES[i],
-                "Score": float(values[i]),
-            }
-            if "assay_type" in row.index:
-                new_row["assay_type"] = row["assay_type"]
-            rows.append(new_row)
-    return pd.DataFrame(rows)
-@st.cache_data
-def load_expanded_data():
-    raw_pearson, raw_mcc = load_raw_data()
-    pearson_expanded = expand_metric_lists(raw_pearson, "pearson correlation")
-    mcc_expanded = expand_metric_lists(raw_mcc, "MCC")
-    return pearson_expanded, mcc_expanded
 _PEARSON_DF, _MCC_DF = load_expanded_data()
@@ -259,7 +299,7 @@ def sidebar_toggle(label: str, value: bool = False, key: str | None = None) -> b
 def main():
-    st.title("🧬 Custom Model Benchmarks")
     st.markdown(_INTRO)
     st.markdown(f"_Last updated: **{_LAST_UPDATED}**_")
@@ -286,7 +326,7 @@ def main():
     # Assay toggles (Pearson only), based on filtered species
     if cfg.get("has_assay_type", False):
-        st.sidebar.subheader("Assay types (Pearson only)")
         if selected_species:
             df_for_assays = df_bench[df_bench["species"].isin(selected_species)]
         else:
@@ -305,8 +345,8 @@ def main():
     # Bed track / dataset toggles (MCC only), based on species selection
     selected_datasets: List[str] = []
-    if benchmark_name == "MCC (bed tracks)":
-        st.sidebar.subheader("Bed tracks (datasets)")
         if selected_species:
             df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
         else:
@@ -318,7 +358,7 @@ def main():
     else:
         selected_datasets = []
-    # Model toggles (we keep all models, regardless of benchmark; filters will prune)
     st.sidebar.subheader("Models")
     selected_models: List[str] = []
     for model in _ALL_MODELS:

 from typing import List
 import os
 import pandas as pd
 # Page config (must be the first Streamlit command)
 # ---------------------------------------------------------------------
 st.set_page_config(
+    page_title="NTv3 Benchmark",
     layout="wide",
 )
 # ---------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------
+COLORS = {
+    # Primary colors 1 (our models)
+    'blue_0': '#004697',     # Darkest allowable blue
+    'blue_1': '#3973fc',     # Main blue
+    'blue_2': '#7ea4fc',     # Medium blue
+    'blue_3': '#c3d5fc',     # Light blue (lightest allowable blue)
+    # Secondary colors 1
+    'red_1': '#ff554d',      # Medium red
+    'red_2': '#ffe0de',      # Light red
+    # Primary colors 2
+    'green_1': '#00b050',    # Darkest green
+    'green_2': '#92d050',    # Medium green
+    'green_3': '#c6e0b4',    # Light green (lightest allowable green)
+    # Secondary colors 2
+    'gold_1': '#fdb932',
+    # Tertiary colors
+    'orange_1': '#ff975e',
+    'purple_1': '#9a6ce4',
+    'purple_2': '#bb9aef',     # Medium purple
+    'purple_3': '#ceb5f5',     # Light purple (lightest allowable purple)
+    # Grays (other models)
+    'gray_1': '#808080',     # Darkest gray (use as a last resort)
+    'gray_2': '#b3b3b3',     # Medium gray (start with this as the darkest when possible)
+    'gray_3': '#e6e6e6',     # Lightest gray
+    'gray_4': '#ffffff',     # It's actually just white (use as a last resort)
+    # If all other options are exhausted
+    'cyan_1': '#0096b4',     # Darkest teal
+    'cyan_2': '#28bed2',     # Medium cyan
+    'cyan_3': '#8cdceb',     # Lightest cyan
+    'magenta_1': '#b428a0',  # Darkest magenta
+    'magenta_2': '#dc50be',  # Medium pink
+    'magenta_3': '#f5a0dc',  # Lightest pink
+    'yellow_1': '#c8aa00',   # Darkest yellow
+    'yellow_2': '#ffd200',   # Medium yellow
+    'yellow_3': '#fff08c',   # Lightest yellow
+}
 MODEL_COLORS = {
+    "NTv3 650M (post)": COLORS['blue_0'],
+    'NTv3 650M (pre)': COLORS['blue_1'],    # #3973fc (Darkest blue)
+    'NTv3 100M (pre)': COLORS['blue_2'],    # #7ea4fc (Medium blue)
+    'NTv3 8M (pre)': COLORS['blue_3'],      # #c3d5fc (Light blue)
+    'Evo2 1B': COLORS['green_3'],      # #b3b3b3 (Medium gray)
+    "NTv2 500M": COLORS['gray_1'],
+    "BPNet arch. 6M": COLORS['cyan_1'],
+    "Residual CNN 44M":  COLORS['magenta_1'],
+    "PlantCAD2 88M": COLORS["purple_1"],
+    "Caduceus 7M": COLORS["purple_2"]
 }
+MODEL_NAMES = list(MODEL_COLORS.keys())
 _LAST_UPDATED = "Dec 10, 2025"
 _INTRO = """
+Benchmark across gene annotation and functionnal tracks.
 - **Pearson correlations (multi-assay)**: per-dataset scores across species and models.
 - **MCC (bed tracks)**: per-track MCC values across species and models.
+These tasks measure the model's ability the generalize to unseen tracks, species and assay types.
 """
 HERE = os.path.dirname(os.path.abspath(__file__))  # /app/src
 PEARSON_PATH = os.path.join(DATA_DIR, "bigwig_dataset.csv")
 MCC_PATH = os.path.join(DATA_DIR, "bed_dataset.csv")
 # ---------------------------------------------------------------------
 # Data loading & preprocessing
 # ---------------------------------------------------------------------
     pearson_df.columns = [c.strip() for c in pearson_df.columns]
     mcc_df.columns = [c.strip() for c in mcc_df.columns]
     return pearson_df, mcc_df
+@st.cache_data
+def load_expanded_data():
     """
+    Load data in the new format where each row is already:
+      (species, [assay_type], datasets, model_name, metric)
+    and convert into a unified schema:
+      species, assay_type?, datasets, Model, Score
+    For Pearson:
+      If multiple rows share (species, assay_type, datasets, Model),
+      we average their Score.
+    For MCC:
+      If multiple rows share (species, datasets, Model),
+      we average their Score.
     """
+    pearson_df, mcc_df = load_raw_data()
+    # --- Pearson correlations ---
+    # Expect columns: species, assay_type, datasets, model_name, pearson correlation
+    pearson_df = pearson_df.rename(
+        columns={
+            "model_name": "Model",
+            "pearson correlation": "Score",
+        }
+    )
+    pearson_group_cols = ["species", "datasets", "Model"]
+    if "assay_type" in pearson_df.columns:
+        pearson_group_cols.append("assay_type")
+    pearson_df = (
+        pearson_df
+        .groupby(pearson_group_cols, as_index=False, dropna=False)["Score"]
+        .mean()
+    )
+    # --- MCC (bed tracks) ---
+    # Expect columns: species, datasets, model_name, MCC
+    mcc_df = mcc_df.rename(
+        columns={
+            "model_name": "Model",
+            "MCC": "Score",
+        }
+    )
+    # Collapse duplicates with same (species, datasets, Model)
+    mcc_group_cols = ["species", "datasets", "Model"]
+    mcc_df = (
+        mcc_df
+        .groupby(mcc_group_cols, as_index=False, dropna=False)["Score"]
+        .mean()
+    )
+    # Optional sanity checks
+    for df_name, df in [("pearson", pearson_df), ("mcc", mcc_df)]:
+        required = {"species", "datasets", "Model", "Score"}
+        missing = required - set(df.columns)
+        if missing:
+            st.error(f"{df_name} dataframe missing columns: {missing}")
+    return pearson_df, mcc_df
 _PEARSON_DF, _MCC_DF = load_expanded_data()
 def main():
+    st.title("🧬 NTv3 Benchmark")
     st.markdown(_INTRO)
     st.markdown(f"_Last updated: **{_LAST_UPDATED}**_")
     # Assay toggles (Pearson only), based on filtered species
     if cfg.get("has_assay_type", False):
+        st.sidebar.subheader("Assay types")
         if selected_species:
             df_for_assays = df_bench[df_bench["species"].isin(selected_species)]
         else:
     # Bed track / dataset toggles (MCC only), based on species selection
     selected_datasets: List[str] = []
+    if benchmark_name == "MCC":
+        st.sidebar.subheader("Genome annotations")
         if selected_species:
             df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
         else:
     else:
         selected_datasets = []
+    # Model toggles (we keep all models in MODEL_NAMES; filters + data will prune)
     st.sidebar.subheader("Models")
     selected_models: List[str] = []
     for model in _ALL_MODELS: