Spaces:
Running
Running
| from typing import List | |
| import os | |
| import pandas as pd | |
| import streamlit as st | |
| import plotly.express as px | |
| from plotly.subplots import make_subplots | |
| import plotly.graph_objects as go | |
| import numpy as np | |
| # --------------------------------------------------------------------- | |
| # Page config (must be the first Streamlit command) | |
| # --------------------------------------------------------------------- | |
| st.set_page_config( | |
| page_title="NTv3 Benchmark", | |
| layout="wide", | |
| ) | |
| # --------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------- | |
| COLORS = { | |
| # Primary colors 1 (our models) | |
| 'blue_0': '#004697', # Darkest allowable blue | |
| 'blue_1': '#3973fc', # Main blue | |
| 'blue_2': '#7ea4fc', # Medium blue | |
| 'blue_3': '#c3d5fc', # Light blue (lightest allowable blue) | |
| # Secondary colors 1 | |
| 'red_1': '#ff554d', # Medium red | |
| 'red_2': '#ffe0de', # Light red | |
| # Primary colors 2 | |
| 'green_1': '#00b050', # Darkest green | |
| 'green_2': '#92d050', # Medium green | |
| 'green_3': '#c6e0b4', # Light green (lightest allowable green) | |
| # Secondary colors 2 | |
| 'gold_1': '#fdb932', | |
| # Tertiary colors | |
| 'orange_1': '#ff975e', | |
| 'purple_1': '#9a6ce4', | |
| 'purple_2': '#bb9aef', # Medium purple | |
| 'purple_3': '#ceb5f5', # Light purple (lightest allowable purple) | |
| # Grays (other models) | |
| 'gray_1': '#808080', # Darkest gray (use as a last resort) | |
| 'gray_2': '#b3b3b3', # Medium gray (start with this as the darkest when possible) | |
| 'gray_3': '#e6e6e6', # Lightest gray | |
| 'gray_4': '#ffffff', # It's actually just white (use as a last resort) | |
| # If all other options are exhausted | |
| 'cyan_1': '#0096b4', # Darkest teal | |
| 'cyan_2': '#28bed2', # Medium cyan | |
| 'cyan_3': '#8cdceb', # Lightest cyan | |
| 'magenta_1': '#b428a0', # Darkest magenta | |
| 'magenta_2': '#dc50be', # Medium pink | |
| 'magenta_3': '#f5a0dc', # Lightest pink | |
| 'yellow_1': '#c8aa00', # Darkest yellow | |
| 'yellow_2': '#ffd200', # Medium yellow | |
| 'yellow_3': '#fff08c', # Lightest yellow | |
| } | |
| ASSAY_TYPE_MAPPING = { | |
| 'ATAC-seq': 'chromatin accessibility', | |
| 'DNase-seq': 'chromatin accessibility', | |
| 'Histone ChIP-seq': 'histone modifications', | |
| 'TF ChIP-seq': 'chromatin accessibility', | |
| 'PRO-cap': 'transcription initiation', | |
| 'eCLIP': 'RNA binding sites', | |
| 'RNA-seq': 'gene expression', | |
| 'ribo-seq': 'mRNA translation', | |
| 'Annotation': 'genome annotation', | |
| "Exon": "exon", | |
| "Intron": "intron", | |
| "Splice acceptor": "splice acceptor", | |
| "Start codon": "start codon", | |
| } | |
| ASSAY_COLORS = { | |
| 'chromatin accessibility': '#004697', | |
| 'histone modifications': '#cc0000', | |
| 'transcription initiation': '#ff9900', | |
| 'RNA binding sites': '#9933cc', | |
| 'gene expression': '#009900', | |
| 'mRNA translation': '#ff6699', | |
| 'genome annotation': '#ffcc00', | |
| "intron": '#004697', | |
| "exon": '#cc0000', | |
| "splice acceptor": '#ff9900', | |
| "start codon": '#9933cc', | |
| } | |
| ASSAY_COLORS["other"] = "#808080" | |
| MODEL_COLORS = { | |
| "NTv3 650M (pos)": COLORS['blue_0'], | |
| 'NTv3 650M (pre)': COLORS['blue_1'], # #3973fc (Darkest blue) | |
| 'NTv3 100M (pre)': COLORS['blue_2'], # #7ea4fc (Medium blue) | |
| 'NTv3 8M (pre)': COLORS['blue_3'], # #c3d5fc (Light blue) | |
| 'Evo2 1B': COLORS['green_3'], # #b3b3b3 (Medium gray) | |
| "NTv2 500M": COLORS['gray_1'], | |
| "BPNet arch. 6M": COLORS['cyan_1'], | |
| "Residual CNN 44M": COLORS['magenta_1'], | |
| "PlantCAD2 88M": COLORS["purple_1"], | |
| "Caduceus 7M": COLORS["purple_2"], | |
| "HyenaDNA 7M": COLORS["yellow_2"] | |
| } | |
| MODEL_TRAINING_STATUS = { | |
| "NTv3 650M (pos)": "POS", | |
| "NTv3 650M (pre)": "PRE", | |
| "NTv3 100M (pre)": "PRE", | |
| "NTv3 8M (pre)": "PRE", | |
| "Residual CNN 44M": "SCRATCH", | |
| "Caduceus 7M": "PRE", | |
| "Evo2 1B": "PRE", | |
| "NTv2 500M": "PRE", | |
| "BPNet arch. 6M": "SCRATCH", | |
| "PlantCAD2 88M": "PRE", | |
| "HyenaDNA 7M": "PRE" | |
| } | |
| MODEL_GPU_MULTIPLIER = { | |
| "Evo2 1B": 8, # trained on 8 GPUs | |
| } | |
| MODEL_NAMES = list(MODEL_COLORS.keys()) | |
| PLANT_SPECIES = ["tomato", "rice", "maize", "arabidopsis"] | |
| ANIMAL_SPECIES = ["human", "chicken", "cattle"] | |
| SPECIES_GROUPS = { | |
| "Plants": PLANT_SPECIES, | |
| "Animals": ANIMAL_SPECIES, # (your code calls these HUMAN_SPECIES, but they’re the “animal” set) | |
| } | |
| _LAST_UPDATED = "Dec 10, 2025" | |
| _INTRO = """ | |
| The **NTv3 Benchmark** is a curated benchmark of 106 long-range genomic datasets | |
| designed to evaluate models under realistic 32 kb input, single-base-pair output settings. | |
| The dataset spans two complementary task families: genome annotation (exon, intron, splice acceptor, start codon) | |
| and functional-regulatory prediction, which includes diverse experimental tracks such as chromatin accessibility, | |
| histone modifications, transcription initiation (PRO-cap), RNA binding (eCLIP), gene expression (RNA-seq), | |
| and translation (Ribo-seq). | |
| Data are drawn from a phylogenetically diverse set of species, including organisms seen during post-training | |
| (human, chicken, arabidopsis, rice, maize) and entirely unseen species (cattle, tomato), with careful curation | |
| to avoid data leakage. This design allows the dataset to probe long-range sequence-to-function mapping, | |
| cross-species generalization, and transfer across heterogeneous regulatory modalities, | |
| including assays not present in prior multispecies training corpora. By standardizing sequence length, | |
| resolution, and evaluation metrics across all tracks, the NTv3 Benchmark provides a controlled dataset | |
| for comparing representation quality across genomic foundation models. | |
| The metrics used are: | |
| - **Pearson correlations (multi-assay)**: per-dataset scores across species and models for functional tracks. | |
| - **MCC (bed tracks)**: per-track MCC values across species and models for gene annotation tracks. | |
| """ | |
| HERE = os.path.dirname(os.path.abspath(__file__)) # /app/src | |
| PROJECT_ROOT = os.path.dirname(HERE) # /app | |
| DATA_DIR = os.path.join(PROJECT_ROOT, "data") | |
| SINGLE_TABLE_PATH = os.path.join(DATA_DIR, "ntv3_benchmark_results.csv") | |
| # --------------------------------------------------------------------- | |
| # Data loading & preprocessing | |
| # --------------------------------------------------------------------- | |
| def load_raw_data(): | |
| df = pd.read_csv(SINGLE_TABLE_PATH) | |
| df.columns = [c.strip() for c in df.columns] | |
| return df | |
| def _normalize_training_time_to_gpu_hours(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Your new column is `running_time`. In your sample it looks like seconds | |
| (e.g. 317034 ~= 88 hours). We'll convert to hours if values look like seconds. | |
| """ | |
| if "running_time" not in df.columns: | |
| return df | |
| rt = pd.to_numeric(df["running_time"], errors="coerce") | |
| # Heuristic: if median is huge, it's probably seconds -> convert to hours | |
| # (88 hours = 316800 seconds is a typical-looking value in your sample) | |
| if rt.dropna().median() > 10_000: | |
| df["GPU hours"] = rt / 3600.0 | |
| else: | |
| df["GPU hours"] = rt.astype(float) | |
| return df | |
| def _best_step_time_to_hours(s: pd.Series) -> pd.Series: | |
| """ | |
| Converts strings like '3 days 04:26:26.467000' to hours (float). | |
| Works with pandas Timedelta parsing. | |
| """ | |
| td = pd.to_timedelta(s, errors="coerce") | |
| return td.dt.total_seconds() / 3600.0 | |
| def load_expanded_data(): | |
| df = load_raw_data().copy() | |
| df = df.rename(columns={"Metric": "Score", "model_name": "Model"}) | |
| df["Score"] = pd.to_numeric(df["Score"], errors="coerce") | |
| if "best_step" in df.columns: | |
| df["best_step"] = pd.to_numeric(df["best_step"], errors="coerce") | |
| if "best_step_time" in df.columns: | |
| df["best_step_time_hours"] = _best_step_time_to_hours(df["best_step_time"]) | |
| else: | |
| df["best_step_time_hours"] = np.nan | |
| is_annot = df.get("assay_type", "").astype(str).eq("Annotation") | |
| pearson_raw = df[~is_annot].copy() | |
| mcc_raw = df[is_annot].copy() | |
| # ------------------------- | |
| # Functional Tracks (Pearson) | |
| # ------------------------- | |
| pearson_group_cols = ["species", "datasets", "Model"] | |
| if "assay_type" in pearson_raw.columns: | |
| pearson_group_cols.append("assay_type") | |
| pearson_df = ( | |
| pearson_raw | |
| .groupby(pearson_group_cols, as_index=False, dropna=False) | |
| .agg({ | |
| "Score": "mean", | |
| "best_step": "mean", | |
| "best_step_time_hours": "mean", | |
| }) | |
| ) | |
| # ✅ merge track_name_clean WHILE assay_type is still raw | |
| if "track_name_clean" in pearson_raw.columns: | |
| map_keys = ["species", "datasets"] | |
| if "assay_type" in pearson_raw.columns: | |
| map_keys.append("assay_type") | |
| track_map = ( | |
| pearson_raw[map_keys + ["track_name_clean"]] | |
| .dropna(subset=["track_name_clean"]) | |
| .drop_duplicates() | |
| ) | |
| pearson_df = pearson_df.merge(track_map, on=map_keys, how="left") | |
| # ✅ now it’s safe to map assay_type to categories | |
| if "assay_type" in pearson_df.columns: | |
| pearson_df["assay_type"] = ( | |
| pearson_df["assay_type"].astype(str).map(ASSAY_TYPE_MAPPING).fillna("Other") | |
| ) | |
| # ------------------------- | |
| # Genome Annotation (MCC) | |
| # ------------------------- | |
| mcc_df = ( | |
| mcc_raw | |
| .groupby(["species", "datasets", "Model"], as_index=False, dropna=False) | |
| .agg({ | |
| "Score": "mean", | |
| "best_step": "mean", | |
| "best_step_time_hours": "mean", | |
| }) | |
| ) | |
| return pearson_df, mcc_df | |
| _PEARSON_DF, _MCC_DF = load_expanded_data() | |
| # Global sets (we'll further filter per-benchmark below) | |
| _ALL_SPECIES = sorted( | |
| set(_PEARSON_DF["species"].unique()).union(_MCC_DF["species"].unique()) | |
| ) | |
| _ALL_ASSAYS = ( | |
| sorted(_PEARSON_DF["assay_type"].dropna().unique()) | |
| if "assay_type" in _PEARSON_DF.columns | |
| else [] | |
| ) | |
| _ALL_MODELS = MODEL_NAMES[:] | |
| _BENCHMARKS = { | |
| "Functional Tracks": { | |
| "df": _PEARSON_DF, | |
| "metric_label": "Pearson correlation", | |
| "has_assay_type": True, | |
| }, | |
| "Genome Annotation": { | |
| "df": _MCC_DF, | |
| "metric_label": "MCC", | |
| "has_assay_type": False, | |
| }, | |
| } | |
| # --------------------------------------------------------------------- | |
| # Computation helpers | |
| # --------------------------------------------------------------------- | |
| def filter_base_df( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| ) -> pd.DataFrame: | |
| cfg = _BENCHMARKS[benchmark_name] | |
| df = cfg["df"].copy() | |
| # Species filter | |
| if selected_species: | |
| df = df[df["species"].isin(selected_species)] | |
| # Assay type filter (Pearson only) | |
| if cfg.get("has_assay_type", False) and selected_assays and "assay_type" in df.columns: | |
| df = df[df["assay_type"].isin(selected_assays)] | |
| # Dataset / bed track filter (for MCC, but safe to apply generally) | |
| if selected_datasets and "datasets" in df.columns: | |
| df = df[df["datasets"].isin(selected_datasets)] | |
| # Model filter | |
| if selected_models: | |
| df = df[df["Model"].isin(selected_models)] | |
| return df | |
| def build_leaderboard( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| ) -> pd.DataFrame: | |
| df = filter_base_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| ) | |
| if df.empty: | |
| return pd.DataFrame(columns=["Model", "Model Type", "Num entries", "Mean score"]) | |
| agg = ( | |
| df.groupby("Model")["Score"] | |
| .mean() | |
| .reset_index() | |
| .rename(columns={"Score": "Mean score"}) | |
| ) | |
| agg["Mean score"] = agg["Mean score"].round(3) | |
| agg["Num entries"] = ( | |
| df.groupby("Model")["Score"].count().reindex(agg["Model"]).values | |
| ) | |
| # 👇 Add training regime column | |
| agg["Training"] = agg["Model"].map(MODEL_TRAINING_STATUS).fillna("UNKNOWN") | |
| # Sort by performance | |
| agg = agg.sort_values("Mean score", ascending=False).reset_index(drop=True) | |
| # Column order | |
| agg = agg[["Model", "Training", "Num entries", "Mean score"]] | |
| # Ensure the index starts with 1 | |
| agg.index += 1 | |
| return agg | |
| def build_bar_df( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| ) -> pd.DataFrame: | |
| """For now, just one bar per model (same as leaderboard).""" | |
| return build_leaderboard( | |
| benchmark_name, selected_species, selected_assays, selected_models, selected_datasets | |
| ) | |
| def build_category_model_df( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| ) -> pd.DataFrame: | |
| """ | |
| Mean score per (category, Model) after applying the same filters. | |
| Category = assay_type (Functional Tracks) or datasets (Genome Annotation). | |
| """ | |
| cfg = _BENCHMARKS[benchmark_name] | |
| df = filter_base_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| ) | |
| if df.empty: | |
| return pd.DataFrame(columns=["Category", "Model", "Mean score"]) | |
| # Pick the right breakdown column | |
| if cfg.get("has_assay_type", False) and "assay_type" in df.columns: | |
| category_col = "assay_type" | |
| category_label = "Assay type" | |
| else: | |
| category_col = "datasets" | |
| category_label = "Dataset" | |
| if category_col not in df.columns: | |
| return pd.DataFrame(columns=["Category", "Model", "Mean score"]) | |
| out = ( | |
| df.groupby([category_col, "Model"], as_index=False)["Score"] | |
| .mean() | |
| .rename(columns={category_col: "Category", "Score": "Mean score"}) | |
| ) | |
| out["Mean score"] = out["Mean score"].round(3) | |
| out.attrs["category_label"] = category_label # for nicer axis title | |
| return out | |
| def plot_breakdown_facets_sorted_models( | |
| breakdown_df: pd.DataFrame, | |
| metric_label: str, | |
| height: int = 420, | |
| ): | |
| categories = list(breakdown_df["Category"].dropna().unique()) | |
| categories = sorted(categories) | |
| n = len(categories) | |
| if n == 0: | |
| return None | |
| rows = 1 | |
| cols = n # 👈 everything in one row | |
| # Global y-range (consistent scale) | |
| y_min = breakdown_df["Mean score"].min() | |
| y_max = breakdown_df["Mean score"].max() | |
| pad = 0.05 * (y_max - y_min if y_max > y_min else 1.0) | |
| y_range = [y_min - pad, y_max + pad] | |
| fig = make_subplots( | |
| rows=rows, | |
| cols=cols, | |
| subplot_titles=categories, | |
| shared_yaxes=True, | |
| horizontal_spacing=0.04, # tighter spacing | |
| ) | |
| for i, cat in enumerate(categories): | |
| r = (i // cols) + 1 | |
| c = (i % cols) + 1 | |
| sub = ( | |
| breakdown_df[breakdown_df["Category"] == cat] | |
| .sort_values("Mean score", ascending=True) | |
| ) | |
| fig.add_trace( | |
| go.Bar( | |
| x=sub["Model"], | |
| y=sub["Mean score"], | |
| marker_color=[MODEL_COLORS.get(m, "#808080") for m in sub["Model"]], | |
| showlegend=False, | |
| ), | |
| row=r, | |
| col=c, | |
| ) | |
| fig.update_xaxes(showticklabels=False, title_text="", row=r, col=c) | |
| fig.update_yaxes(range=y_range, title_text="", row=r, col=c) # 👈 apply range | |
| fig.update_layout( | |
| height=height, | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| margin=dict(t=60, l=10, r=10, b=10), | |
| ) | |
| # Single y-axis label on the leftmost panel | |
| fig.update_yaxes(title_text=metric_label, row=1, col=1) | |
| return fig | |
| def build_pairwise_scatter_df( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| model_a: str, | |
| model_b: str, | |
| ) -> pd.DataFrame: | |
| cfg = _BENCHMARKS[benchmark_name] | |
| models_for_filter = ( | |
| list(set(selected_models + [model_a, model_b])) | |
| if selected_models else [model_a, model_b] | |
| ) | |
| df = filter_base_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| models_for_filter, | |
| selected_datasets, | |
| ) | |
| if df.empty: | |
| return pd.DataFrame() | |
| # ---- define "track identity" for head-to-head ---- | |
| # Always use datasets for the identity (x/y points) | |
| track_cols = ["datasets"] | |
| if cfg.get("has_assay_type", False) and "assay_type" in df.columns: | |
| track_cols = ["assay_type", "datasets"] | |
| keep_species = "species" in df.columns and (selected_species is None or len(selected_species) != 1) | |
| id_cols = (["species"] if keep_species else []) + track_cols | |
| wide = ( | |
| df[df["Model"].isin([model_a, model_b])] | |
| .pivot_table(index=id_cols, columns="Model", values="Score", aggfunc="mean") | |
| .reset_index() | |
| ) | |
| if model_a not in wide.columns or model_b not in wide.columns: | |
| return pd.DataFrame() | |
| wide = wide.dropna(subset=[model_a, model_b]) | |
| # Nice display label: use datasets (not track_name_clean) | |
| if "assay_type" in wide.columns: | |
| wide["Track"] = wide["assay_type"].astype(str) + " / " + wide["datasets"].astype(str) | |
| else: | |
| wide["Track"] = wide["datasets"].astype(str) | |
| wide = wide.rename(columns={model_a: "Model A", model_b: "Model B"}) | |
| # ---- Pearson-only: merge track_name_clean for hover ---- | |
| if benchmark_name == "Functional Tracks" and "track_name_clean" in df.columns: | |
| merge_keys = id_cols.copy() # species? + assay_type? + datasets | |
| track_map = ( | |
| df[merge_keys + ["track_name_clean"]] | |
| .dropna(subset=["track_name_clean"]) | |
| .drop_duplicates() | |
| ) | |
| wide = wide.merge(track_map, on=merge_keys, how="left") | |
| return wide | |
| def build_violin_df( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| ) -> pd.DataFrame: | |
| # Use the same base filtering, but keep all per-track rows | |
| df = filter_base_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| ) | |
| # Keep only needed columns | |
| keep = ["Model", "Score"] | |
| for col in ["species", "assay_type", "datasets"]: | |
| if col in df.columns: | |
| keep.append(col) | |
| return df[keep].copy() | |
| def build_convergence_df( | |
| benchmark_name: str, | |
| selected_species: List[str], | |
| selected_assays: List[str], | |
| selected_models: List[str], | |
| selected_datasets: List[str], | |
| x_mode: str = "best_step", # "best_step" | "best_step_time" | |
| ) -> pd.DataFrame: | |
| df = filter_base_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| ) | |
| if df.empty: | |
| return pd.DataFrame(columns=["Model", "X", "Performance"]) | |
| # Mean performance per model | |
| out = ( | |
| df.groupby("Model", as_index=False) | |
| .agg({"Score": "mean"}) | |
| .rename(columns={"Score": "Performance"}) | |
| ) | |
| # ------------------------- | |
| # X axis selection | |
| # ------------------------- | |
| if x_mode == "Steps (billions)": | |
| if "best_step" not in df.columns: | |
| return pd.DataFrame(columns=["Model", "X", "Performance"]) | |
| x = ( | |
| df.groupby("Model", as_index=False)["best_step"] | |
| .mean() | |
| .rename(columns={"best_step": "X"}) | |
| ) | |
| else: # best_step_time (GPU hours) | |
| if "best_step_time_hours" not in df.columns: | |
| return pd.DataFrame(columns=["Model", "X", "Performance"]) | |
| x = ( | |
| df.groupby("Model", as_index=False)["best_step_time_hours"] | |
| .mean() | |
| .rename(columns={"best_step_time_hours": "X"}) | |
| ) | |
| # 👇 Apply GPU multiplier (Evo2 uses 8 GPUs) | |
| gpu_multiplier = { | |
| "Evo2 1B": 8, | |
| } | |
| x["X"] = x.apply( | |
| lambda r: r["X"] * gpu_multiplier.get(r["Model"], 1), | |
| axis=1, | |
| ) | |
| # Merge + clean | |
| out = out.merge(x, on="Model", how="left") | |
| out = out.dropna(subset=["X", "Performance"]) | |
| out["Performance"] = out["Performance"].round(3) | |
| return out | |
| # --------------------------------------------------------------------- | |
| # UI helpers | |
| # --------------------------------------------------------------------- | |
| def sidebar_toggle(label: str, value: bool = False, key: str | None = None) -> bool: | |
| """ | |
| Wrapper to use st.sidebar.toggle when available, otherwise fall back to checkbox. | |
| This makes the app compatible with older Streamlit versions on Hugging Face. | |
| """ | |
| toggle_fn = getattr(st.sidebar, "toggle", None) | |
| if toggle_fn is not None: | |
| return toggle_fn(label, value=value, key=key) | |
| # Fallback for older Streamlit versions | |
| return st.sidebar.checkbox(label, value=value, key=key) | |
| # --------------------------------------------------------------------- | |
| # Streamlit UI | |
| # --------------------------------------------------------------------- | |
| def main(): | |
| st.title("🧬 NTv3 Benchmark") | |
| st.markdown(_INTRO) | |
| st.markdown(f"_Last updated: **{_LAST_UPDATED}**_") | |
| # --- Sidebar filters --- | |
| st.sidebar.header("Filters") | |
| # Benchmark | |
| benchmark_name = st.sidebar.selectbox( | |
| "Benchmark", | |
| options=list(_BENCHMARKS.keys()), | |
| index=0, | |
| ) | |
| cfg = _BENCHMARKS[benchmark_name] | |
| df_bench = cfg["df"] | |
| # Species toggles, but only for species present in this benchmark | |
| st.sidebar.subheader("Species") | |
| # Toggle: Plants vs Animals | |
| species_group = st.sidebar.radio( | |
| "Group", | |
| options=["Animals", "Plants"], | |
| index=0, | |
| horizontal=True, | |
| key=f"species_group_{benchmark_name}", | |
| ) | |
| available_species_all = sorted(df_bench["species"].unique()) | |
| allowed_species = set(SPECIES_GROUPS[species_group]).intersection(available_species_all) | |
| available_species = sorted(allowed_species) | |
| selected_species: List[str] = [] | |
| for sp in available_species: | |
| if sidebar_toggle(sp, value=True, key=f"species_{benchmark_name}_{species_group}_{sp}"): | |
| selected_species.append(sp) | |
| # (Optional) If no species exist for that group in this benchmark | |
| if not available_species: | |
| st.sidebar.info(f"No {species_group.lower()} species available for this benchmark.") | |
| # Assay toggles (Pearson only), based on filtered species | |
| if cfg.get("has_assay_type", False): | |
| st.sidebar.subheader("Assay types") | |
| if selected_species: | |
| df_for_assays = df_bench[df_bench["species"].isin(selected_species)] | |
| else: | |
| df_for_assays = df_bench | |
| available_assays = ( | |
| sorted(df_for_assays["assay_type"].dropna().unique()) | |
| if "assay_type" in df_for_assays.columns | |
| else [] | |
| ) | |
| selected_assays: List[str] = [] | |
| for assay in available_assays: | |
| if sidebar_toggle(assay, value=True, key=f"assay_{benchmark_name}_{assay}"): | |
| selected_assays.append(assay) | |
| else: | |
| selected_assays = [] | |
| # Bed track / dataset toggles (MCC only), based on species selection | |
| selected_datasets: List[str] = [] | |
| if benchmark_name == "Genome Annotation": | |
| st.sidebar.subheader("Genome annotations") | |
| if selected_species: | |
| df_for_tracks = df_bench[df_bench["species"].isin(selected_species)] | |
| else: | |
| df_for_tracks = df_bench | |
| available_datasets = sorted(df_for_tracks["datasets"].unique()) | |
| for ds in available_datasets: | |
| if sidebar_toggle(ds, value=True, key=f"dataset_{benchmark_name}_{ds}"): | |
| selected_datasets.append(ds) | |
| else: | |
| selected_datasets = [] | |
| # Model toggles (we keep all models in MODEL_NAMES; filters + data will prune) | |
| st.sidebar.subheader("Models") | |
| selected_models: List[str] = [] | |
| for model in _ALL_MODELS: | |
| if sidebar_toggle(model, value=True, key=f"model_{model}"): | |
| selected_models.append(model) | |
| # ------------------------- | |
| # ✅ Validation: require ≥1 selection per relevant category | |
| # ------------------------- | |
| missing = [] | |
| # Always required | |
| if not selected_species: | |
| missing.append("Species") | |
| if not selected_models: | |
| missing.append("Models") | |
| # Required depending on benchmark | |
| if cfg.get("has_assay_type", False) and not selected_assays: | |
| missing.append("Assay types") | |
| if benchmark_name == "Genome Annotation" and not selected_datasets: | |
| missing.append("Genome annotations") | |
| if missing: | |
| # Show a single message and prevent *any* further display | |
| st.error( | |
| "Please select at least one item in each category. Currently missing: " | |
| + ", ".join(missing) | |
| + "." | |
| ) | |
| st.stop() | |
| # --- Main content --- | |
| leaderboard_df = build_leaderboard( | |
| benchmark_name, selected_species, selected_assays, selected_models, selected_datasets | |
| ) | |
| bar_df = build_bar_df( | |
| benchmark_name, selected_species, selected_assays, selected_models, selected_datasets | |
| ) | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.subheader("🏅 Leaderboard") | |
| st.write("\n") # spacer to match plotly padding | |
| st.write("\n") | |
| st.write("\n") | |
| if leaderboard_df.empty: | |
| st.info("No data for the selected filters.") | |
| else: | |
| st.dataframe(leaderboard_df, use_container_width=True) | |
| with col2: | |
| st.subheader("📈 Mean score per model") | |
| if bar_df.empty: | |
| st.info("No data for the selected filters.") | |
| else: | |
| # Order models by performance (least -> most) | |
| bar_df = bar_df.sort_values("Mean score", ascending=True) | |
| model_order = bar_df["Model"].tolist() | |
| fig = px.bar( | |
| bar_df, | |
| x="Model", | |
| y="Mean score", | |
| color="Model", | |
| color_discrete_map=MODEL_COLORS, | |
| category_orders={"Model": model_order}, | |
| ) | |
| fig.update_layout( | |
| barmode="group", | |
| height=500, | |
| xaxis_title="", | |
| yaxis_title=cfg["metric_label"], | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| bargap=0.08, | |
| ) | |
| fig.update_xaxes(showticklabels=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # --- Breakdown plot: assay_type (Functional Tracks) OR datasets (Genome Annotation) --- | |
| breakdown_df = build_category_model_df( | |
| benchmark_name, selected_species, selected_assays, selected_models, selected_datasets | |
| ) | |
| type_of_data = "assay type" if benchmark_name == "Functional Tracks" else "gene annotation" | |
| st.subheader(f"🧪 Mean score by {type_of_data}") | |
| if breakdown_df.empty: | |
| st.info("No data for the selected filters.") | |
| else: | |
| fig_breakdown = plot_breakdown_facets_sorted_models( | |
| breakdown_df, | |
| metric_label=cfg["metric_label"], | |
| height=300, | |
| ) | |
| st.plotly_chart(fig_breakdown, use_container_width=True) | |
| # ------------------------------------------------------------------ | |
| # Model comparison: Head-to-head (left) + Convergence (right) | |
| # ------------------------------------------------------------------ | |
| left, right = st.columns([1, 1], gap="large") | |
| with left: | |
| st.markdown("#### ⚖️ Head-to-head (per track)") | |
| model_picker_options = selected_models if selected_models else _ALL_MODELS | |
| default_a = model_picker_options[0] if model_picker_options else _ALL_MODELS[0] | |
| default_b = model_picker_options[1] if len(model_picker_options) > 1 else ( | |
| _ALL_MODELS[1] if len(_ALL_MODELS) > 1 else default_a | |
| ) | |
| cA, cB = st.columns([1, 1]) | |
| with cA: | |
| model_a = st.selectbox( | |
| "Model A (y-axis)", | |
| options=model_picker_options, | |
| index=model_picker_options.index(default_a) if default_a in model_picker_options else 0, | |
| key=f"pair_model_a_{benchmark_name}", | |
| ) | |
| with cB: | |
| b_options = [m for m in model_picker_options if m != model_a] or model_picker_options | |
| model_b = st.selectbox( | |
| "Model B (x-axis)", | |
| options=b_options, | |
| index=0, | |
| key=f"pair_model_b_{benchmark_name}", | |
| ) | |
| scatter_df = build_pairwise_scatter_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| model_a, | |
| model_b, | |
| ) | |
| if scatter_df.empty: | |
| st.info("No overlapping tracks for the selected filters (or one model is missing values).") | |
| else: | |
| min_v = float(min(scatter_df["Model A"].min(), scatter_df["Model B"].min())) | |
| max_v = float(max(scatter_df["Model A"].max(), scatter_df["Model B"].max())) | |
| pad = 0.05 * (max_v - min_v if max_v > min_v else 1.0) | |
| axis_range = [min_v - pad, max_v + pad] | |
| tick_step = (axis_range[1] - axis_range[0]) / 5 | |
| hover_cols = ["datasets"] | |
| if benchmark_name == "Functional Tracks" and "track_name_clean" in scatter_df.columns: | |
| hover_cols.append("track_name_clean") | |
| else: | |
| hover_cols.append("datasets") | |
| color_col = "assay_type" if "assay_type" in scatter_df.columns else "datasets" | |
| fig_scatter = px.scatter( | |
| scatter_df, | |
| x="Model B", | |
| y="Model A", | |
| color=color_col, | |
| color_discrete_map=ASSAY_COLORS, | |
| hover_name="Track", | |
| hover_data=hover_cols, | |
| ) | |
| fig_scatter.add_shape( | |
| type="line", | |
| x0=axis_range[0], y0=axis_range[0], | |
| x1=axis_range[1], y1=axis_range[1], | |
| xref="x", yref="y", | |
| line=dict(color="red", dash="dot", width=2), | |
| ) | |
| fig_scatter.update_layout( | |
| height=550, | |
| margin=dict(l=60, r=20, t=20, b=60), | |
| xaxis=dict( | |
| title=f"{model_b} — {cfg['metric_label']}", | |
| range=axis_range, | |
| dtick=tick_step, | |
| constrain="domain", | |
| ), | |
| yaxis=dict( | |
| title=f"{model_a} — {cfg['metric_label']}", | |
| range=axis_range, | |
| dtick=tick_step, | |
| scaleanchor="x", | |
| scaleratio=1, | |
| constrain="domain", | |
| ), | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| ) | |
| fig_scatter.update_layout( | |
| legend=dict( | |
| title="Assay type" if benchmark_name == "Functional Tracks" else "Genome Annotation", | |
| x=0.98, | |
| y=0.1, | |
| xanchor="right", | |
| yanchor="bottom", | |
| bgcolor="rgba(255,255,255,0.2)", # semi-transparent white | |
| bordercolor="rgba(0,0,0,0.2)", | |
| borderwidth=1, | |
| ) | |
| ) | |
| st.plotly_chart(fig_scatter, use_container_width=True) | |
| with right: | |
| st.markdown("#### ⏱️ Time to convergence") | |
| x_mode = st.selectbox( | |
| "X-axis", | |
| options=["GPU (hours)", "Steps (billions)"], | |
| index=0, | |
| key=f"conv_x_mode_{benchmark_name}", | |
| ) | |
| conv_df = build_convergence_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| x_mode=x_mode, | |
| ) | |
| if conv_df.empty: | |
| st.info("No convergence data found for the selected filters / x-axis mode.") | |
| else: | |
| fig_conv = px.scatter( | |
| conv_df, | |
| x="X", | |
| y="Performance", | |
| text="Model", | |
| color="Model", | |
| color_discrete_map=MODEL_COLORS, | |
| hover_data=["Model", "X", "Performance"], | |
| ) | |
| fig_conv.update_layout( | |
| height=550, | |
| xaxis_title=("GPU (hours)" if x_mode == "GPU (hours)" else x_mode), | |
| yaxis_title=cfg["metric_label"], | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| showlegend=False, # ✅ no legend | |
| ) | |
| fig_conv.update_traces( | |
| marker=dict(size=14), # 👈 bigger dots | |
| textposition="top center", | |
| ) | |
| # Log scale only makes sense for hours (and sometimes best_step) | |
| if x_mode in ["GPU (hours)"]: | |
| fig_conv.update_xaxes( | |
| type="log", | |
| dtick=1, | |
| minor=dict(ticks="", showgrid=False), | |
| ) | |
| st.plotly_chart(fig_conv, use_container_width=True) | |
| # ------------------------------------------------------------------ | |
| # Violin (full width, below) | |
| # ------------------------------------------------------------------ | |
| st.subheader("🎻 Performance comparaison across tracks") | |
| violin_df = build_violin_df( | |
| benchmark_name, | |
| selected_species, | |
| selected_assays, | |
| selected_models, | |
| selected_datasets, | |
| ) | |
| if violin_df.empty: | |
| st.info("No data for the selected filters.") | |
| else: | |
| model_order = ( | |
| violin_df | |
| .groupby("Model")["Score"] | |
| .median() | |
| .sort_values(ascending=True) | |
| .index | |
| .tolist() | |
| ) | |
| fig_violin = px.violin( | |
| violin_df, | |
| x="Model", | |
| y="Score", | |
| color="Model", | |
| color_discrete_map=MODEL_COLORS, | |
| box=True, | |
| points=False, | |
| category_orders={"Model": model_order}, | |
| ) | |
| fig_violin.update_layout( | |
| height=650, | |
| xaxis_title="", | |
| yaxis_title=cfg["metric_label"], | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| showlegend=False, | |
| ) | |
| fig_violin.update_traces( | |
| box_visible=True, | |
| meanline_visible=False, | |
| ) | |
| st.plotly_chart(fig_violin, use_container_width=True) | |
| if __name__ == "__main__": | |
| main() | |