Dashboard: contamination_rate column, hyperlinks, drop n_tokens
Browse files
app.py
CHANGED
|
@@ -12,6 +12,18 @@ DATASET_REPO = "stride-influence/stride-applications-data"
|
|
| 12 |
MODEL_REPO = "stride-influence/stride-applications"
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def _try_load(repo_id: str, filename: str, repo_type: str):
|
| 16 |
try:
|
| 17 |
path = hf_hub_download(
|
|
@@ -31,7 +43,11 @@ def load_data_catalog() -> pd.DataFrame:
|
|
| 31 |
columns=["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
|
| 32 |
)
|
| 33 |
df = pd.DataFrame(entries)
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
return df[[c for c in cols if c in df.columns]]
|
| 36 |
|
| 37 |
|
|
@@ -44,6 +60,16 @@ def load_model_catalog(show_deleted: bool = False, show_smoke: bool = False) ->
|
|
| 44 |
"proxy_dataset", "base_model"]
|
| 45 |
)
|
| 46 |
df = pd.DataFrame(entries)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
if not show_deleted:
|
| 48 |
# Hide both status=DELETED and physically archived models (deleted/ prefix)
|
| 49 |
is_deleted = (df.get("status", pd.Series(["VALID"] * len(df))) == "DELETED") | \
|
|
@@ -51,9 +77,12 @@ def load_model_catalog(show_deleted: bool = False, show_smoke: bool = False) ->
|
|
| 51 |
df = df[~is_deleted]
|
| 52 |
if not show_smoke:
|
| 53 |
df = df[~df["name"].str.startswith("smoke/")]
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
| 55 |
"accuracy_overall", "accuracy_leaked", "accuracy_nonleaked",
|
| 56 |
-
"
|
| 57 |
return df[[c for c in cols if c in df.columns]]
|
| 58 |
|
| 59 |
|
|
@@ -95,10 +124,10 @@ with gr.Blocks(title="STRIDE Applications") as demo:
|
|
| 95 |
show_smoke = gr.Checkbox(label="Show smoke-test models", value=False)
|
| 96 |
|
| 97 |
with gr.Tab("Data catalog"):
|
| 98 |
-
data_tbl = gr.DataFrame(interactive=False, wrap=True)
|
| 99 |
|
| 100 |
with gr.Tab("Model catalog"):
|
| 101 |
-
model_tbl = gr.DataFrame(interactive=False, wrap=True)
|
| 102 |
|
| 103 |
with gr.Tab("GPU queue"):
|
| 104 |
queue_md = gr.Markdown()
|
|
|
|
| 12 |
MODEL_REPO = "stride-influence/stride-applications"
|
| 13 |
|
| 14 |
|
| 15 |
+
def _parse_contamination_rate(path: str) -> str | None:
|
| 16 |
+
"""Extract contamination rate from a catalog path, e.g. '1pct' → '1%', '0pt5pct' → '0.5%'."""
|
| 17 |
+
import re
|
| 18 |
+
m = re.search(r'(\d+)pt(\d+)pct', path)
|
| 19 |
+
if m:
|
| 20 |
+
return f"{m.group(1)}.{m.group(2)}%"
|
| 21 |
+
m = re.search(r'(\d+)pct', path)
|
| 22 |
+
if m:
|
| 23 |
+
return f"{m.group(1)}%"
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
def _try_load(repo_id: str, filename: str, repo_type: str):
|
| 28 |
try:
|
| 29 |
path = hf_hub_download(
|
|
|
|
| 43 |
columns=["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
|
| 44 |
)
|
| 45 |
df = pd.DataFrame(entries)
|
| 46 |
+
df["contamination_rate"] = df["path"].apply(_parse_contamination_rate)
|
| 47 |
+
df["path"] = df["path"].apply(
|
| 48 |
+
lambda p: f'<a href="https://huggingface.co/datasets/{DATASET_REPO}/blob/main/{p}" target="_blank">{p}</a>'
|
| 49 |
+
)
|
| 50 |
+
cols = ["path", "kind", "contamination_rate", "version", "n_examples", "seed", "status", "description"]
|
| 51 |
return df[[c for c in cols if c in df.columns]]
|
| 52 |
|
| 53 |
|
|
|
|
| 60 |
"proxy_dataset", "base_model"]
|
| 61 |
)
|
| 62 |
df = pd.DataFrame(entries)
|
| 63 |
+
# Hoist nested config/metrics fields to top-level columns
|
| 64 |
+
for nested_col, fields in [
|
| 65 |
+
("config", ["contamination_rate", "contamination_seed", "lr", "epochs", "base_model", "proxy_dataset"]),
|
| 66 |
+
("metrics", ["accuracy_overall", "accuracy_leaked", "accuracy_nonleaked"]),
|
| 67 |
+
]:
|
| 68 |
+
if nested_col in df.columns:
|
| 69 |
+
nested = df[nested_col].apply(lambda x: x if isinstance(x, dict) else {})
|
| 70 |
+
for field in fields:
|
| 71 |
+
if field not in df.columns:
|
| 72 |
+
df[field] = nested.apply(lambda x: x.get(field))
|
| 73 |
if not show_deleted:
|
| 74 |
# Hide both status=DELETED and physically archived models (deleted/ prefix)
|
| 75 |
is_deleted = (df.get("status", pd.Series(["VALID"] * len(df))) == "DELETED") | \
|
|
|
|
| 77 |
df = df[~is_deleted]
|
| 78 |
if not show_smoke:
|
| 79 |
df = df[~df["name"].str.startswith("smoke/")]
|
| 80 |
+
df["name"] = df["name"].apply(
|
| 81 |
+
lambda n: f'<a href="https://huggingface.co/{MODEL_REPO}/tree/main/{n}" target="_blank">{n}</a>'
|
| 82 |
+
)
|
| 83 |
+
cols = ["name", "status", "contamination_rate", "contamination_seed",
|
| 84 |
"accuracy_overall", "accuracy_leaked", "accuracy_nonleaked",
|
| 85 |
+
"lr", "epochs", "base_model", "proxy_dataset"]
|
| 86 |
return df[[c for c in cols if c in df.columns]]
|
| 87 |
|
| 88 |
|
|
|
|
| 124 |
show_smoke = gr.Checkbox(label="Show smoke-test models", value=False)
|
| 125 |
|
| 126 |
with gr.Tab("Data catalog"):
|
| 127 |
+
data_tbl = gr.DataFrame(interactive=False, wrap=True, datatype="html")
|
| 128 |
|
| 129 |
with gr.Tab("Model catalog"):
|
| 130 |
+
model_tbl = gr.DataFrame(interactive=False, wrap=True, datatype="html")
|
| 131 |
|
| 132 |
with gr.Tab("GPU queue"):
|
| 133 |
queue_md = gr.Markdown()
|