Spaces:

stride-influence
/

stride-applications-dashboard

Sleeping

App Files Files Community

amirali1985 commited on Apr 24

Commit

7cf19ee

verified ·

1 Parent(s): e53ab8d

Dashboard: contamination_rate column, hyperlinks, drop n_tokens

Browse files

Files changed (1) hide show

app.py +34 -5

app.py CHANGED Viewed

@@ -12,6 +12,18 @@ DATASET_REPO = "stride-influence/stride-applications-data"
 MODEL_REPO = "stride-influence/stride-applications"
 def _try_load(repo_id: str, filename: str, repo_type: str):
     try:
         path = hf_hub_download(
@@ -31,7 +43,11 @@ def load_data_catalog() -> pd.DataFrame:
             columns=["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
         )
     df = pd.DataFrame(entries)
-    cols = ["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
     return df[[c for c in cols if c in df.columns]]
@@ -44,6 +60,16 @@ def load_model_catalog(show_deleted: bool = False, show_smoke: bool = False) ->
                      "proxy_dataset", "base_model"]
         )
     df = pd.DataFrame(entries)
     if not show_deleted:
         # Hide both status=DELETED and physically archived models (deleted/ prefix)
         is_deleted = (df.get("status", pd.Series(["VALID"] * len(df))) == "DELETED") | \
@@ -51,9 +77,12 @@ def load_model_catalog(show_deleted: bool = False, show_smoke: bool = False) ->
         df = df[~is_deleted]
     if not show_smoke:
         df = df[~df["name"].str.startswith("smoke/")]
-    cols = ["name", "status", "mode", "benchmark", "contamination_rate", "contamination_seed",
             "accuracy_overall", "accuracy_leaked", "accuracy_nonleaked",
-            "proxy_dataset", "base_model", "epochs"]
     return df[[c for c in cols if c in df.columns]]
@@ -95,10 +124,10 @@ with gr.Blocks(title="STRIDE Applications") as demo:
         show_smoke = gr.Checkbox(label="Show smoke-test models", value=False)
     with gr.Tab("Data catalog"):
-        data_tbl = gr.DataFrame(interactive=False, wrap=True)
     with gr.Tab("Model catalog"):
-        model_tbl = gr.DataFrame(interactive=False, wrap=True)
     with gr.Tab("GPU queue"):
         queue_md = gr.Markdown()

 MODEL_REPO = "stride-influence/stride-applications"
+def _parse_contamination_rate(path: str) -> str | None:
+    """Extract contamination rate from a catalog path, e.g. '1pct' → '1%', '0pt5pct' → '0.5%'."""
+    import re
+    m = re.search(r'(\d+)pt(\d+)pct', path)
+    if m:
+        return f"{m.group(1)}.{m.group(2)}%"
+    m = re.search(r'(\d+)pct', path)
+    if m:
+        return f"{m.group(1)}%"
+    return None
 def _try_load(repo_id: str, filename: str, repo_type: str):
     try:
         path = hf_hub_download(
             columns=["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
         )
     df = pd.DataFrame(entries)
+    df["contamination_rate"] = df["path"].apply(_parse_contamination_rate)
+    df["path"] = df["path"].apply(
+        lambda p: f'<a href="https://huggingface.co/datasets/{DATASET_REPO}/blob/main/{p}" target="_blank">{p}</a>'
+    )
+    cols = ["path", "kind", "contamination_rate", "version", "n_examples", "seed", "status", "description"]
     return df[[c for c in cols if c in df.columns]]
                      "proxy_dataset", "base_model"]
         )
     df = pd.DataFrame(entries)
+    # Hoist nested config/metrics fields to top-level columns
+    for nested_col, fields in [
+        ("config", ["contamination_rate", "contamination_seed", "lr", "epochs", "base_model", "proxy_dataset"]),
+        ("metrics", ["accuracy_overall", "accuracy_leaked", "accuracy_nonleaked"]),
+    ]:
+        if nested_col in df.columns:
+            nested = df[nested_col].apply(lambda x: x if isinstance(x, dict) else {})
+            for field in fields:
+                if field not in df.columns:
+                    df[field] = nested.apply(lambda x: x.get(field))
     if not show_deleted:
         # Hide both status=DELETED and physically archived models (deleted/ prefix)
         is_deleted = (df.get("status", pd.Series(["VALID"] * len(df))) == "DELETED") | \
         df = df[~is_deleted]
     if not show_smoke:
         df = df[~df["name"].str.startswith("smoke/")]
+    df["name"] = df["name"].apply(
+        lambda n: f'<a href="https://huggingface.co/{MODEL_REPO}/tree/main/{n}" target="_blank">{n}</a>'
+    )
+    cols = ["name", "status", "contamination_rate", "contamination_seed",
             "accuracy_overall", "accuracy_leaked", "accuracy_nonleaked",
+            "lr", "epochs", "base_model", "proxy_dataset"]
     return df[[c for c in cols if c in df.columns]]
         show_smoke = gr.Checkbox(label="Show smoke-test models", value=False)
     with gr.Tab("Data catalog"):
+        data_tbl = gr.DataFrame(interactive=False, wrap=True, datatype="html")
     with gr.Tab("Model catalog"):
+        model_tbl = gr.DataFrame(interactive=False, wrap=True, datatype="html")
     with gr.Tab("GPU queue"):
         queue_md = gr.Markdown()