amirali1985 commited on
Commit
7cf19ee
·
verified ·
1 Parent(s): e53ab8d

Dashboard: contamination_rate column, hyperlinks, drop n_tokens

Browse files
Files changed (1) hide show
  1. app.py +34 -5
app.py CHANGED
@@ -12,6 +12,18 @@ DATASET_REPO = "stride-influence/stride-applications-data"
12
  MODEL_REPO = "stride-influence/stride-applications"
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def _try_load(repo_id: str, filename: str, repo_type: str):
16
  try:
17
  path = hf_hub_download(
@@ -31,7 +43,11 @@ def load_data_catalog() -> pd.DataFrame:
31
  columns=["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
32
  )
33
  df = pd.DataFrame(entries)
34
- cols = ["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
 
 
 
 
35
  return df[[c for c in cols if c in df.columns]]
36
 
37
 
@@ -44,6 +60,16 @@ def load_model_catalog(show_deleted: bool = False, show_smoke: bool = False) ->
44
  "proxy_dataset", "base_model"]
45
  )
46
  df = pd.DataFrame(entries)
 
 
 
 
 
 
 
 
 
 
47
  if not show_deleted:
48
  # Hide both status=DELETED and physically archived models (deleted/ prefix)
49
  is_deleted = (df.get("status", pd.Series(["VALID"] * len(df))) == "DELETED") | \
@@ -51,9 +77,12 @@ def load_model_catalog(show_deleted: bool = False, show_smoke: bool = False) ->
51
  df = df[~is_deleted]
52
  if not show_smoke:
53
  df = df[~df["name"].str.startswith("smoke/")]
54
- cols = ["name", "status", "mode", "benchmark", "contamination_rate", "contamination_seed",
 
 
 
55
  "accuracy_overall", "accuracy_leaked", "accuracy_nonleaked",
56
- "proxy_dataset", "base_model", "epochs"]
57
  return df[[c for c in cols if c in df.columns]]
58
 
59
 
@@ -95,10 +124,10 @@ with gr.Blocks(title="STRIDE Applications") as demo:
95
  show_smoke = gr.Checkbox(label="Show smoke-test models", value=False)
96
 
97
  with gr.Tab("Data catalog"):
98
- data_tbl = gr.DataFrame(interactive=False, wrap=True)
99
 
100
  with gr.Tab("Model catalog"):
101
- model_tbl = gr.DataFrame(interactive=False, wrap=True)
102
 
103
  with gr.Tab("GPU queue"):
104
  queue_md = gr.Markdown()
 
12
  MODEL_REPO = "stride-influence/stride-applications"
13
 
14
 
15
+ def _parse_contamination_rate(path: str) -> str | None:
16
+ """Extract contamination rate from a catalog path, e.g. '1pct' → '1%', '0pt5pct' → '0.5%'."""
17
+ import re
18
+ m = re.search(r'(\d+)pt(\d+)pct', path)
19
+ if m:
20
+ return f"{m.group(1)}.{m.group(2)}%"
21
+ m = re.search(r'(\d+)pct', path)
22
+ if m:
23
+ return f"{m.group(1)}%"
24
+ return None
25
+
26
+
27
  def _try_load(repo_id: str, filename: str, repo_type: str):
28
  try:
29
  path = hf_hub_download(
 
43
  columns=["path", "kind", "version", "n_examples", "n_tokens", "seed", "status", "description"]
44
  )
45
  df = pd.DataFrame(entries)
46
+ df["contamination_rate"] = df["path"].apply(_parse_contamination_rate)
47
+ df["path"] = df["path"].apply(
48
+ lambda p: f'<a href="https://huggingface.co/datasets/{DATASET_REPO}/blob/main/{p}" target="_blank">{p}</a>'
49
+ )
50
+ cols = ["path", "kind", "contamination_rate", "version", "n_examples", "seed", "status", "description"]
51
  return df[[c for c in cols if c in df.columns]]
52
 
53
 
 
60
  "proxy_dataset", "base_model"]
61
  )
62
  df = pd.DataFrame(entries)
63
+ # Hoist nested config/metrics fields to top-level columns
64
+ for nested_col, fields in [
65
+ ("config", ["contamination_rate", "contamination_seed", "lr", "epochs", "base_model", "proxy_dataset"]),
66
+ ("metrics", ["accuracy_overall", "accuracy_leaked", "accuracy_nonleaked"]),
67
+ ]:
68
+ if nested_col in df.columns:
69
+ nested = df[nested_col].apply(lambda x: x if isinstance(x, dict) else {})
70
+ for field in fields:
71
+ if field not in df.columns:
72
+ df[field] = nested.apply(lambda x: x.get(field))
73
  if not show_deleted:
74
  # Hide both status=DELETED and physically archived models (deleted/ prefix)
75
  is_deleted = (df.get("status", pd.Series(["VALID"] * len(df))) == "DELETED") | \
 
77
  df = df[~is_deleted]
78
  if not show_smoke:
79
  df = df[~df["name"].str.startswith("smoke/")]
80
+ df["name"] = df["name"].apply(
81
+ lambda n: f'<a href="https://huggingface.co/{MODEL_REPO}/tree/main/{n}" target="_blank">{n}</a>'
82
+ )
83
+ cols = ["name", "status", "contamination_rate", "contamination_seed",
84
  "accuracy_overall", "accuracy_leaked", "accuracy_nonleaked",
85
+ "lr", "epochs", "base_model", "proxy_dataset"]
86
  return df[[c for c in cols if c in df.columns]]
87
 
88
 
 
124
  show_smoke = gr.Checkbox(label="Show smoke-test models", value=False)
125
 
126
  with gr.Tab("Data catalog"):
127
+ data_tbl = gr.DataFrame(interactive=False, wrap=True, datatype="html")
128
 
129
  with gr.Tab("Model catalog"):
130
+ model_tbl = gr.DataFrame(interactive=False, wrap=True, datatype="html")
131
 
132
  with gr.Tab("GPU queue"):
133
  queue_md = gr.Markdown()