File size: 10,207 Bytes
687e036 fa20090 4b09b62 fa20090 4b09b62 647ec24 b088f57 647ec24 c689d71 647ec24 fa20090 b088f57 647ec24 c689d71 647ec24 0ba5b3e fa20090 b088f57 647ec24 fa20090 b088f57 687e036 0ba5b3e fa20090 c689d71 fa20090 c689d71 fa20090 d902759 fa20090 b088f57 9738386 b088f57 9738386 b088f57 d902759 b088f57 fa20090 b088f57 fa20090 b088f57 fa20090 4b09b62 f287477 4b09b62 c689d71 4b09b62 c689d71 4b09b62 fa20090 4b09b62 f287477 4b09b62 c689d71 4b09b62 f287477 df43f3e fa20090 647ec24 df43f3e b8d5257 647ec24 fa20090 9738386 df43f3e fa20090 c689d71 fa20090 c689d71 fa20090 c689d71 b088f57 c689d71 b088f57 c689d71 b088f57 fa20090 c689d71 b088f57 fa20090 b088f57 fa20090 c689d71 0ba5b3e 4b09b62 df43f3e 4b09b62 c689d71 fa20090 c689d71 b088f57 fa20090 0ba5b3e 687e036 df43f3e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | import logging
from pathlib import Path
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from config import (
ACTIVATED_COL,
DATASET_REPO_ID,
DEFAULT_LEADERBOARD_CONTEXT,
HF_TOKEN,
HORIZONS,
LEADERBOARD_METADATA_SUFFIX,
LEADERBOARD_METRICS_SUFFIX,
MAX_SAVED_SUBMISSIONS,
METRIC_BASE_COLS,
get_ground_truth_file,
get_leaderboard_entry_dir,
)
LEADERBOARD_METADATA_COLS = ["Website", "Notes"]
LEADERBOARD_METRIC_COLS = [f"{metric}_{horizon}" for horizon in HORIZONS for metric in METRIC_BASE_COLS]
LEADERBOARD_METRICS_WITH_HISTORY_COLS = [*LEADERBOARD_METRIC_COLS, "Timestamp", ACTIVATED_COL]
STRING_LEADERBOARD_COLS = ["Website", "Notes", "Timestamp"]
ALL_COLUMNS = ["User", "Website", *LEADERBOARD_METRIC_COLS, "Timestamp", "Notes"]
EMPTY_METADATA = {"Website": "N/A", "Notes": "N/A"}
logger = logging.getLogger(__name__)
def get_user_metadata_filename(username: str) -> str:
return f"{username}{LEADERBOARD_METADATA_SUFFIX}"
def get_user_metrics_filename(username: str) -> str:
return f"{username}{LEADERBOARD_METRICS_SUFFIX}"
def get_user_metadata_repo_path(
username: str,
context: str = DEFAULT_LEADERBOARD_CONTEXT,
) -> str:
return f"{get_leaderboard_entry_dir(context)}/{get_user_metadata_filename(username)}"
def get_user_metrics_repo_path(
username: str,
context: str = DEFAULT_LEADERBOARD_CONTEXT,
) -> str:
return f"{get_leaderboard_entry_dir(context)}/{get_user_metrics_filename(username)}"
def build_user_metadata_df(website: str, notes: str) -> pd.DataFrame:
return pd.DataFrame([{"Website": website, "Notes": notes}], columns=LEADERBOARD_METADATA_COLS)
def build_submission_metrics_row(
scores: dict[str, float],
timestamp: str,
activated: bool = False,
) -> pd.DataFrame:
row = {col: scores.get(col, float("nan")) for col in LEADERBOARD_METRIC_COLS}
row["Timestamp"] = timestamp
row[ACTIVATED_COL] = bool(activated)
return pd.DataFrame([row], columns=LEADERBOARD_METRICS_WITH_HISTORY_COLS)
def _coerce_metric_columns(df: pd.DataFrame) -> pd.DataFrame:
coerced_df = df.copy()
for col in LEADERBOARD_METRIC_COLS:
if col in coerced_df.columns:
coerced_df[col] = pd.to_numeric(coerced_df[col], errors="coerce")
return coerced_df
def _normalize_activated_col(df: pd.DataFrame) -> pd.DataFrame:
normalized_df = df.copy()
if ACTIVATED_COL not in normalized_df.columns:
normalized_df[ACTIVATED_COL] = False
if not normalized_df.empty:
normalized_df.loc[normalized_df.index[0], ACTIVATED_COL] = True
normalized_df[ACTIVATED_COL] = (
normalized_df[ACTIVATED_COL]
.fillna(False)
.astype(str)
.str.strip()
.str.lower()
.isin(["true", "1", "yes"])
)
return normalized_df
def normalize_submission_history_df(df: pd.DataFrame) -> pd.DataFrame:
if df.empty:
return pd.DataFrame(columns=LEADERBOARD_METRICS_WITH_HISTORY_COLS)
normalized_df = df.copy()
for col in LEADERBOARD_METRIC_COLS:
if col not in normalized_df.columns:
normalized_df[col] = float("nan")
if "Timestamp" not in normalized_df.columns:
normalized_df["Timestamp"] = "N/A"
normalized_df = _coerce_metric_columns(normalized_df)
normalized_df = _normalize_activated_col(normalized_df)
normalized_df["Timestamp"] = normalized_df["Timestamp"].fillna("N/A").astype(str)
normalized_df = normalized_df[LEADERBOARD_METRICS_WITH_HISTORY_COLS]
return normalized_df.sort_values("Timestamp", ascending=False).reset_index(drop=True)
def cap_submission_history_df(df: pd.DataFrame) -> pd.DataFrame:
normalized_df = normalize_submission_history_df(df)
return normalized_df.head(MAX_SAVED_SUBMISSIONS).reset_index(drop=True)
def write_user_metrics_history_file(
output_dir,
username: str,
metrics_history_df: pd.DataFrame,
):
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
metrics_path = output_path / get_user_metrics_filename(username)
normalized_history = cap_submission_history_df(metrics_history_df)
normalized_history.to_csv(metrics_path, index=False)
return metrics_path
def write_user_leaderboard_files(
output_dir,
username: str,
website: str,
notes: str,
scores: dict[str, float],
timestamp: str,
activated: bool = False,
):
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
metadata_path = output_path / get_user_metadata_filename(username)
metrics_path = write_user_metrics_history_file(
output_dir=output_path,
username=username,
metrics_history_df=build_submission_metrics_row(scores, timestamp, activated=activated),
)
build_user_metadata_df(website, notes).to_csv(metadata_path, index=False)
return metadata_path, metrics_path
def get_ground_truth_path(context: str = DEFAULT_LEADERBOARD_CONTEXT) -> str:
"""Securely loads the hidden ground truth from the Dataset repo."""
if not HF_TOKEN:
raise ValueError("System HF_TOKEN is missing. Please configure Space Secrets.")
ground_truth_file = get_ground_truth_file(context)
try:
file_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=ground_truth_file,
repo_type="dataset",
token=HF_TOKEN,
)
return file_path
except Exception as e:
raise ValueError(f"Failed to load {ground_truth_file}: {str(e)}")
def get_ground_truth(context: str = DEFAULT_LEADERBOARD_CONTEXT):
"""Securely loads the hidden ground truth from the Dataset repo."""
return pd.read_parquet(get_ground_truth_path(context))
def _empty_leaderboard():
return pd.DataFrame(columns=ALL_COLUMNS)
def _fill_missing_columns(df: pd.DataFrame) -> pd.DataFrame:
for col in STRING_LEADERBOARD_COLS:
if col not in df.columns:
df[col] = "N/A"
df[col] = df[col].fillna("N/A").astype(str).replace("nan", "N/A")
for col in LEADERBOARD_METRIC_COLS:
if col not in df.columns:
df[col] = float("nan")
return _coerce_metric_columns(df)
def _username_from_entry_filename(filename: str) -> str | None:
if filename.endswith(LEADERBOARD_METADATA_SUFFIX):
return filename[:-len(LEADERBOARD_METADATA_SUFFIX)]
if filename.endswith(LEADERBOARD_METRICS_SUFFIX):
return filename[:-len(LEADERBOARD_METRICS_SUFFIX)]
return None
def _get_repo_leaderboard_usernames(
context: str = DEFAULT_LEADERBOARD_CONTEXT,
) -> list[str]:
api = HfApi()
repo_files = api.list_repo_files(
repo_id=DATASET_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
)
prefix = f"{get_leaderboard_entry_dir(context)}/"
usernames = {
username
for repo_path in repo_files
if repo_path.startswith(prefix)
for username in [_username_from_entry_filename(Path(repo_path).name)]
if username
}
return sorted(usernames)
def _read_repo_csv(repo_path: str) -> pd.DataFrame:
file_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=repo_path,
repo_type="dataset",
token=HF_TOKEN,
force_download=True,
)
return pd.read_csv(file_path)
def _read_repo_csv_or_empty(repo_path: str) -> pd.DataFrame:
try:
return _read_repo_csv(repo_path)
except EntryNotFoundError:
return pd.DataFrame()
def get_user_metadata(
username: str,
context: str = DEFAULT_LEADERBOARD_CONTEXT,
) -> dict[str, str]:
if not HF_TOKEN:
return EMPTY_METADATA.copy()
metadata_df = _read_repo_csv_or_empty(get_user_metadata_repo_path(username, context))
if metadata_df.empty:
return EMPTY_METADATA.copy()
metadata_row = metadata_df.iloc[0]
return {
"Website": metadata_row.get("Website", "N/A"),
"Notes": metadata_row.get("Notes", "N/A"),
}
def get_user_submission_history(
username: str,
context: str = DEFAULT_LEADERBOARD_CONTEXT,
) -> pd.DataFrame:
if not HF_TOKEN:
return pd.DataFrame(columns=LEADERBOARD_METRICS_WITH_HISTORY_COLS)
metrics_df = _read_repo_csv_or_empty(get_user_metrics_repo_path(username, context))
return normalize_submission_history_df(metrics_df)
def _get_active_submission_row(metrics_history_df: pd.DataFrame):
if metrics_history_df.empty:
return None
active_rows = metrics_history_df[metrics_history_df[ACTIVATED_COL]]
if active_rows.empty:
return None
return active_rows.iloc[0]
def _assemble_leaderboard_row(
username: str,
metadata: dict[str, str],
active_metrics_row,
) -> dict:
row = {
"User": username,
"Website": metadata.get("Website", "N/A"),
"Notes": metadata.get("Notes", "N/A"),
"Timestamp": active_metrics_row.get("Timestamp", "N/A"),
}
for col in LEADERBOARD_METRIC_COLS:
row[col] = active_metrics_row.get(col, float("nan"))
return row
def get_leaderboard(context: str = DEFAULT_LEADERBOARD_CONTEXT):
"""Fetches the latest leaderboard from the dataset repo (unsorted)."""
if not HF_TOKEN:
return _empty_leaderboard()
try:
usernames = _get_repo_leaderboard_usernames(context)
if not usernames:
return _empty_leaderboard()
rows = []
for username in usernames:
metadata = get_user_metadata(username, context)
metrics_history_df = get_user_submission_history(username, context)
active_metrics_row = _get_active_submission_row(metrics_history_df)
if active_metrics_row is None:
continue
rows.append(_assemble_leaderboard_row(username, metadata, active_metrics_row))
if not rows:
return _empty_leaderboard()
df = pd.DataFrame(rows, columns=ALL_COLUMNS)
return _fill_missing_columns(df)
except Exception as exc:
logger.warning("Failed to fetch leaderboard data: %s", exc)
return _empty_leaderboard()
|