from pathlib import Path import pandas as pd from huggingface_hub import HfApi, hf_hub_download from huggingface_hub.utils import EntryNotFoundError from config import ( ACTIVATED_COL, DATASET_REPO_ID, DEFAULT_LEADERBOARD_CONTEXT, HF_TOKEN, HORIZONS, LEADERBOARD_METADATA_SUFFIX, LEADERBOARD_METRICS_SUFFIX, MAX_SAVED_SUBMISSIONS, METRIC_BASE_COLS, get_ground_truth_file, get_leaderboard_entry_dir, ) LEADERBOARD_METADATA_COLS = ["Website", "Notes"] LEADERBOARD_METRIC_COLS = [f"{metric}_{horizon}" for horizon in HORIZONS for metric in METRIC_BASE_COLS] LEADERBOARD_METRICS_WITH_HISTORY_COLS = [*LEADERBOARD_METRIC_COLS, "Timestamp", ACTIVATED_COL] STRING_LEADERBOARD_COLS = ["Website", "Notes", "Timestamp"] ALL_COLUMNS = ["User", "Website", *LEADERBOARD_METRIC_COLS, "Timestamp", "Notes"] EMPTY_METADATA = {"Website": "N/A", "Notes": "N/A"} def get_user_metadata_filename(username: str) -> str: return f"{username}{LEADERBOARD_METADATA_SUFFIX}" def get_user_metrics_filename(username: str) -> str: return f"{username}{LEADERBOARD_METRICS_SUFFIX}" def get_user_metadata_repo_path( username: str, context: str = DEFAULT_LEADERBOARD_CONTEXT, ) -> str: return f"{get_leaderboard_entry_dir(context)}/{get_user_metadata_filename(username)}" def get_user_metrics_repo_path( username: str, context: str = DEFAULT_LEADERBOARD_CONTEXT, ) -> str: return f"{get_leaderboard_entry_dir(context)}/{get_user_metrics_filename(username)}" def build_user_metadata_df(website: str, notes: str) -> pd.DataFrame: return pd.DataFrame([{"Website": website, "Notes": notes}], columns=LEADERBOARD_METADATA_COLS) def build_submission_metrics_row( scores: dict[str, float], timestamp: str, activated: bool = False, ) -> pd.DataFrame: row = {col: scores.get(col, float("nan")) for col in LEADERBOARD_METRIC_COLS} row["Timestamp"] = timestamp row[ACTIVATED_COL] = bool(activated) return pd.DataFrame([row], columns=LEADERBOARD_METRICS_WITH_HISTORY_COLS) def _normalize_activated_col(df: pd.DataFrame) -> pd.DataFrame: normalized_df = df.copy() if ACTIVATED_COL not in normalized_df.columns: normalized_df[ACTIVATED_COL] = False if not normalized_df.empty: normalized_df.loc[normalized_df.index[0], ACTIVATED_COL] = True normalized_df[ACTIVATED_COL] = ( normalized_df[ACTIVATED_COL] .fillna(False) .astype(str) .str.strip() .str.lower() .isin(["true", "1", "yes"]) ) return normalized_df def normalize_submission_history_df(df: pd.DataFrame) -> pd.DataFrame: if df.empty: return pd.DataFrame(columns=LEADERBOARD_METRICS_WITH_HISTORY_COLS) normalized_df = df.copy() for col in LEADERBOARD_METRIC_COLS: if col not in normalized_df.columns: normalized_df[col] = float("nan") if "Timestamp" not in normalized_df.columns: normalized_df["Timestamp"] = "N/A" normalized_df = _normalize_activated_col(normalized_df) normalized_df["Timestamp"] = normalized_df["Timestamp"].fillna("N/A").astype(str) normalized_df = normalized_df[LEADERBOARD_METRICS_WITH_HISTORY_COLS] return normalized_df.sort_values("Timestamp", ascending=False).reset_index(drop=True) def cap_submission_history_df(df: pd.DataFrame) -> pd.DataFrame: normalized_df = normalize_submission_history_df(df) return normalized_df.head(MAX_SAVED_SUBMISSIONS).reset_index(drop=True) def write_user_metrics_history_file( output_dir, username: str, metrics_history_df: pd.DataFrame, ): output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) metrics_path = output_path / get_user_metrics_filename(username) normalized_history = cap_submission_history_df(metrics_history_df) normalized_history.to_csv(metrics_path, index=False) return metrics_path def write_user_leaderboard_files( output_dir, username: str, website: str, notes: str, scores: dict[str, float], timestamp: str, activated: bool = False, ): output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) metadata_path = output_path / get_user_metadata_filename(username) metrics_path = write_user_metrics_history_file( output_dir=output_path, username=username, metrics_history_df=build_submission_metrics_row(scores, timestamp, activated=activated), ) build_user_metadata_df(website, notes).to_csv(metadata_path, index=False) return metadata_path, metrics_path def get_ground_truth(context: str = DEFAULT_LEADERBOARD_CONTEXT): """Securely loads the hidden ground truth from the Dataset repo.""" if not HF_TOKEN: raise ValueError("System HF_TOKEN is missing. Please configure Space Secrets.") ground_truth_file = get_ground_truth_file(context) try: file_path = hf_hub_download( repo_id=DATASET_REPO_ID, filename=ground_truth_file, repo_type="dataset", token=HF_TOKEN, ) return pd.read_parquet(file_path) except Exception as e: raise ValueError(f"Failed to load {ground_truth_file}: {str(e)}") def _empty_leaderboard(): return pd.DataFrame(columns=ALL_COLUMNS) def _fill_missing_columns(df: pd.DataFrame) -> pd.DataFrame: for col in STRING_LEADERBOARD_COLS: if col not in df.columns: df[col] = "N/A" df[col] = df[col].fillna("N/A").astype(str).replace("nan", "N/A") for col in LEADERBOARD_METRIC_COLS: if col not in df.columns: df[col] = float("nan") return df def _username_from_entry_filename(filename: str) -> str | None: if filename.endswith(LEADERBOARD_METADATA_SUFFIX): return filename[:-len(LEADERBOARD_METADATA_SUFFIX)] if filename.endswith(LEADERBOARD_METRICS_SUFFIX): return filename[:-len(LEADERBOARD_METRICS_SUFFIX)] return None def _get_repo_leaderboard_usernames( context: str = DEFAULT_LEADERBOARD_CONTEXT, ) -> list[str]: api = HfApi() repo_files = api.list_repo_files( repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN, ) prefix = f"{get_leaderboard_entry_dir(context)}/" usernames = { username for repo_path in repo_files if repo_path.startswith(prefix) for username in [_username_from_entry_filename(Path(repo_path).name)] if username } return sorted(usernames) def _read_repo_csv(repo_path: str) -> pd.DataFrame: file_path = hf_hub_download( repo_id=DATASET_REPO_ID, filename=repo_path, repo_type="dataset", token=HF_TOKEN, force_download=True, ) return pd.read_csv(file_path) def _read_repo_csv_or_empty(repo_path: str) -> pd.DataFrame: try: return _read_repo_csv(repo_path) except EntryNotFoundError: return pd.DataFrame() def get_user_metadata( username: str, context: str = DEFAULT_LEADERBOARD_CONTEXT, ) -> dict[str, str]: if not HF_TOKEN: return EMPTY_METADATA.copy() metadata_df = _read_repo_csv_or_empty(get_user_metadata_repo_path(username, context)) if metadata_df.empty: return EMPTY_METADATA.copy() metadata_row = metadata_df.iloc[0] return { "Website": metadata_row.get("Website", "N/A"), "Notes": metadata_row.get("Notes", "N/A"), } def get_user_submission_history( username: str, context: str = DEFAULT_LEADERBOARD_CONTEXT, ) -> pd.DataFrame: if not HF_TOKEN: return pd.DataFrame(columns=LEADERBOARD_METRICS_WITH_HISTORY_COLS) metrics_df = _read_repo_csv_or_empty(get_user_metrics_repo_path(username, context)) return normalize_submission_history_df(metrics_df) def _get_active_submission_row(metrics_history_df: pd.DataFrame): if metrics_history_df.empty: return None active_rows = metrics_history_df[metrics_history_df[ACTIVATED_COL]] if active_rows.empty: return None return active_rows.iloc[0] def _assemble_leaderboard_row( username: str, metadata: dict[str, str], active_metrics_row, ) -> dict: row = { "User": username, "Website": metadata.get("Website", "N/A"), "Notes": metadata.get("Notes", "N/A"), "Timestamp": active_metrics_row.get("Timestamp", "N/A"), } for col in LEADERBOARD_METRIC_COLS: row[col] = active_metrics_row.get(col, float("nan")) return row def get_leaderboard(context: str = DEFAULT_LEADERBOARD_CONTEXT): """Fetches the latest leaderboard from the dataset repo (unsorted).""" if not HF_TOKEN: return _empty_leaderboard() try: usernames = _get_repo_leaderboard_usernames(context) if not usernames: return _empty_leaderboard() rows = [] for username in usernames: metadata = get_user_metadata(username, context) metrics_history_df = get_user_submission_history(username, context) active_metrics_row = _get_active_submission_row(metrics_history_df) if active_metrics_row is None: continue rows.append(_assemble_leaderboard_row(username, metadata, active_metrics_row)) if not rows: return _empty_leaderboard() df = pd.DataFrame(rows, columns=ALL_COLUMNS) return _fill_missing_columns(df) except Exception as e: print(f"Fetch error: {e}") return _empty_leaderboard()