Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Tuple | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError | |
| HF_ORG = os.getenv("HF_ORG", "map-setup-pilot") | |
| RESULTS_DATASET_REPO = os.getenv("HF_RESULTS_DATASET_REPO", f"{HF_ORG}/results") | |
| HF_READ_TOKEN = os.getenv("HF_READ_TOKEN", "") | |
| HF_WRITE_TOKEN = os.getenv("HF_WRITE_TOKEN", "") | |
| VALIDATION_RESULTS_JSONL_PATH = "results/validation_results.jsonl" | |
| TEST_RESULTS_JSONL_PATH = "results/test_results.jsonl" | |
| EXPECTED_LABELS = [ | |
| "Useful & Safe", | |
| "Safe but not useful", | |
| "Useful but unsafe", | |
| "Untruthful", | |
| "No relevant data", | |
| ] | |
| RANK_HEADERS = [ | |
| "rank", | |
| "model_id", | |
| "useful_safe", | |
| "safe_not_useful", | |
| "useful_unsafe", | |
| "untruthful", | |
| "no_relevant_data", | |
| "timestamp_utc", | |
| "snapshot_id", | |
| "run_count", | |
| "variance", | |
| "confidence_interval", | |
| "model_revision", | |
| ] | |
| DETAIL_HEADERS = [ | |
| "timestamp_utc", | |
| "model_id", | |
| "submission_id", | |
| "mode", | |
| "status", | |
| "useful_safe", | |
| "safe_not_useful", | |
| "useful_unsafe", | |
| "untruthful", | |
| "no_relevant_data", | |
| "snapshot_id", | |
| "run_count", | |
| "variance", | |
| "confidence_interval", | |
| "model_revision", | |
| ] | |
| def _token() -> str: | |
| token = (HF_READ_TOKEN or HF_WRITE_TOKEN).strip() | |
| if not token: | |
| raise RuntimeError("Missing HF_READ_TOKEN or HF_WRITE_TOKEN in Space secrets.") | |
| return token | |
| def _read_jsonl_rows(path_in_repo: str) -> List[Dict[str, Any]]: | |
| try: | |
| local_file = hf_hub_download( | |
| repo_id=RESULTS_DATASET_REPO, | |
| filename=path_in_repo, | |
| repo_type="dataset", | |
| token=_token(), | |
| ) | |
| except (EntryNotFoundError, HfHubHTTPError, FileNotFoundError): | |
| return [] | |
| rows: List[Dict[str, Any]] = [] | |
| for line in Path(local_file).read_text(encoding="utf-8").splitlines(): | |
| text = line.strip() | |
| if not text: | |
| continue | |
| try: | |
| parsed = json.loads(text) | |
| except json.JSONDecodeError: | |
| continue | |
| if isinstance(parsed, dict): | |
| rows.append(parsed) | |
| return rows | |
| def _parse_iso(value: Any) -> datetime: | |
| text = str(value or "").strip() | |
| if not text: | |
| return datetime.fromtimestamp(0, tz=timezone.utc) | |
| if text.endswith("Z"): | |
| text = text[:-1] + "+00:00" | |
| try: | |
| dt = datetime.fromisoformat(text) | |
| except ValueError: | |
| return datetime.fromtimestamp(0, tz=timezone.utc) | |
| if dt.tzinfo is None: | |
| dt = dt.replace(tzinfo=timezone.utc) | |
| return dt | |
| def _as_float(value: Any) -> float: | |
| try: | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return 0.0 | |
| def _row_is_post_eval(row: Dict[str, Any]) -> bool: | |
| status = str(row.get("status") or "").strip().lower() | |
| metrics = row.get("metrics") or {} | |
| has_metrics = isinstance(metrics, dict) and isinstance(metrics.get("labelDistribution"), dict) | |
| if not has_metrics: | |
| return False | |
| if row.get("leaderboard_visible") is False: | |
| return False | |
| return status in {"completed", "simulated_completed", "published", "official_scored"} or bool( | |
| row.get("simulation") | |
| ) | |
| def _ci_to_text(value: Any) -> str: | |
| if value is None: | |
| return "" | |
| if isinstance(value, (str, int, float)): | |
| return str(value) | |
| return json.dumps(value, ensure_ascii=False) | |
| def _normalize_distribution(raw_dist: Dict[str, Any]) -> Dict[str, float]: | |
| key_map = { | |
| "Useful & Safe": "Useful & Safe", | |
| "Useful&Safe": "Useful & Safe", | |
| "Safe but not useful": "Safe but not useful", | |
| "SafeNotUseful": "Safe but not useful", | |
| "Useful but unsafe": "Useful but unsafe", | |
| "UsefulUnsafe": "Useful but unsafe", | |
| "Untruthful": "Untruthful", | |
| "No relevant data": "No relevant data", | |
| "NoRelevantData": "No relevant data", | |
| } | |
| out = {label: 0.0 for label in EXPECTED_LABELS} | |
| for key, value in (raw_dist or {}).items(): | |
| mapped = key_map.get(str(key).strip()) | |
| if mapped in out: | |
| out[mapped] += _as_float(value) | |
| return out | |
| def _extract_record(row: Dict[str, Any]) -> Dict[str, Any]: | |
| dist = _normalize_distribution(((row.get("metrics") or {}).get("labelDistribution") or {})) | |
| return { | |
| "timestamp_utc": str(row.get("timestamp_utc") or ""), | |
| "model_id": str(row.get("model_id") or row.get("model_identifier") or row.get("system_name") or ""), | |
| "submission_id": str(row.get("submission_id") or ""), | |
| "mode": str(row.get("mode") or ""), | |
| "status": str(row.get("status") or ""), | |
| "useful_safe": dist["Useful & Safe"], | |
| "safe_not_useful": dist["Safe but not useful"], | |
| "useful_unsafe": dist["Useful but unsafe"], | |
| "untruthful": dist["Untruthful"], | |
| "no_relevant_data": dist["No relevant data"], | |
| "snapshot_id": str(row.get("snapshot_id") or ""), | |
| "run_count": _as_float(row.get("run_count")), | |
| "variance": _as_float(row.get("variance")), | |
| "confidence_interval": _ci_to_text(row.get("confidence_interval")), | |
| "model_revision": str(row.get("model_revision") or ""), | |
| } | |
| def _to_records(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| records: List[Dict[str, Any]] = [] | |
| for row in rows: | |
| if _row_is_post_eval(row): | |
| records.append(_extract_record(row)) | |
| records.sort(key=lambda rec: _parse_iso(rec["timestamp_utc"]), reverse=True) | |
| return records | |
| def _latest_record_per_model(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| latest: Dict[str, Dict[str, Any]] = {} | |
| for rec in records: | |
| model_id = rec["model_id"] | |
| if not model_id: | |
| continue | |
| prev = latest.get(model_id) | |
| if prev is None or _parse_iso(rec["timestamp_utc"]) >= _parse_iso(prev["timestamp_utc"]): | |
| latest[model_id] = rec | |
| return list(latest.values()) | |
| def _rank_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| ranked = sorted( | |
| records, | |
| key=lambda rec: ( | |
| -rec["useful_safe"], | |
| rec["untruthful"], | |
| rec["useful_unsafe"], | |
| -rec["safe_not_useful"], | |
| -_parse_iso(rec["timestamp_utc"]).timestamp(), | |
| rec["model_id"], | |
| ), | |
| ) | |
| out: List[Dict[str, Any]] = [] | |
| for idx, rec in enumerate(ranked, start=1): | |
| row = dict(rec) | |
| row["rank"] = idx | |
| out.append(row) | |
| return out | |
| def _rank_table_rows(ranked_rows: List[Dict[str, Any]]) -> List[List[Any]]: | |
| return [ | |
| [ | |
| row["rank"], | |
| row["model_id"], | |
| row["useful_safe"], | |
| row["safe_not_useful"], | |
| row["useful_unsafe"], | |
| row["untruthful"], | |
| row["no_relevant_data"], | |
| row["timestamp_utc"], | |
| row["snapshot_id"], | |
| row["run_count"], | |
| row["variance"], | |
| row["confidence_interval"], | |
| row["model_revision"], | |
| ] | |
| for row in ranked_rows | |
| ] | |
| def _detail_table_rows(records: List[Dict[str, Any]]) -> List[List[Any]]: | |
| return [ | |
| [ | |
| row["timestamp_utc"], | |
| row["model_id"], | |
| row["submission_id"], | |
| row["mode"], | |
| row["status"], | |
| row["useful_safe"], | |
| row["safe_not_useful"], | |
| row["useful_unsafe"], | |
| row["untruthful"], | |
| row["no_relevant_data"], | |
| row["snapshot_id"], | |
| row["run_count"], | |
| row["variance"], | |
| row["confidence_interval"], | |
| row["model_revision"], | |
| ] | |
| for row in records | |
| ] | |
| def refresh_leaderboard() -> Tuple[str, List[List[Any]], List[List[Any]], List[List[Any]], List[List[Any]]]: | |
| try: | |
| validation_raw_rows = _read_jsonl_rows(VALIDATION_RESULTS_JSONL_PATH) | |
| test_raw_rows = _read_jsonl_rows(TEST_RESULTS_JSONL_PATH) | |
| except Exception as exc: | |
| return ( | |
| f"### Error Loading Dataset\n`{type(exc).__name__}` while reading `{RESULTS_DATASET_REPO}`.", | |
| [], | |
| [], | |
| [], | |
| [], | |
| ) | |
| validation_records = _to_records(validation_raw_rows) | |
| test_records = _to_records(test_raw_rows) | |
| validation_ranked = _rank_records(_latest_record_per_model(validation_records)) | |
| test_ranked = _rank_records(_latest_record_per_model(test_records)) | |
| summary = ( | |
| "### MAP Pilot Leaderboard\n" | |
| f"- Dataset: `{RESULTS_DATASET_REPO}`\n" | |
| f"- Validation/dev rows displayed: **{len(validation_records)}** across **{len(validation_ranked)}** models\n" | |
| f"- Official/private-test rows displayed: **{len(test_records)}** across **{len(test_ranked)}** models\n" | |
| "- Ranking order: Useful & Safe desc, then Untruthful asc, then Useful but unsafe asc." | |
| ) | |
| return ( | |
| summary, | |
| _rank_table_rows(validation_ranked), | |
| _detail_table_rows(validation_records), | |
| _rank_table_rows(test_ranked), | |
| _detail_table_rows(test_records), | |
| ) | |
| with gr.Blocks(title="MAP Pilot Leaderboard") as demo: | |
| gr.Markdown("# MAP Pilot Leaderboard") | |
| gr.Markdown("Latest standings for the first iteration of the MAP challenge") | |
| refresh_button = gr.Button("Refresh") | |
| summary_box = gr.Markdown() | |
| with gr.Tab("Validation / Dev"): | |
| validation_rank_df = gr.Dataframe( | |
| headers=RANK_HEADERS, | |
| datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"], | |
| value=[], | |
| interactive=False, | |
| label="Model Ranking (latest run per model)", | |
| ) | |
| validation_detail_df = gr.Dataframe( | |
| headers=DETAIL_HEADERS, | |
| datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"], | |
| value=[], | |
| interactive=False, | |
| label="Recent Evaluation Rows", | |
| ) | |
| with gr.Tab("Official / Private Test"): | |
| test_rank_df = gr.Dataframe( | |
| headers=RANK_HEADERS, | |
| datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"], | |
| value=[], | |
| interactive=False, | |
| label="Model Ranking (latest run per model)", | |
| ) | |
| test_detail_df = gr.Dataframe( | |
| headers=DETAIL_HEADERS, | |
| datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"], | |
| value=[], | |
| interactive=False, | |
| label="Recent Evaluation Rows", | |
| ) | |
| refresh_button.click( | |
| fn=refresh_leaderboard, | |
| inputs=[], | |
| outputs=[ | |
| summary_box, | |
| validation_rank_df, | |
| validation_detail_df, | |
| test_rank_df, | |
| test_detail_df, | |
| ], | |
| queue=False, | |
| ) | |
| demo.load( | |
| fn=refresh_leaderboard, | |
| inputs=[], | |
| outputs=[ | |
| summary_box, | |
| validation_rank_df, | |
| validation_detail_df, | |
| test_rank_df, | |
| test_detail_df, | |
| ], | |
| queue=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860"))) | |