import json import os from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Tuple import gradio as gr from huggingface_hub import hf_hub_download from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError HF_ORG = os.getenv("HF_ORG", "map-setup-pilot") RESULTS_DATASET_REPO = os.getenv("HF_RESULTS_DATASET_REPO", f"{HF_ORG}/results") HF_READ_TOKEN = os.getenv("HF_READ_TOKEN", "") HF_WRITE_TOKEN = os.getenv("HF_WRITE_TOKEN", "") VALIDATION_RESULTS_JSONL_PATH = "results/validation_results.jsonl" TEST_RESULTS_JSONL_PATH = "results/test_results.jsonl" EXPECTED_LABELS = [ "Useful & Safe", "Safe but not useful", "Useful but unsafe", "Untruthful", "No relevant data", ] RANK_HEADERS = [ "rank", "model_id", "useful_safe", "safe_not_useful", "useful_unsafe", "untruthful", "no_relevant_data", "timestamp_utc", "snapshot_id", "run_count", "variance", "confidence_interval", "model_revision", ] DETAIL_HEADERS = [ "timestamp_utc", "model_id", "submission_id", "mode", "status", "useful_safe", "safe_not_useful", "useful_unsafe", "untruthful", "no_relevant_data", "snapshot_id", "run_count", "variance", "confidence_interval", "model_revision", ] def _token() -> str: token = (HF_READ_TOKEN or HF_WRITE_TOKEN).strip() if not token: raise RuntimeError("Missing HF_READ_TOKEN or HF_WRITE_TOKEN in Space secrets.") return token def _read_jsonl_rows(path_in_repo: str) -> List[Dict[str, Any]]: try: local_file = hf_hub_download( repo_id=RESULTS_DATASET_REPO, filename=path_in_repo, repo_type="dataset", token=_token(), ) except (EntryNotFoundError, HfHubHTTPError, FileNotFoundError): return [] rows: List[Dict[str, Any]] = [] for line in Path(local_file).read_text(encoding="utf-8").splitlines(): text = line.strip() if not text: continue try: parsed = json.loads(text) except json.JSONDecodeError: continue if isinstance(parsed, dict): rows.append(parsed) return rows def _parse_iso(value: Any) -> datetime: text = str(value or "").strip() if not text: return datetime.fromtimestamp(0, tz=timezone.utc) if text.endswith("Z"): text = text[:-1] + "+00:00" try: dt = datetime.fromisoformat(text) except ValueError: return datetime.fromtimestamp(0, tz=timezone.utc) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt def _as_float(value: Any) -> float: try: return float(value) except (TypeError, ValueError): return 0.0 def _row_is_post_eval(row: Dict[str, Any]) -> bool: status = str(row.get("status") or "").strip().lower() metrics = row.get("metrics") or {} has_metrics = isinstance(metrics, dict) and isinstance(metrics.get("labelDistribution"), dict) if not has_metrics: return False if row.get("leaderboard_visible") is False: return False return status in {"completed", "simulated_completed", "published", "official_scored"} or bool( row.get("simulation") ) def _ci_to_text(value: Any) -> str: if value is None: return "" if isinstance(value, (str, int, float)): return str(value) return json.dumps(value, ensure_ascii=False) def _normalize_distribution(raw_dist: Dict[str, Any]) -> Dict[str, float]: key_map = { "Useful & Safe": "Useful & Safe", "Useful&Safe": "Useful & Safe", "Safe but not useful": "Safe but not useful", "SafeNotUseful": "Safe but not useful", "Useful but unsafe": "Useful but unsafe", "UsefulUnsafe": "Useful but unsafe", "Untruthful": "Untruthful", "No relevant data": "No relevant data", "NoRelevantData": "No relevant data", } out = {label: 0.0 for label in EXPECTED_LABELS} for key, value in (raw_dist or {}).items(): mapped = key_map.get(str(key).strip()) if mapped in out: out[mapped] += _as_float(value) return out def _extract_record(row: Dict[str, Any]) -> Dict[str, Any]: dist = _normalize_distribution(((row.get("metrics") or {}).get("labelDistribution") or {})) return { "timestamp_utc": str(row.get("timestamp_utc") or ""), "model_id": str(row.get("model_id") or row.get("model_identifier") or row.get("system_name") or ""), "submission_id": str(row.get("submission_id") or ""), "mode": str(row.get("mode") or ""), "status": str(row.get("status") or ""), "useful_safe": dist["Useful & Safe"], "safe_not_useful": dist["Safe but not useful"], "useful_unsafe": dist["Useful but unsafe"], "untruthful": dist["Untruthful"], "no_relevant_data": dist["No relevant data"], "snapshot_id": str(row.get("snapshot_id") or ""), "run_count": _as_float(row.get("run_count")), "variance": _as_float(row.get("variance")), "confidence_interval": _ci_to_text(row.get("confidence_interval")), "model_revision": str(row.get("model_revision") or ""), } def _to_records(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: records: List[Dict[str, Any]] = [] for row in rows: if _row_is_post_eval(row): records.append(_extract_record(row)) records.sort(key=lambda rec: _parse_iso(rec["timestamp_utc"]), reverse=True) return records def _latest_record_per_model(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]: latest: Dict[str, Dict[str, Any]] = {} for rec in records: model_id = rec["model_id"] if not model_id: continue prev = latest.get(model_id) if prev is None or _parse_iso(rec["timestamp_utc"]) >= _parse_iso(prev["timestamp_utc"]): latest[model_id] = rec return list(latest.values()) def _rank_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]: ranked = sorted( records, key=lambda rec: ( -rec["useful_safe"], rec["untruthful"], rec["useful_unsafe"], -rec["safe_not_useful"], -_parse_iso(rec["timestamp_utc"]).timestamp(), rec["model_id"], ), ) out: List[Dict[str, Any]] = [] for idx, rec in enumerate(ranked, start=1): row = dict(rec) row["rank"] = idx out.append(row) return out def _rank_table_rows(ranked_rows: List[Dict[str, Any]]) -> List[List[Any]]: return [ [ row["rank"], row["model_id"], row["useful_safe"], row["safe_not_useful"], row["useful_unsafe"], row["untruthful"], row["no_relevant_data"], row["timestamp_utc"], row["snapshot_id"], row["run_count"], row["variance"], row["confidence_interval"], row["model_revision"], ] for row in ranked_rows ] def _detail_table_rows(records: List[Dict[str, Any]]) -> List[List[Any]]: return [ [ row["timestamp_utc"], row["model_id"], row["submission_id"], row["mode"], row["status"], row["useful_safe"], row["safe_not_useful"], row["useful_unsafe"], row["untruthful"], row["no_relevant_data"], row["snapshot_id"], row["run_count"], row["variance"], row["confidence_interval"], row["model_revision"], ] for row in records ] def refresh_leaderboard() -> Tuple[str, List[List[Any]], List[List[Any]], List[List[Any]], List[List[Any]]]: try: validation_raw_rows = _read_jsonl_rows(VALIDATION_RESULTS_JSONL_PATH) test_raw_rows = _read_jsonl_rows(TEST_RESULTS_JSONL_PATH) except Exception as exc: return ( f"### Error Loading Dataset\n`{type(exc).__name__}` while reading `{RESULTS_DATASET_REPO}`.", [], [], [], [], ) validation_records = _to_records(validation_raw_rows) test_records = _to_records(test_raw_rows) validation_ranked = _rank_records(_latest_record_per_model(validation_records)) test_ranked = _rank_records(_latest_record_per_model(test_records)) summary = ( "### MAP Pilot Leaderboard\n" f"- Dataset: `{RESULTS_DATASET_REPO}`\n" f"- Validation/dev rows displayed: **{len(validation_records)}** across **{len(validation_ranked)}** models\n" f"- Official/private-test rows displayed: **{len(test_records)}** across **{len(test_ranked)}** models\n" "- Ranking order: Useful & Safe desc, then Untruthful asc, then Useful but unsafe asc." ) return ( summary, _rank_table_rows(validation_ranked), _detail_table_rows(validation_records), _rank_table_rows(test_ranked), _detail_table_rows(test_records), ) with gr.Blocks(title="MAP Pilot Leaderboard") as demo: gr.Markdown("# MAP Pilot Leaderboard") gr.Markdown("Latest standings for the first iteration of the MAP challenge") refresh_button = gr.Button("Refresh") summary_box = gr.Markdown() with gr.Tab("Validation / Dev"): validation_rank_df = gr.Dataframe( headers=RANK_HEADERS, datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"], value=[], interactive=False, label="Model Ranking (latest run per model)", ) validation_detail_df = gr.Dataframe( headers=DETAIL_HEADERS, datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"], value=[], interactive=False, label="Recent Evaluation Rows", ) with gr.Tab("Official / Private Test"): test_rank_df = gr.Dataframe( headers=RANK_HEADERS, datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"], value=[], interactive=False, label="Model Ranking (latest run per model)", ) test_detail_df = gr.Dataframe( headers=DETAIL_HEADERS, datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"], value=[], interactive=False, label="Recent Evaluation Rows", ) refresh_button.click( fn=refresh_leaderboard, inputs=[], outputs=[ summary_box, validation_rank_df, validation_detail_df, test_rank_df, test_detail_df, ], queue=False, ) demo.load( fn=refresh_leaderboard, inputs=[], outputs=[ summary_box, validation_rank_df, validation_detail_df, test_rank_df, test_detail_df, ], queue=False, ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))