leaderboard / app.py
jason-res's picture
Update app.py
052cc75 verified
Raw
History Blame Contribute Delete
11.6 kB
import json
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Tuple
import gradio as gr
from huggingface_hub import hf_hub_download
from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError
HF_ORG = os.getenv("HF_ORG", "map-setup-pilot")
RESULTS_DATASET_REPO = os.getenv("HF_RESULTS_DATASET_REPO", f"{HF_ORG}/results")
HF_READ_TOKEN = os.getenv("HF_READ_TOKEN", "")
HF_WRITE_TOKEN = os.getenv("HF_WRITE_TOKEN", "")
VALIDATION_RESULTS_JSONL_PATH = "results/validation_results.jsonl"
TEST_RESULTS_JSONL_PATH = "results/test_results.jsonl"
EXPECTED_LABELS = [
"Useful & Safe",
"Safe but not useful",
"Useful but unsafe",
"Untruthful",
"No relevant data",
]
RANK_HEADERS = [
"rank",
"model_id",
"useful_safe",
"safe_not_useful",
"useful_unsafe",
"untruthful",
"no_relevant_data",
"timestamp_utc",
"snapshot_id",
"run_count",
"variance",
"confidence_interval",
"model_revision",
]
DETAIL_HEADERS = [
"timestamp_utc",
"model_id",
"submission_id",
"mode",
"status",
"useful_safe",
"safe_not_useful",
"useful_unsafe",
"untruthful",
"no_relevant_data",
"snapshot_id",
"run_count",
"variance",
"confidence_interval",
"model_revision",
]
def _token() -> str:
token = (HF_READ_TOKEN or HF_WRITE_TOKEN).strip()
if not token:
raise RuntimeError("Missing HF_READ_TOKEN or HF_WRITE_TOKEN in Space secrets.")
return token
def _read_jsonl_rows(path_in_repo: str) -> List[Dict[str, Any]]:
try:
local_file = hf_hub_download(
repo_id=RESULTS_DATASET_REPO,
filename=path_in_repo,
repo_type="dataset",
token=_token(),
)
except (EntryNotFoundError, HfHubHTTPError, FileNotFoundError):
return []
rows: List[Dict[str, Any]] = []
for line in Path(local_file).read_text(encoding="utf-8").splitlines():
text = line.strip()
if not text:
continue
try:
parsed = json.loads(text)
except json.JSONDecodeError:
continue
if isinstance(parsed, dict):
rows.append(parsed)
return rows
def _parse_iso(value: Any) -> datetime:
text = str(value or "").strip()
if not text:
return datetime.fromtimestamp(0, tz=timezone.utc)
if text.endswith("Z"):
text = text[:-1] + "+00:00"
try:
dt = datetime.fromisoformat(text)
except ValueError:
return datetime.fromtimestamp(0, tz=timezone.utc)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt
def _as_float(value: Any) -> float:
try:
return float(value)
except (TypeError, ValueError):
return 0.0
def _row_is_post_eval(row: Dict[str, Any]) -> bool:
status = str(row.get("status") or "").strip().lower()
metrics = row.get("metrics") or {}
has_metrics = isinstance(metrics, dict) and isinstance(metrics.get("labelDistribution"), dict)
if not has_metrics:
return False
if row.get("leaderboard_visible") is False:
return False
return status in {"completed", "simulated_completed", "published", "official_scored"} or bool(
row.get("simulation")
)
def _ci_to_text(value: Any) -> str:
if value is None:
return ""
if isinstance(value, (str, int, float)):
return str(value)
return json.dumps(value, ensure_ascii=False)
def _normalize_distribution(raw_dist: Dict[str, Any]) -> Dict[str, float]:
key_map = {
"Useful & Safe": "Useful & Safe",
"Useful&Safe": "Useful & Safe",
"Safe but not useful": "Safe but not useful",
"SafeNotUseful": "Safe but not useful",
"Useful but unsafe": "Useful but unsafe",
"UsefulUnsafe": "Useful but unsafe",
"Untruthful": "Untruthful",
"No relevant data": "No relevant data",
"NoRelevantData": "No relevant data",
}
out = {label: 0.0 for label in EXPECTED_LABELS}
for key, value in (raw_dist or {}).items():
mapped = key_map.get(str(key).strip())
if mapped in out:
out[mapped] += _as_float(value)
return out
def _extract_record(row: Dict[str, Any]) -> Dict[str, Any]:
dist = _normalize_distribution(((row.get("metrics") or {}).get("labelDistribution") or {}))
return {
"timestamp_utc": str(row.get("timestamp_utc") or ""),
"model_id": str(row.get("model_id") or row.get("model_identifier") or row.get("system_name") or ""),
"submission_id": str(row.get("submission_id") or ""),
"mode": str(row.get("mode") or ""),
"status": str(row.get("status") or ""),
"useful_safe": dist["Useful & Safe"],
"safe_not_useful": dist["Safe but not useful"],
"useful_unsafe": dist["Useful but unsafe"],
"untruthful": dist["Untruthful"],
"no_relevant_data": dist["No relevant data"],
"snapshot_id": str(row.get("snapshot_id") or ""),
"run_count": _as_float(row.get("run_count")),
"variance": _as_float(row.get("variance")),
"confidence_interval": _ci_to_text(row.get("confidence_interval")),
"model_revision": str(row.get("model_revision") or ""),
}
def _to_records(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
records: List[Dict[str, Any]] = []
for row in rows:
if _row_is_post_eval(row):
records.append(_extract_record(row))
records.sort(key=lambda rec: _parse_iso(rec["timestamp_utc"]), reverse=True)
return records
def _latest_record_per_model(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
latest: Dict[str, Dict[str, Any]] = {}
for rec in records:
model_id = rec["model_id"]
if not model_id:
continue
prev = latest.get(model_id)
if prev is None or _parse_iso(rec["timestamp_utc"]) >= _parse_iso(prev["timestamp_utc"]):
latest[model_id] = rec
return list(latest.values())
def _rank_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
ranked = sorted(
records,
key=lambda rec: (
-rec["useful_safe"],
rec["untruthful"],
rec["useful_unsafe"],
-rec["safe_not_useful"],
-_parse_iso(rec["timestamp_utc"]).timestamp(),
rec["model_id"],
),
)
out: List[Dict[str, Any]] = []
for idx, rec in enumerate(ranked, start=1):
row = dict(rec)
row["rank"] = idx
out.append(row)
return out
def _rank_table_rows(ranked_rows: List[Dict[str, Any]]) -> List[List[Any]]:
return [
[
row["rank"],
row["model_id"],
row["useful_safe"],
row["safe_not_useful"],
row["useful_unsafe"],
row["untruthful"],
row["no_relevant_data"],
row["timestamp_utc"],
row["snapshot_id"],
row["run_count"],
row["variance"],
row["confidence_interval"],
row["model_revision"],
]
for row in ranked_rows
]
def _detail_table_rows(records: List[Dict[str, Any]]) -> List[List[Any]]:
return [
[
row["timestamp_utc"],
row["model_id"],
row["submission_id"],
row["mode"],
row["status"],
row["useful_safe"],
row["safe_not_useful"],
row["useful_unsafe"],
row["untruthful"],
row["no_relevant_data"],
row["snapshot_id"],
row["run_count"],
row["variance"],
row["confidence_interval"],
row["model_revision"],
]
for row in records
]
def refresh_leaderboard() -> Tuple[str, List[List[Any]], List[List[Any]], List[List[Any]], List[List[Any]]]:
try:
validation_raw_rows = _read_jsonl_rows(VALIDATION_RESULTS_JSONL_PATH)
test_raw_rows = _read_jsonl_rows(TEST_RESULTS_JSONL_PATH)
except Exception as exc:
return (
f"### Error Loading Dataset\n`{type(exc).__name__}` while reading `{RESULTS_DATASET_REPO}`.",
[],
[],
[],
[],
)
validation_records = _to_records(validation_raw_rows)
test_records = _to_records(test_raw_rows)
validation_ranked = _rank_records(_latest_record_per_model(validation_records))
test_ranked = _rank_records(_latest_record_per_model(test_records))
summary = (
"### MAP Pilot Leaderboard\n"
f"- Dataset: `{RESULTS_DATASET_REPO}`\n"
f"- Validation/dev rows displayed: **{len(validation_records)}** across **{len(validation_ranked)}** models\n"
f"- Official/private-test rows displayed: **{len(test_records)}** across **{len(test_ranked)}** models\n"
"- Ranking order: Useful & Safe desc, then Untruthful asc, then Useful but unsafe asc."
)
return (
summary,
_rank_table_rows(validation_ranked),
_detail_table_rows(validation_records),
_rank_table_rows(test_ranked),
_detail_table_rows(test_records),
)
with gr.Blocks(title="MAP Pilot Leaderboard") as demo:
gr.Markdown("# MAP Pilot Leaderboard")
gr.Markdown("Latest standings for the first iteration of the MAP challenge")
refresh_button = gr.Button("Refresh")
summary_box = gr.Markdown()
with gr.Tab("Validation / Dev"):
validation_rank_df = gr.Dataframe(
headers=RANK_HEADERS,
datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"],
value=[],
interactive=False,
label="Model Ranking (latest run per model)",
)
validation_detail_df = gr.Dataframe(
headers=DETAIL_HEADERS,
datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"],
value=[],
interactive=False,
label="Recent Evaluation Rows",
)
with gr.Tab("Official / Private Test"):
test_rank_df = gr.Dataframe(
headers=RANK_HEADERS,
datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"],
value=[],
interactive=False,
label="Model Ranking (latest run per model)",
)
test_detail_df = gr.Dataframe(
headers=DETAIL_HEADERS,
datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"],
value=[],
interactive=False,
label="Recent Evaluation Rows",
)
refresh_button.click(
fn=refresh_leaderboard,
inputs=[],
outputs=[
summary_box,
validation_rank_df,
validation_detail_df,
test_rank_df,
test_detail_df,
],
queue=False,
)
demo.load(
fn=refresh_leaderboard,
inputs=[],
outputs=[
summary_box,
validation_rank_df,
validation_detail_df,
test_rank_df,
test_detail_df,
],
queue=False,
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))