Spaces:

map-setup-pilot
/

leaderboard

Sleeping

App Files Files Community

leaderboard / app.py

jason-res

Update app.py

052cc75 verified about 2 months ago

Raw

History Blame Contribute Delete

11.6 kB

	import json
	import os
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Any, Dict, List, Tuple

	import gradio as gr
	from huggingface_hub import hf_hub_download
	from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError


	HF_ORG = os.getenv("HF_ORG", "map-setup-pilot")
	RESULTS_DATASET_REPO = os.getenv("HF_RESULTS_DATASET_REPO", f"{HF_ORG}/results")
	HF_READ_TOKEN = os.getenv("HF_READ_TOKEN", "")
	HF_WRITE_TOKEN = os.getenv("HF_WRITE_TOKEN", "")

	VALIDATION_RESULTS_JSONL_PATH = "results/validation_results.jsonl"
	TEST_RESULTS_JSONL_PATH = "results/test_results.jsonl"

	EXPECTED_LABELS = [
	"Useful & Safe",
	"Safe but not useful",
	"Useful but unsafe",
	"Untruthful",
	"No relevant data",
	]

	RANK_HEADERS = [
	"rank",
	"model_id",
	"useful_safe",
	"safe_not_useful",
	"useful_unsafe",
	"untruthful",
	"no_relevant_data",
	"timestamp_utc",
	"snapshot_id",
	"run_count",
	"variance",
	"confidence_interval",
	"model_revision",
	]

	DETAIL_HEADERS = [
	"timestamp_utc",
	"model_id",
	"submission_id",
	"mode",
	"status",
	"useful_safe",
	"safe_not_useful",
	"useful_unsafe",
	"untruthful",
	"no_relevant_data",
	"snapshot_id",
	"run_count",
	"variance",
	"confidence_interval",
	"model_revision",
	]


	def _token() -> str:
	token = (HF_READ_TOKEN or HF_WRITE_TOKEN).strip()
	if not token:
	raise RuntimeError("Missing HF_READ_TOKEN or HF_WRITE_TOKEN in Space secrets.")
	return token


	def _read_jsonl_rows(path_in_repo: str) -> List[Dict[str, Any]]:
	try:
	local_file = hf_hub_download(
	repo_id=RESULTS_DATASET_REPO,
	filename=path_in_repo,
	repo_type="dataset",
	token=_token(),
	)
	except (EntryNotFoundError, HfHubHTTPError, FileNotFoundError):
	return []

	rows: List[Dict[str, Any]] = []
	for line in Path(local_file).read_text(encoding="utf-8").splitlines():
	text = line.strip()
	if not text:
	continue
	try:
	parsed = json.loads(text)
	except json.JSONDecodeError:
	continue
	if isinstance(parsed, dict):
	rows.append(parsed)
	return rows


	def _parse_iso(value: Any) -> datetime:
	text = str(value or "").strip()
	if not text:
	return datetime.fromtimestamp(0, tz=timezone.utc)
	if text.endswith("Z"):
	text = text[:-1] + "+00:00"
	try:
	dt = datetime.fromisoformat(text)
	except ValueError:
	return datetime.fromtimestamp(0, tz=timezone.utc)
	if dt.tzinfo is None:
	dt = dt.replace(tzinfo=timezone.utc)
	return dt


	def _as_float(value: Any) -> float:
	try:
	return float(value)
	except (TypeError, ValueError):
	return 0.0


	def _row_is_post_eval(row: Dict[str, Any]) -> bool:
	status = str(row.get("status") or "").strip().lower()
	metrics = row.get("metrics") or {}
	has_metrics = isinstance(metrics, dict) and isinstance(metrics.get("labelDistribution"), dict)
	if not has_metrics:
	return False
	if row.get("leaderboard_visible") is False:
	return False
	return status in {"completed", "simulated_completed", "published", "official_scored"} or bool(
	row.get("simulation")
	)


	def _ci_to_text(value: Any) -> str:
	if value is None:
	return ""
	if isinstance(value, (str, int, float)):
	return str(value)
	return json.dumps(value, ensure_ascii=False)


	def _normalize_distribution(raw_dist: Dict[str, Any]) -> Dict[str, float]:
	key_map = {
	"Useful & Safe": "Useful & Safe",
	"Useful&Safe": "Useful & Safe",
	"Safe but not useful": "Safe but not useful",
	"SafeNotUseful": "Safe but not useful",
	"Useful but unsafe": "Useful but unsafe",
	"UsefulUnsafe": "Useful but unsafe",
	"Untruthful": "Untruthful",
	"No relevant data": "No relevant data",
	"NoRelevantData": "No relevant data",
	}
	out = {label: 0.0 for label in EXPECTED_LABELS}
	for key, value in (raw_dist or {}).items():
	mapped = key_map.get(str(key).strip())
	if mapped in out:
	out[mapped] += _as_float(value)
	return out


	def _extract_record(row: Dict[str, Any]) -> Dict[str, Any]:
	dist = _normalize_distribution(((row.get("metrics") or {}).get("labelDistribution") or {}))
	return {
	"timestamp_utc": str(row.get("timestamp_utc") or ""),
	"model_id": str(row.get("model_id") or row.get("model_identifier") or row.get("system_name") or ""),
	"submission_id": str(row.get("submission_id") or ""),
	"mode": str(row.get("mode") or ""),
	"status": str(row.get("status") or ""),
	"useful_safe": dist["Useful & Safe"],
	"safe_not_useful": dist["Safe but not useful"],
	"useful_unsafe": dist["Useful but unsafe"],
	"untruthful": dist["Untruthful"],
	"no_relevant_data": dist["No relevant data"],
	"snapshot_id": str(row.get("snapshot_id") or ""),
	"run_count": _as_float(row.get("run_count")),
	"variance": _as_float(row.get("variance")),
	"confidence_interval": _ci_to_text(row.get("confidence_interval")),
	"model_revision": str(row.get("model_revision") or ""),
	}


	def _to_records(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	records: List[Dict[str, Any]] = []
	for row in rows:
	if _row_is_post_eval(row):
	records.append(_extract_record(row))
	records.sort(key=lambda rec: _parse_iso(rec["timestamp_utc"]), reverse=True)
	return records


	def _latest_record_per_model(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	latest: Dict[str, Dict[str, Any]] = {}
	for rec in records:
	model_id = rec["model_id"]
	if not model_id:
	continue
	prev = latest.get(model_id)
	if prev is None or _parse_iso(rec["timestamp_utc"]) >= _parse_iso(prev["timestamp_utc"]):
	latest[model_id] = rec
	return list(latest.values())


	def _rank_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	ranked = sorted(
	records,
	key=lambda rec: (
	-rec["useful_safe"],
	rec["untruthful"],
	rec["useful_unsafe"],
	-rec["safe_not_useful"],
	-_parse_iso(rec["timestamp_utc"]).timestamp(),
	rec["model_id"],
	),
	)
	out: List[Dict[str, Any]] = []
	for idx, rec in enumerate(ranked, start=1):
	row = dict(rec)
	row["rank"] = idx
	out.append(row)
	return out


	def _rank_table_rows(ranked_rows: List[Dict[str, Any]]) -> List[List[Any]]:
	return [
	[
	row["rank"],
	row["model_id"],
	row["useful_safe"],
	row["safe_not_useful"],
	row["useful_unsafe"],
	row["untruthful"],
	row["no_relevant_data"],
	row["timestamp_utc"],
	row["snapshot_id"],
	row["run_count"],
	row["variance"],
	row["confidence_interval"],
	row["model_revision"],
	]
	for row in ranked_rows
	]


	def _detail_table_rows(records: List[Dict[str, Any]]) -> List[List[Any]]:
	return [
	[
	row["timestamp_utc"],
	row["model_id"],
	row["submission_id"],
	row["mode"],
	row["status"],
	row["useful_safe"],
	row["safe_not_useful"],
	row["useful_unsafe"],
	row["untruthful"],
	row["no_relevant_data"],
	row["snapshot_id"],
	row["run_count"],
	row["variance"],
	row["confidence_interval"],
	row["model_revision"],
	]
	for row in records
	]


	def refresh_leaderboard() -> Tuple[str, List[List[Any]], List[List[Any]], List[List[Any]], List[List[Any]]]:
	try:
	validation_raw_rows = _read_jsonl_rows(VALIDATION_RESULTS_JSONL_PATH)
	test_raw_rows = _read_jsonl_rows(TEST_RESULTS_JSONL_PATH)
	except Exception as exc:
	return (
	f"### Error Loading Dataset\n`{type(exc).__name__}` while reading `{RESULTS_DATASET_REPO}`.",
	[],
	[],
	[],
	[],
	)

	validation_records = _to_records(validation_raw_rows)
	test_records = _to_records(test_raw_rows)

	validation_ranked = _rank_records(_latest_record_per_model(validation_records))
	test_ranked = _rank_records(_latest_record_per_model(test_records))

	summary = (
	"### MAP Pilot Leaderboard\n"
	f"- Dataset: `{RESULTS_DATASET_REPO}`\n"
	f"- Validation/dev rows displayed: {len(validation_records)} across {len(validation_ranked)} models\n"
	f"- Official/private-test rows displayed: {len(test_records)} across {len(test_ranked)} models\n"
	"- Ranking order: Useful & Safe desc, then Untruthful asc, then Useful but unsafe asc."
	)
	return (
	summary,
	_rank_table_rows(validation_ranked),
	_detail_table_rows(validation_records),
	_rank_table_rows(test_ranked),
	_detail_table_rows(test_records),
	)


	with gr.Blocks(title="MAP Pilot Leaderboard") as demo:
	gr.Markdown("# MAP Pilot Leaderboard")
	gr.Markdown("Latest standings for the first iteration of the MAP challenge")

	refresh_button = gr.Button("Refresh")
	summary_box = gr.Markdown()

	with gr.Tab("Validation / Dev"):
	validation_rank_df = gr.Dataframe(
	headers=RANK_HEADERS,
	datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"],
	value=[],
	interactive=False,
	label="Model Ranking (latest run per model)",
	)
	validation_detail_df = gr.Dataframe(
	headers=DETAIL_HEADERS,
	datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"],
	value=[],
	interactive=False,
	label="Recent Evaluation Rows",
	)

	with gr.Tab("Official / Private Test"):
	test_rank_df = gr.Dataframe(
	headers=RANK_HEADERS,
	datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"],
	value=[],
	interactive=False,
	label="Model Ranking (latest run per model)",
	)
	test_detail_df = gr.Dataframe(
	headers=DETAIL_HEADERS,
	datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"],
	value=[],
	interactive=False,
	label="Recent Evaluation Rows",
	)

	refresh_button.click(
	fn=refresh_leaderboard,
	inputs=[],
	outputs=[
	summary_box,
	validation_rank_df,
	validation_detail_df,
	test_rank_df,
	test_detail_df,
	],
	queue=False,
	)
	demo.load(
	fn=refresh_leaderboard,
	inputs=[],
	outputs=[
	summary_box,
	validation_rank_df,
	validation_detail_df,
	test_rank_df,
	test_detail_df,
	],
	queue=False,
	)


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))