davidlf-hp's picture
Fix: update src/app.py to read from leaderboard.json
8c1361f verified
"""Streamlit app to display the NPU Arabic leaderboard."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Sequence
import streamlit as st
# Use the aggregated space JSON which includes score and quality_overall
# On HuggingFace, this is uploaded as leaderboard.json (aggregated version)
_DATA_PATH = Path("leaderboard.json")
# Column order for display - score and quality_overall are prominent
_COLUMNS: Sequence[str] = (
"model_name",
"status",
"score",
"quality_overall",
"avg_tps",
"mlqa_ar_ar_f1",
"xquad_ar_f1",
"iwslt2017-en-ar_sacrebleu",
"xlsum_title_ar_rougeL",
"xlsum_summary_ar_rougeLsum",
"arabic_mmlu_acc",
"timestamp",
)
_METRIC_COLUMNS: Sequence[str] = tuple(
col for col in _COLUMNS if col not in {"model_name", "status", "timestamp"}
)
def _load_rows() -> List[dict]:
if not _DATA_PATH.exists():
return []
try:
raw = json.loads(_DATA_PATH.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return []
if isinstance(raw, dict):
data = [raw]
elif isinstance(raw, list):
data = [item for item in raw if isinstance(item, dict)]
else:
data = []
# Filter to desired columns
filtered: List[dict] = []
for row in data:
compact = {key: row.get(key) for key in _COLUMNS}
status = compact.get("status")
if status is None:
status = "Completed"
compact["status"] = status
if status != "Completed":
for metric_col in _METRIC_COLUMNS:
compact[metric_col] = float("nan")
filtered.append(compact)
# Sort by score (highest first), then by timestamp for ties
def _sort_key(item: dict) -> tuple:
score = item.get("score")
score_val = float(score) if score is not None else -1.0
stamp = item.get("timestamp")
try:
parsed = datetime.fromisoformat(str(stamp))
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=timezone.utc)
else:
parsed = parsed.astimezone(timezone.utc)
except Exception:
parsed = datetime.min.replace(tzinfo=timezone.utc)
return (score_val, parsed)
filtered.sort(key=_sort_key, reverse=True)
return filtered
# Column display names for better readability
_COLUMN_LABELS = {
"model_name": "Model",
"status": "Status",
"score": "Score",
"quality_overall": "Quality",
"avg_tps": "Tokens/sec",
"mlqa_ar_ar_f1": "MLQA F1",
"xquad_ar_f1": "XQuAD F1",
"iwslt2017-en-ar_sacrebleu": "IWSLT BLEU",
"xlsum_title_ar_rougeL": "XLSum Title",
"xlsum_summary_ar_rougeLsum": "XLSum Summary",
"arabic_mmlu_acc": "MMLU Acc",
"timestamp": "Last Updated",
}
st.set_page_config(page_title="Intel NPU Arabic Leaderboard", layout="wide")
st.title("🏆 Intel® NPU Arabic Leaderboard")
st.markdown("""
**Score** = √(Quality × Speed) - balances model quality with inference speed on Intel NPU.
**Quality** = Average of all benchmark scores (0-100 scale).
""")
rows = _load_rows()
if not rows:
st.info("No evaluations uploaded yet. Trigger a run to populate the leaderboard.")
else:
st.write(
"Submit your model for evaluation by emailing: **model=your-hf-model-id**"
)
st.dataframe(
rows,
column_config={
col: st.column_config.NumberColumn(_COLUMN_LABELS.get(col, col), format="%.2f")
if col in _METRIC_COLUMNS
else st.column_config.TextColumn(_COLUMN_LABELS.get(col, col))
for col in _COLUMNS
},
hide_index=True,
)
st.caption("Data auto-synced from leaderboard.json produced by the evaluation pipeline.")