Spaces:
Running
Running
| """Streamlit app to display the NPU Arabic leaderboard.""" | |
| from __future__ import annotations | |
| import json | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import List, Sequence | |
| import streamlit as st | |
| # Use the aggregated space JSON which includes score and quality_overall | |
| # On HuggingFace, this is uploaded as leaderboard.json (aggregated version) | |
| _DATA_PATH = Path("leaderboard.json") | |
| # Column order for display - score and quality_overall are prominent | |
| _COLUMNS: Sequence[str] = ( | |
| "model_name", | |
| "status", | |
| "score", | |
| "quality_overall", | |
| "avg_tps", | |
| "mlqa_ar_ar_f1", | |
| "xquad_ar_f1", | |
| "iwslt2017-en-ar_sacrebleu", | |
| "xlsum_title_ar_rougeL", | |
| "xlsum_summary_ar_rougeLsum", | |
| "arabic_mmlu_acc", | |
| "timestamp", | |
| ) | |
| _METRIC_COLUMNS: Sequence[str] = tuple( | |
| col for col in _COLUMNS if col not in {"model_name", "status", "timestamp"} | |
| ) | |
| def _load_rows() -> List[dict]: | |
| if not _DATA_PATH.exists(): | |
| return [] | |
| try: | |
| raw = json.loads(_DATA_PATH.read_text(encoding="utf-8")) | |
| except json.JSONDecodeError: | |
| return [] | |
| if isinstance(raw, dict): | |
| data = [raw] | |
| elif isinstance(raw, list): | |
| data = [item for item in raw if isinstance(item, dict)] | |
| else: | |
| data = [] | |
| # Filter to desired columns | |
| filtered: List[dict] = [] | |
| for row in data: | |
| compact = {key: row.get(key) for key in _COLUMNS} | |
| status = compact.get("status") | |
| if status is None: | |
| status = "Completed" | |
| compact["status"] = status | |
| if status != "Completed": | |
| for metric_col in _METRIC_COLUMNS: | |
| compact[metric_col] = float("nan") | |
| filtered.append(compact) | |
| # Sort by score (highest first), then by timestamp for ties | |
| def _sort_key(item: dict) -> tuple: | |
| score = item.get("score") | |
| score_val = float(score) if score is not None else -1.0 | |
| stamp = item.get("timestamp") | |
| try: | |
| parsed = datetime.fromisoformat(str(stamp)) | |
| if parsed.tzinfo is None: | |
| parsed = parsed.replace(tzinfo=timezone.utc) | |
| else: | |
| parsed = parsed.astimezone(timezone.utc) | |
| except Exception: | |
| parsed = datetime.min.replace(tzinfo=timezone.utc) | |
| return (score_val, parsed) | |
| filtered.sort(key=_sort_key, reverse=True) | |
| return filtered | |
| # Column display names for better readability | |
| _COLUMN_LABELS = { | |
| "model_name": "Model", | |
| "status": "Status", | |
| "score": "Score", | |
| "quality_overall": "Quality", | |
| "avg_tps": "Tokens/sec", | |
| "mlqa_ar_ar_f1": "MLQA F1", | |
| "xquad_ar_f1": "XQuAD F1", | |
| "iwslt2017-en-ar_sacrebleu": "IWSLT BLEU", | |
| "xlsum_title_ar_rougeL": "XLSum Title", | |
| "xlsum_summary_ar_rougeLsum": "XLSum Summary", | |
| "arabic_mmlu_acc": "MMLU Acc", | |
| "timestamp": "Last Updated", | |
| } | |
| st.set_page_config(page_title="Intel NPU Arabic Leaderboard", layout="wide") | |
| st.title("🏆 Intel® NPU Arabic Leaderboard") | |
| st.markdown(""" | |
| **Score** = √(Quality × Speed) - balances model quality with inference speed on Intel NPU. | |
| **Quality** = Average of all benchmark scores (0-100 scale). | |
| """) | |
| rows = _load_rows() | |
| if not rows: | |
| st.info("No evaluations uploaded yet. Trigger a run to populate the leaderboard.") | |
| else: | |
| st.write( | |
| "Submit your model for evaluation by emailing: **model=your-hf-model-id**" | |
| ) | |
| st.dataframe( | |
| rows, | |
| column_config={ | |
| col: st.column_config.NumberColumn(_COLUMN_LABELS.get(col, col), format="%.2f") | |
| if col in _METRIC_COLUMNS | |
| else st.column_config.TextColumn(_COLUMN_LABELS.get(col, col)) | |
| for col in _COLUMNS | |
| }, | |
| hide_index=True, | |
| ) | |
| st.caption("Data auto-synced from leaderboard.json produced by the evaluation pipeline.") | |