davidlf-hp commited on
Commit
25f9bf5
·
verified ·
1 Parent(s): 4c6f2e6

Update leaderboard for ALLaM-7B-Instruct-preview-int4-ov (NPU)

Browse files
Files changed (3) hide show
  1. leaderboard.csv +1 -1
  2. leaderboard.json +11 -11
  3. src/app.py +117 -126
leaderboard.csv CHANGED
@@ -1,3 +1,3 @@
1
  model_name,status,score,quality_overall,avg_tps,mlqa_ar_ar_f1,xquad_ar_f1,iwslt2017-en-ar_sacrebleu,xlsum_title_ar_rougeL,xlsum_summary_ar_rougeLsum,arabic_mmlu_acc,timestamp
2
- KFUPM-JRCAI/ALLaM-7B-Instruct-preview-int4-ov,Evaluating,,,,,,,,,,2026-01-07T06:49:22.215441+00:00
3
  OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov,Completed,31.5,9.92,14.16533453817284,36.82539682539683,16.5158371040724,5.403567063472729,0.0,0.0,0.75,2026-01-06T13:09:59.432404+00:00
 
1
  model_name,status,score,quality_overall,avg_tps,mlqa_ar_ar_f1,xquad_ar_f1,iwslt2017-en-ar_sacrebleu,xlsum_title_ar_rougeL,xlsum_summary_ar_rougeLsum,arabic_mmlu_acc,timestamp
2
+ KFUPM-JRCAI/ALLaM-7B-Instruct-preview-int4-ov,Completed,35.35,19.65,9.00582273138704,33.611111111111114,75.59523809523809,8.170418210184781,0.0,0.0,0.5,2026-01-07T06:56:08.987834+00:00
3
  OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov,Completed,31.5,9.92,14.16533453817284,36.82539682539683,16.5158371040724,5.403567063472729,0.0,0.0,0.75,2026-01-06T13:09:59.432404+00:00
leaderboard.json CHANGED
@@ -1,17 +1,17 @@
1
  [
2
  {
3
  "model_name": "KFUPM-JRCAI/ALLaM-7B-Instruct-preview-int4-ov",
4
- "status": "Evaluating",
5
- "avg_tps": null,
6
- "quality_overall": null,
7
- "timestamp": "2026-01-07T06:49:22.215441+00:00",
8
- "mlqa_ar_ar_f1": null,
9
- "xquad_ar_f1": null,
10
- "iwslt2017-en-ar_sacrebleu": null,
11
- "xlsum_title_ar_rougeL": null,
12
- "xlsum_summary_ar_rougeLsum": null,
13
- "arabic_mmlu_acc": null,
14
- "score": null
15
  },
16
  {
17
  "model_name": "OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov",
 
1
  [
2
  {
3
  "model_name": "KFUPM-JRCAI/ALLaM-7B-Instruct-preview-int4-ov",
4
+ "status": "Completed",
5
+ "avg_tps": 9.00582273138704,
6
+ "quality_overall": 19.65,
7
+ "timestamp": "2026-01-07T06:56:08.987834+00:00",
8
+ "mlqa_ar_ar_f1": 33.611111111111114,
9
+ "xquad_ar_f1": 75.59523809523809,
10
+ "iwslt2017-en-ar_sacrebleu": 8.170418210184781,
11
+ "xlsum_title_ar_rougeL": 0.0,
12
+ "xlsum_summary_ar_rougeLsum": 0.0,
13
+ "arabic_mmlu_acc": 0.5,
14
+ "score": 35.35
15
  },
16
  {
17
  "model_name": "OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov",
src/app.py CHANGED
@@ -1,126 +1,117 @@
1
- """Streamlit app to display the NPU Arabic leaderboard."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- from datetime import datetime, timezone
7
- from pathlib import Path
8
- from typing import List, Sequence
9
-
10
- import streamlit as st
11
-
12
- # Use the aggregated space JSON which includes score and quality_overall
13
- # On HuggingFace, this is uploaded as leaderboard.json (aggregated version)
14
- _DATA_PATH = Path("leaderboard.json")
15
-
16
- # Column order for display - score and quality_overall are prominent
17
- _COLUMNS: Sequence[str] = (
18
- "model_name",
19
- "status",
20
- "score",
21
- "quality_overall",
22
- "avg_tps",
23
- "mlqa_ar_ar_f1",
24
- "xquad_ar_f1",
25
- "iwslt2017-en-ar_sacrebleu",
26
- "xlsum_title_ar_rougeL",
27
- "xlsum_summary_ar_rougeLsum",
28
- "arabic_mmlu_acc",
29
- "timestamp",
30
- )
31
- _METRIC_COLUMNS: Sequence[str] = tuple(
32
- col for col in _COLUMNS if col not in {"model_name", "status", "timestamp"}
33
- )
34
-
35
-
36
- def _load_rows() -> List[dict]:
37
- if not _DATA_PATH.exists():
38
- return []
39
- try:
40
- raw = json.loads(_DATA_PATH.read_text(encoding="utf-8"))
41
- except json.JSONDecodeError:
42
- return []
43
-
44
- if isinstance(raw, dict):
45
- data = [raw]
46
- elif isinstance(raw, list):
47
- data = [item for item in raw if isinstance(item, dict)]
48
- else:
49
- data = []
50
-
51
- # Filter to desired columns
52
- filtered: List[dict] = []
53
- for row in data:
54
- compact = {key: row.get(key) for key in _COLUMNS}
55
- status = compact.get("status")
56
- if status is None:
57
- status = "Completed"
58
- compact["status"] = status
59
- if status != "Completed":
60
- for metric_col in _METRIC_COLUMNS:
61
- compact[metric_col] = float("nan")
62
- filtered.append(compact)
63
-
64
- # Sort by score (highest first), then by timestamp for ties
65
- def _sort_key(item: dict) -> tuple:
66
- score = item.get("score")
67
- score_val = float(score) if score is not None else -1.0
68
- stamp = item.get("timestamp")
69
- try:
70
- parsed = datetime.fromisoformat(str(stamp))
71
- if parsed.tzinfo is None:
72
- parsed = parsed.replace(tzinfo=timezone.utc)
73
- else:
74
- parsed = parsed.astimezone(timezone.utc)
75
- except Exception:
76
- parsed = datetime.min.replace(tzinfo=timezone.utc)
77
- return (score_val, parsed)
78
-
79
- filtered.sort(key=_sort_key, reverse=True)
80
- return filtered
81
-
82
-
83
- # Column display names for better readability
84
- _COLUMN_LABELS = {
85
- "model_name": "Model",
86
- "status": "Status",
87
- "score": "Score",
88
- "quality_overall": "Quality",
89
- "avg_tps": "Tokens/sec",
90
- "mlqa_ar_ar_f1": "MLQA F1",
91
- "xquad_ar_f1": "XQuAD F1",
92
- "iwslt2017-en-ar_sacrebleu": "IWSLT BLEU",
93
- "xlsum_title_ar_rougeL": "XLSum Title",
94
- "xlsum_summary_ar_rougeLsum": "XLSum Summary",
95
- "arabic_mmlu_acc": "MMLU Acc",
96
- "timestamp": "Last Updated",
97
- }
98
-
99
- st.set_page_config(page_title="Intel NPU Arabic Leaderboard", layout="wide")
100
- st.title("🏆 Intel® NPU Arabic Leaderboard")
101
-
102
- st.markdown("""
103
- **Score** = √(Quality × Speed) - balances model quality with inference speed on Intel NPU.
104
-
105
- **Quality** = Average of all benchmark scores (0-100 scale).
106
- """)
107
-
108
- rows = _load_rows()
109
- if not rows:
110
- st.info("No evaluations uploaded yet. Trigger a run to populate the leaderboard.")
111
- else:
112
- st.write(
113
- "Submit your model for evaluation by emailing: **model=your-hf-model-id**"
114
- )
115
- st.dataframe(
116
- rows,
117
- column_config={
118
- col: st.column_config.NumberColumn(_COLUMN_LABELS.get(col, col), format="%.2f")
119
- if col in _METRIC_COLUMNS
120
- else st.column_config.TextColumn(_COLUMN_LABELS.get(col, col))
121
- for col in _COLUMNS
122
- },
123
- hide_index=True,
124
- )
125
-
126
- st.caption("Data auto-synced from leaderboard.json produced by the evaluation pipeline.")
 
1
+ """Streamlit app to display the NPU Arabic leaderboard."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import List, Sequence
9
+
10
+ import streamlit as st
11
+
12
+ # Use the aggregated space JSON which includes score and quality_overall
13
+ # On HuggingFace, this is uploaded as leaderboard.json (aggregated version)
14
+ _DATA_PATH = Path("leaderboard.json")
15
+
16
+ # Column order for display - score and quality_overall are prominent
17
+ _COLUMNS: Sequence[str] = (
18
+ "model_name",
19
+ "status",
20
+ "score",
21
+ "quality_overall",
22
+ "avg_tps",
23
+ "mlqa_ar_ar_f1",
24
+ "xquad_ar_f1",
25
+ "iwslt2017-en-ar_sacrebleu",
26
+ "xlsum_title_ar_rougeL",
27
+ "xlsum_summary_ar_rougeLsum",
28
+ "arabic_mmlu_acc",
29
+ "timestamp",
30
+ )
31
+ _METRIC_COLUMNS: Sequence[str] = tuple(
32
+ col for col in _COLUMNS if col not in {"model_name", "status", "timestamp"}
33
+ )
34
+
35
+
36
+ def _load_rows() -> List[dict]:
37
+ if not _DATA_PATH.exists():
38
+ return []
39
+ try:
40
+ raw = json.loads(_DATA_PATH.read_text(encoding="utf-8"))
41
+ except json.JSONDecodeError:
42
+ return []
43
+
44
+ if isinstance(raw, dict):
45
+ data = [raw]
46
+ elif isinstance(raw, list):
47
+ data = [item for item in raw if isinstance(item, dict)]
48
+ else:
49
+ data = []
50
+
51
+ # Filter to desired columns
52
+ filtered: List[dict] = []
53
+ for row in data:
54
+ compact = {key: row.get(key) for key in _COLUMNS}
55
+ status = compact.get("status")
56
+ if status is None:
57
+ status = "Completed"
58
+ compact["status"] = status
59
+ if status != "Completed":
60
+ for metric_col in _METRIC_COLUMNS:
61
+ compact[metric_col] = float("nan")
62
+ filtered.append(compact)
63
+
64
+ # Sort by score (highest first), then by timestamp for ties
65
+ def _sort_key(item: dict) -> tuple:
66
+ score = item.get("score")
67
+ score_val = float(score) if score is not None else -1.0
68
+ stamp = item.get("timestamp")
69
+ try:
70
+ parsed = datetime.fromisoformat(str(stamp))
71
+ if parsed.tzinfo is None:
72
+ parsed = parsed.replace(tzinfo=timezone.utc)
73
+ else:
74
+ parsed = parsed.astimezone(timezone.utc)
75
+ except Exception:
76
+ parsed = datetime.min.replace(tzinfo=timezone.utc)
77
+ return (score_val, parsed)
78
+
79
+ filtered.sort(key=_sort_key, reverse=True)
80
+ return filtered
81
+
82
+
83
+ # Column display names for better readability
84
+ _COLUMN_LABELS = {
85
+ "model_name": "Model",
86
+ "status": "Status",
87
+ "score": "Score",
88
+ "quality_overall": "Quality",
89
+ "avg_tps": "Tokens/sec",
90
+ "mlqa_ar_ar_f1": "MLQA F1",
91
+ "xquad_ar_f1": "XQuAD F1",
92
+ "iwslt2017-en-ar_sacrebleu": "IWSLT BLEU",
93
+ "xlsum_title_ar_rougeL": "XLSum Title",
94
+ "xlsum_summary_ar_rougeLsum": "XLSum Summary",
95
+ "arabic_mmlu_acc": "MMLU Acc",
96
+ "timestamp": "Last Updated",
97
+ }
98
+
99
+ st.set_page_config(page_title="Intel NPU Arabic Leaderboard", layout="wide")
100
+ st.title("Intel NPU Arabic Leaderboard")
101
+
102
+ rows = _load_rows()
103
+ if not rows:
104
+ st.info("No evaluations uploaded yet.")
105
+ else:
106
+ st.dataframe(
107
+ rows,
108
+ column_config={
109
+ col: st.column_config.NumberColumn(_COLUMN_LABELS.get(col, col), format="%.2f")
110
+ if col in _METRIC_COLUMNS
111
+ else st.column_config.TextColumn(_COLUMN_LABELS.get(col, col))
112
+ for col in _COLUMNS
113
+ },
114
+ hide_index=True,
115
+ )
116
+
117
+ st.caption("Submit your model for evaluation by emailing: **model:your-hf/model-id**")