tjdmstj commited on
Commit
20e4ca3
·
1 Parent(s): 0e50698
__pycache__/app.cpython-312.pyc ADDED
Binary file (35.6 kB). View file
 
app.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
 
3
  import base64
4
  import json
5
- import os
6
  from functools import cmp_to_key
7
  from html import escape
8
  from pathlib import Path
@@ -1094,7 +1093,4 @@ def build_app() -> gr.Blocks:
1094
 
1095
  if __name__ == "__main__":
1096
  app = build_app()
1097
- app.launch(
1098
- server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
1099
- css=CUSTOM_CSS,
1100
- )
 
2
 
3
  import base64
4
  import json
 
5
  from functools import cmp_to_key
6
  from html import escape
7
  from pathlib import Path
 
1093
 
1094
  if __name__ == "__main__":
1095
  app = build_app()
1096
+ app.launch(css=CUSTOM_CSS)
 
 
 
build_leaderboard_data.py DELETED
@@ -1,292 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- from datetime import datetime, timezone
5
- from pathlib import Path
6
- from typing import Any
7
-
8
-
9
- ROOT = Path(__file__).parent
10
- RESULTS_ROOT = ROOT / "data" / "results_real"
11
- LEADERBOARD_JSON = ROOT / "data" / "leaderboard-data.json"
12
-
13
-
14
- CANONICAL_TASKS = [
15
- {
16
- "id": "K-disentQA",
17
- "label": "SCA-QA",
18
- "metricLabel": "Speech Context Faithfulness",
19
- "shortMetric": "Faithfulness",
20
- "lowerBetter": False,
21
- "datasets": [
22
- {"id": "history_after_chosun", "label": "History_after_chosun"},
23
- {"id": "history_after_chosun_other", "label": "History_after_chosun Other"},
24
- {"id": "history_before_chosun", "label": "History_before_chosun"},
25
- {"id": "history_before_chosun_other", "label": "History_before_chosun Other"},
26
- {"id": "k-sports", "label": "K-sports"},
27
- {"id": "k-sports_other", "label": "K-sports Other"},
28
- {"id": "kpop", "label": "K-pop"},
29
- {"id": "kpop_other", "label": "K-pop Other"},
30
- ],
31
- },
32
- {
33
- "id": "SQA",
34
- "label": "Speech QA",
35
- "metricLabel": "Accuracy (%)",
36
- "shortMetric": "Acc(%)",
37
- "lowerBetter": False,
38
- "datasets": [
39
- {"id": "click", "label": "CLICk"},
40
- {"id": "click_other", "label": "CLICk Other"},
41
- {"id": "kobest_boolq", "label": "KoBest BoolQ"},
42
- {"id": "kobest_boolq_other", "label": "KoBest BoolQ Other"},
43
- ],
44
- },
45
- {
46
- "id": "Instruct",
47
- "label": "Speech Instruction",
48
- "metricLabel": "Score (GPT-4o as Judge)",
49
- "shortMetric": "Score (GPT-4o as Judge)",
50
- "lowerBetter": False,
51
- "datasets": [
52
- {"id": "alpaca", "label": "Alpaca"},
53
- {"id": "alpaca_other", "label": "Alpaca Other"},
54
- {"id": "kudge", "label": "KUDGE"},
55
- {"id": "kudge_other", "label": "KUDGE Other"},
56
- {"id": "openhermes", "label": "OpenHermes"},
57
- {"id": "openhermes_other", "label": "OpenHermes Other"},
58
- {"id": "vicuna", "label": "Vicuna"},
59
- {"id": "vicuna_other", "label": "Vicuna Other"},
60
- ],
61
- },
62
- {
63
- "id": "ASR",
64
- "label": "ASR",
65
- "metricLabel": "CER (%)",
66
- "shortMetric": "CER",
67
- "lowerBetter": True,
68
- "datasets": [
69
- {"id": "common_voice_korea", "label": "CommonVoice-KO"},
70
- {"id": "common_voice_korea_other", "label": "CommonVoice-KO Other"},
71
- {"id": "ksponspeech_eval_clean", "label": "KsponSpeech Clean"},
72
- {"id": "ksponspeech_eval_other", "label": "KsponSpeech Other"},
73
- {"id": "zeroth_korean_test", "label": "Zeroth-Korean"},
74
- {"id": "zeroth_korean_test_other", "label": "Zeroth-Korean Other"},
75
- ],
76
- },
77
- {
78
- "id": "Translation",
79
- "label": "Translation",
80
- "metricLabel": "BLEU / METEOR",
81
- "shortMetric": "BLEU / METEOR",
82
- "lowerBetter": False,
83
- "datasets": [
84
- {"id": "etri_tst-COMMON", "label": "ETRI-TST-Common"},
85
- {"id": "etri_tst-HE", "label": "ETRI-TST-HE"},
86
- ],
87
- },
88
- {
89
- "id": "LSQA",
90
- "label": "Long Speech Understanding",
91
- "metricLabel": "Accuracy (%)",
92
- "shortMetric": "Acc(%)",
93
- "lowerBetter": False,
94
- "datasets": [
95
- {"id": "mctest", "label": "MCTest"},
96
- {"id": "mctest_other", "label": "MCTest Other"},
97
- ],
98
- },
99
- ]
100
-
101
-
102
- FOLDER_TO_DATASET_ID = {
103
- "K-disentQA": {
104
- "history_after_chosun": "history_after_chosun",
105
- "history_after_chosun_other": "history_after_chosun_other",
106
- "history_before_chosun": "history_before_chosun",
107
- "history_before_chosun_other": "history_before_chosun_other",
108
- "k-sports": "k-sports",
109
- "k-sports_other": "k-sports_other",
110
- "kpop": "kpop",
111
- "kpop_other": "kpop_other",
112
- },
113
- "SQA": {
114
- "click": "click",
115
- "click_other": "click_other",
116
- "kobest_boolq": "kobest_boolq",
117
- "kobest_boolq_other": "kobest_boolq_other",
118
- },
119
- "Instruct": {
120
- "alpaca": "alpaca",
121
- "alpaca_other": "alpaca_other",
122
- "kudge": "kudge",
123
- "kudge_other": "kudge_other",
124
- "openhermes": "openhermes",
125
- "openhermes_other": "openhermes_other",
126
- "vicuna": "vicuna",
127
- "vicuna_other": "vicuna_other",
128
- },
129
- "ASR": {
130
- "common_voice_korea": "common_voice_korea",
131
- "common_voice_korea_other": "common_voice_korea_other",
132
- "ksponspeech_eval_clean": "ksponspeech_eval_clean",
133
- "ksponspeech_eval_other": "ksponspeech_eval_other",
134
- "zeroth_korean_test": "zeroth_korean_test",
135
- "zeroth_korean_test_other": "zeroth_korean_test_other",
136
- },
137
- "Translation": {
138
- "etri_tst-COMMON": "etri_tst-COMMON",
139
- "etri_tst-HE": "etri_tst-HE",
140
- },
141
- "LSQA": {
142
- "mctest": "mctest",
143
- "mctest_other": "mctest_other",
144
- },
145
- }
146
-
147
-
148
- def load_existing_entry_meta() -> dict[str, dict[str, str]]:
149
- if not LEADERBOARD_JSON.exists():
150
- return {}
151
-
152
- payload = json.loads(LEADERBOARD_JSON.read_text(encoding="utf-8"))
153
- return {
154
- entry["id"]: {
155
- "rank_name": entry.get("rank_name", entry["id"]),
156
- "model": entry.get("model", ""),
157
- "url": entry.get("url", ""),
158
- }
159
- for entry in payload.get("entries", [])
160
- }
161
-
162
-
163
- def pick_summary(dataset_dir: Path) -> Path | None:
164
- direct = sorted(path for path in dataset_dir.glob("*_summary.json") if path.is_file())
165
- if direct:
166
- return direct[0]
167
-
168
- recursive = sorted(
169
- dataset_dir.rglob("*_summary.json"),
170
- key=lambda path: (len(path.relative_to(dataset_dir).parts), str(path)),
171
- )
172
- return recursive[0] if recursive else None
173
-
174
-
175
- def extract_metric(task_name: str, payload: dict[str, Any]) -> dict[str, Any] | None:
176
- if task_name == "K-disentQA":
177
- value = payload.get("accuracy_speech")
178
- if value is None:
179
- return None
180
- value *= 100
181
- return {"value": value, "display": f"{value:.2f}"}
182
-
183
- if task_name in {"SQA", "LSQA"}:
184
- value = payload.get("accuracy_logit")
185
- if value is None:
186
- value = payload.get("accuracy_generation")
187
- if value is None:
188
- return None
189
- value *= 100
190
- return {"value": value, "display": f"{value:.2f}"}
191
-
192
- if task_name == "Instruct":
193
- value = payload.get("avg_gpt_score")
194
- if value is None:
195
- return None
196
- value *= 100
197
- return {"value": value, "display": f"{value:.2f}"}
198
-
199
- if task_name == "ASR":
200
- value = payload.get("total_cer")
201
- if value is None:
202
- return None
203
- value *= 100
204
- return {"value": value, "display": f"{value:.2f}"}
205
-
206
- if task_name == "Translation":
207
- bleu = payload.get("avg_bleu")
208
- if bleu is None:
209
- bleu = payload.get("corpus_bleu")
210
- meteor = payload.get("avg_meteor")
211
- if bleu is None:
212
- return None
213
- if meteor is None:
214
- return {"value": bleu, "display": f"{bleu:.2f}"}
215
- return {"value": bleu, "display": f"{bleu:.2f} / {meteor:.2f}"}
216
-
217
- return None
218
-
219
-
220
- def build_leaderboard_payload() -> dict[str, Any]:
221
- if not RESULTS_ROOT.exists():
222
- raise SystemExit(f"Missing results directory: {RESULTS_ROOT}")
223
-
224
- existing_meta = load_existing_entry_meta()
225
- entries: dict[str, dict[str, Any]] = {}
226
-
227
- for task in CANONICAL_TASKS:
228
- task_id = task["id"]
229
- task_dir = RESULTS_ROOT / task_id
230
- if not task_dir.exists():
231
- continue
232
-
233
- folder_map = FOLDER_TO_DATASET_ID[task_id]
234
- for model_dir in sorted(path for path in task_dir.iterdir() if path.is_dir()):
235
- model_id = model_dir.name
236
- meta = existing_meta.get(model_id, {})
237
- entry = entries.setdefault(
238
- model_id,
239
- {
240
- "id": model_id,
241
- "rank_name": meta.get("rank_name", model_id),
242
- "model": meta.get("model", ""),
243
- "url": meta.get("url", ""),
244
- "tasks": {},
245
- },
246
- )
247
- entry["tasks"].setdefault(task_id, {})
248
-
249
- for dataset_dir in sorted(path for path in model_dir.iterdir() if path.is_dir()):
250
- dataset_id = folder_map.get(dataset_dir.name)
251
- if not dataset_id:
252
- continue
253
-
254
- summary_path = pick_summary(dataset_dir)
255
- if summary_path is None:
256
- continue
257
-
258
- payload = json.loads(summary_path.read_text(encoding="utf-8"))
259
- metric = extract_metric(task_id, payload)
260
- if metric is None:
261
- continue
262
-
263
- if not entry["model"] and payload.get("model"):
264
- entry["model"] = payload["model"]
265
- entry["tasks"][task_id][dataset_id] = metric
266
-
267
- for entry in entries.values():
268
- if not entry["model"]:
269
- entry["model"] = entry["id"]
270
- for task in CANONICAL_TASKS:
271
- entry["tasks"].setdefault(task["id"], {})
272
-
273
- return {
274
- "generatedAt": datetime.now(timezone.utc).isoformat(),
275
- "sourceRoot": "data/results_real",
276
- "tasks": CANONICAL_TASKS,
277
- "entries": [entries[entry_id] for entry_id in sorted(entries)],
278
- }
279
-
280
-
281
- def main() -> None:
282
- payload = build_leaderboard_payload()
283
- LEADERBOARD_JSON.write_text(
284
- json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
285
- encoding="utf-8",
286
- )
287
- print(f"Wrote {LEADERBOARD_JSON}")
288
- print(f"Entries: {len(payload['entries'])}")
289
-
290
-
291
- if __name__ == "__main__":
292
- main()