Spaces:

scailaboratory
/

KoALa-Bench

Running

App Files Files Community

tjdmstj commited on Mar 30

Commit

20e4ca3

1 Parent(s): 0e50698

app.py

Browse files

Files changed (3) hide show

__pycache__/app.cpython-312.pyc +0 -0
app.py +1 -5
build_leaderboard_data.py +0 -292

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (35.6 kB). View file

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 import base64
 import json
-import os
 from functools import cmp_to_key
 from html import escape
 from pathlib import Path
@@ -1094,7 +1093,4 @@ def build_app() -> gr.Blocks:
 if __name__ == "__main__":
     app = build_app()
-    app.launch(
-        server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
-        css=CUSTOM_CSS,
-    )

 import base64
 import json
 from functools import cmp_to_key
 from html import escape
 from pathlib import Path
 if __name__ == "__main__":
     app = build_app()
+    app.launch(css=CUSTOM_CSS)

build_leaderboard_data.py DELETED Viewed

@@ -1,292 +0,0 @@
-from __future__ import annotations
-import json
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any
-ROOT = Path(__file__).parent
-RESULTS_ROOT = ROOT / "data" / "results_real"
-LEADERBOARD_JSON = ROOT / "data" / "leaderboard-data.json"
-CANONICAL_TASKS = [
-    {
-        "id": "K-disentQA",
-        "label": "SCA-QA",
-        "metricLabel": "Speech Context Faithfulness",
-        "shortMetric": "Faithfulness",
-        "lowerBetter": False,
-        "datasets": [
-            {"id": "history_after_chosun", "label": "History_after_chosun"},
-            {"id": "history_after_chosun_other", "label": "History_after_chosun Other"},
-            {"id": "history_before_chosun", "label": "History_before_chosun"},
-            {"id": "history_before_chosun_other", "label": "History_before_chosun Other"},
-            {"id": "k-sports", "label": "K-sports"},
-            {"id": "k-sports_other", "label": "K-sports Other"},
-            {"id": "kpop", "label": "K-pop"},
-            {"id": "kpop_other", "label": "K-pop Other"},
-        ],
-    },
-    {
-        "id": "SQA",
-        "label": "Speech QA",
-        "metricLabel": "Accuracy (%)",
-        "shortMetric": "Acc(%)",
-        "lowerBetter": False,
-        "datasets": [
-            {"id": "click", "label": "CLICk"},
-            {"id": "click_other", "label": "CLICk Other"},
-            {"id": "kobest_boolq", "label": "KoBest BoolQ"},
-            {"id": "kobest_boolq_other", "label": "KoBest BoolQ Other"},
-        ],
-    },
-    {
-        "id": "Instruct",
-        "label": "Speech Instruction",
-        "metricLabel": "Score (GPT-4o as Judge)",
-        "shortMetric": "Score (GPT-4o as Judge)",
-        "lowerBetter": False,
-        "datasets": [
-            {"id": "alpaca", "label": "Alpaca"},
-            {"id": "alpaca_other", "label": "Alpaca Other"},
-            {"id": "kudge", "label": "KUDGE"},
-            {"id": "kudge_other", "label": "KUDGE Other"},
-            {"id": "openhermes", "label": "OpenHermes"},
-            {"id": "openhermes_other", "label": "OpenHermes Other"},
-            {"id": "vicuna", "label": "Vicuna"},
-            {"id": "vicuna_other", "label": "Vicuna Other"},
-        ],
-    },
-    {
-        "id": "ASR",
-        "label": "ASR",
-        "metricLabel": "CER (%)",
-        "shortMetric": "CER",
-        "lowerBetter": True,
-        "datasets": [
-            {"id": "common_voice_korea", "label": "CommonVoice-KO"},
-            {"id": "common_voice_korea_other", "label": "CommonVoice-KO Other"},
-            {"id": "ksponspeech_eval_clean", "label": "KsponSpeech Clean"},
-            {"id": "ksponspeech_eval_other", "label": "KsponSpeech Other"},
-            {"id": "zeroth_korean_test", "label": "Zeroth-Korean"},
-            {"id": "zeroth_korean_test_other", "label": "Zeroth-Korean Other"},
-        ],
-    },
-    {
-        "id": "Translation",
-        "label": "Translation",
-        "metricLabel": "BLEU / METEOR",
-        "shortMetric": "BLEU / METEOR",
-        "lowerBetter": False,
-        "datasets": [
-            {"id": "etri_tst-COMMON", "label": "ETRI-TST-Common"},
-            {"id": "etri_tst-HE", "label": "ETRI-TST-HE"},
-        ],
-    },
-    {
-        "id": "LSQA",
-        "label": "Long Speech Understanding",
-        "metricLabel": "Accuracy (%)",
-        "shortMetric": "Acc(%)",
-        "lowerBetter": False,
-        "datasets": [
-            {"id": "mctest", "label": "MCTest"},
-            {"id": "mctest_other", "label": "MCTest Other"},
-        ],
-    },
-]
-FOLDER_TO_DATASET_ID = {
-    "K-disentQA": {
-        "history_after_chosun": "history_after_chosun",
-        "history_after_chosun_other": "history_after_chosun_other",
-        "history_before_chosun": "history_before_chosun",
-        "history_before_chosun_other": "history_before_chosun_other",
-        "k-sports": "k-sports",
-        "k-sports_other": "k-sports_other",
-        "kpop": "kpop",
-        "kpop_other": "kpop_other",
-    },
-    "SQA": {
-        "click": "click",
-        "click_other": "click_other",
-        "kobest_boolq": "kobest_boolq",
-        "kobest_boolq_other": "kobest_boolq_other",
-    },
-    "Instruct": {
-        "alpaca": "alpaca",
-        "alpaca_other": "alpaca_other",
-        "kudge": "kudge",
-        "kudge_other": "kudge_other",
-        "openhermes": "openhermes",
-        "openhermes_other": "openhermes_other",
-        "vicuna": "vicuna",
-        "vicuna_other": "vicuna_other",
-    },
-    "ASR": {
-        "common_voice_korea": "common_voice_korea",
-        "common_voice_korea_other": "common_voice_korea_other",
-        "ksponspeech_eval_clean": "ksponspeech_eval_clean",
-        "ksponspeech_eval_other": "ksponspeech_eval_other",
-        "zeroth_korean_test": "zeroth_korean_test",
-        "zeroth_korean_test_other": "zeroth_korean_test_other",
-    },
-    "Translation": {
-        "etri_tst-COMMON": "etri_tst-COMMON",
-        "etri_tst-HE": "etri_tst-HE",
-    },
-    "LSQA": {
-        "mctest": "mctest",
-        "mctest_other": "mctest_other",
-    },
-}
-def load_existing_entry_meta() -> dict[str, dict[str, str]]:
-    if not LEADERBOARD_JSON.exists():
-        return {}
-    payload = json.loads(LEADERBOARD_JSON.read_text(encoding="utf-8"))
-    return {
-        entry["id"]: {
-            "rank_name": entry.get("rank_name", entry["id"]),
-            "model": entry.get("model", ""),
-            "url": entry.get("url", ""),
-        }
-        for entry in payload.get("entries", [])
-    }
-def pick_summary(dataset_dir: Path) -> Path | None:
-    direct = sorted(path for path in dataset_dir.glob("*_summary.json") if path.is_file())
-    if direct:
-        return direct[0]
-    recursive = sorted(
-        dataset_dir.rglob("*_summary.json"),
-        key=lambda path: (len(path.relative_to(dataset_dir).parts), str(path)),
-    )
-    return recursive[0] if recursive else None
-def extract_metric(task_name: str, payload: dict[str, Any]) -> dict[str, Any] | None:
-    if task_name == "K-disentQA":
-        value = payload.get("accuracy_speech")
-        if value is None:
-            return None
-        value *= 100
-        return {"value": value, "display": f"{value:.2f}"}
-    if task_name in {"SQA", "LSQA"}:
-        value = payload.get("accuracy_logit")
-        if value is None:
-            value = payload.get("accuracy_generation")
-        if value is None:
-            return None
-        value *= 100
-        return {"value": value, "display": f"{value:.2f}"}
-    if task_name == "Instruct":
-        value = payload.get("avg_gpt_score")
-        if value is None:
-            return None
-        value *= 100
-        return {"value": value, "display": f"{value:.2f}"}
-    if task_name == "ASR":
-        value = payload.get("total_cer")
-        if value is None:
-            return None
-        value *= 100
-        return {"value": value, "display": f"{value:.2f}"}
-    if task_name == "Translation":
-        bleu = payload.get("avg_bleu")
-        if bleu is None:
-            bleu = payload.get("corpus_bleu")
-        meteor = payload.get("avg_meteor")
-        if bleu is None:
-            return None
-        if meteor is None:
-            return {"value": bleu, "display": f"{bleu:.2f}"}
-        return {"value": bleu, "display": f"{bleu:.2f} / {meteor:.2f}"}
-    return None
-def build_leaderboard_payload() -> dict[str, Any]:
-    if not RESULTS_ROOT.exists():
-        raise SystemExit(f"Missing results directory: {RESULTS_ROOT}")
-    existing_meta = load_existing_entry_meta()
-    entries: dict[str, dict[str, Any]] = {}
-    for task in CANONICAL_TASKS:
-        task_id = task["id"]
-        task_dir = RESULTS_ROOT / task_id
-        if not task_dir.exists():
-            continue
-        folder_map = FOLDER_TO_DATASET_ID[task_id]
-        for model_dir in sorted(path for path in task_dir.iterdir() if path.is_dir()):
-            model_id = model_dir.name
-            meta = existing_meta.get(model_id, {})
-            entry = entries.setdefault(
-                model_id,
-                {
-                    "id": model_id,
-                    "rank_name": meta.get("rank_name", model_id),
-                    "model": meta.get("model", ""),
-                    "url": meta.get("url", ""),
-                    "tasks": {},
-                },
-            )
-            entry["tasks"].setdefault(task_id, {})
-            for dataset_dir in sorted(path for path in model_dir.iterdir() if path.is_dir()):
-                dataset_id = folder_map.get(dataset_dir.name)
-                if not dataset_id:
-                    continue
-                summary_path = pick_summary(dataset_dir)
-                if summary_path is None:
-                    continue
-                payload = json.loads(summary_path.read_text(encoding="utf-8"))
-                metric = extract_metric(task_id, payload)
-                if metric is None:
-                    continue
-                if not entry["model"] and payload.get("model"):
-                    entry["model"] = payload["model"]
-                entry["tasks"][task_id][dataset_id] = metric
-    for entry in entries.values():
-        if not entry["model"]:
-            entry["model"] = entry["id"]
-        for task in CANONICAL_TASKS:
-            entry["tasks"].setdefault(task["id"], {})
-    return {
-        "generatedAt": datetime.now(timezone.utc).isoformat(),
-        "sourceRoot": "data/results_real",
-        "tasks": CANONICAL_TASKS,
-        "entries": [entries[entry_id] for entry_id in sorted(entries)],
-    }
-def main() -> None:
-    payload = build_leaderboard_payload()
-    LEADERBOARD_JSON.write_text(
-        json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
-        encoding="utf-8",
-    )
-    print(f"Wrote {LEADERBOARD_JSON}")
-    print(f"Entries: {len(payload['entries'])}")
-if __name__ == "__main__":
-    main()