Commit ·
bed23b0
1
Parent(s): 5a34fee
Add RobloxQA_OpenEnded
Browse files- src/about.py +2 -0
- src/leaderboard/populate.py +1 -1
- src/leaderboard/utils.py +5 -2
src/about.py
CHANGED
|
@@ -8,4 +8,6 @@ Tracking LLM capabilities regarding Roblox game development.
|
|
| 8 |
Benchmarks:
|
| 9 |
|
| 10 |
- [RobloxQA](https://huggingface.co/datasets/boatbomber/RobloxQA-v1.0): Multiple choice question answering about Roblox APIs and concepts.
|
|
|
|
|
|
|
| 11 |
"""
|
|
|
|
| 8 |
Benchmarks:
|
| 9 |
|
| 10 |
- [RobloxQA](https://huggingface.co/datasets/boatbomber/RobloxQA-v1.0): Multiple choice question answering about Roblox APIs and concepts.
|
| 11 |
+
- [RobloxQA_OpenEnded](https://huggingface.co/datasets/boatbomber/RobloxQA-v1.0): Question answering about Roblox APIs and concepts without giving the multiple choices. \
|
| 12 |
+
Correctness judged by an LLM by comparing the generated answer to the correct answer choice.
|
| 13 |
"""
|
src/leaderboard/populate.py
CHANGED
|
@@ -42,7 +42,7 @@ def load_results() -> pd.DataFrame:
|
|
| 42 |
|
| 43 |
for c in COLUMNS:
|
| 44 |
if c.name not in evaluation:
|
| 45 |
-
evaluation[c.name] =
|
| 46 |
|
| 47 |
data.append(evaluation)
|
| 48 |
|
|
|
|
| 42 |
|
| 43 |
for c in COLUMNS:
|
| 44 |
if c.name not in evaluation:
|
| 45 |
+
evaluation[c.name] = c.default
|
| 46 |
|
| 47 |
data.append(evaluation)
|
| 48 |
|
src/leaderboard/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from dataclasses import dataclass
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
@dataclass
|
|
@@ -9,6 +10,7 @@ class ColumnContent:
|
|
| 9 |
hidden: bool = False
|
| 10 |
never_hidden: bool = False
|
| 11 |
searchable: bool = False
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
## Leaderboard columns
|
|
@@ -16,6 +18,7 @@ COLUMNS = [
|
|
| 16 |
ColumnContent("Model", type="str", displayed_by_default=True, never_hidden=True, searchable=True),
|
| 17 |
ColumnContent("Precision", type="str", displayed_by_default=False),
|
| 18 |
ColumnContent("Params (B)", type="number", displayed_by_default=True),
|
| 19 |
-
ColumnContent("Average", type="number", displayed_by_default=True),
|
| 20 |
-
ColumnContent("RobloxQA", type="number", displayed_by_default=True),
|
|
|
|
| 21 |
]
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
+
from typing import Any
|
| 3 |
|
| 4 |
|
| 5 |
@dataclass
|
|
|
|
| 10 |
hidden: bool = False
|
| 11 |
never_hidden: bool = False
|
| 12 |
searchable: bool = False
|
| 13 |
+
default: Any = None
|
| 14 |
|
| 15 |
|
| 16 |
## Leaderboard columns
|
|
|
|
| 18 |
ColumnContent("Model", type="str", displayed_by_default=True, never_hidden=True, searchable=True),
|
| 19 |
ColumnContent("Precision", type="str", displayed_by_default=False),
|
| 20 |
ColumnContent("Params (B)", type="number", displayed_by_default=True),
|
| 21 |
+
ColumnContent("Average", type="number", displayed_by_default=True, default=0),
|
| 22 |
+
ColumnContent("RobloxQA", type="number", displayed_by_default=True, default=0),
|
| 23 |
+
ColumnContent("RobloxQA_OpenEnded", type="number", displayed_by_default=True, default=0),
|
| 24 |
]
|