Commit
·
376f461
1
Parent(s):
60d6a88
fix: Separate zero-shot performance from few-shot
Browse files
app.py
CHANGED
|
@@ -786,7 +786,8 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 786 |
logger.info("Fetching results from EuroEval benchmark...")
|
| 787 |
|
| 788 |
response = requests.get(
|
| 789 |
-
"https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main
|
|
|
|
| 790 |
)
|
| 791 |
response.raise_for_status()
|
| 792 |
records = [
|
|
@@ -805,6 +806,8 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 805 |
data_dict = defaultdict(dict)
|
| 806 |
for record in records:
|
| 807 |
model_name = record["model"]
|
|
|
|
|
|
|
| 808 |
raw_results = record["results"]["raw"]
|
| 809 |
if isinstance(raw_results, dict) and "test" in raw_results:
|
| 810 |
raw_results = raw_results.get("test", raw_results)
|
|
|
|
| 786 |
logger.info("Fetching results from EuroEval benchmark...")
|
| 787 |
|
| 788 |
response = requests.get(
|
| 789 |
+
"https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main"
|
| 790 |
+
"/results/results.jsonl"
|
| 791 |
)
|
| 792 |
response.raise_for_status()
|
| 793 |
records = [
|
|
|
|
| 806 |
data_dict = defaultdict(dict)
|
| 807 |
for record in records:
|
| 808 |
model_name = record["model"]
|
| 809 |
+
if not record["few_shot"]:
|
| 810 |
+
model_name += " (zero-shot)"
|
| 811 |
raw_results = record["results"]["raw"]
|
| 812 |
if isinstance(raw_results, dict) and "test" in raw_results:
|
| 813 |
raw_results = raw_results.get("test", raw_results)
|