Spaces:
Running
Running
Upload from GitHub Actions: Merge pull request #28 from datenlabor-bmz/jn-dev
Browse files- .github/workflows/nightly-evals.yml +6 -3
- README.md +2 -1
- evals/main.py +19 -5
- evals/models.py +2 -11
- evals/tasks.py +12 -0
- pyproject.toml +1 -1
.github/workflows/nightly-evals.yml
CHANGED
|
@@ -21,14 +21,17 @@ jobs:
|
|
| 21 |
- name: Install dependencies
|
| 22 |
run: |
|
| 23 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
- name: Run evaluations
|
| 27 |
env:
|
| 28 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 29 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 30 |
-
N_SENTENCES:
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
run: |
|
| 33 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 34 |
uv run evals/download_data.py
|
|
|
|
| 21 |
- name: Install dependencies
|
| 22 |
run: |
|
| 23 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 24 |
+
# Use the `dev` dependency group defined in pyproject.toml
|
| 25 |
+
uv sync --frozen --group dev
|
| 26 |
|
| 27 |
- name: Run evaluations
|
| 28 |
env:
|
| 29 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 30 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 31 |
+
N_SENTENCES: 10
|
| 32 |
+
# Keep these aligned with defaults in evals/main.py for comparability
|
| 33 |
+
N_LANGUAGES: 1000
|
| 34 |
+
N_MODELS: 40
|
| 35 |
run: |
|
| 36 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 37 |
uv run evals/download_data.py
|
README.md
CHANGED
|
@@ -47,7 +47,8 @@ _AI model evaluations for every language in the world_
|
|
| 47 |
|
| 48 |
### Local Development
|
| 49 |
```bash
|
| 50 |
-
uv
|
|
|
|
| 51 |
```
|
| 52 |
|
| 53 |
## Explore
|
|
|
|
| 47 |
|
| 48 |
### Local Development
|
| 49 |
```bash
|
| 50 |
+
uv sync --group dev
|
| 51 |
+
uv run evals/main.py
|
| 52 |
```
|
| 53 |
|
| 54 |
## Explore
|
evals/main.py
CHANGED
|
@@ -39,7 +39,13 @@ async def evaluate():
|
|
| 39 |
# Load cached results and filter out completed combinations
|
| 40 |
old_results = load("results-detailed")
|
| 41 |
if not old_results.empty:
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
combis = combis[~combis.apply(lambda row: tuple(row) in completed, axis=1)]
|
| 44 |
|
| 45 |
print(f"Running {len(combis)} evaluation tasks...")
|
|
@@ -57,16 +63,24 @@ async def evaluate():
|
|
| 57 |
results = [r for batch in batch_results for result in batch for r in result]
|
| 58 |
results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
|
| 59 |
|
| 60 |
-
# Merge with cached results (immutable log)
|
| 61 |
all_results = pd.concat([old_results, results]).drop_duplicates(
|
| 62 |
-
subset=["task", "model", "bcp_47", "metric", "sentence_nr"]
|
|
|
|
| 63 |
) if not old_results.empty else results
|
| 64 |
|
| 65 |
-
# Filter to current models × languages and aggregate
|
|
|
|
| 66 |
current_models = set(models.iloc[:n_models]["id"])
|
| 67 |
current_languages = set(languages.head(n_languages)["bcp_47"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
results_agg = (
|
| 69 |
-
|
| 70 |
.groupby(["model", "bcp_47", "task", "metric"])
|
| 71 |
.agg({"score": "mean", "origin": "first"})
|
| 72 |
.reset_index()
|
|
|
|
| 39 |
# Load cached results and filter out completed combinations
|
| 40 |
old_results = load("results-detailed")
|
| 41 |
if not old_results.empty:
|
| 42 |
+
# Only treat status==\"ok\" (or missing status) as completed.
|
| 43 |
+
if "status" in old_results.columns:
|
| 44 |
+
ok_mask = old_results["status"].isna() | (old_results["status"] == "ok")
|
| 45 |
+
completed_df = old_results.loc[ok_mask, ["task", "model", "bcp_47", "sentence_nr"]]
|
| 46 |
+
else:
|
| 47 |
+
completed_df = old_results[["task", "model", "bcp_47", "sentence_nr"]]
|
| 48 |
+
completed = set(completed_df.apply(tuple, axis=1))
|
| 49 |
combis = combis[~combis.apply(lambda row: tuple(row) in completed, axis=1)]
|
| 50 |
|
| 51 |
print(f"Running {len(combis)} evaluation tasks...")
|
|
|
|
| 63 |
results = [r for batch in batch_results for result in batch for r in result]
|
| 64 |
results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
|
| 65 |
|
| 66 |
+
# Merge with cached results (immutable log, prefer latest results on conflict)
|
| 67 |
all_results = pd.concat([old_results, results]).drop_duplicates(
|
| 68 |
+
subset=["task", "model", "bcp_47", "metric", "sentence_nr"],
|
| 69 |
+
keep="last",
|
| 70 |
) if not old_results.empty else results
|
| 71 |
|
| 72 |
+
# Filter to current models × languages and aggregate.
|
| 73 |
+
# Only aggregate over successful evaluations (status == \"ok\" or missing).
|
| 74 |
current_models = set(models.iloc[:n_models]["id"])
|
| 75 |
current_languages = set(languages.head(n_languages)["bcp_47"])
|
| 76 |
+
if "status" in all_results.columns:
|
| 77 |
+
valid_mask = all_results["status"].isna() | (all_results["status"] == "ok")
|
| 78 |
+
valid_results = all_results[valid_mask]
|
| 79 |
+
else:
|
| 80 |
+
valid_results = all_results
|
| 81 |
+
|
| 82 |
results_agg = (
|
| 83 |
+
valid_results[valid_results["model"].isin(current_models) & valid_results["bcp_47"].isin(current_languages)]
|
| 84 |
.groupby(["model", "bcp_47", "task", "metric"])
|
| 85 |
.agg({"score": "mean", "origin": "first"})
|
| 86 |
.reset_index()
|
evals/models.py
CHANGED
|
@@ -44,26 +44,17 @@ important_models = [
|
|
| 44 |
"google/gemini-2.5-flash-lite", # 0.3$
|
| 45 |
"google/gemma-3-27b-it", # 0.2$
|
| 46 |
# "x-ai/grok-4", # $15
|
| 47 |
-
"x-ai/grok-4.1-fast:free", #free for now
|
| 48 |
"x-ai/grok-4-fast",
|
| 49 |
-
# "x-ai/grok-3", # $15
|
| 50 |
"cohere/command-a",
|
| 51 |
-
"qwen/qwen3-32b",
|
| 52 |
-
"qwen/qwen3-235b-a22b",
|
| 53 |
"qwen/qwen3-30b-a3b", # 0.29$
|
| 54 |
-
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
| 55 |
-
# "qwen/qwq-32b", # 0.2$
|
| 56 |
-
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
| 57 |
-
# "qwen/qwen-2-72b-instruct", # 0.9$
|
| 58 |
"deepseek/deepseek-v3.2-exp",
|
| 59 |
"microsoft/phi-4", # 0.07$
|
| 60 |
"amazon/nova-premier-v1", # 12.5$
|
| 61 |
"amazon/nova-pro-v1", # 0.09$
|
| 62 |
"moonshotai/kimi-k2", # 0.6$
|
| 63 |
-
# "moonshotai/kimi-k2-thinking", # 2.5$
|
| 64 |
"baidu/ernie-4.5-300b-a47b",
|
| 65 |
-
# "baidu/ernie-4.5-21b-a3b-thinking",
|
| 66 |
-
"z-ai/glm-4.6", # 1.75$
|
| 67 |
]
|
| 68 |
|
| 69 |
blocklist = [
|
|
|
|
| 44 |
"google/gemini-2.5-flash-lite", # 0.3$
|
| 45 |
"google/gemma-3-27b-it", # 0.2$
|
| 46 |
# "x-ai/grok-4", # $15
|
|
|
|
| 47 |
"x-ai/grok-4-fast",
|
|
|
|
| 48 |
"cohere/command-a",
|
| 49 |
+
# "qwen/qwen3-32b",
|
| 50 |
+
# "qwen/qwen3-235b-a22b",
|
| 51 |
"qwen/qwen3-30b-a3b", # 0.29$
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"deepseek/deepseek-v3.2-exp",
|
| 53 |
"microsoft/phi-4", # 0.07$
|
| 54 |
"amazon/nova-premier-v1", # 12.5$
|
| 55 |
"amazon/nova-pro-v1", # 0.09$
|
| 56 |
"moonshotai/kimi-k2", # 0.6$
|
|
|
|
| 57 |
"baidu/ernie-4.5-300b-a47b",
|
|
|
|
|
|
|
| 58 |
]
|
| 59 |
|
| 60 |
blocklist = [
|
evals/tasks.py
CHANGED
|
@@ -117,6 +117,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 117 |
)
|
| 118 |
else:
|
| 119 |
prediction = await query(model, translation_prompt)
|
|
|
|
| 120 |
if prediction:
|
| 121 |
bleu_score = bleu.compute(
|
| 122 |
predictions=[prediction],
|
|
@@ -141,6 +142,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 141 |
"sentence_nr": sentence_nr,
|
| 142 |
"prompt": translation_prompt,
|
| 143 |
"response": prediction,
|
|
|
|
| 144 |
}
|
| 145 |
for metric, score in (
|
| 146 |
("bleu", bleu_score["bleu"]),
|
|
@@ -171,6 +173,7 @@ Text:
|
|
| 171 |
"""
|
| 172 |
response = await query(model, prompt)
|
| 173 |
pred = response.lower().strip() if response else ""
|
|
|
|
| 174 |
true = test_paragraph.topic.lower().strip()
|
| 175 |
others = [t for t in top_topics if t != true]
|
| 176 |
acc = (
|
|
@@ -193,6 +196,7 @@ Text:
|
|
| 193 |
"sentence_nr": nr,
|
| 194 |
"prompt": prompt,
|
| 195 |
"response": pred,
|
|
|
|
| 196 |
}
|
| 197 |
]
|
| 198 |
|
|
@@ -256,6 +260,7 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 256 |
response = await query(model, prompt)
|
| 257 |
final_response = extract_mc_response(response)
|
| 258 |
acc = int(final_response == task["answer"]) if final_response else 0
|
|
|
|
| 259 |
|
| 260 |
return [
|
| 261 |
{
|
|
@@ -268,6 +273,7 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 268 |
"sentence_nr": nr,
|
| 269 |
"prompt": prompt,
|
| 270 |
"response": response,
|
|
|
|
| 271 |
}
|
| 272 |
]
|
| 273 |
|
|
@@ -280,6 +286,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
| 280 |
response = await query(model, prompt)
|
| 281 |
final_response = extract_mc_response(response)
|
| 282 |
acc = int(final_response == task["answer"]) if final_response else 0
|
|
|
|
| 283 |
return [
|
| 284 |
{
|
| 285 |
"model": model,
|
|
@@ -291,6 +298,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
| 291 |
"sentence_nr": nr,
|
| 292 |
"prompt": prompt,
|
| 293 |
"response": response,
|
|
|
|
| 294 |
}
|
| 295 |
]
|
| 296 |
|
|
@@ -323,6 +331,7 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
|
| 323 |
response = await query(model, prompt)
|
| 324 |
final_response = extract_mc_response(response)
|
| 325 |
acc = int(final_response.upper() == answer) if final_response else 0
|
|
|
|
| 326 |
return [
|
| 327 |
{
|
| 328 |
"model": model,
|
|
@@ -334,6 +343,7 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
|
| 334 |
"sentence_nr": nr,
|
| 335 |
"prompt": prompt,
|
| 336 |
"response": response,
|
|
|
|
| 337 |
}
|
| 338 |
]
|
| 339 |
|
|
@@ -358,6 +368,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
|
| 358 |
if number
|
| 359 |
else 0
|
| 360 |
)
|
|
|
|
| 361 |
return [
|
| 362 |
{
|
| 363 |
"model": model,
|
|
@@ -369,6 +380,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
|
| 369 |
"sentence_nr": nr,
|
| 370 |
"prompt": prompt,
|
| 371 |
"response": response,
|
|
|
|
| 372 |
}
|
| 373 |
]
|
| 374 |
|
|
|
|
| 117 |
)
|
| 118 |
else:
|
| 119 |
prediction = await query(model, translation_prompt)
|
| 120 |
+
status = "ok" if prediction else "error"
|
| 121 |
if prediction:
|
| 122 |
bleu_score = bleu.compute(
|
| 123 |
predictions=[prediction],
|
|
|
|
| 142 |
"sentence_nr": sentence_nr,
|
| 143 |
"prompt": translation_prompt,
|
| 144 |
"response": prediction,
|
| 145 |
+
"status": status,
|
| 146 |
}
|
| 147 |
for metric, score in (
|
| 148 |
("bleu", bleu_score["bleu"]),
|
|
|
|
| 173 |
"""
|
| 174 |
response = await query(model, prompt)
|
| 175 |
pred = response.lower().strip() if response else ""
|
| 176 |
+
status = "ok" if pred else "error"
|
| 177 |
true = test_paragraph.topic.lower().strip()
|
| 178 |
others = [t for t in top_topics if t != true]
|
| 179 |
acc = (
|
|
|
|
| 196 |
"sentence_nr": nr,
|
| 197 |
"prompt": prompt,
|
| 198 |
"response": pred,
|
| 199 |
+
"status": status,
|
| 200 |
}
|
| 201 |
]
|
| 202 |
|
|
|
|
| 260 |
response = await query(model, prompt)
|
| 261 |
final_response = extract_mc_response(response)
|
| 262 |
acc = int(final_response == task["answer"]) if final_response else 0
|
| 263 |
+
status = "ok" if final_response else "error"
|
| 264 |
|
| 265 |
return [
|
| 266 |
{
|
|
|
|
| 273 |
"sentence_nr": nr,
|
| 274 |
"prompt": prompt,
|
| 275 |
"response": response,
|
| 276 |
+
"status": status,
|
| 277 |
}
|
| 278 |
]
|
| 279 |
|
|
|
|
| 286 |
response = await query(model, prompt)
|
| 287 |
final_response = extract_mc_response(response)
|
| 288 |
acc = int(final_response == task["answer"]) if final_response else 0
|
| 289 |
+
status = "ok" if final_response else "error"
|
| 290 |
return [
|
| 291 |
{
|
| 292 |
"model": model,
|
|
|
|
| 298 |
"sentence_nr": nr,
|
| 299 |
"prompt": prompt,
|
| 300 |
"response": response,
|
| 301 |
+
"status": status,
|
| 302 |
}
|
| 303 |
]
|
| 304 |
|
|
|
|
| 331 |
response = await query(model, prompt)
|
| 332 |
final_response = extract_mc_response(response)
|
| 333 |
acc = int(final_response.upper() == answer) if final_response else 0
|
| 334 |
+
status = "ok" if final_response else "error"
|
| 335 |
return [
|
| 336 |
{
|
| 337 |
"model": model,
|
|
|
|
| 343 |
"sentence_nr": nr,
|
| 344 |
"prompt": prompt,
|
| 345 |
"response": response,
|
| 346 |
+
"status": status,
|
| 347 |
}
|
| 348 |
]
|
| 349 |
|
|
|
|
| 368 |
if number
|
| 369 |
else 0
|
| 370 |
)
|
| 371 |
+
status = "ok" if number else "error"
|
| 372 |
return [
|
| 373 |
{
|
| 374 |
"model": model,
|
|
|
|
| 380 |
"sentence_nr": nr,
|
| 381 |
"prompt": prompt,
|
| 382 |
"response": response,
|
| 383 |
+
"status": status,
|
| 384 |
}
|
| 385 |
]
|
| 386 |
|
pyproject.toml
CHANGED
|
@@ -37,4 +37,4 @@ dev = [
|
|
| 37 |
"tiktoken>=0.9.0",
|
| 38 |
"tqdm>=4.67.1",
|
| 39 |
"transformers>=4.51.3",
|
| 40 |
-
]
|
|
|
|
| 37 |
"tiktoken>=0.9.0",
|
| 38 |
"tqdm>=4.67.1",
|
| 39 |
"transformers>=4.51.3",
|
| 40 |
+
]
|