davidpomerenke commited on
Commit
55b63ea
·
verified ·
1 Parent(s): 44a2e08

Upload from GitHub Actions: Merge pull request #28 from datenlabor-bmz/jn-dev

Browse files
.github/workflows/nightly-evals.yml CHANGED
@@ -21,14 +21,17 @@ jobs:
21
  - name: Install dependencies
22
  run: |
23
  curl -LsSf https://astral.sh/uv/install.sh | sh
24
- uv sync --frozen --extra dev
 
25
 
26
  - name: Run evaluations
27
  env:
28
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
29
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
- N_SENTENCES: 20
31
- MAX_LANGUAGES: 150
 
 
32
  run: |
33
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
34
  uv run evals/download_data.py
 
21
  - name: Install dependencies
22
  run: |
23
  curl -LsSf https://astral.sh/uv/install.sh | sh
24
+ # Use the `dev` dependency group defined in pyproject.toml
25
+ uv sync --frozen --group dev
26
 
27
  - name: Run evaluations
28
  env:
29
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
30
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
31
+ N_SENTENCES: 10
32
+ # Keep these aligned with defaults in evals/main.py for comparability
33
+ N_LANGUAGES: 1000
34
+ N_MODELS: 40
35
  run: |
36
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
37
  uv run evals/download_data.py
README.md CHANGED
@@ -47,7 +47,8 @@ _AI model evaluations for every language in the world_
47
 
48
  ### Local Development
49
  ```bash
50
- uv run --extra dev evals/main.py
 
51
  ```
52
 
53
  ## Explore
 
47
 
48
  ### Local Development
49
  ```bash
50
+ uv sync --group dev
51
+ uv run evals/main.py
52
  ```
53
 
54
  ## Explore
evals/main.py CHANGED
@@ -39,7 +39,13 @@ async def evaluate():
39
  # Load cached results and filter out completed combinations
40
  old_results = load("results-detailed")
41
  if not old_results.empty:
42
- completed = set(old_results[["task", "model", "bcp_47", "sentence_nr"]].apply(tuple, axis=1))
 
 
 
 
 
 
43
  combis = combis[~combis.apply(lambda row: tuple(row) in completed, axis=1)]
44
 
45
  print(f"Running {len(combis)} evaluation tasks...")
@@ -57,16 +63,24 @@ async def evaluate():
57
  results = [r for batch in batch_results for result in batch for r in result]
58
  results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
59
 
60
- # Merge with cached results (immutable log)
61
  all_results = pd.concat([old_results, results]).drop_duplicates(
62
- subset=["task", "model", "bcp_47", "metric", "sentence_nr"]
 
63
  ) if not old_results.empty else results
64
 
65
- # Filter to current models × languages and aggregate
 
66
  current_models = set(models.iloc[:n_models]["id"])
67
  current_languages = set(languages.head(n_languages)["bcp_47"])
 
 
 
 
 
 
68
  results_agg = (
69
- all_results[all_results["model"].isin(current_models) & all_results["bcp_47"].isin(current_languages)]
70
  .groupby(["model", "bcp_47", "task", "metric"])
71
  .agg({"score": "mean", "origin": "first"})
72
  .reset_index()
 
39
  # Load cached results and filter out completed combinations
40
  old_results = load("results-detailed")
41
  if not old_results.empty:
42
+ # Only treat status==\"ok\" (or missing status) as completed.
43
+ if "status" in old_results.columns:
44
+ ok_mask = old_results["status"].isna() | (old_results["status"] == "ok")
45
+ completed_df = old_results.loc[ok_mask, ["task", "model", "bcp_47", "sentence_nr"]]
46
+ else:
47
+ completed_df = old_results[["task", "model", "bcp_47", "sentence_nr"]]
48
+ completed = set(completed_df.apply(tuple, axis=1))
49
  combis = combis[~combis.apply(lambda row: tuple(row) in completed, axis=1)]
50
 
51
  print(f"Running {len(combis)} evaluation tasks...")
 
63
  results = [r for batch in batch_results for result in batch for r in result]
64
  results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
65
 
66
+ # Merge with cached results (immutable log, prefer latest results on conflict)
67
  all_results = pd.concat([old_results, results]).drop_duplicates(
68
+ subset=["task", "model", "bcp_47", "metric", "sentence_nr"],
69
+ keep="last",
70
  ) if not old_results.empty else results
71
 
72
+ # Filter to current models × languages and aggregate.
73
+ # Only aggregate over successful evaluations (status == \"ok\" or missing).
74
  current_models = set(models.iloc[:n_models]["id"])
75
  current_languages = set(languages.head(n_languages)["bcp_47"])
76
+ if "status" in all_results.columns:
77
+ valid_mask = all_results["status"].isna() | (all_results["status"] == "ok")
78
+ valid_results = all_results[valid_mask]
79
+ else:
80
+ valid_results = all_results
81
+
82
  results_agg = (
83
+ valid_results[valid_results["model"].isin(current_models) & valid_results["bcp_47"].isin(current_languages)]
84
  .groupby(["model", "bcp_47", "task", "metric"])
85
  .agg({"score": "mean", "origin": "first"})
86
  .reset_index()
evals/models.py CHANGED
@@ -44,26 +44,17 @@ important_models = [
44
  "google/gemini-2.5-flash-lite", # 0.3$
45
  "google/gemma-3-27b-it", # 0.2$
46
  # "x-ai/grok-4", # $15
47
- "x-ai/grok-4.1-fast:free", #free for now
48
  "x-ai/grok-4-fast",
49
- # "x-ai/grok-3", # $15
50
  "cohere/command-a",
51
- "qwen/qwen3-32b",
52
- "qwen/qwen3-235b-a22b",
53
  "qwen/qwen3-30b-a3b", # 0.29$
54
- # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
55
- # "qwen/qwq-32b", # 0.2$
56
- # "qwen/qwen-2.5-72b-instruct", # 0.39$
57
- # "qwen/qwen-2-72b-instruct", # 0.9$
58
  "deepseek/deepseek-v3.2-exp",
59
  "microsoft/phi-4", # 0.07$
60
  "amazon/nova-premier-v1", # 12.5$
61
  "amazon/nova-pro-v1", # 0.09$
62
  "moonshotai/kimi-k2", # 0.6$
63
- # "moonshotai/kimi-k2-thinking", # 2.5$
64
  "baidu/ernie-4.5-300b-a47b",
65
- # "baidu/ernie-4.5-21b-a3b-thinking",
66
- "z-ai/glm-4.6", # 1.75$
67
  ]
68
 
69
  blocklist = [
 
44
  "google/gemini-2.5-flash-lite", # 0.3$
45
  "google/gemma-3-27b-it", # 0.2$
46
  # "x-ai/grok-4", # $15
 
47
  "x-ai/grok-4-fast",
 
48
  "cohere/command-a",
49
+ # "qwen/qwen3-32b",
50
+ # "qwen/qwen3-235b-a22b",
51
  "qwen/qwen3-30b-a3b", # 0.29$
 
 
 
 
52
  "deepseek/deepseek-v3.2-exp",
53
  "microsoft/phi-4", # 0.07$
54
  "amazon/nova-premier-v1", # 12.5$
55
  "amazon/nova-pro-v1", # 0.09$
56
  "moonshotai/kimi-k2", # 0.6$
 
57
  "baidu/ernie-4.5-300b-a47b",
 
 
58
  ]
59
 
60
  blocklist = [
evals/tasks.py CHANGED
@@ -117,6 +117,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
117
  )
118
  else:
119
  prediction = await query(model, translation_prompt)
 
120
  if prediction:
121
  bleu_score = bleu.compute(
122
  predictions=[prediction],
@@ -141,6 +142,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
141
  "sentence_nr": sentence_nr,
142
  "prompt": translation_prompt,
143
  "response": prediction,
 
144
  }
145
  for metric, score in (
146
  ("bleu", bleu_score["bleu"]),
@@ -171,6 +173,7 @@ Text:
171
  """
172
  response = await query(model, prompt)
173
  pred = response.lower().strip() if response else ""
 
174
  true = test_paragraph.topic.lower().strip()
175
  others = [t for t in top_topics if t != true]
176
  acc = (
@@ -193,6 +196,7 @@ Text:
193
  "sentence_nr": nr,
194
  "prompt": prompt,
195
  "response": pred,
 
196
  }
197
  ]
198
 
@@ -256,6 +260,7 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
256
  response = await query(model, prompt)
257
  final_response = extract_mc_response(response)
258
  acc = int(final_response == task["answer"]) if final_response else 0
 
259
 
260
  return [
261
  {
@@ -268,6 +273,7 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
268
  "sentence_nr": nr,
269
  "prompt": prompt,
270
  "response": response,
 
271
  }
272
  ]
273
 
@@ -280,6 +286,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
280
  response = await query(model, prompt)
281
  final_response = extract_mc_response(response)
282
  acc = int(final_response == task["answer"]) if final_response else 0
 
283
  return [
284
  {
285
  "model": model,
@@ -291,6 +298,7 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
291
  "sentence_nr": nr,
292
  "prompt": prompt,
293
  "response": response,
 
294
  }
295
  ]
296
 
@@ -323,6 +331,7 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
323
  response = await query(model, prompt)
324
  final_response = extract_mc_response(response)
325
  acc = int(final_response.upper() == answer) if final_response else 0
 
326
  return [
327
  {
328
  "model": model,
@@ -334,6 +343,7 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
334
  "sentence_nr": nr,
335
  "prompt": prompt,
336
  "response": response,
 
337
  }
338
  ]
339
 
@@ -358,6 +368,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
358
  if number
359
  else 0
360
  )
 
361
  return [
362
  {
363
  "model": model,
@@ -369,6 +380,7 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
369
  "sentence_nr": nr,
370
  "prompt": prompt,
371
  "response": response,
 
372
  }
373
  ]
374
 
 
117
  )
118
  else:
119
  prediction = await query(model, translation_prompt)
120
+ status = "ok" if prediction else "error"
121
  if prediction:
122
  bleu_score = bleu.compute(
123
  predictions=[prediction],
 
142
  "sentence_nr": sentence_nr,
143
  "prompt": translation_prompt,
144
  "response": prediction,
145
+ "status": status,
146
  }
147
  for metric, score in (
148
  ("bleu", bleu_score["bleu"]),
 
173
  """
174
  response = await query(model, prompt)
175
  pred = response.lower().strip() if response else ""
176
+ status = "ok" if pred else "error"
177
  true = test_paragraph.topic.lower().strip()
178
  others = [t for t in top_topics if t != true]
179
  acc = (
 
196
  "sentence_nr": nr,
197
  "prompt": prompt,
198
  "response": pred,
199
+ "status": status,
200
  }
201
  ]
202
 
 
260
  response = await query(model, prompt)
261
  final_response = extract_mc_response(response)
262
  acc = int(final_response == task["answer"]) if final_response else 0
263
+ status = "ok" if final_response else "error"
264
 
265
  return [
266
  {
 
273
  "sentence_nr": nr,
274
  "prompt": prompt,
275
  "response": response,
276
+ "status": status,
277
  }
278
  ]
279
 
 
286
  response = await query(model, prompt)
287
  final_response = extract_mc_response(response)
288
  acc = int(final_response == task["answer"]) if final_response else 0
289
+ status = "ok" if final_response else "error"
290
  return [
291
  {
292
  "model": model,
 
298
  "sentence_nr": nr,
299
  "prompt": prompt,
300
  "response": response,
301
+ "status": status,
302
  }
303
  ]
304
 
 
331
  response = await query(model, prompt)
332
  final_response = extract_mc_response(response)
333
  acc = int(final_response.upper() == answer) if final_response else 0
334
+ status = "ok" if final_response else "error"
335
  return [
336
  {
337
  "model": model,
 
343
  "sentence_nr": nr,
344
  "prompt": prompt,
345
  "response": response,
346
+ "status": status,
347
  }
348
  ]
349
 
 
368
  if number
369
  else 0
370
  )
371
+ status = "ok" if number else "error"
372
  return [
373
  {
374
  "model": model,
 
380
  "sentence_nr": nr,
381
  "prompt": prompt,
382
  "response": response,
383
+ "status": status,
384
  }
385
  ]
386
 
pyproject.toml CHANGED
@@ -37,4 +37,4 @@ dev = [
37
  "tiktoken>=0.9.0",
38
  "tqdm>=4.67.1",
39
  "transformers>=4.51.3",
40
- ]
 
37
  "tiktoken>=0.9.0",
38
  "tqdm>=4.67.1",
39
  "transformers>=4.51.3",
40
+ ]