wheattoast11 commited on
Commit
09732e8
·
verified ·
1 Parent(s): 9abfaea

Upload eval_lfm_baseline.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_lfm_baseline.py +44 -13
eval_lfm_baseline.py CHANGED
@@ -8,21 +8,52 @@
8
  # ]
9
  # ///
10
 
11
- """Evaluate baseline LiquidAI/LFM2.5-1.2B-Instruct on standard benchmarks."""
12
 
13
  import subprocess
14
  import sys
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  model_args = "model_name=LiquidAI/LFM2.5-1.2B-Instruct,trust_remote_code=True"
17
- tasks = "leaderboard|mmlu|5|0,leaderboard|arc:challenge|0|0,leaderboard|truthfulqa:mc|0|0"
18
-
19
- cmd = [
20
- sys.executable, "-m", "lighteval", "accelerate",
21
- model_args,
22
- tasks,
23
- "--output-dir", "./eval_results_baseline",
24
- ]
25
-
26
- print(f"Running: {' '.join(cmd)}")
27
- result = subprocess.run(cmd, capture_output=False)
28
- sys.exit(result.returncode)
 
 
 
 
 
 
 
 
 
 
 
8
  # ]
9
  # ///
10
 
11
+ """Evaluate baseline LiquidAI/LFM2.5-1.2B-Instruct list tasks first, then run."""
12
 
13
  import subprocess
14
  import sys
15
+ import json
16
 
17
+ # First, list available tasks to find the right names
18
+ print("=== Listing available leaderboard tasks ===")
19
+ list_cmd = [sys.executable, "-m", "lighteval", "tasks", "list"]
20
+ result = subprocess.run(list_cmd, capture_output=True, text=True)
21
+
22
+ # Filter for leaderboard tasks
23
+ leaderboard_tasks = []
24
+ for line in result.stdout.split("\n"):
25
+ if "leaderboard" in line.lower() or "mmlu" in line.lower() or "arc" in line.lower() or "truthful" in line.lower():
26
+ leaderboard_tasks.append(line.strip())
27
+ print(line.strip())
28
+
29
+ print(f"\n=== Found {len(leaderboard_tasks)} matching tasks ===")
30
+
31
+ # Print first 50 of all tasks for debugging
32
+ print("\n=== First 50 tasks from full list ===")
33
+ for line in result.stdout.split("\n")[:50]:
34
+ print(line)
35
+
36
+ # Try running with the community|mmlu format as fallback
37
  model_args = "model_name=LiquidAI/LFM2.5-1.2B-Instruct,trust_remote_code=True"
38
+
39
+ # Try multiple task name formats
40
+ for tasks in [
41
+ "leaderboard|mmlu|5|0",
42
+ "community|mmlu|5|0",
43
+ "lighteval|mmlu|5|0",
44
+ "original|mmlu|5|0",
45
+ ]:
46
+ print(f"\n=== Trying task format: {tasks} ===")
47
+ cmd = [sys.executable, "-m", "lighteval", "accelerate", model_args, tasks, "--output-dir", "./eval_results_baseline"]
48
+ r = subprocess.run(cmd, capture_output=True, text=True)
49
+ if r.returncode == 0:
50
+ print("SUCCESS!")
51
+ print(r.stdout[-2000:] if len(r.stdout) > 2000 else r.stdout)
52
+ sys.exit(0)
53
+ else:
54
+ err_snippet = r.stderr[-500:] if len(r.stderr) > 500 else r.stderr
55
+ print(f"Failed: {err_snippet}")
56
+
57
+ print("\nAll task formats failed. Dumping full task list to stdout.")
58
+ print(result.stdout)
59
+ sys.exit(1)