|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Evaluate baseline LiquidAI/LFM2.5-1.2B-Instruct — list tasks first, then run.""" |
|
|
|
|
|
import subprocess |
|
|
import sys |
|
|
import json |
|
|
|
|
|
|
|
|
print("=== Listing available leaderboard tasks ===") |
|
|
list_cmd = [sys.executable, "-m", "lighteval", "tasks", "list"] |
|
|
result = subprocess.run(list_cmd, capture_output=True, text=True) |
|
|
|
|
|
|
|
|
leaderboard_tasks = [] |
|
|
for line in result.stdout.split("\n"): |
|
|
if "leaderboard" in line.lower() or "mmlu" in line.lower() or "arc" in line.lower() or "truthful" in line.lower(): |
|
|
leaderboard_tasks.append(line.strip()) |
|
|
print(line.strip()) |
|
|
|
|
|
print(f"\n=== Found {len(leaderboard_tasks)} matching tasks ===") |
|
|
|
|
|
|
|
|
print("\n=== First 50 tasks from full list ===") |
|
|
for line in result.stdout.split("\n")[:50]: |
|
|
print(line) |
|
|
|
|
|
|
|
|
model_args = "model_name=LiquidAI/LFM2.5-1.2B-Instruct,trust_remote_code=True" |
|
|
|
|
|
|
|
|
for tasks in [ |
|
|
"leaderboard|mmlu|5|0", |
|
|
"community|mmlu|5|0", |
|
|
"lighteval|mmlu|5|0", |
|
|
"original|mmlu|5|0", |
|
|
]: |
|
|
print(f"\n=== Trying task format: {tasks} ===") |
|
|
cmd = [sys.executable, "-m", "lighteval", "accelerate", model_args, tasks, "--output-dir", "./eval_results_baseline"] |
|
|
r = subprocess.run(cmd, capture_output=True, text=True) |
|
|
if r.returncode == 0: |
|
|
print("SUCCESS!") |
|
|
print(r.stdout[-2000:] if len(r.stdout) > 2000 else r.stdout) |
|
|
sys.exit(0) |
|
|
else: |
|
|
err_snippet = r.stderr[-500:] if len(r.stderr) > 500 else r.stderr |
|
|
print(f"Failed: {err_snippet}") |
|
|
|
|
|
print("\nAll task formats failed. Dumping full task list to stdout.") |
|
|
print(result.stdout) |
|
|
sys.exit(1) |
|
|
|