# /// script # requires-python = ">=3.10" # dependencies = [ # "lighteval>=0.6.0", # "torch>=2.0.0", # "transformers>=4.40.0", # "accelerate>=0.30.0", # ] # /// """Evaluate baseline LiquidAI/LFM2.5-1.2B-Instruct — list tasks first, then run.""" import subprocess import sys import json # First, list available tasks to find the right names print("=== Listing available leaderboard tasks ===") list_cmd = [sys.executable, "-m", "lighteval", "tasks", "list"] result = subprocess.run(list_cmd, capture_output=True, text=True) # Filter for leaderboard tasks leaderboard_tasks = [] for line in result.stdout.split("\n"): if "leaderboard" in line.lower() or "mmlu" in line.lower() or "arc" in line.lower() or "truthful" in line.lower(): leaderboard_tasks.append(line.strip()) print(line.strip()) print(f"\n=== Found {len(leaderboard_tasks)} matching tasks ===") # Print first 50 of all tasks for debugging print("\n=== First 50 tasks from full list ===") for line in result.stdout.split("\n")[:50]: print(line) # Try running with the community|mmlu format as fallback model_args = "model_name=LiquidAI/LFM2.5-1.2B-Instruct,trust_remote_code=True" # Try multiple task name formats for tasks in [ "leaderboard|mmlu|5|0", "community|mmlu|5|0", "lighteval|mmlu|5|0", "original|mmlu|5|0", ]: print(f"\n=== Trying task format: {tasks} ===") cmd = [sys.executable, "-m", "lighteval", "accelerate", model_args, tasks, "--output-dir", "./eval_results_baseline"] r = subprocess.run(cmd, capture_output=True, text=True) if r.returncode == 0: print("SUCCESS!") print(r.stdout[-2000:] if len(r.stdout) > 2000 else r.stdout) sys.exit(0) else: err_snippet = r.stderr[-500:] if len(r.stderr) > 500 else r.stderr print(f"Failed: {err_snippet}") print("\nAll task formats failed. Dumping full task list to stdout.") print(result.stdout) sys.exit(1)