agent-zero-training-scripts / eval_lfm_baseline.py

Upload eval_lfm_baseline.py with huggingface_hub

09732e8 verified about 1 month ago

1.95 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "lighteval>=0.6.0",
	# "torch>=2.0.0",
	# "transformers>=4.40.0",
	# "accelerate>=0.30.0",
	# ]
	# ///

	"""Evaluate baseline LiquidAI/LFM2.5-1.2B-Instruct — list tasks first, then run."""

	import subprocess
	import sys
	import json

	# First, list available tasks to find the right names
	print("=== Listing available leaderboard tasks ===")
	list_cmd = [sys.executable, "-m", "lighteval", "tasks", "list"]
	result = subprocess.run(list_cmd, capture_output=True, text=True)

	# Filter for leaderboard tasks
	leaderboard_tasks = []
	for line in result.stdout.split("\n"):
	if "leaderboard" in line.lower() or "mmlu" in line.lower() or "arc" in line.lower() or "truthful" in line.lower():
	leaderboard_tasks.append(line.strip())
	print(line.strip())

	print(f"\n=== Found {len(leaderboard_tasks)} matching tasks ===")

	# Print first 50 of all tasks for debugging
	print("\n=== First 50 tasks from full list ===")
	for line in result.stdout.split("\n")[:50]:
	print(line)

	# Try running with the community\|mmlu format as fallback
	model_args = "model_name=LiquidAI/LFM2.5-1.2B-Instruct,trust_remote_code=True"

	# Try multiple task name formats
	for tasks in [
	"leaderboard\|mmlu\|5\|0",
	"community\|mmlu\|5\|0",
	"lighteval\|mmlu\|5\|0",
	"original\|mmlu\|5\|0",
	]:
	print(f"\n=== Trying task format: {tasks} ===")
	cmd = [sys.executable, "-m", "lighteval", "accelerate", model_args, tasks, "--output-dir", "./eval_results_baseline"]
	r = subprocess.run(cmd, capture_output=True, text=True)
	if r.returncode == 0:
	print("SUCCESS!")
	print(r.stdout[-2000:] if len(r.stdout) > 2000 else r.stdout)
	sys.exit(0)
	else:
	err_snippet = r.stderr[-500:] if len(r.stderr) > 500 else r.stderr
	print(f"Failed: {err_snippet}")

	print("\nAll task formats failed. Dumping full task list to stdout.")
	print(result.stdout)
	sys.exit(1)