# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "lighteval>=0.6.0",
#     "torch>=2.0.0",
#     "transformers>=4.40.0",
#     "accelerate>=0.30.0",
# ]
# ///

"""Evaluate baseline LiquidAI/LFM2.5-1.2B-Instruct — list tasks first, then run."""

import subprocess
import sys
import json

# First, list available tasks to find the right names
print("=== Listing available leaderboard tasks ===")
list_cmd = [sys.executable, "-m", "lighteval", "tasks", "list"]
result = subprocess.run(list_cmd, capture_output=True, text=True)

# Filter for leaderboard tasks
leaderboard_tasks = []
for line in result.stdout.split("\n"):
    if "leaderboard" in line.lower() or "mmlu" in line.lower() or "arc" in line.lower() or "truthful" in line.lower():
        leaderboard_tasks.append(line.strip())
        print(line.strip())

print(f"\n=== Found {len(leaderboard_tasks)} matching tasks ===")

# Print first 50 of all tasks for debugging
print("\n=== First 50 tasks from full list ===")
for line in result.stdout.split("\n")[:50]:
    print(line)

# Try running with the community|mmlu format as fallback
model_args = "model_name=LiquidAI/LFM2.5-1.2B-Instruct,trust_remote_code=True"

# Try multiple task name formats
for tasks in [
    "leaderboard|mmlu|5|0",
    "community|mmlu|5|0",
    "lighteval|mmlu|5|0",
    "original|mmlu|5|0",
]:
    print(f"\n=== Trying task format: {tasks} ===")
    cmd = [sys.executable, "-m", "lighteval", "accelerate", model_args, tasks, "--output-dir", "./eval_results_baseline"]
    r = subprocess.run(cmd, capture_output=True, text=True)
    if r.returncode == 0:
        print("SUCCESS!")
        print(r.stdout[-2000:] if len(r.stdout) > 2000 else r.stdout)
        sys.exit(0)
    else:
        err_snippet = r.stderr[-500:] if len(r.stderr) > 500 else r.stderr
        print(f"Failed: {err_snippet}")

print("\nAll task formats failed. Dumping full task list to stdout.")
print(result.stdout)
sys.exit(1)