#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [ # "inspect-ai @ git+https://github.com/dvsrepo/inspect_ai.git@fallback-to-modified-for-hf-fs", # "datasets", # "openai", # "transformers", # "accelerate", # "huggingface_hub", # "inspect-evals", # "pandas", # "pyarrow", # ] # /// import os import sys import subprocess import tempfile import urllib.request from pathlib import Path from inspect_ai.analysis import evals_df, samples_df def export_logs_to_parquet(log_dir: str, dataset_repo: str) -> None: from huggingface_hub import HfApi hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN environment variable not set") api = HfApi(token=hf_token) repo_id = ( dataset_repo.replace("datasets/", "") if dataset_repo.startswith("datasets/") else dataset_repo ) evals = evals_df(logs=log_dir) samples = samples_df(logs=log_dir) with tempfile.TemporaryDirectory() as tmpdir: evals_path = Path(tmpdir) / "evals.parquet" samples_path = Path(tmpdir) / "samples.parquet" evals.to_parquet(evals_path, index=False, engine="pyarrow") samples.to_parquet(samples_path, index=False, engine="pyarrow") api.upload_file( path_or_fileobj=str(evals_path), path_in_repo="evals.parquet", repo_id=repo_id, repo_type="dataset", token=hf_token, ) api.upload_file( path_or_fileobj=str(samples_path), path_in_repo="samples.parquet", repo_id=repo_id, repo_type="dataset", token=hf_token, ) if __name__ == "__main__": if len(sys.argv) < 4: print("Usage: eval_runner.py [--inspect-evals] [extra_args...]") sys.exit(1) eval_ref = sys.argv[1] model = sys.argv[2] dataset_repo = sys.argv[3] is_inspect_evals = "--inspect-evals" in sys.argv extra_args = [arg for arg in sys.argv[4:] if arg != "--inspect-evals"] if not dataset_repo.startswith("datasets/"): dataset_repo = f"datasets/{dataset_repo}" log_dir = f"hf://{dataset_repo}/logs" is_eval_set = "," in model if is_inspect_evals: eval_target = eval_ref cleanup_file = None else: print("Downloading eval script...") with urllib.request.urlopen(eval_ref) as response: eval_code = response.read().decode("utf-8") eval_filename = "downloaded_eval.py" with open(eval_filename, "w") as f: f.write(eval_code) eval_target = eval_filename cleanup_file = eval_filename is_eval_set = "," in model try: if is_eval_set: print("Running evaluation set...") cmd = [ "inspect", "eval-set", eval_target, "--model", model, "--log-dir", log_dir, "--log-shared", "--log-buffer", "100", ] else: print("Running evaluation...") cmd = [ "inspect", "eval", eval_target, "--model", model, "--log-dir", log_dir, "--log-shared", "--log-buffer", "100", ] cmd.extend(extra_args) subprocess.run(cmd, check=True) print("Exporting logs to parquet...") try: export_logs_to_parquet(log_dir, dataset_repo) except Exception as e: print(f"Warning: Could not export to parquet: {e}") finally: if cleanup_file and os.path.exists(cleanup_file): os.unlink(cleanup_file)