Spaces:
Sleeping
Sleeping
File size: 3,916 Bytes
895066e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "inspect-ai @ git+https://github.com/dvsrepo/inspect_ai.git@fallback-to-modified-for-hf-fs",
# "datasets",
# "openai",
# "transformers",
# "accelerate",
# "huggingface_hub",
# "inspect-evals",
# "pandas",
# "pyarrow",
# ]
# ///
import os
import sys
import subprocess
import tempfile
import urllib.request
from pathlib import Path
from inspect_ai.analysis import evals_df, samples_df
def export_logs_to_parquet(log_dir: str, dataset_repo: str) -> None:
from huggingface_hub import HfApi
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable not set")
api = HfApi(token=hf_token)
repo_id = (
dataset_repo.replace("datasets/", "")
if dataset_repo.startswith("datasets/")
else dataset_repo
)
evals = evals_df(logs=log_dir)
samples = samples_df(logs=log_dir)
with tempfile.TemporaryDirectory() as tmpdir:
evals_path = Path(tmpdir) / "evals.parquet"
samples_path = Path(tmpdir) / "samples.parquet"
evals.to_parquet(evals_path, index=False, engine="pyarrow")
samples.to_parquet(samples_path, index=False, engine="pyarrow")
api.upload_file(
path_or_fileobj=str(evals_path),
path_in_repo="evals.parquet",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
api.upload_file(
path_or_fileobj=str(samples_path),
path_in_repo="samples.parquet",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
if __name__ == "__main__":
if len(sys.argv) < 4:
print("Usage: eval_runner.py <eval_ref> <model> <dataset_repo> [--inspect-evals] [extra_args...]")
sys.exit(1)
eval_ref = sys.argv[1]
model = sys.argv[2]
dataset_repo = sys.argv[3]
is_inspect_evals = "--inspect-evals" in sys.argv
extra_args = [arg for arg in sys.argv[4:] if arg != "--inspect-evals"]
if not dataset_repo.startswith("datasets/"):
dataset_repo = f"datasets/{dataset_repo}"
log_dir = f"hf://{dataset_repo}/logs"
is_eval_set = "," in model
if is_inspect_evals:
eval_target = eval_ref
cleanup_file = None
else:
print("Downloading eval script...")
with urllib.request.urlopen(eval_ref) as response:
eval_code = response.read().decode("utf-8")
eval_filename = "downloaded_eval.py"
with open(eval_filename, "w") as f:
f.write(eval_code)
eval_target = eval_filename
cleanup_file = eval_filename
is_eval_set = "," in model
try:
if is_eval_set:
print("Running evaluation set...")
cmd = [
"inspect",
"eval-set",
eval_target,
"--model",
model,
"--log-dir",
log_dir,
"--log-shared",
"--log-buffer",
"100",
]
else:
print("Running evaluation...")
cmd = [
"inspect",
"eval",
eval_target,
"--model",
model,
"--log-dir",
log_dir,
"--log-shared",
"--log-buffer",
"100",
]
cmd.extend(extra_args)
subprocess.run(cmd, check=True)
print("Exporting logs to parquet...")
try:
export_logs_to_parquet(log_dir, dataset_repo)
except Exception as e:
print(f"Warning: Could not export to parquet: {e}")
finally:
if cleanup_file and os.path.exists(cleanup_file):
os.unlink(cleanup_file)
|