neuralese_temp / src /eval_sweep_models.py

Export neuralese codebase (cache and .env excluded).

dbc69f3 about 1 month ago

13.4 kB

	from __future__ import annotations

	import csv
	import json
	import os
	import re
	from pathlib import Path

	import torch
	import torch.distributed as dist
	import yaml
	from transformers import AutoModelForCausalLM, AutoTokenizer

	import hackable # noqa: F401
	from hackable.data_plugins import GSM8KProvider
	from hackable.paths import resolve_storage_path, storage_layout
	from hackable.reward_plugins import gsm8k_correctness_reward
	from hackable.utils import resolve_repo_path

	THINK_RE = re.compile(r"<think>(.*?)</think>", re.DOTALL)


	def _load_yaml(path: str) -> dict:
	with open(path, "r", encoding="utf-8") as handle:
	return yaml.safe_load(handle)


	def _cot_word_len(completion: str) -> int:
	match = THINK_RE.search(completion)
	text = match.group(1).strip() if match else ""
	return len(text.split()) if text else 0


	def _model_dtype(cfg: dict):
	return torch.bfloat16 if bool(cfg.get("trainer", {}).get("bf16", True)) else torch.float16


	def _get_cache_paths(base_cfg: dict) -> tuple[Path, Path]:
	layout = storage_layout(base_cfg.get("storage", {}).get("cache_dir", "cache"))
	return layout.datasets, layout.models


	def _dist_info() -> tuple[int, int, int]:
	rank = int(os.environ.get("RANK", "0"))
	world_size = int(os.environ.get("WORLD_SIZE", "1"))
	local_rank = int(os.environ.get("LOCAL_RANK", "0"))
	return rank, world_size, local_rank


	def _init_distributed() -> tuple[int, int, int]:
	rank, world_size, local_rank = _dist_info()
	if world_size > 1 and not dist.is_initialized():
	backend = "nccl" if torch.cuda.is_available() else "gloo"
	dist.init_process_group(backend=backend, init_method="env://")
	return rank, world_size, local_rank


	def _resolve_local_model_dir(base_cfg: dict, model_dir: str) -> Path:
	candidate = Path(model_dir)
	if candidate.is_absolute() and candidate.exists():
	return candidate.resolve()
	if not candidate.is_absolute() and candidate.exists():
	return candidate.resolve()

	repo_local = resolve_repo_path(model_dir)
	if repo_local.exists():
	return repo_local

	cache_root = resolve_repo_path(base_cfg.get("storage", {}).get("cache_dir", "cache"))
	prefixed = (cache_root / candidate).resolve()
	if prefixed.exists():
	return prefixed

	raise FileNotFoundError(
	f"Model directory not found locally: '{model_dir}'. "
	f"Tried '{candidate}', '{repo_local}', and '{prefixed}'."
	)


	def _resolve_sweep_root(base_cfg: dict, requested_sweep_root: Path) -> Path:
	candidate = resolve_storage_path(
	requested_sweep_root,
	base_cfg.get("storage", {}).get("cache_dir", "cache"),
	)
	if candidate.is_dir() and any(path.is_dir() and path.name.startswith("run_") for path in candidate.iterdir()):
	return candidate
	raise FileNotFoundError(
	"Could not resolve SWEEP_ROOT with run directories: "
	f"{candidate}"
	)


	def _discover_model_dirs(sweep_root: Path) -> list[Path]:
	dirs = [
	path
	for path in sweep_root.iterdir()
	if path.is_dir() and path.name.startswith("run_")
	]
	if not dirs:
	raise FileNotFoundError(
	f"No run directories starting with 'run_' found in {sweep_root}"
	)
	return sorted(dirs)


	@torch.no_grad()
	def evaluate_one_model(
	model_dir: Path,
	base_cfg: dict,
	eval_max_samples: int,
	batch_size: int,
	) -> list[dict]:
	rank, world_size, local_rank = _dist_info()
	generation = base_cfg.get("generation", {})
	max_prompt_len = int(generation.get("max_prompt_length", 512))
	max_completion_len = int(generation.get("max_completion_length", 256))
	model_name_fallback = str(base_cfg["model"]["name"])
	trust_remote_code = bool(base_cfg.get("model", {}).get("trust_remote_code", False))
	dtype = _model_dtype(base_cfg)
	datasets_cache, models_cache = _get_cache_paths(base_cfg)

	provider = GSM8KProvider()
	all_samples = provider.load(
	split="test",
	max_samples=None if eval_max_samples < 0 else eval_max_samples,
	cache_dir=str(datasets_cache),
	)
	indices = list(range(rank, len(all_samples), world_size))
	local_samples = [all_samples[idx] for idx in indices]
	prompts = [sample.prompt for sample in local_samples]
	refs = [sample.target for sample in local_samples]
	metadata = [sample.metadata for sample in local_samples]

	try:
	tokenizer = AutoTokenizer.from_pretrained(
	str(model_dir),
	trust_remote_code=trust_remote_code,
	cache_dir=str(models_cache),
	local_files_only=True,
	)
	except Exception:
	tokenizer = AutoTokenizer.from_pretrained(
	model_name_fallback,
	trust_remote_code=trust_remote_code,
	cache_dir=str(models_cache),
	local_files_only=True,
	)

	model = AutoModelForCausalLM.from_pretrained(
	str(model_dir),
	trust_remote_code=trust_remote_code,
	cache_dir=str(models_cache),
	torch_dtype=dtype,
	local_files_only=True,
	)
	if torch.cuda.is_available():
	torch.cuda.set_device(local_rank)
	device = torch.device(f"cuda:{local_rank}")
	else:
	device = torch.device("cpu")
	model.to(device)
	model.eval()

	if tokenizer.pad_token_id is None:
	tokenizer.pad_token = tokenizer.eos_token

	completions: list[str] = []
	for start in range(0, len(prompts), batch_size):
	batch_prompts = prompts[start : start + batch_size]
	enc = tokenizer(
	batch_prompts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=max_prompt_len,
	)
	input_ids = enc["input_ids"].to(device)
	attn = enc["attention_mask"].to(device)
	out = model.generate(
	input_ids=input_ids,
	attention_mask=attn,
	max_new_tokens=max_completion_len,
	do_sample=False,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)
	prompt_lens = attn.sum(dim=1).tolist()
	for idx in range(out.size(0)):
	completion_ids = out[idx, int(prompt_lens[idx]) :]
	completions.append(tokenizer.decode(completion_ids, skip_special_tokens=True))

	scores = gsm8k_correctness_reward(
	prompts=prompts,
	completions=completions,
	references=refs,
	metadata=metadata,
	)

	local_records: list[dict] = []
	for i, (prompt, reference, completion, score) in enumerate(
	zip(prompts, refs, completions, scores, strict=True)
	):
	local_records.append(
	{
	"sample_index": int(indices[i]),
	"prompt": prompt,
	"reference": reference,
	"completion": completion,
	"correctness": float(score),
	"cot_words": int(_cot_word_len(completion)),
	}
	)

	del model
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	if dist.is_initialized():
	gathered: list[list[dict] \| None] = [None for _ in range(world_size)]
	dist.all_gather_object(gathered, local_records)
	merged: list[dict] = []
	for part in gathered:
	if part:
	merged.extend(part)
	else:
	merged = local_records

	merged.sort(key=lambda row: row["sample_index"])
	return merged


	def _summarize(records: list[dict], model_dir: str) -> dict:
	if not records:
	return {
	"name": Path(model_dir).name,
	"model_dir": model_dir,
	"num_examples": 0,
	"accuracy": 0.0,
	"avg_cot_words": 0.0,
	}
	accuracy = sum(float(row["correctness"]) for row in records) / len(records)
	avg_cot = sum(float(row["cot_words"]) for row in records) / len(records)
	return {
	"name": Path(model_dir).name,
	"model_dir": model_dir,
	"num_examples": len(records),
	"accuracy": float(accuracy),
	"avg_cot_words": float(avg_cot),
	}


	def _write_accuracy_svg(summaries: list[dict], path: Path) -> None:
	width = 1000
	height = 460
	left_margin = 70
	right_margin = 30
	top_margin = 70
	bottom_margin = 90
	plot_w = width - left_margin - right_margin
	plot_h = height - top_margin - bottom_margin
	y_base = top_margin + plot_h

	runs = [row["name"] for row in summaries]
	acc_vals = [float(row["accuracy"]) for row in summaries]
	vmax = max(1.0, max(acc_vals) if acc_vals else 1.0)

	bar_count = max(1, len(runs))
	slot_w = plot_w / bar_count
	bar_w = min(120, max(30, int(slot_w * 0.55)))
	palette = ["#2563eb", "#dc2626", "#16a34a", "#ca8a04", "#7c3aed", "#0891b2"]

	parts: list[str] = []
	parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">')
	parts.append('<rect width="100%" height="100%" fill="#ffffff"/>')
	parts.append(
	'<text x="40" y="34" font-size="20" font-family="sans-serif">Sweep Evaluation: GSM8K Accuracy</text>'
	)
	parts.append(
	f'<line x1="{left_margin}" y1="{y_base}" x2="{left_margin + plot_w}" y2="{y_base}" stroke="#111" stroke-width="2" />'
	)
	parts.append(
	f'<line x1="{left_margin}" y1="{top_margin}" x2="{left_margin}" y2="{y_base}" stroke="#111" stroke-width="2" />'
	)

	# y-axis ticks
	for tick in [0.0, 0.25, 0.5, 0.75, 1.0]:
	y = y_base - int((tick / vmax) * plot_h) if vmax > 0 else y_base
	parts.append(
	f'<line x1="{left_margin - 6}" y1="{y}" x2="{left_margin}" y2="{y}" stroke="#111" stroke-width="1" />'
	)
	parts.append(
	f'<text x="{left_margin - 10}" y="{y + 4}" text-anchor="end" font-size="11" font-family="sans-serif">{tick:.2f}</text>'
	)

	for idx, (run_name, acc) in enumerate(zip(runs, acc_vals, strict=True)):
	center_x = left_margin + int((idx + 0.5) * slot_w)
	bar_h = int((acc / vmax) * plot_h) if vmax > 0 else 0
	x = center_x - bar_w // 2
	y = y_base - bar_h
	color = palette[idx % len(palette)]
	parts.append(f'<rect x="{x}" y="{y}" width="{bar_w}" height="{bar_h}" fill="{color}" />')
	parts.append(
	f'<text x="{center_x}" y="{y - 8}" text-anchor="middle" font-size="12" font-family="sans-serif">{acc:.3f}</text>'
	)
	parts.append(
	f'<text x="{center_x}" y="{y_base + 18}" text-anchor="middle" font-size="11" font-family="sans-serif">{run_name}</text>'
	)

	parts.append("</svg>")
	path.write_text("\n".join(parts), encoding="utf-8")


	def main() -> None:
	rank, _, _ = _init_distributed()
	base_cfg = _load_yaml(str(resolve_repo_path(os.environ["BASE_CONFIG"])))
	requested_sweep_root = Path(os.environ["SWEEP_ROOT"])
	sweep_root = _resolve_sweep_root(base_cfg, requested_sweep_root)
	if "OUT_ROOT" in os.environ:
	out_root = resolve_repo_path(os.environ["OUT_ROOT"])
	else:
	out_root = (sweep_root / "eval_results").resolve()
	eval_max_samples = int(os.environ.get("EVAL_MAX_SAMPLES", "200"))
	eval_batch_size = int(os.environ.get("EVAL_BATCH_SIZE", "4"))

	model_dirs = _discover_model_dirs(sweep_root)
	resolved_model_dirs = [_resolve_local_model_dir(base_cfg, str(path)) for path in model_dirs]

	if rank == 0:
	out_root.mkdir(parents=True, exist_ok=True)
	(out_root / "outputs").mkdir(parents=True, exist_ok=True)

	if dist.is_initialized():
	dist.barrier()

	summaries: list[dict] = []
	for model_dir in resolved_model_dirs:
	records = evaluate_one_model(
	model_dir=model_dir,
	base_cfg=base_cfg,
	eval_max_samples=eval_max_samples,
	batch_size=eval_batch_size,
	)
	if rank == 0:
	output_jsonl = out_root / "outputs" / f"{model_dir.name}_outputs.jsonl"
	with output_jsonl.open("w", encoding="utf-8") as handle:
	for row in records:
	handle.write(json.dumps(row, ensure_ascii=True) + "\n")
	summary = _summarize(records, str(model_dir))
	summary["outputs_jsonl"] = str(output_jsonl)
	summaries.append(summary)

	if dist.is_initialized():
	dist.barrier()

	if rank != 0:
	return

	json_path = out_root / "sweep_eval_summary.json"
	csv_path = out_root / "sweep_eval_summary.csv"
	svg_path = out_root / "sweep_eval_accuracy.svg"
	json_path.write_text(json.dumps(summaries, indent=2), encoding="utf-8")
	with csv_path.open("w", encoding="utf-8", newline="") as handle:
	writer = csv.DictWriter(
	handle,
	fieldnames=[
	"name",
	"model_dir",
	"num_examples",
	"accuracy",
	"avg_cot_words",
	"outputs_jsonl",
	],
	)
	writer.writeheader()
	for row in summaries:
	writer.writerow(row)
	_write_accuracy_svg(summaries, svg_path)

	print(f"Saved summary: {json_path}")
	print(f"Saved summary: {csv_path}")
	print(f"Saved plot: {svg_path}")
	print(f"Saved outputs dir: {out_root / 'outputs'}")


	if __name__ == "__main__":
	main()