qwen3-4b-thinking-microagent / scripts /eval_format_compliance.py

Upload folder using huggingface_hub

a15ae41 verified 11 days ago

6.39 kB

	"""Cheap eval: does the trained model emit MicroAgent-compliant output?

	This is the FIRST eval to run after training. It catches the most common
	failure mode (model emits malformed responses) without needing TB2 Docker setup.

	Procedure:
	1. Hold out a small slice of converted trajectories
	2. For each, take the prefix up through some turn, generate the next assistant turn
	3. Check: does the output contain a valid <think>...</think> block? A valid
	<bash>...</bash> or <finish>...</finish>? Anything outside the tags?

	Pass rate >95% means the model has learned the format. <80% means broken training.

	Usage:
	python scripts/eval_format_compliance.py \\
	--model runs/hunyuan-4b-microagent-v1/final \\
	--base-model tencent-hunyuan/Hunyuan-4B-Instruct \\
	--data data/microagent_train.jsonl \\
	--n 50
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import random
	import re
	import sys
	from pathlib import Path


	_THINK_RE = re.compile(r"<think>(.*?)</think>", re.DOTALL)
	_BASH_RE = re.compile(r"<bash>(.*?)</bash>", re.DOTALL)
	_FINISH_RE = re.compile(r"<finish>(.*?)</finish>", re.DOTALL)


	def parse_args():
	p = argparse.ArgumentParser()
	p.add_argument("--model", required=True,
	help="Path to LoRA adapter dir, or full merged model")
	p.add_argument("--base-model", default=None,
	help="Base model id if --model points to an adapter")
	p.add_argument("--data", default="data/microagent_train.jsonl")
	p.add_argument("--n", type=int, default=50,
	help="Number of held-out prompts to test")
	p.add_argument("--max-new-tokens", type=int, default=512)
	p.add_argument("--temperature", type=float, default=0.1)
	p.add_argument("--seed", type=int, default=123)
	return p.parse_args()


	def classify(text: str) -> tuple[str, list[str]]:
	"""Return (kind, problems) where kind is bash/finish/invalid."""
	problems = []
	think_m = _THINK_RE.search(text)
	if not think_m:
	problems.append("missing <think>")
	bash_m = _BASH_RE.search(text)
	finish_m = _FINISH_RE.search(text)

	if bash_m and finish_m:
	problems.append("both <bash> and <finish>")
	if bash_m:
	if not bash_m.group(1).strip():
	problems.append("empty <bash>")
	return "bash", problems
	if finish_m:
	if not finish_m.group(1).strip():
	problems.append("empty <finish>")
	return "finish", problems

	problems.append("no <bash> or <finish>")
	return "invalid", problems


	def main():
	args = parse_args()

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	print(f"[eval] loading {args.model}")
	if args.base_model:
	from peft import PeftModel
	base = AutoModelForCausalLM.from_pretrained(
	args.base_model, torch_dtype=torch.bfloat16,
	device_map="auto", trust_remote_code=True,
	)
	model = PeftModel.from_pretrained(base, args.model)
	tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)
	else:
	model = AutoModelForCausalLM.from_pretrained(
	args.model, torch_dtype=torch.bfloat16,
	device_map="auto", trust_remote_code=True,
	)
	tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model.eval()

	# Load held-out samples
	rng = random.Random(args.seed)
	rows = []
	with open(args.data, "r", encoding="utf-8") as f:
	for line in f:
	rows.append(json.loads(line))
	rng.shuffle(rows)
	rows = rows[: args.n]
	print(f"[eval] using {len(rows)} held-out prompts")

	bash_count = 0
	finish_count = 0
	invalid_count = 0
	problem_tally = {}

	for i, row in enumerate(rows):
	conv = row["conversations"]
	# Pick a random assistant turn position (must have at least one before it)
	a_positions = [j for j, t in enumerate(conv) if t["role"] == "assistant"]
	if not a_positions:
	continue
	target_pos = rng.choice(a_positions)
	prefix = conv[:target_pos]
	gold = conv[target_pos]["content"]

	# Render prefix via the chat template
	prompt = tokenizer.apply_chat_template(
	prefix, tokenize=False, add_generation_prompt=True
	)
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
	max_length=8192).to(model.device)
	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=args.max_new_tokens,
	temperature=args.temperature,
	do_sample=args.temperature > 0,
	pad_token_id=tokenizer.pad_token_id,
	)
	gen = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
	kind, problems = classify(gen)

	if kind == "bash":
	bash_count += 1
	elif kind == "finish":
	finish_count += 1
	else:
	invalid_count += 1
	for p in problems:
	problem_tally[p] = problem_tally.get(p, 0) + 1

	if i < 5:
	print(f"\n=== Sample {i} (kind={kind}) ===")
	print(f"GOLD (first 200): {gold[:200]}")
	print(f"GEN (first 200): {gen[:200]}")
	if problems:
	print(f"PROBLEMS: {problems}")

	total = bash_count + finish_count + invalid_count
	print(f"\n========== Summary ({total} samples) ==========")
	print(f" bash : {bash_count} ({100*bash_count/max(total,1):.1f}%)")
	print(f" finish : {finish_count} ({100*finish_count/max(total,1):.1f}%)")
	print(f" invalid : {invalid_count} ({100*invalid_count/max(total,1):.1f}%)")
	print(f"\nProblems:")
	for k, v in sorted(problem_tally.items(), key=lambda x: -x[1]):
	print(f" {k}: {v}")

	pass_rate = (bash_count + finish_count) / max(total, 1)
	print(f"\nFormat compliance: {100*pass_rate:.1f}%")
	if pass_rate < 0.80:
	print("WARN: low compliance, check training")
	elif pass_rate < 0.95:
	print("OK but not great; consider another epoch")
	else:
	print("Good. Ready for TB2 eval.")


	if __name__ == "__main__":
	main()