Upload folder using huggingface_hub

a15ae41 verified 26 days ago

5.33 kB

	"""In-place rewriter for <give_up>...</give_up> tags in microagent_train_v2.jsonl.

	Replaces forward-looking analysis-derived reasons with retrospective summaries
	based on (a) the number of distinct command attempts and (b) the last observed
	real error in the trajectory.

	Operates on the already-converted JSONL — no need to re-run the pipeline.
	"""
	from __future__ import annotations

	import json
	import re
	import sys
	import shutil
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent))
	from convert_code_v2 import _REAL_ERR_RE, MAX_FINISH_SUMMARY_CHARS

	GIVEUP_RE = re.compile(r"<give_up>.*?</give_up>", re.DOTALL)


	_EXCEPTION_LINE_RE = re.compile(
	r"^([A-Z][A-Za-z](Error\|Exception\|Warning\|Interrupt)\|KeyboardInterrupt):?.$",
	re.MULTILINE,
	)


	def _clean_snippet(line: str) -> str:
	line = re.sub(r"^\S+@[\w-]+:\S+[#$]\s*", "", line)
	line = re.sub(r"[\x00-\x08\x0b-\x1f\x7f]", "", line)
	line = line.strip("^").strip()
	if len(line) > 120:
	line = line[:117] + "..."
	return line


	def last_error_snippet(conversations: list[dict]) -> str \| None:
	"""Find the most recent user observation with a real error and return a
	useful one-line snippet. PREFER specific exception lines (NameError,
	KeyboardInterrupt, etc.) over generic 'Traceback' headers."""
	for t in reversed(conversations):
	if t.get("role") != "user":
	continue
	c = t.get("content", "") or ""

	# First pass: look for a specific exception line.
	exc_match = None
	for m in _EXCEPTION_LINE_RE.finditer(c):
	exc_match = m
	if exc_match:
	cleaned = _clean_snippet(exc_match.group(0))
	if cleaned and not re.match(r"^C?Traceback", cleaned, re.IGNORECASE):
	return cleaned

	# Fallback: any other real-error pattern, by line.
	last_match = None
	for m in _REAL_ERR_RE.finditer(c):
	last_match = m
	if last_match is None:
	continue
	start = c.rfind("\n", 0, last_match.start()) + 1
	end = c.find("\n", last_match.end())
	if end < 0:
	end = len(c)
	cleaned = _clean_snippet(c[start:end])
	if cleaned and not re.match(r"^C?Traceback", cleaned, re.IGNORECASE):
	return cleaned
	return None


	def count_distinct_attempts(conversations: list[dict]) -> int:
	"""Count distinct assistant attempts by the FIRST line of each <bash> block.
	Each turn = one attempt; we dedupe by the leading command so 'pytest' tried
	three times counts as one approach."""
	bash_re = re.compile(r"<bash>(.*?)</bash>", re.DOTALL)
	distinct = set()
	for t in conversations:
	if t.get("role") != "assistant":
	continue
	for m in bash_re.finditer(t.get("content", "") or ""):
	block = m.group(1).strip()
	if not block:
	continue
	first_line = block.splitlines()[0].strip()
	if first_line and not first_line.startswith("echo '(reviewing"):
	# Normalize to the leading command token for dedup
	first_token = first_line.split(None, 1)[0][:40]
	distinct.add(first_token)
	return len(distinct)


	def compose_summary(conversations: list[dict]) -> str:
	n = count_distinct_attempts(conversations)
	snippet = last_error_snippet(conversations)
	if snippet:
	summary = f"tried {n} distinct approaches; last failure: {snippet}"
	else:
	summary = f"exceeded turn budget after {n} distinct attempts"
	if len(summary) > MAX_FINISH_SUMMARY_CHARS:
	summary = summary[: MAX_FINISH_SUMMARY_CHARS - 3].rstrip() + "..."
	return summary


	def main(in_path: str, out_path: str):
	inp = Path(in_path)
	outp = Path(out_path)
	n_total = 0
	n_giveup = 0
	n_rewritten = 0
	n_no_snippet = 0

	with inp.open("r", encoding="utf-8") as fin, outp.open("w", encoding="utf-8") as fout:
	for line in fin:
	n_total += 1
	d = json.loads(line)
	if d.get("ending_mode") == "give_up":
	n_giveup += 1
	conv = d.get("conversations") or []
	if not conv:
	fout.write(line); continue
	new_summary = compose_summary(conv)
	if "last failure:" not in new_summary:
	n_no_snippet += 1
	# Replace the final <give_up>...</give_up> in the LAST assistant turn
	for t in reversed(conv):
	if t.get("role") == "assistant" and "<give_up>" in (t.get("content") or ""):
	t["content"] = GIVEUP_RE.sub(
	f"<give_up>{new_summary}</give_up>", t["content"], count=1
	)
	n_rewritten += 1
	break
	fout.write(json.dumps(d, ensure_ascii=False) + "\n")

	print(f"Processed: {n_total}")
	print(f"Give-ups: {n_giveup}")
	print(f"Rewritten: {n_rewritten}")
	print(f"No snippet (fallback to generic): {n_no_snippet}")


	if __name__ == "__main__":
	in_p = sys.argv[1] if len(sys.argv) > 1 else "data/microagent_train_v2.jsonl"
	out_p = sys.argv[2] if len(sys.argv) > 2 else "data/microagent_train_v2.tmp.jsonl"
	main(in_p, out_p)