Upload folder using huggingface_hub

a15ae41 verified 11 days ago

12.9 kB

	"""Convert Nemotron-Terminal-Corpus (Terminus 2 format) to MicroAgent format.

	Strategy: multi-line <bash> block (Path D).

	Filters (strict):
	- First turn must be user (system+task)
	- All assistant JSONs must parse cleanly
	- No "parsing error" observations
	- Final assistant turn must have task_complete=true AND zero commands
	(clean "finish" semantics)
	- num_turns sanity: 2 <= assistant turns <= 20

	Conversion:
	- System prompt -> MicroAgent's (with multi-line bash allowed)
	- First user turn -> extract task between markers
	- Each non-final assistant turn -> <think>...</think>\n<bash>cmds joined with \n</bash>
	- Each final assistant turn -> <think>...</think>\n<finish>summary</finish>
	- Each observation -> strip "New Terminal Output:" prefix, head+tail truncate

	Output: JSONL with one trajectory per line.
	"""
	from __future__ import annotations

	import json
	import os
	import re
	import sys
	from collections import Counter
	from dataclasses import dataclass

	os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")


	# --- Configuration ---

	SYSTEM_PROMPT = """You are a terminal agent. You complete tasks by running bash commands in a Linux container.

	Respond in EXACTLY this format every turn:
	<think>brief reasoning, max 2 sentences</think>
	<bash>shell commands (one or more lines)</bash>

	When the task is fully complete and verified, respond instead with:
	<think>brief verification reasoning</think>
	<finish>one-line summary of what you did</finish>

	Rules:
	- One bash block per turn. Multiple lines run sequentially in the same shell.
	- Multi-line files: cat <<'EOF' > path/to/file ... EOF
	- Output is truncated (head + tail). Lines starting with [exit N] mean non-zero exit.
	- Analyze errors before retrying. Never repeat a failed command unchanged.
	- You have a hard turn limit. Use turns efficiently."""

	OBS_HEAD_CHARS = 600
	OBS_TAIL_CHARS = 600
	MIN_ASSISTANT_TURNS = 2
	MAX_ASSISTANT_TURNS = 20
	MAX_FINISH_SUMMARY_CHARS = 200

	TASK_START_MARKER = "Task Description:"
	TASK_END_MARKER = "Current terminal state:"
	INITIAL_STATE_MARKER = "Current Terminal Screen:"
	OBS_PREFIX = "New Terminal Output:"


	_THINK_RE = re.compile(r"<think>(.*?)</think>", re.DOTALL)
	_JSON_RE = re.compile(r"\{.*\}", re.DOTALL)
	_WHITESPACE_RE = re.compile(r"\n{3,}")


	# --- Reject reason tracking ---

	@dataclass
	class RejectStats:
	total: int = 0
	accepted: int = 0
	# named reject reasons:
	too_few_turns: int = 0
	too_many_turns: int = 0
	first_not_user: int = 0
	no_task_marker: int = 0
	parse_error_observation: int = 0
	assistant_json_unparseable: int = 0
	final_not_complete: int = 0
	final_has_commands: int = 0
	empty_commands_midtrajectory: int = 0
	other: int = 0


	# --- Extraction helpers ---

	def extract_think(content: str) -> str:
	m = _THINK_RE.search(content)
	return m.group(1).strip() if m else ""


	def extract_json(content: str):
	"""Pull JSON object out of an assistant turn (skipping the optional <think>)."""
	after_think = _THINK_RE.sub("", content).strip()
	m = _JSON_RE.search(after_think)
	if not m:
	return None
	try:
	return json.loads(m.group(0))
	except Exception:
	# try a more lenient pass: pick first {...} that parses
	depth = 0
	start = None
	for i, ch in enumerate(after_think):
	if ch == "{":
	if depth == 0:
	start = i
	depth += 1
	elif ch == "}":
	depth -= 1
	if depth == 0 and start is not None:
	try:
	return json.loads(after_think[start : i + 1])
	except Exception:
	start = None
	return None


	def extract_task(first_user_content: str) -> str \| None:
	start = first_user_content.find(TASK_START_MARKER)
	if start < 0:
	return None
	start += len(TASK_START_MARKER)
	end = first_user_content.find(TASK_END_MARKER, start)
	if end < 0:
	end = len(first_user_content)
	task = first_user_content[start:end].strip()
	return task if task else None


	def extract_initial_state(first_user_content: str) -> str:
	idx = first_user_content.find(INITIAL_STATE_MARKER)
	if idx < 0:
	return ""
	state = first_user_content[idx + len(INITIAL_STATE_MARKER) :].strip()
	# Bound the size
	return state[:1500]


	def truncate_output(text: str, head: int = OBS_HEAD_CHARS, tail: int = OBS_TAIL_CHARS) -> str:
	if not text:
	return "(no output)"
	if len(text) <= head + tail + 50:
	return text
	cut = len(text) - head - tail
	return f"{text[:head]}\n[... truncated {cut} chars ...]\n{text[-tail:]}"


	def clean_observation(content: str) -> str:
	"""Drop 'New Terminal Output:' prefix, then head+tail truncate."""
	s = content
	if s.startswith(OBS_PREFIX):
	s = s[len(OBS_PREFIX) :]
	s = s.strip()
	# Collapse runs of 3+ newlines
	s = _WHITESPACE_RE.sub("\n\n", s)
	return truncate_output(s)


	def keystrokes_to_command(keystrokes: str) -> str:
	"""Convert one Terminus 2 keystrokes string to a bash line.

	Strips trailing \\n / \\r since bash interprets newlines as command separators
	when joined.
	"""
	s = keystrokes
	# Strip trailing line endings - they were Enter keys
	while s and s[-1] in "\r\n":
	s = s[:-1]
	return s


	def make_finish_summary(parsed_json: dict) -> str:
	"""Compose a one-line finish summary from the JSON's analysis field."""
	analysis = (parsed_json.get("analysis") or "").strip()
	if not analysis:
	return "task complete"
	# Take first sentence-ish
	first = re.split(r"(?<=[.!?])\s", analysis, maxsplit=1)[0]
	summary = first.strip()
	if len(summary) > MAX_FINISH_SUMMARY_CHARS:
	summary = summary[: MAX_FINISH_SUMMARY_CHARS - 3].rstrip() + "..."
	return summary or "task complete"


	def build_bash_block(parsed_json: dict) -> str \| None:
	commands = parsed_json.get("commands")
	if not isinstance(commands, list) or not commands:
	return None
	lines: list[str] = []
	for cmd in commands:
	if not isinstance(cmd, dict):
	return None
	ks = cmd.get("keystrokes", "")
	if not isinstance(ks, str):
	continue
	clean = keystrokes_to_command(ks)
	if clean.strip():
	lines.append(clean)
	if not lines:
	return None
	return "\n".join(lines)


	# --- Main converter ---

	def convert_trajectory(row: dict, source_config: str, stats: RejectStats) -> dict \| None:
	stats.total += 1
	conv = row.get("conversations") or []
	if len(conv) < 3: # need at least user + assistant + something
	stats.too_few_turns += 1
	return None

	if conv[0].get("role") != "user":
	stats.first_not_user += 1
	return None

	task = extract_task(conv[0]["content"])
	if not task:
	stats.no_task_marker += 1
	return None
	initial_state = extract_initial_state(conv[0]["content"])

	other = conv[1:]

	# Quick parse-error check on observations
	for t in other:
	if t.get("role") == "user":
	c = t.get("content", "")
	low = c.lower()
	if "parsing error" in low or "no valid json found in response" in low:
	stats.parse_error_observation += 1
	return None

	# Parse all assistant JSONs
	assistant_idxs = [i for i, t in enumerate(other) if t.get("role") == "assistant"]
	if len(assistant_idxs) < MIN_ASSISTANT_TURNS:
	stats.too_few_turns += 1
	return None
	if len(assistant_idxs) > MAX_ASSISTANT_TURNS:
	stats.too_many_turns += 1
	return None

	parsed_per_assistant = []
	for i in assistant_idxs:
	parsed = extract_json(other[i]["content"])
	if parsed is None:
	stats.assistant_json_unparseable += 1
	return None
	parsed_per_assistant.append(parsed)

	final_parsed = parsed_per_assistant[-1]
	if final_parsed.get("task_complete") is not True:
	stats.final_not_complete += 1
	return None

	final_cmds = final_parsed.get("commands") or []
	if isinstance(final_cmds, list) and len(final_cmds) > 0:
	# Cleanest semantics: final turn declares done with no commands. Drop the rest.
	stats.final_has_commands += 1
	return None

	# Build new conversation
	new_conv = [
	{"role": "system", "content": SYSTEM_PROMPT},
	]
	pinned = f"TASK:\n{task}"
	if initial_state:
	pinned += f"\n\nInitial state:\n{initial_state}"
	new_conv.append({"role": "user", "content": pinned})

	parsed_idx = 0
	final_assistant_pos = assistant_idxs[-1]

	for i, t in enumerate(other):
	role = t.get("role")
	if role == "assistant":
	think = extract_think(t.get("content", ""))
	parsed = parsed_per_assistant[parsed_idx]
	parsed_idx += 1
	is_final = (i == final_assistant_pos)

	if is_final:
	summary = make_finish_summary(parsed)
	action = f"<finish>{summary}</finish>"
	else:
	bash_block = build_bash_block(parsed)
	if bash_block is None:
	# Mid-trajectory turn with no commands - reject the trajectory
	stats.empty_commands_midtrajectory += 1
	return None
	action = f"<bash>{bash_block}</bash>"

	if think:
	content = f"<think>{think}</think>\n{action}"
	else:
	content = action
	new_conv.append({"role": "assistant", "content": content})

	elif role == "user":
	obs = clean_observation(t.get("content", ""))
	new_conv.append({"role": "user", "content": obs})

	stats.accepted += 1
	return {
	"conversations": new_conv,
	"task": row.get("task"),
	"episode": row.get("episode"),
	"run_id": row.get("run_id"),
	"source_config": source_config,
	"n_assistant_turns": len(assistant_idxs),
	}


	# --- Driver ---

	def main(configs, sample_per_config, out_path):
	from datasets import load_dataset

	REPO = "nvidia/Nemotron-Terminal-Corpus"
	stats = RejectStats()
	per_config_accepted = Counter()
	written = 0

	with open(out_path, "w", encoding="utf-8") as out:
	for cfg in configs:
	print(f"\n--- Loading {cfg} (streaming) ---", flush=True)
	try:
	ds = load_dataset(REPO, cfg, streaming=True)
	except Exception as e:
	print(f" could not load {cfg}: {e}")
	continue
	split = list(ds.keys())[0]
	stream = ds[split]
	cfg_count = 0
	for row in stream:
	if sample_per_config and cfg_count >= sample_per_config:
	break
	cfg_count += 1
	try:
	converted = convert_trajectory(row, cfg, stats)
	except Exception as e:
	stats.other += 1
	continue
	if converted is not None:
	out.write(json.dumps(converted, ensure_ascii=False) + "\n")
	per_config_accepted[cfg] += 1
	written += 1
	if cfg_count % 500 == 0:
	print(
	f" {cfg}: scanned={cfg_count} accepted={per_config_accepted[cfg]}",
	flush=True,
	)
	print(
	f" {cfg}: DONE scanned={cfg_count} accepted={per_config_accepted[cfg]}",
	flush=True,
	)

	print("\n\n========== FINAL STATS ==========")
	print(f"Total scanned : {stats.total}")
	print(f"Total accepted : {stats.accepted}")
	print(f"Output path : {out_path}")
	print(f"\nReject breakdown:")
	for k in [
	"too_few_turns",
	"too_many_turns",
	"first_not_user",
	"no_task_marker",
	"parse_error_observation",
	"assistant_json_unparseable",
	"final_not_complete",
	"final_has_commands",
	"empty_commands_midtrajectory",
	"other",
	]:
	v = getattr(stats, k)
	pct = 100.0 * v / max(stats.total, 1)
	print(f" {k:35s} {v:6d} ({pct:5.1f}%)")
	print("\nPer-config accepted:")
	for k, v in per_config_accepted.most_common():
	print(f" {k:25s} {v}")


	if __name__ == "__main__":
	# Default: sample 200 from each of the working configs
	configs = ["skill_based_easy", "skill_based_medium", "skill_based_mixed"]
	sample = 200
	out = "data/converted_sample.jsonl"
	if len(sys.argv) > 1:
	sample = int(sys.argv[1]) if sys.argv[1] != "all" else None
	if len(sys.argv) > 2:
	out = sys.argv[2]
	main(configs, sample, out)