# /// script # requires-python = ">=3.10" # dependencies = [ # "datasets>=3.0.0", # "huggingface_hub>=0.20.0", # ] # /// """ Build Agent Zero SFT v2 mixed dataset. Composition (~5K-8K examples): 40% Agent tasks — agent-zero-sft-v1 (1,200) + agent-zero-training-data agentic split (~300) 40% Math reasoning — MetaMathQA chain-of-thought samples (~3,000) 20% General — OpenHermes-2.5 high-quality instruction samples (~1,500) All formatted as multi-turn conversations in HF messages format. Pushed to: wheattoast11/agent-zero-sft-v2 """ import json import os import random from pathlib import Path from datasets import Dataset, DatasetDict, load_dataset from huggingface_hub import login SEED = 42 random.seed(SEED) AGENT_SYSTEM_PROMPT = ( "You are Agent Zero, an intelligent MCP (Model Context Protocol) server that provides " "research, knowledge base, and tool orchestration capabilities. You understand:\n" "- MCP tool calling with parameter normalization and schema validation\n" "- Intent classification for routing queries to appropriate handlers\n" "- Signal protocol for multi-model consensus and crystallization detection\n" "- Async job management with status tracking\n" "- Rail protocol for inter-agent communication with backpressure\n" "- Sandbox security configuration and permission management\n\n" "Always respond with valid JSON tool calls when appropriate, classify user intents " "accurately, and maintain security boundaries." ) MATH_SYSTEM_PROMPT = ( "You are a helpful assistant skilled in mathematical reasoning. " "Show your work step-by-step before giving the final answer." ) GENERAL_SYSTEM_PROMPT = ( "You are a helpful, harmless, and honest assistant." ) def load_agent_data(): """Load agent-zero-sft-v1 train split + agent-zero-training-data agentic split.""" print("Loading agent-zero-sft-v1...") sft_v1 = load_dataset( "wheattoast11/agent-zero-sft-v1", data_files="data/train.jsonl", split="train", ) print(f" sft-v1 train: {len(sft_v1)} examples") # These already have 'messages' field in correct format agent_examples = list(sft_v1) # Load training-data agentic split and convert to messages format print("Loading agent-zero-training-data (agentic split)...") try: training_data = load_dataset( "wheattoast11/agent-zero-training-data", split="agentic", ) print(f" training-data agentic: {len(training_data)} examples") for row in training_data: messages = [ {"role": "system", "content": AGENT_SYSTEM_PROMPT}, {"role": "user", "content": row["instruction"]}, {"role": "assistant", "content": row["output"]}, ] agent_examples.append({"messages": messages}) except Exception as e: print(f" Warning: Could not load agentic split: {e}") print(" Continuing with sft-v1 only.") print(f" Total agent examples: {len(agent_examples)}") return agent_examples def load_math_data(n=3000): """Sample n chain-of-thought examples from MetaMathQA.""" print(f"Loading MetaMathQA (sampling {n})...") ds = load_dataset("meta-math/MetaMathQA", split="train") print(f" Full dataset: {len(ds)} examples") indices = random.sample(range(len(ds)), min(n, len(ds))) samples = ds.select(indices) math_examples = [] for row in samples: messages = [ {"role": "system", "content": MATH_SYSTEM_PROMPT}, {"role": "user", "content": row["query"]}, {"role": "assistant", "content": row["response"]}, ] math_examples.append({"messages": messages}) print(f" Sampled {len(math_examples)} math examples") return math_examples def load_general_data(n=1500): """Sample n high-quality instruction examples from OpenHermes-2.5.""" print(f"Loading OpenHermes-2.5 (sampling {n})...") ds = load_dataset("teknium/OpenHermes-2.5", split="train") print(f" Full dataset: {len(ds)} examples") indices = random.sample(range(len(ds)), min(n, len(ds))) samples = ds.select(indices) general_examples = [] for row in samples: # OpenHermes has 'conversations' field with list of {from, value} convos = row.get("conversations", []) if not convos: continue messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}] for turn in convos: role = "user" if turn["from"] in ("human", "user") else "assistant" messages.append({"role": role, "content": turn["value"]}) # Ensure conversation ends with assistant if messages[-1]["role"] == "assistant": general_examples.append({"messages": messages}) print(f" Sampled {len(general_examples)} general examples") return general_examples def build_splits(agent, math, general, val_ratio=0.1): """Combine, shuffle, and split into train/validation.""" all_examples = agent + math + general random.shuffle(all_examples) # Tag source for analysis (not included in final messages) print(f"\nDataset composition:") print(f" Agent: {len(agent):>5} ({100*len(agent)/len(all_examples):.1f}%)") print(f" Math: {len(math):>5} ({100*len(math)/len(all_examples):.1f}%)") print(f" General: {len(general):>5} ({100*len(general)/len(all_examples):.1f}%)") print(f" Total: {len(all_examples):>5}") val_size = int(len(all_examples) * val_ratio) val_data = all_examples[:val_size] train_data = all_examples[val_size:] print(f"\nSplit sizes:") print(f" Train: {len(train_data)}") print(f" Validation: {len(val_data)}") return train_data, val_data def main(): token = os.getenv("HF_TOKEN") if token: login(token=token) agent = load_agent_data() math = load_math_data(n=3000) general = load_general_data(n=1500) train_data, val_data = build_splits(agent, math, general) # Write JSONL files out_dir = Path("/tmp/agent-zero-sft-v2") data_dir = out_dir / "data" data_dir.mkdir(parents=True, exist_ok=True) for name, data in [("train", train_data), ("validation", val_data)]: path = data_dir / f"{name}.jsonl" with open(path, "w") as f: for ex in data: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"Wrote {path} ({len(data)} examples)") # Push to Hub print("\nPushing to Hub as wheattoast11/agent-zero-sft-v2...") train_ds = Dataset.from_list(train_data) val_ds = Dataset.from_list(val_data) ds_dict = DatasetDict({"train": train_ds, "validation": val_ds}) ds_dict.push_to_hub( "wheattoast11/agent-zero-sft-v2", private=True, ) print("Done! Dataset at: https://huggingface.co/datasets/wheattoast11/agent-zero-sft-v2") if __name__ == "__main__": main()