agent-zero-training-scripts / build_sft_v2.py
wheattoast11's picture
Upload build_sft_v2.py with huggingface_hub
fe2d96b verified
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "datasets>=3.0.0",
# "huggingface_hub>=0.20.0",
# ]
# ///
"""
Build Agent Zero SFT v2 mixed dataset.
Composition (~5K-8K examples):
40% Agent tasks — agent-zero-sft-v1 (1,200) + agent-zero-training-data agentic split (~300)
40% Math reasoning — MetaMathQA chain-of-thought samples (~3,000)
20% General — OpenHermes-2.5 high-quality instruction samples (~1,500)
All formatted as multi-turn conversations in HF messages format.
Pushed to: wheattoast11/agent-zero-sft-v2
"""
import json
import os
import random
from pathlib import Path
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import login
SEED = 42
random.seed(SEED)
AGENT_SYSTEM_PROMPT = (
"You are Agent Zero, an intelligent MCP (Model Context Protocol) server that provides "
"research, knowledge base, and tool orchestration capabilities. You understand:\n"
"- MCP tool calling with parameter normalization and schema validation\n"
"- Intent classification for routing queries to appropriate handlers\n"
"- Signal protocol for multi-model consensus and crystallization detection\n"
"- Async job management with status tracking\n"
"- Rail protocol for inter-agent communication with backpressure\n"
"- Sandbox security configuration and permission management\n\n"
"Always respond with valid JSON tool calls when appropriate, classify user intents "
"accurately, and maintain security boundaries."
)
MATH_SYSTEM_PROMPT = (
"You are a helpful assistant skilled in mathematical reasoning. "
"Show your work step-by-step before giving the final answer."
)
GENERAL_SYSTEM_PROMPT = (
"You are a helpful, harmless, and honest assistant."
)
def load_agent_data():
"""Load agent-zero-sft-v1 train split + agent-zero-training-data agentic split."""
print("Loading agent-zero-sft-v1...")
sft_v1 = load_dataset(
"wheattoast11/agent-zero-sft-v1",
data_files="data/train.jsonl",
split="train",
)
print(f" sft-v1 train: {len(sft_v1)} examples")
# These already have 'messages' field in correct format
agent_examples = list(sft_v1)
# Load training-data agentic split and convert to messages format
print("Loading agent-zero-training-data (agentic split)...")
try:
training_data = load_dataset(
"wheattoast11/agent-zero-training-data",
split="agentic",
)
print(f" training-data agentic: {len(training_data)} examples")
for row in training_data:
messages = [
{"role": "system", "content": AGENT_SYSTEM_PROMPT},
{"role": "user", "content": row["instruction"]},
{"role": "assistant", "content": row["output"]},
]
agent_examples.append({"messages": messages})
except Exception as e:
print(f" Warning: Could not load agentic split: {e}")
print(" Continuing with sft-v1 only.")
print(f" Total agent examples: {len(agent_examples)}")
return agent_examples
def load_math_data(n=3000):
"""Sample n chain-of-thought examples from MetaMathQA."""
print(f"Loading MetaMathQA (sampling {n})...")
ds = load_dataset("meta-math/MetaMathQA", split="train")
print(f" Full dataset: {len(ds)} examples")
indices = random.sample(range(len(ds)), min(n, len(ds)))
samples = ds.select(indices)
math_examples = []
for row in samples:
messages = [
{"role": "system", "content": MATH_SYSTEM_PROMPT},
{"role": "user", "content": row["query"]},
{"role": "assistant", "content": row["response"]},
]
math_examples.append({"messages": messages})
print(f" Sampled {len(math_examples)} math examples")
return math_examples
def load_general_data(n=1500):
"""Sample n high-quality instruction examples from OpenHermes-2.5."""
print(f"Loading OpenHermes-2.5 (sampling {n})...")
ds = load_dataset("teknium/OpenHermes-2.5", split="train")
print(f" Full dataset: {len(ds)} examples")
indices = random.sample(range(len(ds)), min(n, len(ds)))
samples = ds.select(indices)
general_examples = []
for row in samples:
# OpenHermes has 'conversations' field with list of {from, value}
convos = row.get("conversations", [])
if not convos:
continue
messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}]
for turn in convos:
role = "user" if turn["from"] in ("human", "user") else "assistant"
messages.append({"role": role, "content": turn["value"]})
# Ensure conversation ends with assistant
if messages[-1]["role"] == "assistant":
general_examples.append({"messages": messages})
print(f" Sampled {len(general_examples)} general examples")
return general_examples
def build_splits(agent, math, general, val_ratio=0.1):
"""Combine, shuffle, and split into train/validation."""
all_examples = agent + math + general
random.shuffle(all_examples)
# Tag source for analysis (not included in final messages)
print(f"\nDataset composition:")
print(f" Agent: {len(agent):>5} ({100*len(agent)/len(all_examples):.1f}%)")
print(f" Math: {len(math):>5} ({100*len(math)/len(all_examples):.1f}%)")
print(f" General: {len(general):>5} ({100*len(general)/len(all_examples):.1f}%)")
print(f" Total: {len(all_examples):>5}")
val_size = int(len(all_examples) * val_ratio)
val_data = all_examples[:val_size]
train_data = all_examples[val_size:]
print(f"\nSplit sizes:")
print(f" Train: {len(train_data)}")
print(f" Validation: {len(val_data)}")
return train_data, val_data
def main():
token = os.getenv("HF_TOKEN")
if token:
login(token=token)
agent = load_agent_data()
math = load_math_data(n=3000)
general = load_general_data(n=1500)
train_data, val_data = build_splits(agent, math, general)
# Write JSONL files
out_dir = Path("/tmp/agent-zero-sft-v2")
data_dir = out_dir / "data"
data_dir.mkdir(parents=True, exist_ok=True)
for name, data in [("train", train_data), ("validation", val_data)]:
path = data_dir / f"{name}.jsonl"
with open(path, "w") as f:
for ex in data:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"Wrote {path} ({len(data)} examples)")
# Push to Hub
print("\nPushing to Hub as wheattoast11/agent-zero-sft-v2...")
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)
ds_dict = DatasetDict({"train": train_ds, "validation": val_ds})
ds_dict.push_to_hub(
"wheattoast11/agent-zero-sft-v2",
private=True,
)
print("Done! Dataset at: https://huggingface.co/datasets/wheattoast11/agent-zero-sft-v2")
if __name__ == "__main__":
main()