|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Build Agent Zero SFT v2 mixed dataset. |
|
|
|
|
|
Composition (~5K-8K examples): |
|
|
40% Agent tasks — agent-zero-sft-v1 (1,200) + agent-zero-training-data agentic split (~300) |
|
|
40% Math reasoning — MetaMathQA chain-of-thought samples (~3,000) |
|
|
20% General — OpenHermes-2.5 high-quality instruction samples (~1,500) |
|
|
|
|
|
All formatted as multi-turn conversations in HF messages format. |
|
|
Pushed to: wheattoast11/agent-zero-sft-v2 |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import random |
|
|
from pathlib import Path |
|
|
|
|
|
from datasets import Dataset, DatasetDict, load_dataset |
|
|
from huggingface_hub import login |
|
|
|
|
|
SEED = 42 |
|
|
random.seed(SEED) |
|
|
|
|
|
AGENT_SYSTEM_PROMPT = ( |
|
|
"You are Agent Zero, an intelligent MCP (Model Context Protocol) server that provides " |
|
|
"research, knowledge base, and tool orchestration capabilities. You understand:\n" |
|
|
"- MCP tool calling with parameter normalization and schema validation\n" |
|
|
"- Intent classification for routing queries to appropriate handlers\n" |
|
|
"- Signal protocol for multi-model consensus and crystallization detection\n" |
|
|
"- Async job management with status tracking\n" |
|
|
"- Rail protocol for inter-agent communication with backpressure\n" |
|
|
"- Sandbox security configuration and permission management\n\n" |
|
|
"Always respond with valid JSON tool calls when appropriate, classify user intents " |
|
|
"accurately, and maintain security boundaries." |
|
|
) |
|
|
|
|
|
MATH_SYSTEM_PROMPT = ( |
|
|
"You are a helpful assistant skilled in mathematical reasoning. " |
|
|
"Show your work step-by-step before giving the final answer." |
|
|
) |
|
|
|
|
|
GENERAL_SYSTEM_PROMPT = ( |
|
|
"You are a helpful, harmless, and honest assistant." |
|
|
) |
|
|
|
|
|
|
|
|
def load_agent_data(): |
|
|
"""Load agent-zero-sft-v1 train split + agent-zero-training-data agentic split.""" |
|
|
print("Loading agent-zero-sft-v1...") |
|
|
sft_v1 = load_dataset( |
|
|
"wheattoast11/agent-zero-sft-v1", |
|
|
data_files="data/train.jsonl", |
|
|
split="train", |
|
|
) |
|
|
print(f" sft-v1 train: {len(sft_v1)} examples") |
|
|
|
|
|
|
|
|
agent_examples = list(sft_v1) |
|
|
|
|
|
|
|
|
print("Loading agent-zero-training-data (agentic split)...") |
|
|
try: |
|
|
training_data = load_dataset( |
|
|
"wheattoast11/agent-zero-training-data", |
|
|
split="agentic", |
|
|
) |
|
|
print(f" training-data agentic: {len(training_data)} examples") |
|
|
|
|
|
for row in training_data: |
|
|
messages = [ |
|
|
{"role": "system", "content": AGENT_SYSTEM_PROMPT}, |
|
|
{"role": "user", "content": row["instruction"]}, |
|
|
{"role": "assistant", "content": row["output"]}, |
|
|
] |
|
|
agent_examples.append({"messages": messages}) |
|
|
except Exception as e: |
|
|
print(f" Warning: Could not load agentic split: {e}") |
|
|
print(" Continuing with sft-v1 only.") |
|
|
|
|
|
print(f" Total agent examples: {len(agent_examples)}") |
|
|
return agent_examples |
|
|
|
|
|
|
|
|
def load_math_data(n=3000): |
|
|
"""Sample n chain-of-thought examples from MetaMathQA.""" |
|
|
print(f"Loading MetaMathQA (sampling {n})...") |
|
|
ds = load_dataset("meta-math/MetaMathQA", split="train") |
|
|
print(f" Full dataset: {len(ds)} examples") |
|
|
|
|
|
indices = random.sample(range(len(ds)), min(n, len(ds))) |
|
|
samples = ds.select(indices) |
|
|
|
|
|
math_examples = [] |
|
|
for row in samples: |
|
|
messages = [ |
|
|
{"role": "system", "content": MATH_SYSTEM_PROMPT}, |
|
|
{"role": "user", "content": row["query"]}, |
|
|
{"role": "assistant", "content": row["response"]}, |
|
|
] |
|
|
math_examples.append({"messages": messages}) |
|
|
|
|
|
print(f" Sampled {len(math_examples)} math examples") |
|
|
return math_examples |
|
|
|
|
|
|
|
|
def load_general_data(n=1500): |
|
|
"""Sample n high-quality instruction examples from OpenHermes-2.5.""" |
|
|
print(f"Loading OpenHermes-2.5 (sampling {n})...") |
|
|
ds = load_dataset("teknium/OpenHermes-2.5", split="train") |
|
|
print(f" Full dataset: {len(ds)} examples") |
|
|
|
|
|
indices = random.sample(range(len(ds)), min(n, len(ds))) |
|
|
samples = ds.select(indices) |
|
|
|
|
|
general_examples = [] |
|
|
for row in samples: |
|
|
|
|
|
convos = row.get("conversations", []) |
|
|
if not convos: |
|
|
continue |
|
|
|
|
|
messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}] |
|
|
for turn in convos: |
|
|
role = "user" if turn["from"] in ("human", "user") else "assistant" |
|
|
messages.append({"role": role, "content": turn["value"]}) |
|
|
|
|
|
|
|
|
if messages[-1]["role"] == "assistant": |
|
|
general_examples.append({"messages": messages}) |
|
|
|
|
|
print(f" Sampled {len(general_examples)} general examples") |
|
|
return general_examples |
|
|
|
|
|
|
|
|
def build_splits(agent, math, general, val_ratio=0.1): |
|
|
"""Combine, shuffle, and split into train/validation.""" |
|
|
all_examples = agent + math + general |
|
|
random.shuffle(all_examples) |
|
|
|
|
|
|
|
|
print(f"\nDataset composition:") |
|
|
print(f" Agent: {len(agent):>5} ({100*len(agent)/len(all_examples):.1f}%)") |
|
|
print(f" Math: {len(math):>5} ({100*len(math)/len(all_examples):.1f}%)") |
|
|
print(f" General: {len(general):>5} ({100*len(general)/len(all_examples):.1f}%)") |
|
|
print(f" Total: {len(all_examples):>5}") |
|
|
|
|
|
val_size = int(len(all_examples) * val_ratio) |
|
|
val_data = all_examples[:val_size] |
|
|
train_data = all_examples[val_size:] |
|
|
|
|
|
print(f"\nSplit sizes:") |
|
|
print(f" Train: {len(train_data)}") |
|
|
print(f" Validation: {len(val_data)}") |
|
|
|
|
|
return train_data, val_data |
|
|
|
|
|
|
|
|
def main(): |
|
|
token = os.getenv("HF_TOKEN") |
|
|
if token: |
|
|
login(token=token) |
|
|
|
|
|
agent = load_agent_data() |
|
|
math = load_math_data(n=3000) |
|
|
general = load_general_data(n=1500) |
|
|
|
|
|
train_data, val_data = build_splits(agent, math, general) |
|
|
|
|
|
|
|
|
out_dir = Path("/tmp/agent-zero-sft-v2") |
|
|
data_dir = out_dir / "data" |
|
|
data_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
for name, data in [("train", train_data), ("validation", val_data)]: |
|
|
path = data_dir / f"{name}.jsonl" |
|
|
with open(path, "w") as f: |
|
|
for ex in data: |
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n") |
|
|
print(f"Wrote {path} ({len(data)} examples)") |
|
|
|
|
|
|
|
|
print("\nPushing to Hub as wheattoast11/agent-zero-sft-v2...") |
|
|
train_ds = Dataset.from_list(train_data) |
|
|
val_ds = Dataset.from_list(val_data) |
|
|
ds_dict = DatasetDict({"train": train_ds, "validation": val_ds}) |
|
|
ds_dict.push_to_hub( |
|
|
"wheattoast11/agent-zero-sft-v2", |
|
|
private=True, |
|
|
) |
|
|
print("Done! Dataset at: https://huggingface.co/datasets/wheattoast11/agent-zero-sft-v2") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|