wheattoast11 commited on
Commit
fe2d96b
·
verified ·
1 Parent(s): bdb8ce4

Upload build_sft_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. build_sft_v2.py +202 -0
build_sft_v2.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "datasets>=3.0.0",
5
+ # "huggingface_hub>=0.20.0",
6
+ # ]
7
+ # ///
8
+
9
+ """
10
+ Build Agent Zero SFT v2 mixed dataset.
11
+
12
+ Composition (~5K-8K examples):
13
+ 40% Agent tasks — agent-zero-sft-v1 (1,200) + agent-zero-training-data agentic split (~300)
14
+ 40% Math reasoning — MetaMathQA chain-of-thought samples (~3,000)
15
+ 20% General — OpenHermes-2.5 high-quality instruction samples (~1,500)
16
+
17
+ All formatted as multi-turn conversations in HF messages format.
18
+ Pushed to: wheattoast11/agent-zero-sft-v2
19
+ """
20
+
21
+ import json
22
+ import os
23
+ import random
24
+ from pathlib import Path
25
+
26
+ from datasets import Dataset, DatasetDict, load_dataset
27
+ from huggingface_hub import login
28
+
29
+ SEED = 42
30
+ random.seed(SEED)
31
+
32
+ AGENT_SYSTEM_PROMPT = (
33
+ "You are Agent Zero, an intelligent MCP (Model Context Protocol) server that provides "
34
+ "research, knowledge base, and tool orchestration capabilities. You understand:\n"
35
+ "- MCP tool calling with parameter normalization and schema validation\n"
36
+ "- Intent classification for routing queries to appropriate handlers\n"
37
+ "- Signal protocol for multi-model consensus and crystallization detection\n"
38
+ "- Async job management with status tracking\n"
39
+ "- Rail protocol for inter-agent communication with backpressure\n"
40
+ "- Sandbox security configuration and permission management\n\n"
41
+ "Always respond with valid JSON tool calls when appropriate, classify user intents "
42
+ "accurately, and maintain security boundaries."
43
+ )
44
+
45
+ MATH_SYSTEM_PROMPT = (
46
+ "You are a helpful assistant skilled in mathematical reasoning. "
47
+ "Show your work step-by-step before giving the final answer."
48
+ )
49
+
50
+ GENERAL_SYSTEM_PROMPT = (
51
+ "You are a helpful, harmless, and honest assistant."
52
+ )
53
+
54
+
55
+ def load_agent_data():
56
+ """Load agent-zero-sft-v1 train split + agent-zero-training-data agentic split."""
57
+ print("Loading agent-zero-sft-v1...")
58
+ sft_v1 = load_dataset(
59
+ "wheattoast11/agent-zero-sft-v1",
60
+ data_files="data/train.jsonl",
61
+ split="train",
62
+ )
63
+ print(f" sft-v1 train: {len(sft_v1)} examples")
64
+
65
+ # These already have 'messages' field in correct format
66
+ agent_examples = list(sft_v1)
67
+
68
+ # Load training-data agentic split and convert to messages format
69
+ print("Loading agent-zero-training-data (agentic split)...")
70
+ try:
71
+ training_data = load_dataset(
72
+ "wheattoast11/agent-zero-training-data",
73
+ split="agentic",
74
+ )
75
+ print(f" training-data agentic: {len(training_data)} examples")
76
+
77
+ for row in training_data:
78
+ messages = [
79
+ {"role": "system", "content": AGENT_SYSTEM_PROMPT},
80
+ {"role": "user", "content": row["instruction"]},
81
+ {"role": "assistant", "content": row["output"]},
82
+ ]
83
+ agent_examples.append({"messages": messages})
84
+ except Exception as e:
85
+ print(f" Warning: Could not load agentic split: {e}")
86
+ print(" Continuing with sft-v1 only.")
87
+
88
+ print(f" Total agent examples: {len(agent_examples)}")
89
+ return agent_examples
90
+
91
+
92
+ def load_math_data(n=3000):
93
+ """Sample n chain-of-thought examples from MetaMathQA."""
94
+ print(f"Loading MetaMathQA (sampling {n})...")
95
+ ds = load_dataset("meta-math/MetaMathQA", split="train")
96
+ print(f" Full dataset: {len(ds)} examples")
97
+
98
+ indices = random.sample(range(len(ds)), min(n, len(ds)))
99
+ samples = ds.select(indices)
100
+
101
+ math_examples = []
102
+ for row in samples:
103
+ messages = [
104
+ {"role": "system", "content": MATH_SYSTEM_PROMPT},
105
+ {"role": "user", "content": row["query"]},
106
+ {"role": "assistant", "content": row["response"]},
107
+ ]
108
+ math_examples.append({"messages": messages})
109
+
110
+ print(f" Sampled {len(math_examples)} math examples")
111
+ return math_examples
112
+
113
+
114
+ def load_general_data(n=1500):
115
+ """Sample n high-quality instruction examples from OpenHermes-2.5."""
116
+ print(f"Loading OpenHermes-2.5 (sampling {n})...")
117
+ ds = load_dataset("teknium/OpenHermes-2.5", split="train")
118
+ print(f" Full dataset: {len(ds)} examples")
119
+
120
+ indices = random.sample(range(len(ds)), min(n, len(ds)))
121
+ samples = ds.select(indices)
122
+
123
+ general_examples = []
124
+ for row in samples:
125
+ # OpenHermes has 'conversations' field with list of {from, value}
126
+ convos = row.get("conversations", [])
127
+ if not convos:
128
+ continue
129
+
130
+ messages = [{"role": "system", "content": GENERAL_SYSTEM_PROMPT}]
131
+ for turn in convos:
132
+ role = "user" if turn["from"] in ("human", "user") else "assistant"
133
+ messages.append({"role": role, "content": turn["value"]})
134
+
135
+ # Ensure conversation ends with assistant
136
+ if messages[-1]["role"] == "assistant":
137
+ general_examples.append({"messages": messages})
138
+
139
+ print(f" Sampled {len(general_examples)} general examples")
140
+ return general_examples
141
+
142
+
143
+ def build_splits(agent, math, general, val_ratio=0.1):
144
+ """Combine, shuffle, and split into train/validation."""
145
+ all_examples = agent + math + general
146
+ random.shuffle(all_examples)
147
+
148
+ # Tag source for analysis (not included in final messages)
149
+ print(f"\nDataset composition:")
150
+ print(f" Agent: {len(agent):>5} ({100*len(agent)/len(all_examples):.1f}%)")
151
+ print(f" Math: {len(math):>5} ({100*len(math)/len(all_examples):.1f}%)")
152
+ print(f" General: {len(general):>5} ({100*len(general)/len(all_examples):.1f}%)")
153
+ print(f" Total: {len(all_examples):>5}")
154
+
155
+ val_size = int(len(all_examples) * val_ratio)
156
+ val_data = all_examples[:val_size]
157
+ train_data = all_examples[val_size:]
158
+
159
+ print(f"\nSplit sizes:")
160
+ print(f" Train: {len(train_data)}")
161
+ print(f" Validation: {len(val_data)}")
162
+
163
+ return train_data, val_data
164
+
165
+
166
+ def main():
167
+ token = os.getenv("HF_TOKEN")
168
+ if token:
169
+ login(token=token)
170
+
171
+ agent = load_agent_data()
172
+ math = load_math_data(n=3000)
173
+ general = load_general_data(n=1500)
174
+
175
+ train_data, val_data = build_splits(agent, math, general)
176
+
177
+ # Write JSONL files
178
+ out_dir = Path("/tmp/agent-zero-sft-v2")
179
+ data_dir = out_dir / "data"
180
+ data_dir.mkdir(parents=True, exist_ok=True)
181
+
182
+ for name, data in [("train", train_data), ("validation", val_data)]:
183
+ path = data_dir / f"{name}.jsonl"
184
+ with open(path, "w") as f:
185
+ for ex in data:
186
+ f.write(json.dumps(ex, ensure_ascii=False) + "\n")
187
+ print(f"Wrote {path} ({len(data)} examples)")
188
+
189
+ # Push to Hub
190
+ print("\nPushing to Hub as wheattoast11/agent-zero-sft-v2...")
191
+ train_ds = Dataset.from_list(train_data)
192
+ val_ds = Dataset.from_list(val_data)
193
+ ds_dict = DatasetDict({"train": train_ds, "validation": val_ds})
194
+ ds_dict.push_to_hub(
195
+ "wheattoast11/agent-zero-sft-v2",
196
+ private=True,
197
+ )
198
+ print("Done! Dataset at: https://huggingface.co/datasets/wheattoast11/agent-zero-sft-v2")
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()