llm-agent-factory / script /build_sft_train.py
bridges-optimal-55's picture
Initial commit
505aa09
import json
from pathlib import Path
def main() -> None:
base_dir = Path("dataset/agent_tasks")
output_dir = Path("train")
output_dir.mkdir(exist_ok=True)
fields = ["agent_id", "display_name", "persona", "description", "tools"]
for dataset_dir in sorted(base_dir.iterdir()):
if not dataset_dir.is_dir():
continue
dataset_path = dataset_dir / "dataset.json"
tasks_path = dataset_dir / "tasks.json"
if not dataset_path.exists() or not tasks_path.exists():
continue
dataset_data = json.loads(dataset_path.read_text(encoding="utf-8"))
tasks_data = json.loads(tasks_path.read_text(encoding="utf-8"))
agent_map = {agent["agent_id"]: agent for agent in dataset_data["agents"]}
output_path = output_dir / f"sft_train_{dataset_dir.name}.jsonl"
with output_path.open("w", encoding="utf-8") as output_file:
for agent_entry in tasks_data["agents"]:
agent = agent_map.get(agent_entry["agent_id"])
if not agent:
continue
agent_output = {key: agent[key] for key in fields}
for task in agent_entry.get("tasks", []):
record = {"input": task, "output": agent_output}
output_file.write(json.dumps(record, ensure_ascii=False) + "\n")
if __name__ == "__main__":
main()