Spaces:

sonthaiha
/

Multi_Agent_Model

Paused

Multi_Agent_Model / src /tools /export_training_data.py

sonthaiha

Fresh Deployment with LFS

1804a7a about 2 months ago

4.1 kB

	import json
	import glob
	import os
	import random
	import re

	# CONFIG
	# Locations to look for your valid blueprints
	BLUEPRINT_DIRS = ["/content/drive/MyDrive/ProjectA_Backup/src/data/blueprints", "my_workflows"]
	DB_PATH = "/content/drive/MyDrive/ProjectA_Backup/src/data/project_a.db"
	OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/training_dataset.jsonl"

	# Identity
	SYSTEM_PROMPT = "You are Project A, the Lead Automation Engineer. You generate strict Make.com JSON blueprints."

	def clean_json_string(data):
	"""Minifies JSON slightly but keeps structure for training."""
	return json.dumps(data, indent=2, ensure_ascii=False)

	def generate_prompts_for_blueprint(filename, data):
	"""
	Generates synthetic user prompts based on the modules found in the JSON.
	This is 'Reverse Engineering' the prompt from the answer.
	"""
	prompts = []

	# 1. Analyze the Flow
	modules = []
	if "flow" in data:
	for node in data["flow"]:
	if "module" in node:
	modules.append(node["module"])

	if not modules: return []

	# Simplify module names for natural language (e.g. google-sheets:addRow -> Google Sheets)
	human_modules = [m.split(':')[0].replace('-', ' ').title() for m in modules]
	flow_summary = " -> ".join(human_modules)

	# 2. Create Variations (English & Vietnamese)

	# Variation A: Direct Request (English)
	prompts.append(f"Create a Make.com automation that connects {flow_summary}.")

	# Variation B: "Build" Intent (English)
	prompts.append(f"Build a workflow: {flow_summary}.")

	# Variation C: Vietnamese Intent
	prompts.append(f"Tạo quy trình tự động hóa: {flow_summary}.")

	# Variation D: Specific Filename Context (if filename is descriptive)
	clean_name = filename.replace("WF_", "").replace(".json", "").replace("_", " ")
	prompts.append(f"Design an automation for: {clean_name}")

	return prompts

	def export_blueprints():
	print(f"🔍 Scanning for blueprints in {BLUEPRINT_DIRS}...")
	samples = []

	found_files = []
	for d in BLUEPRINT_DIRS:
	found_files.extend(glob.glob(os.path.join(d, "*.json")))

	print(f" Found {len(found_files)} files.")

	for fpath in found_files:
	try:
	with open(fpath, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Validate it's a real blueprint
	if "flow" not in data and "scenarios" not in data:
	continue

	# Generate synthetic user inputs
	user_prompts = generate_prompts_for_blueprint(os.path.basename(fpath), data)

	# The Target Output (The JSON)
	assistant_response = f"```json\n{clean_json_string(data)}\n```"

	# Create training pairs
	for p in user_prompts:
	sample = {
	"messages": [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": p},
	{"role": "assistant", "content": assistant_response}
	]
	}
	samples.append(sample)
	except Exception as e:
	print(f" ⚠️ Error reading {fpath}: {e}")

	print(f"✅ Generated {len(samples)} training samples from blueprints.")
	return samples

	def main():
	print("🏭 Starting Data Factory...")

	dataset = export_blueprints()

	# Shuffle to prevent overfitting to one type of task sequence
	random.shuffle(dataset)

	# Save
	os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	for entry in dataset:
	f.write(json.dumps(entry, ensure_ascii=False) + "\n")

	print(f"🎉 SUCCESS: Dataset saved to {OUTPUT_FILE}")
	print(f"📊 Total Training Rows: {len(dataset)}")
	print("👉 Next Step: Use this file to fine-tune Qwen using LoRA.")

	if __name__ == "__main__":
	main()