Multi_Agent_Model / src /tools /export_training_data.py
sonthaiha
Fresh Deployment with LFS
1804a7a
import json
import glob
import os
import random
import re
# CONFIG
# Locations to look for your valid blueprints
BLUEPRINT_DIRS = ["/content/drive/MyDrive/ProjectA_Backup/src/data/blueprints", "my_workflows"]
DB_PATH = "/content/drive/MyDrive/ProjectA_Backup/src/data/project_a.db"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/training_dataset.jsonl"
# Identity
SYSTEM_PROMPT = "You are Project A, the Lead Automation Engineer. You generate strict Make.com JSON blueprints."
def clean_json_string(data):
"""Minifies JSON slightly but keeps structure for training."""
return json.dumps(data, indent=2, ensure_ascii=False)
def generate_prompts_for_blueprint(filename, data):
"""
Generates synthetic user prompts based on the modules found in the JSON.
This is 'Reverse Engineering' the prompt from the answer.
"""
prompts = []
# 1. Analyze the Flow
modules = []
if "flow" in data:
for node in data["flow"]:
if "module" in node:
modules.append(node["module"])
if not modules: return []
# Simplify module names for natural language (e.g. google-sheets:addRow -> Google Sheets)
human_modules = [m.split(':')[0].replace('-', ' ').title() for m in modules]
flow_summary = " -> ".join(human_modules)
# 2. Create Variations (English & Vietnamese)
# Variation A: Direct Request (English)
prompts.append(f"Create a Make.com automation that connects {flow_summary}.")
# Variation B: "Build" Intent (English)
prompts.append(f"Build a workflow: {flow_summary}.")
# Variation C: Vietnamese Intent
prompts.append(f"Tạo quy trình tự động hóa: {flow_summary}.")
# Variation D: Specific Filename Context (if filename is descriptive)
clean_name = filename.replace("WF_", "").replace(".json", "").replace("_", " ")
prompts.append(f"Design an automation for: {clean_name}")
return prompts
def export_blueprints():
print(f"🔍 Scanning for blueprints in {BLUEPRINT_DIRS}...")
samples = []
found_files = []
for d in BLUEPRINT_DIRS:
found_files.extend(glob.glob(os.path.join(d, "*.json")))
print(f" Found {len(found_files)} files.")
for fpath in found_files:
try:
with open(fpath, 'r', encoding='utf-8') as f:
data = json.load(f)
# Validate it's a real blueprint
if "flow" not in data and "scenarios" not in data:
continue
# Generate synthetic user inputs
user_prompts = generate_prompts_for_blueprint(os.path.basename(fpath), data)
# The Target Output (The JSON)
assistant_response = f"```json\n{clean_json_string(data)}\n```"
# Create training pairs
for p in user_prompts:
sample = {
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": p},
{"role": "assistant", "content": assistant_response}
]
}
samples.append(sample)
except Exception as e:
print(f" ⚠️ Error reading {fpath}: {e}")
print(f"✅ Generated {len(samples)} training samples from blueprints.")
return samples
def main():
print("🏭 Starting Data Factory...")
dataset = export_blueprints()
# Shuffle to prevent overfitting to one type of task sequence
random.shuffle(dataset)
# Save
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for entry in dataset:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"🎉 SUCCESS: Dataset saved to {OUTPUT_FILE}")
print(f"📊 Total Training Rows: {len(dataset)}")
print("👉 Next Step: Use this file to fine-tune Qwen using LoRA.")
if __name__ == "__main__":
main()