Spaces:
Paused
Paused
| import json | |
| import glob | |
| import os | |
| import random | |
| import re | |
| # CONFIG | |
| # Locations to look for your valid blueprints | |
| BLUEPRINT_DIRS = ["/content/drive/MyDrive/ProjectA_Backup/src/data/blueprints", "my_workflows"] | |
| DB_PATH = "/content/drive/MyDrive/ProjectA_Backup/src/data/project_a.db" | |
| OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/training_dataset.jsonl" | |
| # Identity | |
| SYSTEM_PROMPT = "You are Project A, the Lead Automation Engineer. You generate strict Make.com JSON blueprints." | |
| def clean_json_string(data): | |
| """Minifies JSON slightly but keeps structure for training.""" | |
| return json.dumps(data, indent=2, ensure_ascii=False) | |
| def generate_prompts_for_blueprint(filename, data): | |
| """ | |
| Generates synthetic user prompts based on the modules found in the JSON. | |
| This is 'Reverse Engineering' the prompt from the answer. | |
| """ | |
| prompts = [] | |
| # 1. Analyze the Flow | |
| modules = [] | |
| if "flow" in data: | |
| for node in data["flow"]: | |
| if "module" in node: | |
| modules.append(node["module"]) | |
| if not modules: return [] | |
| # Simplify module names for natural language (e.g. google-sheets:addRow -> Google Sheets) | |
| human_modules = [m.split(':')[0].replace('-', ' ').title() for m in modules] | |
| flow_summary = " -> ".join(human_modules) | |
| # 2. Create Variations (English & Vietnamese) | |
| # Variation A: Direct Request (English) | |
| prompts.append(f"Create a Make.com automation that connects {flow_summary}.") | |
| # Variation B: "Build" Intent (English) | |
| prompts.append(f"Build a workflow: {flow_summary}.") | |
| # Variation C: Vietnamese Intent | |
| prompts.append(f"Tạo quy trình tự động hóa: {flow_summary}.") | |
| # Variation D: Specific Filename Context (if filename is descriptive) | |
| clean_name = filename.replace("WF_", "").replace(".json", "").replace("_", " ") | |
| prompts.append(f"Design an automation for: {clean_name}") | |
| return prompts | |
| def export_blueprints(): | |
| print(f"🔍 Scanning for blueprints in {BLUEPRINT_DIRS}...") | |
| samples = [] | |
| found_files = [] | |
| for d in BLUEPRINT_DIRS: | |
| found_files.extend(glob.glob(os.path.join(d, "*.json"))) | |
| print(f" Found {len(found_files)} files.") | |
| for fpath in found_files: | |
| try: | |
| with open(fpath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Validate it's a real blueprint | |
| if "flow" not in data and "scenarios" not in data: | |
| continue | |
| # Generate synthetic user inputs | |
| user_prompts = generate_prompts_for_blueprint(os.path.basename(fpath), data) | |
| # The Target Output (The JSON) | |
| assistant_response = f"```json\n{clean_json_string(data)}\n```" | |
| # Create training pairs | |
| for p in user_prompts: | |
| sample = { | |
| "messages": [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": p}, | |
| {"role": "assistant", "content": assistant_response} | |
| ] | |
| } | |
| samples.append(sample) | |
| except Exception as e: | |
| print(f" ⚠️ Error reading {fpath}: {e}") | |
| print(f"✅ Generated {len(samples)} training samples from blueprints.") | |
| return samples | |
| def main(): | |
| print("🏭 Starting Data Factory...") | |
| dataset = export_blueprints() | |
| # Shuffle to prevent overfitting to one type of task sequence | |
| random.shuffle(dataset) | |
| # Save | |
| os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: | |
| for entry in dataset: | |
| f.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| print(f"🎉 SUCCESS: Dataset saved to {OUTPUT_FILE}") | |
| print(f"📊 Total Training Rows: {len(dataset)}") | |
| print("👉 Next Step: Use this file to fine-tune Qwen using LoRA.") | |
| if __name__ == "__main__": | |
| main() |