File size: 4,096 Bytes
1804a7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import json
import glob
import os
import random
import re

# CONFIG
# Locations to look for your valid blueprints
BLUEPRINT_DIRS = ["/content/drive/MyDrive/ProjectA_Backup/src/data/blueprints", "my_workflows"]
DB_PATH = "/content/drive/MyDrive/ProjectA_Backup/src/data/project_a.db"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/training_dataset.jsonl"

# Identity
SYSTEM_PROMPT = "You are Project A, the Lead Automation Engineer. You generate strict Make.com JSON blueprints."

def clean_json_string(data):
    """Minifies JSON slightly but keeps structure for training."""
    return json.dumps(data, indent=2, ensure_ascii=False)

def generate_prompts_for_blueprint(filename, data):
    """
    Generates synthetic user prompts based on the modules found in the JSON.
    This is 'Reverse Engineering' the prompt from the answer.
    """
    prompts = []
    
    # 1. Analyze the Flow
    modules = []
    if "flow" in data:
        for node in data["flow"]:
            if "module" in node:
                modules.append(node["module"])
    
    if not modules: return []
    
    # Simplify module names for natural language (e.g. google-sheets:addRow -> Google Sheets)
    human_modules = [m.split(':')[0].replace('-', ' ').title() for m in modules]
    flow_summary = " -> ".join(human_modules)
    
    # 2. Create Variations (English & Vietnamese)
    
    # Variation A: Direct Request (English)
    prompts.append(f"Create a Make.com automation that connects {flow_summary}.")
    
    # Variation B: "Build" Intent (English)
    prompts.append(f"Build a workflow: {flow_summary}.")
    
    # Variation C: Vietnamese Intent
    prompts.append(f"Tạo quy trình tự động hóa: {flow_summary}.")
    
    # Variation D: Specific Filename Context (if filename is descriptive)
    clean_name = filename.replace("WF_", "").replace(".json", "").replace("_", " ")
    prompts.append(f"Design an automation for: {clean_name}")

    return prompts

def export_blueprints():
    print(f"🔍 Scanning for blueprints in {BLUEPRINT_DIRS}...")
    samples = []
    
    found_files = []
    for d in BLUEPRINT_DIRS:
        found_files.extend(glob.glob(os.path.join(d, "*.json")))
    
    print(f"   Found {len(found_files)} files.")

    for fpath in found_files:
        try:
            with open(fpath, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Validate it's a real blueprint
            if "flow" not in data and "scenarios" not in data:
                continue

            # Generate synthetic user inputs
            user_prompts = generate_prompts_for_blueprint(os.path.basename(fpath), data)
            
            # The Target Output (The JSON)
            assistant_response = f"```json\n{clean_json_string(data)}\n```"
            
            # Create training pairs
            for p in user_prompts:
                sample = {
                    "messages": [
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": p},
                        {"role": "assistant", "content": assistant_response}
                    ]
                }
                samples.append(sample)
        except Exception as e:
            print(f"   ⚠️ Error reading {fpath}: {e}")
            
    print(f"✅ Generated {len(samples)} training samples from blueprints.")
    return samples

def main():
    print("🏭 Starting Data Factory...")
    
    dataset = export_blueprints()
    
    # Shuffle to prevent overfitting to one type of task sequence
    random.shuffle(dataset)
    
    # Save
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in dataset:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
            
    print(f"🎉 SUCCESS: Dataset saved to {OUTPUT_FILE}")
    print(f"📊 Total Training Rows: {len(dataset)}")
    print("👉 Next Step: Use this file to fine-tune Qwen using LoRA.")

if __name__ == "__main__":
    main()