File size: 1,764 Bytes
e65ef8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
"""
Step 1: Convert Parquet to JSONL

Converts the n8n_workflows_templates_dataset.parquet file
to JSONL format for consistency with other datasets.
"""

import pandas as pd
import json
from pathlib import Path

def convert_parquet_to_jsonl(parquet_file, output_file):
    """Convert Parquet dataset to JSONL format."""
    print(f"Loading {parquet_file}...")
    df = pd.read_parquet(parquet_file)
    
    print(f"Loaded {len(df):,} examples")
    print(f"Columns: {list(df.columns)}")
    
    print(f"\nConverting to JSONL...")
    with open(output_file, 'w', encoding='utf-8') as f:
        for idx, row in df.iterrows():
            if idx % 5000 == 0 and idx > 0:
                print(f"  Converted {idx:,} / {len(df):,} examples...")
            
            # Convert row to dictionary and write as JSON line
            f.write(json.dumps(row.to_dict()) + '\n')
    
    print(f"\n✅ Conversion complete!")
    print(f"   Input:  {parquet_file} ({len(df):,} examples)")
    print(f"   Output: {output_file}")
    
    # Verify file size
    output_size = Path(output_file).stat().st_size / (1024 * 1024)
    print(f"   Size:   {output_size:.2f} MB")
    
    # Validate by reading first line
    with open(output_file, 'r', encoding='utf-8') as f:
        first_line = f.readline()
        sample = json.loads(first_line)
        print(f"\n📝 Sample structure:")
        print(f"   Fields: {list(sample.keys())}")
    
    return len(df)


if __name__ == '__main__':
    parquet_file = 'n8n_workflows_templates_dataset.parquet'
    output_file = 'n8n_workflows_templates.jsonl'
    
    count = convert_parquet_to_jsonl(parquet_file, output_file)
    print(f"\n🎉 Successfully converted {count:,} examples to JSONL format!")