#!/usr/bin/env python3 """ Step 1: Convert Parquet to JSONL Converts the n8n_workflows_templates_dataset.parquet file to JSONL format for consistency with other datasets. """ import pandas as pd import json from pathlib import Path def convert_parquet_to_jsonl(parquet_file, output_file): """Convert Parquet dataset to JSONL format.""" print(f"Loading {parquet_file}...") df = pd.read_parquet(parquet_file) print(f"Loaded {len(df):,} examples") print(f"Columns: {list(df.columns)}") print(f"\nConverting to JSONL...") with open(output_file, 'w', encoding='utf-8') as f: for idx, row in df.iterrows(): if idx % 5000 == 0 and idx > 0: print(f" Converted {idx:,} / {len(df):,} examples...") # Convert row to dictionary and write as JSON line f.write(json.dumps(row.to_dict()) + '\n') print(f"\nāœ… Conversion complete!") print(f" Input: {parquet_file} ({len(df):,} examples)") print(f" Output: {output_file}") # Verify file size output_size = Path(output_file).stat().st_size / (1024 * 1024) print(f" Size: {output_size:.2f} MB") # Validate by reading first line with open(output_file, 'r', encoding='utf-8') as f: first_line = f.readline() sample = json.loads(first_line) print(f"\nšŸ“ Sample structure:") print(f" Fields: {list(sample.keys())}") return len(df) if __name__ == '__main__': parquet_file = 'n8n_workflows_templates_dataset.parquet' output_file = 'n8n_workflows_templates.jsonl' count = convert_parquet_to_jsonl(parquet_file, output_file) print(f"\nšŸŽ‰ Successfully converted {count:,} examples to JSONL format!")