File size: 1,764 Bytes
e65ef8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | #!/usr/bin/env python3
"""
Step 1: Convert Parquet to JSONL
Converts the n8n_workflows_templates_dataset.parquet file
to JSONL format for consistency with other datasets.
"""
import pandas as pd
import json
from pathlib import Path
def convert_parquet_to_jsonl(parquet_file, output_file):
"""Convert Parquet dataset to JSONL format."""
print(f"Loading {parquet_file}...")
df = pd.read_parquet(parquet_file)
print(f"Loaded {len(df):,} examples")
print(f"Columns: {list(df.columns)}")
print(f"\nConverting to JSONL...")
with open(output_file, 'w', encoding='utf-8') as f:
for idx, row in df.iterrows():
if idx % 5000 == 0 and idx > 0:
print(f" Converted {idx:,} / {len(df):,} examples...")
# Convert row to dictionary and write as JSON line
f.write(json.dumps(row.to_dict()) + '\n')
print(f"\n✅ Conversion complete!")
print(f" Input: {parquet_file} ({len(df):,} examples)")
print(f" Output: {output_file}")
# Verify file size
output_size = Path(output_file).stat().st_size / (1024 * 1024)
print(f" Size: {output_size:.2f} MB")
# Validate by reading first line
with open(output_file, 'r', encoding='utf-8') as f:
first_line = f.readline()
sample = json.loads(first_line)
print(f"\n📝 Sample structure:")
print(f" Fields: {list(sample.keys())}")
return len(df)
if __name__ == '__main__':
parquet_file = 'n8n_workflows_templates_dataset.parquet'
output_file = 'n8n_workflows_templates.jsonl'
count = convert_parquet_to_jsonl(parquet_file, output_file)
print(f"\n🎉 Successfully converted {count:,} examples to JSONL format!")
|