|
|
|
|
|
""" |
|
|
Step 1: Convert Parquet to JSONL |
|
|
|
|
|
Converts the n8n_workflows_templates_dataset.parquet file |
|
|
to JSONL format for consistency with other datasets. |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
def convert_parquet_to_jsonl(parquet_file, output_file): |
|
|
"""Convert Parquet dataset to JSONL format.""" |
|
|
print(f"Loading {parquet_file}...") |
|
|
df = pd.read_parquet(parquet_file) |
|
|
|
|
|
print(f"Loaded {len(df):,} examples") |
|
|
print(f"Columns: {list(df.columns)}") |
|
|
|
|
|
print(f"\nConverting to JSONL...") |
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
for idx, row in df.iterrows(): |
|
|
if idx % 5000 == 0 and idx > 0: |
|
|
print(f" Converted {idx:,} / {len(df):,} examples...") |
|
|
|
|
|
|
|
|
f.write(json.dumps(row.to_dict()) + '\n') |
|
|
|
|
|
print(f"\n✅ Conversion complete!") |
|
|
print(f" Input: {parquet_file} ({len(df):,} examples)") |
|
|
print(f" Output: {output_file}") |
|
|
|
|
|
|
|
|
output_size = Path(output_file).stat().st_size / (1024 * 1024) |
|
|
print(f" Size: {output_size:.2f} MB") |
|
|
|
|
|
|
|
|
with open(output_file, 'r', encoding='utf-8') as f: |
|
|
first_line = f.readline() |
|
|
sample = json.loads(first_line) |
|
|
print(f"\n📝 Sample structure:") |
|
|
print(f" Fields: {list(sample.keys())}") |
|
|
|
|
|
return len(df) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
parquet_file = 'n8n_workflows_templates_dataset.parquet' |
|
|
output_file = 'n8n_workflows_templates.jsonl' |
|
|
|
|
|
count = convert_parquet_to_jsonl(parquet_file, output_file) |
|
|
print(f"\n🎉 Successfully converted {count:,} examples to JSONL format!") |
|
|
|