|
|
|
|
|
""" |
|
|
Comprehensive Dataset Analysis Tool |
|
|
|
|
|
Analyzes all n8n datasets including JSONL and Parquet formats. |
|
|
Provides detailed statistics, validation, and duplicate detection. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Any |
|
|
from collections import defaultdict |
|
|
|
|
|
try: |
|
|
import pandas as pd |
|
|
PANDAS_AVAILABLE = True |
|
|
except ImportError: |
|
|
PANDAS_AVAILABLE = False |
|
|
print("β οΈ pandas not available - Parquet analysis will be skipped") |
|
|
print(" Install with: pip install pandas pyarrow") |
|
|
|
|
|
|
|
|
def analyze_jsonl(filepath: Path) -> Dict[str, Any]: |
|
|
"""Analyze JSONL format dataset.""" |
|
|
print(f"\nπ Analyzing: {filepath.name}") |
|
|
print(f" Size: {filepath.stat().st_size / (1024*1024):.2f} MB") |
|
|
|
|
|
examples = [] |
|
|
errors = [] |
|
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
for line_num, line in enumerate(f, 1): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
try: |
|
|
examples.append(json.loads(line)) |
|
|
except json.JSONDecodeError as e: |
|
|
errors.append(f"Line {line_num}: {e}") |
|
|
if len(errors) < 5: |
|
|
print(f" β οΈ Error on line {line_num}: {e}") |
|
|
|
|
|
|
|
|
fields = set() |
|
|
if examples: |
|
|
for ex in examples[:100]: |
|
|
fields.update(ex.keys()) |
|
|
|
|
|
print(f" β
Valid: {len(examples):,} examples") |
|
|
print(f" π Fields: {', '.join(sorted(fields))}") |
|
|
|
|
|
return { |
|
|
'filename': filepath.name, |
|
|
'format': 'JSONL', |
|
|
'size_mb': filepath.stat().st_size / (1024*1024), |
|
|
'example_count': len(examples), |
|
|
'fields': sorted(fields), |
|
|
'errors': errors, |
|
|
'sample': examples[0] if examples else None |
|
|
} |
|
|
|
|
|
|
|
|
def analyze_json_array(filepath: Path) -> Dict[str, Any]: |
|
|
"""Analyze JSON array format dataset.""" |
|
|
print(f"\nπ Analyzing: {filepath.name}") |
|
|
print(f" Size: {filepath.stat().st_size / (1024*1024):.2f} MB") |
|
|
|
|
|
try: |
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
if not isinstance(data, list): |
|
|
print(f" β Not a JSON array!") |
|
|
return None |
|
|
|
|
|
fields = set() |
|
|
if data: |
|
|
for ex in data[:100]: |
|
|
if isinstance(ex, dict): |
|
|
fields.update(ex.keys()) |
|
|
|
|
|
print(f" β
Valid: {len(data):,} examples") |
|
|
print(f" π Fields: {', '.join(sorted(fields))}") |
|
|
|
|
|
return { |
|
|
'filename': filepath.name, |
|
|
'format': 'JSON Array', |
|
|
'size_mb': filepath.stat().st_size / (1024*1024), |
|
|
'example_count': len(data), |
|
|
'fields': sorted(fields), |
|
|
'errors': [], |
|
|
'sample': data[0] if data else None |
|
|
} |
|
|
except Exception as e: |
|
|
print(f" β Error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def analyze_parquet(filepath: Path) -> Dict[str, Any]: |
|
|
"""Analyze Parquet format dataset.""" |
|
|
if not PANDAS_AVAILABLE: |
|
|
print(f"\nβ οΈ Skipping {filepath.name} - pandas not installed") |
|
|
return None |
|
|
|
|
|
print(f"\nπ Analyzing: {filepath.name}") |
|
|
print(f" Size: {filepath.stat().st_size / (1024*1024):.2f} MB") |
|
|
|
|
|
try: |
|
|
df = pd.read_parquet(filepath) |
|
|
|
|
|
print(f" β
Valid: {len(df):,} examples") |
|
|
print(f" π Columns: {', '.join(df.columns.tolist())}") |
|
|
|
|
|
return { |
|
|
'filename': filepath.name, |
|
|
'format': 'Parquet', |
|
|
'size_mb': filepath.stat().st_size / (1024*1024), |
|
|
'example_count': len(df), |
|
|
'fields': df.columns.tolist(), |
|
|
'errors': [], |
|
|
'sample': df.iloc[0].to_dict() if len(df) > 0 else None |
|
|
} |
|
|
except Exception as e: |
|
|
print(f" β Error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main analysis function.""" |
|
|
print("=" * 70) |
|
|
print("N8N DATASET COLLECTION ANALYSIS") |
|
|
print("=" * 70) |
|
|
|
|
|
datasets_dir = Path(__file__).parent |
|
|
results = [] |
|
|
|
|
|
|
|
|
jsonl_files = sorted(datasets_dir.glob('*.jsonl')) |
|
|
json_files = sorted([f for f in datasets_dir.glob('dataset_*.json')]) |
|
|
parquet_files = sorted(datasets_dir.glob('*.parquet')) |
|
|
|
|
|
print(f"\nπ Found:") |
|
|
print(f" - {len(jsonl_files)} JSONL files") |
|
|
print(f" - {len(json_files)} JSON files") |
|
|
print(f" - {len(parquet_files)} Parquet files") |
|
|
|
|
|
|
|
|
for filepath in jsonl_files: |
|
|
result = analyze_jsonl(filepath) |
|
|
if result: |
|
|
results.append(result) |
|
|
|
|
|
|
|
|
for filepath in json_files: |
|
|
result = analyze_json_array(filepath) |
|
|
if result: |
|
|
results.append(result) |
|
|
|
|
|
|
|
|
for filepath in parquet_files: |
|
|
result = analyze_parquet(filepath) |
|
|
if result: |
|
|
results.append(result) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("COLLECTION SUMMARY") |
|
|
print("=" * 70) |
|
|
|
|
|
total_examples = sum(r['example_count'] for r in results) |
|
|
total_size = sum(r['size_mb'] for r in results) |
|
|
|
|
|
print(f"\nπ¦ Total Datasets: {len(results)}") |
|
|
print(f"π Total Examples: {total_examples:,}") |
|
|
print(f"πΎ Total Size: {total_size:.2f} MB ({total_size/1024:.2f} GB)") |
|
|
|
|
|
|
|
|
print("\n" + "-" * 70) |
|
|
print(f"{'Dataset':<45} {'Format':<12} {'Examples':>12}") |
|
|
print("-" * 70) |
|
|
|
|
|
for r in sorted(results, key=lambda x: x['example_count'], reverse=True): |
|
|
print(f"{r['filename']:<45} {r['format']:<12} {r['example_count']:>12,}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("FIELD ANALYSIS") |
|
|
print("=" * 70) |
|
|
|
|
|
field_counts = defaultdict(int) |
|
|
for r in results: |
|
|
for field in r['fields']: |
|
|
field_counts[field] += 1 |
|
|
|
|
|
print(f"\nCommon fields across datasets:") |
|
|
for field, count in sorted(field_counts.items(), key=lambda x: x[1], reverse=True): |
|
|
print(f" {field:<30} (in {count}/{len(results)} datasets)") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
print("SAMPLE STRUCTURE") |
|
|
print("=" * 70) |
|
|
|
|
|
for r in results[:2]: |
|
|
if r['sample']: |
|
|
print(f"\n{r['filename']}:") |
|
|
print(f" Fields: {list(r['sample'].keys())}") |
|
|
for key in list(r['sample'].keys())[:3]: |
|
|
value = str(r['sample'][key])[:100] |
|
|
print(f" {key}: {value}...") |
|
|
|
|
|
print("\n" + "=" * 70) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|