File size: 2,490 Bytes
b339b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""Command-line interface for DIME converter."""

import argparse
import sys
from datetime import datetime
from pathlib import Path

from .converter import convert_dime_file
from .exceptions import DIMEConversionError
from .schema import FileType


def detect_file_type(filename: str) -> FileType:
    """Auto-detect file type from filename."""
    name_lower = filename.lower()
    if "recipient" in name_lower:
        return FileType.RECIPIENTS
    if "contributor" in name_lower:
        return FileType.CONTRIBUTORS
    return FileType.CONTRIBUTIONS


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Convert DIME campaign finance CSV.gz files to Parquet format"
    )
    parser.add_argument(
        "source",
        type=Path,
        help="Source CSV.gz file path",
    )
    parser.add_argument(
        "output",
        type=Path,
        help="Output Parquet file path",
    )
    parser.add_argument(
        "-t",
        "--file-type",
        choices=["contributions", "recipients", "contributors"],
        default=None,
        help="Type of DIME file (auto-detected from filename if not specified)",
    )
    parser.add_argument(
        "--no-validate",
        action="store_true",
        help="Skip validation (not recommended)",
    )
    parser.add_argument(
        "--sample-size",
        type=int,
        default=1000,
        help="Number of rows to sample for validation (default: 1000)",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=100_000,
        help="Rows per batch for streaming conversion (default: 100000)",
    )

    args = parser.parse_args()

    # Determine file type
    if args.file_type:
        file_type = FileType(args.file_type)
    else:
        file_type = detect_file_type(args.source.name)

    print(f"[{datetime.now().isoformat()}] Converting: {args.source.name}")
    print(f"  File type: {file_type.value}")

    try:
        result = convert_dime_file(
            args.source,
            args.output,
            file_type,
            validate=not args.no_validate,
            sample_size=args.sample_size,
            batch_size=args.batch_size,
        )
        print(f"[{datetime.now().isoformat()}] SUCCESS: {result.row_count:,} rows")
        return 0

    except DIMEConversionError as e:
        print(f"ERROR: {e}", file=sys.stderr)
        return 1


def __main__():
    sys.exit(main())


if __name__ == "__main__":
    sys.exit(main())