File size: 3,175 Bytes
b339b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Command-line interface for distinct legislators extractor."""

from __future__ import annotations

import argparse
import random
import sys
from datetime import datetime
from pathlib import Path

from .exceptions import DistinctLegislatorsError
from .extractor import extract_distinct_legislators
from .schema import MIN_CONGRESS, VOTEVIEW_MEMBERS_URL


def main() -> int:
    """Main entry point for the CLI."""
    parser = argparse.ArgumentParser(
        description="Extract distinct legislators from Voteview data",
        epilog="""

Examples:

  %(prog)s legislators.parquet

  %(prog)s legislators.parquet --min-congress 100

  %(prog)s legislators.parquet --no-validate

        """,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "output",
        type=Path,
        help="Output Parquet file path",
    )
    parser.add_argument(
        "--source-url",
        type=str,
        default=VOTEVIEW_MEMBERS_URL,
        help="Source parquet URL (default: HuggingFace Voteview)",
    )
    parser.add_argument(
        "--min-congress",
        type=int,
        default=MIN_CONGRESS,
        help=f"Minimum congress number (default: {MIN_CONGRESS} = 1979)",
    )
    parser.add_argument(
        "--no-validate",
        action="store_true",
        help="Skip validation (not recommended)",
    )
    parser.add_argument(
        "--aggregation-sample",
        type=int,
        default=100,
        help="Sample size for aggregation validation (default: 100)",
    )
    parser.add_argument(
        "--deep-sample",
        type=int,
        default=50,
        help="Sample size for deep validation (default: 50)",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Random seed for reproducible validation sampling",
    )

    args = parser.parse_args()

    # Set random seed for reproducible validation
    if args.seed is not None:
        random.seed(args.seed)

    print(f"[{datetime.now().isoformat()}] Extracting distinct legislators")
    print(f"  Min congress: {args.min_congress}")
    if args.seed is not None:
        print(f"  Random seed: {args.seed}")

    try:
        result = extract_distinct_legislators(
            args.output,
            source_url=args.source_url,
            min_congress=args.min_congress,
            validate=not args.no_validate,
            aggregation_sample_size=args.aggregation_sample,
            deep_sample_size=args.deep_sample,
        )

        print(f"\n[{datetime.now().isoformat()}] SUCCESS")
        print(f"  Output: {result.output_path}")
        print(f"  Size: {result.output_path.stat().st_size / 1024:.1f} KB")
        print(f"  Legislators: {result.output_count:,}")

        if args.no_validate:
            print("  Validation: SKIPPED")
        else:
            print("  Validation: ALL PASSED")

        return 0

    except DistinctLegislatorsError as e:
        print(f"ERROR: {e}", file=sys.stderr)
        return 1


if __name__ == "__main__":
    sys.exit(main())