Spaces:
Runtime error
Runtime error
| """Voteview schema definitions with explicit PyArrow types for all file types.""" | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| import pyarrow as pa | |
| class FileType(Enum): | |
| """Supported Voteview file types.""" | |
| MEMBERS = "members" | |
| ROLLCALLS = "rollcalls" | |
| VOTES = "votes" | |
| # ============================================================================= | |
| # MEMBERS SCHEMA (22 columns) | |
| # ============================================================================= | |
| # ICPSR IDs are integers (no leading zeros), so int32 is appropriate | |
| # Many columns have float notation in CSV (e.g., "0.0") so use float64 | |
| VOTEVIEW_MEMBERS_SCHEMA: dict[str, pa.DataType] = { | |
| # Integer columns (never have float notation) | |
| "congress": pa.int16(), | |
| "icpsr": pa.int32(), | |
| # Columns with potential float notation in CSV (e.g., "0.0", "4.0") | |
| "state_icpsr": pa.float64(), | |
| "district_code": pa.float64(), | |
| "party_code": pa.float64(), | |
| # Nullable integer columns stored as float64 | |
| "occupancy": pa.float64(), # 0/1 but nullable | |
| "last_means": pa.float64(), # nullable | |
| "nominate_number_of_votes": pa.float64(), # integer counts but nullable | |
| "nominate_number_of_errors": pa.float64(), # integer counts but nullable | |
| # Float columns - NOMINATE scores and years | |
| "born": pa.float64(), # year as float (1754.0) | |
| "died": pa.float64(), # year as float | |
| "nominate_dim1": pa.float64(), | |
| "nominate_dim2": pa.float64(), | |
| "nominate_log_likelihood": pa.float64(), | |
| "nominate_geo_mean_probability": pa.float64(), | |
| "conditional": pa.float64(), # nullable | |
| "nokken_poole_dim1": pa.float64(), | |
| "nokken_poole_dim2": pa.float64(), | |
| # String columns | |
| "chamber": pa.string(), # House/Senate/President | |
| "state_abbrev": pa.string(), | |
| "bioname": pa.string(), # Contains commas: "WASHINGTON, George" | |
| "bioguide_id": pa.string(), # Alphanumeric ID: B000084 | |
| } | |
| VOTEVIEW_MEMBERS_COLUMNS = [ | |
| "congress", | |
| "chamber", | |
| "icpsr", | |
| "state_icpsr", | |
| "district_code", | |
| "state_abbrev", | |
| "party_code", | |
| "occupancy", | |
| "last_means", | |
| "bioname", | |
| "bioguide_id", | |
| "born", | |
| "died", | |
| "nominate_dim1", | |
| "nominate_dim2", | |
| "nominate_log_likelihood", | |
| "nominate_geo_mean_probability", | |
| "nominate_number_of_votes", | |
| "nominate_number_of_errors", | |
| "conditional", | |
| "nokken_poole_dim1", | |
| "nokken_poole_dim2", | |
| ] | |
| # ============================================================================= | |
| # ROLLCALLS SCHEMA (18 columns) | |
| # ============================================================================= | |
| VOTEVIEW_ROLLCALLS_SCHEMA: dict[str, pa.DataType] = { | |
| # Integer columns | |
| "congress": pa.int16(), | |
| "rollnumber": pa.int32(), | |
| "yea_count": pa.int32(), | |
| "nay_count": pa.int32(), | |
| # Nullable integer columns stored as float64 | |
| "session": pa.float64(), # nullable | |
| "clerk_rollnumber": pa.float64(), # nullable | |
| # Float columns - NOMINATE parameters | |
| "nominate_mid_1": pa.float64(), | |
| "nominate_mid_2": pa.float64(), | |
| "nominate_spread_1": pa.float64(), | |
| "nominate_spread_2": pa.float64(), | |
| "nominate_log_likelihood": pa.float64(), | |
| # String columns | |
| "chamber": pa.string(), | |
| "date": pa.string(), # ISO format: YYYY-MM-DD | |
| "bill_number": pa.string(), # HR2, S17, etc. | |
| "vote_result": pa.string(), # nullable | |
| "vote_desc": pa.string(), # nullable | |
| "vote_question": pa.string(), # nullable | |
| "dtl_desc": pa.string(), # long descriptions | |
| } | |
| VOTEVIEW_ROLLCALLS_COLUMNS = [ | |
| "congress", | |
| "chamber", | |
| "rollnumber", | |
| "date", | |
| "session", | |
| "clerk_rollnumber", | |
| "yea_count", | |
| "nay_count", | |
| "nominate_mid_1", | |
| "nominate_mid_2", | |
| "nominate_spread_1", | |
| "nominate_spread_2", | |
| "nominate_log_likelihood", | |
| "bill_number", | |
| "vote_result", | |
| "vote_desc", | |
| "vote_question", | |
| "dtl_desc", | |
| ] | |
| # ============================================================================= | |
| # VOTES SCHEMA (6 columns) | |
| # ============================================================================= | |
| # This is the large file (~26M rows) - keep schema minimal | |
| # Some columns have float notation in CSV (e.g., "10713.0") | |
| VOTEVIEW_VOTES_SCHEMA: dict[str, pa.DataType] = { | |
| # Integer columns (never have float notation) | |
| "congress": pa.int16(), | |
| # Columns with potential float notation in CSV | |
| "rollnumber": pa.float64(), | |
| "icpsr": pa.float64(), | |
| "cast_code": pa.float64(), # 1-9 values but may have ".0" suffix | |
| # Float columns | |
| "prob": pa.float64(), # nullable probability | |
| # String columns | |
| "chamber": pa.string(), | |
| } | |
| VOTEVIEW_VOTES_COLUMNS = [ | |
| "congress", | |
| "chamber", | |
| "rollnumber", | |
| "icpsr", | |
| "cast_code", | |
| "prob", | |
| ] | |
| # ============================================================================= | |
| # FILE TYPE CONFIGURATION | |
| # ============================================================================= | |
| class FileTypeConfig: | |
| """Configuration for a specific Voteview file type.""" | |
| schema: dict[str, pa.DataType] | |
| expected_columns: list[str] | |
| sum_column: str | None # Column to sum for checksum validation | |
| key_columns: list[str] # Columns to track non-null counts | |
| sample_size: int = 1000 # Default sample size for validation | |
| FILE_TYPE_CONFIGS: dict[FileType, FileTypeConfig] = { | |
| FileType.MEMBERS: FileTypeConfig( | |
| schema=VOTEVIEW_MEMBERS_SCHEMA, | |
| expected_columns=VOTEVIEW_MEMBERS_COLUMNS, | |
| sum_column="nominate_number_of_votes", # Integer sum | |
| key_columns=["icpsr", "congress", "chamber", "bioname"], | |
| sample_size=1000, | |
| ), | |
| FileType.ROLLCALLS: FileTypeConfig( | |
| schema=VOTEVIEW_ROLLCALLS_SCHEMA, | |
| expected_columns=VOTEVIEW_ROLLCALLS_COLUMNS, | |
| sum_column="yea_count", # Integer sum | |
| key_columns=["congress", "chamber", "rollnumber", "date"], | |
| sample_size=1000, | |
| ), | |
| FileType.VOTES: FileTypeConfig( | |
| schema=VOTEVIEW_VOTES_SCHEMA, | |
| expected_columns=VOTEVIEW_VOTES_COLUMNS, | |
| sum_column="cast_code", # Integer sum (1-9 values) | |
| key_columns=["congress", "chamber", "rollnumber", "icpsr", "cast_code"], | |
| sample_size=2000, # Larger sample for 26M row file | |
| ), | |
| } | |
| def get_config(file_type: FileType) -> FileTypeConfig: | |
| """Get configuration for a file type.""" | |
| return FILE_TYPE_CONFIGS[file_type] | |
| # Null markers in Voteview CSVs | |
| NULL_VALUES = ["", "N/A"] | |