Hoe
Deploying Backend API
b339b93
"""Voteview schema definitions with explicit PyArrow types for all file types."""
from dataclasses import dataclass
from enum import Enum
import pyarrow as pa
class FileType(Enum):
"""Supported Voteview file types."""
MEMBERS = "members"
ROLLCALLS = "rollcalls"
VOTES = "votes"
# =============================================================================
# MEMBERS SCHEMA (22 columns)
# =============================================================================
# ICPSR IDs are integers (no leading zeros), so int32 is appropriate
# Many columns have float notation in CSV (e.g., "0.0") so use float64
VOTEVIEW_MEMBERS_SCHEMA: dict[str, pa.DataType] = {
# Integer columns (never have float notation)
"congress": pa.int16(),
"icpsr": pa.int32(),
# Columns with potential float notation in CSV (e.g., "0.0", "4.0")
"state_icpsr": pa.float64(),
"district_code": pa.float64(),
"party_code": pa.float64(),
# Nullable integer columns stored as float64
"occupancy": pa.float64(), # 0/1 but nullable
"last_means": pa.float64(), # nullable
"nominate_number_of_votes": pa.float64(), # integer counts but nullable
"nominate_number_of_errors": pa.float64(), # integer counts but nullable
# Float columns - NOMINATE scores and years
"born": pa.float64(), # year as float (1754.0)
"died": pa.float64(), # year as float
"nominate_dim1": pa.float64(),
"nominate_dim2": pa.float64(),
"nominate_log_likelihood": pa.float64(),
"nominate_geo_mean_probability": pa.float64(),
"conditional": pa.float64(), # nullable
"nokken_poole_dim1": pa.float64(),
"nokken_poole_dim2": pa.float64(),
# String columns
"chamber": pa.string(), # House/Senate/President
"state_abbrev": pa.string(),
"bioname": pa.string(), # Contains commas: "WASHINGTON, George"
"bioguide_id": pa.string(), # Alphanumeric ID: B000084
}
VOTEVIEW_MEMBERS_COLUMNS = [
"congress",
"chamber",
"icpsr",
"state_icpsr",
"district_code",
"state_abbrev",
"party_code",
"occupancy",
"last_means",
"bioname",
"bioguide_id",
"born",
"died",
"nominate_dim1",
"nominate_dim2",
"nominate_log_likelihood",
"nominate_geo_mean_probability",
"nominate_number_of_votes",
"nominate_number_of_errors",
"conditional",
"nokken_poole_dim1",
"nokken_poole_dim2",
]
# =============================================================================
# ROLLCALLS SCHEMA (18 columns)
# =============================================================================
VOTEVIEW_ROLLCALLS_SCHEMA: dict[str, pa.DataType] = {
# Integer columns
"congress": pa.int16(),
"rollnumber": pa.int32(),
"yea_count": pa.int32(),
"nay_count": pa.int32(),
# Nullable integer columns stored as float64
"session": pa.float64(), # nullable
"clerk_rollnumber": pa.float64(), # nullable
# Float columns - NOMINATE parameters
"nominate_mid_1": pa.float64(),
"nominate_mid_2": pa.float64(),
"nominate_spread_1": pa.float64(),
"nominate_spread_2": pa.float64(),
"nominate_log_likelihood": pa.float64(),
# String columns
"chamber": pa.string(),
"date": pa.string(), # ISO format: YYYY-MM-DD
"bill_number": pa.string(), # HR2, S17, etc.
"vote_result": pa.string(), # nullable
"vote_desc": pa.string(), # nullable
"vote_question": pa.string(), # nullable
"dtl_desc": pa.string(), # long descriptions
}
VOTEVIEW_ROLLCALLS_COLUMNS = [
"congress",
"chamber",
"rollnumber",
"date",
"session",
"clerk_rollnumber",
"yea_count",
"nay_count",
"nominate_mid_1",
"nominate_mid_2",
"nominate_spread_1",
"nominate_spread_2",
"nominate_log_likelihood",
"bill_number",
"vote_result",
"vote_desc",
"vote_question",
"dtl_desc",
]
# =============================================================================
# VOTES SCHEMA (6 columns)
# =============================================================================
# This is the large file (~26M rows) - keep schema minimal
# Some columns have float notation in CSV (e.g., "10713.0")
VOTEVIEW_VOTES_SCHEMA: dict[str, pa.DataType] = {
# Integer columns (never have float notation)
"congress": pa.int16(),
# Columns with potential float notation in CSV
"rollnumber": pa.float64(),
"icpsr": pa.float64(),
"cast_code": pa.float64(), # 1-9 values but may have ".0" suffix
# Float columns
"prob": pa.float64(), # nullable probability
# String columns
"chamber": pa.string(),
}
VOTEVIEW_VOTES_COLUMNS = [
"congress",
"chamber",
"rollnumber",
"icpsr",
"cast_code",
"prob",
]
# =============================================================================
# FILE TYPE CONFIGURATION
# =============================================================================
@dataclass
class FileTypeConfig:
"""Configuration for a specific Voteview file type."""
schema: dict[str, pa.DataType]
expected_columns: list[str]
sum_column: str | None # Column to sum for checksum validation
key_columns: list[str] # Columns to track non-null counts
sample_size: int = 1000 # Default sample size for validation
FILE_TYPE_CONFIGS: dict[FileType, FileTypeConfig] = {
FileType.MEMBERS: FileTypeConfig(
schema=VOTEVIEW_MEMBERS_SCHEMA,
expected_columns=VOTEVIEW_MEMBERS_COLUMNS,
sum_column="nominate_number_of_votes", # Integer sum
key_columns=["icpsr", "congress", "chamber", "bioname"],
sample_size=1000,
),
FileType.ROLLCALLS: FileTypeConfig(
schema=VOTEVIEW_ROLLCALLS_SCHEMA,
expected_columns=VOTEVIEW_ROLLCALLS_COLUMNS,
sum_column="yea_count", # Integer sum
key_columns=["congress", "chamber", "rollnumber", "date"],
sample_size=1000,
),
FileType.VOTES: FileTypeConfig(
schema=VOTEVIEW_VOTES_SCHEMA,
expected_columns=VOTEVIEW_VOTES_COLUMNS,
sum_column="cast_code", # Integer sum (1-9 values)
key_columns=["congress", "chamber", "rollnumber", "icpsr", "cast_code"],
sample_size=2000, # Larger sample for 26M row file
),
}
def get_config(file_type: FileType) -> FileTypeConfig:
"""Get configuration for a file type."""
return FILE_TYPE_CONFIGS[file_type]
# Null markers in Voteview CSVs
NULL_VALUES = ["", "N/A"]