Spaces:

Hoecat
/

paper-trail-api

Runtime error

paper-trail-api / scripts /voteview_converter /schema.py

Hoe

Deploying Backend API

b339b93 2 months ago

6.67 kB

	"""Voteview schema definitions with explicit PyArrow types for all file types."""

	from dataclasses import dataclass
	from enum import Enum

	import pyarrow as pa


	class FileType(Enum):
	"""Supported Voteview file types."""

	MEMBERS = "members"
	ROLLCALLS = "rollcalls"
	VOTES = "votes"


	# =============================================================================
	# MEMBERS SCHEMA (22 columns)
	# =============================================================================
	# ICPSR IDs are integers (no leading zeros), so int32 is appropriate
	# Many columns have float notation in CSV (e.g., "0.0") so use float64
	VOTEVIEW_MEMBERS_SCHEMA: dict[str, pa.DataType] = {
	# Integer columns (never have float notation)
	"congress": pa.int16(),
	"icpsr": pa.int32(),
	# Columns with potential float notation in CSV (e.g., "0.0", "4.0")
	"state_icpsr": pa.float64(),
	"district_code": pa.float64(),
	"party_code": pa.float64(),
	# Nullable integer columns stored as float64
	"occupancy": pa.float64(), # 0/1 but nullable
	"last_means": pa.float64(), # nullable
	"nominate_number_of_votes": pa.float64(), # integer counts but nullable
	"nominate_number_of_errors": pa.float64(), # integer counts but nullable
	# Float columns - NOMINATE scores and years
	"born": pa.float64(), # year as float (1754.0)
	"died": pa.float64(), # year as float
	"nominate_dim1": pa.float64(),
	"nominate_dim2": pa.float64(),
	"nominate_log_likelihood": pa.float64(),
	"nominate_geo_mean_probability": pa.float64(),
	"conditional": pa.float64(), # nullable
	"nokken_poole_dim1": pa.float64(),
	"nokken_poole_dim2": pa.float64(),
	# String columns
	"chamber": pa.string(), # House/Senate/President
	"state_abbrev": pa.string(),
	"bioname": pa.string(), # Contains commas: "WASHINGTON, George"
	"bioguide_id": pa.string(), # Alphanumeric ID: B000084
	}

	VOTEVIEW_MEMBERS_COLUMNS = [
	"congress",
	"chamber",
	"icpsr",
	"state_icpsr",
	"district_code",
	"state_abbrev",
	"party_code",
	"occupancy",
	"last_means",
	"bioname",
	"bioguide_id",
	"born",
	"died",
	"nominate_dim1",
	"nominate_dim2",
	"nominate_log_likelihood",
	"nominate_geo_mean_probability",
	"nominate_number_of_votes",
	"nominate_number_of_errors",
	"conditional",
	"nokken_poole_dim1",
	"nokken_poole_dim2",
	]


	# =============================================================================
	# ROLLCALLS SCHEMA (18 columns)
	# =============================================================================
	VOTEVIEW_ROLLCALLS_SCHEMA: dict[str, pa.DataType] = {
	# Integer columns
	"congress": pa.int16(),
	"rollnumber": pa.int32(),
	"yea_count": pa.int32(),
	"nay_count": pa.int32(),
	# Nullable integer columns stored as float64
	"session": pa.float64(), # nullable
	"clerk_rollnumber": pa.float64(), # nullable
	# Float columns - NOMINATE parameters
	"nominate_mid_1": pa.float64(),
	"nominate_mid_2": pa.float64(),
	"nominate_spread_1": pa.float64(),
	"nominate_spread_2": pa.float64(),
	"nominate_log_likelihood": pa.float64(),
	# String columns
	"chamber": pa.string(),
	"date": pa.string(), # ISO format: YYYY-MM-DD
	"bill_number": pa.string(), # HR2, S17, etc.
	"vote_result": pa.string(), # nullable
	"vote_desc": pa.string(), # nullable
	"vote_question": pa.string(), # nullable
	"dtl_desc": pa.string(), # long descriptions
	}

	VOTEVIEW_ROLLCALLS_COLUMNS = [
	"congress",
	"chamber",
	"rollnumber",
	"date",
	"session",
	"clerk_rollnumber",
	"yea_count",
	"nay_count",
	"nominate_mid_1",
	"nominate_mid_2",
	"nominate_spread_1",
	"nominate_spread_2",
	"nominate_log_likelihood",
	"bill_number",
	"vote_result",
	"vote_desc",
	"vote_question",
	"dtl_desc",
	]


	# =============================================================================
	# VOTES SCHEMA (6 columns)
	# =============================================================================
	# This is the large file (~26M rows) - keep schema minimal
	# Some columns have float notation in CSV (e.g., "10713.0")
	VOTEVIEW_VOTES_SCHEMA: dict[str, pa.DataType] = {
	# Integer columns (never have float notation)
	"congress": pa.int16(),
	# Columns with potential float notation in CSV
	"rollnumber": pa.float64(),
	"icpsr": pa.float64(),
	"cast_code": pa.float64(), # 1-9 values but may have ".0" suffix
	# Float columns
	"prob": pa.float64(), # nullable probability
	# String columns
	"chamber": pa.string(),
	}

	VOTEVIEW_VOTES_COLUMNS = [
	"congress",
	"chamber",
	"rollnumber",
	"icpsr",
	"cast_code",
	"prob",
	]


	# =============================================================================
	# FILE TYPE CONFIGURATION
	# =============================================================================
	@dataclass
	class FileTypeConfig:
	"""Configuration for a specific Voteview file type."""

	schema: dict[str, pa.DataType]
	expected_columns: list[str]
	sum_column: str \| None # Column to sum for checksum validation
	key_columns: list[str] # Columns to track non-null counts
	sample_size: int = 1000 # Default sample size for validation


	FILE_TYPE_CONFIGS: dict[FileType, FileTypeConfig] = {
	FileType.MEMBERS: FileTypeConfig(
	schema=VOTEVIEW_MEMBERS_SCHEMA,
	expected_columns=VOTEVIEW_MEMBERS_COLUMNS,
	sum_column="nominate_number_of_votes", # Integer sum
	key_columns=["icpsr", "congress", "chamber", "bioname"],
	sample_size=1000,
	),
	FileType.ROLLCALLS: FileTypeConfig(
	schema=VOTEVIEW_ROLLCALLS_SCHEMA,
	expected_columns=VOTEVIEW_ROLLCALLS_COLUMNS,
	sum_column="yea_count", # Integer sum
	key_columns=["congress", "chamber", "rollnumber", "date"],
	sample_size=1000,
	),
	FileType.VOTES: FileTypeConfig(
	schema=VOTEVIEW_VOTES_SCHEMA,
	expected_columns=VOTEVIEW_VOTES_COLUMNS,
	sum_column="cast_code", # Integer sum (1-9 values)
	key_columns=["congress", "chamber", "rollnumber", "icpsr", "cast_code"],
	sample_size=2000, # Larger sample for 26M row file
	),
	}


	def get_config(file_type: FileType) -> FileTypeConfig:
	"""Get configuration for a file type."""
	return FILE_TYPE_CONFIGS[file_type]


	# Null markers in Voteview CSVs
	NULL_VALUES = ["", "N/A"]