Hoe
Deploying Backend API
b339b93
"""DIME schema definitions with explicit PyArrow types for all file types."""
from dataclasses import dataclass
from enum import Enum
import pyarrow as pa
class FileType(Enum):
"""Supported DIME file types."""
CONTRIBUTIONS = "contributions"
RECIPIENTS = "recipients"
CONTRIBUTORS = "contributors"
# =============================================================================
# CONTRIBUTIONS SCHEMA (45 columns)
# =============================================================================
# Critical: IDs and codes as strings to preserve leading zeros and prevent float conversion
DIME_CONTRIBUTIONS_SCHEMA: dict[str, pa.DataType] = {
# Integer columns
"cycle": pa.int32(),
"excluded.from.scaling": pa.int32(), # 0/1 flag
# Float columns
"amount": pa.float64(),
"gis.confidence": pa.float64(),
"contributor.cfscore": pa.float64(),
"candidate.cfscore": pa.float64(),
# String columns (all IDs, codes, and text fields)
"transaction.id": pa.string(),
"transaction.type": pa.string(),
"date": pa.string(), # Keep as string - YYYY-MM-DD format
"bonica.cid": pa.string(), # ID - MUST be string
"contributor.name": pa.string(),
"contributor.lname": pa.string(),
"contributor.fname": pa.string(),
"contributor.mname": pa.string(),
"contributor.suffix": pa.string(),
"contributor.title": pa.string(),
"contributor.ffname": pa.string(),
"contributor.type": pa.string(),
"contributor.gender": pa.string(),
"contributor.address": pa.string(),
"contributor.city": pa.string(),
"contributor.state": pa.string(),
"contributor.zipcode": pa.string(), # MUST be string - leading zeros
"contributor.occupation": pa.string(),
"contributor.employer": pa.string(),
"occ.standardized": pa.string(),
"is.corp": pa.string(),
"recipient.name": pa.string(),
"bonica.rid": pa.string(), # ID - MUST be string
"recipient.party": pa.string(),
"recipient.type": pa.string(),
"recipient.state": pa.string(),
"seat": pa.string(),
"election.type": pa.string(),
"latitude": pa.string(), # Stored as string per source format
"longitude": pa.string(), # Stored as string per source format
"contributor.district": pa.string(),
"censustract": pa.string(), # Census tract codes - string is safer
"efec.memo": pa.string(),
"efec.memo2": pa.string(),
"efec.transaction.id.orig": pa.string(),
"bk.ref.transaction.id": pa.string(),
"efec.org.orig": pa.string(),
"efec.comid.orig": pa.string(),
"efec.form.type": pa.string(),
}
DIME_CONTRIBUTIONS_COLUMNS = list(DIME_CONTRIBUTIONS_SCHEMA.keys())
# Backwards compatibility alias
DIME_SCHEMA = DIME_CONTRIBUTIONS_SCHEMA
EXPECTED_COLUMNS = DIME_CONTRIBUTIONS_COLUMNS
# =============================================================================
# RECIPIENTS SCHEMA (66 columns)
# =============================================================================
DIME_RECIPIENTS_SCHEMA: dict[str, pa.DataType] = {
# String columns - IDs and codes
"election": pa.string(),
"bonica.rid": pa.string(), # Primary ID - MUST be string
"bonica.cid": pa.string(), # ID - MUST be string
"name": pa.string(),
"lname": pa.string(),
"ffname": pa.string(),
"fname": pa.string(),
"mname": pa.string(),
"title": pa.string(),
"suffix": pa.string(),
"party": pa.string(),
"state": pa.string(),
"seat": pa.string(),
"district": pa.string(),
"distcyc": pa.string(),
"ico.status": pa.string(),
"cand.gender": pa.string(),
"pwinner": pa.string(),
"gwinner": pa.string(),
"s.elec.stat": pa.string(),
"r.elec.stat": pa.string(),
"fec.cand.status": pa.string(),
"recipient.type": pa.string(),
"igcat": pa.string(),
"comtype": pa.string(),
"ICPSR": pa.string(),
"ICPSR2": pa.string(),
"Cand.ID": pa.string(),
"FEC.ID": pa.string(),
"NID": pa.string(),
"before.switch.ICPSR": pa.string(),
"after.switch.ICPSR": pa.string(),
"party.orig": pa.string(),
"nimsp.party": pa.string(),
"nimsp.candidate.ICO.code": pa.string(),
"nimsp.district": pa.string(),
"nimsp.office": pa.string(),
"nimsp.candidate.status": pa.string(),
"included_in_scaling": pa.string(),
# Integer columns (use float64 to handle nulls gracefully)
"cycle": pa.float64(),
"fecyear": pa.float64(),
"num.givers": pa.float64(),
"num.givers.total": pa.float64(),
# Float columns - scores
"recipient.cfscore": pa.float64(),
"recipient.cfscore.dyn": pa.float64(),
"contributor.cfscore": pa.float64(),
"dwdime": pa.float64(),
"dwnom1": pa.float64(),
"dwnom2": pa.float64(),
"ps.dwnom1": pa.float64(),
"ps.dwnom2": pa.float64(),
"irt.cfscore": pa.float64(),
"composite.score": pa.float64(),
# Float columns - financial
"total.receipts": pa.float64(),
"total.disbursements": pa.float64(),
"total.indiv.contribs": pa.float64(),
"total.unitemized": pa.float64(),
"total.pac.contribs": pa.float64(),
"total.party.contribs": pa.float64(),
"total.contribs.from.candidate": pa.float64(),
"ind.exp.support": pa.float64(),
"ind.exp.oppose": pa.float64(),
# Float columns - vote percentages
"prim.vote.pct": pa.float64(),
"gen.vote.pct": pa.float64(),
"district.pres.vs": pa.float64(),
}
DIME_RECIPIENTS_COLUMNS = list(DIME_RECIPIENTS_SCHEMA.keys())
# =============================================================================
# CONTRIBUTORS SCHEMA (43 columns)
# =============================================================================
# Generate amount columns for each election cycle (1980-2024, even years)
_AMOUNT_YEARS = list(range(1980, 2025, 2)) # 1980, 1982, ..., 2024
DIME_CONTRIBUTORS_SCHEMA: dict[str, pa.DataType] = {
# String columns - IDs
"bonica.cid": pa.string(), # Primary ID - MUST be string
"contributor.type": pa.string(),
"most.recent.contributor.name": pa.string(),
"most.recent.contributor.address": pa.string(),
"most.recent.contributor.city": pa.string(),
"most.recent.contributor.zipcode": pa.string(), # MUST be string - leading zeros
"most.recent.contributor.state": pa.string(),
"most.recent.contributor.occupation": pa.string(),
"most.recent.contributor.employer": pa.string(),
"most.recent.transaction.id": pa.string(),
"most.recent.transaction.date": pa.string(), # Keep as string - YYYY-MM-DD format
"contributor.gender": pa.string(),
"is.corp": pa.string(),
"is.projected": pa.string(),
# Float columns (use float64 to handle nulls)
"num.distinct": pa.float64(),
"most.recent.contributor.latitude": pa.float64(),
"most.recent.contributor.longitude": pa.float64(),
"contributor.cfscore": pa.float64(),
"first_cycle_active": pa.float64(),
"last_cycle_active": pa.float64(),
# Amount columns for each election cycle
**{f"amount.{year}": pa.float64() for year in _AMOUNT_YEARS},
}
DIME_CONTRIBUTORS_COLUMNS = list(DIME_CONTRIBUTORS_SCHEMA.keys())
# =============================================================================
# FILE TYPE CONFIGURATION
# =============================================================================
@dataclass
class FileTypeConfig:
"""Configuration for a specific DIME file type."""
schema: dict[str, pa.DataType]
expected_columns: list[str]
sum_column: str | None # Column to sum for checksum validation
key_columns: list[str] # Columns to track non-null counts
FILE_TYPE_CONFIGS: dict[FileType, FileTypeConfig] = {
FileType.CONTRIBUTIONS: FileTypeConfig(
schema=DIME_CONTRIBUTIONS_SCHEMA,
expected_columns=DIME_CONTRIBUTIONS_COLUMNS,
sum_column="amount",
key_columns=["transaction.id", "bonica.cid", "contributor.name", "amount"],
),
FileType.RECIPIENTS: FileTypeConfig(
schema=DIME_RECIPIENTS_SCHEMA,
expected_columns=DIME_RECIPIENTS_COLUMNS,
sum_column="recipient.cfscore",
key_columns=["bonica.rid", "bonica.cid", "name"],
),
FileType.CONTRIBUTORS: FileTypeConfig(
schema=DIME_CONTRIBUTORS_SCHEMA,
expected_columns=DIME_CONTRIBUTORS_COLUMNS,
sum_column="contributor.cfscore",
key_columns=["bonica.cid", "most.recent.contributor.name"],
),
}
def get_config(file_type: FileType) -> FileTypeConfig:
"""Get configuration for a file type."""
return FILE_TYPE_CONFIGS[file_type]
# MySQL export null marker and empty string
NULL_VALUES = ["\\N", ""]