Spaces:

Hoecat
/

paper-trail-api

Runtime error

paper-trail-api / scripts /dime_converter /schema.py

Hoe

Deploying Backend API

b339b93 2 months ago

8.62 kB

	"""DIME schema definitions with explicit PyArrow types for all file types."""

	from dataclasses import dataclass
	from enum import Enum

	import pyarrow as pa


	class FileType(Enum):
	"""Supported DIME file types."""

	CONTRIBUTIONS = "contributions"
	RECIPIENTS = "recipients"
	CONTRIBUTORS = "contributors"


	# =============================================================================
	# CONTRIBUTIONS SCHEMA (45 columns)
	# =============================================================================
	# Critical: IDs and codes as strings to preserve leading zeros and prevent float conversion
	DIME_CONTRIBUTIONS_SCHEMA: dict[str, pa.DataType] = {
	# Integer columns
	"cycle": pa.int32(),
	"excluded.from.scaling": pa.int32(), # 0/1 flag
	# Float columns
	"amount": pa.float64(),
	"gis.confidence": pa.float64(),
	"contributor.cfscore": pa.float64(),
	"candidate.cfscore": pa.float64(),
	# String columns (all IDs, codes, and text fields)
	"transaction.id": pa.string(),
	"transaction.type": pa.string(),
	"date": pa.string(), # Keep as string - YYYY-MM-DD format
	"bonica.cid": pa.string(), # ID - MUST be string
	"contributor.name": pa.string(),
	"contributor.lname": pa.string(),
	"contributor.fname": pa.string(),
	"contributor.mname": pa.string(),
	"contributor.suffix": pa.string(),
	"contributor.title": pa.string(),
	"contributor.ffname": pa.string(),
	"contributor.type": pa.string(),
	"contributor.gender": pa.string(),
	"contributor.address": pa.string(),
	"contributor.city": pa.string(),
	"contributor.state": pa.string(),
	"contributor.zipcode": pa.string(), # MUST be string - leading zeros
	"contributor.occupation": pa.string(),
	"contributor.employer": pa.string(),
	"occ.standardized": pa.string(),
	"is.corp": pa.string(),
	"recipient.name": pa.string(),
	"bonica.rid": pa.string(), # ID - MUST be string
	"recipient.party": pa.string(),
	"recipient.type": pa.string(),
	"recipient.state": pa.string(),
	"seat": pa.string(),
	"election.type": pa.string(),
	"latitude": pa.string(), # Stored as string per source format
	"longitude": pa.string(), # Stored as string per source format
	"contributor.district": pa.string(),
	"censustract": pa.string(), # Census tract codes - string is safer
	"efec.memo": pa.string(),
	"efec.memo2": pa.string(),
	"efec.transaction.id.orig": pa.string(),
	"bk.ref.transaction.id": pa.string(),
	"efec.org.orig": pa.string(),
	"efec.comid.orig": pa.string(),
	"efec.form.type": pa.string(),
	}

	DIME_CONTRIBUTIONS_COLUMNS = list(DIME_CONTRIBUTIONS_SCHEMA.keys())

	# Backwards compatibility alias
	DIME_SCHEMA = DIME_CONTRIBUTIONS_SCHEMA
	EXPECTED_COLUMNS = DIME_CONTRIBUTIONS_COLUMNS


	# =============================================================================
	# RECIPIENTS SCHEMA (66 columns)
	# =============================================================================
	DIME_RECIPIENTS_SCHEMA: dict[str, pa.DataType] = {
	# String columns - IDs and codes
	"election": pa.string(),
	"bonica.rid": pa.string(), # Primary ID - MUST be string
	"bonica.cid": pa.string(), # ID - MUST be string
	"name": pa.string(),
	"lname": pa.string(),
	"ffname": pa.string(),
	"fname": pa.string(),
	"mname": pa.string(),
	"title": pa.string(),
	"suffix": pa.string(),
	"party": pa.string(),
	"state": pa.string(),
	"seat": pa.string(),
	"district": pa.string(),
	"distcyc": pa.string(),
	"ico.status": pa.string(),
	"cand.gender": pa.string(),
	"pwinner": pa.string(),
	"gwinner": pa.string(),
	"s.elec.stat": pa.string(),
	"r.elec.stat": pa.string(),
	"fec.cand.status": pa.string(),
	"recipient.type": pa.string(),
	"igcat": pa.string(),
	"comtype": pa.string(),
	"ICPSR": pa.string(),
	"ICPSR2": pa.string(),
	"Cand.ID": pa.string(),
	"FEC.ID": pa.string(),
	"NID": pa.string(),
	"before.switch.ICPSR": pa.string(),
	"after.switch.ICPSR": pa.string(),
	"party.orig": pa.string(),
	"nimsp.party": pa.string(),
	"nimsp.candidate.ICO.code": pa.string(),
	"nimsp.district": pa.string(),
	"nimsp.office": pa.string(),
	"nimsp.candidate.status": pa.string(),
	"included_in_scaling": pa.string(),
	# Integer columns (use float64 to handle nulls gracefully)
	"cycle": pa.float64(),
	"fecyear": pa.float64(),
	"num.givers": pa.float64(),
	"num.givers.total": pa.float64(),
	# Float columns - scores
	"recipient.cfscore": pa.float64(),
	"recipient.cfscore.dyn": pa.float64(),
	"contributor.cfscore": pa.float64(),
	"dwdime": pa.float64(),
	"dwnom1": pa.float64(),
	"dwnom2": pa.float64(),
	"ps.dwnom1": pa.float64(),
	"ps.dwnom2": pa.float64(),
	"irt.cfscore": pa.float64(),
	"composite.score": pa.float64(),
	# Float columns - financial
	"total.receipts": pa.float64(),
	"total.disbursements": pa.float64(),
	"total.indiv.contribs": pa.float64(),
	"total.unitemized": pa.float64(),
	"total.pac.contribs": pa.float64(),
	"total.party.contribs": pa.float64(),
	"total.contribs.from.candidate": pa.float64(),
	"ind.exp.support": pa.float64(),
	"ind.exp.oppose": pa.float64(),
	# Float columns - vote percentages
	"prim.vote.pct": pa.float64(),
	"gen.vote.pct": pa.float64(),
	"district.pres.vs": pa.float64(),
	}

	DIME_RECIPIENTS_COLUMNS = list(DIME_RECIPIENTS_SCHEMA.keys())


	# =============================================================================
	# CONTRIBUTORS SCHEMA (43 columns)
	# =============================================================================
	# Generate amount columns for each election cycle (1980-2024, even years)
	_AMOUNT_YEARS = list(range(1980, 2025, 2)) # 1980, 1982, ..., 2024

	DIME_CONTRIBUTORS_SCHEMA: dict[str, pa.DataType] = {
	# String columns - IDs
	"bonica.cid": pa.string(), # Primary ID - MUST be string
	"contributor.type": pa.string(),
	"most.recent.contributor.name": pa.string(),
	"most.recent.contributor.address": pa.string(),
	"most.recent.contributor.city": pa.string(),
	"most.recent.contributor.zipcode": pa.string(), # MUST be string - leading zeros
	"most.recent.contributor.state": pa.string(),
	"most.recent.contributor.occupation": pa.string(),
	"most.recent.contributor.employer": pa.string(),
	"most.recent.transaction.id": pa.string(),
	"most.recent.transaction.date": pa.string(), # Keep as string - YYYY-MM-DD format
	"contributor.gender": pa.string(),
	"is.corp": pa.string(),
	"is.projected": pa.string(),
	# Float columns (use float64 to handle nulls)
	"num.distinct": pa.float64(),
	"most.recent.contributor.latitude": pa.float64(),
	"most.recent.contributor.longitude": pa.float64(),
	"contributor.cfscore": pa.float64(),
	"first_cycle_active": pa.float64(),
	"last_cycle_active": pa.float64(),
	# Amount columns for each election cycle
	**{f"amount.{year}": pa.float64() for year in _AMOUNT_YEARS},
	}

	DIME_CONTRIBUTORS_COLUMNS = list(DIME_CONTRIBUTORS_SCHEMA.keys())


	# =============================================================================
	# FILE TYPE CONFIGURATION
	# =============================================================================
	@dataclass
	class FileTypeConfig:
	"""Configuration for a specific DIME file type."""

	schema: dict[str, pa.DataType]
	expected_columns: list[str]
	sum_column: str \| None # Column to sum for checksum validation
	key_columns: list[str] # Columns to track non-null counts


	FILE_TYPE_CONFIGS: dict[FileType, FileTypeConfig] = {
	FileType.CONTRIBUTIONS: FileTypeConfig(
	schema=DIME_CONTRIBUTIONS_SCHEMA,
	expected_columns=DIME_CONTRIBUTIONS_COLUMNS,
	sum_column="amount",
	key_columns=["transaction.id", "bonica.cid", "contributor.name", "amount"],
	),
	FileType.RECIPIENTS: FileTypeConfig(
	schema=DIME_RECIPIENTS_SCHEMA,
	expected_columns=DIME_RECIPIENTS_COLUMNS,
	sum_column="recipient.cfscore",
	key_columns=["bonica.rid", "bonica.cid", "name"],
	),
	FileType.CONTRIBUTORS: FileTypeConfig(
	schema=DIME_CONTRIBUTORS_SCHEMA,
	expected_columns=DIME_CONTRIBUTORS_COLUMNS,
	sum_column="contributor.cfscore",
	key_columns=["bonica.cid", "most.recent.contributor.name"],
	),
	}


	def get_config(file_type: FileType) -> FileTypeConfig:
	"""Get configuration for a file type."""
	return FILE_TYPE_CONFIGS[file_type]


	# MySQL export null marker and empty string
	NULL_VALUES = ["\\N", ""]