"""Schema definitions for distinct legislators output.""" import pyarrow as pa # ============================================================================= # SOURCE CONFIGURATION # ============================================================================= # HuggingFace dataset URL HF_BASE_URL = "https://huggingface.co/datasets/Dustinhax/tyt/resolve/main" VOTEVIEW_MEMBERS_URL = f"{HF_BASE_URL}/voteview/HSall_members.parquet" # Allowed domains for source URLs (SQL injection mitigation) ALLOWED_SOURCE_DOMAINS = [ "huggingface.co", ] def validate_source_url(url: str) -> bool: """Check if source URL is from an allowed domain. Args: url: The URL to validate Returns: True if URL is from an allowed domain """ from urllib.parse import urlparse parsed = urlparse(url) return any(parsed.netloc.endswith(domain) for domain in ALLOWED_SOURCE_DOMAINS) # Congress filter: 96th congress (1979-1980) and later # Congress 96 started January 3, 1979 MIN_CONGRESS = 96 # Congress number to year mapping (approximate - congress starts in odd years) # Congress N covers years (1787 + 2*N) to (1788 + 2*N) # Example: Congress 96 = 1979-1980, Congress 119 = 2025-2026 def congress_to_years(congress: int) -> tuple[int, int]: """Convert congress number to start/end years.""" start_year = 1787 + 2 * congress end_year = start_year + 1 return (start_year, end_year) # ============================================================================= # OUTPUT SCHEMA # ============================================================================= DISTINCT_LEGISLATORS_SCHEMA = pa.schema( [ # Primary identifier pa.field("bioguide_id", pa.string(), nullable=False), # Cross-reference identifier for DIME linkage pa.field("icpsr", pa.int32()), # Links to DIME Recipients via ICPSR # Biographical info (most recent values) pa.field("bioname", pa.string()), pa.field("state_abbrev", pa.string()), pa.field("party_code", pa.float64()), # 100=Democrat, 200=Republican # Congress sessions served pa.field("congresses_served", pa.list_(pa.int16())), pa.field("first_congress", pa.int16()), pa.field("last_congress", pa.int16()), # Ideology scores (most recent values) pa.field("nominate_dim1", pa.float64()), # Economic left-right pa.field("nominate_dim2", pa.float64()), # Social conservatism ] ) DISTINCT_LEGISLATORS_COLUMNS = [ "bioguide_id", "icpsr", "bioname", "state_abbrev", "party_code", "congresses_served", "first_congress", "last_congress", "nominate_dim1", "nominate_dim2", ] # Columns used for validation checksums KEY_COLUMNS = ["bioguide_id", "first_congress", "last_congress"] # ============================================================================= # AGGREGATION SQL # ============================================================================= AGGREGATION_QUERY = """ SELECT bioguide_id, LAST(icpsr ORDER BY congress)::INTEGER as icpsr, LAST(bioname ORDER BY congress) as bioname, LAST(state_abbrev ORDER BY congress) as state_abbrev, LAST(party_code ORDER BY congress) as party_code, LIST(congress ORDER BY congress) as congresses_served, MIN(congress)::SMALLINT as first_congress, MAX(congress)::SMALLINT as last_congress, LAST(nominate_dim1 ORDER BY congress) as nominate_dim1, LAST(nominate_dim2 ORDER BY congress) as nominate_dim2 FROM read_parquet('{source_url}') WHERE congress >= {min_congress} AND bioguide_id IS NOT NULL GROUP BY bioguide_id ORDER BY bioguide_id """ # ============================================================================= # DATA INTERPRETATION NOTES # ============================================================================= DATA_INTERPRETATION = """ ## Data Interpretation Decisions ### Source Data - **Source:** Voteview HSall_members.parquet - **Original structure:** One row per legislator per congress session - **Output structure:** One row per legislator (aggregated) ### Filtering - **Congress filter:** >= 96 (1979-1980 onward) - **Null filter:** bioguide_id IS NOT NULL (excludes ~17 records without bioguide) - These are typically Presidents or historical members without bioguide IDs ### Aggregation Rules | Field | Aggregation | Rationale | |-------|-------------|-----------| | bioguide_id | GROUP BY | Primary key, unique per legislator | | icpsr | LAST by congress | ICPSR identifier for DIME cross-reference | | bioname | LAST by congress | Name format may change; use most recent | | state_abbrev | LAST by congress | Legislators may change states (rare) | | party_code | LAST by congress | Party affiliation may change over career | | congresses_served | LIST ordered | Complete history of all sessions served | | first_congress | MIN | Earliest congress served (career start) | | last_congress | MAX | Latest congress served (current or end) | | nominate_dim1 | LAST by congress | Ideology score from most recent session | | nominate_dim2 | LAST by congress | Ideology score from most recent session | ### Party Codes | Code | Party | |------|-------| | 100 | Democrat | | 200 | Republican | | 328 | Independent | | Other | Historical parties (Whig, Federalist, etc.) | ### NOMINATE Scores - **dim1:** Economic liberalism/conservatism (-1 to +1, negative=liberal) - **dim2:** Social issues/civil rights (-1 to +1, interpretation varies by era) - Scores are session-specific; we keep the most recent for simplicity ### Known Edge Cases 1. **Party switchers:** Uses most recent party (e.g., Arlen Specter shows Democrat) 2. **State changers:** Uses most recent state (rare, but possible) 3. **Gaps in service:** congresses_served array handles non-consecutive terms 4. **Presidents:** Excluded (no bioguide_id in Voteview data) """