Spaces:
Runtime error
Runtime error
| """Schema definitions for distinct legislators output.""" | |
| import pyarrow as pa | |
| # ============================================================================= | |
| # SOURCE CONFIGURATION | |
| # ============================================================================= | |
| # HuggingFace dataset URL | |
| HF_BASE_URL = "https://huggingface.co/datasets/Dustinhax/tyt/resolve/main" | |
| VOTEVIEW_MEMBERS_URL = f"{HF_BASE_URL}/voteview/HSall_members.parquet" | |
| # Allowed domains for source URLs (SQL injection mitigation) | |
| ALLOWED_SOURCE_DOMAINS = [ | |
| "huggingface.co", | |
| ] | |
| def validate_source_url(url: str) -> bool: | |
| """Check if source URL is from an allowed domain. | |
| Args: | |
| url: The URL to validate | |
| Returns: | |
| True if URL is from an allowed domain | |
| """ | |
| from urllib.parse import urlparse | |
| parsed = urlparse(url) | |
| return any(parsed.netloc.endswith(domain) for domain in ALLOWED_SOURCE_DOMAINS) | |
| # Congress filter: 96th congress (1979-1980) and later | |
| # Congress 96 started January 3, 1979 | |
| MIN_CONGRESS = 96 | |
| # Congress number to year mapping (approximate - congress starts in odd years) | |
| # Congress N covers years (1787 + 2*N) to (1788 + 2*N) | |
| # Example: Congress 96 = 1979-1980, Congress 119 = 2025-2026 | |
| def congress_to_years(congress: int) -> tuple[int, int]: | |
| """Convert congress number to start/end years.""" | |
| start_year = 1787 + 2 * congress | |
| end_year = start_year + 1 | |
| return (start_year, end_year) | |
| # ============================================================================= | |
| # OUTPUT SCHEMA | |
| # ============================================================================= | |
| DISTINCT_LEGISLATORS_SCHEMA = pa.schema( | |
| [ | |
| # Primary identifier | |
| pa.field("bioguide_id", pa.string(), nullable=False), | |
| # Cross-reference identifier for DIME linkage | |
| pa.field("icpsr", pa.int32()), # Links to DIME Recipients via ICPSR | |
| # Biographical info (most recent values) | |
| pa.field("bioname", pa.string()), | |
| pa.field("state_abbrev", pa.string()), | |
| pa.field("party_code", pa.float64()), # 100=Democrat, 200=Republican | |
| # Congress sessions served | |
| pa.field("congresses_served", pa.list_(pa.int16())), | |
| pa.field("first_congress", pa.int16()), | |
| pa.field("last_congress", pa.int16()), | |
| # Ideology scores (most recent values) | |
| pa.field("nominate_dim1", pa.float64()), # Economic left-right | |
| pa.field("nominate_dim2", pa.float64()), # Social conservatism | |
| ] | |
| ) | |
| DISTINCT_LEGISLATORS_COLUMNS = [ | |
| "bioguide_id", | |
| "icpsr", | |
| "bioname", | |
| "state_abbrev", | |
| "party_code", | |
| "congresses_served", | |
| "first_congress", | |
| "last_congress", | |
| "nominate_dim1", | |
| "nominate_dim2", | |
| ] | |
| # Columns used for validation checksums | |
| KEY_COLUMNS = ["bioguide_id", "first_congress", "last_congress"] | |
| # ============================================================================= | |
| # AGGREGATION SQL | |
| # ============================================================================= | |
| AGGREGATION_QUERY = """ | |
| SELECT | |
| bioguide_id, | |
| LAST(icpsr ORDER BY congress)::INTEGER as icpsr, | |
| LAST(bioname ORDER BY congress) as bioname, | |
| LAST(state_abbrev ORDER BY congress) as state_abbrev, | |
| LAST(party_code ORDER BY congress) as party_code, | |
| LIST(congress ORDER BY congress) as congresses_served, | |
| MIN(congress)::SMALLINT as first_congress, | |
| MAX(congress)::SMALLINT as last_congress, | |
| LAST(nominate_dim1 ORDER BY congress) as nominate_dim1, | |
| LAST(nominate_dim2 ORDER BY congress) as nominate_dim2 | |
| FROM read_parquet('{source_url}') | |
| WHERE congress >= {min_congress} | |
| AND bioguide_id IS NOT NULL | |
| GROUP BY bioguide_id | |
| ORDER BY bioguide_id | |
| """ | |
| # ============================================================================= | |
| # DATA INTERPRETATION NOTES | |
| # ============================================================================= | |
| DATA_INTERPRETATION = """ | |
| ## Data Interpretation Decisions | |
| ### Source Data | |
| - **Source:** Voteview HSall_members.parquet | |
| - **Original structure:** One row per legislator per congress session | |
| - **Output structure:** One row per legislator (aggregated) | |
| ### Filtering | |
| - **Congress filter:** >= 96 (1979-1980 onward) | |
| - **Null filter:** bioguide_id IS NOT NULL (excludes ~17 records without bioguide) | |
| - These are typically Presidents or historical members without bioguide IDs | |
| ### Aggregation Rules | |
| | Field | Aggregation | Rationale | | |
| |-------|-------------|-----------| | |
| | bioguide_id | GROUP BY | Primary key, unique per legislator | | |
| | icpsr | LAST by congress | ICPSR identifier for DIME cross-reference | | |
| | bioname | LAST by congress | Name format may change; use most recent | | |
| | state_abbrev | LAST by congress | Legislators may change states (rare) | | |
| | party_code | LAST by congress | Party affiliation may change over career | | |
| | congresses_served | LIST ordered | Complete history of all sessions served | | |
| | first_congress | MIN | Earliest congress served (career start) | | |
| | last_congress | MAX | Latest congress served (current or end) | | |
| | nominate_dim1 | LAST by congress | Ideology score from most recent session | | |
| | nominate_dim2 | LAST by congress | Ideology score from most recent session | | |
| ### Party Codes | |
| | Code | Party | | |
| |------|-------| | |
| | 100 | Democrat | | |
| | 200 | Republican | | |
| | 328 | Independent | | |
| | Other | Historical parties (Whig, Federalist, etc.) | | |
| ### NOMINATE Scores | |
| - **dim1:** Economic liberalism/conservatism (-1 to +1, negative=liberal) | |
| - **dim2:** Social issues/civil rights (-1 to +1, interpretation varies by era) | |
| - Scores are session-specific; we keep the most recent for simplicity | |
| ### Known Edge Cases | |
| 1. **Party switchers:** Uses most recent party (e.g., Arlen Specter shows Democrat) | |
| 2. **State changers:** Uses most recent state (rare, but possible) | |
| 3. **Gaps in service:** congresses_served array handles non-consecutive terms | |
| 4. **Presidents:** Excluded (no bioguide_id in Voteview data) | |
| """ | |