Spaces:
Runtime error
Runtime error
File size: 6,065 Bytes
b339b93 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | """Schema definitions for distinct legislators output."""
import pyarrow as pa
# =============================================================================
# SOURCE CONFIGURATION
# =============================================================================
# HuggingFace dataset URL
HF_BASE_URL = "https://huggingface.co/datasets/Dustinhax/tyt/resolve/main"
VOTEVIEW_MEMBERS_URL = f"{HF_BASE_URL}/voteview/HSall_members.parquet"
# Allowed domains for source URLs (SQL injection mitigation)
ALLOWED_SOURCE_DOMAINS = [
"huggingface.co",
]
def validate_source_url(url: str) -> bool:
"""Check if source URL is from an allowed domain.
Args:
url: The URL to validate
Returns:
True if URL is from an allowed domain
"""
from urllib.parse import urlparse
parsed = urlparse(url)
return any(parsed.netloc.endswith(domain) for domain in ALLOWED_SOURCE_DOMAINS)
# Congress filter: 96th congress (1979-1980) and later
# Congress 96 started January 3, 1979
MIN_CONGRESS = 96
# Congress number to year mapping (approximate - congress starts in odd years)
# Congress N covers years (1787 + 2*N) to (1788 + 2*N)
# Example: Congress 96 = 1979-1980, Congress 119 = 2025-2026
def congress_to_years(congress: int) -> tuple[int, int]:
"""Convert congress number to start/end years."""
start_year = 1787 + 2 * congress
end_year = start_year + 1
return (start_year, end_year)
# =============================================================================
# OUTPUT SCHEMA
# =============================================================================
DISTINCT_LEGISLATORS_SCHEMA = pa.schema(
[
# Primary identifier
pa.field("bioguide_id", pa.string(), nullable=False),
# Cross-reference identifier for DIME linkage
pa.field("icpsr", pa.int32()), # Links to DIME Recipients via ICPSR
# Biographical info (most recent values)
pa.field("bioname", pa.string()),
pa.field("state_abbrev", pa.string()),
pa.field("party_code", pa.float64()), # 100=Democrat, 200=Republican
# Congress sessions served
pa.field("congresses_served", pa.list_(pa.int16())),
pa.field("first_congress", pa.int16()),
pa.field("last_congress", pa.int16()),
# Ideology scores (most recent values)
pa.field("nominate_dim1", pa.float64()), # Economic left-right
pa.field("nominate_dim2", pa.float64()), # Social conservatism
]
)
DISTINCT_LEGISLATORS_COLUMNS = [
"bioguide_id",
"icpsr",
"bioname",
"state_abbrev",
"party_code",
"congresses_served",
"first_congress",
"last_congress",
"nominate_dim1",
"nominate_dim2",
]
# Columns used for validation checksums
KEY_COLUMNS = ["bioguide_id", "first_congress", "last_congress"]
# =============================================================================
# AGGREGATION SQL
# =============================================================================
AGGREGATION_QUERY = """
SELECT
bioguide_id,
LAST(icpsr ORDER BY congress)::INTEGER as icpsr,
LAST(bioname ORDER BY congress) as bioname,
LAST(state_abbrev ORDER BY congress) as state_abbrev,
LAST(party_code ORDER BY congress) as party_code,
LIST(congress ORDER BY congress) as congresses_served,
MIN(congress)::SMALLINT as first_congress,
MAX(congress)::SMALLINT as last_congress,
LAST(nominate_dim1 ORDER BY congress) as nominate_dim1,
LAST(nominate_dim2 ORDER BY congress) as nominate_dim2
FROM read_parquet('{source_url}')
WHERE congress >= {min_congress}
AND bioguide_id IS NOT NULL
GROUP BY bioguide_id
ORDER BY bioguide_id
"""
# =============================================================================
# DATA INTERPRETATION NOTES
# =============================================================================
DATA_INTERPRETATION = """
## Data Interpretation Decisions
### Source Data
- **Source:** Voteview HSall_members.parquet
- **Original structure:** One row per legislator per congress session
- **Output structure:** One row per legislator (aggregated)
### Filtering
- **Congress filter:** >= 96 (1979-1980 onward)
- **Null filter:** bioguide_id IS NOT NULL (excludes ~17 records without bioguide)
- These are typically Presidents or historical members without bioguide IDs
### Aggregation Rules
| Field | Aggregation | Rationale |
|-------|-------------|-----------|
| bioguide_id | GROUP BY | Primary key, unique per legislator |
| icpsr | LAST by congress | ICPSR identifier for DIME cross-reference |
| bioname | LAST by congress | Name format may change; use most recent |
| state_abbrev | LAST by congress | Legislators may change states (rare) |
| party_code | LAST by congress | Party affiliation may change over career |
| congresses_served | LIST ordered | Complete history of all sessions served |
| first_congress | MIN | Earliest congress served (career start) |
| last_congress | MAX | Latest congress served (current or end) |
| nominate_dim1 | LAST by congress | Ideology score from most recent session |
| nominate_dim2 | LAST by congress | Ideology score from most recent session |
### Party Codes
| Code | Party |
|------|-------|
| 100 | Democrat |
| 200 | Republican |
| 328 | Independent |
| Other | Historical parties (Whig, Federalist, etc.) |
### NOMINATE Scores
- **dim1:** Economic liberalism/conservatism (-1 to +1, negative=liberal)
- **dim2:** Social issues/civil rights (-1 to +1, interpretation varies by era)
- Scores are session-specific; we keep the most recent for simplicity
### Known Edge Cases
1. **Party switchers:** Uses most recent party (e.g., Arlen Specter shows Democrat)
2. **State changers:** Uses most recent state (rare, but possible)
3. **Gaps in service:** congresses_served array handles non-consecutive terms
4. **Presidents:** Excluded (no bioguide_id in Voteview data)
"""
|