Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,151 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | #!/usr/bin/env python3
"""
Load bills from existing parquet files into DuckDB
Demonstrates querying parquet files directly without copying
"""
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from scripts.legislative_analysis_intel import DuckDBLegislativeAnalyzer
from loguru import logger
logger.info("π Loading bills from parquet files into DuckDB")
logger.info("=" * 60)
with DuckDBLegislativeAnalyzer() as analyzer:
# Find what parquet files exist
logger.info("π Searching for bill parquet files...")
# Try different patterns
patterns = [
'data/gold/**/*bills*.parquet',
'data/gold/**/*bill*.parquet',
'data/gold/national/bills_*.parquet',
]
found_files = False
for pattern in patterns:
try:
logger.info(f" Trying pattern: {pattern}")
# Test if pattern matches any files
test_query = f"""
SELECT COUNT(*) as file_count
FROM read_parquet('{pattern}', filename=true)
LIMIT 1
"""
result = analyzer.conn.execute(test_query).fetchone()
if result and result[0] > 0:
logger.info(f" β
Found files matching: {pattern}")
# Create bills table from this pattern
analyzer.conn.execute(f"""
CREATE OR REPLACE TABLE bills AS
SELECT * FROM read_parquet('{pattern}')
""")
# Count total bills loaded
count = analyzer.conn.execute("SELECT COUNT(*) FROM bills").fetchone()[0]
logger.info(f" π Loaded {count:,} bills into DuckDB")
found_files = True
break
except Exception as e:
logger.debug(f" β οΈ Pattern {pattern} didn't match: {e}")
continue
if not found_files:
logger.warning("β οΈ No bill parquet files found")
logger.info(" Creating demo bills table instead...")
analyzer.conn.execute("""
CREATE TABLE IF NOT EXISTS bills (
identifier VARCHAR,
title TEXT,
abstract TEXT,
classification VARCHAR,
subject VARCHAR,
from_organization_name VARCHAR,
from_organization_state VARCHAR(2),
updated_at TIMESTAMP
)
""")
demo_bills = [
('HB1234', 'Water Fluoridation Act', 'Requires community water fluoridation', 'bill', 'Health', 'Alabama House', 'AL', '2026-04-01'),
('SB5678', 'Dental Care Access', 'Expands dental coverage for children', 'bill', 'Health', 'Georgia Senate', 'GA', '2026-04-15'),
('HB9012', 'School Health Programs', 'Funds oral health screenings in schools', 'bill', 'Education', 'Massachusetts House', 'MA', '2026-03-20'),
]
analyzer.conn.executemany("INSERT INTO bills VALUES (?, ?, ?, ?, ?, ?, ?, ?)", demo_bills)
logger.info(" β
Created 3 demo bills")
# Now run stats
logger.info("\nπ Analyzing bill statistics...")
stats = analyzer.analyze_bill_statistics()
logger.info("\nπ Results:")
logger.info(f"\nπΊοΈ Top States by Bill Count:")
for state_stat in stats.get('top_states', [])[:10]:
logger.info(f" {state_stat['state']}: {state_stat['count']:,} bills")
if 'top_topics' in stats:
logger.info(f"\nπ Top Topics by Bill Count:")
for topic_stat in stats.get('top_topics', [])[:10]:
logger.info(f" {topic_stat['topic']}: {topic_stat['count']:,} bills")
elif 'top_subjects' in stats:
logger.info(f"\nπ Top Subjects by Bill Count:")
for subject_stat in stats.get('top_subjects', [])[:10]:
logger.info(f" {subject_stat['subject']}: {subject_stat['count']:,} bills")
logger.info("\nβ
Analysis complete!")
|