#!/usr/bin/env python3 """ Load bills from existing parquet files into DuckDB Demonstrates querying parquet files directly without copying """ import sys from pathlib import Path # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) from scripts.legislative_analysis_intel import DuckDBLegislativeAnalyzer from loguru import logger logger.info("πŸš€ Loading bills from parquet files into DuckDB") logger.info("=" * 60) with DuckDBLegislativeAnalyzer() as analyzer: # Find what parquet files exist logger.info("πŸ“ Searching for bill parquet files...") # Try different patterns patterns = [ 'data/gold/**/*bills*.parquet', 'data/gold/**/*bill*.parquet', 'data/gold/national/bills_*.parquet', ] found_files = False for pattern in patterns: try: logger.info(f" Trying pattern: {pattern}") # Test if pattern matches any files test_query = f""" SELECT COUNT(*) as file_count FROM read_parquet('{pattern}', filename=true) LIMIT 1 """ result = analyzer.conn.execute(test_query).fetchone() if result and result[0] > 0: logger.info(f" βœ… Found files matching: {pattern}") # Create bills table from this pattern analyzer.conn.execute(f""" CREATE OR REPLACE TABLE bills AS SELECT * FROM read_parquet('{pattern}') """) # Count total bills loaded count = analyzer.conn.execute("SELECT COUNT(*) FROM bills").fetchone()[0] logger.info(f" πŸ“Š Loaded {count:,} bills into DuckDB") found_files = True break except Exception as e: logger.debug(f" ⚠️ Pattern {pattern} didn't match: {e}") continue if not found_files: logger.warning("⚠️ No bill parquet files found") logger.info(" Creating demo bills table instead...") analyzer.conn.execute(""" CREATE TABLE IF NOT EXISTS bills ( identifier VARCHAR, title TEXT, abstract TEXT, classification VARCHAR, subject VARCHAR, from_organization_name VARCHAR, from_organization_state VARCHAR(2), updated_at TIMESTAMP ) """) demo_bills = [ ('HB1234', 'Water Fluoridation Act', 'Requires community water fluoridation', 'bill', 'Health', 'Alabama House', 'AL', '2026-04-01'), ('SB5678', 'Dental Care Access', 'Expands dental coverage for children', 'bill', 'Health', 'Georgia Senate', 'GA', '2026-04-15'), ('HB9012', 'School Health Programs', 'Funds oral health screenings in schools', 'bill', 'Education', 'Massachusetts House', 'MA', '2026-03-20'), ] analyzer.conn.executemany("INSERT INTO bills VALUES (?, ?, ?, ?, ?, ?, ?, ?)", demo_bills) logger.info(" βœ… Created 3 demo bills") # Now run stats logger.info("\nπŸ“Š Analyzing bill statistics...") stats = analyzer.analyze_bill_statistics() logger.info("\nπŸ“ˆ Results:") logger.info(f"\nπŸ—ΊοΈ Top States by Bill Count:") for state_stat in stats.get('top_states', [])[:10]: logger.info(f" {state_stat['state']}: {state_stat['count']:,} bills") if 'top_topics' in stats: logger.info(f"\nπŸ“‹ Top Topics by Bill Count:") for topic_stat in stats.get('top_topics', [])[:10]: logger.info(f" {topic_stat['topic']}: {topic_stat['count']:,} bills") elif 'top_subjects' in stats: logger.info(f"\nπŸ“‹ Top Subjects by Bill Count:") for subject_stat in stats.get('top_subjects', [])[:10]: logger.info(f" {subject_stat['subject']}: {subject_stat['count']:,} bills") logger.info("\nβœ… Analysis complete!")