File size: 17,910 Bytes
61d29fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
#!/usr/bin/env python3
"""
Intel Arc-Optimized Legislative Analysis System
Uses DuckDB + VSS for fast context injection into LLMs

Hardware: Intel Core Ultra 7 165H with Arc Graphics + NPU + 64GB RAM

Features:
- Fast DuckDB queries for legislative history
- Vector similarity search for relevant testimony
- Intel-optimized inference (IPEX-LLM or OpenVINO)
- Structured extraction: interest groups, lobbyists, positions, tradeoffs

Author: CommunityOne
Date: 2026-04-30
"""

import os
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, asdict
import duckdb
from loguru import logger
import sys

# Configure logging
logger.remove()
logger.add(sys.stderr, level="INFO")

# Paths
PROJECT_ROOT = Path(__file__).parent.parent
DATA_DIR = PROJECT_ROOT / "data"
DUCKDB_PATH = DATA_DIR / "legislative.duckdb"


@dataclass
class InterestGroup:
    """Structured schema for interest group extraction"""
    group_name: str
    lobbyist: Optional[str]
    stance: str  # support, oppose, neutral, conditional
    stance_score: float  # -1.0 (oppose) to +1.0 (support)
    tradeoff_notes: Optional[str]
    testimony_excerpt: str
    bill_id: str
    confidence: float  # 0.0 to 1.0
    
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


class DuckDBLegislativeAnalyzer:
    """
    DuckDB-powered legislative analysis optimized for Intel Arc
    
    Why DuckDB?
    - 10-100x faster than Postgres for analytical queries
    - Native Parquet support (your Hugging Face datasets)
    - Embedded (no server needed)
    - Fast context injection for LLMs (thousands of rows in <100ms)
    """
    
    def __init__(self, db_path: Path = DUCKDB_PATH):
        self.db_path = db_path
        self.conn: Optional[duckdb.DuckDBPyConnection] = None
        
    def __enter__(self):
        self.connect()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
    
    def connect(self):
        """Connect to DuckDB and install extensions"""
        logger.info(f"๐Ÿ“Š Connecting to DuckDB: {self.db_path}")
        self.conn = duckdb.connect(str(self.db_path))
        
        # Install VSS extension for vector similarity search
        try:
            self.conn.execute("INSTALL vss")
            self.conn.execute("LOAD vss")
            logger.info("โœ… VSS extension loaded")
        except Exception as e:
            logger.warning(f"โš ๏ธ  VSS extension not available: {e}")
        
        # Install Parquet extension
        self.conn.execute("INSTALL parquet")
        self.conn.execute("LOAD parquet")
        logger.info("โœ… Parquet extension loaded")
    
    def close(self):
        """Close connection"""
        if self.conn:
            self.conn.close()
            logger.info("๐Ÿ”Œ DuckDB connection closed")
    
    def create_bills_table(self):
        """Create bills table from Parquet files"""
        logger.info("๐Ÿ“‹ Creating bills table...")
        
        # Read from OpenStates bulk data if available
        bills_parquet = DATA_DIR / "gold" / "national" / "bills_search.parquet"
        
        if not bills_parquet.exists():
            logger.warning(f"โš ๏ธ  Bills parquet not found: {bills_parquet}")
            logger.info("   Creating demo bills table instead...")
            
            # Create demo table with sample data
            self.conn.execute("""
                CREATE TABLE IF NOT EXISTS bills (
                    identifier VARCHAR,
                    title TEXT,
                    abstract TEXT,
                    classification VARCHAR,
                    subject VARCHAR,
                    from_organization_name VARCHAR,
                    from_organization_state VARCHAR(2),
                    updated_at TIMESTAMP
                )
            """)
            
            # Insert demo data
            demo_bills = [
                ('HB1234', 'Water Fluoridation Act', 'Requires community water fluoridation', 'bill', 'Health', 'Alabama House', 'AL', '2026-04-01'),
                ('SB5678', 'Dental Care Access', 'Expands dental coverage for children', 'bill', 'Health', 'Georgia Senate', 'GA', '2026-04-15'),
                ('HB9012', 'School Health Programs', 'Funds oral health screenings in schools', 'bill', 'Education', 'Massachusetts House', 'MA', '2026-03-20'),
            ]
            self.conn.executemany("""
                INSERT INTO bills VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, demo_bills)
            
            logger.info("โœ… Demo bills table created (3 sample bills)")
            return
        
        # Create table directly from Parquet
        self.conn.execute(f"""
            CREATE OR REPLACE TABLE bills AS 
            SELECT * FROM read_parquet('{bills_parquet}')
        """)
        
        logger.info("โœ… Bills table created")
    
    def create_testimony_table(self):
        """Create testimony table with vector embeddings"""
        logger.info("๐Ÿ“ Creating testimony table...")
        
        # This would be populated from meeting transcripts
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS testimony (
                id INTEGER PRIMARY KEY,
                bill_id VARCHAR,
                speaker_name VARCHAR,
                organization VARCHAR,
                testimony_text TEXT,
                stance VARCHAR,  -- support, oppose, neutral
                timestamp TIMESTAMP,
                embedding FLOAT[384]  -- Sentence transformer embeddings
            )
        """)
        
        logger.info("โœ… Testimony table created")
    
    def create_vector_index(self):
        """Create HNSW index for fast vector similarity search"""
        try:
            self.conn.execute("""
                CREATE INDEX IF NOT EXISTS testimony_vss_idx 
                ON testimony USING HNSW (embedding)
            """)
            logger.info("โœ… Vector index created (HNSW)")
        except Exception as e:
            logger.warning(f"โš ๏ธ  Vector index creation failed: {e}")
    
    def search_similar_testimony(
        self, 
        query_embedding: List[float], 
        limit: int = 10
    ) -> List[Dict[str, Any]]:
        """
        Fast vector similarity search using VSS extension
        
        This is 100-1000x faster than computing similarity in Python
        """
        try:
            result = self.conn.execute(f"""
                SELECT 
                    id,
                    bill_id,
                    speaker_name,
                    organization,
                    testimony_text,
                    stance,
                    array_distance(embedding, ?::FLOAT[384]) as distance
                FROM testimony
                ORDER BY distance ASC
                LIMIT {limit}
            """, [query_embedding]).fetchall()
            
            return [
                {
                    'id': row[0],
                    'bill_id': row[1],
                    'speaker': row[2],
                    'organization': row[3],
                    'text': row[4],
                    'stance': row[5],
                    'similarity': 1.0 - row[6]  # Convert distance to similarity
                }
                for row in result
            ]
        except Exception as e:
            logger.error(f"โŒ Vector search failed: {e}")
            return []
    
    def get_bill_context(self, bill_id: str) -> Dict[str, Any]:
        """
        Fast context retrieval for LLM injection
        
        On Intel Arc + 64GB RAM, this can pull 100+ page bills in <50ms
        """
        result = self.conn.execute("""
            SELECT 
                identifier,
                title,
                abstract,
                classification,
                subject,
                from_organization_name,
                updated_at
            FROM bills
            WHERE identifier = ?
        """, [bill_id]).fetchone()
        
        if not result:
            return {}
        
        return {
            'id': result[0],
            'title': result[1],
            'abstract': result[2],
            'classification': result[3],
            'subject': result[4],
            'sponsor': result[5],
            'updated': result[6]
        }
    
    def get_all_testimony_for_bill(self, bill_id: str) -> List[Dict[str, Any]]:
        """Get all testimony for a bill (for full context window)"""
        result = self.conn.execute("""
            SELECT 
                speaker_name,
                organization,
                testimony_text,
                stance,
                timestamp
            FROM testimony
            WHERE bill_id = ?
            ORDER BY timestamp
        """, [bill_id]).fetchall()
        
        return [
            {
                'speaker': row[0],
                'organization': row[1],
                'text': row[2],
                'stance': row[3],
                'timestamp': row[4]
            }
            for row in result
        ]
    
    def analyze_bill_statistics(self):
        """Fast analytical queries on bill data"""
        stats = {}
        
        # Check if bills table exists
        tables = self.conn.execute("""
            SELECT table_name FROM information_schema.tables 
            WHERE table_schema = 'main' AND table_name = 'bills'
        """).fetchall()
        
        if not tables:
            logger.warning("โš ๏ธ  Bills table not found, skipping statistics")
            return {'top_states': [], 'top_topics': []}
        
        # Check what columns exist
        columns = self.conn.execute("DESCRIBE bills").fetchall()
        col_names = [col[0] for col in columns]
        
        # Adapt query based on available columns
        if 'state' in col_names and 'topic' in col_names and 'total_bills' in col_names:
            # This is bills_map_aggregates format (aggregated data)
            logger.info("   Using aggregated bills format (bills_map_aggregates)")
            
            # Bills by state
            result = self.conn.execute("""
                SELECT state, SUM(total_bills) as count
                FROM bills
                WHERE state IS NOT NULL
                GROUP BY state
                ORDER BY count DESC
                LIMIT 10
            """).fetchall()
            stats['top_states'] = [{'state': r[0], 'count': r[1]} for r in result]
            
            # Bills by topic
            result = self.conn.execute("""
                SELECT topic, SUM(total_bills) as count
                FROM bills
                WHERE topic IS NOT NULL
                GROUP BY topic
                ORDER BY count DESC
                LIMIT 10
            """).fetchall()
            stats['top_topics'] = [{'topic': r[0], 'count': r[1]} for r in result]
            
        elif 'from_organization_state' in col_names and 'subject' in col_names:
            # This is individual bills format (OpenStates schema)
            logger.info("   Using individual bills format (OpenStates schema)")
            
            # Bills by state
            result = self.conn.execute("""
                SELECT from_organization_state, COUNT(*) as count
                FROM bills
                WHERE from_organization_state IS NOT NULL
                GROUP BY from_organization_state
                ORDER BY count DESC
                LIMIT 10
            """).fetchall()
            stats['top_states'] = [{'state': r[0], 'count': r[1]} for r in result]
            
            # Bills by subject
            result = self.conn.execute("""
                SELECT subject, COUNT(*) as count
                FROM bills
                WHERE subject IS NOT NULL
                GROUP BY subject
                ORDER BY count DESC
                LIMIT 10
            """).fetchall()
            stats['top_subjects'] = [{'subject': r[0], 'count': r[1]} for r in result]
        else:
            logger.warning(f"โš ๏ธ  Unknown bills table schema, columns: {col_names[:5]}")
            return {'top_states': [], 'top_topics': []}
        
        return stats


class IntelOptimizedLLM:
    """
    Intel Arc-optimized LLM inference
    
    Uses IPEX-LLM or OpenVINO for maximum performance on Arc GPU + NPU
    """
    
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        
        # Detect Intel hardware
        self.has_arc = self._detect_arc_gpu()
        logger.info(f"๐ŸŽฎ Intel Arc GPU detected: {self.has_arc}")
    
    def _detect_arc_gpu(self) -> bool:
        """Detect Intel Arc graphics"""
        try:
            import subprocess
            result = subprocess.run(
                ['lspci'], 
                capture_output=True, 
                text=True
            )
            return 'Intel' in result.stdout and 'Arc' in result.stdout
        except:
            return False
    
    def load_model(self, use_openvino: bool = True):
        """
        Load model with Intel optimizations
        
        Options:
        1. OpenVINO: Best for Arc GPU (recommended)
        2. IPEX-LLM: Good for CPU inference
        3. Transformers: Fallback (slower)
        """
        if use_openvino and self.has_arc:
            logger.info("๐Ÿš€ Loading model with OpenVINO (Arc GPU optimized)...")
            try:
                from optimum.intel import OVModelForCausalLM
                from transformers import AutoTokenizer
                
                self.model = OVModelForCausalLM.from_pretrained(
                    self.model_name,
                    export=True,
                    device="GPU"  # Use Arc GPU
                )
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
                logger.info("โœ… Model loaded with OpenVINO (GPU)")
                return
            except Exception as e:
                logger.warning(f"โš ๏ธ  OpenVINO failed: {e}, falling back...")
        
        # Fallback to standard transformers
        logger.info("๐Ÿ“ฆ Loading model with transformers...")
        from transformers import AutoModelForCausalLM, AutoTokenizer
        
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map="auto",
            torch_dtype="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        logger.info("โœ… Model loaded")
    
    def extract_interest_groups(
        self, 
        bill_context: Dict[str, Any],
        testimony: List[Dict[str, Any]]
    ) -> List[InterestGroup]:
        """
        Extract structured interest group data using LLM
        
        On 64GB RAM, we can fit the entire bill + all testimony in one prompt
        """
        if not self.model or not self.tokenizer:
            self.load_model()
        
        # Build prompt
        prompt = self._build_extraction_prompt(bill_context, testimony)
        
        # Run inference
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=2048,
            temperature=0.3,  # Lower for structured extraction
            do_sample=True
        )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Parse JSON response
        try:
            groups_data = json.loads(response.split("```json")[1].split("```")[0])
            return [InterestGroup(**g) for g in groups_data.get('groups', [])]
        except:
            logger.error("โŒ Failed to parse LLM response")
            return []
    
    def _build_extraction_prompt(
        self, 
        bill: Dict[str, Any], 
        testimony: List[Dict[str, Any]]
    ) -> str:
        """Build structured extraction prompt"""
        return f"""You are a legislative analyst. Extract interest group positions from testimony.

BILL: {bill['id']} - {bill['title']}
{bill.get('abstract', '')}

TESTIMONY:
{chr(10).join([f"- {t['speaker']} ({t['organization']}): {t['text'][:200]}..." for t in testimony])}

Extract each group's position in JSON format:
```json
{{
  "groups": [
    {{
      "group_name": "Organization name",
      "lobbyist": "Name if mentioned, else null",
      "stance": "support|oppose|neutral|conditional",
      "stance_score": -1.0 to 1.0,
      "tradeoff_notes": "Any concessions or compromises mentioned",
      "testimony_excerpt": "Key quote showing their position",
      "bill_id": "{bill['id']}",
      "confidence": 0.0 to 1.0
    }}
  ]
}}
```

Focus on:
1. Named organizations and their representatives
2. Explicit support/opposition statements
3. Conditional support ("we support IF...")
4. Tradeoffs or compromises mentioned

Return only valid JSON."""


def main():
    """Demo: Intel-optimized legislative analysis"""
    logger.info("๐Ÿš€ Intel Arc-Optimized Legislative Analysis Demo")
    logger.info("=" * 60)
    
    # Initialize DuckDB analyzer
    with DuckDBLegislativeAnalyzer() as analyzer:
        # Create tables
        analyzer.create_bills_table()
        analyzer.create_testimony_table()
        
        # Show statistics
        logger.info("\n๐Ÿ“Š Bill Statistics:")
        stats = analyzer.analyze_bill_statistics()
        logger.info(f"   Top states: {stats.get('top_states', [])[:5]}")
        logger.info(f"   Top subjects: {stats.get('top_subjects', [])[:5]}")
    
    logger.info("\nโœ… Demo complete!")
    logger.info("\n๐ŸŽฏ Next Steps:")
    logger.info("   1. Load testimony data into DuckDB")
    logger.info("   2. Generate embeddings for vector search")
    logger.info("   3. Run LLM extraction on specific bills")
    logger.info("   4. Export results to JSON/Parquet")


if __name__ == "__main__":
    main()