File size: 7,783 Bytes
61d29fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
Example: Using platform detection with discovered URLs.

This script demonstrates how to:
1. Load discovered URLs from Silver layer
2. Detect which platform each URL uses
3. Prepare optimized scraping strategies
"""
import asyncio
from pathlib import Path

# Imports from your project
from discovery.platform_detector import detect_platform_async, get_platform_capabilities
from models.meeting_event import MeetingEvent, Classification
from config.settings import settings


async def analyze_discovered_urls():
    """
    Analyze all discovered URLs and detect their platforms.
    """
    print("πŸ” Analyzing discovered URLs for platform detection...\n")
    
    # Check if we have discovered URLs
    silver_path = Path(f"{settings.delta_lake_path}/silver/discovered_urls")
    if not silver_path.exists():
        print("❌ No discovered URLs found. Run: python main.py discover-jurisdictions --limit 500")
        return
    
    # Load URLs using pandas (simpler than PySpark for small data)
    import pandas as pd
    df = pd.read_parquet(silver_path, engine='pyarrow')
    
    print(f"πŸ“Š Found {len(df)} discovered URLs\n")
    print("=" * 80)
    
    # Analyze each URL
    platform_counts = {}
    platform_examples = {}
    
    for idx, row in df.head(20).iterrows():  # Analyze first 20 for demo
        url = row['url']
        jurisdiction = row['jurisdiction_name']
        state = row['state_code']
        
        # Detect platform (async for thorough detection)
        result = await detect_platform_async(url, fetch_html=False)
        
        platform = result['platform'] or 'unknown'
        confidence = result['confidence']
        
        # Track counts
        platform_counts[platform] = platform_counts.get(platform, 0) + 1
        
        # Store example
        if platform not in platform_examples:
            platform_examples[platform] = {
                'url': url,
                'jurisdiction': jurisdiction,
                'state': state
            }
        
        # Display
        status = "βœ…" if result['scraper_available'] else "⚠️"
        print(f"{status} {jurisdiction}, {state}")
        print(f"   Platform: {platform} (confidence: {confidence:.1%})")
        print(f"   URL: {url}")
        
        # Show capabilities
        if platform != 'unknown':
            caps = get_platform_capabilities(platform)
            if caps.get('has_api'):
                print(f"   πŸš€ API Available: {caps.get('api_docs', 'Yes')}")
            if caps.get('scraper_class'):
                print(f"   πŸ€– Scraper: {caps['scraper_class']}")
        
        print()
    
    # Summary
    print("=" * 80)
    print("\nπŸ“ˆ Platform Distribution:\n")
    for platform, count in sorted(platform_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / len(df.head(20))) * 100
        bar = "β–ˆ" * int(percentage / 5)
        print(f"  {platform:15s} {count:3d} ({percentage:5.1f}%) {bar}")
    
    print("\nπŸ’‘ Next Steps:")
    print("  1. Implement scrapers for top platforms")
    print("  2. Test scraping on example URLs")
    print("  3. Add platform info to Silver layer")
    
    return platform_counts, platform_examples


async def demo_event_creation():
    """
    Demonstrate creating standardized MeetingEvent objects.
    """
    print("\n" + "=" * 80)
    print("πŸ“… Demo: Creating Standardized Meeting Events\n")
    
    # Example 1: From Birmingham
    event1 = MeetingEvent(
        title="Birmingham City Council Regular Meeting",
        description="Regular meeting with discussion of water system improvements",
        classification=Classification.COUNCIL,
        start=datetime(2026, 4, 21, 18, 0),
        location=Location(
            name="City Hall Council Chambers",
            address="710 N 20th Street",
            city="Birmingham",
            state="AL"
        ),
        source="https://birminghamal.gov/meetings",
        jurisdiction_name="Birmingham city",
        state_code="AL"
    )
    
    # Add documents
    event1.add_link("Agenda", "https://birminghamal.gov/agenda-20260421.pdf")
    event1.add_link("Video Recording", "https://birminghamal.gov/video/20260421")
    
    # Mark as oral health relevant
    event1.oral_health_relevant = True
    event1.keywords_found = ["water", "fluoridation", "public health"]
    event1.confidence_score = 0.85
    
    print(f"Event 1: {event1.title}")
    print(f"  Location: {event1.location}")
    print(f"  Has agenda: {event1.has_agenda()}")
    print(f"  Has video: {event1.has_video()}")
    print(f"  Oral health relevant: {event1.oral_health_relevant}")
    print(f"  Keywords: {', '.join(event1.keywords_found)}")
    print(f"  Confidence: {event1.confidence_score:.1%}")
    
    # Show how it converts to Delta Lake format
    print(f"\nπŸ“ Delta Lake format (first 5 fields):")
    data_dict = event1.to_dict()
    for key in list(data_dict.keys())[:5]:
        print(f"  {key}: {data_dict[key]}")
    
    print(f"\nβœ… Event ID: {event1.id}")
    print(f"   (Generated from: {event1.source} + {event1.start.isoformat()})")


async def demo_matter_tracking():
    """
    Demonstrate tracking a policy matter across meetings.
    """
    from models.meeting_event import Matter
    from datetime import datetime
    
    print("\n" + "=" * 80)
    print("πŸ“‹ Demo: Matter Tracking (Legislative Item Evolution)\n")
    
    # Track a fluoridation ordinance
    matter = Matter(
        matter_id="BHM-2024-FL001",
        matter_number="Ordinance 2024-045",
        title="Community Water Fluoridation Program Implementation",
        type="Ordinance",
        first_introduced=datetime(2024, 1, 15),
        status="Committee Review"
    )
    
    # Add related meetings
    matter.related_meetings = [
        "mtg-20240115-council",
        "mtg-20240205-health-committee", 
        "mtg-20240220-public-hearing"
    ]
    
    # Add documents
    from models.meeting_event import Link
    matter.related_documents = [
        Link("Original Ordinance", "https://example.gov/ord-2024-045.pdf"),
        Link("Committee Report", "https://example.gov/committee-report.pdf"),
        Link("Public Comments", "https://example.gov/comments.pdf")
    ]
    
    # Mark as health policy
    matter.is_health_policy = True
    matter.policy_keywords = ["fluoridation", "oral health", "CDC guidelines"]
    
    print(f"Matter: {matter.title}")
    print(f"  Number: {matter.matter_number}")
    print(f"  Type: {matter.type}")
    print(f"  Status: {matter.status}")
    print(f"  First introduced: {matter.first_introduced.strftime('%B %d, %Y')}")
    print(f"  Related meetings: {len(matter.related_meetings)}")
    print(f"  Documents: {len(matter.related_documents)}")
    print(f"  Health policy: {matter.is_health_policy}")
    print(f"  Keywords: {', '.join(matter.policy_keywords)}")
    
    print("\nπŸ’‘ This allows you to:")
    print("  - Track how a policy evolves across multiple meetings")
    print("  - See all related documents in one place")
    print("  - Identify windows of opportunity for advocacy")
    print("  - Monitor voting patterns on health issues")


if __name__ == "__main__":
    from datetime import datetime
    from models.meeting_event import Location
    
    print("🦷 Oral Health Policy Pulse - Platform Integration Demo")
    print("=" * 80)
    
    # Run async demos
    asyncio.run(analyze_discovered_urls())
    asyncio.run(demo_event_creation())
    asyncio.run(demo_matter_tracking())
    
    print("\n" + "=" * 80)
    print("βœ… Demo complete!")
    print("\nπŸ“– For more details, see:")
    print("   - docs/INTEGRATION_GUIDE.md")
    print("   - discovery/platform_detector.py")
    print("   - models/meeting_event.py")