Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 7,783 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | """
Example: Using platform detection with discovered URLs.
This script demonstrates how to:
1. Load discovered URLs from Silver layer
2. Detect which platform each URL uses
3. Prepare optimized scraping strategies
"""
import asyncio
from pathlib import Path
# Imports from your project
from discovery.platform_detector import detect_platform_async, get_platform_capabilities
from models.meeting_event import MeetingEvent, Classification
from config.settings import settings
async def analyze_discovered_urls():
"""
Analyze all discovered URLs and detect their platforms.
"""
print("π Analyzing discovered URLs for platform detection...\n")
# Check if we have discovered URLs
silver_path = Path(f"{settings.delta_lake_path}/silver/discovered_urls")
if not silver_path.exists():
print("β No discovered URLs found. Run: python main.py discover-jurisdictions --limit 500")
return
# Load URLs using pandas (simpler than PySpark for small data)
import pandas as pd
df = pd.read_parquet(silver_path, engine='pyarrow')
print(f"π Found {len(df)} discovered URLs\n")
print("=" * 80)
# Analyze each URL
platform_counts = {}
platform_examples = {}
for idx, row in df.head(20).iterrows(): # Analyze first 20 for demo
url = row['url']
jurisdiction = row['jurisdiction_name']
state = row['state_code']
# Detect platform (async for thorough detection)
result = await detect_platform_async(url, fetch_html=False)
platform = result['platform'] or 'unknown'
confidence = result['confidence']
# Track counts
platform_counts[platform] = platform_counts.get(platform, 0) + 1
# Store example
if platform not in platform_examples:
platform_examples[platform] = {
'url': url,
'jurisdiction': jurisdiction,
'state': state
}
# Display
status = "β
" if result['scraper_available'] else "β οΈ"
print(f"{status} {jurisdiction}, {state}")
print(f" Platform: {platform} (confidence: {confidence:.1%})")
print(f" URL: {url}")
# Show capabilities
if platform != 'unknown':
caps = get_platform_capabilities(platform)
if caps.get('has_api'):
print(f" π API Available: {caps.get('api_docs', 'Yes')}")
if caps.get('scraper_class'):
print(f" π€ Scraper: {caps['scraper_class']}")
print()
# Summary
print("=" * 80)
print("\nπ Platform Distribution:\n")
for platform, count in sorted(platform_counts.items(), key=lambda x: x[1], reverse=True):
percentage = (count / len(df.head(20))) * 100
bar = "β" * int(percentage / 5)
print(f" {platform:15s} {count:3d} ({percentage:5.1f}%) {bar}")
print("\nπ‘ Next Steps:")
print(" 1. Implement scrapers for top platforms")
print(" 2. Test scraping on example URLs")
print(" 3. Add platform info to Silver layer")
return platform_counts, platform_examples
async def demo_event_creation():
"""
Demonstrate creating standardized MeetingEvent objects.
"""
print("\n" + "=" * 80)
print("π
Demo: Creating Standardized Meeting Events\n")
# Example 1: From Birmingham
event1 = MeetingEvent(
title="Birmingham City Council Regular Meeting",
description="Regular meeting with discussion of water system improvements",
classification=Classification.COUNCIL,
start=datetime(2026, 4, 21, 18, 0),
location=Location(
name="City Hall Council Chambers",
address="710 N 20th Street",
city="Birmingham",
state="AL"
),
source="https://birminghamal.gov/meetings",
jurisdiction_name="Birmingham city",
state_code="AL"
)
# Add documents
event1.add_link("Agenda", "https://birminghamal.gov/agenda-20260421.pdf")
event1.add_link("Video Recording", "https://birminghamal.gov/video/20260421")
# Mark as oral health relevant
event1.oral_health_relevant = True
event1.keywords_found = ["water", "fluoridation", "public health"]
event1.confidence_score = 0.85
print(f"Event 1: {event1.title}")
print(f" Location: {event1.location}")
print(f" Has agenda: {event1.has_agenda()}")
print(f" Has video: {event1.has_video()}")
print(f" Oral health relevant: {event1.oral_health_relevant}")
print(f" Keywords: {', '.join(event1.keywords_found)}")
print(f" Confidence: {event1.confidence_score:.1%}")
# Show how it converts to Delta Lake format
print(f"\nπ Delta Lake format (first 5 fields):")
data_dict = event1.to_dict()
for key in list(data_dict.keys())[:5]:
print(f" {key}: {data_dict[key]}")
print(f"\nβ
Event ID: {event1.id}")
print(f" (Generated from: {event1.source} + {event1.start.isoformat()})")
async def demo_matter_tracking():
"""
Demonstrate tracking a policy matter across meetings.
"""
from models.meeting_event import Matter
from datetime import datetime
print("\n" + "=" * 80)
print("π Demo: Matter Tracking (Legislative Item Evolution)\n")
# Track a fluoridation ordinance
matter = Matter(
matter_id="BHM-2024-FL001",
matter_number="Ordinance 2024-045",
title="Community Water Fluoridation Program Implementation",
type="Ordinance",
first_introduced=datetime(2024, 1, 15),
status="Committee Review"
)
# Add related meetings
matter.related_meetings = [
"mtg-20240115-council",
"mtg-20240205-health-committee",
"mtg-20240220-public-hearing"
]
# Add documents
from models.meeting_event import Link
matter.related_documents = [
Link("Original Ordinance", "https://example.gov/ord-2024-045.pdf"),
Link("Committee Report", "https://example.gov/committee-report.pdf"),
Link("Public Comments", "https://example.gov/comments.pdf")
]
# Mark as health policy
matter.is_health_policy = True
matter.policy_keywords = ["fluoridation", "oral health", "CDC guidelines"]
print(f"Matter: {matter.title}")
print(f" Number: {matter.matter_number}")
print(f" Type: {matter.type}")
print(f" Status: {matter.status}")
print(f" First introduced: {matter.first_introduced.strftime('%B %d, %Y')}")
print(f" Related meetings: {len(matter.related_meetings)}")
print(f" Documents: {len(matter.related_documents)}")
print(f" Health policy: {matter.is_health_policy}")
print(f" Keywords: {', '.join(matter.policy_keywords)}")
print("\nπ‘ This allows you to:")
print(" - Track how a policy evolves across multiple meetings")
print(" - See all related documents in one place")
print(" - Identify windows of opportunity for advocacy")
print(" - Monitor voting patterns on health issues")
if __name__ == "__main__":
from datetime import datetime
from models.meeting_event import Location
print("π¦· Oral Health Policy Pulse - Platform Integration Demo")
print("=" * 80)
# Run async demos
asyncio.run(analyze_discovered_urls())
asyncio.run(demo_event_creation())
asyncio.run(demo_matter_tracking())
print("\n" + "=" * 80)
print("β
Demo complete!")
print("\nπ For more details, see:")
print(" - docs/INTEGRATION_GUIDE.md")
print(" - discovery/platform_detector.py")
print(" - models/meeting_event.py")
|