open-navigator / examples /integration_demo.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
"""
Example: Using platform detection with discovered URLs.
This script demonstrates how to:
1. Load discovered URLs from Silver layer
2. Detect which platform each URL uses
3. Prepare optimized scraping strategies
"""
import asyncio
from pathlib import Path
# Imports from your project
from discovery.platform_detector import detect_platform_async, get_platform_capabilities
from models.meeting_event import MeetingEvent, Classification
from config.settings import settings
async def analyze_discovered_urls():
"""
Analyze all discovered URLs and detect their platforms.
"""
print("πŸ” Analyzing discovered URLs for platform detection...\n")
# Check if we have discovered URLs
silver_path = Path(f"{settings.delta_lake_path}/silver/discovered_urls")
if not silver_path.exists():
print("❌ No discovered URLs found. Run: python main.py discover-jurisdictions --limit 500")
return
# Load URLs using pandas (simpler than PySpark for small data)
import pandas as pd
df = pd.read_parquet(silver_path, engine='pyarrow')
print(f"πŸ“Š Found {len(df)} discovered URLs\n")
print("=" * 80)
# Analyze each URL
platform_counts = {}
platform_examples = {}
for idx, row in df.head(20).iterrows(): # Analyze first 20 for demo
url = row['url']
jurisdiction = row['jurisdiction_name']
state = row['state_code']
# Detect platform (async for thorough detection)
result = await detect_platform_async(url, fetch_html=False)
platform = result['platform'] or 'unknown'
confidence = result['confidence']
# Track counts
platform_counts[platform] = platform_counts.get(platform, 0) + 1
# Store example
if platform not in platform_examples:
platform_examples[platform] = {
'url': url,
'jurisdiction': jurisdiction,
'state': state
}
# Display
status = "βœ…" if result['scraper_available'] else "⚠️"
print(f"{status} {jurisdiction}, {state}")
print(f" Platform: {platform} (confidence: {confidence:.1%})")
print(f" URL: {url}")
# Show capabilities
if platform != 'unknown':
caps = get_platform_capabilities(platform)
if caps.get('has_api'):
print(f" πŸš€ API Available: {caps.get('api_docs', 'Yes')}")
if caps.get('scraper_class'):
print(f" πŸ€– Scraper: {caps['scraper_class']}")
print()
# Summary
print("=" * 80)
print("\nπŸ“ˆ Platform Distribution:\n")
for platform, count in sorted(platform_counts.items(), key=lambda x: x[1], reverse=True):
percentage = (count / len(df.head(20))) * 100
bar = "β–ˆ" * int(percentage / 5)
print(f" {platform:15s} {count:3d} ({percentage:5.1f}%) {bar}")
print("\nπŸ’‘ Next Steps:")
print(" 1. Implement scrapers for top platforms")
print(" 2. Test scraping on example URLs")
print(" 3. Add platform info to Silver layer")
return platform_counts, platform_examples
async def demo_event_creation():
"""
Demonstrate creating standardized MeetingEvent objects.
"""
print("\n" + "=" * 80)
print("πŸ“… Demo: Creating Standardized Meeting Events\n")
# Example 1: From Birmingham
event1 = MeetingEvent(
title="Birmingham City Council Regular Meeting",
description="Regular meeting with discussion of water system improvements",
classification=Classification.COUNCIL,
start=datetime(2026, 4, 21, 18, 0),
location=Location(
name="City Hall Council Chambers",
address="710 N 20th Street",
city="Birmingham",
state="AL"
),
source="https://birminghamal.gov/meetings",
jurisdiction_name="Birmingham city",
state_code="AL"
)
# Add documents
event1.add_link("Agenda", "https://birminghamal.gov/agenda-20260421.pdf")
event1.add_link("Video Recording", "https://birminghamal.gov/video/20260421")
# Mark as oral health relevant
event1.oral_health_relevant = True
event1.keywords_found = ["water", "fluoridation", "public health"]
event1.confidence_score = 0.85
print(f"Event 1: {event1.title}")
print(f" Location: {event1.location}")
print(f" Has agenda: {event1.has_agenda()}")
print(f" Has video: {event1.has_video()}")
print(f" Oral health relevant: {event1.oral_health_relevant}")
print(f" Keywords: {', '.join(event1.keywords_found)}")
print(f" Confidence: {event1.confidence_score:.1%}")
# Show how it converts to Delta Lake format
print(f"\nπŸ“ Delta Lake format (first 5 fields):")
data_dict = event1.to_dict()
for key in list(data_dict.keys())[:5]:
print(f" {key}: {data_dict[key]}")
print(f"\nβœ… Event ID: {event1.id}")
print(f" (Generated from: {event1.source} + {event1.start.isoformat()})")
async def demo_matter_tracking():
"""
Demonstrate tracking a policy matter across meetings.
"""
from models.meeting_event import Matter
from datetime import datetime
print("\n" + "=" * 80)
print("πŸ“‹ Demo: Matter Tracking (Legislative Item Evolution)\n")
# Track a fluoridation ordinance
matter = Matter(
matter_id="BHM-2024-FL001",
matter_number="Ordinance 2024-045",
title="Community Water Fluoridation Program Implementation",
type="Ordinance",
first_introduced=datetime(2024, 1, 15),
status="Committee Review"
)
# Add related meetings
matter.related_meetings = [
"mtg-20240115-council",
"mtg-20240205-health-committee",
"mtg-20240220-public-hearing"
]
# Add documents
from models.meeting_event import Link
matter.related_documents = [
Link("Original Ordinance", "https://example.gov/ord-2024-045.pdf"),
Link("Committee Report", "https://example.gov/committee-report.pdf"),
Link("Public Comments", "https://example.gov/comments.pdf")
]
# Mark as health policy
matter.is_health_policy = True
matter.policy_keywords = ["fluoridation", "oral health", "CDC guidelines"]
print(f"Matter: {matter.title}")
print(f" Number: {matter.matter_number}")
print(f" Type: {matter.type}")
print(f" Status: {matter.status}")
print(f" First introduced: {matter.first_introduced.strftime('%B %d, %Y')}")
print(f" Related meetings: {len(matter.related_meetings)}")
print(f" Documents: {len(matter.related_documents)}")
print(f" Health policy: {matter.is_health_policy}")
print(f" Keywords: {', '.join(matter.policy_keywords)}")
print("\nπŸ’‘ This allows you to:")
print(" - Track how a policy evolves across multiple meetings")
print(" - See all related documents in one place")
print(" - Identify windows of opportunity for advocacy")
print(" - Monitor voting patterns on health issues")
if __name__ == "__main__":
from datetime import datetime
from models.meeting_event import Location
print("🦷 Oral Health Policy Pulse - Platform Integration Demo")
print("=" * 80)
# Run async demos
asyncio.run(analyze_discovered_urls())
asyncio.run(demo_event_creation())
asyncio.run(demo_matter_tracking())
print("\n" + "=" * 80)
print("βœ… Demo complete!")
print("\nπŸ“– For more details, see:")
print(" - docs/INTEGRATION_GUIDE.md")
print(" - discovery/platform_detector.py")
print(" - models/meeting_event.py")