open-navigator / discovery /platform_detector.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
"""
Platform detection for municipal websites.
Based on patterns from:
- biglocalnews/civic-scraper (Apache 2.0)
- city-scrapers/city-scrapers (MIT)
Detects which content management system or meeting platform a municipality uses,
enabling optimized scraping strategies.
"""
from typing import Optional, Dict, List
from urllib.parse import urlparse
import httpx
from bs4 import BeautifulSoup
from loguru import logger
# Platform URL patterns (most specific first)
PLATFORM_PATTERNS = {
'legistar': [
'legistar.com',
'/Legistar/',
'/LegislationDetail.aspx',
'/Calendar.aspx',
'/MeetingDetail.aspx',
'WebApi/odata'
],
'granicus': [
'granicus.com',
'/Mediasite/',
'/ViewPublisher.php',
'/MetaViewer.php',
'granicus-cdn.com'
],
'municode': [
'municode.com',
'/meeting_minutes',
'/MuniCode/'
],
'civicplus': [
'civicplus.com',
'/AgendaCenter/',
'/DocumentCenter/',
'/CivicSend/'
],
'primegov': [
'primegov.com',
'/Portal/',
'/Public/0/'
],
'calagenda': [
'ca-ilg.civicplus.com',
'/AgendaCenter/ViewFile/'
],
'swagit': [
'swagit.com',
'/play/',
'/videos/'
],
'zoomgov': [
'zoom.us/rec/',
'zoomgov.com'
]
}
# HTML meta tag patterns that indicate platforms
META_PATTERNS = {
'legistar': [
'Legistar',
'InSite',
'Granicus' # Granicus owns Legistar
],
'civicplus': [
'CivicPlus',
'CivicEngage'
]
}
# Common CMS patterns (WordPress, Drupal, etc.)
CMS_PATTERNS = {
'wordpress': [
'wp-content',
'wp-includes',
'wordpress'
],
'drupal': [
'/sites/default/',
'drupal.js',
'Drupal.settings'
],
'joomla': [
'/components/com_',
'/modules/mod_'
]
}
def detect_platform(url: str, html_content: Optional[str] = None) -> Optional[str]:
"""
Detect which platform a municipality website uses.
Performs two-stage detection:
1. URL pattern matching (fast, works without fetching)
2. HTML content analysis (slower, more accurate)
Args:
url: Municipality website URL
html_content: Optional HTML content for deeper analysis
Returns:
Platform name or None if unknown
Examples:
>>> detect_platform("https://chicago.legistar.com/Calendar.aspx")
'legistar'
>>> detect_platform("https://example.gov/meetings")
None
"""
url_lower = url.lower()
# Stage 1: URL pattern matching
for platform, patterns in PLATFORM_PATTERNS.items():
if any(pattern.lower() in url_lower for pattern in patterns):
logger.debug(f"Detected {platform} from URL pattern: {url}")
return platform
# Stage 2: HTML content analysis (if provided)
if html_content:
platform = detect_from_html(html_content)
if platform:
logger.debug(f"Detected {platform} from HTML content: {url}")
return platform
# Stage 3: Check for generic CMS
for cms, patterns in CMS_PATTERNS.items():
if any(pattern.lower() in url_lower for pattern in patterns):
logger.debug(f"Detected generic CMS {cms}: {url}")
return 'generic'
logger.debug(f"No platform detected for: {url}")
return None
def detect_from_html(html_content: str) -> Optional[str]:
"""
Detect platform from HTML content analysis.
Checks:
- Meta tags (generator, description)
- Script sources
- Link hrefs
- Specific HTML structures
Args:
html_content: Raw HTML content
Returns:
Platform name or None
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Check meta tags
for platform, keywords in META_PATTERNS.items():
meta_generator = soup.find('meta', attrs={'name': 'generator'})
if meta_generator:
content = meta_generator.get('content', '').lower()
if any(keyword.lower() in content for keyword in keywords):
return platform
# Check scripts and links
all_text = html_content.lower()
for platform, patterns in PLATFORM_PATTERNS.items():
if any(pattern.lower() in all_text for pattern in patterns):
return platform
except Exception as e:
logger.warning(f"Error parsing HTML for platform detection: {e}")
return None
async def detect_platform_async(url: str, fetch_html: bool = True) -> Dict[str, any]:
"""
Async version that can fetch HTML content for deeper detection.
Args:
url: Municipality website URL
fetch_html: Whether to fetch HTML for content analysis
Returns:
Dictionary with detection results:
{
'platform': str,
'confidence': float,
'features': List[str],
'scraper_available': bool
}
"""
result = {
'url': url,
'platform': None,
'confidence': 0.0,
'features': [],
'scraper_available': False
}
# Quick URL check
platform = detect_platform(url)
if platform:
result['platform'] = platform
result['confidence'] = 0.7
result['features'].append('url_pattern')
# Fetch and analyze HTML if requested
if fetch_html:
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
platform_from_html = detect_from_html(response.text)
if platform_from_html:
result['platform'] = platform_from_html
result['confidence'] = 0.9
result['features'].append('html_content')
except Exception as e:
logger.warning(f"Could not fetch {url} for platform detection: {e}")
# Check if we have a scraper for this platform
if result['platform'] in ['legistar', 'granicus', 'civicplus']:
result['scraper_available'] = True
return result
def get_platform_capabilities(platform: str) -> Dict[str, any]:
"""
Get capabilities and scraping strategies for a platform.
Args:
platform: Platform name
Returns:
Dictionary describing platform capabilities
"""
capabilities = {
'legistar': {
'has_api': True,
'api_docs': 'https://webapi.legistar.com/Help',
'supports_bulk_download': True,
'common_endpoints': [
'/events',
'/matters',
'/bodies'
],
'rate_limit': 'Unknown',
'scraper_class': 'scrapers.legistar.LegistarScraper'
},
'granicus': {
'has_api': True,
'supports_bulk_download': True,
'common_endpoints': [
'/ViewPublisher.php',
'/MetaViewer.php'
],
'rate_limit': 'Unknown',
'scraper_class': 'scrapers.granicus.GranicusScraper'
},
'civicplus': {
'has_api': False,
'supports_bulk_download': False,
'requires_html_parsing': True,
'scraper_class': 'scrapers.civicplus.CivicPlusScraper'
},
'generic': {
'has_api': False,
'supports_bulk_download': False,
'requires_html_parsing': True,
'scraper_class': 'scrapers.generic.GenericScraper'
}
}
return capabilities.get(platform, {
'has_api': False,
'supports_bulk_download': False,
'requires_html_parsing': True,
'scraper_class': 'scrapers.generic.GenericScraper'
})
def get_scraper_class(platform: str):
"""
Get appropriate scraper class for a platform.
Args:
platform: Platform name
Returns:
Scraper class (dynamically imported)
"""
# Note: This assumes you'll create these scraper classes
# For now, returns None to avoid import errors
scraper_map = {
'legistar': 'scrapers.legistar.LegistarScraper',
'granicus': 'scrapers.granicus.GranicusScraper',
'civicplus': 'scrapers.civicplus.CivicPlusScraper',
'generic': 'scrapers.generic.GenericScraper'
}
scraper_path = scraper_map.get(platform, 'scrapers.generic.GenericScraper')
# TODO: Dynamic import when scrapers are implemented
# module_path, class_name = scraper_path.rsplit('.', 1)
# module = importlib.import_module(module_path)
# return getattr(module, class_name)
logger.warning(f"Scraper class not yet implemented: {scraper_path}")
return None
# Example usage
if __name__ == "__main__":
# Test URL detection
test_urls = [
"https://chicago.legistar.com/Calendar.aspx",
"https://birminghamal.gov/meetings",
"https://example.civicplus.com/AgendaCenter",
"https://unknown-city.gov/council"
]
for url in test_urls:
platform = detect_platform(url)
print(f"{url}\n → Platform: {platform}\n")