Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| Platform detection for municipal websites. | |
| Based on patterns from: | |
| - biglocalnews/civic-scraper (Apache 2.0) | |
| - city-scrapers/city-scrapers (MIT) | |
| Detects which content management system or meeting platform a municipality uses, | |
| enabling optimized scraping strategies. | |
| """ | |
| from typing import Optional, Dict, List | |
| from urllib.parse import urlparse | |
| import httpx | |
| from bs4 import BeautifulSoup | |
| from loguru import logger | |
| # Platform URL patterns (most specific first) | |
| PLATFORM_PATTERNS = { | |
| 'legistar': [ | |
| 'legistar.com', | |
| '/Legistar/', | |
| '/LegislationDetail.aspx', | |
| '/Calendar.aspx', | |
| '/MeetingDetail.aspx', | |
| 'WebApi/odata' | |
| ], | |
| 'granicus': [ | |
| 'granicus.com', | |
| '/Mediasite/', | |
| '/ViewPublisher.php', | |
| '/MetaViewer.php', | |
| 'granicus-cdn.com' | |
| ], | |
| 'municode': [ | |
| 'municode.com', | |
| '/meeting_minutes', | |
| '/MuniCode/' | |
| ], | |
| 'civicplus': [ | |
| 'civicplus.com', | |
| '/AgendaCenter/', | |
| '/DocumentCenter/', | |
| '/CivicSend/' | |
| ], | |
| 'primegov': [ | |
| 'primegov.com', | |
| '/Portal/', | |
| '/Public/0/' | |
| ], | |
| 'calagenda': [ | |
| 'ca-ilg.civicplus.com', | |
| '/AgendaCenter/ViewFile/' | |
| ], | |
| 'swagit': [ | |
| 'swagit.com', | |
| '/play/', | |
| '/videos/' | |
| ], | |
| 'zoomgov': [ | |
| 'zoom.us/rec/', | |
| 'zoomgov.com' | |
| ] | |
| } | |
| # HTML meta tag patterns that indicate platforms | |
| META_PATTERNS = { | |
| 'legistar': [ | |
| 'Legistar', | |
| 'InSite', | |
| 'Granicus' # Granicus owns Legistar | |
| ], | |
| 'civicplus': [ | |
| 'CivicPlus', | |
| 'CivicEngage' | |
| ] | |
| } | |
| # Common CMS patterns (WordPress, Drupal, etc.) | |
| CMS_PATTERNS = { | |
| 'wordpress': [ | |
| 'wp-content', | |
| 'wp-includes', | |
| 'wordpress' | |
| ], | |
| 'drupal': [ | |
| '/sites/default/', | |
| 'drupal.js', | |
| 'Drupal.settings' | |
| ], | |
| 'joomla': [ | |
| '/components/com_', | |
| '/modules/mod_' | |
| ] | |
| } | |
| def detect_platform(url: str, html_content: Optional[str] = None) -> Optional[str]: | |
| """ | |
| Detect which platform a municipality website uses. | |
| Performs two-stage detection: | |
| 1. URL pattern matching (fast, works without fetching) | |
| 2. HTML content analysis (slower, more accurate) | |
| Args: | |
| url: Municipality website URL | |
| html_content: Optional HTML content for deeper analysis | |
| Returns: | |
| Platform name or None if unknown | |
| Examples: | |
| >>> detect_platform("https://chicago.legistar.com/Calendar.aspx") | |
| 'legistar' | |
| >>> detect_platform("https://example.gov/meetings") | |
| None | |
| """ | |
| url_lower = url.lower() | |
| # Stage 1: URL pattern matching | |
| for platform, patterns in PLATFORM_PATTERNS.items(): | |
| if any(pattern.lower() in url_lower for pattern in patterns): | |
| logger.debug(f"Detected {platform} from URL pattern: {url}") | |
| return platform | |
| # Stage 2: HTML content analysis (if provided) | |
| if html_content: | |
| platform = detect_from_html(html_content) | |
| if platform: | |
| logger.debug(f"Detected {platform} from HTML content: {url}") | |
| return platform | |
| # Stage 3: Check for generic CMS | |
| for cms, patterns in CMS_PATTERNS.items(): | |
| if any(pattern.lower() in url_lower for pattern in patterns): | |
| logger.debug(f"Detected generic CMS {cms}: {url}") | |
| return 'generic' | |
| logger.debug(f"No platform detected for: {url}") | |
| return None | |
| def detect_from_html(html_content: str) -> Optional[str]: | |
| """ | |
| Detect platform from HTML content analysis. | |
| Checks: | |
| - Meta tags (generator, description) | |
| - Script sources | |
| - Link hrefs | |
| - Specific HTML structures | |
| Args: | |
| html_content: Raw HTML content | |
| Returns: | |
| Platform name or None | |
| """ | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Check meta tags | |
| for platform, keywords in META_PATTERNS.items(): | |
| meta_generator = soup.find('meta', attrs={'name': 'generator'}) | |
| if meta_generator: | |
| content = meta_generator.get('content', '').lower() | |
| if any(keyword.lower() in content for keyword in keywords): | |
| return platform | |
| # Check scripts and links | |
| all_text = html_content.lower() | |
| for platform, patterns in PLATFORM_PATTERNS.items(): | |
| if any(pattern.lower() in all_text for pattern in patterns): | |
| return platform | |
| except Exception as e: | |
| logger.warning(f"Error parsing HTML for platform detection: {e}") | |
| return None | |
| async def detect_platform_async(url: str, fetch_html: bool = True) -> Dict[str, any]: | |
| """ | |
| Async version that can fetch HTML content for deeper detection. | |
| Args: | |
| url: Municipality website URL | |
| fetch_html: Whether to fetch HTML for content analysis | |
| Returns: | |
| Dictionary with detection results: | |
| { | |
| 'platform': str, | |
| 'confidence': float, | |
| 'features': List[str], | |
| 'scraper_available': bool | |
| } | |
| """ | |
| result = { | |
| 'url': url, | |
| 'platform': None, | |
| 'confidence': 0.0, | |
| 'features': [], | |
| 'scraper_available': False | |
| } | |
| # Quick URL check | |
| platform = detect_platform(url) | |
| if platform: | |
| result['platform'] = platform | |
| result['confidence'] = 0.7 | |
| result['features'].append('url_pattern') | |
| # Fetch and analyze HTML if requested | |
| if fetch_html: | |
| try: | |
| async with httpx.AsyncClient(timeout=10.0) as client: | |
| response = await client.get(url, follow_redirects=True) | |
| response.raise_for_status() | |
| platform_from_html = detect_from_html(response.text) | |
| if platform_from_html: | |
| result['platform'] = platform_from_html | |
| result['confidence'] = 0.9 | |
| result['features'].append('html_content') | |
| except Exception as e: | |
| logger.warning(f"Could not fetch {url} for platform detection: {e}") | |
| # Check if we have a scraper for this platform | |
| if result['platform'] in ['legistar', 'granicus', 'civicplus']: | |
| result['scraper_available'] = True | |
| return result | |
| def get_platform_capabilities(platform: str) -> Dict[str, any]: | |
| """ | |
| Get capabilities and scraping strategies for a platform. | |
| Args: | |
| platform: Platform name | |
| Returns: | |
| Dictionary describing platform capabilities | |
| """ | |
| capabilities = { | |
| 'legistar': { | |
| 'has_api': True, | |
| 'api_docs': 'https://webapi.legistar.com/Help', | |
| 'supports_bulk_download': True, | |
| 'common_endpoints': [ | |
| '/events', | |
| '/matters', | |
| '/bodies' | |
| ], | |
| 'rate_limit': 'Unknown', | |
| 'scraper_class': 'scrapers.legistar.LegistarScraper' | |
| }, | |
| 'granicus': { | |
| 'has_api': True, | |
| 'supports_bulk_download': True, | |
| 'common_endpoints': [ | |
| '/ViewPublisher.php', | |
| '/MetaViewer.php' | |
| ], | |
| 'rate_limit': 'Unknown', | |
| 'scraper_class': 'scrapers.granicus.GranicusScraper' | |
| }, | |
| 'civicplus': { | |
| 'has_api': False, | |
| 'supports_bulk_download': False, | |
| 'requires_html_parsing': True, | |
| 'scraper_class': 'scrapers.civicplus.CivicPlusScraper' | |
| }, | |
| 'generic': { | |
| 'has_api': False, | |
| 'supports_bulk_download': False, | |
| 'requires_html_parsing': True, | |
| 'scraper_class': 'scrapers.generic.GenericScraper' | |
| } | |
| } | |
| return capabilities.get(platform, { | |
| 'has_api': False, | |
| 'supports_bulk_download': False, | |
| 'requires_html_parsing': True, | |
| 'scraper_class': 'scrapers.generic.GenericScraper' | |
| }) | |
| def get_scraper_class(platform: str): | |
| """ | |
| Get appropriate scraper class for a platform. | |
| Args: | |
| platform: Platform name | |
| Returns: | |
| Scraper class (dynamically imported) | |
| """ | |
| # Note: This assumes you'll create these scraper classes | |
| # For now, returns None to avoid import errors | |
| scraper_map = { | |
| 'legistar': 'scrapers.legistar.LegistarScraper', | |
| 'granicus': 'scrapers.granicus.GranicusScraper', | |
| 'civicplus': 'scrapers.civicplus.CivicPlusScraper', | |
| 'generic': 'scrapers.generic.GenericScraper' | |
| } | |
| scraper_path = scraper_map.get(platform, 'scrapers.generic.GenericScraper') | |
| # TODO: Dynamic import when scrapers are implemented | |
| # module_path, class_name = scraper_path.rsplit('.', 1) | |
| # module = importlib.import_module(module_path) | |
| # return getattr(module, class_name) | |
| logger.warning(f"Scraper class not yet implemented: {scraper_path}") | |
| return None | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Test URL detection | |
| test_urls = [ | |
| "https://chicago.legistar.com/Calendar.aspx", | |
| "https://birminghamal.gov/meetings", | |
| "https://example.civicplus.com/AgendaCenter", | |
| "https://unknown-city.gov/council" | |
| ] | |
| for url in test_urls: | |
| platform = detect_platform(url) | |
| print(f"{url}\n → Platform: {platform}\n") | |