Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

File size: 9,505 Bytes

61d29fc

"""
Platform detection for municipal websites.

Based on patterns from:
- biglocalnews/civic-scraper (Apache 2.0)
- city-scrapers/city-scrapers (MIT)

Detects which content management system or meeting platform a municipality uses,
enabling optimized scraping strategies.
"""
from typing import Optional, Dict, List
from urllib.parse import urlparse
import httpx
from bs4 import BeautifulSoup
from loguru import logger


# Platform URL patterns (most specific first)
PLATFORM_PATTERNS = {
    'legistar': [
        'legistar.com',
        '/Legistar/',
        '/LegislationDetail.aspx',
        '/Calendar.aspx',
        '/MeetingDetail.aspx',
        'WebApi/odata'
    ],
    'granicus': [
        'granicus.com',
        '/Mediasite/',
        '/ViewPublisher.php',
        '/MetaViewer.php',
        'granicus-cdn.com'
    ],
    'municode': [
        'municode.com',
        '/meeting_minutes',
        '/MuniCode/'
    ],
    'civicplus': [
        'civicplus.com',
        '/AgendaCenter/',
        '/DocumentCenter/',
        '/CivicSend/'
    ],
    'primegov': [
        'primegov.com',
        '/Portal/',
        '/Public/0/'
    ],
    'calagenda': [
        'ca-ilg.civicplus.com',
        '/AgendaCenter/ViewFile/'
    ],
    'swagit': [
        'swagit.com',
        '/play/',
        '/videos/'
    ],
    'zoomgov': [
        'zoom.us/rec/',
        'zoomgov.com'
    ]
}

# HTML meta tag patterns that indicate platforms
META_PATTERNS = {
    'legistar': [
        'Legistar',
        'InSite',
        'Granicus'  # Granicus owns Legistar
    ],
    'civicplus': [
        'CivicPlus',
        'CivicEngage'
    ]
}

# Common CMS patterns (WordPress, Drupal, etc.)
CMS_PATTERNS = {
    'wordpress': [
        'wp-content',
        'wp-includes',
        'wordpress'
    ],
    'drupal': [
        '/sites/default/',
        'drupal.js',
        'Drupal.settings'
    ],
    'joomla': [
        '/components/com_',
        '/modules/mod_'
    ]
}


def detect_platform(url: str, html_content: Optional[str] = None) -> Optional[str]:
    """
    Detect which platform a municipality website uses.
    
    Performs two-stage detection:
    1. URL pattern matching (fast, works without fetching)
    2. HTML content analysis (slower, more accurate)
    
    Args:
        url: Municipality website URL
        html_content: Optional HTML content for deeper analysis
        
    Returns:
        Platform name or None if unknown
        
    Examples:
        >>> detect_platform("https://chicago.legistar.com/Calendar.aspx")
        'legistar'
        >>> detect_platform("https://example.gov/meetings")
        None
    """
    url_lower = url.lower()
    
    # Stage 1: URL pattern matching
    for platform, patterns in PLATFORM_PATTERNS.items():
        if any(pattern.lower() in url_lower for pattern in patterns):
            logger.debug(f"Detected {platform} from URL pattern: {url}")
            return platform
    
    # Stage 2: HTML content analysis (if provided)
    if html_content:
        platform = detect_from_html(html_content)
        if platform:
            logger.debug(f"Detected {platform} from HTML content: {url}")
            return platform
    
    # Stage 3: Check for generic CMS
    for cms, patterns in CMS_PATTERNS.items():
        if any(pattern.lower() in url_lower for pattern in patterns):
            logger.debug(f"Detected generic CMS {cms}: {url}")
            return 'generic'
    
    logger.debug(f"No platform detected for: {url}")
    return None


def detect_from_html(html_content: str) -> Optional[str]:
    """
    Detect platform from HTML content analysis.
    
    Checks:
    - Meta tags (generator, description)
    - Script sources
    - Link hrefs
    - Specific HTML structures
    
    Args:
        html_content: Raw HTML content
        
    Returns:
        Platform name or None
    """
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Check meta tags
        for platform, keywords in META_PATTERNS.items():
            meta_generator = soup.find('meta', attrs={'name': 'generator'})
            if meta_generator:
                content = meta_generator.get('content', '').lower()
                if any(keyword.lower() in content for keyword in keywords):
                    return platform
        
        # Check scripts and links
        all_text = html_content.lower()
        for platform, patterns in PLATFORM_PATTERNS.items():
            if any(pattern.lower() in all_text for pattern in patterns):
                return platform
        
    except Exception as e:
        logger.warning(f"Error parsing HTML for platform detection: {e}")
    
    return None


async def detect_platform_async(url: str, fetch_html: bool = True) -> Dict[str, any]:
    """
    Async version that can fetch HTML content for deeper detection.
    
    Args:
        url: Municipality website URL
        fetch_html: Whether to fetch HTML for content analysis
        
    Returns:
        Dictionary with detection results:
        {
            'platform': str,
            'confidence': float,
            'features': List[str],
            'scraper_available': bool
        }
    """
    result = {
        'url': url,
        'platform': None,
        'confidence': 0.0,
        'features': [],
        'scraper_available': False
    }
    
    # Quick URL check
    platform = detect_platform(url)
    if platform:
        result['platform'] = platform
        result['confidence'] = 0.7
        result['features'].append('url_pattern')
    
    # Fetch and analyze HTML if requested
    if fetch_html:
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                response = await client.get(url, follow_redirects=True)
                response.raise_for_status()
                
                platform_from_html = detect_from_html(response.text)
                if platform_from_html:
                    result['platform'] = platform_from_html
                    result['confidence'] = 0.9
                    result['features'].append('html_content')
                    
        except Exception as e:
            logger.warning(f"Could not fetch {url} for platform detection: {e}")
    
    # Check if we have a scraper for this platform
    if result['platform'] in ['legistar', 'granicus', 'civicplus']:
        result['scraper_available'] = True
    
    return result


def get_platform_capabilities(platform: str) -> Dict[str, any]:
    """
    Get capabilities and scraping strategies for a platform.
    
    Args:
        platform: Platform name
        
    Returns:
        Dictionary describing platform capabilities
    """
    capabilities = {
        'legistar': {
            'has_api': True,
            'api_docs': 'https://webapi.legistar.com/Help',
            'supports_bulk_download': True,
            'common_endpoints': [
                '/events',
                '/matters',
                '/bodies'
            ],
            'rate_limit': 'Unknown',
            'scraper_class': 'scrapers.legistar.LegistarScraper'
        },
        'granicus': {
            'has_api': True,
            'supports_bulk_download': True,
            'common_endpoints': [
                '/ViewPublisher.php',
                '/MetaViewer.php'
            ],
            'rate_limit': 'Unknown',
            'scraper_class': 'scrapers.granicus.GranicusScraper'
        },
        'civicplus': {
            'has_api': False,
            'supports_bulk_download': False,
            'requires_html_parsing': True,
            'scraper_class': 'scrapers.civicplus.CivicPlusScraper'
        },
        'generic': {
            'has_api': False,
            'supports_bulk_download': False,
            'requires_html_parsing': True,
            'scraper_class': 'scrapers.generic.GenericScraper'
        }
    }
    
    return capabilities.get(platform, {
        'has_api': False,
        'supports_bulk_download': False,
        'requires_html_parsing': True,
        'scraper_class': 'scrapers.generic.GenericScraper'
    })


def get_scraper_class(platform: str):
    """
    Get appropriate scraper class for a platform.
    
    Args:
        platform: Platform name
        
    Returns:
        Scraper class (dynamically imported)
    """
    # Note: This assumes you'll create these scraper classes
    # For now, returns None to avoid import errors
    
    scraper_map = {
        'legistar': 'scrapers.legistar.LegistarScraper',
        'granicus': 'scrapers.granicus.GranicusScraper',
        'civicplus': 'scrapers.civicplus.CivicPlusScraper',
        'generic': 'scrapers.generic.GenericScraper'
    }
    
    scraper_path = scraper_map.get(platform, 'scrapers.generic.GenericScraper')
    
    # TODO: Dynamic import when scrapers are implemented
    # module_path, class_name = scraper_path.rsplit('.', 1)
    # module = importlib.import_module(module_path)
    # return getattr(module, class_name)
    
    logger.warning(f"Scraper class not yet implemented: {scraper_path}")
    return None


# Example usage
if __name__ == "__main__":
    # Test URL detection
    test_urls = [
        "https://chicago.legistar.com/Calendar.aspx",
        "https://birminghamal.gov/meetings",
        "https://example.civicplus.com/AgendaCenter",
        "https://unknown-city.gov/council"
    ]
    
    for url in test_urls:
        platform = detect_platform(url)
        print(f"{url}\n  → Platform: {platform}\n")