""" Standardized models for government meeting data. Based on City Scrapers schema (MIT License): https://github.com/city-scrapers/city-scrapers These models provide a consistent format regardless of the source platform (Legistar, Granicus, generic websites, etc.). """ from dataclasses import dataclass, field, asdict from datetime import datetime from typing import Optional, List, Dict, Any from enum import Enum import hashlib import json class Classification(str, Enum): """Meeting classification types (from City Scrapers)""" BOARD = "Board" COMMISSION = "Commission" COMMITTEE = "Committee" COUNCIL = "Council" TOWN_HALL = "Town Hall" PUBLIC_HEARING = "Public Hearing" NOT_CLASSIFIED = "Not classified" class EventStatus(str, Enum): """Meeting status""" TENTATIVE = "tentative" CONFIRMED = "confirmed" CANCELLED = "cancelled" PASSED = "passed" # Meeting has already occurred @dataclass class Location: """Meeting location information""" name: str address: Optional[str] = None city: Optional[str] = None state: Optional[str] = None def __str__(self): parts = [self.name] if self.address: parts.append(self.address) if self.city and self.state: parts.append(f"{self.city}, {self.state}") elif self.city: parts.append(self.city) return ", ".join(parts) @dataclass class Link: """Document or resource link""" title: str # "Agenda", "Minutes", "Video Recording", "Packet" href: str content_type: Optional[str] = None # "application/pdf", "text/html", "video/mp4" def __post_init__(self): """Infer content type from URL if not provided""" if not self.content_type: if self.href.endswith('.pdf'): self.content_type = 'application/pdf' elif self.href.endswith('.html') or self.href.endswith('.htm'): self.content_type = 'text/html' elif self.href.endswith('.doc') or self.href.endswith('.docx'): self.content_type = 'application/msword' elif 'video' in self.href or 'youtube' in self.href: self.content_type = 'video/mp4' @dataclass class MeetingEvent: """ Standardized government meeting event. Compatible with City Scrapers Event schema. Extended with oral health policy tracking fields. """ # === Core Identification === title: str description: str classification: Classification # === Temporal === start: datetime end: Optional[datetime] = None all_day: bool = False status: EventStatus = EventStatus.CONFIRMED # === Spatial === location: Location = field(default_factory=lambda: Location(name="TBD")) # === Content === links: List[Link] = field(default_factory=list) source: str = "" # Original URL where event was found # === Jurisdiction === jurisdiction_name: str = "" state_code: str = "" fips_code: Optional[str] = None # === Metadata === scraped_at: datetime = field(default_factory=datetime.utcnow) # === Oral Health Policy Tracking (YOUR VALUE-ADD!) === oral_health_relevant: bool = False keywords_found: List[str] = field(default_factory=list) confidence_score: float = 0.0 # Generated fields id: str = field(init=False) def __post_init__(self): """Generate unique ID after initialization""" self.id = self._generate_id() def _generate_id(self) -> str: """Generate unique ID from source + start time""" unique_string = f"{self.source}_{self.start.isoformat()}_{self.title}" return hashlib.sha256(unique_string.encode()).hexdigest()[:16] def add_link(self, title: str, href: str, content_type: Optional[str] = None): """Convenience method to add a document link""" self.links.append(Link(title=title, href=href, content_type=content_type)) def has_agenda(self) -> bool: """Check if event has an agenda document""" return any('agenda' in link.title.lower() for link in self.links) def has_minutes(self) -> bool: """Check if event has meeting minutes""" return any('minute' in link.title.lower() for link in self.links) def has_video(self) -> bool: """Check if event has video recording""" return any( 'video' in link.title.lower() or link.content_type == 'video/mp4' for link in self.links ) def to_dict(self) -> Dict[str, Any]: """ Convert to dictionary for Delta Lake storage. Handles datetime serialization and nested objects. """ return { 'id': self.id, 'title': self.title, 'description': self.description, 'classification': self.classification.value, 'status': self.status.value, # Temporal (ISO 8601 format) 'start': self.start.isoformat(), 'end': self.end.isoformat() if self.end else None, 'all_day': self.all_day, # Spatial (flattened) 'location_name': self.location.name, 'location_address': self.location.address, 'location_city': self.location.city, 'location_state': self.location.state, # Links (as JSON array) 'links': [ { 'title': link.title, 'href': link.href, 'content_type': link.content_type } for link in self.links ], # Source tracking 'source': self.source, 'jurisdiction_name': self.jurisdiction_name, 'state_code': self.state_code, 'fips_code': self.fips_code, 'scraped_at': self.scraped_at.isoformat(), # Oral health relevance 'oral_health_relevant': self.oral_health_relevant, 'keywords_found': self.keywords_found, 'confidence_score': self.confidence_score, # Convenience flags 'has_agenda': self.has_agenda(), 'has_minutes': self.has_minutes(), 'has_video': self.has_video() } def to_json(self) -> str: """Convert to JSON string""" return json.dumps(self.to_dict(), indent=2) @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'MeetingEvent': """ Create MeetingEvent from dictionary. Useful for loading from Delta Lake or JSON. """ # Parse datetimes start = datetime.fromisoformat(data['start']) if isinstance(data['start'], str) else data['start'] end = datetime.fromisoformat(data['end']) if data.get('end') and isinstance(data['end'], str) else data.get('end') scraped_at = datetime.fromisoformat(data.get('scraped_at', datetime.utcnow().isoformat())) # Reconstruct location location = Location( name=data.get('location_name', 'TBD'), address=data.get('location_address'), city=data.get('location_city'), state=data.get('location_state') ) # Reconstruct links links = [ Link( title=link['title'], href=link['href'], content_type=link.get('content_type') ) for link in data.get('links', []) ] return cls( title=data['title'], description=data['description'], classification=Classification(data['classification']), status=EventStatus(data.get('status', 'confirmed')), start=start, end=end, all_day=data.get('all_day', False), location=location, links=links, source=data['source'], jurisdiction_name=data.get('jurisdiction_name', ''), state_code=data.get('state_code', ''), fips_code=data.get('fips_code'), scraped_at=scraped_at, oral_health_relevant=data.get('oral_health_relevant', False), keywords_found=data.get('keywords_found', []), confidence_score=data.get('confidence_score', 0.0) ) @dataclass class Matter: """ Legislative matter/item tracking across meetings. Based on Engagic's "Matter" model for tracking policy evolution. Perfect for tracking fluoridation ordinances, health board decisions, etc. """ matter_id: str matter_number: Optional[str] = None # "Bill 2024-001", "Resolution 45" title: str = "" type: str = "Unknown" # "Ordinance", "Resolution", "Motion", "Discussion" # Lifecycle first_introduced: Optional[datetime] = None status: str = "Introduced" # "Introduced", "Committee", "Hearing", "Passed", "Failed" # Related content related_meetings: List[str] = field(default_factory=list) # Meeting IDs related_documents: List[Link] = field(default_factory=list) # Votes (if applicable) votes_for: int = 0 votes_against: int = 0 votes_abstain: int = 0 # Oral health specific is_health_policy: bool = False policy_keywords: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: """Convert to dictionary""" return { 'matter_id': self.matter_id, 'matter_number': self.matter_number, 'title': self.title, 'type': self.type, 'first_introduced': self.first_introduced.isoformat() if self.first_introduced else None, 'status': self.status, 'related_meetings': self.related_meetings, 'related_documents': [ {'title': doc.title, 'href': doc.href} for doc in self.related_documents ], 'votes_for': self.votes_for, 'votes_against': self.votes_against, 'votes_abstain': self.votes_abstain, 'is_health_policy': self.is_health_policy, 'policy_keywords': self.policy_keywords } # Example usage if __name__ == "__main__": # Create a sample meeting event event = MeetingEvent( title="City Council Regular Meeting", description="Regular meeting of the Birmingham City Council", classification=Classification.COUNCIL, start=datetime(2026, 4, 21, 18, 0), end=datetime(2026, 4, 21, 20, 0), location=Location( name="City Hall Council Chambers", address="710 N 20th Street", city="Birmingham", state="AL" ), source="https://birminghamal.gov/meetings", jurisdiction_name="Birmingham", state_code="AL" ) # Add documents event.add_link("Agenda", "https://birminghamal.gov/agenda.pdf", "application/pdf") event.add_link("Previous Minutes", "https://birminghamal.gov/minutes.pdf") # Mark as oral health relevant event.oral_health_relevant = True event.keywords_found = ["fluoridation", "water", "public health"] event.confidence_score = 0.85 # Print as JSON print(event.to_json()) # Show what's available print(f"\nHas agenda: {event.has_agenda()}") print(f"Has minutes: {event.has_minutes()}") print(f"Has video: {event.has_video()}")