#!/usr/bin/env python3 """ Organize meetings and contacts data by state for HuggingFace. Structure: - data/gold/states/{STATE}/meetings.parquet - data/gold/states/{STATE}/contacts_local_officials.parquet - data/gold/states/{STATE}/contacts_meeting_attendance.parquet """ import shutil from pathlib import Path import pandas as pd from loguru import logger # State name to abbreviation mapping STATE_ABBREV = { 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY', 'District of Columbia': 'DC', 'Puerto Rico': 'PR' } def load_all_meetings() -> pd.DataFrame: """Load and consolidate all meeting data from cache.""" logger.info("πŸ“‚ Loading all meeting data from cache...") cache_dir = Path("data/cache/localview") meeting_files = sorted(cache_dir.glob("meetings.*.parquet")) if not meeting_files: logger.error(f"No meeting files found in {cache_dir}") return pd.DataFrame() logger.info(f" Found {len(meeting_files)} year files (2006-2023)") dfs = [] total_rows = 0 for file in meeting_files: year = file.stem.split('.')[1] df = pd.read_parquet(file) dfs.append(df) total_rows += len(df) logger.info(f" {year}: {len(df):,} meetings") combined_df = pd.concat(dfs, ignore_index=True) logger.success(f" βœ… Loaded {total_rows:,} total meetings") return combined_df def add_state_code(df: pd.DataFrame) -> pd.DataFrame: """Add state abbreviation column.""" logger.info("πŸ—ΊοΈ Mapping state names to abbreviations...") if 'state_name' not in df.columns: logger.error("No 'state_name' column found!") return df df['state'] = df['state_name'].map(STATE_ABBREV) # Check for unmapped states unmapped = df[df['state'].isna()]['state_name'].unique() if len(unmapped) > 0: logger.warning(f" ⚠️ Unmapped states: {unmapped}") # Drop rows with no state mapping before = len(df) df = df.dropna(subset=['state']) after = len(df) if before > after: logger.warning(f" Dropped {before - after:,} rows with unmapped states") logger.success(f" βœ… Mapped {len(df):,} meetings to state codes") return df def split_meetings_by_state(df: pd.DataFrame, output_dir: Path): """Split meetings into state directories.""" logger.info("\nπŸ“Š Splitting meetings by state...") states = sorted(df['state'].unique()) logger.info(f" Found {len(states)} states with meeting data") total_size = 0 state_counts = {} for state in states: state_df = df[df['state'] == state] # Create state directory state_dir = output_dir / state state_dir.mkdir(parents=True, exist_ok=True) # Save meetings output_file = state_dir / "meetings.parquet" state_df.to_parquet(output_file, index=False, compression='snappy') size = output_file.stat().st_size total_size += size state_counts[state] = len(state_df) logger.info(f" βœ… {state}: {len(state_df):,} meetings β†’ {output_file.relative_to(output_dir.parent)} ({size / 1024 / 1024:.1f} MB)") logger.success(f" πŸ“¦ Total: {len(states)} states, {total_size / 1024 / 1024:.1f} MB") return state_counts def create_national_meetings(df: pd.DataFrame, output_dir: Path): """Create consolidated national meetings file.""" logger.info("\n🌎 Creating national meetings file...") national_dir = output_dir / "national" national_dir.mkdir(parents=True, exist_ok=True) output_file = national_dir / "meetings.parquet" df.to_parquet(output_file, index=False, compression='snappy') size = output_file.stat().st_size logger.success(f" βœ… Created {output_file.relative_to(output_dir.parent)} ({size / 1024 / 1024:.1f} MB)") logger.info(f" {len(df):,} total meetings from {len(df['state'].unique())} states") def update_readmes(output_dir: Path, state_counts: dict): """Update README files with meetings information.""" logger.info("\nπŸ“ Updating README files...") # Update national README national_readme = output_dir / "national" / "README.md" if national_readme.exists(): content = national_readme.read_text() # Add meetings section if not present if "meetings.parquet" not in content: addition = """ ## Meetings Data - **meetings.parquet** - 153K+ government meeting transcripts (2006-2023) - City council meetings, county board meetings, etc. - Columns: state, jurisdiction, meeting_date, transcript, demographics, etc. - Source: LocalView project """ # Insert before the Example section if "## Example" in content: content = content.replace("## Example", addition + "\n## Example") else: content += addition national_readme.write_text(content) logger.success(f" βœ… Updated {national_readme.relative_to(output_dir.parent)}") # Update states README states_readme = output_dir / "states" / "README.md" if states_readme.exists(): content = states_readme.read_text() # Add meetings to structure if not present if "meetings.parquet" not in content: # Find the structure section and add meetings content = content.replace( "β”‚ └── nonprofits_programs.parquet", """β”‚ β”œβ”€β”€ nonprofits_programs.parquet β”‚ └── meetings.parquet""" ) # Add to datasets section datasets_addition = """ 5. **meetings.parquet** - Government meeting transcripts (where available) """ if "## πŸ“Š Datasets" in content: content = content.replace( "4. **nonprofits_programs.parquet**", "4. **nonprofits_programs.parquet**" + datasets_addition ) states_readme.write_text(content) logger.success(f" βœ… Updated {states_readme.relative_to(output_dir.parent)}") # Create meetings-specific README meetings_readme = output_dir / "states" / "MEETINGS_README.md" # Build state coverage table top_states = sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:20] state_table = "\n".join([f"| {state} | {count:,} |" for state, count in top_states]) meetings_readme.write_text(f"""# Government Meetings Data by State Local government meeting transcripts from the LocalView project (2006-2023). ## Coverage **Total:** 153,000+ meetings across {len(state_counts)} states **Top 20 States by Meeting Count:** | State | Meetings | |-------|----------| {state_table} ## Data Structure Each state directory contains: - `meetings.parquet` - Meeting transcripts for that state ## Columns - **state** - State abbreviation - **state_name** - Full state name - **place_name** - City/jurisdiction name - **meeting_date** - Date of meeting - **caption_text** - Full meeting transcript - **channel_title** - Government channel (e.g., "City Council") - **vid_upload_date** - When video was uploaded - **Demographics** - Census data (acs_18_* columns) - And more... ## Usage Examples ### Load meetings for a single state ```python import pandas as pd # California meetings ca_meetings = pd.read_parquet('states/CA/meetings.parquet') print(f"California meetings: {{len(ca_meetings):,}}") print(f"Jurisdictions: {{ca_meetings['place_name'].nunique()}}") ``` ### Search across multiple states ```python import pandas as pd import glob # Load West Coast meetings states = ['CA', 'OR', 'WA'] dfs = [pd.read_parquet(f'states/{{s}}/meetings.parquet') for s in states] west_coast = pd.concat(dfs) # Search for topic dental_meetings = west_coast[ west_coast['caption_text'].str.contains('dental|oral health', case=False, na=False) ] print(f"Found {{len(dental_meetings)}} meetings mentioning oral health") ``` ### Load national dataset ```python import pandas as pd # All meetings in one file all_meetings = pd.read_parquet('national/meetings.parquet') print(f"Total meetings: {{len(all_meetings):,}}") print(f"Date range: {{all_meetings['meeting_date'].min()}} to {{all_meetings['meeting_date'].max()}}") ``` ## Data Source LocalView project - automated scraping of government meeting videos and transcripts from municipal YouTube channels. **Years:** 2006-2023 **Coverage:** 153K+ meetings **States:** {len(state_counts)} states with data ## Notes - Not all states have equal coverage (depends on jurisdictions publishing to YouTube) - Transcript quality varies by jurisdiction's captioning practices - Some meetings may have incomplete transcripts - Demographics linked via Census tract data """) logger.success(f" βœ… Created {meetings_readme.relative_to(output_dir.parent)}") def main(): """Main execution.""" logger.info("=" * 70) logger.info("πŸš€ Organizing meetings data by state for HuggingFace") logger.info("=" * 70) output_dir = Path("data/gold") # Load all meetings df = load_all_meetings() if df.empty: logger.error("No meeting data found. Exiting.") return # Add state codes df = add_state_code(df) # Split by state state_counts = split_meetings_by_state(df, output_dir / "states") # Create national file create_national_meetings(df, output_dir) # Update READMEs update_readmes(output_dir, state_counts) # Summary logger.info("\n" + "=" * 70) logger.success("βœ… COMPLETE: Meetings data organized by state") logger.info("=" * 70) logger.info(f"\nπŸ“ Structure:") logger.info(f" data/gold/national/meetings.parquet - All {len(df):,} meetings") logger.info(f" data/gold/states/{{STATE}}/meetings.parquet - State-specific") logger.info(f"\nπŸ“Š Coverage: {len(state_counts)} states") logger.info(f" Top states: {', '.join(sorted(state_counts.keys())[:10])}") if __name__ == "__main__": main()