diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..af3893485aa4093b9adaac626ef9be8729e04931 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,87 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +.venv/ +venv/ +ENV/ +env/ + +# Node +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* +frontend/dist/ +frontend/node_modules/ +website/build/ +website/node_modules/ +website/.docusaurus/ + +# Data and cache +data/ +cache/ +logs/ +output/ +*.db +*.sqlite + +# Git +.git/ +.gitignore +.github/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Documentation (we copy specific dirs) +docs/ +*.md +!README.md + +# Test and development +tests/ +examples/ +notebooks/ +*.ipynb + +# Large files +*.zip +*.tar.gz +*.mp4 +*.avi +*.mov + +# Environment +.env +.env.local +*.key +*.pem + +# OS +.DS_Store +Thumbs.db + +# Build artifacts +build/ +dist/ +*.egg-info/ + +# Temporary +tmp/ +temp/ +*.tmp + +# Scripts we don't need in container +start-all.sh +stop-all.sh +deploy-huggingface.sh +test-huggingface-build.sh +migrate-docs.sh +install.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..92fc1b2b9dc11d5b78584404b82438853e691d7a --- /dev/null +++ b/.gitignore @@ -0,0 +1,100 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +!frontend/src/lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Node.js +node_modules/ + +# Virtual environments +venv/ +env/ +ENV/ +.venv +.venv-intel + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Environment variables +.env +.env.local +.env.*.local +.env.production +.env.development +.env.test + +# Credentials and secrets +credentials.json +service-account.json +*_credentials.json +auth.json +token.json +*_token.txt +*-token.txt +gcp-*.json +bigquery-*.json +.gcp/ + +# Logs +logs/ +*.log + +# Jupyter Notebook +.ipynb_checkpoints +*.ipynb + +# Data files +data/ +.migration_backup/ +*.csv +*.parquet +*.delta + +# Delta Lake +_delta_log/ + +# Testing +.coverage +htmlcov/ +.pytest_cache/ +.tox/ + +# Documentation +docs/_build/ + +# Databricks +.databricks/ + +# Secrets +secrets/ +*.key +*.pem + +# OS +Thumbs.db +# Binary files for Docker build only (not in git) +website/static/img/communityone_card.png diff --git a/.huggingface/README.md b/.huggingface/README.md new file mode 100644 index 0000000000000000000000000000000000000000..87f182b6345f51d2a69b8c316e7028b2aa37dc1d --- /dev/null +++ b/.huggingface/README.md @@ -0,0 +1,101 @@ +--- +title: CommunityOne - Open Navigator +emoji: 🏛️ +colorFrom: blue +colorTo: green +sdk: docker +app_port: 7860 +pinned: false +license: apache-2.0 +tags: + - civic-engagement + - policy-tracking + - government-transparency + - nonprofit-discovery + - open-data +--- + +# 🏛️ CommunityOne - Open Navigator + +**Track 90,000+ jurisdictions. Monitor 1.8M nonprofits. Amplify your voice.** + +CommunityOne is a civic engagement platform that helps you discover advocacy opportunities, track policy changes, and connect with organizations working on the causes you care about. + +## ✨ Features + +- **🔍 Unified Search**: Find contacts, meetings, organizations, and causes across the entire United States +- **📊 Real-time Stats**: Track policy activity across 90,000+ cities, counties, and states +- **🏢 Nonprofit Discovery**: Explore 1.8M organizations from IRS data enriched with Every.org +- **📅 Meeting Minutes**: Search 250,000+ government meeting transcripts and agendas +- **🎯 Geographic Filtering**: Browse by state, county, or city to find local opportunities +- **🔐 OAuth Login**: Sign in with HuggingFace, GitHub, or Google to save your preferences + +## 🚀 Three Services Architecture + +This deployment runs three integrated services: + +1. **📚 Documentation** (Docusaurus) - `/docs/` +2. **🖥️ Main Application** (React + Vite) - `/` +3. **⚡ API Backend** (FastAPI) - `/api/` + +All services are reverse-proxied through nginx on port 7860. + +## 📖 Quick Start + +### Browse Without Login +- Click "Browse All" to explore data by state +- Use the search bar to find organizations, contacts, or causes +- Filter by location using the state/county/city selectors + +### Sign In for Personalization +- Click "Login" in the top right +- Choose your OAuth provider (HuggingFace, GitHub, or Google) +- Follow organizations, leaders, and causes you care about +- Get personalized recommendations + +### Explore the API +- Visit `/redoc` for interactive API documentation +- Try the search endpoints with state filters +- Export data in JSON format for your own projects + +## 🛠️ Technology Stack + +- **Frontend**: React 18 + TypeScript + Vite + TailwindCSS + shadcn/ui +- **Backend**: Python 3.11 + FastAPI + Pydantic +- **Data**: Delta Lake + Parquet (90GB+ of civic data) +- **Docs**: Docusaurus v3 +- **Infrastructure**: nginx + supervisor + Docker + +## 📊 Data Sources + +- **IRS BMF**: 1.8M tax-exempt organizations +- **Every.org**: Nonprofit enrichment (logos, causes, revenue) +- **Open States**: State legislators and bills (7,300+ officials) +- **Census**: Jurisdictions and boundaries (90,000+) +- **CityScrapers**: Local government meetings +- **OpenCivicData**: Standardized government data + +## 🔗 Links + +- **Repository**: [github.com/getcommunityone/open-navigator](https://github.com/getcommunityone/open-navigator) +- **Documentation**: Click "📚 Browse Documentation" on the homepage +- **API Docs**: `/redoc` endpoint +- **Website**: [www.communityone.com](https://www.communityone.com) + +## 📝 License + +Apache License 2.0 - Free for commercial and non-commercial use + +## 🤝 Contributing + +We welcome contributions! See CONTRIBUTING.md in the repository for guidelines. + +## 💬 Support + +- **Issues**: [GitHub Issues](https://github.com/getcommunityone/open-navigator/issues) +- **Discussions**: [GitHub Discussions](https://github.com/getcommunityone/open-navigator/discussions) +- **Email**: hello@communityone.com + +--- + +Built with ❤️ for civic engagement and government transparency. diff --git a/.huggingface/nginx.conf b/.huggingface/nginx.conf new file mode 100644 index 0000000000000000000000000000000000000000..258d74441e574d71b7d9387c60b00e787fc0445c --- /dev/null +++ b/.huggingface/nginx.conf @@ -0,0 +1,125 @@ +worker_processes auto; +pid /tmp/nginx.pid; + +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # Logging + access_log /app/logs/nginx-access.log; + error_log /app/logs/nginx-error.log; + + # Performance + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + client_max_body_size 50M; + + # Compression + gzip on; + gzip_vary on; + gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript; + + upstream fastapi_backend { + server 127.0.0.1:8000; + } + + server { + listen 7860; + server_name _; + + # Force HTTPS - HSTS header tells browsers to ALWAYS use HTTPS + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + + # Additional security headers + add_header X-Content-Type-Options "nosniff" always; + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-XSS-Protection "1; mode=block" always; + + # Documentation - serve static files built by Docusaurus + location /docs { + alias /app/static/docs; + try_files $uri $uri/ /docs/index.html; + + # Cache static assets - shorter for easier updates + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { + expires 1d; + add_header Cache-Control "public, max-age=86400"; + } + } + + # API backend at /api/ + location /api/ { + proxy_pass http://fastapi_backend/api/; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection 'upgrade'; + proxy_set_header Host $host; + proxy_cache_bypass $http_upgrade; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 300s; + proxy_connect_timeout 75s; + } + + # API docs - route /api/docs to backend /docs + location = /api/docs { + proxy_pass http://fastapi_backend/docs; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # API OpenAPI schema - support both /api/openapi.json and /openapi.json + location ~ ^/(api/)?(openapi\.json|redoc) { + proxy_pass http://fastapi_backend/$2; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Frontend assets - shorter cache for easier updates + location /assets/ { + alias /app/static/frontend/assets/; + expires 1d; + add_header Cache-Control "public, max-age=86400"; + } + + # Main frontend app at root + location / { + root /app/static/frontend; + try_files $uri $uri/ /index.html; + + # NEVER cache index.html - force browser to check for new version + location = /index.html { + add_header Cache-Control "no-cache, no-store, must-revalidate"; + add_header Pragma "no-cache"; + add_header Expires "0"; + } + + # Cache hashed assets (immutable) but shorter time for easier updates + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { + expires 1d; + add_header Cache-Control "public, max-age=86400"; + } + } + + # Health check endpoint + location /health { + access_log off; + return 200 "OK"; + add_header Content-Type text/plain; + } + } +} diff --git a/.huggingface/start.sh b/.huggingface/start.sh new file mode 100755 index 0000000000000000000000000000000000000000..189211a3ef2096da386bd93c93ddb87329d27428 --- /dev/null +++ b/.huggingface/start.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -e + +echo "🚀 Starting CommunityOne on Hugging Face Spaces..." +echo "📊 Three services architecture:" +echo " 1. Documentation (Docusaurus) - Port 3000" +echo " 2. Main Application (React + Vite) - Port 5173" +echo " 3. API Backend (FastAPI) - Port 8000" +echo " 4. Nginx Reverse Proxy - Port 7860 (HF Spaces public port)" +echo "" + +# DEBUG: Check environment variable +echo "🔍 Environment Check:" +echo " HF_SPACES = ${HF_SPACES:-NOT SET}" +if [ "$HF_SPACES" = "1" ]; then + echo " ✅ HF_SPACES is correctly set to 1" +else + echo " ❌ WARNING: HF_SPACES is not set to 1" + echo " Setting HF_SPACES=1 now..." + export HF_SPACES=1 +fi +echo "" + +# Create required directories +mkdir -p /app/logs /app/data /var/log/supervisor + +# Verify static files exist +echo "📁 Verifying static files..." +if [ -d "/app/static/docs" ]; then + echo "✅ Documentation static files found" + ls -lh /app/static/docs/ | head -5 +else + echo "❌ ERROR: Documentation static files missing at /app/static/docs" + exit 1 +fi + +if [ -d "/app/static/frontend" ]; then + echo "✅ Frontend static files found" + ls -lh /app/static/frontend/ | head -5 +else + echo "❌ ERROR: Frontend static files missing at /app/static/frontend" + exit 1 +fi + +# Install serve for static file hosting (if not already installed) +if ! command -v serve &> /dev/null; then + echo "📦 Installing serve for static file hosting..." + npm install -g serve +fi + +# Test nginx configuration +echo "🔧 Testing nginx configuration..." +nginx -t + +# Initialize database if needed +echo "💾 Initializing database..." +python -c "from api.database import init_db; init_db()" || echo "⚠️ Database init skipped" + +# Start all services with supervisor +echo "🎬 Starting all services with supervisor..." +exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf diff --git a/.huggingface/supervisord.conf b/.huggingface/supervisord.conf new file mode 100644 index 0000000000000000000000000000000000000000..d90e1c942749a0d2ea3c29a17197b70a7e316260 --- /dev/null +++ b/.huggingface/supervisord.conf @@ -0,0 +1,28 @@ +[supervisord] +nodaemon=true +logfile=/dev/stdout +logfile_maxbytes=0 +pidfile=/tmp/supervisord.pid +user=root + +[program:nginx] +command=/usr/sbin/nginx -g "daemon off;" +autostart=true +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=10 + +[program:fastapi] +command=uvicorn api.main:app --host 0.0.0.0 --port 8000 --log-level info --proxy-headers +directory=/app +autostart=true +autorestart=true +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +environment=PYTHONUNBUFFERED="1",HF_SPACES="1" +priority=20 diff --git a/CITATIONS.md b/CITATIONS.md new file mode 100644 index 0000000000000000000000000000000000000000..b25edae392d46813a5b891010c65bff5dbf06b53 --- /dev/null +++ b/CITATIONS.md @@ -0,0 +1,1474 @@ +# Citations and Acknowledgments + +This project uses several open datasets and research contributions. Please cite the following works when using or referencing this project. + +## 📚 **Datasets** + +### **MeetingBank Dataset** + +We use the MeetingBank benchmark dataset for meeting summarization and analysis. + +**Citation:** +``` +Yebowen Hu, Tim Ganter, Hanieh Deilamsalehy, Franck Dernoncourt, Hassan Foroosh, Fei Liu. +"MeetingBank: A Benchmark Dataset for Meeting Summarization" +In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL), +July 2023, Toronto, Canada. +``` + +**BibTeX:** +```bibtex +@inproceedings{hu-etal-2023-meetingbank, + title = "MeetingBank: A Benchmark Dataset for Meeting Summarization", + author = "Yebowen Hu and Tim Ganter and Hanieh Deilamsalehy and Franck Dernoncourt and Hassan Foroosh and Fei Liu", + booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL)", + month = July, + year = "2023", + address = "Toronto, Canada", + publisher = "Association for Computational Linguistics", +} +``` + +**Resources:** +- Paper: https://arxiv.org/abs/2305.17529 +- Dataset: https://huggingface.co/datasets/huuuyeah/meetingbank +- Zenodo: https://zenodo.org/record/7989108 + +**What we use:** +- 1,366 city council meetings from 6 U.S. cities +- Meeting transcripts and summaries +- Used for: Meeting discovery, transcript analysis, summarization benchmarking + +--- + +## 🗂️ **Other Data Sources** + +### **U.S. Census Bureau** +- Geographic boundaries and demographic data +- Source: https://www.census.gov/ +- License: Public Domain (U.S. Government) + +### **Open States / Plural Policy** ⭐ +- Comprehensive state and local legislative information +- Organization: Plural Policy (formerly Open States Foundation) +- Source: https://openstates.org/ +- API: https://openstates.org/api/ +- Data Downloads: https://open.pluralpolicy.com/data/ +- License: Various (check per state) +- API Key: Required for access (free tier: 50,000 requests/month) + +**Coverage:** +- All 50 states + DC + Puerto Rico +- 7,300+ state legislators +- Millions of bills, votes, and legislative sessions +- Monthly PostgreSQL database dumps (9.8GB+) + +**What we use:** +- Bulk legislative session downloads (CSV/JSON/PostgreSQL) +- State legislator data with committee assignments +- Bill tracking and voting records +- Legislative video sources (YouTube channels, Granicus portals) + +**Resources:** +- Open Data: https://open.pluralpolicy.com/data/ +- Scrapers Repository: https://github.com/openstates/openstates-scrapers +- Local Database Setup: https://docs.openstates.org/contributing/local-database/ +- Code of Conduct: https://docs.openstates.org/code-of-conduct/ +- Schema Documentation: https://github.com/openstates/people/blob/master/schema.md + +**BibTeX:** +```bibtex +@software{openstates, + title = {Open States}, + author = {{Plural Policy}}, + year = {2024}, + url = {https://openstates.org/}, + note = {Comprehensive state legislative data for all 50 U.S. states} +} +``` + +**Potential Contributions:** +- Our scraper patterns could be contributed to openstates-scrapers +- Video source discovery could enhance their data +- We follow their Code of Conduct for all contributions + +### **LegiScan** ⭐ +- Comprehensive legislative tracking and bill text database +- Organization: LegiScan LLC +- Source: https://legiscan.com/ +- API: https://legiscan.com/legiscan +- License: API access requires subscription (free tier available with limitations) +- Coverage: All 50 states + U.S. Congress + Washington D.C. +- API Key: Required for access (free tier: limited requests) + +**Coverage:** +- Real-time legislative tracking for all U.S. states and Congress +- Full bill text, amendments, and legislative documents +- Roll call votes and voting records +- Committee assignments and hearings +- Bill status tracking and history +- Sponsor and co-sponsor information +- Bill text in PDF, HTML, and plain text formats + +**What we use:** +- Bill text downloads and full-text search +- Legislative document archives +- Bill status and tracking data +- Voting records and roll calls +- Supplement to Open States data for missing jurisdictions +- Historical legislative data back to 2011 + +**API Features:** +- GetBillText: Retrieve full bill text in multiple formats +- GetBill: Detailed bill metadata and status +- GetRollCall: Voting records with legislator positions +- GetSponsor: Sponsor and co-sponsor information +- Search: Full-text search across all bills +- GetDatasetList: Bulk dataset downloads + +**Resources:** +- API Documentation: https://legiscan.com/legiscan +- Dataset Downloads: https://legiscan.com/datasets +- Search Interface: https://legiscan.com/gaits/search +- State Coverage: https://legiscan.com/legiscan/states + +**BibTeX:** +```bibtex +@software{legiscan, + title = {LegiScan}, + author = {{LegiScan LLC}}, + year = {2024}, + url = {https://legiscan.com/}, + note = {Comprehensive legislative tracking and bill text database covering all 50 U.S. states and Congress} +} +``` + +**Complementary to Open States:** +- LegiScan provides bill text in multiple formats (PDF, HTML, plain text) +- Historical data back to 2011 for all states +- Real-time updates and notifications +- More comprehensive document archives +- Paid API provides higher rate limits and bulk downloads +- Use LegiScan for bill text analysis, Open States for structured legislative data + +### **Harvard Dataverse** +- Meeting datasets and civic engagement research +- Source: https://dataverse.harvard.edu/ +- License: Varies by dataset + +### **City Scrapers** ⭐ +- Open source civic tech project for scraping local government meetings +- Organization: Documenters.org / City Bureau +- Source: https://cityscrapers.org/ +- GitHub: https://github.com/city-scrapers +- License: MIT License (open source) +- Coverage: Chicago, Pittsburgh, Detroit, Cleveland, Los Angeles (250+ government agencies) +- What we use: Validated meeting URLs, Legistar/Granicus platform endpoints, spider code for scraper patterns +- Used for: Meeting discovery, URL extraction, platform detection, scraper validation + +**City Scrapers Repositories:** +- Chicago: https://github.com/city-scrapers/city-scrapers (~100 agencies) +- Pittsburgh: https://github.com/city-scrapers/city-scrapers-pitt (~30 agencies) +- Detroit: https://github.com/city-scrapers/city-scrapers-detroit (~40 agencies) +- Cleveland: https://github.com/city-scrapers/city-scrapers-cle (~30 agencies) +- Los Angeles: https://github.com/city-scrapers/city-scrapers-la (~50 agencies) + +**BibTeX:** +```bibtex +@software{city_scrapers, + title = {City Scrapers}, + author = {{Documenters.org}}, + year = {2024}, + url = {https://cityscrapers.org/}, + note = {Open source civic tech project providing validated scrapers for local government meetings across major U.S. cities} +} +``` + +### **Google Civic Information API** ⭐ +- Government officials, polling locations, and election data +- Organization: Google LLC +- API Documentation: https://developers.google.com/civic-information +- License: Free (with quota limits) +- Rate Limit: 25,000 requests/day (free tier) +- Coverage: U.S. federal, state, and local government officials; polling locations; election data +- What we use: Elected officials by address, representative contact info, voting districts +- Used for: Contact discovery, official verification, civic engagement tools + +**API Endpoints Used:** +- Representatives by Address: Get all elected officials for a given address +- Elections: Voter information, polling locations, ballot information +- Divisions: Geographic/political divisions (OCD-IDs) + +**BibTeX:** +```bibtex +@misc{google_civic_api, + title = {Google Civic Information API}, + author = {{Google LLC}}, + year = {2024}, + url = {https://developers.google.com/civic-information}, + note = {API providing government official contact information, election data, and polling locations} +} +``` + +**Terms of Service:** +- Attribution required when displaying official data +- Caching limited to 30 days +- Must comply with Google API Terms of Service + +### **YouTube Data API v3** ⭐ +- Video metadata, channel information, and search for government meetings +- Organization: Google LLC +- API Documentation: https://developers.google.com/youtube/v3 +- License: Free (with quota limits) +- Rate Limit: 10,000 units/day (free tier), search costs 100 units per request +- Coverage: Global video platform with millions of government channels +- What we use: Government channel discovery, meeting video metadata, transcript availability +- Used for: Video discovery, channel statistics, meeting video archival + +**API Features Used:** +- Search: Find government channels by jurisdiction name +- Channels: Get channel metadata, subscriber counts, video counts +- Videos: Metadata including title, description, upload date, duration +- Captions: Check for closed caption/transcript availability + +**BibTeX:** +```bibtex +@misc{youtube_data_api, + title = {YouTube Data API v3}, + author = {{Google LLC}}, + year = {2024}, + url = {https://developers.google.com/youtube/v3}, + note = {API for accessing YouTube video metadata, channel information, and search functionality} +} +``` + +**Terms of Service:** +- YouTube API Services Terms: https://developers.google.com/youtube/terms/api-services-terms-of-service +- Attribution required with YouTube logo +- Quota limits enforced (10,000 units/day free) +- Video embeds must use official YouTube player + +### **Ballotpedia** ⭐ +- Ballot measures, referendums, and propositions +- Organization: Lucy Burns Institute +- Source: https://ballotpedia.org/ +- API: https://ballotpedia.org/API-documentation +- License: API access is limited at scale (paid tier available) +- Coverage: All 50 states, historical measures back to 1990s +- Used for: Tracking fluoridation votes, school bond measures, health policy propositions + +### **MIT Election Data + Science Lab** +- Presidential, Congressional, and gubernatorial election results +- Organization: Massachusetts Institute of Technology +- Source: https://electionlab.mit.edu/data +- Repository: https://github.com/MEDSL/official-returns +- License: Free for research and commercial use +- Coverage: 1976-present, county-level results +- Used for: Political composition analysis, jurisdiction context + +### **OpenElections** +- State-by-state certified election results in standardized CSV format +- Source: https://openelections.net/ +- GitHub: https://github.com/openelections +- License: Open source (various by state) +- Coverage: All 50 states (various completion levels), precinct-level data +- Used for: Detailed election results, local race outcomes, advocacy targeting + +### **Open Civic Data (OCD) Standards** +- Division identifiers and civic data standards +- Specification: https://open-civic-data.readthedocs.io/en/latest/proposals/0002.html +- Repository: https://github.com/opencivicdata/ocd-division-ids +- License: Open source +- Used for: Standardized jurisdiction identifiers, cross-platform compatibility + +### **Popolo Project** +- International open government data specification for people, organizations, and elected positions +- Specification: https://www.popoloproject.com/ +- GitHub: https://github.com/popolo-project/popolo-spec +- Documentation: http://www.popoloproject.com/specs/ +- License: Creative Commons Attribution 4.0 International + +### **BillMap** ⭐ +- Tracks bill text similarity across all 50 U.S. states to identify copy-paste legislation and model bill influence +- Organization: Sunlight Foundation / @unitedstates community +- Repository: https://github.com/unitedstates/BillMap +- Research: Anderson et al., "Detecting Policy Influence in Legislatures" (2019) +- Paper: https://arxiv.org/abs/1906.03699 +- Live Demo: https://billmap.cs.princeton.edu/ +- License: Open source +- Coverage: All 50 states, tracks legislative text diffusion across jurisdictions +- Used for: Identifying model legislation, tracking policy influence, finding similar bills across states +- Method: Text similarity analysis, n-gram matching, bill text alignment + +**What we use:** +- Bill similarity detection algorithms +- Model legislation tracking methodology +- Cross-state policy diffusion analysis +- Legislative text comparison techniques + +**BibTeX:** +```bibtex +@article{anderson2019billmap, + title = {Detecting Policy Influence in Legislatures}, + author = {Anderson, Evan and Fowler, Anthony and Grossmann, Matt and Sahn, Alexander and Shiraito, Yuki}, + journal = {arXiv preprint arXiv:1906.03699}, + year = {2019}, + url = {https://arxiv.org/abs/1906.03699} +} +``` + +### **@unitedstates Images Repository** ⭐ +- High-resolution photos of all U.S. Congress members (past and present) +- Organization: @unitedstates community (Sunlight Foundation legacy project) +- Repository: https://github.com/unitedstates/images +- CDN: https://theunitedstates.io/images/congress/ +- License: Public domain (government photos) +- Coverage: All U.S. Senators and Representatives (1789-present), updated regularly +- Image Format: JPEG, multiple resolutions (original, 450x550, 225x275) +- Used for: Legislator profile photos, visual identification, representative directories + +**Image URL Format:** +``` +https://theunitedstates.io/images/congress/original/[bioguide_id].jpg +https://theunitedstates.io/images/congress/450x550/[bioguide_id].jpg +https://theunitedstates.io/images/congress/225x275/[bioguide_id].jpg +``` + +**Example:** +``` +https://theunitedstates.io/images/congress/original/P000197.jpg +(Nancy Pelosi, bioguide_id: P000197) +``` + +**What we use:** +- Legislator profile photos for federal representatives +- Visual identification in advocacy tools +- Representative directories and contact pages +- Cross-referenced with Open States data using bioguide IDs + +**Related Projects:** +- **congress-legislators**: https://github.com/unitedstates/congress-legislators (YAML data files) +- **congress**: https://github.com/unitedstates/congress (scraping tools) +- **districts**: https://github.com/unitedstates/districts (GeoJSON boundaries) + +--- + +## 💰 **Nonprofit Financial Data** + +### **GivingTuesday 990 Data Infrastructure** ⭐ + +We use the GivingTuesday 990 Data Lake for detailed nonprofit financial data from IRS Form 990 XML filings. + +**Organization:** GivingTuesday +**Website:** https://990data.givingtuesday.org/ +**Data Lake:** `s3://gt990datalake-rawdata` (AWS S3, us-east-1 Virginia, Public Access) +**Console:** https://us-east-1.console.aws.amazon.com/s3/buckets/gt990datalake-rawdata +**License:** Public domain (IRS data) + Open source tools +**Access:** Free, no AWS credentials required (anonymous access via `--no-sign-request`) + +**What we use:** +- **Raw 990 XMLs**: Individual e-filed Form 990 returns in XML format (1-2 MB each) +- **Indices**: CSV/Parquet files listing all available 990s with metadata +- **Coverage**: 5.4M+ e-filed Form 990s (2011-present, ~300K new filings/year) +- **Scale**: ~10 TB of raw XML data +- **Data extracted**: Revenue, expenses, assets, liabilities, grants, programs, officer compensation, mission statements, website URLs + +**Data Lake Structure:** +``` +s3://gt990datalake-rawdata/ +├── EfileData/ +│ ├── XmlFiles/ # Individual 990 XMLs (~5.4M files, ~10 TB) +│ │ └── [OBJECT_ID]_public.xml (e.g., 202233259349300703_public.xml) +│ └── XmlZips/ # ZIP archives (97 files, ~38 GB → ~95 GB uncompressed) +│ └── YYYY_TEOS_XML_*.zip (e.g., 2023_TEOS_XML_01A.zip ~400 MB) +└── Indices/ + └── 990xmls/ # CSV indices with metadata + └── index_all_years_efiledata_xmls_created_on_2023-10-29.csv (~925 MB) +``` + +**Download Strategies:** + +| Approach | Best For | Time | Bandwidth | Storage | +|----------|----------|------|-----------|---------| +| **Individual XMLs** | Single state or targeted download | ~2 hrs (22K orgs) | 32 GB | 32 GB | +| **ZIP Archives** | All states / nationwide | ~6 hrs total | 38 GB | 95 GB | + +**Choose Individual XMLs when:** +- You need data for 1-5 states only +- You want to download only specific EINs +- Storage space is limited +- You want incremental caching (download as needed) + +**Choose ZIP Archives when:** +- You need all 50 states +- You're building a comprehensive nonprofit database +- You have 100+ GB storage +- You want offline access to all filings + +**S3 Access Examples:** + +**Individual XMLs (for single state or targeted download):** +```bash +# List index files (no credentials needed) +aws s3 ls s3://gt990datalake-rawdata/Indices/990xmls/ --no-sign-request + +# Download index (~925 MB) +aws s3 cp s3://gt990datalake-rawdata/Indices/990xmls/index_all_years_efiledata_xmls_created_on_2023-10-29.csv . --no-sign-request + +# Download specific XML +aws s3 cp s3://gt990datalake-rawdata/EfileData/XmlFiles/202233259349300703_public.xml . --no-sign-request + +# Batch download for single state (using our script) +python scripts/batch_download_990s.py --state MA --health-only --concurrent 1000 +``` + +**ZIP Archives (for all states / nationwide):** +```bash +# Download all 97 ZIPs (~38 GB) to local directory +./scripts/download_990_zips.sh + +# Extract all ZIPs to get ~384K XMLs (~95 GB) +./scripts/extract_990_zips.sh + +# Build local index for fast lookup +python scripts/build_990_local_index.py + +# Now enrich from local files (no network needed!) +python scripts/enrich_all_states_990.py +``` + +**Index Schema:** +The CSV index contains columns: `EIN`, `TaxPeriod`, `ObjectId`, `URL`, `FormType`, `OrganizationName`, `DLN`, `SubmittedOn` + +**Python Access:** +```python +import boto3 +from botocore import UNSIGNED +from botocore.config import Config + +# Configure anonymous S3 client +s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) + +# Download XML +xml_obj = s3.get_object( + Bucket='gt990datalake-rawdata', + Key='EfileData/XmlFiles/202233259349300703_public.xml' +) +xml_content = xml_obj['Body'].read() +``` + +**BibTeX:** +```bibtex +@misc{givingtuesday990data, + title = {GivingTuesday 990 Data Infrastructure}, + author = {{GivingTuesday}}, + year = {2023}, + url = {https://990data.givingtuesday.org/}, + note = {Collaborative data lake of standardized IRS Form 990 XML filings} +} +``` + +**Attribution:** When publishing analyses using this data, please cite both: +1. GivingTuesday 990 Data Infrastructure: https://990data.givingtuesday.org/ +2. Our enrichment tools: https://github.com/getcommunityone/open-navigator-for-engagement + +--- + +### **Google Cloud Public Datasets: IRS 990** ⭐ + +Google hosts the complete IRS Form 990 dataset in BigQuery for fast SQL-based querying. + +**Platform:** Google Cloud BigQuery +**Dataset:** `bigquery-public-data.irs_990` +**Table:** `bigquery-public-data.irs_990.irs_990_xml` +**Documentation:** https://console.cloud.google.com/marketplace/product/internal-revenue-service/irs-990 +**Cost:** First 1 TB of queries per month is **FREE** +**Coverage:** All e-filed Form 990s (2011-present, 5M+ records) + +**What we use:** +- **Mission statements**: Extracted from `return_header` or `part_i_mission_desc` fields +- **Website URLs**: Found in `website_address_txt` field +- **Financial data**: All Form 990 fields accessible via SQL +- **Fast bulk queries**: Extract data for 1M+ orgs in seconds (vs hours downloading XMLs) + +**Advantages:** +- ✅ No local XML downloads needed +- ✅ Single SQL query to bulk-extract fields +- ✅ Serverless (no infrastructure to manage) +- ✅ Fast (queries complete in seconds) +- ✅ Free tier covers most research use cases + +**Example Query:** +```sql +SELECT + ein, + org_name, + website_address_txt, + part_i_mission_desc, + total_revenue_current_year, + total_expenses_current_year +FROM `bigquery-public-data.irs_990.irs_990_2023` +WHERE state = 'AL' + AND ntee_code LIKE 'E%' +LIMIT 1000; +``` + +**BibTeX:** +```bibtex +@misc{googlecloud_irs990, + title = {IRS 990 Public Dataset}, + author = {{Google Cloud Public Datasets}}, + year = {2024}, + publisher = {Google Cloud Platform}, + url = {https://console.cloud.google.com/marketplace/product/internal-revenue-service/irs-990}, + note = {BigQuery public dataset of IRS Form 990 e-file data} +} +``` + +**Attribution:** When using BigQuery 990 data, cite: +1. IRS 990 Public Dataset (Google Cloud) +2. Internal Revenue Service (original data source) + +--- + +### **National Center for Charitable Statistics (NCCS) Unified BMF** ⭐ + +The NCCS Unified BMF is a longitudinal nonprofit dataset specifically designed for AI and "Lakehouse" projects with pre-geocoded locations and Census integration. + +**Organization:** National Center for Charitable Statistics (NCCS), Urban Institute +**Website:** https://nccs.urban.org/ +**Dataset:** Unified BMF (Business Master File) +**Documentation:** https://nccs.urban.org/project/irs-exempt-organizations-business-master-file +**License:** Public domain (IRS data) + Urban Institute terms +**Released:** Late 2025/Early 2026 +**Coverage:** 1989 through mid-2025 (update pending) + +**What we use:** +- **Longitudinal tracking**: Single file with one row per organization that has ever held tax-exempt status +- **Pre-geocoded addresses**: Most recent address geocoded to Census block level +- **Geographic codes**: FIPS codes at block, tract, county, and state levels +- **Metropolitan area codes**: Current CBSA (Core Based Statistical Area) definitions +- **Temporal tracking**: `ORG_YEAR_FIRST` and `ORG_YEAR_LAST` variables for organization lifecycle +- **Census integration**: Ready for merging with Census demographic and economic data + +**Key Features:** +- ✅ **Eliminates annual file merging**: Consolidates all historical BMF releases into single file +- ✅ **AI/Lakehouse optimized**: Designed for modern data infrastructure +- ✅ **Census-ready**: FIPS codes enable direct joins with Census data +- ✅ **Metropolitan vs Rural**: CBSA codes identify urban/rural areas +- ✅ **Historical analysis**: Track organizations over time without complex ETL +- ✅ **Geographic analysis**: Pre-geocoded to Census block granularity + +**Use Cases:** +- Longitudinal analysis of nonprofit sector +- Building historical sampling frames +- Linking nonprofit data to Census demographics +- Metropolitan vs rural nonprofit analysis +- Policy research requiring geographic precision +- Time-series analysis of organizational entry/exit + +**Geographic Levels Available:** +- Census Block (finest granularity) +- Census Tract +- County (FIPS codes) +- State (FIPS codes) +- CBSA (Core Based Statistical Area) + +**Related Resources:** +- NCCS Census Crosswalk: For aggregating to additional geographic levels +- BMF Overview: https://nccs.urban.org/project/irs-exempt-organizations-business-master-file +- NCCS Data Archive: https://nccs.urban.org/nccs-data-archive + +**BibTeX:** +```bibtex +@dataset{nccs_unified_bmf, + title = {Unified Business Master File (BMF)}, + author = {{National Center for Charitable Statistics}}, + year = {2026}, + publisher = {Urban Institute}, + url = {https://nccs.urban.org/}, + note = {Longitudinal nonprofit dataset with pre-geocoded Census integration, 1989-2025} +} +``` + +**Attribution:** When using NCCS Unified BMF data, cite: +1. National Center for Charitable Statistics, Urban Institute +2. IRS Business Master File (original data source) +3. Specify the data vintage/update date used + +--- + +### **Charity Navigator** ⭐ + +**Powered by Charity Navigator** + +We use the Charity Navigator GraphQL API to enrich nonprofit profiles with star ratings, mission statements, and organizational metrics. + +**Organization:** Charity Navigator, Inc. +**Website:** https://www.charitynavigator.org +**API Documentation:** https://www.charitynavigator.org/partner/api +**Principal Office:** 299 Market Street, Suite 250, Saddle Brook, NJ 07663 +**License:** API Terms of Use (Last updated March 2025) +**Rate Limit:** 1,000 API calls per day + +**What we use:** +- **Charity Ratings**: Encompass Star Rating (0-4 stars) +- **Mission Statements**: Organization mission and purpose +- **Website URLs**: Official organization websites +- **Organizational Data**: EIN, name, address, category, cause +- **Active Advisories**: Alerts about organization status +- **Encompass Score**: Overall rating score +- **Rating Publication Date**: When the rating was last updated + +**Data Fields Accessed:** +``` +- Employer Identification Number (EIN) +- Charity Name +- Mission +- Organization Website URL +- Charity Navigator URL +- Category & Cause +- Street Address, City, State, Zip, Country +- Active Advisories +- Encompass Score & Star Rating +- Encompass Rating Publication Date & ID +``` + +**Attribution Requirements:** +- **Text Credit:** "Powered by Charity Navigator" (displayed on pages using their data) +- **Source Citation:** Charity Navigator cited as source on all pages displaying their data +- **Linkbacks:** All charity data links back to corresponding Charity Navigator profile pages +- **Trademark Notice:** CHARITY NAVIGATOR and the CHARITY NAVIGATOR logo are registered trademarks of Charity Navigator. All rights reserved. Used with permission. + +**BibTeX:** +```bibtex +@misc{charitynavigator_api, + title = {Charity Navigator API}, + author = {{Charity Navigator, Inc.}}, + year = {2025}, + url = {https://www.charitynavigator.org}, + note = {GraphQL API providing nonprofit ratings, mission statements, and organizational data} +} +``` + +**Compliance:** +This project complies with Charity Navigator's API Terms of Use, including: +- Rate limit compliance (max 1,000 calls/day) +- Proper attribution and branding +- Linkbacks to Charity Navigator profile pages +- Trademark acknowledgment +- Data caching for performance only (not for redistribution) + +**Example Profile Link Format:** +```html + + Michael J. Fox Foundation for Parkinson's Research + +``` + +**Related Tools:** +- [Nonprofit enrichment script](scripts/enrich_nonprofits_charitynavigator.py) (if created) +- [API integration documentation](website/docs/data-sources/charity-navigator.md) (if created) + +--- + +### **OpenSecrets.org (Center for Responsive Politics)** ⭐ + +**Organization:** OpenSecrets, a nonpartisan research organization tracking money in U.S. politics +**Website:** https://www.opensecrets.org +**Bulk Data:** https://www.opensecrets.org/open-data/bulk-data +**API Documentation:** https://www.opensecrets.org/open-data/api +**Status:** Bulk data access pending approval + +**What they offer:** +- **Campaign Finance Data**: Federal campaign contributions, expenditures, and fundraising +- **Lobbying Data**: Federal lobbying spending by organizations and industries +- **Political Action Committees (PACs)**: PAC contributions and expenditures +- **Personal Finance Disclosures**: Wealth and financial interests of federal lawmakers +- **501(c) Organizations**: Political spending by nonprofits and dark money groups +- **Foreign Lobby Influence**: Foreign agents registered under FARA + +**Data Access:** +- **Bulk Data Downloads**: Available to nonprofits upon approval (application pending) +- **Public API**: Available with rate limits for smaller queries +- **Data Format**: CSV files with detailed transaction-level records +- **Update Frequency**: Regular updates as new filings are processed +- **Coverage**: Federal-level political finance data (1990-present) + +**What we plan to use:** +- Nonprofit political spending and advocacy activity +- Lobbying expenditures by healthcare and oral health organizations +- Campaign contributions from dental associations and health policy groups +- 501(c)(4) "dark money" spending on ballot measures +- Cross-reference EINs with IRS nonprofit data for comprehensive profiles + +**BibTeX:** +```bibtex +@misc{opensecrets, + title = {OpenSecrets.org: Money in Politics Database}, + author = {{Center for Responsive Politics}}, + year = {2024}, + url = {https://www.opensecrets.org}, + note = {Comprehensive database of campaign finance, lobbying, and political spending in U.S. politics} +} +``` + +**License & Attribution:** +- Data collected from Federal Election Commission (FEC) and other public sources +- Attribution required: "Data from OpenSecrets.org, a project of the Center for Responsive Politics" +- Nonprofit bulk data access subject to approval and terms of use + +**Application Status:** +- ⏳ Bulk data access application pending approval +- Will enable comprehensive analysis of nonprofit political activity +- Integration planned upon approval + +--- + +### **IRS Exempt Organizations Business Master File (EO-BMF)** + +Basic nonprofit registration data (name, EIN, address, NTEE code). + +### **IRS Exempt Organizations Business Master File (EO-BMF)** +- Complete database of 1.9M+ U.S. tax-exempt organizations +- Organization: Internal Revenue Service (IRS) +- Source: https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf +- Download: https://www.irs.gov/pub/irs-soi/ (4 regional CSV files) +- Format: CSV (basic organizational data: name, EIN, address, NTEE code, etc.) +- Update frequency: Monthly +- License: Public Domain (U.S. Government data) +- Coverage: All registered tax-exempt organizations under sections 501(c)(3), 501(c)(4), etc. +- Used for: Nonprofit discovery, organization matching, NTEE categorization + +**Note:** This is the **Business Master File** (basic info). For detailed financial data, see IRS Form 990 XML below. + +### **IRS Form 990 XML Filings** ⭐ +- Detailed financial filings from nonprofit tax returns +- Organization: Internal Revenue Service (IRS) +- Source: https://www.irs.gov/charities-non-profits/form-990-series-downloads +- Format: XML (highly detailed financial and operational data) +- Parser Tools: **Giving Tuesday** open source libraries + - XML Parser: https://github.com/Giving-Tuesday/form-990-xml-parser + - XML Mapper: https://github.com/Giving-Tuesday/form-990-xml-mapper +- AWS S3 Index: https://registry.opendata.aws/irs990/ +- License: Public Domain (U.S. Government data) +- Coverage: Annual filings from organizations with >$50K revenue +- Data includes: Detailed revenue, expenses, program services, officer compensation, grants, donors +- Used for: Financial analysis, transparency, grant research, program evaluation + +**Giving Tuesday Attribution:** +The Giving Tuesday Data Commons provides essential tools for parsing IRS Form 990 XML data: +```bibtex +@software{giving_tuesday_form990_parser, + title = {Form 990 XML Parser}, + author = {{Giving Tuesday}}, + year = {2024}, + url = {https://github.com/Giving-Tuesday/form-990-xml-parser}, + note = {Open source Python library for parsing IRS Form 990 XML filings} +} + +@software{giving_tuesday_form990_mapper, + title = {Form 990 XML Mapper}, + author = {{Giving Tuesday}}, + year = {2024}, + url = {https://github.com/Giving-Tuesday/form-990-xml-mapper}, + note = {Maps Form 990 XML to standardized data structures} +} +``` + +**More Giving Tuesday Resources:** +- GitHub Organization: https://github.com/Giving-Tuesday +- Data Commons: https://www.givingtuesday.org/data-commons +- Research & Insights: https://www.givingtuesday.org/research +- Coverage: Standardized schemas for Person, Organization, Membership, Post, Area, Motion, VoteEvent, Count +- Used for: Leader/official data modeling, organization structure, membership tracking, voting records +- Adoption: Used by Civic Commons, OpenNorth, mySociety, Sunlight Foundation, and 30+ civic tech organizations worldwide +- Citation: "Popolo Project. Open government data specifications. https://www.popoloproject.com/" +- **Key Features:** + - **Person**: Names, contact details, identifiers, links to images/sources + - **Organization**: Names, classification, founding/dissolution dates, contact information + - **Membership**: Relationship between persons and organizations (with roles and time periods) + - **Post**: Positions within organizations (e.g., "Mayor", "City Council Member District 3") + - **VoteEvent**: Votes on motions/bills with individual voter positions +- **Our Implementation**: LEADER and ORGANIZATION entities follow Popolo schema for maximum interoperability with civic tech platforms + +**Popolo Dependencies & Standards:** +The Popolo specification builds upon and references the following W3C, IETF, and open data standards: + +| Publisher | Specification | Prefix | Use in Popolo | URL | +|-----------|---------------|--------|---------------|-----| +| Bibliographic Framework Initiative | BIBFRAME Vocabulary | `bf` | Bibliographic references | https://www.loc.gov/bibframe/ | +| Ian Davis | BIO: Biographical Information | `bio` | Life events, relationships | http://purl.org/vocab/bio/0.1/ | +| W3C | Contact: Utility concepts | `con` | Contact information | http://www.w3.org/2000/10/swap/pim/contact# | +| DCMI | DCMI Metadata Terms | `dcterms` | Metadata, provenance | https://www.dublincore.org/specifications/dublin-core/dcmi-terms/ | +| FOAF Project | FOAF Vocabulary | `foaf` | People, social networks | http://xmlns.com/foaf/0.1/ | +| GeoNames | GeoNames Ontology | `gn` | Geographic names | http://www.geonames.org/ontology/ | +| ISA Programme | Location Core Vocabulary | `locn` | Addresses, locations | https://www.w3.org/ns/locn | +| OSCA Foundation | NEPOMUK Calendar Ontology | `ncal` | Events, meetings | http://www.semanticdesktop.org/ontologies/ncal/ | +| Open Data Institute | Open Data Rights Statement | `odrs` | Data licensing | http://schema.theodi.org/odrs | +| W3C | The Organization Ontology | `org` | Organizational structures | https://www.w3.org/TR/vocab-org/ | +| ISA Programme | Person Core Vocabulary | `person` | Person attributes | http://www.w3.org/ns/person | +| W3C | RDF Schema | `rdfs` | Semantic web foundation | https://www.w3.org/TR/rdf-schema/ | +| W3C | Schema.org | `schema` | Structured data | https://schema.org/ | +| W3C | SKOS | `skos` | Taxonomies, classification | https://www.w3.org/2004/02/skos/ | +| IETF | vCard Format | `vcard` | Contact information | https://www.rfc-editor.org/rfc/rfc6350.html | + +**Popolo Classes Implemented:** +- ✅ **Person** → LEADER entity (elected officials, appointees) +- ✅ **Organization** → ORGANIZATION entity (nonprofits, government agencies) +- ✅ **Membership** → Implicit through leader_id/organization relationships +- ✅ **Post** → position_type, office fields in LEADER +- ✅ **Contact Detail** → email, phone, website fields +- ✅ **Motion** → AGENDA items, LEGISLATION entities +- ✅ **Vote Event** → VOTE entity +- ✅ **Count** → vote_yes, vote_no in VOTE and LEGISLATION +- ✅ **Area** → JURISDICTION entity (geographic/political boundaries) +- ✅ **Event** → MEETING entity +- ✅ **Speech** → Extracted from MINUTES, VIDEO transcripts + +### **Roper Center for Public Opinion Research** +- Scientifically validated survey questions and public opinion data +- Organization: Cornell University +- Source: https://ropercenter.cornell.edu/ +- iPoll Database: https://ropercenter.cornell.edu/ipoll/ +- License: Free public search (metadata and question wording), full data requires institutional membership +- Coverage: 500,000+ survey questions from 1930s-present, all major polling organizations +- Used for: Topic definitions, validated question wording, national opinion baselines, messaging optimization +- Citation: "Roper Center for Public Opinion Research, Cornell University. iPoll Databank. https://ropercenter.cornell.edu/ipoll/" + +### **Google Fact Check Tools API** +- Aggregated fact-checking data with ClaimReview structured data +- Organization: Google LLC +- Source: https://toolbox.google.com/factcheck/explorer +- API: https://developers.google.com/fact-check/tools/api +- Schema: https://developers.google.com/search/docs/appearance/structured-data/factcheck +- License: Free API with quota (10,000 queries/day) +- Coverage: 100+ fact-checking organizations worldwide, all claim types +- Used for: Verifying claims from meetings/legislation, tracking misinformation, accountability scoring +- Citation: "Google Fact Check Tools API. Google LLC. https://developers.google.com/fact-check/tools/api" + +### **FactCheck.org** +- Nonpartisan fact-checking of political claims and viral misinformation +- Organization: Annenberg Public Policy Center, University of Pennsylvania +- Source: https://www.factcheck.org/ +- License: Free (web scraping allowed with rate limiting) +- Coverage: National politics, health claims, science, viral content (2003-present) +- Used for: Verifying political claims, health policy fact-checking, scientific claim verification +- Citation: "FactCheck.org. Annenberg Public Policy Center, University of Pennsylvania. https://www.factcheck.org/" + +### **PolitiFact** +- Pulitzer Prize-winning fact-checking with Truth-O-Meter ratings +- Organization: Poynter Institute +- Source: https://www.politifact.com/ +- License: Free (web scraping allowed with rate limiting) +- Coverage: All 50 states, federal politics, ballot measures (2007-present) +- Rating Scale: 6-point (True, Mostly True, Half True, Mostly False, False, Pants on Fire) +- Used for: State-level fact-checking, tracking politician claims, ballot measure verification +- Citation: "PolitiFact. Poynter Institute. https://www.politifact.com/" + +### **Schema.org** +- Structured data vocabulary for semantic web markup +- Organization: W3C Community Group (sponsors: Google, Microsoft, Yahoo, Yandex) +- Source: https://schema.org/ +- Documentation: https://schema.org/docs/schemas.html +- License: Creative Commons Attribution-ShareAlike License (CC BY-SA 3.0) +- Coverage: 800+ types, 1,400+ properties for describing web content +- Used for: SEO-optimized structured data, JSON-LD exports, API documentation, search engine compatibility +- Citation: "Schema.org. W3C Community Group. https://schema.org/" + +**Our Schema.org Type Mappings:** + +| Our Entity | Schema.org Type | Properties Used | Use Case | +|------------|----------------|-----------------|----------| +| JURISDICTION | [AdministrativeArea](https://schema.org/AdministrativeArea) | name, address, geo, telephone, url | City/county geographic data | +| MEETING | [Event](https://schema.org/Event) + [GovernmentService](https://schema.org/GovernmentService) | name, startDate, location, organizer, description | Public meetings, hearings | +| LEADER | [Person](https://schema.org/Person) + [GovernmentOfficial](https://schema.org/GovernmentOfficial) | name, email, telephone, jobTitle, worksFor | Elected officials | +| ORGANIZATION | [Organization](https://schema.org/Organization) + [NGO](https://schema.org/NGO) | name, address, telephone, url, foundingDate | Nonprofits, agencies | +| LEGISLATION | [Legislation](https://schema.org/Legislation) | name, legislationDate, legislationPassedBy, legislationType | Bills, ordinances | +| BALLOT_MEASURE | [Legislation](https://schema.org/Legislation) + referendumProposal | name, datePosted, legislationChanges | Referendums, propositions | +| VOTE | [VoteAction](https://schema.org/VoteAction) | agent (Person), candidate (Legislation), actionOption | Roll call votes | +| FACT_CHECK | [ClaimReview](https://schema.org/ClaimReview) | claimReviewed, reviewRating, author, datePublished | Verified fact-checks | +| SCHOOL_DISTRICT | [EducationalOrganization](https://schema.org/EducationalOrganization) | name, address, telephone, numberOfStudents | K-12 school districts | +| NONPROFIT_FINANCES | [MonetaryGrant](https://schema.org/MonetaryGrant) | funder, amount, fundedItem | IRS Form 990 data | +| VIDEO | [VideoObject](https://schema.org/VideoObject) | name, description, uploadDate, duration, thumbnailUrl | Meeting recordings | +| DOCUMENT | [DigitalDocument](https://schema.org/DigitalDocument) | name, fileFormat, datePublished, url | PDFs, agendas, minutes | + +**Benefits:** +- ✅ **SEO Enhancement**: Google Search rich results for meetings, officials, organizations +- ✅ **Voice Assistant Ready**: Alexa, Google Assistant can parse our structured data +- ✅ **Knowledge Graph**: Data appears in Google Knowledge Panels +- ✅ **API Discoverability**: Standards-compliant REST/GraphQL responses +- ✅ **Cross-platform**: Compatible with Apple Podcasts, Microsoft Bing, Yandex + +### **Common Education Data Standards (CEDS)** +- Comprehensive education data standards for K-12, postsecondary, and workforce +- Organization: U.S. Department of Education, National Center for Education Statistics (NCES) +- Source: https://ceds.ed.gov/ +- GitHub: https://github.com/CEDStandards +- Specification Repository: https://github.com/CEDStandards/CEDS-Elements +- License: Public Domain (U.S. Government) +- Coverage: 2,300+ data elements, 500+ option sets, alignment with NCES surveys +- Used for: School district data modeling, NCES interoperability, education finance tracking +- Citation: "Common Education Data Standards (CEDS). National Center for Education Statistics. https://ceds.ed.gov/" + +**CEDS Alignment for School Districts:** + +| Our Field | CEDS Element ID | CEDS Element Name | Description | +|-----------|----------------|-------------------|-------------| +| `nces_id` | 000827 | LEA Identifier (NCES) | National Center for Education Statistics LEA ID | +| `district_name` | 000168 | Name of Institution | Legal name of the school district | +| `district_type` | 000108 | LEA Type | Local, State, Federal, or Other | +| `total_students` | 001475 | Student Count | Total number of students enrolled | +| `total_schools` | 000856 | Number of Schools | Count of schools in district | +| `total_revenue` | 000612 | Total Revenue | Sum of all revenue sources | +| `total_expenditures` | 000611 | Total Expenditures | Sum of all spending categories | +| `per_pupil_spending` | 000613 | Expenditure per Student | Total expenditures / student count | +| `federal_revenue` | 000614 | Federal Revenue | Revenue from federal government | +| `state_revenue` | 000615 | State Revenue | Revenue from state sources | +| `local_revenue` | 000616 | Local Revenue | Revenue from property taxes, bonds | +| `superintendent` | 000240 | Chief Administrator Name | District superintendent name | +| `school_year` | 000243 | School Year | Academic year (e.g., 2023-2024) | + +**CEDS Option Sets Used:** +- **LEA Type** (CEDS 000108): Regular, Specialized, Supervisory Union, Service Agency, State Agency, Federal Agency +- **Grade Level** (CEDS 000100): PK, KG, 01-12, UG (ungraded) +- **Operational Status** (CEDS 000533): Open, Closed, New, Added, Changed Agency, Temporarily Closed +- **Locale Type** (CEDS 001315): City, Suburb, Town, Rural (NCES Urban-centric locale codes) + +**Benefits of CEDS Compliance:** +- ✅ **NCES Compatibility**: Direct mapping to Common Core of Data (CCD) and F-33 Finance Survey +- ✅ **State Reporting**: Aligns with state education department data systems +- ✅ **Federal Grants**: Standardized reporting for ESSA, Title I, IDEA compliance +- ✅ **Longitudinal Tracking**: Consistent identifiers for multi-year analysis +- ✅ **Interoperability**: Works with Ed-Fi Alliance, IMS Global, SIF Association standards + +### **Microsoft Common Data Model for Nonprofits** +- Industry-standard data model for nonprofit organizations built on Microsoft Dataverse +- Organization: Microsoft Corporation +- Repository: https://github.com/microsoft/Nonprofits/tree/master/CommonDataModelforNonprofits +- ERD Documentation: https://github.com/microsoft/Nonprofits/blob/master/CommonDataModelforNonprofits/Documents/common-data-model-for-nonprofits-erds.pdf +- License: MIT License +- Coverage: Donor management, fundraising, program delivery, volunteer management, impact measurement, award/grant tracking +- Used for: Nonprofit data standardization, Dynamics 365 integration, constituent relationship management, outcome tracking +- Citation: "Microsoft Common Data Model for Nonprofits. Microsoft Corporation. https://github.com/microsoft/Nonprofits/" + +**Microsoft CDM Nonprofit Core Entities:** + +| Entity | Description | Our Implementation | +|--------|-------------|--------------------| +| **Constituent** | Individuals who interact with nonprofit (donors, volunteers, members, beneficiaries) | CONSTITUENT entity | +| **Donation** | Financial contributions and in-kind gifts | DONATION entity | +| **Designation** | How donations are allocated (programs, funds, campaigns) | designation_id in DONATION | +| **Campaign** | Fundraising campaigns and appeals | CAMPAIGN entity | +| **Membership** | Member enrollment and renewal tracking | MEMBERSHIP entity | +| **Volunteer** | Volunteer activities, hours, and preferences | VOLUNTEER_ACTIVITY entity | +| **Award** | Grants received by the nonprofit | Awards captured in NONPROFIT_FINANCES | +| **Disbursement** | Spending of grant/award funds | Expenditures in GOVERNMENT_BUDGET | +| **Objective** | Measurable program outcomes and impact | PROGRAM_OUTCOME entity | +| **DeliveryFramework** | Programs and services delivered | PROGRAM_DELIVERY entity | +| **Budget** | Organizational budgets and allocations | GOVERNMENT_BUDGET, SCHOOL_DISTRICT budgets | +| **Indicator** | Key performance indicators for impact | Metrics in PROGRAM_OUTCOME | + +**Key Entity Relationships (Microsoft CDM Pattern):** +- Constituent → Donation (one-to-many): A constituent makes many donations +- Donation → Designation (many-to-one): Multiple donations to one fund/program +- Campaign → Donation (one-to-many): A campaign receives many donations +- Constituent → Membership (one-to-many): A constituent can have multiple memberships over time +- Constituent → Volunteer (one-to-many): A constituent volunteers for multiple activities +- Organization → DeliveryFramework (one-to-many): An organization delivers multiple programs +- DeliveryFramework → Objective (one-to-many): A program has multiple outcome objectives + +**Benefits of Microsoft CDM Alignment:** +- ✅ **Dynamics 365 Integration**: Native compatibility with Microsoft Cloud for Nonprofits +- ✅ **Power Platform**: Direct use in Power BI, Power Apps, Power Automate +- ✅ **Azure Synapse**: Seamless analytics with Azure data services +- ✅ **Industry Standard**: Adopted by large nonprofits using Microsoft ecosystem +- ✅ **Grant Compliance**: Built-in support for grant reporting and outcome measurement +- ✅ **Constituent 360**: Unified view of donor, volunteer, member activities + +--- + +## 🎯 **Grant Research and Fundraising Platforms** + +These platforms are built on open-source principles or community-funded models to keep grant and fundraising data accessible. + +### **Grantmakers.io** ⭐ + +**"Free as in Freedom" Grant Research** + +Grantmakers.io is the gold standard for open, community-supported foundation research. It provides lightning-fast search through IRS 990-PF data with no login required. + +**Organization:** Community-supported open-source project +**Website:** https://www.grantmakers.io/ +**Data Source:** IRS Form 990-PF (Private Foundation tax returns) +**License:** Open source, community-funded +**Access:** 100% free, no account or API key required +**Coverage:** All U.S. private foundations filing Form 990-PF (75,000+ grantmaking foundations) + +**What we use:** +- **Foundation Giving Histories**: Search foundations by who they've funded in the past +- **Grantee Databases**: Find all grants made to specific organizations +- **Geographic Targeting**: Search by state, city, or region +- **Funding Amounts**: Filter by grant size ranges +- **NTEE Categories**: Search by nonprofit sector (health, education, environment, etc.) +- **Year-over-Year Trends**: Track foundation giving patterns over time + +**Key Features:** +- ⚡ **Lightning-Fast Search**: Instant results across millions of grant records +- 🔓 **No Login Required**: Completely open access, no barriers +- 📊 **Detailed 990-PF Data**: Full foundation financials, officers, assets +- 🎯 **Relationship Mapping**: Discover foundation-grantee connections +- 📈 **Trend Analysis**: Multi-year giving patterns and focus areas +- 🆓 **Always Free**: Community-funded to remain accessible + +**Use Cases:** +- **Grant Prospecting**: Find foundations that fund similar organizations in your area +- **Relationship Research**: Identify foundations that have supported oral health, public health, or civic engagement +- **Competitive Analysis**: See which organizations are receiving grants in your field +- **Foundation Vetting**: Review foundation assets, giving patterns, and leadership before applying + +**Example Searches:** +- Foundations that funded "fluoridation" or "oral health" projects +- Grantmakers in Massachusetts supporting health policy advocacy +- Foundations with >$10M assets funding civic engagement +- All grants made by Robert Wood Johnson Foundation to nonprofits in Alabama + +**BibTeX:** +```bibtex +@misc{grantmakersio, + title = {Grantmakers.io: Open Foundation Research Platform}, + year = {2026}, + url = {https://www.grantmakers.io/}, + note = {Community-supported open-source platform for searching IRS 990-PF private foundation data} +} +``` + +**Citation:** "Grantmakers.io. Community-supported open foundation research. https://www.grantmakers.io/" + +--- + +### **Zeffy** ⭐ + +**100% Free Fundraising with AI-Powered Grant Matching** + +Zeffy is unique for being a completely free fundraising platform that also offers an AI-powered grant search tool to help match nonprofit missions with potential grant opportunities. + +**Organization:** Zeffy, Inc. +**Website:** https://www.zeffy.com/ +**Platform:** Fundraising + Grant Discovery +**Cost:** 100% free for nonprofits (donor-covered fees model) +**Grant Tool:** AI-powered grant opportunity matching +**Coverage:** U.S. and Canadian grant opportunities + +**What we use:** +- **AI Grant Matching**: Automated matching of nonprofit missions to relevant grant opportunities +- **Fundraising Infrastructure**: Donation processing, event ticketing, membership management +- **Donor Management**: CRM for tracking constituent relationships +- **Grant Alerts**: Notifications when new matching opportunities are posted + +**Key Features:** +- 💰 **100% Free**: No platform fees, monthly charges, or hidden costs +- 🤖 **AI-Powered Matching**: Machine learning matches your mission to grant opportunities +- 📧 **Grant Alerts**: Email notifications for new matching grants +- 🎟️ **All-in-One Platform**: Donations, events, memberships, grants in one system +- 🇺🇸 🇨🇦 **North America Coverage**: U.S. and Canadian grant databases +- 📊 **Impact Reporting**: Built-in analytics for grant reporting requirements + +**Grant Discovery Capabilities:** +- **Mission-Based Matching**: Upload your mission statement, get matched grants +- **Federal Grants**: Monitors Grants.gov for federal opportunities +- **Foundation Grants**: Tracks private foundation RFPs and announcements +- **Corporate Giving**: Alerts for corporate philanthropy programs +- **Local Grants**: Community foundation and regional funder opportunities + +**Use Cases for This Project:** +- **Nonprofit Fundraising**: Organizations can use Zeffy for zero-cost donation processing +- **Grant Prospecting**: AI helps match oral health nonprofits to relevant grant opportunities +- **Event Fundraising**: Free ticketing for fundraising galas, community events +- **Membership Management**: Track supporters, volunteers, members at no cost +- **Sustainability**: Recommend to small nonprofits to reduce overhead costs + +**Why It's Important:** +Traditional fundraising platforms charge 3-5% fees on donations, which drains resources from small nonprofits. Zeffy's donor-covered model means 100% of donations go to the organization, making it especially valuable for grassroots oral health advocacy groups. + +**BibTeX:** +```bibtex +@misc{zeffy_platform, + title = {Zeffy: 100% Free Fundraising Platform with AI Grant Matching}, + author = {{Zeffy, Inc.}}, + year = {2026}, + url = {https://www.zeffy.com/}, + note = {Free fundraising platform with AI-powered grant discovery for U.S. and Canadian nonprofits} +} +``` + +**Citation:** "Zeffy. 100% Free Fundraising Platform with AI Grant Matching. https://www.zeffy.com/" + +--- + +### **Community Foundations** ⭐ + +**Local Grant Opportunities Often Overlooked** + +Community foundations are often the most accessible grant sources for local nonprofits, yet they're frequently overlooked because they don't appear in major federal databases. Most maintain their own open listings for regional grants. + +**What Community Foundations Are:** +Community foundations are public charities that pool donations from individuals, families, and businesses to support local nonprofits through competitive grants, scholarship programs, and donor-advised funds. + +**Why They Matter:** +- 🏘️ **Local Focus**: Prioritize organizations serving their specific geographic region +- 💵 **Smaller, Accessible Grants**: $500-$50,000 range, ideal for grassroots groups +- 🤝 **Relationship-Based**: Local foundations know local issues and local leaders +- 📋 **Simpler Applications**: Less bureaucratic than federal or national foundations +- ⚡ **Faster Decisions**: Many have quarterly or rolling deadlines +- 🎯 **Mission Alignment**: Support for community health, civic engagement, education + +**Examples of Community Foundations:** + +| Foundation | Region | Website | Grant Focus Areas | +|------------|--------|---------|-------------------| +| **Central Alabama Community Foundation** | Birmingham, AL metro | https://www.cacfbirmingham.org/ | Health, education, civic engagement, arts | +| **Community Foundation for Greater Atlanta** | Atlanta, GA metro | https://cfgreateratlanta.org/ | Health equity, education, economic mobility | +| **Boston Foundation** | Boston, MA metro | https://www.tbf.org/ | Health, housing, education, civic participation | +| **Community Foundation of Greater Memphis** | Memphis, TN metro | https://cfgm.org/ | Health, youth development, community engagement | +| **Silicon Valley Community Foundation** | San Francisco Bay Area | https://www.siliconvalleycf.org/ | Health, education, immigration, environment | +| **Greater Kansas City Community Foundation** | Kansas City, MO/KS | https://www.growyourgiving.org/ | Health, education, civic infrastructure | +| **Seattle Foundation** | Seattle, WA metro | https://www.seattlefoundation.org/ | Racial equity, community health, economic opportunity | + +**How to Find Your Local Community Foundation:** +1. **Council on Foundations Directory**: https://www.cof.org/community-foundation-locator +2. **Candid (formerly Foundation Center)**: https://candid.org/find-us/foundation-finder +3. **State Associations**: Most states have a community foundation association +4. **Google Search**: "[Your City] Community Foundation" or "[Your County] Community Foundation" + +**Grant Opportunities:** +- **Competitive Grants**: Open RFPs for nonprofits in specific focus areas +- **Capacity Building Grants**: Support for operations, staffing, strategic planning +- **Donor-Advised Funds**: Individuals/families make grants through the foundation +- **Fiscal Sponsorship**: Some foundations sponsor projects for groups without 501(c)(3) status +- **Scholarship Programs**: Education grants for students (often administered by community foundations) + +**For Oral Health Advocacy:** +Many community foundations have health equity or preventive health focus areas that align perfectly with fluoridation advocacy, dental access programs, and oral health education. They're often the best first step for local grassroots campaigns. + +**How We Use Community Foundation Data:** +- **Local Grant Mapping**: Identify which community foundations serve each jurisdiction +- **Nonprofit Funding Sources**: Link organizations to local foundation grants received +- **Geographic Targeting**: Recommend local funders when users search by city/county +- **Grant Prospecting**: Alert nonprofits to community foundation RFPs in their area + +**BibTeX:** +```bibtex +@misc{community_foundations, + title = {Community Foundations: Local Grant Opportunities}, + author = {{Council on Foundations}}, + year = {2026}, + url = {https://www.cof.org/community-foundation-locator}, + note = {Network of 700+ community foundations providing local grants across the United States} +} +``` + +**Citation:** "Community Foundations. Council on Foundations. https://www.cof.org/community-foundation-locator" + +--- + +## 🏛️ **Civic Tech Organizations & Resources** + +### **Code for America** ⭐ + +The flagship U.S. civic technology nonprofit organization, convening government leaders and technologists to transform public services. + +**Organization:** Code for America +**Website:** https://codeforamerica.org/ +**About:** National nonprofit working with government to build digital services that are simple, effective, and accessible to all +**Founded:** 2009 +**Coverage:** National (50 states), with focus on state-level government transformation + +**What we use:** +- **Summit Insights**: Annual Summit (most recently Summit 2026) where state-level AI leads and municipal CIOs set the civic tech agenda for the year +- **Brigade Network**: Community chapters across the U.S. working on local civic tech projects +- **Best Practices**: Government service design patterns, digital service standards +- **Technology Landscape**: Trends in state and local government digital transformation + +**Key Programs:** +- **Code for America Summit**: Annual conference bringing together 1,500+ government leaders, technologists, and advocates +- **Brigade Network**: 80+ volunteer chapters in cities across America building civic tech solutions +- **Get CalFresh**: Flagship product helping millions access food benefits through simplified digital applications +- **Clear My Record**: Automated criminal record clearance to help people move forward +- **Government Services Portfolio**: Digital tools for social safety net programs + +**Resources:** +- Summit: https://codeforamerica.org/summit/ +- Brigade Network: https://brigade.codeforamerica.org/ +- Blog: https://codeforamerica.org/news/ +- GitHub: https://github.com/codeforamerica +- Annual Reports: https://codeforamerica.org/news/category/annual-reports/ + +**Why Code for America:** +- **Agenda Setting**: The Summit is where state-level AI leads and municipal CIOs define priorities for the year +- **Network Effect**: Connects civic technologists across the country +- **Proven Impact**: Products serving millions of Americans annually +- **Open Source**: Many tools available as open source for other governments to adopt + +**BibTeX:** +```bibtex +@misc{code_for_america, + title = {Code for America}, + author = {{Code for America}}, + year = {2026}, + url = {https://codeforamerica.org/}, + note = {National nonprofit transforming government digital services and convening state and local government technology leaders} +} +``` + +**Citation:** "Code for America. https://codeforamerica.org/" + +--- + +### **GovTech.com (Government Technology)** ⭐ + +The primary news and ranking source for the government technology industry, providing market intelligence and trend analysis. + +**Organization:** Government Technology (e.Republic) +**Website:** https://www.govtech.com/ +**About:** Leading publication covering technology trends, policy, and innovation in state and local government +**Founded:** 1987 +**Coverage:** State and local government across all 50 states + +**What we use:** +- **GovTech 100**: Annual definitive directory of the top 100 trending companies in the U.S. public sector technology market +- **Industry Trends**: Analysis of emerging technologies, procurement trends, and digital transformation initiatives +- **Vendor Landscape**: Tracking government technology companies, products, and solutions +- **Policy Coverage**: Legislative and regulatory developments affecting civic technology + +**Key Resources:** +- **GovTech 100 List**: https://www.govtech.com/100/ - The definitive annual ranking of companies shaping the future of state and local government +- **Navigator Awards**: https://www.govtech.com/navigator - Recognition of state and local government IT leaders +- **Digital Cities Survey**: Annual ranking of America's most digitally advanced cities and counties +- **Research Center**: https://www.govtech.com/research/ - White papers, surveys, and industry reports +- **Webinars & Events**: https://www.govtech.com/events/ + +**GovTech 100 Categories:** +- Cloud & Infrastructure +- Cybersecurity +- Data & Analytics +- Digital Government Services +- Education Technology +- Emergency Management +- Financial Management +- GIS & Mapping +- Health & Human Services +- Public Safety +- Transportation + +**Why GovTech.com:** +- **Market Intelligence**: The GovTech 100 is the authoritative list of trending companies in government technology +- **Vendor Discovery**: Comprehensive directory of solutions available to state and local government +- **Industry Standards**: Defines what's considered "trending" and "emerging" in the civic tech marketplace +- **Procurement Insights**: Helps identify which technologies governments are actively adopting + +**Resources:** +- Main Site: https://www.govtech.com/ +- GovTech 100: https://www.govtech.com/100/ +- Newsletter: https://www.govtech.com/newsletters/ +- Podcasts: https://www.govtech.com/podcasts/ +- Magazine Archive: https://www.govtech.com/magazines/ + +**BibTeX:** +```bibtex +@misc{govtech_magazine, + title = {Government Technology Magazine}, + author = {{e.Republic Inc.}}, + year = {2026}, + url = {https://www.govtech.com/}, + note = {Leading publication and market intelligence source for state and local government technology, publisher of the annual GovTech 100 list} +} +``` + +**GovTech 100 BibTeX:** +```bibtex +@misc{govtech_100, + title = {GovTech 100: Companies Trending in State and Local Government}, + author = {{Government Technology}}, + year = {2026}, + url = {https://www.govtech.com/100/}, + note = {Annual directory of the top 100 trending companies serving the U.S. public sector} +} +``` + +**Citation:** "Government Technology. e.Republic Inc. https://www.govtech.com/" + +--- + +### **Civic Tech Guide** ⭐ + +Comprehensive, curated directory of civic technology projects, organizations, and tools worldwide, maintained by the civic tech community. + +**Organization:** Civic Tech Field Guide (Community-maintained) +**Website:** https://app.civictech.guide/ +**About:** Open directory and knowledge base of civic tech projects, tools, and organizations with detailed project profiles and categorization +**Founded:** 2018 +**Coverage:** Global civic technology ecosystem (1,000+ projects) + +**What we use:** +- **Project Directory**: Discovery of related civic tech tools and platforms +- **Categorization**: Understanding how civic tech projects are classified and tagged +- **Community Connections**: Network of civic technologists and organizations +- **Best Practices**: Learning from similar projects and their approaches + +**CommunityOne Profile:** +- **Listed as**: https://app.civictech.guide/p/communityone/r/recN0BG4gvjXT7WLf +- **Categories**: Open Government, Civic Engagement, AI/Machine Learning +- **Description**: AI-powered civic engagement platform tracking local government meetings, legislation, and advocacy opportunities + +**Key Features:** +- **Search & Filter**: Discover projects by topic, geography, technology, and impact area +- **Project Profiles**: Detailed information about civic tech initiatives including status, team, and technology +- **Tagging System**: Categorization by civic tech domains (transparency, participation, accountability, etc.) +- **API Access**: Programmatic access to the project database +- **Community Contributions**: Open for civic tech projects to self-list and update profiles + +**Categories in Civic Tech Guide:** +- Open Government & Transparency +- Civic Participation & Engagement +- Community Organizing +- Democracy & Voting +- Public Service Delivery +- Data & Research +- Advocacy & Policy +- Urban Planning & Development + +**Resources:** +- Main Site: https://app.civictech.guide/ +- About: https://civictech.guide/ +- Submit Project: https://app.civictech.guide/submit +- GitHub: https://github.com/compilerla/civic-tech-taxonomy + +**Why Civic Tech Guide:** +- **Discovery**: Find related projects and potential collaborators +- **Context**: Understand where your project fits in the broader civic tech ecosystem +- **Community**: Connect with civic technologists working on similar problems +- **Legitimacy**: Being listed establishes credibility in the civic tech community + +**BibTeX:** +```bibtex +@misc{civic_tech_guide, + title = {Civic Tech Field Guide}, + author = {{Civic Tech Field Guide Community}}, + year = {2026}, + url = {https://civictech.guide/}, + note = {Community-maintained directory of civic technology projects and organizations worldwide} +} +``` + +**Citation:** "Civic Tech Field Guide. https://civictech.guide/" + +--- + +## �️ **Technology Platforms & Support Programs** + +### **Databricks for Good Program** ⭐ + +A philanthropic initiative providing cloud data platform credits and technical support to nonprofits, academic institutions, and social impact organizations. + +**Organization:** Databricks, Inc. +**Website:** https://www.databricks.com/product/databricks-for-good +**Application:** https://www.databricks.com/product/databricks-for-good + +**Eligibility:** +- ✅ **Nonprofits** - 501(c)(3) status required +- ✅ **Academic institutions** - Universities, colleges, research organizations +- ✅ **Social impact organizations** - Civic engagement, public good projects +- ✅ **Government agencies** - Case-by-case evaluation for civic data initiatives + +**CommunityOne/Open Navigator Alignment:** +This project appears well-suited for the program as a civic engagement and social good initiative. Eligibility would depend on establishing formal nonprofit status or academic partnership. + +**Program Benefits:** +- **$10,000-50,000** in annual Databricks credits +- Access to **Unity Catalog** (normally $0.20 per million metadata operations) +- Access to **Databricks Marketplace** for data sharing and distribution +- **Standard tier** platform features included +- **Technical support** from Databricks team +- **Delta Sharing** protocol for secure data distribution +- **MLflow** for AI/ML experiment tracking +- **Databricks SQL** for analytics and dashboards + +**What This Enables for Open Navigator:** +- **Data Publishing**: Share 1.8M nonprofit profiles, 4.5M+ legislative documents via Databricks Marketplace +- **Unity Catalog**: Organize data assets with enterprise-grade governance +- **Delta Sharing**: Distribute datasets to enterprise/research consumers without data copying +- **Lakehouse Architecture**: Unified analytics on legislative, nonprofit, and civic data +- **Collaborative Notebooks**: Reproducible research and analysis +- **Scheduled Pipelines**: Automated data updates and quality checks + +**Alternative Path (Hybrid Approach):** +- **HuggingFace Hub**: Continue using for open-source community distribution (free) +- **Databricks Marketplace**: Add enterprise/research distribution channel (if approved) +- **Data stays in one place**: External tables in Unity Catalog point to existing Parquet files +- **No data duplication**: Delta Sharing streams data from your storage on-demand + +**BibTeX:** +```bibtex +@misc{databricks_for_good, + title = {Databricks for Good Program}, + author = {{Databricks, Inc.}}, + year = {2024}, + url = {https://www.databricks.com/product/databricks-for-good}, + note = {Cloud data platform credits and support for nonprofits, academic institutions, and social impact organizations} +} +``` + +**Application Process:** +1. Visit https://www.databricks.com/product/databricks-for-good +2. Submit organization details and project description +3. Describe social impact and data use case +4. Provide 501(c)(3) documentation (for nonprofits) or academic affiliation +5. Review process typically takes 2-4 weeks +6. Upon approval, receive credits and onboarding support + +**Compliance:** +Credits must be used for the approved social impact project and cannot be resold or transferred. Annual renewal required with impact reporting. + +--- + +## �🙏 **Acknowledgments** + +We are grateful to the authors of MeetingBank for making their dataset publicly available for research purposes. Their work on meeting summarization has been instrumental in developing civic engagement tools. + +Special thanks to: +- The Association for Computational Linguistics (ACL) +- HuggingFace for hosting datasets +- Open States for legislative data +- All municipal governments providing open access to meeting records + +--- + +## 📖 **How to Cite This Project** + +If you use Open Navigator in your research, please cite: + +``` +Open Navigator +GitHub: https://github.com/getcommunityone/open-navigator-for-engagement +License: MIT +``` + +**BibTeX:** +```bibtex +@software{open-navigator-2026, + title = {Open Navigator}, + author = {Community One}, + year = {2026}, + url = {https://github.com/getcommunityone/open-navigator-for-engagement}, + license = {MIT} +} +``` + +--- + +## 📝 **License Compliance** + +This project respects all dataset licenses and terms of use. See [LICENSE](LICENSE) for this project's MIT license. + +For dataset-specific licenses, please refer to the original sources listed above. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..96e273554b57c20b9a91ce86c511b924f987c4f7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,99 @@ +# Contributing to Oral Health Policy Pulse + +Thank you for your interest in contributing to the Oral Health Policy Pulse project! + +## How to Contribute + +### Reporting Bugs + +If you find a bug, please open an issue with: +- A clear description of the problem +- Steps to reproduce +- Expected vs actual behavior +- Your environment (OS, Python version, etc.) + +### Suggesting Features + +Feature requests are welcome! Please: +- Check if the feature has already been requested +- Clearly describe the feature and its use case +- Explain how it would benefit advocacy groups + +### Code Contributions + +1. **Fork the repository** +2. **Create a feature branch** + ```bash + git checkout -b feature/your-feature-name + ``` + +3. **Make your changes** + - Follow the existing code style + - Add tests for new functionality + - Update documentation as needed + +4. **Run tests** + ```bash + pytest + black . + ruff check . + ``` + +5. **Commit your changes** + ```bash + git commit -m "Add feature: description" + ``` + +6. **Push and create a pull request** + ```bash + git push origin feature/your-feature-name + ``` + +## Code Style + +- Follow PEP 8 guidelines +- Use type hints +- Write docstrings for all public functions +- Keep functions focused and single-purpose +- Use meaningful variable names + +## Code of Conduct + +This project values respectful, inclusive collaboration. We align with the principles of: +- **Open States Code of Conduct**: https://docs.openstates.org/code-of-conduct/ +- Be respectful and professional +- Welcome diverse perspectives +- Focus on what's best for the community +- Show empathy towards other contributors + +## Contributing to Upstream Projects + +We use data and patterns from several open source civic tech projects. When contributing scraper patterns or improvements back to upstream projects like **OpenStates**, please: + +1. **Follow their standards**: https://github.com/openstates/openstates-scrapers +2. **Reference their documentation**: https://docs.openstates.org/contributing/local-database/ +3. **Respect their Code of Conduct**: https://docs.openstates.org/code-of-conduct/ +4. **Test locally** before submitting pull requests +5. **Document data sources** used in scraper development + +## Testing + +All new features should include tests. Run the test suite with: + +```bash +pytest tests/ -v +``` + +## Documentation + +Update relevant documentation when: +- Adding new features +- Changing API endpoints +- Modifying configuration options +- Adding new dependencies + +## Questions? + +Open an issue or reach out to the maintainers. + +Thank you for helping improve oral health advocacy! 🦷 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..352fc29ef8242f8c5ed927e5a7b5f1005aae19e6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,90 @@ +# Multi-stage build for Hugging Face Spaces +# Runs all three apps: Docusaurus docs, React frontend, FastAPI backend + +FROM node:20-slim AS docs-builder +WORKDIR /build + +# Set baseUrl to /docs/ for HuggingFace deployment # Docs are served at nginx /docs/ location +# routeBasePath: '/' in docusaurus.config.ts prevents /docs/docs/ nesting +ENV DOCUSAURUS_BASE_URL=/docs/ + +COPY website/package*.json ./ +RUN npm config set fetch-retry-mintimeout 20000 && \ + npm config set fetch-retry-maxtimeout 120000 && \ + npm ci --prefer-offline --no-audit || npm install --prefer-offline --no-audit + +# Add cache-busting argument to force rebuild when needed +ARG CACHE_BUST=2026-04-27-12-00-fix-double-docs-prefix + +COPY website/ ./ + +# Verify environment variable is set and build +RUN echo "Building Docusaurus with DOCUSAURUS_BASE_URL=$DOCUSAURUS_BASE_URL" && \ + echo "Cache bust: 2026-04-27-12-00-fix-double-docs-prefix" && \ + npm run build && \ + echo "Verifying baseUrl in build output..." && \ + grep -r "baseUrl" build/ | head -5 || true + +FROM python:3.11-slim + +# Install system dependencies, nginx, and Node.js for frontend build +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + tesseract-ocr \ + nginx \ + supervisor \ + && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy Python requirements and install +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# OPTIMIZATION: Copy frontend package files first for better caching +COPY frontend/package*.json /app/frontend/ +RUN cd /app/frontend && npm ci + +# Copy application code (now npm ci layer is cached) +COPY . . + +# Copy built static files from docs stage +COPY --from=docs-builder /build/build /app/static/docs + +# Build frontend (npm_modules already cached from above) +# Set production environment variables for Vite +ENV VITE_CANONICAL_DOMAIN=www.communityone.com +ENV VITE_API_URL=/api +# Cache bust: 2026-04-29-remove-axios +ARG CACHE_BUST_FRONTEND=2026-04-29-remove-axios +RUN cd /app/frontend && echo "Frontend build cache bust: $CACHE_BUST_FRONTEND" && npm run build + +# Frontend is already built to /app/api/static/ via vite.config.ts +# Create frontend directory in /app/static for nginx +RUN mkdir -p /app/static/frontend && \ + ls -la /app/api/static/ && \ + cp -r /app/api/static/* /app/static/frontend/ + +# Create necessary directories +RUN mkdir -p /app/logs /app/data /var/log/supervisor + +# Copy Hugging Face specific configs +COPY .huggingface/nginx.conf /etc/nginx/nginx.conf +COPY .huggingface/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +COPY .huggingface/start.sh /app/start.sh +RUN chmod +x /app/start.sh + +# Expose port 7860 (Hugging Face Spaces default) +EXPOSE 7860 + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV LOG_LEVEL=INFO +ENV HF_SPACES=1 + +# Use supervisor to run all services +CMD ["/app/start.sh"] diff --git a/INTEL_ARC_QUICKSTART.md b/INTEL_ARC_QUICKSTART.md new file mode 100644 index 0000000000000000000000000000000000000000..0a2b5a1c16f01246a16613c19569024bb6c283e7 --- /dev/null +++ b/INTEL_ARC_QUICKSTART.md @@ -0,0 +1,281 @@ +# 🚀 Intel Arc + DuckDB Quick Reference + +**Get started with local AI legislative analysis in 5 minutes** + +## ⚡ Performance at a Glance + +| Task | Standard (Postgres + CPU) | Optimized (DuckDB + Arc GPU) | Speedup | +|------|--------------------------|------------------------------|---------| +| Context injection (100 bills) | 500ms | 20ms | **25x** | +| Vector search (10K records) | 800ms | 18ms | **44x** | +| LLM inference (3B model) | 350 tok/s | 1,200 tok/s | **3.4x** | +| Full testimony analysis | 2,000ms | 80ms | **25x** | + +## 🎯 Three-Step Setup + +### 1. Install (5 minutes) + +```bash +cd /path/to/open-navigator +./scripts/intel_llm_setup.sh +source .venv-intel/bin/activate +``` + +### 2. Test DuckDB VSS (30 seconds) + +```bash +python scripts/duckdb_vss_demo.py +``` + +Expected output: +``` +📊 Creating demo DuckDB database with VSS... +✅ Demo database created! +📈 Results (searching 1,000 bills): + Average: 18.45ms +🎯 Top 3 most similar bills: ... +``` + +### 3. Run Analysis (1 minute) + +```bash +python scripts/legislative_analysis_intel.py +``` + +## 🧠 Code Examples + +### Example 1: Fast Bill Search + +```python +from scripts.legislative_analysis_intel import DuckDBLegislativeAnalyzer + +with DuckDBLegislativeAnalyzer() as analyzer: + # Get bill context in < 50ms + bill = analyzer.get_bill_context("HB1234") + testimony = analyzer.get_all_testimony_for_bill("HB1234") + + print(f"Bill: {bill['title']}") + print(f"Testimony records: {len(testimony)}") +``` + +### Example 2: Vector Similarity Search + +```python +import numpy as np + +# Your query embedding (384 dimensions from sentence-transformers) +query_embedding = model.encode("water fluoridation policy") + +# Fast vector search (< 20ms for 10K bills) +similar_bills = analyzer.search_similar_testimony( + query_embedding.tolist(), + limit=10 +) + +for bill in similar_bills: + print(f"{bill['bill_id']}: {bill['text'][:100]}... (similarity: {bill['similarity']:.2f})") +``` + +### Example 3: Extract Interest Groups + +```python +from scripts.legislative_analysis_intel import IntelOptimizedLLM, InterestGroup + +# Initialize Intel-optimized LLM (uses Arc GPU) +llm = IntelOptimizedLLM(model_name="meta-llama/Llama-3.2-3B-Instruct") +llm.load_model(use_openvino=True) # OpenVINO = best Arc GPU performance + +# Extract structured data +groups = llm.extract_interest_groups(bill_context, testimony) + +# Results +for group in groups: + print(f""" + Group: {group.group_name} + Lobbyist: {group.lobbyist} + Stance: {group.stance} (score: {group.stance_score}) + Tradeoffs: {group.tradeoff_notes} + Confidence: {group.confidence} + """) +``` + +### Example 4: Query Hugging Face Datasets Directly + +```python +import duckdb + +conn = duckdb.connect() + +# No download needed - streams from HF! +df = conn.execute(""" + SELECT * + FROM read_parquet( + 'hf://datasets/CommunityOne/states-al-nonprofits-locations/data/train-*.parquet' + ) + WHERE city = 'Birmingham' + LIMIT 100 +""").fetchdf() + +print(f"Found {len(df)} organizations in Birmingham, AL") +``` + +## 🎨 Output Schema + +**Interest Group Extraction:** + +```json +{ + "groups": [ + { + "group_name": "Alabama Dental Association", + "lobbyist": "John Smith, DDS", + "stance": "conditional", + "stance_score": 0.6, + "tradeoff_notes": "Support if Section 4 amended to include rural exemption and phased implementation timeline", + "testimony_excerpt": "While we have concerns about Section 4's implementation timeline, we support the overall goals if rural communities receive proper resources...", + "bill_id": "HB1234", + "confidence": 0.85 + }, + { + "group_name": "Sierra Club Alabama Chapter", + "lobbyist": null, + "stance": "oppose", + "stance_score": -0.9, + "tradeoff_notes": null, + "testimony_excerpt": "We strongly oppose this bill due to environmental concerns...", + "bill_id": "HB1234", + "confidence": 0.92 + } + ] +} +``` + +## 🔧 Environment Variables + +```bash +# Enable Intel GPU +export ZES_ENABLE_SYSMAN=1 + +# Ollama GPU usage (if using Ollama) +export OLLAMA_NUM_GPU=999 + +# IPEX-LLM optimizations +export IPEX_LLM_NUM_GPU=1 +export ONEAPI_DEVICE_SELECTOR=level_zero:0 +``` + +## 💡 Best Practices + +### 1. Cache Embeddings + +**DON'T** recompute every time: +```python +# Slow - recomputes embeddings every run +for bill in bills: + embedding = model.encode(bill['text']) + analyze(embedding) +``` + +**DO** cache in DuckDB: +```python +# Fast - compute once, reuse forever +conn.execute(""" + CREATE TABLE bill_embeddings AS + SELECT bill_id, embedding + FROM ... -- computed once +""") + +# Then just query +similar = conn.execute(""" + SELECT * FROM bill_embeddings + ORDER BY array_distance(embedding, ?) + LIMIT 10 +""", [query]).fetchall() +``` + +### 2. Batch Processing + +**DON'T** process one at a time: +```python +for bill_id in bill_ids: # Slow! + result = analyze_single_bill(bill_id) +``` + +**DO** batch efficiently: +```python +# Fast - processes 100 bills in parallel +results = llm.extract_interest_groups_batch( + bill_contexts=bills, + testimony_batches=all_testimony, + batch_size=32 # Fits in Arc GPU memory +) +``` + +### 3. Monitor GPU Usage + +```bash +# Linux: intel_gpu_top +sudo apt install intel-gpu-tools +intel_gpu_top + +# Windows: Task Manager → Performance → GPU +# Look for "GPU 0 - Intel Arc Graphics" +``` + +## 🐛 Troubleshooting + +### Issue: "ModuleNotFoundError: optimum" + +```bash +pip install optimum[openvino] +``` + +### Issue: Slow inference (still using CPU) + +Check device: +```python +import torch +print(f"Device: {torch.cuda.get_device_name(0)}") # Should show Arc GPU + +# Force GPU +model = OVModelForCausalLM.from_pretrained( + model_name, + device="GPU" # Explicitly set +) +``` + +### Issue: Out of memory + +Use smaller model or reduce batch size: +```python +# Use 3B instead of 8B +model_name = "meta-llama/Llama-3.2-3B-Instruct" + +# Reduce context +testimony = testimony[:10] # Top 10 only +``` + +## 📚 Resources + +- **Full Guide**: [website/docs/guides/intel-arc-optimization.md](../website/docs/guides/intel-arc-optimization.md) +- **DuckDB Docs**: https://duckdb.org/docs/ +- **Intel IPEX**: https://github.com/intel/intel-extension-for-pytorch +- **OpenVINO**: https://docs.openvino.ai/ + +## 🎯 Next Steps + +1. ✅ Run the demo: `python scripts/duckdb_vss_demo.py` +2. ✅ Test analysis: `python scripts/legislative_analysis_intel.py` +3. 📚 Read full guide: [Intel Arc Optimization Guide](../website/docs/guides/intel-arc-optimization.md) +4. 🚀 Build your own: Use the `DuckDBLegislativeAnalyzer` class +5. 🤝 Share results: Open an issue with your findings! + +## 💬 Questions? + +- **GitHub Issues**: https://github.com/getcommunityone/open-navigator/issues +- **Documentation**: https://www.communityone.com/docs +- **Intel AI Forums**: https://community.intel.com/t5/Intel-AI-Analytics-and/bd-p/software-ai + +--- + +**Built with ❤️ for Data Engineering Managers who want local, private, fast legislative intelligence.** diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..7442a2a1966bf89af46bbf471b4f6f87c481c184 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Support. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 Community One + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f441ab9fdcdaf727e1a4f75f2ac0af35232afa38 --- /dev/null +++ b/Makefile @@ -0,0 +1,169 @@ +.PHONY: help install install-frontend install-docs build-frontend build-docs clean test run dev dev-frontend dev-docs start-all stop-all dev-full docker-up docker-down deploy-databricks + +help: + @echo "🦷 Open Navigator - Makefile Commands" + @echo "====================================================" + @echo "" + @echo "🚀 Quick Start:" + @echo " make start-all - Start ALL services (API + Dashboard + Docs) with tmux" + @echo " make stop-all - Stop all running services" + @echo "" + @echo "🐍 Python Backend:" + @echo " make install - Install Python dependencies in venv" + @echo " make dev - Start backend with auto-reload" + @echo " make run - Start backend (production)" + @echo "" + @echo "⚛️ React Dashboard:" + @echo " make install-frontend - Install dashboard npm dependencies" + @echo " make build-frontend - Build React dashboard for production" + @echo " make dev-frontend - Start dashboard dev server" + @echo "" + @echo "📚 Documentation Site:" + @echo " make install-docs - Install Docusaurus dependencies" + @echo " make build-docs - Build documentation for production" + @echo " make dev-docs - Start documentation dev server" + @echo "" + @echo "☁️ Deployment:" + @echo " make deploy-databricks - Deploy to Databricks Apps" + @echo "" + @echo "🐳 Docker:" + @echo " make docker-up - Start Docker containers" + @echo " make docker-down - Stop Docker containers" + @echo "" + @echo "🧪 Testing:" + @echo " make test - Run test suite" + @echo " make clean - Remove build artifacts" + @echo "" + +install: + @echo "📦 Creating virtual environment and installing dependencies..." + @chmod +x install.sh + @./install.sh + +install-frontend: + @echo "📦 Installing dashboard dependencies..." + @cd frontend && npm install + @echo "✅ Dashboard dependencies installed!" + +install-docs: + @echo "📦 Installing documentation dependencies..." + @cd website && npm install + @echo "✅ Documentation dependencies installed!" + +build-frontend: + @echo "🔨 Building React dashboard..." + @cd frontend && npm run build + @echo "✅ Dashboard built to api/static/" + +build-docs: + @echo "🔨 Building documentation site..." + @cd website && npm run build + @echo "✅ Documentation built to website/build/" + +clean: + @echo "🧹 Cleaning up..." + @rm -rf .venv venv + @rm -rf frontend/node_modules frontend/dist + @rm -rf website/node_modules website/build website/.docusaurus + @rm -rf api/static + @rm -rf __pycache__ + @find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true + @find . -type f -name "*.pyc" -delete + @find . -type f -name "*.pyo" -delete + @rm -rf .pytest_cache + @rm -rf .coverage + @rm -rf htmlcov + @rm -rf dist + @rm -rf build + @rm -rf *.egg-info + @rm -rf logs/*.pid logs/*.log + @echo "✅ Cleanup complete" + +test: + @echo "🧪 Running tests..." + @. venv/bin/activate && pytest tests/ -v + +run: build-frontend + @echo "🚀 Starting application (production mode)..." + @. venv/bin/activate && uvicorn api.app:app --host 0.0.0.0 --port 8000 + +dev: + @echo "🔧 Starting backend with auto-reload..." + @echo "📡 Backend running at http://localhost:8000" + @. venv/bin/activate && uvicorn api.app:app --reload + +dev-frontend: + @echo "⚛️ Starting dashboard dev server..." + @echo "📡 Dashboard running at http://localhost:5173" + @cd frontend && npm run dev + +dev-docs: + @echo "📚 Starting documentation dev server..." + @echo "📡 Documentation running at http://localhost:3000" + @cd website && npm start + +start-all: + @echo "🚀 Starting all services with tmux..." + @chmod +x start-all.sh + @./start-all.sh + +stop-all: + @echo "🛑 Stopping all services..." + @chmod +x stop-all.sh + @./stop-all.sh + +dev-full: + @echo "🚀 Use 'make start-all' for better experience with tmux!" + @echo "" + @echo "Starting backend and frontend (manual)..." + @echo "📡 Backend: http://localhost:8000" + @echo "📡 Dashboard: http://localhost:5173" + @echo "📡 Docs: http://localhost:3000 (run 'make dev-docs' in another terminal)" + @echo "" + @. venv/bin/activate && uvicorn api.app:app --reload & \ + cd frontend && npm run dev + +deploy-databricks: + @echo "☁️ Deploying to Databricks Apps..." + @chmod +x scripts/deploy-databricks-app.sh + @./scripts/deploy-databricks-app.sh + +docker-up: + @echo "Starting Docker containers..." + @docker-compose up -d + @echo "✓ Containers started" + @echo " API: http://localhost:8000" + @echo " Docs: http://localhost:8000/docs" + +docker-down: + @echo "Stopping Docker containers..." + @docker-compose down + @echo "✓ Containers stopped" + +example: + @echo "Running example workflow..." + @. venv/bin/activate && python examples/example_workflow.py + +heatmap: + @echo "Generating example heatmap..." + @. venv/bin/activate && python main.py generate-heatmap --output example_heatmap.html + @echo "✓ Heatmap saved to example_heatmap.html" + +init: + @echo "Initializing system..." + @. venv/bin/activate && python main.py init + +status: + @echo "Checking system status..." + @. venv/bin/activate && python main.py status + +format: + @echo "Formatting code..." + @. venv/bin/activate && black . + @. venv/bin/activate && ruff check . --fix + @echo "✓ Code formatted" + +lint: + @echo "Linting code..." + @. venv/bin/activate && ruff check . + @. venv/bin/activate && mypy agents/ pipeline/ visualization/ api/ diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..06a923ee55d14c52d2bfd2a11f06789cb4015325 --- /dev/null +++ b/README.md @@ -0,0 +1,534 @@ +--- +title: Open Navigator +emoji: 🏛️ +colorFrom: blue +colorTo: green +sdk: docker +app_port: 7860 +pinned: false +license: apache-2.0 +--- + +# 🏛️ Open Navigator + +> **CommunityOne: The open path to everything local** +> +> AI-powered civic engagement platform with React + FastAPI web interface + +[](https://opensource.org/licenses/Apache-2.0) +[](https://www.python.org/downloads/) +[](https://reactjs.org) +[](https://fastapi.tiangolo.com) + +## � Quick Links + +**[⚛️ Open Navigator →](https://www.communityone.com)** - **LIVE APPLICATION** (search, filters, heatmap, data exploration) + +**[📖 Documentation →](https://www.communityone.com/docs)** - Complete guides, architecture, and feature details + +The documentation site includes: +- Features and capabilities +- Data sources and integrations +- Architecture and deployment options +- Policy topics and advocacy tools +- API reference and examples + +--- + +## Quick Start + +### Three Services + +This project runs three separate services: + +| Service | Port (Local) | Live URL | Description | +|---------|------|----------|-------------| +| **⚛️ Open Navigator** 🚀 | 5173 | [www.communityone.com](https://www.communityone.com) | **MAIN APPLICATION** - Search, filters, heatmap, data exploration | +| **📚 Documentation** | 3000 | [www.communityone.com/docs](https://www.communityone.com/docs) | Docusaurus site with complete guides and tutorials | +| **🔥 API Backend** | 8000 | [www.communityone.com/api](https://www.communityone.com/api) | FastAPI server with AI agents | + +> **💡 LIVE DEMO:** Visit **[www.communityone.com](https://www.communityone.com)** to use the application! +> +> **💻 LOCAL DEV:** After running `./start-all.sh`, visit **http://localhost:5173** + +## 🚀 Deployment + +**Deploy to Hugging Face Spaces** in 3 commands: + +```bash +echo "HF_USERNAME=your_username" >> .env +./deploy-huggingface.sh +# Configure hardware and secrets at https://huggingface.co/spaces/YOUR_USERNAME/www.communityone.com +``` + +**Full deployment guides:** +- **[Hugging Face Spaces](website/docs/deployment/huggingface-spaces.md)** - Docker deployment (~$22/month) +- **[Databricks Apps](website/docs/deployment/databricks-apps.md)** - Enterprise deployment +- **[Local Development](website/docs/deployment/)** - Complete deployment documentation + +The `deploy-huggingface.sh` script automatically: +- ✅ Tests builds locally (catches errors before pushing) +- ✅ Creates the Space on Hugging Face +- ✅ Pushes code and triggers automatic build (~10-15 min) + + +### Prerequisites + +- Python 3.11+ +- Node.js 18+ +- Docker (optional) +- OpenAI API key + +### Installation + +**Option 1: Start Everything at Once (Recommended)** + +```bash +# Clone repository +git clone https://github.com/getcommunityone/open-navigator.git +cd open-navigator + +# Install dependencies +./install.sh # Python backend +cd frontend && npm install && cd .. # React app +cd website && npm install && cd .. # Documentation + +# Setup git hooks for build protection (one-time) +./setup-git-hooks.sh + +# Start all services in tmux +./start-all.sh +``` + +**Option 2: Using Makefile** + +```bash +# Install +make install +make install-frontend +make install-docs + +# Start all services +make start-all + +# Or individually: +make dev # API only +make dev-frontend # React app only +make dev-docs # Docs only +``` + +**Option 3: Manual Setup** + +```bash +# Python backend +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt + +# React app +cd frontend && npm install && cd .. + +# Documentation +cd website && npm install && cd .. + +# Configure environment +cp .env.example .env +# Edit .env with your API keys + +# Start services (separate terminals) +source .venv/bin/activate && python main.py serve # Terminal 1 +cd frontend && npm run dev # Terminal 2 +cd website && npm start # Terminal 3 +``` + +### Access Points + +**🌐 LIVE APPLICATION:** +- **🚀 Open Navigator:** https://www.communityone.com - Main application +- 📚 **Documentation:** https://www.communityone.com/docs - Guides and API reference +- 🔥 **API Docs:** https://www.communityone.com/api/docs - FastAPI interactive documentation + +**💻 LOCAL DEVELOPMENT:** +- **🚀 Main App:** http://localhost:5173 +- 📚 **Documentation:** http://localhost:3000 +- 🔥 **API Docs:** http://localhost:8000/docs + +### Stop Services + +```bash +./stop-all.sh +# or +make stop-all +``` + +--- + +## Usage + +### Command Line Interface + +Always activate the virtual environment first: + +```bash +source .venv/bin/activate +``` + +**API Server** + +```bash +python main.py serve --host 0.0.0.0 --port 8000 +``` + +**Jurisdiction Discovery** + +```bash +# Test run +python main.py discover-jurisdictions --limit 100 + +# Single state +python main.py discover-jurisdictions --state CA + +# Full discovery (~30k jurisdictions) +python main.py discover-jurisdictions + +# View statistics +python main.py discovery-stats +``` + +**Data Ingestion** + +```bash +# Census data (90,000+ jurisdictions) +python -m discovery.census_ingestion + +# NCES school districts (13,000+) +python -m discovery.nces_ingestion + +# Pre-built meeting datasets +python discovery/meetingbank_ingestion.py +python discovery/city_scrapers_urls.py +python discovery/openstates_sources.py + +# LocalView (requires Dataverse API key) +python discovery/localview_ingestion.py +``` + +**Scraping & Analysis** + +```bash +# Scrape batch from discovered sites +python main.py scrape-batch --source discovered --limit 50 + +# Scrape single source +python main.py scrape --url "https://city.legistar.com" \ + --state "CA" \ + --municipality "San Francisco" + +# Run analysis pipeline +python main.py analyze --targets-file examples/targets.json + +# Generate heatmap +python main.py generate-heatmap --output heatmap.html +``` + +**Publishing Datasets** + +```bash +# Publish to HuggingFace (requires HUGGINGFACE_TOKEN in .env) +python main.py publish-to-hf --dataset all +python main.py publish-to-hf --dataset discovered-urls +python main.py publish-to-hf --dataset census --sample +``` + +### API Usage + +**Start a workflow:** + +```bash +curl -X POST "http://localhost:8000/workflow/start" \ + -H "Content-Type: application/json" \ + -d '{ + "scrape_targets": [ + { + "url": "https://example-city.legistar.com", + "municipality": "Example City", + "state": "CA", + "platform": "legistar" + } + ] + }' +``` + +**Query opportunities:** + +```bash +curl "http://localhost:8000/opportunities?state=CA&urgency=critical" +``` + +**Get heatmap:** + +```bash +curl "http://localhost:8000/heatmap" > heatmap.html +``` + +### Python API + +```python +import asyncio +from agents.orchestrator import OrchestratorAgent +from agents.scraper import ScraperAgent +from agents.parser import ParserAgent +from agents.classifier import ClassifierAgent + +# Initialize orchestrator +orchestrator = OrchestratorAgent() + +# Register agents +orchestrator.register_agent(ScraperAgent()) +orchestrator.register_agent(ParserAgent()) +orchestrator.register_agent(ClassifierAgent()) + +# Execute pipeline +targets = [ + { + "url": "https://city.legistar.com", + "municipality": "Example City", + "state": "CA", + "platform": "legistar" + } +] + +results = await orchestrator.execute_pipeline(targets) +``` + +--- + +## Project Structure + +``` +open-navigator/ +├── agents/ # Multi-agent AI system +├── api/ # FastAPI application +├── frontend/ # React application (Open Navigator) +├── website/ # Docusaurus documentation +├── discovery/ # Data discovery modules +├── extraction/ # Document extraction +├── pipeline/ # Data pipeline components +├── visualization/ # Heatmap and charts +├── config/ # Configuration +├── tests/ # Test suite +├── main.py # CLI entry point +└── requirements.txt # Python dependencies +``` + +--- + +## Deployment Options + +### 1. Databricks Apps (Production) + +```bash +export DATABRICKS_HOST=https://your-workspace.cloud.databricks.com +export DATABRICKS_TOKEN=dapi... +export OPENAI_API_KEY=sk-... + +./scripts/deploy-databricks-app.sh +``` + +See [DATABRICKS_APP_GUIDE.md](DATABRICKS_APP_GUIDE.md) for details. + +### 2. Docker + +```bash +docker-compose up -d +``` + +Starts: +- API server (port 8000) +- Qdrant vector database (port 6333) +- Jupyter notebook (port 8888) + +### 3. Local Development + +See [Quick Start](#quick-start) above. + +--- + +## ⚡ Intel Arc GPU Optimization + +**Run Llama 4 at NVIDIA-like speeds on Intel Arc integrated graphics!** + +If you have **Intel Core Ultra 7** (or similar) with Arc Graphics + NPU, you can use **DuckDB + VSS** for 10-50x faster legislative analysis: + +```bash +# Setup Intel-optimized environment +./scripts/intel_llm_setup.sh +source .venv-intel/bin/activate + +# Run DuckDB vector search demo +python scripts/duckdb_vss_demo.py + +# Run legislative analysis with LLM +python scripts/legislative_analysis_intel.py +``` + +**Why DuckDB for Local AI?** +- ⚡ **10-50x faster** than Postgres for context injection +- 🎯 **< 20ms** vector similarity search across 10K bills +- 🧠 **Embedded** - no server needed, runs locally +- 🤗 **Hugging Face Integration** - query HF datasets directly + +**Performance:** +- **Context Injection**: 20ms vs 500ms (Postgres) = **25x faster** +- **LLM Inference**: 1,200 tok/s (Arc GPU) vs 350 tok/s (CPU) = **3.4x faster** +- **Vector Search**: 18ms vs 800ms = **44x faster** + +**Features:** +- Extract interest groups from legislative testimony +- Identify lobbyists and their positions +- Analyze support/oppose scores with confidence +- Detect tradeoffs and compromises + +**See full guide:** [Intel Arc Optimization Guide](website/docs/guides/intel-arc-optimization.md) + +--- + +## 🤖 AI Integration (MCP Server) + +**Connect your civic data to Claude and other AI assistants!** + +Open Navigator includes a **Model Context Protocol (MCP)** server that lets AI assistants directly access your data: + +```bash +# Install MCP dependencies +pip install mcp anthropic-mcp-sdk + +# Run the server +python scripts/mcp/open_navigator_server.py +``` + +**What AI assistants can do:** +- 🏛️ Search 90,000+ jurisdictions by name or location +- 🏢 Query 1.8M nonprofits with Form 990 data +- 📜 Semantic search across 4.5M+ legislative documents +- 📊 Get real-time statistics and analytics +- 🔍 Vector search meetings and bills with natural language + +**Example queries to Claude:** +> "Find all cities named Springfield in the database" + +> "Show me 501c3 nonprofits in San Francisco focused on education" + +> "What bills related to oral health were introduced in California?" + +**Configure Claude Desktop:** + +Add to `~/.config/Claude/claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "open-navigator": { + "command": "python", + "args": ["/path/to/open-navigator/scripts/mcp/open_navigator_server.py"], + "env": { + "DATABASE_URL": "postgresql://postgres:password@localhost:5433/open_navigator" + } + } + } +} +``` + +**See full guide:** [MCP Server Documentation](website/docs/integrations/mcp-server.md) + +--- + +## Testing + +```bash +# Run all tests +pytest + +# With coverage +pytest --cov=agents --cov=pipeline --cov=visualization + +# Specific test file +pytest tests/test_agents.py +``` + +--- + +## Configuration + +Create `.env` file: + +```bash +# OpenAI +OPENAI_API_KEY=sk-... + +# Databricks (optional) +DATABRICKS_HOST=https://your-workspace.cloud.databricks.com +DATABRICKS_TOKEN=dapi... + +# HuggingFace (optional) +HUGGINGFACE_TOKEN=hf_... + +# Dataverse (optional) +DATAVERSE_API_KEY=... +``` + +--- + +## Contributing + +Contributions are welcome! Please: + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests +5. Submit a pull request + +See [CONTRIBUTING.md](CONTRIBUTING.md) for details. + +--- + +## Documentation + +- **[Full Documentation](http://localhost:3000)** - Complete guides and API reference +- **[Architecture](http://localhost:3000/docs/architecture)** - System architecture overview +- **[Quick Start](http://localhost:3000/docs/quickstart)** - Detailed setup instructions +- **[Quick Reference](http://localhost:3000/docs/quick-reference)** - Command reference card +- **[MCP Server](http://localhost:3000/docs/integrations/mcp-server)** - AI assistant integration guide +- **[Deployment](http://localhost:3000/docs/deployment/databricks-apps)** - Production deployment guides +- **[Case Studies](http://localhost:3000/docs/case-studies/tuscaloosa-complete)** - Real-world examples +- [CONTRIBUTING.md](CONTRIBUTING.md) - How to contribute + +--- + +## Citations + +This project uses several open datasets and research contributions. **Please see [CITATIONS.md](CITATIONS.md) for complete citation information.** + +**Key Dataset:** +- **MeetingBank**: Hu et al., "MeetingBank: A Benchmark Dataset for Meeting Summarization", ACL 2023 + - Used for meeting discovery and analysis + - 1,366 city council meetings from 6 U.S. cities + - See [CITATIONS.md](CITATIONS.md) for full citation and BibTeX + +--- + +## License + +Apache License 2.0 - see [LICENSE](LICENSE) file for details. + +--- + +## Support + +- GitHub Issues: [github.com/getcommunityone/open-navigator-for-engagement/issues](https://github.com/getcommunityone/open-navigator-for-engagement/issues) +- Email: johnbowyer@communityone.com + +--- + +**Note**: This system is designed to support advocacy efforts. All generated content should be reviewed by humans before use. diff --git a/README_HF.md b/README_HF.md new file mode 100644 index 0000000000000000000000000000000000000000..87f182b6345f51d2a69b8c316e7028b2aa37dc1d --- /dev/null +++ b/README_HF.md @@ -0,0 +1,101 @@ +--- +title: CommunityOne - Open Navigator +emoji: 🏛️ +colorFrom: blue +colorTo: green +sdk: docker +app_port: 7860 +pinned: false +license: apache-2.0 +tags: + - civic-engagement + - policy-tracking + - government-transparency + - nonprofit-discovery + - open-data +--- + +# 🏛️ CommunityOne - Open Navigator + +**Track 90,000+ jurisdictions. Monitor 1.8M nonprofits. Amplify your voice.** + +CommunityOne is a civic engagement platform that helps you discover advocacy opportunities, track policy changes, and connect with organizations working on the causes you care about. + +## ✨ Features + +- **🔍 Unified Search**: Find contacts, meetings, organizations, and causes across the entire United States +- **📊 Real-time Stats**: Track policy activity across 90,000+ cities, counties, and states +- **🏢 Nonprofit Discovery**: Explore 1.8M organizations from IRS data enriched with Every.org +- **📅 Meeting Minutes**: Search 250,000+ government meeting transcripts and agendas +- **🎯 Geographic Filtering**: Browse by state, county, or city to find local opportunities +- **🔐 OAuth Login**: Sign in with HuggingFace, GitHub, or Google to save your preferences + +## 🚀 Three Services Architecture + +This deployment runs three integrated services: + +1. **📚 Documentation** (Docusaurus) - `/docs/` +2. **🖥️ Main Application** (React + Vite) - `/` +3. **⚡ API Backend** (FastAPI) - `/api/` + +All services are reverse-proxied through nginx on port 7860. + +## 📖 Quick Start + +### Browse Without Login +- Click "Browse All" to explore data by state +- Use the search bar to find organizations, contacts, or causes +- Filter by location using the state/county/city selectors + +### Sign In for Personalization +- Click "Login" in the top right +- Choose your OAuth provider (HuggingFace, GitHub, or Google) +- Follow organizations, leaders, and causes you care about +- Get personalized recommendations + +### Explore the API +- Visit `/redoc` for interactive API documentation +- Try the search endpoints with state filters +- Export data in JSON format for your own projects + +## 🛠️ Technology Stack + +- **Frontend**: React 18 + TypeScript + Vite + TailwindCSS + shadcn/ui +- **Backend**: Python 3.11 + FastAPI + Pydantic +- **Data**: Delta Lake + Parquet (90GB+ of civic data) +- **Docs**: Docusaurus v3 +- **Infrastructure**: nginx + supervisor + Docker + +## 📊 Data Sources + +- **IRS BMF**: 1.8M tax-exempt organizations +- **Every.org**: Nonprofit enrichment (logos, causes, revenue) +- **Open States**: State legislators and bills (7,300+ officials) +- **Census**: Jurisdictions and boundaries (90,000+) +- **CityScrapers**: Local government meetings +- **OpenCivicData**: Standardized government data + +## 🔗 Links + +- **Repository**: [github.com/getcommunityone/open-navigator](https://github.com/getcommunityone/open-navigator) +- **Documentation**: Click "📚 Browse Documentation" on the homepage +- **API Docs**: `/redoc` endpoint +- **Website**: [www.communityone.com](https://www.communityone.com) + +## 📝 License + +Apache License 2.0 - Free for commercial and non-commercial use + +## 🤝 Contributing + +We welcome contributions! See CONTRIBUTING.md in the repository for guidelines. + +## 💬 Support + +- **Issues**: [GitHub Issues](https://github.com/getcommunityone/open-navigator/issues) +- **Discussions**: [GitHub Discussions](https://github.com/getcommunityone/open-navigator/discussions) +- **Email**: hello@communityone.com + +--- + +Built with ❤️ for civic engagement and government transparency. diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6f3393f84677587c2a450a0fc5066a4ab9a1756c --- /dev/null +++ b/agents/__init__.py @@ -0,0 +1,16 @@ +"""Agents module for the Oral Health Policy Pulse system.""" +from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus +from agents.orchestrator import OrchestratorAgent +from agents.debate_grader import DebateGraderAgent, DebateDimension, DebateScore + +__all__ = [ + "BaseAgent", + "AgentRole", + "AgentMessage", + "MessageType", + "AgentStatus", + "OrchestratorAgent", + "DebateGraderAgent", + "DebateDimension", + "DebateScore" +] diff --git a/agents/advocacy.py b/agents/advocacy.py new file mode 100644 index 0000000000000000000000000000000000000000..ff4e6cdb0e41e94428aabd0a0673a89a61a15136 --- /dev/null +++ b/agents/advocacy.py @@ -0,0 +1,408 @@ +""" +Advocacy Writer Agent for generating personalized outreach materials. +""" +import asyncio +from typing import List, Dict, Any, Optional +from datetime import datetime +from loguru import logger + +from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus + + +class AdvocacyWriterAgent(BaseAgent): + """ + Agent responsible for generating advocacy materials. + + Creates: + - Personalized emails to local officials + - Talking points for public testimony + - Social media content + - Policy briefs + - Community outreach materials + """ + + def __init__(self, agent_id: str = "advocacy-001"): + """Initialize the advocacy writer agent.""" + super().__init__(agent_id, AgentRole.ADVOCACY_WRITER) + self._initialize_templates() + + def _initialize_templates(self): + """Initialize email and content templates.""" + self.email_templates = { + "critical_vote": { + "subject": "Urgent: Support Oral Health Policy - Vote Upcoming in {municipality}", + "opening": ( + "I am writing to urge your support for the upcoming vote on " + "{policy_topic} in {municipality}." + ), + "urgency": "This matter requires immediate attention as a vote is scheduled for {meeting_date}." + }, + "introduce_topic": { + "subject": "Opportunity to Improve Community Oral Health in {municipality}", + "opening": ( + "I am writing to bring to your attention an important opportunity " + "to enhance oral health services in {municipality}." + ), + "urgency": None + }, + "address_opposition": { + "subject": "Addressing Concerns About {policy_topic} in {municipality}", + "opening": ( + "I understand there are concerns about {policy_topic}. " + "I would like to share evidence-based information that may help inform the discussion." + ), + "urgency": None + }, + "support_existing": { + "subject": "Thank You for Supporting Oral Health in {municipality}", + "opening": ( + "Thank you for your support of {policy_topic}. " + "I am writing to express my appreciation and offer additional support." + ), + "urgency": None + } + } + + self.policy_benefits = { + "water_fluoridation": [ + "Reduces tooth decay by 25% in children and adults", + "Costs approximately $1 per person per year", + "Recognized by CDC as one of 10 great public health achievements", + "Reduces dental treatment costs by $38 per $1 invested", + "Particularly benefits low-income families with limited access to dental care" + ], + "school_dental_screening": [ + "Early detection prevents costly emergency dental procedures", + "Identifies children who need care before problems become severe", + "Reduces school absences due to dental pain", + "Connects families to dental resources and services", + "Supported by American Academy of Pediatrics" + ], + "medicaid_dental": [ + "Improves health outcomes for vulnerable populations", + "Reduces emergency room visits for dental problems", + "Prevents progression of oral disease to systemic health issues", + "Supports working families and children", + "Generates economic returns through improved productivity" + ], + "dental_clinic_funding": [ + "Provides essential services to underserved communities", + "Reduces health disparities", + "Creates local jobs and economic activity", + "Prevents costly emergency care", + "Serves as safety net for uninsured and underinsured residents" + ] + } + + async def process(self, message: AgentMessage) -> List[AgentMessage]: + """ + Process advocacy generation commands. + + Args: + message: Message containing analyzed documents and opportunities + + Returns: + List of messages with generated advocacy materials + """ + self.update_status(AgentStatus.PROCESSING, "Generating advocacy materials") + + try: + documents = message.payload.get("documents", []) + opportunities = message.payload.get("opportunities", []) + + # Generate advocacy materials for each opportunity + advocacy_materials = [] + + for opp in opportunities: + materials = await self._generate_advocacy_materials(opp, documents) + advocacy_materials.append(materials) + + # Send results back to orchestrator + response = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.RESPONSE, + { + "workflow_id": message.payload.get("workflow_id"), + "advocacy_materials": advocacy_materials, + "opportunities_count": len(opportunities), + "materials_generated": len(advocacy_materials) + } + ) + + self.log_success() + logger.info(f"Generated advocacy materials for {len(opportunities)} opportunities") + + return [response] + + except Exception as e: + self.log_failure(str(e)) + error_msg = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.ERROR, + {"error": str(e), "agent": self.agent_id} + ) + return [error_msg] + + async def _generate_advocacy_materials( + self, + opportunity: Dict[str, Any], + all_documents: List[Dict[str, Any]] + ) -> Dict[str, Any]: + """ + Generate complete advocacy materials for an opportunity. + + Args: + opportunity: Advocacy opportunity details + all_documents: All analyzed documents for context + + Returns: + Dictionary containing all generated materials + """ + # Find the source document + doc = next( + (d for d in all_documents if d["document_id"] == opportunity["document_id"]), + None + ) + + if not doc: + logger.error(f"Document not found: {opportunity['document_id']}") + return {} + + # Determine template based on situation + template_type = self._select_template(opportunity) + + # Generate email + email = await self._generate_email(opportunity, doc, template_type) + + # Generate talking points + talking_points = self._generate_talking_points(opportunity, doc) + + # Generate social media content + social_media = self._generate_social_media(opportunity) + + # Generate policy brief + policy_brief = self._generate_policy_brief(opportunity, doc) + + materials = { + "opportunity_id": opportunity["document_id"], + "municipality": opportunity["municipality"], + "state": opportunity["state"], + "topic": opportunity["topic"], + "urgency": opportunity["urgency"], + "materials": { + "email": email, + "talking_points": talking_points, + "social_media": social_media, + "policy_brief": policy_brief + }, + "generated_at": datetime.utcnow().isoformat(), + "metadata": { + "source_url": opportunity["source_url"], + "meeting_date": opportunity["meeting_date"] + } + } + + return materials + + def _select_template(self, opportunity: Dict[str, Any]) -> str: + """Select appropriate email template based on situation.""" + urgency = opportunity.get("urgency") + stance = opportunity.get("stance") + + if urgency == "critical": + return "critical_vote" + elif stance in ["opposed", "strongly_opposed"]: + return "address_opposition" + elif stance in ["supportive", "strongly_supportive"]: + return "support_existing" + else: + return "introduce_topic" + + async def _generate_email( + self, + opportunity: Dict[str, Any], + doc: Dict[str, Any], + template_type: str + ) -> Dict[str, Any]: + """Generate personalized email content.""" + template = self.email_templates[template_type] + + # Format template variables + variables = { + "municipality": opportunity["municipality"], + "policy_topic": self._format_topic_name(opportunity["topic"]), + "meeting_date": opportunity["meeting_date"] + } + + subject = template["subject"].format(**variables) + opening = template["opening"].format(**variables) + + # Build email body + body_parts = [opening] + + # Add urgency if applicable + if template["urgency"]: + body_parts.append("\n\n" + template["urgency"].format(**variables)) + + # Add policy benefits + body_parts.append("\n\n**Key Benefits:**") + benefits = self.policy_benefits.get( + opportunity["topic"], + ["Improves community health outcomes"] + ) + for benefit in benefits[:3]: # Top 3 benefits + body_parts.append(f"• {benefit}") + + # Add call to action + body_parts.append(self._generate_call_to_action(opportunity)) + + # Add closing + body_parts.append( + "\n\nThank you for your time and consideration. " + "I would welcome the opportunity to discuss this further." + ) + body_parts.append("\n\nSincerely,") + body_parts.append("[Your Name]") + body_parts.append("[Your Organization]") + + email = { + "subject": subject, + "body": "\n".join(body_parts), + "template_type": template_type, + "personalization_variables": variables + } + + return email + + def _generate_call_to_action(self, opportunity: Dict[str, Any]) -> str: + """Generate appropriate call to action based on urgency.""" + urgency = opportunity.get("urgency") + stance = opportunity.get("stance") + + if urgency == "critical": + return ( + "\n\n**Action Needed:**\n" + f"Please vote in favor of this important measure at the upcoming meeting. " + f"Your constituents' oral health depends on this decision." + ) + elif stance in ["opposed", "strongly_opposed"]: + return ( + "\n\n**Requested Action:**\n" + "I respectfully request a meeting to discuss the evidence supporting this policy " + "and address any concerns you may have." + ) + else: + return ( + "\n\n**Requested Action:**\n" + "I encourage you to support this initiative and would be happy to provide " + "additional information or connect you with subject matter experts." + ) + + def _generate_talking_points( + self, + opportunity: Dict[str, Any], + doc: Dict[str, Any] + ) -> List[str]: + """Generate talking points for public testimony or meetings.""" + topic = opportunity["topic"] + + talking_points = [ + f"Introduction: Community member concerned about oral health in {opportunity['municipality']}" + ] + + # Add topic-specific points + benefits = self.policy_benefits.get(topic, []) + for i, benefit in enumerate(benefits[:5], 1): + talking_points.append(f"Point {i}: {benefit}") + + # Add local context + talking_points.append( + f"Local relevance: This policy addresses needs identified in " + f"recent community discussions" + ) + + # Add closing point + talking_points.append( + "Closing: Urge decision-makers to prioritize community oral health" + ) + + return talking_points + + def _generate_social_media( + self, + opportunity: Dict[str, Any] + ) -> Dict[str, str]: + """Generate social media content.""" + municipality = opportunity["municipality"] + topic = self._format_topic_name(opportunity["topic"]) + + twitter = ( + f"🦷 {municipality} is considering {topic}! " + f"This could improve oral health for thousands. " + f"Contact your local officials to show support. " + f"#OralHealth #PublicHealth" + ) + + facebook = ( + f"Important news for {municipality} residents!\n\n" + f"Our local government is discussing {topic}. " + f"This policy could significantly improve access to dental care " + f"for families in our community.\n\n" + f"Learn more and contact your representatives to voice your support: " + f"{opportunity.get('source_url', '')}" + ) + + return { + "twitter": twitter, + "facebook": facebook, + "instagram": twitter, # Similar to Twitter + "hashtags": ["OralHealth", "PublicHealth", municipality.replace(" ", "")] + } + + def _generate_policy_brief( + self, + opportunity: Dict[str, Any], + doc: Dict[str, Any] + ) -> Dict[str, Any]: + """Generate a concise policy brief.""" + topic = opportunity["topic"] + + brief = { + "title": f"Policy Brief: {self._format_topic_name(topic)} in {opportunity['municipality']}", + "summary": ( + f"This brief outlines the benefits and implementation considerations " + f"for {self._format_topic_name(topic)}." + ), + "background": ( + f"Current discussion in {opportunity['municipality']} presents " + f"an opportunity to improve community oral health." + ), + "key_benefits": self.policy_benefits.get(topic, []), + "recommendations": [ + "Approve the proposed policy", + "Allocate necessary funding", + "Establish implementation timeline", + "Monitor outcomes and adjust as needed" + ], + "evidence_sources": [ + "Centers for Disease Control and Prevention", + "American Dental Association", + "Community Preventive Services Task Force" + ] + } + + return brief + + def _format_topic_name(self, topic: str) -> str: + """Format topic identifier into readable name.""" + topic_names = { + "water_fluoridation": "community water fluoridation", + "school_dental_screening": "school-based dental screening", + "medicaid_dental": "Medicaid dental coverage expansion", + "dental_clinic_funding": "community dental clinic funding", + "community_dental_program": "community dental programs", + "children_dental_health": "children's dental health initiatives", + "dental_care_access": "dental care access improvements" + } + + return topic_names.get(topic, topic.replace("_", " ")) diff --git a/agents/base.py b/agents/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f9315f18a77b6c9496905946a4da6a411d1d79dc --- /dev/null +++ b/agents/base.py @@ -0,0 +1,171 @@ +""" +Core agent base classes and protocols for the multi-agent system. +""" +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Union +from datetime import datetime +from enum import Enum +from pydantic import BaseModel, Field +from loguru import logger + + +class AgentRole(str, Enum): + """Enumeration of agent roles in the system.""" + SCRAPER = "scraper" + PARSER = "parser" + CLASSIFIER = "classifier" + SENTIMENT_ANALYZER = "sentiment_analyzer" + DEBATE_GRADER = "debate_grader" + ADVOCACY_WRITER = "advocacy_writer" + ORCHESTRATOR = "orchestrator" + + +class MessageType(str, Enum): + """Types of messages exchanged between agents.""" + DATA = "data" + COMMAND = "command" + QUERY = "query" + RESPONSE = "response" + ERROR = "error" + STATUS = "status" + + +class AgentMessage(BaseModel): + """Message structure for inter-agent communication.""" + message_id: str = Field(..., description="Unique message identifier") + sender: AgentRole = Field(..., description="Sending agent role") + recipient: AgentRole = Field(..., description="Receiving agent role") + message_type: MessageType = Field(..., description="Type of message") + timestamp: datetime = Field(default_factory=datetime.utcnow) + payload: Dict[str, Any] = Field(default_factory=dict, description="Message payload") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") + + class Config: + json_encoders = { + datetime: lambda v: v.isoformat() + } + + +class AgentStatus(str, Enum): + """Agent operational status.""" + IDLE = "idle" + PROCESSING = "processing" + WAITING = "waiting" + ERROR = "error" + COMPLETED = "completed" + + +class AgentState(BaseModel): + """Current state of an agent.""" + agent_id: str + role: AgentRole + status: AgentStatus = AgentStatus.IDLE + current_task: Optional[str] = None + tasks_completed: int = 0 + tasks_failed: int = 0 + last_activity: datetime = Field(default_factory=datetime.utcnow) + error_message: Optional[str] = None + + +class BaseAgent(ABC): + """ + Abstract base class for all agents in the system. + + Each agent must implement the process method to handle incoming messages + and perform its specific role in the pipeline. + """ + + def __init__(self, agent_id: str, role: AgentRole): + """ + Initialize the base agent. + + Args: + agent_id: Unique identifier for this agent instance + role: The role this agent plays in the system + """ + self.agent_id = agent_id + self.role = role + self.state = AgentState(agent_id=agent_id, role=role) + self.message_queue: List[AgentMessage] = [] + logger.info(f"Initialized {role.value} agent: {agent_id}") + + @abstractmethod + async def process(self, message: AgentMessage) -> Union[AgentMessage, List[AgentMessage]]: + """ + Process an incoming message and return response(s). + + Args: + message: The message to process + + Returns: + One or more response messages + """ + pass + + def update_status(self, status: AgentStatus, task: Optional[str] = None): + """Update the agent's current status.""" + self.state.status = status + self.state.current_task = task + self.state.last_activity = datetime.utcnow() + logger.debug(f"{self.role.value} agent {self.agent_id} status: {status.value}") + + def log_success(self): + """Log a successful task completion.""" + self.state.tasks_completed += 1 + self.update_status(AgentStatus.IDLE) + + def log_failure(self, error: str): + """Log a task failure.""" + self.state.tasks_failed += 1 + self.state.error_message = error + self.update_status(AgentStatus.ERROR) + logger.error(f"{self.role.value} agent {self.agent_id} error: {error}") + + async def send_message( + self, + recipient: AgentRole, + message_type: MessageType, + payload: Dict[str, Any], + metadata: Optional[Dict[str, Any]] = None + ) -> AgentMessage: + """ + Create and send a message to another agent. + + Args: + recipient: The receiving agent's role + message_type: Type of message to send + payload: Message content + metadata: Optional metadata + + Returns: + The created message + """ + import uuid + + message = AgentMessage( + message_id=str(uuid.uuid4()), + sender=self.role, + recipient=recipient, + message_type=message_type, + payload=payload, + metadata=metadata or {} + ) + + return message + + def get_state(self) -> AgentState: + """Get the current state of the agent.""" + return self.state + + +class AgentMetrics(BaseModel): + """Metrics for monitoring agent performance.""" + agent_id: str + role: AgentRole + total_messages_processed: int = 0 + total_processing_time_seconds: float = 0.0 + average_processing_time_seconds: float = 0.0 + success_rate: float = 0.0 + error_count: int = 0 + last_error: Optional[str] = None + uptime_seconds: float = 0.0 diff --git a/agents/classifier.py b/agents/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..d246e494751197b1c377003c41c5143e2eeb8c78 --- /dev/null +++ b/agents/classifier.py @@ -0,0 +1,295 @@ +""" +Classifier Agent for identifying oral health policy topics in meeting minutes. +""" +import asyncio +from typing import List, Dict, Any, Optional, Set +from datetime import datetime +from loguru import logger + +from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus +from config import settings + + +class PolicyTopic: + """Enumeration of oral health policy topics.""" + WATER_FLUORIDATION = "water_fluoridation" + SCHOOL_DENTAL_SCREENING = "school_dental_screening" + MEDICAID_DENTAL = "medicaid_dental" + DENTAL_CLINIC_FUNDING = "dental_clinic_funding" + COMMUNITY_DENTAL_PROGRAM = "community_dental_program" + CHILDREN_DENTAL_HEALTH = "children_dental_health" + DENTAL_CARE_ACCESS = "dental_care_access" + OTHER_ORAL_HEALTH = "other_oral_health" + NOT_RELEVANT = "not_relevant" + + +class ClassifierAgent(BaseAgent): + """ + Agent responsible for classifying documents by oral health policy topics. + + Uses a combination of: + - Keyword matching for high-precision identification + - LLM-based classification for nuanced topics + - Topic modeling for discovering new themes + """ + + def __init__(self, agent_id: str = "classifier-001"): + """Initialize the classifier agent.""" + super().__init__(agent_id, AgentRole.CLASSIFIER) + self._initialize_keywords() + self.llm_client = None # Will be initialized when needed + + def _initialize_keywords(self): + """Initialize keyword patterns for each topic.""" + self.topic_keywords = { + PolicyTopic.WATER_FLUORIDATION: [ + "fluoridation", "fluoride", "water fluoridation", + "fluoridated water", "fluoride level", "fluoride treatment", + "community water fluoridation" + ], + PolicyTopic.SCHOOL_DENTAL_SCREENING: [ + "school dental", "dental screening", "school screening", + "school health screening", "dental exam", "school nurse", + "student dental" + ], + PolicyTopic.MEDICAID_DENTAL: [ + "medicaid dental", "medicaid", "medicare dental", + "public assistance dental", "low-income dental", + "dental benefits", "dental coverage" + ], + PolicyTopic.DENTAL_CLINIC_FUNDING: [ + "dental clinic", "community dental clinic", + "dental center", "dental facility", "clinic funding", + "dental services funding" + ], + PolicyTopic.COMMUNITY_DENTAL_PROGRAM: [ + "community dental", "dental program", "oral health program", + "dental outreach", "mobile dental", "dental van" + ], + PolicyTopic.CHILDREN_DENTAL_HEALTH: [ + "children's dental", "pediatric dental", "child dental", + "kids dental", "youth dental", "infant oral health" + ], + PolicyTopic.DENTAL_CARE_ACCESS: [ + "dental access", "access to dental", "dental care", + "oral health access", "dental services", "dental disparities" + ] + } + + async def process(self, message: AgentMessage) -> List[AgentMessage]: + """ + Process classification commands. + + Args: + message: Message containing parsed documents to classify + + Returns: + List of messages with classification results + """ + self.update_status(AgentStatus.PROCESSING, "Classifying policy documents") + + try: + documents = message.payload.get("documents", []) + + # Classify documents in batches + batch_size = settings.classifier_batch_size + classified_documents = [] + + for i in range(0, len(documents), batch_size): + batch = documents[i:i + batch_size] + batch_results = await self._classify_batch(batch) + classified_documents.extend(batch_results) + + # Filter to only relevant documents + relevant_documents = [ + doc for doc in classified_documents + if doc["classification"]["primary_topic"] != PolicyTopic.NOT_RELEVANT + ] + + # Send classified documents to sentiment analyzer + response = await self.send_message( + AgentRole.SENTIMENT_ANALYZER, + MessageType.DATA, + { + "workflow_id": message.payload.get("workflow_id"), + "documents": relevant_documents, + "count": len(relevant_documents), + "filtered_count": len(documents) - len(relevant_documents) + } + ) + + self.log_success() + logger.info( + f"Classified {len(documents)} documents, " + f"{len(relevant_documents)} relevant to oral health policy" + ) + + return [response] + + except Exception as e: + self.log_failure(str(e)) + error_msg = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.ERROR, + {"error": str(e), "agent": self.agent_id} + ) + return [error_msg] + + async def _classify_batch( + self, + documents: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """ + Classify a batch of documents. + + Args: + documents: Batch of documents to classify + + Returns: + Documents with classification results + """ + tasks = [self._classify_document(doc) for doc in documents] + results = await asyncio.gather(*tasks, return_exceptions=True) + + classified = [] + for doc, result in zip(documents, results): + if isinstance(result, Exception): + logger.error(f"Classification error for {doc['document_id']}: {result}") + doc["classification"] = { + "primary_topic": PolicyTopic.NOT_RELEVANT, + "error": str(result) + } + else: + doc["classification"] = result + + classified.append(doc) + + return classified + + async def _classify_document( + self, + doc: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Classify a single document. + + Args: + doc: Document to classify + + Returns: + Classification results + """ + text = self._get_searchable_text(doc) + text_lower = text.lower() + + # Keyword-based classification + topic_scores = {} + for topic, keywords in self.topic_keywords.items(): + score = sum(1 for keyword in keywords if keyword in text_lower) + if score > 0: + topic_scores[topic] = score + + # Determine primary topic + if topic_scores: + primary_topic = max(topic_scores, key=topic_scores.get) + confidence = "high" if topic_scores[primary_topic] >= 3 else "medium" + + # Get all topics mentioned + all_topics = list(topic_scores.keys()) + else: + primary_topic = PolicyTopic.NOT_RELEVANT + confidence = "high" + all_topics = [] + + # Extract relevant excerpts + excerpts = self._extract_relevant_excerpts(doc, primary_topic) + + classification = { + "primary_topic": primary_topic, + "all_topics": all_topics, + "topic_scores": topic_scores, + "confidence": confidence, + "relevant_excerpts": excerpts, + "classified_at": datetime.utcnow().isoformat() + } + + return classification + + def _get_searchable_text(self, doc: Dict[str, Any]) -> str: + """Extract searchable text from document.""" + parts = [ + doc.get("raw_title", ""), + doc.get("full_text", "") + ] + + # Add agenda items + for item in doc.get("agenda_items", []): + parts.append(item.get("description", "")) + + # Add discussion sections + for section in doc.get("discussion_sections", []): + parts.append(section.get("text", "")) + + return " ".join(parts) + + def _extract_relevant_excerpts( + self, + doc: Dict[str, Any], + topic: str + ) -> List[Dict[str, str]]: + """Extract text excerpts relevant to the topic.""" + if topic == PolicyTopic.NOT_RELEVANT: + return [] + + keywords = self.topic_keywords.get(topic, []) + excerpts = [] + + # Check discussion sections + for section in doc.get("discussion_sections", []): + text = section.get("text", "") + text_lower = text.lower() + + # Check if any keywords present + if any(keyword in text_lower for keyword in keywords): + excerpts.append({ + "source": "discussion", + "text": text[:500], # First 500 chars + "section_id": section.get("section_id") + }) + + # Check agenda items + for item in doc.get("agenda_items", []): + desc = item.get("description", "") + desc_lower = desc.lower() + + if any(keyword in desc_lower for keyword in keywords): + excerpts.append({ + "source": "agenda", + "text": desc, + "item_number": item.get("number") + }) + + return excerpts[:5] # Return top 5 excerpts + + async def _llm_classify( + self, + text: str, + preliminary_topics: List[str] + ) -> Dict[str, Any]: + """ + Use LLM for nuanced classification when keywords are ambiguous. + + Args: + text: Text to classify + preliminary_topics: Topics identified by keyword matching + + Returns: + LLM classification results + """ + # This would use OpenAI API or similar + # Placeholder for now + return { + "llm_topic": preliminary_topics[0] if preliminary_topics else PolicyTopic.NOT_RELEVANT, + "llm_confidence": 0.8, + "llm_reasoning": "Based on keyword analysis" + } diff --git a/agents/debate_grader.py b/agents/debate_grader.py new file mode 100644 index 0000000000000000000000000000000000000000..1a138bb106cf61945d74ed733b03277c5f3cc177 --- /dev/null +++ b/agents/debate_grader.py @@ -0,0 +1,424 @@ +""" +Debate Grader Agent for evaluating government decisions using debate framework. + +Evaluates decisions across three dimensions: +- Harms: The problem/crisis identified +- Solvency: How the proposed solution addresses the problem +- Topicality: Whether the solution fits within jurisdiction's authority +""" +import asyncio +from typing import List, Dict, Any, Optional +from datetime import datetime +from loguru import logger + +from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus + + +class DebateDimension: + """Enumeration of debate evaluation dimensions.""" + HARMS = "harms" # The problem + SOLVENCY = "solvency" # The fix + TOPICALITY = "topicality" # The scope + + +class DebateScore: + """Score levels for each debate dimension.""" + EXCELLENT = "excellent" # 4-5/5 + GOOD = "good" # 3-4/5 + FAIR = "fair" # 2-3/5 + WEAK = "weak" # 1-2/5 + MISSING = "missing" # 0-1/5 + + +class DebateGraderAgent(BaseAgent): + """ + Agent responsible for grading government decisions using debate framework. + + Translates debate concepts for laypeople: + - Harms → "The Problem": Why is this a crisis in our community? + - Solvency → "The Fix": How does this solution actually work? + - Topicality → "The Scope": Does the government have authority to do this? + """ + + def __init__(self, agent_id: str = "debate-grader-001"): + """Initialize the debate grader agent.""" + super().__init__(agent_id, AgentRole.SENTIMENT_ANALYZER) + self._initialize_criteria() + + def _initialize_criteria(self): + """Initialize evaluation criteria for each dimension.""" + + # Harms evaluation keywords + self.harms_indicators = { + "problem_identification": [ + "crisis", "emergency", "critical", "urgent need", + "widespread problem", "affecting", "impacting", + "suffering", "lack of", "shortage", "gap in services" + ], + "data_evidence": [ + "statistics", "data shows", "research indicates", + "study found", "percent", "%", "number of people", + "cases", "instances", "reports" + ], + "affected_population": [ + "children", "families", "residents", "citizens", + "low-income", "vulnerable", "underserved", + "community members", "students", "seniors" + ] + } + + # Solvency evaluation keywords + self.solvency_indicators = { + "solution_clarity": [ + "will", "would", "proposes to", "plans to", + "implement", "establish", "create", "provide", + "offer", "deliver", "fund", "allocate" + ], + "mechanism": [ + "through", "by", "using", "via", "process", + "program", "initiative", "partnership", + "collaboration", "service", "system" + ], + "evidence_of_effectiveness": [ + "proven", "successful in", "works in", + "demonstrated", "track record", "best practice", + "evidence-based", "research-backed" + ], + "implementation_plan": [ + "timeline", "budget", "staff", "resources", + "phase", "rollout", "launch", "start date", + "completion", "milestones" + ] + } + + # Topicality evaluation keywords + self.topicality_indicators = { + "legal_authority": [ + "authority", "jurisdiction", "mandate", + "chartered to", "empowered to", "authorized", + "within our purview", "responsibility" + ], + "precedent": [ + "previously", "historically", "past practice", + "similar actions", "other cities", "state law", + "federal law", "code", "ordinance" + ], + "scope_appropriateness": [ + "city council", "county commission", "board", + "department", "local government", "municipal", + "within scope", "appropriate for" + ] + } + + async def process(self, message: AgentMessage) -> List[AgentMessage]: + """ + Process debate grading commands. + + Args: + message: Message containing decisions/documents to grade + + Returns: + List of messages with debate grades + """ + self.update_status(AgentStatus.PROCESSING, "Grading decisions with debate framework") + + try: + documents = message.payload.get("documents", []) + + graded_documents = [] + + for doc in documents: + grade = await self._grade_document(doc) + doc["debate_grade"] = grade + graded_documents.append(doc) + + # Calculate aggregate insights + insights = self._generate_insights(graded_documents) + + # Send results + response = await self.send_message( + recipient=AgentRole.ORCHESTRATOR, + message_type=MessageType.RESPONSE, + payload={ + "documents": graded_documents, + "insights": insights, + "graded_count": len(graded_documents) + } + ) + + self.update_status(AgentStatus.COMPLETED, f"Graded {len(graded_documents)} decisions") + return [response] + + except Exception as e: + logger.error(f"Debate grading failed: {e}") + self.update_status(AgentStatus.ERROR, str(e)) + raise + + async def _grade_document(self, document: Dict[str, Any]) -> Dict[str, Any]: + """ + Grade a single document across all debate dimensions. + + Args: + document: Document to grade + + Returns: + Dictionary with grades for each dimension + """ + text = document.get("content", "").lower() + title = document.get("title", "").lower() + combined_text = f"{title} {text}" + + # Grade each dimension + harms_score = self._grade_harms(combined_text) + solvency_score = self._grade_solvency(combined_text) + topicality_score = self._grade_topicality(combined_text) + + # Calculate overall score + overall_score = self._calculate_overall_score( + harms_score, solvency_score, topicality_score + ) + + return { + "dimensions": { + "harms": { + "score": harms_score["score"], + "grade": harms_score["grade"], + "explanation": harms_score["explanation"], + "layperson_label": "The Problem", + "layperson_question": "Why is this a crisis in our community?" + }, + "solvency": { + "score": solvency_score["score"], + "grade": solvency_score["grade"], + "explanation": solvency_score["explanation"], + "layperson_label": "The Fix", + "layperson_question": "How does this solution actually work?" + }, + "topicality": { + "score": topicality_score["score"], + "grade": topicality_score["grade"], + "explanation": topicality_score["explanation"], + "layperson_label": "The Scope", + "layperson_question": "Does the government have authority to do this?" + } + }, + "overall": { + "score": overall_score, + "grade": self._score_to_grade(overall_score), + "summary": self._generate_summary(harms_score, solvency_score, topicality_score) + }, + "timestamp": datetime.utcnow().isoformat() + } + + def _grade_harms(self, text: str) -> Dict[str, Any]: + """Grade the 'harms' dimension - problem identification.""" + score = 0 + max_score = 5 + details = [] + + # Check for problem identification (0-2 points) + problem_count = sum(1 for keyword in self.harms_indicators["problem_identification"] if keyword in text) + if problem_count >= 3: + score += 2 + details.append("Strong problem identification") + elif problem_count >= 1: + score += 1 + details.append("Problem mentioned but not detailed") + + # Check for data/evidence (0-2 points) + data_count = sum(1 for keyword in self.harms_indicators["data_evidence"] if keyword in text) + if data_count >= 2: + score += 2 + details.append("Data-driven evidence provided") + elif data_count >= 1: + score += 1 + details.append("Some evidence mentioned") + + # Check for affected population (0-1 point) + population_count = sum(1 for keyword in self.harms_indicators["affected_population"] if keyword in text) + if population_count >= 1: + score += 1 + details.append("Affected population identified") + + return { + "score": score, + "max_score": max_score, + "grade": self._score_to_grade(score / max_score * 5), + "explanation": "; ".join(details) if details else "No clear problem statement" + } + + def _grade_solvency(self, text: str) -> Dict[str, Any]: + """Grade the 'solvency' dimension - solution effectiveness.""" + score = 0 + max_score = 5 + details = [] + + # Check for solution clarity (0-1 point) + solution_count = sum(1 for keyword in self.solvency_indicators["solution_clarity"] if keyword in text) + if solution_count >= 2: + score += 1 + details.append("Clear solution proposed") + + # Check for mechanism (0-2 points) + mechanism_count = sum(1 for keyword in self.solvency_indicators["mechanism"] if keyword in text) + if mechanism_count >= 3: + score += 2 + details.append("Implementation mechanism described") + elif mechanism_count >= 1: + score += 1 + details.append("Basic approach outlined") + + # Check for evidence of effectiveness (0-1 point) + evidence_count = sum(1 for keyword in self.solvency_indicators["evidence_of_effectiveness"] if keyword in text) + if evidence_count >= 1: + score += 1 + details.append("Evidence-based approach") + + # Check for implementation plan (0-1 point) + plan_count = sum(1 for keyword in self.solvency_indicators["implementation_plan"] if keyword in text) + if plan_count >= 2: + score += 1 + details.append("Implementation plan included") + + return { + "score": score, + "max_score": max_score, + "grade": self._score_to_grade(score / max_score * 5), + "explanation": "; ".join(details) if details else "No clear solution mechanism" + } + + def _grade_topicality(self, text: str) -> Dict[str, Any]: + """Grade the 'topicality' dimension - scope appropriateness.""" + score = 0 + max_score = 5 + details = [] + + # Check for legal authority (0-2 points) + authority_count = sum(1 for keyword in self.topicality_indicators["legal_authority"] if keyword in text) + if authority_count >= 2: + score += 2 + details.append("Legal authority cited") + elif authority_count >= 1: + score += 1 + details.append("Authority mentioned") + + # Check for precedent (0-2 points) + precedent_count = sum(1 for keyword in self.topicality_indicators["precedent"] if keyword in text) + if precedent_count >= 2: + score += 2 + details.append("Precedent established") + elif precedent_count >= 1: + score += 1 + details.append("Some precedent referenced") + + # Check for scope appropriateness (0-1 point) + scope_count = sum(1 for keyword in self.topicality_indicators["scope_appropriateness"] if keyword in text) + if scope_count >= 1: + score += 1 + details.append("Within appropriate scope") + + return { + "score": score, + "max_score": max_score, + "grade": self._score_to_grade(score / max_score * 5), + "explanation": "; ".join(details) if details else "Unclear jurisdictional authority" + } + + def _score_to_grade(self, normalized_score: float) -> str: + """Convert numerical score to grade.""" + if normalized_score >= 4.0: + return DebateScore.EXCELLENT + elif normalized_score >= 3.0: + return DebateScore.GOOD + elif normalized_score >= 2.0: + return DebateScore.FAIR + elif normalized_score >= 1.0: + return DebateScore.WEAK + else: + return DebateScore.MISSING + + def _calculate_overall_score( + self, + harms: Dict[str, Any], + solvency: Dict[str, Any], + topicality: Dict[str, Any] + ) -> float: + """Calculate weighted overall score.""" + # Weight: Harms 40%, Solvency 40%, Topicality 20% + harms_normalized = (harms["score"] / harms["max_score"]) * 5 + solvency_normalized = (solvency["score"] / solvency["max_score"]) * 5 + topicality_normalized = (topicality["score"] / topicality["max_score"]) * 5 + + overall = (harms_normalized * 0.4) + (solvency_normalized * 0.4) + (topicality_normalized * 0.2) + return round(overall, 2) + + def _generate_summary( + self, + harms: Dict[str, Any], + solvency: Dict[str, Any], + topicality: Dict[str, Any] + ) -> str: + """Generate human-readable summary.""" + parts = [] + + if harms["grade"] in [DebateScore.EXCELLENT, DebateScore.GOOD]: + parts.append("Strong problem identification") + else: + parts.append("Weak problem statement") + + if solvency["grade"] in [DebateScore.EXCELLENT, DebateScore.GOOD]: + parts.append("clear solution") + else: + parts.append("unclear fix") + + if topicality["grade"] in [DebateScore.EXCELLENT, DebateScore.GOOD]: + parts.append("within authority") + else: + parts.append("questionable scope") + + return "; ".join(parts).capitalize() + + def _generate_insights(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate aggregate insights from all graded documents.""" + if not documents: + return {} + + total = len(documents) + dimension_scores = { + "harms": [], + "solvency": [], + "topicality": [] + } + overall_scores = [] + + for doc in documents: + grade = doc.get("debate_grade", {}) + dimensions = grade.get("dimensions", {}) + + for dim in ["harms", "solvency", "topicality"]: + if dim in dimensions: + dimension_scores[dim].append(dimensions[dim]["score"]) + + if "overall" in grade: + overall_scores.append(grade["overall"]["score"]) + + # Calculate averages + insights = { + "total_documents": total, + "average_scores": { + "harms": round(sum(dimension_scores["harms"]) / len(dimension_scores["harms"]), 2) if dimension_scores["harms"] else 0, + "solvency": round(sum(dimension_scores["solvency"]) / len(dimension_scores["solvency"]), 2) if dimension_scores["solvency"] else 0, + "topicality": round(sum(dimension_scores["topicality"]) / len(dimension_scores["topicality"]), 2) if dimension_scores["topicality"] else 0, + "overall": round(sum(overall_scores) / len(overall_scores), 2) if overall_scores else 0 + }, + "strongest_dimension": max( + dimension_scores.items(), + key=lambda x: sum(x[1]) / len(x[1]) if x[1] else 0 + )[0] if any(dimension_scores.values()) else None, + "weakest_dimension": min( + dimension_scores.items(), + key=lambda x: sum(x[1]) / len(x[1]) if x[1] else 0 + )[0] if any(dimension_scores.values()) else None + } + + return insights diff --git a/agents/mlflow_base.py b/agents/mlflow_base.py new file mode 100644 index 0000000000000000000000000000000000000000..619489ade33b2039ef88e5e20f61eb67cedba8c2 --- /dev/null +++ b/agents/mlflow_base.py @@ -0,0 +1,307 @@ +""" +MLflow-based agent foundation for Databricks Agent Bricks. + +Provides: +- MLflow Pyfunc model wrappers for agents +- Unity Catalog integration +- Automatic tracing and observability +- Model serving compatibility +""" +from typing import Any, Dict, List, Optional, Union +from abc import ABC, abstractmethod +import mlflow +from mlflow.pyfunc import PythonModel +from mlflow.models import infer_signature +from mlflow.tracking import MlflowClient +import pandas as pd +from datetime import datetime +from loguru import logger + +from agents.base import AgentRole, AgentMessage, AgentStatus +from config import settings + + +class MLflowAgentBase(PythonModel, ABC): + """ + Base class for agents that can be deployed via MLflow Model Serving. + + Integrates with: + - Unity Catalog for governance + - MLflow Tracking for experimentation + - Databricks Model Serving for deployment + - Mosaic AI Agent Framework for evaluation + """ + + def __init__(self, agent_id: str, role: AgentRole): + """ + Initialize MLflow agent. + + Args: + agent_id: Unique identifier for this agent + role: Agent role in the pipeline + """ + super().__init__() + self.agent_id = agent_id + self.role = role + self.status = AgentStatus.IDLE + self.client = MlflowClient() + + def load_context(self, context): + """ + Load agent context from MLflow (called during model loading). + + Args: + context: MLflow context with model artifacts + """ + logger.info(f"Loading {self.role.value} agent from MLflow context") + # Load any model artifacts, configs, etc. + pass + + @abstractmethod + def _process_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single agent request. + + Args: + request: Input request dictionary + + Returns: + Response dictionary + """ + pass + + def predict( + self, + context, + model_input: Union[pd.DataFrame, Dict[str, Any], List[Dict[str, Any]]] + ) -> Union[pd.DataFrame, List[Dict[str, Any]]]: + """ + MLflow Pyfunc predict interface. + + This is the main entry point when the agent is deployed as a Model Serving endpoint. + + Args: + context: MLflow context + model_input: Input data (DataFrame, dict, or list of dicts) + + Returns: + Predictions in same format as input + """ + # Enable MLflow tracing for observability + with mlflow.start_span(name=f"{self.role.value}_agent") as span: + span.set_attribute("agent_id", self.agent_id) + span.set_attribute("agent_role", self.role.value) + + try: + # Convert input to standard format + if isinstance(model_input, pd.DataFrame): + requests = model_input.to_dict('records') + return_df = True + elif isinstance(model_input, dict): + requests = [model_input] + return_df = False + else: + requests = model_input + return_df = False + + # Process each request with tracing + results = [] + for idx, request in enumerate(requests): + with mlflow.start_span(name=f"process_request_{idx}") as req_span: + req_span.set_attribute("request_id", request.get("request_id", f"req_{idx}")) + + try: + result = self._process_request(request) + result["status"] = "success" + result["agent_id"] = self.agent_id + result["timestamp"] = datetime.utcnow().isoformat() + results.append(result) + + req_span.set_attribute("status", "success") + + except Exception as e: + error_result = { + "status": "error", + "error": str(e), + "agent_id": self.agent_id, + "timestamp": datetime.utcnow().isoformat() + } + results.append(error_result) + + req_span.set_attribute("status", "error") + req_span.set_attribute("error", str(e)) + logger.error(f"Error processing request {idx}: {e}") + + # Return in requested format + if return_df: + return pd.DataFrame(results) + elif len(results) == 1 and not isinstance(model_input, list): + return results[0] + else: + return results + + except Exception as e: + span.set_attribute("status", "error") + span.set_attribute("error", str(e)) + logger.error(f"Error in {self.role.value} agent: {e}") + raise + + def log_to_mlflow( + self, + model_name: str, + artifact_path: str = "agent", + registered_model_name: Optional[str] = None, + **kwargs + ): + """ + Log this agent to MLflow. + + Args: + model_name: Name for the MLflow run + artifact_path: Path within the run to store the model + registered_model_name: Unity Catalog model name (e.g., "main.agents.scraper") + **kwargs: Additional MLflow logging parameters + """ + with mlflow.start_run(run_name=model_name) as run: + # Log agent metadata + mlflow.log_param("agent_id", self.agent_id) + mlflow.log_param("agent_role", self.role.value) + mlflow.log_param("framework", "databricks-agent-bricks") + + # Create example input/output for signature + example_input = self._get_example_input() + example_output = self.predict(None, example_input) + signature = infer_signature(example_input, example_output) + + # Log the model + mlflow.pyfunc.log_model( + artifact_path=artifact_path, + python_model=self, + signature=signature, + registered_model_name=registered_model_name, + **kwargs + ) + + logger.info(f"Logged {self.role.value} agent to MLflow run {run.info.run_id}") + + if registered_model_name: + logger.info(f"Registered model as {registered_model_name}") + + return run.info.run_id + + @abstractmethod + def _get_example_input(self) -> Union[pd.DataFrame, Dict[str, Any]]: + """ + Get example input for MLflow signature inference. + + Returns: + Example input data + """ + pass + + def deploy_to_model_serving( + self, + model_name: str, + endpoint_name: str, + workload_size: str = "Small", + scale_to_zero: bool = True + ) -> str: + """ + Deploy this agent to Databricks Model Serving. + + Args: + model_name: Registered model name in Unity Catalog + endpoint_name: Name for the serving endpoint + workload_size: Endpoint size (Small, Medium, Large) + scale_to_zero: Whether to scale to zero when idle + + Returns: + Endpoint URL + """ + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.serving import ServedEntityInput, EndpointCoreConfigInput + + w = WorkspaceClient( + host=settings.databricks_host, + token=settings.databricks_token + ) + + # Get latest model version + latest_version = self.client.get_latest_versions(model_name, stages=["None"])[0].version + + # Create or update endpoint + endpoint_config = EndpointCoreConfigInput( + name=endpoint_name, + served_entities=[ + ServedEntityInput( + entity_name=model_name, + entity_version=latest_version, + workload_size=workload_size, + scale_to_zero_enabled=scale_to_zero + ) + ] + ) + + try: + endpoint = w.serving_endpoints.create_and_wait( + name=endpoint_name, + config=endpoint_config + ) + logger.info(f"Created endpoint: {endpoint_name}") + except Exception as e: + if "already exists" in str(e): + endpoint = w.serving_endpoints.update_config_and_wait( + name=endpoint_name, + served_entities=endpoint_config.served_entities + ) + logger.info(f"Updated endpoint: {endpoint_name}") + else: + raise + + endpoint_url = f"{settings.databricks_host}/serving-endpoints/{endpoint_name}/invocations" + return endpoint_url + + +class MLflowChainAgent(MLflowAgentBase): + """ + Agent that uses LangChain with MLflow tracing. + + Provides integration with: + - LangChain agents and chains + - Automatic prompt logging + - LLM call tracing + - Tool usage tracking + """ + + def __init__(self, agent_id: str, role: AgentRole): + """Initialize LangChain-based agent.""" + super().__init__(agent_id, role) + self.chain = None + + def _setup_langchain_tracing(self): + """Enable MLflow tracing for LangChain.""" + mlflow.langchain.autolog() + + @abstractmethod + def _build_chain(self): + """ + Build the LangChain chain for this agent. + + Returns: + LangChain chain or agent + """ + pass + + def _process_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """Process request through LangChain.""" + if self.chain is None: + self.chain = self._build_chain() + + with mlflow.start_span(name="langchain_invoke") as span: + result = self.chain.invoke(request) + + # Log relevant metrics + if hasattr(result, "llm_output"): + span.set_attribute("tokens_used", result.llm_output.get("token_usage", {}).get("total_tokens", 0)) + + return result diff --git a/agents/mlflow_classifier.py b/agents/mlflow_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..272df9a9124984d7d8f77e69c45da9704715dfa6 --- /dev/null +++ b/agents/mlflow_classifier.py @@ -0,0 +1,308 @@ +""" +Policy Classifier Agent - MLflow version for Databricks Agent Bricks. + +Classifies meeting documents for oral health policy topics using: +- Keyword matching and NLP +- LLM-based classification for ambiguous cases +- Unity Catalog for model governance +- MLflow tracing for observability +""" +from typing import Any, Dict, List, Optional +import pandas as pd +from enum import Enum +import mlflow +from langchain.chat_models import ChatOpenAI +from langchain.prompts import ChatPromptTemplate +from langchain.output_parsers import PydanticOutputParser +from pydantic import BaseModel, Field + +from agents.mlflow_base import MLflowChainAgent +from agents.base import AgentRole +from config import settings + + +class PolicyTopic(str, Enum): + """Oral health policy topics to classify.""" + WATER_FLUORIDATION = "water_fluoridation" + SCHOOL_DENTAL_SCREENING = "school_dental_screening" + MEDICAID_DENTAL = "medicaid_dental_expansion" + LOW_INCOME_DENTAL_FUNDING = "low_income_dental_funding" + DENTAL_INSURANCE_MANDATE = "dental_insurance_mandate" + DENTAL_WORKFORCE = "dental_workforce_development" + COMMUNITY_HEALTH_CENTER = "community_health_center_dental" + OTHER_ORAL_HEALTH = "other_oral_health" + NOT_ORAL_HEALTH = "not_oral_health_related" + + +class ClassificationResult(BaseModel): + """Structured classification output.""" + primary_topic: PolicyTopic = Field(description="Primary policy topic") + secondary_topics: List[PolicyTopic] = Field(default_factory=list, description="Additional relevant topics") + confidence: float = Field(ge=0.0, le=1.0, description="Classification confidence") + relevant_excerpts: List[str] = Field(default_factory=list, description="Key text excerpts") + reasoning: str = Field(description="Brief explanation of classification") + + +class PolicyClassifierAgent(MLflowChainAgent): + """ + Agent that classifies documents for oral health policy topics. + + Can be deployed to Databricks Model Serving and integrated with + Unity Catalog for governance. + """ + + # Keywords for each topic (fallback classification) + TOPIC_KEYWORDS = { + PolicyTopic.WATER_FLUORIDATION: { + "fluoride", "fluoridation", "water supply", "dental fluorosis", + "community water", "fluoride levels", "fluoridated water" + }, + PolicyTopic.SCHOOL_DENTAL_SCREENING: { + "school dental", "screening program", "student dental", "school health", + "dental exam", "school nurse", "oral health screening" + }, + PolicyTopic.MEDICAID_DENTAL: { + "medicaid dental", "adult dental coverage", "medicaid expansion", + "dental benefits", "state medicaid", "covered dental services" + }, + PolicyTopic.LOW_INCOME_DENTAL_FUNDING: { + "low-income dental", "dental safety net", "free dental clinic", + "dental voucher", "sliding scale dental", "charity care" + }, + PolicyTopic.DENTAL_INSURANCE_MANDATE: { + "dental insurance", "insurance mandate", "coverage requirement", + "pediatric dental", "essential health benefits" + }, + PolicyTopic.DENTAL_WORKFORCE: { + "dental hygienist", "dental therapist", "scope of practice", + "workforce shortage", "dental provider", "loan repayment" + }, + PolicyTopic.COMMUNITY_HEALTH_CENTER: { + "community health center", "FQHC", "health center dental", + "federally qualified", "CHC dental" + } + } + + def __init__(self, agent_id: str = "classifier-mlflow-001"): + """Initialize classifier agent.""" + super().__init__(agent_id, AgentRole.CLASSIFIER) + self._setup_langchain_tracing() + + def _build_chain(self): + """Build LangChain classification chain.""" + # Initialize LLM (will use AI Gateway if configured) + llm = ChatOpenAI( + model=settings.classifier_model, + temperature=0.1, + openai_api_key=settings.openai_api_key + ) + + # Create output parser + parser = PydanticOutputParser(pydantic_object=ClassificationResult) + + # Create prompt template + prompt = ChatPromptTemplate.from_messages([ + ("system", """You are an expert policy analyst specializing in oral health policy. + +Classify the following government meeting document for oral health policy topics. + +Available topics: +- water_fluoridation: Fluoride in public water systems +- school_dental_screening: School-based dental programs +- medicaid_dental_expansion: Medicaid dental coverage +- low_income_dental_funding: Funding for low-income dental care +- dental_insurance_mandate: Insurance coverage requirements +- dental_workforce_development: Training, scope of practice +- community_health_center_dental: CHC/FQHC dental services +- other_oral_health: Other oral health topics +- not_oral_health_related: Not related to oral health + +{format_instructions}"""), + ("user", """Document Title: {title} + +Document Content: +{content} + +Classify this document and provide relevant excerpts.""") + ]) + + # Build chain + chain = prompt | llm | parser + return chain + + def _process_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Classify a document for oral health policy topics. + + Args: + request: Dict with 'document_id', 'title', 'content' + + Returns: + Classification results with topics and confidence + """ + document_id = request.get("document_id") + title = request.get("title", "") + content = request.get("content", "") + + with mlflow.start_span(name="classify_document") as span: + span.set_attribute("document_id", document_id) + + # Try keyword-based classification first (faster, cheaper) + keyword_result = self._classify_by_keywords(title + " " + content) + + if keyword_result["confidence"] >= 0.8: + # High confidence from keywords, no LLM needed + span.set_attribute("classification_method", "keywords") + result = keyword_result + else: + # Use LLM for ambiguous cases + span.set_attribute("classification_method", "llm") + + try: + llm_result = super()._process_request({ + "title": title, + "content": content[:4000], # Limit context length + "format_instructions": self._get_format_instructions() + }) + + result = { + "document_id": document_id, + "primary_topic": llm_result.primary_topic.value, + "secondary_topics": [t.value for t in llm_result.secondary_topics], + "confidence": llm_result.confidence, + "relevant_excerpts": llm_result.relevant_excerpts, + "reasoning": llm_result.reasoning, + "method": "llm" + } + + except Exception as e: + # Fallback to keywords if LLM fails + span.set_attribute("llm_error", str(e)) + result = keyword_result + result["method"] = "keywords_fallback" + + return result + + def _classify_by_keywords(self, text: str) -> Dict[str, Any]: + """ + Fast keyword-based classification. + + Args: + text: Document text + + Returns: + Classification result + """ + text_lower = text.lower() + scores = {} + + # Score each topic + for topic, keywords in self.TOPIC_KEYWORDS.items(): + score = sum(1 for keyword in keywords if keyword in text_lower) + if score > 0: + scores[topic] = score + + if not scores: + return { + "primary_topic": PolicyTopic.NOT_ORAL_HEALTH.value, + "secondary_topics": [], + "confidence": 0.9, + "relevant_excerpts": [], + "reasoning": "No oral health keywords found", + "method": "keywords" + } + + # Get top topics + sorted_topics = sorted(scores.items(), key=lambda x: x[1], reverse=True) + primary_topic = sorted_topics[0][0] + secondary_topics = [t for t, s in sorted_topics[1:3] if s >= 2] + + # Calculate confidence based on score gap + max_score = sorted_topics[0][1] + confidence = min(0.95, 0.5 + (max_score / 10)) + + # Extract relevant excerpts + excerpts = self._extract_excerpts(text, primary_topic) + + return { + "primary_topic": primary_topic.value, + "secondary_topics": [t.value for t in secondary_topics], + "confidence": confidence, + "relevant_excerpts": excerpts, + "reasoning": f"Found {max_score} keyword matches for {primary_topic.value}", + "method": "keywords" + } + + def _extract_excerpts(self, text: str, topic: PolicyTopic, max_excerpts: int = 3) -> List[str]: + """Extract relevant text excerpts for a topic.""" + keywords = self.TOPIC_KEYWORDS.get(topic, set()) + sentences = text.split('. ') + + relevant = [] + for sentence in sentences: + sentence_lower = sentence.lower() + if any(keyword in sentence_lower for keyword in keywords): + relevant.append(sentence.strip()) + if len(relevant) >= max_excerpts: + break + + return relevant + + def _get_format_instructions(self) -> str: + """Get format instructions for LLM output parsing.""" + parser = PydanticOutputParser(pydantic_object=ClassificationResult) + return parser.get_format_instructions() + + def _get_example_input(self) -> Dict[str, Any]: + """Get example input for MLflow signature.""" + return { + "document_id": "doc_12345", + "title": "City Council Meeting - Water Quality Discussion", + "content": "The council discussed adding fluoride to the municipal water supply..." + } + + +def register_classifier_to_unity_catalog(): + """ + Register the classifier agent to Unity Catalog. + + Usage: + python -c "from agents.mlflow_classifier import register_classifier_to_unity_catalog; register_classifier_to_unity_catalog()" + """ + agent = PolicyClassifierAgent() + + # Log and register to Unity Catalog + run_id = agent.log_to_mlflow( + model_name="policy_classifier_agent", + registered_model_name=f"{settings.catalog_name}.{settings.schema_name}.policy_classifier", + pip_requirements=[ + "mlflow>=2.10.0", + "langchain>=0.1.0", + "openai>=1.6.0", + "pydantic>=2.5.0" + ] + ) + + print(f"✅ Registered policy classifier agent to Unity Catalog") + print(f" Model: {settings.catalog_name}.{settings.schema_name}.policy_classifier") + print(f" Run ID: {run_id}") + + return run_id + + +if __name__ == "__main__": + # Test the agent locally + agent = PolicyClassifierAgent() + + test_input = { + "document_id": "test_001", + "title": "School Board Meeting Minutes", + "content": """ + The school board discussed implementing a new dental screening program + for elementary students. The program would provide free dental exams + and referrals to local dentists for students in need. + """ + } + + result = agent.predict(None, test_input) + print("Classification Result:", result) diff --git a/agents/orchestrator.py b/agents/orchestrator.py new file mode 100644 index 0000000000000000000000000000000000000000..5e43655cb81f585012436f80d7e851c04edea4da --- /dev/null +++ b/agents/orchestrator.py @@ -0,0 +1,269 @@ +""" +Multi-Agent Orchestrator for coordinating the policy analysis pipeline. +""" +import asyncio +from typing import Dict, List, Optional, Any +from datetime import datetime +from collections import defaultdict +from loguru import logger + +from agents.base import ( + BaseAgent, + AgentRole, + AgentMessage, + MessageType, + AgentStatus +) + + +class WorkflowStage(str): + """Workflow stage identifiers.""" + SCRAPE = "scrape" + PARSE = "parse" + CLASSIFY = "classify" + ANALYZE = "analyze" + GENERATE = "generate" + + +class OrchestratorAgent(BaseAgent): + """ + Orchestrator agent that coordinates the multi-agent workflow. + + The orchestrator manages the flow of data through the pipeline: + 1. Scraper Agent -> Collects meeting minutes + 2. Parser Agent -> Extracts structured data + 3. Classifier Agent -> Identifies oral health topics + 4. Sentiment Agent -> Analyzes policy positions + 5. Advocacy Agent -> Generates outreach materials + """ + + def __init__(self, agent_id: str = "orchestrator-001"): + """Initialize the orchestrator agent.""" + super().__init__(agent_id, AgentRole.ORCHESTRATOR) + self.agents: Dict[AgentRole, BaseAgent] = {} + self.workflow_state: Dict[str, Any] = defaultdict(dict) + self.active_workflows: Dict[str, Dict[str, Any]] = {} + + def register_agent(self, agent: BaseAgent): + """ + Register an agent with the orchestrator. + + Args: + agent: The agent to register + """ + self.agents[agent.role] = agent + logger.info(f"Registered {agent.role.value} agent: {agent.agent_id}") + + async def process(self, message: AgentMessage) -> List[AgentMessage]: + """ + Process orchestrator commands and route messages. + + Args: + message: The incoming message + + Returns: + List of response messages + """ + self.update_status(AgentStatus.PROCESSING, "Processing orchestrator command") + + try: + if message.message_type == MessageType.COMMAND: + command = message.payload.get("command") + + if command == "start_workflow": + return await self._start_workflow(message.payload) + elif command == "check_status": + return await self._check_workflow_status(message.payload) + elif command == "stop_workflow": + return await self._stop_workflow(message.payload) + + self.log_success() + return [] + + except Exception as e: + self.log_failure(str(e)) + return [await self.send_message( + message.sender, + MessageType.ERROR, + {"error": str(e)} + )] + + async def _start_workflow(self, payload: Dict[str, Any]) -> List[AgentMessage]: + """ + Start a new policy analysis workflow. + + Args: + payload: Workflow configuration + + Returns: + List of messages to initiate the workflow + """ + import uuid + + workflow_id = str(uuid.uuid4()) + workflow_config = payload.get("config", {}) + + # Initialize workflow state + self.active_workflows[workflow_id] = { + "id": workflow_id, + "started_at": datetime.utcnow(), + "stage": WorkflowStage.SCRAPE, + "config": workflow_config, + "status": "running" + } + + logger.info(f"Starting workflow {workflow_id}") + + # Create initial scraping task + scraper_message = await self.send_message( + AgentRole.SCRAPER, + MessageType.COMMAND, + { + "workflow_id": workflow_id, + "command": "scrape", + "targets": workflow_config.get("scrape_targets", []), + "date_range": workflow_config.get("date_range", {}) + } + ) + + return [scraper_message] + + async def _check_workflow_status(self, payload: Dict[str, Any]) -> List[AgentMessage]: + """ + Check the status of active workflows. + + Args: + payload: Status check request + + Returns: + List containing status response + """ + workflow_id = payload.get("workflow_id") + + if workflow_id and workflow_id in self.active_workflows: + workflow = self.active_workflows[workflow_id] + status_payload = { + "workflow_id": workflow_id, + "status": workflow.get("status"), + "stage": workflow.get("stage"), + "started_at": workflow.get("started_at").isoformat() + } + else: + # Return status of all workflows + status_payload = { + "active_workflows": len(self.active_workflows), + "workflows": [ + { + "id": wf_id, + "status": wf["status"], + "stage": wf["stage"] + } + for wf_id, wf in self.active_workflows.items() + ] + } + + response = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.RESPONSE, + status_payload + ) + + return [response] + + async def _stop_workflow(self, payload: Dict[str, Any]) -> List[AgentMessage]: + """ + Stop a running workflow. + + Args: + payload: Stop request with workflow_id + + Returns: + List containing confirmation message + """ + workflow_id = payload.get("workflow_id") + + if workflow_id in self.active_workflows: + self.active_workflows[workflow_id]["status"] = "stopped" + logger.info(f"Stopped workflow {workflow_id}") + + response = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.RESPONSE, + {"workflow_id": workflow_id, "status": "stopped"} + ) + else: + response = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.ERROR, + {"error": f"Workflow {workflow_id} not found"} + ) + + return [response] + + async def route_message(self, message: AgentMessage) -> Optional[AgentMessage]: + """ + Route a message to the appropriate agent. + + Args: + message: The message to route + + Returns: + Response from the target agent + """ + target_agent = self.agents.get(message.recipient) + + if not target_agent: + logger.error(f"No agent found for role: {message.recipient}") + return None + + try: + response = await target_agent.process(message) + return response + except Exception as e: + logger.error(f"Error routing message to {message.recipient}: {e}") + return None + + async def execute_pipeline( + self, + scrape_targets: List[Dict[str, Any]], + date_range: Optional[Dict[str, str]] = None + ) -> Dict[str, Any]: + """ + Execute the complete policy analysis pipeline. + + Args: + scrape_targets: List of government entities to scrape + date_range: Optional date range for historical data + + Returns: + Dictionary containing pipeline results + """ + workflow_config = { + "scrape_targets": scrape_targets, + "date_range": date_range or {} + } + + # Start the workflow + start_message = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.COMMAND, + { + "command": "start_workflow", + "config": workflow_config + } + ) + + results = await self.process(start_message) + + return { + "success": True, + "workflow_initiated": True, + "messages": [msg.dict() for msg in results] + } + + def get_all_agent_states(self) -> Dict[str, Any]: + """Get the current state of all registered agents.""" + return { + role.value: agent.get_state().dict() + for role, agent in self.agents.items() + } diff --git a/agents/parser.py b/agents/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..d68fe1dc9c48bfe4f6f672da38a5ecfad5ba3d65 --- /dev/null +++ b/agents/parser.py @@ -0,0 +1,199 @@ +""" +Parser Agent for extracting and structuring data from raw meeting minutes. +""" +import re +from typing import List, Dict, Any, Optional +from datetime import datetime +from loguru import logger + +from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus + + +class ParserAgent(BaseAgent): + """ + Agent responsible for parsing raw meeting documents into structured data. + + Extracts: + - Meeting metadata (date, type, location) + - Attendees and participants + - Agenda items + - Discussion topics + - Votes and decisions + - Action items + """ + + def __init__(self, agent_id: str = "parser-001"): + """Initialize the parser agent.""" + super().__init__(agent_id, AgentRole.PARSER) + self._compile_patterns() + + def _compile_patterns(self): + """Compile regex patterns for parsing.""" + self.patterns = { + "date": re.compile( + r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}", + re.IGNORECASE + ), + "time": re.compile(r"\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?"), + "attendees": re.compile(r"(?:Present|Attending|Members Present):(.+?)(?:\n\n|\Z)", re.DOTALL | re.IGNORECASE), + "motion": re.compile(r"(?:MOTION|Motion|MOVED)(.+?)(?:CARRIED|PASSED|FAILED|$)", re.DOTALL | re.IGNORECASE), + "vote": re.compile(r"(?:Vote|VOTE):\s*(.+)", re.IGNORECASE), + "agenda_item": re.compile(r"(?:Item|ITEM)\s+#?(\d+|[A-Z])[\.:]\s*(.+?)(?=\n(?:Item|ITEM)|$)", re.DOTALL | re.IGNORECASE) + } + + async def process(self, message: AgentMessage) -> List[AgentMessage]: + """ + Process parsing commands. + + Args: + message: Message containing raw documents to parse + + Returns: + List of messages with parsed data + """ + self.update_status(AgentStatus.PROCESSING, "Parsing meeting documents") + + try: + documents = message.payload.get("documents", []) + + parsed_documents = [] + + for doc in documents: + parsed = await self._parse_document(doc) + if parsed: + parsed_documents.append(parsed) + + # Send parsed documents to classifier + response = await self.send_message( + AgentRole.CLASSIFIER, + MessageType.DATA, + { + "workflow_id": message.payload.get("workflow_id"), + "documents": parsed_documents, + "count": len(parsed_documents) + } + ) + + self.log_success() + logger.info(f"Parsed {len(parsed_documents)} documents") + + return [response] + + except Exception as e: + self.log_failure(str(e)) + error_msg = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.ERROR, + {"error": str(e), "agent": self.agent_id} + ) + return [error_msg] + + async def _parse_document(self, doc: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + Parse a single meeting document. + + Args: + doc: Raw document data + + Returns: + Parsed document with structured fields + """ + try: + content = doc.get("content", "") + + parsed = { + "document_id": doc["document_id"], + "source_url": doc["source_url"], + "municipality": doc["municipality"], + "state": doc["state"], + "raw_title": doc["title"], + "parsed_at": datetime.utcnow().isoformat(), + + # Extracted structured data + "meeting_date": self._extract_date(content, doc.get("meeting_date")), + "meeting_time": self._extract_time(content), + "meeting_type": doc.get("meeting_type", "Unknown"), + "attendees": self._extract_attendees(content), + "agenda_items": self._extract_agenda_items(content), + "motions": self._extract_motions(content), + "votes": self._extract_votes(content), + "discussion_sections": self._extract_discussion_sections(content), + + # Full text for semantic search + "full_text": content, + + # Metadata + "metadata": doc.get("metadata", {}) + } + + return parsed + + except Exception as e: + logger.error(f"Error parsing document {doc.get('document_id')}: {e}") + return None + + def _extract_date(self, content: str, fallback_date: Optional[str]) -> str: + """Extract meeting date from content.""" + match = self.patterns["date"].search(content) + if match: + return match.group(0) + return fallback_date or datetime.utcnow().isoformat() + + def _extract_time(self, content: str) -> Optional[str]: + """Extract meeting time from content.""" + match = self.patterns["time"].search(content) + return match.group(0) if match else None + + def _extract_attendees(self, content: str) -> List[str]: + """Extract list of meeting attendees.""" + match = self.patterns["attendees"].search(content) + if match: + attendees_text = match.group(1) + # Split by comma or newline + attendees = re.split(r'[,\n]', attendees_text) + return [a.strip() for a in attendees if a.strip()] + return [] + + def _extract_agenda_items(self, content: str) -> List[Dict[str, str]]: + """Extract agenda items from content.""" + items = [] + for match in self.patterns["agenda_item"].finditer(content): + items.append({ + "number": match.group(1).strip(), + "description": match.group(2).strip() + }) + return items + + def _extract_motions(self, content: str) -> List[Dict[str, str]]: + """Extract motions from content.""" + motions = [] + for match in self.patterns["motion"].finditer(content): + motions.append({ + "text": match.group(1).strip(), + "full_match": match.group(0).strip() + }) + return motions + + def _extract_votes(self, content: str) -> List[Dict[str, str]]: + """Extract voting records from content.""" + votes = [] + for match in self.patterns["vote"].finditer(content): + votes.append({ + "result": match.group(1).strip() + }) + return votes + + def _extract_discussion_sections(self, content: str) -> List[Dict[str, str]]: + """Extract discussion sections from content.""" + # Split content into paragraphs + paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()] + + sections = [] + for i, para in enumerate(paragraphs): + if len(para) > 100: # Only substantial paragraphs + sections.append({ + "section_id": i, + "text": para + }) + + return sections diff --git a/agents/scraper.py b/agents/scraper.py new file mode 100644 index 0000000000000000000000000000000000000000..68831e1cc53d1a5ee01f8f144dea9563b6521e5c --- /dev/null +++ b/agents/scraper.py @@ -0,0 +1,2113 @@ +""" +Scraper Agent for collecting government meeting minutes from various sources. +""" +import asyncio +import hashlib +import io +import json +import re +from typing import List, Dict, Any, Optional +from datetime import datetime, timedelta +from urllib.parse import urljoin, urlparse +import httpx +from bs4 import BeautifulSoup +from loguru import logger + +try: + from PyPDF2 import PdfReader +except Exception: + PdfReader = None + +try: + import pdfplumber +except Exception: + pdfplumber = None + +try: + import pytesseract + from pytesseract import TesseractNotFoundError +except Exception: + pytesseract = None + TesseractNotFoundError = Exception + +try: + from PIL import Image +except Exception: + Image = None + +try: + from youtube_transcript_api import YouTubeTranscriptApi +except Exception: + YouTubeTranscriptApi = None + +from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus + + +class MeetingDocument(dict): + """Structured representation of a meeting document.""" + + def __init__( + self, + document_id: str, + source_url: str, + municipality: str, + state: str, + meeting_date: datetime, + meeting_type: str, + title: str, + content: str, + metadata: Optional[Dict[str, Any]] = None + ): + super().__init__( + document_id=document_id, + source_url=source_url, + municipality=municipality, + state=state, + meeting_date=meeting_date.isoformat() if isinstance(meeting_date, datetime) else meeting_date, + meeting_type=meeting_type, + title=title, + content=content, + scraped_at=datetime.utcnow().isoformat(), + metadata=metadata or {} + ) + + +class ScraperAgent(BaseAgent): + """ + Agent responsible for scraping government meeting minutes from various sources. + + Supports multiple platforms: + - Legistar (widely used by city councils) + - Granicus (meeting management platform) + - Generic municipal websites + - PDF documents + """ + + def __init__(self, agent_id: str = "scraper-001"): + """Initialize the scraper agent.""" + super().__init__(agent_id, AgentRole.SCRAPER) + self.http_client: Optional[httpx.AsyncClient] = None + self.scraped_urls: set = set() + self.document_extensions = (".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx") + self.meeting_keywords = ("minutes", "agenda", "meeting", "council", "commission", "board") + self.document_route_keywords = ( + "getagendafile", + "getminutesfile", + "download", + "agendafile", + "minutesfile", + ) + self.ocr_max_pages = 10 + self._ocr_missing_tesseract_warned = False + self.social_source_limit = 8 + + # Policy and meeting-focused keywords for social media filtering + self.policy_meeting_keywords = ( + # Meetings + "council meeting", "city council", "town council", "board meeting", + "commission meeting", "public meeting", "town hall", "session", + "special meeting", "regular meeting", "work session", "workshop", + # Documents + "agenda", "minutes", "ordinance", "resolution", "public hearing", + "hearing", "vote", "voting", "motion", "legislation", + # Policy topics + "policy", "budget", "zoning", "planning", "development", + "public comment", "community meeting", "civic", "government", + # Video/meeting specific + "live stream", "livestream", "recorded meeting", "meeting video", + "council session", "board session", "official meeting" + ) + + async def __aenter__(self): + """Async context manager entry.""" + self.http_client = httpx.AsyncClient( + timeout=30.0, + follow_redirects=True, + headers={ + "User-Agent": "OpenNavigator/1.0 (+https://github.com/getcommunityone/open-navigator)" + } + ) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + if self.http_client: + await self.http_client.aclose() + + async def process(self, message: AgentMessage) -> List[AgentMessage]: + """ + Process scraping commands. + + Args: + message: Command message with scraping targets + + Returns: + List of messages containing scraped data + """ + self.update_status(AgentStatus.PROCESSING, "Scraping government meeting minutes") + + try: + command = message.payload.get("command") + + if command == "scrape": + targets = message.payload.get("targets", []) + date_range = message.payload.get("date_range", {}) + + # Initialize HTTP client if not already done + if not self.http_client: + async with self: + documents = await self._scrape_targets(targets, date_range) + else: + documents = await self._scrape_targets(targets, date_range) + + # Send scraped documents to parser + response = await self.send_message( + AgentRole.PARSER, + MessageType.DATA, + { + "workflow_id": message.payload.get("workflow_id"), + "documents": documents, + "count": len(documents) + } + ) + + self.log_success() + logger.info(f"Scraped {len(documents)} documents") + + return [response] + + return [] + + except Exception as e: + self.log_failure(str(e)) + error_msg = await self.send_message( + AgentRole.ORCHESTRATOR, + MessageType.ERROR, + {"error": str(e), "agent": self.agent_id} + ) + return [error_msg] + + async def _scrape_targets( + self, + targets: List[Dict[str, Any]], + date_range: Dict[str, str] + ) -> List[Dict[str, Any]]: + """ + Scrape multiple targets concurrently. + + Args: + targets: List of scraping targets + date_range: Date range for filtering meetings + + Returns: + List of scraped documents + """ + tasks = [] + + for target in targets: + platform = target.get("platform", "generic") + url = target.get("url", "") + + if platform == "legistar": + tasks.append(self._scrape_legistar(target, date_range)) + elif platform == "granicus": + tasks.append(self._scrape_granicus(target, date_range)) + elif platform == "suiteonemedia" or "suiteonemedia" in url.lower(): + tasks.append(self._scrape_suiteonemedia(target, date_range)) + elif platform == "eboard" or "eboardsolutions.com" in url.lower() or "simbli.eboardsolutions" in url.lower(): + tasks.append(self._scrape_eboard(target, date_range)) + elif platform == "youtube": + tasks.append(self._scrape_youtube_source(target)) + elif platform == "facebook": + tasks.append(self._scrape_facebook_source(target)) + else: + tasks.append(self._scrape_generic(target, date_range)) + + # Execute all scraping tasks concurrently + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Flatten results and filter out errors + documents = [] + for result in results: + if isinstance(result, Exception): + logger.error(f"Scraping error: {result}") + elif isinstance(result, list): + documents.extend(result) + + return documents + + async def scrape_social_sources( + self, + municipality: str, + state: str, + seed_url: str, + max_sources: int = 8 + ) -> List[Dict[str, Any]]: + """Discover and scrape YouTube/Facebook sources for a jurisdiction.""" + social_documents: List[Dict[str, Any]] = [] + + homepage_url = await self._resolve_homepage_url(municipality, state, seed_url) + if not homepage_url: + logger.warning(f"Could not resolve homepage URL for social scraping: {municipality}, {state}") + return social_documents + + logger.info(f"Discovering social sources from homepage: {homepage_url}") + social_urls = await self._discover_social_urls(homepage_url, municipality, state) + + youtube_urls = list(dict.fromkeys(social_urls.get("youtube", [])))[:max_sources] + facebook_urls = list(dict.fromkeys(social_urls.get("facebook", [])))[:max_sources] + + logger.info( + f"Social discovery for {municipality}: " + f"{len(youtube_urls)} YouTube, {len(facebook_urls)} Facebook" + ) + + tasks = [] + for y_url in youtube_urls: + tasks.append(self._scrape_youtube_source({ + "url": y_url, + "municipality": municipality, + "state": state, + })) + for f_url in facebook_urls: + tasks.append(self._scrape_facebook_source({ + "url": f_url, + "municipality": municipality, + "state": state, + })) + + if not tasks: + return social_documents + + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.warning(f"Social scraping error: {result}") + continue + if isinstance(result, list): + social_documents.extend(result) + + return social_documents + + async def _resolve_homepage_url(self, municipality: str, state: str, seed_url: str) -> str: + """Resolve an official website homepage used for social discovery.""" + if seed_url and "suiteonemedia" not in seed_url.lower(): + parsed = urlparse(seed_url) + return f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else seed_url + + city = (municipality or "").lower().replace(" ", "").replace("'", "") + st = (state or "").lower() + candidates = [ + f"https://www.{city}{st}.gov", + f"https://{city}{st}.gov", + f"https://www.cityof{city}.com", + f"https://www.{city}.gov", + f"https://www.{city}.com", + f"https://{city}.com", + ] + + for candidate in candidates: + try: + resp = await self.http_client.get(candidate, timeout=8) + if resp.status_code < 400: + parsed = urlparse(str(resp.url)) + return f"{parsed.scheme}://{parsed.netloc}" + except Exception: + continue + + return "" + + async def _discover_social_urls(self, homepage_url: str, municipality: str, state: str) -> Dict[str, List[str]]: + """Discover social media URLs from homepage and YouTube pattern matching.""" + discovered = {"youtube": [], "facebook": []} + + try: + from discovery.social_media_discovery import SocialMediaDiscovery + + async with SocialMediaDiscovery() as discovery: + social = await discovery.discover_from_website( + homepage_url=homepage_url, + jurisdiction_name=municipality, + state=state, + ) + discovered["youtube"] = social.get("youtube", []) + discovered["facebook"] = social.get("facebook", []) + except Exception as err: + logger.debug(f"SocialMediaDiscovery unavailable/failed: {err}") + + # Augment YouTube discovery using handle pattern search for better recall. + try: + from discovery.youtube_channel_discovery import YouTubeChannelDiscovery + + async with YouTubeChannelDiscovery() as ydisc: + channels = await ydisc.discover_channels( + city_name=municipality, + state_code=state, + homepage_url=homepage_url, + ) + for channel in channels: + url = channel.get("channel_url") + if url: + discovered["youtube"].append(url) + except Exception as err: + logger.debug(f"YouTubeChannelDiscovery unavailable/failed: {err}") + + discovered["youtube"] = list(dict.fromkeys(discovered["youtube"])) + discovered["facebook"] = list(dict.fromkeys(discovered["facebook"])) + return discovered + + def _is_policy_meeting_content(self, text: str) -> bool: + """Check if text content is related to policy or meetings.""" + if not text: + return False + text_lower = text.lower() + return any(keyword in text_lower for keyword in self.policy_meeting_keywords) + + def _extract_youtube_video_metadata(self, html: str, video_id: str) -> Dict[str, str]: + """Extract title and description from YouTube video page HTML.""" + metadata = {"title": "", "description": ""} + + try: + # Extract title from various possible patterns + title_match = re.search(r'"title":"([^"]+)"', html) + if title_match: + metadata["title"] = title_match.group(1) + else: + # Fallback to meta tags + title_match = re.search(r'
CommunityOne: The open path to everything local
+1&&(D=e(D,h),z=e(z,h),C=D.length,P=z.length),E=C,x=z.slice(0,C),y=x.length;y 16)throw Error(W2+Ut(e));if(!e.s)return new h(fr);for(yt=!1,c=m,s=new h(.03125);e.abs().gte(.1);)e=e.times(s),d+=5;for(r=Math.log(Xo(2,d))/Math.LN10*2+5|0,c+=r,n=i=a=new h(fr),h.precision=c;;){if(i=it(i.times(e),c),n=n.times(++u),s=a.plus(ba(i,n,c)),$i(s.d).slice(0,c)===$i(a.d).slice(0,c)){for(;d--;)a=it(a.times(a),c);return h.precision=m,t==null?(yt=!0,it(a,m)):a}a=s}}function Ut(e){for(var t=e.e*mt,n=e.d[0];n>=10;n/=10)t++;return t}function Ry(e,t,n){if(t>e.LN10.sd())throw yt=!0,n&&(e.precision=n),Error(Wr+"LN10 precision limit exceeded");return it(new e(e.LN10),t)}function Za(e){for(var t="";e--;)t+="0";return t}function rf(e,t){var n,r,i,a,s,c,u,d,h,m=1,p=10,v=e,_=v.d,x=v.constructor,y=x.precision;if(v.s<1)throw Error(Wr+(v.s?"NaN":"-Infinity"));if(v.eq(fr))return new x(0);if(t==null?(yt=!1,d=y):d=t,v.eq(10))return t==null&&(yt=!0),Ry(x,d);if(d+=p,x.precision=d,n=$i(_),r=n.charAt(0),a=Ut(v),Math.abs(a)<15e14){for(;r<7&&r!=1||r==1&&n.charAt(1)>3;)v=v.times(e),n=$i(v.d),r=n.charAt(0),m++;a=Ut(v),r>1?(v=new x("0."+n),a++):v=new x(r+"."+n.slice(1))}else return u=Ry(x,d+2,y).times(a+""),v=rf(new x(r+"."+n.slice(1)),d-p).plus(u),x.precision=y,t==null?(yt=!0,it(v,y)):v;for(c=s=v=ba(v.minus(fr),v.plus(fr),d),h=it(v.times(v),d),i=3;;){if(s=it(s.times(h),d),u=c.plus(ba(s,new x(i),d)),$i(u.d).slice(0,d)===$i(c.d).slice(0,d))return c=c.times(2),a!==0&&(c=c.plus(Ry(x,d+2,y).times(a+""))),c=ba(c,new x(m),d),x.precision=y,t==null?(yt=!0,it(c,y)):c;c=u,i+=2}}function I3(e,t){var n,r,i;for((n=t.indexOf("."))>-1&&(t=t.replace(".","")),(r=t.search(/e/i))>0?(n<0&&(n=r),n+=+t.slice(r+1),t=t.substring(0,r)):n<0&&(n=t.length),r=0;t.charCodeAt(r)===48;)++r;for(i=t.length;t.charCodeAt(i-1)===48;)--i;if(t=t.slice(r,i),t){if(i-=r,n=n-r-1,e.e=au(n/mt),e.d=[],r=(n+1)%mt,n<0&&(r+=mt),r$p||e.e<-$p))throw Error(W2+n)}else e.s=0,e.e=0,e.d=[0];return e}function it(e,t,n){var r,i,a,s,c,u,d,h,m=e.d;for(s=1,a=m[0];a>=10;a/=10)s++;if(r=t-s,r<0)r+=mt,i=t,d=m[h=0];else{if(h=Math.ceil((r+1)/mt),a=m.length,h>=a)return e;for(d=a=m[h],s=1;a>=10;a/=10)s++;r%=mt,i=r-mt+s}if(n!==void 0&&(a=Xo(10,s-i-1),c=d/a%10|0,u=t<0||m[h+1]!==void 0||d%a,u=n<4?(c||u)&&(n==0||n==(e.s<0?3:2)):c>5||c==5&&(n==4||u||n==6&&(r>0?i>0?d/Xo(10,s-i):0:m[h-1])%10&1||n==(e.s<0?8:7))),t<1||!m[0])return u?(a=Ut(e),m.length=1,t=t-a-1,m[0]=Xo(10,(mt-t%mt)%mt),e.e=au(-t/mt)||0):(m.length=1,m[0]=e.e=e.s=0),e;if(r==0?(m.length=h,a=1,h--):(m.length=h+1,a=Xo(10,mt-r),m[h]=i>0?(d/Xo(10,s-i)%Xo(10,i)|0)*a:0),u)for(;;)if(h==0){(m[0]+=a)==Qt&&(m[0]=1,++e.e);break}else{if(m[h]+=a,m[h]!=Qt)break;m[h--]=0,a=1}for(r=m.length;m[--r]===0;)m.pop();if(yt&&(e.e>$p||e.e<-$p))throw Error(W2+Ut(e));return e}function D6(e,t){var n,r,i,a,s,c,u,d,h,m,p=e.constructor,v=p.precision;if(!e.s||!t.s)return t.s?t.s=-t.s:t=new p(e),yt?it(t,v):t;if(u=e.d,m=t.d,r=t.e,d=e.e,u=u.slice(),s=d-r,s){for(h=s<0,h?(n=u,s=-s,c=m.length):(n=m,r=d,c=u.length),i=Math.max(Math.ceil(v/mt),c)+2,s>i&&(s=i,n.length=1),n.reverse(),i=s;i--;)n.push(0);n.reverse()}else{for(i=u.length,c=m.length,h=i =p&&v<=m}return _?ta(ta({},n),{},{radius:s,angle:tse(v,n)}):null},cM=function(t){return!N.isValidElement(t)&&!Ce(t)&&typeof t!="boolean"?t.className:""};function cf(e){"@babel/helpers - typeof";return cf=typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?function(t){return typeof t}:function(t){return t&&typeof Symbol=="function"&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},cf(e)}var nse=["offset"];function rse(e){return sse(e)||ose(e)||ase(e)||ise()}function ise(){throw new TypeError(`Invalid attempt to spread non-iterable instance.
+In order to be iterable, non-array objects must have a [Symbol.iterator]() method.`)}function ase(e,t){if(e){if(typeof e=="string")return v1(e,t);var n=Object.prototype.toString.call(e).slice(8,-1);if(n==="Object"&&e.constructor&&(n=e.constructor.name),n==="Map"||n==="Set")return Array.from(e);if(n==="Arguments"||/^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n))return v1(e,t)}}function ose(e){if(typeof Symbol<"u"&&e[Symbol.iterator]!=null||e["@@iterator"]!=null)return Array.from(e)}function sse(e){if(Array.isArray(e))return v1(e)}function v1(e,t){(t==null||t>e.length)&&(t=e.length);for(var n=0,r=new Array(t);n0&&(i+1===s&&(a+="."),a+=Za(r))),e.s<0?"-"+a:a}function R3(e,t){if(e.length>t)return e.length=t,!0}function B6(e){var t,n,r;function i(a){var s=this;if(!(s instanceof i))return new i(a);if(s.constructor=i,a instanceof i){s.s=a.s,s.e=a.e,s.d=(a=a.d)?a.slice():a;return}if(typeof a=="number"){if(a*0!==0)throw Error(ws+a);if(a>0)s.s=1;else if(a<0)a=-a,s.s=-1;else{s.s=0,s.e=0,s.d=[0];return}if(a===~~a&&a<1e7){s.e=0,s.d=[a];return}return I3(s,a.toString())}else if(typeof a!="string")throw Error(ws+a);if(a.charCodeAt(0)===45?(a=a.slice(1),s.s=-1):s.s=1,Oae.test(a))I3(s,a);else throw Error(ws+a)}if(i.prototype=ge,i.ROUND_UP=0,i.ROUND_DOWN=1,i.ROUND_CEIL=2,i.ROUND_FLOOR=3,i.ROUND_HALF_UP=4,i.ROUND_HALF_DOWN=5,i.ROUND_HALF_EVEN=6,i.ROUND_HALF_CEIL=7,i.ROUND_HALF_FLOOR=8,i.clone=B6,i.config=i.set=kae,e===void 0&&(e={}),e)for(r=["precision","rounding","toExpNeg","toExpPos","LN10"],t=0;t"u"||!L||!L.Mixin)){l=j(l)?l:[l];for(var f=0;f1,this._baseLayersList.style.display=l?"":"none"),this._separator.style.display=f&&l?"":"none",this},_onLayerChange:function(l){this._handlingClick||this._update();var f=this._getLayer(u(l.target)),g=f.overlay?l.type==="add"?"overlayadd":"overlayremove":l.type==="add"?"baselayerchange":null;g&&this._map.fire(g,f)},_createRadioElement:function(l,f){var g='",S=document.createElement("div");return S.innerHTML=g,S.firstChild},_addItem:function(l){var f=document.createElement("label"),g=this._map.hasLayer(l.layer),S;l.overlay?(S=document.createElement("input"),S.type="checkbox",S.className="leaflet-control-layers-selector",S.defaultChecked=g):S=this._createRadioElement("leaflet-base-layers_"+u(this),g),this._layerControlInputs.push(S),S.layerId=u(l.layer),Te(S,"click",this._onInputClick,this);var k=document.createElement("span");k.innerHTML=" "+l.name;var M=document.createElement("span");f.appendChild(M),M.appendChild(S),M.appendChild(k);var V=l.overlay?this._overlaysList:this._baseLayersList;return V.appendChild(f),this._checkDisabledLayers(),f},_onInputClick:function(){if(!this._preventClick){var l=this._layerControlInputs,f,g,S=[],k=[];this._handlingClick=!0;for(var M=l.length-1;M>=0;M--)f=l[M],g=this._getLayer(f.layerId).layer,f.checked?S.push(g):f.checked||k.push(g);for(M=0;M0&&k.push(k[0].slice()),k}function nl(l,f){return l.feature?i({},l.feature,{geometry:f}):bh(f)}function bh(l){return l.type==="Feature"||l.type==="FeatureCollection"?l:{type:"Feature",properties:{},geometry:l}}var Iv={toGeoJSON:function(l){return nl(this,{type:"Point",coordinates:$v(this.getLatLng(),l)})}};ph.include(Iv),Mv.include(Iv),gh.include(Iv),Ki.include({toGeoJSON:function(l){var f=!_r(this._latlngs),g=xh(this._latlngs,f?1:0,!1,l);return nl(this,{type:(f?"Multi":"")+"LineString",coordinates:g})}}),tl.include({toGeoJSON:function(l){var f=!_r(this._latlngs),g=f&&!_r(this._latlngs[0]),S=xh(this._latlngs,g?2:f?1:0,!0,l);return f||(S=[S]),nl(this,{type:(g?"Multi":"")+"Polygon",coordinates:S})}}),Js.include({toMultiPoint:function(l){var f=[];return this.eachLayer(function(g){f.push(g.toGeoJSON(l).geometry.coordinates)}),nl(this,{type:"MultiPoint",coordinates:f})},toGeoJSON:function(l){var f=this.feature&&this.feature.geometry&&this.feature.geometry.type;if(f==="MultiPoint")return this.toMultiPoint(l);var g=f==="GeometryCollection",S=[];return this.eachLayer(function(k){if(k.toGeoJSON){var M=k.toGeoJSON(l);if(g)S.push(M.geometry);else{var V=bh(M);V.type==="FeatureCollection"?S.push.apply(S,V.features):S.push(V)}}}),g?nl(this,{geometries:S,type:"GeometryCollection"}):{type:"FeatureCollection",features:S}}});function vN(l,f){return new Yi(l,f)}var II=vN,wh=Yr.extend({options:{opacity:1,alt:"",interactive:!1,crossOrigin:!1,errorOverlayUrl:"",zIndex:1,className:""},initialize:function(l,f,g){this._url=l,this._bounds=ae(f),x(this,g)},onAdd:function(){this._image||(this._initImage(),this.options.opacity<1&&this._updateOpacity()),this.options.interactive&&(Le(this._image,"leaflet-interactive"),this.addInteractiveTarget(this._image)),this.getPane().appendChild(this._image),this._reset()},onRemove:function(){xt(this._image),this.options.interactive&&this.removeInteractiveTarget(this._image)},setOpacity:function(l){return this.options.opacity=l,this._image&&this._updateOpacity(),this},setStyle:function(l){return l.opacity&&this.setOpacity(l.opacity),this},bringToFront:function(){return this._map&&Xs(this._image),this},bringToBack:function(){return this._map&&Qs(this._image),this},setUrl:function(l){return this._url=l,this._image&&(this._image.src=l),this},setBounds:function(l){return this._bounds=ae(l),this._map&&this._reset(),this},getEvents:function(){var l={zoom:this._reset,viewreset:this._reset};return this._zoomAnimated&&(l.zoomanim=this._animateZoom),l},setZIndex:function(l){return this.options.zIndex=l,this._updateZIndex(),this},getBounds:function(){return this._bounds},getElement:function(){return this._image},_initImage:function(){var l=this._url.tagName==="IMG",f=this._image=l?this._url:qe("img");if(Le(f,"leaflet-image-layer"),this._zoomAnimated&&Le(f,"leaflet-zoom-animated"),this.options.className&&Le(f,this.options.className),f.onselectstart=m,f.onmousemove=m,f.onload=s(this.fire,this,"load"),f.onerror=s(this._overlayOnError,this,"error"),(this.options.crossOrigin||this.options.crossOrigin==="")&&(f.crossOrigin=this.options.crossOrigin===!0?"":this.options.crossOrigin),this.options.zIndex&&this._updateZIndex(),l){this._url=f.src;return}f.src=this._url,f.alt=this.options.alt},_animateZoom:function(l){var f=this._map.getZoomScale(l.zoom),g=this._map._latLngBoundsToNewLayerBounds(this._bounds,l.zoom,l.center).min;Io(this._image,g,f)},_reset:function(){var l=this._image,f=new U(this._map.latLngToLayerPoint(this._bounds.getNorthWest()),this._map.latLngToLayerPoint(this._bounds.getSouthEast())),g=f.getSize();It(l,f.min),l.style.width=g.x+"px",l.style.height=g.y+"px"},_updateOpacity:function(){wr(this._image,this.options.opacity)},_updateZIndex:function(){this._image&&this.options.zIndex!==void 0&&this.options.zIndex!==null&&(this._image.style.zIndex=this.options.zIndex)},_overlayOnError:function(){this.fire("error");var l=this.options.errorOverlayUrl;l&&this._url!==l&&(this._url=l,this._image.src=l)},getCenter:function(){return this._bounds.getCenter()}}),RI=function(l,f,g){return new wh(l,f,g)},yN=wh.extend({options:{autoplay:!0,loop:!0,keepAspectRatio:!0,muted:!1,playsInline:!0},_initImage:function(){var l=this._url.tagName==="VIDEO",f=this._image=l?this._url:qe("video");if(Le(f,"leaflet-image-layer"),this._zoomAnimated&&Le(f,"leaflet-zoom-animated"),this.options.className&&Le(f,this.options.className),f.onselectstart=m,f.onmousemove=m,f.onloadeddata=s(this.fire,this,"load"),l){for(var g=f.getElementsByTagName("source"),S=[],k=0;k