Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
61d29fc
0
Parent(s):
Clean HuggingFace deployment without binary files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +87 -0
- .gitignore +100 -0
- .huggingface/README.md +101 -0
- .huggingface/nginx.conf +125 -0
- .huggingface/start.sh +61 -0
- .huggingface/supervisord.conf +28 -0
- CITATIONS.md +1474 -0
- CONTRIBUTING.md +99 -0
- Dockerfile +90 -0
- INTEL_ARC_QUICKSTART.md +281 -0
- LICENSE +201 -0
- Makefile +169 -0
- README.md +534 -0
- README_HF.md +101 -0
- agents/__init__.py +16 -0
- agents/advocacy.py +408 -0
- agents/base.py +171 -0
- agents/classifier.py +295 -0
- agents/debate_grader.py +424 -0
- agents/mlflow_base.py +307 -0
- agents/mlflow_classifier.py +308 -0
- agents/orchestrator.py +269 -0
- agents/parser.py +199 -0
- agents/scraper.py +2113 -0
- agents/scraper_undetected.py +261 -0
- agents/sentiment.py +381 -0
- api/__init__.py +4 -0
- api/app.py +711 -0
- api/auth.py +153 -0
- api/database.py +62 -0
- api/errors.py +154 -0
- api/main.py +1288 -0
- api/models.py +268 -0
- api/routes/__init__.py +3 -0
- api/routes/auth.py +436 -0
- api/routes/bills.py +841 -0
- api/routes/bills_neon.py +481 -0
- api/routes/contact.py +118 -0
- api/routes/hf_search.py +182 -0
- api/routes/search.py +1685 -0
- api/routes/search_postgres.py +535 -0
- api/routes/social.py +544 -0
- api/routes/stats.py +453 -0
- api/routes/stats_neon.py +322 -0
- api/static/assets/index-BIH9Tona.css +1 -0
- api/static/assets/index-DoIJncqg.js +0 -0
- api/static/communityone_logo.jpg +0 -0
- api/static/communityone_logo.svg +22 -0
- api/static/communityone_logo_64.png +0 -0
- api/static/favicon.ico +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
env/
|
| 11 |
+
|
| 12 |
+
# Node
|
| 13 |
+
node_modules/
|
| 14 |
+
npm-debug.log*
|
| 15 |
+
yarn-debug.log*
|
| 16 |
+
yarn-error.log*
|
| 17 |
+
.pnpm-debug.log*
|
| 18 |
+
frontend/dist/
|
| 19 |
+
frontend/node_modules/
|
| 20 |
+
website/build/
|
| 21 |
+
website/node_modules/
|
| 22 |
+
website/.docusaurus/
|
| 23 |
+
|
| 24 |
+
# Data and cache
|
| 25 |
+
data/
|
| 26 |
+
cache/
|
| 27 |
+
logs/
|
| 28 |
+
output/
|
| 29 |
+
*.db
|
| 30 |
+
*.sqlite
|
| 31 |
+
|
| 32 |
+
# Git
|
| 33 |
+
.git/
|
| 34 |
+
.gitignore
|
| 35 |
+
.github/
|
| 36 |
+
|
| 37 |
+
# IDE
|
| 38 |
+
.vscode/
|
| 39 |
+
.idea/
|
| 40 |
+
*.swp
|
| 41 |
+
*.swo
|
| 42 |
+
|
| 43 |
+
# Documentation (we copy specific dirs)
|
| 44 |
+
docs/
|
| 45 |
+
*.md
|
| 46 |
+
!README.md
|
| 47 |
+
|
| 48 |
+
# Test and development
|
| 49 |
+
tests/
|
| 50 |
+
examples/
|
| 51 |
+
notebooks/
|
| 52 |
+
*.ipynb
|
| 53 |
+
|
| 54 |
+
# Large files
|
| 55 |
+
*.zip
|
| 56 |
+
*.tar.gz
|
| 57 |
+
*.mp4
|
| 58 |
+
*.avi
|
| 59 |
+
*.mov
|
| 60 |
+
|
| 61 |
+
# Environment
|
| 62 |
+
.env
|
| 63 |
+
.env.local
|
| 64 |
+
*.key
|
| 65 |
+
*.pem
|
| 66 |
+
|
| 67 |
+
# OS
|
| 68 |
+
.DS_Store
|
| 69 |
+
Thumbs.db
|
| 70 |
+
|
| 71 |
+
# Build artifacts
|
| 72 |
+
build/
|
| 73 |
+
dist/
|
| 74 |
+
*.egg-info/
|
| 75 |
+
|
| 76 |
+
# Temporary
|
| 77 |
+
tmp/
|
| 78 |
+
temp/
|
| 79 |
+
*.tmp
|
| 80 |
+
|
| 81 |
+
# Scripts we don't need in container
|
| 82 |
+
start-all.sh
|
| 83 |
+
stop-all.sh
|
| 84 |
+
deploy-huggingface.sh
|
| 85 |
+
test-huggingface-build.sh
|
| 86 |
+
migrate-docs.sh
|
| 87 |
+
install.sh
|
.gitignore
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
!frontend/src/lib/
|
| 15 |
+
lib64/
|
| 16 |
+
parts/
|
| 17 |
+
sdist/
|
| 18 |
+
var/
|
| 19 |
+
wheels/
|
| 20 |
+
*.egg-info/
|
| 21 |
+
.installed.cfg
|
| 22 |
+
*.egg
|
| 23 |
+
|
| 24 |
+
# Node.js
|
| 25 |
+
node_modules/
|
| 26 |
+
|
| 27 |
+
# Virtual environments
|
| 28 |
+
venv/
|
| 29 |
+
env/
|
| 30 |
+
ENV/
|
| 31 |
+
.venv
|
| 32 |
+
.venv-intel
|
| 33 |
+
|
| 34 |
+
# IDE
|
| 35 |
+
.vscode/
|
| 36 |
+
.idea/
|
| 37 |
+
*.swp
|
| 38 |
+
*.swo
|
| 39 |
+
*~
|
| 40 |
+
.DS_Store
|
| 41 |
+
|
| 42 |
+
# Environment variables
|
| 43 |
+
.env
|
| 44 |
+
.env.local
|
| 45 |
+
.env.*.local
|
| 46 |
+
.env.production
|
| 47 |
+
.env.development
|
| 48 |
+
.env.test
|
| 49 |
+
|
| 50 |
+
# Credentials and secrets
|
| 51 |
+
credentials.json
|
| 52 |
+
service-account.json
|
| 53 |
+
*_credentials.json
|
| 54 |
+
auth.json
|
| 55 |
+
token.json
|
| 56 |
+
*_token.txt
|
| 57 |
+
*-token.txt
|
| 58 |
+
gcp-*.json
|
| 59 |
+
bigquery-*.json
|
| 60 |
+
.gcp/
|
| 61 |
+
|
| 62 |
+
# Logs
|
| 63 |
+
logs/
|
| 64 |
+
*.log
|
| 65 |
+
|
| 66 |
+
# Jupyter Notebook
|
| 67 |
+
.ipynb_checkpoints
|
| 68 |
+
*.ipynb
|
| 69 |
+
|
| 70 |
+
# Data files
|
| 71 |
+
data/
|
| 72 |
+
.migration_backup/
|
| 73 |
+
*.csv
|
| 74 |
+
*.parquet
|
| 75 |
+
*.delta
|
| 76 |
+
|
| 77 |
+
# Delta Lake
|
| 78 |
+
_delta_log/
|
| 79 |
+
|
| 80 |
+
# Testing
|
| 81 |
+
.coverage
|
| 82 |
+
htmlcov/
|
| 83 |
+
.pytest_cache/
|
| 84 |
+
.tox/
|
| 85 |
+
|
| 86 |
+
# Documentation
|
| 87 |
+
docs/_build/
|
| 88 |
+
|
| 89 |
+
# Databricks
|
| 90 |
+
.databricks/
|
| 91 |
+
|
| 92 |
+
# Secrets
|
| 93 |
+
secrets/
|
| 94 |
+
*.key
|
| 95 |
+
*.pem
|
| 96 |
+
|
| 97 |
+
# OS
|
| 98 |
+
Thumbs.db
|
| 99 |
+
# Binary files for Docker build only (not in git)
|
| 100 |
+
website/static/img/communityone_card.png
|
.huggingface/README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: CommunityOne - Open Navigator
|
| 3 |
+
emoji: 🏛️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
tags:
|
| 11 |
+
- civic-engagement
|
| 12 |
+
- policy-tracking
|
| 13 |
+
- government-transparency
|
| 14 |
+
- nonprofit-discovery
|
| 15 |
+
- open-data
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
# 🏛️ CommunityOne - Open Navigator
|
| 19 |
+
|
| 20 |
+
**Track 90,000+ jurisdictions. Monitor 1.8M nonprofits. Amplify your voice.**
|
| 21 |
+
|
| 22 |
+
CommunityOne is a civic engagement platform that helps you discover advocacy opportunities, track policy changes, and connect with organizations working on the causes you care about.
|
| 23 |
+
|
| 24 |
+
## ✨ Features
|
| 25 |
+
|
| 26 |
+
- **🔍 Unified Search**: Find contacts, meetings, organizations, and causes across the entire United States
|
| 27 |
+
- **📊 Real-time Stats**: Track policy activity across 90,000+ cities, counties, and states
|
| 28 |
+
- **🏢 Nonprofit Discovery**: Explore 1.8M organizations from IRS data enriched with Every.org
|
| 29 |
+
- **📅 Meeting Minutes**: Search 250,000+ government meeting transcripts and agendas
|
| 30 |
+
- **🎯 Geographic Filtering**: Browse by state, county, or city to find local opportunities
|
| 31 |
+
- **🔐 OAuth Login**: Sign in with HuggingFace, GitHub, or Google to save your preferences
|
| 32 |
+
|
| 33 |
+
## 🚀 Three Services Architecture
|
| 34 |
+
|
| 35 |
+
This deployment runs three integrated services:
|
| 36 |
+
|
| 37 |
+
1. **📚 Documentation** (Docusaurus) - `/docs/`
|
| 38 |
+
2. **🖥️ Main Application** (React + Vite) - `/`
|
| 39 |
+
3. **⚡ API Backend** (FastAPI) - `/api/`
|
| 40 |
+
|
| 41 |
+
All services are reverse-proxied through nginx on port 7860.
|
| 42 |
+
|
| 43 |
+
## 📖 Quick Start
|
| 44 |
+
|
| 45 |
+
### Browse Without Login
|
| 46 |
+
- Click "Browse All" to explore data by state
|
| 47 |
+
- Use the search bar to find organizations, contacts, or causes
|
| 48 |
+
- Filter by location using the state/county/city selectors
|
| 49 |
+
|
| 50 |
+
### Sign In for Personalization
|
| 51 |
+
- Click "Login" in the top right
|
| 52 |
+
- Choose your OAuth provider (HuggingFace, GitHub, or Google)
|
| 53 |
+
- Follow organizations, leaders, and causes you care about
|
| 54 |
+
- Get personalized recommendations
|
| 55 |
+
|
| 56 |
+
### Explore the API
|
| 57 |
+
- Visit `/redoc` for interactive API documentation
|
| 58 |
+
- Try the search endpoints with state filters
|
| 59 |
+
- Export data in JSON format for your own projects
|
| 60 |
+
|
| 61 |
+
## 🛠️ Technology Stack
|
| 62 |
+
|
| 63 |
+
- **Frontend**: React 18 + TypeScript + Vite + TailwindCSS + shadcn/ui
|
| 64 |
+
- **Backend**: Python 3.11 + FastAPI + Pydantic
|
| 65 |
+
- **Data**: Delta Lake + Parquet (90GB+ of civic data)
|
| 66 |
+
- **Docs**: Docusaurus v3
|
| 67 |
+
- **Infrastructure**: nginx + supervisor + Docker
|
| 68 |
+
|
| 69 |
+
## 📊 Data Sources
|
| 70 |
+
|
| 71 |
+
- **IRS BMF**: 1.8M tax-exempt organizations
|
| 72 |
+
- **Every.org**: Nonprofit enrichment (logos, causes, revenue)
|
| 73 |
+
- **Open States**: State legislators and bills (7,300+ officials)
|
| 74 |
+
- **Census**: Jurisdictions and boundaries (90,000+)
|
| 75 |
+
- **CityScrapers**: Local government meetings
|
| 76 |
+
- **OpenCivicData**: Standardized government data
|
| 77 |
+
|
| 78 |
+
## 🔗 Links
|
| 79 |
+
|
| 80 |
+
- **Repository**: [github.com/getcommunityone/open-navigator](https://github.com/getcommunityone/open-navigator)
|
| 81 |
+
- **Documentation**: Click "📚 Browse Documentation" on the homepage
|
| 82 |
+
- **API Docs**: `/redoc` endpoint
|
| 83 |
+
- **Website**: [www.communityone.com](https://www.communityone.com)
|
| 84 |
+
|
| 85 |
+
## 📝 License
|
| 86 |
+
|
| 87 |
+
Apache License 2.0 - Free for commercial and non-commercial use
|
| 88 |
+
|
| 89 |
+
## 🤝 Contributing
|
| 90 |
+
|
| 91 |
+
We welcome contributions! See CONTRIBUTING.md in the repository for guidelines.
|
| 92 |
+
|
| 93 |
+
## 💬 Support
|
| 94 |
+
|
| 95 |
+
- **Issues**: [GitHub Issues](https://github.com/getcommunityone/open-navigator/issues)
|
| 96 |
+
- **Discussions**: [GitHub Discussions](https://github.com/getcommunityone/open-navigator/discussions)
|
| 97 |
+
- **Email**: hello@communityone.com
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
Built with ❤️ for civic engagement and government transparency.
|
.huggingface/nginx.conf
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
worker_processes auto;
|
| 2 |
+
pid /tmp/nginx.pid;
|
| 3 |
+
|
| 4 |
+
events {
|
| 5 |
+
worker_connections 1024;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
http {
|
| 9 |
+
include /etc/nginx/mime.types;
|
| 10 |
+
default_type application/octet-stream;
|
| 11 |
+
|
| 12 |
+
# Logging
|
| 13 |
+
access_log /app/logs/nginx-access.log;
|
| 14 |
+
error_log /app/logs/nginx-error.log;
|
| 15 |
+
|
| 16 |
+
# Performance
|
| 17 |
+
sendfile on;
|
| 18 |
+
tcp_nopush on;
|
| 19 |
+
tcp_nodelay on;
|
| 20 |
+
keepalive_timeout 65;
|
| 21 |
+
types_hash_max_size 2048;
|
| 22 |
+
client_max_body_size 50M;
|
| 23 |
+
|
| 24 |
+
# Compression
|
| 25 |
+
gzip on;
|
| 26 |
+
gzip_vary on;
|
| 27 |
+
gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript;
|
| 28 |
+
|
| 29 |
+
upstream fastapi_backend {
|
| 30 |
+
server 127.0.0.1:8000;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
server {
|
| 34 |
+
listen 7860;
|
| 35 |
+
server_name _;
|
| 36 |
+
|
| 37 |
+
# Force HTTPS - HSTS header tells browsers to ALWAYS use HTTPS
|
| 38 |
+
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
| 39 |
+
|
| 40 |
+
# Additional security headers
|
| 41 |
+
add_header X-Content-Type-Options "nosniff" always;
|
| 42 |
+
add_header X-Frame-Options "SAMEORIGIN" always;
|
| 43 |
+
add_header X-XSS-Protection "1; mode=block" always;
|
| 44 |
+
|
| 45 |
+
# Documentation - serve static files built by Docusaurus
|
| 46 |
+
location /docs {
|
| 47 |
+
alias /app/static/docs;
|
| 48 |
+
try_files $uri $uri/ /docs/index.html;
|
| 49 |
+
|
| 50 |
+
# Cache static assets - shorter for easier updates
|
| 51 |
+
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
| 52 |
+
expires 1d;
|
| 53 |
+
add_header Cache-Control "public, max-age=86400";
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# API backend at /api/
|
| 58 |
+
location /api/ {
|
| 59 |
+
proxy_pass http://fastapi_backend/api/;
|
| 60 |
+
proxy_http_version 1.1;
|
| 61 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 62 |
+
proxy_set_header Connection 'upgrade';
|
| 63 |
+
proxy_set_header Host $host;
|
| 64 |
+
proxy_cache_bypass $http_upgrade;
|
| 65 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 66 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 67 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 68 |
+
proxy_read_timeout 300s;
|
| 69 |
+
proxy_connect_timeout 75s;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# API docs - route /api/docs to backend /docs
|
| 73 |
+
location = /api/docs {
|
| 74 |
+
proxy_pass http://fastapi_backend/docs;
|
| 75 |
+
proxy_http_version 1.1;
|
| 76 |
+
proxy_set_header Host $host;
|
| 77 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 78 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 79 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# API OpenAPI schema - support both /api/openapi.json and /openapi.json
|
| 83 |
+
location ~ ^/(api/)?(openapi\.json|redoc) {
|
| 84 |
+
proxy_pass http://fastapi_backend/$2;
|
| 85 |
+
proxy_http_version 1.1;
|
| 86 |
+
proxy_set_header Host $host;
|
| 87 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 88 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 89 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
# Frontend assets - shorter cache for easier updates
|
| 93 |
+
location /assets/ {
|
| 94 |
+
alias /app/static/frontend/assets/;
|
| 95 |
+
expires 1d;
|
| 96 |
+
add_header Cache-Control "public, max-age=86400";
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Main frontend app at root
|
| 100 |
+
location / {
|
| 101 |
+
root /app/static/frontend;
|
| 102 |
+
try_files $uri $uri/ /index.html;
|
| 103 |
+
|
| 104 |
+
# NEVER cache index.html - force browser to check for new version
|
| 105 |
+
location = /index.html {
|
| 106 |
+
add_header Cache-Control "no-cache, no-store, must-revalidate";
|
| 107 |
+
add_header Pragma "no-cache";
|
| 108 |
+
add_header Expires "0";
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# Cache hashed assets (immutable) but shorter time for easier updates
|
| 112 |
+
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
| 113 |
+
expires 1d;
|
| 114 |
+
add_header Cache-Control "public, max-age=86400";
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Health check endpoint
|
| 119 |
+
location /health {
|
| 120 |
+
access_log off;
|
| 121 |
+
return 200 "OK";
|
| 122 |
+
add_header Content-Type text/plain;
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
.huggingface/start.sh
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
echo "🚀 Starting CommunityOne on Hugging Face Spaces..."
|
| 5 |
+
echo "📊 Three services architecture:"
|
| 6 |
+
echo " 1. Documentation (Docusaurus) - Port 3000"
|
| 7 |
+
echo " 2. Main Application (React + Vite) - Port 5173"
|
| 8 |
+
echo " 3. API Backend (FastAPI) - Port 8000"
|
| 9 |
+
echo " 4. Nginx Reverse Proxy - Port 7860 (HF Spaces public port)"
|
| 10 |
+
echo ""
|
| 11 |
+
|
| 12 |
+
# DEBUG: Check environment variable
|
| 13 |
+
echo "🔍 Environment Check:"
|
| 14 |
+
echo " HF_SPACES = ${HF_SPACES:-NOT SET}"
|
| 15 |
+
if [ "$HF_SPACES" = "1" ]; then
|
| 16 |
+
echo " ✅ HF_SPACES is correctly set to 1"
|
| 17 |
+
else
|
| 18 |
+
echo " ❌ WARNING: HF_SPACES is not set to 1"
|
| 19 |
+
echo " Setting HF_SPACES=1 now..."
|
| 20 |
+
export HF_SPACES=1
|
| 21 |
+
fi
|
| 22 |
+
echo ""
|
| 23 |
+
|
| 24 |
+
# Create required directories
|
| 25 |
+
mkdir -p /app/logs /app/data /var/log/supervisor
|
| 26 |
+
|
| 27 |
+
# Verify static files exist
|
| 28 |
+
echo "📁 Verifying static files..."
|
| 29 |
+
if [ -d "/app/static/docs" ]; then
|
| 30 |
+
echo "✅ Documentation static files found"
|
| 31 |
+
ls -lh /app/static/docs/ | head -5
|
| 32 |
+
else
|
| 33 |
+
echo "❌ ERROR: Documentation static files missing at /app/static/docs"
|
| 34 |
+
exit 1
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
if [ -d "/app/static/frontend" ]; then
|
| 38 |
+
echo "✅ Frontend static files found"
|
| 39 |
+
ls -lh /app/static/frontend/ | head -5
|
| 40 |
+
else
|
| 41 |
+
echo "❌ ERROR: Frontend static files missing at /app/static/frontend"
|
| 42 |
+
exit 1
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
# Install serve for static file hosting (if not already installed)
|
| 46 |
+
if ! command -v serve &> /dev/null; then
|
| 47 |
+
echo "📦 Installing serve for static file hosting..."
|
| 48 |
+
npm install -g serve
|
| 49 |
+
fi
|
| 50 |
+
|
| 51 |
+
# Test nginx configuration
|
| 52 |
+
echo "🔧 Testing nginx configuration..."
|
| 53 |
+
nginx -t
|
| 54 |
+
|
| 55 |
+
# Initialize database if needed
|
| 56 |
+
echo "💾 Initializing database..."
|
| 57 |
+
python -c "from api.database import init_db; init_db()" || echo "⚠️ Database init skipped"
|
| 58 |
+
|
| 59 |
+
# Start all services with supervisor
|
| 60 |
+
echo "🎬 Starting all services with supervisor..."
|
| 61 |
+
exec /usr/bin/supervisord -c /etc/supervisor/conf.d/supervisord.conf
|
.huggingface/supervisord.conf
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[supervisord]
|
| 2 |
+
nodaemon=true
|
| 3 |
+
logfile=/dev/stdout
|
| 4 |
+
logfile_maxbytes=0
|
| 5 |
+
pidfile=/tmp/supervisord.pid
|
| 6 |
+
user=root
|
| 7 |
+
|
| 8 |
+
[program:nginx]
|
| 9 |
+
command=/usr/sbin/nginx -g "daemon off;"
|
| 10 |
+
autostart=true
|
| 11 |
+
autorestart=true
|
| 12 |
+
stdout_logfile=/dev/stdout
|
| 13 |
+
stdout_logfile_maxbytes=0
|
| 14 |
+
stderr_logfile=/dev/stderr
|
| 15 |
+
stderr_logfile_maxbytes=0
|
| 16 |
+
priority=10
|
| 17 |
+
|
| 18 |
+
[program:fastapi]
|
| 19 |
+
command=uvicorn api.main:app --host 0.0.0.0 --port 8000 --log-level info --proxy-headers
|
| 20 |
+
directory=/app
|
| 21 |
+
autostart=true
|
| 22 |
+
autorestart=true
|
| 23 |
+
stdout_logfile=/dev/stdout
|
| 24 |
+
stdout_logfile_maxbytes=0
|
| 25 |
+
stderr_logfile=/dev/stderr
|
| 26 |
+
stderr_logfile_maxbytes=0
|
| 27 |
+
environment=PYTHONUNBUFFERED="1",HF_SPACES="1"
|
| 28 |
+
priority=20
|
CITATIONS.md
ADDED
|
@@ -0,0 +1,1474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Citations and Acknowledgments
|
| 2 |
+
|
| 3 |
+
This project uses several open datasets and research contributions. Please cite the following works when using or referencing this project.
|
| 4 |
+
|
| 5 |
+
## 📚 **Datasets**
|
| 6 |
+
|
| 7 |
+
### **MeetingBank Dataset**
|
| 8 |
+
|
| 9 |
+
We use the MeetingBank benchmark dataset for meeting summarization and analysis.
|
| 10 |
+
|
| 11 |
+
**Citation:**
|
| 12 |
+
```
|
| 13 |
+
Yebowen Hu, Tim Ganter, Hanieh Deilamsalehy, Franck Dernoncourt, Hassan Foroosh, Fei Liu.
|
| 14 |
+
"MeetingBank: A Benchmark Dataset for Meeting Summarization"
|
| 15 |
+
In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL),
|
| 16 |
+
July 2023, Toronto, Canada.
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
**BibTeX:**
|
| 20 |
+
```bibtex
|
| 21 |
+
@inproceedings{hu-etal-2023-meetingbank,
|
| 22 |
+
title = "MeetingBank: A Benchmark Dataset for Meeting Summarization",
|
| 23 |
+
author = "Yebowen Hu and Tim Ganter and Hanieh Deilamsalehy and Franck Dernoncourt and Hassan Foroosh and Fei Liu",
|
| 24 |
+
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL)",
|
| 25 |
+
month = July,
|
| 26 |
+
year = "2023",
|
| 27 |
+
address = "Toronto, Canada",
|
| 28 |
+
publisher = "Association for Computational Linguistics",
|
| 29 |
+
}
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
**Resources:**
|
| 33 |
+
- Paper: https://arxiv.org/abs/2305.17529
|
| 34 |
+
- Dataset: https://huggingface.co/datasets/huuuyeah/meetingbank
|
| 35 |
+
- Zenodo: https://zenodo.org/record/7989108
|
| 36 |
+
|
| 37 |
+
**What we use:**
|
| 38 |
+
- 1,366 city council meetings from 6 U.S. cities
|
| 39 |
+
- Meeting transcripts and summaries
|
| 40 |
+
- Used for: Meeting discovery, transcript analysis, summarization benchmarking
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## 🗂️ **Other Data Sources**
|
| 45 |
+
|
| 46 |
+
### **U.S. Census Bureau**
|
| 47 |
+
- Geographic boundaries and demographic data
|
| 48 |
+
- Source: https://www.census.gov/
|
| 49 |
+
- License: Public Domain (U.S. Government)
|
| 50 |
+
|
| 51 |
+
### **Open States / Plural Policy** ⭐
|
| 52 |
+
- Comprehensive state and local legislative information
|
| 53 |
+
- Organization: Plural Policy (formerly Open States Foundation)
|
| 54 |
+
- Source: https://openstates.org/
|
| 55 |
+
- API: https://openstates.org/api/
|
| 56 |
+
- Data Downloads: https://open.pluralpolicy.com/data/
|
| 57 |
+
- License: Various (check per state)
|
| 58 |
+
- API Key: Required for access (free tier: 50,000 requests/month)
|
| 59 |
+
|
| 60 |
+
**Coverage:**
|
| 61 |
+
- All 50 states + DC + Puerto Rico
|
| 62 |
+
- 7,300+ state legislators
|
| 63 |
+
- Millions of bills, votes, and legislative sessions
|
| 64 |
+
- Monthly PostgreSQL database dumps (9.8GB+)
|
| 65 |
+
|
| 66 |
+
**What we use:**
|
| 67 |
+
- Bulk legislative session downloads (CSV/JSON/PostgreSQL)
|
| 68 |
+
- State legislator data with committee assignments
|
| 69 |
+
- Bill tracking and voting records
|
| 70 |
+
- Legislative video sources (YouTube channels, Granicus portals)
|
| 71 |
+
|
| 72 |
+
**Resources:**
|
| 73 |
+
- Open Data: https://open.pluralpolicy.com/data/
|
| 74 |
+
- Scrapers Repository: https://github.com/openstates/openstates-scrapers
|
| 75 |
+
- Local Database Setup: https://docs.openstates.org/contributing/local-database/
|
| 76 |
+
- Code of Conduct: https://docs.openstates.org/code-of-conduct/
|
| 77 |
+
- Schema Documentation: https://github.com/openstates/people/blob/master/schema.md
|
| 78 |
+
|
| 79 |
+
**BibTeX:**
|
| 80 |
+
```bibtex
|
| 81 |
+
@software{openstates,
|
| 82 |
+
title = {Open States},
|
| 83 |
+
author = {{Plural Policy}},
|
| 84 |
+
year = {2024},
|
| 85 |
+
url = {https://openstates.org/},
|
| 86 |
+
note = {Comprehensive state legislative data for all 50 U.S. states}
|
| 87 |
+
}
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
**Potential Contributions:**
|
| 91 |
+
- Our scraper patterns could be contributed to openstates-scrapers
|
| 92 |
+
- Video source discovery could enhance their data
|
| 93 |
+
- We follow their Code of Conduct for all contributions
|
| 94 |
+
|
| 95 |
+
### **LegiScan** ⭐
|
| 96 |
+
- Comprehensive legislative tracking and bill text database
|
| 97 |
+
- Organization: LegiScan LLC
|
| 98 |
+
- Source: https://legiscan.com/
|
| 99 |
+
- API: https://legiscan.com/legiscan
|
| 100 |
+
- License: API access requires subscription (free tier available with limitations)
|
| 101 |
+
- Coverage: All 50 states + U.S. Congress + Washington D.C.
|
| 102 |
+
- API Key: Required for access (free tier: limited requests)
|
| 103 |
+
|
| 104 |
+
**Coverage:**
|
| 105 |
+
- Real-time legislative tracking for all U.S. states and Congress
|
| 106 |
+
- Full bill text, amendments, and legislative documents
|
| 107 |
+
- Roll call votes and voting records
|
| 108 |
+
- Committee assignments and hearings
|
| 109 |
+
- Bill status tracking and history
|
| 110 |
+
- Sponsor and co-sponsor information
|
| 111 |
+
- Bill text in PDF, HTML, and plain text formats
|
| 112 |
+
|
| 113 |
+
**What we use:**
|
| 114 |
+
- Bill text downloads and full-text search
|
| 115 |
+
- Legislative document archives
|
| 116 |
+
- Bill status and tracking data
|
| 117 |
+
- Voting records and roll calls
|
| 118 |
+
- Supplement to Open States data for missing jurisdictions
|
| 119 |
+
- Historical legislative data back to 2011
|
| 120 |
+
|
| 121 |
+
**API Features:**
|
| 122 |
+
- GetBillText: Retrieve full bill text in multiple formats
|
| 123 |
+
- GetBill: Detailed bill metadata and status
|
| 124 |
+
- GetRollCall: Voting records with legislator positions
|
| 125 |
+
- GetSponsor: Sponsor and co-sponsor information
|
| 126 |
+
- Search: Full-text search across all bills
|
| 127 |
+
- GetDatasetList: Bulk dataset downloads
|
| 128 |
+
|
| 129 |
+
**Resources:**
|
| 130 |
+
- API Documentation: https://legiscan.com/legiscan
|
| 131 |
+
- Dataset Downloads: https://legiscan.com/datasets
|
| 132 |
+
- Search Interface: https://legiscan.com/gaits/search
|
| 133 |
+
- State Coverage: https://legiscan.com/legiscan/states
|
| 134 |
+
|
| 135 |
+
**BibTeX:**
|
| 136 |
+
```bibtex
|
| 137 |
+
@software{legiscan,
|
| 138 |
+
title = {LegiScan},
|
| 139 |
+
author = {{LegiScan LLC}},
|
| 140 |
+
year = {2024},
|
| 141 |
+
url = {https://legiscan.com/},
|
| 142 |
+
note = {Comprehensive legislative tracking and bill text database covering all 50 U.S. states and Congress}
|
| 143 |
+
}
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
**Complementary to Open States:**
|
| 147 |
+
- LegiScan provides bill text in multiple formats (PDF, HTML, plain text)
|
| 148 |
+
- Historical data back to 2011 for all states
|
| 149 |
+
- Real-time updates and notifications
|
| 150 |
+
- More comprehensive document archives
|
| 151 |
+
- Paid API provides higher rate limits and bulk downloads
|
| 152 |
+
- Use LegiScan for bill text analysis, Open States for structured legislative data
|
| 153 |
+
|
| 154 |
+
### **Harvard Dataverse**
|
| 155 |
+
- Meeting datasets and civic engagement research
|
| 156 |
+
- Source: https://dataverse.harvard.edu/
|
| 157 |
+
- License: Varies by dataset
|
| 158 |
+
|
| 159 |
+
### **City Scrapers** ⭐
|
| 160 |
+
- Open source civic tech project for scraping local government meetings
|
| 161 |
+
- Organization: Documenters.org / City Bureau
|
| 162 |
+
- Source: https://cityscrapers.org/
|
| 163 |
+
- GitHub: https://github.com/city-scrapers
|
| 164 |
+
- License: MIT License (open source)
|
| 165 |
+
- Coverage: Chicago, Pittsburgh, Detroit, Cleveland, Los Angeles (250+ government agencies)
|
| 166 |
+
- What we use: Validated meeting URLs, Legistar/Granicus platform endpoints, spider code for scraper patterns
|
| 167 |
+
- Used for: Meeting discovery, URL extraction, platform detection, scraper validation
|
| 168 |
+
|
| 169 |
+
**City Scrapers Repositories:**
|
| 170 |
+
- Chicago: https://github.com/city-scrapers/city-scrapers (~100 agencies)
|
| 171 |
+
- Pittsburgh: https://github.com/city-scrapers/city-scrapers-pitt (~30 agencies)
|
| 172 |
+
- Detroit: https://github.com/city-scrapers/city-scrapers-detroit (~40 agencies)
|
| 173 |
+
- Cleveland: https://github.com/city-scrapers/city-scrapers-cle (~30 agencies)
|
| 174 |
+
- Los Angeles: https://github.com/city-scrapers/city-scrapers-la (~50 agencies)
|
| 175 |
+
|
| 176 |
+
**BibTeX:**
|
| 177 |
+
```bibtex
|
| 178 |
+
@software{city_scrapers,
|
| 179 |
+
title = {City Scrapers},
|
| 180 |
+
author = {{Documenters.org}},
|
| 181 |
+
year = {2024},
|
| 182 |
+
url = {https://cityscrapers.org/},
|
| 183 |
+
note = {Open source civic tech project providing validated scrapers for local government meetings across major U.S. cities}
|
| 184 |
+
}
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
### **Google Civic Information API** ⭐
|
| 188 |
+
- Government officials, polling locations, and election data
|
| 189 |
+
- Organization: Google LLC
|
| 190 |
+
- API Documentation: https://developers.google.com/civic-information
|
| 191 |
+
- License: Free (with quota limits)
|
| 192 |
+
- Rate Limit: 25,000 requests/day (free tier)
|
| 193 |
+
- Coverage: U.S. federal, state, and local government officials; polling locations; election data
|
| 194 |
+
- What we use: Elected officials by address, representative contact info, voting districts
|
| 195 |
+
- Used for: Contact discovery, official verification, civic engagement tools
|
| 196 |
+
|
| 197 |
+
**API Endpoints Used:**
|
| 198 |
+
- Representatives by Address: Get all elected officials for a given address
|
| 199 |
+
- Elections: Voter information, polling locations, ballot information
|
| 200 |
+
- Divisions: Geographic/political divisions (OCD-IDs)
|
| 201 |
+
|
| 202 |
+
**BibTeX:**
|
| 203 |
+
```bibtex
|
| 204 |
+
@misc{google_civic_api,
|
| 205 |
+
title = {Google Civic Information API},
|
| 206 |
+
author = {{Google LLC}},
|
| 207 |
+
year = {2024},
|
| 208 |
+
url = {https://developers.google.com/civic-information},
|
| 209 |
+
note = {API providing government official contact information, election data, and polling locations}
|
| 210 |
+
}
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
**Terms of Service:**
|
| 214 |
+
- Attribution required when displaying official data
|
| 215 |
+
- Caching limited to 30 days
|
| 216 |
+
- Must comply with Google API Terms of Service
|
| 217 |
+
|
| 218 |
+
### **YouTube Data API v3** ⭐
|
| 219 |
+
- Video metadata, channel information, and search for government meetings
|
| 220 |
+
- Organization: Google LLC
|
| 221 |
+
- API Documentation: https://developers.google.com/youtube/v3
|
| 222 |
+
- License: Free (with quota limits)
|
| 223 |
+
- Rate Limit: 10,000 units/day (free tier), search costs 100 units per request
|
| 224 |
+
- Coverage: Global video platform with millions of government channels
|
| 225 |
+
- What we use: Government channel discovery, meeting video metadata, transcript availability
|
| 226 |
+
- Used for: Video discovery, channel statistics, meeting video archival
|
| 227 |
+
|
| 228 |
+
**API Features Used:**
|
| 229 |
+
- Search: Find government channels by jurisdiction name
|
| 230 |
+
- Channels: Get channel metadata, subscriber counts, video counts
|
| 231 |
+
- Videos: Metadata including title, description, upload date, duration
|
| 232 |
+
- Captions: Check for closed caption/transcript availability
|
| 233 |
+
|
| 234 |
+
**BibTeX:**
|
| 235 |
+
```bibtex
|
| 236 |
+
@misc{youtube_data_api,
|
| 237 |
+
title = {YouTube Data API v3},
|
| 238 |
+
author = {{Google LLC}},
|
| 239 |
+
year = {2024},
|
| 240 |
+
url = {https://developers.google.com/youtube/v3},
|
| 241 |
+
note = {API for accessing YouTube video metadata, channel information, and search functionality}
|
| 242 |
+
}
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
**Terms of Service:**
|
| 246 |
+
- YouTube API Services Terms: https://developers.google.com/youtube/terms/api-services-terms-of-service
|
| 247 |
+
- Attribution required with YouTube logo
|
| 248 |
+
- Quota limits enforced (10,000 units/day free)
|
| 249 |
+
- Video embeds must use official YouTube player
|
| 250 |
+
|
| 251 |
+
### **Ballotpedia** ⭐
|
| 252 |
+
- Ballot measures, referendums, and propositions
|
| 253 |
+
- Organization: Lucy Burns Institute
|
| 254 |
+
- Source: https://ballotpedia.org/
|
| 255 |
+
- API: https://ballotpedia.org/API-documentation
|
| 256 |
+
- License: API access is limited at scale (paid tier available)
|
| 257 |
+
- Coverage: All 50 states, historical measures back to 1990s
|
| 258 |
+
- Used for: Tracking fluoridation votes, school bond measures, health policy propositions
|
| 259 |
+
|
| 260 |
+
### **MIT Election Data + Science Lab**
|
| 261 |
+
- Presidential, Congressional, and gubernatorial election results
|
| 262 |
+
- Organization: Massachusetts Institute of Technology
|
| 263 |
+
- Source: https://electionlab.mit.edu/data
|
| 264 |
+
- Repository: https://github.com/MEDSL/official-returns
|
| 265 |
+
- License: Free for research and commercial use
|
| 266 |
+
- Coverage: 1976-present, county-level results
|
| 267 |
+
- Used for: Political composition analysis, jurisdiction context
|
| 268 |
+
|
| 269 |
+
### **OpenElections**
|
| 270 |
+
- State-by-state certified election results in standardized CSV format
|
| 271 |
+
- Source: https://openelections.net/
|
| 272 |
+
- GitHub: https://github.com/openelections
|
| 273 |
+
- License: Open source (various by state)
|
| 274 |
+
- Coverage: All 50 states (various completion levels), precinct-level data
|
| 275 |
+
- Used for: Detailed election results, local race outcomes, advocacy targeting
|
| 276 |
+
|
| 277 |
+
### **Open Civic Data (OCD) Standards**
|
| 278 |
+
- Division identifiers and civic data standards
|
| 279 |
+
- Specification: https://open-civic-data.readthedocs.io/en/latest/proposals/0002.html
|
| 280 |
+
- Repository: https://github.com/opencivicdata/ocd-division-ids
|
| 281 |
+
- License: Open source
|
| 282 |
+
- Used for: Standardized jurisdiction identifiers, cross-platform compatibility
|
| 283 |
+
|
| 284 |
+
### **Popolo Project**
|
| 285 |
+
- International open government data specification for people, organizations, and elected positions
|
| 286 |
+
- Specification: https://www.popoloproject.com/
|
| 287 |
+
- GitHub: https://github.com/popolo-project/popolo-spec
|
| 288 |
+
- Documentation: http://www.popoloproject.com/specs/
|
| 289 |
+
- License: Creative Commons Attribution 4.0 International
|
| 290 |
+
|
| 291 |
+
### **BillMap** ⭐
|
| 292 |
+
- Tracks bill text similarity across all 50 U.S. states to identify copy-paste legislation and model bill influence
|
| 293 |
+
- Organization: Sunlight Foundation / @unitedstates community
|
| 294 |
+
- Repository: https://github.com/unitedstates/BillMap
|
| 295 |
+
- Research: Anderson et al., "Detecting Policy Influence in Legislatures" (2019)
|
| 296 |
+
- Paper: https://arxiv.org/abs/1906.03699
|
| 297 |
+
- Live Demo: https://billmap.cs.princeton.edu/
|
| 298 |
+
- License: Open source
|
| 299 |
+
- Coverage: All 50 states, tracks legislative text diffusion across jurisdictions
|
| 300 |
+
- Used for: Identifying model legislation, tracking policy influence, finding similar bills across states
|
| 301 |
+
- Method: Text similarity analysis, n-gram matching, bill text alignment
|
| 302 |
+
|
| 303 |
+
**What we use:**
|
| 304 |
+
- Bill similarity detection algorithms
|
| 305 |
+
- Model legislation tracking methodology
|
| 306 |
+
- Cross-state policy diffusion analysis
|
| 307 |
+
- Legislative text comparison techniques
|
| 308 |
+
|
| 309 |
+
**BibTeX:**
|
| 310 |
+
```bibtex
|
| 311 |
+
@article{anderson2019billmap,
|
| 312 |
+
title = {Detecting Policy Influence in Legislatures},
|
| 313 |
+
author = {Anderson, Evan and Fowler, Anthony and Grossmann, Matt and Sahn, Alexander and Shiraito, Yuki},
|
| 314 |
+
journal = {arXiv preprint arXiv:1906.03699},
|
| 315 |
+
year = {2019},
|
| 316 |
+
url = {https://arxiv.org/abs/1906.03699}
|
| 317 |
+
}
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
### **@unitedstates Images Repository** ⭐
|
| 321 |
+
- High-resolution photos of all U.S. Congress members (past and present)
|
| 322 |
+
- Organization: @unitedstates community (Sunlight Foundation legacy project)
|
| 323 |
+
- Repository: https://github.com/unitedstates/images
|
| 324 |
+
- CDN: https://theunitedstates.io/images/congress/
|
| 325 |
+
- License: Public domain (government photos)
|
| 326 |
+
- Coverage: All U.S. Senators and Representatives (1789-present), updated regularly
|
| 327 |
+
- Image Format: JPEG, multiple resolutions (original, 450x550, 225x275)
|
| 328 |
+
- Used for: Legislator profile photos, visual identification, representative directories
|
| 329 |
+
|
| 330 |
+
**Image URL Format:**
|
| 331 |
+
```
|
| 332 |
+
https://theunitedstates.io/images/congress/original/[bioguide_id].jpg
|
| 333 |
+
https://theunitedstates.io/images/congress/450x550/[bioguide_id].jpg
|
| 334 |
+
https://theunitedstates.io/images/congress/225x275/[bioguide_id].jpg
|
| 335 |
+
```
|
| 336 |
+
|
| 337 |
+
**Example:**
|
| 338 |
+
```
|
| 339 |
+
https://theunitedstates.io/images/congress/original/P000197.jpg
|
| 340 |
+
(Nancy Pelosi, bioguide_id: P000197)
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
**What we use:**
|
| 344 |
+
- Legislator profile photos for federal representatives
|
| 345 |
+
- Visual identification in advocacy tools
|
| 346 |
+
- Representative directories and contact pages
|
| 347 |
+
- Cross-referenced with Open States data using bioguide IDs
|
| 348 |
+
|
| 349 |
+
**Related Projects:**
|
| 350 |
+
- **congress-legislators**: https://github.com/unitedstates/congress-legislators (YAML data files)
|
| 351 |
+
- **congress**: https://github.com/unitedstates/congress (scraping tools)
|
| 352 |
+
- **districts**: https://github.com/unitedstates/districts (GeoJSON boundaries)
|
| 353 |
+
|
| 354 |
+
---
|
| 355 |
+
|
| 356 |
+
## 💰 **Nonprofit Financial Data**
|
| 357 |
+
|
| 358 |
+
### **GivingTuesday 990 Data Infrastructure** ⭐
|
| 359 |
+
|
| 360 |
+
We use the GivingTuesday 990 Data Lake for detailed nonprofit financial data from IRS Form 990 XML filings.
|
| 361 |
+
|
| 362 |
+
**Organization:** GivingTuesday
|
| 363 |
+
**Website:** https://990data.givingtuesday.org/
|
| 364 |
+
**Data Lake:** `s3://gt990datalake-rawdata` (AWS S3, us-east-1 Virginia, Public Access)
|
| 365 |
+
**Console:** https://us-east-1.console.aws.amazon.com/s3/buckets/gt990datalake-rawdata
|
| 366 |
+
**License:** Public domain (IRS data) + Open source tools
|
| 367 |
+
**Access:** Free, no AWS credentials required (anonymous access via `--no-sign-request`)
|
| 368 |
+
|
| 369 |
+
**What we use:**
|
| 370 |
+
- **Raw 990 XMLs**: Individual e-filed Form 990 returns in XML format (1-2 MB each)
|
| 371 |
+
- **Indices**: CSV/Parquet files listing all available 990s with metadata
|
| 372 |
+
- **Coverage**: 5.4M+ e-filed Form 990s (2011-present, ~300K new filings/year)
|
| 373 |
+
- **Scale**: ~10 TB of raw XML data
|
| 374 |
+
- **Data extracted**: Revenue, expenses, assets, liabilities, grants, programs, officer compensation, mission statements, website URLs
|
| 375 |
+
|
| 376 |
+
**Data Lake Structure:**
|
| 377 |
+
```
|
| 378 |
+
s3://gt990datalake-rawdata/
|
| 379 |
+
├── EfileData/
|
| 380 |
+
│ ├── XmlFiles/ # Individual 990 XMLs (~5.4M files, ~10 TB)
|
| 381 |
+
│ │ └── [OBJECT_ID]_public.xml (e.g., 202233259349300703_public.xml)
|
| 382 |
+
│ └── XmlZips/ # ZIP archives (97 files, ~38 GB → ~95 GB uncompressed)
|
| 383 |
+
│ └── YYYY_TEOS_XML_*.zip (e.g., 2023_TEOS_XML_01A.zip ~400 MB)
|
| 384 |
+
└── Indices/
|
| 385 |
+
└── 990xmls/ # CSV indices with metadata
|
| 386 |
+
└── index_all_years_efiledata_xmls_created_on_2023-10-29.csv (~925 MB)
|
| 387 |
+
```
|
| 388 |
+
|
| 389 |
+
**Download Strategies:**
|
| 390 |
+
|
| 391 |
+
| Approach | Best For | Time | Bandwidth | Storage |
|
| 392 |
+
|----------|----------|------|-----------|---------|
|
| 393 |
+
| **Individual XMLs** | Single state or targeted download | ~2 hrs (22K orgs) | 32 GB | 32 GB |
|
| 394 |
+
| **ZIP Archives** | All states / nationwide | ~6 hrs total | 38 GB | 95 GB |
|
| 395 |
+
|
| 396 |
+
**Choose Individual XMLs when:**
|
| 397 |
+
- You need data for 1-5 states only
|
| 398 |
+
- You want to download only specific EINs
|
| 399 |
+
- Storage space is limited
|
| 400 |
+
- You want incremental caching (download as needed)
|
| 401 |
+
|
| 402 |
+
**Choose ZIP Archives when:**
|
| 403 |
+
- You need all 50 states
|
| 404 |
+
- You're building a comprehensive nonprofit database
|
| 405 |
+
- You have 100+ GB storage
|
| 406 |
+
- You want offline access to all filings
|
| 407 |
+
|
| 408 |
+
**S3 Access Examples:**
|
| 409 |
+
|
| 410 |
+
**Individual XMLs (for single state or targeted download):**
|
| 411 |
+
```bash
|
| 412 |
+
# List index files (no credentials needed)
|
| 413 |
+
aws s3 ls s3://gt990datalake-rawdata/Indices/990xmls/ --no-sign-request
|
| 414 |
+
|
| 415 |
+
# Download index (~925 MB)
|
| 416 |
+
aws s3 cp s3://gt990datalake-rawdata/Indices/990xmls/index_all_years_efiledata_xmls_created_on_2023-10-29.csv . --no-sign-request
|
| 417 |
+
|
| 418 |
+
# Download specific XML
|
| 419 |
+
aws s3 cp s3://gt990datalake-rawdata/EfileData/XmlFiles/202233259349300703_public.xml . --no-sign-request
|
| 420 |
+
|
| 421 |
+
# Batch download for single state (using our script)
|
| 422 |
+
python scripts/batch_download_990s.py --state MA --health-only --concurrent 1000
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
**ZIP Archives (for all states / nationwide):**
|
| 426 |
+
```bash
|
| 427 |
+
# Download all 97 ZIPs (~38 GB) to local directory
|
| 428 |
+
./scripts/download_990_zips.sh
|
| 429 |
+
|
| 430 |
+
# Extract all ZIPs to get ~384K XMLs (~95 GB)
|
| 431 |
+
./scripts/extract_990_zips.sh
|
| 432 |
+
|
| 433 |
+
# Build local index for fast lookup
|
| 434 |
+
python scripts/build_990_local_index.py
|
| 435 |
+
|
| 436 |
+
# Now enrich from local files (no network needed!)
|
| 437 |
+
python scripts/enrich_all_states_990.py
|
| 438 |
+
```
|
| 439 |
+
|
| 440 |
+
**Index Schema:**
|
| 441 |
+
The CSV index contains columns: `EIN`, `TaxPeriod`, `ObjectId`, `URL`, `FormType`, `OrganizationName`, `DLN`, `SubmittedOn`
|
| 442 |
+
|
| 443 |
+
**Python Access:**
|
| 444 |
+
```python
|
| 445 |
+
import boto3
|
| 446 |
+
from botocore import UNSIGNED
|
| 447 |
+
from botocore.config import Config
|
| 448 |
+
|
| 449 |
+
# Configure anonymous S3 client
|
| 450 |
+
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
|
| 451 |
+
|
| 452 |
+
# Download XML
|
| 453 |
+
xml_obj = s3.get_object(
|
| 454 |
+
Bucket='gt990datalake-rawdata',
|
| 455 |
+
Key='EfileData/XmlFiles/202233259349300703_public.xml'
|
| 456 |
+
)
|
| 457 |
+
xml_content = xml_obj['Body'].read()
|
| 458 |
+
```
|
| 459 |
+
|
| 460 |
+
**BibTeX:**
|
| 461 |
+
```bibtex
|
| 462 |
+
@misc{givingtuesday990data,
|
| 463 |
+
title = {GivingTuesday 990 Data Infrastructure},
|
| 464 |
+
author = {{GivingTuesday}},
|
| 465 |
+
year = {2023},
|
| 466 |
+
url = {https://990data.givingtuesday.org/},
|
| 467 |
+
note = {Collaborative data lake of standardized IRS Form 990 XML filings}
|
| 468 |
+
}
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
**Attribution:** When publishing analyses using this data, please cite both:
|
| 472 |
+
1. GivingTuesday 990 Data Infrastructure: https://990data.givingtuesday.org/
|
| 473 |
+
2. Our enrichment tools: https://github.com/getcommunityone/open-navigator-for-engagement
|
| 474 |
+
|
| 475 |
+
---
|
| 476 |
+
|
| 477 |
+
### **Google Cloud Public Datasets: IRS 990** ⭐
|
| 478 |
+
|
| 479 |
+
Google hosts the complete IRS Form 990 dataset in BigQuery for fast SQL-based querying.
|
| 480 |
+
|
| 481 |
+
**Platform:** Google Cloud BigQuery
|
| 482 |
+
**Dataset:** `bigquery-public-data.irs_990`
|
| 483 |
+
**Table:** `bigquery-public-data.irs_990.irs_990_xml`
|
| 484 |
+
**Documentation:** https://console.cloud.google.com/marketplace/product/internal-revenue-service/irs-990
|
| 485 |
+
**Cost:** First 1 TB of queries per month is **FREE**
|
| 486 |
+
**Coverage:** All e-filed Form 990s (2011-present, 5M+ records)
|
| 487 |
+
|
| 488 |
+
**What we use:**
|
| 489 |
+
- **Mission statements**: Extracted from `return_header` or `part_i_mission_desc` fields
|
| 490 |
+
- **Website URLs**: Found in `website_address_txt` field
|
| 491 |
+
- **Financial data**: All Form 990 fields accessible via SQL
|
| 492 |
+
- **Fast bulk queries**: Extract data for 1M+ orgs in seconds (vs hours downloading XMLs)
|
| 493 |
+
|
| 494 |
+
**Advantages:**
|
| 495 |
+
- ✅ No local XML downloads needed
|
| 496 |
+
- ✅ Single SQL query to bulk-extract fields
|
| 497 |
+
- ✅ Serverless (no infrastructure to manage)
|
| 498 |
+
- ✅ Fast (queries complete in seconds)
|
| 499 |
+
- ✅ Free tier covers most research use cases
|
| 500 |
+
|
| 501 |
+
**Example Query:**
|
| 502 |
+
```sql
|
| 503 |
+
SELECT
|
| 504 |
+
ein,
|
| 505 |
+
org_name,
|
| 506 |
+
website_address_txt,
|
| 507 |
+
part_i_mission_desc,
|
| 508 |
+
total_revenue_current_year,
|
| 509 |
+
total_expenses_current_year
|
| 510 |
+
FROM `bigquery-public-data.irs_990.irs_990_2023`
|
| 511 |
+
WHERE state = 'AL'
|
| 512 |
+
AND ntee_code LIKE 'E%'
|
| 513 |
+
LIMIT 1000;
|
| 514 |
+
```
|
| 515 |
+
|
| 516 |
+
**BibTeX:**
|
| 517 |
+
```bibtex
|
| 518 |
+
@misc{googlecloud_irs990,
|
| 519 |
+
title = {IRS 990 Public Dataset},
|
| 520 |
+
author = {{Google Cloud Public Datasets}},
|
| 521 |
+
year = {2024},
|
| 522 |
+
publisher = {Google Cloud Platform},
|
| 523 |
+
url = {https://console.cloud.google.com/marketplace/product/internal-revenue-service/irs-990},
|
| 524 |
+
note = {BigQuery public dataset of IRS Form 990 e-file data}
|
| 525 |
+
}
|
| 526 |
+
```
|
| 527 |
+
|
| 528 |
+
**Attribution:** When using BigQuery 990 data, cite:
|
| 529 |
+
1. IRS 990 Public Dataset (Google Cloud)
|
| 530 |
+
2. Internal Revenue Service (original data source)
|
| 531 |
+
|
| 532 |
+
---
|
| 533 |
+
|
| 534 |
+
### **National Center for Charitable Statistics (NCCS) Unified BMF** ⭐
|
| 535 |
+
|
| 536 |
+
The NCCS Unified BMF is a longitudinal nonprofit dataset specifically designed for AI and "Lakehouse" projects with pre-geocoded locations and Census integration.
|
| 537 |
+
|
| 538 |
+
**Organization:** National Center for Charitable Statistics (NCCS), Urban Institute
|
| 539 |
+
**Website:** https://nccs.urban.org/
|
| 540 |
+
**Dataset:** Unified BMF (Business Master File)
|
| 541 |
+
**Documentation:** https://nccs.urban.org/project/irs-exempt-organizations-business-master-file
|
| 542 |
+
**License:** Public domain (IRS data) + Urban Institute terms
|
| 543 |
+
**Released:** Late 2025/Early 2026
|
| 544 |
+
**Coverage:** 1989 through mid-2025 (update pending)
|
| 545 |
+
|
| 546 |
+
**What we use:**
|
| 547 |
+
- **Longitudinal tracking**: Single file with one row per organization that has ever held tax-exempt status
|
| 548 |
+
- **Pre-geocoded addresses**: Most recent address geocoded to Census block level
|
| 549 |
+
- **Geographic codes**: FIPS codes at block, tract, county, and state levels
|
| 550 |
+
- **Metropolitan area codes**: Current CBSA (Core Based Statistical Area) definitions
|
| 551 |
+
- **Temporal tracking**: `ORG_YEAR_FIRST` and `ORG_YEAR_LAST` variables for organization lifecycle
|
| 552 |
+
- **Census integration**: Ready for merging with Census demographic and economic data
|
| 553 |
+
|
| 554 |
+
**Key Features:**
|
| 555 |
+
- ✅ **Eliminates annual file merging**: Consolidates all historical BMF releases into single file
|
| 556 |
+
- ✅ **AI/Lakehouse optimized**: Designed for modern data infrastructure
|
| 557 |
+
- ✅ **Census-ready**: FIPS codes enable direct joins with Census data
|
| 558 |
+
- ✅ **Metropolitan vs Rural**: CBSA codes identify urban/rural areas
|
| 559 |
+
- ✅ **Historical analysis**: Track organizations over time without complex ETL
|
| 560 |
+
- ✅ **Geographic analysis**: Pre-geocoded to Census block granularity
|
| 561 |
+
|
| 562 |
+
**Use Cases:**
|
| 563 |
+
- Longitudinal analysis of nonprofit sector
|
| 564 |
+
- Building historical sampling frames
|
| 565 |
+
- Linking nonprofit data to Census demographics
|
| 566 |
+
- Metropolitan vs rural nonprofit analysis
|
| 567 |
+
- Policy research requiring geographic precision
|
| 568 |
+
- Time-series analysis of organizational entry/exit
|
| 569 |
+
|
| 570 |
+
**Geographic Levels Available:**
|
| 571 |
+
- Census Block (finest granularity)
|
| 572 |
+
- Census Tract
|
| 573 |
+
- County (FIPS codes)
|
| 574 |
+
- State (FIPS codes)
|
| 575 |
+
- CBSA (Core Based Statistical Area)
|
| 576 |
+
|
| 577 |
+
**Related Resources:**
|
| 578 |
+
- NCCS Census Crosswalk: For aggregating to additional geographic levels
|
| 579 |
+
- BMF Overview: https://nccs.urban.org/project/irs-exempt-organizations-business-master-file
|
| 580 |
+
- NCCS Data Archive: https://nccs.urban.org/nccs-data-archive
|
| 581 |
+
|
| 582 |
+
**BibTeX:**
|
| 583 |
+
```bibtex
|
| 584 |
+
@dataset{nccs_unified_bmf,
|
| 585 |
+
title = {Unified Business Master File (BMF)},
|
| 586 |
+
author = {{National Center for Charitable Statistics}},
|
| 587 |
+
year = {2026},
|
| 588 |
+
publisher = {Urban Institute},
|
| 589 |
+
url = {https://nccs.urban.org/},
|
| 590 |
+
note = {Longitudinal nonprofit dataset with pre-geocoded Census integration, 1989-2025}
|
| 591 |
+
}
|
| 592 |
+
```
|
| 593 |
+
|
| 594 |
+
**Attribution:** When using NCCS Unified BMF data, cite:
|
| 595 |
+
1. National Center for Charitable Statistics, Urban Institute
|
| 596 |
+
2. IRS Business Master File (original data source)
|
| 597 |
+
3. Specify the data vintage/update date used
|
| 598 |
+
|
| 599 |
+
---
|
| 600 |
+
|
| 601 |
+
### **Charity Navigator** ⭐
|
| 602 |
+
|
| 603 |
+
**Powered by Charity Navigator**
|
| 604 |
+
|
| 605 |
+
We use the Charity Navigator GraphQL API to enrich nonprofit profiles with star ratings, mission statements, and organizational metrics.
|
| 606 |
+
|
| 607 |
+
**Organization:** Charity Navigator, Inc.
|
| 608 |
+
**Website:** https://www.charitynavigator.org
|
| 609 |
+
**API Documentation:** https://www.charitynavigator.org/partner/api
|
| 610 |
+
**Principal Office:** 299 Market Street, Suite 250, Saddle Brook, NJ 07663
|
| 611 |
+
**License:** API Terms of Use (Last updated March 2025)
|
| 612 |
+
**Rate Limit:** 1,000 API calls per day
|
| 613 |
+
|
| 614 |
+
**What we use:**
|
| 615 |
+
- **Charity Ratings**: Encompass Star Rating (0-4 stars)
|
| 616 |
+
- **Mission Statements**: Organization mission and purpose
|
| 617 |
+
- **Website URLs**: Official organization websites
|
| 618 |
+
- **Organizational Data**: EIN, name, address, category, cause
|
| 619 |
+
- **Active Advisories**: Alerts about organization status
|
| 620 |
+
- **Encompass Score**: Overall rating score
|
| 621 |
+
- **Rating Publication Date**: When the rating was last updated
|
| 622 |
+
|
| 623 |
+
**Data Fields Accessed:**
|
| 624 |
+
```
|
| 625 |
+
- Employer Identification Number (EIN)
|
| 626 |
+
- Charity Name
|
| 627 |
+
- Mission
|
| 628 |
+
- Organization Website URL
|
| 629 |
+
- Charity Navigator URL
|
| 630 |
+
- Category & Cause
|
| 631 |
+
- Street Address, City, State, Zip, Country
|
| 632 |
+
- Active Advisories
|
| 633 |
+
- Encompass Score & Star Rating
|
| 634 |
+
- Encompass Rating Publication Date & ID
|
| 635 |
+
```
|
| 636 |
+
|
| 637 |
+
**Attribution Requirements:**
|
| 638 |
+
- **Text Credit:** "Powered by Charity Navigator" (displayed on pages using their data)
|
| 639 |
+
- **Source Citation:** Charity Navigator cited as source on all pages displaying their data
|
| 640 |
+
- **Linkbacks:** All charity data links back to corresponding Charity Navigator profile pages
|
| 641 |
+
- **Trademark Notice:** CHARITY NAVIGATOR and the CHARITY NAVIGATOR logo are registered trademarks of Charity Navigator. All rights reserved. Used with permission.
|
| 642 |
+
|
| 643 |
+
**BibTeX:**
|
| 644 |
+
```bibtex
|
| 645 |
+
@misc{charitynavigator_api,
|
| 646 |
+
title = {Charity Navigator API},
|
| 647 |
+
author = {{Charity Navigator, Inc.}},
|
| 648 |
+
year = {2025},
|
| 649 |
+
url = {https://www.charitynavigator.org},
|
| 650 |
+
note = {GraphQL API providing nonprofit ratings, mission statements, and organizational data}
|
| 651 |
+
}
|
| 652 |
+
```
|
| 653 |
+
|
| 654 |
+
**Compliance:**
|
| 655 |
+
This project complies with Charity Navigator's API Terms of Use, including:
|
| 656 |
+
- Rate limit compliance (max 1,000 calls/day)
|
| 657 |
+
- Proper attribution and branding
|
| 658 |
+
- Linkbacks to Charity Navigator profile pages
|
| 659 |
+
- Trademark acknowledgment
|
| 660 |
+
- Data caching for performance only (not for redistribution)
|
| 661 |
+
|
| 662 |
+
**Example Profile Link Format:**
|
| 663 |
+
```html
|
| 664 |
+
<a href="https://www.charitynavigator.org/ein/134141945">
|
| 665 |
+
Michael J. Fox Foundation for Parkinson's Research
|
| 666 |
+
</a>
|
| 667 |
+
```
|
| 668 |
+
|
| 669 |
+
**Related Tools:**
|
| 670 |
+
- [Nonprofit enrichment script](scripts/enrich_nonprofits_charitynavigator.py) (if created)
|
| 671 |
+
- [API integration documentation](website/docs/data-sources/charity-navigator.md) (if created)
|
| 672 |
+
|
| 673 |
+
---
|
| 674 |
+
|
| 675 |
+
### **OpenSecrets.org (Center for Responsive Politics)** ⭐
|
| 676 |
+
|
| 677 |
+
**Organization:** OpenSecrets, a nonpartisan research organization tracking money in U.S. politics
|
| 678 |
+
**Website:** https://www.opensecrets.org
|
| 679 |
+
**Bulk Data:** https://www.opensecrets.org/open-data/bulk-data
|
| 680 |
+
**API Documentation:** https://www.opensecrets.org/open-data/api
|
| 681 |
+
**Status:** Bulk data access pending approval
|
| 682 |
+
|
| 683 |
+
**What they offer:**
|
| 684 |
+
- **Campaign Finance Data**: Federal campaign contributions, expenditures, and fundraising
|
| 685 |
+
- **Lobbying Data**: Federal lobbying spending by organizations and industries
|
| 686 |
+
- **Political Action Committees (PACs)**: PAC contributions and expenditures
|
| 687 |
+
- **Personal Finance Disclosures**: Wealth and financial interests of federal lawmakers
|
| 688 |
+
- **501(c) Organizations**: Political spending by nonprofits and dark money groups
|
| 689 |
+
- **Foreign Lobby Influence**: Foreign agents registered under FARA
|
| 690 |
+
|
| 691 |
+
**Data Access:**
|
| 692 |
+
- **Bulk Data Downloads**: Available to nonprofits upon approval (application pending)
|
| 693 |
+
- **Public API**: Available with rate limits for smaller queries
|
| 694 |
+
- **Data Format**: CSV files with detailed transaction-level records
|
| 695 |
+
- **Update Frequency**: Regular updates as new filings are processed
|
| 696 |
+
- **Coverage**: Federal-level political finance data (1990-present)
|
| 697 |
+
|
| 698 |
+
**What we plan to use:**
|
| 699 |
+
- Nonprofit political spending and advocacy activity
|
| 700 |
+
- Lobbying expenditures by healthcare and oral health organizations
|
| 701 |
+
- Campaign contributions from dental associations and health policy groups
|
| 702 |
+
- 501(c)(4) "dark money" spending on ballot measures
|
| 703 |
+
- Cross-reference EINs with IRS nonprofit data for comprehensive profiles
|
| 704 |
+
|
| 705 |
+
**BibTeX:**
|
| 706 |
+
```bibtex
|
| 707 |
+
@misc{opensecrets,
|
| 708 |
+
title = {OpenSecrets.org: Money in Politics Database},
|
| 709 |
+
author = {{Center for Responsive Politics}},
|
| 710 |
+
year = {2024},
|
| 711 |
+
url = {https://www.opensecrets.org},
|
| 712 |
+
note = {Comprehensive database of campaign finance, lobbying, and political spending in U.S. politics}
|
| 713 |
+
}
|
| 714 |
+
```
|
| 715 |
+
|
| 716 |
+
**License & Attribution:**
|
| 717 |
+
- Data collected from Federal Election Commission (FEC) and other public sources
|
| 718 |
+
- Attribution required: "Data from OpenSecrets.org, a project of the Center for Responsive Politics"
|
| 719 |
+
- Nonprofit bulk data access subject to approval and terms of use
|
| 720 |
+
|
| 721 |
+
**Application Status:**
|
| 722 |
+
- ⏳ Bulk data access application pending approval
|
| 723 |
+
- Will enable comprehensive analysis of nonprofit political activity
|
| 724 |
+
- Integration planned upon approval
|
| 725 |
+
|
| 726 |
+
---
|
| 727 |
+
|
| 728 |
+
### **IRS Exempt Organizations Business Master File (EO-BMF)**
|
| 729 |
+
|
| 730 |
+
Basic nonprofit registration data (name, EIN, address, NTEE code).
|
| 731 |
+
|
| 732 |
+
### **IRS Exempt Organizations Business Master File (EO-BMF)**
|
| 733 |
+
- Complete database of 1.9M+ U.S. tax-exempt organizations
|
| 734 |
+
- Organization: Internal Revenue Service (IRS)
|
| 735 |
+
- Source: https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf
|
| 736 |
+
- Download: https://www.irs.gov/pub/irs-soi/ (4 regional CSV files)
|
| 737 |
+
- Format: CSV (basic organizational data: name, EIN, address, NTEE code, etc.)
|
| 738 |
+
- Update frequency: Monthly
|
| 739 |
+
- License: Public Domain (U.S. Government data)
|
| 740 |
+
- Coverage: All registered tax-exempt organizations under sections 501(c)(3), 501(c)(4), etc.
|
| 741 |
+
- Used for: Nonprofit discovery, organization matching, NTEE categorization
|
| 742 |
+
|
| 743 |
+
**Note:** This is the **Business Master File** (basic info). For detailed financial data, see IRS Form 990 XML below.
|
| 744 |
+
|
| 745 |
+
### **IRS Form 990 XML Filings** ⭐
|
| 746 |
+
- Detailed financial filings from nonprofit tax returns
|
| 747 |
+
- Organization: Internal Revenue Service (IRS)
|
| 748 |
+
- Source: https://www.irs.gov/charities-non-profits/form-990-series-downloads
|
| 749 |
+
- Format: XML (highly detailed financial and operational data)
|
| 750 |
+
- Parser Tools: **Giving Tuesday** open source libraries
|
| 751 |
+
- XML Parser: https://github.com/Giving-Tuesday/form-990-xml-parser
|
| 752 |
+
- XML Mapper: https://github.com/Giving-Tuesday/form-990-xml-mapper
|
| 753 |
+
- AWS S3 Index: https://registry.opendata.aws/irs990/
|
| 754 |
+
- License: Public Domain (U.S. Government data)
|
| 755 |
+
- Coverage: Annual filings from organizations with >$50K revenue
|
| 756 |
+
- Data includes: Detailed revenue, expenses, program services, officer compensation, grants, donors
|
| 757 |
+
- Used for: Financial analysis, transparency, grant research, program evaluation
|
| 758 |
+
|
| 759 |
+
**Giving Tuesday Attribution:**
|
| 760 |
+
The Giving Tuesday Data Commons provides essential tools for parsing IRS Form 990 XML data:
|
| 761 |
+
```bibtex
|
| 762 |
+
@software{giving_tuesday_form990_parser,
|
| 763 |
+
title = {Form 990 XML Parser},
|
| 764 |
+
author = {{Giving Tuesday}},
|
| 765 |
+
year = {2024},
|
| 766 |
+
url = {https://github.com/Giving-Tuesday/form-990-xml-parser},
|
| 767 |
+
note = {Open source Python library for parsing IRS Form 990 XML filings}
|
| 768 |
+
}
|
| 769 |
+
|
| 770 |
+
@software{giving_tuesday_form990_mapper,
|
| 771 |
+
title = {Form 990 XML Mapper},
|
| 772 |
+
author = {{Giving Tuesday}},
|
| 773 |
+
year = {2024},
|
| 774 |
+
url = {https://github.com/Giving-Tuesday/form-990-xml-mapper},
|
| 775 |
+
note = {Maps Form 990 XML to standardized data structures}
|
| 776 |
+
}
|
| 777 |
+
```
|
| 778 |
+
|
| 779 |
+
**More Giving Tuesday Resources:**
|
| 780 |
+
- GitHub Organization: https://github.com/Giving-Tuesday
|
| 781 |
+
- Data Commons: https://www.givingtuesday.org/data-commons
|
| 782 |
+
- Research & Insights: https://www.givingtuesday.org/research
|
| 783 |
+
- Coverage: Standardized schemas for Person, Organization, Membership, Post, Area, Motion, VoteEvent, Count
|
| 784 |
+
- Used for: Leader/official data modeling, organization structure, membership tracking, voting records
|
| 785 |
+
- Adoption: Used by Civic Commons, OpenNorth, mySociety, Sunlight Foundation, and 30+ civic tech organizations worldwide
|
| 786 |
+
- Citation: "Popolo Project. Open government data specifications. https://www.popoloproject.com/"
|
| 787 |
+
- **Key Features:**
|
| 788 |
+
- **Person**: Names, contact details, identifiers, links to images/sources
|
| 789 |
+
- **Organization**: Names, classification, founding/dissolution dates, contact information
|
| 790 |
+
- **Membership**: Relationship between persons and organizations (with roles and time periods)
|
| 791 |
+
- **Post**: Positions within organizations (e.g., "Mayor", "City Council Member District 3")
|
| 792 |
+
- **VoteEvent**: Votes on motions/bills with individual voter positions
|
| 793 |
+
- **Our Implementation**: LEADER and ORGANIZATION entities follow Popolo schema for maximum interoperability with civic tech platforms
|
| 794 |
+
|
| 795 |
+
**Popolo Dependencies & Standards:**
|
| 796 |
+
The Popolo specification builds upon and references the following W3C, IETF, and open data standards:
|
| 797 |
+
|
| 798 |
+
| Publisher | Specification | Prefix | Use in Popolo | URL |
|
| 799 |
+
|-----------|---------------|--------|---------------|-----|
|
| 800 |
+
| Bibliographic Framework Initiative | BIBFRAME Vocabulary | `bf` | Bibliographic references | https://www.loc.gov/bibframe/ |
|
| 801 |
+
| Ian Davis | BIO: Biographical Information | `bio` | Life events, relationships | http://purl.org/vocab/bio/0.1/ |
|
| 802 |
+
| W3C | Contact: Utility concepts | `con` | Contact information | http://www.w3.org/2000/10/swap/pim/contact# |
|
| 803 |
+
| DCMI | DCMI Metadata Terms | `dcterms` | Metadata, provenance | https://www.dublincore.org/specifications/dublin-core/dcmi-terms/ |
|
| 804 |
+
| FOAF Project | FOAF Vocabulary | `foaf` | People, social networks | http://xmlns.com/foaf/0.1/ |
|
| 805 |
+
| GeoNames | GeoNames Ontology | `gn` | Geographic names | http://www.geonames.org/ontology/ |
|
| 806 |
+
| ISA Programme | Location Core Vocabulary | `locn` | Addresses, locations | https://www.w3.org/ns/locn |
|
| 807 |
+
| OSCA Foundation | NEPOMUK Calendar Ontology | `ncal` | Events, meetings | http://www.semanticdesktop.org/ontologies/ncal/ |
|
| 808 |
+
| Open Data Institute | Open Data Rights Statement | `odrs` | Data licensing | http://schema.theodi.org/odrs |
|
| 809 |
+
| W3C | The Organization Ontology | `org` | Organizational structures | https://www.w3.org/TR/vocab-org/ |
|
| 810 |
+
| ISA Programme | Person Core Vocabulary | `person` | Person attributes | http://www.w3.org/ns/person |
|
| 811 |
+
| W3C | RDF Schema | `rdfs` | Semantic web foundation | https://www.w3.org/TR/rdf-schema/ |
|
| 812 |
+
| W3C | Schema.org | `schema` | Structured data | https://schema.org/ |
|
| 813 |
+
| W3C | SKOS | `skos` | Taxonomies, classification | https://www.w3.org/2004/02/skos/ |
|
| 814 |
+
| IETF | vCard Format | `vcard` | Contact information | https://www.rfc-editor.org/rfc/rfc6350.html |
|
| 815 |
+
|
| 816 |
+
**Popolo Classes Implemented:**
|
| 817 |
+
- ✅ **Person** → LEADER entity (elected officials, appointees)
|
| 818 |
+
- ✅ **Organization** → ORGANIZATION entity (nonprofits, government agencies)
|
| 819 |
+
- ✅ **Membership** → Implicit through leader_id/organization relationships
|
| 820 |
+
- ✅ **Post** → position_type, office fields in LEADER
|
| 821 |
+
- ✅ **Contact Detail** → email, phone, website fields
|
| 822 |
+
- ✅ **Motion** → AGENDA items, LEGISLATION entities
|
| 823 |
+
- ✅ **Vote Event** → VOTE entity
|
| 824 |
+
- ✅ **Count** → vote_yes, vote_no in VOTE and LEGISLATION
|
| 825 |
+
- ✅ **Area** → JURISDICTION entity (geographic/political boundaries)
|
| 826 |
+
- ✅ **Event** → MEETING entity
|
| 827 |
+
- ✅ **Speech** → Extracted from MINUTES, VIDEO transcripts
|
| 828 |
+
|
| 829 |
+
### **Roper Center for Public Opinion Research**
|
| 830 |
+
- Scientifically validated survey questions and public opinion data
|
| 831 |
+
- Organization: Cornell University
|
| 832 |
+
- Source: https://ropercenter.cornell.edu/
|
| 833 |
+
- iPoll Database: https://ropercenter.cornell.edu/ipoll/
|
| 834 |
+
- License: Free public search (metadata and question wording), full data requires institutional membership
|
| 835 |
+
- Coverage: 500,000+ survey questions from 1930s-present, all major polling organizations
|
| 836 |
+
- Used for: Topic definitions, validated question wording, national opinion baselines, messaging optimization
|
| 837 |
+
- Citation: "Roper Center for Public Opinion Research, Cornell University. iPoll Databank. https://ropercenter.cornell.edu/ipoll/"
|
| 838 |
+
|
| 839 |
+
### **Google Fact Check Tools API**
|
| 840 |
+
- Aggregated fact-checking data with ClaimReview structured data
|
| 841 |
+
- Organization: Google LLC
|
| 842 |
+
- Source: https://toolbox.google.com/factcheck/explorer
|
| 843 |
+
- API: https://developers.google.com/fact-check/tools/api
|
| 844 |
+
- Schema: https://developers.google.com/search/docs/appearance/structured-data/factcheck
|
| 845 |
+
- License: Free API with quota (10,000 queries/day)
|
| 846 |
+
- Coverage: 100+ fact-checking organizations worldwide, all claim types
|
| 847 |
+
- Used for: Verifying claims from meetings/legislation, tracking misinformation, accountability scoring
|
| 848 |
+
- Citation: "Google Fact Check Tools API. Google LLC. https://developers.google.com/fact-check/tools/api"
|
| 849 |
+
|
| 850 |
+
### **FactCheck.org**
|
| 851 |
+
- Nonpartisan fact-checking of political claims and viral misinformation
|
| 852 |
+
- Organization: Annenberg Public Policy Center, University of Pennsylvania
|
| 853 |
+
- Source: https://www.factcheck.org/
|
| 854 |
+
- License: Free (web scraping allowed with rate limiting)
|
| 855 |
+
- Coverage: National politics, health claims, science, viral content (2003-present)
|
| 856 |
+
- Used for: Verifying political claims, health policy fact-checking, scientific claim verification
|
| 857 |
+
- Citation: "FactCheck.org. Annenberg Public Policy Center, University of Pennsylvania. https://www.factcheck.org/"
|
| 858 |
+
|
| 859 |
+
### **PolitiFact**
|
| 860 |
+
- Pulitzer Prize-winning fact-checking with Truth-O-Meter ratings
|
| 861 |
+
- Organization: Poynter Institute
|
| 862 |
+
- Source: https://www.politifact.com/
|
| 863 |
+
- License: Free (web scraping allowed with rate limiting)
|
| 864 |
+
- Coverage: All 50 states, federal politics, ballot measures (2007-present)
|
| 865 |
+
- Rating Scale: 6-point (True, Mostly True, Half True, Mostly False, False, Pants on Fire)
|
| 866 |
+
- Used for: State-level fact-checking, tracking politician claims, ballot measure verification
|
| 867 |
+
- Citation: "PolitiFact. Poynter Institute. https://www.politifact.com/"
|
| 868 |
+
|
| 869 |
+
### **Schema.org**
|
| 870 |
+
- Structured data vocabulary for semantic web markup
|
| 871 |
+
- Organization: W3C Community Group (sponsors: Google, Microsoft, Yahoo, Yandex)
|
| 872 |
+
- Source: https://schema.org/
|
| 873 |
+
- Documentation: https://schema.org/docs/schemas.html
|
| 874 |
+
- License: Creative Commons Attribution-ShareAlike License (CC BY-SA 3.0)
|
| 875 |
+
- Coverage: 800+ types, 1,400+ properties for describing web content
|
| 876 |
+
- Used for: SEO-optimized structured data, JSON-LD exports, API documentation, search engine compatibility
|
| 877 |
+
- Citation: "Schema.org. W3C Community Group. https://schema.org/"
|
| 878 |
+
|
| 879 |
+
**Our Schema.org Type Mappings:**
|
| 880 |
+
|
| 881 |
+
| Our Entity | Schema.org Type | Properties Used | Use Case |
|
| 882 |
+
|------------|----------------|-----------------|----------|
|
| 883 |
+
| JURISDICTION | [AdministrativeArea](https://schema.org/AdministrativeArea) | name, address, geo, telephone, url | City/county geographic data |
|
| 884 |
+
| MEETING | [Event](https://schema.org/Event) + [GovernmentService](https://schema.org/GovernmentService) | name, startDate, location, organizer, description | Public meetings, hearings |
|
| 885 |
+
| LEADER | [Person](https://schema.org/Person) + [GovernmentOfficial](https://schema.org/GovernmentOfficial) | name, email, telephone, jobTitle, worksFor | Elected officials |
|
| 886 |
+
| ORGANIZATION | [Organization](https://schema.org/Organization) + [NGO](https://schema.org/NGO) | name, address, telephone, url, foundingDate | Nonprofits, agencies |
|
| 887 |
+
| LEGISLATION | [Legislation](https://schema.org/Legislation) | name, legislationDate, legislationPassedBy, legislationType | Bills, ordinances |
|
| 888 |
+
| BALLOT_MEASURE | [Legislation](https://schema.org/Legislation) + referendumProposal | name, datePosted, legislationChanges | Referendums, propositions |
|
| 889 |
+
| VOTE | [VoteAction](https://schema.org/VoteAction) | agent (Person), candidate (Legislation), actionOption | Roll call votes |
|
| 890 |
+
| FACT_CHECK | [ClaimReview](https://schema.org/ClaimReview) | claimReviewed, reviewRating, author, datePublished | Verified fact-checks |
|
| 891 |
+
| SCHOOL_DISTRICT | [EducationalOrganization](https://schema.org/EducationalOrganization) | name, address, telephone, numberOfStudents | K-12 school districts |
|
| 892 |
+
| NONPROFIT_FINANCES | [MonetaryGrant](https://schema.org/MonetaryGrant) | funder, amount, fundedItem | IRS Form 990 data |
|
| 893 |
+
| VIDEO | [VideoObject](https://schema.org/VideoObject) | name, description, uploadDate, duration, thumbnailUrl | Meeting recordings |
|
| 894 |
+
| DOCUMENT | [DigitalDocument](https://schema.org/DigitalDocument) | name, fileFormat, datePublished, url | PDFs, agendas, minutes |
|
| 895 |
+
|
| 896 |
+
**Benefits:**
|
| 897 |
+
- ✅ **SEO Enhancement**: Google Search rich results for meetings, officials, organizations
|
| 898 |
+
- ✅ **Voice Assistant Ready**: Alexa, Google Assistant can parse our structured data
|
| 899 |
+
- ✅ **Knowledge Graph**: Data appears in Google Knowledge Panels
|
| 900 |
+
- ✅ **API Discoverability**: Standards-compliant REST/GraphQL responses
|
| 901 |
+
- ✅ **Cross-platform**: Compatible with Apple Podcasts, Microsoft Bing, Yandex
|
| 902 |
+
|
| 903 |
+
### **Common Education Data Standards (CEDS)**
|
| 904 |
+
- Comprehensive education data standards for K-12, postsecondary, and workforce
|
| 905 |
+
- Organization: U.S. Department of Education, National Center for Education Statistics (NCES)
|
| 906 |
+
- Source: https://ceds.ed.gov/
|
| 907 |
+
- GitHub: https://github.com/CEDStandards
|
| 908 |
+
- Specification Repository: https://github.com/CEDStandards/CEDS-Elements
|
| 909 |
+
- License: Public Domain (U.S. Government)
|
| 910 |
+
- Coverage: 2,300+ data elements, 500+ option sets, alignment with NCES surveys
|
| 911 |
+
- Used for: School district data modeling, NCES interoperability, education finance tracking
|
| 912 |
+
- Citation: "Common Education Data Standards (CEDS). National Center for Education Statistics. https://ceds.ed.gov/"
|
| 913 |
+
|
| 914 |
+
**CEDS Alignment for School Districts:**
|
| 915 |
+
|
| 916 |
+
| Our Field | CEDS Element ID | CEDS Element Name | Description |
|
| 917 |
+
|-----------|----------------|-------------------|-------------|
|
| 918 |
+
| `nces_id` | 000827 | LEA Identifier (NCES) | National Center for Education Statistics LEA ID |
|
| 919 |
+
| `district_name` | 000168 | Name of Institution | Legal name of the school district |
|
| 920 |
+
| `district_type` | 000108 | LEA Type | Local, State, Federal, or Other |
|
| 921 |
+
| `total_students` | 001475 | Student Count | Total number of students enrolled |
|
| 922 |
+
| `total_schools` | 000856 | Number of Schools | Count of schools in district |
|
| 923 |
+
| `total_revenue` | 000612 | Total Revenue | Sum of all revenue sources |
|
| 924 |
+
| `total_expenditures` | 000611 | Total Expenditures | Sum of all spending categories |
|
| 925 |
+
| `per_pupil_spending` | 000613 | Expenditure per Student | Total expenditures / student count |
|
| 926 |
+
| `federal_revenue` | 000614 | Federal Revenue | Revenue from federal government |
|
| 927 |
+
| `state_revenue` | 000615 | State Revenue | Revenue from state sources |
|
| 928 |
+
| `local_revenue` | 000616 | Local Revenue | Revenue from property taxes, bonds |
|
| 929 |
+
| `superintendent` | 000240 | Chief Administrator Name | District superintendent name |
|
| 930 |
+
| `school_year` | 000243 | School Year | Academic year (e.g., 2023-2024) |
|
| 931 |
+
|
| 932 |
+
**CEDS Option Sets Used:**
|
| 933 |
+
- **LEA Type** (CEDS 000108): Regular, Specialized, Supervisory Union, Service Agency, State Agency, Federal Agency
|
| 934 |
+
- **Grade Level** (CEDS 000100): PK, KG, 01-12, UG (ungraded)
|
| 935 |
+
- **Operational Status** (CEDS 000533): Open, Closed, New, Added, Changed Agency, Temporarily Closed
|
| 936 |
+
- **Locale Type** (CEDS 001315): City, Suburb, Town, Rural (NCES Urban-centric locale codes)
|
| 937 |
+
|
| 938 |
+
**Benefits of CEDS Compliance:**
|
| 939 |
+
- ✅ **NCES Compatibility**: Direct mapping to Common Core of Data (CCD) and F-33 Finance Survey
|
| 940 |
+
- ✅ **State Reporting**: Aligns with state education department data systems
|
| 941 |
+
- ✅ **Federal Grants**: Standardized reporting for ESSA, Title I, IDEA compliance
|
| 942 |
+
- ✅ **Longitudinal Tracking**: Consistent identifiers for multi-year analysis
|
| 943 |
+
- ✅ **Interoperability**: Works with Ed-Fi Alliance, IMS Global, SIF Association standards
|
| 944 |
+
|
| 945 |
+
### **Microsoft Common Data Model for Nonprofits**
|
| 946 |
+
- Industry-standard data model for nonprofit organizations built on Microsoft Dataverse
|
| 947 |
+
- Organization: Microsoft Corporation
|
| 948 |
+
- Repository: https://github.com/microsoft/Nonprofits/tree/master/CommonDataModelforNonprofits
|
| 949 |
+
- ERD Documentation: https://github.com/microsoft/Nonprofits/blob/master/CommonDataModelforNonprofits/Documents/common-data-model-for-nonprofits-erds.pdf
|
| 950 |
+
- License: MIT License
|
| 951 |
+
- Coverage: Donor management, fundraising, program delivery, volunteer management, impact measurement, award/grant tracking
|
| 952 |
+
- Used for: Nonprofit data standardization, Dynamics 365 integration, constituent relationship management, outcome tracking
|
| 953 |
+
- Citation: "Microsoft Common Data Model for Nonprofits. Microsoft Corporation. https://github.com/microsoft/Nonprofits/"
|
| 954 |
+
|
| 955 |
+
**Microsoft CDM Nonprofit Core Entities:**
|
| 956 |
+
|
| 957 |
+
| Entity | Description | Our Implementation |
|
| 958 |
+
|--------|-------------|--------------------|
|
| 959 |
+
| **Constituent** | Individuals who interact with nonprofit (donors, volunteers, members, beneficiaries) | CONSTITUENT entity |
|
| 960 |
+
| **Donation** | Financial contributions and in-kind gifts | DONATION entity |
|
| 961 |
+
| **Designation** | How donations are allocated (programs, funds, campaigns) | designation_id in DONATION |
|
| 962 |
+
| **Campaign** | Fundraising campaigns and appeals | CAMPAIGN entity |
|
| 963 |
+
| **Membership** | Member enrollment and renewal tracking | MEMBERSHIP entity |
|
| 964 |
+
| **Volunteer** | Volunteer activities, hours, and preferences | VOLUNTEER_ACTIVITY entity |
|
| 965 |
+
| **Award** | Grants received by the nonprofit | Awards captured in NONPROFIT_FINANCES |
|
| 966 |
+
| **Disbursement** | Spending of grant/award funds | Expenditures in GOVERNMENT_BUDGET |
|
| 967 |
+
| **Objective** | Measurable program outcomes and impact | PROGRAM_OUTCOME entity |
|
| 968 |
+
| **DeliveryFramework** | Programs and services delivered | PROGRAM_DELIVERY entity |
|
| 969 |
+
| **Budget** | Organizational budgets and allocations | GOVERNMENT_BUDGET, SCHOOL_DISTRICT budgets |
|
| 970 |
+
| **Indicator** | Key performance indicators for impact | Metrics in PROGRAM_OUTCOME |
|
| 971 |
+
|
| 972 |
+
**Key Entity Relationships (Microsoft CDM Pattern):**
|
| 973 |
+
- Constituent → Donation (one-to-many): A constituent makes many donations
|
| 974 |
+
- Donation → Designation (many-to-one): Multiple donations to one fund/program
|
| 975 |
+
- Campaign → Donation (one-to-many): A campaign receives many donations
|
| 976 |
+
- Constituent → Membership (one-to-many): A constituent can have multiple memberships over time
|
| 977 |
+
- Constituent → Volunteer (one-to-many): A constituent volunteers for multiple activities
|
| 978 |
+
- Organization → DeliveryFramework (one-to-many): An organization delivers multiple programs
|
| 979 |
+
- DeliveryFramework → Objective (one-to-many): A program has multiple outcome objectives
|
| 980 |
+
|
| 981 |
+
**Benefits of Microsoft CDM Alignment:**
|
| 982 |
+
- ✅ **Dynamics 365 Integration**: Native compatibility with Microsoft Cloud for Nonprofits
|
| 983 |
+
- ✅ **Power Platform**: Direct use in Power BI, Power Apps, Power Automate
|
| 984 |
+
- ✅ **Azure Synapse**: Seamless analytics with Azure data services
|
| 985 |
+
- ✅ **Industry Standard**: Adopted by large nonprofits using Microsoft ecosystem
|
| 986 |
+
- ✅ **Grant Compliance**: Built-in support for grant reporting and outcome measurement
|
| 987 |
+
- ✅ **Constituent 360**: Unified view of donor, volunteer, member activities
|
| 988 |
+
|
| 989 |
+
---
|
| 990 |
+
|
| 991 |
+
## 🎯 **Grant Research and Fundraising Platforms**
|
| 992 |
+
|
| 993 |
+
These platforms are built on open-source principles or community-funded models to keep grant and fundraising data accessible.
|
| 994 |
+
|
| 995 |
+
### **Grantmakers.io** ⭐
|
| 996 |
+
|
| 997 |
+
**"Free as in Freedom" Grant Research**
|
| 998 |
+
|
| 999 |
+
Grantmakers.io is the gold standard for open, community-supported foundation research. It provides lightning-fast search through IRS 990-PF data with no login required.
|
| 1000 |
+
|
| 1001 |
+
**Organization:** Community-supported open-source project
|
| 1002 |
+
**Website:** https://www.grantmakers.io/
|
| 1003 |
+
**Data Source:** IRS Form 990-PF (Private Foundation tax returns)
|
| 1004 |
+
**License:** Open source, community-funded
|
| 1005 |
+
**Access:** 100% free, no account or API key required
|
| 1006 |
+
**Coverage:** All U.S. private foundations filing Form 990-PF (75,000+ grantmaking foundations)
|
| 1007 |
+
|
| 1008 |
+
**What we use:**
|
| 1009 |
+
- **Foundation Giving Histories**: Search foundations by who they've funded in the past
|
| 1010 |
+
- **Grantee Databases**: Find all grants made to specific organizations
|
| 1011 |
+
- **Geographic Targeting**: Search by state, city, or region
|
| 1012 |
+
- **Funding Amounts**: Filter by grant size ranges
|
| 1013 |
+
- **NTEE Categories**: Search by nonprofit sector (health, education, environment, etc.)
|
| 1014 |
+
- **Year-over-Year Trends**: Track foundation giving patterns over time
|
| 1015 |
+
|
| 1016 |
+
**Key Features:**
|
| 1017 |
+
- ⚡ **Lightning-Fast Search**: Instant results across millions of grant records
|
| 1018 |
+
- 🔓 **No Login Required**: Completely open access, no barriers
|
| 1019 |
+
- 📊 **Detailed 990-PF Data**: Full foundation financials, officers, assets
|
| 1020 |
+
- 🎯 **Relationship Mapping**: Discover foundation-grantee connections
|
| 1021 |
+
- 📈 **Trend Analysis**: Multi-year giving patterns and focus areas
|
| 1022 |
+
- 🆓 **Always Free**: Community-funded to remain accessible
|
| 1023 |
+
|
| 1024 |
+
**Use Cases:**
|
| 1025 |
+
- **Grant Prospecting**: Find foundations that fund similar organizations in your area
|
| 1026 |
+
- **Relationship Research**: Identify foundations that have supported oral health, public health, or civic engagement
|
| 1027 |
+
- **Competitive Analysis**: See which organizations are receiving grants in your field
|
| 1028 |
+
- **Foundation Vetting**: Review foundation assets, giving patterns, and leadership before applying
|
| 1029 |
+
|
| 1030 |
+
**Example Searches:**
|
| 1031 |
+
- Foundations that funded "fluoridation" or "oral health" projects
|
| 1032 |
+
- Grantmakers in Massachusetts supporting health policy advocacy
|
| 1033 |
+
- Foundations with >$10M assets funding civic engagement
|
| 1034 |
+
- All grants made by Robert Wood Johnson Foundation to nonprofits in Alabama
|
| 1035 |
+
|
| 1036 |
+
**BibTeX:**
|
| 1037 |
+
```bibtex
|
| 1038 |
+
@misc{grantmakersio,
|
| 1039 |
+
title = {Grantmakers.io: Open Foundation Research Platform},
|
| 1040 |
+
year = {2026},
|
| 1041 |
+
url = {https://www.grantmakers.io/},
|
| 1042 |
+
note = {Community-supported open-source platform for searching IRS 990-PF private foundation data}
|
| 1043 |
+
}
|
| 1044 |
+
```
|
| 1045 |
+
|
| 1046 |
+
**Citation:** "Grantmakers.io. Community-supported open foundation research. https://www.grantmakers.io/"
|
| 1047 |
+
|
| 1048 |
+
---
|
| 1049 |
+
|
| 1050 |
+
### **Zeffy** ⭐
|
| 1051 |
+
|
| 1052 |
+
**100% Free Fundraising with AI-Powered Grant Matching**
|
| 1053 |
+
|
| 1054 |
+
Zeffy is unique for being a completely free fundraising platform that also offers an AI-powered grant search tool to help match nonprofit missions with potential grant opportunities.
|
| 1055 |
+
|
| 1056 |
+
**Organization:** Zeffy, Inc.
|
| 1057 |
+
**Website:** https://www.zeffy.com/
|
| 1058 |
+
**Platform:** Fundraising + Grant Discovery
|
| 1059 |
+
**Cost:** 100% free for nonprofits (donor-covered fees model)
|
| 1060 |
+
**Grant Tool:** AI-powered grant opportunity matching
|
| 1061 |
+
**Coverage:** U.S. and Canadian grant opportunities
|
| 1062 |
+
|
| 1063 |
+
**What we use:**
|
| 1064 |
+
- **AI Grant Matching**: Automated matching of nonprofit missions to relevant grant opportunities
|
| 1065 |
+
- **Fundraising Infrastructure**: Donation processing, event ticketing, membership management
|
| 1066 |
+
- **Donor Management**: CRM for tracking constituent relationships
|
| 1067 |
+
- **Grant Alerts**: Notifications when new matching opportunities are posted
|
| 1068 |
+
|
| 1069 |
+
**Key Features:**
|
| 1070 |
+
- 💰 **100% Free**: No platform fees, monthly charges, or hidden costs
|
| 1071 |
+
- 🤖 **AI-Powered Matching**: Machine learning matches your mission to grant opportunities
|
| 1072 |
+
- 📧 **Grant Alerts**: Email notifications for new matching grants
|
| 1073 |
+
- 🎟️ **All-in-One Platform**: Donations, events, memberships, grants in one system
|
| 1074 |
+
- 🇺🇸 🇨🇦 **North America Coverage**: U.S. and Canadian grant databases
|
| 1075 |
+
- 📊 **Impact Reporting**: Built-in analytics for grant reporting requirements
|
| 1076 |
+
|
| 1077 |
+
**Grant Discovery Capabilities:**
|
| 1078 |
+
- **Mission-Based Matching**: Upload your mission statement, get matched grants
|
| 1079 |
+
- **Federal Grants**: Monitors Grants.gov for federal opportunities
|
| 1080 |
+
- **Foundation Grants**: Tracks private foundation RFPs and announcements
|
| 1081 |
+
- **Corporate Giving**: Alerts for corporate philanthropy programs
|
| 1082 |
+
- **Local Grants**: Community foundation and regional funder opportunities
|
| 1083 |
+
|
| 1084 |
+
**Use Cases for This Project:**
|
| 1085 |
+
- **Nonprofit Fundraising**: Organizations can use Zeffy for zero-cost donation processing
|
| 1086 |
+
- **Grant Prospecting**: AI helps match oral health nonprofits to relevant grant opportunities
|
| 1087 |
+
- **Event Fundraising**: Free ticketing for fundraising galas, community events
|
| 1088 |
+
- **Membership Management**: Track supporters, volunteers, members at no cost
|
| 1089 |
+
- **Sustainability**: Recommend to small nonprofits to reduce overhead costs
|
| 1090 |
+
|
| 1091 |
+
**Why It's Important:**
|
| 1092 |
+
Traditional fundraising platforms charge 3-5% fees on donations, which drains resources from small nonprofits. Zeffy's donor-covered model means 100% of donations go to the organization, making it especially valuable for grassroots oral health advocacy groups.
|
| 1093 |
+
|
| 1094 |
+
**BibTeX:**
|
| 1095 |
+
```bibtex
|
| 1096 |
+
@misc{zeffy_platform,
|
| 1097 |
+
title = {Zeffy: 100% Free Fundraising Platform with AI Grant Matching},
|
| 1098 |
+
author = {{Zeffy, Inc.}},
|
| 1099 |
+
year = {2026},
|
| 1100 |
+
url = {https://www.zeffy.com/},
|
| 1101 |
+
note = {Free fundraising platform with AI-powered grant discovery for U.S. and Canadian nonprofits}
|
| 1102 |
+
}
|
| 1103 |
+
```
|
| 1104 |
+
|
| 1105 |
+
**Citation:** "Zeffy. 100% Free Fundraising Platform with AI Grant Matching. https://www.zeffy.com/"
|
| 1106 |
+
|
| 1107 |
+
---
|
| 1108 |
+
|
| 1109 |
+
### **Community Foundations** ⭐
|
| 1110 |
+
|
| 1111 |
+
**Local Grant Opportunities Often Overlooked**
|
| 1112 |
+
|
| 1113 |
+
Community foundations are often the most accessible grant sources for local nonprofits, yet they're frequently overlooked because they don't appear in major federal databases. Most maintain their own open listings for regional grants.
|
| 1114 |
+
|
| 1115 |
+
**What Community Foundations Are:**
|
| 1116 |
+
Community foundations are public charities that pool donations from individuals, families, and businesses to support local nonprofits through competitive grants, scholarship programs, and donor-advised funds.
|
| 1117 |
+
|
| 1118 |
+
**Why They Matter:**
|
| 1119 |
+
- 🏘️ **Local Focus**: Prioritize organizations serving their specific geographic region
|
| 1120 |
+
- 💵 **Smaller, Accessible Grants**: $500-$50,000 range, ideal for grassroots groups
|
| 1121 |
+
- 🤝 **Relationship-Based**: Local foundations know local issues and local leaders
|
| 1122 |
+
- 📋 **Simpler Applications**: Less bureaucratic than federal or national foundations
|
| 1123 |
+
- ⚡ **Faster Decisions**: Many have quarterly or rolling deadlines
|
| 1124 |
+
- 🎯 **Mission Alignment**: Support for community health, civic engagement, education
|
| 1125 |
+
|
| 1126 |
+
**Examples of Community Foundations:**
|
| 1127 |
+
|
| 1128 |
+
| Foundation | Region | Website | Grant Focus Areas |
|
| 1129 |
+
|------------|--------|---------|-------------------|
|
| 1130 |
+
| **Central Alabama Community Foundation** | Birmingham, AL metro | https://www.cacfbirmingham.org/ | Health, education, civic engagement, arts |
|
| 1131 |
+
| **Community Foundation for Greater Atlanta** | Atlanta, GA metro | https://cfgreateratlanta.org/ | Health equity, education, economic mobility |
|
| 1132 |
+
| **Boston Foundation** | Boston, MA metro | https://www.tbf.org/ | Health, housing, education, civic participation |
|
| 1133 |
+
| **Community Foundation of Greater Memphis** | Memphis, TN metro | https://cfgm.org/ | Health, youth development, community engagement |
|
| 1134 |
+
| **Silicon Valley Community Foundation** | San Francisco Bay Area | https://www.siliconvalleycf.org/ | Health, education, immigration, environment |
|
| 1135 |
+
| **Greater Kansas City Community Foundation** | Kansas City, MO/KS | https://www.growyourgiving.org/ | Health, education, civic infrastructure |
|
| 1136 |
+
| **Seattle Foundation** | Seattle, WA metro | https://www.seattlefoundation.org/ | Racial equity, community health, economic opportunity |
|
| 1137 |
+
|
| 1138 |
+
**How to Find Your Local Community Foundation:**
|
| 1139 |
+
1. **Council on Foundations Directory**: https://www.cof.org/community-foundation-locator
|
| 1140 |
+
2. **Candid (formerly Foundation Center)**: https://candid.org/find-us/foundation-finder
|
| 1141 |
+
3. **State Associations**: Most states have a community foundation association
|
| 1142 |
+
4. **Google Search**: "[Your City] Community Foundation" or "[Your County] Community Foundation"
|
| 1143 |
+
|
| 1144 |
+
**Grant Opportunities:**
|
| 1145 |
+
- **Competitive Grants**: Open RFPs for nonprofits in specific focus areas
|
| 1146 |
+
- **Capacity Building Grants**: Support for operations, staffing, strategic planning
|
| 1147 |
+
- **Donor-Advised Funds**: Individuals/families make grants through the foundation
|
| 1148 |
+
- **Fiscal Sponsorship**: Some foundations sponsor projects for groups without 501(c)(3) status
|
| 1149 |
+
- **Scholarship Programs**: Education grants for students (often administered by community foundations)
|
| 1150 |
+
|
| 1151 |
+
**For Oral Health Advocacy:**
|
| 1152 |
+
Many community foundations have health equity or preventive health focus areas that align perfectly with fluoridation advocacy, dental access programs, and oral health education. They're often the best first step for local grassroots campaigns.
|
| 1153 |
+
|
| 1154 |
+
**How We Use Community Foundation Data:**
|
| 1155 |
+
- **Local Grant Mapping**: Identify which community foundations serve each jurisdiction
|
| 1156 |
+
- **Nonprofit Funding Sources**: Link organizations to local foundation grants received
|
| 1157 |
+
- **Geographic Targeting**: Recommend local funders when users search by city/county
|
| 1158 |
+
- **Grant Prospecting**: Alert nonprofits to community foundation RFPs in their area
|
| 1159 |
+
|
| 1160 |
+
**BibTeX:**
|
| 1161 |
+
```bibtex
|
| 1162 |
+
@misc{community_foundations,
|
| 1163 |
+
title = {Community Foundations: Local Grant Opportunities},
|
| 1164 |
+
author = {{Council on Foundations}},
|
| 1165 |
+
year = {2026},
|
| 1166 |
+
url = {https://www.cof.org/community-foundation-locator},
|
| 1167 |
+
note = {Network of 700+ community foundations providing local grants across the United States}
|
| 1168 |
+
}
|
| 1169 |
+
```
|
| 1170 |
+
|
| 1171 |
+
**Citation:** "Community Foundations. Council on Foundations. https://www.cof.org/community-foundation-locator"
|
| 1172 |
+
|
| 1173 |
+
---
|
| 1174 |
+
|
| 1175 |
+
## 🏛️ **Civic Tech Organizations & Resources**
|
| 1176 |
+
|
| 1177 |
+
### **Code for America** ⭐
|
| 1178 |
+
|
| 1179 |
+
The flagship U.S. civic technology nonprofit organization, convening government leaders and technologists to transform public services.
|
| 1180 |
+
|
| 1181 |
+
**Organization:** Code for America
|
| 1182 |
+
**Website:** https://codeforamerica.org/
|
| 1183 |
+
**About:** National nonprofit working with government to build digital services that are simple, effective, and accessible to all
|
| 1184 |
+
**Founded:** 2009
|
| 1185 |
+
**Coverage:** National (50 states), with focus on state-level government transformation
|
| 1186 |
+
|
| 1187 |
+
**What we use:**
|
| 1188 |
+
- **Summit Insights**: Annual Summit (most recently Summit 2026) where state-level AI leads and municipal CIOs set the civic tech agenda for the year
|
| 1189 |
+
- **Brigade Network**: Community chapters across the U.S. working on local civic tech projects
|
| 1190 |
+
- **Best Practices**: Government service design patterns, digital service standards
|
| 1191 |
+
- **Technology Landscape**: Trends in state and local government digital transformation
|
| 1192 |
+
|
| 1193 |
+
**Key Programs:**
|
| 1194 |
+
- **Code for America Summit**: Annual conference bringing together 1,500+ government leaders, technologists, and advocates
|
| 1195 |
+
- **Brigade Network**: 80+ volunteer chapters in cities across America building civic tech solutions
|
| 1196 |
+
- **Get CalFresh**: Flagship product helping millions access food benefits through simplified digital applications
|
| 1197 |
+
- **Clear My Record**: Automated criminal record clearance to help people move forward
|
| 1198 |
+
- **Government Services Portfolio**: Digital tools for social safety net programs
|
| 1199 |
+
|
| 1200 |
+
**Resources:**
|
| 1201 |
+
- Summit: https://codeforamerica.org/summit/
|
| 1202 |
+
- Brigade Network: https://brigade.codeforamerica.org/
|
| 1203 |
+
- Blog: https://codeforamerica.org/news/
|
| 1204 |
+
- GitHub: https://github.com/codeforamerica
|
| 1205 |
+
- Annual Reports: https://codeforamerica.org/news/category/annual-reports/
|
| 1206 |
+
|
| 1207 |
+
**Why Code for America:**
|
| 1208 |
+
- **Agenda Setting**: The Summit is where state-level AI leads and municipal CIOs define priorities for the year
|
| 1209 |
+
- **Network Effect**: Connects civic technologists across the country
|
| 1210 |
+
- **Proven Impact**: Products serving millions of Americans annually
|
| 1211 |
+
- **Open Source**: Many tools available as open source for other governments to adopt
|
| 1212 |
+
|
| 1213 |
+
**BibTeX:**
|
| 1214 |
+
```bibtex
|
| 1215 |
+
@misc{code_for_america,
|
| 1216 |
+
title = {Code for America},
|
| 1217 |
+
author = {{Code for America}},
|
| 1218 |
+
year = {2026},
|
| 1219 |
+
url = {https://codeforamerica.org/},
|
| 1220 |
+
note = {National nonprofit transforming government digital services and convening state and local government technology leaders}
|
| 1221 |
+
}
|
| 1222 |
+
```
|
| 1223 |
+
|
| 1224 |
+
**Citation:** "Code for America. https://codeforamerica.org/"
|
| 1225 |
+
|
| 1226 |
+
---
|
| 1227 |
+
|
| 1228 |
+
### **GovTech.com (Government Technology)** ⭐
|
| 1229 |
+
|
| 1230 |
+
The primary news and ranking source for the government technology industry, providing market intelligence and trend analysis.
|
| 1231 |
+
|
| 1232 |
+
**Organization:** Government Technology (e.Republic)
|
| 1233 |
+
**Website:** https://www.govtech.com/
|
| 1234 |
+
**About:** Leading publication covering technology trends, policy, and innovation in state and local government
|
| 1235 |
+
**Founded:** 1987
|
| 1236 |
+
**Coverage:** State and local government across all 50 states
|
| 1237 |
+
|
| 1238 |
+
**What we use:**
|
| 1239 |
+
- **GovTech 100**: Annual definitive directory of the top 100 trending companies in the U.S. public sector technology market
|
| 1240 |
+
- **Industry Trends**: Analysis of emerging technologies, procurement trends, and digital transformation initiatives
|
| 1241 |
+
- **Vendor Landscape**: Tracking government technology companies, products, and solutions
|
| 1242 |
+
- **Policy Coverage**: Legislative and regulatory developments affecting civic technology
|
| 1243 |
+
|
| 1244 |
+
**Key Resources:**
|
| 1245 |
+
- **GovTech 100 List**: https://www.govtech.com/100/ - The definitive annual ranking of companies shaping the future of state and local government
|
| 1246 |
+
- **Navigator Awards**: https://www.govtech.com/navigator - Recognition of state and local government IT leaders
|
| 1247 |
+
- **Digital Cities Survey**: Annual ranking of America's most digitally advanced cities and counties
|
| 1248 |
+
- **Research Center**: https://www.govtech.com/research/ - White papers, surveys, and industry reports
|
| 1249 |
+
- **Webinars & Events**: https://www.govtech.com/events/
|
| 1250 |
+
|
| 1251 |
+
**GovTech 100 Categories:**
|
| 1252 |
+
- Cloud & Infrastructure
|
| 1253 |
+
- Cybersecurity
|
| 1254 |
+
- Data & Analytics
|
| 1255 |
+
- Digital Government Services
|
| 1256 |
+
- Education Technology
|
| 1257 |
+
- Emergency Management
|
| 1258 |
+
- Financial Management
|
| 1259 |
+
- GIS & Mapping
|
| 1260 |
+
- Health & Human Services
|
| 1261 |
+
- Public Safety
|
| 1262 |
+
- Transportation
|
| 1263 |
+
|
| 1264 |
+
**Why GovTech.com:**
|
| 1265 |
+
- **Market Intelligence**: The GovTech 100 is the authoritative list of trending companies in government technology
|
| 1266 |
+
- **Vendor Discovery**: Comprehensive directory of solutions available to state and local government
|
| 1267 |
+
- **Industry Standards**: Defines what's considered "trending" and "emerging" in the civic tech marketplace
|
| 1268 |
+
- **Procurement Insights**: Helps identify which technologies governments are actively adopting
|
| 1269 |
+
|
| 1270 |
+
**Resources:**
|
| 1271 |
+
- Main Site: https://www.govtech.com/
|
| 1272 |
+
- GovTech 100: https://www.govtech.com/100/
|
| 1273 |
+
- Newsletter: https://www.govtech.com/newsletters/
|
| 1274 |
+
- Podcasts: https://www.govtech.com/podcasts/
|
| 1275 |
+
- Magazine Archive: https://www.govtech.com/magazines/
|
| 1276 |
+
|
| 1277 |
+
**BibTeX:**
|
| 1278 |
+
```bibtex
|
| 1279 |
+
@misc{govtech_magazine,
|
| 1280 |
+
title = {Government Technology Magazine},
|
| 1281 |
+
author = {{e.Republic Inc.}},
|
| 1282 |
+
year = {2026},
|
| 1283 |
+
url = {https://www.govtech.com/},
|
| 1284 |
+
note = {Leading publication and market intelligence source for state and local government technology, publisher of the annual GovTech 100 list}
|
| 1285 |
+
}
|
| 1286 |
+
```
|
| 1287 |
+
|
| 1288 |
+
**GovTech 100 BibTeX:**
|
| 1289 |
+
```bibtex
|
| 1290 |
+
@misc{govtech_100,
|
| 1291 |
+
title = {GovTech 100: Companies Trending in State and Local Government},
|
| 1292 |
+
author = {{Government Technology}},
|
| 1293 |
+
year = {2026},
|
| 1294 |
+
url = {https://www.govtech.com/100/},
|
| 1295 |
+
note = {Annual directory of the top 100 trending companies serving the U.S. public sector}
|
| 1296 |
+
}
|
| 1297 |
+
```
|
| 1298 |
+
|
| 1299 |
+
**Citation:** "Government Technology. e.Republic Inc. https://www.govtech.com/"
|
| 1300 |
+
|
| 1301 |
+
---
|
| 1302 |
+
|
| 1303 |
+
### **Civic Tech Guide** ⭐
|
| 1304 |
+
|
| 1305 |
+
Comprehensive, curated directory of civic technology projects, organizations, and tools worldwide, maintained by the civic tech community.
|
| 1306 |
+
|
| 1307 |
+
**Organization:** Civic Tech Field Guide (Community-maintained)
|
| 1308 |
+
**Website:** https://app.civictech.guide/
|
| 1309 |
+
**About:** Open directory and knowledge base of civic tech projects, tools, and organizations with detailed project profiles and categorization
|
| 1310 |
+
**Founded:** 2018
|
| 1311 |
+
**Coverage:** Global civic technology ecosystem (1,000+ projects)
|
| 1312 |
+
|
| 1313 |
+
**What we use:**
|
| 1314 |
+
- **Project Directory**: Discovery of related civic tech tools and platforms
|
| 1315 |
+
- **Categorization**: Understanding how civic tech projects are classified and tagged
|
| 1316 |
+
- **Community Connections**: Network of civic technologists and organizations
|
| 1317 |
+
- **Best Practices**: Learning from similar projects and their approaches
|
| 1318 |
+
|
| 1319 |
+
**CommunityOne Profile:**
|
| 1320 |
+
- **Listed as**: https://app.civictech.guide/p/communityone/r/recN0BG4gvjXT7WLf
|
| 1321 |
+
- **Categories**: Open Government, Civic Engagement, AI/Machine Learning
|
| 1322 |
+
- **Description**: AI-powered civic engagement platform tracking local government meetings, legislation, and advocacy opportunities
|
| 1323 |
+
|
| 1324 |
+
**Key Features:**
|
| 1325 |
+
- **Search & Filter**: Discover projects by topic, geography, technology, and impact area
|
| 1326 |
+
- **Project Profiles**: Detailed information about civic tech initiatives including status, team, and technology
|
| 1327 |
+
- **Tagging System**: Categorization by civic tech domains (transparency, participation, accountability, etc.)
|
| 1328 |
+
- **API Access**: Programmatic access to the project database
|
| 1329 |
+
- **Community Contributions**: Open for civic tech projects to self-list and update profiles
|
| 1330 |
+
|
| 1331 |
+
**Categories in Civic Tech Guide:**
|
| 1332 |
+
- Open Government & Transparency
|
| 1333 |
+
- Civic Participation & Engagement
|
| 1334 |
+
- Community Organizing
|
| 1335 |
+
- Democracy & Voting
|
| 1336 |
+
- Public Service Delivery
|
| 1337 |
+
- Data & Research
|
| 1338 |
+
- Advocacy & Policy
|
| 1339 |
+
- Urban Planning & Development
|
| 1340 |
+
|
| 1341 |
+
**Resources:**
|
| 1342 |
+
- Main Site: https://app.civictech.guide/
|
| 1343 |
+
- About: https://civictech.guide/
|
| 1344 |
+
- Submit Project: https://app.civictech.guide/submit
|
| 1345 |
+
- GitHub: https://github.com/compilerla/civic-tech-taxonomy
|
| 1346 |
+
|
| 1347 |
+
**Why Civic Tech Guide:**
|
| 1348 |
+
- **Discovery**: Find related projects and potential collaborators
|
| 1349 |
+
- **Context**: Understand where your project fits in the broader civic tech ecosystem
|
| 1350 |
+
- **Community**: Connect with civic technologists working on similar problems
|
| 1351 |
+
- **Legitimacy**: Being listed establishes credibility in the civic tech community
|
| 1352 |
+
|
| 1353 |
+
**BibTeX:**
|
| 1354 |
+
```bibtex
|
| 1355 |
+
@misc{civic_tech_guide,
|
| 1356 |
+
title = {Civic Tech Field Guide},
|
| 1357 |
+
author = {{Civic Tech Field Guide Community}},
|
| 1358 |
+
year = {2026},
|
| 1359 |
+
url = {https://civictech.guide/},
|
| 1360 |
+
note = {Community-maintained directory of civic technology projects and organizations worldwide}
|
| 1361 |
+
}
|
| 1362 |
+
```
|
| 1363 |
+
|
| 1364 |
+
**Citation:** "Civic Tech Field Guide. https://civictech.guide/"
|
| 1365 |
+
|
| 1366 |
+
---
|
| 1367 |
+
|
| 1368 |
+
## �️ **Technology Platforms & Support Programs**
|
| 1369 |
+
|
| 1370 |
+
### **Databricks for Good Program** ⭐
|
| 1371 |
+
|
| 1372 |
+
A philanthropic initiative providing cloud data platform credits and technical support to nonprofits, academic institutions, and social impact organizations.
|
| 1373 |
+
|
| 1374 |
+
**Organization:** Databricks, Inc.
|
| 1375 |
+
**Website:** https://www.databricks.com/product/databricks-for-good
|
| 1376 |
+
**Application:** https://www.databricks.com/product/databricks-for-good
|
| 1377 |
+
|
| 1378 |
+
**Eligibility:**
|
| 1379 |
+
- ✅ **Nonprofits** - 501(c)(3) status required
|
| 1380 |
+
- ✅ **Academic institutions** - Universities, colleges, research organizations
|
| 1381 |
+
- ✅ **Social impact organizations** - Civic engagement, public good projects
|
| 1382 |
+
- ✅ **Government agencies** - Case-by-case evaluation for civic data initiatives
|
| 1383 |
+
|
| 1384 |
+
**CommunityOne/Open Navigator Alignment:**
|
| 1385 |
+
This project appears well-suited for the program as a civic engagement and social good initiative. Eligibility would depend on establishing formal nonprofit status or academic partnership.
|
| 1386 |
+
|
| 1387 |
+
**Program Benefits:**
|
| 1388 |
+
- **$10,000-50,000** in annual Databricks credits
|
| 1389 |
+
- Access to **Unity Catalog** (normally $0.20 per million metadata operations)
|
| 1390 |
+
- Access to **Databricks Marketplace** for data sharing and distribution
|
| 1391 |
+
- **Standard tier** platform features included
|
| 1392 |
+
- **Technical support** from Databricks team
|
| 1393 |
+
- **Delta Sharing** protocol for secure data distribution
|
| 1394 |
+
- **MLflow** for AI/ML experiment tracking
|
| 1395 |
+
- **Databricks SQL** for analytics and dashboards
|
| 1396 |
+
|
| 1397 |
+
**What This Enables for Open Navigator:**
|
| 1398 |
+
- **Data Publishing**: Share 1.8M nonprofit profiles, 4.5M+ legislative documents via Databricks Marketplace
|
| 1399 |
+
- **Unity Catalog**: Organize data assets with enterprise-grade governance
|
| 1400 |
+
- **Delta Sharing**: Distribute datasets to enterprise/research consumers without data copying
|
| 1401 |
+
- **Lakehouse Architecture**: Unified analytics on legislative, nonprofit, and civic data
|
| 1402 |
+
- **Collaborative Notebooks**: Reproducible research and analysis
|
| 1403 |
+
- **Scheduled Pipelines**: Automated data updates and quality checks
|
| 1404 |
+
|
| 1405 |
+
**Alternative Path (Hybrid Approach):**
|
| 1406 |
+
- **HuggingFace Hub**: Continue using for open-source community distribution (free)
|
| 1407 |
+
- **Databricks Marketplace**: Add enterprise/research distribution channel (if approved)
|
| 1408 |
+
- **Data stays in one place**: External tables in Unity Catalog point to existing Parquet files
|
| 1409 |
+
- **No data duplication**: Delta Sharing streams data from your storage on-demand
|
| 1410 |
+
|
| 1411 |
+
**BibTeX:**
|
| 1412 |
+
```bibtex
|
| 1413 |
+
@misc{databricks_for_good,
|
| 1414 |
+
title = {Databricks for Good Program},
|
| 1415 |
+
author = {{Databricks, Inc.}},
|
| 1416 |
+
year = {2024},
|
| 1417 |
+
url = {https://www.databricks.com/product/databricks-for-good},
|
| 1418 |
+
note = {Cloud data platform credits and support for nonprofits, academic institutions, and social impact organizations}
|
| 1419 |
+
}
|
| 1420 |
+
```
|
| 1421 |
+
|
| 1422 |
+
**Application Process:**
|
| 1423 |
+
1. Visit https://www.databricks.com/product/databricks-for-good
|
| 1424 |
+
2. Submit organization details and project description
|
| 1425 |
+
3. Describe social impact and data use case
|
| 1426 |
+
4. Provide 501(c)(3) documentation (for nonprofits) or academic affiliation
|
| 1427 |
+
5. Review process typically takes 2-4 weeks
|
| 1428 |
+
6. Upon approval, receive credits and onboarding support
|
| 1429 |
+
|
| 1430 |
+
**Compliance:**
|
| 1431 |
+
Credits must be used for the approved social impact project and cannot be resold or transferred. Annual renewal required with impact reporting.
|
| 1432 |
+
|
| 1433 |
+
---
|
| 1434 |
+
|
| 1435 |
+
## �🙏 **Acknowledgments**
|
| 1436 |
+
|
| 1437 |
+
We are grateful to the authors of MeetingBank for making their dataset publicly available for research purposes. Their work on meeting summarization has been instrumental in developing civic engagement tools.
|
| 1438 |
+
|
| 1439 |
+
Special thanks to:
|
| 1440 |
+
- The Association for Computational Linguistics (ACL)
|
| 1441 |
+
- HuggingFace for hosting datasets
|
| 1442 |
+
- Open States for legislative data
|
| 1443 |
+
- All municipal governments providing open access to meeting records
|
| 1444 |
+
|
| 1445 |
+
---
|
| 1446 |
+
|
| 1447 |
+
## 📖 **How to Cite This Project**
|
| 1448 |
+
|
| 1449 |
+
If you use Open Navigator in your research, please cite:
|
| 1450 |
+
|
| 1451 |
+
```
|
| 1452 |
+
Open Navigator
|
| 1453 |
+
GitHub: https://github.com/getcommunityone/open-navigator-for-engagement
|
| 1454 |
+
License: MIT
|
| 1455 |
+
```
|
| 1456 |
+
|
| 1457 |
+
**BibTeX:**
|
| 1458 |
+
```bibtex
|
| 1459 |
+
@software{open-navigator-2026,
|
| 1460 |
+
title = {Open Navigator},
|
| 1461 |
+
author = {Community One},
|
| 1462 |
+
year = {2026},
|
| 1463 |
+
url = {https://github.com/getcommunityone/open-navigator-for-engagement},
|
| 1464 |
+
license = {MIT}
|
| 1465 |
+
}
|
| 1466 |
+
```
|
| 1467 |
+
|
| 1468 |
+
---
|
| 1469 |
+
|
| 1470 |
+
## 📝 **License Compliance**
|
| 1471 |
+
|
| 1472 |
+
This project respects all dataset licenses and terms of use. See [LICENSE](LICENSE) for this project's MIT license.
|
| 1473 |
+
|
| 1474 |
+
For dataset-specific licenses, please refer to the original sources listed above.
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to Oral Health Policy Pulse
|
| 2 |
+
|
| 3 |
+
Thank you for your interest in contributing to the Oral Health Policy Pulse project!
|
| 4 |
+
|
| 5 |
+
## How to Contribute
|
| 6 |
+
|
| 7 |
+
### Reporting Bugs
|
| 8 |
+
|
| 9 |
+
If you find a bug, please open an issue with:
|
| 10 |
+
- A clear description of the problem
|
| 11 |
+
- Steps to reproduce
|
| 12 |
+
- Expected vs actual behavior
|
| 13 |
+
- Your environment (OS, Python version, etc.)
|
| 14 |
+
|
| 15 |
+
### Suggesting Features
|
| 16 |
+
|
| 17 |
+
Feature requests are welcome! Please:
|
| 18 |
+
- Check if the feature has already been requested
|
| 19 |
+
- Clearly describe the feature and its use case
|
| 20 |
+
- Explain how it would benefit advocacy groups
|
| 21 |
+
|
| 22 |
+
### Code Contributions
|
| 23 |
+
|
| 24 |
+
1. **Fork the repository**
|
| 25 |
+
2. **Create a feature branch**
|
| 26 |
+
```bash
|
| 27 |
+
git checkout -b feature/your-feature-name
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
3. **Make your changes**
|
| 31 |
+
- Follow the existing code style
|
| 32 |
+
- Add tests for new functionality
|
| 33 |
+
- Update documentation as needed
|
| 34 |
+
|
| 35 |
+
4. **Run tests**
|
| 36 |
+
```bash
|
| 37 |
+
pytest
|
| 38 |
+
black .
|
| 39 |
+
ruff check .
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
5. **Commit your changes**
|
| 43 |
+
```bash
|
| 44 |
+
git commit -m "Add feature: description"
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
6. **Push and create a pull request**
|
| 48 |
+
```bash
|
| 49 |
+
git push origin feature/your-feature-name
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## Code Style
|
| 53 |
+
|
| 54 |
+
- Follow PEP 8 guidelines
|
| 55 |
+
- Use type hints
|
| 56 |
+
- Write docstrings for all public functions
|
| 57 |
+
- Keep functions focused and single-purpose
|
| 58 |
+
- Use meaningful variable names
|
| 59 |
+
|
| 60 |
+
## Code of Conduct
|
| 61 |
+
|
| 62 |
+
This project values respectful, inclusive collaboration. We align with the principles of:
|
| 63 |
+
- **Open States Code of Conduct**: https://docs.openstates.org/code-of-conduct/
|
| 64 |
+
- Be respectful and professional
|
| 65 |
+
- Welcome diverse perspectives
|
| 66 |
+
- Focus on what's best for the community
|
| 67 |
+
- Show empathy towards other contributors
|
| 68 |
+
|
| 69 |
+
## Contributing to Upstream Projects
|
| 70 |
+
|
| 71 |
+
We use data and patterns from several open source civic tech projects. When contributing scraper patterns or improvements back to upstream projects like **OpenStates**, please:
|
| 72 |
+
|
| 73 |
+
1. **Follow their standards**: https://github.com/openstates/openstates-scrapers
|
| 74 |
+
2. **Reference their documentation**: https://docs.openstates.org/contributing/local-database/
|
| 75 |
+
3. **Respect their Code of Conduct**: https://docs.openstates.org/code-of-conduct/
|
| 76 |
+
4. **Test locally** before submitting pull requests
|
| 77 |
+
5. **Document data sources** used in scraper development
|
| 78 |
+
|
| 79 |
+
## Testing
|
| 80 |
+
|
| 81 |
+
All new features should include tests. Run the test suite with:
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
pytest tests/ -v
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Documentation
|
| 88 |
+
|
| 89 |
+
Update relevant documentation when:
|
| 90 |
+
- Adding new features
|
| 91 |
+
- Changing API endpoints
|
| 92 |
+
- Modifying configuration options
|
| 93 |
+
- Adding new dependencies
|
| 94 |
+
|
| 95 |
+
## Questions?
|
| 96 |
+
|
| 97 |
+
Open an issue or reach out to the maintainers.
|
| 98 |
+
|
| 99 |
+
Thank you for helping improve oral health advocacy! 🦷
|
Dockerfile
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build for Hugging Face Spaces
|
| 2 |
+
# Runs all three apps: Docusaurus docs, React frontend, FastAPI backend
|
| 3 |
+
|
| 4 |
+
FROM node:20-slim AS docs-builder
|
| 5 |
+
WORKDIR /build
|
| 6 |
+
|
| 7 |
+
# Set baseUrl to /docs/ for HuggingFace deployment # Docs are served at nginx /docs/ location
|
| 8 |
+
# routeBasePath: '/' in docusaurus.config.ts prevents /docs/docs/ nesting
|
| 9 |
+
ENV DOCUSAURUS_BASE_URL=/docs/
|
| 10 |
+
|
| 11 |
+
COPY website/package*.json ./
|
| 12 |
+
RUN npm config set fetch-retry-mintimeout 20000 && \
|
| 13 |
+
npm config set fetch-retry-maxtimeout 120000 && \
|
| 14 |
+
npm ci --prefer-offline --no-audit || npm install --prefer-offline --no-audit
|
| 15 |
+
|
| 16 |
+
# Add cache-busting argument to force rebuild when needed
|
| 17 |
+
ARG CACHE_BUST=2026-04-27-12-00-fix-double-docs-prefix
|
| 18 |
+
|
| 19 |
+
COPY website/ ./
|
| 20 |
+
|
| 21 |
+
# Verify environment variable is set and build
|
| 22 |
+
RUN echo "Building Docusaurus with DOCUSAURUS_BASE_URL=$DOCUSAURUS_BASE_URL" && \
|
| 23 |
+
echo "Cache bust: 2026-04-27-12-00-fix-double-docs-prefix" && \
|
| 24 |
+
npm run build && \
|
| 25 |
+
echo "Verifying baseUrl in build output..." && \
|
| 26 |
+
grep -r "baseUrl" build/ | head -5 || true
|
| 27 |
+
|
| 28 |
+
FROM python:3.11-slim
|
| 29 |
+
|
| 30 |
+
# Install system dependencies, nginx, and Node.js for frontend build
|
| 31 |
+
RUN apt-get update && apt-get install -y \
|
| 32 |
+
build-essential \
|
| 33 |
+
curl \
|
| 34 |
+
git \
|
| 35 |
+
tesseract-ocr \
|
| 36 |
+
nginx \
|
| 37 |
+
supervisor \
|
| 38 |
+
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
| 39 |
+
&& apt-get install -y nodejs \
|
| 40 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 41 |
+
|
| 42 |
+
WORKDIR /app
|
| 43 |
+
|
| 44 |
+
# Copy Python requirements and install
|
| 45 |
+
COPY requirements.txt .
|
| 46 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 47 |
+
|
| 48 |
+
# OPTIMIZATION: Copy frontend package files first for better caching
|
| 49 |
+
COPY frontend/package*.json /app/frontend/
|
| 50 |
+
RUN cd /app/frontend && npm ci
|
| 51 |
+
|
| 52 |
+
# Copy application code (now npm ci layer is cached)
|
| 53 |
+
COPY . .
|
| 54 |
+
|
| 55 |
+
# Copy built static files from docs stage
|
| 56 |
+
COPY --from=docs-builder /build/build /app/static/docs
|
| 57 |
+
|
| 58 |
+
# Build frontend (npm_modules already cached from above)
|
| 59 |
+
# Set production environment variables for Vite
|
| 60 |
+
ENV VITE_CANONICAL_DOMAIN=www.communityone.com
|
| 61 |
+
ENV VITE_API_URL=/api
|
| 62 |
+
# Cache bust: 2026-04-29-remove-axios
|
| 63 |
+
ARG CACHE_BUST_FRONTEND=2026-04-29-remove-axios
|
| 64 |
+
RUN cd /app/frontend && echo "Frontend build cache bust: $CACHE_BUST_FRONTEND" && npm run build
|
| 65 |
+
|
| 66 |
+
# Frontend is already built to /app/api/static/ via vite.config.ts
|
| 67 |
+
# Create frontend directory in /app/static for nginx
|
| 68 |
+
RUN mkdir -p /app/static/frontend && \
|
| 69 |
+
ls -la /app/api/static/ && \
|
| 70 |
+
cp -r /app/api/static/* /app/static/frontend/
|
| 71 |
+
|
| 72 |
+
# Create necessary directories
|
| 73 |
+
RUN mkdir -p /app/logs /app/data /var/log/supervisor
|
| 74 |
+
|
| 75 |
+
# Copy Hugging Face specific configs
|
| 76 |
+
COPY .huggingface/nginx.conf /etc/nginx/nginx.conf
|
| 77 |
+
COPY .huggingface/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
| 78 |
+
COPY .huggingface/start.sh /app/start.sh
|
| 79 |
+
RUN chmod +x /app/start.sh
|
| 80 |
+
|
| 81 |
+
# Expose port 7860 (Hugging Face Spaces default)
|
| 82 |
+
EXPOSE 7860
|
| 83 |
+
|
| 84 |
+
# Set environment variables
|
| 85 |
+
ENV PYTHONUNBUFFERED=1
|
| 86 |
+
ENV LOG_LEVEL=INFO
|
| 87 |
+
ENV HF_SPACES=1
|
| 88 |
+
|
| 89 |
+
# Use supervisor to run all services
|
| 90 |
+
CMD ["/app/start.sh"]
|
INTEL_ARC_QUICKSTART.md
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Intel Arc + DuckDB Quick Reference
|
| 2 |
+
|
| 3 |
+
**Get started with local AI legislative analysis in 5 minutes**
|
| 4 |
+
|
| 5 |
+
## ⚡ Performance at a Glance
|
| 6 |
+
|
| 7 |
+
| Task | Standard (Postgres + CPU) | Optimized (DuckDB + Arc GPU) | Speedup |
|
| 8 |
+
|------|--------------------------|------------------------------|---------|
|
| 9 |
+
| Context injection (100 bills) | 500ms | 20ms | **25x** |
|
| 10 |
+
| Vector search (10K records) | 800ms | 18ms | **44x** |
|
| 11 |
+
| LLM inference (3B model) | 350 tok/s | 1,200 tok/s | **3.4x** |
|
| 12 |
+
| Full testimony analysis | 2,000ms | 80ms | **25x** |
|
| 13 |
+
|
| 14 |
+
## 🎯 Three-Step Setup
|
| 15 |
+
|
| 16 |
+
### 1. Install (5 minutes)
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
cd /path/to/open-navigator
|
| 20 |
+
./scripts/intel_llm_setup.sh
|
| 21 |
+
source .venv-intel/bin/activate
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
### 2. Test DuckDB VSS (30 seconds)
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
python scripts/duckdb_vss_demo.py
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Expected output:
|
| 31 |
+
```
|
| 32 |
+
📊 Creating demo DuckDB database with VSS...
|
| 33 |
+
✅ Demo database created!
|
| 34 |
+
📈 Results (searching 1,000 bills):
|
| 35 |
+
Average: 18.45ms
|
| 36 |
+
🎯 Top 3 most similar bills: ...
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
### 3. Run Analysis (1 minute)
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
python scripts/legislative_analysis_intel.py
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
## 🧠 Code Examples
|
| 46 |
+
|
| 47 |
+
### Example 1: Fast Bill Search
|
| 48 |
+
|
| 49 |
+
```python
|
| 50 |
+
from scripts.legislative_analysis_intel import DuckDBLegislativeAnalyzer
|
| 51 |
+
|
| 52 |
+
with DuckDBLegislativeAnalyzer() as analyzer:
|
| 53 |
+
# Get bill context in < 50ms
|
| 54 |
+
bill = analyzer.get_bill_context("HB1234")
|
| 55 |
+
testimony = analyzer.get_all_testimony_for_bill("HB1234")
|
| 56 |
+
|
| 57 |
+
print(f"Bill: {bill['title']}")
|
| 58 |
+
print(f"Testimony records: {len(testimony)}")
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### Example 2: Vector Similarity Search
|
| 62 |
+
|
| 63 |
+
```python
|
| 64 |
+
import numpy as np
|
| 65 |
+
|
| 66 |
+
# Your query embedding (384 dimensions from sentence-transformers)
|
| 67 |
+
query_embedding = model.encode("water fluoridation policy")
|
| 68 |
+
|
| 69 |
+
# Fast vector search (< 20ms for 10K bills)
|
| 70 |
+
similar_bills = analyzer.search_similar_testimony(
|
| 71 |
+
query_embedding.tolist(),
|
| 72 |
+
limit=10
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
for bill in similar_bills:
|
| 76 |
+
print(f"{bill['bill_id']}: {bill['text'][:100]}... (similarity: {bill['similarity']:.2f})")
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### Example 3: Extract Interest Groups
|
| 80 |
+
|
| 81 |
+
```python
|
| 82 |
+
from scripts.legislative_analysis_intel import IntelOptimizedLLM, InterestGroup
|
| 83 |
+
|
| 84 |
+
# Initialize Intel-optimized LLM (uses Arc GPU)
|
| 85 |
+
llm = IntelOptimizedLLM(model_name="meta-llama/Llama-3.2-3B-Instruct")
|
| 86 |
+
llm.load_model(use_openvino=True) # OpenVINO = best Arc GPU performance
|
| 87 |
+
|
| 88 |
+
# Extract structured data
|
| 89 |
+
groups = llm.extract_interest_groups(bill_context, testimony)
|
| 90 |
+
|
| 91 |
+
# Results
|
| 92 |
+
for group in groups:
|
| 93 |
+
print(f"""
|
| 94 |
+
Group: {group.group_name}
|
| 95 |
+
Lobbyist: {group.lobbyist}
|
| 96 |
+
Stance: {group.stance} (score: {group.stance_score})
|
| 97 |
+
Tradeoffs: {group.tradeoff_notes}
|
| 98 |
+
Confidence: {group.confidence}
|
| 99 |
+
""")
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
### Example 4: Query Hugging Face Datasets Directly
|
| 103 |
+
|
| 104 |
+
```python
|
| 105 |
+
import duckdb
|
| 106 |
+
|
| 107 |
+
conn = duckdb.connect()
|
| 108 |
+
|
| 109 |
+
# No download needed - streams from HF!
|
| 110 |
+
df = conn.execute("""
|
| 111 |
+
SELECT *
|
| 112 |
+
FROM read_parquet(
|
| 113 |
+
'hf://datasets/CommunityOne/states-al-nonprofits-locations/data/train-*.parquet'
|
| 114 |
+
)
|
| 115 |
+
WHERE city = 'Birmingham'
|
| 116 |
+
LIMIT 100
|
| 117 |
+
""").fetchdf()
|
| 118 |
+
|
| 119 |
+
print(f"Found {len(df)} organizations in Birmingham, AL")
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
## 🎨 Output Schema
|
| 123 |
+
|
| 124 |
+
**Interest Group Extraction:**
|
| 125 |
+
|
| 126 |
+
```json
|
| 127 |
+
{
|
| 128 |
+
"groups": [
|
| 129 |
+
{
|
| 130 |
+
"group_name": "Alabama Dental Association",
|
| 131 |
+
"lobbyist": "John Smith, DDS",
|
| 132 |
+
"stance": "conditional",
|
| 133 |
+
"stance_score": 0.6,
|
| 134 |
+
"tradeoff_notes": "Support if Section 4 amended to include rural exemption and phased implementation timeline",
|
| 135 |
+
"testimony_excerpt": "While we have concerns about Section 4's implementation timeline, we support the overall goals if rural communities receive proper resources...",
|
| 136 |
+
"bill_id": "HB1234",
|
| 137 |
+
"confidence": 0.85
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"group_name": "Sierra Club Alabama Chapter",
|
| 141 |
+
"lobbyist": null,
|
| 142 |
+
"stance": "oppose",
|
| 143 |
+
"stance_score": -0.9,
|
| 144 |
+
"tradeoff_notes": null,
|
| 145 |
+
"testimony_excerpt": "We strongly oppose this bill due to environmental concerns...",
|
| 146 |
+
"bill_id": "HB1234",
|
| 147 |
+
"confidence": 0.92
|
| 148 |
+
}
|
| 149 |
+
]
|
| 150 |
+
}
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
## 🔧 Environment Variables
|
| 154 |
+
|
| 155 |
+
```bash
|
| 156 |
+
# Enable Intel GPU
|
| 157 |
+
export ZES_ENABLE_SYSMAN=1
|
| 158 |
+
|
| 159 |
+
# Ollama GPU usage (if using Ollama)
|
| 160 |
+
export OLLAMA_NUM_GPU=999
|
| 161 |
+
|
| 162 |
+
# IPEX-LLM optimizations
|
| 163 |
+
export IPEX_LLM_NUM_GPU=1
|
| 164 |
+
export ONEAPI_DEVICE_SELECTOR=level_zero:0
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
## 💡 Best Practices
|
| 168 |
+
|
| 169 |
+
### 1. Cache Embeddings
|
| 170 |
+
|
| 171 |
+
**DON'T** recompute every time:
|
| 172 |
+
```python
|
| 173 |
+
# Slow - recomputes embeddings every run
|
| 174 |
+
for bill in bills:
|
| 175 |
+
embedding = model.encode(bill['text'])
|
| 176 |
+
analyze(embedding)
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
**DO** cache in DuckDB:
|
| 180 |
+
```python
|
| 181 |
+
# Fast - compute once, reuse forever
|
| 182 |
+
conn.execute("""
|
| 183 |
+
CREATE TABLE bill_embeddings AS
|
| 184 |
+
SELECT bill_id, embedding
|
| 185 |
+
FROM ... -- computed once
|
| 186 |
+
""")
|
| 187 |
+
|
| 188 |
+
# Then just query
|
| 189 |
+
similar = conn.execute("""
|
| 190 |
+
SELECT * FROM bill_embeddings
|
| 191 |
+
ORDER BY array_distance(embedding, ?)
|
| 192 |
+
LIMIT 10
|
| 193 |
+
""", [query]).fetchall()
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
### 2. Batch Processing
|
| 197 |
+
|
| 198 |
+
**DON'T** process one at a time:
|
| 199 |
+
```python
|
| 200 |
+
for bill_id in bill_ids: # Slow!
|
| 201 |
+
result = analyze_single_bill(bill_id)
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
**DO** batch efficiently:
|
| 205 |
+
```python
|
| 206 |
+
# Fast - processes 100 bills in parallel
|
| 207 |
+
results = llm.extract_interest_groups_batch(
|
| 208 |
+
bill_contexts=bills,
|
| 209 |
+
testimony_batches=all_testimony,
|
| 210 |
+
batch_size=32 # Fits in Arc GPU memory
|
| 211 |
+
)
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
### 3. Monitor GPU Usage
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
# Linux: intel_gpu_top
|
| 218 |
+
sudo apt install intel-gpu-tools
|
| 219 |
+
intel_gpu_top
|
| 220 |
+
|
| 221 |
+
# Windows: Task Manager → Performance → GPU
|
| 222 |
+
# Look for "GPU 0 - Intel Arc Graphics"
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
## 🐛 Troubleshooting
|
| 226 |
+
|
| 227 |
+
### Issue: "ModuleNotFoundError: optimum"
|
| 228 |
+
|
| 229 |
+
```bash
|
| 230 |
+
pip install optimum[openvino]
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### Issue: Slow inference (still using CPU)
|
| 234 |
+
|
| 235 |
+
Check device:
|
| 236 |
+
```python
|
| 237 |
+
import torch
|
| 238 |
+
print(f"Device: {torch.cuda.get_device_name(0)}") # Should show Arc GPU
|
| 239 |
+
|
| 240 |
+
# Force GPU
|
| 241 |
+
model = OVModelForCausalLM.from_pretrained(
|
| 242 |
+
model_name,
|
| 243 |
+
device="GPU" # Explicitly set
|
| 244 |
+
)
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
### Issue: Out of memory
|
| 248 |
+
|
| 249 |
+
Use smaller model or reduce batch size:
|
| 250 |
+
```python
|
| 251 |
+
# Use 3B instead of 8B
|
| 252 |
+
model_name = "meta-llama/Llama-3.2-3B-Instruct"
|
| 253 |
+
|
| 254 |
+
# Reduce context
|
| 255 |
+
testimony = testimony[:10] # Top 10 only
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
## 📚 Resources
|
| 259 |
+
|
| 260 |
+
- **Full Guide**: [website/docs/guides/intel-arc-optimization.md](../website/docs/guides/intel-arc-optimization.md)
|
| 261 |
+
- **DuckDB Docs**: https://duckdb.org/docs/
|
| 262 |
+
- **Intel IPEX**: https://github.com/intel/intel-extension-for-pytorch
|
| 263 |
+
- **OpenVINO**: https://docs.openvino.ai/
|
| 264 |
+
|
| 265 |
+
## 🎯 Next Steps
|
| 266 |
+
|
| 267 |
+
1. ✅ Run the demo: `python scripts/duckdb_vss_demo.py`
|
| 268 |
+
2. ✅ Test analysis: `python scripts/legislative_analysis_intel.py`
|
| 269 |
+
3. 📚 Read full guide: [Intel Arc Optimization Guide](../website/docs/guides/intel-arc-optimization.md)
|
| 270 |
+
4. 🚀 Build your own: Use the `DuckDBLegislativeAnalyzer` class
|
| 271 |
+
5. 🤝 Share results: Open an issue with your findings!
|
| 272 |
+
|
| 273 |
+
## 💬 Questions?
|
| 274 |
+
|
| 275 |
+
- **GitHub Issues**: https://github.com/getcommunityone/open-navigator/issues
|
| 276 |
+
- **Documentation**: https://www.communityone.com/docs
|
| 277 |
+
- **Intel AI Forums**: https://community.intel.com/t5/Intel-AI-Analytics-and/bd-p/software-ai
|
| 278 |
+
|
| 279 |
+
---
|
| 280 |
+
|
| 281 |
+
**Built with ❤️ for Data Engineering Managers who want local, private, fast legislative intelligence.**
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Support. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright 2026 Community One
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
Makefile
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: help install install-frontend install-docs build-frontend build-docs clean test run dev dev-frontend dev-docs start-all stop-all dev-full docker-up docker-down deploy-databricks
|
| 2 |
+
|
| 3 |
+
help:
|
| 4 |
+
@echo "🦷 Open Navigator - Makefile Commands"
|
| 5 |
+
@echo "===================================================="
|
| 6 |
+
@echo ""
|
| 7 |
+
@echo "🚀 Quick Start:"
|
| 8 |
+
@echo " make start-all - Start ALL services (API + Dashboard + Docs) with tmux"
|
| 9 |
+
@echo " make stop-all - Stop all running services"
|
| 10 |
+
@echo ""
|
| 11 |
+
@echo "🐍 Python Backend:"
|
| 12 |
+
@echo " make install - Install Python dependencies in venv"
|
| 13 |
+
@echo " make dev - Start backend with auto-reload"
|
| 14 |
+
@echo " make run - Start backend (production)"
|
| 15 |
+
@echo ""
|
| 16 |
+
@echo "⚛️ React Dashboard:"
|
| 17 |
+
@echo " make install-frontend - Install dashboard npm dependencies"
|
| 18 |
+
@echo " make build-frontend - Build React dashboard for production"
|
| 19 |
+
@echo " make dev-frontend - Start dashboard dev server"
|
| 20 |
+
@echo ""
|
| 21 |
+
@echo "📚 Documentation Site:"
|
| 22 |
+
@echo " make install-docs - Install Docusaurus dependencies"
|
| 23 |
+
@echo " make build-docs - Build documentation for production"
|
| 24 |
+
@echo " make dev-docs - Start documentation dev server"
|
| 25 |
+
@echo ""
|
| 26 |
+
@echo "☁️ Deployment:"
|
| 27 |
+
@echo " make deploy-databricks - Deploy to Databricks Apps"
|
| 28 |
+
@echo ""
|
| 29 |
+
@echo "🐳 Docker:"
|
| 30 |
+
@echo " make docker-up - Start Docker containers"
|
| 31 |
+
@echo " make docker-down - Stop Docker containers"
|
| 32 |
+
@echo ""
|
| 33 |
+
@echo "🧪 Testing:"
|
| 34 |
+
@echo " make test - Run test suite"
|
| 35 |
+
@echo " make clean - Remove build artifacts"
|
| 36 |
+
@echo ""
|
| 37 |
+
|
| 38 |
+
install:
|
| 39 |
+
@echo "📦 Creating virtual environment and installing dependencies..."
|
| 40 |
+
@chmod +x install.sh
|
| 41 |
+
@./install.sh
|
| 42 |
+
|
| 43 |
+
install-frontend:
|
| 44 |
+
@echo "📦 Installing dashboard dependencies..."
|
| 45 |
+
@cd frontend && npm install
|
| 46 |
+
@echo "✅ Dashboard dependencies installed!"
|
| 47 |
+
|
| 48 |
+
install-docs:
|
| 49 |
+
@echo "📦 Installing documentation dependencies..."
|
| 50 |
+
@cd website && npm install
|
| 51 |
+
@echo "✅ Documentation dependencies installed!"
|
| 52 |
+
|
| 53 |
+
build-frontend:
|
| 54 |
+
@echo "🔨 Building React dashboard..."
|
| 55 |
+
@cd frontend && npm run build
|
| 56 |
+
@echo "✅ Dashboard built to api/static/"
|
| 57 |
+
|
| 58 |
+
build-docs:
|
| 59 |
+
@echo "🔨 Building documentation site..."
|
| 60 |
+
@cd website && npm run build
|
| 61 |
+
@echo "✅ Documentation built to website/build/"
|
| 62 |
+
|
| 63 |
+
clean:
|
| 64 |
+
@echo "🧹 Cleaning up..."
|
| 65 |
+
@rm -rf .venv venv
|
| 66 |
+
@rm -rf frontend/node_modules frontend/dist
|
| 67 |
+
@rm -rf website/node_modules website/build website/.docusaurus
|
| 68 |
+
@rm -rf api/static
|
| 69 |
+
@rm -rf __pycache__
|
| 70 |
+
@find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
|
| 71 |
+
@find . -type f -name "*.pyc" -delete
|
| 72 |
+
@find . -type f -name "*.pyo" -delete
|
| 73 |
+
@rm -rf .pytest_cache
|
| 74 |
+
@rm -rf .coverage
|
| 75 |
+
@rm -rf htmlcov
|
| 76 |
+
@rm -rf dist
|
| 77 |
+
@rm -rf build
|
| 78 |
+
@rm -rf *.egg-info
|
| 79 |
+
@rm -rf logs/*.pid logs/*.log
|
| 80 |
+
@echo "✅ Cleanup complete"
|
| 81 |
+
|
| 82 |
+
test:
|
| 83 |
+
@echo "🧪 Running tests..."
|
| 84 |
+
@. venv/bin/activate && pytest tests/ -v
|
| 85 |
+
|
| 86 |
+
run: build-frontend
|
| 87 |
+
@echo "🚀 Starting application (production mode)..."
|
| 88 |
+
@. venv/bin/activate && uvicorn api.app:app --host 0.0.0.0 --port 8000
|
| 89 |
+
|
| 90 |
+
dev:
|
| 91 |
+
@echo "🔧 Starting backend with auto-reload..."
|
| 92 |
+
@echo "📡 Backend running at http://localhost:8000"
|
| 93 |
+
@. venv/bin/activate && uvicorn api.app:app --reload
|
| 94 |
+
|
| 95 |
+
dev-frontend:
|
| 96 |
+
@echo "⚛️ Starting dashboard dev server..."
|
| 97 |
+
@echo "📡 Dashboard running at http://localhost:5173"
|
| 98 |
+
@cd frontend && npm run dev
|
| 99 |
+
|
| 100 |
+
dev-docs:
|
| 101 |
+
@echo "📚 Starting documentation dev server..."
|
| 102 |
+
@echo "📡 Documentation running at http://localhost:3000"
|
| 103 |
+
@cd website && npm start
|
| 104 |
+
|
| 105 |
+
start-all:
|
| 106 |
+
@echo "🚀 Starting all services with tmux..."
|
| 107 |
+
@chmod +x start-all.sh
|
| 108 |
+
@./start-all.sh
|
| 109 |
+
|
| 110 |
+
stop-all:
|
| 111 |
+
@echo "🛑 Stopping all services..."
|
| 112 |
+
@chmod +x stop-all.sh
|
| 113 |
+
@./stop-all.sh
|
| 114 |
+
|
| 115 |
+
dev-full:
|
| 116 |
+
@echo "🚀 Use 'make start-all' for better experience with tmux!"
|
| 117 |
+
@echo ""
|
| 118 |
+
@echo "Starting backend and frontend (manual)..."
|
| 119 |
+
@echo "📡 Backend: http://localhost:8000"
|
| 120 |
+
@echo "📡 Dashboard: http://localhost:5173"
|
| 121 |
+
@echo "📡 Docs: http://localhost:3000 (run 'make dev-docs' in another terminal)"
|
| 122 |
+
@echo ""
|
| 123 |
+
@. venv/bin/activate && uvicorn api.app:app --reload & \
|
| 124 |
+
cd frontend && npm run dev
|
| 125 |
+
|
| 126 |
+
deploy-databricks:
|
| 127 |
+
@echo "☁️ Deploying to Databricks Apps..."
|
| 128 |
+
@chmod +x scripts/deploy-databricks-app.sh
|
| 129 |
+
@./scripts/deploy-databricks-app.sh
|
| 130 |
+
|
| 131 |
+
docker-up:
|
| 132 |
+
@echo "Starting Docker containers..."
|
| 133 |
+
@docker-compose up -d
|
| 134 |
+
@echo "✓ Containers started"
|
| 135 |
+
@echo " API: http://localhost:8000"
|
| 136 |
+
@echo " Docs: http://localhost:8000/docs"
|
| 137 |
+
|
| 138 |
+
docker-down:
|
| 139 |
+
@echo "Stopping Docker containers..."
|
| 140 |
+
@docker-compose down
|
| 141 |
+
@echo "✓ Containers stopped"
|
| 142 |
+
|
| 143 |
+
example:
|
| 144 |
+
@echo "Running example workflow..."
|
| 145 |
+
@. venv/bin/activate && python examples/example_workflow.py
|
| 146 |
+
|
| 147 |
+
heatmap:
|
| 148 |
+
@echo "Generating example heatmap..."
|
| 149 |
+
@. venv/bin/activate && python main.py generate-heatmap --output example_heatmap.html
|
| 150 |
+
@echo "✓ Heatmap saved to example_heatmap.html"
|
| 151 |
+
|
| 152 |
+
init:
|
| 153 |
+
@echo "Initializing system..."
|
| 154 |
+
@. venv/bin/activate && python main.py init
|
| 155 |
+
|
| 156 |
+
status:
|
| 157 |
+
@echo "Checking system status..."
|
| 158 |
+
@. venv/bin/activate && python main.py status
|
| 159 |
+
|
| 160 |
+
format:
|
| 161 |
+
@echo "Formatting code..."
|
| 162 |
+
@. venv/bin/activate && black .
|
| 163 |
+
@. venv/bin/activate && ruff check . --fix
|
| 164 |
+
@echo "✓ Code formatted"
|
| 165 |
+
|
| 166 |
+
lint:
|
| 167 |
+
@echo "Linting code..."
|
| 168 |
+
@. venv/bin/activate && ruff check .
|
| 169 |
+
@. venv/bin/activate && mypy agents/ pipeline/ visualization/ api/
|
README.md
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Open Navigator
|
| 3 |
+
emoji: 🏛️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 🏛️ Open Navigator
|
| 13 |
+
|
| 14 |
+
> **CommunityOne: The open path to everything local**
|
| 15 |
+
>
|
| 16 |
+
> AI-powered civic engagement platform with React + FastAPI web interface
|
| 17 |
+
|
| 18 |
+
[](https://opensource.org/licenses/Apache-2.0)
|
| 19 |
+
[](https://www.python.org/downloads/)
|
| 20 |
+
[](https://reactjs.org)
|
| 21 |
+
[](https://fastapi.tiangolo.com)
|
| 22 |
+
|
| 23 |
+
## � Quick Links
|
| 24 |
+
|
| 25 |
+
**[⚛️ Open Navigator →](https://www.communityone.com)** - **LIVE APPLICATION** (search, filters, heatmap, data exploration)
|
| 26 |
+
|
| 27 |
+
**[📖 Documentation →](https://www.communityone.com/docs)** - Complete guides, architecture, and feature details
|
| 28 |
+
|
| 29 |
+
The documentation site includes:
|
| 30 |
+
- Features and capabilities
|
| 31 |
+
- Data sources and integrations
|
| 32 |
+
- Architecture and deployment options
|
| 33 |
+
- Policy topics and advocacy tools
|
| 34 |
+
- API reference and examples
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## Quick Start
|
| 39 |
+
|
| 40 |
+
### Three Services
|
| 41 |
+
|
| 42 |
+
This project runs three separate services:
|
| 43 |
+
|
| 44 |
+
| Service | Port (Local) | Live URL | Description |
|
| 45 |
+
|---------|------|----------|-------------|
|
| 46 |
+
| **⚛️ Open Navigator** 🚀 | 5173 | [www.communityone.com](https://www.communityone.com) | **MAIN APPLICATION** - Search, filters, heatmap, data exploration |
|
| 47 |
+
| **📚 Documentation** | 3000 | [www.communityone.com/docs](https://www.communityone.com/docs) | Docusaurus site with complete guides and tutorials |
|
| 48 |
+
| **🔥 API Backend** | 8000 | [www.communityone.com/api](https://www.communityone.com/api) | FastAPI server with AI agents |
|
| 49 |
+
|
| 50 |
+
> **💡 LIVE DEMO:** Visit **[www.communityone.com](https://www.communityone.com)** to use the application!
|
| 51 |
+
>
|
| 52 |
+
> **💻 LOCAL DEV:** After running `./start-all.sh`, visit **http://localhost:5173**
|
| 53 |
+
|
| 54 |
+
## 🚀 Deployment
|
| 55 |
+
|
| 56 |
+
**Deploy to Hugging Face Spaces** in 3 commands:
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
echo "HF_USERNAME=your_username" >> .env
|
| 60 |
+
./deploy-huggingface.sh
|
| 61 |
+
# Configure hardware and secrets at https://huggingface.co/spaces/YOUR_USERNAME/www.communityone.com
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
**Full deployment guides:**
|
| 65 |
+
- **[Hugging Face Spaces](website/docs/deployment/huggingface-spaces.md)** - Docker deployment (~$22/month)
|
| 66 |
+
- **[Databricks Apps](website/docs/deployment/databricks-apps.md)** - Enterprise deployment
|
| 67 |
+
- **[Local Development](website/docs/deployment/)** - Complete deployment documentation
|
| 68 |
+
|
| 69 |
+
The `deploy-huggingface.sh` script automatically:
|
| 70 |
+
- ✅ Tests builds locally (catches errors before pushing)
|
| 71 |
+
- ✅ Creates the Space on Hugging Face
|
| 72 |
+
- ✅ Pushes code and triggers automatic build (~10-15 min)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
### Prerequisites
|
| 76 |
+
|
| 77 |
+
- Python 3.11+
|
| 78 |
+
- Node.js 18+
|
| 79 |
+
- Docker (optional)
|
| 80 |
+
- OpenAI API key
|
| 81 |
+
|
| 82 |
+
### Installation
|
| 83 |
+
|
| 84 |
+
**Option 1: Start Everything at Once (Recommended)**
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
# Clone repository
|
| 88 |
+
git clone https://github.com/getcommunityone/open-navigator.git
|
| 89 |
+
cd open-navigator
|
| 90 |
+
|
| 91 |
+
# Install dependencies
|
| 92 |
+
./install.sh # Python backend
|
| 93 |
+
cd frontend && npm install && cd .. # React app
|
| 94 |
+
cd website && npm install && cd .. # Documentation
|
| 95 |
+
|
| 96 |
+
# Setup git hooks for build protection (one-time)
|
| 97 |
+
./setup-git-hooks.sh
|
| 98 |
+
|
| 99 |
+
# Start all services in tmux
|
| 100 |
+
./start-all.sh
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
**Option 2: Using Makefile**
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
# Install
|
| 107 |
+
make install
|
| 108 |
+
make install-frontend
|
| 109 |
+
make install-docs
|
| 110 |
+
|
| 111 |
+
# Start all services
|
| 112 |
+
make start-all
|
| 113 |
+
|
| 114 |
+
# Or individually:
|
| 115 |
+
make dev # API only
|
| 116 |
+
make dev-frontend # React app only
|
| 117 |
+
make dev-docs # Docs only
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
**Option 3: Manual Setup**
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
# Python backend
|
| 124 |
+
python3 -m venv .venv
|
| 125 |
+
source .venv/bin/activate
|
| 126 |
+
pip install -r requirements.txt
|
| 127 |
+
|
| 128 |
+
# React app
|
| 129 |
+
cd frontend && npm install && cd ..
|
| 130 |
+
|
| 131 |
+
# Documentation
|
| 132 |
+
cd website && npm install && cd ..
|
| 133 |
+
|
| 134 |
+
# Configure environment
|
| 135 |
+
cp .env.example .env
|
| 136 |
+
# Edit .env with your API keys
|
| 137 |
+
|
| 138 |
+
# Start services (separate terminals)
|
| 139 |
+
source .venv/bin/activate && python main.py serve # Terminal 1
|
| 140 |
+
cd frontend && npm run dev # Terminal 2
|
| 141 |
+
cd website && npm start # Terminal 3
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### Access Points
|
| 145 |
+
|
| 146 |
+
**🌐 LIVE APPLICATION:**
|
| 147 |
+
- **🚀 Open Navigator:** https://www.communityone.com - Main application
|
| 148 |
+
- 📚 **Documentation:** https://www.communityone.com/docs - Guides and API reference
|
| 149 |
+
- 🔥 **API Docs:** https://www.communityone.com/api/docs - FastAPI interactive documentation
|
| 150 |
+
|
| 151 |
+
**💻 LOCAL DEVELOPMENT:**
|
| 152 |
+
- **🚀 Main App:** http://localhost:5173
|
| 153 |
+
- 📚 **Documentation:** http://localhost:3000
|
| 154 |
+
- 🔥 **API Docs:** http://localhost:8000/docs
|
| 155 |
+
|
| 156 |
+
### Stop Services
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
./stop-all.sh
|
| 160 |
+
# or
|
| 161 |
+
make stop-all
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
---
|
| 165 |
+
|
| 166 |
+
## Usage
|
| 167 |
+
|
| 168 |
+
### Command Line Interface
|
| 169 |
+
|
| 170 |
+
Always activate the virtual environment first:
|
| 171 |
+
|
| 172 |
+
```bash
|
| 173 |
+
source .venv/bin/activate
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
**API Server**
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
python main.py serve --host 0.0.0.0 --port 8000
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
**Jurisdiction Discovery**
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
# Test run
|
| 186 |
+
python main.py discover-jurisdictions --limit 100
|
| 187 |
+
|
| 188 |
+
# Single state
|
| 189 |
+
python main.py discover-jurisdictions --state CA
|
| 190 |
+
|
| 191 |
+
# Full discovery (~30k jurisdictions)
|
| 192 |
+
python main.py discover-jurisdictions
|
| 193 |
+
|
| 194 |
+
# View statistics
|
| 195 |
+
python main.py discovery-stats
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
**Data Ingestion**
|
| 199 |
+
|
| 200 |
+
```bash
|
| 201 |
+
# Census data (90,000+ jurisdictions)
|
| 202 |
+
python -m discovery.census_ingestion
|
| 203 |
+
|
| 204 |
+
# NCES school districts (13,000+)
|
| 205 |
+
python -m discovery.nces_ingestion
|
| 206 |
+
|
| 207 |
+
# Pre-built meeting datasets
|
| 208 |
+
python discovery/meetingbank_ingestion.py
|
| 209 |
+
python discovery/city_scrapers_urls.py
|
| 210 |
+
python discovery/openstates_sources.py
|
| 211 |
+
|
| 212 |
+
# LocalView (requires Dataverse API key)
|
| 213 |
+
python discovery/localview_ingestion.py
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
**Scraping & Analysis**
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
# Scrape batch from discovered sites
|
| 220 |
+
python main.py scrape-batch --source discovered --limit 50
|
| 221 |
+
|
| 222 |
+
# Scrape single source
|
| 223 |
+
python main.py scrape --url "https://city.legistar.com" \
|
| 224 |
+
--state "CA" \
|
| 225 |
+
--municipality "San Francisco"
|
| 226 |
+
|
| 227 |
+
# Run analysis pipeline
|
| 228 |
+
python main.py analyze --targets-file examples/targets.json
|
| 229 |
+
|
| 230 |
+
# Generate heatmap
|
| 231 |
+
python main.py generate-heatmap --output heatmap.html
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
**Publishing Datasets**
|
| 235 |
+
|
| 236 |
+
```bash
|
| 237 |
+
# Publish to HuggingFace (requires HUGGINGFACE_TOKEN in .env)
|
| 238 |
+
python main.py publish-to-hf --dataset all
|
| 239 |
+
python main.py publish-to-hf --dataset discovered-urls
|
| 240 |
+
python main.py publish-to-hf --dataset census --sample
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
### API Usage
|
| 244 |
+
|
| 245 |
+
**Start a workflow:**
|
| 246 |
+
|
| 247 |
+
```bash
|
| 248 |
+
curl -X POST "http://localhost:8000/workflow/start" \
|
| 249 |
+
-H "Content-Type: application/json" \
|
| 250 |
+
-d '{
|
| 251 |
+
"scrape_targets": [
|
| 252 |
+
{
|
| 253 |
+
"url": "https://example-city.legistar.com",
|
| 254 |
+
"municipality": "Example City",
|
| 255 |
+
"state": "CA",
|
| 256 |
+
"platform": "legistar"
|
| 257 |
+
}
|
| 258 |
+
]
|
| 259 |
+
}'
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
**Query opportunities:**
|
| 263 |
+
|
| 264 |
+
```bash
|
| 265 |
+
curl "http://localhost:8000/opportunities?state=CA&urgency=critical"
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
**Get heatmap:**
|
| 269 |
+
|
| 270 |
+
```bash
|
| 271 |
+
curl "http://localhost:8000/heatmap" > heatmap.html
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
### Python API
|
| 275 |
+
|
| 276 |
+
```python
|
| 277 |
+
import asyncio
|
| 278 |
+
from agents.orchestrator import OrchestratorAgent
|
| 279 |
+
from agents.scraper import ScraperAgent
|
| 280 |
+
from agents.parser import ParserAgent
|
| 281 |
+
from agents.classifier import ClassifierAgent
|
| 282 |
+
|
| 283 |
+
# Initialize orchestrator
|
| 284 |
+
orchestrator = OrchestratorAgent()
|
| 285 |
+
|
| 286 |
+
# Register agents
|
| 287 |
+
orchestrator.register_agent(ScraperAgent())
|
| 288 |
+
orchestrator.register_agent(ParserAgent())
|
| 289 |
+
orchestrator.register_agent(ClassifierAgent())
|
| 290 |
+
|
| 291 |
+
# Execute pipeline
|
| 292 |
+
targets = [
|
| 293 |
+
{
|
| 294 |
+
"url": "https://city.legistar.com",
|
| 295 |
+
"municipality": "Example City",
|
| 296 |
+
"state": "CA",
|
| 297 |
+
"platform": "legistar"
|
| 298 |
+
}
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
results = await orchestrator.execute_pipeline(targets)
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
---
|
| 305 |
+
|
| 306 |
+
## Project Structure
|
| 307 |
+
|
| 308 |
+
```
|
| 309 |
+
open-navigator/
|
| 310 |
+
├── agents/ # Multi-agent AI system
|
| 311 |
+
├── api/ # FastAPI application
|
| 312 |
+
├── frontend/ # React application (Open Navigator)
|
| 313 |
+
├── website/ # Docusaurus documentation
|
| 314 |
+
├── discovery/ # Data discovery modules
|
| 315 |
+
├── extraction/ # Document extraction
|
| 316 |
+
├── pipeline/ # Data pipeline components
|
| 317 |
+
├── visualization/ # Heatmap and charts
|
| 318 |
+
├── config/ # Configuration
|
| 319 |
+
├── tests/ # Test suite
|
| 320 |
+
├── main.py # CLI entry point
|
| 321 |
+
└── requirements.txt # Python dependencies
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
---
|
| 325 |
+
|
| 326 |
+
## Deployment Options
|
| 327 |
+
|
| 328 |
+
### 1. Databricks Apps (Production)
|
| 329 |
+
|
| 330 |
+
```bash
|
| 331 |
+
export DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
|
| 332 |
+
export DATABRICKS_TOKEN=dapi...
|
| 333 |
+
export OPENAI_API_KEY=sk-...
|
| 334 |
+
|
| 335 |
+
./scripts/deploy-databricks-app.sh
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
See [DATABRICKS_APP_GUIDE.md](DATABRICKS_APP_GUIDE.md) for details.
|
| 339 |
+
|
| 340 |
+
### 2. Docker
|
| 341 |
+
|
| 342 |
+
```bash
|
| 343 |
+
docker-compose up -d
|
| 344 |
+
```
|
| 345 |
+
|
| 346 |
+
Starts:
|
| 347 |
+
- API server (port 8000)
|
| 348 |
+
- Qdrant vector database (port 6333)
|
| 349 |
+
- Jupyter notebook (port 8888)
|
| 350 |
+
|
| 351 |
+
### 3. Local Development
|
| 352 |
+
|
| 353 |
+
See [Quick Start](#quick-start) above.
|
| 354 |
+
|
| 355 |
+
---
|
| 356 |
+
|
| 357 |
+
## ⚡ Intel Arc GPU Optimization
|
| 358 |
+
|
| 359 |
+
**Run Llama 4 at NVIDIA-like speeds on Intel Arc integrated graphics!**
|
| 360 |
+
|
| 361 |
+
If you have **Intel Core Ultra 7** (or similar) with Arc Graphics + NPU, you can use **DuckDB + VSS** for 10-50x faster legislative analysis:
|
| 362 |
+
|
| 363 |
+
```bash
|
| 364 |
+
# Setup Intel-optimized environment
|
| 365 |
+
./scripts/intel_llm_setup.sh
|
| 366 |
+
source .venv-intel/bin/activate
|
| 367 |
+
|
| 368 |
+
# Run DuckDB vector search demo
|
| 369 |
+
python scripts/duckdb_vss_demo.py
|
| 370 |
+
|
| 371 |
+
# Run legislative analysis with LLM
|
| 372 |
+
python scripts/legislative_analysis_intel.py
|
| 373 |
+
```
|
| 374 |
+
|
| 375 |
+
**Why DuckDB for Local AI?**
|
| 376 |
+
- ⚡ **10-50x faster** than Postgres for context injection
|
| 377 |
+
- 🎯 **< 20ms** vector similarity search across 10K bills
|
| 378 |
+
- 🧠 **Embedded** - no server needed, runs locally
|
| 379 |
+
- 🤗 **Hugging Face Integration** - query HF datasets directly
|
| 380 |
+
|
| 381 |
+
**Performance:**
|
| 382 |
+
- **Context Injection**: 20ms vs 500ms (Postgres) = **25x faster**
|
| 383 |
+
- **LLM Inference**: 1,200 tok/s (Arc GPU) vs 350 tok/s (CPU) = **3.4x faster**
|
| 384 |
+
- **Vector Search**: 18ms vs 800ms = **44x faster**
|
| 385 |
+
|
| 386 |
+
**Features:**
|
| 387 |
+
- Extract interest groups from legislative testimony
|
| 388 |
+
- Identify lobbyists and their positions
|
| 389 |
+
- Analyze support/oppose scores with confidence
|
| 390 |
+
- Detect tradeoffs and compromises
|
| 391 |
+
|
| 392 |
+
**See full guide:** [Intel Arc Optimization Guide](website/docs/guides/intel-arc-optimization.md)
|
| 393 |
+
|
| 394 |
+
---
|
| 395 |
+
|
| 396 |
+
## 🤖 AI Integration (MCP Server)
|
| 397 |
+
|
| 398 |
+
**Connect your civic data to Claude and other AI assistants!**
|
| 399 |
+
|
| 400 |
+
Open Navigator includes a **Model Context Protocol (MCP)** server that lets AI assistants directly access your data:
|
| 401 |
+
|
| 402 |
+
```bash
|
| 403 |
+
# Install MCP dependencies
|
| 404 |
+
pip install mcp anthropic-mcp-sdk
|
| 405 |
+
|
| 406 |
+
# Run the server
|
| 407 |
+
python scripts/mcp/open_navigator_server.py
|
| 408 |
+
```
|
| 409 |
+
|
| 410 |
+
**What AI assistants can do:**
|
| 411 |
+
- 🏛️ Search 90,000+ jurisdictions by name or location
|
| 412 |
+
- 🏢 Query 1.8M nonprofits with Form 990 data
|
| 413 |
+
- 📜 Semantic search across 4.5M+ legislative documents
|
| 414 |
+
- 📊 Get real-time statistics and analytics
|
| 415 |
+
- 🔍 Vector search meetings and bills with natural language
|
| 416 |
+
|
| 417 |
+
**Example queries to Claude:**
|
| 418 |
+
> "Find all cities named Springfield in the database"
|
| 419 |
+
|
| 420 |
+
> "Show me 501c3 nonprofits in San Francisco focused on education"
|
| 421 |
+
|
| 422 |
+
> "What bills related to oral health were introduced in California?"
|
| 423 |
+
|
| 424 |
+
**Configure Claude Desktop:**
|
| 425 |
+
|
| 426 |
+
Add to `~/.config/Claude/claude_desktop_config.json`:
|
| 427 |
+
|
| 428 |
+
```json
|
| 429 |
+
{
|
| 430 |
+
"mcpServers": {
|
| 431 |
+
"open-navigator": {
|
| 432 |
+
"command": "python",
|
| 433 |
+
"args": ["/path/to/open-navigator/scripts/mcp/open_navigator_server.py"],
|
| 434 |
+
"env": {
|
| 435 |
+
"DATABASE_URL": "postgresql://postgres:password@localhost:5433/open_navigator"
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
}
|
| 439 |
+
}
|
| 440 |
+
```
|
| 441 |
+
|
| 442 |
+
**See full guide:** [MCP Server Documentation](website/docs/integrations/mcp-server.md)
|
| 443 |
+
|
| 444 |
+
---
|
| 445 |
+
|
| 446 |
+
## Testing
|
| 447 |
+
|
| 448 |
+
```bash
|
| 449 |
+
# Run all tests
|
| 450 |
+
pytest
|
| 451 |
+
|
| 452 |
+
# With coverage
|
| 453 |
+
pytest --cov=agents --cov=pipeline --cov=visualization
|
| 454 |
+
|
| 455 |
+
# Specific test file
|
| 456 |
+
pytest tests/test_agents.py
|
| 457 |
+
```
|
| 458 |
+
|
| 459 |
+
---
|
| 460 |
+
|
| 461 |
+
## Configuration
|
| 462 |
+
|
| 463 |
+
Create `.env` file:
|
| 464 |
+
|
| 465 |
+
```bash
|
| 466 |
+
# OpenAI
|
| 467 |
+
OPENAI_API_KEY=sk-...
|
| 468 |
+
|
| 469 |
+
# Databricks (optional)
|
| 470 |
+
DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
|
| 471 |
+
DATABRICKS_TOKEN=dapi...
|
| 472 |
+
|
| 473 |
+
# HuggingFace (optional)
|
| 474 |
+
HUGGINGFACE_TOKEN=hf_...
|
| 475 |
+
|
| 476 |
+
# Dataverse (optional)
|
| 477 |
+
DATAVERSE_API_KEY=...
|
| 478 |
+
```
|
| 479 |
+
|
| 480 |
+
---
|
| 481 |
+
|
| 482 |
+
## Contributing
|
| 483 |
+
|
| 484 |
+
Contributions are welcome! Please:
|
| 485 |
+
|
| 486 |
+
1. Fork the repository
|
| 487 |
+
2. Create a feature branch
|
| 488 |
+
3. Make your changes
|
| 489 |
+
4. Add tests
|
| 490 |
+
5. Submit a pull request
|
| 491 |
+
|
| 492 |
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
|
| 493 |
+
|
| 494 |
+
---
|
| 495 |
+
|
| 496 |
+
## Documentation
|
| 497 |
+
|
| 498 |
+
- **[Full Documentation](http://localhost:3000)** - Complete guides and API reference
|
| 499 |
+
- **[Architecture](http://localhost:3000/docs/architecture)** - System architecture overview
|
| 500 |
+
- **[Quick Start](http://localhost:3000/docs/quickstart)** - Detailed setup instructions
|
| 501 |
+
- **[Quick Reference](http://localhost:3000/docs/quick-reference)** - Command reference card
|
| 502 |
+
- **[MCP Server](http://localhost:3000/docs/integrations/mcp-server)** - AI assistant integration guide
|
| 503 |
+
- **[Deployment](http://localhost:3000/docs/deployment/databricks-apps)** - Production deployment guides
|
| 504 |
+
- **[Case Studies](http://localhost:3000/docs/case-studies/tuscaloosa-complete)** - Real-world examples
|
| 505 |
+
- [CONTRIBUTING.md](CONTRIBUTING.md) - How to contribute
|
| 506 |
+
|
| 507 |
+
---
|
| 508 |
+
|
| 509 |
+
## Citations
|
| 510 |
+
|
| 511 |
+
This project uses several open datasets and research contributions. **Please see [CITATIONS.md](CITATIONS.md) for complete citation information.**
|
| 512 |
+
|
| 513 |
+
**Key Dataset:**
|
| 514 |
+
- **MeetingBank**: Hu et al., "MeetingBank: A Benchmark Dataset for Meeting Summarization", ACL 2023
|
| 515 |
+
- Used for meeting discovery and analysis
|
| 516 |
+
- 1,366 city council meetings from 6 U.S. cities
|
| 517 |
+
- See [CITATIONS.md](CITATIONS.md) for full citation and BibTeX
|
| 518 |
+
|
| 519 |
+
---
|
| 520 |
+
|
| 521 |
+
## License
|
| 522 |
+
|
| 523 |
+
Apache License 2.0 - see [LICENSE](LICENSE) file for details.
|
| 524 |
+
|
| 525 |
+
---
|
| 526 |
+
|
| 527 |
+
## Support
|
| 528 |
+
|
| 529 |
+
- GitHub Issues: [github.com/getcommunityone/open-navigator-for-engagement/issues](https://github.com/getcommunityone/open-navigator-for-engagement/issues)
|
| 530 |
+
- Email: johnbowyer@communityone.com
|
| 531 |
+
|
| 532 |
+
---
|
| 533 |
+
|
| 534 |
+
**Note**: This system is designed to support advocacy efforts. All generated content should be reviewed by humans before use.
|
README_HF.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: CommunityOne - Open Navigator
|
| 3 |
+
emoji: 🏛️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
tags:
|
| 11 |
+
- civic-engagement
|
| 12 |
+
- policy-tracking
|
| 13 |
+
- government-transparency
|
| 14 |
+
- nonprofit-discovery
|
| 15 |
+
- open-data
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
# 🏛️ CommunityOne - Open Navigator
|
| 19 |
+
|
| 20 |
+
**Track 90,000+ jurisdictions. Monitor 1.8M nonprofits. Amplify your voice.**
|
| 21 |
+
|
| 22 |
+
CommunityOne is a civic engagement platform that helps you discover advocacy opportunities, track policy changes, and connect with organizations working on the causes you care about.
|
| 23 |
+
|
| 24 |
+
## ✨ Features
|
| 25 |
+
|
| 26 |
+
- **🔍 Unified Search**: Find contacts, meetings, organizations, and causes across the entire United States
|
| 27 |
+
- **📊 Real-time Stats**: Track policy activity across 90,000+ cities, counties, and states
|
| 28 |
+
- **🏢 Nonprofit Discovery**: Explore 1.8M organizations from IRS data enriched with Every.org
|
| 29 |
+
- **📅 Meeting Minutes**: Search 250,000+ government meeting transcripts and agendas
|
| 30 |
+
- **🎯 Geographic Filtering**: Browse by state, county, or city to find local opportunities
|
| 31 |
+
- **🔐 OAuth Login**: Sign in with HuggingFace, GitHub, or Google to save your preferences
|
| 32 |
+
|
| 33 |
+
## 🚀 Three Services Architecture
|
| 34 |
+
|
| 35 |
+
This deployment runs three integrated services:
|
| 36 |
+
|
| 37 |
+
1. **📚 Documentation** (Docusaurus) - `/docs/`
|
| 38 |
+
2. **🖥️ Main Application** (React + Vite) - `/`
|
| 39 |
+
3. **⚡ API Backend** (FastAPI) - `/api/`
|
| 40 |
+
|
| 41 |
+
All services are reverse-proxied through nginx on port 7860.
|
| 42 |
+
|
| 43 |
+
## 📖 Quick Start
|
| 44 |
+
|
| 45 |
+
### Browse Without Login
|
| 46 |
+
- Click "Browse All" to explore data by state
|
| 47 |
+
- Use the search bar to find organizations, contacts, or causes
|
| 48 |
+
- Filter by location using the state/county/city selectors
|
| 49 |
+
|
| 50 |
+
### Sign In for Personalization
|
| 51 |
+
- Click "Login" in the top right
|
| 52 |
+
- Choose your OAuth provider (HuggingFace, GitHub, or Google)
|
| 53 |
+
- Follow organizations, leaders, and causes you care about
|
| 54 |
+
- Get personalized recommendations
|
| 55 |
+
|
| 56 |
+
### Explore the API
|
| 57 |
+
- Visit `/redoc` for interactive API documentation
|
| 58 |
+
- Try the search endpoints with state filters
|
| 59 |
+
- Export data in JSON format for your own projects
|
| 60 |
+
|
| 61 |
+
## 🛠️ Technology Stack
|
| 62 |
+
|
| 63 |
+
- **Frontend**: React 18 + TypeScript + Vite + TailwindCSS + shadcn/ui
|
| 64 |
+
- **Backend**: Python 3.11 + FastAPI + Pydantic
|
| 65 |
+
- **Data**: Delta Lake + Parquet (90GB+ of civic data)
|
| 66 |
+
- **Docs**: Docusaurus v3
|
| 67 |
+
- **Infrastructure**: nginx + supervisor + Docker
|
| 68 |
+
|
| 69 |
+
## 📊 Data Sources
|
| 70 |
+
|
| 71 |
+
- **IRS BMF**: 1.8M tax-exempt organizations
|
| 72 |
+
- **Every.org**: Nonprofit enrichment (logos, causes, revenue)
|
| 73 |
+
- **Open States**: State legislators and bills (7,300+ officials)
|
| 74 |
+
- **Census**: Jurisdictions and boundaries (90,000+)
|
| 75 |
+
- **CityScrapers**: Local government meetings
|
| 76 |
+
- **OpenCivicData**: Standardized government data
|
| 77 |
+
|
| 78 |
+
## 🔗 Links
|
| 79 |
+
|
| 80 |
+
- **Repository**: [github.com/getcommunityone/open-navigator](https://github.com/getcommunityone/open-navigator)
|
| 81 |
+
- **Documentation**: Click "📚 Browse Documentation" on the homepage
|
| 82 |
+
- **API Docs**: `/redoc` endpoint
|
| 83 |
+
- **Website**: [www.communityone.com](https://www.communityone.com)
|
| 84 |
+
|
| 85 |
+
## 📝 License
|
| 86 |
+
|
| 87 |
+
Apache License 2.0 - Free for commercial and non-commercial use
|
| 88 |
+
|
| 89 |
+
## 🤝 Contributing
|
| 90 |
+
|
| 91 |
+
We welcome contributions! See CONTRIBUTING.md in the repository for guidelines.
|
| 92 |
+
|
| 93 |
+
## 💬 Support
|
| 94 |
+
|
| 95 |
+
- **Issues**: [GitHub Issues](https://github.com/getcommunityone/open-navigator/issues)
|
| 96 |
+
- **Discussions**: [GitHub Discussions](https://github.com/getcommunityone/open-navigator/discussions)
|
| 97 |
+
- **Email**: hello@communityone.com
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
Built with ❤️ for civic engagement and government transparency.
|
agents/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agents module for the Oral Health Policy Pulse system."""
|
| 2 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 3 |
+
from agents.orchestrator import OrchestratorAgent
|
| 4 |
+
from agents.debate_grader import DebateGraderAgent, DebateDimension, DebateScore
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"BaseAgent",
|
| 8 |
+
"AgentRole",
|
| 9 |
+
"AgentMessage",
|
| 10 |
+
"MessageType",
|
| 11 |
+
"AgentStatus",
|
| 12 |
+
"OrchestratorAgent",
|
| 13 |
+
"DebateGraderAgent",
|
| 14 |
+
"DebateDimension",
|
| 15 |
+
"DebateScore"
|
| 16 |
+
]
|
agents/advocacy.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Advocacy Writer Agent for generating personalized outreach materials.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AdvocacyWriterAgent(BaseAgent):
|
| 13 |
+
"""
|
| 14 |
+
Agent responsible for generating advocacy materials.
|
| 15 |
+
|
| 16 |
+
Creates:
|
| 17 |
+
- Personalized emails to local officials
|
| 18 |
+
- Talking points for public testimony
|
| 19 |
+
- Social media content
|
| 20 |
+
- Policy briefs
|
| 21 |
+
- Community outreach materials
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, agent_id: str = "advocacy-001"):
|
| 25 |
+
"""Initialize the advocacy writer agent."""
|
| 26 |
+
super().__init__(agent_id, AgentRole.ADVOCACY_WRITER)
|
| 27 |
+
self._initialize_templates()
|
| 28 |
+
|
| 29 |
+
def _initialize_templates(self):
|
| 30 |
+
"""Initialize email and content templates."""
|
| 31 |
+
self.email_templates = {
|
| 32 |
+
"critical_vote": {
|
| 33 |
+
"subject": "Urgent: Support Oral Health Policy - Vote Upcoming in {municipality}",
|
| 34 |
+
"opening": (
|
| 35 |
+
"I am writing to urge your support for the upcoming vote on "
|
| 36 |
+
"{policy_topic} in {municipality}."
|
| 37 |
+
),
|
| 38 |
+
"urgency": "This matter requires immediate attention as a vote is scheduled for {meeting_date}."
|
| 39 |
+
},
|
| 40 |
+
"introduce_topic": {
|
| 41 |
+
"subject": "Opportunity to Improve Community Oral Health in {municipality}",
|
| 42 |
+
"opening": (
|
| 43 |
+
"I am writing to bring to your attention an important opportunity "
|
| 44 |
+
"to enhance oral health services in {municipality}."
|
| 45 |
+
),
|
| 46 |
+
"urgency": None
|
| 47 |
+
},
|
| 48 |
+
"address_opposition": {
|
| 49 |
+
"subject": "Addressing Concerns About {policy_topic} in {municipality}",
|
| 50 |
+
"opening": (
|
| 51 |
+
"I understand there are concerns about {policy_topic}. "
|
| 52 |
+
"I would like to share evidence-based information that may help inform the discussion."
|
| 53 |
+
),
|
| 54 |
+
"urgency": None
|
| 55 |
+
},
|
| 56 |
+
"support_existing": {
|
| 57 |
+
"subject": "Thank You for Supporting Oral Health in {municipality}",
|
| 58 |
+
"opening": (
|
| 59 |
+
"Thank you for your support of {policy_topic}. "
|
| 60 |
+
"I am writing to express my appreciation and offer additional support."
|
| 61 |
+
),
|
| 62 |
+
"urgency": None
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
self.policy_benefits = {
|
| 67 |
+
"water_fluoridation": [
|
| 68 |
+
"Reduces tooth decay by 25% in children and adults",
|
| 69 |
+
"Costs approximately $1 per person per year",
|
| 70 |
+
"Recognized by CDC as one of 10 great public health achievements",
|
| 71 |
+
"Reduces dental treatment costs by $38 per $1 invested",
|
| 72 |
+
"Particularly benefits low-income families with limited access to dental care"
|
| 73 |
+
],
|
| 74 |
+
"school_dental_screening": [
|
| 75 |
+
"Early detection prevents costly emergency dental procedures",
|
| 76 |
+
"Identifies children who need care before problems become severe",
|
| 77 |
+
"Reduces school absences due to dental pain",
|
| 78 |
+
"Connects families to dental resources and services",
|
| 79 |
+
"Supported by American Academy of Pediatrics"
|
| 80 |
+
],
|
| 81 |
+
"medicaid_dental": [
|
| 82 |
+
"Improves health outcomes for vulnerable populations",
|
| 83 |
+
"Reduces emergency room visits for dental problems",
|
| 84 |
+
"Prevents progression of oral disease to systemic health issues",
|
| 85 |
+
"Supports working families and children",
|
| 86 |
+
"Generates economic returns through improved productivity"
|
| 87 |
+
],
|
| 88 |
+
"dental_clinic_funding": [
|
| 89 |
+
"Provides essential services to underserved communities",
|
| 90 |
+
"Reduces health disparities",
|
| 91 |
+
"Creates local jobs and economic activity",
|
| 92 |
+
"Prevents costly emergency care",
|
| 93 |
+
"Serves as safety net for uninsured and underinsured residents"
|
| 94 |
+
]
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 98 |
+
"""
|
| 99 |
+
Process advocacy generation commands.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
message: Message containing analyzed documents and opportunities
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
List of messages with generated advocacy materials
|
| 106 |
+
"""
|
| 107 |
+
self.update_status(AgentStatus.PROCESSING, "Generating advocacy materials")
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
documents = message.payload.get("documents", [])
|
| 111 |
+
opportunities = message.payload.get("opportunities", [])
|
| 112 |
+
|
| 113 |
+
# Generate advocacy materials for each opportunity
|
| 114 |
+
advocacy_materials = []
|
| 115 |
+
|
| 116 |
+
for opp in opportunities:
|
| 117 |
+
materials = await self._generate_advocacy_materials(opp, documents)
|
| 118 |
+
advocacy_materials.append(materials)
|
| 119 |
+
|
| 120 |
+
# Send results back to orchestrator
|
| 121 |
+
response = await self.send_message(
|
| 122 |
+
AgentRole.ORCHESTRATOR,
|
| 123 |
+
MessageType.RESPONSE,
|
| 124 |
+
{
|
| 125 |
+
"workflow_id": message.payload.get("workflow_id"),
|
| 126 |
+
"advocacy_materials": advocacy_materials,
|
| 127 |
+
"opportunities_count": len(opportunities),
|
| 128 |
+
"materials_generated": len(advocacy_materials)
|
| 129 |
+
}
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
self.log_success()
|
| 133 |
+
logger.info(f"Generated advocacy materials for {len(opportunities)} opportunities")
|
| 134 |
+
|
| 135 |
+
return [response]
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
self.log_failure(str(e))
|
| 139 |
+
error_msg = await self.send_message(
|
| 140 |
+
AgentRole.ORCHESTRATOR,
|
| 141 |
+
MessageType.ERROR,
|
| 142 |
+
{"error": str(e), "agent": self.agent_id}
|
| 143 |
+
)
|
| 144 |
+
return [error_msg]
|
| 145 |
+
|
| 146 |
+
async def _generate_advocacy_materials(
|
| 147 |
+
self,
|
| 148 |
+
opportunity: Dict[str, Any],
|
| 149 |
+
all_documents: List[Dict[str, Any]]
|
| 150 |
+
) -> Dict[str, Any]:
|
| 151 |
+
"""
|
| 152 |
+
Generate complete advocacy materials for an opportunity.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
opportunity: Advocacy opportunity details
|
| 156 |
+
all_documents: All analyzed documents for context
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
Dictionary containing all generated materials
|
| 160 |
+
"""
|
| 161 |
+
# Find the source document
|
| 162 |
+
doc = next(
|
| 163 |
+
(d for d in all_documents if d["document_id"] == opportunity["document_id"]),
|
| 164 |
+
None
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
if not doc:
|
| 168 |
+
logger.error(f"Document not found: {opportunity['document_id']}")
|
| 169 |
+
return {}
|
| 170 |
+
|
| 171 |
+
# Determine template based on situation
|
| 172 |
+
template_type = self._select_template(opportunity)
|
| 173 |
+
|
| 174 |
+
# Generate email
|
| 175 |
+
email = await self._generate_email(opportunity, doc, template_type)
|
| 176 |
+
|
| 177 |
+
# Generate talking points
|
| 178 |
+
talking_points = self._generate_talking_points(opportunity, doc)
|
| 179 |
+
|
| 180 |
+
# Generate social media content
|
| 181 |
+
social_media = self._generate_social_media(opportunity)
|
| 182 |
+
|
| 183 |
+
# Generate policy brief
|
| 184 |
+
policy_brief = self._generate_policy_brief(opportunity, doc)
|
| 185 |
+
|
| 186 |
+
materials = {
|
| 187 |
+
"opportunity_id": opportunity["document_id"],
|
| 188 |
+
"municipality": opportunity["municipality"],
|
| 189 |
+
"state": opportunity["state"],
|
| 190 |
+
"topic": opportunity["topic"],
|
| 191 |
+
"urgency": opportunity["urgency"],
|
| 192 |
+
"materials": {
|
| 193 |
+
"email": email,
|
| 194 |
+
"talking_points": talking_points,
|
| 195 |
+
"social_media": social_media,
|
| 196 |
+
"policy_brief": policy_brief
|
| 197 |
+
},
|
| 198 |
+
"generated_at": datetime.utcnow().isoformat(),
|
| 199 |
+
"metadata": {
|
| 200 |
+
"source_url": opportunity["source_url"],
|
| 201 |
+
"meeting_date": opportunity["meeting_date"]
|
| 202 |
+
}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
return materials
|
| 206 |
+
|
| 207 |
+
def _select_template(self, opportunity: Dict[str, Any]) -> str:
|
| 208 |
+
"""Select appropriate email template based on situation."""
|
| 209 |
+
urgency = opportunity.get("urgency")
|
| 210 |
+
stance = opportunity.get("stance")
|
| 211 |
+
|
| 212 |
+
if urgency == "critical":
|
| 213 |
+
return "critical_vote"
|
| 214 |
+
elif stance in ["opposed", "strongly_opposed"]:
|
| 215 |
+
return "address_opposition"
|
| 216 |
+
elif stance in ["supportive", "strongly_supportive"]:
|
| 217 |
+
return "support_existing"
|
| 218 |
+
else:
|
| 219 |
+
return "introduce_topic"
|
| 220 |
+
|
| 221 |
+
async def _generate_email(
|
| 222 |
+
self,
|
| 223 |
+
opportunity: Dict[str, Any],
|
| 224 |
+
doc: Dict[str, Any],
|
| 225 |
+
template_type: str
|
| 226 |
+
) -> Dict[str, Any]:
|
| 227 |
+
"""Generate personalized email content."""
|
| 228 |
+
template = self.email_templates[template_type]
|
| 229 |
+
|
| 230 |
+
# Format template variables
|
| 231 |
+
variables = {
|
| 232 |
+
"municipality": opportunity["municipality"],
|
| 233 |
+
"policy_topic": self._format_topic_name(opportunity["topic"]),
|
| 234 |
+
"meeting_date": opportunity["meeting_date"]
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
subject = template["subject"].format(**variables)
|
| 238 |
+
opening = template["opening"].format(**variables)
|
| 239 |
+
|
| 240 |
+
# Build email body
|
| 241 |
+
body_parts = [opening]
|
| 242 |
+
|
| 243 |
+
# Add urgency if applicable
|
| 244 |
+
if template["urgency"]:
|
| 245 |
+
body_parts.append("\n\n" + template["urgency"].format(**variables))
|
| 246 |
+
|
| 247 |
+
# Add policy benefits
|
| 248 |
+
body_parts.append("\n\n**Key Benefits:**")
|
| 249 |
+
benefits = self.policy_benefits.get(
|
| 250 |
+
opportunity["topic"],
|
| 251 |
+
["Improves community health outcomes"]
|
| 252 |
+
)
|
| 253 |
+
for benefit in benefits[:3]: # Top 3 benefits
|
| 254 |
+
body_parts.append(f"• {benefit}")
|
| 255 |
+
|
| 256 |
+
# Add call to action
|
| 257 |
+
body_parts.append(self._generate_call_to_action(opportunity))
|
| 258 |
+
|
| 259 |
+
# Add closing
|
| 260 |
+
body_parts.append(
|
| 261 |
+
"\n\nThank you for your time and consideration. "
|
| 262 |
+
"I would welcome the opportunity to discuss this further."
|
| 263 |
+
)
|
| 264 |
+
body_parts.append("\n\nSincerely,")
|
| 265 |
+
body_parts.append("[Your Name]")
|
| 266 |
+
body_parts.append("[Your Organization]")
|
| 267 |
+
|
| 268 |
+
email = {
|
| 269 |
+
"subject": subject,
|
| 270 |
+
"body": "\n".join(body_parts),
|
| 271 |
+
"template_type": template_type,
|
| 272 |
+
"personalization_variables": variables
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
return email
|
| 276 |
+
|
| 277 |
+
def _generate_call_to_action(self, opportunity: Dict[str, Any]) -> str:
|
| 278 |
+
"""Generate appropriate call to action based on urgency."""
|
| 279 |
+
urgency = opportunity.get("urgency")
|
| 280 |
+
stance = opportunity.get("stance")
|
| 281 |
+
|
| 282 |
+
if urgency == "critical":
|
| 283 |
+
return (
|
| 284 |
+
"\n\n**Action Needed:**\n"
|
| 285 |
+
f"Please vote in favor of this important measure at the upcoming meeting. "
|
| 286 |
+
f"Your constituents' oral health depends on this decision."
|
| 287 |
+
)
|
| 288 |
+
elif stance in ["opposed", "strongly_opposed"]:
|
| 289 |
+
return (
|
| 290 |
+
"\n\n**Requested Action:**\n"
|
| 291 |
+
"I respectfully request a meeting to discuss the evidence supporting this policy "
|
| 292 |
+
"and address any concerns you may have."
|
| 293 |
+
)
|
| 294 |
+
else:
|
| 295 |
+
return (
|
| 296 |
+
"\n\n**Requested Action:**\n"
|
| 297 |
+
"I encourage you to support this initiative and would be happy to provide "
|
| 298 |
+
"additional information or connect you with subject matter experts."
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
def _generate_talking_points(
|
| 302 |
+
self,
|
| 303 |
+
opportunity: Dict[str, Any],
|
| 304 |
+
doc: Dict[str, Any]
|
| 305 |
+
) -> List[str]:
|
| 306 |
+
"""Generate talking points for public testimony or meetings."""
|
| 307 |
+
topic = opportunity["topic"]
|
| 308 |
+
|
| 309 |
+
talking_points = [
|
| 310 |
+
f"Introduction: Community member concerned about oral health in {opportunity['municipality']}"
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
+
# Add topic-specific points
|
| 314 |
+
benefits = self.policy_benefits.get(topic, [])
|
| 315 |
+
for i, benefit in enumerate(benefits[:5], 1):
|
| 316 |
+
talking_points.append(f"Point {i}: {benefit}")
|
| 317 |
+
|
| 318 |
+
# Add local context
|
| 319 |
+
talking_points.append(
|
| 320 |
+
f"Local relevance: This policy addresses needs identified in "
|
| 321 |
+
f"recent community discussions"
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# Add closing point
|
| 325 |
+
talking_points.append(
|
| 326 |
+
"Closing: Urge decision-makers to prioritize community oral health"
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
return talking_points
|
| 330 |
+
|
| 331 |
+
def _generate_social_media(
|
| 332 |
+
self,
|
| 333 |
+
opportunity: Dict[str, Any]
|
| 334 |
+
) -> Dict[str, str]:
|
| 335 |
+
"""Generate social media content."""
|
| 336 |
+
municipality = opportunity["municipality"]
|
| 337 |
+
topic = self._format_topic_name(opportunity["topic"])
|
| 338 |
+
|
| 339 |
+
twitter = (
|
| 340 |
+
f"🦷 {municipality} is considering {topic}! "
|
| 341 |
+
f"This could improve oral health for thousands. "
|
| 342 |
+
f"Contact your local officials to show support. "
|
| 343 |
+
f"#OralHealth #PublicHealth"
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
facebook = (
|
| 347 |
+
f"Important news for {municipality} residents!\n\n"
|
| 348 |
+
f"Our local government is discussing {topic}. "
|
| 349 |
+
f"This policy could significantly improve access to dental care "
|
| 350 |
+
f"for families in our community.\n\n"
|
| 351 |
+
f"Learn more and contact your representatives to voice your support: "
|
| 352 |
+
f"{opportunity.get('source_url', '')}"
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
return {
|
| 356 |
+
"twitter": twitter,
|
| 357 |
+
"facebook": facebook,
|
| 358 |
+
"instagram": twitter, # Similar to Twitter
|
| 359 |
+
"hashtags": ["OralHealth", "PublicHealth", municipality.replace(" ", "")]
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
def _generate_policy_brief(
|
| 363 |
+
self,
|
| 364 |
+
opportunity: Dict[str, Any],
|
| 365 |
+
doc: Dict[str, Any]
|
| 366 |
+
) -> Dict[str, Any]:
|
| 367 |
+
"""Generate a concise policy brief."""
|
| 368 |
+
topic = opportunity["topic"]
|
| 369 |
+
|
| 370 |
+
brief = {
|
| 371 |
+
"title": f"Policy Brief: {self._format_topic_name(topic)} in {opportunity['municipality']}",
|
| 372 |
+
"summary": (
|
| 373 |
+
f"This brief outlines the benefits and implementation considerations "
|
| 374 |
+
f"for {self._format_topic_name(topic)}."
|
| 375 |
+
),
|
| 376 |
+
"background": (
|
| 377 |
+
f"Current discussion in {opportunity['municipality']} presents "
|
| 378 |
+
f"an opportunity to improve community oral health."
|
| 379 |
+
),
|
| 380 |
+
"key_benefits": self.policy_benefits.get(topic, []),
|
| 381 |
+
"recommendations": [
|
| 382 |
+
"Approve the proposed policy",
|
| 383 |
+
"Allocate necessary funding",
|
| 384 |
+
"Establish implementation timeline",
|
| 385 |
+
"Monitor outcomes and adjust as needed"
|
| 386 |
+
],
|
| 387 |
+
"evidence_sources": [
|
| 388 |
+
"Centers for Disease Control and Prevention",
|
| 389 |
+
"American Dental Association",
|
| 390 |
+
"Community Preventive Services Task Force"
|
| 391 |
+
]
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
return brief
|
| 395 |
+
|
| 396 |
+
def _format_topic_name(self, topic: str) -> str:
|
| 397 |
+
"""Format topic identifier into readable name."""
|
| 398 |
+
topic_names = {
|
| 399 |
+
"water_fluoridation": "community water fluoridation",
|
| 400 |
+
"school_dental_screening": "school-based dental screening",
|
| 401 |
+
"medicaid_dental": "Medicaid dental coverage expansion",
|
| 402 |
+
"dental_clinic_funding": "community dental clinic funding",
|
| 403 |
+
"community_dental_program": "community dental programs",
|
| 404 |
+
"children_dental_health": "children's dental health initiatives",
|
| 405 |
+
"dental_care_access": "dental care access improvements"
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
return topic_names.get(topic, topic.replace("_", " "))
|
agents/base.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core agent base classes and protocols for the multi-agent system.
|
| 3 |
+
"""
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from typing import Any, Dict, List, Optional, Union
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from enum import Enum
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
from loguru import logger
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AgentRole(str, Enum):
|
| 13 |
+
"""Enumeration of agent roles in the system."""
|
| 14 |
+
SCRAPER = "scraper"
|
| 15 |
+
PARSER = "parser"
|
| 16 |
+
CLASSIFIER = "classifier"
|
| 17 |
+
SENTIMENT_ANALYZER = "sentiment_analyzer"
|
| 18 |
+
DEBATE_GRADER = "debate_grader"
|
| 19 |
+
ADVOCACY_WRITER = "advocacy_writer"
|
| 20 |
+
ORCHESTRATOR = "orchestrator"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class MessageType(str, Enum):
|
| 24 |
+
"""Types of messages exchanged between agents."""
|
| 25 |
+
DATA = "data"
|
| 26 |
+
COMMAND = "command"
|
| 27 |
+
QUERY = "query"
|
| 28 |
+
RESPONSE = "response"
|
| 29 |
+
ERROR = "error"
|
| 30 |
+
STATUS = "status"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class AgentMessage(BaseModel):
|
| 34 |
+
"""Message structure for inter-agent communication."""
|
| 35 |
+
message_id: str = Field(..., description="Unique message identifier")
|
| 36 |
+
sender: AgentRole = Field(..., description="Sending agent role")
|
| 37 |
+
recipient: AgentRole = Field(..., description="Receiving agent role")
|
| 38 |
+
message_type: MessageType = Field(..., description="Type of message")
|
| 39 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 40 |
+
payload: Dict[str, Any] = Field(default_factory=dict, description="Message payload")
|
| 41 |
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
|
| 42 |
+
|
| 43 |
+
class Config:
|
| 44 |
+
json_encoders = {
|
| 45 |
+
datetime: lambda v: v.isoformat()
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class AgentStatus(str, Enum):
|
| 50 |
+
"""Agent operational status."""
|
| 51 |
+
IDLE = "idle"
|
| 52 |
+
PROCESSING = "processing"
|
| 53 |
+
WAITING = "waiting"
|
| 54 |
+
ERROR = "error"
|
| 55 |
+
COMPLETED = "completed"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class AgentState(BaseModel):
|
| 59 |
+
"""Current state of an agent."""
|
| 60 |
+
agent_id: str
|
| 61 |
+
role: AgentRole
|
| 62 |
+
status: AgentStatus = AgentStatus.IDLE
|
| 63 |
+
current_task: Optional[str] = None
|
| 64 |
+
tasks_completed: int = 0
|
| 65 |
+
tasks_failed: int = 0
|
| 66 |
+
last_activity: datetime = Field(default_factory=datetime.utcnow)
|
| 67 |
+
error_message: Optional[str] = None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class BaseAgent(ABC):
|
| 71 |
+
"""
|
| 72 |
+
Abstract base class for all agents in the system.
|
| 73 |
+
|
| 74 |
+
Each agent must implement the process method to handle incoming messages
|
| 75 |
+
and perform its specific role in the pipeline.
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
def __init__(self, agent_id: str, role: AgentRole):
|
| 79 |
+
"""
|
| 80 |
+
Initialize the base agent.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
agent_id: Unique identifier for this agent instance
|
| 84 |
+
role: The role this agent plays in the system
|
| 85 |
+
"""
|
| 86 |
+
self.agent_id = agent_id
|
| 87 |
+
self.role = role
|
| 88 |
+
self.state = AgentState(agent_id=agent_id, role=role)
|
| 89 |
+
self.message_queue: List[AgentMessage] = []
|
| 90 |
+
logger.info(f"Initialized {role.value} agent: {agent_id}")
|
| 91 |
+
|
| 92 |
+
@abstractmethod
|
| 93 |
+
async def process(self, message: AgentMessage) -> Union[AgentMessage, List[AgentMessage]]:
|
| 94 |
+
"""
|
| 95 |
+
Process an incoming message and return response(s).
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
message: The message to process
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
One or more response messages
|
| 102 |
+
"""
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
def update_status(self, status: AgentStatus, task: Optional[str] = None):
|
| 106 |
+
"""Update the agent's current status."""
|
| 107 |
+
self.state.status = status
|
| 108 |
+
self.state.current_task = task
|
| 109 |
+
self.state.last_activity = datetime.utcnow()
|
| 110 |
+
logger.debug(f"{self.role.value} agent {self.agent_id} status: {status.value}")
|
| 111 |
+
|
| 112 |
+
def log_success(self):
|
| 113 |
+
"""Log a successful task completion."""
|
| 114 |
+
self.state.tasks_completed += 1
|
| 115 |
+
self.update_status(AgentStatus.IDLE)
|
| 116 |
+
|
| 117 |
+
def log_failure(self, error: str):
|
| 118 |
+
"""Log a task failure."""
|
| 119 |
+
self.state.tasks_failed += 1
|
| 120 |
+
self.state.error_message = error
|
| 121 |
+
self.update_status(AgentStatus.ERROR)
|
| 122 |
+
logger.error(f"{self.role.value} agent {self.agent_id} error: {error}")
|
| 123 |
+
|
| 124 |
+
async def send_message(
|
| 125 |
+
self,
|
| 126 |
+
recipient: AgentRole,
|
| 127 |
+
message_type: MessageType,
|
| 128 |
+
payload: Dict[str, Any],
|
| 129 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 130 |
+
) -> AgentMessage:
|
| 131 |
+
"""
|
| 132 |
+
Create and send a message to another agent.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
recipient: The receiving agent's role
|
| 136 |
+
message_type: Type of message to send
|
| 137 |
+
payload: Message content
|
| 138 |
+
metadata: Optional metadata
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
The created message
|
| 142 |
+
"""
|
| 143 |
+
import uuid
|
| 144 |
+
|
| 145 |
+
message = AgentMessage(
|
| 146 |
+
message_id=str(uuid.uuid4()),
|
| 147 |
+
sender=self.role,
|
| 148 |
+
recipient=recipient,
|
| 149 |
+
message_type=message_type,
|
| 150 |
+
payload=payload,
|
| 151 |
+
metadata=metadata or {}
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
return message
|
| 155 |
+
|
| 156 |
+
def get_state(self) -> AgentState:
|
| 157 |
+
"""Get the current state of the agent."""
|
| 158 |
+
return self.state
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class AgentMetrics(BaseModel):
|
| 162 |
+
"""Metrics for monitoring agent performance."""
|
| 163 |
+
agent_id: str
|
| 164 |
+
role: AgentRole
|
| 165 |
+
total_messages_processed: int = 0
|
| 166 |
+
total_processing_time_seconds: float = 0.0
|
| 167 |
+
average_processing_time_seconds: float = 0.0
|
| 168 |
+
success_rate: float = 0.0
|
| 169 |
+
error_count: int = 0
|
| 170 |
+
last_error: Optional[str] = None
|
| 171 |
+
uptime_seconds: float = 0.0
|
agents/classifier.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Classifier Agent for identifying oral health policy topics in meeting minutes.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
from typing import List, Dict, Any, Optional, Set
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 10 |
+
from config import settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class PolicyTopic:
|
| 14 |
+
"""Enumeration of oral health policy topics."""
|
| 15 |
+
WATER_FLUORIDATION = "water_fluoridation"
|
| 16 |
+
SCHOOL_DENTAL_SCREENING = "school_dental_screening"
|
| 17 |
+
MEDICAID_DENTAL = "medicaid_dental"
|
| 18 |
+
DENTAL_CLINIC_FUNDING = "dental_clinic_funding"
|
| 19 |
+
COMMUNITY_DENTAL_PROGRAM = "community_dental_program"
|
| 20 |
+
CHILDREN_DENTAL_HEALTH = "children_dental_health"
|
| 21 |
+
DENTAL_CARE_ACCESS = "dental_care_access"
|
| 22 |
+
OTHER_ORAL_HEALTH = "other_oral_health"
|
| 23 |
+
NOT_RELEVANT = "not_relevant"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ClassifierAgent(BaseAgent):
|
| 27 |
+
"""
|
| 28 |
+
Agent responsible for classifying documents by oral health policy topics.
|
| 29 |
+
|
| 30 |
+
Uses a combination of:
|
| 31 |
+
- Keyword matching for high-precision identification
|
| 32 |
+
- LLM-based classification for nuanced topics
|
| 33 |
+
- Topic modeling for discovering new themes
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(self, agent_id: str = "classifier-001"):
|
| 37 |
+
"""Initialize the classifier agent."""
|
| 38 |
+
super().__init__(agent_id, AgentRole.CLASSIFIER)
|
| 39 |
+
self._initialize_keywords()
|
| 40 |
+
self.llm_client = None # Will be initialized when needed
|
| 41 |
+
|
| 42 |
+
def _initialize_keywords(self):
|
| 43 |
+
"""Initialize keyword patterns for each topic."""
|
| 44 |
+
self.topic_keywords = {
|
| 45 |
+
PolicyTopic.WATER_FLUORIDATION: [
|
| 46 |
+
"fluoridation", "fluoride", "water fluoridation",
|
| 47 |
+
"fluoridated water", "fluoride level", "fluoride treatment",
|
| 48 |
+
"community water fluoridation"
|
| 49 |
+
],
|
| 50 |
+
PolicyTopic.SCHOOL_DENTAL_SCREENING: [
|
| 51 |
+
"school dental", "dental screening", "school screening",
|
| 52 |
+
"school health screening", "dental exam", "school nurse",
|
| 53 |
+
"student dental"
|
| 54 |
+
],
|
| 55 |
+
PolicyTopic.MEDICAID_DENTAL: [
|
| 56 |
+
"medicaid dental", "medicaid", "medicare dental",
|
| 57 |
+
"public assistance dental", "low-income dental",
|
| 58 |
+
"dental benefits", "dental coverage"
|
| 59 |
+
],
|
| 60 |
+
PolicyTopic.DENTAL_CLINIC_FUNDING: [
|
| 61 |
+
"dental clinic", "community dental clinic",
|
| 62 |
+
"dental center", "dental facility", "clinic funding",
|
| 63 |
+
"dental services funding"
|
| 64 |
+
],
|
| 65 |
+
PolicyTopic.COMMUNITY_DENTAL_PROGRAM: [
|
| 66 |
+
"community dental", "dental program", "oral health program",
|
| 67 |
+
"dental outreach", "mobile dental", "dental van"
|
| 68 |
+
],
|
| 69 |
+
PolicyTopic.CHILDREN_DENTAL_HEALTH: [
|
| 70 |
+
"children's dental", "pediatric dental", "child dental",
|
| 71 |
+
"kids dental", "youth dental", "infant oral health"
|
| 72 |
+
],
|
| 73 |
+
PolicyTopic.DENTAL_CARE_ACCESS: [
|
| 74 |
+
"dental access", "access to dental", "dental care",
|
| 75 |
+
"oral health access", "dental services", "dental disparities"
|
| 76 |
+
]
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 80 |
+
"""
|
| 81 |
+
Process classification commands.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
message: Message containing parsed documents to classify
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
List of messages with classification results
|
| 88 |
+
"""
|
| 89 |
+
self.update_status(AgentStatus.PROCESSING, "Classifying policy documents")
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
documents = message.payload.get("documents", [])
|
| 93 |
+
|
| 94 |
+
# Classify documents in batches
|
| 95 |
+
batch_size = settings.classifier_batch_size
|
| 96 |
+
classified_documents = []
|
| 97 |
+
|
| 98 |
+
for i in range(0, len(documents), batch_size):
|
| 99 |
+
batch = documents[i:i + batch_size]
|
| 100 |
+
batch_results = await self._classify_batch(batch)
|
| 101 |
+
classified_documents.extend(batch_results)
|
| 102 |
+
|
| 103 |
+
# Filter to only relevant documents
|
| 104 |
+
relevant_documents = [
|
| 105 |
+
doc for doc in classified_documents
|
| 106 |
+
if doc["classification"]["primary_topic"] != PolicyTopic.NOT_RELEVANT
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
# Send classified documents to sentiment analyzer
|
| 110 |
+
response = await self.send_message(
|
| 111 |
+
AgentRole.SENTIMENT_ANALYZER,
|
| 112 |
+
MessageType.DATA,
|
| 113 |
+
{
|
| 114 |
+
"workflow_id": message.payload.get("workflow_id"),
|
| 115 |
+
"documents": relevant_documents,
|
| 116 |
+
"count": len(relevant_documents),
|
| 117 |
+
"filtered_count": len(documents) - len(relevant_documents)
|
| 118 |
+
}
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
self.log_success()
|
| 122 |
+
logger.info(
|
| 123 |
+
f"Classified {len(documents)} documents, "
|
| 124 |
+
f"{len(relevant_documents)} relevant to oral health policy"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
return [response]
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
self.log_failure(str(e))
|
| 131 |
+
error_msg = await self.send_message(
|
| 132 |
+
AgentRole.ORCHESTRATOR,
|
| 133 |
+
MessageType.ERROR,
|
| 134 |
+
{"error": str(e), "agent": self.agent_id}
|
| 135 |
+
)
|
| 136 |
+
return [error_msg]
|
| 137 |
+
|
| 138 |
+
async def _classify_batch(
|
| 139 |
+
self,
|
| 140 |
+
documents: List[Dict[str, Any]]
|
| 141 |
+
) -> List[Dict[str, Any]]:
|
| 142 |
+
"""
|
| 143 |
+
Classify a batch of documents.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
documents: Batch of documents to classify
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
Documents with classification results
|
| 150 |
+
"""
|
| 151 |
+
tasks = [self._classify_document(doc) for doc in documents]
|
| 152 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 153 |
+
|
| 154 |
+
classified = []
|
| 155 |
+
for doc, result in zip(documents, results):
|
| 156 |
+
if isinstance(result, Exception):
|
| 157 |
+
logger.error(f"Classification error for {doc['document_id']}: {result}")
|
| 158 |
+
doc["classification"] = {
|
| 159 |
+
"primary_topic": PolicyTopic.NOT_RELEVANT,
|
| 160 |
+
"error": str(result)
|
| 161 |
+
}
|
| 162 |
+
else:
|
| 163 |
+
doc["classification"] = result
|
| 164 |
+
|
| 165 |
+
classified.append(doc)
|
| 166 |
+
|
| 167 |
+
return classified
|
| 168 |
+
|
| 169 |
+
async def _classify_document(
|
| 170 |
+
self,
|
| 171 |
+
doc: Dict[str, Any]
|
| 172 |
+
) -> Dict[str, Any]:
|
| 173 |
+
"""
|
| 174 |
+
Classify a single document.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
doc: Document to classify
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
Classification results
|
| 181 |
+
"""
|
| 182 |
+
text = self._get_searchable_text(doc)
|
| 183 |
+
text_lower = text.lower()
|
| 184 |
+
|
| 185 |
+
# Keyword-based classification
|
| 186 |
+
topic_scores = {}
|
| 187 |
+
for topic, keywords in self.topic_keywords.items():
|
| 188 |
+
score = sum(1 for keyword in keywords if keyword in text_lower)
|
| 189 |
+
if score > 0:
|
| 190 |
+
topic_scores[topic] = score
|
| 191 |
+
|
| 192 |
+
# Determine primary topic
|
| 193 |
+
if topic_scores:
|
| 194 |
+
primary_topic = max(topic_scores, key=topic_scores.get)
|
| 195 |
+
confidence = "high" if topic_scores[primary_topic] >= 3 else "medium"
|
| 196 |
+
|
| 197 |
+
# Get all topics mentioned
|
| 198 |
+
all_topics = list(topic_scores.keys())
|
| 199 |
+
else:
|
| 200 |
+
primary_topic = PolicyTopic.NOT_RELEVANT
|
| 201 |
+
confidence = "high"
|
| 202 |
+
all_topics = []
|
| 203 |
+
|
| 204 |
+
# Extract relevant excerpts
|
| 205 |
+
excerpts = self._extract_relevant_excerpts(doc, primary_topic)
|
| 206 |
+
|
| 207 |
+
classification = {
|
| 208 |
+
"primary_topic": primary_topic,
|
| 209 |
+
"all_topics": all_topics,
|
| 210 |
+
"topic_scores": topic_scores,
|
| 211 |
+
"confidence": confidence,
|
| 212 |
+
"relevant_excerpts": excerpts,
|
| 213 |
+
"classified_at": datetime.utcnow().isoformat()
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
return classification
|
| 217 |
+
|
| 218 |
+
def _get_searchable_text(self, doc: Dict[str, Any]) -> str:
|
| 219 |
+
"""Extract searchable text from document."""
|
| 220 |
+
parts = [
|
| 221 |
+
doc.get("raw_title", ""),
|
| 222 |
+
doc.get("full_text", "")
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
# Add agenda items
|
| 226 |
+
for item in doc.get("agenda_items", []):
|
| 227 |
+
parts.append(item.get("description", ""))
|
| 228 |
+
|
| 229 |
+
# Add discussion sections
|
| 230 |
+
for section in doc.get("discussion_sections", []):
|
| 231 |
+
parts.append(section.get("text", ""))
|
| 232 |
+
|
| 233 |
+
return " ".join(parts)
|
| 234 |
+
|
| 235 |
+
def _extract_relevant_excerpts(
|
| 236 |
+
self,
|
| 237 |
+
doc: Dict[str, Any],
|
| 238 |
+
topic: str
|
| 239 |
+
) -> List[Dict[str, str]]:
|
| 240 |
+
"""Extract text excerpts relevant to the topic."""
|
| 241 |
+
if topic == PolicyTopic.NOT_RELEVANT:
|
| 242 |
+
return []
|
| 243 |
+
|
| 244 |
+
keywords = self.topic_keywords.get(topic, [])
|
| 245 |
+
excerpts = []
|
| 246 |
+
|
| 247 |
+
# Check discussion sections
|
| 248 |
+
for section in doc.get("discussion_sections", []):
|
| 249 |
+
text = section.get("text", "")
|
| 250 |
+
text_lower = text.lower()
|
| 251 |
+
|
| 252 |
+
# Check if any keywords present
|
| 253 |
+
if any(keyword in text_lower for keyword in keywords):
|
| 254 |
+
excerpts.append({
|
| 255 |
+
"source": "discussion",
|
| 256 |
+
"text": text[:500], # First 500 chars
|
| 257 |
+
"section_id": section.get("section_id")
|
| 258 |
+
})
|
| 259 |
+
|
| 260 |
+
# Check agenda items
|
| 261 |
+
for item in doc.get("agenda_items", []):
|
| 262 |
+
desc = item.get("description", "")
|
| 263 |
+
desc_lower = desc.lower()
|
| 264 |
+
|
| 265 |
+
if any(keyword in desc_lower for keyword in keywords):
|
| 266 |
+
excerpts.append({
|
| 267 |
+
"source": "agenda",
|
| 268 |
+
"text": desc,
|
| 269 |
+
"item_number": item.get("number")
|
| 270 |
+
})
|
| 271 |
+
|
| 272 |
+
return excerpts[:5] # Return top 5 excerpts
|
| 273 |
+
|
| 274 |
+
async def _llm_classify(
|
| 275 |
+
self,
|
| 276 |
+
text: str,
|
| 277 |
+
preliminary_topics: List[str]
|
| 278 |
+
) -> Dict[str, Any]:
|
| 279 |
+
"""
|
| 280 |
+
Use LLM for nuanced classification when keywords are ambiguous.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
text: Text to classify
|
| 284 |
+
preliminary_topics: Topics identified by keyword matching
|
| 285 |
+
|
| 286 |
+
Returns:
|
| 287 |
+
LLM classification results
|
| 288 |
+
"""
|
| 289 |
+
# This would use OpenAI API or similar
|
| 290 |
+
# Placeholder for now
|
| 291 |
+
return {
|
| 292 |
+
"llm_topic": preliminary_topics[0] if preliminary_topics else PolicyTopic.NOT_RELEVANT,
|
| 293 |
+
"llm_confidence": 0.8,
|
| 294 |
+
"llm_reasoning": "Based on keyword analysis"
|
| 295 |
+
}
|
agents/debate_grader.py
ADDED
|
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Debate Grader Agent for evaluating government decisions using debate framework.
|
| 3 |
+
|
| 4 |
+
Evaluates decisions across three dimensions:
|
| 5 |
+
- Harms: The problem/crisis identified
|
| 6 |
+
- Solvency: How the proposed solution addresses the problem
|
| 7 |
+
- Topicality: Whether the solution fits within jurisdiction's authority
|
| 8 |
+
"""
|
| 9 |
+
import asyncio
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from loguru import logger
|
| 13 |
+
|
| 14 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DebateDimension:
|
| 18 |
+
"""Enumeration of debate evaluation dimensions."""
|
| 19 |
+
HARMS = "harms" # The problem
|
| 20 |
+
SOLVENCY = "solvency" # The fix
|
| 21 |
+
TOPICALITY = "topicality" # The scope
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DebateScore:
|
| 25 |
+
"""Score levels for each debate dimension."""
|
| 26 |
+
EXCELLENT = "excellent" # 4-5/5
|
| 27 |
+
GOOD = "good" # 3-4/5
|
| 28 |
+
FAIR = "fair" # 2-3/5
|
| 29 |
+
WEAK = "weak" # 1-2/5
|
| 30 |
+
MISSING = "missing" # 0-1/5
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class DebateGraderAgent(BaseAgent):
|
| 34 |
+
"""
|
| 35 |
+
Agent responsible for grading government decisions using debate framework.
|
| 36 |
+
|
| 37 |
+
Translates debate concepts for laypeople:
|
| 38 |
+
- Harms → "The Problem": Why is this a crisis in our community?
|
| 39 |
+
- Solvency → "The Fix": How does this solution actually work?
|
| 40 |
+
- Topicality → "The Scope": Does the government have authority to do this?
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, agent_id: str = "debate-grader-001"):
|
| 44 |
+
"""Initialize the debate grader agent."""
|
| 45 |
+
super().__init__(agent_id, AgentRole.SENTIMENT_ANALYZER)
|
| 46 |
+
self._initialize_criteria()
|
| 47 |
+
|
| 48 |
+
def _initialize_criteria(self):
|
| 49 |
+
"""Initialize evaluation criteria for each dimension."""
|
| 50 |
+
|
| 51 |
+
# Harms evaluation keywords
|
| 52 |
+
self.harms_indicators = {
|
| 53 |
+
"problem_identification": [
|
| 54 |
+
"crisis", "emergency", "critical", "urgent need",
|
| 55 |
+
"widespread problem", "affecting", "impacting",
|
| 56 |
+
"suffering", "lack of", "shortage", "gap in services"
|
| 57 |
+
],
|
| 58 |
+
"data_evidence": [
|
| 59 |
+
"statistics", "data shows", "research indicates",
|
| 60 |
+
"study found", "percent", "%", "number of people",
|
| 61 |
+
"cases", "instances", "reports"
|
| 62 |
+
],
|
| 63 |
+
"affected_population": [
|
| 64 |
+
"children", "families", "residents", "citizens",
|
| 65 |
+
"low-income", "vulnerable", "underserved",
|
| 66 |
+
"community members", "students", "seniors"
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Solvency evaluation keywords
|
| 71 |
+
self.solvency_indicators = {
|
| 72 |
+
"solution_clarity": [
|
| 73 |
+
"will", "would", "proposes to", "plans to",
|
| 74 |
+
"implement", "establish", "create", "provide",
|
| 75 |
+
"offer", "deliver", "fund", "allocate"
|
| 76 |
+
],
|
| 77 |
+
"mechanism": [
|
| 78 |
+
"through", "by", "using", "via", "process",
|
| 79 |
+
"program", "initiative", "partnership",
|
| 80 |
+
"collaboration", "service", "system"
|
| 81 |
+
],
|
| 82 |
+
"evidence_of_effectiveness": [
|
| 83 |
+
"proven", "successful in", "works in",
|
| 84 |
+
"demonstrated", "track record", "best practice",
|
| 85 |
+
"evidence-based", "research-backed"
|
| 86 |
+
],
|
| 87 |
+
"implementation_plan": [
|
| 88 |
+
"timeline", "budget", "staff", "resources",
|
| 89 |
+
"phase", "rollout", "launch", "start date",
|
| 90 |
+
"completion", "milestones"
|
| 91 |
+
]
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Topicality evaluation keywords
|
| 95 |
+
self.topicality_indicators = {
|
| 96 |
+
"legal_authority": [
|
| 97 |
+
"authority", "jurisdiction", "mandate",
|
| 98 |
+
"chartered to", "empowered to", "authorized",
|
| 99 |
+
"within our purview", "responsibility"
|
| 100 |
+
],
|
| 101 |
+
"precedent": [
|
| 102 |
+
"previously", "historically", "past practice",
|
| 103 |
+
"similar actions", "other cities", "state law",
|
| 104 |
+
"federal law", "code", "ordinance"
|
| 105 |
+
],
|
| 106 |
+
"scope_appropriateness": [
|
| 107 |
+
"city council", "county commission", "board",
|
| 108 |
+
"department", "local government", "municipal",
|
| 109 |
+
"within scope", "appropriate for"
|
| 110 |
+
]
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 114 |
+
"""
|
| 115 |
+
Process debate grading commands.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
message: Message containing decisions/documents to grade
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
List of messages with debate grades
|
| 122 |
+
"""
|
| 123 |
+
self.update_status(AgentStatus.PROCESSING, "Grading decisions with debate framework")
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
documents = message.payload.get("documents", [])
|
| 127 |
+
|
| 128 |
+
graded_documents = []
|
| 129 |
+
|
| 130 |
+
for doc in documents:
|
| 131 |
+
grade = await self._grade_document(doc)
|
| 132 |
+
doc["debate_grade"] = grade
|
| 133 |
+
graded_documents.append(doc)
|
| 134 |
+
|
| 135 |
+
# Calculate aggregate insights
|
| 136 |
+
insights = self._generate_insights(graded_documents)
|
| 137 |
+
|
| 138 |
+
# Send results
|
| 139 |
+
response = await self.send_message(
|
| 140 |
+
recipient=AgentRole.ORCHESTRATOR,
|
| 141 |
+
message_type=MessageType.RESPONSE,
|
| 142 |
+
payload={
|
| 143 |
+
"documents": graded_documents,
|
| 144 |
+
"insights": insights,
|
| 145 |
+
"graded_count": len(graded_documents)
|
| 146 |
+
}
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
self.update_status(AgentStatus.COMPLETED, f"Graded {len(graded_documents)} decisions")
|
| 150 |
+
return [response]
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.error(f"Debate grading failed: {e}")
|
| 154 |
+
self.update_status(AgentStatus.ERROR, str(e))
|
| 155 |
+
raise
|
| 156 |
+
|
| 157 |
+
async def _grade_document(self, document: Dict[str, Any]) -> Dict[str, Any]:
|
| 158 |
+
"""
|
| 159 |
+
Grade a single document across all debate dimensions.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
document: Document to grade
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Dictionary with grades for each dimension
|
| 166 |
+
"""
|
| 167 |
+
text = document.get("content", "").lower()
|
| 168 |
+
title = document.get("title", "").lower()
|
| 169 |
+
combined_text = f"{title} {text}"
|
| 170 |
+
|
| 171 |
+
# Grade each dimension
|
| 172 |
+
harms_score = self._grade_harms(combined_text)
|
| 173 |
+
solvency_score = self._grade_solvency(combined_text)
|
| 174 |
+
topicality_score = self._grade_topicality(combined_text)
|
| 175 |
+
|
| 176 |
+
# Calculate overall score
|
| 177 |
+
overall_score = self._calculate_overall_score(
|
| 178 |
+
harms_score, solvency_score, topicality_score
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
return {
|
| 182 |
+
"dimensions": {
|
| 183 |
+
"harms": {
|
| 184 |
+
"score": harms_score["score"],
|
| 185 |
+
"grade": harms_score["grade"],
|
| 186 |
+
"explanation": harms_score["explanation"],
|
| 187 |
+
"layperson_label": "The Problem",
|
| 188 |
+
"layperson_question": "Why is this a crisis in our community?"
|
| 189 |
+
},
|
| 190 |
+
"solvency": {
|
| 191 |
+
"score": solvency_score["score"],
|
| 192 |
+
"grade": solvency_score["grade"],
|
| 193 |
+
"explanation": solvency_score["explanation"],
|
| 194 |
+
"layperson_label": "The Fix",
|
| 195 |
+
"layperson_question": "How does this solution actually work?"
|
| 196 |
+
},
|
| 197 |
+
"topicality": {
|
| 198 |
+
"score": topicality_score["score"],
|
| 199 |
+
"grade": topicality_score["grade"],
|
| 200 |
+
"explanation": topicality_score["explanation"],
|
| 201 |
+
"layperson_label": "The Scope",
|
| 202 |
+
"layperson_question": "Does the government have authority to do this?"
|
| 203 |
+
}
|
| 204 |
+
},
|
| 205 |
+
"overall": {
|
| 206 |
+
"score": overall_score,
|
| 207 |
+
"grade": self._score_to_grade(overall_score),
|
| 208 |
+
"summary": self._generate_summary(harms_score, solvency_score, topicality_score)
|
| 209 |
+
},
|
| 210 |
+
"timestamp": datetime.utcnow().isoformat()
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
def _grade_harms(self, text: str) -> Dict[str, Any]:
|
| 214 |
+
"""Grade the 'harms' dimension - problem identification."""
|
| 215 |
+
score = 0
|
| 216 |
+
max_score = 5
|
| 217 |
+
details = []
|
| 218 |
+
|
| 219 |
+
# Check for problem identification (0-2 points)
|
| 220 |
+
problem_count = sum(1 for keyword in self.harms_indicators["problem_identification"] if keyword in text)
|
| 221 |
+
if problem_count >= 3:
|
| 222 |
+
score += 2
|
| 223 |
+
details.append("Strong problem identification")
|
| 224 |
+
elif problem_count >= 1:
|
| 225 |
+
score += 1
|
| 226 |
+
details.append("Problem mentioned but not detailed")
|
| 227 |
+
|
| 228 |
+
# Check for data/evidence (0-2 points)
|
| 229 |
+
data_count = sum(1 for keyword in self.harms_indicators["data_evidence"] if keyword in text)
|
| 230 |
+
if data_count >= 2:
|
| 231 |
+
score += 2
|
| 232 |
+
details.append("Data-driven evidence provided")
|
| 233 |
+
elif data_count >= 1:
|
| 234 |
+
score += 1
|
| 235 |
+
details.append("Some evidence mentioned")
|
| 236 |
+
|
| 237 |
+
# Check for affected population (0-1 point)
|
| 238 |
+
population_count = sum(1 for keyword in self.harms_indicators["affected_population"] if keyword in text)
|
| 239 |
+
if population_count >= 1:
|
| 240 |
+
score += 1
|
| 241 |
+
details.append("Affected population identified")
|
| 242 |
+
|
| 243 |
+
return {
|
| 244 |
+
"score": score,
|
| 245 |
+
"max_score": max_score,
|
| 246 |
+
"grade": self._score_to_grade(score / max_score * 5),
|
| 247 |
+
"explanation": "; ".join(details) if details else "No clear problem statement"
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
def _grade_solvency(self, text: str) -> Dict[str, Any]:
|
| 251 |
+
"""Grade the 'solvency' dimension - solution effectiveness."""
|
| 252 |
+
score = 0
|
| 253 |
+
max_score = 5
|
| 254 |
+
details = []
|
| 255 |
+
|
| 256 |
+
# Check for solution clarity (0-1 point)
|
| 257 |
+
solution_count = sum(1 for keyword in self.solvency_indicators["solution_clarity"] if keyword in text)
|
| 258 |
+
if solution_count >= 2:
|
| 259 |
+
score += 1
|
| 260 |
+
details.append("Clear solution proposed")
|
| 261 |
+
|
| 262 |
+
# Check for mechanism (0-2 points)
|
| 263 |
+
mechanism_count = sum(1 for keyword in self.solvency_indicators["mechanism"] if keyword in text)
|
| 264 |
+
if mechanism_count >= 3:
|
| 265 |
+
score += 2
|
| 266 |
+
details.append("Implementation mechanism described")
|
| 267 |
+
elif mechanism_count >= 1:
|
| 268 |
+
score += 1
|
| 269 |
+
details.append("Basic approach outlined")
|
| 270 |
+
|
| 271 |
+
# Check for evidence of effectiveness (0-1 point)
|
| 272 |
+
evidence_count = sum(1 for keyword in self.solvency_indicators["evidence_of_effectiveness"] if keyword in text)
|
| 273 |
+
if evidence_count >= 1:
|
| 274 |
+
score += 1
|
| 275 |
+
details.append("Evidence-based approach")
|
| 276 |
+
|
| 277 |
+
# Check for implementation plan (0-1 point)
|
| 278 |
+
plan_count = sum(1 for keyword in self.solvency_indicators["implementation_plan"] if keyword in text)
|
| 279 |
+
if plan_count >= 2:
|
| 280 |
+
score += 1
|
| 281 |
+
details.append("Implementation plan included")
|
| 282 |
+
|
| 283 |
+
return {
|
| 284 |
+
"score": score,
|
| 285 |
+
"max_score": max_score,
|
| 286 |
+
"grade": self._score_to_grade(score / max_score * 5),
|
| 287 |
+
"explanation": "; ".join(details) if details else "No clear solution mechanism"
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
def _grade_topicality(self, text: str) -> Dict[str, Any]:
|
| 291 |
+
"""Grade the 'topicality' dimension - scope appropriateness."""
|
| 292 |
+
score = 0
|
| 293 |
+
max_score = 5
|
| 294 |
+
details = []
|
| 295 |
+
|
| 296 |
+
# Check for legal authority (0-2 points)
|
| 297 |
+
authority_count = sum(1 for keyword in self.topicality_indicators["legal_authority"] if keyword in text)
|
| 298 |
+
if authority_count >= 2:
|
| 299 |
+
score += 2
|
| 300 |
+
details.append("Legal authority cited")
|
| 301 |
+
elif authority_count >= 1:
|
| 302 |
+
score += 1
|
| 303 |
+
details.append("Authority mentioned")
|
| 304 |
+
|
| 305 |
+
# Check for precedent (0-2 points)
|
| 306 |
+
precedent_count = sum(1 for keyword in self.topicality_indicators["precedent"] if keyword in text)
|
| 307 |
+
if precedent_count >= 2:
|
| 308 |
+
score += 2
|
| 309 |
+
details.append("Precedent established")
|
| 310 |
+
elif precedent_count >= 1:
|
| 311 |
+
score += 1
|
| 312 |
+
details.append("Some precedent referenced")
|
| 313 |
+
|
| 314 |
+
# Check for scope appropriateness (0-1 point)
|
| 315 |
+
scope_count = sum(1 for keyword in self.topicality_indicators["scope_appropriateness"] if keyword in text)
|
| 316 |
+
if scope_count >= 1:
|
| 317 |
+
score += 1
|
| 318 |
+
details.append("Within appropriate scope")
|
| 319 |
+
|
| 320 |
+
return {
|
| 321 |
+
"score": score,
|
| 322 |
+
"max_score": max_score,
|
| 323 |
+
"grade": self._score_to_grade(score / max_score * 5),
|
| 324 |
+
"explanation": "; ".join(details) if details else "Unclear jurisdictional authority"
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
def _score_to_grade(self, normalized_score: float) -> str:
|
| 328 |
+
"""Convert numerical score to grade."""
|
| 329 |
+
if normalized_score >= 4.0:
|
| 330 |
+
return DebateScore.EXCELLENT
|
| 331 |
+
elif normalized_score >= 3.0:
|
| 332 |
+
return DebateScore.GOOD
|
| 333 |
+
elif normalized_score >= 2.0:
|
| 334 |
+
return DebateScore.FAIR
|
| 335 |
+
elif normalized_score >= 1.0:
|
| 336 |
+
return DebateScore.WEAK
|
| 337 |
+
else:
|
| 338 |
+
return DebateScore.MISSING
|
| 339 |
+
|
| 340 |
+
def _calculate_overall_score(
|
| 341 |
+
self,
|
| 342 |
+
harms: Dict[str, Any],
|
| 343 |
+
solvency: Dict[str, Any],
|
| 344 |
+
topicality: Dict[str, Any]
|
| 345 |
+
) -> float:
|
| 346 |
+
"""Calculate weighted overall score."""
|
| 347 |
+
# Weight: Harms 40%, Solvency 40%, Topicality 20%
|
| 348 |
+
harms_normalized = (harms["score"] / harms["max_score"]) * 5
|
| 349 |
+
solvency_normalized = (solvency["score"] / solvency["max_score"]) * 5
|
| 350 |
+
topicality_normalized = (topicality["score"] / topicality["max_score"]) * 5
|
| 351 |
+
|
| 352 |
+
overall = (harms_normalized * 0.4) + (solvency_normalized * 0.4) + (topicality_normalized * 0.2)
|
| 353 |
+
return round(overall, 2)
|
| 354 |
+
|
| 355 |
+
def _generate_summary(
|
| 356 |
+
self,
|
| 357 |
+
harms: Dict[str, Any],
|
| 358 |
+
solvency: Dict[str, Any],
|
| 359 |
+
topicality: Dict[str, Any]
|
| 360 |
+
) -> str:
|
| 361 |
+
"""Generate human-readable summary."""
|
| 362 |
+
parts = []
|
| 363 |
+
|
| 364 |
+
if harms["grade"] in [DebateScore.EXCELLENT, DebateScore.GOOD]:
|
| 365 |
+
parts.append("Strong problem identification")
|
| 366 |
+
else:
|
| 367 |
+
parts.append("Weak problem statement")
|
| 368 |
+
|
| 369 |
+
if solvency["grade"] in [DebateScore.EXCELLENT, DebateScore.GOOD]:
|
| 370 |
+
parts.append("clear solution")
|
| 371 |
+
else:
|
| 372 |
+
parts.append("unclear fix")
|
| 373 |
+
|
| 374 |
+
if topicality["grade"] in [DebateScore.EXCELLENT, DebateScore.GOOD]:
|
| 375 |
+
parts.append("within authority")
|
| 376 |
+
else:
|
| 377 |
+
parts.append("questionable scope")
|
| 378 |
+
|
| 379 |
+
return "; ".join(parts).capitalize()
|
| 380 |
+
|
| 381 |
+
def _generate_insights(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 382 |
+
"""Generate aggregate insights from all graded documents."""
|
| 383 |
+
if not documents:
|
| 384 |
+
return {}
|
| 385 |
+
|
| 386 |
+
total = len(documents)
|
| 387 |
+
dimension_scores = {
|
| 388 |
+
"harms": [],
|
| 389 |
+
"solvency": [],
|
| 390 |
+
"topicality": []
|
| 391 |
+
}
|
| 392 |
+
overall_scores = []
|
| 393 |
+
|
| 394 |
+
for doc in documents:
|
| 395 |
+
grade = doc.get("debate_grade", {})
|
| 396 |
+
dimensions = grade.get("dimensions", {})
|
| 397 |
+
|
| 398 |
+
for dim in ["harms", "solvency", "topicality"]:
|
| 399 |
+
if dim in dimensions:
|
| 400 |
+
dimension_scores[dim].append(dimensions[dim]["score"])
|
| 401 |
+
|
| 402 |
+
if "overall" in grade:
|
| 403 |
+
overall_scores.append(grade["overall"]["score"])
|
| 404 |
+
|
| 405 |
+
# Calculate averages
|
| 406 |
+
insights = {
|
| 407 |
+
"total_documents": total,
|
| 408 |
+
"average_scores": {
|
| 409 |
+
"harms": round(sum(dimension_scores["harms"]) / len(dimension_scores["harms"]), 2) if dimension_scores["harms"] else 0,
|
| 410 |
+
"solvency": round(sum(dimension_scores["solvency"]) / len(dimension_scores["solvency"]), 2) if dimension_scores["solvency"] else 0,
|
| 411 |
+
"topicality": round(sum(dimension_scores["topicality"]) / len(dimension_scores["topicality"]), 2) if dimension_scores["topicality"] else 0,
|
| 412 |
+
"overall": round(sum(overall_scores) / len(overall_scores), 2) if overall_scores else 0
|
| 413 |
+
},
|
| 414 |
+
"strongest_dimension": max(
|
| 415 |
+
dimension_scores.items(),
|
| 416 |
+
key=lambda x: sum(x[1]) / len(x[1]) if x[1] else 0
|
| 417 |
+
)[0] if any(dimension_scores.values()) else None,
|
| 418 |
+
"weakest_dimension": min(
|
| 419 |
+
dimension_scores.items(),
|
| 420 |
+
key=lambda x: sum(x[1]) / len(x[1]) if x[1] else 0
|
| 421 |
+
)[0] if any(dimension_scores.values()) else None
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
return insights
|
agents/mlflow_base.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MLflow-based agent foundation for Databricks Agent Bricks.
|
| 3 |
+
|
| 4 |
+
Provides:
|
| 5 |
+
- MLflow Pyfunc model wrappers for agents
|
| 6 |
+
- Unity Catalog integration
|
| 7 |
+
- Automatic tracing and observability
|
| 8 |
+
- Model serving compatibility
|
| 9 |
+
"""
|
| 10 |
+
from typing import Any, Dict, List, Optional, Union
|
| 11 |
+
from abc import ABC, abstractmethod
|
| 12 |
+
import mlflow
|
| 13 |
+
from mlflow.pyfunc import PythonModel
|
| 14 |
+
from mlflow.models import infer_signature
|
| 15 |
+
from mlflow.tracking import MlflowClient
|
| 16 |
+
import pandas as pd
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from loguru import logger
|
| 19 |
+
|
| 20 |
+
from agents.base import AgentRole, AgentMessage, AgentStatus
|
| 21 |
+
from config import settings
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class MLflowAgentBase(PythonModel, ABC):
|
| 25 |
+
"""
|
| 26 |
+
Base class for agents that can be deployed via MLflow Model Serving.
|
| 27 |
+
|
| 28 |
+
Integrates with:
|
| 29 |
+
- Unity Catalog for governance
|
| 30 |
+
- MLflow Tracking for experimentation
|
| 31 |
+
- Databricks Model Serving for deployment
|
| 32 |
+
- Mosaic AI Agent Framework for evaluation
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, agent_id: str, role: AgentRole):
|
| 36 |
+
"""
|
| 37 |
+
Initialize MLflow agent.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
agent_id: Unique identifier for this agent
|
| 41 |
+
role: Agent role in the pipeline
|
| 42 |
+
"""
|
| 43 |
+
super().__init__()
|
| 44 |
+
self.agent_id = agent_id
|
| 45 |
+
self.role = role
|
| 46 |
+
self.status = AgentStatus.IDLE
|
| 47 |
+
self.client = MlflowClient()
|
| 48 |
+
|
| 49 |
+
def load_context(self, context):
|
| 50 |
+
"""
|
| 51 |
+
Load agent context from MLflow (called during model loading).
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
context: MLflow context with model artifacts
|
| 55 |
+
"""
|
| 56 |
+
logger.info(f"Loading {self.role.value} agent from MLflow context")
|
| 57 |
+
# Load any model artifacts, configs, etc.
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
@abstractmethod
|
| 61 |
+
def _process_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
| 62 |
+
"""
|
| 63 |
+
Process a single agent request.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
request: Input request dictionary
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
Response dictionary
|
| 70 |
+
"""
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
def predict(
|
| 74 |
+
self,
|
| 75 |
+
context,
|
| 76 |
+
model_input: Union[pd.DataFrame, Dict[str, Any], List[Dict[str, Any]]]
|
| 77 |
+
) -> Union[pd.DataFrame, List[Dict[str, Any]]]:
|
| 78 |
+
"""
|
| 79 |
+
MLflow Pyfunc predict interface.
|
| 80 |
+
|
| 81 |
+
This is the main entry point when the agent is deployed as a Model Serving endpoint.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
context: MLflow context
|
| 85 |
+
model_input: Input data (DataFrame, dict, or list of dicts)
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
Predictions in same format as input
|
| 89 |
+
"""
|
| 90 |
+
# Enable MLflow tracing for observability
|
| 91 |
+
with mlflow.start_span(name=f"{self.role.value}_agent") as span:
|
| 92 |
+
span.set_attribute("agent_id", self.agent_id)
|
| 93 |
+
span.set_attribute("agent_role", self.role.value)
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
# Convert input to standard format
|
| 97 |
+
if isinstance(model_input, pd.DataFrame):
|
| 98 |
+
requests = model_input.to_dict('records')
|
| 99 |
+
return_df = True
|
| 100 |
+
elif isinstance(model_input, dict):
|
| 101 |
+
requests = [model_input]
|
| 102 |
+
return_df = False
|
| 103 |
+
else:
|
| 104 |
+
requests = model_input
|
| 105 |
+
return_df = False
|
| 106 |
+
|
| 107 |
+
# Process each request with tracing
|
| 108 |
+
results = []
|
| 109 |
+
for idx, request in enumerate(requests):
|
| 110 |
+
with mlflow.start_span(name=f"process_request_{idx}") as req_span:
|
| 111 |
+
req_span.set_attribute("request_id", request.get("request_id", f"req_{idx}"))
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
result = self._process_request(request)
|
| 115 |
+
result["status"] = "success"
|
| 116 |
+
result["agent_id"] = self.agent_id
|
| 117 |
+
result["timestamp"] = datetime.utcnow().isoformat()
|
| 118 |
+
results.append(result)
|
| 119 |
+
|
| 120 |
+
req_span.set_attribute("status", "success")
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
error_result = {
|
| 124 |
+
"status": "error",
|
| 125 |
+
"error": str(e),
|
| 126 |
+
"agent_id": self.agent_id,
|
| 127 |
+
"timestamp": datetime.utcnow().isoformat()
|
| 128 |
+
}
|
| 129 |
+
results.append(error_result)
|
| 130 |
+
|
| 131 |
+
req_span.set_attribute("status", "error")
|
| 132 |
+
req_span.set_attribute("error", str(e))
|
| 133 |
+
logger.error(f"Error processing request {idx}: {e}")
|
| 134 |
+
|
| 135 |
+
# Return in requested format
|
| 136 |
+
if return_df:
|
| 137 |
+
return pd.DataFrame(results)
|
| 138 |
+
elif len(results) == 1 and not isinstance(model_input, list):
|
| 139 |
+
return results[0]
|
| 140 |
+
else:
|
| 141 |
+
return results
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
span.set_attribute("status", "error")
|
| 145 |
+
span.set_attribute("error", str(e))
|
| 146 |
+
logger.error(f"Error in {self.role.value} agent: {e}")
|
| 147 |
+
raise
|
| 148 |
+
|
| 149 |
+
def log_to_mlflow(
|
| 150 |
+
self,
|
| 151 |
+
model_name: str,
|
| 152 |
+
artifact_path: str = "agent",
|
| 153 |
+
registered_model_name: Optional[str] = None,
|
| 154 |
+
**kwargs
|
| 155 |
+
):
|
| 156 |
+
"""
|
| 157 |
+
Log this agent to MLflow.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
model_name: Name for the MLflow run
|
| 161 |
+
artifact_path: Path within the run to store the model
|
| 162 |
+
registered_model_name: Unity Catalog model name (e.g., "main.agents.scraper")
|
| 163 |
+
**kwargs: Additional MLflow logging parameters
|
| 164 |
+
"""
|
| 165 |
+
with mlflow.start_run(run_name=model_name) as run:
|
| 166 |
+
# Log agent metadata
|
| 167 |
+
mlflow.log_param("agent_id", self.agent_id)
|
| 168 |
+
mlflow.log_param("agent_role", self.role.value)
|
| 169 |
+
mlflow.log_param("framework", "databricks-agent-bricks")
|
| 170 |
+
|
| 171 |
+
# Create example input/output for signature
|
| 172 |
+
example_input = self._get_example_input()
|
| 173 |
+
example_output = self.predict(None, example_input)
|
| 174 |
+
signature = infer_signature(example_input, example_output)
|
| 175 |
+
|
| 176 |
+
# Log the model
|
| 177 |
+
mlflow.pyfunc.log_model(
|
| 178 |
+
artifact_path=artifact_path,
|
| 179 |
+
python_model=self,
|
| 180 |
+
signature=signature,
|
| 181 |
+
registered_model_name=registered_model_name,
|
| 182 |
+
**kwargs
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
logger.info(f"Logged {self.role.value} agent to MLflow run {run.info.run_id}")
|
| 186 |
+
|
| 187 |
+
if registered_model_name:
|
| 188 |
+
logger.info(f"Registered model as {registered_model_name}")
|
| 189 |
+
|
| 190 |
+
return run.info.run_id
|
| 191 |
+
|
| 192 |
+
@abstractmethod
|
| 193 |
+
def _get_example_input(self) -> Union[pd.DataFrame, Dict[str, Any]]:
|
| 194 |
+
"""
|
| 195 |
+
Get example input for MLflow signature inference.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Example input data
|
| 199 |
+
"""
|
| 200 |
+
pass
|
| 201 |
+
|
| 202 |
+
def deploy_to_model_serving(
|
| 203 |
+
self,
|
| 204 |
+
model_name: str,
|
| 205 |
+
endpoint_name: str,
|
| 206 |
+
workload_size: str = "Small",
|
| 207 |
+
scale_to_zero: bool = True
|
| 208 |
+
) -> str:
|
| 209 |
+
"""
|
| 210 |
+
Deploy this agent to Databricks Model Serving.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
model_name: Registered model name in Unity Catalog
|
| 214 |
+
endpoint_name: Name for the serving endpoint
|
| 215 |
+
workload_size: Endpoint size (Small, Medium, Large)
|
| 216 |
+
scale_to_zero: Whether to scale to zero when idle
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
Endpoint URL
|
| 220 |
+
"""
|
| 221 |
+
from databricks.sdk import WorkspaceClient
|
| 222 |
+
from databricks.sdk.service.serving import ServedEntityInput, EndpointCoreConfigInput
|
| 223 |
+
|
| 224 |
+
w = WorkspaceClient(
|
| 225 |
+
host=settings.databricks_host,
|
| 226 |
+
token=settings.databricks_token
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
# Get latest model version
|
| 230 |
+
latest_version = self.client.get_latest_versions(model_name, stages=["None"])[0].version
|
| 231 |
+
|
| 232 |
+
# Create or update endpoint
|
| 233 |
+
endpoint_config = EndpointCoreConfigInput(
|
| 234 |
+
name=endpoint_name,
|
| 235 |
+
served_entities=[
|
| 236 |
+
ServedEntityInput(
|
| 237 |
+
entity_name=model_name,
|
| 238 |
+
entity_version=latest_version,
|
| 239 |
+
workload_size=workload_size,
|
| 240 |
+
scale_to_zero_enabled=scale_to_zero
|
| 241 |
+
)
|
| 242 |
+
]
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
try:
|
| 246 |
+
endpoint = w.serving_endpoints.create_and_wait(
|
| 247 |
+
name=endpoint_name,
|
| 248 |
+
config=endpoint_config
|
| 249 |
+
)
|
| 250 |
+
logger.info(f"Created endpoint: {endpoint_name}")
|
| 251 |
+
except Exception as e:
|
| 252 |
+
if "already exists" in str(e):
|
| 253 |
+
endpoint = w.serving_endpoints.update_config_and_wait(
|
| 254 |
+
name=endpoint_name,
|
| 255 |
+
served_entities=endpoint_config.served_entities
|
| 256 |
+
)
|
| 257 |
+
logger.info(f"Updated endpoint: {endpoint_name}")
|
| 258 |
+
else:
|
| 259 |
+
raise
|
| 260 |
+
|
| 261 |
+
endpoint_url = f"{settings.databricks_host}/serving-endpoints/{endpoint_name}/invocations"
|
| 262 |
+
return endpoint_url
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
class MLflowChainAgent(MLflowAgentBase):
|
| 266 |
+
"""
|
| 267 |
+
Agent that uses LangChain with MLflow tracing.
|
| 268 |
+
|
| 269 |
+
Provides integration with:
|
| 270 |
+
- LangChain agents and chains
|
| 271 |
+
- Automatic prompt logging
|
| 272 |
+
- LLM call tracing
|
| 273 |
+
- Tool usage tracking
|
| 274 |
+
"""
|
| 275 |
+
|
| 276 |
+
def __init__(self, agent_id: str, role: AgentRole):
|
| 277 |
+
"""Initialize LangChain-based agent."""
|
| 278 |
+
super().__init__(agent_id, role)
|
| 279 |
+
self.chain = None
|
| 280 |
+
|
| 281 |
+
def _setup_langchain_tracing(self):
|
| 282 |
+
"""Enable MLflow tracing for LangChain."""
|
| 283 |
+
mlflow.langchain.autolog()
|
| 284 |
+
|
| 285 |
+
@abstractmethod
|
| 286 |
+
def _build_chain(self):
|
| 287 |
+
"""
|
| 288 |
+
Build the LangChain chain for this agent.
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
LangChain chain or agent
|
| 292 |
+
"""
|
| 293 |
+
pass
|
| 294 |
+
|
| 295 |
+
def _process_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
| 296 |
+
"""Process request through LangChain."""
|
| 297 |
+
if self.chain is None:
|
| 298 |
+
self.chain = self._build_chain()
|
| 299 |
+
|
| 300 |
+
with mlflow.start_span(name="langchain_invoke") as span:
|
| 301 |
+
result = self.chain.invoke(request)
|
| 302 |
+
|
| 303 |
+
# Log relevant metrics
|
| 304 |
+
if hasattr(result, "llm_output"):
|
| 305 |
+
span.set_attribute("tokens_used", result.llm_output.get("token_usage", {}).get("total_tokens", 0))
|
| 306 |
+
|
| 307 |
+
return result
|
agents/mlflow_classifier.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Policy Classifier Agent - MLflow version for Databricks Agent Bricks.
|
| 3 |
+
|
| 4 |
+
Classifies meeting documents for oral health policy topics using:
|
| 5 |
+
- Keyword matching and NLP
|
| 6 |
+
- LLM-based classification for ambiguous cases
|
| 7 |
+
- Unity Catalog for model governance
|
| 8 |
+
- MLflow tracing for observability
|
| 9 |
+
"""
|
| 10 |
+
from typing import Any, Dict, List, Optional
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from enum import Enum
|
| 13 |
+
import mlflow
|
| 14 |
+
from langchain.chat_models import ChatOpenAI
|
| 15 |
+
from langchain.prompts import ChatPromptTemplate
|
| 16 |
+
from langchain.output_parsers import PydanticOutputParser
|
| 17 |
+
from pydantic import BaseModel, Field
|
| 18 |
+
|
| 19 |
+
from agents.mlflow_base import MLflowChainAgent
|
| 20 |
+
from agents.base import AgentRole
|
| 21 |
+
from config import settings
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class PolicyTopic(str, Enum):
|
| 25 |
+
"""Oral health policy topics to classify."""
|
| 26 |
+
WATER_FLUORIDATION = "water_fluoridation"
|
| 27 |
+
SCHOOL_DENTAL_SCREENING = "school_dental_screening"
|
| 28 |
+
MEDICAID_DENTAL = "medicaid_dental_expansion"
|
| 29 |
+
LOW_INCOME_DENTAL_FUNDING = "low_income_dental_funding"
|
| 30 |
+
DENTAL_INSURANCE_MANDATE = "dental_insurance_mandate"
|
| 31 |
+
DENTAL_WORKFORCE = "dental_workforce_development"
|
| 32 |
+
COMMUNITY_HEALTH_CENTER = "community_health_center_dental"
|
| 33 |
+
OTHER_ORAL_HEALTH = "other_oral_health"
|
| 34 |
+
NOT_ORAL_HEALTH = "not_oral_health_related"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class ClassificationResult(BaseModel):
|
| 38 |
+
"""Structured classification output."""
|
| 39 |
+
primary_topic: PolicyTopic = Field(description="Primary policy topic")
|
| 40 |
+
secondary_topics: List[PolicyTopic] = Field(default_factory=list, description="Additional relevant topics")
|
| 41 |
+
confidence: float = Field(ge=0.0, le=1.0, description="Classification confidence")
|
| 42 |
+
relevant_excerpts: List[str] = Field(default_factory=list, description="Key text excerpts")
|
| 43 |
+
reasoning: str = Field(description="Brief explanation of classification")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class PolicyClassifierAgent(MLflowChainAgent):
|
| 47 |
+
"""
|
| 48 |
+
Agent that classifies documents for oral health policy topics.
|
| 49 |
+
|
| 50 |
+
Can be deployed to Databricks Model Serving and integrated with
|
| 51 |
+
Unity Catalog for governance.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# Keywords for each topic (fallback classification)
|
| 55 |
+
TOPIC_KEYWORDS = {
|
| 56 |
+
PolicyTopic.WATER_FLUORIDATION: {
|
| 57 |
+
"fluoride", "fluoridation", "water supply", "dental fluorosis",
|
| 58 |
+
"community water", "fluoride levels", "fluoridated water"
|
| 59 |
+
},
|
| 60 |
+
PolicyTopic.SCHOOL_DENTAL_SCREENING: {
|
| 61 |
+
"school dental", "screening program", "student dental", "school health",
|
| 62 |
+
"dental exam", "school nurse", "oral health screening"
|
| 63 |
+
},
|
| 64 |
+
PolicyTopic.MEDICAID_DENTAL: {
|
| 65 |
+
"medicaid dental", "adult dental coverage", "medicaid expansion",
|
| 66 |
+
"dental benefits", "state medicaid", "covered dental services"
|
| 67 |
+
},
|
| 68 |
+
PolicyTopic.LOW_INCOME_DENTAL_FUNDING: {
|
| 69 |
+
"low-income dental", "dental safety net", "free dental clinic",
|
| 70 |
+
"dental voucher", "sliding scale dental", "charity care"
|
| 71 |
+
},
|
| 72 |
+
PolicyTopic.DENTAL_INSURANCE_MANDATE: {
|
| 73 |
+
"dental insurance", "insurance mandate", "coverage requirement",
|
| 74 |
+
"pediatric dental", "essential health benefits"
|
| 75 |
+
},
|
| 76 |
+
PolicyTopic.DENTAL_WORKFORCE: {
|
| 77 |
+
"dental hygienist", "dental therapist", "scope of practice",
|
| 78 |
+
"workforce shortage", "dental provider", "loan repayment"
|
| 79 |
+
},
|
| 80 |
+
PolicyTopic.COMMUNITY_HEALTH_CENTER: {
|
| 81 |
+
"community health center", "FQHC", "health center dental",
|
| 82 |
+
"federally qualified", "CHC dental"
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
def __init__(self, agent_id: str = "classifier-mlflow-001"):
|
| 87 |
+
"""Initialize classifier agent."""
|
| 88 |
+
super().__init__(agent_id, AgentRole.CLASSIFIER)
|
| 89 |
+
self._setup_langchain_tracing()
|
| 90 |
+
|
| 91 |
+
def _build_chain(self):
|
| 92 |
+
"""Build LangChain classification chain."""
|
| 93 |
+
# Initialize LLM (will use AI Gateway if configured)
|
| 94 |
+
llm = ChatOpenAI(
|
| 95 |
+
model=settings.classifier_model,
|
| 96 |
+
temperature=0.1,
|
| 97 |
+
openai_api_key=settings.openai_api_key
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Create output parser
|
| 101 |
+
parser = PydanticOutputParser(pydantic_object=ClassificationResult)
|
| 102 |
+
|
| 103 |
+
# Create prompt template
|
| 104 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 105 |
+
("system", """You are an expert policy analyst specializing in oral health policy.
|
| 106 |
+
|
| 107 |
+
Classify the following government meeting document for oral health policy topics.
|
| 108 |
+
|
| 109 |
+
Available topics:
|
| 110 |
+
- water_fluoridation: Fluoride in public water systems
|
| 111 |
+
- school_dental_screening: School-based dental programs
|
| 112 |
+
- medicaid_dental_expansion: Medicaid dental coverage
|
| 113 |
+
- low_income_dental_funding: Funding for low-income dental care
|
| 114 |
+
- dental_insurance_mandate: Insurance coverage requirements
|
| 115 |
+
- dental_workforce_development: Training, scope of practice
|
| 116 |
+
- community_health_center_dental: CHC/FQHC dental services
|
| 117 |
+
- other_oral_health: Other oral health topics
|
| 118 |
+
- not_oral_health_related: Not related to oral health
|
| 119 |
+
|
| 120 |
+
{format_instructions}"""),
|
| 121 |
+
("user", """Document Title: {title}
|
| 122 |
+
|
| 123 |
+
Document Content:
|
| 124 |
+
{content}
|
| 125 |
+
|
| 126 |
+
Classify this document and provide relevant excerpts.""")
|
| 127 |
+
])
|
| 128 |
+
|
| 129 |
+
# Build chain
|
| 130 |
+
chain = prompt | llm | parser
|
| 131 |
+
return chain
|
| 132 |
+
|
| 133 |
+
def _process_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
| 134 |
+
"""
|
| 135 |
+
Classify a document for oral health policy topics.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
request: Dict with 'document_id', 'title', 'content'
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Classification results with topics and confidence
|
| 142 |
+
"""
|
| 143 |
+
document_id = request.get("document_id")
|
| 144 |
+
title = request.get("title", "")
|
| 145 |
+
content = request.get("content", "")
|
| 146 |
+
|
| 147 |
+
with mlflow.start_span(name="classify_document") as span:
|
| 148 |
+
span.set_attribute("document_id", document_id)
|
| 149 |
+
|
| 150 |
+
# Try keyword-based classification first (faster, cheaper)
|
| 151 |
+
keyword_result = self._classify_by_keywords(title + " " + content)
|
| 152 |
+
|
| 153 |
+
if keyword_result["confidence"] >= 0.8:
|
| 154 |
+
# High confidence from keywords, no LLM needed
|
| 155 |
+
span.set_attribute("classification_method", "keywords")
|
| 156 |
+
result = keyword_result
|
| 157 |
+
else:
|
| 158 |
+
# Use LLM for ambiguous cases
|
| 159 |
+
span.set_attribute("classification_method", "llm")
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
llm_result = super()._process_request({
|
| 163 |
+
"title": title,
|
| 164 |
+
"content": content[:4000], # Limit context length
|
| 165 |
+
"format_instructions": self._get_format_instructions()
|
| 166 |
+
})
|
| 167 |
+
|
| 168 |
+
result = {
|
| 169 |
+
"document_id": document_id,
|
| 170 |
+
"primary_topic": llm_result.primary_topic.value,
|
| 171 |
+
"secondary_topics": [t.value for t in llm_result.secondary_topics],
|
| 172 |
+
"confidence": llm_result.confidence,
|
| 173 |
+
"relevant_excerpts": llm_result.relevant_excerpts,
|
| 174 |
+
"reasoning": llm_result.reasoning,
|
| 175 |
+
"method": "llm"
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
# Fallback to keywords if LLM fails
|
| 180 |
+
span.set_attribute("llm_error", str(e))
|
| 181 |
+
result = keyword_result
|
| 182 |
+
result["method"] = "keywords_fallback"
|
| 183 |
+
|
| 184 |
+
return result
|
| 185 |
+
|
| 186 |
+
def _classify_by_keywords(self, text: str) -> Dict[str, Any]:
|
| 187 |
+
"""
|
| 188 |
+
Fast keyword-based classification.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
text: Document text
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Classification result
|
| 195 |
+
"""
|
| 196 |
+
text_lower = text.lower()
|
| 197 |
+
scores = {}
|
| 198 |
+
|
| 199 |
+
# Score each topic
|
| 200 |
+
for topic, keywords in self.TOPIC_KEYWORDS.items():
|
| 201 |
+
score = sum(1 for keyword in keywords if keyword in text_lower)
|
| 202 |
+
if score > 0:
|
| 203 |
+
scores[topic] = score
|
| 204 |
+
|
| 205 |
+
if not scores:
|
| 206 |
+
return {
|
| 207 |
+
"primary_topic": PolicyTopic.NOT_ORAL_HEALTH.value,
|
| 208 |
+
"secondary_topics": [],
|
| 209 |
+
"confidence": 0.9,
|
| 210 |
+
"relevant_excerpts": [],
|
| 211 |
+
"reasoning": "No oral health keywords found",
|
| 212 |
+
"method": "keywords"
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
# Get top topics
|
| 216 |
+
sorted_topics = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
| 217 |
+
primary_topic = sorted_topics[0][0]
|
| 218 |
+
secondary_topics = [t for t, s in sorted_topics[1:3] if s >= 2]
|
| 219 |
+
|
| 220 |
+
# Calculate confidence based on score gap
|
| 221 |
+
max_score = sorted_topics[0][1]
|
| 222 |
+
confidence = min(0.95, 0.5 + (max_score / 10))
|
| 223 |
+
|
| 224 |
+
# Extract relevant excerpts
|
| 225 |
+
excerpts = self._extract_excerpts(text, primary_topic)
|
| 226 |
+
|
| 227 |
+
return {
|
| 228 |
+
"primary_topic": primary_topic.value,
|
| 229 |
+
"secondary_topics": [t.value for t in secondary_topics],
|
| 230 |
+
"confidence": confidence,
|
| 231 |
+
"relevant_excerpts": excerpts,
|
| 232 |
+
"reasoning": f"Found {max_score} keyword matches for {primary_topic.value}",
|
| 233 |
+
"method": "keywords"
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
def _extract_excerpts(self, text: str, topic: PolicyTopic, max_excerpts: int = 3) -> List[str]:
|
| 237 |
+
"""Extract relevant text excerpts for a topic."""
|
| 238 |
+
keywords = self.TOPIC_KEYWORDS.get(topic, set())
|
| 239 |
+
sentences = text.split('. ')
|
| 240 |
+
|
| 241 |
+
relevant = []
|
| 242 |
+
for sentence in sentences:
|
| 243 |
+
sentence_lower = sentence.lower()
|
| 244 |
+
if any(keyword in sentence_lower for keyword in keywords):
|
| 245 |
+
relevant.append(sentence.strip())
|
| 246 |
+
if len(relevant) >= max_excerpts:
|
| 247 |
+
break
|
| 248 |
+
|
| 249 |
+
return relevant
|
| 250 |
+
|
| 251 |
+
def _get_format_instructions(self) -> str:
|
| 252 |
+
"""Get format instructions for LLM output parsing."""
|
| 253 |
+
parser = PydanticOutputParser(pydantic_object=ClassificationResult)
|
| 254 |
+
return parser.get_format_instructions()
|
| 255 |
+
|
| 256 |
+
def _get_example_input(self) -> Dict[str, Any]:
|
| 257 |
+
"""Get example input for MLflow signature."""
|
| 258 |
+
return {
|
| 259 |
+
"document_id": "doc_12345",
|
| 260 |
+
"title": "City Council Meeting - Water Quality Discussion",
|
| 261 |
+
"content": "The council discussed adding fluoride to the municipal water supply..."
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def register_classifier_to_unity_catalog():
|
| 266 |
+
"""
|
| 267 |
+
Register the classifier agent to Unity Catalog.
|
| 268 |
+
|
| 269 |
+
Usage:
|
| 270 |
+
python -c "from agents.mlflow_classifier import register_classifier_to_unity_catalog; register_classifier_to_unity_catalog()"
|
| 271 |
+
"""
|
| 272 |
+
agent = PolicyClassifierAgent()
|
| 273 |
+
|
| 274 |
+
# Log and register to Unity Catalog
|
| 275 |
+
run_id = agent.log_to_mlflow(
|
| 276 |
+
model_name="policy_classifier_agent",
|
| 277 |
+
registered_model_name=f"{settings.catalog_name}.{settings.schema_name}.policy_classifier",
|
| 278 |
+
pip_requirements=[
|
| 279 |
+
"mlflow>=2.10.0",
|
| 280 |
+
"langchain>=0.1.0",
|
| 281 |
+
"openai>=1.6.0",
|
| 282 |
+
"pydantic>=2.5.0"
|
| 283 |
+
]
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
print(f"✅ Registered policy classifier agent to Unity Catalog")
|
| 287 |
+
print(f" Model: {settings.catalog_name}.{settings.schema_name}.policy_classifier")
|
| 288 |
+
print(f" Run ID: {run_id}")
|
| 289 |
+
|
| 290 |
+
return run_id
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
if __name__ == "__main__":
|
| 294 |
+
# Test the agent locally
|
| 295 |
+
agent = PolicyClassifierAgent()
|
| 296 |
+
|
| 297 |
+
test_input = {
|
| 298 |
+
"document_id": "test_001",
|
| 299 |
+
"title": "School Board Meeting Minutes",
|
| 300 |
+
"content": """
|
| 301 |
+
The school board discussed implementing a new dental screening program
|
| 302 |
+
for elementary students. The program would provide free dental exams
|
| 303 |
+
and referrals to local dentists for students in need.
|
| 304 |
+
"""
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
result = agent.predict(None, test_input)
|
| 308 |
+
print("Classification Result:", result)
|
agents/orchestrator.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-Agent Orchestrator for coordinating the policy analysis pipeline.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
from typing import Dict, List, Optional, Any
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
from loguru import logger
|
| 9 |
+
|
| 10 |
+
from agents.base import (
|
| 11 |
+
BaseAgent,
|
| 12 |
+
AgentRole,
|
| 13 |
+
AgentMessage,
|
| 14 |
+
MessageType,
|
| 15 |
+
AgentStatus
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class WorkflowStage(str):
|
| 20 |
+
"""Workflow stage identifiers."""
|
| 21 |
+
SCRAPE = "scrape"
|
| 22 |
+
PARSE = "parse"
|
| 23 |
+
CLASSIFY = "classify"
|
| 24 |
+
ANALYZE = "analyze"
|
| 25 |
+
GENERATE = "generate"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class OrchestratorAgent(BaseAgent):
|
| 29 |
+
"""
|
| 30 |
+
Orchestrator agent that coordinates the multi-agent workflow.
|
| 31 |
+
|
| 32 |
+
The orchestrator manages the flow of data through the pipeline:
|
| 33 |
+
1. Scraper Agent -> Collects meeting minutes
|
| 34 |
+
2. Parser Agent -> Extracts structured data
|
| 35 |
+
3. Classifier Agent -> Identifies oral health topics
|
| 36 |
+
4. Sentiment Agent -> Analyzes policy positions
|
| 37 |
+
5. Advocacy Agent -> Generates outreach materials
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, agent_id: str = "orchestrator-001"):
|
| 41 |
+
"""Initialize the orchestrator agent."""
|
| 42 |
+
super().__init__(agent_id, AgentRole.ORCHESTRATOR)
|
| 43 |
+
self.agents: Dict[AgentRole, BaseAgent] = {}
|
| 44 |
+
self.workflow_state: Dict[str, Any] = defaultdict(dict)
|
| 45 |
+
self.active_workflows: Dict[str, Dict[str, Any]] = {}
|
| 46 |
+
|
| 47 |
+
def register_agent(self, agent: BaseAgent):
|
| 48 |
+
"""
|
| 49 |
+
Register an agent with the orchestrator.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
agent: The agent to register
|
| 53 |
+
"""
|
| 54 |
+
self.agents[agent.role] = agent
|
| 55 |
+
logger.info(f"Registered {agent.role.value} agent: {agent.agent_id}")
|
| 56 |
+
|
| 57 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 58 |
+
"""
|
| 59 |
+
Process orchestrator commands and route messages.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
message: The incoming message
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
List of response messages
|
| 66 |
+
"""
|
| 67 |
+
self.update_status(AgentStatus.PROCESSING, "Processing orchestrator command")
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
if message.message_type == MessageType.COMMAND:
|
| 71 |
+
command = message.payload.get("command")
|
| 72 |
+
|
| 73 |
+
if command == "start_workflow":
|
| 74 |
+
return await self._start_workflow(message.payload)
|
| 75 |
+
elif command == "check_status":
|
| 76 |
+
return await self._check_workflow_status(message.payload)
|
| 77 |
+
elif command == "stop_workflow":
|
| 78 |
+
return await self._stop_workflow(message.payload)
|
| 79 |
+
|
| 80 |
+
self.log_success()
|
| 81 |
+
return []
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
self.log_failure(str(e))
|
| 85 |
+
return [await self.send_message(
|
| 86 |
+
message.sender,
|
| 87 |
+
MessageType.ERROR,
|
| 88 |
+
{"error": str(e)}
|
| 89 |
+
)]
|
| 90 |
+
|
| 91 |
+
async def _start_workflow(self, payload: Dict[str, Any]) -> List[AgentMessage]:
|
| 92 |
+
"""
|
| 93 |
+
Start a new policy analysis workflow.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
payload: Workflow configuration
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
List of messages to initiate the workflow
|
| 100 |
+
"""
|
| 101 |
+
import uuid
|
| 102 |
+
|
| 103 |
+
workflow_id = str(uuid.uuid4())
|
| 104 |
+
workflow_config = payload.get("config", {})
|
| 105 |
+
|
| 106 |
+
# Initialize workflow state
|
| 107 |
+
self.active_workflows[workflow_id] = {
|
| 108 |
+
"id": workflow_id,
|
| 109 |
+
"started_at": datetime.utcnow(),
|
| 110 |
+
"stage": WorkflowStage.SCRAPE,
|
| 111 |
+
"config": workflow_config,
|
| 112 |
+
"status": "running"
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
logger.info(f"Starting workflow {workflow_id}")
|
| 116 |
+
|
| 117 |
+
# Create initial scraping task
|
| 118 |
+
scraper_message = await self.send_message(
|
| 119 |
+
AgentRole.SCRAPER,
|
| 120 |
+
MessageType.COMMAND,
|
| 121 |
+
{
|
| 122 |
+
"workflow_id": workflow_id,
|
| 123 |
+
"command": "scrape",
|
| 124 |
+
"targets": workflow_config.get("scrape_targets", []),
|
| 125 |
+
"date_range": workflow_config.get("date_range", {})
|
| 126 |
+
}
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return [scraper_message]
|
| 130 |
+
|
| 131 |
+
async def _check_workflow_status(self, payload: Dict[str, Any]) -> List[AgentMessage]:
|
| 132 |
+
"""
|
| 133 |
+
Check the status of active workflows.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
payload: Status check request
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
List containing status response
|
| 140 |
+
"""
|
| 141 |
+
workflow_id = payload.get("workflow_id")
|
| 142 |
+
|
| 143 |
+
if workflow_id and workflow_id in self.active_workflows:
|
| 144 |
+
workflow = self.active_workflows[workflow_id]
|
| 145 |
+
status_payload = {
|
| 146 |
+
"workflow_id": workflow_id,
|
| 147 |
+
"status": workflow.get("status"),
|
| 148 |
+
"stage": workflow.get("stage"),
|
| 149 |
+
"started_at": workflow.get("started_at").isoformat()
|
| 150 |
+
}
|
| 151 |
+
else:
|
| 152 |
+
# Return status of all workflows
|
| 153 |
+
status_payload = {
|
| 154 |
+
"active_workflows": len(self.active_workflows),
|
| 155 |
+
"workflows": [
|
| 156 |
+
{
|
| 157 |
+
"id": wf_id,
|
| 158 |
+
"status": wf["status"],
|
| 159 |
+
"stage": wf["stage"]
|
| 160 |
+
}
|
| 161 |
+
for wf_id, wf in self.active_workflows.items()
|
| 162 |
+
]
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
response = await self.send_message(
|
| 166 |
+
AgentRole.ORCHESTRATOR,
|
| 167 |
+
MessageType.RESPONSE,
|
| 168 |
+
status_payload
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
return [response]
|
| 172 |
+
|
| 173 |
+
async def _stop_workflow(self, payload: Dict[str, Any]) -> List[AgentMessage]:
|
| 174 |
+
"""
|
| 175 |
+
Stop a running workflow.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
payload: Stop request with workflow_id
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
List containing confirmation message
|
| 182 |
+
"""
|
| 183 |
+
workflow_id = payload.get("workflow_id")
|
| 184 |
+
|
| 185 |
+
if workflow_id in self.active_workflows:
|
| 186 |
+
self.active_workflows[workflow_id]["status"] = "stopped"
|
| 187 |
+
logger.info(f"Stopped workflow {workflow_id}")
|
| 188 |
+
|
| 189 |
+
response = await self.send_message(
|
| 190 |
+
AgentRole.ORCHESTRATOR,
|
| 191 |
+
MessageType.RESPONSE,
|
| 192 |
+
{"workflow_id": workflow_id, "status": "stopped"}
|
| 193 |
+
)
|
| 194 |
+
else:
|
| 195 |
+
response = await self.send_message(
|
| 196 |
+
AgentRole.ORCHESTRATOR,
|
| 197 |
+
MessageType.ERROR,
|
| 198 |
+
{"error": f"Workflow {workflow_id} not found"}
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
return [response]
|
| 202 |
+
|
| 203 |
+
async def route_message(self, message: AgentMessage) -> Optional[AgentMessage]:
|
| 204 |
+
"""
|
| 205 |
+
Route a message to the appropriate agent.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
message: The message to route
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
Response from the target agent
|
| 212 |
+
"""
|
| 213 |
+
target_agent = self.agents.get(message.recipient)
|
| 214 |
+
|
| 215 |
+
if not target_agent:
|
| 216 |
+
logger.error(f"No agent found for role: {message.recipient}")
|
| 217 |
+
return None
|
| 218 |
+
|
| 219 |
+
try:
|
| 220 |
+
response = await target_agent.process(message)
|
| 221 |
+
return response
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Error routing message to {message.recipient}: {e}")
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
async def execute_pipeline(
|
| 227 |
+
self,
|
| 228 |
+
scrape_targets: List[Dict[str, Any]],
|
| 229 |
+
date_range: Optional[Dict[str, str]] = None
|
| 230 |
+
) -> Dict[str, Any]:
|
| 231 |
+
"""
|
| 232 |
+
Execute the complete policy analysis pipeline.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
scrape_targets: List of government entities to scrape
|
| 236 |
+
date_range: Optional date range for historical data
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Dictionary containing pipeline results
|
| 240 |
+
"""
|
| 241 |
+
workflow_config = {
|
| 242 |
+
"scrape_targets": scrape_targets,
|
| 243 |
+
"date_range": date_range or {}
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Start the workflow
|
| 247 |
+
start_message = await self.send_message(
|
| 248 |
+
AgentRole.ORCHESTRATOR,
|
| 249 |
+
MessageType.COMMAND,
|
| 250 |
+
{
|
| 251 |
+
"command": "start_workflow",
|
| 252 |
+
"config": workflow_config
|
| 253 |
+
}
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
results = await self.process(start_message)
|
| 257 |
+
|
| 258 |
+
return {
|
| 259 |
+
"success": True,
|
| 260 |
+
"workflow_initiated": True,
|
| 261 |
+
"messages": [msg.dict() for msg in results]
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
def get_all_agent_states(self) -> Dict[str, Any]:
|
| 265 |
+
"""Get the current state of all registered agents."""
|
| 266 |
+
return {
|
| 267 |
+
role.value: agent.get_state().dict()
|
| 268 |
+
for role, agent in self.agents.items()
|
| 269 |
+
}
|
agents/parser.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Parser Agent for extracting and structuring data from raw meeting minutes.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ParserAgent(BaseAgent):
|
| 13 |
+
"""
|
| 14 |
+
Agent responsible for parsing raw meeting documents into structured data.
|
| 15 |
+
|
| 16 |
+
Extracts:
|
| 17 |
+
- Meeting metadata (date, type, location)
|
| 18 |
+
- Attendees and participants
|
| 19 |
+
- Agenda items
|
| 20 |
+
- Discussion topics
|
| 21 |
+
- Votes and decisions
|
| 22 |
+
- Action items
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, agent_id: str = "parser-001"):
|
| 26 |
+
"""Initialize the parser agent."""
|
| 27 |
+
super().__init__(agent_id, AgentRole.PARSER)
|
| 28 |
+
self._compile_patterns()
|
| 29 |
+
|
| 30 |
+
def _compile_patterns(self):
|
| 31 |
+
"""Compile regex patterns for parsing."""
|
| 32 |
+
self.patterns = {
|
| 33 |
+
"date": re.compile(
|
| 34 |
+
r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}",
|
| 35 |
+
re.IGNORECASE
|
| 36 |
+
),
|
| 37 |
+
"time": re.compile(r"\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?"),
|
| 38 |
+
"attendees": re.compile(r"(?:Present|Attending|Members Present):(.+?)(?:\n\n|\Z)", re.DOTALL | re.IGNORECASE),
|
| 39 |
+
"motion": re.compile(r"(?:MOTION|Motion|MOVED)(.+?)(?:CARRIED|PASSED|FAILED|$)", re.DOTALL | re.IGNORECASE),
|
| 40 |
+
"vote": re.compile(r"(?:Vote|VOTE):\s*(.+)", re.IGNORECASE),
|
| 41 |
+
"agenda_item": re.compile(r"(?:Item|ITEM)\s+#?(\d+|[A-Z])[\.:]\s*(.+?)(?=\n(?:Item|ITEM)|$)", re.DOTALL | re.IGNORECASE)
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 45 |
+
"""
|
| 46 |
+
Process parsing commands.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
message: Message containing raw documents to parse
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
List of messages with parsed data
|
| 53 |
+
"""
|
| 54 |
+
self.update_status(AgentStatus.PROCESSING, "Parsing meeting documents")
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
documents = message.payload.get("documents", [])
|
| 58 |
+
|
| 59 |
+
parsed_documents = []
|
| 60 |
+
|
| 61 |
+
for doc in documents:
|
| 62 |
+
parsed = await self._parse_document(doc)
|
| 63 |
+
if parsed:
|
| 64 |
+
parsed_documents.append(parsed)
|
| 65 |
+
|
| 66 |
+
# Send parsed documents to classifier
|
| 67 |
+
response = await self.send_message(
|
| 68 |
+
AgentRole.CLASSIFIER,
|
| 69 |
+
MessageType.DATA,
|
| 70 |
+
{
|
| 71 |
+
"workflow_id": message.payload.get("workflow_id"),
|
| 72 |
+
"documents": parsed_documents,
|
| 73 |
+
"count": len(parsed_documents)
|
| 74 |
+
}
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
self.log_success()
|
| 78 |
+
logger.info(f"Parsed {len(parsed_documents)} documents")
|
| 79 |
+
|
| 80 |
+
return [response]
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
self.log_failure(str(e))
|
| 84 |
+
error_msg = await self.send_message(
|
| 85 |
+
AgentRole.ORCHESTRATOR,
|
| 86 |
+
MessageType.ERROR,
|
| 87 |
+
{"error": str(e), "agent": self.agent_id}
|
| 88 |
+
)
|
| 89 |
+
return [error_msg]
|
| 90 |
+
|
| 91 |
+
async def _parse_document(self, doc: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 92 |
+
"""
|
| 93 |
+
Parse a single meeting document.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
doc: Raw document data
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
Parsed document with structured fields
|
| 100 |
+
"""
|
| 101 |
+
try:
|
| 102 |
+
content = doc.get("content", "")
|
| 103 |
+
|
| 104 |
+
parsed = {
|
| 105 |
+
"document_id": doc["document_id"],
|
| 106 |
+
"source_url": doc["source_url"],
|
| 107 |
+
"municipality": doc["municipality"],
|
| 108 |
+
"state": doc["state"],
|
| 109 |
+
"raw_title": doc["title"],
|
| 110 |
+
"parsed_at": datetime.utcnow().isoformat(),
|
| 111 |
+
|
| 112 |
+
# Extracted structured data
|
| 113 |
+
"meeting_date": self._extract_date(content, doc.get("meeting_date")),
|
| 114 |
+
"meeting_time": self._extract_time(content),
|
| 115 |
+
"meeting_type": doc.get("meeting_type", "Unknown"),
|
| 116 |
+
"attendees": self._extract_attendees(content),
|
| 117 |
+
"agenda_items": self._extract_agenda_items(content),
|
| 118 |
+
"motions": self._extract_motions(content),
|
| 119 |
+
"votes": self._extract_votes(content),
|
| 120 |
+
"discussion_sections": self._extract_discussion_sections(content),
|
| 121 |
+
|
| 122 |
+
# Full text for semantic search
|
| 123 |
+
"full_text": content,
|
| 124 |
+
|
| 125 |
+
# Metadata
|
| 126 |
+
"metadata": doc.get("metadata", {})
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
return parsed
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"Error parsing document {doc.get('document_id')}: {e}")
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
def _extract_date(self, content: str, fallback_date: Optional[str]) -> str:
|
| 136 |
+
"""Extract meeting date from content."""
|
| 137 |
+
match = self.patterns["date"].search(content)
|
| 138 |
+
if match:
|
| 139 |
+
return match.group(0)
|
| 140 |
+
return fallback_date or datetime.utcnow().isoformat()
|
| 141 |
+
|
| 142 |
+
def _extract_time(self, content: str) -> Optional[str]:
|
| 143 |
+
"""Extract meeting time from content."""
|
| 144 |
+
match = self.patterns["time"].search(content)
|
| 145 |
+
return match.group(0) if match else None
|
| 146 |
+
|
| 147 |
+
def _extract_attendees(self, content: str) -> List[str]:
|
| 148 |
+
"""Extract list of meeting attendees."""
|
| 149 |
+
match = self.patterns["attendees"].search(content)
|
| 150 |
+
if match:
|
| 151 |
+
attendees_text = match.group(1)
|
| 152 |
+
# Split by comma or newline
|
| 153 |
+
attendees = re.split(r'[,\n]', attendees_text)
|
| 154 |
+
return [a.strip() for a in attendees if a.strip()]
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
def _extract_agenda_items(self, content: str) -> List[Dict[str, str]]:
|
| 158 |
+
"""Extract agenda items from content."""
|
| 159 |
+
items = []
|
| 160 |
+
for match in self.patterns["agenda_item"].finditer(content):
|
| 161 |
+
items.append({
|
| 162 |
+
"number": match.group(1).strip(),
|
| 163 |
+
"description": match.group(2).strip()
|
| 164 |
+
})
|
| 165 |
+
return items
|
| 166 |
+
|
| 167 |
+
def _extract_motions(self, content: str) -> List[Dict[str, str]]:
|
| 168 |
+
"""Extract motions from content."""
|
| 169 |
+
motions = []
|
| 170 |
+
for match in self.patterns["motion"].finditer(content):
|
| 171 |
+
motions.append({
|
| 172 |
+
"text": match.group(1).strip(),
|
| 173 |
+
"full_match": match.group(0).strip()
|
| 174 |
+
})
|
| 175 |
+
return motions
|
| 176 |
+
|
| 177 |
+
def _extract_votes(self, content: str) -> List[Dict[str, str]]:
|
| 178 |
+
"""Extract voting records from content."""
|
| 179 |
+
votes = []
|
| 180 |
+
for match in self.patterns["vote"].finditer(content):
|
| 181 |
+
votes.append({
|
| 182 |
+
"result": match.group(1).strip()
|
| 183 |
+
})
|
| 184 |
+
return votes
|
| 185 |
+
|
| 186 |
+
def _extract_discussion_sections(self, content: str) -> List[Dict[str, str]]:
|
| 187 |
+
"""Extract discussion sections from content."""
|
| 188 |
+
# Split content into paragraphs
|
| 189 |
+
paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
|
| 190 |
+
|
| 191 |
+
sections = []
|
| 192 |
+
for i, para in enumerate(paragraphs):
|
| 193 |
+
if len(para) > 100: # Only substantial paragraphs
|
| 194 |
+
sections.append({
|
| 195 |
+
"section_id": i,
|
| 196 |
+
"text": para
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
return sections
|
agents/scraper.py
ADDED
|
@@ -0,0 +1,2113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scraper Agent for collecting government meeting minutes from various sources.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
import hashlib
|
| 6 |
+
import io
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from urllib.parse import urljoin, urlparse
|
| 12 |
+
import httpx
|
| 13 |
+
from bs4 import BeautifulSoup
|
| 14 |
+
from loguru import logger
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from PyPDF2 import PdfReader
|
| 18 |
+
except Exception:
|
| 19 |
+
PdfReader = None
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
import pdfplumber
|
| 23 |
+
except Exception:
|
| 24 |
+
pdfplumber = None
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
import pytesseract
|
| 28 |
+
from pytesseract import TesseractNotFoundError
|
| 29 |
+
except Exception:
|
| 30 |
+
pytesseract = None
|
| 31 |
+
TesseractNotFoundError = Exception
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
from PIL import Image
|
| 35 |
+
except Exception:
|
| 36 |
+
Image = None
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 40 |
+
except Exception:
|
| 41 |
+
YouTubeTranscriptApi = None
|
| 42 |
+
|
| 43 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class MeetingDocument(dict):
|
| 47 |
+
"""Structured representation of a meeting document."""
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
document_id: str,
|
| 52 |
+
source_url: str,
|
| 53 |
+
municipality: str,
|
| 54 |
+
state: str,
|
| 55 |
+
meeting_date: datetime,
|
| 56 |
+
meeting_type: str,
|
| 57 |
+
title: str,
|
| 58 |
+
content: str,
|
| 59 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 60 |
+
):
|
| 61 |
+
super().__init__(
|
| 62 |
+
document_id=document_id,
|
| 63 |
+
source_url=source_url,
|
| 64 |
+
municipality=municipality,
|
| 65 |
+
state=state,
|
| 66 |
+
meeting_date=meeting_date.isoformat() if isinstance(meeting_date, datetime) else meeting_date,
|
| 67 |
+
meeting_type=meeting_type,
|
| 68 |
+
title=title,
|
| 69 |
+
content=content,
|
| 70 |
+
scraped_at=datetime.utcnow().isoformat(),
|
| 71 |
+
metadata=metadata or {}
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class ScraperAgent(BaseAgent):
|
| 76 |
+
"""
|
| 77 |
+
Agent responsible for scraping government meeting minutes from various sources.
|
| 78 |
+
|
| 79 |
+
Supports multiple platforms:
|
| 80 |
+
- Legistar (widely used by city councils)
|
| 81 |
+
- Granicus (meeting management platform)
|
| 82 |
+
- Generic municipal websites
|
| 83 |
+
- PDF documents
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(self, agent_id: str = "scraper-001"):
|
| 87 |
+
"""Initialize the scraper agent."""
|
| 88 |
+
super().__init__(agent_id, AgentRole.SCRAPER)
|
| 89 |
+
self.http_client: Optional[httpx.AsyncClient] = None
|
| 90 |
+
self.scraped_urls: set = set()
|
| 91 |
+
self.document_extensions = (".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx")
|
| 92 |
+
self.meeting_keywords = ("minutes", "agenda", "meeting", "council", "commission", "board")
|
| 93 |
+
self.document_route_keywords = (
|
| 94 |
+
"getagendafile",
|
| 95 |
+
"getminutesfile",
|
| 96 |
+
"download",
|
| 97 |
+
"agendafile",
|
| 98 |
+
"minutesfile",
|
| 99 |
+
)
|
| 100 |
+
self.ocr_max_pages = 10
|
| 101 |
+
self._ocr_missing_tesseract_warned = False
|
| 102 |
+
self.social_source_limit = 8
|
| 103 |
+
|
| 104 |
+
# Policy and meeting-focused keywords for social media filtering
|
| 105 |
+
self.policy_meeting_keywords = (
|
| 106 |
+
# Meetings
|
| 107 |
+
"council meeting", "city council", "town council", "board meeting",
|
| 108 |
+
"commission meeting", "public meeting", "town hall", "session",
|
| 109 |
+
"special meeting", "regular meeting", "work session", "workshop",
|
| 110 |
+
# Documents
|
| 111 |
+
"agenda", "minutes", "ordinance", "resolution", "public hearing",
|
| 112 |
+
"hearing", "vote", "voting", "motion", "legislation",
|
| 113 |
+
# Policy topics
|
| 114 |
+
"policy", "budget", "zoning", "planning", "development",
|
| 115 |
+
"public comment", "community meeting", "civic", "government",
|
| 116 |
+
# Video/meeting specific
|
| 117 |
+
"live stream", "livestream", "recorded meeting", "meeting video",
|
| 118 |
+
"council session", "board session", "official meeting"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
async def __aenter__(self):
|
| 122 |
+
"""Async context manager entry."""
|
| 123 |
+
self.http_client = httpx.AsyncClient(
|
| 124 |
+
timeout=30.0,
|
| 125 |
+
follow_redirects=True,
|
| 126 |
+
headers={
|
| 127 |
+
"User-Agent": "OpenNavigator/1.0 (+https://github.com/getcommunityone/open-navigator)"
|
| 128 |
+
}
|
| 129 |
+
)
|
| 130 |
+
return self
|
| 131 |
+
|
| 132 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 133 |
+
"""Async context manager exit."""
|
| 134 |
+
if self.http_client:
|
| 135 |
+
await self.http_client.aclose()
|
| 136 |
+
|
| 137 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 138 |
+
"""
|
| 139 |
+
Process scraping commands.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
message: Command message with scraping targets
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
List of messages containing scraped data
|
| 146 |
+
"""
|
| 147 |
+
self.update_status(AgentStatus.PROCESSING, "Scraping government meeting minutes")
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
command = message.payload.get("command")
|
| 151 |
+
|
| 152 |
+
if command == "scrape":
|
| 153 |
+
targets = message.payload.get("targets", [])
|
| 154 |
+
date_range = message.payload.get("date_range", {})
|
| 155 |
+
|
| 156 |
+
# Initialize HTTP client if not already done
|
| 157 |
+
if not self.http_client:
|
| 158 |
+
async with self:
|
| 159 |
+
documents = await self._scrape_targets(targets, date_range)
|
| 160 |
+
else:
|
| 161 |
+
documents = await self._scrape_targets(targets, date_range)
|
| 162 |
+
|
| 163 |
+
# Send scraped documents to parser
|
| 164 |
+
response = await self.send_message(
|
| 165 |
+
AgentRole.PARSER,
|
| 166 |
+
MessageType.DATA,
|
| 167 |
+
{
|
| 168 |
+
"workflow_id": message.payload.get("workflow_id"),
|
| 169 |
+
"documents": documents,
|
| 170 |
+
"count": len(documents)
|
| 171 |
+
}
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
self.log_success()
|
| 175 |
+
logger.info(f"Scraped {len(documents)} documents")
|
| 176 |
+
|
| 177 |
+
return [response]
|
| 178 |
+
|
| 179 |
+
return []
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
self.log_failure(str(e))
|
| 183 |
+
error_msg = await self.send_message(
|
| 184 |
+
AgentRole.ORCHESTRATOR,
|
| 185 |
+
MessageType.ERROR,
|
| 186 |
+
{"error": str(e), "agent": self.agent_id}
|
| 187 |
+
)
|
| 188 |
+
return [error_msg]
|
| 189 |
+
|
| 190 |
+
async def _scrape_targets(
|
| 191 |
+
self,
|
| 192 |
+
targets: List[Dict[str, Any]],
|
| 193 |
+
date_range: Dict[str, str]
|
| 194 |
+
) -> List[Dict[str, Any]]:
|
| 195 |
+
"""
|
| 196 |
+
Scrape multiple targets concurrently.
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
targets: List of scraping targets
|
| 200 |
+
date_range: Date range for filtering meetings
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
List of scraped documents
|
| 204 |
+
"""
|
| 205 |
+
tasks = []
|
| 206 |
+
|
| 207 |
+
for target in targets:
|
| 208 |
+
platform = target.get("platform", "generic")
|
| 209 |
+
url = target.get("url", "")
|
| 210 |
+
|
| 211 |
+
if platform == "legistar":
|
| 212 |
+
tasks.append(self._scrape_legistar(target, date_range))
|
| 213 |
+
elif platform == "granicus":
|
| 214 |
+
tasks.append(self._scrape_granicus(target, date_range))
|
| 215 |
+
elif platform == "suiteonemedia" or "suiteonemedia" in url.lower():
|
| 216 |
+
tasks.append(self._scrape_suiteonemedia(target, date_range))
|
| 217 |
+
elif platform == "eboard" or "eboardsolutions.com" in url.lower() or "simbli.eboardsolutions" in url.lower():
|
| 218 |
+
tasks.append(self._scrape_eboard(target, date_range))
|
| 219 |
+
elif platform == "youtube":
|
| 220 |
+
tasks.append(self._scrape_youtube_source(target))
|
| 221 |
+
elif platform == "facebook":
|
| 222 |
+
tasks.append(self._scrape_facebook_source(target))
|
| 223 |
+
else:
|
| 224 |
+
tasks.append(self._scrape_generic(target, date_range))
|
| 225 |
+
|
| 226 |
+
# Execute all scraping tasks concurrently
|
| 227 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 228 |
+
|
| 229 |
+
# Flatten results and filter out errors
|
| 230 |
+
documents = []
|
| 231 |
+
for result in results:
|
| 232 |
+
if isinstance(result, Exception):
|
| 233 |
+
logger.error(f"Scraping error: {result}")
|
| 234 |
+
elif isinstance(result, list):
|
| 235 |
+
documents.extend(result)
|
| 236 |
+
|
| 237 |
+
return documents
|
| 238 |
+
|
| 239 |
+
async def scrape_social_sources(
|
| 240 |
+
self,
|
| 241 |
+
municipality: str,
|
| 242 |
+
state: str,
|
| 243 |
+
seed_url: str,
|
| 244 |
+
max_sources: int = 8
|
| 245 |
+
) -> List[Dict[str, Any]]:
|
| 246 |
+
"""Discover and scrape YouTube/Facebook sources for a jurisdiction."""
|
| 247 |
+
social_documents: List[Dict[str, Any]] = []
|
| 248 |
+
|
| 249 |
+
homepage_url = await self._resolve_homepage_url(municipality, state, seed_url)
|
| 250 |
+
if not homepage_url:
|
| 251 |
+
logger.warning(f"Could not resolve homepage URL for social scraping: {municipality}, {state}")
|
| 252 |
+
return social_documents
|
| 253 |
+
|
| 254 |
+
logger.info(f"Discovering social sources from homepage: {homepage_url}")
|
| 255 |
+
social_urls = await self._discover_social_urls(homepage_url, municipality, state)
|
| 256 |
+
|
| 257 |
+
youtube_urls = list(dict.fromkeys(social_urls.get("youtube", [])))[:max_sources]
|
| 258 |
+
facebook_urls = list(dict.fromkeys(social_urls.get("facebook", [])))[:max_sources]
|
| 259 |
+
|
| 260 |
+
logger.info(
|
| 261 |
+
f"Social discovery for {municipality}: "
|
| 262 |
+
f"{len(youtube_urls)} YouTube, {len(facebook_urls)} Facebook"
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
tasks = []
|
| 266 |
+
for y_url in youtube_urls:
|
| 267 |
+
tasks.append(self._scrape_youtube_source({
|
| 268 |
+
"url": y_url,
|
| 269 |
+
"municipality": municipality,
|
| 270 |
+
"state": state,
|
| 271 |
+
}))
|
| 272 |
+
for f_url in facebook_urls:
|
| 273 |
+
tasks.append(self._scrape_facebook_source({
|
| 274 |
+
"url": f_url,
|
| 275 |
+
"municipality": municipality,
|
| 276 |
+
"state": state,
|
| 277 |
+
}))
|
| 278 |
+
|
| 279 |
+
if not tasks:
|
| 280 |
+
return social_documents
|
| 281 |
+
|
| 282 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 283 |
+
for result in results:
|
| 284 |
+
if isinstance(result, Exception):
|
| 285 |
+
logger.warning(f"Social scraping error: {result}")
|
| 286 |
+
continue
|
| 287 |
+
if isinstance(result, list):
|
| 288 |
+
social_documents.extend(result)
|
| 289 |
+
|
| 290 |
+
return social_documents
|
| 291 |
+
|
| 292 |
+
async def _resolve_homepage_url(self, municipality: str, state: str, seed_url: str) -> str:
|
| 293 |
+
"""Resolve an official website homepage used for social discovery."""
|
| 294 |
+
if seed_url and "suiteonemedia" not in seed_url.lower():
|
| 295 |
+
parsed = urlparse(seed_url)
|
| 296 |
+
return f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else seed_url
|
| 297 |
+
|
| 298 |
+
city = (municipality or "").lower().replace(" ", "").replace("'", "")
|
| 299 |
+
st = (state or "").lower()
|
| 300 |
+
candidates = [
|
| 301 |
+
f"https://www.{city}{st}.gov",
|
| 302 |
+
f"https://{city}{st}.gov",
|
| 303 |
+
f"https://www.cityof{city}.com",
|
| 304 |
+
f"https://www.{city}.gov",
|
| 305 |
+
f"https://www.{city}.com",
|
| 306 |
+
f"https://{city}.com",
|
| 307 |
+
]
|
| 308 |
+
|
| 309 |
+
for candidate in candidates:
|
| 310 |
+
try:
|
| 311 |
+
resp = await self.http_client.get(candidate, timeout=8)
|
| 312 |
+
if resp.status_code < 400:
|
| 313 |
+
parsed = urlparse(str(resp.url))
|
| 314 |
+
return f"{parsed.scheme}://{parsed.netloc}"
|
| 315 |
+
except Exception:
|
| 316 |
+
continue
|
| 317 |
+
|
| 318 |
+
return ""
|
| 319 |
+
|
| 320 |
+
async def _discover_social_urls(self, homepage_url: str, municipality: str, state: str) -> Dict[str, List[str]]:
|
| 321 |
+
"""Discover social media URLs from homepage and YouTube pattern matching."""
|
| 322 |
+
discovered = {"youtube": [], "facebook": []}
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
from discovery.social_media_discovery import SocialMediaDiscovery
|
| 326 |
+
|
| 327 |
+
async with SocialMediaDiscovery() as discovery:
|
| 328 |
+
social = await discovery.discover_from_website(
|
| 329 |
+
homepage_url=homepage_url,
|
| 330 |
+
jurisdiction_name=municipality,
|
| 331 |
+
state=state,
|
| 332 |
+
)
|
| 333 |
+
discovered["youtube"] = social.get("youtube", [])
|
| 334 |
+
discovered["facebook"] = social.get("facebook", [])
|
| 335 |
+
except Exception as err:
|
| 336 |
+
logger.debug(f"SocialMediaDiscovery unavailable/failed: {err}")
|
| 337 |
+
|
| 338 |
+
# Augment YouTube discovery using handle pattern search for better recall.
|
| 339 |
+
try:
|
| 340 |
+
from discovery.youtube_channel_discovery import YouTubeChannelDiscovery
|
| 341 |
+
|
| 342 |
+
async with YouTubeChannelDiscovery() as ydisc:
|
| 343 |
+
channels = await ydisc.discover_channels(
|
| 344 |
+
city_name=municipality,
|
| 345 |
+
state_code=state,
|
| 346 |
+
homepage_url=homepage_url,
|
| 347 |
+
)
|
| 348 |
+
for channel in channels:
|
| 349 |
+
url = channel.get("channel_url")
|
| 350 |
+
if url:
|
| 351 |
+
discovered["youtube"].append(url)
|
| 352 |
+
except Exception as err:
|
| 353 |
+
logger.debug(f"YouTubeChannelDiscovery unavailable/failed: {err}")
|
| 354 |
+
|
| 355 |
+
discovered["youtube"] = list(dict.fromkeys(discovered["youtube"]))
|
| 356 |
+
discovered["facebook"] = list(dict.fromkeys(discovered["facebook"]))
|
| 357 |
+
return discovered
|
| 358 |
+
|
| 359 |
+
def _is_policy_meeting_content(self, text: str) -> bool:
|
| 360 |
+
"""Check if text content is related to policy or meetings."""
|
| 361 |
+
if not text:
|
| 362 |
+
return False
|
| 363 |
+
text_lower = text.lower()
|
| 364 |
+
return any(keyword in text_lower for keyword in self.policy_meeting_keywords)
|
| 365 |
+
|
| 366 |
+
def _extract_youtube_video_metadata(self, html: str, video_id: str) -> Dict[str, str]:
|
| 367 |
+
"""Extract title and description from YouTube video page HTML."""
|
| 368 |
+
metadata = {"title": "", "description": ""}
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
# Extract title from various possible patterns
|
| 372 |
+
title_match = re.search(r'"title":"([^"]+)"', html)
|
| 373 |
+
if title_match:
|
| 374 |
+
metadata["title"] = title_match.group(1)
|
| 375 |
+
else:
|
| 376 |
+
# Fallback to meta tags
|
| 377 |
+
title_match = re.search(r'<title>([^<]+)</title>', html)
|
| 378 |
+
if title_match:
|
| 379 |
+
metadata["title"] = title_match.group(1).replace(" - YouTube", "")
|
| 380 |
+
|
| 381 |
+
# Extract description
|
| 382 |
+
desc_match = re.search(r'"description":"([^"]+)"', html)
|
| 383 |
+
if desc_match:
|
| 384 |
+
# Unescape and limit description length
|
| 385 |
+
metadata["description"] = desc_match.group(1)[:500]
|
| 386 |
+
|
| 387 |
+
except Exception as err:
|
| 388 |
+
logger.debug(f"Error extracting metadata for video {video_id}: {err}")
|
| 389 |
+
|
| 390 |
+
return metadata
|
| 391 |
+
|
| 392 |
+
async def _scrape_youtube_source(self, target: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 393 |
+
"""Scrape recent YouTube videos and transcripts from a channel URL, focusing on policy and meeting content."""
|
| 394 |
+
url = target.get("url", "")
|
| 395 |
+
municipality = target.get("municipality", "")
|
| 396 |
+
state = target.get("state", "")
|
| 397 |
+
|
| 398 |
+
documents: List[Dict[str, Any]] = []
|
| 399 |
+
if not url:
|
| 400 |
+
return documents
|
| 401 |
+
|
| 402 |
+
videos_url = url.rstrip("/") + "/videos"
|
| 403 |
+
try:
|
| 404 |
+
resp = await self.http_client.get(videos_url)
|
| 405 |
+
if resp.status_code >= 400:
|
| 406 |
+
resp = await self.http_client.get(url)
|
| 407 |
+
text = resp.text
|
| 408 |
+
except Exception as err:
|
| 409 |
+
logger.debug(f"Could not fetch YouTube page {url}: {err}")
|
| 410 |
+
return documents
|
| 411 |
+
|
| 412 |
+
video_ids = []
|
| 413 |
+
for vid in re.findall(r'watch\?v=([A-Za-z0-9_-]{11})', text):
|
| 414 |
+
if vid not in video_ids:
|
| 415 |
+
video_ids.append(vid)
|
| 416 |
+
|
| 417 |
+
# Process more videos initially to filter for relevant content
|
| 418 |
+
video_ids = video_ids[: self.social_source_limit * 3]
|
| 419 |
+
|
| 420 |
+
policy_videos = []
|
| 421 |
+
|
| 422 |
+
for vid in video_ids:
|
| 423 |
+
video_url = f"https://www.youtube.com/watch?v={vid}"
|
| 424 |
+
|
| 425 |
+
# Fetch video page to extract metadata
|
| 426 |
+
try:
|
| 427 |
+
video_resp = await self.http_client.get(video_url)
|
| 428 |
+
video_metadata = self._extract_youtube_video_metadata(video_resp.text, vid)
|
| 429 |
+
|
| 430 |
+
# Filter: Only keep videos with policy/meeting-related titles or descriptions
|
| 431 |
+
if not self._is_policy_meeting_content(video_metadata["title"]) and \
|
| 432 |
+
not self._is_policy_meeting_content(video_metadata["description"]):
|
| 433 |
+
logger.debug(f"Skipping non-policy video: {video_metadata['title']}")
|
| 434 |
+
continue
|
| 435 |
+
|
| 436 |
+
logger.info(f"Found policy/meeting video: {video_metadata['title']}")
|
| 437 |
+
|
| 438 |
+
except Exception as err:
|
| 439 |
+
logger.debug(f"Could not fetch metadata for video {vid}: {err}")
|
| 440 |
+
video_metadata = {"title": f"Video {vid}", "description": ""}
|
| 441 |
+
|
| 442 |
+
# Fetch transcript
|
| 443 |
+
transcript_text = self._fetch_youtube_transcript(vid)
|
| 444 |
+
if not transcript_text:
|
| 445 |
+
logger.debug(f"No transcript available for video {vid}")
|
| 446 |
+
continue
|
| 447 |
+
|
| 448 |
+
# Double-check transcript content for policy/meeting keywords
|
| 449 |
+
if not self._is_policy_meeting_content(transcript_text):
|
| 450 |
+
logger.debug(f"Transcript doesn't contain policy/meeting content: {vid}")
|
| 451 |
+
continue
|
| 452 |
+
|
| 453 |
+
doc_id = hashlib.md5(f"youtube-{municipality}-{vid}".encode()).hexdigest()
|
| 454 |
+
policy_videos.append(MeetingDocument(
|
| 455 |
+
document_id=doc_id,
|
| 456 |
+
source_url=video_url,
|
| 457 |
+
municipality=municipality,
|
| 458 |
+
state=state,
|
| 459 |
+
meeting_date=datetime.utcnow().isoformat(),
|
| 460 |
+
meeting_type="YouTube Video - Policy/Meeting",
|
| 461 |
+
title=video_metadata["title"] or f"YouTube Video {vid}",
|
| 462 |
+
content=transcript_text,
|
| 463 |
+
metadata={
|
| 464 |
+
"platform": "youtube",
|
| 465 |
+
"channel_url": url,
|
| 466 |
+
"video_id": vid,
|
| 467 |
+
"has_transcript": True,
|
| 468 |
+
"description": video_metadata["description"],
|
| 469 |
+
"filtered_for_policy": True,
|
| 470 |
+
}
|
| 471 |
+
))
|
| 472 |
+
|
| 473 |
+
# Limit to configured number of policy videos
|
| 474 |
+
if len(policy_videos) >= self.social_source_limit:
|
| 475 |
+
break
|
| 476 |
+
|
| 477 |
+
# Rate limiting
|
| 478 |
+
await asyncio.sleep(0.5)
|
| 479 |
+
|
| 480 |
+
logger.info(f"Found {len(policy_videos)} policy/meeting videos from {url}")
|
| 481 |
+
return policy_videos
|
| 482 |
+
|
| 483 |
+
async def _scrape_facebook_source(self, target: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 484 |
+
"""Scrape publicly accessible Facebook page/post text snippets, focusing on policy and meeting content."""
|
| 485 |
+
url = target.get("url", "")
|
| 486 |
+
municipality = target.get("municipality", "")
|
| 487 |
+
state = target.get("state", "")
|
| 488 |
+
|
| 489 |
+
documents: List[Dict[str, Any]] = []
|
| 490 |
+
if not url:
|
| 491 |
+
return documents
|
| 492 |
+
|
| 493 |
+
normalized = url.replace("www.facebook.com", "m.facebook.com")
|
| 494 |
+
try:
|
| 495 |
+
resp = await self.http_client.get(normalized)
|
| 496 |
+
if resp.status_code >= 400:
|
| 497 |
+
return documents
|
| 498 |
+
soup = BeautifulSoup(resp.content, "html.parser")
|
| 499 |
+
except Exception as err:
|
| 500 |
+
logger.debug(f"Could not fetch Facebook page {url}: {err}")
|
| 501 |
+
return documents
|
| 502 |
+
|
| 503 |
+
# Try to extract links to individual post pages first.
|
| 504 |
+
post_links: List[str] = []
|
| 505 |
+
for a in soup.find_all("a", href=True):
|
| 506 |
+
href = a["href"]
|
| 507 |
+
if "/posts/" in href or "/videos/" in href:
|
| 508 |
+
full = urljoin(normalized, href)
|
| 509 |
+
if full not in post_links:
|
| 510 |
+
post_links.append(full)
|
| 511 |
+
post_links = post_links[: self.social_source_limit * 2] # Check more posts to filter
|
| 512 |
+
|
| 513 |
+
# If direct post links are unavailable, use page text as fallback content.
|
| 514 |
+
if not post_links:
|
| 515 |
+
page_text = soup.get_text(" ", strip=True)
|
| 516 |
+
# Filter: Only use page content if it contains policy/meeting keywords
|
| 517 |
+
if len(page_text) > 200 and self._is_policy_meeting_content(page_text):
|
| 518 |
+
doc_id = hashlib.md5(f"facebook-page-{municipality}-{url}".encode()).hexdigest()
|
| 519 |
+
documents.append(MeetingDocument(
|
| 520 |
+
document_id=doc_id,
|
| 521 |
+
source_url=url,
|
| 522 |
+
municipality=municipality,
|
| 523 |
+
state=state,
|
| 524 |
+
meeting_date=datetime.utcnow().isoformat(),
|
| 525 |
+
meeting_type="Facebook Page - Policy/Meeting",
|
| 526 |
+
title="Facebook Page Content (Policy-Related)",
|
| 527 |
+
content=page_text[:8000],
|
| 528 |
+
metadata={
|
| 529 |
+
"platform": "facebook",
|
| 530 |
+
"content_source": "page_fallback",
|
| 531 |
+
"filtered_for_policy": True,
|
| 532 |
+
}
|
| 533 |
+
))
|
| 534 |
+
else:
|
| 535 |
+
logger.debug(f"Facebook page content doesn't contain policy/meeting keywords: {url}")
|
| 536 |
+
return documents
|
| 537 |
+
|
| 538 |
+
policy_posts = []
|
| 539 |
+
for post_url in post_links:
|
| 540 |
+
try:
|
| 541 |
+
p_resp = await self.http_client.get(post_url)
|
| 542 |
+
if p_resp.status_code >= 400:
|
| 543 |
+
continue
|
| 544 |
+
p_soup = BeautifulSoup(p_resp.content, "html.parser")
|
| 545 |
+
post_text = p_soup.get_text(" ", strip=True)
|
| 546 |
+
if len(post_text) < 120:
|
| 547 |
+
continue
|
| 548 |
+
|
| 549 |
+
# Filter: Only keep posts that mention policy/meeting keywords
|
| 550 |
+
if not self._is_policy_meeting_content(post_text):
|
| 551 |
+
logger.debug(f"Skipping non-policy Facebook post: {post_url[:80]}...")
|
| 552 |
+
continue
|
| 553 |
+
|
| 554 |
+
logger.info(f"Found policy/meeting Facebook post: {post_url[:80]}...")
|
| 555 |
+
|
| 556 |
+
doc_id = hashlib.md5(f"facebook-post-{municipality}-{post_url}".encode()).hexdigest()
|
| 557 |
+
policy_posts.append(MeetingDocument(
|
| 558 |
+
document_id=doc_id,
|
| 559 |
+
source_url=post_url,
|
| 560 |
+
municipality=municipality,
|
| 561 |
+
state=state,
|
| 562 |
+
meeting_date=datetime.utcnow().isoformat(),
|
| 563 |
+
meeting_type="Facebook Post - Policy/Meeting",
|
| 564 |
+
title="Facebook Post (Policy-Related)",
|
| 565 |
+
content=post_text[:8000],
|
| 566 |
+
metadata={
|
| 567 |
+
"platform": "facebook",
|
| 568 |
+
"content_source": "post",
|
| 569 |
+
"filtered_for_policy": True,
|
| 570 |
+
}
|
| 571 |
+
))
|
| 572 |
+
|
| 573 |
+
# Limit to configured number of policy posts
|
| 574 |
+
if len(policy_posts) >= self.social_source_limit:
|
| 575 |
+
break
|
| 576 |
+
|
| 577 |
+
# Rate limiting
|
| 578 |
+
await asyncio.sleep(0.5)
|
| 579 |
+
|
| 580 |
+
except Exception as err:
|
| 581 |
+
logger.debug(f"Could not parse Facebook post {post_url}: {err}")
|
| 582 |
+
|
| 583 |
+
logger.info(f"Found {len(policy_posts)} policy/meeting Facebook posts from {url}")
|
| 584 |
+
return policy_posts
|
| 585 |
+
|
| 586 |
+
def _fetch_youtube_transcript(self, video_id: str) -> str:
|
| 587 |
+
"""Return concatenated YouTube transcript text when available."""
|
| 588 |
+
if YouTubeTranscriptApi is None:
|
| 589 |
+
return ""
|
| 590 |
+
|
| 591 |
+
try:
|
| 592 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
|
| 593 |
+
return " ".join(chunk.get("text", "") for chunk in transcript if chunk.get("text")).strip()
|
| 594 |
+
except Exception:
|
| 595 |
+
return ""
|
| 596 |
+
|
| 597 |
+
async def _scrape_legistar(
|
| 598 |
+
self,
|
| 599 |
+
target: Dict[str, Any],
|
| 600 |
+
date_range: Dict[str, str]
|
| 601 |
+
) -> List[Dict[str, Any]]:
|
| 602 |
+
"""
|
| 603 |
+
Scrape meeting minutes from Legistar platform using the official API.
|
| 604 |
+
|
| 605 |
+
Legistar provides a REST API at https://webapi.legistar.com/v1/{city}/
|
| 606 |
+
This is much more reliable than HTML scraping.
|
| 607 |
+
|
| 608 |
+
Args:
|
| 609 |
+
target: Target configuration with 'url' and 'municipality'
|
| 610 |
+
date_range: Date range for filtering (optional)
|
| 611 |
+
|
| 612 |
+
Returns:
|
| 613 |
+
List of meeting documents
|
| 614 |
+
"""
|
| 615 |
+
base_url = target["url"]
|
| 616 |
+
municipality = target["municipality"]
|
| 617 |
+
state = target["state"]
|
| 618 |
+
|
| 619 |
+
# Extract city slug from URL (e.g., "chicago" from "chicago.legistar.com")
|
| 620 |
+
parsed = urlparse(base_url)
|
| 621 |
+
hostname = parsed.hostname or ""
|
| 622 |
+
city_slug = hostname.split('.')[0] if '.' in hostname else municipality.lower().replace(' ', '')
|
| 623 |
+
|
| 624 |
+
# Use the official Legistar API
|
| 625 |
+
api_base = f"https://webapi.legistar.com/v1/{city_slug}"
|
| 626 |
+
|
| 627 |
+
documents = []
|
| 628 |
+
|
| 629 |
+
try:
|
| 630 |
+
# Build OData filter for date range
|
| 631 |
+
params = {
|
| 632 |
+
"$top": 100, # Limit to 100 most recent meetings
|
| 633 |
+
"$orderby": "EventDate desc"
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
if date_range and "start" in date_range:
|
| 637 |
+
params["$filter"] = f"EventDate ge datetime'{date_range['start']}'"
|
| 638 |
+
|
| 639 |
+
# Get events (meetings)
|
| 640 |
+
events_url = f"{api_base}/events"
|
| 641 |
+
logger.info(f"Fetching Legistar events from {events_url}")
|
| 642 |
+
|
| 643 |
+
response = await self.http_client.get(events_url, params=params)
|
| 644 |
+
response.raise_for_status()
|
| 645 |
+
events = response.json()
|
| 646 |
+
|
| 647 |
+
logger.info(f"Found {len(events)} events for {municipality}")
|
| 648 |
+
|
| 649 |
+
# Process each event
|
| 650 |
+
for event in events[:50]: # Limit to 50 meetings
|
| 651 |
+
event_id = event.get("EventId")
|
| 652 |
+
event_guid = event.get("EventGuid")
|
| 653 |
+
|
| 654 |
+
if not event_id:
|
| 655 |
+
continue
|
| 656 |
+
|
| 657 |
+
# Get agenda items for this event
|
| 658 |
+
try:
|
| 659 |
+
items_url = f"{api_base}/events/{event_id}/EventItems"
|
| 660 |
+
items_response = await self.http_client.get(items_url, timeout=10)
|
| 661 |
+
|
| 662 |
+
if items_response.status_code == 200:
|
| 663 |
+
items = items_response.json()
|
| 664 |
+
|
| 665 |
+
# Create document from event and items
|
| 666 |
+
doc = self._create_legistar_document(
|
| 667 |
+
event,
|
| 668 |
+
items,
|
| 669 |
+
municipality,
|
| 670 |
+
state,
|
| 671 |
+
base_url
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
if doc:
|
| 675 |
+
documents.append(doc)
|
| 676 |
+
|
| 677 |
+
# Rate limiting - be respectful
|
| 678 |
+
await asyncio.sleep(0.3)
|
| 679 |
+
|
| 680 |
+
except Exception as item_error:
|
| 681 |
+
logger.warning(f"Error fetching items for event {event_id}: {item_error}")
|
| 682 |
+
continue
|
| 683 |
+
|
| 684 |
+
except Exception as e:
|
| 685 |
+
logger.error(f"Error scraping Legistar API for {municipality}: {e}")
|
| 686 |
+
|
| 687 |
+
return documents
|
| 688 |
+
|
| 689 |
+
def _create_legistar_document(
|
| 690 |
+
self,
|
| 691 |
+
event: Dict[str, Any],
|
| 692 |
+
items: List[Dict[str, Any]],
|
| 693 |
+
municipality: str,
|
| 694 |
+
state: str,
|
| 695 |
+
base_url: str
|
| 696 |
+
) -> Optional[Dict[str, Any]]:
|
| 697 |
+
"""
|
| 698 |
+
Create a meeting document from Legistar API data.
|
| 699 |
+
|
| 700 |
+
Args:
|
| 701 |
+
event: Event data from API
|
| 702 |
+
items: Agenda items from API
|
| 703 |
+
municipality: Municipality name
|
| 704 |
+
state: State code
|
| 705 |
+
base_url: Base URL for constructing links
|
| 706 |
+
|
| 707 |
+
Returns:
|
| 708 |
+
Meeting document dict or None
|
| 709 |
+
"""
|
| 710 |
+
try:
|
| 711 |
+
event_id = event.get("EventId")
|
| 712 |
+
event_date = event.get("EventDate", "")
|
| 713 |
+
event_body = event.get("EventBodyName", "Unknown Body")
|
| 714 |
+
|
| 715 |
+
# Combine agenda items into content
|
| 716 |
+
content_parts = [f"Meeting: {event_body}", f"Date: {event_date}", "\n=== AGENDA ===\n"]
|
| 717 |
+
|
| 718 |
+
for item in items:
|
| 719 |
+
agenda_num = item.get("EventItemAgendaNumber", "")
|
| 720 |
+
title = item.get("EventItemTitle", "")
|
| 721 |
+
matter_file = item.get("EventItemMatterFile", "")
|
| 722 |
+
|
| 723 |
+
if title:
|
| 724 |
+
item_text = f"\n{agenda_num}. {title}"
|
| 725 |
+
if matter_file:
|
| 726 |
+
item_text += f" (File: {matter_file})"
|
| 727 |
+
content_parts.append(item_text)
|
| 728 |
+
|
| 729 |
+
content = "\n".join(content_parts)
|
| 730 |
+
|
| 731 |
+
# Generate document ID
|
| 732 |
+
doc_id = hashlib.md5(
|
| 733 |
+
f"{municipality}-{state}-{event_id}".encode()
|
| 734 |
+
).hexdigest()
|
| 735 |
+
|
| 736 |
+
# Create meeting detail URL
|
| 737 |
+
parsed = urlparse(base_url)
|
| 738 |
+
hostname = parsed.hostname or base_url
|
| 739 |
+
meeting_url = f"https://{hostname}/MeetingDetail.aspx?ID={event_id}"
|
| 740 |
+
|
| 741 |
+
return MeetingDocument(
|
| 742 |
+
document_id=doc_id,
|
| 743 |
+
source_url=meeting_url,
|
| 744 |
+
municipality=municipality,
|
| 745 |
+
state=state,
|
| 746 |
+
meeting_date=event_date,
|
| 747 |
+
meeting_type=event_body,
|
| 748 |
+
title=f"{event_body} - {event_date}",
|
| 749 |
+
content=content,
|
| 750 |
+
metadata={
|
| 751 |
+
"event_id": event_id,
|
| 752 |
+
"event_guid": event.get("EventGuid"),
|
| 753 |
+
"event_time": event.get("EventTime"),
|
| 754 |
+
"event_location": event.get("EventLocation"),
|
| 755 |
+
"video_status": event.get("EventVideoStatus"),
|
| 756 |
+
"agenda_status": event.get("EventAgendaStatusName"),
|
| 757 |
+
"minutes_status": event.get("EventMinutesStatusName"),
|
| 758 |
+
"item_count": len(items),
|
| 759 |
+
"platform": "legistar_api"
|
| 760 |
+
}
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
except Exception as e:
|
| 764 |
+
logger.error(f"Error creating document from Legistar data: {e}")
|
| 765 |
+
return None
|
| 766 |
+
|
| 767 |
+
async def _scrape_granicus(
|
| 768 |
+
self,
|
| 769 |
+
target: Dict[str, Any],
|
| 770 |
+
date_range: Dict[str, str]
|
| 771 |
+
) -> List[Dict[str, Any]]:
|
| 772 |
+
"""
|
| 773 |
+
Scrape meeting minutes from Granicus platform.
|
| 774 |
+
|
| 775 |
+
Args:
|
| 776 |
+
target: Target configuration
|
| 777 |
+
date_range: Date range for filtering
|
| 778 |
+
|
| 779 |
+
Returns:
|
| 780 |
+
List of meeting documents
|
| 781 |
+
"""
|
| 782 |
+
base_url = target["url"]
|
| 783 |
+
municipality = target["municipality"]
|
| 784 |
+
state = target["state"]
|
| 785 |
+
|
| 786 |
+
documents = []
|
| 787 |
+
|
| 788 |
+
try:
|
| 789 |
+
# Granicus often has an API endpoint
|
| 790 |
+
api_url = urljoin(base_url, "api/v1/meetings")
|
| 791 |
+
|
| 792 |
+
response = await self.http_client.get(api_url)
|
| 793 |
+
response.raise_for_status()
|
| 794 |
+
|
| 795 |
+
meetings_data = response.json()
|
| 796 |
+
|
| 797 |
+
for meeting in meetings_data.get("meetings", [])[:50]:
|
| 798 |
+
meeting_id = meeting.get("id")
|
| 799 |
+
meeting_url = urljoin(base_url, f"meeting/{meeting_id}")
|
| 800 |
+
|
| 801 |
+
if meeting_url in self.scraped_urls:
|
| 802 |
+
continue
|
| 803 |
+
|
| 804 |
+
doc = await self._scrape_meeting_page(
|
| 805 |
+
meeting_url,
|
| 806 |
+
municipality,
|
| 807 |
+
state,
|
| 808 |
+
meeting_data=meeting
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
if doc:
|
| 812 |
+
documents.append(doc)
|
| 813 |
+
self.scraped_urls.add(meeting_url)
|
| 814 |
+
|
| 815 |
+
await asyncio.sleep(0.5)
|
| 816 |
+
|
| 817 |
+
except Exception as e:
|
| 818 |
+
logger.error(f"Error scraping Granicus {base_url}: {e}")
|
| 819 |
+
|
| 820 |
+
return documents
|
| 821 |
+
|
| 822 |
+
async def _scrape_suiteonemedia(
|
| 823 |
+
self,
|
| 824 |
+
target: Dict[str, Any],
|
| 825 |
+
date_range: Dict[str, str]
|
| 826 |
+
) -> List[Dict[str, Any]]:
|
| 827 |
+
"""
|
| 828 |
+
Scrape meeting events from a SuiteOne Media portal.
|
| 829 |
+
|
| 830 |
+
Strategy:
|
| 831 |
+
1. Fetch the portal homepage — it renders ALL current-year events as
|
| 832 |
+
/event/?id=XXXX anchor links.
|
| 833 |
+
2. Parse each <tr> in the eventTable to get: event ID, title, date,
|
| 834 |
+
agenda/minutes PDF links, and whether a media recording exists.
|
| 835 |
+
3. For events with media (or missing title/date), fetch the event page
|
| 836 |
+
to extract the S3 MP4 video recording URL from jwplayer setup.
|
| 837 |
+
4. Download PDFs for text extraction.
|
| 838 |
+
5. Extend backwards through historical event IDs when max_events > homepage count.
|
| 839 |
+
|
| 840 |
+
Parameters via target dict:
|
| 841 |
+
max_events - maximum events to process (default 500, 0 = unlimited)
|
| 842 |
+
start_year - only include events on/after this year (0 = all)
|
| 843 |
+
fetch_videos - whether to fetch event pages for S3 video URLs (default True)
|
| 844 |
+
"""
|
| 845 |
+
url = target["url"]
|
| 846 |
+
municipality = target.get("municipality", "")
|
| 847 |
+
state = target.get("state", "")
|
| 848 |
+
max_events: int = int(target.get("max_events", 500))
|
| 849 |
+
start_year: int = int(target.get("start_year", 0))
|
| 850 |
+
fetch_videos: bool = bool(target.get("fetch_videos", True))
|
| 851 |
+
|
| 852 |
+
parsed = urlparse(url)
|
| 853 |
+
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
| 854 |
+
|
| 855 |
+
documents: List[Dict[str, Any]] = []
|
| 856 |
+
|
| 857 |
+
try:
|
| 858 |
+
# ---- Step 1: fetch homepage and parse event table rows ----
|
| 859 |
+
logger.info(f"Fetching SuiteOne homepage: {base_url}/Web/Home.aspx")
|
| 860 |
+
home_resp = await self.http_client.get(f"{base_url}/Web/Home.aspx")
|
| 861 |
+
home_resp.raise_for_status()
|
| 862 |
+
home_soup = BeautifulSoup(home_resp.content, "html.parser")
|
| 863 |
+
|
| 864 |
+
# Each <tr> in an eventTable contains: event link, agenda/minutes PDF links, date text
|
| 865 |
+
events: list[dict] = []
|
| 866 |
+
seen_event_ids: set[int] = set()
|
| 867 |
+
|
| 868 |
+
for table in home_soup.find_all("table", class_=re.compile("eventTable", re.I)):
|
| 869 |
+
for tr in table.find_all("tr"):
|
| 870 |
+
row_links = [(a["href"], a.get_text(" ", strip=True)) for a in tr.find_all("a", href=True)]
|
| 871 |
+
row_text = tr.get_text(" ", strip=True)
|
| 872 |
+
|
| 873 |
+
event_id = None
|
| 874 |
+
event_title = ""
|
| 875 |
+
agenda_url = ""
|
| 876 |
+
minutes_url = ""
|
| 877 |
+
has_media = False
|
| 878 |
+
|
| 879 |
+
for href, text in row_links:
|
| 880 |
+
m = re.match(r'/event/\?id=(\d+)', href)
|
| 881 |
+
if m:
|
| 882 |
+
eid = int(m.group(1))
|
| 883 |
+
if event_id is None:
|
| 884 |
+
event_id = eid
|
| 885 |
+
event_title = re.sub(r'\(opens in new window\)', '', text).strip()
|
| 886 |
+
elif "getagendafile" in href.lower():
|
| 887 |
+
agenda_url = self._normalize_document_url(urljoin(base_url, href))
|
| 888 |
+
elif "getminutesfile" in href.lower():
|
| 889 |
+
minutes_url = self._normalize_document_url(urljoin(base_url, href))
|
| 890 |
+
if "media" in text.lower():
|
| 891 |
+
has_media = True
|
| 892 |
+
|
| 893 |
+
if event_id is None or event_id in seen_event_ids:
|
| 894 |
+
continue
|
| 895 |
+
seen_event_ids.add(event_id)
|
| 896 |
+
|
| 897 |
+
date_m = re.search(
|
| 898 |
+
r'((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*'
|
| 899 |
+
r'\s+\d{1,2},?\s*\d{4}(?:\s*\|\s*\d{2}:\d{2}\s*[AP]M)?)',
|
| 900 |
+
row_text, re.I
|
| 901 |
+
)
|
| 902 |
+
meeting_date = date_m.group(1).strip() if date_m else ""
|
| 903 |
+
|
| 904 |
+
year_m = re.search(r'\b(20\d{2})\b', meeting_date)
|
| 905 |
+
if start_year and year_m and int(year_m.group(1)) < start_year:
|
| 906 |
+
continue
|
| 907 |
+
|
| 908 |
+
events.append({
|
| 909 |
+
"id": event_id,
|
| 910 |
+
"title": event_title,
|
| 911 |
+
"date": meeting_date,
|
| 912 |
+
"agenda_url": agenda_url,
|
| 913 |
+
"minutes_url": minutes_url,
|
| 914 |
+
"has_media": has_media,
|
| 915 |
+
})
|
| 916 |
+
|
| 917 |
+
logger.info(
|
| 918 |
+
f"Parsed {len(events)} events from SuiteOne homepage table "
|
| 919 |
+
f"({len([e for e in events if e['agenda_url']])} with agenda, "
|
| 920 |
+
f"{len([e for e in events if e['minutes_url']])} with minutes, "
|
| 921 |
+
f"{len([e for e in events if e['has_media']])} with media)"
|
| 922 |
+
)
|
| 923 |
+
|
| 924 |
+
# ---- Step 2: extend with historical events if needed ----
|
| 925 |
+
if events and (max_events == 0 or max_events > len(events)):
|
| 926 |
+
lowest_id = min(e["id"] for e in events)
|
| 927 |
+
logger.info(f"Probing historical events below ID {lowest_id}")
|
| 928 |
+
for eid in range(lowest_id - 1, max(1, lowest_id - 5000), -1):
|
| 929 |
+
if eid not in seen_event_ids:
|
| 930 |
+
seen_event_ids.add(eid)
|
| 931 |
+
events.append({
|
| 932 |
+
"id": eid, "title": "", "date": "", "agenda_url": "",
|
| 933 |
+
"minutes_url": "", "has_media": True,
|
| 934 |
+
})
|
| 935 |
+
logger.info(f"Expanded to {len(events)} total events (including historical)")
|
| 936 |
+
|
| 937 |
+
events.sort(key=lambda e: e["id"], reverse=True)
|
| 938 |
+
if max_events > 0:
|
| 939 |
+
events = events[:max_events]
|
| 940 |
+
|
| 941 |
+
logger.info(f"Processing {len(events)} SuiteOne events for {municipality}")
|
| 942 |
+
|
| 943 |
+
# ---- Step 3 & 4: for each event, fetch video URL + download PDFs ----
|
| 944 |
+
for i, ev in enumerate(events):
|
| 945 |
+
eid = ev["id"]
|
| 946 |
+
event_url = f"{base_url}/event/?id={eid}"
|
| 947 |
+
|
| 948 |
+
meeting_date = ev["date"]
|
| 949 |
+
meeting_title = ev["title"]
|
| 950 |
+
meeting_type = re.sub(r'^\d+:\d+\s*[ap]\.m\.\s*', '', meeting_title, flags=re.I).strip() or "Meeting"
|
| 951 |
+
video_url = ""
|
| 952 |
+
|
| 953 |
+
# Fetch event page when: has media flag, or missing title/date
|
| 954 |
+
if ev["has_media"] or not meeting_title or not meeting_date:
|
| 955 |
+
try:
|
| 956 |
+
ev_resp = await self.http_client.get(event_url)
|
| 957 |
+
if ev_resp.status_code == 404:
|
| 958 |
+
continue
|
| 959 |
+
ev_resp.raise_for_status()
|
| 960 |
+
ev_text = ev_resp.text
|
| 961 |
+
ev_soup = BeautifulSoup(ev_resp.content, "html.parser")
|
| 962 |
+
|
| 963 |
+
title_tag = ev_soup.find("title")
|
| 964 |
+
if title_tag:
|
| 965 |
+
page_title = title_tag.get_text(strip=True).replace("Meeting:", "").strip()
|
| 966 |
+
if "upcoming meetings" in page_title.lower():
|
| 967 |
+
continue
|
| 968 |
+
if not meeting_title:
|
| 969 |
+
meeting_title = page_title
|
| 970 |
+
meeting_type = re.sub(r'^\d+:\d+\s*[ap]\.m\.\s*', '', meeting_title, flags=re.I).strip() or "Meeting"
|
| 971 |
+
|
| 972 |
+
if not meeting_date:
|
| 973 |
+
dm = re.search(
|
| 974 |
+
r'((?:January|February|March|April|May|June|July|August|'
|
| 975 |
+
r'September|October|November|December)\s+\d{1,2},?\s*\d{4})',
|
| 976 |
+
ev_text
|
| 977 |
+
)
|
| 978 |
+
meeting_date = dm.group(1) if dm else ""
|
| 979 |
+
|
| 980 |
+
year_m = re.search(r'\b(20\d{2})\b', meeting_date)
|
| 981 |
+
if start_year and year_m and int(year_m.group(1)) < start_year:
|
| 982 |
+
continue
|
| 983 |
+
|
| 984 |
+
if fetch_videos:
|
| 985 |
+
src_m = re.search(r"var src\s*=\s*'([^']+)';", ev_text)
|
| 986 |
+
if src_m and src_m.group(1):
|
| 987 |
+
video_url = src_m.group(1)
|
| 988 |
+
|
| 989 |
+
for a in ev_soup.find_all("a", href=True):
|
| 990 |
+
href = a["href"]
|
| 991 |
+
full = self._normalize_document_url(urljoin(base_url, href))
|
| 992 |
+
if "getagendafile" in full.lower() and not ev["agenda_url"]:
|
| 993 |
+
ev["agenda_url"] = full
|
| 994 |
+
elif "getminutesfile" in full.lower() and not ev["minutes_url"]:
|
| 995 |
+
ev["minutes_url"] = full
|
| 996 |
+
|
| 997 |
+
await asyncio.sleep(0.2)
|
| 998 |
+
|
| 999 |
+
except Exception as fetch_err:
|
| 1000 |
+
logger.debug(f"Could not fetch event page {eid}: {fetch_err}")
|
| 1001 |
+
|
| 1002 |
+
doc_urls = [(ev["agenda_url"], "Agenda"), (ev["minutes_url"], "Minutes")]
|
| 1003 |
+
produced = 0
|
| 1004 |
+
for doc_url, doc_type in doc_urls:
|
| 1005 |
+
if not doc_url or doc_url in self.scraped_urls:
|
| 1006 |
+
continue
|
| 1007 |
+
doc = await self._scrape_document(
|
| 1008 |
+
url=doc_url,
|
| 1009 |
+
municipality=municipality,
|
| 1010 |
+
state=state,
|
| 1011 |
+
title=f"{meeting_title} — {doc_type}",
|
| 1012 |
+
)
|
| 1013 |
+
if doc:
|
| 1014 |
+
doc["meeting_date"] = meeting_date
|
| 1015 |
+
doc["meeting_type"] = meeting_type
|
| 1016 |
+
meta = doc.setdefault("metadata", {})
|
| 1017 |
+
meta["platform"] = "suiteonemedia"
|
| 1018 |
+
meta["event_id"] = eid
|
| 1019 |
+
meta["doc_type"] = doc_type.lower()
|
| 1020 |
+
if video_url:
|
| 1021 |
+
meta["video_url"] = video_url
|
| 1022 |
+
documents.append(doc)
|
| 1023 |
+
self.scraped_urls.add(doc_url)
|
| 1024 |
+
produced += 1
|
| 1025 |
+
|
| 1026 |
+
if produced == 0 and meeting_title and "upcoming meetings" not in meeting_title.lower():
|
| 1027 |
+
doc_id = hashlib.md5(event_url.encode()).hexdigest()
|
| 1028 |
+
documents.append(MeetingDocument(
|
| 1029 |
+
document_id=doc_id,
|
| 1030 |
+
source_url=event_url,
|
| 1031 |
+
municipality=municipality,
|
| 1032 |
+
state=state,
|
| 1033 |
+
meeting_date=meeting_date or datetime.utcnow().isoformat(),
|
| 1034 |
+
meeting_type=meeting_type,
|
| 1035 |
+
title=meeting_title,
|
| 1036 |
+
content="",
|
| 1037 |
+
metadata={
|
| 1038 |
+
"platform": "suiteonemedia",
|
| 1039 |
+
"event_id": eid,
|
| 1040 |
+
"video_url": video_url,
|
| 1041 |
+
"has_pdf": False,
|
| 1042 |
+
}
|
| 1043 |
+
))
|
| 1044 |
+
|
| 1045 |
+
if (i + 1) % 50 == 0:
|
| 1046 |
+
logger.info(
|
| 1047 |
+
f" SuiteOne: {i+1}/{len(events)} events processed, "
|
| 1048 |
+
f"{len(documents)} docs so far"
|
| 1049 |
+
)
|
| 1050 |
+
|
| 1051 |
+
await asyncio.sleep(0.3)
|
| 1052 |
+
|
| 1053 |
+
except Exception as e:
|
| 1054 |
+
logger.error(f"Error scraping SuiteOne portal {url}: {e}")
|
| 1055 |
+
|
| 1056 |
+
logger.info(f"SuiteOne scrape complete: {len(documents)} documents from {municipality}")
|
| 1057 |
+
return documents
|
| 1058 |
+
|
| 1059 |
+
async def _scrape_eboard_undetected(
|
| 1060 |
+
self,
|
| 1061 |
+
target: Dict[str, Any],
|
| 1062 |
+
date_range: Dict[str, str]
|
| 1063 |
+
) -> List[Dict[str, Any]]:
|
| 1064 |
+
"""
|
| 1065 |
+
Scrape eBoard using undetected-chromedriver to bypass Incapsula automatically.
|
| 1066 |
+
|
| 1067 |
+
This method uses undetected-chromedriver which patches Selenium to avoid detection:
|
| 1068 |
+
- Removes navigator.webdriver flag
|
| 1069 |
+
- Uses real Chrome binary
|
| 1070 |
+
- Randomizes browser fingerprints
|
| 1071 |
+
|
| 1072 |
+
Args:
|
| 1073 |
+
target: Scraping target with URL, municipality, state
|
| 1074 |
+
date_range: Date range for filtering meetings
|
| 1075 |
+
|
| 1076 |
+
Returns:
|
| 1077 |
+
List of meeting documents
|
| 1078 |
+
"""
|
| 1079 |
+
url = target.get("url", "")
|
| 1080 |
+
municipality = target.get("municipality", "Unknown")
|
| 1081 |
+
state = target.get("state", "")
|
| 1082 |
+
|
| 1083 |
+
import undetected_chromedriver as uc
|
| 1084 |
+
from selenium.webdriver.common.by import By
|
| 1085 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 1086 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 1087 |
+
import time
|
| 1088 |
+
import random
|
| 1089 |
+
import re
|
| 1090 |
+
from pathlib import Path
|
| 1091 |
+
|
| 1092 |
+
# Extract school ID from URL
|
| 1093 |
+
school_id_match = re.search(r'[?&]s=(\d+)', url, re.IGNORECASE)
|
| 1094 |
+
school_id = school_id_match.group(1) if school_id_match else None
|
| 1095 |
+
|
| 1096 |
+
if not school_id:
|
| 1097 |
+
logger.error(f"Could not extract school ID from URL: {url}")
|
| 1098 |
+
return []
|
| 1099 |
+
|
| 1100 |
+
base_url = "https://simbli.eboardsolutions.com"
|
| 1101 |
+
meetings_url = f"{base_url}/SB_Meetings/SB_MeetingListing.aspx?S={school_id}"
|
| 1102 |
+
|
| 1103 |
+
logger.info(f"Using undetected-chromedriver for: {meetings_url}")
|
| 1104 |
+
|
| 1105 |
+
documents = []
|
| 1106 |
+
driver = None
|
| 1107 |
+
|
| 1108 |
+
try:
|
| 1109 |
+
# Create undetected Chrome instance
|
| 1110 |
+
options = uc.ChromeOptions()
|
| 1111 |
+
# Try headless mode (may work better with newer Chrome)
|
| 1112 |
+
options.add_argument('--headless=new') # New headless mode
|
| 1113 |
+
options.add_argument('--no-sandbox')
|
| 1114 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 1115 |
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
| 1116 |
+
options.add_argument('--window-size=1920,1080')
|
| 1117 |
+
|
| 1118 |
+
logger.info("Launching Chrome with anti-detection patches...")
|
| 1119 |
+
|
| 1120 |
+
# Let undetected-chromedriver auto-download matching ChromeDriver
|
| 1121 |
+
# Specify version_main to match system Chrome (147)
|
| 1122 |
+
try:
|
| 1123 |
+
driver = uc.Chrome(
|
| 1124 |
+
options=options,
|
| 1125 |
+
version_main=147, # Match Chromium version
|
| 1126 |
+
use_subprocess=True
|
| 1127 |
+
)
|
| 1128 |
+
except Exception as e:
|
| 1129 |
+
logger.warning(f"Headless mode failed: {e}, trying with visible browser...")
|
| 1130 |
+
# Try without headless as fallback
|
| 1131 |
+
options = uc.ChromeOptions()
|
| 1132 |
+
options.add_argument('--no-sandbox')
|
| 1133 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 1134 |
+
driver = uc.Chrome(options=options, version_main=147, use_subprocess=True)
|
| 1135 |
+
|
| 1136 |
+
# Navigate to meetings page
|
| 1137 |
+
logger.info("Loading meeting listing page...")
|
| 1138 |
+
driver.get(meetings_url)
|
| 1139 |
+
|
| 1140 |
+
# Wait for Incapsula challenge to complete
|
| 1141 |
+
wait_time = random.uniform(6.0, 9.0)
|
| 1142 |
+
logger.info(f"Waiting {wait_time:.1f}s for Incapsula challenge...")
|
| 1143 |
+
time.sleep(wait_time)
|
| 1144 |
+
|
| 1145 |
+
# Check if we bypassed Incapsula
|
| 1146 |
+
page_source = driver.page_source
|
| 1147 |
+
|
| 1148 |
+
if 'Incapsula incident ID' in page_source or ('Incapsula' in page_source and len(page_source) < 10000):
|
| 1149 |
+
logger.error(f"Still blocked by Incapsula ({len(page_source)} bytes)")
|
| 1150 |
+
logger.error("Incapsula detection triggered despite undetected-chromedriver")
|
| 1151 |
+
raise Exception("Incapsula block detected")
|
| 1152 |
+
|
| 1153 |
+
logger.success(f"✓ Bypassed Incapsula! Page size: {len(page_source)} bytes")
|
| 1154 |
+
|
| 1155 |
+
# Parse the page
|
| 1156 |
+
soup = BeautifulSoup(page_source, 'html.parser')
|
| 1157 |
+
|
| 1158 |
+
# Extract meeting links
|
| 1159 |
+
meeting_links = []
|
| 1160 |
+
|
| 1161 |
+
# Look for links with MID parameter or PDFs
|
| 1162 |
+
for link in soup.find_all('a', href=True):
|
| 1163 |
+
href = link.get('href', '')
|
| 1164 |
+
text = link.get_text().strip()
|
| 1165 |
+
|
| 1166 |
+
if 'MID=' in href.upper() or 'meetingdetail' in href.lower():
|
| 1167 |
+
full_url = urljoin(base_url, href)
|
| 1168 |
+
meeting_links.append({
|
| 1169 |
+
'url': full_url,
|
| 1170 |
+
'text': text,
|
| 1171 |
+
'type': 'meeting'
|
| 1172 |
+
})
|
| 1173 |
+
elif href.lower().endswith('.pdf') and any(word in text.lower() for word in ['agenda', 'minutes', 'packet', 'meeting']):
|
| 1174 |
+
full_url = urljoin(base_url, href)
|
| 1175 |
+
meeting_links.append({
|
| 1176 |
+
'url': full_url,
|
| 1177 |
+
'text': text,
|
| 1178 |
+
'type': 'pdf'
|
| 1179 |
+
})
|
| 1180 |
+
|
| 1181 |
+
logger.info(f"Found {len(meeting_links)} meeting/document links")
|
| 1182 |
+
|
| 1183 |
+
# If no links found, try waiting for JavaScript-rendered content
|
| 1184 |
+
if len(meeting_links) == 0:
|
| 1185 |
+
logger.warning("No links found initially, waiting for JavaScript...")
|
| 1186 |
+
try:
|
| 1187 |
+
WebDriverWait(driver, 10).until(
|
| 1188 |
+
EC.presence_of_element_located((By.TAG_NAME, "a"))
|
| 1189 |
+
)
|
| 1190 |
+
time.sleep(3)
|
| 1191 |
+
|
| 1192 |
+
# Re-parse
|
| 1193 |
+
page_source = driver.page_source
|
| 1194 |
+
soup = BeautifulSoup(page_source, 'html.parser')
|
| 1195 |
+
|
| 1196 |
+
for link in soup.find_all('a', href=True):
|
| 1197 |
+
href = link.get('href', '')
|
| 1198 |
+
text = link.get_text().strip()
|
| 1199 |
+
|
| 1200 |
+
if 'MID=' in href.upper() or href.lower().endswith('.pdf'):
|
| 1201 |
+
full_url = urljoin(base_url, href)
|
| 1202 |
+
meeting_links.append({
|
| 1203 |
+
'url': full_url,
|
| 1204 |
+
'text': text,
|
| 1205 |
+
'type': 'pdf' if href.lower().endswith('.pdf') else 'meeting'
|
| 1206 |
+
})
|
| 1207 |
+
|
| 1208 |
+
logger.info(f"After JS wait: {len(meeting_links)} links")
|
| 1209 |
+
except Exception as e:
|
| 1210 |
+
logger.warning(f"JS content wait failed: {e}")
|
| 1211 |
+
|
| 1212 |
+
# Save page HTML for debugging
|
| 1213 |
+
debug_file = Path("/tmp/eboard_meeting_list_undetected.html")
|
| 1214 |
+
with open(debug_file, 'w', encoding='utf-8') as f:
|
| 1215 |
+
f.write(page_source)
|
| 1216 |
+
logger.info(f"Saved page HTML to {debug_file} for debugging")
|
| 1217 |
+
|
| 1218 |
+
# Process meeting links (limit to prevent overwhelming)
|
| 1219 |
+
for idx, meeting_info in enumerate(meeting_links[:50]):
|
| 1220 |
+
if idx > 0 and idx % 10 == 0:
|
| 1221 |
+
logger.info(f"Progress: {idx}/{min(50, len(meeting_links))}")
|
| 1222 |
+
|
| 1223 |
+
# Human-like delay
|
| 1224 |
+
time.sleep(random.uniform(2.0, 4.0))
|
| 1225 |
+
|
| 1226 |
+
try:
|
| 1227 |
+
meeting_url = meeting_info['url']
|
| 1228 |
+
meeting_title = meeting_info['text']
|
| 1229 |
+
|
| 1230 |
+
if meeting_info['type'] == 'pdf':
|
| 1231 |
+
# Record PDF link for later download
|
| 1232 |
+
logger.debug(f" Found PDF: {meeting_title[:50]}")
|
| 1233 |
+
|
| 1234 |
+
# Try to extract date from title
|
| 1235 |
+
meeting_date = datetime.now()
|
| 1236 |
+
try:
|
| 1237 |
+
from dateutil import parser as date_parser
|
| 1238 |
+
date_match = re.search(r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', meeting_title)
|
| 1239 |
+
if date_match:
|
| 1240 |
+
meeting_date = date_parser.parse(date_match.group(0))
|
| 1241 |
+
except:
|
| 1242 |
+
pass
|
| 1243 |
+
|
| 1244 |
+
# Download and extract PDF content
|
| 1245 |
+
try:
|
| 1246 |
+
pdf_content = await self._scrape_pdf_document(meeting_url)
|
| 1247 |
+
if pdf_content and len(pdf_content.strip()) > 100:
|
| 1248 |
+
doc = MeetingDocument(
|
| 1249 |
+
document_id=hashlib.md5(f"{meeting_url}{municipality}".encode()).hexdigest(),
|
| 1250 |
+
source_url=meeting_url,
|
| 1251 |
+
municipality=municipality,
|
| 1252 |
+
state=state,
|
| 1253 |
+
meeting_date=meeting_date,
|
| 1254 |
+
meeting_type='Board Meeting',
|
| 1255 |
+
title=meeting_title,
|
| 1256 |
+
content=pdf_content[:50000],
|
| 1257 |
+
metadata={
|
| 1258 |
+
'platform': 'eboard',
|
| 1259 |
+
'school_id': school_id,
|
| 1260 |
+
'scraped_with': 'undetected_chromedriver'
|
| 1261 |
+
}
|
| 1262 |
+
)
|
| 1263 |
+
documents.append(doc)
|
| 1264 |
+
logger.success(f" ✓ Scraped PDF: {meeting_title[:50]}")
|
| 1265 |
+
except Exception as e:
|
| 1266 |
+
logger.error(f" Error downloading PDF: {e}")
|
| 1267 |
+
|
| 1268 |
+
else:
|
| 1269 |
+
# Navigate to meeting detail page
|
| 1270 |
+
logger.debug(f" Loading meeting: {meeting_title[:50]}")
|
| 1271 |
+
driver.get(meeting_url)
|
| 1272 |
+
time.sleep(random.uniform(2.0, 4.0))
|
| 1273 |
+
|
| 1274 |
+
meeting_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
| 1275 |
+
|
| 1276 |
+
# Extract meeting date
|
| 1277 |
+
meeting_date = datetime.now()
|
| 1278 |
+
try:
|
| 1279 |
+
from dateutil import parser as date_parser
|
| 1280 |
+
for elem in meeting_soup.find_all(['h1', 'h2', 'h3', 'div', 'span']):
|
| 1281 |
+
text = elem.get_text().strip()
|
| 1282 |
+
date_match = re.search(r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', text)
|
| 1283 |
+
if date_match:
|
| 1284 |
+
meeting_date = date_parser.parse(date_match.group(0))
|
| 1285 |
+
break
|
| 1286 |
+
except:
|
| 1287 |
+
pass
|
| 1288 |
+
|
| 1289 |
+
# Find document links (PDFs)
|
| 1290 |
+
doc_links = []
|
| 1291 |
+
for link in meeting_soup.find_all('a', href=True):
|
| 1292 |
+
href = link.get('href', '')
|
| 1293 |
+
link_text = link.get_text().strip()
|
| 1294 |
+
|
| 1295 |
+
if href.lower().endswith('.pdf') or any(word in link_text.lower() for word in ['agenda', 'minutes', 'packet']):
|
| 1296 |
+
doc_url = urljoin(base_url, href)
|
| 1297 |
+
doc_links.append({
|
| 1298 |
+
'url': doc_url,
|
| 1299 |
+
'text': link_text
|
| 1300 |
+
})
|
| 1301 |
+
|
| 1302 |
+
logger.info(f" Found {len(doc_links)} documents")
|
| 1303 |
+
|
| 1304 |
+
# Download each document
|
| 1305 |
+
for doc_info in doc_links[:5]:
|
| 1306 |
+
try:
|
| 1307 |
+
doc_url = doc_info['url']
|
| 1308 |
+
doc_title = doc_info['text']
|
| 1309 |
+
|
| 1310 |
+
if doc_url.lower().endswith('.pdf'):
|
| 1311 |
+
doc_content = await self._scrape_pdf_document(doc_url)
|
| 1312 |
+
|
| 1313 |
+
if doc_content and len(doc_content.strip()) > 100:
|
| 1314 |
+
doc = MeetingDocument(
|
| 1315 |
+
document_id=hashlib.md5(f"{doc_url}{municipality}".encode()).hexdigest(),
|
| 1316 |
+
source_url=doc_url,
|
| 1317 |
+
municipality=municipality,
|
| 1318 |
+
state=state,
|
| 1319 |
+
meeting_date=meeting_date,
|
| 1320 |
+
meeting_type='Board Meeting',
|
| 1321 |
+
title=doc_title or meeting_title,
|
| 1322 |
+
content=doc_content[:50000],
|
| 1323 |
+
metadata={
|
| 1324 |
+
'platform': 'eboard',
|
| 1325 |
+
'meeting_page': meeting_url,
|
| 1326 |
+
'school_id': school_id,
|
| 1327 |
+
'scraped_with': 'undetected_chromedriver'
|
| 1328 |
+
}
|
| 1329 |
+
)
|
| 1330 |
+
documents.append(doc)
|
| 1331 |
+
logger.success(f" ✓ Scraped: {doc_title[:50]}")
|
| 1332 |
+
|
| 1333 |
+
except Exception as e:
|
| 1334 |
+
logger.error(f" Error scraping document: {e}")
|
| 1335 |
+
|
| 1336 |
+
except Exception as e:
|
| 1337 |
+
logger.error(f"Error processing {meeting_info.get('text', 'unknown')}: {e}")
|
| 1338 |
+
continue
|
| 1339 |
+
|
| 1340 |
+
except Exception as e:
|
| 1341 |
+
logger.error(f"Error in undetected scraper: {e}")
|
| 1342 |
+
import traceback
|
| 1343 |
+
logger.error(traceback.format_exc())
|
| 1344 |
+
raise # Re-raise to trigger fallback
|
| 1345 |
+
|
| 1346 |
+
finally:
|
| 1347 |
+
if driver:
|
| 1348 |
+
try:
|
| 1349 |
+
driver.quit()
|
| 1350 |
+
except:
|
| 1351 |
+
pass
|
| 1352 |
+
|
| 1353 |
+
logger.success(f"Undetected scraper complete: {len(documents)} documents")
|
| 1354 |
+
return documents
|
| 1355 |
+
|
| 1356 |
+
async def _scrape_eboard(
|
| 1357 |
+
self,
|
| 1358 |
+
target: Dict[str, Any],
|
| 1359 |
+
date_range: Dict[str, str]
|
| 1360 |
+
) -> List[Dict[str, Any]]:
|
| 1361 |
+
"""
|
| 1362 |
+
Scrape eBoard Solutions platform (used by many school districts).
|
| 1363 |
+
|
| 1364 |
+
eBoard uses ASP.NET with JavaScript and Incapsula bot protection.
|
| 1365 |
+
This implementation uses undetected-chromedriver for automatic bypass.
|
| 1366 |
+
|
| 1367 |
+
Bypass methods (in order):
|
| 1368 |
+
1. Undetected ChromeDriver - automatic bot detection evasion
|
| 1369 |
+
2. Playwright with manual cookies (fallback)
|
| 1370 |
+
|
| 1371 |
+
Args:
|
| 1372 |
+
target: Scraping target with URL, municipality, state
|
| 1373 |
+
date_range: Date range for filtering meetings
|
| 1374 |
+
|
| 1375 |
+
Returns:
|
| 1376 |
+
List of meeting documents
|
| 1377 |
+
"""
|
| 1378 |
+
url = target.get("url", "")
|
| 1379 |
+
municipality = target.get("municipality", "Unknown")
|
| 1380 |
+
state = target.get("state", "")
|
| 1381 |
+
|
| 1382 |
+
logger.info(f"Scraping eBoard platform: {url} for {municipality}")
|
| 1383 |
+
|
| 1384 |
+
# Try undetected-chromedriver first (automatic bypass)
|
| 1385 |
+
try:
|
| 1386 |
+
import undetected_chromedriver as uc
|
| 1387 |
+
logger.info("Attempting with undetected-chromedriver (automatic bot evasion)")
|
| 1388 |
+
return await self._scrape_eboard_undetected(target, date_range)
|
| 1389 |
+
except ImportError:
|
| 1390 |
+
logger.warning("undetected-chromedriver not available, falling back to Playwright")
|
| 1391 |
+
except Exception as e:
|
| 1392 |
+
logger.warning(f"Undetected ChromeDriver failed: {e}, falling back to Playwright")
|
| 1393 |
+
|
| 1394 |
+
# Fallback to Playwright with cookies
|
| 1395 |
+
logger.info("Using Playwright with manual cookies (if available)")
|
| 1396 |
+
|
| 1397 |
+
documents = []
|
| 1398 |
+
|
| 1399 |
+
try:
|
| 1400 |
+
from playwright.async_api import async_playwright
|
| 1401 |
+
from playwright_stealth import Stealth
|
| 1402 |
+
import random
|
| 1403 |
+
from pathlib import Path
|
| 1404 |
+
|
| 1405 |
+
# Extract school ID from URL (S=xxxx parameter)
|
| 1406 |
+
import re
|
| 1407 |
+
school_id_match = re.search(r'[?&]s=(\d+)', url, re.IGNORECASE)
|
| 1408 |
+
school_id = school_id_match.group(1) if school_id_match else None
|
| 1409 |
+
|
| 1410 |
+
if not school_id:
|
| 1411 |
+
logger.error(f"Could not extract school ID from URL: {url}")
|
| 1412 |
+
return []
|
| 1413 |
+
|
| 1414 |
+
# Target the Meeting Listing page directly (bypasses some Incapsula triggers)
|
| 1415 |
+
base_url = "https://simbli.eboardsolutions.com"
|
| 1416 |
+
meetings_url = f"{base_url}/SB_Meetings/SB_MeetingListing.aspx?S={school_id}"
|
| 1417 |
+
|
| 1418 |
+
# Check for manual cookies file
|
| 1419 |
+
cookie_file = Path("eboard_cookies.json")
|
| 1420 |
+
cookies = None
|
| 1421 |
+
if cookie_file.exists():
|
| 1422 |
+
try:
|
| 1423 |
+
import json
|
| 1424 |
+
with open(cookie_file, 'r') as f:
|
| 1425 |
+
cookies = json.load(f)
|
| 1426 |
+
logger.success(f"✓ Loaded {len(cookies)} cookies from eboard_cookies.json")
|
| 1427 |
+
logger.info("Using manual session cookies to bypass Incapsula")
|
| 1428 |
+
except Exception as e:
|
| 1429 |
+
logger.warning(f"Could not load cookies: {e}")
|
| 1430 |
+
else:
|
| 1431 |
+
logger.info("No cookie file found. Will attempt without cookies (may be blocked)")
|
| 1432 |
+
logger.info(f"To bypass Incapsula: Create {cookie_file.absolute()}")
|
| 1433 |
+
logger.info("See docs/EBOARD_MANUAL_DOWNLOAD.md for instructions")
|
| 1434 |
+
|
| 1435 |
+
logger.info(f"Targeting Meeting Listing: {meetings_url}")
|
| 1436 |
+
|
| 1437 |
+
async with async_playwright() as p:
|
| 1438 |
+
# Launch browser with anti-detection settings
|
| 1439 |
+
logger.info("Launching browser with stealth settings to bypass Incapsula")
|
| 1440 |
+
browser = await p.chromium.launch(
|
| 1441 |
+
headless=True, # Stealth makes headless work
|
| 1442 |
+
args=[
|
| 1443 |
+
'--disable-blink-features=AutomationControlled',
|
| 1444 |
+
'--disable-dev-shm-usage',
|
| 1445 |
+
'--no-sandbox'
|
| 1446 |
+
]
|
| 1447 |
+
)
|
| 1448 |
+
|
| 1449 |
+
# CRITICAL: User-Agent must match the browser used to generate cookies
|
| 1450 |
+
# If cookies were from Chrome 123, use Chrome 123 UA
|
| 1451 |
+
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
|
| 1452 |
+
|
| 1453 |
+
context = await browser.new_context(
|
| 1454 |
+
viewport={'width': 1920, 'height': 1080},
|
| 1455 |
+
user_agent=user_agent,
|
| 1456 |
+
locale='en-US',
|
| 1457 |
+
timezone_id='America/Chicago',
|
| 1458 |
+
# Additional fingerprinting evasion
|
| 1459 |
+
geolocation={'latitude': 33.2098, 'longitude': -87.5692}, # Tuscaloosa, AL
|
| 1460 |
+
permissions=['geolocation']
|
| 1461 |
+
)
|
| 1462 |
+
|
| 1463 |
+
page = await context.new_page()
|
| 1464 |
+
|
| 1465 |
+
# Apply stealth to bypass Incapsula fingerprinting
|
| 1466 |
+
stealth = Stealth()
|
| 1467 |
+
await stealth.apply_stealth_async(page)
|
| 1468 |
+
logger.info("Stealth mode activated")
|
| 1469 |
+
|
| 1470 |
+
# Inject cookies if available (CRITICAL for bypassing Incapsula)
|
| 1471 |
+
if cookies:
|
| 1472 |
+
await context.add_cookies(cookies)
|
| 1473 |
+
logger.success("✓ Cookies injected into browser session")
|
| 1474 |
+
|
| 1475 |
+
# Navigate to Meeting Listing
|
| 1476 |
+
logger.info(f"Loading Meeting Listing page...")
|
| 1477 |
+
try:
|
| 1478 |
+
# Simulate human behavior - move mouse before navigation
|
| 1479 |
+
await page.mouse.move(random.randint(100, 500), random.randint(100, 500))
|
| 1480 |
+
|
| 1481 |
+
response = await page.goto(meetings_url, wait_until='networkidle', timeout=60000)
|
| 1482 |
+
logger.info(f"Response status: {response.status if response else 'No response'}")
|
| 1483 |
+
except Exception as e:
|
| 1484 |
+
logger.warning(f"Navigation timeout/error: {e}, continuing anyway...")
|
| 1485 |
+
|
| 1486 |
+
# Wait for Incapsula JavaScript challenge to complete
|
| 1487 |
+
# CRITICAL: Use randomized delay (not flat sleep) to avoid pattern detection
|
| 1488 |
+
wait_time = random.uniform(5.0, 7.0)
|
| 1489 |
+
logger.info(f"Waiting {wait_time:.1f}s for Incapsula JavaScript challenge...")
|
| 1490 |
+
await page.wait_for_timeout(int(wait_time * 1000))
|
| 1491 |
+
|
| 1492 |
+
# Check if we got through
|
| 1493 |
+
content = await page.content()
|
| 1494 |
+
|
| 1495 |
+
# More sophisticated Incapsula detection
|
| 1496 |
+
is_blocked = False
|
| 1497 |
+
if len(content) < 5000:
|
| 1498 |
+
is_blocked = True
|
| 1499 |
+
logger.error(f"Still blocked by Incapsula (page too small: {len(content)} bytes)")
|
| 1500 |
+
elif 'Incapsula incident ID' in content:
|
| 1501 |
+
is_blocked = True
|
| 1502 |
+
logger.error(f"Still blocked by Incapsula (incident ID found)")
|
| 1503 |
+
elif 'Request unsuccessful. Incapsula' in content:
|
| 1504 |
+
is_blocked = True
|
| 1505 |
+
logger.error(f"Still blocked by Incapsula (request unsuccessful message)")
|
| 1506 |
+
elif '<title>Access Denied</title>' in content or '<title>Blocked</title>' in content:
|
| 1507 |
+
is_blocked = True
|
| 1508 |
+
logger.error(f"Still blocked by Incapsula (access denied title)")
|
| 1509 |
+
|
| 1510 |
+
if is_blocked:
|
| 1511 |
+
logger.warning(f"Try running with headless=False or use manual session cookies")
|
| 1512 |
+
logger.info(f"See docs/EBOARD_MANUAL_DOWNLOAD.md for manual download guide")
|
| 1513 |
+
await browser.close()
|
| 1514 |
+
return []
|
| 1515 |
+
|
| 1516 |
+
logger.success(f"✓ Bypassed Incapsula! Got {len(content)} bytes")
|
| 1517 |
+
|
| 1518 |
+
# Save HTML for debugging
|
| 1519 |
+
debug_file = Path("/tmp/eboard_success.html")
|
| 1520 |
+
with open(debug_file, 'w', encoding='utf-8') as f:
|
| 1521 |
+
f.write(content)
|
| 1522 |
+
logger.info(f"Saved successful page to {debug_file}")
|
| 1523 |
+
|
| 1524 |
+
# Parse the page
|
| 1525 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 1526 |
+
|
| 1527 |
+
# Debug: Log all links found on page
|
| 1528 |
+
all_links = soup.find_all('a', href=True)
|
| 1529 |
+
logger.info(f"Total <a> tags with href found: {len(all_links)}")
|
| 1530 |
+
if len(all_links) > 0:
|
| 1531 |
+
logger.info(f"Sample links (first 10):")
|
| 1532 |
+
for i, link in enumerate(all_links[:10]):
|
| 1533 |
+
href = link.get('href', '')[:100]
|
| 1534 |
+
text = link.get_text().strip()[:50]
|
| 1535 |
+
logger.info(f" {i+1}. {text} -> {href}")
|
| 1536 |
+
|
| 1537 |
+
# Extract meeting links - eBoard uses MID parameter
|
| 1538 |
+
# Look for links containing "MID=" (Meeting ID)
|
| 1539 |
+
meeting_links = []
|
| 1540 |
+
|
| 1541 |
+
for link in soup.find_all('a', href=True):
|
| 1542 |
+
href = link.get('href', '')
|
| 1543 |
+
text = link.get_text().strip()
|
| 1544 |
+
|
| 1545 |
+
# eBoard meeting detail links contain MID parameter
|
| 1546 |
+
if 'MID=' in href.upper() or 'meetingdetail' in href.lower():
|
| 1547 |
+
full_url = urljoin(base_url, href)
|
| 1548 |
+
meeting_links.append({
|
| 1549 |
+
'url': full_url,
|
| 1550 |
+
'text': text,
|
| 1551 |
+
'mid': re.search(r'MID=(\d+)', href, re.IGNORECASE).group(1) if re.search(r'MID=(\d+)', href, re.IGNORECASE) else None
|
| 1552 |
+
})
|
| 1553 |
+
|
| 1554 |
+
# Also look for direct PDF links (agendas/minutes)
|
| 1555 |
+
for link in soup.find_all('a', href=True):
|
| 1556 |
+
href = link.get('href', '')
|
| 1557 |
+
text = link.get_text().strip()
|
| 1558 |
+
|
| 1559 |
+
if href.lower().endswith('.pdf') and any(word in text.lower() for word in ['agenda', 'minutes', 'packet']):
|
| 1560 |
+
full_url = urljoin(base_url, href)
|
| 1561 |
+
meeting_links.append({
|
| 1562 |
+
'url': full_url,
|
| 1563 |
+
'text': text,
|
| 1564 |
+
'type': 'pdf'
|
| 1565 |
+
})
|
| 1566 |
+
|
| 1567 |
+
logger.info(f"Found {len(meeting_links)} meeting/document links")
|
| 1568 |
+
|
| 1569 |
+
# Process each meeting (limit to prevent overwhelming)
|
| 1570 |
+
for idx, meeting_info in enumerate(meeting_links[:50]):
|
| 1571 |
+
try:
|
| 1572 |
+
meeting_url = meeting_info['url']
|
| 1573 |
+
meeting_title = meeting_info['text']
|
| 1574 |
+
|
| 1575 |
+
if idx > 0 and idx % 10 == 0:
|
| 1576 |
+
logger.info(f" Progress: {idx}/{min(50, len(meeting_links))} meetings processed")
|
| 1577 |
+
|
| 1578 |
+
# CRITICAL: Randomized rate limiting to prevent Advanced Mode trigger
|
| 1579 |
+
# Never use flat sleep - Incapsula detects patterns
|
| 1580 |
+
wait_time = random.uniform(3.0, 7.0)
|
| 1581 |
+
await asyncio.sleep(wait_time)
|
| 1582 |
+
|
| 1583 |
+
# Simulate human mouse movement before each action
|
| 1584 |
+
await page.mouse.move(random.randint(200, 800), random.randint(200, 600))
|
| 1585 |
+
|
| 1586 |
+
# Handle PDF links directly
|
| 1587 |
+
if meeting_info.get('type') == 'pdf':
|
| 1588 |
+
try:
|
| 1589 |
+
# Download PDF
|
| 1590 |
+
pdf_content = await self._scrape_pdf_document(meeting_url)
|
| 1591 |
+
|
| 1592 |
+
if pdf_content and len(pdf_content.strip()) > 100:
|
| 1593 |
+
# Extract date from title/text
|
| 1594 |
+
meeting_date = None
|
| 1595 |
+
try:
|
| 1596 |
+
from dateutil import parser as date_parser
|
| 1597 |
+
date_match = re.search(r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', meeting_title)
|
| 1598 |
+
if not date_match:
|
| 1599 |
+
date_match = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}', meeting_title, re.IGNORECASE)
|
| 1600 |
+
|
| 1601 |
+
if date_match:
|
| 1602 |
+
meeting_date = date_parser.parse(date_match.group(0))
|
| 1603 |
+
except:
|
| 1604 |
+
meeting_date = datetime.now()
|
| 1605 |
+
|
| 1606 |
+
if not meeting_date:
|
| 1607 |
+
meeting_date = datetime.now()
|
| 1608 |
+
|
| 1609 |
+
document_id = hashlib.md5(f"{meeting_url}{municipality}".encode()).hexdigest()
|
| 1610 |
+
|
| 1611 |
+
doc = MeetingDocument(
|
| 1612 |
+
document_id=document_id,
|
| 1613 |
+
source_url=meeting_url,
|
| 1614 |
+
municipality=municipality,
|
| 1615 |
+
state=state,
|
| 1616 |
+
meeting_date=meeting_date,
|
| 1617 |
+
meeting_type='Board Meeting',
|
| 1618 |
+
title=meeting_title,
|
| 1619 |
+
content=pdf_content[:50000],
|
| 1620 |
+
metadata={
|
| 1621 |
+
'platform': 'eboard',
|
| 1622 |
+
'school_id': school_id,
|
| 1623 |
+
'scraped_with': 'playwright_stealth'
|
| 1624 |
+
}
|
| 1625 |
+
)
|
| 1626 |
+
|
| 1627 |
+
documents.append(doc)
|
| 1628 |
+
logger.success(f" ✓ Scraped PDF: {meeting_title[:50]}")
|
| 1629 |
+
|
| 1630 |
+
except Exception as e:
|
| 1631 |
+
logger.error(f" Error downloading PDF: {e}")
|
| 1632 |
+
continue
|
| 1633 |
+
|
| 1634 |
+
# Handle meeting detail pages
|
| 1635 |
+
else:
|
| 1636 |
+
logger.debug(f" Loading meeting detail: {meeting_title[:50]}")
|
| 1637 |
+
|
| 1638 |
+
try:
|
| 1639 |
+
# Simulate clicking on link (human-like behavior)
|
| 1640 |
+
await page.mouse.move(random.randint(300, 700), random.randint(200, 500))
|
| 1641 |
+
await page.goto(meeting_url, wait_until='domcontentloaded', timeout=30000)
|
| 1642 |
+
|
| 1643 |
+
# Random wait to appear human
|
| 1644 |
+
await page.wait_for_timeout(random.randint(1500, 3000))
|
| 1645 |
+
|
| 1646 |
+
meeting_content = await page.content()
|
| 1647 |
+
meeting_soup = BeautifulSoup(meeting_content, 'html.parser')
|
| 1648 |
+
|
| 1649 |
+
# Extract meeting date
|
| 1650 |
+
meeting_date = None
|
| 1651 |
+
for elem in meeting_soup.find_all(['h1', 'h2', 'h3', 'div', 'span']):
|
| 1652 |
+
text = elem.get_text().strip()
|
| 1653 |
+
date_match = re.search(r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', text)
|
| 1654 |
+
if not date_match:
|
| 1655 |
+
date_match = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}', text, re.IGNORECASE)
|
| 1656 |
+
|
| 1657 |
+
if date_match:
|
| 1658 |
+
try:
|
| 1659 |
+
from dateutil import parser as date_parser
|
| 1660 |
+
meeting_date = date_parser.parse(date_match.group(0))
|
| 1661 |
+
break
|
| 1662 |
+
except:
|
| 1663 |
+
pass
|
| 1664 |
+
|
| 1665 |
+
if not meeting_date:
|
| 1666 |
+
meeting_date = datetime.now()
|
| 1667 |
+
|
| 1668 |
+
# Find document links (PDFs)
|
| 1669 |
+
doc_links = []
|
| 1670 |
+
for link in meeting_soup.find_all('a', href=True):
|
| 1671 |
+
href = link.get('href', '')
|
| 1672 |
+
link_text = link.get_text().strip()
|
| 1673 |
+
|
| 1674 |
+
if (href.lower().endswith('.pdf') or
|
| 1675 |
+
'agenda' in link_text.lower() or
|
| 1676 |
+
'minutes' in link_text.lower() or
|
| 1677 |
+
'packet' in link_text.lower()):
|
| 1678 |
+
|
| 1679 |
+
doc_url = urljoin(base_url, href)
|
| 1680 |
+
doc_links.append({
|
| 1681 |
+
'url': doc_url,
|
| 1682 |
+
'text': link_text
|
| 1683 |
+
})
|
| 1684 |
+
|
| 1685 |
+
logger.info(f" Found {len(doc_links)} documents for {meeting_title[:40]}")
|
| 1686 |
+
|
| 1687 |
+
# Download each document
|
| 1688 |
+
for doc_info in doc_links[:5]: # Limit per meeting
|
| 1689 |
+
try:
|
| 1690 |
+
doc_url = doc_info['url']
|
| 1691 |
+
doc_title = doc_info['text']
|
| 1692 |
+
|
| 1693 |
+
if doc_url.lower().endswith('.pdf'):
|
| 1694 |
+
doc_content = await self._scrape_pdf_document(doc_url)
|
| 1695 |
+
|
| 1696 |
+
if doc_content and len(doc_content.strip()) > 100:
|
| 1697 |
+
document_id = hashlib.md5(f"{doc_url}{municipality}".encode()).hexdigest()
|
| 1698 |
+
|
| 1699 |
+
doc = MeetingDocument(
|
| 1700 |
+
document_id=document_id,
|
| 1701 |
+
source_url=doc_url,
|
| 1702 |
+
municipality=municipality,
|
| 1703 |
+
state=state,
|
| 1704 |
+
meeting_date=meeting_date,
|
| 1705 |
+
meeting_type='Board Meeting',
|
| 1706 |
+
title=doc_title or meeting_title,
|
| 1707 |
+
content=doc_content[:50000],
|
| 1708 |
+
metadata={
|
| 1709 |
+
'platform': 'eboard',
|
| 1710 |
+
'meeting_page': meeting_url,
|
| 1711 |
+
'school_id': school_id,
|
| 1712 |
+
'meeting_id': meeting_info.get('mid'),
|
| 1713 |
+
'scraped_with': 'playwright_stealth'
|
| 1714 |
+
}
|
| 1715 |
+
)
|
| 1716 |
+
|
| 1717 |
+
documents.append(doc)
|
| 1718 |
+
logger.success(f" ✓ Scraped: {doc_title[:50]}")
|
| 1719 |
+
|
| 1720 |
+
except Exception as e:
|
| 1721 |
+
logger.error(f" Error scraping document: {e}")
|
| 1722 |
+
continue
|
| 1723 |
+
|
| 1724 |
+
except Exception as e:
|
| 1725 |
+
logger.error(f" Error processing meeting {meeting_title[:40]}: {e}")
|
| 1726 |
+
continue
|
| 1727 |
+
|
| 1728 |
+
except Exception as e:
|
| 1729 |
+
logger.error(f"Error processing meeting link: {e}")
|
| 1730 |
+
continue
|
| 1731 |
+
|
| 1732 |
+
# Close browser
|
| 1733 |
+
await browser.close()
|
| 1734 |
+
|
| 1735 |
+
except ImportError as e:
|
| 1736 |
+
logger.error(f"Missing dependency: {e}")
|
| 1737 |
+
logger.error("Install with: pip install playwright-stealth && playwright install chromium")
|
| 1738 |
+
return []
|
| 1739 |
+
except Exception as e:
|
| 1740 |
+
logger.error(f"Error scraping eBoard {url}: {e}")
|
| 1741 |
+
import traceback
|
| 1742 |
+
logger.error(traceback.format_exc())
|
| 1743 |
+
|
| 1744 |
+
logger.success(f"eBoard scrape complete: {len(documents)} documents from {municipality}")
|
| 1745 |
+
return documents
|
| 1746 |
+
|
| 1747 |
+
async def _scrape_generic(
|
| 1748 |
+
self,
|
| 1749 |
+
target: Dict[str, Any],
|
| 1750 |
+
date_range: Dict[str, str]
|
| 1751 |
+
) -> List[Dict[str, Any]]:
|
| 1752 |
+
"""
|
| 1753 |
+
Scrape meeting minutes from generic municipal websites.
|
| 1754 |
+
|
| 1755 |
+
Args:
|
| 1756 |
+
target: Target configuration
|
| 1757 |
+
date_range: Date range for filtering
|
| 1758 |
+
|
| 1759 |
+
Returns:
|
| 1760 |
+
List of meeting documents
|
| 1761 |
+
"""
|
| 1762 |
+
url = target["url"]
|
| 1763 |
+
municipality = target["municipality"]
|
| 1764 |
+
state = target["state"]
|
| 1765 |
+
|
| 1766 |
+
documents = []
|
| 1767 |
+
|
| 1768 |
+
try:
|
| 1769 |
+
response = await self.http_client.get(url)
|
| 1770 |
+
response.raise_for_status()
|
| 1771 |
+
|
| 1772 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 1773 |
+
|
| 1774 |
+
candidate_documents = self._extract_document_candidates(
|
| 1775 |
+
page_url=url,
|
| 1776 |
+
html=response.text,
|
| 1777 |
+
soup=soup
|
| 1778 |
+
)
|
| 1779 |
+
|
| 1780 |
+
# Crawl a few likely meeting pages because many sites (including JS-heavy portals)
|
| 1781 |
+
# keep document links off the landing page.
|
| 1782 |
+
meeting_pages = self._extract_meeting_pages(page_url=url, soup=soup)
|
| 1783 |
+
for meeting_page in meeting_pages[:8]:
|
| 1784 |
+
try:
|
| 1785 |
+
page_response = await self.http_client.get(meeting_page)
|
| 1786 |
+
if page_response.status_code >= 400:
|
| 1787 |
+
continue
|
| 1788 |
+
|
| 1789 |
+
page_soup = BeautifulSoup(page_response.content, "html.parser")
|
| 1790 |
+
page_candidates = self._extract_document_candidates(
|
| 1791 |
+
page_url=meeting_page,
|
| 1792 |
+
html=page_response.text,
|
| 1793 |
+
soup=page_soup
|
| 1794 |
+
)
|
| 1795 |
+
candidate_documents.extend(page_candidates)
|
| 1796 |
+
await asyncio.sleep(0.2)
|
| 1797 |
+
except Exception as page_err:
|
| 1798 |
+
logger.debug(f"Could not scrape meeting page {meeting_page}: {page_err}")
|
| 1799 |
+
|
| 1800 |
+
# De-duplicate while preserving order
|
| 1801 |
+
seen_urls = set()
|
| 1802 |
+
deduped_candidates = []
|
| 1803 |
+
for doc_url, doc_label in candidate_documents:
|
| 1804 |
+
if doc_url not in seen_urls:
|
| 1805 |
+
seen_urls.add(doc_url)
|
| 1806 |
+
deduped_candidates.append((doc_url, doc_label))
|
| 1807 |
+
|
| 1808 |
+
for doc_url, doc_label in deduped_candidates[:50]:
|
| 1809 |
+
if doc_url in self.scraped_urls:
|
| 1810 |
+
continue
|
| 1811 |
+
|
| 1812 |
+
# Prioritize meeting-related labels but still allow document URL heuristics.
|
| 1813 |
+
label = (doc_label or "").lower()
|
| 1814 |
+
if label and not any(keyword in label for keyword in self.meeting_keywords):
|
| 1815 |
+
if not any(keyword in doc_url.lower() for keyword in self.meeting_keywords):
|
| 1816 |
+
continue
|
| 1817 |
+
|
| 1818 |
+
doc = await self._scrape_document(
|
| 1819 |
+
url=doc_url,
|
| 1820 |
+
municipality=municipality,
|
| 1821 |
+
state=state,
|
| 1822 |
+
title=doc_label or "meeting document"
|
| 1823 |
+
)
|
| 1824 |
+
|
| 1825 |
+
if doc:
|
| 1826 |
+
documents.append(doc)
|
| 1827 |
+
self.scraped_urls.add(doc_url)
|
| 1828 |
+
|
| 1829 |
+
await asyncio.sleep(0.2)
|
| 1830 |
+
|
| 1831 |
+
except Exception as e:
|
| 1832 |
+
logger.error(f"Error scraping generic site {url}: {e}")
|
| 1833 |
+
|
| 1834 |
+
return documents
|
| 1835 |
+
|
| 1836 |
+
def _extract_document_candidates(
|
| 1837 |
+
self,
|
| 1838 |
+
page_url: str,
|
| 1839 |
+
html: str,
|
| 1840 |
+
soup: BeautifulSoup
|
| 1841 |
+
) -> List[tuple[str, str]]:
|
| 1842 |
+
"""Extract document URLs from anchors and script text."""
|
| 1843 |
+
candidates: List[tuple[str, str]] = []
|
| 1844 |
+
|
| 1845 |
+
# Anchor/link extraction
|
| 1846 |
+
for anchor in soup.find_all("a", href=True):
|
| 1847 |
+
href = anchor.get("href", "")
|
| 1848 |
+
if not href:
|
| 1849 |
+
continue
|
| 1850 |
+
full_url = urljoin(page_url, href)
|
| 1851 |
+
full_url = self._normalize_document_url(full_url)
|
| 1852 |
+
lowered = full_url.lower()
|
| 1853 |
+
if any(ext in lowered for ext in self.document_extensions) or any(k in lowered for k in self.document_route_keywords):
|
| 1854 |
+
text = anchor.get_text(" ", strip=True) or anchor.get("title", "") or "document"
|
| 1855 |
+
candidates.append((full_url, text))
|
| 1856 |
+
|
| 1857 |
+
# Script extraction for JS-driven portals that embed links in JSON blobs.
|
| 1858 |
+
url_pattern = r'(https?://[^"\'\s)]+\.(?:pdf|docx?|pptx?|xlsx?)(?:\?[^"\'\s)]*)?)'
|
| 1859 |
+
rel_pattern = r'([\w/\-.]+\.(?:pdf|docx?|pptx?|xlsx?)(?:\?[^"\'\s)]*)?)'
|
| 1860 |
+
|
| 1861 |
+
for raw in re.findall(url_pattern, html, flags=re.IGNORECASE):
|
| 1862 |
+
candidates.append((self._normalize_document_url(raw), "document"))
|
| 1863 |
+
|
| 1864 |
+
route_pattern = r'(["\'](?:/event/Get(?:Agenda|Minutes)File/[^"\']+)["\'])'
|
| 1865 |
+
for raw in re.findall(route_pattern, html, flags=re.IGNORECASE):
|
| 1866 |
+
cleaned = raw.strip("\"'")
|
| 1867 |
+
candidates.append((self._normalize_document_url(urljoin(page_url, cleaned)), "document"))
|
| 1868 |
+
|
| 1869 |
+
for raw in re.findall(rel_pattern, html, flags=re.IGNORECASE):
|
| 1870 |
+
if raw.startswith("http"):
|
| 1871 |
+
continue
|
| 1872 |
+
if raw.startswith("/") or "/" in raw:
|
| 1873 |
+
candidates.append((self._normalize_document_url(urljoin(page_url, raw)), "document"))
|
| 1874 |
+
|
| 1875 |
+
return candidates
|
| 1876 |
+
|
| 1877 |
+
def _normalize_document_url(self, url: str) -> str:
|
| 1878 |
+
"""Clean common malformed URL artifacts found in embedded portal markup."""
|
| 1879 |
+
normalized = url.strip()
|
| 1880 |
+
normalized = normalized.replace(" %20?", "?")
|
| 1881 |
+
normalized = normalized.replace("%20?", "?")
|
| 1882 |
+
normalized = normalized.replace(" ?", "?")
|
| 1883 |
+
return normalized
|
| 1884 |
+
|
| 1885 |
+
def _extract_meeting_pages(self, page_url: str, soup: BeautifulSoup) -> List[str]:
|
| 1886 |
+
"""Find likely meeting-related subpages to expand document discovery."""
|
| 1887 |
+
pages = []
|
| 1888 |
+
for anchor in soup.find_all("a", href=True):
|
| 1889 |
+
href = anchor.get("href", "")
|
| 1890 |
+
text = anchor.get_text(" ", strip=True).lower()
|
| 1891 |
+
if not href:
|
| 1892 |
+
continue
|
| 1893 |
+
|
| 1894 |
+
full_url = urljoin(page_url, href)
|
| 1895 |
+
combined = f"{text} {full_url.lower()}"
|
| 1896 |
+
if "/event/?id=" in full_url.lower() or any(keyword in combined for keyword in self.meeting_keywords):
|
| 1897 |
+
pages.append(full_url)
|
| 1898 |
+
|
| 1899 |
+
seen = set()
|
| 1900 |
+
deduped = []
|
| 1901 |
+
for p in pages:
|
| 1902 |
+
if p not in seen:
|
| 1903 |
+
seen.add(p)
|
| 1904 |
+
deduped.append(p)
|
| 1905 |
+
return deduped
|
| 1906 |
+
|
| 1907 |
+
async def _scrape_meeting_page(
|
| 1908 |
+
self,
|
| 1909 |
+
url: str,
|
| 1910 |
+
municipality: str,
|
| 1911 |
+
state: str,
|
| 1912 |
+
meeting_data: Optional[Dict[str, Any]] = None
|
| 1913 |
+
) -> Optional[Dict[str, Any]]:
|
| 1914 |
+
"""
|
| 1915 |
+
Scrape a single meeting page.
|
| 1916 |
+
|
| 1917 |
+
Args:
|
| 1918 |
+
url: Meeting page URL
|
| 1919 |
+
municipality: Municipality name
|
| 1920 |
+
state: State code
|
| 1921 |
+
meeting_data: Optional pre-fetched meeting data
|
| 1922 |
+
|
| 1923 |
+
Returns:
|
| 1924 |
+
Meeting document or None
|
| 1925 |
+
"""
|
| 1926 |
+
try:
|
| 1927 |
+
response = await self.http_client.get(url)
|
| 1928 |
+
response.raise_for_status()
|
| 1929 |
+
|
| 1930 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 1931 |
+
|
| 1932 |
+
# Extract meeting details (simplified - actual implementation would be more robust)
|
| 1933 |
+
title = soup.find("h1") or soup.find("title")
|
| 1934 |
+
title_text = title.get_text().strip() if title else "Untitled Meeting"
|
| 1935 |
+
|
| 1936 |
+
# Extract main content
|
| 1937 |
+
content_div = soup.find("div", class_="meeting-content") or soup.find("main")
|
| 1938 |
+
content = content_div.get_text(separator="\n").strip() if content_div else ""
|
| 1939 |
+
|
| 1940 |
+
# Generate document ID
|
| 1941 |
+
doc_id = hashlib.md5(f"{url}{municipality}".encode()).hexdigest()
|
| 1942 |
+
|
| 1943 |
+
document = MeetingDocument(
|
| 1944 |
+
document_id=doc_id,
|
| 1945 |
+
source_url=url,
|
| 1946 |
+
municipality=municipality,
|
| 1947 |
+
state=state,
|
| 1948 |
+
meeting_date=datetime.utcnow(), # Would parse from content
|
| 1949 |
+
meeting_type="City Council", # Would determine from content
|
| 1950 |
+
title=title_text,
|
| 1951 |
+
content=content,
|
| 1952 |
+
metadata={"platform": "web", "raw_data": meeting_data}
|
| 1953 |
+
)
|
| 1954 |
+
|
| 1955 |
+
return document
|
| 1956 |
+
|
| 1957 |
+
except Exception as e:
|
| 1958 |
+
logger.error(f"Error scraping meeting page {url}: {e}")
|
| 1959 |
+
return None
|
| 1960 |
+
|
| 1961 |
+
async def _scrape_document(
|
| 1962 |
+
self,
|
| 1963 |
+
url: str,
|
| 1964 |
+
municipality: str,
|
| 1965 |
+
state: str,
|
| 1966 |
+
title: str
|
| 1967 |
+
) -> Optional[Dict[str, Any]]:
|
| 1968 |
+
"""
|
| 1969 |
+
Download and extract text from a document URL.
|
| 1970 |
+
|
| 1971 |
+
Args:
|
| 1972 |
+
url: PDF URL
|
| 1973 |
+
municipality: Municipality name
|
| 1974 |
+
state: State code
|
| 1975 |
+
title: Document title
|
| 1976 |
+
|
| 1977 |
+
Returns:
|
| 1978 |
+
Meeting document or None
|
| 1979 |
+
"""
|
| 1980 |
+
try:
|
| 1981 |
+
response = await self.http_client.get(url)
|
| 1982 |
+
response.raise_for_status()
|
| 1983 |
+
|
| 1984 |
+
content_type = (response.headers.get("content-type") or "").lower()
|
| 1985 |
+
url_lower = url.lower()
|
| 1986 |
+
is_pdf = ".pdf" in url_lower or "application/pdf" in content_type
|
| 1987 |
+
is_image = any(ext in url_lower for ext in [".png", ".jpg", ".jpeg", ".tif", ".tiff"]) or content_type.startswith("image/")
|
| 1988 |
+
|
| 1989 |
+
content = "[Document content extraction unavailable]"
|
| 1990 |
+
ocr_used = False
|
| 1991 |
+
ocr_pages = 0
|
| 1992 |
+
|
| 1993 |
+
if is_pdf and PdfReader is not None:
|
| 1994 |
+
try:
|
| 1995 |
+
reader = PdfReader(io.BytesIO(response.content))
|
| 1996 |
+
pages = []
|
| 1997 |
+
for page in reader.pages[:30]:
|
| 1998 |
+
pages.append(page.extract_text() or "")
|
| 1999 |
+
extracted = "\n".join(pages).strip()
|
| 2000 |
+
if extracted:
|
| 2001 |
+
content = extracted
|
| 2002 |
+
else:
|
| 2003 |
+
content = "[PDF has no extractable text]"
|
| 2004 |
+
except Exception as parse_error:
|
| 2005 |
+
logger.debug(f"PDF parse failed for {url}: {parse_error}")
|
| 2006 |
+
content = "[PDF parsing failed]"
|
| 2007 |
+
|
| 2008 |
+
# OCR fallback for scanned/image-based PDFs.
|
| 2009 |
+
if is_pdf and content in ["[PDF has no extractable text]", "[PDF parsing failed]"]:
|
| 2010 |
+
ocr_text, ocr_pages = self._ocr_pdf_bytes(response.content)
|
| 2011 |
+
if ocr_text:
|
| 2012 |
+
content = ocr_text
|
| 2013 |
+
ocr_used = True
|
| 2014 |
+
|
| 2015 |
+
# OCR for image documents.
|
| 2016 |
+
if is_image and content == "[Document content extraction unavailable]":
|
| 2017 |
+
image_text = self._ocr_image_bytes(response.content)
|
| 2018 |
+
if image_text:
|
| 2019 |
+
content = image_text
|
| 2020 |
+
ocr_used = True
|
| 2021 |
+
ocr_pages = 1
|
| 2022 |
+
|
| 2023 |
+
doc_id = hashlib.md5(f"{url}{municipality}".encode()).hexdigest()
|
| 2024 |
+
|
| 2025 |
+
document = MeetingDocument(
|
| 2026 |
+
document_id=doc_id,
|
| 2027 |
+
source_url=url,
|
| 2028 |
+
municipality=municipality,
|
| 2029 |
+
state=state,
|
| 2030 |
+
meeting_date=datetime.utcnow(),
|
| 2031 |
+
meeting_type="Unknown",
|
| 2032 |
+
title=title,
|
| 2033 |
+
content=content,
|
| 2034 |
+
metadata={
|
| 2035 |
+
"platform": "document",
|
| 2036 |
+
"file_size": len(response.content),
|
| 2037 |
+
"content_type": response.headers.get("content-type"),
|
| 2038 |
+
"is_pdf": is_pdf,
|
| 2039 |
+
"is_image": is_image,
|
| 2040 |
+
"ocr_used": ocr_used,
|
| 2041 |
+
"ocr_pages": ocr_pages,
|
| 2042 |
+
"text_extracted": content not in [
|
| 2043 |
+
"[Document content extraction unavailable]",
|
| 2044 |
+
"[PDF has no extractable text]",
|
| 2045 |
+
"[PDF parsing failed]"
|
| 2046 |
+
]
|
| 2047 |
+
}
|
| 2048 |
+
)
|
| 2049 |
+
|
| 2050 |
+
return document
|
| 2051 |
+
|
| 2052 |
+
except Exception as e:
|
| 2053 |
+
if isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 404:
|
| 2054 |
+
logger.debug(f"Document not found (404): {url}")
|
| 2055 |
+
else:
|
| 2056 |
+
logger.error(f"Error downloading document {url}: {e}")
|
| 2057 |
+
return None
|
| 2058 |
+
|
| 2059 |
+
def _ocr_pdf_bytes(self, pdf_bytes: bytes) -> tuple[str, int]:
|
| 2060 |
+
"""OCR PDF pages when direct PDF text extraction fails."""
|
| 2061 |
+
if pdfplumber is None or pytesseract is None:
|
| 2062 |
+
return "", 0
|
| 2063 |
+
|
| 2064 |
+
try:
|
| 2065 |
+
extracted_pages = []
|
| 2066 |
+
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
| 2067 |
+
for page in pdf.pages[:self.ocr_max_pages]:
|
| 2068 |
+
try:
|
| 2069 |
+
image = page.to_image(resolution=200).original
|
| 2070 |
+
text = pytesseract.image_to_string(image).strip()
|
| 2071 |
+
if text:
|
| 2072 |
+
extracted_pages.append(text)
|
| 2073 |
+
except TesseractNotFoundError:
|
| 2074 |
+
if not self._ocr_missing_tesseract_warned:
|
| 2075 |
+
logger.warning("Tesseract binary not found. Install 'tesseract-ocr' to enable OCR.")
|
| 2076 |
+
self._ocr_missing_tesseract_warned = True
|
| 2077 |
+
return "", 0
|
| 2078 |
+
except Exception as page_err:
|
| 2079 |
+
logger.debug(f"OCR page failed: {page_err}")
|
| 2080 |
+
|
| 2081 |
+
if not extracted_pages:
|
| 2082 |
+
return "", 0
|
| 2083 |
+
return "\n\n".join(extracted_pages), len(extracted_pages)
|
| 2084 |
+
except Exception as err:
|
| 2085 |
+
logger.debug(f"OCR PDF fallback failed: {err}")
|
| 2086 |
+
return "", 0
|
| 2087 |
+
|
| 2088 |
+
def _ocr_image_bytes(self, image_bytes: bytes) -> str:
|
| 2089 |
+
"""OCR text from image-based documents."""
|
| 2090 |
+
if pytesseract is None or Image is None:
|
| 2091 |
+
return ""
|
| 2092 |
+
|
| 2093 |
+
try:
|
| 2094 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 2095 |
+
return pytesseract.image_to_string(image).strip()
|
| 2096 |
+
except TesseractNotFoundError:
|
| 2097 |
+
if not self._ocr_missing_tesseract_warned:
|
| 2098 |
+
logger.warning("Tesseract binary not found. Install 'tesseract-ocr' to enable OCR.")
|
| 2099 |
+
self._ocr_missing_tesseract_warned = True
|
| 2100 |
+
return ""
|
| 2101 |
+
except Exception as err:
|
| 2102 |
+
logger.debug(f"Image OCR failed: {err}")
|
| 2103 |
+
return ""
|
| 2104 |
+
|
| 2105 |
+
async def _scrape_pdf_document(
|
| 2106 |
+
self,
|
| 2107 |
+
url: str,
|
| 2108 |
+
municipality: str,
|
| 2109 |
+
state: str,
|
| 2110 |
+
title: str
|
| 2111 |
+
) -> Optional[Dict[str, Any]]:
|
| 2112 |
+
"""Backward-compatible wrapper for existing call sites."""
|
| 2113 |
+
return await self._scrape_document(url, municipality, state, title)
|
agents/scraper_undetected.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Alternative eBoard scraper using undetected-chromedriver
|
| 3 |
+
This bypasses Incapsula without manual cookies
|
| 4 |
+
"""
|
| 5 |
+
import asyncio
|
| 6 |
+
import re
|
| 7 |
+
from typing import Dict, Any, List
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
from urllib.parse import urljoin
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import hashlib
|
| 13 |
+
|
| 14 |
+
from loguru import logger
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class UndetectedEboardScraper:
|
| 18 |
+
"""
|
| 19 |
+
Scrape eBoard using undetected-chromedriver to bypass Incapsula.
|
| 20 |
+
|
| 21 |
+
This library patches Selenium ChromeDriver to avoid detection by:
|
| 22 |
+
- Removing Selenium markers from navigator.webdriver
|
| 23 |
+
- Randomizing browser fingerprints
|
| 24 |
+
- Using real Chrome instead of ChromeDriver
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
async def scrape_eboard(
|
| 28 |
+
self,
|
| 29 |
+
url: str,
|
| 30 |
+
municipality: str,
|
| 31 |
+
state: str,
|
| 32 |
+
school_id: str = None
|
| 33 |
+
) -> List[Dict[str, Any]]:
|
| 34 |
+
"""
|
| 35 |
+
Scrape eBoard platform without manual cookies.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
url: eBoard URL
|
| 39 |
+
municipality: School district name
|
| 40 |
+
state: State code
|
| 41 |
+
school_id: Optional school ID (extracted from URL if not provided)
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
List of meeting documents
|
| 45 |
+
"""
|
| 46 |
+
try:
|
| 47 |
+
import undetected_chromedriver as uc
|
| 48 |
+
from selenium.webdriver.common.by import By
|
| 49 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 50 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 51 |
+
import time
|
| 52 |
+
import random
|
| 53 |
+
except ImportError:
|
| 54 |
+
logger.error("Missing undetected-chromedriver. Install: pip install undetected-chromedriver")
|
| 55 |
+
return []
|
| 56 |
+
|
| 57 |
+
# Extract school ID
|
| 58 |
+
if not school_id:
|
| 59 |
+
match = re.search(r'[?&]s=(\d+)', url, re.IGNORECASE)
|
| 60 |
+
school_id = match.group(1) if match else None
|
| 61 |
+
|
| 62 |
+
if not school_id:
|
| 63 |
+
logger.error(f"Could not extract school ID from URL: {url}")
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
base_url = "https://simbli.eboardsolutions.com"
|
| 67 |
+
meetings_url = f"{base_url}/SB_Meetings/SB_MeetingListing.aspx?S={school_id}"
|
| 68 |
+
|
| 69 |
+
logger.info(f"Using undetected-chromedriver to bypass Incapsula")
|
| 70 |
+
logger.info(f"Target: {meetings_url}")
|
| 71 |
+
|
| 72 |
+
documents = []
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
# Create undetected Chrome instance
|
| 76 |
+
options = uc.ChromeOptions()
|
| 77 |
+
# options.add_argument('--headless') # Headless may still be detected
|
| 78 |
+
options.add_argument('--no-sandbox')
|
| 79 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 80 |
+
options.add_argument('--disable-blink-features=AutomationControlled')
|
| 81 |
+
|
| 82 |
+
# Create driver with version management
|
| 83 |
+
driver = uc.Chrome(options=options, version_main=None)
|
| 84 |
+
|
| 85 |
+
logger.info("Chrome launched with anti-detection patches")
|
| 86 |
+
|
| 87 |
+
# Navigate to meetings page
|
| 88 |
+
driver.get(meetings_url)
|
| 89 |
+
logger.info(f"Loaded page: {driver.title[:100]}")
|
| 90 |
+
|
| 91 |
+
# Wait for Incapsula challenge to complete
|
| 92 |
+
# The challenge usually takes 3-5 seconds
|
| 93 |
+
wait_time = random.uniform(5.0, 8.0)
|
| 94 |
+
logger.info(f"Waiting {wait_time:.1f}s for Incapsula challenge...")
|
| 95 |
+
time.sleep(wait_time)
|
| 96 |
+
|
| 97 |
+
# Check if we bypassed Incapsula
|
| 98 |
+
page_source = driver.page_source
|
| 99 |
+
|
| 100 |
+
if 'Incapsula' in page_source and len(page_source) < 10000:
|
| 101 |
+
logger.error("Still blocked by Incapsula")
|
| 102 |
+
logger.warning("Try running with headless=False or use Option 2 (Residential Proxies)")
|
| 103 |
+
driver.quit()
|
| 104 |
+
return []
|
| 105 |
+
|
| 106 |
+
logger.success(f"✓ Bypassed Incapsula! Page size: {len(page_source)} bytes")
|
| 107 |
+
|
| 108 |
+
# Parse the page
|
| 109 |
+
soup = BeautifulSoup(page_source, 'html.parser')
|
| 110 |
+
|
| 111 |
+
# Extract meeting links
|
| 112 |
+
meeting_links = []
|
| 113 |
+
|
| 114 |
+
# Method 1: Look for MID parameter
|
| 115 |
+
for link in soup.find_all('a', href=True):
|
| 116 |
+
href = link.get('href', '')
|
| 117 |
+
text = link.get_text().strip()
|
| 118 |
+
|
| 119 |
+
if 'MID=' in href.upper() or 'meetingdetail' in href.lower():
|
| 120 |
+
full_url = urljoin(base_url, href)
|
| 121 |
+
meeting_links.append({
|
| 122 |
+
'url': full_url,
|
| 123 |
+
'text': text,
|
| 124 |
+
'type': 'meeting'
|
| 125 |
+
})
|
| 126 |
+
elif href.lower().endswith('.pdf'):
|
| 127 |
+
full_url = urljoin(base_url, href)
|
| 128 |
+
meeting_links.append({
|
| 129 |
+
'url': full_url,
|
| 130 |
+
'text': text,
|
| 131 |
+
'type': 'pdf'
|
| 132 |
+
})
|
| 133 |
+
|
| 134 |
+
logger.info(f"Found {len(meeting_links)} meeting/document links")
|
| 135 |
+
|
| 136 |
+
# If no links found, try JavaScript execution
|
| 137 |
+
if len(meeting_links) == 0:
|
| 138 |
+
logger.warning("No links found in HTML, checking for JavaScript-rendered content...")
|
| 139 |
+
|
| 140 |
+
# Wait for dynamic content
|
| 141 |
+
try:
|
| 142 |
+
WebDriverWait(driver, 10).until(
|
| 143 |
+
EC.presence_of_element_located((By.TAG_NAME, "a"))
|
| 144 |
+
)
|
| 145 |
+
time.sleep(3) # Additional wait for JS
|
| 146 |
+
|
| 147 |
+
# Re-parse
|
| 148 |
+
page_source = driver.page_source
|
| 149 |
+
soup = BeautifulSoup(page_source, 'html.parser')
|
| 150 |
+
|
| 151 |
+
for link in soup.find_all('a', href=True):
|
| 152 |
+
href = link.get('href', '')
|
| 153 |
+
text = link.get_text().strip()
|
| 154 |
+
|
| 155 |
+
if 'MID=' in href.upper() or href.lower().endswith('.pdf'):
|
| 156 |
+
full_url = urljoin(base_url, href)
|
| 157 |
+
meeting_links.append({
|
| 158 |
+
'url': full_url,
|
| 159 |
+
'text': text,
|
| 160 |
+
'type': 'pdf' if href.lower().endswith('.pdf') else 'meeting'
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
logger.info(f"After JS wait: Found {len(meeting_links)} links")
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.warning(f"JS content wait failed: {e}")
|
| 166 |
+
|
| 167 |
+
# Process meeting links (limit to prevent overwhelming)
|
| 168 |
+
for idx, meeting_info in enumerate(meeting_links[:50]):
|
| 169 |
+
if idx > 0 and idx % 10 == 0:
|
| 170 |
+
logger.info(f"Progress: {idx}/{min(50, len(meeting_links))}")
|
| 171 |
+
|
| 172 |
+
# Human-like delay
|
| 173 |
+
time.sleep(random.uniform(2.0, 5.0))
|
| 174 |
+
|
| 175 |
+
try:
|
| 176 |
+
meeting_url = meeting_info['url']
|
| 177 |
+
meeting_title = meeting_info['text']
|
| 178 |
+
|
| 179 |
+
if meeting_info['type'] == 'pdf':
|
| 180 |
+
# Download PDF directly
|
| 181 |
+
logger.debug(f" Downloading PDF: {meeting_title[:50]}")
|
| 182 |
+
# TODO: Implement PDF download
|
| 183 |
+
# For now, just record the URL
|
| 184 |
+
doc = {
|
| 185 |
+
'document_id': hashlib.md5(f"{meeting_url}{municipality}".encode()).hexdigest(),
|
| 186 |
+
'source_url': meeting_url,
|
| 187 |
+
'municipality': municipality,
|
| 188 |
+
'state': state,
|
| 189 |
+
'meeting_date': datetime.now(),
|
| 190 |
+
'meeting_type': 'Board Meeting',
|
| 191 |
+
'title': meeting_title,
|
| 192 |
+
'content': '', # Would need PDF extraction
|
| 193 |
+
'metadata': {
|
| 194 |
+
'platform': 'eboard',
|
| 195 |
+
'school_id': school_id,
|
| 196 |
+
'scraped_with': 'undetected_chromedriver'
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
documents.append(doc)
|
| 200 |
+
else:
|
| 201 |
+
# Navigate to meeting detail page
|
| 202 |
+
logger.debug(f" Loading meeting: {meeting_title[:50]}")
|
| 203 |
+
driver.get(meeting_url)
|
| 204 |
+
time.sleep(random.uniform(2.0, 4.0))
|
| 205 |
+
|
| 206 |
+
meeting_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
| 207 |
+
|
| 208 |
+
# Extract PDFs from meeting page
|
| 209 |
+
for link in meeting_soup.find_all('a', href=True):
|
| 210 |
+
href = link.get('href', '')
|
| 211 |
+
if href.lower().endswith('.pdf'):
|
| 212 |
+
doc_url = urljoin(base_url, href)
|
| 213 |
+
doc_title = link.get_text().strip()
|
| 214 |
+
|
| 215 |
+
doc = {
|
| 216 |
+
'document_id': hashlib.md5(f"{doc_url}{municipality}".encode()).hexdigest(),
|
| 217 |
+
'source_url': doc_url,
|
| 218 |
+
'municipality': municipality,
|
| 219 |
+
'state': state,
|
| 220 |
+
'meeting_date': datetime.now(),
|
| 221 |
+
'meeting_type': 'Board Meeting',
|
| 222 |
+
'title': doc_title or meeting_title,
|
| 223 |
+
'content': '',
|
| 224 |
+
'metadata': {
|
| 225 |
+
'platform': 'eboard',
|
| 226 |
+
'meeting_page': meeting_url,
|
| 227 |
+
'school_id': school_id,
|
| 228 |
+
'scraped_with': 'undetected_chromedriver'
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
documents.append(doc)
|
| 232 |
+
logger.success(f" ✓ Found: {doc_title[:50]}")
|
| 233 |
+
|
| 234 |
+
except Exception as e:
|
| 235 |
+
logger.error(f"Error processing {meeting_info.get('text', 'unknown')}: {e}")
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
driver.quit()
|
| 239 |
+
logger.success(f"Scraping complete: {len(documents)} documents")
|
| 240 |
+
return documents
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
logger.error(f"Error in undetected scraper: {e}")
|
| 244 |
+
import traceback
|
| 245 |
+
logger.error(traceback.format_exc())
|
| 246 |
+
return []
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
# Example usage
|
| 250 |
+
async def main():
|
| 251 |
+
scraper = UndetectedEboardScraper()
|
| 252 |
+
docs = await scraper.scrape_eboard(
|
| 253 |
+
url="http://simbli.eboardsolutions.com/index.aspx?s=2088",
|
| 254 |
+
municipality="Tuscaloosa City Schools",
|
| 255 |
+
state="AL"
|
| 256 |
+
)
|
| 257 |
+
print(f"Scraped {len(docs)} documents")
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
if __name__ == "__main__":
|
| 261 |
+
asyncio.run(main())
|
agents/sentiment.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sentiment Analyzer Agent for determining policy stance and debate intensity.
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
from agents.base import BaseAgent, AgentRole, AgentMessage, MessageType, AgentStatus
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class PolicyStance:
|
| 13 |
+
"""Enumeration of policy stances."""
|
| 14 |
+
STRONGLY_SUPPORTIVE = "strongly_supportive"
|
| 15 |
+
SUPPORTIVE = "supportive"
|
| 16 |
+
NEUTRAL = "neutral"
|
| 17 |
+
OPPOSED = "opposed"
|
| 18 |
+
STRONGLY_OPPOSED = "strongly_opposed"
|
| 19 |
+
DEBATED = "debated" # When there's active debate
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class DebateIntensity:
|
| 23 |
+
"""Enumeration of debate intensity levels."""
|
| 24 |
+
NONE = "none" # Passing mention
|
| 25 |
+
LOW = "low" # Brief discussion
|
| 26 |
+
MODERATE = "moderate" # Extended discussion
|
| 27 |
+
HIGH = "high" # Heated debate with multiple viewpoints
|
| 28 |
+
CRITICAL = "critical" # Vote imminent or major decision pending
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class SentimentAnalyzerAgent(BaseAgent):
|
| 32 |
+
"""
|
| 33 |
+
Agent responsible for analyzing sentiment and policy stance.
|
| 34 |
+
|
| 35 |
+
Determines:
|
| 36 |
+
- Overall stance toward oral health policies
|
| 37 |
+
- Intensity of debate/discussion
|
| 38 |
+
- Key arguments for and against
|
| 39 |
+
- Likelihood of policy action
|
| 40 |
+
- Advocacy opportunities
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, agent_id: str = "sentiment-001"):
|
| 44 |
+
"""Initialize the sentiment analyzer agent."""
|
| 45 |
+
super().__init__(agent_id, AgentRole.SENTIMENT_ANALYZER)
|
| 46 |
+
self._initialize_indicators()
|
| 47 |
+
|
| 48 |
+
def _initialize_indicators(self):
|
| 49 |
+
"""Initialize sentiment and debate indicators."""
|
| 50 |
+
self.supportive_indicators = [
|
| 51 |
+
"approve", "support", "favor", "endorse", "recommend",
|
| 52 |
+
"beneficial", "important", "necessary", "implement",
|
| 53 |
+
"move forward", "proceed with"
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
self.opposition_indicators = [
|
| 57 |
+
"oppose", "against", "reject", "deny", "concerns about",
|
| 58 |
+
"problems with", "issues with", "delay", "postpone",
|
| 59 |
+
"table the motion", "reconsider"
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
self.debate_indicators = [
|
| 63 |
+
"discussion", "debate", "motion", "vote", "amendment",
|
| 64 |
+
"public comment", "testimony", "hearing", "concerns",
|
| 65 |
+
"questions about", "divided"
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
self.urgency_indicators = [
|
| 69 |
+
"urgent", "immediate", "deadline", "vote", "decision",
|
| 70 |
+
"approval needed", "time-sensitive", "pressing",
|
| 71 |
+
"second reading", "final vote"
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
async def process(self, message: AgentMessage) -> List[AgentMessage]:
|
| 75 |
+
"""
|
| 76 |
+
Process sentiment analysis commands.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
message: Message containing classified documents
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
List of messages with sentiment analysis results
|
| 83 |
+
"""
|
| 84 |
+
self.update_status(AgentStatus.PROCESSING, "Analyzing policy sentiment and debate")
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
documents = message.payload.get("documents", [])
|
| 88 |
+
|
| 89 |
+
analyzed_documents = []
|
| 90 |
+
|
| 91 |
+
for doc in documents:
|
| 92 |
+
analysis = await self._analyze_document(doc)
|
| 93 |
+
doc["sentiment_analysis"] = analysis
|
| 94 |
+
analyzed_documents.append(doc)
|
| 95 |
+
|
| 96 |
+
# Identify advocacy opportunities
|
| 97 |
+
opportunities = self._identify_advocacy_opportunities(analyzed_documents)
|
| 98 |
+
|
| 99 |
+
# Send to advocacy writer agent
|
| 100 |
+
response = await self.send_message(
|
| 101 |
+
AgentRole.ADVOCACY_WRITER,
|
| 102 |
+
MessageType.DATA,
|
| 103 |
+
{
|
| 104 |
+
"workflow_id": message.payload.get("workflow_id"),
|
| 105 |
+
"documents": analyzed_documents,
|
| 106 |
+
"opportunities": opportunities,
|
| 107 |
+
"count": len(analyzed_documents)
|
| 108 |
+
}
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
self.log_success()
|
| 112 |
+
logger.info(
|
| 113 |
+
f"Analyzed sentiment for {len(analyzed_documents)} documents, "
|
| 114 |
+
f"found {len(opportunities)} advocacy opportunities"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return [response]
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
self.log_failure(str(e))
|
| 121 |
+
error_msg = await self.send_message(
|
| 122 |
+
AgentRole.ORCHESTRATOR,
|
| 123 |
+
MessageType.ERROR,
|
| 124 |
+
{"error": str(e), "agent": self.agent_id}
|
| 125 |
+
)
|
| 126 |
+
return [error_msg]
|
| 127 |
+
|
| 128 |
+
async def _analyze_document(
|
| 129 |
+
self,
|
| 130 |
+
doc: Dict[str, Any]
|
| 131 |
+
) -> Dict[str, Any]:
|
| 132 |
+
"""
|
| 133 |
+
Analyze sentiment and policy stance for a document.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
doc: Document to analyze
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Sentiment analysis results
|
| 140 |
+
"""
|
| 141 |
+
text = self._get_analyzable_text(doc)
|
| 142 |
+
text_lower = text.lower()
|
| 143 |
+
|
| 144 |
+
# Count sentiment indicators
|
| 145 |
+
support_score = sum(
|
| 146 |
+
1 for indicator in self.supportive_indicators
|
| 147 |
+
if indicator in text_lower
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
opposition_score = sum(
|
| 151 |
+
1 for indicator in self.opposition_indicators
|
| 152 |
+
if indicator in text_lower
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
debate_score = sum(
|
| 156 |
+
1 for indicator in self.debate_indicators
|
| 157 |
+
if indicator in text_lower
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
urgency_score = sum(
|
| 161 |
+
1 for indicator in self.urgency_indicators
|
| 162 |
+
if indicator in text_lower
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# Determine policy stance
|
| 166 |
+
stance = self._determine_stance(support_score, opposition_score, debate_score)
|
| 167 |
+
|
| 168 |
+
# Determine debate intensity
|
| 169 |
+
intensity = self._determine_intensity(debate_score, urgency_score, doc)
|
| 170 |
+
|
| 171 |
+
# Extract key arguments
|
| 172 |
+
arguments = self._extract_arguments(doc, text_lower)
|
| 173 |
+
|
| 174 |
+
# Calculate advocacy urgency
|
| 175 |
+
advocacy_urgency = self._calculate_advocacy_urgency(
|
| 176 |
+
stance, intensity, urgency_score
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
analysis = {
|
| 180 |
+
"stance": stance,
|
| 181 |
+
"debate_intensity": intensity,
|
| 182 |
+
"support_score": support_score,
|
| 183 |
+
"opposition_score": opposition_score,
|
| 184 |
+
"debate_score": debate_score,
|
| 185 |
+
"urgency_score": urgency_score,
|
| 186 |
+
"advocacy_urgency": advocacy_urgency,
|
| 187 |
+
"key_arguments": arguments,
|
| 188 |
+
"analyzed_at": datetime.utcnow().isoformat()
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
return analysis
|
| 192 |
+
|
| 193 |
+
def _get_analyzable_text(self, doc: Dict[str, Any]) -> str:
|
| 194 |
+
"""Extract text for sentiment analysis."""
|
| 195 |
+
parts = []
|
| 196 |
+
|
| 197 |
+
# Prioritize excerpts from classification
|
| 198 |
+
for excerpt in doc.get("classification", {}).get("relevant_excerpts", []):
|
| 199 |
+
parts.append(excerpt.get("text", ""))
|
| 200 |
+
|
| 201 |
+
# Add motions (highly relevant)
|
| 202 |
+
for motion in doc.get("motions", []):
|
| 203 |
+
parts.append(motion.get("text", ""))
|
| 204 |
+
|
| 205 |
+
# Add votes
|
| 206 |
+
for vote in doc.get("votes", []):
|
| 207 |
+
parts.append(vote.get("result", ""))
|
| 208 |
+
|
| 209 |
+
# Fallback to full text if needed
|
| 210 |
+
if not parts:
|
| 211 |
+
parts.append(doc.get("full_text", ""))
|
| 212 |
+
|
| 213 |
+
return " ".join(parts)
|
| 214 |
+
|
| 215 |
+
def _determine_stance(
|
| 216 |
+
self,
|
| 217 |
+
support_score: int,
|
| 218 |
+
opposition_score: int,
|
| 219 |
+
debate_score: int
|
| 220 |
+
) -> str:
|
| 221 |
+
"""Determine overall policy stance."""
|
| 222 |
+
if debate_score >= 3 and abs(support_score - opposition_score) <= 1:
|
| 223 |
+
return PolicyStance.DEBATED
|
| 224 |
+
|
| 225 |
+
if support_score > opposition_score:
|
| 226 |
+
if support_score >= 3:
|
| 227 |
+
return PolicyStance.STRONGLY_SUPPORTIVE
|
| 228 |
+
else:
|
| 229 |
+
return PolicyStance.SUPPORTIVE
|
| 230 |
+
elif opposition_score > support_score:
|
| 231 |
+
if opposition_score >= 3:
|
| 232 |
+
return PolicyStance.STRONGLY_OPPOSED
|
| 233 |
+
else:
|
| 234 |
+
return PolicyStance.OPPOSED
|
| 235 |
+
else:
|
| 236 |
+
return PolicyStance.NEUTRAL
|
| 237 |
+
|
| 238 |
+
def _determine_intensity(
|
| 239 |
+
self,
|
| 240 |
+
debate_score: int,
|
| 241 |
+
urgency_score: int,
|
| 242 |
+
doc: Dict[str, Any]
|
| 243 |
+
) -> str:
|
| 244 |
+
"""Determine debate intensity."""
|
| 245 |
+
# Check for votes or motions (indicates high intensity)
|
| 246 |
+
has_vote = len(doc.get("votes", [])) > 0
|
| 247 |
+
has_motion = len(doc.get("motions", [])) > 0
|
| 248 |
+
|
| 249 |
+
if urgency_score >= 2 or (has_vote and has_motion):
|
| 250 |
+
return DebateIntensity.CRITICAL
|
| 251 |
+
elif debate_score >= 5 or has_vote or has_motion:
|
| 252 |
+
return DebateIntensity.HIGH
|
| 253 |
+
elif debate_score >= 3:
|
| 254 |
+
return DebateIntensity.MODERATE
|
| 255 |
+
elif debate_score >= 1:
|
| 256 |
+
return DebateIntensity.LOW
|
| 257 |
+
else:
|
| 258 |
+
return DebateIntensity.NONE
|
| 259 |
+
|
| 260 |
+
def _extract_arguments(
|
| 261 |
+
self,
|
| 262 |
+
doc: Dict[str, Any],
|
| 263 |
+
text_lower: str
|
| 264 |
+
) -> Dict[str, List[str]]:
|
| 265 |
+
"""Extract key arguments for and against."""
|
| 266 |
+
arguments = {
|
| 267 |
+
"supporting": [],
|
| 268 |
+
"opposing": []
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Extract from motions and discussion
|
| 272 |
+
for motion in doc.get("motions", []):
|
| 273 |
+
motion_text = motion.get("text", "").lower()
|
| 274 |
+
|
| 275 |
+
if any(ind in motion_text for ind in self.supportive_indicators):
|
| 276 |
+
arguments["supporting"].append(motion.get("text", ""))
|
| 277 |
+
elif any(ind in motion_text for ind in self.opposition_indicators):
|
| 278 |
+
arguments["opposing"].append(motion.get("text", ""))
|
| 279 |
+
|
| 280 |
+
return arguments
|
| 281 |
+
|
| 282 |
+
def _calculate_advocacy_urgency(
|
| 283 |
+
self,
|
| 284 |
+
stance: str,
|
| 285 |
+
intensity: str,
|
| 286 |
+
urgency_score: int
|
| 287 |
+
) -> str:
|
| 288 |
+
"""
|
| 289 |
+
Calculate how urgent advocacy action is needed.
|
| 290 |
+
|
| 291 |
+
Returns: "critical", "high", "medium", "low", or "none"
|
| 292 |
+
"""
|
| 293 |
+
# Critical: Vote imminent and debated/opposed
|
| 294 |
+
if intensity == DebateIntensity.CRITICAL:
|
| 295 |
+
if stance in [PolicyStance.DEBATED, PolicyStance.OPPOSED, PolicyStance.STRONGLY_OPPOSED]:
|
| 296 |
+
return "critical"
|
| 297 |
+
return "high"
|
| 298 |
+
|
| 299 |
+
# High: Active debate with opposition
|
| 300 |
+
if intensity == DebateIntensity.HIGH:
|
| 301 |
+
if stance in [PolicyStance.OPPOSED, PolicyStance.STRONGLY_OPPOSED]:
|
| 302 |
+
return "high"
|
| 303 |
+
elif stance == PolicyStance.DEBATED:
|
| 304 |
+
return "high"
|
| 305 |
+
return "medium"
|
| 306 |
+
|
| 307 |
+
# Medium: Moderate discussion or emerging issue
|
| 308 |
+
if intensity == DebateIntensity.MODERATE:
|
| 309 |
+
return "medium"
|
| 310 |
+
|
| 311 |
+
# Low: Early stage or general mention
|
| 312 |
+
if intensity == DebateIntensity.LOW:
|
| 313 |
+
return "low"
|
| 314 |
+
|
| 315 |
+
return "none"
|
| 316 |
+
|
| 317 |
+
def _identify_advocacy_opportunities(
|
| 318 |
+
self,
|
| 319 |
+
documents: List[Dict[str, Any]]
|
| 320 |
+
) -> List[Dict[str, Any]]:
|
| 321 |
+
"""
|
| 322 |
+
Identify advocacy opportunities across all analyzed documents.
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
documents: All analyzed documents
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
List of advocacy opportunities
|
| 329 |
+
"""
|
| 330 |
+
opportunities = []
|
| 331 |
+
|
| 332 |
+
for doc in documents:
|
| 333 |
+
sentiment = doc.get("sentiment_analysis", {})
|
| 334 |
+
urgency = sentiment.get("advocacy_urgency")
|
| 335 |
+
|
| 336 |
+
# Only flag high and critical urgency items
|
| 337 |
+
if urgency in ["critical", "high"]:
|
| 338 |
+
opportunity = {
|
| 339 |
+
"document_id": doc["document_id"],
|
| 340 |
+
"municipality": doc["municipality"],
|
| 341 |
+
"state": doc["state"],
|
| 342 |
+
"meeting_date": doc["meeting_date"],
|
| 343 |
+
"source_url": doc["source_url"],
|
| 344 |
+
"topic": doc["classification"]["primary_topic"],
|
| 345 |
+
"stance": sentiment["stance"],
|
| 346 |
+
"intensity": sentiment["debate_intensity"],
|
| 347 |
+
"urgency": urgency,
|
| 348 |
+
"key_excerpts": doc["classification"].get("relevant_excerpts", []),
|
| 349 |
+
"recommended_action": self._recommend_action(sentiment, doc)
|
| 350 |
+
}
|
| 351 |
+
opportunities.append(opportunity)
|
| 352 |
+
|
| 353 |
+
# Sort by urgency
|
| 354 |
+
urgency_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
| 355 |
+
opportunities.sort(key=lambda x: urgency_order.get(x["urgency"], 4))
|
| 356 |
+
|
| 357 |
+
return opportunities
|
| 358 |
+
|
| 359 |
+
def _recommend_action(
|
| 360 |
+
self,
|
| 361 |
+
sentiment: Dict[str, Any],
|
| 362 |
+
doc: Dict[str, Any]
|
| 363 |
+
) -> str:
|
| 364 |
+
"""Recommend advocacy action based on analysis."""
|
| 365 |
+
stance = sentiment.get("stance")
|
| 366 |
+
intensity = sentiment.get("debate_intensity")
|
| 367 |
+
|
| 368 |
+
if intensity == DebateIntensity.CRITICAL:
|
| 369 |
+
if stance in [PolicyStance.OPPOSED, PolicyStance.STRONGLY_OPPOSED]:
|
| 370 |
+
return "URGENT: Contact officials immediately. Vote imminent."
|
| 371 |
+
elif stance == PolicyStance.DEBATED:
|
| 372 |
+
return "URGENT: Provide supporting testimony. Decision pending."
|
| 373 |
+
|
| 374 |
+
if stance == PolicyStance.DEBATED:
|
| 375 |
+
return "Engage with stakeholders. Provide educational materials."
|
| 376 |
+
elif stance in [PolicyStance.OPPOSED, PolicyStance.STRONGLY_OPPOSED]:
|
| 377 |
+
return "Initiate dialogue with decision-makers. Address concerns."
|
| 378 |
+
elif stance == PolicyStance.NEUTRAL:
|
| 379 |
+
return "Introduce topic to agenda. Build awareness."
|
| 380 |
+
|
| 381 |
+
return "Monitor situation. Prepare support materials."
|
api/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API module for the Oral Health Policy Pulse system."""
|
| 2 |
+
from api.main import app
|
| 3 |
+
|
| 4 |
+
__all__ = ["app"]
|
api/app.py
ADDED
|
@@ -0,0 +1,711 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application optimized for Databricks Apps deployment.
|
| 3 |
+
Serves React frontend and provides REST API for agent interactions.
|
| 4 |
+
"""
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
|
| 9 |
+
from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from pydantic import BaseModel, Field
|
| 13 |
+
from loguru import logger
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
from agents.orchestrator import OrchestratorAgent
|
| 17 |
+
from pipeline.delta_lake import DeltaLakePipeline
|
| 18 |
+
from config import settings
|
| 19 |
+
|
| 20 |
+
# Initialize FastAPI app
|
| 21 |
+
app = FastAPI(
|
| 22 |
+
title="Open Navigator",
|
| 23 |
+
description="AI-powered advocacy opportunity finder",
|
| 24 |
+
version="2.0.0",
|
| 25 |
+
docs_url="/api/docs",
|
| 26 |
+
redoc_url="/api/redoc",
|
| 27 |
+
openapi_url="/api/openapi.json"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Add CORS middleware
|
| 31 |
+
app.add_middleware(
|
| 32 |
+
CORSMiddleware,
|
| 33 |
+
allow_origins=["*"],
|
| 34 |
+
allow_credentials=True,
|
| 35 |
+
allow_methods=["*"],
|
| 36 |
+
allow_headers=["*"],
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Initialize components
|
| 40 |
+
orchestrator = OrchestratorAgent()
|
| 41 |
+
pipeline = DeltaLakePipeline()
|
| 42 |
+
|
| 43 |
+
# Pydantic models
|
| 44 |
+
class WorkflowRequest(BaseModel):
|
| 45 |
+
"""Request to start a new analysis workflow."""
|
| 46 |
+
targets: List[Dict[str, str]]
|
| 47 |
+
topics: Optional[List[str]] = None
|
| 48 |
+
|
| 49 |
+
class OpportunityFilter(BaseModel):
|
| 50 |
+
"""Filter criteria for advocacy opportunities."""
|
| 51 |
+
state: Optional[str] = None
|
| 52 |
+
topic: Optional[str] = None
|
| 53 |
+
urgency: Optional[str] = None
|
| 54 |
+
min_confidence: Optional[float] = 0.7
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# API Routes
|
| 58 |
+
@app.get("/api/health")
|
| 59 |
+
async def health_check():
|
| 60 |
+
"""Health check endpoint."""
|
| 61 |
+
return {"status": "healthy", "timestamp": datetime.utcnow().isoformat()}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@app.get("/api/dashboard")
|
| 65 |
+
async def get_dashboard_stats():
|
| 66 |
+
"""Get dashboard statistics and recent opportunities."""
|
| 67 |
+
try:
|
| 68 |
+
# Query Delta Lake for stats
|
| 69 |
+
stats = await pipeline.get_dashboard_stats()
|
| 70 |
+
return stats
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error(f"Dashboard stats error: {e}")
|
| 73 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@app.get("/api/opportunities")
|
| 77 |
+
async def get_opportunities(
|
| 78 |
+
state: Optional[str] = Query(None),
|
| 79 |
+
topic: Optional[str] = Query(None),
|
| 80 |
+
urgency: Optional[str] = Query(None),
|
| 81 |
+
limit: int = Query(100, le=1000)
|
| 82 |
+
):
|
| 83 |
+
"""Get advocacy opportunities with optional filters."""
|
| 84 |
+
try:
|
| 85 |
+
opportunities = await pipeline.query_opportunities(
|
| 86 |
+
state=state,
|
| 87 |
+
topic=topic,
|
| 88 |
+
urgency=urgency,
|
| 89 |
+
limit=limit
|
| 90 |
+
)
|
| 91 |
+
return {"opportunities": opportunities, "count": len(opportunities)}
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.error(f"Query opportunities error: {e}")
|
| 94 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@app.get("/api/documents")
|
| 98 |
+
async def get_documents(
|
| 99 |
+
search: Optional[str] = Query(None),
|
| 100 |
+
page: int = Query(1, ge=1),
|
| 101 |
+
limit: int = Query(20, le=100)
|
| 102 |
+
):
|
| 103 |
+
"""Get analyzed documents with pagination."""
|
| 104 |
+
try:
|
| 105 |
+
offset = (page - 1) * limit
|
| 106 |
+
documents = await pipeline.query_documents(
|
| 107 |
+
search=search,
|
| 108 |
+
limit=limit,
|
| 109 |
+
offset=offset
|
| 110 |
+
)
|
| 111 |
+
total = await pipeline.count_documents(search=search)
|
| 112 |
+
return {
|
| 113 |
+
"documents": documents,
|
| 114 |
+
"page": page,
|
| 115 |
+
"limit": limit,
|
| 116 |
+
"total": total,
|
| 117 |
+
"total_pages": (total + limit - 1) // limit
|
| 118 |
+
}
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"Query documents error: {e}")
|
| 121 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@app.get("/api/search/")
|
| 125 |
+
async def search_all(
|
| 126 |
+
q: str = Query(..., min_length=1, description="Search query"),
|
| 127 |
+
types: Optional[str] = Query("contacts,organizations,causes", description="Comma-separated types to search"),
|
| 128 |
+
state: Optional[str] = Query(None, description="Filter by state code (e.g. MA, AL)"),
|
| 129 |
+
limit: int = Query(10, ge=1, le=100),
|
| 130 |
+
page: int = Query(1, ge=1),
|
| 131 |
+
ntee_code: Optional[str] = Query(None, description="NTEE category code")
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
Unified search across contacts, organizations, causes, and jurisdictions.
|
| 135 |
+
Returns results grouped by type with pagination.
|
| 136 |
+
"""
|
| 137 |
+
try:
|
| 138 |
+
offset = (page - 1) * limit
|
| 139 |
+
search_types = [t.strip() for t in types.split(',') if t.strip()]
|
| 140 |
+
|
| 141 |
+
# Initialize results structure
|
| 142 |
+
results = {
|
| 143 |
+
"contacts": [],
|
| 144 |
+
"organizations": [],
|
| 145 |
+
"causes": [],
|
| 146 |
+
"meetings": [],
|
| 147 |
+
"jurisdictions": []
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
# Search logic (placeholder - implement actual search)
|
| 151 |
+
# For now, return mock results for demonstration
|
| 152 |
+
if "contacts" in search_types:
|
| 153 |
+
results["contacts"] = [
|
| 154 |
+
{
|
| 155 |
+
"type": "contact",
|
| 156 |
+
"title": f"Sample Contact matching '{q}'",
|
| 157 |
+
"subtitle": "Government Official",
|
| 158 |
+
"description": "Contact information for local official",
|
| 159 |
+
"url": "/contact/1",
|
| 160 |
+
"score": 0.9,
|
| 161 |
+
"metadata": {"state": state or "MA"}
|
| 162 |
+
}
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
if "organizations" in search_types:
|
| 166 |
+
results["organizations"] = [
|
| 167 |
+
{
|
| 168 |
+
"type": "organization",
|
| 169 |
+
"title": f"Sample Organization matching '{q}'",
|
| 170 |
+
"subtitle": "Nonprofit Organization",
|
| 171 |
+
"description": "Community health organization",
|
| 172 |
+
"url": "/org/1",
|
| 173 |
+
"score": 0.85,
|
| 174 |
+
"metadata": {"state": state or "MA", "ntee": ntee_code}
|
| 175 |
+
}
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
if "causes" in search_types:
|
| 179 |
+
results["causes"] = [
|
| 180 |
+
{
|
| 181 |
+
"type": "cause",
|
| 182 |
+
"title": f"Sample Cause matching '{q}'",
|
| 183 |
+
"subtitle": "Health & Wellness",
|
| 184 |
+
"description": "Advocacy for community health",
|
| 185 |
+
"url": "/cause/1",
|
| 186 |
+
"score": 0.8,
|
| 187 |
+
"metadata": {}
|
| 188 |
+
}
|
| 189 |
+
]
|
| 190 |
+
|
| 191 |
+
# Calculate total results
|
| 192 |
+
total_results = sum(len(v) for v in results.values())
|
| 193 |
+
total_pages = max(1, (total_results + limit - 1) // limit)
|
| 194 |
+
|
| 195 |
+
return {
|
| 196 |
+
"query": q,
|
| 197 |
+
"total_results": total_results,
|
| 198 |
+
"results": results,
|
| 199 |
+
"pagination": {
|
| 200 |
+
"page": page,
|
| 201 |
+
"limit": limit,
|
| 202 |
+
"offset": offset,
|
| 203 |
+
"total_pages": total_pages,
|
| 204 |
+
"has_next": page < total_pages,
|
| 205 |
+
"has_prev": page > 1
|
| 206 |
+
},
|
| 207 |
+
"filters": {
|
| 208 |
+
"state": state,
|
| 209 |
+
"ntee_code": ntee_code,
|
| 210 |
+
"types": search_types
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
except Exception as e:
|
| 214 |
+
logger.error(f"Search error: {e}")
|
| 215 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
@app.post("/api/workflow/start")
|
| 219 |
+
async def start_workflow(request: WorkflowRequest, background_tasks: BackgroundTasks):
|
| 220 |
+
"""Start a new analysis workflow."""
|
| 221 |
+
try:
|
| 222 |
+
workflow_id = f"workflow_{datetime.utcnow().timestamp()}"
|
| 223 |
+
|
| 224 |
+
# Start workflow in background
|
| 225 |
+
background_tasks.add_task(
|
| 226 |
+
orchestrator.execute_pipeline,
|
| 227 |
+
workflow_id=workflow_id,
|
| 228 |
+
targets=request.targets,
|
| 229 |
+
topics=request.topics
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"workflow_id": workflow_id,
|
| 234 |
+
"status": "started",
|
| 235 |
+
"message": "Workflow started successfully"
|
| 236 |
+
}
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Workflow start error: {e}")
|
| 239 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
@app.get("/api/workflow/{workflow_id}/status")
|
| 243 |
+
async def get_workflow_status(workflow_id: str):
|
| 244 |
+
"""Get status of a running workflow."""
|
| 245 |
+
try:
|
| 246 |
+
status = await orchestrator.get_workflow_status(workflow_id)
|
| 247 |
+
return status
|
| 248 |
+
except Exception as e:
|
| 249 |
+
logger.error(f"Workflow status error: {e}")
|
| 250 |
+
raise HTTPException(status_code=404, detail="Workflow not found")
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
@app.post("/api/advocacy/email/{opportunity_id}")
|
| 254 |
+
async def generate_advocacy_email(opportunity_id: str):
|
| 255 |
+
"""Generate advocacy email for an opportunity."""
|
| 256 |
+
try:
|
| 257 |
+
opportunity = await pipeline.get_opportunity(opportunity_id)
|
| 258 |
+
if not opportunity:
|
| 259 |
+
raise HTTPException(status_code=404, detail="Opportunity not found")
|
| 260 |
+
|
| 261 |
+
email_content = await orchestrator.generate_advocacy_email(opportunity)
|
| 262 |
+
return {"content": email_content}
|
| 263 |
+
except Exception as e:
|
| 264 |
+
logger.error(f"Generate email error: {e}")
|
| 265 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
@app.get("/api/settings")
|
| 269 |
+
async def get_settings():
|
| 270 |
+
"""Get current system settings."""
|
| 271 |
+
return {
|
| 272 |
+
"target_states": settings.target_states or [],
|
| 273 |
+
"policy_topics": settings.policy_topics,
|
| 274 |
+
"min_confidence": 0.7,
|
| 275 |
+
"email_notifications": False,
|
| 276 |
+
"notification_email": ""
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
@app.put("/api/settings")
|
| 281 |
+
async def update_settings(new_settings: Dict[str, Any]):
|
| 282 |
+
"""Update system settings."""
|
| 283 |
+
try:
|
| 284 |
+
# In production, this would update configuration in Unity Catalog
|
| 285 |
+
logger.info(f"Settings update requested: {new_settings}")
|
| 286 |
+
return {"message": "Settings updated successfully"}
|
| 287 |
+
except Exception as e:
|
| 288 |
+
logger.error(f"Settings update error: {e}")
|
| 289 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
@app.get("/api/agents/status")
|
| 293 |
+
async def get_agents_status():
|
| 294 |
+
"""Get status of all agents."""
|
| 295 |
+
try:
|
| 296 |
+
return {
|
| 297 |
+
"agents": [
|
| 298 |
+
{"name": "Scraper", "status": "active", "uptime": "24h"},
|
| 299 |
+
{"name": "Classifier", "status": "active", "uptime": "24h"},
|
| 300 |
+
{"name": "Sentiment Analyzer", "status": "active", "uptime": "24h"},
|
| 301 |
+
{"name": "Advocacy Writer", "status": "active", "uptime": "24h"}
|
| 302 |
+
]
|
| 303 |
+
}
|
| 304 |
+
except Exception as e:
|
| 305 |
+
logger.error(f"Agent status error: {e}")
|
| 306 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
@app.get("/api/nonprofits")
|
| 310 |
+
async def search_nonprofits(
|
| 311 |
+
location: str = Query("Tuscaloosa, AL", description="City, State format"),
|
| 312 |
+
keyword: Optional[str] = Query(None, description="Service keyword (e.g., 'dental', 'health')"),
|
| 313 |
+
state: Optional[str] = Query(None, description="2-letter state code (e.g., 'AL')"),
|
| 314 |
+
ntee_code: Optional[str] = Query(None, description="NTEE code (e.g., 'E' for health)"),
|
| 315 |
+
source: Optional[str] = Query(None, description="Data source: 'propublica', 'everyorg', 'all'")
|
| 316 |
+
):
|
| 317 |
+
"""
|
| 318 |
+
Search for nonprofits using free open data APIs.
|
| 319 |
+
|
| 320 |
+
Integrates data from:
|
| 321 |
+
- ProPublica Nonprofit Explorer (financial data, NTEE codes)
|
| 322 |
+
- Every.org (mission statements, logos)
|
| 323 |
+
- IRS TEOS (official tax-exempt status)
|
| 324 |
+
|
| 325 |
+
Example: /api/nonprofits?location=Tuscaloosa,AL&keyword=dental&ntee_code=E
|
| 326 |
+
"""
|
| 327 |
+
try:
|
| 328 |
+
from discovery.nonprofit_discovery import NonprofitDiscovery
|
| 329 |
+
|
| 330 |
+
discovery = NonprofitDiscovery()
|
| 331 |
+
results = []
|
| 332 |
+
|
| 333 |
+
# Parse location for state/city
|
| 334 |
+
location_parts = location.split(',')
|
| 335 |
+
city = location_parts[0].strip() if len(location_parts) > 0 else None
|
| 336 |
+
state_from_location = location_parts[1].strip() if len(location_parts) > 1 else None
|
| 337 |
+
state_code = state or state_from_location or "AL"
|
| 338 |
+
|
| 339 |
+
# Determine which sources to query
|
| 340 |
+
sources_to_query = ['propublica', 'everyorg'] if source == 'all' or not source else [source]
|
| 341 |
+
|
| 342 |
+
# Query ProPublica
|
| 343 |
+
if 'propublica' in sources_to_query:
|
| 344 |
+
try:
|
| 345 |
+
propublica_results = discovery.search_propublica(
|
| 346 |
+
state=state_code,
|
| 347 |
+
city=city,
|
| 348 |
+
ntee_code=ntee_code
|
| 349 |
+
)
|
| 350 |
+
results.extend(propublica_results)
|
| 351 |
+
logger.info(f"ProPublica: Found {len(propublica_results)} organizations")
|
| 352 |
+
except Exception as e:
|
| 353 |
+
logger.warning(f"ProPublica search failed: {e}")
|
| 354 |
+
|
| 355 |
+
# Query Every.org
|
| 356 |
+
if 'everyorg' in sources_to_query:
|
| 357 |
+
try:
|
| 358 |
+
causes = []
|
| 359 |
+
if keyword:
|
| 360 |
+
# Map keywords to causes
|
| 361 |
+
keyword_lower = keyword.lower()
|
| 362 |
+
if 'health' in keyword_lower or 'dental' in keyword_lower or 'medical' in keyword_lower:
|
| 363 |
+
causes.append('health')
|
| 364 |
+
if 'education' in keyword_lower or 'school' in keyword_lower:
|
| 365 |
+
causes.append('education')
|
| 366 |
+
|
| 367 |
+
everyorg_results = discovery.search_everyorg(
|
| 368 |
+
location=location,
|
| 369 |
+
causes=causes if causes else None
|
| 370 |
+
)
|
| 371 |
+
results.extend(everyorg_results)
|
| 372 |
+
logger.info(f"Every.org: Found {len(everyorg_results)} organizations")
|
| 373 |
+
except Exception as e:
|
| 374 |
+
logger.warning(f"Every.org search failed: {e}")
|
| 375 |
+
|
| 376 |
+
# Filter by keyword if provided
|
| 377 |
+
if keyword and results:
|
| 378 |
+
keyword_lower = keyword.lower()
|
| 379 |
+
filtered_results = []
|
| 380 |
+
for org in results:
|
| 381 |
+
# Search in name, description, mission, ntee_description
|
| 382 |
+
searchable_text = ' '.join([
|
| 383 |
+
str(org.get('name', '')),
|
| 384 |
+
str(org.get('description', '')),
|
| 385 |
+
str(org.get('mission', '')),
|
| 386 |
+
str(org.get('ntee_description', ''))
|
| 387 |
+
]).lower()
|
| 388 |
+
|
| 389 |
+
if keyword_lower in searchable_text:
|
| 390 |
+
filtered_results.append(org)
|
| 391 |
+
|
| 392 |
+
results = filtered_results
|
| 393 |
+
|
| 394 |
+
return {
|
| 395 |
+
"location": location,
|
| 396 |
+
"keyword": keyword,
|
| 397 |
+
"state": state_code,
|
| 398 |
+
"ntee_code": ntee_code,
|
| 399 |
+
"count": len(results),
|
| 400 |
+
"nonprofits": results,
|
| 401 |
+
"data_sources": {
|
| 402 |
+
"propublica": "https://projects.propublica.org/nonprofits/api",
|
| 403 |
+
"everyorg": "https://www.every.org/nonprofit-api",
|
| 404 |
+
"irs_teos": "https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads"
|
| 405 |
+
}
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
except Exception as e:
|
| 409 |
+
logger.error(f"Nonprofit search error: {e}")
|
| 410 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
@app.get("/api/data/status")
|
| 414 |
+
async def get_data_status():
|
| 415 |
+
"""
|
| 416 |
+
Get status of all reference data ingestions.
|
| 417 |
+
|
| 418 |
+
Returns counts and last update times for:
|
| 419 |
+
- Census jurisdictions
|
| 420 |
+
- NCES school districts
|
| 421 |
+
- Nonprofit organizations
|
| 422 |
+
- Meeting datasets (MeetingBank, LocalView, etc.)
|
| 423 |
+
"""
|
| 424 |
+
try:
|
| 425 |
+
from pathlib import Path
|
| 426 |
+
from datetime import datetime
|
| 427 |
+
|
| 428 |
+
status = {
|
| 429 |
+
"census_jurisdictions": {
|
| 430 |
+
"path": "data/bronze/census_jurisdictions",
|
| 431 |
+
"status": "not_ingested",
|
| 432 |
+
"count": 0,
|
| 433 |
+
"last_updated": None
|
| 434 |
+
},
|
| 435 |
+
"nces_school_districts": {
|
| 436 |
+
"path": "data/bronze/nces_school_districts",
|
| 437 |
+
"status": "not_ingested",
|
| 438 |
+
"count": 0,
|
| 439 |
+
"last_updated": None
|
| 440 |
+
},
|
| 441 |
+
"nonprofits": {
|
| 442 |
+
"path": "data/cache/nonprofits",
|
| 443 |
+
"status": "cached",
|
| 444 |
+
"count": 0,
|
| 445 |
+
"last_updated": None
|
| 446 |
+
},
|
| 447 |
+
"meeting_datasets": {
|
| 448 |
+
"meetingbank": {"status": "available", "count": 1366},
|
| 449 |
+
"city_scrapers": {"status": "available", "count": "100-500"},
|
| 450 |
+
"open_states": {"status": "available", "count": "50+"}
|
| 451 |
+
}
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
# Check each data directory
|
| 455 |
+
for key in ["census_jurisdictions", "nces_school_districts", "nonprofits"]:
|
| 456 |
+
data_path = Path(status[key]["path"])
|
| 457 |
+
if data_path.exists():
|
| 458 |
+
files = list(data_path.glob("**/*"))
|
| 459 |
+
status[key]["count"] = len(files)
|
| 460 |
+
status[key]["status"] = "ingested" if files else "empty"
|
| 461 |
+
if files:
|
| 462 |
+
latest_file = max(files, key=lambda f: f.stat().st_mtime if f.is_file() else 0)
|
| 463 |
+
if latest_file.is_file():
|
| 464 |
+
status[key]["last_updated"] = datetime.fromtimestamp(
|
| 465 |
+
latest_file.stat().st_mtime
|
| 466 |
+
).isoformat()
|
| 467 |
+
|
| 468 |
+
return status
|
| 469 |
+
|
| 470 |
+
except Exception as e:
|
| 471 |
+
logger.error(f"Data status error: {e}")
|
| 472 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
@app.post("/api/data/ingest/census")
|
| 476 |
+
async def ingest_census_data(background_tasks: BackgroundTasks):
|
| 477 |
+
"""
|
| 478 |
+
Trigger Census Bureau jurisdiction data ingestion.
|
| 479 |
+
|
| 480 |
+
Downloads and processes:
|
| 481 |
+
- 3,144 counties
|
| 482 |
+
- 19,500+ municipalities
|
| 483 |
+
- 36,000+ townships
|
| 484 |
+
- 13,000+ school districts
|
| 485 |
+
|
| 486 |
+
This is a long-running operation that runs in the background.
|
| 487 |
+
"""
|
| 488 |
+
try:
|
| 489 |
+
def run_census_ingestion():
|
| 490 |
+
from discovery.census_ingestion import CensusGovernmentIngestion
|
| 491 |
+
import asyncio
|
| 492 |
+
|
| 493 |
+
logger.info("Starting Census data ingestion...")
|
| 494 |
+
ingestor = CensusGovernmentIngestion()
|
| 495 |
+
|
| 496 |
+
# Run async ingestion
|
| 497 |
+
loop = asyncio.new_event_loop()
|
| 498 |
+
asyncio.set_event_loop(loop)
|
| 499 |
+
result = loop.run_until_complete(ingestor.ingest_all_jurisdictions())
|
| 500 |
+
loop.close()
|
| 501 |
+
|
| 502 |
+
logger.success(f"Census ingestion complete: {result}")
|
| 503 |
+
|
| 504 |
+
background_tasks.add_task(run_census_ingestion)
|
| 505 |
+
|
| 506 |
+
return {
|
| 507 |
+
"message": "Census data ingestion started",
|
| 508 |
+
"status": "processing",
|
| 509 |
+
"check_status": "/api/data/status"
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
except Exception as e:
|
| 513 |
+
logger.error(f"Census ingestion error: {e}")
|
| 514 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
@app.post("/api/data/ingest/nces")
|
| 518 |
+
async def ingest_nces_data(background_tasks: BackgroundTasks):
|
| 519 |
+
"""
|
| 520 |
+
Trigger NCES school district data ingestion.
|
| 521 |
+
|
| 522 |
+
Downloads and processes 13,000+ school districts with:
|
| 523 |
+
- District names and addresses
|
| 524 |
+
- Contact information
|
| 525 |
+
- NCES IDs
|
| 526 |
+
- Enrollment data
|
| 527 |
+
"""
|
| 528 |
+
try:
|
| 529 |
+
def run_nces_ingestion():
|
| 530 |
+
from discovery.nces_ingestion import NCESSchoolDistrictIngestion
|
| 531 |
+
import asyncio
|
| 532 |
+
|
| 533 |
+
logger.info("Starting NCES data ingestion...")
|
| 534 |
+
ingestor = NCESSchoolDistrictIngestion()
|
| 535 |
+
|
| 536 |
+
# Run async ingestion
|
| 537 |
+
loop = asyncio.new_event_loop()
|
| 538 |
+
asyncio.set_event_loop(loop)
|
| 539 |
+
result = loop.run_until_complete(ingestor.download_and_process())
|
| 540 |
+
loop.close()
|
| 541 |
+
|
| 542 |
+
logger.success(f"NCES ingestion complete: {result}")
|
| 543 |
+
|
| 544 |
+
background_tasks.add_task(run_nces_ingestion)
|
| 545 |
+
|
| 546 |
+
return {
|
| 547 |
+
"message": "NCES data ingestion started",
|
| 548 |
+
"status": "processing",
|
| 549 |
+
"check_status": "/api/data/status"
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
except Exception as e:
|
| 553 |
+
logger.error(f"NCES ingestion error: {e}")
|
| 554 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
@app.post("/api/data/ingest/nonprofits")
|
| 558 |
+
async def ingest_nonprofits(
|
| 559 |
+
state: str = Query(..., description="2-letter state code"),
|
| 560 |
+
ntee_codes: Optional[List[str]] = Query(None, description="NTEE codes to ingest"),
|
| 561 |
+
background_tasks: BackgroundTasks = None
|
| 562 |
+
):
|
| 563 |
+
"""
|
| 564 |
+
Trigger nonprofit data ingestion for a specific state.
|
| 565 |
+
|
| 566 |
+
Bulk downloads nonprofit data from ProPublica API and caches locally.
|
| 567 |
+
|
| 568 |
+
Example: POST /api/data/ingest/nonprofits?state=AL&ntee_codes=E&ntee_codes=E20
|
| 569 |
+
"""
|
| 570 |
+
try:
|
| 571 |
+
from discovery.nonprofit_discovery import NonprofitDiscovery
|
| 572 |
+
|
| 573 |
+
discovery = NonprofitDiscovery()
|
| 574 |
+
ntee_list = ntee_codes or ["E"] # Default to health
|
| 575 |
+
|
| 576 |
+
total_orgs = 0
|
| 577 |
+
for ntee_code in ntee_list:
|
| 578 |
+
orgs = discovery.search_propublica(state=state, ntee_code=ntee_code)
|
| 579 |
+
total_orgs += len(orgs)
|
| 580 |
+
logger.info(f"Cached {len(orgs)} nonprofits for {state}/{ntee_code}")
|
| 581 |
+
|
| 582 |
+
return {
|
| 583 |
+
"message": f"Nonprofit data ingestion complete for {state}",
|
| 584 |
+
"state": state,
|
| 585 |
+
"ntee_codes": ntee_list,
|
| 586 |
+
"organizations_cached": total_orgs,
|
| 587 |
+
"cache_location": "data/cache/nonprofits"
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
except Exception as e:
|
| 591 |
+
logger.error(f"Nonprofit ingestion error: {e}")
|
| 592 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
@app.get("/api/jurisdictions")
|
| 596 |
+
async def get_jurisdictions(
|
| 597 |
+
state: Optional[str] = Query(None, description="2-letter state code"),
|
| 598 |
+
type: Optional[str] = Query(None, description="Type: county, municipality, township"),
|
| 599 |
+
limit: int = Query(100, le=1000)
|
| 600 |
+
):
|
| 601 |
+
"""
|
| 602 |
+
Query ingested Census jurisdiction data.
|
| 603 |
+
|
| 604 |
+
Returns government entities with FIPS codes, coordinates, and population.
|
| 605 |
+
"""
|
| 606 |
+
try:
|
| 607 |
+
# This would query the Delta Lake census tables
|
| 608 |
+
# For now, return sample data
|
| 609 |
+
return {
|
| 610 |
+
"message": "Query census jurisdiction data from Delta Lake",
|
| 611 |
+
"filters": {"state": state, "type": type},
|
| 612 |
+
"limit": limit,
|
| 613 |
+
"note": "Requires Census data ingestion first (POST /api/data/ingest/census)",
|
| 614 |
+
"example_data": [
|
| 615 |
+
{
|
| 616 |
+
"name": "Tuscaloosa County",
|
| 617 |
+
"state": "AL",
|
| 618 |
+
"type": "county",
|
| 619 |
+
"fips": "01125",
|
| 620 |
+
"population": "209355"
|
| 621 |
+
}
|
| 622 |
+
]
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
+
except Exception as e:
|
| 626 |
+
logger.error(f"Jurisdiction query error: {e}")
|
| 627 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
@app.get("/api/school-districts")
|
| 631 |
+
async def get_school_districts(
|
| 632 |
+
state: Optional[str] = Query(None, description="2-letter state code"),
|
| 633 |
+
limit: int = Query(100, le=1000)
|
| 634 |
+
):
|
| 635 |
+
"""
|
| 636 |
+
Query ingested NCES school district data.
|
| 637 |
+
|
| 638 |
+
Returns school districts with contact information and enrollment.
|
| 639 |
+
"""
|
| 640 |
+
try:
|
| 641 |
+
# This would query the Delta Lake NCES tables
|
| 642 |
+
return {
|
| 643 |
+
"message": "Query NCES school district data from Delta Lake",
|
| 644 |
+
"filters": {"state": state},
|
| 645 |
+
"limit": limit,
|
| 646 |
+
"note": "Requires NCES data ingestion first (POST /api/data/ingest/nces)",
|
| 647 |
+
"example_data": [
|
| 648 |
+
{
|
| 649 |
+
"name": "Tuscaloosa City Schools",
|
| 650 |
+
"state": "AL",
|
| 651 |
+
"nces_id": "0100123",
|
| 652 |
+
"phone": "(205) 759-3500",
|
| 653 |
+
"website": "https://www.tusc.k12.al.us/"
|
| 654 |
+
}
|
| 655 |
+
]
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
except Exception as e:
|
| 659 |
+
logger.error(f"School district query error: {e}")
|
| 660 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
# Serve React frontend
|
| 664 |
+
static_dir = Path(__file__).parent / "static"
|
| 665 |
+
if static_dir.exists():
|
| 666 |
+
# Mount static files (JS, CSS, images)
|
| 667 |
+
app.mount("/assets", StaticFiles(directory=static_dir / "assets"), name="assets")
|
| 668 |
+
|
| 669 |
+
# Serve index.html for all non-API routes (SPA routing)
|
| 670 |
+
@app.get("/{full_path:path}")
|
| 671 |
+
async def serve_react_app(full_path: str):
|
| 672 |
+
"""Serve React app for all non-API routes."""
|
| 673 |
+
if full_path.startswith("api/"):
|
| 674 |
+
raise HTTPException(status_code=404, detail="API endpoint not found")
|
| 675 |
+
|
| 676 |
+
index_file = static_dir / "index.html"
|
| 677 |
+
if index_file.exists():
|
| 678 |
+
return FileResponse(index_file)
|
| 679 |
+
else:
|
| 680 |
+
raise HTTPException(status_code=404, detail="Frontend not built")
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
@app.on_event("startup")
|
| 684 |
+
async def startup_event():
|
| 685 |
+
"""Initialize system on startup."""
|
| 686 |
+
logger.info("Starting Oral Health Policy Pulse application...")
|
| 687 |
+
|
| 688 |
+
# Initialize Delta Lake if not exists
|
| 689 |
+
try:
|
| 690 |
+
await pipeline.initialize_tables()
|
| 691 |
+
logger.info("Delta Lake tables initialized")
|
| 692 |
+
except Exception as e:
|
| 693 |
+
logger.warning(f"Delta Lake initialization skipped: {e}")
|
| 694 |
+
|
| 695 |
+
logger.info("Application started successfully")
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
@app.on_event("shutdown")
|
| 699 |
+
async def shutdown_event():
|
| 700 |
+
"""Cleanup on shutdown."""
|
| 701 |
+
logger.info("Shutting down application...")
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
if __name__ == "__main__":
|
| 705 |
+
import uvicorn
|
| 706 |
+
uvicorn.run(
|
| 707 |
+
"api.app:app",
|
| 708 |
+
host="0.0.0.0",
|
| 709 |
+
port=8000,
|
| 710 |
+
reload=True
|
| 711 |
+
)
|
api/auth.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Authentication utilities - JWT tokens, password hashing, OAuth helpers
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import secrets
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import Optional, Dict, Any
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
from jose import JWTError, jwt
|
| 11 |
+
from passlib.context import CryptContext
|
| 12 |
+
from fastapi import HTTPException, status, Depends
|
| 13 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 14 |
+
from sqlalchemy.orm import Session
|
| 15 |
+
|
| 16 |
+
from api.database import get_db
|
| 17 |
+
from api.models import User
|
| 18 |
+
|
| 19 |
+
# Load environment variables
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
# Security configuration
|
| 23 |
+
SECRET_KEY = os.getenv("JWT_SECRET_KEY", secrets.token_urlsafe(32))
|
| 24 |
+
ALGORITHM = "HS256"
|
| 25 |
+
ACCESS_TOKEN_EXPIRE_MINUTES = 60 * 24 * 7 # 7 days
|
| 26 |
+
|
| 27 |
+
# Password hashing
|
| 28 |
+
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
| 29 |
+
|
| 30 |
+
# HTTP Bearer token scheme
|
| 31 |
+
security = HTTPBearer(auto_error=False)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def hash_password(password: str) -> str:
|
| 35 |
+
"""Hash a password using bcrypt"""
|
| 36 |
+
return pwd_context.hash(password)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
| 40 |
+
"""Verify a password against its hash"""
|
| 41 |
+
return pwd_context.verify(plain_password, hashed_password)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def create_access_token(data: Dict[str, Any], expires_delta: Optional[timedelta] = None) -> str:
|
| 45 |
+
"""
|
| 46 |
+
Create a JWT access token
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
data: Payload to encode (usually {"sub": user_id})
|
| 50 |
+
expires_delta: Optional custom expiration time
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
JWT token string
|
| 54 |
+
"""
|
| 55 |
+
to_encode = data.copy()
|
| 56 |
+
|
| 57 |
+
if expires_delta:
|
| 58 |
+
expire = datetime.utcnow() + expires_delta
|
| 59 |
+
else:
|
| 60 |
+
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 61 |
+
|
| 62 |
+
to_encode.update({"exp": expire})
|
| 63 |
+
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 64 |
+
return encoded_jwt
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def decode_access_token(token: str) -> Dict[str, Any]:
|
| 68 |
+
"""
|
| 69 |
+
Decode and validate a JWT token
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
token: JWT token string
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Decoded payload
|
| 76 |
+
|
| 77 |
+
Raises:
|
| 78 |
+
HTTPException: If token is invalid or expired
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 82 |
+
return payload
|
| 83 |
+
except JWTError:
|
| 84 |
+
raise HTTPException(
|
| 85 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 86 |
+
detail="Could not validate credentials",
|
| 87 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def get_current_user(
|
| 92 |
+
credentials: HTTPAuthorizationCredentials = Depends(security),
|
| 93 |
+
db: Session = Depends(get_db)
|
| 94 |
+
) -> Optional[User]:
|
| 95 |
+
"""
|
| 96 |
+
Get current authenticated user from JWT token
|
| 97 |
+
|
| 98 |
+
Usage:
|
| 99 |
+
@app.get("/protected")
|
| 100 |
+
def protected_route(user: User = Depends(get_current_user)):
|
| 101 |
+
return {"message": f"Hello {user.email}"}
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
User object if authenticated, None if optional auth
|
| 105 |
+
"""
|
| 106 |
+
if not credentials:
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
token = credentials.credentials
|
| 110 |
+
payload = decode_access_token(token)
|
| 111 |
+
|
| 112 |
+
user_id: int = payload.get("sub")
|
| 113 |
+
if user_id is None:
|
| 114 |
+
raise HTTPException(
|
| 115 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 116 |
+
detail="Invalid authentication credentials"
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
user = db.query(User).filter(User.id == user_id).first()
|
| 120 |
+
if user is None:
|
| 121 |
+
raise HTTPException(
|
| 122 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 123 |
+
detail="User not found"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Update last login
|
| 127 |
+
user.last_login = datetime.utcnow()
|
| 128 |
+
db.commit()
|
| 129 |
+
|
| 130 |
+
return user
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def require_auth(user: User = Depends(get_current_user)) -> User:
|
| 134 |
+
"""
|
| 135 |
+
Require authentication (raises 401 if not authenticated)
|
| 136 |
+
|
| 137 |
+
Usage:
|
| 138 |
+
@app.get("/protected")
|
| 139 |
+
def protected_route(user: User = Depends(require_auth)):
|
| 140 |
+
return {"message": f"Hello {user.email}"}
|
| 141 |
+
"""
|
| 142 |
+
if user is None:
|
| 143 |
+
raise HTTPException(
|
| 144 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 145 |
+
detail="Authentication required",
|
| 146 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 147 |
+
)
|
| 148 |
+
return user
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def generate_state_token() -> str:
|
| 152 |
+
"""Generate a secure random state token for OAuth CSRF protection"""
|
| 153 |
+
return secrets.token_urlsafe(32)
|
api/database.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database connection and session management
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from sqlalchemy import create_engine
|
| 6 |
+
from sqlalchemy.orm import sessionmaker, Session
|
| 7 |
+
from sqlalchemy.pool import StaticPool
|
| 8 |
+
from typing import Generator
|
| 9 |
+
|
| 10 |
+
from api.models import Base
|
| 11 |
+
|
| 12 |
+
# Database URL from environment or default to SQLite for development
|
| 13 |
+
DATABASE_URL = os.getenv(
|
| 14 |
+
"DATABASE_URL",
|
| 15 |
+
"sqlite:///./data/users.db" # Fallback to SQLite if no PostgreSQL configured
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Handle PostgreSQL URL format for SQLAlchemy 2.0+
|
| 19 |
+
if DATABASE_URL.startswith("postgres://"):
|
| 20 |
+
DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql://", 1)
|
| 21 |
+
|
| 22 |
+
# Create engine
|
| 23 |
+
if "sqlite" in DATABASE_URL:
|
| 24 |
+
# SQLite needs special handling for concurrent access
|
| 25 |
+
engine = create_engine(
|
| 26 |
+
DATABASE_URL,
|
| 27 |
+
connect_args={"check_same_thread": False},
|
| 28 |
+
poolclass=StaticPool,
|
| 29 |
+
)
|
| 30 |
+
else:
|
| 31 |
+
# PostgreSQL configuration
|
| 32 |
+
engine = create_engine(
|
| 33 |
+
DATABASE_URL,
|
| 34 |
+
pool_pre_ping=True,
|
| 35 |
+
pool_size=10,
|
| 36 |
+
max_overflow=20,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Create session factory
|
| 40 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def init_db():
|
| 44 |
+
"""Create all tables"""
|
| 45 |
+
Base.metadata.create_all(bind=engine)
|
| 46 |
+
print(f"✅ Database initialized at: {DATABASE_URL}")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def get_db() -> Generator[Session, None, None]:
|
| 50 |
+
"""
|
| 51 |
+
Database session dependency for FastAPI
|
| 52 |
+
|
| 53 |
+
Usage:
|
| 54 |
+
@app.get("/users")
|
| 55 |
+
def get_users(db: Session = Depends(get_db)):
|
| 56 |
+
return db.query(User).all()
|
| 57 |
+
"""
|
| 58 |
+
db = SessionLocal()
|
| 59 |
+
try:
|
| 60 |
+
yield db
|
| 61 |
+
finally:
|
| 62 |
+
db.close()
|
api/errors.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Structured error models for API responses.
|
| 3 |
+
|
| 4 |
+
Provides user-friendly error messages with expandable technical details.
|
| 5 |
+
"""
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
from typing import Optional, Dict, Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ErrorDetail(BaseModel):
|
| 11 |
+
"""Structured error detail with user-friendly message and technical info."""
|
| 12 |
+
|
| 13 |
+
message: str = Field(
|
| 14 |
+
...,
|
| 15 |
+
description="User-friendly error message"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
error_type: str = Field(
|
| 19 |
+
...,
|
| 20 |
+
description="Type of error (e.g., 'data_not_found', 'network_error', 'validation_error')"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
technical_details: Optional[str] = Field(
|
| 24 |
+
None,
|
| 25 |
+
description="Technical error details for debugging (expandable in UI)"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
suggestions: Optional[list[str]] = Field(
|
| 29 |
+
None,
|
| 30 |
+
description="Helpful suggestions for the user"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
metadata: Optional[Dict[str, Any]] = Field(
|
| 34 |
+
None,
|
| 35 |
+
description="Additional context (state, dataset name, etc.)"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def parse_error(exception: Exception, context: Optional[Dict[str, Any]] = None) -> ErrorDetail:
|
| 40 |
+
"""
|
| 41 |
+
Parse an exception into a structured ErrorDetail with user-friendly message.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
exception: The exception to parse
|
| 45 |
+
context: Additional context (state, dataset, etc.)
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
ErrorDetail with user-friendly message and technical details
|
| 49 |
+
"""
|
| 50 |
+
error_str = str(exception)
|
| 51 |
+
context = context or {}
|
| 52 |
+
|
| 53 |
+
# Parse HuggingFace dataset not found errors
|
| 54 |
+
if "HTTP 404 Not Found" in error_str and "huggingface.co/datasets" in error_str:
|
| 55 |
+
# Extract dataset name from URL
|
| 56 |
+
import re
|
| 57 |
+
match = re.search(r'datasets/([^/]+/[^/]+)/', error_str)
|
| 58 |
+
dataset_name = match.group(1) if match else "unknown"
|
| 59 |
+
|
| 60 |
+
# Extract state from dataset name or context
|
| 61 |
+
state = context.get('state', 'Unknown')
|
| 62 |
+
data_type = 'bills' if 'bills' in dataset_name else 'data'
|
| 63 |
+
|
| 64 |
+
return ErrorDetail(
|
| 65 |
+
message=f"No {data_type} data available for {state.upper()}",
|
| 66 |
+
error_type="data_not_found",
|
| 67 |
+
technical_details=f"Dataset '{dataset_name}' not found on HuggingFace.\n\nFull error: {error_str}",
|
| 68 |
+
suggestions=[
|
| 69 |
+
f"Try a different state - we have data for 50+ states",
|
| 70 |
+
f"Check /api/bills/map to see which states have {data_type} data",
|
| 71 |
+
"Contact support if you believe this data should be available"
|
| 72 |
+
],
|
| 73 |
+
metadata={
|
| 74 |
+
"dataset": dataset_name,
|
| 75 |
+
"state": state,
|
| 76 |
+
"data_type": data_type
|
| 77 |
+
}
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Parse file not found errors (local environment)
|
| 81 |
+
elif "No such file or directory" in error_str or "FileNotFoundError" in error_str:
|
| 82 |
+
state = context.get('state', 'Unknown')
|
| 83 |
+
data_type = context.get('data_type', 'data')
|
| 84 |
+
|
| 85 |
+
return ErrorDetail(
|
| 86 |
+
message=f"No {data_type} available for {state.upper()}",
|
| 87 |
+
error_type="data_not_found",
|
| 88 |
+
technical_details=error_str,
|
| 89 |
+
suggestions=[
|
| 90 |
+
f"This state may not have {data_type} in our database yet",
|
| 91 |
+
"Try a different state or check which states have data",
|
| 92 |
+
"Data is being continuously added - check back later"
|
| 93 |
+
],
|
| 94 |
+
metadata={
|
| 95 |
+
"state": state,
|
| 96 |
+
"data_type": data_type
|
| 97 |
+
}
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Parse DuckDB/SQL errors
|
| 101 |
+
elif "DuckDB" in error_str or "SYNTAX ERROR" in error_str or "LINE" in error_str:
|
| 102 |
+
return ErrorDetail(
|
| 103 |
+
message="Database query error - please check your search parameters",
|
| 104 |
+
error_type="query_error",
|
| 105 |
+
technical_details=error_str,
|
| 106 |
+
suggestions=[
|
| 107 |
+
"Try simplifying your search query",
|
| 108 |
+
"Check that all parameters are valid",
|
| 109 |
+
"Contact support if the issue persists"
|
| 110 |
+
],
|
| 111 |
+
metadata=context
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Parse network/timeout errors
|
| 115 |
+
elif "timeout" in error_str.lower() or "connection" in error_str.lower():
|
| 116 |
+
return ErrorDetail(
|
| 117 |
+
message="Network request timed out - please try again",
|
| 118 |
+
error_type="network_error",
|
| 119 |
+
technical_details=error_str,
|
| 120 |
+
suggestions=[
|
| 121 |
+
"Try again in a few seconds",
|
| 122 |
+
"Check your internet connection",
|
| 123 |
+
"The server may be temporarily busy"
|
| 124 |
+
],
|
| 125 |
+
metadata=context
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Parse validation errors
|
| 129 |
+
elif "validation" in error_str.lower() or "invalid" in error_str.lower():
|
| 130 |
+
return ErrorDetail(
|
| 131 |
+
message="Invalid request parameters",
|
| 132 |
+
error_type="validation_error",
|
| 133 |
+
technical_details=error_str,
|
| 134 |
+
suggestions=[
|
| 135 |
+
"Check that all required parameters are provided",
|
| 136 |
+
"Verify parameter formats (e.g., state codes should be 2 letters)",
|
| 137 |
+
"See API documentation for valid parameter values"
|
| 138 |
+
],
|
| 139 |
+
metadata=context
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Generic error fallback
|
| 143 |
+
else:
|
| 144 |
+
return ErrorDetail(
|
| 145 |
+
message="An unexpected error occurred",
|
| 146 |
+
error_type="server_error",
|
| 147 |
+
technical_details=error_str,
|
| 148 |
+
suggestions=[
|
| 149 |
+
"Try again in a few moments",
|
| 150 |
+
"Contact support if the issue persists",
|
| 151 |
+
"Check the technical details for more information"
|
| 152 |
+
],
|
| 153 |
+
metadata=context
|
| 154 |
+
)
|
api/main.py
ADDED
|
@@ -0,0 +1,1288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for the Oral Health Policy Pulse system.
|
| 3 |
+
|
| 4 |
+
Provides REST API endpoints for:
|
| 5 |
+
- Initiating policy analysis workflows
|
| 6 |
+
- Querying advocacy opportunities
|
| 7 |
+
- Retrieving generated materials
|
| 8 |
+
- Accessing visualizations
|
| 9 |
+
- System status and monitoring
|
| 10 |
+
"""
|
| 11 |
+
from typing import List, Dict, Any, Optional
|
| 12 |
+
from datetime import datetime, date
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks, Request
|
| 17 |
+
from fastapi.responses import HTMLResponse, JSONResponse
|
| 18 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
+
from fastapi.staticfiles import StaticFiles
|
| 20 |
+
from fastapi.openapi.docs import get_swagger_ui_html
|
| 21 |
+
from pydantic import BaseModel, Field
|
| 22 |
+
from loguru import logger
|
| 23 |
+
import os
|
| 24 |
+
|
| 25 |
+
from agents.orchestrator import OrchestratorAgent
|
| 26 |
+
from agents.scraper import ScraperAgent
|
| 27 |
+
from agents.parser import ParserAgent
|
| 28 |
+
from agents.classifier import ClassifierAgent
|
| 29 |
+
from agents.sentiment import SentimentAnalyzerAgent
|
| 30 |
+
from agents.advocacy import AdvocacyWriterAgent
|
| 31 |
+
from pipeline.delta_lake import DeltaLakePipeline
|
| 32 |
+
from visualization.heatmap import AdvocacyHeatmap
|
| 33 |
+
from config import settings
|
| 34 |
+
|
| 35 |
+
# Configure logging with rotation and retention
|
| 36 |
+
# Output to both file (with rotation) and stderr (for HuggingFace container logs)
|
| 37 |
+
logger.remove() # Remove default handler
|
| 38 |
+
|
| 39 |
+
# Add console output (shows in HuggingFace container logs)
|
| 40 |
+
logger.add(
|
| 41 |
+
sys.stderr,
|
| 42 |
+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> - <level>{message}</level>",
|
| 43 |
+
level=settings.log_level
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Add file output with rotation and retention
|
| 47 |
+
logger.add(
|
| 48 |
+
settings.log_file,
|
| 49 |
+
rotation="500 MB", # Create new file when size exceeds 500MB
|
| 50 |
+
retention="10 days", # Delete logs older than 10 days
|
| 51 |
+
level=settings.log_level,
|
| 52 |
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function} - {message}"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# Initialize FastAPI app
|
| 56 |
+
app = FastAPI(
|
| 57 |
+
title="Open Navigator API",
|
| 58 |
+
description="Multi-agent system for analyzing local government oral health policy discussions",
|
| 59 |
+
version="1.0.0",
|
| 60 |
+
docs_url=None, # Disable default docs to use custom
|
| 61 |
+
redoc_url="/redoc", # Keep ReDoc at /redoc
|
| 62 |
+
openapi_tags=[
|
| 63 |
+
{
|
| 64 |
+
"name": "auth",
|
| 65 |
+
"description": "Authentication and user management"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "social",
|
| 69 |
+
"description": "Social features - follow users, leaders, organizations, and causes"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "workflows",
|
| 73 |
+
"description": "Policy analysis workflows"
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"name": "opportunities",
|
| 77 |
+
"description": "Advocacy opportunities"
|
| 78 |
+
}
|
| 79 |
+
]
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Custom OpenAPI schema with logo
|
| 83 |
+
def custom_openapi():
|
| 84 |
+
if app.openapi_schema:
|
| 85 |
+
return app.openapi_schema
|
| 86 |
+
|
| 87 |
+
from fastapi.openapi.utils import get_openapi
|
| 88 |
+
|
| 89 |
+
openapi_schema = get_openapi(
|
| 90 |
+
title=app.title,
|
| 91 |
+
version=app.version,
|
| 92 |
+
description=app.description,
|
| 93 |
+
routes=app.routes,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Add custom logo
|
| 97 |
+
openapi_schema["info"]["x-logo"] = {
|
| 98 |
+
"url": "/static/communityone_logo.svg",
|
| 99 |
+
"altText": "CommunityOne Logo"
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
app.openapi_schema = openapi_schema
|
| 103 |
+
return app.openapi_schema
|
| 104 |
+
|
| 105 |
+
app.openapi = custom_openapi
|
| 106 |
+
|
| 107 |
+
# Add CORS middleware
|
| 108 |
+
app.add_middleware(
|
| 109 |
+
CORSMiddleware,
|
| 110 |
+
allow_origins=["*"],
|
| 111 |
+
allow_credentials=True,
|
| 112 |
+
allow_methods=["*"],
|
| 113 |
+
allow_headers=["*"],
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# Request logging middleware
|
| 117 |
+
@app.middleware("http")
|
| 118 |
+
async def log_requests(request: Request, call_next):
|
| 119 |
+
"""Log all API requests with timing and response info"""
|
| 120 |
+
start_time = time.time()
|
| 121 |
+
|
| 122 |
+
# Get client info
|
| 123 |
+
client_host = request.client.host if request.client else "unknown"
|
| 124 |
+
|
| 125 |
+
# Log incoming request
|
| 126 |
+
logger.info(f"➡️ {request.method} {request.url.path} - Client: {client_host}")
|
| 127 |
+
|
| 128 |
+
# Process request
|
| 129 |
+
try:
|
| 130 |
+
response = await call_next(request)
|
| 131 |
+
|
| 132 |
+
# Calculate duration
|
| 133 |
+
duration_ms = (time.time() - start_time) * 1000
|
| 134 |
+
|
| 135 |
+
# Get response size if available
|
| 136 |
+
response_size = response.headers.get("content-length", "unknown")
|
| 137 |
+
|
| 138 |
+
# Log response with appropriate emoji based on status
|
| 139 |
+
if response.status_code < 400:
|
| 140 |
+
logger.info(
|
| 141 |
+
f"✅ {request.method} {request.url.path} - "
|
| 142 |
+
f"Status: {response.status_code} - "
|
| 143 |
+
f"Duration: {duration_ms:.2f}ms - "
|
| 144 |
+
f"Size: {response_size} bytes"
|
| 145 |
+
)
|
| 146 |
+
elif response.status_code < 500:
|
| 147 |
+
logger.warning(
|
| 148 |
+
f"⚠️ {request.method} {request.url.path} - "
|
| 149 |
+
f"Status: {response.status_code} - "
|
| 150 |
+
f"Duration: {duration_ms:.2f}ms"
|
| 151 |
+
)
|
| 152 |
+
else:
|
| 153 |
+
logger.error(
|
| 154 |
+
f"❌ {request.method} {request.url.path} - "
|
| 155 |
+
f"Status: {response.status_code} - "
|
| 156 |
+
f"Duration: {duration_ms:.2f}ms"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
return response
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
duration_ms = (time.time() - start_time) * 1000
|
| 163 |
+
logger.error(
|
| 164 |
+
f"💥 {request.method} {request.url.path} - "
|
| 165 |
+
f"Error: {str(e)} - "
|
| 166 |
+
f"Duration: {duration_ms:.2f}ms"
|
| 167 |
+
)
|
| 168 |
+
raise
|
| 169 |
+
|
| 170 |
+
# Mount static files for logo
|
| 171 |
+
static_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "frontend", "public")
|
| 172 |
+
if os.path.exists(static_dir):
|
| 173 |
+
app.mount("/static", StaticFiles(directory=static_dir), name="static")
|
| 174 |
+
else:
|
| 175 |
+
logger.warning(f"Static directory not found: {static_dir}")
|
| 176 |
+
|
| 177 |
+
# Include authentication routes
|
| 178 |
+
from api.routes import auth as auth_routes
|
| 179 |
+
from api.routes import social as social_routes
|
| 180 |
+
from api.routes import search as search_routes
|
| 181 |
+
# Use Neon database for fast stats queries (500x faster than parquet)
|
| 182 |
+
from api.routes import stats_neon as stats_routes # Was: stats
|
| 183 |
+
from api.routes import contact as contact_routes
|
| 184 |
+
# Use hybrid approach for bills: Neon for map, parquet for drill-down (saves space)
|
| 185 |
+
from api.routes import bills_neon as bills_routes # Was: bills
|
| 186 |
+
from api.database import init_db
|
| 187 |
+
|
| 188 |
+
app.include_router(auth_routes.router)
|
| 189 |
+
app.include_router(social_routes.router)
|
| 190 |
+
app.include_router(search_routes.router)
|
| 191 |
+
app.include_router(stats_routes.router, prefix="/api", tags=["stats"])
|
| 192 |
+
app.include_router(contact_routes.router)
|
| 193 |
+
app.include_router(bills_routes.router)
|
| 194 |
+
|
| 195 |
+
# Custom Swagger UI with logo
|
| 196 |
+
@app.get("/docs", include_in_schema=False)
|
| 197 |
+
async def custom_swagger_ui_html():
|
| 198 |
+
"""Custom Swagger UI with CommunityOne logo"""
|
| 199 |
+
return get_swagger_ui_html(
|
| 200 |
+
openapi_url=app.openapi_url,
|
| 201 |
+
title=f"{app.title} - API Documentation",
|
| 202 |
+
swagger_favicon_url="/static/communityone_logo_64.png",
|
| 203 |
+
swagger_ui_parameters={
|
| 204 |
+
"defaultModelsExpandDepth": -1,
|
| 205 |
+
"docExpansion": "list",
|
| 206 |
+
"filter": True,
|
| 207 |
+
"syntaxHighlight.theme": "monokai"
|
| 208 |
+
}
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Initialize database on startup
|
| 212 |
+
@app.on_event("startup")
|
| 213 |
+
async def init_database():
|
| 214 |
+
"""Initialize authentication database"""
|
| 215 |
+
try:
|
| 216 |
+
init_db()
|
| 217 |
+
logger.info("✅ Authentication database initialized")
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.warning(f"⚠️ Database initialization skipped: {e}")
|
| 220 |
+
|
| 221 |
+
# Initialize components
|
| 222 |
+
orchestrator = OrchestratorAgent()
|
| 223 |
+
pipeline = DeltaLakePipeline()
|
| 224 |
+
heatmap_generator = AdvocacyHeatmap()
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# Pydantic models for API
|
| 228 |
+
class ScrapeTarget(BaseModel):
|
| 229 |
+
"""Configuration for a scraping target."""
|
| 230 |
+
url: str
|
| 231 |
+
municipality: str
|
| 232 |
+
state: str
|
| 233 |
+
platform: str = "generic"
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
class WorkflowRequest(BaseModel):
|
| 237 |
+
"""Request to start a new analysis workflow."""
|
| 238 |
+
scrape_targets: List[ScrapeTarget]
|
| 239 |
+
date_range: Optional[Dict[str, str]] = None
|
| 240 |
+
description: Optional[str] = None
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
class WorkflowResponse(BaseModel):
|
| 244 |
+
"""Response for workflow operations."""
|
| 245 |
+
workflow_id: str
|
| 246 |
+
status: str
|
| 247 |
+
message: str
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
class OpportunityFilter(BaseModel):
|
| 251 |
+
"""Filters for querying opportunities."""
|
| 252 |
+
state: Optional[str] = None
|
| 253 |
+
municipality: Optional[str] = None
|
| 254 |
+
topic: Optional[str] = None
|
| 255 |
+
urgency: Optional[str] = None
|
| 256 |
+
min_date: Optional[date] = None
|
| 257 |
+
max_date: Optional[date] = None
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
class OpportunityResponse(BaseModel):
|
| 261 |
+
"""Response containing advocacy opportunities."""
|
| 262 |
+
opportunities: List[Dict[str, Any]]
|
| 263 |
+
total_count: int
|
| 264 |
+
filters_applied: Dict[str, Any]
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
class SystemStatus(BaseModel):
|
| 268 |
+
"""System status information."""
|
| 269 |
+
status: str
|
| 270 |
+
active_workflows: int
|
| 271 |
+
agent_status: Dict[str, Any]
|
| 272 |
+
last_update: datetime
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# API Endpoints
|
| 276 |
+
|
| 277 |
+
@app.get("/", response_class=HTMLResponse)
|
| 278 |
+
async def root():
|
| 279 |
+
"""Root endpoint with API information."""
|
| 280 |
+
html_content = """
|
| 281 |
+
<!DOCTYPE html>
|
| 282 |
+
<html>
|
| 283 |
+
<head>
|
| 284 |
+
<title>Open Navigator API</title>
|
| 285 |
+
<style>
|
| 286 |
+
body {
|
| 287 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
| 288 |
+
max-width: 900px;
|
| 289 |
+
margin: 50px auto;
|
| 290 |
+
padding: 20px;
|
| 291 |
+
line-height: 1.6;
|
| 292 |
+
}
|
| 293 |
+
.header {
|
| 294 |
+
display: flex;
|
| 295 |
+
align-items: center;
|
| 296 |
+
gap: 20px;
|
| 297 |
+
margin-bottom: 30px;
|
| 298 |
+
border-bottom: 2px solid #1976d2;
|
| 299 |
+
padding-bottom: 20px;
|
| 300 |
+
}
|
| 301 |
+
.logo {
|
| 302 |
+
width: 80px;
|
| 303 |
+
height: 80px;
|
| 304 |
+
}
|
| 305 |
+
h1 {
|
| 306 |
+
color: #1976d2;
|
| 307 |
+
margin: 0;
|
| 308 |
+
}
|
| 309 |
+
.tagline {
|
| 310 |
+
color: #666;
|
| 311 |
+
font-size: 1.1em;
|
| 312 |
+
}
|
| 313 |
+
.endpoint {
|
| 314 |
+
background: linear-gradient(135deg, #f5f5f5 0%, #e8f4f8 100%);
|
| 315 |
+
padding: 15px;
|
| 316 |
+
margin: 10px 0;
|
| 317 |
+
border-radius: 8px;
|
| 318 |
+
border-left: 4px solid #1976d2;
|
| 319 |
+
}
|
| 320 |
+
.endpoint strong {
|
| 321 |
+
color: #1976d2;
|
| 322 |
+
}
|
| 323 |
+
code {
|
| 324 |
+
background: #e0e0e0;
|
| 325 |
+
padding: 3px 8px;
|
| 326 |
+
border-radius: 4px;
|
| 327 |
+
font-family: 'Courier New', monospace;
|
| 328 |
+
}
|
| 329 |
+
.docs-link {
|
| 330 |
+
display: inline-block;
|
| 331 |
+
background: #1976d2;
|
| 332 |
+
color: white;
|
| 333 |
+
padding: 12px 24px;
|
| 334 |
+
text-decoration: none;
|
| 335 |
+
border-radius: 6px;
|
| 336 |
+
margin-top: 20px;
|
| 337 |
+
transition: background 0.3s;
|
| 338 |
+
}
|
| 339 |
+
.docs-link:hover {
|
| 340 |
+
background: #1565c0;
|
| 341 |
+
}
|
| 342 |
+
</style>
|
| 343 |
+
</head>
|
| 344 |
+
<body>
|
| 345 |
+
<div class="header">
|
| 346 |
+
<img src="/static/communityone_logo.svg" alt="CommunityOne Logo" class="logo">
|
| 347 |
+
<div>
|
| 348 |
+
<h1>Open Navigator API</h1>
|
| 349 |
+
<p class="tagline">CommunityOne: The open path to everything local</p>
|
| 350 |
+
</div>
|
| 351 |
+
</div>
|
| 352 |
+
|
| 353 |
+
<h2>🔑 Key Endpoints:</h2>
|
| 354 |
+
<div class="endpoint">
|
| 355 |
+
<strong>POST /workflow/start</strong> - Start a new analysis workflow
|
| 356 |
+
</div>
|
| 357 |
+
<div class="endpoint">
|
| 358 |
+
<strong>GET /opportunities</strong> - Query advocacy opportunities
|
| 359 |
+
</div>
|
| 360 |
+
<div class="endpoint">
|
| 361 |
+
<strong>GET /heatmap</strong> - Get advocacy heatmap visualization
|
| 362 |
+
</div>
|
| 363 |
+
<div class="endpoint">
|
| 364 |
+
<strong>GET /status</strong> - System status and health
|
| 365 |
+
</div>
|
| 366 |
+
<div class="endpoint">
|
| 367 |
+
<strong>POST /auth/login/{provider}</strong> - OAuth login (HuggingFace, Google, Facebook, GitHub)
|
| 368 |
+
</div>
|
| 369 |
+
|
| 370 |
+
<a href="/docs" class="docs-link">📚 View Full API Documentation</a>
|
| 371 |
+
</body>
|
| 372 |
+
</html>
|
| 373 |
+
"""
|
| 374 |
+
return HTMLResponse(content=html_content)
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
@app.post("/workflow/start", response_model=WorkflowResponse)
|
| 378 |
+
async def start_workflow(
|
| 379 |
+
request: WorkflowRequest,
|
| 380 |
+
background_tasks: BackgroundTasks
|
| 381 |
+
):
|
| 382 |
+
"""
|
| 383 |
+
Start a new policy analysis workflow.
|
| 384 |
+
|
| 385 |
+
This initiates the full multi-agent pipeline:
|
| 386 |
+
1. Scrape meeting minutes from specified sources
|
| 387 |
+
2. Parse and structure the data
|
| 388 |
+
3. Classify by oral health topics
|
| 389 |
+
4. Analyze sentiment and policy stance
|
| 390 |
+
5. Generate advocacy materials
|
| 391 |
+
"""
|
| 392 |
+
try:
|
| 393 |
+
# Register agents with orchestrator
|
| 394 |
+
orchestrator.register_agent(ScraperAgent())
|
| 395 |
+
orchestrator.register_agent(ParserAgent())
|
| 396 |
+
orchestrator.register_agent(ClassifierAgent())
|
| 397 |
+
orchestrator.register_agent(SentimentAnalyzerAgent())
|
| 398 |
+
orchestrator.register_agent(AdvocacyWriterAgent())
|
| 399 |
+
|
| 400 |
+
# Convert targets to dict format
|
| 401 |
+
targets = [target.dict() for target in request.scrape_targets]
|
| 402 |
+
|
| 403 |
+
# Start workflow in background
|
| 404 |
+
background_tasks.add_task(
|
| 405 |
+
orchestrator.execute_pipeline,
|
| 406 |
+
targets,
|
| 407 |
+
request.date_range
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
return WorkflowResponse(
|
| 411 |
+
workflow_id="wf-" + datetime.utcnow().strftime("%Y%m%d-%H%M%S"),
|
| 412 |
+
status="started",
|
| 413 |
+
message=f"Workflow started with {len(targets)} targets"
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
except Exception as e:
|
| 417 |
+
logger.error(f"Error starting workflow: {e}")
|
| 418 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
@app.get("/workflow/{workflow_id}/status")
|
| 422 |
+
async def get_workflow_status(workflow_id: str):
|
| 423 |
+
"""Get the status of a specific workflow."""
|
| 424 |
+
# Query workflow status from orchestrator
|
| 425 |
+
# This is a placeholder - would query actual workflow state
|
| 426 |
+
return {
|
| 427 |
+
"workflow_id": workflow_id,
|
| 428 |
+
"status": "running",
|
| 429 |
+
"stage": "classification",
|
| 430 |
+
"progress": 0.6
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
@app.get("/opportunities", response_model=OpportunityResponse)
|
| 435 |
+
async def get_opportunities(
|
| 436 |
+
state: Optional[str] = Query(None, description="Filter by state code"),
|
| 437 |
+
municipality: Optional[str] = Query(None, description="Filter by municipality"),
|
| 438 |
+
topic: Optional[str] = Query(None, description="Filter by policy topic"),
|
| 439 |
+
urgency: Optional[str] = Query(None, description="Filter by urgency level"),
|
| 440 |
+
limit: int = Query(100, ge=1, le=1000, description="Maximum results to return")
|
| 441 |
+
):
|
| 442 |
+
"""
|
| 443 |
+
Query advocacy opportunities.
|
| 444 |
+
|
| 445 |
+
Returns a list of identified opportunities for advocacy action
|
| 446 |
+
based on the specified filters.
|
| 447 |
+
"""
|
| 448 |
+
try:
|
| 449 |
+
# Query from Delta Lake
|
| 450 |
+
opportunities = pipeline.query_opportunities_by_state(state, urgency)
|
| 451 |
+
|
| 452 |
+
# Apply additional filters
|
| 453 |
+
if municipality:
|
| 454 |
+
opportunities = [
|
| 455 |
+
opp for opp in opportunities
|
| 456 |
+
if opp.get("municipality", "").lower() == municipality.lower()
|
| 457 |
+
]
|
| 458 |
+
|
| 459 |
+
if topic:
|
| 460 |
+
opportunities = [
|
| 461 |
+
opp for opp in opportunities
|
| 462 |
+
if opp.get("topic") == topic
|
| 463 |
+
]
|
| 464 |
+
|
| 465 |
+
# Limit results
|
| 466 |
+
opportunities = opportunities[:limit]
|
| 467 |
+
|
| 468 |
+
return OpportunityResponse(
|
| 469 |
+
opportunities=opportunities,
|
| 470 |
+
total_count=len(opportunities),
|
| 471 |
+
filters_applied={
|
| 472 |
+
"state": state,
|
| 473 |
+
"municipality": municipality,
|
| 474 |
+
"topic": topic,
|
| 475 |
+
"urgency": urgency
|
| 476 |
+
}
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
except Exception as e:
|
| 480 |
+
logger.error(f"Error querying opportunities: {e}")
|
| 481 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
# React frontend endpoints with /api/ prefix
|
| 485 |
+
@app.get("/api/opportunities")
|
| 486 |
+
async def get_api_opportunities(
|
| 487 |
+
state: Optional[str] = Query(None),
|
| 488 |
+
topic: Optional[str] = Query(None),
|
| 489 |
+
urgency: Optional[str] = Query(None),
|
| 490 |
+
limit: int = Query(100)
|
| 491 |
+
):
|
| 492 |
+
"""API endpoint for React frontend opportunities page - returns fluoridation bills as advocacy opportunities."""
|
| 493 |
+
try:
|
| 494 |
+
import duckdb
|
| 495 |
+
from pathlib import Path
|
| 496 |
+
import random
|
| 497 |
+
|
| 498 |
+
# State center coordinates for mapping
|
| 499 |
+
STATE_COORDS = {
|
| 500 |
+
'AL': (32.806671, -86.791130),
|
| 501 |
+
'GA': (33.040619, -83.643074),
|
| 502 |
+
'IN': (39.849426, -86.258278),
|
| 503 |
+
'MA': (42.230171, -71.530106),
|
| 504 |
+
'WA': (47.400902, -121.490494),
|
| 505 |
+
'WI': (44.268543, -89.616508)
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
# Build query for fluoridation-related bills
|
| 509 |
+
states = [state] if state else list(STATE_COORDS.keys())
|
| 510 |
+
opportunities = []
|
| 511 |
+
|
| 512 |
+
for st in states:
|
| 513 |
+
parquet_path = Path(f"data/gold/states/{st}/bills_bills.parquet")
|
| 514 |
+
if not parquet_path.exists():
|
| 515 |
+
continue
|
| 516 |
+
|
| 517 |
+
# Query for fluoridation-related bills
|
| 518 |
+
query = f"""
|
| 519 |
+
SELECT
|
| 520 |
+
'{st}' as state,
|
| 521 |
+
title,
|
| 522 |
+
identifier,
|
| 523 |
+
session,
|
| 524 |
+
latest_action,
|
| 525 |
+
created_at,
|
| 526 |
+
updated_at
|
| 527 |
+
FROM read_parquet('{parquet_path}')
|
| 528 |
+
WHERE LOWER(title) LIKE '%fluorid%'
|
| 529 |
+
OR LOWER(title) LIKE '%dental%'
|
| 530 |
+
OR LOWER(title) LIKE '%oral health%'
|
| 531 |
+
OR LOWER(title) LIKE '%water treat%'
|
| 532 |
+
LIMIT {limit}
|
| 533 |
+
"""
|
| 534 |
+
|
| 535 |
+
result = duckdb.query(query).fetchall()
|
| 536 |
+
|
| 537 |
+
# Convert to opportunities format
|
| 538 |
+
for row in result:
|
| 539 |
+
state_code, title, identifier, session, latest_action, created_at, updated_at = row
|
| 540 |
+
|
| 541 |
+
# Determine urgency based on keywords
|
| 542 |
+
title_lower = title.lower() if title else ""
|
| 543 |
+
# Check for fluoride topics (both pro and anti fluoride are critical)
|
| 544 |
+
if 'fluoride' in title_lower or 'fluorin' in title_lower or 'water' in title_lower:
|
| 545 |
+
urgency_level = 'critical'
|
| 546 |
+
confidence = 0.9
|
| 547 |
+
topic_type = 'water_fluoridation'
|
| 548 |
+
elif 'dental' in title_lower:
|
| 549 |
+
urgency_level = 'high'
|
| 550 |
+
confidence = 0.75
|
| 551 |
+
topic_type = 'school_dental_screening'
|
| 552 |
+
else:
|
| 553 |
+
urgency_level = 'medium'
|
| 554 |
+
confidence = 0.6
|
| 555 |
+
topic_type = 'medicaid_dental_expansion'
|
| 556 |
+
|
| 557 |
+
# Filter by topic if specified
|
| 558 |
+
if topic and topic_type != topic:
|
| 559 |
+
continue
|
| 560 |
+
|
| 561 |
+
# Filter by urgency if specified
|
| 562 |
+
if urgency and urgency_level != urgency:
|
| 563 |
+
continue
|
| 564 |
+
|
| 565 |
+
# Get state coordinates with slight random offset for multiple bills
|
| 566 |
+
base_lat, base_lon = STATE_COORDS[state_code]
|
| 567 |
+
lat_offset = random.uniform(-0.5, 0.5)
|
| 568 |
+
lon_offset = random.uniform(-0.5, 0.5)
|
| 569 |
+
|
| 570 |
+
opportunities.append({
|
| 571 |
+
'state': state_code,
|
| 572 |
+
'municipality': f'{state_code} Legislature',
|
| 573 |
+
'latitude': base_lat + lat_offset,
|
| 574 |
+
'longitude': base_lon + lon_offset,
|
| 575 |
+
'topic': topic_type,
|
| 576 |
+
'urgency': urgency_level,
|
| 577 |
+
'confidence': confidence,
|
| 578 |
+
'meeting_date': updated_at.isoformat() if updated_at else created_at.isoformat(),
|
| 579 |
+
'title': title,
|
| 580 |
+
'bill_id': identifier,
|
| 581 |
+
'session': session,
|
| 582 |
+
'latest_action': latest_action
|
| 583 |
+
})
|
| 584 |
+
|
| 585 |
+
return {"opportunities": opportunities[:limit]}
|
| 586 |
+
except Exception as e:
|
| 587 |
+
logger.error(f"Error: {e}")
|
| 588 |
+
return {"opportunities": []}
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
@app.get("/api/documents")
|
| 592 |
+
async def get_api_documents(
|
| 593 |
+
search: Optional[str] = Query(None),
|
| 594 |
+
page: int = Query(1),
|
| 595 |
+
limit: int = Query(20)
|
| 596 |
+
):
|
| 597 |
+
"""API endpoint for React frontend documents page."""
|
| 598 |
+
try:
|
| 599 |
+
# Get all opportunities (documents)
|
| 600 |
+
documents = pipeline.query_opportunities_by_state(None, None)
|
| 601 |
+
|
| 602 |
+
# Apply search filter
|
| 603 |
+
if search:
|
| 604 |
+
search_lower = search.lower()
|
| 605 |
+
documents = [
|
| 606 |
+
d for d in documents
|
| 607 |
+
if search_lower in d.get("title", "").lower() or
|
| 608 |
+
search_lower in d.get("municipality", "").lower() or
|
| 609 |
+
search_lower in d.get("content", "").lower()
|
| 610 |
+
]
|
| 611 |
+
|
| 612 |
+
# Paginate
|
| 613 |
+
start = (page - 1) * limit
|
| 614 |
+
end = start + limit
|
| 615 |
+
|
| 616 |
+
return {
|
| 617 |
+
"documents": documents[start:end],
|
| 618 |
+
"total": len(documents),
|
| 619 |
+
"page": page,
|
| 620 |
+
"limit": limit
|
| 621 |
+
}
|
| 622 |
+
except Exception as e:
|
| 623 |
+
logger.error(f"Error: {e}")
|
| 624 |
+
return {"documents": [], "total": 0}
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
@app.get("/opportunities/{opportunity_id}")
|
| 628 |
+
async def get_opportunity_detail(opportunity_id: str):
|
| 629 |
+
"""Get detailed information about a specific opportunity."""
|
| 630 |
+
# Query specific opportunity
|
| 631 |
+
document = pipeline.get_document_by_id(opportunity_id)
|
| 632 |
+
|
| 633 |
+
if not document:
|
| 634 |
+
raise HTTPException(status_code=404, detail="Opportunity not found")
|
| 635 |
+
|
| 636 |
+
return document
|
| 637 |
+
|
| 638 |
+
|
| 639 |
+
@app.get("/opportunities/{opportunity_id}/materials")
|
| 640 |
+
async def get_advocacy_materials(opportunity_id: str):
|
| 641 |
+
"""Get generated advocacy materials for an opportunity."""
|
| 642 |
+
# Query materials from Delta Lake
|
| 643 |
+
# This is a placeholder
|
| 644 |
+
return {
|
| 645 |
+
"opportunity_id": opportunity_id,
|
| 646 |
+
"materials": {
|
| 647 |
+
"email": {
|
| 648 |
+
"subject": "Support Oral Health Policy",
|
| 649 |
+
"body": "..."
|
| 650 |
+
},
|
| 651 |
+
"talking_points": [],
|
| 652 |
+
"social_media": {}
|
| 653 |
+
}
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
@app.get("/heatmap", response_class=HTMLResponse)
|
| 658 |
+
async def get_heatmap(
|
| 659 |
+
urgency: Optional[str] = Query(None, description="Filter by urgency level")
|
| 660 |
+
):
|
| 661 |
+
"""
|
| 662 |
+
Get interactive heatmap visualization.
|
| 663 |
+
|
| 664 |
+
Returns an HTML page with an interactive map showing
|
| 665 |
+
advocacy opportunities across the country.
|
| 666 |
+
"""
|
| 667 |
+
try:
|
| 668 |
+
# Query opportunities
|
| 669 |
+
opportunities = pipeline.query_opportunities_by_state(None, urgency)
|
| 670 |
+
|
| 671 |
+
# Generate map
|
| 672 |
+
m = heatmap_generator.create_folium_map(
|
| 673 |
+
opportunities,
|
| 674 |
+
title="Open Navigator - Advocacy Heatmap"
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
# Return HTML
|
| 678 |
+
return HTMLResponse(content=m._repr_html_())
|
| 679 |
+
|
| 680 |
+
except Exception as e:
|
| 681 |
+
logger.error(f"Error generating heatmap: {e}")
|
| 682 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 683 |
+
|
| 684 |
+
|
| 685 |
+
@app.get("/dashboard")
|
| 686 |
+
async def get_dashboard():
|
| 687 |
+
"""
|
| 688 |
+
Get complete dashboard data including statistics and visualizations.
|
| 689 |
+
"""
|
| 690 |
+
try:
|
| 691 |
+
# Query all opportunities
|
| 692 |
+
opportunities = pipeline.query_opportunities_by_state(None, None)
|
| 693 |
+
|
| 694 |
+
# Generate dashboard
|
| 695 |
+
dashboard = heatmap_generator.create_dashboard(opportunities)
|
| 696 |
+
|
| 697 |
+
# Convert visualizations to JSON-serializable format
|
| 698 |
+
return {
|
| 699 |
+
"statistics": dashboard["statistics"],
|
| 700 |
+
"topic_distribution": dashboard["topic_distribution"].to_json(),
|
| 701 |
+
"timeline": dashboard["timeline"].to_json()
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
except Exception as e:
|
| 705 |
+
logger.error(f"Error generating dashboard: {e}")
|
| 706 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
@app.get("/api/dashboard")
|
| 710 |
+
async def get_api_dashboard():
|
| 711 |
+
"""
|
| 712 |
+
Get dashboard statistics for React frontend.
|
| 713 |
+
Returns data in format expected by frontend Dashboard component.
|
| 714 |
+
"""
|
| 715 |
+
try:
|
| 716 |
+
# Query all opportunities
|
| 717 |
+
opportunities = pipeline.query_opportunities_by_state(None, None)
|
| 718 |
+
|
| 719 |
+
# Count topics
|
| 720 |
+
topics_count = {}
|
| 721 |
+
for opp in opportunities:
|
| 722 |
+
topic = opp.get("topic", "unknown")
|
| 723 |
+
topics_count[topic] = topics_count.get(topic, 0) + 1
|
| 724 |
+
|
| 725 |
+
# Get unique states
|
| 726 |
+
states = set(opp.get("state") for opp in opportunities if opp.get("state"))
|
| 727 |
+
|
| 728 |
+
# Get recent opportunities (last 10)
|
| 729 |
+
recent = sorted(
|
| 730 |
+
opportunities,
|
| 731 |
+
key=lambda x: x.get("meeting_date", ""),
|
| 732 |
+
reverse=True
|
| 733 |
+
)[:10]
|
| 734 |
+
|
| 735 |
+
return {
|
| 736 |
+
"total_documents": len(opportunities),
|
| 737 |
+
"total_opportunities": len(opportunities),
|
| 738 |
+
"states_monitored": len(states),
|
| 739 |
+
"topics": topics_count,
|
| 740 |
+
"recent_opportunities": recent
|
| 741 |
+
}
|
| 742 |
+
|
| 743 |
+
except Exception as e:
|
| 744 |
+
logger.error(f"Error generating API dashboard: {e}")
|
| 745 |
+
# Return mock data if there's an error
|
| 746 |
+
return {
|
| 747 |
+
"total_documents": 0,
|
| 748 |
+
"total_opportunities": 0,
|
| 749 |
+
"states_monitored": 0,
|
| 750 |
+
"topics": {},
|
| 751 |
+
"recent_opportunities": []
|
| 752 |
+
}
|
| 753 |
+
|
| 754 |
+
|
| 755 |
+
@app.get("/topics")
|
| 756 |
+
async def get_topics():
|
| 757 |
+
"""Get list of all policy topics being tracked."""
|
| 758 |
+
return {
|
| 759 |
+
"topics": settings.policy_topics,
|
| 760 |
+
"count": len(settings.policy_topics)
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
@app.get("/states")
|
| 765 |
+
async def get_states():
|
| 766 |
+
"""Get list of all states with active opportunities."""
|
| 767 |
+
# Query distinct states from database
|
| 768 |
+
states = ["CA", "NY", "TX", "FL", "IL"] # Placeholder
|
| 769 |
+
|
| 770 |
+
return {
|
| 771 |
+
"states": states,
|
| 772 |
+
"count": len(states)
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
|
| 776 |
+
@app.get("/status", response_model=SystemStatus)
|
| 777 |
+
async def get_system_status():
|
| 778 |
+
"""Get current system status and health."""
|
| 779 |
+
try:
|
| 780 |
+
agent_status = orchestrator.get_all_agent_states()
|
| 781 |
+
|
| 782 |
+
return SystemStatus(
|
| 783 |
+
status="operational",
|
| 784 |
+
active_workflows=len(orchestrator.active_workflows),
|
| 785 |
+
agent_status=agent_status,
|
| 786 |
+
last_update=datetime.utcnow()
|
| 787 |
+
)
|
| 788 |
+
|
| 789 |
+
except Exception as e:
|
| 790 |
+
logger.error(f"Error getting system status: {e}")
|
| 791 |
+
return SystemStatus(
|
| 792 |
+
status="error",
|
| 793 |
+
active_workflows=0,
|
| 794 |
+
agent_status={},
|
| 795 |
+
last_update=datetime.utcnow()
|
| 796 |
+
)
|
| 797 |
+
|
| 798 |
+
|
| 799 |
+
@app.get("/nonprofits")
|
| 800 |
+
async def search_nonprofits(
|
| 801 |
+
location: str = Query("Tuscaloosa, AL", description="City, State format"),
|
| 802 |
+
keyword: Optional[str] = Query(None, description="Service keyword (e.g., 'dental', 'health')"),
|
| 803 |
+
state: Optional[str] = Query(None, description="2-letter state code (e.g., 'AL')"),
|
| 804 |
+
ntee_code: Optional[str] = Query(None, description="NTEE code (e.g., 'E' for health)"),
|
| 805 |
+
source: Optional[str] = Query(None, description="Data source: 'propublica', 'everyorg', 'all'")
|
| 806 |
+
):
|
| 807 |
+
"""
|
| 808 |
+
Search for nonprofits using free open data APIs.
|
| 809 |
+
|
| 810 |
+
Integrates data from:
|
| 811 |
+
- ProPublica Nonprofit Explorer (financial data, NTEE codes)
|
| 812 |
+
- Every.org (mission statements, logos)
|
| 813 |
+
- IRS TEOS (official tax-exempt status)
|
| 814 |
+
|
| 815 |
+
Example: /nonprofits?location=Tuscaloosa,AL&keyword=dental&ntee_code=E
|
| 816 |
+
"""
|
| 817 |
+
try:
|
| 818 |
+
from discovery.nonprofit_discovery import NonprofitDiscovery
|
| 819 |
+
|
| 820 |
+
discovery = NonprofitDiscovery()
|
| 821 |
+
results = []
|
| 822 |
+
|
| 823 |
+
# Parse location for state/city
|
| 824 |
+
location_parts = location.split(',')
|
| 825 |
+
city = location_parts[0].strip() if len(location_parts) > 0 else None
|
| 826 |
+
state_from_location = location_parts[1].strip() if len(location_parts) > 1 else None
|
| 827 |
+
state_code = state or state_from_location or "AL"
|
| 828 |
+
|
| 829 |
+
# Determine which sources to query
|
| 830 |
+
sources_to_query = ['propublica', 'everyorg'] if source == 'all' or not source else [source]
|
| 831 |
+
|
| 832 |
+
# Query ProPublica
|
| 833 |
+
if 'propublica' in sources_to_query:
|
| 834 |
+
try:
|
| 835 |
+
propublica_results = discovery.search_propublica(
|
| 836 |
+
state=state_code,
|
| 837 |
+
city=city,
|
| 838 |
+
ntee_code=ntee_code
|
| 839 |
+
)
|
| 840 |
+
results.extend(propublica_results)
|
| 841 |
+
logger.info(f"ProPublica: Found {len(propublica_results)} organizations")
|
| 842 |
+
except Exception as e:
|
| 843 |
+
logger.warning(f"ProPublica search failed: {e}")
|
| 844 |
+
|
| 845 |
+
# Query Every.org
|
| 846 |
+
if 'everyorg' in sources_to_query:
|
| 847 |
+
try:
|
| 848 |
+
causes = []
|
| 849 |
+
if keyword:
|
| 850 |
+
# Map keywords to causes
|
| 851 |
+
keyword_lower = keyword.lower()
|
| 852 |
+
if 'health' in keyword_lower or 'dental' in keyword_lower or 'medical' in keyword_lower:
|
| 853 |
+
causes.append('health')
|
| 854 |
+
if 'education' in keyword_lower or 'school' in keyword_lower:
|
| 855 |
+
causes.append('education')
|
| 856 |
+
|
| 857 |
+
everyorg_results = discovery.search_everyorg(
|
| 858 |
+
location=location,
|
| 859 |
+
causes=causes if causes else None
|
| 860 |
+
)
|
| 861 |
+
results.extend(everyorg_results)
|
| 862 |
+
logger.info(f"Every.org: Found {len(everyorg_results)} organizations")
|
| 863 |
+
except Exception as e:
|
| 864 |
+
logger.warning(f"Every.org search failed: {e}")
|
| 865 |
+
|
| 866 |
+
# Filter by keyword if provided
|
| 867 |
+
if keyword and results:
|
| 868 |
+
keyword_lower = keyword.lower()
|
| 869 |
+
filtered_results = []
|
| 870 |
+
for org in results:
|
| 871 |
+
# Search in name, description, mission, ntee_description
|
| 872 |
+
searchable_text = ' '.join([
|
| 873 |
+
str(org.get('name', '')),
|
| 874 |
+
str(org.get('description', '')),
|
| 875 |
+
str(org.get('mission', '')),
|
| 876 |
+
str(org.get('ntee_description', ''))
|
| 877 |
+
]).lower()
|
| 878 |
+
|
| 879 |
+
if keyword_lower in searchable_text:
|
| 880 |
+
filtered_results.append(org)
|
| 881 |
+
|
| 882 |
+
results = filtered_results
|
| 883 |
+
|
| 884 |
+
return {
|
| 885 |
+
"location": location,
|
| 886 |
+
"keyword": keyword,
|
| 887 |
+
"state": state_code,
|
| 888 |
+
"ntee_code": ntee_code,
|
| 889 |
+
"count": len(results),
|
| 890 |
+
"nonprofits": results,
|
| 891 |
+
"data_sources": {
|
| 892 |
+
"propublica": "https://projects.propublica.org/nonprofits/api",
|
| 893 |
+
"everyorg": "https://www.every.org/nonprofit-api",
|
| 894 |
+
"irs_teos": "https://www.irs.gov/charities-non-profits/tax-exempt-organization-search-bulk-data-downloads"
|
| 895 |
+
}
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
except Exception as e:
|
| 899 |
+
logger.error(f"Nonprofit search error: {e}")
|
| 900 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 901 |
+
|
| 902 |
+
|
| 903 |
+
@app.get("/api/nonprofits")
|
| 904 |
+
async def search_nonprofits_api(
|
| 905 |
+
location: str = Query("Tuscaloosa, AL", description="City, State format"),
|
| 906 |
+
keyword: Optional[str] = Query(None, description="Service keyword (e.g., 'dental', 'health')"),
|
| 907 |
+
state: Optional[str] = Query(None, description="2-letter state code (e.g., 'AL')"),
|
| 908 |
+
ntee_code: Optional[str] = Query(None, description="NTEE code (e.g., 'E' for health)"),
|
| 909 |
+
source: Optional[str] = Query(None, description="Data source: 'propublica', 'everyorg', 'all'")
|
| 910 |
+
):
|
| 911 |
+
"""
|
| 912 |
+
Search for nonprofits using free open data APIs (API-prefixed endpoint for frontend).
|
| 913 |
+
|
| 914 |
+
This is a duplicate of /nonprofits with /api prefix for frontend routing.
|
| 915 |
+
"""
|
| 916 |
+
return await search_nonprofits(location, keyword, state, ntee_code, source)
|
| 917 |
+
|
| 918 |
+
|
| 919 |
+
@app.get("/data/status")
|
| 920 |
+
async def get_data_ingestion_status():
|
| 921 |
+
"""
|
| 922 |
+
Get status of reference data ingestions.
|
| 923 |
+
|
| 924 |
+
Shows Census jurisdictions, NCES school districts, and nonprofit cache status.
|
| 925 |
+
"""
|
| 926 |
+
try:
|
| 927 |
+
from pathlib import Path
|
| 928 |
+
from datetime import datetime
|
| 929 |
+
|
| 930 |
+
status = {
|
| 931 |
+
"census": {
|
| 932 |
+
"jurisdictions": 90000,
|
| 933 |
+
"counties": 3144,
|
| 934 |
+
"municipalities": 19500,
|
| 935 |
+
"status": "Check data/bronze/census_jurisdictions"
|
| 936 |
+
},
|
| 937 |
+
"nces": {
|
| 938 |
+
"school_districts": 13000,
|
| 939 |
+
"status": "Check data/bronze/nces_school_districts"
|
| 940 |
+
},
|
| 941 |
+
"nonprofits": {
|
| 942 |
+
"total_available": 3000000,
|
| 943 |
+
"cached_searches": 0,
|
| 944 |
+
"cache_path": "data/cache/nonprofits"
|
| 945 |
+
},
|
| 946 |
+
"meetings": {
|
| 947 |
+
"meetingbank": 1366,
|
| 948 |
+
"city_scrapers": "100-500",
|
| 949 |
+
"open_states": "50+"
|
| 950 |
+
}
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
# Check cache directories
|
| 954 |
+
cache_dir = Path("data/cache/nonprofits")
|
| 955 |
+
if cache_dir.exists():
|
| 956 |
+
cached_files = list(cache_dir.glob("*.json"))
|
| 957 |
+
status["nonprofits"]["cached_searches"] = len(cached_files)
|
| 958 |
+
|
| 959 |
+
return status
|
| 960 |
+
|
| 961 |
+
except Exception as e:
|
| 962 |
+
logger.error(f"Data status error: {e}")
|
| 963 |
+
return {"error": str(e)}
|
| 964 |
+
|
| 965 |
+
|
| 966 |
+
@app.post("/data/ingest/nonprofits")
|
| 967 |
+
async def bulk_ingest_nonprofits(
|
| 968 |
+
state: str = Query(..., description="State code (e.g., AL)"),
|
| 969 |
+
ntee_code: Optional[str] = Query("E", description="NTEE code (default: E for Health)")
|
| 970 |
+
):
|
| 971 |
+
"""
|
| 972 |
+
Bulk ingest nonprofit data for a state.
|
| 973 |
+
|
| 974 |
+
Caches ProPublica API results for offline use.
|
| 975 |
+
"""
|
| 976 |
+
try:
|
| 977 |
+
from discovery.nonprofit_discovery import NonprofitDiscovery
|
| 978 |
+
|
| 979 |
+
discovery = NonprofitDiscovery()
|
| 980 |
+
orgs = discovery.search_propublica(state=state, ntee_code=ntee_code)
|
| 981 |
+
|
| 982 |
+
return {
|
| 983 |
+
"message": f"Ingested {len(orgs)} nonprofits for {state}",
|
| 984 |
+
"state": state,
|
| 985 |
+
"ntee_code": ntee_code,
|
| 986 |
+
"count": len(orgs),
|
| 987 |
+
"cache_location": "data/cache/nonprofits"
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
except Exception as e:
|
| 991 |
+
logger.error(f"Nonprofit ingestion error: {e}")
|
| 992 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 993 |
+
|
| 994 |
+
|
| 995 |
+
@app.get("/health")
|
| 996 |
+
async def health_check():
|
| 997 |
+
"""Health check endpoint for monitoring."""
|
| 998 |
+
return {
|
| 999 |
+
"status": "healthy",
|
| 1000 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 1001 |
+
"version": "1.0.0"
|
| 1002 |
+
}
|
| 1003 |
+
|
| 1004 |
+
|
| 1005 |
+
@app.get("/api/health")
|
| 1006 |
+
async def api_health_check():
|
| 1007 |
+
"""Health check endpoint for monitoring (API path)."""
|
| 1008 |
+
return {
|
| 1009 |
+
"status": "healthy",
|
| 1010 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 1011 |
+
"version": "1.0.0"
|
| 1012 |
+
}
|
| 1013 |
+
|
| 1014 |
+
|
| 1015 |
+
@app.post("/admin/initialize")
|
| 1016 |
+
async def initialize_system():
|
| 1017 |
+
"""Initialize Delta Lake tables and system components."""
|
| 1018 |
+
try:
|
| 1019 |
+
pipeline.initialize_tables()
|
| 1020 |
+
|
| 1021 |
+
return {
|
| 1022 |
+
"status": "success",
|
| 1023 |
+
"message": "System initialized successfully"
|
| 1024 |
+
}
|
| 1025 |
+
|
| 1026 |
+
except Exception as e:
|
| 1027 |
+
logger.error(f"Error initializing system: {e}")
|
| 1028 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
# Startup event
|
| 1032 |
+
@app.post("/api/debate-grade")
|
| 1033 |
+
async def grade_decision_with_debate_framework(
|
| 1034 |
+
document_id: Optional[str] = Query(None, description="Document ID to grade"),
|
| 1035 |
+
text: Optional[str] = Query(None, description="Text to grade directly"),
|
| 1036 |
+
title: Optional[str] = Query("", description="Document title")
|
| 1037 |
+
):
|
| 1038 |
+
"""
|
| 1039 |
+
Grade a government decision using debate framework (Harms/Solvency/Topicality).
|
| 1040 |
+
|
| 1041 |
+
Translates debate concepts for laypeople:
|
| 1042 |
+
- Harms → "The Problem": Why is this a crisis?
|
| 1043 |
+
- Solvency → "The Fix": How does this solution work?
|
| 1044 |
+
- Topicality → "The Scope": Does the government have authority?
|
| 1045 |
+
|
| 1046 |
+
Example: /api/debate-grade?text=The city council approved funding for dental screening...
|
| 1047 |
+
"""
|
| 1048 |
+
try:
|
| 1049 |
+
from agents.debate_grader import DebateGraderAgent
|
| 1050 |
+
|
| 1051 |
+
grader = DebateGraderAgent()
|
| 1052 |
+
|
| 1053 |
+
# Get document content
|
| 1054 |
+
if document_id:
|
| 1055 |
+
document = pipeline.get_document_by_id(document_id)
|
| 1056 |
+
if not document:
|
| 1057 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 1058 |
+
elif text:
|
| 1059 |
+
document = {
|
| 1060 |
+
"content": text,
|
| 1061 |
+
"title": title,
|
| 1062 |
+
"id": "custom_text"
|
| 1063 |
+
}
|
| 1064 |
+
else:
|
| 1065 |
+
raise HTTPException(status_code=400, detail="Provide either document_id or text")
|
| 1066 |
+
|
| 1067 |
+
# Grade the document
|
| 1068 |
+
grade = await grader._grade_document(document)
|
| 1069 |
+
|
| 1070 |
+
return {
|
| 1071 |
+
"document_id": document.get("id"),
|
| 1072 |
+
"title": document.get("title", ""),
|
| 1073 |
+
"debate_grade": grade,
|
| 1074 |
+
"explanation": {
|
| 1075 |
+
"harms": "This measures how well the decision identifies and documents the problem using data and evidence",
|
| 1076 |
+
"solvency": "This measures how clearly the solution is defined and whether it will actually fix the problem",
|
| 1077 |
+
"topicality": "This measures whether the government body has the legal authority to take this action"
|
| 1078 |
+
}
|
| 1079 |
+
}
|
| 1080 |
+
|
| 1081 |
+
except HTTPException:
|
| 1082 |
+
raise
|
| 1083 |
+
except Exception as e:
|
| 1084 |
+
logger.error(f"Debate grading error: {e}")
|
| 1085 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1086 |
+
|
| 1087 |
+
|
| 1088 |
+
@app.post("/api/debate-grade/batch")
|
| 1089 |
+
async def grade_decisions_batch(
|
| 1090 |
+
state: Optional[str] = Query(None, description="Filter by state"),
|
| 1091 |
+
topic: Optional[str] = Query(None, description="Filter by topic"),
|
| 1092 |
+
limit: int = Query(50, description="Number of documents to grade")
|
| 1093 |
+
):
|
| 1094 |
+
"""
|
| 1095 |
+
Grade multiple government decisions using debate framework.
|
| 1096 |
+
|
| 1097 |
+
Returns aggregate insights about decision quality across dimensions.
|
| 1098 |
+
"""
|
| 1099 |
+
try:
|
| 1100 |
+
from agents.debate_grader import DebateGraderAgent
|
| 1101 |
+
from agents.base import AgentMessage, MessageType, AgentRole
|
| 1102 |
+
|
| 1103 |
+
grader = DebateGraderAgent()
|
| 1104 |
+
|
| 1105 |
+
# Get documents to grade
|
| 1106 |
+
documents = pipeline.query_opportunities_by_state(state, None)
|
| 1107 |
+
|
| 1108 |
+
if topic:
|
| 1109 |
+
documents = [d for d in documents if d.get("topic") == topic]
|
| 1110 |
+
|
| 1111 |
+
documents = documents[:limit]
|
| 1112 |
+
|
| 1113 |
+
# Create message and process
|
| 1114 |
+
message = AgentMessage(
|
| 1115 |
+
message_id=f"batch_grade_{datetime.utcnow().timestamp()}",
|
| 1116 |
+
sender=AgentRole.ORCHESTRATOR,
|
| 1117 |
+
recipient=AgentRole.DEBATE_GRADER,
|
| 1118 |
+
message_type=MessageType.COMMAND,
|
| 1119 |
+
payload={"documents": documents}
|
| 1120 |
+
)
|
| 1121 |
+
|
| 1122 |
+
result = await grader.process(message)
|
| 1123 |
+
graded_documents = result[0].payload.get("documents", [])
|
| 1124 |
+
insights = result[0].payload.get("insights", {})
|
| 1125 |
+
|
| 1126 |
+
return {
|
| 1127 |
+
"graded_count": len(graded_documents),
|
| 1128 |
+
"documents": graded_documents,
|
| 1129 |
+
"insights": insights,
|
| 1130 |
+
"explanation": {
|
| 1131 |
+
"average_scores": "Average scores across all three debate dimensions (out of 5)",
|
| 1132 |
+
"strongest_dimension": "Which dimension governments perform best on",
|
| 1133 |
+
"weakest_dimension": "Which dimension needs the most improvement"
|
| 1134 |
+
}
|
| 1135 |
+
}
|
| 1136 |
+
|
| 1137 |
+
except Exception as e:
|
| 1138 |
+
logger.error(f"Batch debate grading error: {e}")
|
| 1139 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1140 |
+
|
| 1141 |
+
|
| 1142 |
+
@app.on_event("startup")
|
| 1143 |
+
async def startup_event():
|
| 1144 |
+
"""Initialize system on startup with data validation."""
|
| 1145 |
+
logger.info("="*80)
|
| 1146 |
+
logger.info("🚀 STARTING Open Navigator API")
|
| 1147 |
+
logger.info("="*80)
|
| 1148 |
+
logger.info(f"Configuration: {settings.catalog_name}.{settings.schema_name}")
|
| 1149 |
+
logger.info(f"Log Level: {settings.log_level}")
|
| 1150 |
+
logger.info(f"Log File: {settings.log_file}")
|
| 1151 |
+
|
| 1152 |
+
# Check if running on HuggingFace Spaces
|
| 1153 |
+
IS_HF_SPACES = os.getenv("HF_SPACES") == "1"
|
| 1154 |
+
if IS_HF_SPACES:
|
| 1155 |
+
logger.info(f"🤗 Running on HuggingFace Spaces")
|
| 1156 |
+
else:
|
| 1157 |
+
logger.info(f"💻 Running in local/standard environment")
|
| 1158 |
+
|
| 1159 |
+
# Validate critical data files
|
| 1160 |
+
logger.info("")
|
| 1161 |
+
logger.info("📊 VALIDATING DATA AVAILABILITY...")
|
| 1162 |
+
logger.info("-" * 80)
|
| 1163 |
+
|
| 1164 |
+
data_dir = Path("data/gold")
|
| 1165 |
+
critical_files = []
|
| 1166 |
+
optional_files = []
|
| 1167 |
+
|
| 1168 |
+
# Check reference data (critical)
|
| 1169 |
+
reference_checks = [
|
| 1170 |
+
"reference/jurisdictions_cities.parquet",
|
| 1171 |
+
"reference/jurisdictions_counties.parquet",
|
| 1172 |
+
"reference/causes_ntee_codes.parquet",
|
| 1173 |
+
]
|
| 1174 |
+
|
| 1175 |
+
for file_pattern in reference_checks:
|
| 1176 |
+
file_path = data_dir / file_pattern
|
| 1177 |
+
if file_path.exists():
|
| 1178 |
+
size_mb = file_path.stat().st_size / (1024 * 1024)
|
| 1179 |
+
try:
|
| 1180 |
+
import pandas as pd
|
| 1181 |
+
df = pd.read_parquet(file_path)
|
| 1182 |
+
logger.info(f" ✅ {file_pattern}: {len(df):,} records ({size_mb:.2f} MB)")
|
| 1183 |
+
critical_files.append(file_pattern)
|
| 1184 |
+
except Exception as e:
|
| 1185 |
+
logger.error(f" ❌ {file_pattern}: ERROR - {e}")
|
| 1186 |
+
else:
|
| 1187 |
+
logger.warning(f" ⚠️ {file_pattern}: NOT FOUND")
|
| 1188 |
+
|
| 1189 |
+
# Check state data (optional - shows what's available)
|
| 1190 |
+
logger.info("")
|
| 1191 |
+
logger.info("📍 STATE DATA AVAILABILITY:")
|
| 1192 |
+
|
| 1193 |
+
states_dir = data_dir / "states"
|
| 1194 |
+
if states_dir.exists():
|
| 1195 |
+
state_dirs = sorted([d for d in states_dir.iterdir() if d.is_dir()])
|
| 1196 |
+
states_with_data = []
|
| 1197 |
+
|
| 1198 |
+
for state_dir in state_dirs[:10]: # Show first 10 states
|
| 1199 |
+
state = state_dir.name
|
| 1200 |
+
files_found = []
|
| 1201 |
+
|
| 1202 |
+
# Check for key files
|
| 1203 |
+
key_files = [
|
| 1204 |
+
"nonprofits_organizations.parquet",
|
| 1205 |
+
"contacts_officials.parquet",
|
| 1206 |
+
"events.parquet",
|
| 1207 |
+
]
|
| 1208 |
+
|
| 1209 |
+
for filename in key_files:
|
| 1210 |
+
file_path = state_dir / filename
|
| 1211 |
+
if file_path.exists():
|
| 1212 |
+
files_found.append(filename.split('.')[0].split('_')[-1])
|
| 1213 |
+
|
| 1214 |
+
if files_found:
|
| 1215 |
+
logger.info(f" ✅ {state}: {', '.join(files_found)}")
|
| 1216 |
+
states_with_data.append(state)
|
| 1217 |
+
|
| 1218 |
+
total_states = len(state_dirs)
|
| 1219 |
+
if total_states > 10:
|
| 1220 |
+
logger.info(f" ... and {total_states - 10} more states")
|
| 1221 |
+
|
| 1222 |
+
logger.info(f"")
|
| 1223 |
+
logger.info(f" 📊 Total states with data: {total_states}")
|
| 1224 |
+
else:
|
| 1225 |
+
logger.warning(" ⚠️ No state data directory found")
|
| 1226 |
+
|
| 1227 |
+
# Validate HuggingFace datasets if running on HF Spaces
|
| 1228 |
+
if IS_HF_SPACES:
|
| 1229 |
+
logger.info("")
|
| 1230 |
+
logger.info("🤗 VALIDATING HUGGINGFACE DATASETS...")
|
| 1231 |
+
logger.info("-" * 80)
|
| 1232 |
+
|
| 1233 |
+
# Check a sample of critical datasets
|
| 1234 |
+
import requests
|
| 1235 |
+
from api.routes.bills import get_hf_dataset_url
|
| 1236 |
+
|
| 1237 |
+
test_datasets = [
|
| 1238 |
+
("states-ma-bills-bills", "Massachusetts Bills"),
|
| 1239 |
+
("states-al-bills-bills", "Alabama Bills"),
|
| 1240 |
+
("states-ma-contacts-local-officials", "Massachusetts Local Officials"),
|
| 1241 |
+
]
|
| 1242 |
+
|
| 1243 |
+
hf_datasets_ok = 0
|
| 1244 |
+
for dataset_name, display_name in test_datasets:
|
| 1245 |
+
url = get_hf_dataset_url(dataset_name)
|
| 1246 |
+
try:
|
| 1247 |
+
response = requests.head(url, timeout=10, allow_redirects=True)
|
| 1248 |
+
if response.status_code == 200:
|
| 1249 |
+
logger.info(f" ✅ {display_name}: Accessible")
|
| 1250 |
+
hf_datasets_ok += 1
|
| 1251 |
+
else:
|
| 1252 |
+
logger.error(f" ❌ {display_name}: HTTP {response.status_code}")
|
| 1253 |
+
logger.error(f" URL: {url}")
|
| 1254 |
+
except Exception as e:
|
| 1255 |
+
logger.error(f" ❌ {display_name}: {type(e).__name__} - {e}")
|
| 1256 |
+
logger.error(f" URL: {url}")
|
| 1257 |
+
|
| 1258 |
+
logger.info("")
|
| 1259 |
+
logger.info(f" 📊 HuggingFace datasets validated: {hf_datasets_ok}/{len(test_datasets)}")
|
| 1260 |
+
|
| 1261 |
+
if hf_datasets_ok < len(test_datasets):
|
| 1262 |
+
logger.warning(" ⚠️ Some datasets are not accessible - API may have limited functionality")
|
| 1263 |
+
|
| 1264 |
+
logger.info("")
|
| 1265 |
+
logger.info("="*80)
|
| 1266 |
+
logger.info(f"✅ API READY - {len(critical_files)}/{len(reference_checks)} critical files available")
|
| 1267 |
+
if IS_HF_SPACES:
|
| 1268 |
+
logger.info(f"✅ HuggingFace datasets validated")
|
| 1269 |
+
logger.info("="*80)
|
| 1270 |
+
logger.info("")
|
| 1271 |
+
|
| 1272 |
+
|
| 1273 |
+
# Shutdown event
|
| 1274 |
+
@app.on_event("shutdown")
|
| 1275 |
+
async def shutdown_event():
|
| 1276 |
+
"""Cleanup on shutdown."""
|
| 1277 |
+
logger.info("Shutting down Oral Health Policy Pulse API")
|
| 1278 |
+
|
| 1279 |
+
|
| 1280 |
+
if __name__ == "__main__":
|
| 1281 |
+
import uvicorn
|
| 1282 |
+
|
| 1283 |
+
uvicorn.run(
|
| 1284 |
+
app,
|
| 1285 |
+
host=settings.api_host,
|
| 1286 |
+
port=settings.api_port,
|
| 1287 |
+
workers=settings.api_workers
|
| 1288 |
+
)
|
api/models.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database models for authentication, user management, and social features
|
| 3 |
+
"""
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, ForeignKey, UniqueConstraint, Float
|
| 6 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 7 |
+
from sqlalchemy.orm import relationship
|
| 8 |
+
|
| 9 |
+
Base = declarative_base()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class User(Base):
|
| 13 |
+
"""User account model"""
|
| 14 |
+
__tablename__ = "users"
|
| 15 |
+
|
| 16 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 17 |
+
email = Column(String(255), unique=True, index=True, nullable=False)
|
| 18 |
+
username = Column(String(100), unique=True, index=True, nullable=True)
|
| 19 |
+
full_name = Column(String(255), nullable=True)
|
| 20 |
+
avatar_url = Column(String(500), nullable=True)
|
| 21 |
+
|
| 22 |
+
# OAuth provider info
|
| 23 |
+
oauth_provider = Column(String(50), nullable=True) # 'huggingface', 'google', 'facebook', 'github'
|
| 24 |
+
oauth_id = Column(String(255), nullable=True) # Provider-specific user ID
|
| 25 |
+
|
| 26 |
+
# Authentication
|
| 27 |
+
hashed_password = Column(String(255), nullable=True) # For email/password (optional)
|
| 28 |
+
is_active = Column(Boolean, default=True)
|
| 29 |
+
is_verified = Column(Boolean, default=False)
|
| 30 |
+
|
| 31 |
+
# Location preferences
|
| 32 |
+
state = Column(String(100), nullable=True) # US State
|
| 33 |
+
county = Column(String(100), nullable=True) # County
|
| 34 |
+
city = Column(String(100), nullable=True) # City
|
| 35 |
+
school_board = Column(String(255), nullable=True) # School board/district
|
| 36 |
+
|
| 37 |
+
# Profile completion
|
| 38 |
+
profile_completed = Column(Boolean, default=False)
|
| 39 |
+
|
| 40 |
+
# Timestamps
|
| 41 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 42 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 43 |
+
last_login = Column(DateTime, nullable=True)
|
| 44 |
+
|
| 45 |
+
# User preferences (JSON stored as text)
|
| 46 |
+
preferences = Column(Text, nullable=True)
|
| 47 |
+
|
| 48 |
+
def __repr__(self):
|
| 49 |
+
return f"<User {self.email}>"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class OAuthState(Base):
|
| 53 |
+
"""Temporary storage for OAuth state tokens (CSRF protection)"""
|
| 54 |
+
__tablename__ = "oauth_states"
|
| 55 |
+
|
| 56 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 57 |
+
state_token = Column(String(255), unique=True, index=True, nullable=False)
|
| 58 |
+
provider = Column(String(50), nullable=False)
|
| 59 |
+
redirect_uri = Column(String(500), nullable=True)
|
| 60 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 61 |
+
expires_at = Column(DateTime, nullable=False)
|
| 62 |
+
|
| 63 |
+
def __repr__(self):
|
| 64 |
+
return f"<OAuthState {self.provider} - {self.state_token[:8]}...>"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ============================================================================
|
| 68 |
+
# SOCIAL FEATURES MODELS
|
| 69 |
+
# ============================================================================
|
| 70 |
+
|
| 71 |
+
class Organization(Base):
|
| 72 |
+
"""Organizations (nonprofits, charities, government agencies, advocacy groups)"""
|
| 73 |
+
__tablename__ = "organizations"
|
| 74 |
+
|
| 75 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 76 |
+
name = Column(String(255), nullable=False, index=True)
|
| 77 |
+
slug = Column(String(255), unique=True, index=True, nullable=False) # URL-friendly identifier
|
| 78 |
+
description = Column(Text, nullable=True)
|
| 79 |
+
logo_url = Column(String(500), nullable=True)
|
| 80 |
+
website = Column(String(500), nullable=True)
|
| 81 |
+
|
| 82 |
+
# Organization type
|
| 83 |
+
org_type = Column(String(50), nullable=True) # 'nonprofit', 'government', 'advocacy', 'charity'
|
| 84 |
+
|
| 85 |
+
# Location
|
| 86 |
+
state = Column(String(100), nullable=True)
|
| 87 |
+
county = Column(String(100), nullable=True)
|
| 88 |
+
city = Column(String(100), nullable=True)
|
| 89 |
+
address = Column(Text, nullable=True)
|
| 90 |
+
|
| 91 |
+
# Contact
|
| 92 |
+
email = Column(String(255), nullable=True)
|
| 93 |
+
phone = Column(String(50), nullable=True)
|
| 94 |
+
|
| 95 |
+
# Nonprofit-specific (from IRS/ProPublica)
|
| 96 |
+
ein = Column(String(20), nullable=True, index=True) # Employer Identification Number
|
| 97 |
+
ntee_code = Column(String(10), nullable=True) # National Taxonomy of Exempt Entities
|
| 98 |
+
revenue = Column(Float, nullable=True)
|
| 99 |
+
|
| 100 |
+
# Social stats
|
| 101 |
+
follower_count = Column(Integer, default=0)
|
| 102 |
+
|
| 103 |
+
# Verification
|
| 104 |
+
is_verified = Column(Boolean, default=False)
|
| 105 |
+
verified_at = Column(DateTime, nullable=True)
|
| 106 |
+
|
| 107 |
+
# Timestamps
|
| 108 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 109 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 110 |
+
|
| 111 |
+
def __repr__(self):
|
| 112 |
+
return f"<Organization {self.name}>"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class Cause(Base):
|
| 116 |
+
"""Causes/Topics/Issues (oral health, housing, education, climate, etc.)"""
|
| 117 |
+
__tablename__ = "causes"
|
| 118 |
+
|
| 119 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 120 |
+
name = Column(String(255), nullable=False, index=True)
|
| 121 |
+
slug = Column(String(255), unique=True, index=True, nullable=False)
|
| 122 |
+
description = Column(Text, nullable=True)
|
| 123 |
+
icon_url = Column(String(500), nullable=True)
|
| 124 |
+
color = Column(String(7), nullable=True) # Hex color code
|
| 125 |
+
|
| 126 |
+
# Category
|
| 127 |
+
category = Column(String(100), nullable=True) # 'health', 'education', 'housing', 'environment', etc.
|
| 128 |
+
|
| 129 |
+
# Social stats
|
| 130 |
+
follower_count = Column(Integer, default=0)
|
| 131 |
+
|
| 132 |
+
# Timestamps
|
| 133 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 134 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 135 |
+
|
| 136 |
+
def __repr__(self):
|
| 137 |
+
return f"<Cause {self.name}>"
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class Official(Base):
|
| 141 |
+
"""Public officials (elected, appointed) - renamed from Leader to match OpenStates"""
|
| 142 |
+
__tablename__ = "officials"
|
| 143 |
+
|
| 144 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 145 |
+
ocd_person_id = Column(String(255), unique=True, index=True, nullable=True) # OpenCivicData ID
|
| 146 |
+
name = Column(String(255), nullable=False, index=True)
|
| 147 |
+
slug = Column(String(255), unique=True, index=True, nullable=False)
|
| 148 |
+
family_name = Column(String(100), nullable=True)
|
| 149 |
+
given_name = Column(String(100), nullable=True)
|
| 150 |
+
sort_name = Column(String(255), nullable=True)
|
| 151 |
+
|
| 152 |
+
# Bio and presentation
|
| 153 |
+
title = Column(String(255), nullable=True) # 'Mayor', 'State Senator', 'City Council Member'
|
| 154 |
+
bio = Column(Text, nullable=True)
|
| 155 |
+
photo_url = Column(String(500), nullable=True)
|
| 156 |
+
gender = Column(String(20), nullable=True)
|
| 157 |
+
birth_date = Column(DateTime, nullable=True)
|
| 158 |
+
|
| 159 |
+
# Current role (primary position)
|
| 160 |
+
position_type = Column(String(100), nullable=True) # 'elected', 'appointed'
|
| 161 |
+
office = Column(String(255), nullable=True) # 'Office of the Mayor', 'State Senate District 12'
|
| 162 |
+
party = Column(String(100), nullable=True) # 'Democratic', 'Republican', 'Independent'
|
| 163 |
+
chamber = Column(String(50), nullable=True) # 'upper', 'lower', 'executive'
|
| 164 |
+
district = Column(String(50), nullable=True) # District number or name
|
| 165 |
+
|
| 166 |
+
# Location/Jurisdiction
|
| 167 |
+
state = Column(String(100), nullable=True)
|
| 168 |
+
county = Column(String(100), nullable=True)
|
| 169 |
+
city = Column(String(100), nullable=True)
|
| 170 |
+
jurisdiction = Column(String(255), nullable=True)
|
| 171 |
+
|
| 172 |
+
# Contact
|
| 173 |
+
email = Column(String(255), nullable=True)
|
| 174 |
+
phone = Column(String(50), nullable=True)
|
| 175 |
+
website = Column(String(500), nullable=True)
|
| 176 |
+
|
| 177 |
+
# Social media
|
| 178 |
+
twitter = Column(String(255), nullable=True)
|
| 179 |
+
linkedin = Column(String(255), nullable=True)
|
| 180 |
+
facebook = Column(String(255), nullable=True)
|
| 181 |
+
|
| 182 |
+
# Social stats
|
| 183 |
+
follower_count = Column(Integer, default=0)
|
| 184 |
+
|
| 185 |
+
# Verification
|
| 186 |
+
is_verified = Column(Boolean, default=False)
|
| 187 |
+
verified_at = Column(DateTime, nullable=True)
|
| 188 |
+
|
| 189 |
+
# Term dates
|
| 190 |
+
term_start_date = Column(DateTime, nullable=True)
|
| 191 |
+
term_end_date = Column(DateTime, nullable=True)
|
| 192 |
+
is_current = Column(Boolean, default=True)
|
| 193 |
+
|
| 194 |
+
# Timestamps
|
| 195 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 196 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 197 |
+
|
| 198 |
+
def __repr__(self):
|
| 199 |
+
return f"<Official {self.name}>"
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ============================================================================
|
| 203 |
+
# FOLLOW RELATIONSHIPS (Many-to-Many)
|
| 204 |
+
# ============================================================================
|
| 205 |
+
|
| 206 |
+
class UserFollow(Base):
|
| 207 |
+
"""User following another user"""
|
| 208 |
+
__tablename__ = "user_follows"
|
| 209 |
+
__table_args__ = (
|
| 210 |
+
UniqueConstraint('follower_id', 'following_id', name='unique_user_follow'),
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 214 |
+
follower_id = Column(Integer, ForeignKey('users.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 215 |
+
following_id = Column(Integer, ForeignKey('users.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 216 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 217 |
+
|
| 218 |
+
def __repr__(self):
|
| 219 |
+
return f"<UserFollow {self.follower_id} -> {self.following_id}>"
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
class OfficialFollow(Base):
|
| 223 |
+
"""User following an official (renamed from LeaderFollow)"""
|
| 224 |
+
__tablename__ = "official_follows"
|
| 225 |
+
__table_args__ = (
|
| 226 |
+
UniqueConstraint('user_id', 'official_id', name='unique_official_follow'),
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 230 |
+
user_id = Column(Integer, ForeignKey('users.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 231 |
+
official_id = Column(Integer, ForeignKey('officials.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 232 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 233 |
+
|
| 234 |
+
def __repr__(self):
|
| 235 |
+
return f"<OfficialFollow user:{self.user_id} -> official:{self.official_id}>"
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
class OrganizationFollow(Base):
|
| 239 |
+
"""User following an organization"""
|
| 240 |
+
__tablename__ = "organization_follows"
|
| 241 |
+
__table_args__ = (
|
| 242 |
+
UniqueConstraint('user_id', 'organization_id', name='unique_org_follow'),
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 246 |
+
user_id = Column(Integer, ForeignKey('users.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 247 |
+
organization_id = Column(Integer, ForeignKey('organizations.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 248 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 249 |
+
|
| 250 |
+
def __repr__(self):
|
| 251 |
+
return f"<OrganizationFollow user:{self.user_id} -> org:{self.organization_id}>"
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
class CauseFollow(Base):
|
| 255 |
+
"""User following a cause/topic"""
|
| 256 |
+
__tablename__ = "cause_follows"
|
| 257 |
+
__table_args__ = (
|
| 258 |
+
UniqueConstraint('user_id', 'cause_id', name='unique_cause_follow'),
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 262 |
+
user_id = Column(Integer, ForeignKey('users.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 263 |
+
cause_id = Column(Integer, ForeignKey('causes.id', ondelete='CASCADE'), nullable=False, index=True)
|
| 264 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 265 |
+
|
| 266 |
+
def __repr__(self):
|
| 267 |
+
return f"<CauseFollow user:{self.user_id} -> cause:{self.cause_id}>"
|
| 268 |
+
|
api/routes/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API routes package
|
| 3 |
+
"""
|
api/routes/auth.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OAuth authentication routes - HuggingFace, Google, Facebook, GitHub
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import httpx
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from urllib.parse import urlencode
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
from fastapi import APIRouter, Depends, HTTPException, Request, status
|
| 12 |
+
from fastapi.responses import RedirectResponse
|
| 13 |
+
from sqlalchemy.orm import Session
|
| 14 |
+
from pydantic import BaseModel
|
| 15 |
+
|
| 16 |
+
from api.database import get_db
|
| 17 |
+
from api.models import User, OAuthState
|
| 18 |
+
from api.auth import create_access_token, generate_state_token
|
| 19 |
+
|
| 20 |
+
# Load environment variables from .env file
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
router = APIRouter(prefix="/auth", tags=["authentication"])
|
| 24 |
+
|
| 25 |
+
# OAuth provider configurations
|
| 26 |
+
OAUTH_PROVIDERS = {
|
| 27 |
+
'huggingface': {
|
| 28 |
+
'authorize_url': 'https://huggingface.co/oauth/authorize',
|
| 29 |
+
'token_url': 'https://huggingface.co/oauth/token',
|
| 30 |
+
'userinfo_url': 'https://huggingface.co/api/whoami-v2',
|
| 31 |
+
'scope': 'openid profile email',
|
| 32 |
+
'client_id_env': 'HUGGINGFACE_CLIENT_ID',
|
| 33 |
+
'client_secret_env': 'HUGGINGFACE_CLIENT_SECRET',
|
| 34 |
+
},
|
| 35 |
+
'google': {
|
| 36 |
+
'authorize_url': 'https://accounts.google.com/o/oauth2/v2/auth',
|
| 37 |
+
'token_url': 'https://oauth2.googleapis.com/token',
|
| 38 |
+
'userinfo_url': 'https://www.googleapis.com/oauth2/v2/userinfo',
|
| 39 |
+
'scope': 'openid email profile',
|
| 40 |
+
'client_id_env': 'GOOGLE_CLIENT_ID',
|
| 41 |
+
'client_secret_env': 'GOOGLE_CLIENT_SECRET',
|
| 42 |
+
},
|
| 43 |
+
'facebook': {
|
| 44 |
+
'authorize_url': 'https://www.facebook.com/v18.0/dialog/oauth',
|
| 45 |
+
'token_url': 'https://graph.facebook.com/v18.0/oauth/access_token',
|
| 46 |
+
'userinfo_url': 'https://graph.facebook.com/me?fields=id,name,email,picture',
|
| 47 |
+
'scope': 'email public_profile',
|
| 48 |
+
'client_id_env': 'FACEBOOK_APP_ID',
|
| 49 |
+
'client_secret_env': 'FACEBOOK_APP_SECRET',
|
| 50 |
+
},
|
| 51 |
+
'github': {
|
| 52 |
+
'authorize_url': 'https://github.com/login/oauth/authorize',
|
| 53 |
+
'token_url': 'https://github.com/login/oauth/access_token',
|
| 54 |
+
'userinfo_url': 'https://api.github.com/user',
|
| 55 |
+
'scope': 'user:email',
|
| 56 |
+
'client_id_env': 'GITHUB_CLIENT_ID',
|
| 57 |
+
'client_secret_env': 'GITHUB_CLIENT_SECRET',
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Response models
|
| 63 |
+
class TokenResponse(BaseModel):
|
| 64 |
+
access_token: str
|
| 65 |
+
token_type: str = "bearer"
|
| 66 |
+
user: dict
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class UserResponse(BaseModel):
|
| 70 |
+
id: int
|
| 71 |
+
email: str
|
| 72 |
+
username: Optional[str]
|
| 73 |
+
full_name: Optional[str]
|
| 74 |
+
avatar_url: Optional[str]
|
| 75 |
+
oauth_provider: Optional[str]
|
| 76 |
+
state: Optional[str]
|
| 77 |
+
county: Optional[str]
|
| 78 |
+
city: Optional[str]
|
| 79 |
+
school_board: Optional[str]
|
| 80 |
+
profile_completed: Optional[bool]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# Helper functions
|
| 84 |
+
def get_or_create_user(
|
| 85 |
+
db: Session,
|
| 86 |
+
email: str,
|
| 87 |
+
provider: str,
|
| 88 |
+
oauth_id: str,
|
| 89 |
+
full_name: Optional[str] = None,
|
| 90 |
+
avatar_url: Optional[str] = None,
|
| 91 |
+
username: Optional[str] = None,
|
| 92 |
+
) -> User:
|
| 93 |
+
"""Get existing user or create new one from OAuth data"""
|
| 94 |
+
|
| 95 |
+
# Try to find existing user by OAuth ID first
|
| 96 |
+
user = db.query(User).filter(
|
| 97 |
+
User.oauth_provider == provider,
|
| 98 |
+
User.oauth_id == oauth_id
|
| 99 |
+
).first()
|
| 100 |
+
|
| 101 |
+
if user:
|
| 102 |
+
# Update user info if changed
|
| 103 |
+
user.full_name = full_name or user.full_name
|
| 104 |
+
user.avatar_url = avatar_url or user.avatar_url
|
| 105 |
+
user.username = username or user.username
|
| 106 |
+
user.last_login = datetime.utcnow()
|
| 107 |
+
db.commit()
|
| 108 |
+
return user
|
| 109 |
+
|
| 110 |
+
# Try to find by email
|
| 111 |
+
user = db.query(User).filter(User.email == email).first()
|
| 112 |
+
|
| 113 |
+
if user:
|
| 114 |
+
# Link OAuth account to existing user
|
| 115 |
+
user.oauth_provider = provider
|
| 116 |
+
user.oauth_id = oauth_id
|
| 117 |
+
user.full_name = full_name or user.full_name
|
| 118 |
+
user.avatar_url = avatar_url or user.avatar_url
|
| 119 |
+
user.username = username or user.username
|
| 120 |
+
user.last_login = datetime.utcnow()
|
| 121 |
+
user.is_verified = True # OAuth emails are verified
|
| 122 |
+
db.commit()
|
| 123 |
+
return user
|
| 124 |
+
|
| 125 |
+
# Create new user
|
| 126 |
+
user = User(
|
| 127 |
+
email=email,
|
| 128 |
+
username=username,
|
| 129 |
+
full_name=full_name,
|
| 130 |
+
avatar_url=avatar_url,
|
| 131 |
+
oauth_provider=provider,
|
| 132 |
+
oauth_id=oauth_id,
|
| 133 |
+
is_verified=True,
|
| 134 |
+
is_active=True,
|
| 135 |
+
last_login=datetime.utcnow(),
|
| 136 |
+
)
|
| 137 |
+
db.add(user)
|
| 138 |
+
db.commit()
|
| 139 |
+
db.refresh(user)
|
| 140 |
+
return user
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# OAuth routes
|
| 144 |
+
@router.get("/login/{provider}")
|
| 145 |
+
async def oauth_login(
|
| 146 |
+
provider: str,
|
| 147 |
+
request: Request,
|
| 148 |
+
db: Session = Depends(get_db),
|
| 149 |
+
redirect_uri: Optional[str] = None
|
| 150 |
+
):
|
| 151 |
+
"""
|
| 152 |
+
Initiate OAuth login flow
|
| 153 |
+
|
| 154 |
+
Supported providers: huggingface, google, facebook, github
|
| 155 |
+
"""
|
| 156 |
+
if provider not in OAUTH_PROVIDERS:
|
| 157 |
+
raise HTTPException(status_code=400, detail=f"Unsupported provider: {provider}")
|
| 158 |
+
|
| 159 |
+
config = OAUTH_PROVIDERS[provider]
|
| 160 |
+
client_id = os.getenv(config['client_id_env'])
|
| 161 |
+
|
| 162 |
+
if not client_id:
|
| 163 |
+
raise HTTPException(
|
| 164 |
+
status_code=500,
|
| 165 |
+
detail=f"OAuth not configured for {provider}. Missing {config['client_id_env']}"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Generate state token for CSRF protection
|
| 169 |
+
state = generate_state_token()
|
| 170 |
+
|
| 171 |
+
# Store state in database
|
| 172 |
+
oauth_state = OAuthState(
|
| 173 |
+
state_token=state,
|
| 174 |
+
provider=provider,
|
| 175 |
+
redirect_uri=redirect_uri,
|
| 176 |
+
expires_at=datetime.utcnow() + timedelta(minutes=10),
|
| 177 |
+
)
|
| 178 |
+
db.add(oauth_state)
|
| 179 |
+
db.commit()
|
| 180 |
+
|
| 181 |
+
# Build callback URL using API_BASE_URL to ensure correct protocol (http vs https)
|
| 182 |
+
base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
|
| 183 |
+
callback_url = f"{base_url}/auth/callback/{provider}"
|
| 184 |
+
|
| 185 |
+
# Build authorization URL
|
| 186 |
+
params = {
|
| 187 |
+
'client_id': client_id,
|
| 188 |
+
'redirect_uri': callback_url,
|
| 189 |
+
'scope': config['scope'],
|
| 190 |
+
'state': state,
|
| 191 |
+
'response_type': 'code',
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
auth_url = f"{config['authorize_url']}?{urlencode(params)}"
|
| 195 |
+
return RedirectResponse(url=auth_url)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
@router.get("/callback/{provider}", name="oauth_callback")
|
| 199 |
+
async def oauth_callback(
|
| 200 |
+
provider: str,
|
| 201 |
+
code: Optional[str] = None,
|
| 202 |
+
state: Optional[str] = None,
|
| 203 |
+
error: Optional[str] = None,
|
| 204 |
+
db: Session = Depends(get_db)
|
| 205 |
+
):
|
| 206 |
+
"""OAuth callback handler"""
|
| 207 |
+
|
| 208 |
+
if error:
|
| 209 |
+
raise HTTPException(status_code=400, detail=f"OAuth error: {error}")
|
| 210 |
+
|
| 211 |
+
if not code or not state:
|
| 212 |
+
raise HTTPException(status_code=400, detail="Missing code or state parameter")
|
| 213 |
+
|
| 214 |
+
if provider not in OAUTH_PROVIDERS:
|
| 215 |
+
raise HTTPException(status_code=400, detail=f"Unsupported provider: {provider}")
|
| 216 |
+
|
| 217 |
+
# Verify state token
|
| 218 |
+
oauth_state = db.query(OAuthState).filter(
|
| 219 |
+
OAuthState.state_token == state,
|
| 220 |
+
OAuthState.provider == provider
|
| 221 |
+
).first()
|
| 222 |
+
|
| 223 |
+
if not oauth_state or oauth_state.expires_at < datetime.utcnow():
|
| 224 |
+
raise HTTPException(status_code=400, detail="Invalid or expired state token")
|
| 225 |
+
|
| 226 |
+
config = OAUTH_PROVIDERS[provider]
|
| 227 |
+
client_id = os.getenv(config['client_id_env'])
|
| 228 |
+
client_secret = os.getenv(config['client_secret_env'])
|
| 229 |
+
|
| 230 |
+
# Build callback URL (must match the one sent to authorize)
|
| 231 |
+
from fastapi import Request
|
| 232 |
+
# We need to reconstruct the callback URL - for now use a simple approach
|
| 233 |
+
base_url = os.getenv('API_BASE_URL', 'http://localhost:8000')
|
| 234 |
+
callback_url = f"{base_url}/auth/callback/{provider}"
|
| 235 |
+
|
| 236 |
+
# Exchange code for access token
|
| 237 |
+
async with httpx.AsyncClient() as client:
|
| 238 |
+
token_response = await client.post(
|
| 239 |
+
config['token_url'],
|
| 240 |
+
data={
|
| 241 |
+
'client_id': client_id,
|
| 242 |
+
'client_secret': client_secret,
|
| 243 |
+
'code': code,
|
| 244 |
+
'redirect_uri': callback_url,
|
| 245 |
+
'grant_type': 'authorization_code',
|
| 246 |
+
},
|
| 247 |
+
headers={'Accept': 'application/json'}
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
if token_response.status_code != 200:
|
| 251 |
+
raise HTTPException(
|
| 252 |
+
status_code=400,
|
| 253 |
+
detail=f"Token exchange failed: {token_response.text}"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
token_data = token_response.json()
|
| 257 |
+
access_token = token_data.get('access_token')
|
| 258 |
+
|
| 259 |
+
if not access_token:
|
| 260 |
+
raise HTTPException(status_code=400, detail="No access token received")
|
| 261 |
+
|
| 262 |
+
# Get user info from provider
|
| 263 |
+
user_info = await get_user_info(provider, access_token, config)
|
| 264 |
+
|
| 265 |
+
if not user_info or not user_info.get('email'):
|
| 266 |
+
raise HTTPException(status_code=400, detail="Could not retrieve user email from provider")
|
| 267 |
+
|
| 268 |
+
# Get or create user
|
| 269 |
+
user = get_or_create_user(
|
| 270 |
+
db=db,
|
| 271 |
+
email=user_info['email'],
|
| 272 |
+
provider=provider,
|
| 273 |
+
oauth_id=user_info['oauth_id'],
|
| 274 |
+
full_name=user_info.get('full_name'),
|
| 275 |
+
avatar_url=user_info.get('avatar_url'),
|
| 276 |
+
username=user_info.get('username'),
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# Clean up state token
|
| 280 |
+
db.delete(oauth_state)
|
| 281 |
+
db.commit()
|
| 282 |
+
|
| 283 |
+
# Create JWT token (sub must be string, not int)
|
| 284 |
+
jwt_token = create_access_token(data={"sub": str(user.id)})
|
| 285 |
+
|
| 286 |
+
# Redirect to frontend with token
|
| 287 |
+
# On HuggingFace/production, frontend and backend are same domain - use relative path
|
| 288 |
+
# On local dev, frontend is separate server - use FRONTEND_URL
|
| 289 |
+
frontend_url = os.getenv('FRONTEND_URL', '')
|
| 290 |
+
|
| 291 |
+
# If FRONTEND_URL is localhost or not set, assume same-domain deployment (HuggingFace)
|
| 292 |
+
if not frontend_url or 'localhost' in frontend_url:
|
| 293 |
+
# Use relative redirect (works on HuggingFace where both are same domain)
|
| 294 |
+
redirect_url = oauth_state.redirect_uri or '/'
|
| 295 |
+
else:
|
| 296 |
+
# Use absolute URL for separate frontend server
|
| 297 |
+
redirect_url = oauth_state.redirect_uri or frontend_url
|
| 298 |
+
|
| 299 |
+
# Append token as URL parameter
|
| 300 |
+
params = urlencode({'token': jwt_token})
|
| 301 |
+
full_redirect_url = f"{redirect_url}?{params}" if '?' not in redirect_url else f"{redirect_url}&{params}"
|
| 302 |
+
|
| 303 |
+
return RedirectResponse(url=full_redirect_url)
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
async def get_user_info(provider: str, access_token: str, config: dict) -> dict:
|
| 307 |
+
"""Get user information from OAuth provider"""
|
| 308 |
+
|
| 309 |
+
async with httpx.AsyncClient() as client:
|
| 310 |
+
headers = {'Authorization': f'Bearer {access_token}'}
|
| 311 |
+
|
| 312 |
+
user_info = {}
|
| 313 |
+
|
| 314 |
+
if provider == 'huggingface':
|
| 315 |
+
resp = await client.get(config['userinfo_url'], headers=headers)
|
| 316 |
+
data = resp.json()
|
| 317 |
+
user_info = {
|
| 318 |
+
'email': data.get('email'),
|
| 319 |
+
'oauth_id': str(data.get('id')),
|
| 320 |
+
'full_name': data.get('fullname') or data.get('name'),
|
| 321 |
+
'avatar_url': data.get('avatarUrl'),
|
| 322 |
+
'username': data.get('name'),
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
elif provider == 'google':
|
| 326 |
+
resp = await client.get(config['userinfo_url'], headers=headers)
|
| 327 |
+
data = resp.json()
|
| 328 |
+
user_info = {
|
| 329 |
+
'email': data.get('email'),
|
| 330 |
+
'oauth_id': data.get('id'),
|
| 331 |
+
'full_name': data.get('name'),
|
| 332 |
+
'avatar_url': data.get('picture'),
|
| 333 |
+
'username': data.get('email', '').split('@')[0],
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
elif provider == 'facebook':
|
| 337 |
+
resp = await client.get(config['userinfo_url'], headers={'Authorization': f'Bearer {access_token}'})
|
| 338 |
+
data = resp.json()
|
| 339 |
+
user_info = {
|
| 340 |
+
'email': data.get('email'),
|
| 341 |
+
'oauth_id': str(data.get('id')),
|
| 342 |
+
'full_name': data.get('name'),
|
| 343 |
+
'avatar_url': data.get('picture', {}).get('data', {}).get('url') if isinstance(data.get('picture'), dict) else None,
|
| 344 |
+
'username': data.get('name', '').replace(' ', '_').lower(),
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
elif provider == 'github':
|
| 348 |
+
# Get user profile
|
| 349 |
+
resp = await client.get(config['userinfo_url'], headers=headers)
|
| 350 |
+
data = resp.json()
|
| 351 |
+
|
| 352 |
+
# Get user email if not public
|
| 353 |
+
email = data.get('email')
|
| 354 |
+
if not email:
|
| 355 |
+
resp_emails = await client.get('https://api.github.com/user/emails', headers=headers)
|
| 356 |
+
emails = resp_emails.json()
|
| 357 |
+
email = next((e['email'] for e in emails if e.get('primary')), emails[0]['email'] if emails else None)
|
| 358 |
+
|
| 359 |
+
user_info = {
|
| 360 |
+
'email': email,
|
| 361 |
+
'oauth_id': str(data.get('id')),
|
| 362 |
+
'full_name': data.get('name'),
|
| 363 |
+
'avatar_url': data.get('avatar_url'),
|
| 364 |
+
'username': data.get('login'),
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
return user_info
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
@router.get("/me", response_model=UserResponse)
|
| 371 |
+
def get_current_user_info(
|
| 372 |
+
request: Request,
|
| 373 |
+
db: Session = Depends(get_db)
|
| 374 |
+
):
|
| 375 |
+
"""Get current authenticated user info"""
|
| 376 |
+
|
| 377 |
+
# Get token from Authorization header
|
| 378 |
+
auth_header = request.headers.get('Authorization')
|
| 379 |
+
if not auth_header or not auth_header.startswith('Bearer '):
|
| 380 |
+
raise HTTPException(status_code=401, detail="Not authenticated")
|
| 381 |
+
|
| 382 |
+
token = auth_header.split(' ')[1]
|
| 383 |
+
|
| 384 |
+
# Decode token and get user
|
| 385 |
+
from api.auth import decode_access_token
|
| 386 |
+
payload = decode_access_token(token)
|
| 387 |
+
user_id = int(payload.get('sub')) # Convert back to int for DB query
|
| 388 |
+
|
| 389 |
+
user = db.query(User).filter(User.id == user_id).first()
|
| 390 |
+
if not user:
|
| 391 |
+
raise HTTPException(status_code=404, detail="User not found")
|
| 392 |
+
|
| 393 |
+
return user
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
@router.patch("/profile", response_model=UserResponse)
|
| 397 |
+
def update_user_profile(
|
| 398 |
+
profile_data: dict,
|
| 399 |
+
request: Request,
|
| 400 |
+
db: Session = Depends(get_db)
|
| 401 |
+
):
|
| 402 |
+
"""Update user profile (location preferences)"""
|
| 403 |
+
|
| 404 |
+
# Get token from Authorization header
|
| 405 |
+
auth_header = request.headers.get('Authorization')
|
| 406 |
+
if not auth_header or not auth_header.startswith('Bearer '):
|
| 407 |
+
raise HTTPException(status_code=401, detail="Not authenticated")
|
| 408 |
+
|
| 409 |
+
token = auth_header.split(' ')[1]
|
| 410 |
+
|
| 411 |
+
# Decode token and get user
|
| 412 |
+
from api.auth import decode_access_token
|
| 413 |
+
payload = decode_access_token(token)
|
| 414 |
+
user_id = int(payload.get('sub'))
|
| 415 |
+
|
| 416 |
+
user = db.query(User).filter(User.id == user_id).first()
|
| 417 |
+
if not user:
|
| 418 |
+
raise HTTPException(status_code=404, detail="User not found")
|
| 419 |
+
|
| 420 |
+
# Update allowed fields
|
| 421 |
+
allowed_fields = ['state', 'county', 'city', 'school_board', 'profile_completed']
|
| 422 |
+
for field, value in profile_data.items():
|
| 423 |
+
if field in allowed_fields and hasattr(user, field):
|
| 424 |
+
setattr(user, field, value)
|
| 425 |
+
|
| 426 |
+
user.updated_at = datetime.utcnow()
|
| 427 |
+
db.commit()
|
| 428 |
+
db.refresh(user)
|
| 429 |
+
|
| 430 |
+
return user
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
@router.post("/logout")
|
| 434 |
+
def logout():
|
| 435 |
+
"""Logout endpoint (client-side token removal)"""
|
| 436 |
+
return {"message": "Logged out successfully. Please remove the token from client."}
|
api/routes/bills.py
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Bills API Routes - Legislative bill data from OpenStates
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import APIRouter, Query, HTTPException
|
| 5 |
+
from fastapi.responses import JSONResponse
|
| 6 |
+
from typing import Optional, List, Dict
|
| 7 |
+
import duckdb
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from loguru import logger
|
| 11 |
+
import re
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import traceback
|
| 15 |
+
|
| 16 |
+
from api.errors import ErrorDetail, parse_error
|
| 17 |
+
from api.routes.search import load_parquet_cached
|
| 18 |
+
|
| 19 |
+
router = APIRouter(prefix="/api/bills", tags=["bills"])
|
| 20 |
+
|
| 21 |
+
GOLD_DIR = Path("data/gold")
|
| 22 |
+
IS_HF_SPACES = os.getenv("HF_SPACES") == "1"
|
| 23 |
+
HF_ORGANIZATION = "CommunityOne"
|
| 24 |
+
|
| 25 |
+
def get_hf_dataset_url(dataset_name: str) -> str:
|
| 26 |
+
"""
|
| 27 |
+
Convert dataset name to HuggingFace parquet URL.
|
| 28 |
+
|
| 29 |
+
HuggingFace Datasets library stores parquet files in the standard format:
|
| 30 |
+
data/train-00000-of-00001.parquet
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
dataset_name: Dataset name (e.g., 'states-ma-bills-bills')
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
Full URL to the parquet file
|
| 37 |
+
"""
|
| 38 |
+
return f"https://huggingface.co/datasets/{HF_ORGANIZATION}/{dataset_name}/resolve/main/data/train-00000-of-00001.parquet"
|
| 39 |
+
|
| 40 |
+
def get_data_source(file_path: Path, use_remote: bool = False) -> str:
|
| 41 |
+
"""Get data source (local path or remote URL) based on environment."""
|
| 42 |
+
if not IS_HF_SPACES and not use_remote:
|
| 43 |
+
return str(file_path)
|
| 44 |
+
|
| 45 |
+
# Convert local path to HuggingFace dataset name
|
| 46 |
+
parts = file_path.parts
|
| 47 |
+
|
| 48 |
+
if 'states' in parts:
|
| 49 |
+
state_idx = parts.index('states')
|
| 50 |
+
state = parts[state_idx + 1].lower()
|
| 51 |
+
filename = parts[-1].replace('.parquet', '').replace('_', '-')
|
| 52 |
+
dataset_name = f"states-{state}-{filename}"
|
| 53 |
+
return get_hf_dataset_url(dataset_name)
|
| 54 |
+
|
| 55 |
+
# Fallback to local
|
| 56 |
+
return str(file_path)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def classify_bill_type(title: str, classification: list, topic: Optional[str] = None) -> str:
|
| 60 |
+
"""
|
| 61 |
+
Classify bill based on topic-specific categories.
|
| 62 |
+
|
| 63 |
+
Different topics use different classification schemes:
|
| 64 |
+
- Fluoridation: mandate, removal, funding, study
|
| 65 |
+
- Dental/Oral Health: coverage_expansion, screening, provider_access, funding
|
| 66 |
+
- Medicaid: expansion, coverage, reimbursement, eligibility
|
| 67 |
+
- Health (general): protection, restriction, funding, reform
|
| 68 |
+
- Education: requirement, funding, curriculum, reform
|
| 69 |
+
- Default: support, oppose, regulate, other
|
| 70 |
+
"""
|
| 71 |
+
title_lower = title.lower()
|
| 72 |
+
topic_lower = topic.lower() if topic else ""
|
| 73 |
+
|
| 74 |
+
# EXCEPTION: Fluoride varnish/dental coverage bills (not water fluoridation)
|
| 75 |
+
# Check this BEFORE water fluoridation classification
|
| 76 |
+
if any(word in title_lower for word in ['varnish', 'sealant', 'dental', 'medicaid', 'medical assistance']) and 'fluoride' in title_lower:
|
| 77 |
+
if any(word in title_lower for word in ['coverage', 'expand', 'expansion', 'benefit']):
|
| 78 |
+
return 'coverage_expansion'
|
| 79 |
+
elif any(word in title_lower for word in ['screening', 'examination', 'check']):
|
| 80 |
+
return 'screening'
|
| 81 |
+
# If it mentions dental/varnish but unclear type, it's dental "other" not fluoridation
|
| 82 |
+
return 'other'
|
| 83 |
+
|
| 84 |
+
# Fluoridation-specific classifications (WATER fluoridation only)
|
| 85 |
+
if 'fluoride' in topic_lower or 'fluoride' in title_lower:
|
| 86 |
+
# FIRST: Check for REMOVAL/BAN/PROHIBITION (negative sentiment)
|
| 87 |
+
# CRITICAL: Must check these BEFORE "mandate"/"require" to avoid misclassification
|
| 88 |
+
# e.g., "prohibit fluoride" should be "removal", not "mandate"
|
| 89 |
+
if any(word in title_lower for word in [
|
| 90 |
+
'prohibit', 'prohibition', 'prohibited', 'prohibiting',
|
| 91 |
+
'ban', 'banning', 'banned',
|
| 92 |
+
'discontinue', 'discontinuation',
|
| 93 |
+
'cease', 'ceasing',
|
| 94 |
+
'eliminate', 'elimination',
|
| 95 |
+
'removal', 'remove', 'removing',
|
| 96 |
+
'prevent', 'preventing',
|
| 97 |
+
'repeal', 'repealing', 'repealed',
|
| 98 |
+
'optional', 'opt-out', 'opt out',
|
| 99 |
+
'fluoride-free', 'fluoride free'
|
| 100 |
+
]):
|
| 101 |
+
# But check if it's "prohibit removal" (double negative = pro-fluoride)
|
| 102 |
+
if any(phrase in title_lower for phrase in ['prohibit removal', 'prevent removal', 'ban removal']):
|
| 103 |
+
return 'mandate' # Prohibiting removal = mandate to keep
|
| 104 |
+
return 'removal'
|
| 105 |
+
|
| 106 |
+
# SECOND: Check for notification/monitoring (before "require" check)
|
| 107 |
+
# Bills like "notification required" are about monitoring, not mandating fluoridation
|
| 108 |
+
elif any(phrase in title_lower for phrase in [
|
| 109 |
+
'notification', 'notify', 'notifying',
|
| 110 |
+
'report to', 'reporting', 'report when',
|
| 111 |
+
'monitor', 'monitoring'
|
| 112 |
+
]):
|
| 113 |
+
return 'study'
|
| 114 |
+
|
| 115 |
+
# THIRD: Check for MANDATE/REQUIRE (positive sentiment)
|
| 116 |
+
# Be specific - just "require" alone isn't enough, need context
|
| 117 |
+
elif any(phrase in title_lower for phrase in [
|
| 118 |
+
'mandate', 'mandating', 'shall fluoridate', 'shall add fluoride',
|
| 119 |
+
'must fluoridate', 'must add fluoride',
|
| 120 |
+
'require fluoridation', 'require water system to fluoridate',
|
| 121 |
+
'require addition of fluoride'
|
| 122 |
+
]):
|
| 123 |
+
return 'mandate'
|
| 124 |
+
|
| 125 |
+
# FOURTH: Check for funding
|
| 126 |
+
elif any(word in title_lower for word in ['fund', 'funding', 'appropriation', 'grant', 'reimburse', 'subsidy']):
|
| 127 |
+
return 'funding'
|
| 128 |
+
elif any(word in title_lower for word in ['study', 'research', 'analysis', 'assess', 'evaluate']):
|
| 129 |
+
return 'study'
|
| 130 |
+
else:
|
| 131 |
+
return 'other'
|
| 132 |
+
|
| 133 |
+
# Dental/Oral Health-specific classifications
|
| 134 |
+
elif 'dental' in topic_lower or 'oral health' in topic_lower or 'dental' in title_lower:
|
| 135 |
+
if any(word in title_lower for word in ['expand', 'increase coverage', 'extend coverage', 'add coverage']):
|
| 136 |
+
return 'coverage_expansion'
|
| 137 |
+
elif any(word in title_lower for word in ['screen', 'examination', 'checkup', 'assessment']):
|
| 138 |
+
return 'screening'
|
| 139 |
+
elif any(word in title_lower for word in ['provider', 'dentist', 'hygienist', 'workforce', 'professional']):
|
| 140 |
+
return 'provider_access'
|
| 141 |
+
elif any(word in title_lower for word in ['fund', 'appropriation', 'grant', 'budget', 'reimburse']):
|
| 142 |
+
return 'funding'
|
| 143 |
+
else:
|
| 144 |
+
return 'other'
|
| 145 |
+
|
| 146 |
+
# Medicaid-specific classifications
|
| 147 |
+
elif 'medicaid' in topic_lower or 'medicaid' in title_lower:
|
| 148 |
+
if any(word in title_lower for word in ['expand', 'expansion', 'extend', 'broaden']):
|
| 149 |
+
return 'expansion'
|
| 150 |
+
elif any(word in title_lower for word in ['coverage', 'benefit', 'service']):
|
| 151 |
+
return 'coverage'
|
| 152 |
+
elif any(word in title_lower for word in ['reimburse', 'payment', 'rate', 'compensation']):
|
| 153 |
+
return 'reimbursement'
|
| 154 |
+
elif any(word in title_lower for word in ['eligib', 'qualify', 'enroll']):
|
| 155 |
+
return 'eligibility'
|
| 156 |
+
else:
|
| 157 |
+
return 'other'
|
| 158 |
+
|
| 159 |
+
# Education-specific classifications
|
| 160 |
+
elif 'education' in topic_lower or 'school' in topic_lower:
|
| 161 |
+
if any(word in title_lower for word in ['require', 'mandate', 'shall provide', 'must offer']):
|
| 162 |
+
return 'requirement'
|
| 163 |
+
elif any(word in title_lower for word in ['fund', 'appropriation', 'grant', 'budget']):
|
| 164 |
+
return 'funding'
|
| 165 |
+
elif any(word in title_lower for word in ['curriculum', 'course', 'instruction', 'program']):
|
| 166 |
+
return 'curriculum'
|
| 167 |
+
elif any(word in title_lower for word in ['reform', 'restructure', 'modernize', 'improve']):
|
| 168 |
+
return 'reform'
|
| 169 |
+
else:
|
| 170 |
+
return 'other'
|
| 171 |
+
|
| 172 |
+
# General health classifications
|
| 173 |
+
elif 'health' in topic_lower or 'health' in title_lower:
|
| 174 |
+
if any(word in title_lower for word in ['protect', 'preserve', 'safeguard', 'ensure', 'guarantee', 'expand', 'increase', 'enhance', 'support']):
|
| 175 |
+
return 'protection'
|
| 176 |
+
elif any(word in title_lower for word in ['restrict', 'limit', 'regulate', 'control', 'impose', 'prohibit', 'ban']):
|
| 177 |
+
return 'restriction'
|
| 178 |
+
elif any(word in title_lower for word in ['fund', 'appropriation', 'grant', 'budget']):
|
| 179 |
+
return 'funding'
|
| 180 |
+
elif any(word in title_lower for word in ['reform', 'restructure', 'modernize', 'improve']):
|
| 181 |
+
return 'reform'
|
| 182 |
+
else:
|
| 183 |
+
return 'other'
|
| 184 |
+
|
| 185 |
+
# Default general classifications
|
| 186 |
+
else:
|
| 187 |
+
if any(word in title_lower for word in ['support', 'promote', 'encourage', 'expand', 'increase', 'enhance', 'fund']):
|
| 188 |
+
return 'support'
|
| 189 |
+
elif any(word in title_lower for word in ['oppose', 'prohibit', 'ban', 'restrict', 'limit', 'prevent']):
|
| 190 |
+
return 'oppose'
|
| 191 |
+
elif any(word in title_lower for word in ['regulate', 'oversee', 'control', 'require', 'mandate']):
|
| 192 |
+
return 'regulate'
|
| 193 |
+
else:
|
| 194 |
+
return 'other'
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def get_legend_for_topic(topic: Optional[str]) -> dict:
|
| 198 |
+
"""
|
| 199 |
+
Get appropriate legend labels based on topic.
|
| 200 |
+
"""
|
| 201 |
+
topic_lower = topic.lower() if topic else ""
|
| 202 |
+
|
| 203 |
+
if 'fluoride' in topic_lower:
|
| 204 |
+
return {
|
| 205 |
+
"mandate": "Mandate Fluoridation",
|
| 206 |
+
"removal": "Remove Fluoridation",
|
| 207 |
+
"funding": "Funding/Grants",
|
| 208 |
+
"study": "Study/Research",
|
| 209 |
+
"other": "Other"
|
| 210 |
+
}
|
| 211 |
+
elif 'dental' in topic_lower or 'oral health' in topic_lower:
|
| 212 |
+
return {
|
| 213 |
+
"coverage_expansion": "Coverage Expansion",
|
| 214 |
+
"screening": "Screening Programs",
|
| 215 |
+
"provider_access": "Provider Access",
|
| 216 |
+
"funding": "Funding/Grants",
|
| 217 |
+
"other": "Other"
|
| 218 |
+
}
|
| 219 |
+
elif 'medicaid' in topic_lower:
|
| 220 |
+
return {
|
| 221 |
+
"expansion": "Program Expansion",
|
| 222 |
+
"coverage": "Coverage/Benefits",
|
| 223 |
+
"reimbursement": "Reimbursement",
|
| 224 |
+
"eligibility": "Eligibility",
|
| 225 |
+
"other": "Other"
|
| 226 |
+
}
|
| 227 |
+
elif 'education' in topic_lower or 'school' in topic_lower:
|
| 228 |
+
return {
|
| 229 |
+
"requirement": "Requirements",
|
| 230 |
+
"funding": "Funding",
|
| 231 |
+
"curriculum": "Curriculum",
|
| 232 |
+
"reform": "Reform",
|
| 233 |
+
"other": "Other"
|
| 234 |
+
}
|
| 235 |
+
elif 'health' in topic_lower:
|
| 236 |
+
return {
|
| 237 |
+
"protection": "Protection/Expansion",
|
| 238 |
+
"restriction": "Restriction",
|
| 239 |
+
"funding": "Funding",
|
| 240 |
+
"reform": "Reform",
|
| 241 |
+
"other": "Other"
|
| 242 |
+
}
|
| 243 |
+
else:
|
| 244 |
+
return {
|
| 245 |
+
"support": "Support/Promote",
|
| 246 |
+
"oppose": "Oppose/Restrict",
|
| 247 |
+
"regulate": "Regulate",
|
| 248 |
+
"other": "Other"
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def determine_bill_status(latest_action: str, latest_date: str) -> str:
|
| 253 |
+
"""
|
| 254 |
+
Determine if bill was enacted, failed, or is pending.
|
| 255 |
+
"""
|
| 256 |
+
if not latest_action:
|
| 257 |
+
return 'pending'
|
| 258 |
+
|
| 259 |
+
action_lower = latest_action.lower()
|
| 260 |
+
|
| 261 |
+
# Enacted/Passed
|
| 262 |
+
if any(word in action_lower for word in ['signed', 'enacted', 'approved', 'passed both', 'became law']):
|
| 263 |
+
return 'enacted'
|
| 264 |
+
|
| 265 |
+
# Failed
|
| 266 |
+
if any(word in action_lower for word in ['failed', 'defeated', 'rejected', 'died', 'withdrawn', 'vetoed']):
|
| 267 |
+
return 'failed'
|
| 268 |
+
|
| 269 |
+
# Pending (default)
|
| 270 |
+
return 'pending'
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
@router.get("")
|
| 274 |
+
async def search_bills(
|
| 275 |
+
q: Optional[str] = Query(None, description="Search query for bill title or number"),
|
| 276 |
+
state: Optional[str] = Query("AL", description="State code (e.g., AL, GA, MA)"),
|
| 277 |
+
session: Optional[str] = Query(None, description="Legislative session (e.g., 2024rs)"),
|
| 278 |
+
limit: int = Query(20, ge=1, le=100, description="Maximum results"),
|
| 279 |
+
offset: int = Query(0, ge=0, description="Number of results to skip")
|
| 280 |
+
):
|
| 281 |
+
"""
|
| 282 |
+
Search legislative bills from OpenStates data.
|
| 283 |
+
|
| 284 |
+
**Examples:**
|
| 285 |
+
- `/api/bills?state=AL&q=dental` - Search Alabama bills for "dental"
|
| 286 |
+
- `/api/bills?state=AL&session=2024rs` - Get all 2024 regular session bills
|
| 287 |
+
- `/api/bills?state=AL&limit=50` - Browse recent Alabama bills
|
| 288 |
+
"""
|
| 289 |
+
try:
|
| 290 |
+
# Build file path
|
| 291 |
+
bills_file = GOLD_DIR / "states" / state / "bills_bills.parquet"
|
| 292 |
+
|
| 293 |
+
# Get data source (local or remote HuggingFace URL)
|
| 294 |
+
data_source = get_data_source(bills_file, use_remote=IS_HF_SPACES)
|
| 295 |
+
|
| 296 |
+
# Connect to DuckDB
|
| 297 |
+
conn = duckdb.connect()
|
| 298 |
+
|
| 299 |
+
# Build SQL query
|
| 300 |
+
where_clauses = []
|
| 301 |
+
params = []
|
| 302 |
+
|
| 303 |
+
if q:
|
| 304 |
+
where_clauses.append("(LOWER(title) LIKE LOWER(?) OR LOWER(bill_number) LIKE LOWER(?))")
|
| 305 |
+
pattern = f'%{q}%'
|
| 306 |
+
params.extend([pattern, pattern])
|
| 307 |
+
|
| 308 |
+
if session:
|
| 309 |
+
where_clauses.append("session = ?")
|
| 310 |
+
params.append(session)
|
| 311 |
+
|
| 312 |
+
where_clause = " AND ".join(where_clauses) if where_clauses else "1=1"
|
| 313 |
+
|
| 314 |
+
# Count total
|
| 315 |
+
count_sql = f"""
|
| 316 |
+
SELECT COUNT(*) as total
|
| 317 |
+
FROM read_parquet(?)
|
| 318 |
+
WHERE {where_clause}
|
| 319 |
+
"""
|
| 320 |
+
count_params = [data_source] + params
|
| 321 |
+
total = conn.execute(count_sql, count_params).fetchone()[0]
|
| 322 |
+
|
| 323 |
+
# Fetch bills
|
| 324 |
+
sql = f"""
|
| 325 |
+
SELECT
|
| 326 |
+
bill_id,
|
| 327 |
+
bill_number,
|
| 328 |
+
title,
|
| 329 |
+
classification,
|
| 330 |
+
session,
|
| 331 |
+
session_name,
|
| 332 |
+
first_action_date,
|
| 333 |
+
latest_action_date,
|
| 334 |
+
latest_action_description,
|
| 335 |
+
jurisdiction_name
|
| 336 |
+
FROM read_parquet(?)
|
| 337 |
+
WHERE {where_clause}
|
| 338 |
+
ORDER BY latest_action_date DESC NULLS LAST, bill_number DESC
|
| 339 |
+
LIMIT ? OFFSET ?
|
| 340 |
+
"""
|
| 341 |
+
|
| 342 |
+
query_params = [data_source] + params + [limit, offset]
|
| 343 |
+
rows = conn.execute(sql, query_params).fetchall()
|
| 344 |
+
|
| 345 |
+
bills = []
|
| 346 |
+
for row in rows:
|
| 347 |
+
bills.append({
|
| 348 |
+
"bill_id": row[0],
|
| 349 |
+
"bill_number": row[1],
|
| 350 |
+
"title": row[2],
|
| 351 |
+
"classification": row[3],
|
| 352 |
+
"session": row[4],
|
| 353 |
+
"session_name": row[5],
|
| 354 |
+
"first_action_date": row[6],
|
| 355 |
+
"latest_action_date": row[7],
|
| 356 |
+
"latest_action": row[8],
|
| 357 |
+
"jurisdiction": row[9]
|
| 358 |
+
})
|
| 359 |
+
|
| 360 |
+
conn.close()
|
| 361 |
+
|
| 362 |
+
return {
|
| 363 |
+
"total": total,
|
| 364 |
+
"bills": bills,
|
| 365 |
+
"pagination": {
|
| 366 |
+
"limit": limit,
|
| 367 |
+
"offset": offset,
|
| 368 |
+
"has_more": offset + len(bills) < total
|
| 369 |
+
},
|
| 370 |
+
"filters": {
|
| 371 |
+
"state": state,
|
| 372 |
+
"query": q,
|
| 373 |
+
"session": session
|
| 374 |
+
}
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
except HTTPException:
|
| 378 |
+
raise
|
| 379 |
+
except Exception as e:
|
| 380 |
+
logger.error(f"Bills search error for state={state}: {e}")
|
| 381 |
+
|
| 382 |
+
# Parse error into structured response
|
| 383 |
+
error_detail = parse_error(e, context={
|
| 384 |
+
"state": state,
|
| 385 |
+
"data_type": "bills",
|
| 386 |
+
"query": q,
|
| 387 |
+
"session": session
|
| 388 |
+
})
|
| 389 |
+
|
| 390 |
+
# Return structured error response
|
| 391 |
+
return JSONResponse(
|
| 392 |
+
status_code=500,
|
| 393 |
+
content=error_detail.model_dump()
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
@router.get("/sessions")
|
| 398 |
+
async def get_sessions(
|
| 399 |
+
state: str = Query("AL", description="State code")
|
| 400 |
+
):
|
| 401 |
+
"""Get available legislative sessions for a state."""
|
| 402 |
+
try:
|
| 403 |
+
bills_file = GOLD_DIR / "states" / state / "bills_bills.parquet"
|
| 404 |
+
|
| 405 |
+
if not bills_file.exists():
|
| 406 |
+
raise HTTPException(
|
| 407 |
+
status_code=404,
|
| 408 |
+
detail=f"No bills data found for state: {state}"
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
conn = duckdb.connect()
|
| 412 |
+
|
| 413 |
+
sql = """
|
| 414 |
+
SELECT DISTINCT
|
| 415 |
+
session,
|
| 416 |
+
session_name,
|
| 417 |
+
MIN(first_action_date) as start_date,
|
| 418 |
+
MAX(latest_action_date) as end_date,
|
| 419 |
+
COUNT(*) as bill_count
|
| 420 |
+
FROM read_parquet(?)
|
| 421 |
+
GROUP BY session, session_name
|
| 422 |
+
ORDER BY session DESC
|
| 423 |
+
"""
|
| 424 |
+
|
| 425 |
+
rows = conn.execute(sql, [str(bills_file)]).fetchall()
|
| 426 |
+
|
| 427 |
+
sessions = []
|
| 428 |
+
for row in rows:
|
| 429 |
+
sessions.append({
|
| 430 |
+
"session": row[0],
|
| 431 |
+
"session_name": row[1],
|
| 432 |
+
"start_date": row[2],
|
| 433 |
+
"end_date": row[3],
|
| 434 |
+
"bill_count": row[4]
|
| 435 |
+
})
|
| 436 |
+
|
| 437 |
+
conn.close()
|
| 438 |
+
|
| 439 |
+
return {
|
| 440 |
+
"state": state,
|
| 441 |
+
"sessions": sessions,
|
| 442 |
+
"total_sessions": len(sessions)
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
except HTTPException:
|
| 446 |
+
raise
|
| 447 |
+
except Exception as e:
|
| 448 |
+
logger.error(f"Sessions query error for state={state}: {e}")
|
| 449 |
+
|
| 450 |
+
# Parse error into structured response
|
| 451 |
+
error_detail = parse_error(e, context={
|
| 452 |
+
"state": state,
|
| 453 |
+
"data_type": "sessions"
|
| 454 |
+
})
|
| 455 |
+
|
| 456 |
+
return JSONResponse(
|
| 457 |
+
status_code=500,
|
| 458 |
+
content=error_detail.model_dump()
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
@router.get("/map")
|
| 463 |
+
async def get_bill_map_data(
|
| 464 |
+
topic: Optional[str] = Query(None, description="Topic to filter (e.g., dental, health, education)"),
|
| 465 |
+
session: Optional[str] = Query(None, description="Legislative session")
|
| 466 |
+
):
|
| 467 |
+
"""
|
| 468 |
+
Get aggregated bill data for choropleth map visualization.
|
| 469 |
+
|
| 470 |
+
Uses pre-computed national aggregates for instant loading.
|
| 471 |
+
Returns counts of bills by type and status for each state.
|
| 472 |
+
|
| 473 |
+
**Examples:**
|
| 474 |
+
- `/api/bills/map?topic=fluorid` - Map fluoridation legislation
|
| 475 |
+
- `/api/bills/map?topic=dental` - Map dental legislation
|
| 476 |
+
"""
|
| 477 |
+
try:
|
| 478 |
+
# Use pre-aggregated national dataset
|
| 479 |
+
agg_file = GOLD_DIR / "national" / "bills_map_aggregates.parquet"
|
| 480 |
+
|
| 481 |
+
# Fallback to on-demand aggregation if pre-computed file doesn't exist
|
| 482 |
+
if not agg_file.exists():
|
| 483 |
+
logger.warning("Pre-aggregated bill data not found, using on-demand aggregation (slower)")
|
| 484 |
+
return await get_bill_map_data_on_demand(topic, session)
|
| 485 |
+
|
| 486 |
+
# Load from cached aggregates (fast!)
|
| 487 |
+
df = load_parquet_cached(str(agg_file))
|
| 488 |
+
|
| 489 |
+
# Filter by topic
|
| 490 |
+
if topic:
|
| 491 |
+
df = df[df['topic'] == topic.lower()]
|
| 492 |
+
|
| 493 |
+
# Convert to state_data dict
|
| 494 |
+
state_data = {}
|
| 495 |
+
|
| 496 |
+
for _, row in df.iterrows():
|
| 497 |
+
state_code = row['state']
|
| 498 |
+
|
| 499 |
+
# Reconstruct nested dicts (exclude type_status_counts which is already a dict)
|
| 500 |
+
type_cols = [c for c in df.columns if c.startswith('type_') and c != 'type_status_counts']
|
| 501 |
+
status_cols = [c for c in df.columns if c.startswith('status_')]
|
| 502 |
+
|
| 503 |
+
# Handle NaN values - convert to 0
|
| 504 |
+
type_counts = {c.replace('type_', ''): int(row[c]) if not pd.isna(row[c]) else 0 for c in type_cols}
|
| 505 |
+
status_counts = {c.replace('status_', ''): int(row[c]) if not pd.isna(row[c]) else 0 for c in status_cols}
|
| 506 |
+
|
| 507 |
+
# Extract sample_bills (stored as numpy array in parquet)
|
| 508 |
+
sample_bills = []
|
| 509 |
+
if 'sample_bills' in row.index:
|
| 510 |
+
bills_data = row['sample_bills']
|
| 511 |
+
# Pandas stores list columns as numpy arrays
|
| 512 |
+
if hasattr(bills_data, '__iter__') and not isinstance(bills_data, str):
|
| 513 |
+
try:
|
| 514 |
+
# Convert numpy array or list to Python list
|
| 515 |
+
sample_bills = [dict(bill) for bill in bills_data if bill]
|
| 516 |
+
except:
|
| 517 |
+
sample_bills = []
|
| 518 |
+
elif isinstance(bills_data, str):
|
| 519 |
+
import json
|
| 520 |
+
try:
|
| 521 |
+
sample_bills = json.loads(bills_data)
|
| 522 |
+
except:
|
| 523 |
+
sample_bills = []
|
| 524 |
+
|
| 525 |
+
state_data[state_code] = {
|
| 526 |
+
"state": state_code,
|
| 527 |
+
"total_bills": int(row['total_bills']),
|
| 528 |
+
"type_counts": type_counts,
|
| 529 |
+
"status_counts": status_counts,
|
| 530 |
+
"primary_type": row['primary_type'],
|
| 531 |
+
"primary_status": row['primary_status'],
|
| 532 |
+
"map_category": row['map_category'],
|
| 533 |
+
"sample_bills": sample_bills,
|
| 534 |
+
"last_updated": str(row['last_updated']) if 'last_updated' in row.index else ''
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
return {
|
| 538 |
+
"topic": topic,
|
| 539 |
+
"session": session,
|
| 540 |
+
"states": state_data,
|
| 541 |
+
"total_states": len(state_data),
|
| 542 |
+
"legend": {
|
| 543 |
+
"types": get_legend_for_topic(topic),
|
| 544 |
+
"statuses": {
|
| 545 |
+
"enacted": "Enacted",
|
| 546 |
+
"failed": "Failed",
|
| 547 |
+
"pending": "Pending"
|
| 548 |
+
}
|
| 549 |
+
},
|
| 550 |
+
"cached": True
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
except HTTPException:
|
| 554 |
+
raise
|
| 555 |
+
except Exception as e:
|
| 556 |
+
logger.error(f"Map data error: {e}")
|
| 557 |
+
|
| 558 |
+
error_detail = parse_error(e, context={
|
| 559 |
+
"data_type": "bill map",
|
| 560 |
+
"topic": topic,
|
| 561 |
+
"session": session
|
| 562 |
+
})
|
| 563 |
+
|
| 564 |
+
return JSONResponse(
|
| 565 |
+
status_code=500,
|
| 566 |
+
content=error_detail.model_dump()
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
async def get_bill_map_data_on_demand(
|
| 571 |
+
topic: Optional[str] = None,
|
| 572 |
+
session: Optional[str] = None
|
| 573 |
+
):
|
| 574 |
+
"""
|
| 575 |
+
LEGACY: On-demand aggregation (slow - loads 50 state files).
|
| 576 |
+
Only used as fallback if pre-aggregated data doesn't exist.
|
| 577 |
+
"""
|
| 578 |
+
try:
|
| 579 |
+
# List of all US state codes to check
|
| 580 |
+
ALL_STATES = [
|
| 581 |
+
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
|
| 582 |
+
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
|
| 583 |
+
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
|
| 584 |
+
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
|
| 585 |
+
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
|
| 586 |
+
]
|
| 587 |
+
|
| 588 |
+
# In local environment, check available directories
|
| 589 |
+
# In HF Spaces, try all states (will skip missing datasets)
|
| 590 |
+
states_to_check = ALL_STATES
|
| 591 |
+
if not IS_HF_SPACES:
|
| 592 |
+
states_dir = GOLD_DIR / "states"
|
| 593 |
+
if states_dir.exists():
|
| 594 |
+
states_to_check = [d.name for d in states_dir.iterdir() if d.is_dir()]
|
| 595 |
+
|
| 596 |
+
state_data = {}
|
| 597 |
+
|
| 598 |
+
# Iterate through states
|
| 599 |
+
for state_code in states_to_check:
|
| 600 |
+
try:
|
| 601 |
+
bills_file = GOLD_DIR / "states" / state_code / "bills_bills.parquet"
|
| 602 |
+
|
| 603 |
+
# Get data source (local or remote HuggingFace URL)
|
| 604 |
+
data_source = get_data_source(bills_file, use_remote=IS_HF_SPACES)
|
| 605 |
+
|
| 606 |
+
# Connect to DuckDB
|
| 607 |
+
conn = duckdb.connect()
|
| 608 |
+
|
| 609 |
+
# Build query
|
| 610 |
+
where_clauses = ["1=1"]
|
| 611 |
+
params = [data_source]
|
| 612 |
+
|
| 613 |
+
if topic:
|
| 614 |
+
where_clauses.append("LOWER(title) LIKE LOWER(?)")
|
| 615 |
+
params.append(f'%{topic}%')
|
| 616 |
+
|
| 617 |
+
if session:
|
| 618 |
+
where_clauses.append("session = ?")
|
| 619 |
+
params.append(session)
|
| 620 |
+
|
| 621 |
+
where_clause = " AND ".join(where_clauses)
|
| 622 |
+
|
| 623 |
+
sql = f"""
|
| 624 |
+
SELECT
|
| 625 |
+
title,
|
| 626 |
+
classification,
|
| 627 |
+
latest_action_description
|
| 628 |
+
FROM read_parquet(?)
|
| 629 |
+
WHERE {where_clause}
|
| 630 |
+
"""
|
| 631 |
+
|
| 632 |
+
rows = conn.execute(sql, params).fetchall()
|
| 633 |
+
conn.close()
|
| 634 |
+
|
| 635 |
+
if not rows:
|
| 636 |
+
continue
|
| 637 |
+
|
| 638 |
+
# Get topic-aware categories
|
| 639 |
+
legend_categories = get_legend_for_topic(topic)
|
| 640 |
+
|
| 641 |
+
# Initialize type_counts with all possible categories for this topic
|
| 642 |
+
type_counts = {cat: 0 for cat in legend_categories.keys()}
|
| 643 |
+
status_counts = {'enacted': 0, 'failed': 0, 'pending': 0}
|
| 644 |
+
type_status_counts = {}
|
| 645 |
+
|
| 646 |
+
for row in rows:
|
| 647 |
+
title = row[0]
|
| 648 |
+
classification = row[1] if row[1] else []
|
| 649 |
+
latest_action = row[2] if row[2] else ''
|
| 650 |
+
|
| 651 |
+
bill_type = classify_bill_type(title, classification, topic)
|
| 652 |
+
bill_status = determine_bill_status(latest_action, '')
|
| 653 |
+
|
| 654 |
+
# Ensure bill_type exists in type_counts (fallback to 'other')
|
| 655 |
+
if bill_type not in type_counts:
|
| 656 |
+
bill_type = 'other'
|
| 657 |
+
|
| 658 |
+
type_counts[bill_type] += 1
|
| 659 |
+
status_counts[bill_status] += 1
|
| 660 |
+
|
| 661 |
+
# Track type+status combinations
|
| 662 |
+
key = f"{bill_type}_{bill_status}"
|
| 663 |
+
type_status_counts[key] = type_status_counts.get(key, 0) + 1
|
| 664 |
+
|
| 665 |
+
# Determine primary legislation type and status for map visualization
|
| 666 |
+
primary_type = max(type_counts, key=type_counts.get)
|
| 667 |
+
primary_status = max(status_counts, key=status_counts.get)
|
| 668 |
+
|
| 669 |
+
state_data[state_code] = {
|
| 670 |
+
"state": state_code,
|
| 671 |
+
"total_bills": len(rows),
|
| 672 |
+
"type_counts": type_counts,
|
| 673 |
+
"status_counts": status_counts,
|
| 674 |
+
"type_status_counts": type_status_counts,
|
| 675 |
+
"primary_type": primary_type,
|
| 676 |
+
"primary_status": primary_status,
|
| 677 |
+
# For map visualization
|
| 678 |
+
"map_category": f"{primary_type}_{primary_status}" if type_counts[primary_type] > 0 else "none"
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
except Exception as e:
|
| 682 |
+
# Skip states with missing or inaccessible data
|
| 683 |
+
logger.debug(f"Skipping state {state_code}: {str(e)}")
|
| 684 |
+
continue
|
| 685 |
+
|
| 686 |
+
return {
|
| 687 |
+
"topic": topic,
|
| 688 |
+
"session": session,
|
| 689 |
+
"states": state_data,
|
| 690 |
+
"total_states": len(state_data),
|
| 691 |
+
"legend": {
|
| 692 |
+
"types": get_legend_for_topic(topic),
|
| 693 |
+
"statuses": {
|
| 694 |
+
"enacted": "Enacted",
|
| 695 |
+
"failed": "Failed",
|
| 696 |
+
"pending": "Pending"
|
| 697 |
+
}
|
| 698 |
+
}
|
| 699 |
+
}
|
| 700 |
+
|
| 701 |
+
except HTTPException:
|
| 702 |
+
raise
|
| 703 |
+
except Exception as e:
|
| 704 |
+
logger.error(f"Map data error: {e}")
|
| 705 |
+
|
| 706 |
+
# Parse error into structured response
|
| 707 |
+
error_detail = parse_error(e, context={
|
| 708 |
+
"data_type": "bill map",
|
| 709 |
+
"topic": topic,
|
| 710 |
+
"session": session
|
| 711 |
+
})
|
| 712 |
+
|
| 713 |
+
return JSONResponse(
|
| 714 |
+
status_code=500,
|
| 715 |
+
content=error_detail.model_dump()
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
@router.get("/{bill_id}")
|
| 720 |
+
async def get_bill_details(bill_id: str):
|
| 721 |
+
"""
|
| 722 |
+
Get detailed information about a specific bill from gold parquet files.
|
| 723 |
+
|
| 724 |
+
Args:
|
| 725 |
+
bill_id: Bill identifier in format {state}-{bill_number} (e.g., "LA-SB 4")
|
| 726 |
+
|
| 727 |
+
Returns:
|
| 728 |
+
Detailed bill information including actions, sponsors, sources
|
| 729 |
+
"""
|
| 730 |
+
try:
|
| 731 |
+
# Parse bill_id to extract state and bill number
|
| 732 |
+
if '-' not in bill_id:
|
| 733 |
+
raise HTTPException(status_code=400, detail="Invalid bill ID format. Expected: STATE-BILLNUMBER")
|
| 734 |
+
|
| 735 |
+
parts = bill_id.split('-', 1)
|
| 736 |
+
state = parts[0].upper()
|
| 737 |
+
bill_number = parts[1]
|
| 738 |
+
|
| 739 |
+
# Build file paths for bills data from gold layer
|
| 740 |
+
bills_file = GOLD_DIR / "states" / state / "bills_bills.parquet"
|
| 741 |
+
actions_file = GOLD_DIR / "states" / state / "bills_bill_actions.parquet"
|
| 742 |
+
sponsors_file = GOLD_DIR / "states" / state / "bills_bill_sponsorships.parquet"
|
| 743 |
+
|
| 744 |
+
# Get data sources (local or remote HuggingFace URL)
|
| 745 |
+
bills_source = get_data_source(bills_file, use_remote=IS_HF_SPACES)
|
| 746 |
+
actions_source = get_data_source(actions_file, use_remote=IS_HF_SPACES)
|
| 747 |
+
sponsors_source = get_data_source(sponsors_file, use_remote=IS_HF_SPACES)
|
| 748 |
+
|
| 749 |
+
# Connect to DuckDB for querying parquet files
|
| 750 |
+
conn = duckdb.connect()
|
| 751 |
+
|
| 752 |
+
try:
|
| 753 |
+
# Query for the specific bill
|
| 754 |
+
bill_query = """
|
| 755 |
+
SELECT
|
| 756 |
+
bill_id,
|
| 757 |
+
bill_number,
|
| 758 |
+
title,
|
| 759 |
+
classification,
|
| 760 |
+
latest_action_description,
|
| 761 |
+
latest_action_date,
|
| 762 |
+
first_action_date,
|
| 763 |
+
session,
|
| 764 |
+
session_name,
|
| 765 |
+
jurisdiction_name
|
| 766 |
+
FROM read_parquet(?)
|
| 767 |
+
WHERE bill_number = ?
|
| 768 |
+
LIMIT 1
|
| 769 |
+
"""
|
| 770 |
+
|
| 771 |
+
result = conn.execute(bill_query, [bills_source, bill_number]).fetchone()
|
| 772 |
+
|
| 773 |
+
if not result:
|
| 774 |
+
conn.close()
|
| 775 |
+
raise HTTPException(status_code=404, detail=f"Bill {bill_number} not found in {state}")
|
| 776 |
+
|
| 777 |
+
# Parse bill data
|
| 778 |
+
bill_data = {
|
| 779 |
+
"bill_id": result[0] if result[0] else bill_id,
|
| 780 |
+
"bill_number": result[1],
|
| 781 |
+
"title": result[2],
|
| 782 |
+
"classification": result[3] if result[3] else [],
|
| 783 |
+
"latest_action": result[4],
|
| 784 |
+
"latest_action_date": result[5],
|
| 785 |
+
"first_action_date": result[6],
|
| 786 |
+
"session": result[7],
|
| 787 |
+
"session_name": result[8],
|
| 788 |
+
"jurisdiction": result[9],
|
| 789 |
+
"state": state,
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
# Get sponsors if available
|
| 793 |
+
try:
|
| 794 |
+
sponsor_query = """
|
| 795 |
+
SELECT name, primary_sponsor, classification
|
| 796 |
+
FROM read_parquet(?)
|
| 797 |
+
WHERE bill_id = ?
|
| 798 |
+
ORDER BY primary_sponsor DESC
|
| 799 |
+
"""
|
| 800 |
+
sponsor_rows = conn.execute(sponsor_query, [sponsors_source, bill_data["bill_id"]]).fetchall()
|
| 801 |
+
|
| 802 |
+
bill_data["sponsors"] = [
|
| 803 |
+
{"name": s[0], "primary": bool(s[1]), "classification": s[2]}
|
| 804 |
+
for s in sponsor_rows
|
| 805 |
+
]
|
| 806 |
+
except Exception as e:
|
| 807 |
+
logger.warning(f"Could not load sponsors for {bill_id}: {e}")
|
| 808 |
+
bill_data["sponsors"] = []
|
| 809 |
+
|
| 810 |
+
# Get actions if available
|
| 811 |
+
try:
|
| 812 |
+
actions_query = """
|
| 813 |
+
SELECT description, date, classification
|
| 814 |
+
FROM read_parquet(?)
|
| 815 |
+
WHERE bill_id = ?
|
| 816 |
+
ORDER BY date DESC
|
| 817 |
+
LIMIT 10
|
| 818 |
+
"""
|
| 819 |
+
action_rows = conn.execute(actions_query, [actions_source, bill_data["bill_id"]]).fetchall()
|
| 820 |
+
|
| 821 |
+
bill_data["actions"] = [
|
| 822 |
+
{"description": a[0], "date": a[1], "classification": a[2]}
|
| 823 |
+
for a in action_rows
|
| 824 |
+
]
|
| 825 |
+
except Exception as e:
|
| 826 |
+
logger.warning(f"Could not load actions for {bill_id}: {e}")
|
| 827 |
+
bill_data["actions"] = []
|
| 828 |
+
|
| 829 |
+
conn.close()
|
| 830 |
+
return bill_data
|
| 831 |
+
|
| 832 |
+
except Exception as e:
|
| 833 |
+
conn.close()
|
| 834 |
+
raise
|
| 835 |
+
|
| 836 |
+
except HTTPException:
|
| 837 |
+
raise
|
| 838 |
+
except Exception as e:
|
| 839 |
+
logger.error(f"Bill details error: {e}")
|
| 840 |
+
logger.error(traceback.format_exc())
|
| 841 |
+
raise HTTPException(status_code=500, detail=str(e))
|
api/routes/bills_neon.py
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Bills API Routes - Hybrid approach using Neon + Parquet
|
| 3 |
+
- Map aggregates: Neon PostgreSQL (fast, lightweight)
|
| 4 |
+
- Detailed bills & sessions: Parquet files (saves Neon space)
|
| 5 |
+
"""
|
| 6 |
+
from fastapi import APIRouter, Query, HTTPException
|
| 7 |
+
from fastapi.responses import JSONResponse
|
| 8 |
+
from typing import Optional, List, Dict, Any
|
| 9 |
+
import asyncpg
|
| 10 |
+
import duckdb
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from loguru import logger
|
| 14 |
+
import os
|
| 15 |
+
from datetime import datetime, timedelta
|
| 16 |
+
|
| 17 |
+
from api.errors import ErrorDetail, parse_error
|
| 18 |
+
|
| 19 |
+
router = APIRouter(prefix="/api/bills", tags=["bills"])
|
| 20 |
+
|
| 21 |
+
# Database configuration (for map aggregates only)
|
| 22 |
+
NEON_DATABASE_URL_DEV = os.getenv("NEON_DATABASE_URL_DEV")
|
| 23 |
+
NEON_DATABASE_URL = os.getenv("NEON_DATABASE_URL")
|
| 24 |
+
DATABASE_URL = NEON_DATABASE_URL_DEV or NEON_DATABASE_URL
|
| 25 |
+
|
| 26 |
+
# Parquet configuration (for detailed bills)
|
| 27 |
+
GOLD_DIR = Path("data/gold")
|
| 28 |
+
IS_HF_SPACES = os.getenv("HF_SPACES") == "1"
|
| 29 |
+
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
|
| 30 |
+
|
| 31 |
+
if DATABASE_URL:
|
| 32 |
+
logger.info(f"🗄️ Bills map using: {'DEV' if NEON_DATABASE_URL_DEV else 'PROD'} database")
|
| 33 |
+
logger.info(f"📁 Bills details using: {'HuggingFace' if IS_HF_SPACES else 'local'} parquet")
|
| 34 |
+
else:
|
| 35 |
+
logger.warning("⚠️ No database URL configured. Map endpoint will not work.")
|
| 36 |
+
|
| 37 |
+
# Connection pool
|
| 38 |
+
_pool = None
|
| 39 |
+
|
| 40 |
+
# Cache for map data (TTL: 5 minutes)
|
| 41 |
+
_map_cache = {}
|
| 42 |
+
_map_cache_time = None
|
| 43 |
+
MAP_CACHE_DURATION = timedelta(minutes=5)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_hf_dataset_url(dataset_name: str) -> str:
|
| 47 |
+
"""Convert dataset name to HuggingFace parquet URL."""
|
| 48 |
+
return f"https://huggingface.co/datasets/{HF_ORGANIZATION}/{dataset_name}/resolve/main/data/train-00000-of-00001.parquet"
|
| 49 |
+
|
| 50 |
+
def get_data_source(file_path: Path, use_remote: bool = False) -> str:
|
| 51 |
+
"""Get data source (local path or remote URL) based on environment."""
|
| 52 |
+
if not IS_HF_SPACES and not use_remote:
|
| 53 |
+
return str(file_path)
|
| 54 |
+
|
| 55 |
+
# Convert local path to HuggingFace dataset name
|
| 56 |
+
parts = file_path.parts
|
| 57 |
+
|
| 58 |
+
if 'states' in parts:
|
| 59 |
+
state_idx = parts.index('states')
|
| 60 |
+
state = parts[state_idx + 1].lower()
|
| 61 |
+
filename = parts[-1].replace('.parquet', '').replace('_', '-')
|
| 62 |
+
dataset_name = f"states-{state}-{filename}"
|
| 63 |
+
return get_hf_dataset_url(dataset_name)
|
| 64 |
+
|
| 65 |
+
return str(file_path)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
async def get_pool():
|
| 69 |
+
"""Get or create asyncpg connection pool."""
|
| 70 |
+
global _pool
|
| 71 |
+
if _pool is None and DATABASE_URL:
|
| 72 |
+
_pool = await asyncpg.create_pool(
|
| 73 |
+
DATABASE_URL,
|
| 74 |
+
min_size=1,
|
| 75 |
+
max_size=10,
|
| 76 |
+
command_timeout=60
|
| 77 |
+
)
|
| 78 |
+
return _pool
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
async def fetch_bills_from_parquet(
|
| 82 |
+
state: str,
|
| 83 |
+
q: Optional[str] = None,
|
| 84 |
+
session: Optional[str] = None,
|
| 85 |
+
limit: int = 50,
|
| 86 |
+
offset: int = 0
|
| 87 |
+
) -> Dict[str, Any]:
|
| 88 |
+
"""Fetch bills from parquet files using DuckDB (detailed drill-down)."""
|
| 89 |
+
try:
|
| 90 |
+
# Build file path
|
| 91 |
+
bills_file = GOLD_DIR / "states" / state / "bills_bills.parquet"
|
| 92 |
+
|
| 93 |
+
# Get data source (local or remote HuggingFace URL)
|
| 94 |
+
data_source = get_data_source(bills_file, use_remote=IS_HF_SPACES)
|
| 95 |
+
|
| 96 |
+
# Connect to DuckDB
|
| 97 |
+
conn = duckdb.connect()
|
| 98 |
+
|
| 99 |
+
# Build SQL query
|
| 100 |
+
where_clauses = []
|
| 101 |
+
params = []
|
| 102 |
+
|
| 103 |
+
if q:
|
| 104 |
+
where_clauses.append("(LOWER(title) LIKE LOWER(?) OR LOWER(bill_number) LIKE LOWER(?))")
|
| 105 |
+
pattern = f'%{q}%'
|
| 106 |
+
params.extend([pattern, pattern])
|
| 107 |
+
|
| 108 |
+
if session:
|
| 109 |
+
where_clauses.append("session = ?")
|
| 110 |
+
params.append(session)
|
| 111 |
+
|
| 112 |
+
where_clause = " AND ".join(where_clauses) if where_clauses else "1=1"
|
| 113 |
+
|
| 114 |
+
# Count total
|
| 115 |
+
count_sql = f"""
|
| 116 |
+
SELECT COUNT(*) as total
|
| 117 |
+
FROM read_parquet(?)
|
| 118 |
+
WHERE {where_clause}
|
| 119 |
+
"""
|
| 120 |
+
count_params = [data_source] + params
|
| 121 |
+
total = conn.execute(count_sql, count_params).fetchone()[0]
|
| 122 |
+
|
| 123 |
+
# Fetch bills
|
| 124 |
+
sql = f"""
|
| 125 |
+
SELECT
|
| 126 |
+
bill_id,
|
| 127 |
+
bill_number,
|
| 128 |
+
title,
|
| 129 |
+
classification,
|
| 130 |
+
session,
|
| 131 |
+
session_name,
|
| 132 |
+
first_action_date,
|
| 133 |
+
latest_action_date,
|
| 134 |
+
latest_action_description,
|
| 135 |
+
jurisdiction_name
|
| 136 |
+
FROM read_parquet(?)
|
| 137 |
+
WHERE {where_clause}
|
| 138 |
+
ORDER BY latest_action_date DESC NULLS LAST, bill_number DESC
|
| 139 |
+
LIMIT ? OFFSET ?
|
| 140 |
+
"""
|
| 141 |
+
|
| 142 |
+
query_params = [data_source] + params + [limit, offset]
|
| 143 |
+
rows = conn.execute(sql, query_params).fetchall()
|
| 144 |
+
|
| 145 |
+
bills = []
|
| 146 |
+
for row in rows:
|
| 147 |
+
bills.append({
|
| 148 |
+
"bill_id": row[0],
|
| 149 |
+
"bill_number": row[1],
|
| 150 |
+
"title": row[2],
|
| 151 |
+
"classification": list(row[3]) if row[3] else [],
|
| 152 |
+
"session": row[4],
|
| 153 |
+
"session_name": row[5],
|
| 154 |
+
"first_action_date": str(row[6]) if row[6] else None,
|
| 155 |
+
"latest_action_date": str(row[7]) if row[7] else None,
|
| 156 |
+
"latest_action_description": row[8],
|
| 157 |
+
"jurisdiction_name": row[9]
|
| 158 |
+
})
|
| 159 |
+
|
| 160 |
+
conn.close()
|
| 161 |
+
|
| 162 |
+
return {
|
| 163 |
+
"state": state,
|
| 164 |
+
"query": q,
|
| 165 |
+
"session": session,
|
| 166 |
+
"bills": bills,
|
| 167 |
+
"total": total,
|
| 168 |
+
"limit": limit,
|
| 169 |
+
"offset": offset,
|
| 170 |
+
"source": "parquet"
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Error fetching bills from parquet: {e}")
|
| 175 |
+
raise
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
async def fetch_sessions_from_parquet(state: str) -> Dict[str, Any]:
|
| 179 |
+
"""Fetch sessions from parquet files using DuckDB."""
|
| 180 |
+
try:
|
| 181 |
+
# Build file path
|
| 182 |
+
bills_file = GOLD_DIR / "states" / state / "bills_bills.parquet"
|
| 183 |
+
|
| 184 |
+
# Get data source
|
| 185 |
+
data_source = get_data_source(bills_file, use_remote=IS_HF_SPACES)
|
| 186 |
+
|
| 187 |
+
# Connect to DuckDB
|
| 188 |
+
conn = duckdb.connect()
|
| 189 |
+
|
| 190 |
+
# Aggregate sessions
|
| 191 |
+
sql = """
|
| 192 |
+
SELECT
|
| 193 |
+
session,
|
| 194 |
+
MAX(session_name) as session_name,
|
| 195 |
+
MIN(first_action_date) as start_date,
|
| 196 |
+
MAX(latest_action_date) as end_date,
|
| 197 |
+
COUNT(*) as bill_count
|
| 198 |
+
FROM read_parquet(?)
|
| 199 |
+
GROUP BY session, session_name
|
| 200 |
+
ORDER BY session DESC
|
| 201 |
+
"""
|
| 202 |
+
|
| 203 |
+
rows = conn.execute(sql, [data_source]).fetchall()
|
| 204 |
+
|
| 205 |
+
sessions = []
|
| 206 |
+
for row in rows:
|
| 207 |
+
sessions.append({
|
| 208 |
+
"session": row[0],
|
| 209 |
+
"session_name": row[1],
|
| 210 |
+
"start_date": str(row[2]) if row[2] else None,
|
| 211 |
+
"end_date": str(row[3]) if row[3] else None,
|
| 212 |
+
"bill_count": row[4]
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
conn.close()
|
| 216 |
+
|
| 217 |
+
return {
|
| 218 |
+
"state": state,
|
| 219 |
+
"sessions": sessions,
|
| 220 |
+
"total_sessions": len(sessions),
|
| 221 |
+
"source": "parquet"
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.error(f"Error fetching sessions from parquet: {e}")
|
| 226 |
+
raise
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
async def fetch_map_data_from_neon(
|
| 230 |
+
topic: Optional[str] = None,
|
| 231 |
+
session: Optional[str] = None
|
| 232 |
+
) -> Dict[str, Any]:
|
| 233 |
+
"""Fetch map aggregates from Neon PostgreSQL."""
|
| 234 |
+
pool = await get_pool()
|
| 235 |
+
|
| 236 |
+
# Use cache if available
|
| 237 |
+
global _map_cache, _map_cache_time
|
| 238 |
+
cache_key = f"{topic or 'all'}_{session or 'all'}"
|
| 239 |
+
|
| 240 |
+
now = datetime.now()
|
| 241 |
+
if _map_cache_time and (now - _map_cache_time) < MAP_CACHE_DURATION:
|
| 242 |
+
if cache_key in _map_cache:
|
| 243 |
+
logger.debug(f"🚀 Map cache hit for {cache_key}")
|
| 244 |
+
return _map_cache[cache_key]
|
| 245 |
+
|
| 246 |
+
async with pool.acquire() as conn:
|
| 247 |
+
# For now, we only support topic='all' (no topic filtering yet)
|
| 248 |
+
# Session filtering would require aggregating bills on-the-fly
|
| 249 |
+
|
| 250 |
+
sql = """
|
| 251 |
+
SELECT
|
| 252 |
+
state_code,
|
| 253 |
+
topic,
|
| 254 |
+
total_bills,
|
| 255 |
+
type_bill,
|
| 256 |
+
type_resolution,
|
| 257 |
+
type_concurrent_resolution,
|
| 258 |
+
type_joint_resolution,
|
| 259 |
+
type_constitutional_amendment,
|
| 260 |
+
status_enacted,
|
| 261 |
+
status_failed,
|
| 262 |
+
status_pending,
|
| 263 |
+
primary_type,
|
| 264 |
+
primary_status,
|
| 265 |
+
map_category,
|
| 266 |
+
sample_bills,
|
| 267 |
+
last_updated
|
| 268 |
+
FROM bills_map_aggregates
|
| 269 |
+
WHERE topic = $1
|
| 270 |
+
"""
|
| 271 |
+
|
| 272 |
+
requested_topic = topic.lower() if topic else 'all'
|
| 273 |
+
rows = await conn.fetch(sql, requested_topic)
|
| 274 |
+
|
| 275 |
+
# If topic-specific data not found, return empty (don't fallback)
|
| 276 |
+
if not rows:
|
| 277 |
+
logger.warning(f"📊 No pre-computed data for topic '{requested_topic}'")
|
| 278 |
+
return {
|
| 279 |
+
"topic": requested_topic,
|
| 280 |
+
"session": session,
|
| 281 |
+
"states": {},
|
| 282 |
+
"total_states": 0,
|
| 283 |
+
"message": f"No data available for topic '{requested_topic}'. Try 'all' or pre-compute aggregates for this topic.",
|
| 284 |
+
"source": "neon"
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
state_data = {}
|
| 288 |
+
for row in rows:
|
| 289 |
+
state_code = row['state_code']
|
| 290 |
+
|
| 291 |
+
# Parse sample bills JSON
|
| 292 |
+
sample_bills = row['sample_bills'] or []
|
| 293 |
+
if isinstance(sample_bills, str):
|
| 294 |
+
import json
|
| 295 |
+
sample_bills = json.loads(sample_bills)
|
| 296 |
+
|
| 297 |
+
state_data[state_code] = {
|
| 298 |
+
"state": state_code,
|
| 299 |
+
"total_bills": row['total_bills'],
|
| 300 |
+
"type_counts": {
|
| 301 |
+
"bill": row['type_bill'],
|
| 302 |
+
"resolution": row['type_resolution'],
|
| 303 |
+
"concurrent_resolution": row['type_concurrent_resolution'],
|
| 304 |
+
"joint_resolution": row['type_joint_resolution'],
|
| 305 |
+
"constitutional_amendment": row['type_constitutional_amendment']
|
| 306 |
+
},
|
| 307 |
+
"status_counts": {
|
| 308 |
+
"enacted": row['status_enacted'] or 0,
|
| 309 |
+
"failed": row['status_failed'] or 0,
|
| 310 |
+
"pending": row['status_pending'] or 0
|
| 311 |
+
},
|
| 312 |
+
"primary_type": row['primary_type'],
|
| 313 |
+
"primary_status": row['primary_status'],
|
| 314 |
+
"map_category": row['map_category'],
|
| 315 |
+
"sample_bills": sample_bills,
|
| 316 |
+
"last_updated": row['last_updated'].isoformat() if row['last_updated'] else None
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
# Build dynamic legend based on actual data
|
| 320 |
+
unique_types = set()
|
| 321 |
+
for state in state_data.values():
|
| 322 |
+
if state['primary_type']:
|
| 323 |
+
unique_types.add(state['primary_type'])
|
| 324 |
+
|
| 325 |
+
# Map types to user-friendly names
|
| 326 |
+
type_labels = {
|
| 327 |
+
'mandate': 'Mandate',
|
| 328 |
+
'removal': 'Removal',
|
| 329 |
+
'study': 'Study',
|
| 330 |
+
'funding': 'Funding',
|
| 331 |
+
'coverage_expansion': 'Coverage Expansion',
|
| 332 |
+
'screening': 'Screening',
|
| 333 |
+
'provider_access': 'Provider Access',
|
| 334 |
+
'expansion': 'Expansion',
|
| 335 |
+
'coverage': 'Coverage',
|
| 336 |
+
'reimbursement': 'Reimbursement',
|
| 337 |
+
'eligibility': 'Eligibility',
|
| 338 |
+
'requirement': 'Requirement',
|
| 339 |
+
'curriculum': 'Curriculum',
|
| 340 |
+
'reform': 'Reform',
|
| 341 |
+
'protection': 'Protection',
|
| 342 |
+
'restriction': 'Restriction',
|
| 343 |
+
'other': 'Other'
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
legend_types = {t: type_labels.get(t, t.replace('_', ' ').title()) for t in unique_types}
|
| 347 |
+
|
| 348 |
+
result = {
|
| 349 |
+
"topic": requested_topic,
|
| 350 |
+
"session": session,
|
| 351 |
+
"states": state_data,
|
| 352 |
+
"total_states": len(state_data),
|
| 353 |
+
"legend": {
|
| 354 |
+
"types": legend_types,
|
| 355 |
+
"statuses": {
|
| 356 |
+
"enacted": "Enacted",
|
| 357 |
+
"failed": "Failed",
|
| 358 |
+
"pending": "Pending"
|
| 359 |
+
}
|
| 360 |
+
},
|
| 361 |
+
"cached": True,
|
| 362 |
+
"source": "neon"
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
# Update cache
|
| 366 |
+
_map_cache[cache_key] = result
|
| 367 |
+
_map_cache_time = now
|
| 368 |
+
|
| 369 |
+
return result
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
@router.get("")
|
| 373 |
+
async def get_bills(
|
| 374 |
+
state: str = Query(..., description="State abbreviation (e.g., MA, AL)"),
|
| 375 |
+
q: Optional[str] = Query(None, description="Search query (bill number or title)"),
|
| 376 |
+
session: Optional[str] = Query(None, description="Legislative session"),
|
| 377 |
+
limit: int = Query(50, ge=1, le=500),
|
| 378 |
+
offset: int = Query(0, ge=0)
|
| 379 |
+
):
|
| 380 |
+
"""
|
| 381 |
+
Search legislative bills using parquet files (detailed drill-down).
|
| 382 |
+
|
| 383 |
+
**Examples:**
|
| 384 |
+
- `/api/bills?state=AL&q=dental` - Search Alabama bills for "dental"
|
| 385 |
+
- `/api/bills?state=AL&session=2024rs` - Get all 2024 regular session bills
|
| 386 |
+
- `/api/bills?state=AL&limit=50` - Browse recent Alabama bills
|
| 387 |
+
"""
|
| 388 |
+
try:
|
| 389 |
+
result = await fetch_bills_from_parquet(
|
| 390 |
+
state=state.upper(),
|
| 391 |
+
q=q,
|
| 392 |
+
session=session,
|
| 393 |
+
limit=limit,
|
| 394 |
+
offset=offset
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
return result
|
| 398 |
+
|
| 399 |
+
except HTTPException:
|
| 400 |
+
raise
|
| 401 |
+
except Exception as e:
|
| 402 |
+
logger.error(f"Bills query error for state={state}: {e}")
|
| 403 |
+
|
| 404 |
+
error_detail = parse_error(e, context={
|
| 405 |
+
"state": state,
|
| 406 |
+
"query": q,
|
| 407 |
+
"session": session
|
| 408 |
+
})
|
| 409 |
+
|
| 410 |
+
return JSONResponse(
|
| 411 |
+
status_code=500,
|
| 412 |
+
content=error_detail.model_dump()
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
@router.get("/sessions")
|
| 417 |
+
async def get_sessions(
|
| 418 |
+
state: str = Query(..., description="State abbreviation (e.g., MA, AL)")
|
| 419 |
+
):
|
| 420 |
+
"""
|
| 421 |
+
Get legislative sessions for a state using parquet files.
|
| 422 |
+
|
| 423 |
+
**Examples:**
|
| 424 |
+
- `/api/bills/sessions?state=MA` - Get all Massachusetts sessions
|
| 425 |
+
"""
|
| 426 |
+
try:
|
| 427 |
+
result = await fetch_sessions_from_parquet(state=state.upper())
|
| 428 |
+
|
| 429 |
+
return result
|
| 430 |
+
|
| 431 |
+
except HTTPException:
|
| 432 |
+
raise
|
| 433 |
+
except Exception as e:
|
| 434 |
+
logger.error(f"Sessions query error for state={state}: {e}")
|
| 435 |
+
|
| 436 |
+
error_detail = parse_error(e, context={
|
| 437 |
+
"state": state
|
| 438 |
+
})
|
| 439 |
+
|
| 440 |
+
return JSONResponse(
|
| 441 |
+
status_code=500,
|
| 442 |
+
content=error_detail.model_dump()
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
@router.get("/map")
|
| 447 |
+
async def get_bill_map_data(
|
| 448 |
+
topic: Optional[str] = Query(None, description="Topic to filter (e.g., dental, health, education)"),
|
| 449 |
+
session: Optional[str] = Query(None, description="Legislative session")
|
| 450 |
+
):
|
| 451 |
+
"""
|
| 452 |
+
Get aggregated bill data for choropleth map using Neon PostgreSQL.
|
| 453 |
+
|
| 454 |
+
Returns pre-computed state-level aggregates for instant visualization.
|
| 455 |
+
|
| 456 |
+
**Examples:**
|
| 457 |
+
- `/api/bills/map` - Get national bill map data
|
| 458 |
+
- `/api/bills/map?topic=dental` - Map dental legislation (not yet implemented)
|
| 459 |
+
"""
|
| 460 |
+
try:
|
| 461 |
+
if not DATABASE_URL:
|
| 462 |
+
raise HTTPException(status_code=503, detail="Database not configured")
|
| 463 |
+
|
| 464 |
+
result = await fetch_map_data_from_neon(topic=topic, session=session)
|
| 465 |
+
|
| 466 |
+
return result
|
| 467 |
+
|
| 468 |
+
except HTTPException:
|
| 469 |
+
raise
|
| 470 |
+
except Exception as e:
|
| 471 |
+
logger.error(f"Map data query error: {e}")
|
| 472 |
+
|
| 473 |
+
error_detail = parse_error(e, context={
|
| 474 |
+
"topic": topic,
|
| 475 |
+
"session": session
|
| 476 |
+
})
|
| 477 |
+
|
| 478 |
+
return JSONResponse(
|
| 479 |
+
status_code=500,
|
| 480 |
+
content=error_detail.model_dump()
|
| 481 |
+
)
|
api/routes/contact.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Contact routes for submitting feedback and issues via GitHub.
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import APIRouter, HTTPException, status
|
| 5 |
+
from pydantic import BaseModel, EmailStr, Field
|
| 6 |
+
from typing import Optional
|
| 7 |
+
import httpx
|
| 8 |
+
import os
|
| 9 |
+
from loguru import logger
|
| 10 |
+
|
| 11 |
+
router = APIRouter(prefix="/api/contact", tags=["contact"])
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ContactRequest(BaseModel):
|
| 15 |
+
"""Contact form submission"""
|
| 16 |
+
name: str = Field(..., min_length=1, max_length=100, description="Name of the person contacting")
|
| 17 |
+
email: EmailStr = Field(..., description="Email address for follow-up")
|
| 18 |
+
subject: str = Field(..., min_length=1, max_length=200, description="Subject of the message")
|
| 19 |
+
message: str = Field(..., min_length=10, max_length=5000, description="Detailed message")
|
| 20 |
+
category: Optional[str] = Field(default="feedback", description="Type of contact: feedback, bug, feature, question")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class ContactResponse(BaseModel):
|
| 24 |
+
"""Response after submitting contact form"""
|
| 25 |
+
success: bool
|
| 26 |
+
message: str
|
| 27 |
+
issue_url: Optional[str] = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@router.post("/submit", response_model=ContactResponse, status_code=status.HTTP_201_CREATED)
|
| 31 |
+
async def submit_contact(request: ContactRequest):
|
| 32 |
+
"""
|
| 33 |
+
Submit a contact form message as a GitHub issue.
|
| 34 |
+
|
| 35 |
+
Creates an issue in the GitHub repository with the contact information.
|
| 36 |
+
Requires GITHUB_TOKEN and GITHUB_REPO environment variables.
|
| 37 |
+
"""
|
| 38 |
+
github_token = os.getenv("GITHUB_TOKEN")
|
| 39 |
+
github_repo = os.getenv("GITHUB_REPO", "getcommunityone/open-navigator")
|
| 40 |
+
|
| 41 |
+
if not github_token:
|
| 42 |
+
logger.warning("GitHub token not configured - contact form submission failed")
|
| 43 |
+
raise HTTPException(
|
| 44 |
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 45 |
+
detail="Contact form is not configured. Please email us directly or submit an issue on GitHub."
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Determine label based on category
|
| 49 |
+
category_labels = {
|
| 50 |
+
"bug": "bug",
|
| 51 |
+
"feature": "enhancement",
|
| 52 |
+
"question": "question",
|
| 53 |
+
"feedback": "feedback"
|
| 54 |
+
}
|
| 55 |
+
label = category_labels.get(request.category, "feedback")
|
| 56 |
+
|
| 57 |
+
# Create issue title and body
|
| 58 |
+
issue_title = f"[Contact Form] {request.subject}"
|
| 59 |
+
issue_body = f"""**From:** {request.name} ({request.email})
|
| 60 |
+
**Category:** {request.category}
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
{request.message}
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
*This issue was automatically created from the contact form.*
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
# Create GitHub issue
|
| 71 |
+
github_api_url = f"https://api.github.com/repos/{github_repo}/issues"
|
| 72 |
+
headers = {
|
| 73 |
+
"Authorization": f"token {github_token}",
|
| 74 |
+
"Accept": "application/vnd.github.v3+json"
|
| 75 |
+
}
|
| 76 |
+
payload = {
|
| 77 |
+
"title": issue_title,
|
| 78 |
+
"body": issue_body,
|
| 79 |
+
"labels": [label, "contact-form"]
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
async with httpx.AsyncClient() as client:
|
| 84 |
+
response = await client.post(
|
| 85 |
+
github_api_url,
|
| 86 |
+
json=payload,
|
| 87 |
+
headers=headers,
|
| 88 |
+
timeout=10.0
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if response.status_code == 201:
|
| 92 |
+
issue_data = response.json()
|
| 93 |
+
issue_url = issue_data.get("html_url")
|
| 94 |
+
logger.info(f"Contact form submitted successfully: {issue_url}")
|
| 95 |
+
return ContactResponse(
|
| 96 |
+
success=True,
|
| 97 |
+
message="Thank you for contacting us! We've received your message and will get back to you soon.",
|
| 98 |
+
issue_url=issue_url
|
| 99 |
+
)
|
| 100 |
+
else:
|
| 101 |
+
logger.error(f"GitHub API error: {response.status_code} - {response.text}")
|
| 102 |
+
raise HTTPException(
|
| 103 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 104 |
+
detail="Failed to submit contact form. Please try again later."
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
except httpx.TimeoutException:
|
| 108 |
+
logger.error("GitHub API timeout")
|
| 109 |
+
raise HTTPException(
|
| 110 |
+
status_code=status.HTTP_504_GATEWAY_TIMEOUT,
|
| 111 |
+
detail="Request timed out. Please try again."
|
| 112 |
+
)
|
| 113 |
+
except httpx.RequestError as e:
|
| 114 |
+
logger.error(f"GitHub API request error: {e}")
|
| 115 |
+
raise HTTPException(
|
| 116 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 117 |
+
detail="Failed to submit contact form. Please try again later."
|
| 118 |
+
)
|
api/routes/hf_search.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Datasets Search API Integration
|
| 3 |
+
|
| 4 |
+
Fast server-side text search using HuggingFace's indexed datasets.
|
| 5 |
+
Falls back to DuckDB if dataset not indexed or search unavailable.
|
| 6 |
+
"""
|
| 7 |
+
import httpx
|
| 8 |
+
from typing import Optional, List, Dict, Any
|
| 9 |
+
from loguru import logger
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
HF_SEARCH_API = "https://datasets-server.huggingface.co/search"
|
| 13 |
+
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
|
| 14 |
+
REQUEST_TIMEOUT = 5 # seconds
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def is_dataset_indexed(dataset_name: str) -> bool:
|
| 18 |
+
"""
|
| 19 |
+
Check if a dataset is indexed and searchable.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
dataset_name: Full repo ID (e.g., 'CommunityOne/states-ma-contacts-local-officials')
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
True if dataset supports search, False otherwise
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
response = httpx.get(
|
| 29 |
+
"https://datasets-server.huggingface.co/is-valid",
|
| 30 |
+
params={"dataset": dataset_name},
|
| 31 |
+
timeout=REQUEST_TIMEOUT
|
| 32 |
+
)
|
| 33 |
+
if response.status_code == 200:
|
| 34 |
+
data = response.json()
|
| 35 |
+
return data.get("search", False)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.debug(f"Could not check if {dataset_name} is indexed: {e}")
|
| 38 |
+
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def search_hf_dataset(
|
| 43 |
+
dataset_name: str,
|
| 44 |
+
query: str,
|
| 45 |
+
config: str = "default",
|
| 46 |
+
split: str = "train",
|
| 47 |
+
limit: int = 100
|
| 48 |
+
) -> Optional[List[Dict[str, Any]]]:
|
| 49 |
+
"""
|
| 50 |
+
Search a HuggingFace dataset using server-side indexed search.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
dataset_name: Full repo ID (e.g., 'CommunityOne/states-ma-contacts')
|
| 54 |
+
query: Search text (searches across all string columns)
|
| 55 |
+
config: Dataset configuration name
|
| 56 |
+
split: Dataset split to search
|
| 57 |
+
limit: Maximum results to return
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List of matching rows, or None if search unavailable
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
response = httpx.get(
|
| 64 |
+
HF_SEARCH_API,
|
| 65 |
+
params={
|
| 66 |
+
"dataset": dataset_name,
|
| 67 |
+
"config": config,
|
| 68 |
+
"split": split,
|
| 69 |
+
"query": query,
|
| 70 |
+
"offset": 0,
|
| 71 |
+
"length": limit
|
| 72 |
+
},
|
| 73 |
+
timeout=REQUEST_TIMEOUT
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if response.status_code == 200:
|
| 77 |
+
data = response.json()
|
| 78 |
+
|
| 79 |
+
# Check for errors
|
| 80 |
+
if "error" in data:
|
| 81 |
+
logger.debug(f"HF Search API error for {dataset_name}: {data['error']}")
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
# Extract rows
|
| 85 |
+
rows = data.get("rows", [])
|
| 86 |
+
logger.info(f"✅ HF Search API: Found {len(rows)} results for '{query}' in {dataset_name}")
|
| 87 |
+
|
| 88 |
+
# Convert to list of dicts (row['row'] contains actual data)
|
| 89 |
+
results = [row.get("row", {}) for row in rows]
|
| 90 |
+
return results
|
| 91 |
+
|
| 92 |
+
elif response.status_code == 404:
|
| 93 |
+
logger.debug(f"Dataset {dataset_name} not found on HuggingFace")
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
else:
|
| 97 |
+
logger.debug(f"HF Search API returned status {response.status_code}")
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
except httpx.TimeoutException:
|
| 101 |
+
logger.debug(f"HF Search API timeout for {dataset_name}")
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.debug(f"HF Search API error: {e}")
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def search_contacts_hf(query: str, state: Optional[str] = None, limit: int = 10) -> Optional[List[Dict[str, Any]]]:
|
| 110 |
+
"""
|
| 111 |
+
Search contacts (local officials + nonprofit officers) using HF Search API.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
query: Search text (name, title, jurisdiction, etc.)
|
| 115 |
+
state: 2-letter state code (e.g., 'MA')
|
| 116 |
+
limit: Maximum results
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of contact dicts, or None if search unavailable
|
| 120 |
+
"""
|
| 121 |
+
results = []
|
| 122 |
+
|
| 123 |
+
# Search local officials
|
| 124 |
+
if state:
|
| 125 |
+
dataset = f"{HF_ORGANIZATION}/states-{state.lower()}-contacts-local-officials"
|
| 126 |
+
local_results = search_hf_dataset(dataset, query, limit=limit)
|
| 127 |
+
if local_results:
|
| 128 |
+
for row in local_results:
|
| 129 |
+
row['source'] = 'local_officials'
|
| 130 |
+
results.append(row)
|
| 131 |
+
|
| 132 |
+
# Search nonprofit officers
|
| 133 |
+
if state and len(results) < limit:
|
| 134 |
+
dataset = f"{HF_ORGANIZATION}/states-{state.lower()}-contacts-nonprofit-officers"
|
| 135 |
+
nonprofit_results = search_hf_dataset(dataset, query, limit=limit - len(results))
|
| 136 |
+
if nonprofit_results:
|
| 137 |
+
for row in nonprofit_results:
|
| 138 |
+
row['source'] = 'nonprofit_officers'
|
| 139 |
+
results.append(row)
|
| 140 |
+
|
| 141 |
+
return results if results else None
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def search_organizations_hf(query: str, state: Optional[str] = None, limit: int = 10) -> Optional[List[Dict[str, Any]]]:
|
| 145 |
+
"""
|
| 146 |
+
Search nonprofit organizations using HF Search API.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
query: Search text (organization name, EIN, etc.)
|
| 150 |
+
state: 2-letter state code
|
| 151 |
+
limit: Maximum results
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
List of organization dicts, or None if search unavailable
|
| 155 |
+
"""
|
| 156 |
+
if state:
|
| 157 |
+
dataset = f"{HF_ORGANIZATION}/states-{state.lower()}-nonprofits-organizations"
|
| 158 |
+
else:
|
| 159 |
+
dataset = f"{HF_ORGANIZATION}/national-nonprofits-organizations"
|
| 160 |
+
|
| 161 |
+
return search_hf_dataset(dataset, query, limit=limit)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def search_jurisdictions_hf(query: str, jurisdiction_type: Optional[str] = None, limit: int = 10) -> Optional[List[Dict[str, Any]]]:
|
| 165 |
+
"""
|
| 166 |
+
Search jurisdictions (cities, counties, townships, school districts) using HF Search API.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
query: Search text (city name, county name, etc.)
|
| 170 |
+
jurisdiction_type: 'cities', 'counties', 'townships', 'school_districts'
|
| 171 |
+
limit: Maximum results
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
List of jurisdiction dicts, or None if search unavailable
|
| 175 |
+
"""
|
| 176 |
+
if jurisdiction_type:
|
| 177 |
+
dataset = f"{HF_ORGANIZATION}/reference-jurisdictions-{jurisdiction_type.replace('_', '-')}"
|
| 178 |
+
else:
|
| 179 |
+
# Try cities first (most common)
|
| 180 |
+
dataset = f"{HF_ORGANIZATION}/reference-jurisdictions-cities"
|
| 181 |
+
|
| 182 |
+
return search_hf_dataset(dataset, query, limit=limit)
|
api/routes/search.py
ADDED
|
@@ -0,0 +1,1685 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unified Search API
|
| 3 |
+
LinkedIn-style search across contacts, meetings, organizations, and causes
|
| 4 |
+
Uses hybrid approach: PostgreSQL (primary, fast) + HuggingFace Search API + DuckDB (fallback)
|
| 5 |
+
"""
|
| 6 |
+
from fastapi import APIRouter, Query, HTTPException
|
| 7 |
+
from fastapi.responses import JSONResponse
|
| 8 |
+
from typing import Optional, List, Dict, Any
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import duckdb
|
| 12 |
+
from loguru import logger
|
| 13 |
+
import re
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import requests
|
| 17 |
+
from functools import lru_cache
|
| 18 |
+
from datetime import datetime, timedelta
|
| 19 |
+
|
| 20 |
+
from api.errors import ErrorDetail, parse_error
|
| 21 |
+
|
| 22 |
+
# Import PostgreSQL search functions (primary)
|
| 23 |
+
from api.routes import search_postgres
|
| 24 |
+
|
| 25 |
+
# Import HuggingFace Search helpers
|
| 26 |
+
from api.routes.hf_search import (
|
| 27 |
+
search_contacts_hf,
|
| 28 |
+
search_organizations_hf,
|
| 29 |
+
search_jurisdictions_hf,
|
| 30 |
+
is_dataset_indexed
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
router = APIRouter(tags=["search"])
|
| 34 |
+
|
| 35 |
+
# Paths to gold datasets
|
| 36 |
+
GOLD_DIR = Path("data/gold")
|
| 37 |
+
|
| 38 |
+
# Detect deployment environment
|
| 39 |
+
IS_HF_SPACES = os.getenv("HF_SPACES") == "1"
|
| 40 |
+
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
|
| 41 |
+
|
| 42 |
+
# Cache for count queries (TTL: 1 hour)
|
| 43 |
+
_count_cache = {}
|
| 44 |
+
_count_cache_ttl = {}
|
| 45 |
+
|
| 46 |
+
# In-memory DataFrame cache for HuggingFace datasets (TTL: 5 minutes)
|
| 47 |
+
# Reduces remote HTTP requests from 2-3s to <10ms per search
|
| 48 |
+
_dataframe_cache: Dict[str, pd.DataFrame] = {}
|
| 49 |
+
_dataframe_cache_ttl: Dict[str, datetime] = {}
|
| 50 |
+
DATAFRAME_CACHE_TTL = timedelta(minutes=5)
|
| 51 |
+
|
| 52 |
+
# Every.org API config (fallback only)
|
| 53 |
+
EVERYORG_API_KEY = os.getenv('EVERYORG_API_KEY', '')
|
| 54 |
+
EVERYORG_API_BASE = "https://partners.every.org/v0.2"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def load_parquet_cached(url: str) -> pd.DataFrame:
|
| 58 |
+
"""
|
| 59 |
+
Load parquet file with in-memory caching to avoid repeated HTTP requests.
|
| 60 |
+
|
| 61 |
+
Cache TTL: 5 minutes (balances speed vs freshness)
|
| 62 |
+
Reduces search latency from 2-3s to <10ms per query.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
url: URL to parquet file (local path or HuggingFace URL)
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
pandas DataFrame
|
| 69 |
+
"""
|
| 70 |
+
now = datetime.now()
|
| 71 |
+
|
| 72 |
+
# Check cache
|
| 73 |
+
if url in _dataframe_cache:
|
| 74 |
+
cache_time = _dataframe_cache_ttl.get(url)
|
| 75 |
+
if cache_time and (now - cache_time) < DATAFRAME_CACHE_TTL:
|
| 76 |
+
logger.debug(f"🚀 Cache hit for {url}")
|
| 77 |
+
return _dataframe_cache[url]
|
| 78 |
+
|
| 79 |
+
# Cache miss - load from source
|
| 80 |
+
logger.info(f"📥 Loading parquet from {url}")
|
| 81 |
+
df = pd.read_parquet(url)
|
| 82 |
+
|
| 83 |
+
# Store in cache
|
| 84 |
+
_dataframe_cache[url] = df
|
| 85 |
+
_dataframe_cache_ttl[url] = now
|
| 86 |
+
logger.debug(f"💾 Cached {len(df)} rows from {url}")
|
| 87 |
+
|
| 88 |
+
return df
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def get_hf_dataset_url(dataset_name: str) -> str:
|
| 92 |
+
"""
|
| 93 |
+
Convert dataset name to HuggingFace parquet URL.
|
| 94 |
+
|
| 95 |
+
HuggingFace Datasets library stores parquet files in the standard format:
|
| 96 |
+
data/train-00000-of-00001.parquet
|
| 97 |
+
|
| 98 |
+
Examples:
|
| 99 |
+
states-ma-contacts-local-officials ->
|
| 100 |
+
https://huggingface.co/datasets/CommunityOne/states-ma-contacts-local-officials/resolve/main/data/train-00000-of-00001.parquet
|
| 101 |
+
"""
|
| 102 |
+
return f"https://huggingface.co/datasets/{HF_ORGANIZATION}/{dataset_name}/resolve/main/data/train-00000-of-00001.parquet"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def get_data_source(file_path: Path, use_remote: bool = False) -> str:
|
| 106 |
+
"""
|
| 107 |
+
Get data source (local path or remote URL) based on environment.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
file_path: Local file path (e.g., data/gold/states/MA/contacts_local_officials.parquet)
|
| 111 |
+
use_remote: Force remote URL even in local environment
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
File path string (local) or HuggingFace URL (remote)
|
| 115 |
+
"""
|
| 116 |
+
if not IS_HF_SPACES and not use_remote:
|
| 117 |
+
return str(file_path)
|
| 118 |
+
|
| 119 |
+
# Convert local path to HuggingFace dataset name
|
| 120 |
+
# data/gold/states/MA/contacts_local_officials.parquet -> states-ma-contacts-local-officials
|
| 121 |
+
parts = file_path.parts
|
| 122 |
+
|
| 123 |
+
if 'states' in parts:
|
| 124 |
+
state_idx = parts.index('states')
|
| 125 |
+
state = parts[state_idx + 1].lower()
|
| 126 |
+
filename = parts[-1].replace('.parquet', '').replace('_', '-')
|
| 127 |
+
dataset_name = f"states-{state}-{filename}"
|
| 128 |
+
elif 'national' in parts:
|
| 129 |
+
filename = parts[-1].replace('.parquet', '').replace('_', '-')
|
| 130 |
+
dataset_name = f"national-{filename}"
|
| 131 |
+
elif 'reference' in parts:
|
| 132 |
+
filename = parts[-1].replace('.parquet', '').replace('_', '-')
|
| 133 |
+
dataset_name = f"reference-{filename}"
|
| 134 |
+
else:
|
| 135 |
+
# Fallback to local
|
| 136 |
+
return str(file_path)
|
| 137 |
+
|
| 138 |
+
return get_hf_dataset_url(dataset_name)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@lru_cache(maxsize=5000)
|
| 142 |
+
def fetch_form990_data(ein: str) -> Optional[Dict[str, Any]]:
|
| 143 |
+
"""
|
| 144 |
+
Fetch enrichment data from ProPublica Nonprofit Explorer (FREE!)
|
| 145 |
+
Uses their API to get website and mission from Form 990 filings
|
| 146 |
+
"""
|
| 147 |
+
if not ein:
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
clean_ein = str(ein).replace('-', '').zfill(9)
|
| 152 |
+
url = f"https://projects.propublica.org/nonprofits/api/v2/organizations/{clean_ein}.json"
|
| 153 |
+
|
| 154 |
+
response = requests.get(url, timeout=3)
|
| 155 |
+
|
| 156 |
+
if response.status_code == 200:
|
| 157 |
+
data = response.json()
|
| 158 |
+
org = data.get('organization', {})
|
| 159 |
+
filings = data.get('filings_with_data', [])
|
| 160 |
+
|
| 161 |
+
# Get most recent filing data
|
| 162 |
+
website = None
|
| 163 |
+
mission = None
|
| 164 |
+
|
| 165 |
+
if filings:
|
| 166 |
+
# ProPublica provides website from most recent filing
|
| 167 |
+
latest = filings[0]
|
| 168 |
+
# Note: ProPublica API doesn't directly expose website field
|
| 169 |
+
# but we can use their organization name and data as fallback
|
| 170 |
+
pass
|
| 171 |
+
|
| 172 |
+
return {
|
| 173 |
+
'website': website, # ProPublica doesn't expose this in API
|
| 174 |
+
'mission': None, # Would need to parse PDF
|
| 175 |
+
'source': 'propublica',
|
| 176 |
+
'last_updated': datetime.now().isoformat(),
|
| 177 |
+
'tax_year': filings[0].get('tax_prd_yr') if filings else None
|
| 178 |
+
}
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.debug(f"ProPublica lookup failed for EIN {ein}: {e}")
|
| 181 |
+
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
@lru_cache(maxsize=5000)
|
| 186 |
+
def fetch_everyorg_data(ein: str) -> Optional[Dict[str, Any]]:
|
| 187 |
+
"""Fetch enrichment data from Every.org API (cached) - FALLBACK ONLY"""
|
| 188 |
+
if not EVERYORG_API_KEY or not ein:
|
| 189 |
+
return None
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
# Format EIN (remove dashes, ensure 9 digits)
|
| 193 |
+
clean_ein = str(ein).replace('-', '').zfill(9)
|
| 194 |
+
|
| 195 |
+
url = f"{EVERYORG_API_BASE}/nonprofit/{clean_ein}"
|
| 196 |
+
headers = {
|
| 197 |
+
"Authorization": f"Bearer {EVERYORG_API_KEY}",
|
| 198 |
+
"Accept": "application/json"
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
response = requests.get(url, headers=headers, timeout=3)
|
| 202 |
+
|
| 203 |
+
if response.status_code == 200:
|
| 204 |
+
data = response.json()
|
| 205 |
+
if data and 'data' in data and 'nonprofit' in data['data']:
|
| 206 |
+
nonprofit = data['data']['nonprofit']
|
| 207 |
+
tags = data['data'].get('nonprofitTags', [])
|
| 208 |
+
causes = [tag.get('tagName') for tag in tags if tag.get('tagName')]
|
| 209 |
+
|
| 210 |
+
return {
|
| 211 |
+
'mission': nonprofit.get('description') or nonprofit.get('descriptionLong'),
|
| 212 |
+
'website': nonprofit.get('websiteUrl'),
|
| 213 |
+
'logo_url': nonprofit.get('logoUrl'),
|
| 214 |
+
'profile_url': nonprofit.get('profileUrl'),
|
| 215 |
+
'causes': causes[:5], # Limit to top 5 causes
|
| 216 |
+
'source': 'everyorg',
|
| 217 |
+
'last_updated': datetime.now().isoformat()
|
| 218 |
+
}
|
| 219 |
+
except Exception as e:
|
| 220 |
+
logger.debug(f"Every.org lookup failed for EIN {ein}: {e}")
|
| 221 |
+
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def get_enrichment_data(ein: str, existing_data: Optional[Dict] = None) -> Dict[str, Any]:
|
| 226 |
+
"""
|
| 227 |
+
Get enrichment data with intelligent backfill strategy
|
| 228 |
+
|
| 229 |
+
Priority:
|
| 230 |
+
1. Existing form_990_* data (if recent)
|
| 231 |
+
2. GivingTuesday 990 XML (future: direct S3 access)
|
| 232 |
+
3. ProPublica API (current fallback)
|
| 233 |
+
4. Every.org API (last resort)
|
| 234 |
+
|
| 235 |
+
Tracks source and update time for incremental processing
|
| 236 |
+
"""
|
| 237 |
+
result = {
|
| 238 |
+
'website': None,
|
| 239 |
+
'mission': None,
|
| 240 |
+
'logo_url': None,
|
| 241 |
+
'profile_url': None,
|
| 242 |
+
'causes': [],
|
| 243 |
+
'data_sources': []
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Check existing data first (skip if older than 30 days)
|
| 247 |
+
if existing_data:
|
| 248 |
+
cutoff_date = datetime.now() - timedelta(days=30)
|
| 249 |
+
|
| 250 |
+
# Check enrichment data (from any source: form_990, bigquery, etc.)
|
| 251 |
+
if existing_data.get('enrichment_website'):
|
| 252 |
+
last_updated = existing_data.get('enrichment_last_updated')
|
| 253 |
+
if not last_updated or (isinstance(last_updated, str) and datetime.fromisoformat(last_updated) > cutoff_date):
|
| 254 |
+
result['website'] = existing_data['enrichment_website']
|
| 255 |
+
result['data_sources'].append('cached')
|
| 256 |
+
|
| 257 |
+
if existing_data.get('enrichment_mission'):
|
| 258 |
+
result['mission'] = existing_data['enrichment_mission']
|
| 259 |
+
if 'cached' not in result['data_sources']:
|
| 260 |
+
result['data_sources'].append('cached')
|
| 261 |
+
|
| 262 |
+
# Try Every.org for missing fields (keeps logo and causes which 990 doesn't have)
|
| 263 |
+
if not result['website'] or not result['mission']:
|
| 264 |
+
everyorg_data = fetch_everyorg_data(ein)
|
| 265 |
+
if everyorg_data:
|
| 266 |
+
if not result['website'] and everyorg_data.get('website'):
|
| 267 |
+
result['website'] = everyorg_data['website']
|
| 268 |
+
result['data_sources'].append('everyorg')
|
| 269 |
+
|
| 270 |
+
if not result['mission'] and everyorg_data.get('mission'):
|
| 271 |
+
result['mission'] = everyorg_data['mission']
|
| 272 |
+
result['data_sources'].append('everyorg')
|
| 273 |
+
|
| 274 |
+
# Always get logo and causes from Every.org
|
| 275 |
+
result['logo_url'] = everyorg_data.get('logo_url')
|
| 276 |
+
result['profile_url'] = everyorg_data.get('profile_url')
|
| 277 |
+
result['causes'] = everyorg_data.get('causes', [])
|
| 278 |
+
if result['logo_url'] or result['causes']:
|
| 279 |
+
if 'everyorg' not in result['data_sources']:
|
| 280 |
+
result['data_sources'].append('everyorg')
|
| 281 |
+
|
| 282 |
+
return result
|
| 283 |
+
|
| 284 |
+
class SearchResult:
|
| 285 |
+
"""Unified search result"""
|
| 286 |
+
|
| 287 |
+
def __init__(self,
|
| 288 |
+
result_type: str,
|
| 289 |
+
title: str,
|
| 290 |
+
subtitle: str,
|
| 291 |
+
description: str,
|
| 292 |
+
url: str,
|
| 293 |
+
score: float,
|
| 294 |
+
metadata: Dict[str, Any]):
|
| 295 |
+
self.result_type = result_type
|
| 296 |
+
self.title = title
|
| 297 |
+
self.subtitle = subtitle
|
| 298 |
+
self.description = description
|
| 299 |
+
self.url = url
|
| 300 |
+
self.score = score
|
| 301 |
+
self.metadata = metadata
|
| 302 |
+
|
| 303 |
+
def to_dict(self):
|
| 304 |
+
return {
|
| 305 |
+
"type": self.result_type,
|
| 306 |
+
"title": self.title,
|
| 307 |
+
"subtitle": self.subtitle,
|
| 308 |
+
"description": self.description,
|
| 309 |
+
"url": self.url,
|
| 310 |
+
"score": self.score,
|
| 311 |
+
"metadata": self.metadata
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def convert_pg_result(pg_result: search_postgres.SearchResult) -> 'SearchResult':
|
| 316 |
+
"""Convert PostgreSQL SearchResult dataclass to SearchResult class"""
|
| 317 |
+
return SearchResult(
|
| 318 |
+
result_type=pg_result.result_type,
|
| 319 |
+
title=pg_result.title,
|
| 320 |
+
subtitle=pg_result.subtitle,
|
| 321 |
+
description=pg_result.description,
|
| 322 |
+
url=pg_result.url,
|
| 323 |
+
score=pg_result.score,
|
| 324 |
+
metadata=pg_result.metadata
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def calculate_relevance_score(text: str, query: str) -> float:
|
| 329 |
+
"""Calculate relevance score for text matching query"""
|
| 330 |
+
if not text or not query:
|
| 331 |
+
return 0.0
|
| 332 |
+
|
| 333 |
+
text_lower = text.lower()
|
| 334 |
+
query_lower = query.lower()
|
| 335 |
+
|
| 336 |
+
# Exact match gets highest score
|
| 337 |
+
if query_lower in text_lower:
|
| 338 |
+
score = 1.0
|
| 339 |
+
# Boost if it's at the start
|
| 340 |
+
if text_lower.startswith(query_lower):
|
| 341 |
+
score += 0.5
|
| 342 |
+
return min(score, 2.0)
|
| 343 |
+
|
| 344 |
+
# Partial word matches
|
| 345 |
+
query_words = query_lower.split()
|
| 346 |
+
text_words = text_lower.split()
|
| 347 |
+
|
| 348 |
+
matches = sum(1 for qw in query_words if any(qw in tw for tw in text_words))
|
| 349 |
+
return matches / len(query_words) if query_words else 0.0
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def search_contacts_duckdb(query: str, state: Optional[str] = None, limit: int = 10) -> List[SearchResult]:
|
| 353 |
+
"""
|
| 354 |
+
Search contacts using DuckDB (supports local files or remote HTTP parquet).
|
| 355 |
+
This is the fallback when HF Search API is unavailable.
|
| 356 |
+
Supports browse mode when query is empty.
|
| 357 |
+
"""
|
| 358 |
+
results = []
|
| 359 |
+
|
| 360 |
+
# Determine if this is browse mode (no query) or search mode
|
| 361 |
+
is_browse_mode = not query or query.strip() == ''
|
| 362 |
+
|
| 363 |
+
try:
|
| 364 |
+
# Initialize DuckDB connection
|
| 365 |
+
conn = duckdb.connect()
|
| 366 |
+
|
| 367 |
+
# Search 1: State Officials (OpenStates - state legislators, mayors, etc.)
|
| 368 |
+
if state:
|
| 369 |
+
officials_file_path = GOLD_DIR / "states" / state / "contacts_officials.parquet"
|
| 370 |
+
officials_file_paths = [officials_file_path]
|
| 371 |
+
else:
|
| 372 |
+
officials_file_paths = list(GOLD_DIR.glob("states/*/contacts_officials.parquet"))[:5]
|
| 373 |
+
|
| 374 |
+
logger.info(f"Searching {len(officials_file_paths)} state official contact files (OpenStates) - browse_mode={is_browse_mode}")
|
| 375 |
+
|
| 376 |
+
for file_path in officials_file_paths:
|
| 377 |
+
if not file_path.exists():
|
| 378 |
+
continue
|
| 379 |
+
|
| 380 |
+
# Get data source (local or remote URL)
|
| 381 |
+
data_source = get_data_source(file_path, use_remote=IS_HF_SPACES)
|
| 382 |
+
|
| 383 |
+
try:
|
| 384 |
+
if is_browse_mode:
|
| 385 |
+
# Browse mode: return all officials, prioritize mayors
|
| 386 |
+
sql = """
|
| 387 |
+
SELECT
|
| 388 |
+
full_name as name,
|
| 389 |
+
role_type as title,
|
| 390 |
+
city_jurisdiction as jurisdiction,
|
| 391 |
+
state,
|
| 392 |
+
email,
|
| 393 |
+
phone,
|
| 394 |
+
CASE
|
| 395 |
+
WHEN LOWER(role_type) = 'mayor' THEN 2.0
|
| 396 |
+
WHEN LOWER(role_type) LIKE '%council%' THEN 1.8
|
| 397 |
+
WHEN LOWER(role_type) LIKE '%commission%' THEN 1.7
|
| 398 |
+
ELSE 1.5
|
| 399 |
+
END as score
|
| 400 |
+
FROM read_parquet(?)
|
| 401 |
+
ORDER BY score DESC, full_name ASC
|
| 402 |
+
LIMIT ?
|
| 403 |
+
"""
|
| 404 |
+
|
| 405 |
+
rows = conn.execute(sql, [data_source, limit]).fetchall()
|
| 406 |
+
else:
|
| 407 |
+
# Search mode: relevance scoring
|
| 408 |
+
sql = """
|
| 409 |
+
SELECT
|
| 410 |
+
full_name as name,
|
| 411 |
+
role_type as title,
|
| 412 |
+
city_jurisdiction as jurisdiction,
|
| 413 |
+
state,
|
| 414 |
+
email,
|
| 415 |
+
phone,
|
| 416 |
+
GREATEST(
|
| 417 |
+
CASE
|
| 418 |
+
WHEN LOWER(full_name) LIKE LOWER(?) THEN 1.5
|
| 419 |
+
WHEN LOWER(full_name) LIKE LOWER(?) THEN 1.0
|
| 420 |
+
ELSE 0.0
|
| 421 |
+
END,
|
| 422 |
+
CASE
|
| 423 |
+
WHEN LOWER(role_type) LIKE LOWER(?) THEN 1.5
|
| 424 |
+
WHEN LOWER(role_type) LIKE LOWER(?) THEN 1.0
|
| 425 |
+
ELSE 0.0
|
| 426 |
+
END,
|
| 427 |
+
CASE
|
| 428 |
+
WHEN LOWER(city_jurisdiction) LIKE LOWER(?) THEN 1.5
|
| 429 |
+
WHEN LOWER(city_jurisdiction) LIKE LOWER(?) THEN 1.0
|
| 430 |
+
ELSE 0.0
|
| 431 |
+
END,
|
| 432 |
+
CASE
|
| 433 |
+
WHEN LOWER(jurisdiction_name) LIKE LOWER(?) THEN 1.5
|
| 434 |
+
WHEN LOWER(jurisdiction_name) LIKE LOWER(?) THEN 1.0
|
| 435 |
+
ELSE 0.0
|
| 436 |
+
END
|
| 437 |
+
) as score
|
| 438 |
+
FROM read_parquet(?)
|
| 439 |
+
WHERE LOWER(full_name) LIKE LOWER(?)
|
| 440 |
+
OR LOWER(role_type) LIKE LOWER(?)
|
| 441 |
+
OR LOWER(city_jurisdiction) LIKE LOWER(?)
|
| 442 |
+
OR LOWER(jurisdiction_name) LIKE LOWER(?)
|
| 443 |
+
ORDER BY score DESC
|
| 444 |
+
LIMIT ?
|
| 445 |
+
"""
|
| 446 |
+
|
| 447 |
+
query_pattern = f'%{query}%'
|
| 448 |
+
query_start = f'{query}%'
|
| 449 |
+
|
| 450 |
+
rows = conn.execute(sql, [
|
| 451 |
+
query_start, query_pattern, # name scoring
|
| 452 |
+
query_start, query_pattern, # role_type scoring
|
| 453 |
+
query_start, query_pattern, # city_jurisdiction scoring
|
| 454 |
+
query_start, query_pattern, # jurisdiction_name scoring
|
| 455 |
+
data_source, # file path or URL
|
| 456 |
+
query_pattern, query_pattern, query_pattern, query_pattern, # WHERE clause
|
| 457 |
+
limit
|
| 458 |
+
]).fetchall()
|
| 459 |
+
|
| 460 |
+
# Convert to SearchResult objects
|
| 461 |
+
for row in rows:
|
| 462 |
+
name, title, jurisdiction, state_code, email, phone, score = row
|
| 463 |
+
|
| 464 |
+
if score > 0.3: # Relevance threshold
|
| 465 |
+
contact_info = []
|
| 466 |
+
if email:
|
| 467 |
+
contact_info.append(f"📧 {email}")
|
| 468 |
+
if phone:
|
| 469 |
+
contact_info.append(f"📞 {phone}")
|
| 470 |
+
|
| 471 |
+
description = f"State official in {jurisdiction}" if jurisdiction else f"State official in {state_code}"
|
| 472 |
+
if contact_info:
|
| 473 |
+
description += f" • {' • '.join(contact_info)}"
|
| 474 |
+
|
| 475 |
+
results.append(SearchResult(
|
| 476 |
+
result_type="contact",
|
| 477 |
+
title=name if name else "Unknown",
|
| 478 |
+
subtitle=f"{title.title() if title else 'Official'} - {jurisdiction or state_code}",
|
| 479 |
+
description=description,
|
| 480 |
+
url=f"/people/{name.replace(' ', '-') if name else 'unknown'}",
|
| 481 |
+
score=score,
|
| 482 |
+
metadata={
|
| 483 |
+
"title": title,
|
| 484 |
+
"jurisdiction": jurisdiction,
|
| 485 |
+
"state": state_code,
|
| 486 |
+
"name": name,
|
| 487 |
+
"email": email,
|
| 488 |
+
"phone": phone,
|
| 489 |
+
"contact_type": "state_official",
|
| 490 |
+
"data_source": "OpenStates"
|
| 491 |
+
}
|
| 492 |
+
))
|
| 493 |
+
|
| 494 |
+
except Exception as e:
|
| 495 |
+
logger.debug(f"Error searching state officials {file_path}: {e}")
|
| 496 |
+
|
| 497 |
+
# Search 2: Local Officials (from meeting transcripts)
|
| 498 |
+
if state:
|
| 499 |
+
local_file_path = GOLD_DIR / "states" / state / "contacts_local_officials.parquet"
|
| 500 |
+
local_file_paths = [local_file_path]
|
| 501 |
+
else:
|
| 502 |
+
local_file_paths = list(GOLD_DIR.glob("states/*/contacts_local_officials.parquet"))[:5]
|
| 503 |
+
|
| 504 |
+
logger.info(f"Searching {len(local_file_paths)} local official contact files (meeting transcripts)")
|
| 505 |
+
|
| 506 |
+
for file_path in local_file_paths:
|
| 507 |
+
# Get data source (local or remote URL)
|
| 508 |
+
data_source = get_data_source(file_path, use_remote=IS_HF_SPACES)
|
| 509 |
+
|
| 510 |
+
try:
|
| 511 |
+
# SQL query with relevance scoring across name, title, jurisdiction
|
| 512 |
+
sql = """
|
| 513 |
+
SELECT
|
| 514 |
+
name,
|
| 515 |
+
title,
|
| 516 |
+
jurisdiction,
|
| 517 |
+
state,
|
| 518 |
+
GREATEST(
|
| 519 |
+
CASE
|
| 520 |
+
WHEN LOWER(name) LIKE LOWER(?) THEN 1.5
|
| 521 |
+
WHEN LOWER(name) LIKE LOWER(?) THEN 1.0
|
| 522 |
+
ELSE 0.0
|
| 523 |
+
END,
|
| 524 |
+
CASE
|
| 525 |
+
WHEN LOWER(title) LIKE LOWER(?) THEN 1.5
|
| 526 |
+
WHEN LOWER(title) LIKE LOWER(?) THEN 1.0
|
| 527 |
+
ELSE 0.0
|
| 528 |
+
END,
|
| 529 |
+
CASE
|
| 530 |
+
WHEN LOWER(jurisdiction) LIKE LOWER(?) THEN 1.5
|
| 531 |
+
WHEN LOWER(jurisdiction) LIKE LOWER(?) THEN 1.0
|
| 532 |
+
ELSE 0.0
|
| 533 |
+
END
|
| 534 |
+
) as score
|
| 535 |
+
FROM read_parquet(?)
|
| 536 |
+
WHERE LOWER(name) LIKE LOWER(?)
|
| 537 |
+
OR LOWER(title) LIKE LOWER(?)
|
| 538 |
+
OR LOWER(jurisdiction) LIKE LOWER(?)
|
| 539 |
+
ORDER BY score DESC
|
| 540 |
+
LIMIT ?
|
| 541 |
+
"""
|
| 542 |
+
|
| 543 |
+
query_pattern = f'%{query}%'
|
| 544 |
+
query_start = f'{query}%'
|
| 545 |
+
|
| 546 |
+
rows = conn.execute(sql, [
|
| 547 |
+
query_start, query_pattern, # name scoring
|
| 548 |
+
query_start, query_pattern, # title scoring
|
| 549 |
+
query_start, query_pattern, # jurisdiction scoring
|
| 550 |
+
data_source, # file path or URL
|
| 551 |
+
query_pattern, query_pattern, query_pattern, # WHERE clause
|
| 552 |
+
limit
|
| 553 |
+
]).fetchall()
|
| 554 |
+
|
| 555 |
+
# Convert to SearchResult objects
|
| 556 |
+
for row in rows:
|
| 557 |
+
name, title, jurisdiction, state_code, score = row
|
| 558 |
+
|
| 559 |
+
if score > 0.3: # Relevance threshold
|
| 560 |
+
results.append(SearchResult(
|
| 561 |
+
result_type="contact",
|
| 562 |
+
title=name if name else "Unknown",
|
| 563 |
+
subtitle=f"{title} - {jurisdiction}, {state_code}",
|
| 564 |
+
description=f"Local official in {jurisdiction}",
|
| 565 |
+
url=f"/people/{name.replace(' ', '-') if name else 'unknown'}",
|
| 566 |
+
score=score,
|
| 567 |
+
metadata={
|
| 568 |
+
"title": title,
|
| 569 |
+
"jurisdiction": jurisdiction,
|
| 570 |
+
"state": state_code,
|
| 571 |
+
"name": name
|
| 572 |
+
}
|
| 573 |
+
))
|
| 574 |
+
|
| 575 |
+
except Exception as e:
|
| 576 |
+
logger.debug(f"Error searching {file_path}: {e}")
|
| 577 |
+
|
| 578 |
+
# Search 3: Nonprofit Officers from state directories
|
| 579 |
+
nonprofit_files = []
|
| 580 |
+
|
| 581 |
+
# If state specified, search that state's directory
|
| 582 |
+
if state:
|
| 583 |
+
state_nonprofit_file = GOLD_DIR / "states" / state / "contacts_nonprofit_officers.parquet"
|
| 584 |
+
nonprofit_files.append(state_nonprofit_file)
|
| 585 |
+
else:
|
| 586 |
+
# Search all state directories
|
| 587 |
+
for state_dir in (GOLD_DIR / "states").glob("*/"):
|
| 588 |
+
state_file = state_dir / "contacts_nonprofit_officers.parquet"
|
| 589 |
+
nonprofit_files.append(state_file)
|
| 590 |
+
|
| 591 |
+
for nonprofit_file in nonprofit_files:
|
| 592 |
+
# Get data source (local or remote URL)
|
| 593 |
+
nonprofit_source = get_data_source(nonprofit_file, use_remote=IS_HF_SPACES)
|
| 594 |
+
|
| 595 |
+
try:
|
| 596 |
+
logger.info(f"Searching nonprofit officers: {nonprofit_source}")
|
| 597 |
+
|
| 598 |
+
officer_sql = """
|
| 599 |
+
SELECT
|
| 600 |
+
name,
|
| 601 |
+
title,
|
| 602 |
+
organization_name,
|
| 603 |
+
city,
|
| 604 |
+
state,
|
| 605 |
+
compensation,
|
| 606 |
+
GREATEST(
|
| 607 |
+
CASE
|
| 608 |
+
WHEN LOWER(name) LIKE LOWER(?) THEN 1.5
|
| 609 |
+
WHEN LOWER(name) LIKE LOWER(?) THEN 1.0
|
| 610 |
+
ELSE 0.0
|
| 611 |
+
END,
|
| 612 |
+
CASE
|
| 613 |
+
WHEN LOWER(title) LIKE LOWER(?) THEN 1.0
|
| 614 |
+
WHEN LOWER(title) LIKE LOWER(?) THEN 0.5
|
| 615 |
+
ELSE 0.0
|
| 616 |
+
END,
|
| 617 |
+
CASE
|
| 618 |
+
WHEN LOWER(organization_name) LIKE LOWER(?) THEN 1.0
|
| 619 |
+
WHEN LOWER(organization_name) LIKE LOWER(?) THEN 0.5
|
| 620 |
+
ELSE 0.0
|
| 621 |
+
END
|
| 622 |
+
) AS score
|
| 623 |
+
FROM read_parquet(?)
|
| 624 |
+
WHERE (LOWER(name) LIKE LOWER(?)
|
| 625 |
+
OR LOWER(title) LIKE LOWER(?)
|
| 626 |
+
OR LOWER(organization_name) LIKE LOWER(?))
|
| 627 |
+
"""
|
| 628 |
+
|
| 629 |
+
query_pattern = f'%{query}%'
|
| 630 |
+
query_start = f'{query}%'
|
| 631 |
+
params = [
|
| 632 |
+
query_start, query_pattern, # name scoring
|
| 633 |
+
query_start, query_pattern, # title scoring
|
| 634 |
+
query_start, query_pattern, # organization scoring
|
| 635 |
+
nonprofit_source, # file path or URL
|
| 636 |
+
query_pattern, query_pattern, query_pattern # WHERE clause
|
| 637 |
+
]
|
| 638 |
+
|
| 639 |
+
if state:
|
| 640 |
+
officer_sql += " AND LOWER(state) = LOWER(?)"
|
| 641 |
+
params.append(state)
|
| 642 |
+
|
| 643 |
+
officer_sql += " ORDER BY score DESC, compensation DESC NULLS LAST LIMIT ?"
|
| 644 |
+
params.append(limit)
|
| 645 |
+
|
| 646 |
+
officer_rows = conn.execute(officer_sql, params).fetchall()
|
| 647 |
+
|
| 648 |
+
for row in officer_rows:
|
| 649 |
+
name, title, org_name, city, state_code, compensation, score = row
|
| 650 |
+
|
| 651 |
+
if score > 0.3:
|
| 652 |
+
comp_text = f" (${compensation:,.0f})" if compensation else ""
|
| 653 |
+
|
| 654 |
+
results.append(SearchResult(
|
| 655 |
+
result_type="contact",
|
| 656 |
+
title=name if name else "Unknown",
|
| 657 |
+
subtitle=f"{title} - {org_name}{comp_text}",
|
| 658 |
+
description=f"Nonprofit officer in {city}, {state_code}",
|
| 659 |
+
url=f"/nonprofits?name={org_name.replace(' ', '-') if org_name else 'unknown'}",
|
| 660 |
+
score=score,
|
| 661 |
+
metadata={
|
| 662 |
+
"title": title,
|
| 663 |
+
"organization": org_name,
|
| 664 |
+
"city": city,
|
| 665 |
+
"state": state_code,
|
| 666 |
+
"compensation": compensation,
|
| 667 |
+
"contact_type": "nonprofit_officer"
|
| 668 |
+
}
|
| 669 |
+
))
|
| 670 |
+
|
| 671 |
+
logger.info(f"Found {len([r for r in results if r.metadata.get('contact_type') == 'nonprofit_officer'])} nonprofit officer results")
|
| 672 |
+
|
| 673 |
+
except Exception as e:
|
| 674 |
+
logger.debug(f"Error searching nonprofit officers: {e}")
|
| 675 |
+
|
| 676 |
+
conn.close()
|
| 677 |
+
|
| 678 |
+
# Sort all results by score and limit
|
| 679 |
+
results.sort(key=lambda x: x.score, reverse=True)
|
| 680 |
+
logger.info(f"DuckDB search found {len(results)} contacts for query '{query}'")
|
| 681 |
+
return results[:limit]
|
| 682 |
+
|
| 683 |
+
except Exception as e:
|
| 684 |
+
logger.error(f"Contact search error: {e}")
|
| 685 |
+
return results
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def search_contacts(query: str, state: Optional[str] = None, limit: int = 10) -> List[SearchResult]:
|
| 689 |
+
"""
|
| 690 |
+
HYBRID SEARCH: Search local officials AND nonprofit officers.
|
| 691 |
+
|
| 692 |
+
Strategy:
|
| 693 |
+
1. Try HuggingFace Search API first (fast, server-side indexed) - HF Spaces only
|
| 694 |
+
2. Fall back to DuckDB (local files or remote HTTP parquet)
|
| 695 |
+
|
| 696 |
+
Args:
|
| 697 |
+
query: Search text (name, title, organization, etc.)
|
| 698 |
+
state: Optional 2-letter state code filter
|
| 699 |
+
limit: Maximum results to return
|
| 700 |
+
|
| 701 |
+
Returns:
|
| 702 |
+
List of SearchResult objects sorted by relevance
|
| 703 |
+
"""
|
| 704 |
+
logger.info(f"🔎 search_contacts() called - query={query!r}, state={state!r}, limit={limit}, IS_HF_SPACES={IS_HF_SPACES}")
|
| 705 |
+
|
| 706 |
+
# STRATEGY 1: Try HuggingFace Search API (fast text search)
|
| 707 |
+
if query and IS_HF_SPACES:
|
| 708 |
+
logger.info(f"🔍 Trying HF Search API for '{query}' (state={state})")
|
| 709 |
+
try:
|
| 710 |
+
hf_results = search_contacts_hf(query, state, limit=limit)
|
| 711 |
+
|
| 712 |
+
if hf_results:
|
| 713 |
+
logger.info(f"✅ HF Search API returned {len(hf_results)} results")
|
| 714 |
+
# Convert HF results to SearchResult objects
|
| 715 |
+
results = []
|
| 716 |
+
for row in hf_results:
|
| 717 |
+
source_type = row.get('source', 'contact')
|
| 718 |
+
name = row.get('name', 'Unknown')
|
| 719 |
+
title = row.get('title', '')
|
| 720 |
+
jurisdiction = row.get('jurisdiction', row.get('organization_name', ''))
|
| 721 |
+
state_code = row.get('state', state or '')
|
| 722 |
+
|
| 723 |
+
results.append(SearchResult(
|
| 724 |
+
result_type="contact",
|
| 725 |
+
title=name,
|
| 726 |
+
subtitle=f"{title} - {jurisdiction}, {state_code}",
|
| 727 |
+
description=f"{'Local official' if source_type == 'local_officials' else 'Nonprofit officer'} in {jurisdiction}",
|
| 728 |
+
url=f"/people/{name.replace(' ', '-')}",
|
| 729 |
+
score=1.0,
|
| 730 |
+
metadata={
|
| 731 |
+
"title": title,
|
| 732 |
+
"jurisdiction": jurisdiction,
|
| 733 |
+
"state": state_code,
|
| 734 |
+
"name": name,
|
| 735 |
+
"source": source_type
|
| 736 |
+
}
|
| 737 |
+
))
|
| 738 |
+
return results[:limit]
|
| 739 |
+
except Exception as e:
|
| 740 |
+
logger.warning(f"HF Search API failed, falling back to DuckDB: {e}")
|
| 741 |
+
|
| 742 |
+
# STRATEGY 2: Fall back to DuckDB (works with local or remote parquet)
|
| 743 |
+
logger.info(f"🔍 Using DuckDB {'remote' if IS_HF_SPACES else 'local'} search for '{query}'")
|
| 744 |
+
return search_contacts_duckdb(query, state, limit)
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
def search_meetings(query: str, state: Optional[str] = None, limit: int = 10) -> List[SearchResult]:
|
| 748 |
+
"""Search meeting transcripts and agendas"""
|
| 749 |
+
results = []
|
| 750 |
+
|
| 751 |
+
try:
|
| 752 |
+
# Search state event/meeting files (try new naming first, fallback to old)
|
| 753 |
+
if state:
|
| 754 |
+
meeting_files = list(GOLD_DIR.glob(f"states/{state}/events.parquet"))
|
| 755 |
+
if not meeting_files:
|
| 756 |
+
meeting_files = list(GOLD_DIR.glob(f"states/{state}/events_events.parquet"))
|
| 757 |
+
if not meeting_files:
|
| 758 |
+
meeting_files = list(GOLD_DIR.glob(f"states/{state}/meetings.parquet"))
|
| 759 |
+
else:
|
| 760 |
+
meeting_files = list(GOLD_DIR.glob("states/*/events.parquet"))
|
| 761 |
+
if not meeting_files:
|
| 762 |
+
meeting_files = list(GOLD_DIR.glob("states/*/events_events.parquet"))
|
| 763 |
+
if not meeting_files:
|
| 764 |
+
meeting_files = list(GOLD_DIR.glob("states/*/meetings.parquet"))
|
| 765 |
+
|
| 766 |
+
for file_path in meeting_files[:5]: # Limit for performance
|
| 767 |
+
try:
|
| 768 |
+
df = pd.read_parquet(file_path)
|
| 769 |
+
state_code = file_path.parent.name
|
| 770 |
+
|
| 771 |
+
# Detect schema - different files have different column names
|
| 772 |
+
columns = set(df.columns)
|
| 773 |
+
|
| 774 |
+
# Map column names (handle LocalView vs CityScrapers vs other formats)
|
| 775 |
+
title_col = 'vid_title' if 'vid_title' in columns else ('event_title' if 'event_title' in columns else 'title')
|
| 776 |
+
body_col = 'caption_text_clean' if 'caption_text_clean' in columns else ('caption_text' if 'caption_text' in columns else ('full_text' if 'full_text' in columns else 'body'))
|
| 777 |
+
jurisdiction_col = 'place_name' if 'place_name' in columns else ('jurisdiction_name' if 'jurisdiction_name' in columns else 'jurisdiction')
|
| 778 |
+
date_col = 'meeting_date' if 'meeting_date' in columns else 'date'
|
| 779 |
+
id_col = 'vid_id' if 'vid_id' in columns else ('meeting_id' if 'meeting_id' in columns else 'id')
|
| 780 |
+
|
| 781 |
+
# Search in title, body, jurisdiction
|
| 782 |
+
for _, row in df.iterrows():
|
| 783 |
+
title = str(row.get(title_col, ''))
|
| 784 |
+
body = str(row.get(body_col, ''))[:500] # First 500 chars
|
| 785 |
+
jurisdiction = str(row.get(jurisdiction_col, ''))
|
| 786 |
+
meeting_date = str(row.get(date_col, ''))
|
| 787 |
+
meeting_id = str(row.get(id_col, ''))
|
| 788 |
+
|
| 789 |
+
score = max(
|
| 790 |
+
calculate_relevance_score(title, query),
|
| 791 |
+
calculate_relevance_score(body, query) * 0.8, # Body matches slightly lower
|
| 792 |
+
calculate_relevance_score(jurisdiction, query) * 0.6
|
| 793 |
+
)
|
| 794 |
+
|
| 795 |
+
if score > 0.3:
|
| 796 |
+
# Extract snippet around query
|
| 797 |
+
snippet = body[:200] + "..." if len(body) > 200 else body
|
| 798 |
+
|
| 799 |
+
results.append(SearchResult(
|
| 800 |
+
result_type="meeting",
|
| 801 |
+
title=title,
|
| 802 |
+
subtitle=f"{jurisdiction}, {state_code} - {meeting_date}",
|
| 803 |
+
description=snippet,
|
| 804 |
+
url=f"/documents?meeting_id={meeting_id}",
|
| 805 |
+
score=score,
|
| 806 |
+
metadata={
|
| 807 |
+
"jurisdiction": jurisdiction,
|
| 808 |
+
"state": state_code,
|
| 809 |
+
"date": meeting_date,
|
| 810 |
+
"meeting_id": meeting_id
|
| 811 |
+
}
|
| 812 |
+
))
|
| 813 |
+
except Exception as e:
|
| 814 |
+
logger.debug(f"Error searching {file_path}: {e}")
|
| 815 |
+
|
| 816 |
+
except Exception as e:
|
| 817 |
+
logger.error(f"Meeting search error: {e}")
|
| 818 |
+
|
| 819 |
+
results.sort(key=lambda x: x.score, reverse=True)
|
| 820 |
+
return results[:limit]
|
| 821 |
+
|
| 822 |
+
|
| 823 |
+
def count_organizations(state: Optional[str] = None, ntee_code: Optional[str] = None, query: Optional[str] = None) -> int:
|
| 824 |
+
"""Count total organizations matching criteria (for pagination) - cached"""
|
| 825 |
+
# Create cache key
|
| 826 |
+
cache_key = f"count_{state}_{ntee_code}_{query}"
|
| 827 |
+
|
| 828 |
+
# Check cache (1 hour TTL)
|
| 829 |
+
now = datetime.now()
|
| 830 |
+
if cache_key in _count_cache:
|
| 831 |
+
cached_time = _count_cache_ttl.get(cache_key)
|
| 832 |
+
if cached_time and (now - cached_time).total_seconds() < 3600:
|
| 833 |
+
return _count_cache[cache_key]
|
| 834 |
+
|
| 835 |
+
try:
|
| 836 |
+
# Determine file path
|
| 837 |
+
if state:
|
| 838 |
+
file_pattern = f"{GOLD_DIR}/states/{state}/nonprofits_organizations.parquet"
|
| 839 |
+
else:
|
| 840 |
+
file_pattern = f"{GOLD_DIR}/national/nonprofits_organizations.parquet"
|
| 841 |
+
|
| 842 |
+
file_path = Path(file_pattern)
|
| 843 |
+
if not file_path.exists():
|
| 844 |
+
return 0
|
| 845 |
+
|
| 846 |
+
conn = duckdb.connect()
|
| 847 |
+
|
| 848 |
+
# Detect schema
|
| 849 |
+
columns_query = f"DESCRIBE SELECT * FROM '{file_path}' LIMIT 0"
|
| 850 |
+
available_columns = set([row[0] for row in conn.execute(columns_query).fetchall()])
|
| 851 |
+
name_col = 'organization_name' if 'organization_name' in available_columns else 'name'
|
| 852 |
+
ntee_col = 'ntee_code' if 'ntee_code' in available_columns else 'ntee_cd'
|
| 853 |
+
|
| 854 |
+
# Build WHERE clause
|
| 855 |
+
where_clauses = []
|
| 856 |
+
params = []
|
| 857 |
+
|
| 858 |
+
if query and query.strip():
|
| 859 |
+
where_clauses.append(f"LOWER({name_col}) LIKE LOWER(?)")
|
| 860 |
+
params.append(f'%{query}%')
|
| 861 |
+
|
| 862 |
+
if ntee_code and ntee_col in available_columns:
|
| 863 |
+
where_clauses.append(f"{ntee_col} LIKE ?")
|
| 864 |
+
params.append(f'{ntee_code}%')
|
| 865 |
+
|
| 866 |
+
where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
| 867 |
+
|
| 868 |
+
# Count query
|
| 869 |
+
count_sql = f"SELECT COUNT(*) FROM '{data_source}' WHERE {where_sql}"
|
| 870 |
+
result = conn.execute(count_sql, params).fetchone()
|
| 871 |
+
conn.close()
|
| 872 |
+
|
| 873 |
+
count = result[0] if result else 0
|
| 874 |
+
|
| 875 |
+
# Cache the result
|
| 876 |
+
_count_cache[cache_key] = count
|
| 877 |
+
_count_cache_ttl[cache_key] = now
|
| 878 |
+
|
| 879 |
+
return count
|
| 880 |
+
except Exception as e:
|
| 881 |
+
logger.error(f"Count error: {e}")
|
| 882 |
+
return 0
|
| 883 |
+
|
| 884 |
+
|
| 885 |
+
def search_organizations(query: str, state: Optional[str] = None, ntee_code: Optional[str] = None, limit: int = 10, offset: int = 0, enrich: bool = False, sort: str = 'relevance', ein: Optional[str] = None) -> List[SearchResult]:
|
| 886 |
+
"""Search nonprofit organizations using DuckDB for fast Parquet queries
|
| 887 |
+
|
| 888 |
+
Args:
|
| 889 |
+
enrich: If True, fetch additional data from Every.org API (slower)
|
| 890 |
+
sort: Sort order - 'relevance', 'name-asc', 'name-desc', 'revenue-asc', 'revenue-desc', 'assets-asc', 'assets-desc'
|
| 891 |
+
ein: If provided, filter to exact EIN match (for direct organization links)
|
| 892 |
+
"""
|
| 893 |
+
results = []
|
| 894 |
+
|
| 895 |
+
try:
|
| 896 |
+
# Determine file path
|
| 897 |
+
if state:
|
| 898 |
+
file_pattern = f"{GOLD_DIR}/states/{state}/nonprofits_organizations.parquet"
|
| 899 |
+
else:
|
| 900 |
+
file_pattern = f"{GOLD_DIR}/national/nonprofits_organizations.parquet"
|
| 901 |
+
|
| 902 |
+
# Get data source (local or remote HuggingFace URL)
|
| 903 |
+
file_path = Path(file_pattern)
|
| 904 |
+
data_source = get_data_source(file_path, use_remote=IS_HF_SPACES)
|
| 905 |
+
|
| 906 |
+
# Load parquet with caching (speeds up from 2-3s to <10ms)
|
| 907 |
+
df = load_parquet_cached(data_source)
|
| 908 |
+
|
| 909 |
+
# Initialize DuckDB connection
|
| 910 |
+
conn = duckdb.connect()
|
| 911 |
+
|
| 912 |
+
# Query the DataFrame directly (DuckDB can query pandas DataFrames)
|
| 913 |
+
available_columns = set(df.columns)
|
| 914 |
+
|
| 915 |
+
# Detect column name variations (handle different schemas)
|
| 916 |
+
name_col = 'organization_name' if 'organization_name' in available_columns else 'name'
|
| 917 |
+
ntee_col = 'ntee_code' if 'ntee_code' in available_columns else 'ntee_cd'
|
| 918 |
+
revenue_col = 'form_990_total_revenue' if 'form_990_total_revenue' in available_columns else 'revenue_amt'
|
| 919 |
+
asset_col = 'form_990_total_assets' if 'form_990_total_assets' in available_columns else 'asset_amt'
|
| 920 |
+
income_col = 'form_990_net_income' if 'form_990_net_income' in available_columns else 'income_amt'
|
| 921 |
+
|
| 922 |
+
# Build WHERE clauses using detected column names
|
| 923 |
+
where_clauses = []
|
| 924 |
+
params = []
|
| 925 |
+
|
| 926 |
+
# EIN filter (exact match - highest priority for direct organization links)
|
| 927 |
+
if ein and ein.strip():
|
| 928 |
+
where_clauses.append("ein = ?")
|
| 929 |
+
params.append(ein.strip())
|
| 930 |
+
|
| 931 |
+
# Search query (case-insensitive LIKE) - only if query provided and no EIN
|
| 932 |
+
if query and query.strip() and not ein:
|
| 933 |
+
where_clauses.append(f"LOWER({name_col}) LIKE LOWER(?)")
|
| 934 |
+
params.append(f'%{query}%')
|
| 935 |
+
|
| 936 |
+
# State filter (if using national file)
|
| 937 |
+
if state and not file_pattern.startswith(f"{GOLD_DIR}/states/"):
|
| 938 |
+
where_clauses.append("state = ?")
|
| 939 |
+
params.append(state)
|
| 940 |
+
|
| 941 |
+
# NTEE code filter
|
| 942 |
+
if ntee_code and ntee_col in available_columns:
|
| 943 |
+
where_clauses.append(f"{ntee_col} LIKE ?")
|
| 944 |
+
params.append(f'{ntee_code}%')
|
| 945 |
+
|
| 946 |
+
# Default to TRUE if no filters (browse all)
|
| 947 |
+
where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
| 948 |
+
|
| 949 |
+
# Build column list with proper NULL handling for missing columns
|
| 950 |
+
select_columns = []
|
| 951 |
+
|
| 952 |
+
# Add core columns (with aliases for consistency)
|
| 953 |
+
select_columns.append(f'{name_col} as name')
|
| 954 |
+
select_columns.append('city')
|
| 955 |
+
select_columns.append('state')
|
| 956 |
+
select_columns.append(f'{ntee_col} as ntee_cd' if ntee_col in available_columns else 'NULL as ntee_cd')
|
| 957 |
+
select_columns.append('ein')
|
| 958 |
+
select_columns.append(f'{revenue_col} as revenue_amt' if revenue_col in available_columns else 'NULL as revenue_amt')
|
| 959 |
+
select_columns.append(f'{asset_col} as asset_amt' if asset_col in available_columns else 'NULL as asset_amt')
|
| 960 |
+
select_columns.append(f'{income_col} as income_amt' if income_col in available_columns else 'NULL as income_amt')
|
| 961 |
+
select_columns.append('tax_period' if 'tax_period' in available_columns else 'NULL as tax_period')
|
| 962 |
+
|
| 963 |
+
# Track enrichment columns (form_990 and bigquery)
|
| 964 |
+
enrichment_cols = []
|
| 965 |
+
enrichment_col_map = {}
|
| 966 |
+
|
| 967 |
+
# Check for website columns (multiple possible names) - ALWAYS add if exists
|
| 968 |
+
website_col_added = False
|
| 969 |
+
for col_name in ['bigquery_website', 'form_990_website', 'website', 'everyorg_website']:
|
| 970 |
+
if col_name in available_columns:
|
| 971 |
+
select_columns.append(f'{col_name} as enrichment_website')
|
| 972 |
+
enrichment_cols.append('enrichment_website')
|
| 973 |
+
enrichment_col_map['enrichment_website'] = col_name
|
| 974 |
+
website_col_added = True
|
| 975 |
+
logger.debug(f"Added website column: {col_name}")
|
| 976 |
+
break
|
| 977 |
+
|
| 978 |
+
# Check for mission columns
|
| 979 |
+
mission_col_added = False
|
| 980 |
+
for col_name in ['bigquery_mission', 'form_990_mission', 'mission', 'everyorg_mission']:
|
| 981 |
+
if col_name in available_columns:
|
| 982 |
+
select_columns.append(f'{col_name} as enrichment_mission')
|
| 983 |
+
enrichment_cols.append('enrichment_mission')
|
| 984 |
+
enrichment_col_map['enrichment_mission'] = col_name
|
| 985 |
+
mission_col_added = True
|
| 986 |
+
logger.debug(f"Added mission column: {col_name}")
|
| 987 |
+
break
|
| 988 |
+
|
| 989 |
+
# Check for logo columns
|
| 990 |
+
logo_col_added = False
|
| 991 |
+
for col_name in ['logodev_logo_url', 'everyorg_logo_url', 'logo_url']:
|
| 992 |
+
if col_name in available_columns:
|
| 993 |
+
select_columns.append(f'{col_name} as enrichment_logo')
|
| 994 |
+
enrichment_cols.append('enrichment_logo')
|
| 995 |
+
enrichment_col_map['enrichment_logo'] = col_name
|
| 996 |
+
logo_col_added = True
|
| 997 |
+
logger.debug(f"Added logo column: {col_name}")
|
| 998 |
+
break
|
| 999 |
+
|
| 1000 |
+
# Last updated timestamp
|
| 1001 |
+
for col_name in ['bigquery_updated_date', 'form_990_last_updated', 'everyorg_last_updated']:
|
| 1002 |
+
if col_name in available_columns:
|
| 1003 |
+
select_columns.append(f'{col_name} as enrichment_last_updated')
|
| 1004 |
+
enrichment_cols.append('enrichment_last_updated')
|
| 1005 |
+
enrichment_col_map['enrichment_last_updated'] = col_name
|
| 1006 |
+
logger.debug(f"Added timestamp column: {col_name}")
|
| 1007 |
+
break
|
| 1008 |
+
|
| 1009 |
+
columns_sql = ', '.join(select_columns)
|
| 1010 |
+
|
| 1011 |
+
# Log what we're selecting
|
| 1012 |
+
logger.info(f"🔍 Enrichment columns to select: {enrichment_cols}")
|
| 1013 |
+
logger.info(f"📋 Full SQL columns: {columns_sql}")
|
| 1014 |
+
|
| 1015 |
+
# Build ORDER BY clause based on sort parameter
|
| 1016 |
+
order_by_clauses = []
|
| 1017 |
+
|
| 1018 |
+
if sort == 'name-asc':
|
| 1019 |
+
order_by_clauses.append(f"{name_col} ASC")
|
| 1020 |
+
elif sort == 'name-desc':
|
| 1021 |
+
order_by_clauses.append(f"{name_col} DESC")
|
| 1022 |
+
elif sort == 'revenue-desc':
|
| 1023 |
+
order_by_clauses.append(f"COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0) DESC")
|
| 1024 |
+
elif sort == 'revenue-asc':
|
| 1025 |
+
# Low to high: Show positive values first (smallest to largest), then zeros, then negatives
|
| 1026 |
+
order_by_clauses.append(f"""
|
| 1027 |
+
CASE
|
| 1028 |
+
WHEN TRY_CAST({revenue_col} AS BIGINT) IS NULL THEN 3
|
| 1029 |
+
WHEN TRY_CAST({revenue_col} AS BIGINT) <= 0 THEN 2
|
| 1030 |
+
ELSE 1
|
| 1031 |
+
END ASC,
|
| 1032 |
+
ABS(COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0)) ASC
|
| 1033 |
+
""")
|
| 1034 |
+
elif sort == 'assets-desc':
|
| 1035 |
+
order_by_clauses.append(f"COALESCE(TRY_CAST({asset_col} AS BIGINT), 0) DESC")
|
| 1036 |
+
elif sort == 'assets-asc':
|
| 1037 |
+
# Low to high: Show positive values first (smallest to largest), then zeros, then negatives
|
| 1038 |
+
order_by_clauses.append(f"""
|
| 1039 |
+
CASE
|
| 1040 |
+
WHEN TRY_CAST({asset_col} AS BIGINT) IS NULL THEN 3
|
| 1041 |
+
WHEN TRY_CAST({asset_col} AS BIGINT) <= 0 THEN 2
|
| 1042 |
+
ELSE 1
|
| 1043 |
+
END ASC,
|
| 1044 |
+
ABS(COALESCE(TRY_CAST({asset_col} AS BIGINT), 0)) ASC
|
| 1045 |
+
""")
|
| 1046 |
+
elif query and query.strip():
|
| 1047 |
+
# Relevance sort (only for search mode)
|
| 1048 |
+
order_by_clauses.append("score DESC")
|
| 1049 |
+
order_by_clauses.append(f"COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0) DESC")
|
| 1050 |
+
else:
|
| 1051 |
+
# Default browse mode: sort by revenue/assets
|
| 1052 |
+
order_by_clauses.append(f"COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0) DESC")
|
| 1053 |
+
order_by_clauses.append(f"COALESCE(TRY_CAST({asset_col} AS BIGINT), 0) DESC")
|
| 1054 |
+
|
| 1055 |
+
# Always add name as final sort for consistency
|
| 1056 |
+
if 'name' not in sort:
|
| 1057 |
+
order_by_clauses.append(f"{name_col}")
|
| 1058 |
+
|
| 1059 |
+
order_by_sql = ', '.join(order_by_clauses)
|
| 1060 |
+
|
| 1061 |
+
# SQL query with relevance scoring (browse mode if no query)
|
| 1062 |
+
if query and query.strip():
|
| 1063 |
+
# Search mode: score by text match
|
| 1064 |
+
sql = f"""
|
| 1065 |
+
SELECT
|
| 1066 |
+
{columns_sql},
|
| 1067 |
+
CASE
|
| 1068 |
+
WHEN LOWER({name_col}) LIKE LOWER(?) THEN 1.5
|
| 1069 |
+
WHEN LOWER({name_col}) LIKE LOWER(?) THEN 1.0
|
| 1070 |
+
ELSE 0.5
|
| 1071 |
+
END as score
|
| 1072 |
+
FROM df
|
| 1073 |
+
WHERE {where_sql}
|
| 1074 |
+
ORDER BY {order_by_sql}
|
| 1075 |
+
LIMIT ? OFFSET ?
|
| 1076 |
+
"""
|
| 1077 |
+
# Execute query with scoring parameters
|
| 1078 |
+
query_params = [f'{query}%', f'%{query}%'] + params + [limit, offset]
|
| 1079 |
+
else:
|
| 1080 |
+
# Browse mode: sort by size/activity
|
| 1081 |
+
sql = f"""
|
| 1082 |
+
SELECT
|
| 1083 |
+
{columns_sql},
|
| 1084 |
+
1.0 as score
|
| 1085 |
+
FROM df
|
| 1086 |
+
WHERE {where_sql}
|
| 1087 |
+
ORDER BY {order_by_sql}
|
| 1088 |
+
LIMIT ? OFFSET ?
|
| 1089 |
+
"""
|
| 1090 |
+
# Execute query without scoring parameters
|
| 1091 |
+
query_params = params + [limit, offset]
|
| 1092 |
+
|
| 1093 |
+
rows = conn.execute(sql, query_params).fetchall()
|
| 1094 |
+
|
| 1095 |
+
# NTEE code descriptions for better context
|
| 1096 |
+
ntee_descriptions = {
|
| 1097 |
+
'E': 'Health Services',
|
| 1098 |
+
'E60': 'Health Support Services',
|
| 1099 |
+
'E61': 'Blood Supply',
|
| 1100 |
+
'E62': 'Emergency Medical Services',
|
| 1101 |
+
'E65': 'Organ & Tissue Banks',
|
| 1102 |
+
'E70': 'Public Health',
|
| 1103 |
+
'E80': 'Health Treatment - Primary Care',
|
| 1104 |
+
'E90': 'Nursing Services',
|
| 1105 |
+
'E20': 'Hospitals & Primary Medical Care',
|
| 1106 |
+
'E30': 'Ambulatory & Primary Health Care',
|
| 1107 |
+
'E32': 'Clinics & Community Health Centers',
|
| 1108 |
+
'P': 'Human Services',
|
| 1109 |
+
'B': 'Education',
|
| 1110 |
+
'X': 'Religion-Related',
|
| 1111 |
+
'A': 'Arts, Culture & Humanities',
|
| 1112 |
+
}
|
| 1113 |
+
|
| 1114 |
+
# Convert to SearchResult objects with intelligent enrichment
|
| 1115 |
+
for row in rows:
|
| 1116 |
+
# Unpack base columns (now includes tax_period)
|
| 1117 |
+
org_name, city, state_code, ntee, ein, revenue, assets, income, tax_period = row[:9]
|
| 1118 |
+
|
| 1119 |
+
# Unpack optional enrichment columns if present
|
| 1120 |
+
existing_data = {}
|
| 1121 |
+
idx = 9
|
| 1122 |
+
|
| 1123 |
+
if 'enrichment_website' in enrichment_cols:
|
| 1124 |
+
existing_data['enrichment_website'] = row[idx]
|
| 1125 |
+
# Only log non-null websites to reduce spam
|
| 1126 |
+
if row[idx] and str(row[idx]) != 'nan':
|
| 1127 |
+
logger.debug(f"✅ Website: {row[idx]}")
|
| 1128 |
+
idx += 1
|
| 1129 |
+
if 'enrichment_mission' in enrichment_cols:
|
| 1130 |
+
existing_data['enrichment_mission'] = row[idx]
|
| 1131 |
+
idx += 1
|
| 1132 |
+
if 'enrichment_logo' in enrichment_cols:
|
| 1133 |
+
existing_data['enrichment_logo'] = row[idx]
|
| 1134 |
+
idx += 1
|
| 1135 |
+
if 'enrichment_last_updated' in enrichment_cols:
|
| 1136 |
+
existing_data['enrichment_last_updated'] = row[idx]
|
| 1137 |
+
idx += 1
|
| 1138 |
+
|
| 1139 |
+
score = row[-1] # Score is always last
|
| 1140 |
+
|
| 1141 |
+
# Parse tax year from tax_period (format: YYYYMM)
|
| 1142 |
+
tax_year = None
|
| 1143 |
+
if tax_period and str(tax_period).isdigit() and len(str(tax_period)) >= 4:
|
| 1144 |
+
tax_year = int(str(tax_period)[:4])
|
| 1145 |
+
|
| 1146 |
+
# Get enriched data with intelligent backfill (only if requested)
|
| 1147 |
+
enrichment = get_enrichment_data(ein, existing_data) if (ein and enrich) else {}
|
| 1148 |
+
|
| 1149 |
+
# Build a more informative description
|
| 1150 |
+
ntee_desc = None
|
| 1151 |
+
if ntee:
|
| 1152 |
+
# Try exact match first, then prefix match
|
| 1153 |
+
ntee_desc = ntee_descriptions.get(ntee)
|
| 1154 |
+
if not ntee_desc:
|
| 1155 |
+
# Try first character (major category)
|
| 1156 |
+
ntee_desc = ntee_descriptions.get(ntee[0]) if ntee else None
|
| 1157 |
+
|
| 1158 |
+
# Use enriched mission as primary description, fallback to NTEE + financial
|
| 1159 |
+
description = enrichment.get('mission') if enrichment.get('mission') else None
|
| 1160 |
+
|
| 1161 |
+
# Validate mission: if it contains a different org name, it's stale data
|
| 1162 |
+
if description and org_name:
|
| 1163 |
+
# Check if mission mentions a completely different org name
|
| 1164 |
+
# (e.g., "Catalyst Institute" when org name is "CAREQUEST INSTITUTE")
|
| 1165 |
+
mission_lower = description.lower()
|
| 1166 |
+
name_words = set(org_name.lower().split())
|
| 1167 |
+
|
| 1168 |
+
# If mission starts with an org name that's not in our actual org name, skip it
|
| 1169 |
+
first_sentence = description.split('.')[0].lower()
|
| 1170 |
+
if ' is a nonprofit' in first_sentence or ' is an nonprofit' in first_sentence:
|
| 1171 |
+
# Extract the subject (organization name before "is a nonprofit")
|
| 1172 |
+
subject = first_sentence.split(' is a')[0].strip()
|
| 1173 |
+
subject_words = set(subject.split())
|
| 1174 |
+
|
| 1175 |
+
# If the subject shares NO significant words with our org name, it's stale
|
| 1176 |
+
# (e.g., "catalyst institute" vs "carequest institute")
|
| 1177 |
+
significant_words = subject_words - {'the', 'a', 'an', 'of', 'for', 'and', 'inc', 'llc'}
|
| 1178 |
+
name_significant = name_words - {'the', 'a', 'an', 'of', 'for', 'and', 'inc', 'llc', 'institute'}
|
| 1179 |
+
|
| 1180 |
+
if significant_words and not (significant_words & name_significant):
|
| 1181 |
+
# Stale data - mission talks about a different org
|
| 1182 |
+
logger.warning(f"Stale mission data for {org_name}: '{subject}' != '{org_name}'")
|
| 1183 |
+
description = None
|
| 1184 |
+
|
| 1185 |
+
if not description:
|
| 1186 |
+
description_parts = []
|
| 1187 |
+
if ntee_desc:
|
| 1188 |
+
description_parts.append(ntee_desc)
|
| 1189 |
+
|
| 1190 |
+
# Convert financial data to numbers (handle None and string types)
|
| 1191 |
+
try:
|
| 1192 |
+
revenue_num = float(revenue) if revenue else 0
|
| 1193 |
+
assets_num = float(assets) if assets else 0
|
| 1194 |
+
except (ValueError, TypeError):
|
| 1195 |
+
revenue_num = 0
|
| 1196 |
+
assets_num = 0
|
| 1197 |
+
|
| 1198 |
+
if revenue_num > 0:
|
| 1199 |
+
description_parts.append(f"Revenue: ${revenue_num:,.0f}")
|
| 1200 |
+
elif assets_num > 0:
|
| 1201 |
+
description_parts.append(f"Assets: ${assets_num:,.0f}")
|
| 1202 |
+
|
| 1203 |
+
if not description_parts:
|
| 1204 |
+
description_parts.append(f"Nonprofit serving {city}")
|
| 1205 |
+
|
| 1206 |
+
description = " • ".join(description_parts)
|
| 1207 |
+
|
| 1208 |
+
# Build metadata with enriched fields
|
| 1209 |
+
metadata = {
|
| 1210 |
+
"ein": ein,
|
| 1211 |
+
"city": city,
|
| 1212 |
+
"state": state_code,
|
| 1213 |
+
"ntee_code": ntee,
|
| 1214 |
+
"revenue": revenue,
|
| 1215 |
+
"assets": assets,
|
| 1216 |
+
"income": income,
|
| 1217 |
+
"tax_year": tax_year,
|
| 1218 |
+
"data_sources": []
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
# ALWAYS add enrichment from parquet columns (existing_data) - no enrich flag needed
|
| 1222 |
+
if existing_data.get('enrichment_website'):
|
| 1223 |
+
metadata['website'] = existing_data['enrichment_website']
|
| 1224 |
+
metadata['data_sources'].append('cached')
|
| 1225 |
+
|
| 1226 |
+
if existing_data.get('enrichment_mission'):
|
| 1227 |
+
metadata['mission'] = existing_data['enrichment_mission']
|
| 1228 |
+
if 'cached' not in metadata['data_sources']:
|
| 1229 |
+
metadata['data_sources'].append('cached')
|
| 1230 |
+
|
| 1231 |
+
if existing_data.get('enrichment_logo'):
|
| 1232 |
+
metadata['logo_url'] = existing_data['enrichment_logo']
|
| 1233 |
+
if 'cached' not in metadata['data_sources']:
|
| 1234 |
+
metadata['data_sources'].append('cached')
|
| 1235 |
+
|
| 1236 |
+
# Add API enrichment if requested (enrich=true)
|
| 1237 |
+
if enrichment:
|
| 1238 |
+
if enrichment.get('website') and not metadata.get('website'):
|
| 1239 |
+
metadata['website'] = enrichment['website']
|
| 1240 |
+
if enrichment.get('logo_url'):
|
| 1241 |
+
metadata['logo_url'] = enrichment['logo_url']
|
| 1242 |
+
if enrichment.get('profile_url'):
|
| 1243 |
+
metadata['profile_url'] = enrichment['profile_url']
|
| 1244 |
+
if enrichment.get('causes'):
|
| 1245 |
+
metadata['causes'] = enrichment['causes']
|
| 1246 |
+
# Add API data sources
|
| 1247 |
+
for source in enrichment.get('data_sources', []):
|
| 1248 |
+
if source not in metadata['data_sources']:
|
| 1249 |
+
metadata['data_sources'].append(source)
|
| 1250 |
+
|
| 1251 |
+
results.append(SearchResult(
|
| 1252 |
+
result_type="organization",
|
| 1253 |
+
title=org_name if org_name else "Unknown",
|
| 1254 |
+
subtitle=f"{city}, {state_code}" + (f" - NTEE: {ntee}" if ntee else ""),
|
| 1255 |
+
description=description,
|
| 1256 |
+
url=f"/search?types=organizations&state={state_code}&ein={ein}",
|
| 1257 |
+
score=score,
|
| 1258 |
+
metadata=metadata
|
| 1259 |
+
))
|
| 1260 |
+
|
| 1261 |
+
conn.close()
|
| 1262 |
+
logger.info(f"DuckDB search found {len(results)} organizations for query '{query}'")
|
| 1263 |
+
|
| 1264 |
+
except Exception as e:
|
| 1265 |
+
logger.error(f"Organization search error: {e}")
|
| 1266 |
+
|
| 1267 |
+
return results
|
| 1268 |
+
|
| 1269 |
+
|
| 1270 |
+
def search_causes(query: str, limit: int = 10) -> List[SearchResult]:
|
| 1271 |
+
"""Search causes and NTEE categories - supports browse mode"""
|
| 1272 |
+
results = []
|
| 1273 |
+
|
| 1274 |
+
try:
|
| 1275 |
+
# Get data source (local or remote HuggingFace URL)
|
| 1276 |
+
ntee_file = GOLD_DIR / "reference" / "causes_ntee_codes.parquet"
|
| 1277 |
+
data_source = get_data_source(ntee_file, use_remote=IS_HF_SPACES)
|
| 1278 |
+
|
| 1279 |
+
# Load with caching
|
| 1280 |
+
df = load_parquet_cached(data_source)
|
| 1281 |
+
logger.debug(f"Loaded {len(df)} NTEE codes from cache")
|
| 1282 |
+
|
| 1283 |
+
for _, row in df.iterrows():
|
| 1284 |
+
code = str(row.get('ntee_code', ''))
|
| 1285 |
+
description = str(row.get('description', ''))
|
| 1286 |
+
ntee_type = str(row.get('ntee_type', ''))
|
| 1287 |
+
|
| 1288 |
+
# Browse mode: return all causes
|
| 1289 |
+
# Search mode: filter by relevance
|
| 1290 |
+
if query and query.strip():
|
| 1291 |
+
score = max(
|
| 1292 |
+
calculate_relevance_score(description, query),
|
| 1293 |
+
calculate_relevance_score(code, query)
|
| 1294 |
+
)
|
| 1295 |
+
if score <= 0.3:
|
| 1296 |
+
continue # Skip low relevance results
|
| 1297 |
+
else:
|
| 1298 |
+
score = 1.0 # Default score for browse mode
|
| 1299 |
+
|
| 1300 |
+
results.append(SearchResult(
|
| 1301 |
+
result_type="cause",
|
| 1302 |
+
title=description,
|
| 1303 |
+
subtitle=f"NTEE Code: {code}",
|
| 1304 |
+
description=f"Category type: {ntee_type}",
|
| 1305 |
+
url=f"/nonprofits?ntee_code={code}",
|
| 1306 |
+
score=score,
|
| 1307 |
+
metadata={
|
| 1308 |
+
"ntee_code": code,
|
| 1309 |
+
"ntee_type": ntee_type
|
| 1310 |
+
}
|
| 1311 |
+
))
|
| 1312 |
+
|
| 1313 |
+
logger.info(f"Found {len(results)} cause results for query '{query}'")
|
| 1314 |
+
|
| 1315 |
+
except Exception as e:
|
| 1316 |
+
logger.error(f"Cause search error: {e}")
|
| 1317 |
+
|
| 1318 |
+
results.sort(key=lambda x: x.score, reverse=True)
|
| 1319 |
+
return results[:limit]
|
| 1320 |
+
|
| 1321 |
+
|
| 1322 |
+
def search_jurisdictions(query: str, state: Optional[str] = None, city: Optional[str] = None, jurisdiction_levels: Optional[List[str]] = None, limit: int = 10, offset: int = 0) -> List[SearchResult]:
|
| 1323 |
+
"""Search cities, counties, townships, and school districts using DuckDB"""
|
| 1324 |
+
all_results = []
|
| 1325 |
+
|
| 1326 |
+
try:
|
| 1327 |
+
conn = duckdb.connect()
|
| 1328 |
+
|
| 1329 |
+
# Map frontend level IDs to file keys
|
| 1330 |
+
level_mapping = {
|
| 1331 |
+
'city': 'city',
|
| 1332 |
+
'county': 'county',
|
| 1333 |
+
'town': 'township',
|
| 1334 |
+
'village': 'township',
|
| 1335 |
+
'school_district': 'school district',
|
| 1336 |
+
'special_district': 'school district', # Use school district as proxy
|
| 1337 |
+
'state': None # States handled separately if needed
|
| 1338 |
+
}
|
| 1339 |
+
|
| 1340 |
+
# Define jurisdiction files with priority scores
|
| 1341 |
+
jurisdiction_files = {
|
| 1342 |
+
'county': (f"{GOLD_DIR}/reference/jurisdictions_counties.parquet", 1.3), # Boost counties
|
| 1343 |
+
'city': (f"{GOLD_DIR}/reference/jurisdictions_cities.parquet", 1.0),
|
| 1344 |
+
'school district': (f"{GOLD_DIR}/reference/jurisdictions_school_districts.parquet", 1.1), # Boost school districts
|
| 1345 |
+
'township': (f"{GOLD_DIR}/reference/jurisdictions_townships.parquet", 0.9)
|
| 1346 |
+
}
|
| 1347 |
+
|
| 1348 |
+
# Filter jurisdiction files based on selected levels
|
| 1349 |
+
if jurisdiction_levels:
|
| 1350 |
+
# Map selected levels to file keys
|
| 1351 |
+
selected_file_keys = set()
|
| 1352 |
+
for level in jurisdiction_levels:
|
| 1353 |
+
file_key = level_mapping.get(level)
|
| 1354 |
+
if file_key:
|
| 1355 |
+
selected_file_keys.add(file_key)
|
| 1356 |
+
|
| 1357 |
+
# Filter to only selected types
|
| 1358 |
+
if selected_file_keys:
|
| 1359 |
+
jurisdiction_files = {
|
| 1360 |
+
k: v for k, v in jurisdiction_files.items()
|
| 1361 |
+
if k in selected_file_keys
|
| 1362 |
+
}
|
| 1363 |
+
|
| 1364 |
+
# Fetch enough results from each type to ensure diversity
|
| 1365 |
+
# Even with small limits, we want representation from each type
|
| 1366 |
+
per_type_limit = max(limit, 15)
|
| 1367 |
+
|
| 1368 |
+
for jtype, (file_path, type_score) in jurisdiction_files.items():
|
| 1369 |
+
file_path_obj = Path(file_path)
|
| 1370 |
+
if not file_path_obj.exists():
|
| 1371 |
+
continue
|
| 1372 |
+
|
| 1373 |
+
try:
|
| 1374 |
+
# Build SQL query - use state column (lowercase)
|
| 1375 |
+
where_clauses = []
|
| 1376 |
+
params = []
|
| 1377 |
+
|
| 1378 |
+
if state:
|
| 1379 |
+
where_clauses.append("state = ?")
|
| 1380 |
+
params.append(state)
|
| 1381 |
+
|
| 1382 |
+
if city and query:
|
| 1383 |
+
# If city is specified, search for jurisdictions matching the city name
|
| 1384 |
+
where_clauses.append("LOWER(NAME) LIKE LOWER(?)")
|
| 1385 |
+
params.append(f"%{city}%")
|
| 1386 |
+
elif query:
|
| 1387 |
+
# General search across jurisdiction names
|
| 1388 |
+
where_clauses.append("LOWER(NAME) LIKE LOWER(?)")
|
| 1389 |
+
params.append(f"%{query}%")
|
| 1390 |
+
|
| 1391 |
+
where_clause = " AND ".join(where_clauses) if where_clauses else "1=1"
|
| 1392 |
+
|
| 1393 |
+
# Calculate name match score if query provided
|
| 1394 |
+
score_expr = f"{type_score}"
|
| 1395 |
+
if query:
|
| 1396 |
+
score_expr = f"""CASE
|
| 1397 |
+
WHEN LOWER(NAME) = LOWER('{query}') THEN {type_score} * 2.0
|
| 1398 |
+
WHEN LOWER(NAME) LIKE LOWER('{query}%') THEN {type_score} * 1.5
|
| 1399 |
+
ELSE {type_score}
|
| 1400 |
+
END"""
|
| 1401 |
+
|
| 1402 |
+
sql = f"""
|
| 1403 |
+
SELECT
|
| 1404 |
+
NAME as name,
|
| 1405 |
+
state,
|
| 1406 |
+
GEOID as geoid,
|
| 1407 |
+
jurisdiction_type,
|
| 1408 |
+
{score_expr} as score
|
| 1409 |
+
FROM read_parquet(?)
|
| 1410 |
+
WHERE {where_clause}
|
| 1411 |
+
ORDER BY score DESC, NAME ASC
|
| 1412 |
+
LIMIT ?
|
| 1413 |
+
"""
|
| 1414 |
+
|
| 1415 |
+
query_params = [str(file_path_obj)] + params + [per_type_limit]
|
| 1416 |
+
df = conn.execute(sql, query_params).fetchdf()
|
| 1417 |
+
|
| 1418 |
+
for _, row in df.iterrows():
|
| 1419 |
+
jurisdiction_label = row['jurisdiction_type'].replace('_', ' ').title()
|
| 1420 |
+
all_results.append(SearchResult(
|
| 1421 |
+
result_type='jurisdiction',
|
| 1422 |
+
title=f"{row['name']}",
|
| 1423 |
+
subtitle=f"{jurisdiction_label}",
|
| 1424 |
+
description=f"{jurisdiction_label} in {row['state']}",
|
| 1425 |
+
url=f"/jurisdictions/{row['geoid']}",
|
| 1426 |
+
score=float(row['score']),
|
| 1427 |
+
metadata={
|
| 1428 |
+
'state': row['state'],
|
| 1429 |
+
'geoid': row['geoid'],
|
| 1430 |
+
'type': row['jurisdiction_type']
|
| 1431 |
+
}
|
| 1432 |
+
))
|
| 1433 |
+
|
| 1434 |
+
except Exception as e:
|
| 1435 |
+
logger.error(f"Error searching {jtype} jurisdictions: {e}")
|
| 1436 |
+
continue
|
| 1437 |
+
|
| 1438 |
+
except Exception as e:
|
| 1439 |
+
logger.error(f"Jurisdiction search error: {e}")
|
| 1440 |
+
|
| 1441 |
+
# Sort all results by score, then apply pagination
|
| 1442 |
+
all_results.sort(key=lambda x: (x.score, x.title), reverse=True)
|
| 1443 |
+
return all_results[offset:offset + limit]
|
| 1444 |
+
|
| 1445 |
+
|
| 1446 |
+
@router.get("/api/search")
|
| 1447 |
+
@router.get("/api/search/", include_in_schema=False)
|
| 1448 |
+
async def unified_search(
|
| 1449 |
+
q: Optional[str] = Query(None, description="Search query (optional - browse by filters if omitted)"),
|
| 1450 |
+
types: Optional[str] = Query(None, description="Comma-separated result types: contacts,meetings,organizations,causes,jurisdictions"),
|
| 1451 |
+
state: Optional[str] = Query(None, description="Filter by state (2-letter code)"),
|
| 1452 |
+
city: Optional[str] = Query(None, description="Filter by city name"),
|
| 1453 |
+
jurisdiction_levels: Optional[str] = Query(None, description="Comma-separated jurisdiction levels: city,county,town,village,school_district,special_district,state"),
|
| 1454 |
+
ntee_code: Optional[str] = Query(None, description="Filter organizations by NTEE code"),
|
| 1455 |
+
ein: Optional[str] = Query(None, description="Filter organizations by exact EIN (for direct organization links)"),
|
| 1456 |
+
limit: int = Query(20, ge=1, le=100, description="Maximum results per type"),
|
| 1457 |
+
offset: int = Query(0, ge=0, description="Number of results to skip (for pagination)"),
|
| 1458 |
+
page: int = Query(1, ge=1, description="Page number (alternative to offset)"),
|
| 1459 |
+
enrich: bool = Query(False, description="Enable API enrichment (slower - fetches logos, causes from Every.org)"),
|
| 1460 |
+
sort: str = Query('relevance', description="Sort order: relevance, name-asc, name-desc, revenue-asc, revenue-desc, assets-asc, assets-desc")
|
| 1461 |
+
):
|
| 1462 |
+
"""
|
| 1463 |
+
Unified search across all data types
|
| 1464 |
+
|
| 1465 |
+
Search for contacts, meetings, organizations, and causes in one query.
|
| 1466 |
+
**NEW:** Query is now optional - you can browse by state/type without searching!
|
| 1467 |
+
|
| 1468 |
+
**Pagination:**
|
| 1469 |
+
- Use `offset` to skip results: `offset=20` skips first 20 results
|
| 1470 |
+
- Or use `page` with `limit`: `page=2&limit=20` gets results 21-40
|
| 1471 |
+
- `offset` takes precedence if both are provided
|
| 1472 |
+
|
| 1473 |
+
**Examples:**
|
| 1474 |
+
- `/api/search?q=dental` - Search everything for "dental"
|
| 1475 |
+
- `/api/search?types=organizations&state=GA` - Browse all orgs in Georgia
|
| 1476 |
+
- `/api/search?q=budget&types=meetings` - Search only meetings
|
| 1477 |
+
- `/api/search?q=health&state=AL` - Search in Alabama only
|
| 1478 |
+
- `/api/search?q=education&types=organizations,causes` - Search orgs and causes
|
| 1479 |
+
- `/api/search?q=health&state=MA&page=2&limit=20` - Page 2 of MA health results
|
| 1480 |
+
"""
|
| 1481 |
+
# 🔍 DEBUG LOGGING - Log all incoming request parameters
|
| 1482 |
+
logger.info(f"🔍 SEARCH REQUEST: q={q!r}, types={types!r}, state={state!r}, city={city!r}, jurisdiction_levels={jurisdiction_levels!r}, ntee_code={ntee_code!r}, ein={ein!r}, limit={limit}, offset={offset}, page={page}, enrich={enrich}, sort={sort!r}")
|
| 1483 |
+
|
| 1484 |
+
try:
|
| 1485 |
+
# Calculate offset from page if offset not explicitly provided
|
| 1486 |
+
if offset == 0 and page > 1:
|
| 1487 |
+
offset = (page - 1) * limit
|
| 1488 |
+
|
| 1489 |
+
# Parse requested types
|
| 1490 |
+
if types:
|
| 1491 |
+
requested_types = [t.strip() for t in types.split(',')]
|
| 1492 |
+
else:
|
| 1493 |
+
requested_types = ['contacts', 'meetings', 'organizations', 'causes', 'jurisdictions']
|
| 1494 |
+
|
| 1495 |
+
# Parse jurisdiction levels if provided
|
| 1496 |
+
jurisdiction_levels_list = None
|
| 1497 |
+
if jurisdiction_levels:
|
| 1498 |
+
jurisdiction_levels_list = [level.strip() for level in jurisdiction_levels.split(',')]
|
| 1499 |
+
|
| 1500 |
+
logger.info(f"📋 Requested types: {requested_types}, calculated offset: {offset}")
|
| 1501 |
+
|
| 1502 |
+
all_results = []
|
| 1503 |
+
|
| 1504 |
+
# Optimize for single-type browse mode (no query)
|
| 1505 |
+
# Let database handle pagination for efficiency
|
| 1506 |
+
use_db_pagination = not q and len(requested_types) == 1
|
| 1507 |
+
|
| 1508 |
+
if use_db_pagination:
|
| 1509 |
+
# Single-type browse: pass offset to DB for efficient pagination
|
| 1510 |
+
search_limit = limit
|
| 1511 |
+
search_offset = offset
|
| 1512 |
+
else:
|
| 1513 |
+
# Multi-type or search mode: fetch extra for mixing/sorting
|
| 1514 |
+
search_limit = offset + limit + 100
|
| 1515 |
+
search_offset = 0
|
| 1516 |
+
|
| 1517 |
+
if 'contacts' in requested_types:
|
| 1518 |
+
# Use PostgreSQL for fast indexed search
|
| 1519 |
+
contact_results_pg = await search_postgres.search_contacts_pg(q, state, limit=search_limit)
|
| 1520 |
+
contact_results = [convert_pg_result(r) for r in contact_results_pg]
|
| 1521 |
+
logger.info(f"👤 Contacts search returned {len(contact_results)} results")
|
| 1522 |
+
all_results.extend(contact_results)
|
| 1523 |
+
|
| 1524 |
+
if 'meetings' in requested_types:
|
| 1525 |
+
# Use PostgreSQL for fast indexed search
|
| 1526 |
+
meeting_results_pg = await search_postgres.search_events_pg(q, state, limit=search_limit)
|
| 1527 |
+
meeting_results = [convert_pg_result(r) for r in meeting_results_pg]
|
| 1528 |
+
logger.info(f"📅 Meetings search returned {len(meeting_results)} results")
|
| 1529 |
+
all_results.extend(meeting_results)
|
| 1530 |
+
|
| 1531 |
+
if 'organizations' in requested_types:
|
| 1532 |
+
# Use PostgreSQL for fast indexed search
|
| 1533 |
+
org_results_pg = await search_postgres.search_organizations_pg(q, state, ntee_code, ein, limit=search_limit, offset=search_offset, sort=sort)
|
| 1534 |
+
org_results = [convert_pg_result(r) for r in org_results_pg]
|
| 1535 |
+
logger.info(f"🏢 Organizations search returned {len(org_results)} results")
|
| 1536 |
+
all_results.extend(org_results)
|
| 1537 |
+
|
| 1538 |
+
if 'causes' in requested_types:
|
| 1539 |
+
cause_results = search_causes(q or "", limit=search_limit)
|
| 1540 |
+
logger.info(f"🎯 Causes search returned {len(cause_results)} results")
|
| 1541 |
+
all_results.extend(cause_results)
|
| 1542 |
+
|
| 1543 |
+
if 'jurisdictions' in requested_types:
|
| 1544 |
+
# Use PostgreSQL for fast indexed search
|
| 1545 |
+
jurisdiction_results_pg = await search_postgres.search_jurisdictions_pg(q, state, city, jurisdiction_levels_list, limit=search_limit, offset=search_offset)
|
| 1546 |
+
jurisdiction_results = [convert_pg_result(r) for r in jurisdiction_results_pg]
|
| 1547 |
+
logger.info(f"🏛️ Jurisdictions search returned {len(jurisdiction_results)} results")
|
| 1548 |
+
all_results.extend(jurisdiction_results)
|
| 1549 |
+
|
| 1550 |
+
# Sort all results by score
|
| 1551 |
+
all_results.sort(key=lambda x: x.score, reverse=True)
|
| 1552 |
+
|
| 1553 |
+
logger.info(f"📊 Total combined results: {len(all_results)}, applying pagination (offset={offset}, limit={limit})")
|
| 1554 |
+
|
| 1555 |
+
# Apply pagination
|
| 1556 |
+
if use_db_pagination:
|
| 1557 |
+
# DB already paginated - use all results
|
| 1558 |
+
paginated_results = all_results
|
| 1559 |
+
else:
|
| 1560 |
+
# Paginate in-memory from combined results
|
| 1561 |
+
paginated_results = all_results[offset:offset + limit]
|
| 1562 |
+
|
| 1563 |
+
logger.info(f"✂️ Paginated results: {len(paginated_results)} items")
|
| 1564 |
+
|
| 1565 |
+
# Group by type for response
|
| 1566 |
+
grouped_results = {
|
| 1567 |
+
'contacts': [r.to_dict() for r in paginated_results if r.result_type == 'contact'],
|
| 1568 |
+
'meetings': [r.to_dict() for r in paginated_results if r.result_type == 'meeting'],
|
| 1569 |
+
'organizations': [r.to_dict() for r in paginated_results if r.result_type == 'organization'],
|
| 1570 |
+
'causes': [r.to_dict() for r in paginated_results if r.result_type == 'cause'],
|
| 1571 |
+
'jurisdictions': [r.to_dict() for r in paginated_results if r.result_type == 'jurisdiction'],
|
| 1572 |
+
}
|
| 1573 |
+
|
| 1574 |
+
logger.info(f"📦 Grouped results - contacts:{len(grouped_results['contacts'])}, meetings:{len(grouped_results['meetings'])}, organizations:{len(grouped_results['organizations'])}, causes:{len(grouped_results['causes'])}, jurisdictions:{len(grouped_results['jurisdictions'])}")
|
| 1575 |
+
|
| 1576 |
+
# Calculate total results per type (from all_results before pagination)
|
| 1577 |
+
type_totals = {
|
| 1578 |
+
'contacts': len([r for r in all_results if r.result_type == 'contact']),
|
| 1579 |
+
'meetings': len([r for r in all_results if r.result_type == 'meeting']),
|
| 1580 |
+
'organizations': len([r for r in all_results if r.result_type == 'organization']),
|
| 1581 |
+
'causes': len([r for r in all_results if r.result_type == 'cause']),
|
| 1582 |
+
'jurisdictions': len([r for r in all_results if r.result_type == 'jurisdiction']),
|
| 1583 |
+
}
|
| 1584 |
+
|
| 1585 |
+
# Calculate total results
|
| 1586 |
+
# For single-type browse mode, get accurate count from database
|
| 1587 |
+
if not q and len(requested_types) == 1:
|
| 1588 |
+
# Browse mode: count total matching records in DB
|
| 1589 |
+
if 'organizations' in requested_types:
|
| 1590 |
+
total_results = count_organizations(state=state, ntee_code=ntee_code, query=q)
|
| 1591 |
+
type_totals['organizations'] = total_results # Use accurate DB count
|
| 1592 |
+
else:
|
| 1593 |
+
# Fallback to fetched results for other types
|
| 1594 |
+
total_results = len(all_results)
|
| 1595 |
+
else:
|
| 1596 |
+
# Search/multi-type mode: use fetched results
|
| 1597 |
+
total_results = len(all_results)
|
| 1598 |
+
|
| 1599 |
+
total_pages = (total_results + limit - 1) // limit # Ceiling division
|
| 1600 |
+
|
| 1601 |
+
response_data = {
|
| 1602 |
+
"query": q or "",
|
| 1603 |
+
"total_results": total_results,
|
| 1604 |
+
"type_totals": type_totals, # Add per-type totals
|
| 1605 |
+
"results": grouped_results,
|
| 1606 |
+
"pagination": {
|
| 1607 |
+
"page": page if offset == 0 or offset == (page - 1) * limit else (offset // limit) + 1,
|
| 1608 |
+
"limit": limit,
|
| 1609 |
+
"offset": offset,
|
| 1610 |
+
"total_pages": total_pages,
|
| 1611 |
+
"has_next": offset + limit < total_results,
|
| 1612 |
+
"has_prev": offset > 0
|
| 1613 |
+
},
|
| 1614 |
+
"filters": {
|
| 1615 |
+
"state": state,
|
| 1616 |
+
"ntee_code": ntee_code,
|
| 1617 |
+
"types": requested_types,
|
| 1618 |
+
"sort": sort
|
| 1619 |
+
}
|
| 1620 |
+
}
|
| 1621 |
+
|
| 1622 |
+
logger.info(f"✅ Search complete - returning {total_results} total results, {len(paginated_results)} on this page")
|
| 1623 |
+
return response_data
|
| 1624 |
+
|
| 1625 |
+
except Exception as e:
|
| 1626 |
+
logger.error(f"❌ Search error: {type(e).__name__}: {e}")
|
| 1627 |
+
logger.exception("Full traceback:")
|
| 1628 |
+
|
| 1629 |
+
# Parse error into structured response
|
| 1630 |
+
error_detail = parse_error(e, context={
|
| 1631 |
+
"query": q,
|
| 1632 |
+
"state": state,
|
| 1633 |
+
"types": types,
|
| 1634 |
+
"data_type": "search"
|
| 1635 |
+
})
|
| 1636 |
+
|
| 1637 |
+
return JSONResponse(
|
| 1638 |
+
status_code=500,
|
| 1639 |
+
content=error_detail.model_dump()
|
| 1640 |
+
)
|
| 1641 |
+
|
| 1642 |
+
|
| 1643 |
+
@router.get("/api/search/suggest")
|
| 1644 |
+
async def search_suggestions(
|
| 1645 |
+
q: str = Query(..., min_length=1, description="Partial search query"),
|
| 1646 |
+
limit: int = Query(5, ge=1, le=20, description="Maximum suggestions")
|
| 1647 |
+
):
|
| 1648 |
+
"""
|
| 1649 |
+
Get search suggestions/autocomplete
|
| 1650 |
+
|
| 1651 |
+
Returns quick suggestions as user types
|
| 1652 |
+
"""
|
| 1653 |
+
try:
|
| 1654 |
+
suggestions = []
|
| 1655 |
+
|
| 1656 |
+
# Common search terms
|
| 1657 |
+
common_terms = [
|
| 1658 |
+
"dental health", "oral health", "affordable housing", "public transit",
|
| 1659 |
+
"school funding", "budget", "water quality", "parks", "zoning",
|
| 1660 |
+
"police", "fire department", "mental health", "food assistance",
|
| 1661 |
+
"senior services", "youth programs", "employment", "job training"
|
| 1662 |
+
]
|
| 1663 |
+
|
| 1664 |
+
# Filter suggestions
|
| 1665 |
+
q_lower = q.lower()
|
| 1666 |
+
suggestions = [term for term in common_terms if q_lower in term.lower()]
|
| 1667 |
+
|
| 1668 |
+
return {
|
| 1669 |
+
"query": q,
|
| 1670 |
+
"suggestions": suggestions[:limit]
|
| 1671 |
+
}
|
| 1672 |
+
|
| 1673 |
+
except Exception as e:
|
| 1674 |
+
logger.error(f"Suggestion error: {e}")
|
| 1675 |
+
|
| 1676 |
+
# Parse error into structured response
|
| 1677 |
+
error_detail = parse_error(e, context={
|
| 1678 |
+
"query": q,
|
| 1679 |
+
"data_type": "suggestions"
|
| 1680 |
+
})
|
| 1681 |
+
|
| 1682 |
+
return JSONResponse(
|
| 1683 |
+
status_code=500,
|
| 1684 |
+
content=error_detail.model_dump()
|
| 1685 |
+
)
|
api/routes/search_postgres.py
ADDED
|
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PostgreSQL-based search functions
|
| 3 |
+
Uses indexed search tables for fast queries (10-100x faster than parquet)
|
| 4 |
+
"""
|
| 5 |
+
from typing import Optional, List
|
| 6 |
+
from loguru import logger
|
| 7 |
+
import asyncpg
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
|
| 12 |
+
# Database configuration
|
| 13 |
+
# Priority: NEON_DATABASE_URL_DEV (local) > NEON_DATABASE_URL (production)
|
| 14 |
+
NEON_DATABASE_URL_DEV = os.getenv('NEON_DATABASE_URL_DEV')
|
| 15 |
+
NEON_DATABASE_URL = os.getenv('NEON_DATABASE_URL')
|
| 16 |
+
|
| 17 |
+
# Use dev database for local development, production database for deployed environments
|
| 18 |
+
DATABASE_URL = NEON_DATABASE_URL_DEV or NEON_DATABASE_URL
|
| 19 |
+
|
| 20 |
+
# Connection pool (created on first request)
|
| 21 |
+
_db_pool = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class SearchResult:
|
| 26 |
+
"""Search result data class"""
|
| 27 |
+
result_type: str
|
| 28 |
+
title: str
|
| 29 |
+
subtitle: str
|
| 30 |
+
description: str
|
| 31 |
+
url: str
|
| 32 |
+
score: float
|
| 33 |
+
metadata: dict
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
async def get_db_pool():
|
| 37 |
+
"""Get or create database connection pool"""
|
| 38 |
+
global _db_pool
|
| 39 |
+
if _db_pool is None:
|
| 40 |
+
if not DATABASE_URL:
|
| 41 |
+
raise ValueError("DATABASE_URL not configured")
|
| 42 |
+
|
| 43 |
+
db_type = "Development (Local PostgreSQL)" if NEON_DATABASE_URL_DEV else "Production (Neon)"
|
| 44 |
+
logger.info(f"🗄️ Creating connection pool to {db_type}")
|
| 45 |
+
|
| 46 |
+
_db_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=20)
|
| 47 |
+
return _db_pool
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
async def search_jurisdictions_pg(
|
| 51 |
+
query: Optional[str] = None,
|
| 52 |
+
state: Optional[str] = None,
|
| 53 |
+
city: Optional[str] = None,
|
| 54 |
+
jurisdiction_levels: Optional[List[str]] = None,
|
| 55 |
+
limit: int = 10,
|
| 56 |
+
offset: int = 0
|
| 57 |
+
) -> List[SearchResult]:
|
| 58 |
+
"""
|
| 59 |
+
Search jurisdictions using PostgreSQL full-text search
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
query: Search text (jurisdiction name)
|
| 63 |
+
state: Filter by state code (e.g., 'MA')
|
| 64 |
+
city: Filter by city name
|
| 65 |
+
jurisdiction_levels: Filter by types (city, county, town, school_district, etc.)
|
| 66 |
+
limit: Max results
|
| 67 |
+
offset: Pagination offset
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
List of SearchResult objects
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
pool = await get_db_pool()
|
| 74 |
+
|
| 75 |
+
# Map frontend level IDs to database types
|
| 76 |
+
level_mapping = {
|
| 77 |
+
'city': 'city',
|
| 78 |
+
'county': 'county',
|
| 79 |
+
'town': 'town',
|
| 80 |
+
'village': 'village',
|
| 81 |
+
'school_district': 'school_district',
|
| 82 |
+
'special_district': 'special_district',
|
| 83 |
+
'state': 'state'
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Build SQL query
|
| 87 |
+
where_clauses = []
|
| 88 |
+
params = []
|
| 89 |
+
param_idx = 1
|
| 90 |
+
has_query = query and query.strip()
|
| 91 |
+
|
| 92 |
+
# Text search filter first (if present) - must be $1 for score calculation
|
| 93 |
+
score_param_idx = None
|
| 94 |
+
if has_query:
|
| 95 |
+
where_clauses.append(f"to_tsvector('english', name) @@ plainto_tsquery('english', ${param_idx})")
|
| 96 |
+
params.append(query)
|
| 97 |
+
score_param_idx = param_idx
|
| 98 |
+
param_idx += 1
|
| 99 |
+
|
| 100 |
+
# State filter
|
| 101 |
+
if state:
|
| 102 |
+
where_clauses.append(f"state = ${param_idx}")
|
| 103 |
+
params.append(state.upper())
|
| 104 |
+
param_idx += 1
|
| 105 |
+
|
| 106 |
+
# City filter
|
| 107 |
+
if city:
|
| 108 |
+
where_clauses.append(f"LOWER(name) LIKE LOWER(${param_idx})")
|
| 109 |
+
params.append(f"%{city}%")
|
| 110 |
+
param_idx += 1
|
| 111 |
+
|
| 112 |
+
# Jurisdiction level filter
|
| 113 |
+
if jurisdiction_levels:
|
| 114 |
+
db_types = [level_mapping.get(level) for level in jurisdiction_levels if level_mapping.get(level)]
|
| 115 |
+
if db_types:
|
| 116 |
+
placeholders = ','.join([f"${param_idx + i}" for i in range(len(db_types))])
|
| 117 |
+
where_clauses.append(f"type IN ({placeholders})")
|
| 118 |
+
params.extend(db_types)
|
| 119 |
+
param_idx += len(db_types)
|
| 120 |
+
|
| 121 |
+
# Build final WHERE clause
|
| 122 |
+
where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
| 123 |
+
|
| 124 |
+
# Select clause and order by
|
| 125 |
+
if has_query:
|
| 126 |
+
select_score = f"ts_rank(to_tsvector('english', name), plainto_tsquery('english', ${score_param_idx})) as score"
|
| 127 |
+
order_by = f"score DESC, name ASC"
|
| 128 |
+
else:
|
| 129 |
+
select_score = "1.0 as score"
|
| 130 |
+
order_by = "name ASC"
|
| 131 |
+
|
| 132 |
+
# Build complete query
|
| 133 |
+
sql = f"""
|
| 134 |
+
SELECT
|
| 135 |
+
name,
|
| 136 |
+
type,
|
| 137 |
+
state,
|
| 138 |
+
county,
|
| 139 |
+
geoid,
|
| 140 |
+
population,
|
| 141 |
+
{select_score}
|
| 142 |
+
FROM jurisdictions_search
|
| 143 |
+
WHERE {where_sql}
|
| 144 |
+
ORDER BY {order_by}
|
| 145 |
+
LIMIT ${param_idx}
|
| 146 |
+
OFFSET ${param_idx + 1}
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
# Add limit and offset
|
| 150 |
+
params.append(limit)
|
| 151 |
+
params.append(offset)
|
| 152 |
+
|
| 153 |
+
async with pool.acquire() as conn:
|
| 154 |
+
rows = await conn.fetch(sql, *params)
|
| 155 |
+
|
| 156 |
+
results = []
|
| 157 |
+
for row in rows:
|
| 158 |
+
jurisdiction_label = row['type'].replace('_', ' ').title()
|
| 159 |
+
|
| 160 |
+
results.append(SearchResult(
|
| 161 |
+
result_type='jurisdiction',
|
| 162 |
+
title=row['name'],
|
| 163 |
+
subtitle=f"{jurisdiction_label}",
|
| 164 |
+
description=f"{jurisdiction_label} in {row['state']}" + (f" • Pop: {row['population']:,}" if row['population'] else ""),
|
| 165 |
+
url=f"/jurisdictions/{row['geoid']}" if row['geoid'] else f"/jurisdictions/{row['name']}",
|
| 166 |
+
score=float(row.get('score', 1.0)) if query else 1.0,
|
| 167 |
+
metadata={
|
| 168 |
+
'state': row['state'],
|
| 169 |
+
'geoid': row['geoid'],
|
| 170 |
+
'type': row['type'],
|
| 171 |
+
'county': row['county'],
|
| 172 |
+
'population': row['population']
|
| 173 |
+
}
|
| 174 |
+
))
|
| 175 |
+
|
| 176 |
+
logger.info(f"🏛️ PostgreSQL jurisdictions search: {len(results)} results")
|
| 177 |
+
return results
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"PostgreSQL jurisdictions search error: {e}")
|
| 181 |
+
return []
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
async def search_contacts_pg(
|
| 185 |
+
query: Optional[str] = None,
|
| 186 |
+
state: Optional[str] = None,
|
| 187 |
+
limit: int = 10
|
| 188 |
+
) -> List[SearchResult]:
|
| 189 |
+
"""
|
| 190 |
+
Search contacts (nonprofit officers, local officials) using PostgreSQL
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
query: Search text (name, title, organization)
|
| 194 |
+
state: Filter by state code
|
| 195 |
+
limit: Max results
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
List of SearchResult objects
|
| 199 |
+
"""
|
| 200 |
+
try:
|
| 201 |
+
pool = await get_db_pool()
|
| 202 |
+
|
| 203 |
+
# Build WHERE clauses
|
| 204 |
+
where_clauses = []
|
| 205 |
+
params = []
|
| 206 |
+
param_idx = 1
|
| 207 |
+
|
| 208 |
+
if state:
|
| 209 |
+
where_clauses.append(f"state = ${param_idx}")
|
| 210 |
+
params.append(state.upper())
|
| 211 |
+
param_idx += 1
|
| 212 |
+
|
| 213 |
+
# Text search across name and organization
|
| 214 |
+
if query and query.strip():
|
| 215 |
+
where_clauses.append(f"""(
|
| 216 |
+
to_tsvector('english', name) @@ plainto_tsquery('english', ${param_idx})
|
| 217 |
+
OR to_tsvector('english', COALESCE(organization_name, '')) @@ plainto_tsquery('english', ${param_idx})
|
| 218 |
+
OR LOWER(title) LIKE LOWER(${param_idx + 1})
|
| 219 |
+
)""")
|
| 220 |
+
params.append(query)
|
| 221 |
+
params.append(f"%{query}%")
|
| 222 |
+
param_idx += 2
|
| 223 |
+
|
| 224 |
+
# Rank by relevance
|
| 225 |
+
order_by = f"""
|
| 226 |
+
GREATEST(
|
| 227 |
+
ts_rank(to_tsvector('english', name), plainto_tsquery('english', ${param_idx - 2})),
|
| 228 |
+
ts_rank(to_tsvector('english', COALESCE(organization_name, '')), plainto_tsquery('english', ${param_idx - 2}))
|
| 229 |
+
) DESC, name ASC
|
| 230 |
+
"""
|
| 231 |
+
else:
|
| 232 |
+
order_by = "name ASC"
|
| 233 |
+
|
| 234 |
+
where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
| 235 |
+
|
| 236 |
+
sql = f"""
|
| 237 |
+
SELECT
|
| 238 |
+
name,
|
| 239 |
+
title,
|
| 240 |
+
organization_name,
|
| 241 |
+
organization_ein,
|
| 242 |
+
city,
|
| 243 |
+
state,
|
| 244 |
+
role_type,
|
| 245 |
+
compensation,
|
| 246 |
+
source
|
| 247 |
+
FROM contacts_search
|
| 248 |
+
WHERE {where_sql}
|
| 249 |
+
ORDER BY {order_by}
|
| 250 |
+
LIMIT ${param_idx}
|
| 251 |
+
"""
|
| 252 |
+
params.append(limit)
|
| 253 |
+
|
| 254 |
+
async with pool.acquire() as conn:
|
| 255 |
+
rows = await conn.fetch(sql, *params)
|
| 256 |
+
|
| 257 |
+
results = []
|
| 258 |
+
for row in rows:
|
| 259 |
+
org_display = row['organization_name'] or 'Unknown Organization'
|
| 260 |
+
location = f"{row['city']}, {row['state']}" if row['city'] and row['state'] else (row['state'] or '')
|
| 261 |
+
|
| 262 |
+
results.append(SearchResult(
|
| 263 |
+
result_type='contact',
|
| 264 |
+
title=row['name'],
|
| 265 |
+
subtitle=f"{row['title'] or 'Officer'} - {org_display}",
|
| 266 |
+
description=f"{row['role_type'] or 'Contact'} in {location}",
|
| 267 |
+
url=f"/people/{row['name'].replace(' ', '-')}",
|
| 268 |
+
score=1.0,
|
| 269 |
+
metadata={
|
| 270 |
+
'title': row['title'],
|
| 271 |
+
'organization': org_display,
|
| 272 |
+
'organization_ein': row['organization_ein'],
|
| 273 |
+
'state': row['state'],
|
| 274 |
+
'city': row['city'],
|
| 275 |
+
'role_type': row['role_type'],
|
| 276 |
+
'compensation': row['compensation'],
|
| 277 |
+
'source': row['source']
|
| 278 |
+
}
|
| 279 |
+
))
|
| 280 |
+
|
| 281 |
+
logger.info(f"👤 PostgreSQL contacts search: {len(results)} results")
|
| 282 |
+
return results
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
logger.error(f"PostgreSQL contacts search error: {e}")
|
| 286 |
+
return []
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
async def search_organizations_pg(
|
| 290 |
+
query: Optional[str] = None,
|
| 291 |
+
state: Optional[str] = None,
|
| 292 |
+
ntee_code: Optional[str] = None,
|
| 293 |
+
ein: Optional[str] = None,
|
| 294 |
+
limit: int = 10,
|
| 295 |
+
offset: int = 0,
|
| 296 |
+
sort: str = 'relevance'
|
| 297 |
+
) -> List[SearchResult]:
|
| 298 |
+
"""
|
| 299 |
+
Search nonprofit organizations using PostgreSQL
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
query: Search text (organization name)
|
| 303 |
+
state: Filter by state code
|
| 304 |
+
ntee_code: Filter by NTEE code prefix
|
| 305 |
+
ein: Exact EIN match
|
| 306 |
+
limit: Max results
|
| 307 |
+
offset: Pagination offset
|
| 308 |
+
sort: Sort order (relevance, name-asc, name-desc, revenue-asc, revenue-desc, assets-asc, assets-desc)
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
List of SearchResult objects
|
| 312 |
+
"""
|
| 313 |
+
try:
|
| 314 |
+
pool = await get_db_pool()
|
| 315 |
+
|
| 316 |
+
# Build WHERE clauses
|
| 317 |
+
where_clauses = []
|
| 318 |
+
params = []
|
| 319 |
+
param_idx = 1
|
| 320 |
+
|
| 321 |
+
# EIN exact match (highest priority)
|
| 322 |
+
if ein:
|
| 323 |
+
where_clauses.append(f"ein = ${param_idx}")
|
| 324 |
+
params.append(ein.strip())
|
| 325 |
+
param_idx += 1
|
| 326 |
+
|
| 327 |
+
# State filter
|
| 328 |
+
if state:
|
| 329 |
+
where_clauses.append(f"state = ${param_idx}")
|
| 330 |
+
params.append(state.upper())
|
| 331 |
+
param_idx += 1
|
| 332 |
+
|
| 333 |
+
# NTEE code filter
|
| 334 |
+
if ntee_code:
|
| 335 |
+
where_clauses.append(f"ntee_code LIKE ${param_idx}")
|
| 336 |
+
params.append(f"{ntee_code}%")
|
| 337 |
+
param_idx += 1
|
| 338 |
+
|
| 339 |
+
# Text search (if no EIN specified)
|
| 340 |
+
if query and query.strip() and not ein:
|
| 341 |
+
where_clauses.append(f"to_tsvector('english', name) @@ plainto_tsquery('english', ${param_idx})")
|
| 342 |
+
params.append(query)
|
| 343 |
+
param_idx += 1
|
| 344 |
+
|
| 345 |
+
where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
| 346 |
+
|
| 347 |
+
# Determine sort order
|
| 348 |
+
if sort == 'name-asc':
|
| 349 |
+
order_by = "name ASC"
|
| 350 |
+
elif sort == 'name-desc':
|
| 351 |
+
order_by = "name DESC"
|
| 352 |
+
elif sort == 'revenue-asc':
|
| 353 |
+
order_by = "revenue ASC NULLS LAST"
|
| 354 |
+
elif sort == 'revenue-desc':
|
| 355 |
+
order_by = "revenue DESC NULLS LAST"
|
| 356 |
+
elif sort == 'assets-asc':
|
| 357 |
+
order_by = "assets ASC NULLS LAST"
|
| 358 |
+
elif sort == 'assets-desc':
|
| 359 |
+
order_by = "assets DESC NULLS LAST"
|
| 360 |
+
elif query and query.strip() and not ein:
|
| 361 |
+
# Relevance ranking for text search
|
| 362 |
+
order_by = f"ts_rank(to_tsvector('english', name), plainto_tsquery('english', ${param_idx - 1})) DESC, name ASC"
|
| 363 |
+
else:
|
| 364 |
+
order_by = "name ASC"
|
| 365 |
+
|
| 366 |
+
sql = f"""
|
| 367 |
+
SELECT
|
| 368 |
+
ein,
|
| 369 |
+
name,
|
| 370 |
+
city,
|
| 371 |
+
state,
|
| 372 |
+
county,
|
| 373 |
+
ntee_code,
|
| 374 |
+
ntee_description,
|
| 375 |
+
revenue,
|
| 376 |
+
assets,
|
| 377 |
+
income,
|
| 378 |
+
tax_period
|
| 379 |
+
FROM nonprofits_search
|
| 380 |
+
WHERE {where_sql}
|
| 381 |
+
ORDER BY {order_by}
|
| 382 |
+
LIMIT ${param_idx}
|
| 383 |
+
OFFSET ${param_idx + 1}
|
| 384 |
+
"""
|
| 385 |
+
params.append(limit)
|
| 386 |
+
params.append(offset)
|
| 387 |
+
|
| 388 |
+
async with pool.acquire() as conn:
|
| 389 |
+
rows = await conn.fetch(sql, *params)
|
| 390 |
+
|
| 391 |
+
results = []
|
| 392 |
+
for row in rows:
|
| 393 |
+
location = f"{row['city']}, {row['state']}" if row['city'] and row['state'] else (row['state'] or '')
|
| 394 |
+
|
| 395 |
+
# Format financials
|
| 396 |
+
financials = []
|
| 397 |
+
if row['revenue']:
|
| 398 |
+
financials.append(f"Revenue: ${row['revenue']:,}")
|
| 399 |
+
if row['assets']:
|
| 400 |
+
financials.append(f"Assets: ${row['assets']:,}")
|
| 401 |
+
|
| 402 |
+
description = f"{row['ntee_description'] or 'Nonprofit organization'}"
|
| 403 |
+
if financials:
|
| 404 |
+
description += " • " + " • ".join(financials)
|
| 405 |
+
|
| 406 |
+
results.append(SearchResult(
|
| 407 |
+
result_type='organization',
|
| 408 |
+
title=row['name'],
|
| 409 |
+
subtitle=location,
|
| 410 |
+
description=description,
|
| 411 |
+
url=f"/organizations/{row['ein']}",
|
| 412 |
+
score=1.0,
|
| 413 |
+
metadata={
|
| 414 |
+
'ein': row['ein'],
|
| 415 |
+
'state': row['state'],
|
| 416 |
+
'city': row['city'],
|
| 417 |
+
'county': row['county'],
|
| 418 |
+
'ntee_code': row['ntee_code'],
|
| 419 |
+
'ntee_description': row['ntee_description'],
|
| 420 |
+
'revenue': row['revenue'],
|
| 421 |
+
'assets': row['assets'],
|
| 422 |
+
'income': row['income'],
|
| 423 |
+
'tax_period': row['tax_period']
|
| 424 |
+
}
|
| 425 |
+
))
|
| 426 |
+
|
| 427 |
+
logger.info(f"🏢 PostgreSQL organizations search: {len(results)} results")
|
| 428 |
+
return results
|
| 429 |
+
|
| 430 |
+
except Exception as e:
|
| 431 |
+
logger.error(f"PostgreSQL organizations search error: {e}")
|
| 432 |
+
return []
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
async def search_events_pg(
|
| 436 |
+
query: Optional[str] = None,
|
| 437 |
+
state: Optional[str] = None,
|
| 438 |
+
limit: int = 10
|
| 439 |
+
) -> List[SearchResult]:
|
| 440 |
+
"""
|
| 441 |
+
Search meetings/events using PostgreSQL
|
| 442 |
+
|
| 443 |
+
Args:
|
| 444 |
+
query: Search text (title, jurisdiction, description)
|
| 445 |
+
state: Filter by state code
|
| 446 |
+
limit: Max results
|
| 447 |
+
|
| 448 |
+
Returns:
|
| 449 |
+
List of SearchResult objects
|
| 450 |
+
"""
|
| 451 |
+
try:
|
| 452 |
+
pool = await get_db_pool()
|
| 453 |
+
|
| 454 |
+
# Build WHERE clauses
|
| 455 |
+
where_clauses = []
|
| 456 |
+
params = []
|
| 457 |
+
param_idx = 1
|
| 458 |
+
|
| 459 |
+
if state:
|
| 460 |
+
where_clauses.append(f"state = ${param_idx}")
|
| 461 |
+
params.append(state.upper())
|
| 462 |
+
param_idx += 1
|
| 463 |
+
|
| 464 |
+
# Text search
|
| 465 |
+
if query and query.strip():
|
| 466 |
+
where_clauses.append(f"""(
|
| 467 |
+
to_tsvector('english', title) @@ plainto_tsquery('english', ${param_idx})
|
| 468 |
+
OR LOWER(jurisdiction_name) LIKE LOWER(${param_idx + 1})
|
| 469 |
+
)""")
|
| 470 |
+
params.append(query)
|
| 471 |
+
params.append(f"%{query}%")
|
| 472 |
+
param_idx += 2
|
| 473 |
+
|
| 474 |
+
order_by = f"ts_rank(to_tsvector('english', title), plainto_tsquery('english', ${param_idx - 2})) DESC, event_date DESC"
|
| 475 |
+
else:
|
| 476 |
+
order_by = "event_date DESC"
|
| 477 |
+
|
| 478 |
+
where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"
|
| 479 |
+
|
| 480 |
+
sql = f"""
|
| 481 |
+
SELECT
|
| 482 |
+
id,
|
| 483 |
+
title,
|
| 484 |
+
description,
|
| 485 |
+
event_date,
|
| 486 |
+
jurisdiction_name,
|
| 487 |
+
jurisdiction_type,
|
| 488 |
+
state,
|
| 489 |
+
city,
|
| 490 |
+
video_url,
|
| 491 |
+
agenda_url
|
| 492 |
+
FROM events_search
|
| 493 |
+
WHERE {where_sql}
|
| 494 |
+
ORDER BY {order_by}
|
| 495 |
+
LIMIT ${param_idx}
|
| 496 |
+
"""
|
| 497 |
+
params.append(limit)
|
| 498 |
+
|
| 499 |
+
async with pool.acquire() as conn:
|
| 500 |
+
rows = await conn.fetch(sql, *params)
|
| 501 |
+
|
| 502 |
+
results = []
|
| 503 |
+
for row in rows:
|
| 504 |
+
location = f"{row['jurisdiction_name']}, {row['state']}" if row['jurisdiction_name'] and row['state'] else ''
|
| 505 |
+
date_str = row['event_date'].strftime('%Y-%m-%d') if row['event_date'] else ''
|
| 506 |
+
|
| 507 |
+
description = (row['description'] or '')[:200]
|
| 508 |
+
if len(description) == 200:
|
| 509 |
+
description += "..."
|
| 510 |
+
|
| 511 |
+
results.append(SearchResult(
|
| 512 |
+
result_type='meeting',
|
| 513 |
+
title=row['title'],
|
| 514 |
+
subtitle=f"{location} - {date_str}",
|
| 515 |
+
description=description,
|
| 516 |
+
url=f"/documents?meeting_id={row['id']}",
|
| 517 |
+
score=1.0,
|
| 518 |
+
metadata={
|
| 519 |
+
'jurisdiction': row['jurisdiction_name'],
|
| 520 |
+
'jurisdiction_type': row['jurisdiction_type'],
|
| 521 |
+
'state': row['state'],
|
| 522 |
+
'city': row['city'],
|
| 523 |
+
'date': date_str,
|
| 524 |
+
'meeting_id': row['id'],
|
| 525 |
+
'video_url': row['video_url'],
|
| 526 |
+
'agenda_url': row['agenda_url']
|
| 527 |
+
}
|
| 528 |
+
))
|
| 529 |
+
|
| 530 |
+
logger.info(f"📅 PostgreSQL events search: {len(results)} results")
|
| 531 |
+
return results
|
| 532 |
+
|
| 533 |
+
except Exception as e:
|
| 534 |
+
logger.error(f"PostgreSQL events search error: {e}")
|
| 535 |
+
return []
|
api/routes/social.py
ADDED
|
@@ -0,0 +1,544 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Social features API routes - following, followers, feeds
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import APIRouter, Depends, HTTPException, status
|
| 5 |
+
from sqlalchemy.orm import Session
|
| 6 |
+
from sqlalchemy import func, or_, and_
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from api.database import get_db
|
| 12 |
+
from api.auth import get_current_user
|
| 13 |
+
from api.models import (
|
| 14 |
+
User, Official, Organization, Cause,
|
| 15 |
+
UserFollow, OfficialFollow, OrganizationFollow, CauseFollow
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
router = APIRouter(prefix="/api/social", tags=["social"])
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ============================================================================
|
| 22 |
+
# PYDANTIC MODELS
|
| 23 |
+
# ============================================================================
|
| 24 |
+
|
| 25 |
+
class FollowResponse(BaseModel):
|
| 26 |
+
"""Response after follow/unfollow action"""
|
| 27 |
+
success: bool
|
| 28 |
+
following: bool
|
| 29 |
+
follower_count: int
|
| 30 |
+
message: str
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class FollowerStats(BaseModel):
|
| 34 |
+
"""Follower/following statistics"""
|
| 35 |
+
followers: int
|
| 36 |
+
following: int
|
| 37 |
+
following_users: int
|
| 38 |
+
following_officials: int
|
| 39 |
+
following_organizations: int
|
| 40 |
+
following_causes: int
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class UserSummary(BaseModel):
|
| 44 |
+
"""Brief user info for lists"""
|
| 45 |
+
id: int
|
| 46 |
+
username: Optional[str]
|
| 47 |
+
full_name: Optional[str]
|
| 48 |
+
avatar_url: Optional[str]
|
| 49 |
+
created_at: datetime
|
| 50 |
+
|
| 51 |
+
class Config:
|
| 52 |
+
from_attributes = True
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class OfficialSummary(BaseModel):
|
| 56 |
+
"""Brief official info for lists"""
|
| 57 |
+
id: int
|
| 58 |
+
name: str
|
| 59 |
+
slug: str
|
| 60 |
+
title: Optional[str]
|
| 61 |
+
photo_url: Optional[str]
|
| 62 |
+
office: Optional[str]
|
| 63 |
+
city: Optional[str]
|
| 64 |
+
state: Optional[str]
|
| 65 |
+
follower_count: int
|
| 66 |
+
is_verified: bool
|
| 67 |
+
|
| 68 |
+
class Config:
|
| 69 |
+
from_attributes = True
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class OrganizationSummary(BaseModel):
|
| 73 |
+
"""Brief organization info for lists"""
|
| 74 |
+
id: int
|
| 75 |
+
name: str
|
| 76 |
+
slug: str
|
| 77 |
+
description: Optional[str]
|
| 78 |
+
logo_url: Optional[str]
|
| 79 |
+
org_type: Optional[str]
|
| 80 |
+
city: Optional[str]
|
| 81 |
+
state: Optional[str]
|
| 82 |
+
follower_count: int
|
| 83 |
+
is_verified: bool
|
| 84 |
+
|
| 85 |
+
class Config:
|
| 86 |
+
from_attributes = True
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class CauseSummary(BaseModel):
|
| 90 |
+
"""Brief cause info for lists"""
|
| 91 |
+
id: int
|
| 92 |
+
name: str
|
| 93 |
+
slug: str
|
| 94 |
+
description: Optional[str]
|
| 95 |
+
icon_url: Optional[str]
|
| 96 |
+
color: Optional[str]
|
| 97 |
+
category: Optional[str]
|
| 98 |
+
follower_count: int
|
| 99 |
+
|
| 100 |
+
class Config:
|
| 101 |
+
from_attributes = True
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ============================================================================
|
| 105 |
+
# FOLLOW/UNFOLLOW ACTIONS
|
| 106 |
+
# ============================================================================
|
| 107 |
+
|
| 108 |
+
@router.post("/follow/user/{user_id}")
|
| 109 |
+
async def follow_user(
|
| 110 |
+
user_id: int,
|
| 111 |
+
current_user: User = Depends(get_current_user),
|
| 112 |
+
db: Session = Depends(get_db)
|
| 113 |
+
) -> FollowResponse:
|
| 114 |
+
"""Follow another user"""
|
| 115 |
+
|
| 116 |
+
if current_user.id == user_id:
|
| 117 |
+
raise HTTPException(status_code=400, detail="Cannot follow yourself")
|
| 118 |
+
|
| 119 |
+
# Check if target user exists
|
| 120 |
+
target_user = db.query(User).filter(User.id == user_id).first()
|
| 121 |
+
if not target_user:
|
| 122 |
+
raise HTTPException(status_code=404, detail="User not found")
|
| 123 |
+
|
| 124 |
+
# Check if already following
|
| 125 |
+
existing = db.query(UserFollow).filter(
|
| 126 |
+
UserFollow.follower_id == current_user.id,
|
| 127 |
+
UserFollow.following_id == user_id
|
| 128 |
+
).first()
|
| 129 |
+
|
| 130 |
+
if existing:
|
| 131 |
+
return FollowResponse(
|
| 132 |
+
success=True,
|
| 133 |
+
following=True,
|
| 134 |
+
follower_count=db.query(UserFollow).filter(UserFollow.following_id == user_id).count(),
|
| 135 |
+
message="Already following this user"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Create follow
|
| 139 |
+
follow = UserFollow(follower_id=current_user.id, following_id=user_id)
|
| 140 |
+
db.add(follow)
|
| 141 |
+
db.commit()
|
| 142 |
+
|
| 143 |
+
follower_count = db.query(UserFollow).filter(UserFollow.following_id == user_id).count()
|
| 144 |
+
|
| 145 |
+
return FollowResponse(
|
| 146 |
+
success=True,
|
| 147 |
+
following=True,
|
| 148 |
+
follower_count=follower_count,
|
| 149 |
+
message="Successfully followed user"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@router.delete("/follow/user/{user_id}")
|
| 154 |
+
async def unfollow_user(
|
| 155 |
+
user_id: int,
|
| 156 |
+
current_user: User = Depends(get_current_user),
|
| 157 |
+
db: Session = Depends(get_db)
|
| 158 |
+
) -> FollowResponse:
|
| 159 |
+
"""Unfollow a user"""
|
| 160 |
+
|
| 161 |
+
follow = db.query(UserFollow).filter(
|
| 162 |
+
UserFollow.follower_id == current_user.id,
|
| 163 |
+
UserFollow.following_id == user_id
|
| 164 |
+
).first()
|
| 165 |
+
|
| 166 |
+
if not follow:
|
| 167 |
+
return FollowResponse(
|
| 168 |
+
success=True,
|
| 169 |
+
following=False,
|
| 170 |
+
follower_count=db.query(UserFollow).filter(UserFollow.following_id == user_id).count(),
|
| 171 |
+
message="Not following this user"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
db.delete(follow)
|
| 175 |
+
db.commit()
|
| 176 |
+
|
| 177 |
+
follower_count = db.query(UserFollow).filter(UserFollow.following_id == user_id).count()
|
| 178 |
+
|
| 179 |
+
return FollowResponse(
|
| 180 |
+
success=True,
|
| 181 |
+
following=False,
|
| 182 |
+
follower_count=follower_count,
|
| 183 |
+
message="Successfully unfollowed user"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
@router.post("/follow/official/{official_id}")
|
| 188 |
+
async def follow_official(
|
| 189 |
+
official_id: int,
|
| 190 |
+
current_user: User = Depends(get_current_user),
|
| 191 |
+
db: Session = Depends(get_db)
|
| 192 |
+
) -> FollowResponse:
|
| 193 |
+
"""Follow an official"""
|
| 194 |
+
|
| 195 |
+
# Check if official exists
|
| 196 |
+
official = db.query(Official).filter(Official.id == official_id).first()
|
| 197 |
+
if not official:
|
| 198 |
+
raise HTTPException(status_code=404, detail="Official not found")
|
| 199 |
+
|
| 200 |
+
# Check if already following
|
| 201 |
+
existing = db.query(OfficialFollow).filter(
|
| 202 |
+
OfficialFollow.user_id == current_user.id,
|
| 203 |
+
OfficialFollow.official_id == official_id
|
| 204 |
+
).first()
|
| 205 |
+
|
| 206 |
+
if existing:
|
| 207 |
+
return FollowResponse(
|
| 208 |
+
success=True,
|
| 209 |
+
following=True,
|
| 210 |
+
follower_count=official.follower_count,
|
| 211 |
+
message="Already following this official"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Create follow
|
| 215 |
+
follow = OfficialFollow(user_id=current_user.id, official_id=official_id)
|
| 216 |
+
db.add(follow)
|
| 217 |
+
|
| 218 |
+
# Update follower count
|
| 219 |
+
official.follower_count += 1
|
| 220 |
+
db.commit()
|
| 221 |
+
|
| 222 |
+
return FollowResponse(
|
| 223 |
+
success=True,
|
| 224 |
+
following=True,
|
| 225 |
+
follower_count=official.follower_count,
|
| 226 |
+
message="Successfully followed official"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
@router.delete("/follow/official/{official_id}")
|
| 231 |
+
async def unfollow_official(
|
| 232 |
+
official_id: int,
|
| 233 |
+
current_user: User = Depends(get_current_user),
|
| 234 |
+
db: Session = Depends(get_db)
|
| 235 |
+
) -> FollowResponse:
|
| 236 |
+
"""Unfollow an official"""
|
| 237 |
+
|
| 238 |
+
official = db.query(Official).filter(Official.id == official_id).first()
|
| 239 |
+
if not official:
|
| 240 |
+
raise HTTPException(status_code=404, detail="Official not found")
|
| 241 |
+
|
| 242 |
+
follow = db.query(OfficialFollow).filter(
|
| 243 |
+
OfficialFollow.user_id == current_user.id,
|
| 244 |
+
OfficialFollow.official_id == official_id
|
| 245 |
+
).first()
|
| 246 |
+
|
| 247 |
+
if not follow:
|
| 248 |
+
return FollowResponse(
|
| 249 |
+
success=True,
|
| 250 |
+
following=False,
|
| 251 |
+
follower_count=official.follower_count,
|
| 252 |
+
message="Not following this official"
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
db.delete(follow)
|
| 256 |
+
official.follower_count = max(0, official.follower_count - 1)
|
| 257 |
+
db.commit()
|
| 258 |
+
|
| 259 |
+
return FollowResponse(
|
| 260 |
+
success=True,
|
| 261 |
+
following=False,
|
| 262 |
+
follower_count=official.follower_count,
|
| 263 |
+
message="Successfully unfollowed official"
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
@router.post("/follow/organization/{org_id}")
|
| 268 |
+
async def follow_organization(
|
| 269 |
+
org_id: int,
|
| 270 |
+
current_user: User = Depends(get_current_user),
|
| 271 |
+
db: Session = Depends(get_db)
|
| 272 |
+
) -> FollowResponse:
|
| 273 |
+
"""Follow an organization"""
|
| 274 |
+
|
| 275 |
+
org = db.query(Organization).filter(Organization.id == org_id).first()
|
| 276 |
+
if not org:
|
| 277 |
+
raise HTTPException(status_code=404, detail="Organization not found")
|
| 278 |
+
|
| 279 |
+
existing = db.query(OrganizationFollow).filter(
|
| 280 |
+
OrganizationFollow.user_id == current_user.id,
|
| 281 |
+
OrganizationFollow.organization_id == org_id
|
| 282 |
+
).first()
|
| 283 |
+
|
| 284 |
+
if existing:
|
| 285 |
+
return FollowResponse(
|
| 286 |
+
success=True,
|
| 287 |
+
following=True,
|
| 288 |
+
follower_count=org.follower_count,
|
| 289 |
+
message="Already following this organization"
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
follow = OrganizationFollow(user_id=current_user.id, organization_id=org_id)
|
| 293 |
+
db.add(follow)
|
| 294 |
+
org.follower_count += 1
|
| 295 |
+
db.commit()
|
| 296 |
+
|
| 297 |
+
return FollowResponse(
|
| 298 |
+
success=True,
|
| 299 |
+
following=True,
|
| 300 |
+
follower_count=org.follower_count,
|
| 301 |
+
message="Successfully followed organization"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@router.delete("/follow/organization/{org_id}")
|
| 306 |
+
async def unfollow_organization(
|
| 307 |
+
org_id: int,
|
| 308 |
+
current_user: User = Depends(get_current_user),
|
| 309 |
+
db: Session = Depends(get_db)
|
| 310 |
+
) -> FollowResponse:
|
| 311 |
+
"""Unfollow an organization"""
|
| 312 |
+
|
| 313 |
+
org = db.query(Organization).filter(Organization.id == org_id).first()
|
| 314 |
+
if not org:
|
| 315 |
+
raise HTTPException(status_code=404, detail="Organization not found")
|
| 316 |
+
|
| 317 |
+
follow = db.query(OrganizationFollow).filter(
|
| 318 |
+
OrganizationFollow.user_id == current_user.id,
|
| 319 |
+
OrganizationFollow.organization_id == org_id
|
| 320 |
+
).first()
|
| 321 |
+
|
| 322 |
+
if not follow:
|
| 323 |
+
return FollowResponse(
|
| 324 |
+
success=True,
|
| 325 |
+
following=False,
|
| 326 |
+
follower_count=org.follower_count,
|
| 327 |
+
message="Not following this organization"
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
db.delete(follow)
|
| 331 |
+
org.follower_count = max(0, org.follower_count - 1)
|
| 332 |
+
db.commit()
|
| 333 |
+
|
| 334 |
+
return FollowResponse(
|
| 335 |
+
success=True,
|
| 336 |
+
following=False,
|
| 337 |
+
follower_count=org.follower_count,
|
| 338 |
+
message="Successfully unfollowed organization"
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
@router.post("/follow/cause/{cause_id}")
|
| 343 |
+
async def follow_cause(
|
| 344 |
+
cause_id: int,
|
| 345 |
+
current_user: User = Depends(get_current_user),
|
| 346 |
+
db: Session = Depends(get_db)
|
| 347 |
+
) -> FollowResponse:
|
| 348 |
+
"""Follow a cause/topic"""
|
| 349 |
+
|
| 350 |
+
cause = db.query(Cause).filter(Cause.id == cause_id).first()
|
| 351 |
+
if not cause:
|
| 352 |
+
raise HTTPException(status_code=404, detail="Cause not found")
|
| 353 |
+
|
| 354 |
+
existing = db.query(CauseFollow).filter(
|
| 355 |
+
CauseFollow.user_id == current_user.id,
|
| 356 |
+
CauseFollow.cause_id == cause_id
|
| 357 |
+
).first()
|
| 358 |
+
|
| 359 |
+
if existing:
|
| 360 |
+
return FollowResponse(
|
| 361 |
+
success=True,
|
| 362 |
+
following=True,
|
| 363 |
+
follower_count=cause.follower_count,
|
| 364 |
+
message="Already following this cause"
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
follow = CauseFollow(user_id=current_user.id, cause_id=cause_id)
|
| 368 |
+
db.add(follow)
|
| 369 |
+
cause.follower_count += 1
|
| 370 |
+
db.commit()
|
| 371 |
+
|
| 372 |
+
return FollowResponse(
|
| 373 |
+
success=True,
|
| 374 |
+
following=True,
|
| 375 |
+
follower_count=cause.follower_count,
|
| 376 |
+
message="Successfully followed cause"
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
@router.delete("/follow/cause/{cause_id}")
|
| 381 |
+
async def unfollow_cause(
|
| 382 |
+
cause_id: int,
|
| 383 |
+
current_user: User = Depends(get_current_user),
|
| 384 |
+
db: Session = Depends(get_db)
|
| 385 |
+
) -> FollowResponse:
|
| 386 |
+
"""Unfollow a cause/topic"""
|
| 387 |
+
|
| 388 |
+
cause = db.query(Cause).filter(Cause.id == cause_id).first()
|
| 389 |
+
if not cause:
|
| 390 |
+
raise HTTPException(status_code=404, detail="Cause not found")
|
| 391 |
+
|
| 392 |
+
follow = db.query(CauseFollow).filter(
|
| 393 |
+
CauseFollow.user_id == current_user.id,
|
| 394 |
+
CauseFollow.cause_id == cause_id
|
| 395 |
+
).first()
|
| 396 |
+
|
| 397 |
+
if not follow:
|
| 398 |
+
return FollowResponse(
|
| 399 |
+
success=True,
|
| 400 |
+
following=False,
|
| 401 |
+
follower_count=cause.follower_count,
|
| 402 |
+
message="Not following this cause"
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
db.delete(follow)
|
| 406 |
+
cause.follower_count = max(0, cause.follower_count - 1)
|
| 407 |
+
db.commit()
|
| 408 |
+
|
| 409 |
+
return FollowResponse(
|
| 410 |
+
success=True,
|
| 411 |
+
following=False,
|
| 412 |
+
follower_count=cause.follower_count,
|
| 413 |
+
message="Successfully unfollowed cause"
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# ============================================================================
|
| 418 |
+
# CHECK FOLLOW STATUS
|
| 419 |
+
# ============================================================================
|
| 420 |
+
|
| 421 |
+
@router.get("/following/status")
|
| 422 |
+
async def check_following_status(
|
| 423 |
+
user_id: Optional[int] = None,
|
| 424 |
+
leader_id: Optional[int] = None,
|
| 425 |
+
org_id: Optional[int] = None,
|
| 426 |
+
cause_id: Optional[int] = None,
|
| 427 |
+
current_user: User = Depends(get_current_user),
|
| 428 |
+
db: Session = Depends(get_db)
|
| 429 |
+
) -> dict:
|
| 430 |
+
"""Check if current user is following various entities"""
|
| 431 |
+
|
| 432 |
+
result = {}
|
| 433 |
+
|
| 434 |
+
if user_id:
|
| 435 |
+
result['user'] = db.query(UserFollow).filter(
|
| 436 |
+
UserFollow.follower_id == current_user.id,
|
| 437 |
+
UserFollow.following_id == user_id
|
| 438 |
+
).first() is not None
|
| 439 |
+
|
| 440 |
+
if leader_id:
|
| 441 |
+
result['official'] = db.query(OfficialFollow).filter(
|
| 442 |
+
OfficialFollow.user_id == current_user.id,
|
| 443 |
+
OfficialFollow.official_id == leader_id
|
| 444 |
+
).first() is not None
|
| 445 |
+
|
| 446 |
+
if org_id:
|
| 447 |
+
result['organization'] = db.query(OrganizationFollow).filter(
|
| 448 |
+
OrganizationFollow.user_id == current_user.id,
|
| 449 |
+
OrganizationFollow.organization_id == org_id
|
| 450 |
+
).first() is not None
|
| 451 |
+
|
| 452 |
+
if cause_id:
|
| 453 |
+
result['cause'] = db.query(CauseFollow).filter(
|
| 454 |
+
CauseFollow.user_id == current_user.id,
|
| 455 |
+
CauseFollow.cause_id == cause_id
|
| 456 |
+
).first() is not None
|
| 457 |
+
|
| 458 |
+
return result
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
# ============================================================================
|
| 462 |
+
# FOLLOWER/FOLLOWING LISTS
|
| 463 |
+
# ============================================================================
|
| 464 |
+
|
| 465 |
+
@router.get("/stats")
|
| 466 |
+
async def get_follower_stats(
|
| 467 |
+
user_id: Optional[int] = None,
|
| 468 |
+
current_user: User = Depends(get_current_user),
|
| 469 |
+
db: Session = Depends(get_db)
|
| 470 |
+
) -> FollowerStats:
|
| 471 |
+
"""Get follower/following statistics for a user"""
|
| 472 |
+
|
| 473 |
+
target_id = user_id if user_id else current_user.id
|
| 474 |
+
|
| 475 |
+
# Count followers (people following this user)
|
| 476 |
+
followers = db.query(UserFollow).filter(UserFollow.following_id == target_id).count()
|
| 477 |
+
|
| 478 |
+
# Count following (users this person follows)
|
| 479 |
+
following_users = db.query(UserFollow).filter(UserFollow.follower_id == target_id).count()
|
| 480 |
+
following_officials = db.query(OfficialFollow).filter(OfficialFollow.user_id == target_id).count()
|
| 481 |
+
following_orgs = db.query(OrganizationFollow).filter(OrganizationFollow.user_id == target_id).count()
|
| 482 |
+
following_causes = db.query(CauseFollow).filter(CauseFollow.user_id == target_id).count()
|
| 483 |
+
|
| 484 |
+
total_following = following_users + following_officials + following_orgs + following_causes
|
| 485 |
+
|
| 486 |
+
return FollowerStats(
|
| 487 |
+
followers=followers,
|
| 488 |
+
following=total_following,
|
| 489 |
+
following_users=following_users,
|
| 490 |
+
following_officials=following_officials,
|
| 491 |
+
following_organizations=following_orgs,
|
| 492 |
+
following_causes=following_causes
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
@router.get("/following/officials")
|
| 497 |
+
async def get_following_officials(
|
| 498 |
+
current_user: User = Depends(get_current_user),
|
| 499 |
+
db: Session = Depends(get_db)
|
| 500 |
+
) -> List[OfficialSummary]:
|
| 501 |
+
"""Get list of officials the current user is following"""
|
| 502 |
+
|
| 503 |
+
officials = db.query(Official).join(
|
| 504 |
+
OfficialFollow,
|
| 505 |
+
OfficialFollow.official_id == Official.id
|
| 506 |
+
).filter(
|
| 507 |
+
OfficialFollow.user_id == current_user.id
|
| 508 |
+
).all()
|
| 509 |
+
|
| 510 |
+
return [OfficialSummary.from_orm(official) for official in officials]
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
@router.get("/following/organizations")
|
| 514 |
+
async def get_following_organizations(
|
| 515 |
+
current_user: User = Depends(get_current_user),
|
| 516 |
+
db: Session = Depends(get_db)
|
| 517 |
+
) -> List[OrganizationSummary]:
|
| 518 |
+
"""Get list of organizations the current user is following"""
|
| 519 |
+
|
| 520 |
+
orgs = db.query(Organization).join(
|
| 521 |
+
OrganizationFollow,
|
| 522 |
+
OrganizationFollow.organization_id == Organization.id
|
| 523 |
+
).filter(
|
| 524 |
+
OrganizationFollow.user_id == current_user.id
|
| 525 |
+
).all()
|
| 526 |
+
|
| 527 |
+
return [OrganizationSummary.from_orm(org) for org in orgs]
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
@router.get("/following/causes")
|
| 531 |
+
async def get_following_causes(
|
| 532 |
+
current_user: User = Depends(get_current_user),
|
| 533 |
+
db: Session = Depends(get_db)
|
| 534 |
+
) -> List[CauseSummary]:
|
| 535 |
+
"""Get list of causes the current user is following"""
|
| 536 |
+
|
| 537 |
+
causes = db.query(Cause).join(
|
| 538 |
+
CauseFollow,
|
| 539 |
+
CauseFollow.cause_id == Cause.id
|
| 540 |
+
).filter(
|
| 541 |
+
CauseFollow.user_id == current_user.id
|
| 542 |
+
).all()
|
| 543 |
+
|
| 544 |
+
return [CauseSummary.from_orm(cause) for cause in causes]
|
api/routes/stats.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Statistics endpoint with cached metrics from real data at multiple geographic levels
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from datetime import datetime, timedelta
|
| 8 |
+
from typing import Dict, Any, Optional
|
| 9 |
+
from loguru import logger
|
| 10 |
+
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
# Multi-level cache: {cache_key: {stats_data, timestamp}}
|
| 14 |
+
# Cache key format: "national" or "state:MA" or "county:MA:Suffolk" or "city:MA:Boston"
|
| 15 |
+
STATS_CACHE: Dict[str, Dict[str, Any]] = {}
|
| 16 |
+
CACHE_DURATION = timedelta(hours=1)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def count_parquet_records(pattern: str, filter_func=None) -> int:
|
| 20 |
+
"""
|
| 21 |
+
Count total records across matching parquet files
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
pattern: Glob pattern for files
|
| 25 |
+
filter_func: Optional function to filter DataFrame rows
|
| 26 |
+
"""
|
| 27 |
+
files = list(Path('data/gold').glob(pattern))
|
| 28 |
+
total = 0
|
| 29 |
+
for file in files:
|
| 30 |
+
try:
|
| 31 |
+
df = pd.read_parquet(file)
|
| 32 |
+
if filter_func:
|
| 33 |
+
df = df[filter_func(df)]
|
| 34 |
+
total += len(df)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"Warning: Could not read {file}: {e}")
|
| 37 |
+
return total
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def calculate_stats(state: Optional[str] = None,
|
| 41 |
+
county: Optional[str] = None,
|
| 42 |
+
city: Optional[str] = None) -> Dict[str, Any]:
|
| 43 |
+
"""
|
| 44 |
+
Calculate statistics from parquet files with optional geographic filtering
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
state: Two-letter state code (e.g., 'MA')
|
| 48 |
+
county: County name (e.g., 'Suffolk County')
|
| 49 |
+
city: City name (e.g., 'Boston')
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
# Determine geographic level
|
| 53 |
+
if city and state:
|
| 54 |
+
level = 'city'
|
| 55 |
+
if county:
|
| 56 |
+
location_display = f"{city}, {county}, {state}"
|
| 57 |
+
else:
|
| 58 |
+
location_display = f"{city}, {state}"
|
| 59 |
+
elif county and state:
|
| 60 |
+
level = 'county'
|
| 61 |
+
location_display = f"{county}, {state}"
|
| 62 |
+
elif state:
|
| 63 |
+
level = 'state'
|
| 64 |
+
location_display = state
|
| 65 |
+
else:
|
| 66 |
+
level = 'national'
|
| 67 |
+
location_display = 'United States'
|
| 68 |
+
|
| 69 |
+
# Count jurisdictions (cities, counties, townships, school districts)
|
| 70 |
+
if state:
|
| 71 |
+
# Filter to specific state's jurisdictions
|
| 72 |
+
def filter_state(df):
|
| 73 |
+
state_col = 'state' if 'state' in df.columns else 'STATE'
|
| 74 |
+
if state_col not in df.columns:
|
| 75 |
+
return pd.Series([False] * len(df))
|
| 76 |
+
return df[state_col].str.upper() == state.upper()
|
| 77 |
+
|
| 78 |
+
# For city level, show just that city (1 jurisdiction)
|
| 79 |
+
if city:
|
| 80 |
+
# When a city is selected, show 4 jurisdictions:
|
| 81 |
+
# 1. City, 2. County, 3. State, 4. School District
|
| 82 |
+
jurisdictions = 4 # City, County, State, School District
|
| 83 |
+
elif county:
|
| 84 |
+
# Count cities/townships in this county
|
| 85 |
+
cities_file = Path('data/gold/reference/jurisdictions_cities.parquet')
|
| 86 |
+
townships_file = Path('data/gold/reference/jurisdictions_townships.parquet')
|
| 87 |
+
count = 0
|
| 88 |
+
|
| 89 |
+
if cities_file.exists():
|
| 90 |
+
df = pd.read_parquet(cities_file)
|
| 91 |
+
state_col = 'state' if 'state' in df.columns else 'STATE'
|
| 92 |
+
if state_col in df.columns:
|
| 93 |
+
df = df[df[state_col].str.upper() == state.upper()]
|
| 94 |
+
# Filter by county name (NAME column contains county info in some cases)
|
| 95 |
+
# For now, count all in state - proper county filtering needs geocoding
|
| 96 |
+
count += len(df)
|
| 97 |
+
|
| 98 |
+
if townships_file.exists():
|
| 99 |
+
df = pd.read_parquet(townships_file)
|
| 100 |
+
state_col = 'state' if 'state' in df.columns else 'STATE'
|
| 101 |
+
if state_col in df.columns:
|
| 102 |
+
df = df[df[state_col].str.upper() == state.upper()]
|
| 103 |
+
count += len(df)
|
| 104 |
+
|
| 105 |
+
jurisdictions = count if count > 0 else 1 # At least the county itself
|
| 106 |
+
else:
|
| 107 |
+
# State level - count all jurisdictions
|
| 108 |
+
jurisdictions = count_parquet_records('reference/jurisdictions_*.parquet', filter_state)
|
| 109 |
+
|
| 110 |
+
school_districts = count_parquet_records('reference/jurisdictions_school_districts.parquet', filter_state)
|
| 111 |
+
else:
|
| 112 |
+
jurisdictions = count_parquet_records('reference/jurisdictions_*.parquet')
|
| 113 |
+
school_districts = count_parquet_records('reference/jurisdictions_school_districts.parquet')
|
| 114 |
+
|
| 115 |
+
# Count nonprofits
|
| 116 |
+
if state:
|
| 117 |
+
# Read specific state's nonprofit file
|
| 118 |
+
state_file = Path(f'data/gold/states/{state}/nonprofits_organizations.parquet')
|
| 119 |
+
if state_file.exists():
|
| 120 |
+
df = pd.read_parquet(state_file)
|
| 121 |
+
|
| 122 |
+
# Filter by county if specified
|
| 123 |
+
if county:
|
| 124 |
+
county_col = 'COUNTY' if 'COUNTY' in df.columns else 'county'
|
| 125 |
+
if county_col in df.columns:
|
| 126 |
+
df = df[df[county_col].str.contains(county, case=False, na=False)]
|
| 127 |
+
|
| 128 |
+
# Filter by city if specified
|
| 129 |
+
if city:
|
| 130 |
+
city_col = 'CITY' if 'CITY' in df.columns else 'city'
|
| 131 |
+
if city_col in df.columns:
|
| 132 |
+
df = df[df[city_col].str.contains(city, case=False, na=False)]
|
| 133 |
+
|
| 134 |
+
nonprofits = len(df)
|
| 135 |
+
else:
|
| 136 |
+
nonprofits = 0
|
| 137 |
+
else:
|
| 138 |
+
nonprofits = count_parquet_records('states/*/nonprofits_organizations.parquet')
|
| 139 |
+
|
| 140 |
+
# Count events/meetings (try new naming first, fallback to old)
|
| 141 |
+
if state:
|
| 142 |
+
# Try new naming first
|
| 143 |
+
event_pattern = f'states/{state}/events.parquet'
|
| 144 |
+
event_file = Path(f'data/gold/{event_pattern}')
|
| 145 |
+
|
| 146 |
+
if not event_file.exists():
|
| 147 |
+
# Try old events_events naming
|
| 148 |
+
event_pattern = f'states/{state}/events_events.parquet'
|
| 149 |
+
event_file = Path(f'data/gold/{event_pattern}')
|
| 150 |
+
|
| 151 |
+
if not event_file.exists():
|
| 152 |
+
# Fallback to original meetings naming
|
| 153 |
+
event_pattern = f'states/{state}/meetings.parquet'
|
| 154 |
+
event_file = Path(f'data/gold/{event_pattern}')
|
| 155 |
+
|
| 156 |
+
if city and event_file.exists():
|
| 157 |
+
# Filter by city
|
| 158 |
+
df = pd.read_parquet(event_file)
|
| 159 |
+
place_col = 'place_name' if 'place_name' in df.columns else ('jurisdiction_name' if 'jurisdiction_name' in df.columns else 'jurisdiction')
|
| 160 |
+
if place_col in df.columns:
|
| 161 |
+
# Match city name (case-insensitive)
|
| 162 |
+
df = df[df[place_col].str.contains(city, case=False, na=False)]
|
| 163 |
+
meetings = len(df)
|
| 164 |
+
else:
|
| 165 |
+
meetings = count_parquet_records(event_pattern)
|
| 166 |
+
else:
|
| 167 |
+
# Try new naming first for all states
|
| 168 |
+
meetings = count_parquet_records('states/*/events.parquet')
|
| 169 |
+
if meetings == 0:
|
| 170 |
+
# Try old events_events naming
|
| 171 |
+
meetings = count_parquet_records('states/*/events_events.parquet')
|
| 172 |
+
if meetings == 0:
|
| 173 |
+
# Fallback to original meetings naming
|
| 174 |
+
meetings = count_parquet_records('states/*/meetings.parquet')
|
| 175 |
+
|
| 176 |
+
# Count contacts
|
| 177 |
+
if state:
|
| 178 |
+
contact_pattern = f'states/{state}/contacts_*.parquet'
|
| 179 |
+
contact_files = list(Path('data/gold/states').glob(f'{state}/contacts_*.parquet'))
|
| 180 |
+
|
| 181 |
+
if city and contact_files:
|
| 182 |
+
# Filter by city across all contact files
|
| 183 |
+
contacts = 0
|
| 184 |
+
for contact_file in contact_files:
|
| 185 |
+
try:
|
| 186 |
+
df = pd.read_parquet(contact_file)
|
| 187 |
+
jurisdiction_col = 'jurisdiction' if 'jurisdiction' in df.columns else 'city'
|
| 188 |
+
if jurisdiction_col in df.columns:
|
| 189 |
+
df = df[df[jurisdiction_col].str.contains(city, case=False, na=False)]
|
| 190 |
+
contacts += len(df)
|
| 191 |
+
except Exception as e:
|
| 192 |
+
logger.error(f"Error filtering contacts by city in {contact_file}: {e}")
|
| 193 |
+
continue
|
| 194 |
+
else:
|
| 195 |
+
contacts = count_parquet_records(contact_pattern)
|
| 196 |
+
else:
|
| 197 |
+
contacts = count_parquet_records('states/*/contacts_*.parquet')
|
| 198 |
+
|
| 199 |
+
# Count causes (NTEE codes - always national)
|
| 200 |
+
causes = count_parquet_records('reference/causes_ntee_codes.parquet')
|
| 201 |
+
|
| 202 |
+
# Count states with data
|
| 203 |
+
states_with_data = len(list(Path('data/gold/states').glob('*/')))
|
| 204 |
+
|
| 205 |
+
# Count domains
|
| 206 |
+
domains = count_parquet_records('reference/domains_*.parquet')
|
| 207 |
+
|
| 208 |
+
# Format display values - use ACTUAL counts only, no extrapolation
|
| 209 |
+
# Don't make up numbers we don't have
|
| 210 |
+
nonprofits_display = f'{nonprofits:,}'
|
| 211 |
+
meetings_display = f'{meetings:,}'
|
| 212 |
+
contacts_display = f'{contacts:,}'
|
| 213 |
+
|
| 214 |
+
# Build jurisdictions breakdown for city-level views
|
| 215 |
+
jurisdictions_breakdown = None
|
| 216 |
+
if city and state:
|
| 217 |
+
jurisdictions_breakdown = [
|
| 218 |
+
{'type': 'City', 'name': city},
|
| 219 |
+
{'type': 'County', 'name': county if county else 'County (TBD)'},
|
| 220 |
+
{'type': 'State', 'name': state},
|
| 221 |
+
{'type': 'School District', 'name': f'{city} School District'}
|
| 222 |
+
]
|
| 223 |
+
|
| 224 |
+
return {
|
| 225 |
+
'level': level,
|
| 226 |
+
'location': location_display,
|
| 227 |
+
'state': state,
|
| 228 |
+
'county': county,
|
| 229 |
+
'city': city,
|
| 230 |
+
|
| 231 |
+
# Core counts
|
| 232 |
+
'jurisdictions': jurisdictions,
|
| 233 |
+
'jurisdictions_display': f'{jurisdictions:,}',
|
| 234 |
+
'jurisdictions_breakdown': jurisdictions_breakdown, # List of jurisdiction types for city-level
|
| 235 |
+
'school_districts': school_districts,
|
| 236 |
+
'school_districts_display': f'{school_districts:,}',
|
| 237 |
+
|
| 238 |
+
# Nonprofits (actual counts only)
|
| 239 |
+
'nonprofits_current': nonprofits,
|
| 240 |
+
'nonprofits_display': nonprofits_display,
|
| 241 |
+
|
| 242 |
+
# Meetings (actual counts only)
|
| 243 |
+
'meetings_current': meetings,
|
| 244 |
+
'meetings_display': meetings_display,
|
| 245 |
+
|
| 246 |
+
# Contacts (actual counts only)
|
| 247 |
+
'contacts_current': contacts,
|
| 248 |
+
'contacts_display': contacts_display,
|
| 249 |
+
|
| 250 |
+
# Other metrics
|
| 251 |
+
'causes': causes,
|
| 252 |
+
'causes_display': f'{causes}',
|
| 253 |
+
'states_with_data': states_with_data,
|
| 254 |
+
'domains': domains,
|
| 255 |
+
'last_updated': datetime.now().isoformat(),
|
| 256 |
+
|
| 257 |
+
# Calculated metrics (use N/A for unavailable data)
|
| 258 |
+
'budget_tracked': 'N/A',
|
| 259 |
+
'fact_checks': 'N/A',
|
| 260 |
+
'grant_opportunities': '1,000s',
|
| 261 |
+
'churches': f'{int(nonprofits * 0.1):,}' if nonprofits > 0 else '4,372',
|
| 262 |
+
'policy_decisions': 'N/A',
|
| 263 |
+
'states_total': states_with_data,
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def get_cached_stats(state: Optional[str] = None,
|
| 268 |
+
county: Optional[str] = None,
|
| 269 |
+
city: Optional[str] = None) -> Dict[str, Any]:
|
| 270 |
+
"""Get stats with multi-level caching"""
|
| 271 |
+
global STATS_CACHE
|
| 272 |
+
|
| 273 |
+
# Build cache key based on geographic level
|
| 274 |
+
if city and state:
|
| 275 |
+
# City level (county is optional)
|
| 276 |
+
if county:
|
| 277 |
+
cache_key = f"city:{state}:{county}:{city}"
|
| 278 |
+
else:
|
| 279 |
+
cache_key = f"city:{state}:{city}"
|
| 280 |
+
elif county and state:
|
| 281 |
+
cache_key = f"county:{state}:{county}"
|
| 282 |
+
elif state:
|
| 283 |
+
cache_key = f"state:{state}"
|
| 284 |
+
else:
|
| 285 |
+
cache_key = "national"
|
| 286 |
+
|
| 287 |
+
now = datetime.now()
|
| 288 |
+
|
| 289 |
+
# Check if cached stats exist and are still valid
|
| 290 |
+
if cache_key in STATS_CACHE:
|
| 291 |
+
cached_entry = STATS_CACHE[cache_key]
|
| 292 |
+
cache_timestamp = cached_entry.get('_cache_timestamp')
|
| 293 |
+
|
| 294 |
+
if cache_timestamp and (now - cache_timestamp) < CACHE_DURATION:
|
| 295 |
+
# Return cached stats (remove internal timestamp before returning)
|
| 296 |
+
stats = cached_entry.copy()
|
| 297 |
+
stats.pop('_cache_timestamp', None)
|
| 298 |
+
return stats
|
| 299 |
+
|
| 300 |
+
# Calculate fresh stats
|
| 301 |
+
try:
|
| 302 |
+
stats = calculate_stats(state=state, county=county, city=city)
|
| 303 |
+
|
| 304 |
+
# Add to cache with timestamp
|
| 305 |
+
cache_entry = stats.copy()
|
| 306 |
+
cache_entry['_cache_timestamp'] = now
|
| 307 |
+
STATS_CACHE[cache_key] = cache_entry
|
| 308 |
+
|
| 309 |
+
return stats
|
| 310 |
+
except Exception as e:
|
| 311 |
+
print(f"Error calculating stats for {cache_key}: {e}")
|
| 312 |
+
|
| 313 |
+
# Return fallback stats if calculation fails (use real numbers only)
|
| 314 |
+
return {
|
| 315 |
+
'level': 'national' if not state else ('state' if not county else ('county' if not city else 'city')),
|
| 316 |
+
'location': state or 'United States',
|
| 317 |
+
'jurisdictions_display': '925',
|
| 318 |
+
'nonprofits_display': '43,726',
|
| 319 |
+
'meetings_display': '6,913',
|
| 320 |
+
'contacts_display': '362',
|
| 321 |
+
'school_districts_display': '306',
|
| 322 |
+
'causes_display': '196',
|
| 323 |
+
'churches': '4,372',
|
| 324 |
+
'budget_tracked': 'N/A',
|
| 325 |
+
'fact_checks': 'N/A',
|
| 326 |
+
'grant_opportunities': '1,000s',
|
| 327 |
+
'policy_decisions': 'N/A',
|
| 328 |
+
'states_with_data': 5,
|
| 329 |
+
'states_total': 5,
|
| 330 |
+
'last_updated': now.isoformat(),
|
| 331 |
+
'error': str(e)
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
@router.get("/stats")
|
| 336 |
+
async def get_stats(
|
| 337 |
+
state: Optional[str] = Query(None, description="Two-letter state code (e.g., 'MA')"),
|
| 338 |
+
county: Optional[str] = Query(None, description="County name (e.g., 'Suffolk County')"),
|
| 339 |
+
city: Optional[str] = Query(None, description="City name (e.g., 'Boston')")
|
| 340 |
+
):
|
| 341 |
+
"""
|
| 342 |
+
Get platform statistics from real data with optional geographic filtering
|
| 343 |
+
|
| 344 |
+
**Examples:**
|
| 345 |
+
- `/api/stats` - National statistics
|
| 346 |
+
- `/api/stats?state=MA` - Massachusetts statistics
|
| 347 |
+
- `/api/stats?state=MA&county=Suffolk` - Suffolk County, MA statistics
|
| 348 |
+
- `/api/stats?state=MA&county=Suffolk&city=Boston` - Boston, MA statistics
|
| 349 |
+
|
| 350 |
+
**Returns:** Cached metrics calculated from parquet files:
|
| 351 |
+
- Jurisdictions tracked (cities, counties, townships, school districts)
|
| 352 |
+
- Nonprofits monitored
|
| 353 |
+
- Meetings analyzed
|
| 354 |
+
- Officials and contacts tracked
|
| 355 |
+
- Causes and NTEE codes
|
| 356 |
+
|
| 357 |
+
**Cache duration:** 1 hour per geographic level
|
| 358 |
+
"""
|
| 359 |
+
try:
|
| 360 |
+
stats = get_cached_stats(state=state, county=county, city=city)
|
| 361 |
+
return {
|
| 362 |
+
'success': True,
|
| 363 |
+
'data': stats
|
| 364 |
+
}
|
| 365 |
+
except Exception as e:
|
| 366 |
+
raise HTTPException(status_code=500, detail=f"Error fetching stats: {str(e)}")
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
@router.get("/stats/detailed")
|
| 370 |
+
async def get_detailed_stats(
|
| 371 |
+
state: Optional[str] = Query(None, description="Two-letter state code (e.g., 'MA')")
|
| 372 |
+
):
|
| 373 |
+
"""
|
| 374 |
+
Get detailed statistics including breakdowns by state
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
- Overall totals
|
| 378 |
+
- Per-state breakdowns (if no state specified)
|
| 379 |
+
- Data quality metrics
|
| 380 |
+
"""
|
| 381 |
+
try:
|
| 382 |
+
stats = get_cached_stats(state=state)
|
| 383 |
+
|
| 384 |
+
# Add state-by-state breakdown (only for national view)
|
| 385 |
+
if not state:
|
| 386 |
+
states = {}
|
| 387 |
+
for state_dir in Path('data/gold/states').glob('*/'):
|
| 388 |
+
state_code = state_dir.name
|
| 389 |
+
state_stats = {}
|
| 390 |
+
|
| 391 |
+
# Count each data type for this state
|
| 392 |
+
for data_type in ['nonprofits_organizations', 'meetings', 'contacts_nonprofit_officers']:
|
| 393 |
+
file = state_dir / f'{data_type}.parquet'
|
| 394 |
+
if file.exists():
|
| 395 |
+
try:
|
| 396 |
+
df = pd.read_parquet(file)
|
| 397 |
+
state_stats[data_type] = len(df)
|
| 398 |
+
except:
|
| 399 |
+
pass
|
| 400 |
+
|
| 401 |
+
if state_stats:
|
| 402 |
+
states[state_code] = state_stats
|
| 403 |
+
|
| 404 |
+
return {
|
| 405 |
+
'success': True,
|
| 406 |
+
'data': {
|
| 407 |
+
**stats,
|
| 408 |
+
'state_breakdown': states
|
| 409 |
+
}
|
| 410 |
+
}
|
| 411 |
+
else:
|
| 412 |
+
return {
|
| 413 |
+
'success': True,
|
| 414 |
+
'data': stats
|
| 415 |
+
}
|
| 416 |
+
except Exception as e:
|
| 417 |
+
raise HTTPException(status_code=500, detail=f"Error fetching detailed stats: {str(e)}")
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
@router.post("/stats/refresh")
|
| 421 |
+
async def refresh_stats(
|
| 422 |
+
state: Optional[str] = Query(None, description="State to refresh (or all if not specified)")
|
| 423 |
+
):
|
| 424 |
+
"""
|
| 425 |
+
Force refresh of statistics cache
|
| 426 |
+
|
| 427 |
+
Useful after data updates or imports.
|
| 428 |
+
Can refresh a specific state or all levels.
|
| 429 |
+
"""
|
| 430 |
+
global STATS_CACHE
|
| 431 |
+
|
| 432 |
+
try:
|
| 433 |
+
if state:
|
| 434 |
+
# Clear cache for specific state and its derivatives
|
| 435 |
+
keys_to_remove = [k for k in STATS_CACHE.keys() if k.startswith(f'state:{state}') or k.startswith(f'county:{state}') or k.startswith(f'city:{state}')]
|
| 436 |
+
for key in keys_to_remove:
|
| 437 |
+
STATS_CACHE.pop(key, None)
|
| 438 |
+
message = f'Statistics cache refreshed for {state}'
|
| 439 |
+
else:
|
| 440 |
+
# Clear all cache
|
| 441 |
+
STATS_CACHE = {}
|
| 442 |
+
message = 'All statistics cache refreshed'
|
| 443 |
+
|
| 444 |
+
# Recalculate to warm cache
|
| 445 |
+
stats = get_cached_stats(state=state)
|
| 446 |
+
|
| 447 |
+
return {
|
| 448 |
+
'success': True,
|
| 449 |
+
'message': message,
|
| 450 |
+
'data': stats
|
| 451 |
+
}
|
| 452 |
+
except Exception as e:
|
| 453 |
+
raise HTTPException(status_code=500, detail=f"Error refreshing stats: {str(e)}")
|
api/routes/stats_neon.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Statistics endpoint using Neon Postgres (fast!)
|
| 3 |
+
Replaces parquet file scanning with indexed database queries
|
| 4 |
+
"""
|
| 5 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 6 |
+
from typing import Dict, Any, Optional
|
| 7 |
+
from loguru import logger
|
| 8 |
+
import os
|
| 9 |
+
import asyncpg
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
|
| 12 |
+
router = APIRouter()
|
| 13 |
+
|
| 14 |
+
# Cache for stats (TTL: 5 minutes - data in Neon changes infrequently)
|
| 15 |
+
STATS_CACHE: Dict[str, Dict[str, Any]] = {}
|
| 16 |
+
CACHE_DURATION = timedelta(minutes=5)
|
| 17 |
+
|
| 18 |
+
# Get database URL from environment
|
| 19 |
+
# Priority: NEON_DATABASE_URL_DEV (local) > NEON_DATABASE_URL (production)
|
| 20 |
+
NEON_DATABASE_URL_DEV = os.getenv('NEON_DATABASE_URL_DEV')
|
| 21 |
+
NEON_DATABASE_URL = os.getenv('NEON_DATABASE_URL')
|
| 22 |
+
|
| 23 |
+
# Use dev database for local development, production database for deployed environments
|
| 24 |
+
DATABASE_URL = NEON_DATABASE_URL_DEV or NEON_DATABASE_URL
|
| 25 |
+
|
| 26 |
+
# Connection pool (created on first request)
|
| 27 |
+
_db_pool = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
async def get_db_pool():
|
| 31 |
+
"""Get or create database connection pool"""
|
| 32 |
+
global _db_pool
|
| 33 |
+
if _db_pool is None:
|
| 34 |
+
if not DATABASE_URL:
|
| 35 |
+
raise ValueError("DATABASE_URL not configured (set NEON_DATABASE_URL_DEV or NEON_DATABASE_URL)")
|
| 36 |
+
|
| 37 |
+
# Log which database we're using
|
| 38 |
+
db_type = "Development (Local PostgreSQL)" if NEON_DATABASE_URL_DEV else "Production (Neon)"
|
| 39 |
+
logger.info(f"🗄️ [Stats] Connecting to {db_type}: {DATABASE_URL[:50]}...")
|
| 40 |
+
|
| 41 |
+
_db_pool = await asyncpg.create_pool(DATABASE_URL, min_size=1, max_size=10)
|
| 42 |
+
return _db_pool
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@router.get("/stats")
|
| 46 |
+
async def get_stats(
|
| 47 |
+
state: Optional[str] = Query(None, description="Two-letter state code (e.g., MA)"),
|
| 48 |
+
county: Optional[str] = Query(None, description="County name (e.g., Suffolk County)"),
|
| 49 |
+
city: Optional[str] = Query(None, description="City name (e.g., Boston)")
|
| 50 |
+
):
|
| 51 |
+
"""
|
| 52 |
+
Get statistics from Neon Postgres database
|
| 53 |
+
|
| 54 |
+
**Performance**: ~10-50ms (vs 3-10 seconds with parquet files)
|
| 55 |
+
|
| 56 |
+
- **National**: GET /api/stats
|
| 57 |
+
- **State**: GET /api/stats?state=MA
|
| 58 |
+
- **County**: GET /api/stats?state=MA&county=Suffolk%20County
|
| 59 |
+
- **City**: GET /api/stats?state=MA&city=Boston
|
| 60 |
+
|
| 61 |
+
Returns comprehensive statistics including:
|
| 62 |
+
- Jurisdiction counts (cities, counties, school districts)
|
| 63 |
+
- Nonprofit counts and financials
|
| 64 |
+
- Event/meeting counts
|
| 65 |
+
- Contact/officer counts
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
# Determine cache key and query parameters
|
| 70 |
+
if city and state:
|
| 71 |
+
cache_key = f"city:{state}:{city}"
|
| 72 |
+
level = 'city'
|
| 73 |
+
location_display = f"{city}, {state}"
|
| 74 |
+
elif county and state:
|
| 75 |
+
cache_key = f"county:{state}:{county}"
|
| 76 |
+
level = 'county'
|
| 77 |
+
location_display = f"{county}, {state}"
|
| 78 |
+
elif state:
|
| 79 |
+
cache_key = f"state:{state}"
|
| 80 |
+
level = 'state'
|
| 81 |
+
location_display = state
|
| 82 |
+
else:
|
| 83 |
+
cache_key = "national"
|
| 84 |
+
level = 'national'
|
| 85 |
+
location_display = 'United States'
|
| 86 |
+
|
| 87 |
+
# Check cache
|
| 88 |
+
if cache_key in STATS_CACHE:
|
| 89 |
+
cached = STATS_CACHE[cache_key]
|
| 90 |
+
if datetime.now() - cached['timestamp'] < CACHE_DURATION:
|
| 91 |
+
logger.debug(f"🚀 Cache hit for {cache_key}")
|
| 92 |
+
return cached['stats']
|
| 93 |
+
|
| 94 |
+
# Query Neon database
|
| 95 |
+
logger.info(f"📊 Fetching stats from Neon: {cache_key}")
|
| 96 |
+
stats = await fetch_stats_from_neon(level, state, county, city)
|
| 97 |
+
|
| 98 |
+
if not stats:
|
| 99 |
+
# No data found - return empty stats
|
| 100 |
+
stats = {
|
| 101 |
+
'location': location_display,
|
| 102 |
+
'level': level,
|
| 103 |
+
'state': state,
|
| 104 |
+
'county': county,
|
| 105 |
+
'city': city,
|
| 106 |
+
'jurisdictions': 0,
|
| 107 |
+
'school_districts': 0,
|
| 108 |
+
'nonprofits': 0,
|
| 109 |
+
'events': 0,
|
| 110 |
+
'bills': 0,
|
| 111 |
+
'contacts': 0,
|
| 112 |
+
'total_revenue': 0,
|
| 113 |
+
'total_assets': 0,
|
| 114 |
+
'last_updated': None,
|
| 115 |
+
'source': 'neon',
|
| 116 |
+
'note': 'No data available for this location'
|
| 117 |
+
}
|
| 118 |
+
else:
|
| 119 |
+
# Format response
|
| 120 |
+
stats = {
|
| 121 |
+
'location': location_display,
|
| 122 |
+
'level': level,
|
| 123 |
+
'state': state,
|
| 124 |
+
'county': county,
|
| 125 |
+
'city': city,
|
| 126 |
+
'jurisdictions': stats.get('jurisdictions_count', 0),
|
| 127 |
+
'school_districts': stats.get('school_districts_count', 0),
|
| 128 |
+
'nonprofits': stats.get('nonprofits_count', 0),
|
| 129 |
+
'events': stats.get('events_count', 0),
|
| 130 |
+
'bills': stats.get('bills_count', 0),
|
| 131 |
+
'contacts': stats.get('contacts_count', 0),
|
| 132 |
+
'total_revenue': stats.get('total_revenue', 0),
|
| 133 |
+
'total_assets': stats.get('total_assets', 0),
|
| 134 |
+
'last_updated': stats.get('last_updated'),
|
| 135 |
+
'source': 'neon'
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# Cache result
|
| 139 |
+
STATS_CACHE[cache_key] = {
|
| 140 |
+
'stats': stats,
|
| 141 |
+
'timestamp': datetime.now()
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
return stats
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"❌ Error fetching stats: {e}")
|
| 148 |
+
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
async def fetch_stats_from_neon(
|
| 152 |
+
level: str,
|
| 153 |
+
state: Optional[str] = None,
|
| 154 |
+
county: Optional[str] = None,
|
| 155 |
+
city: Optional[str] = None
|
| 156 |
+
) -> Optional[Dict[str, Any]]:
|
| 157 |
+
"""
|
| 158 |
+
Fetch statistics from Neon database
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
level: 'national', 'state', 'county', or 'city'
|
| 162 |
+
state: State code (if applicable)
|
| 163 |
+
county: County name (if applicable)
|
| 164 |
+
city: City name (if applicable)
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
Dictionary with stats or None if not found
|
| 168 |
+
"""
|
| 169 |
+
try:
|
| 170 |
+
pool = await get_db_pool()
|
| 171 |
+
|
| 172 |
+
async with pool.acquire() as conn:
|
| 173 |
+
# Build query based on level
|
| 174 |
+
if level == 'national':
|
| 175 |
+
query = """
|
| 176 |
+
SELECT * FROM stats_aggregates
|
| 177 |
+
WHERE level = 'national'
|
| 178 |
+
LIMIT 1
|
| 179 |
+
"""
|
| 180 |
+
result = await conn.fetchrow(query)
|
| 181 |
+
|
| 182 |
+
elif level == 'state':
|
| 183 |
+
query = """
|
| 184 |
+
SELECT * FROM stats_aggregates
|
| 185 |
+
WHERE level = 'state' AND UPPER(state) = UPPER($1)
|
| 186 |
+
LIMIT 1
|
| 187 |
+
"""
|
| 188 |
+
result = await conn.fetchrow(query, state)
|
| 189 |
+
|
| 190 |
+
elif level == 'county':
|
| 191 |
+
# Try county-level stats first
|
| 192 |
+
query = """
|
| 193 |
+
SELECT * FROM stats_aggregates
|
| 194 |
+
WHERE level = 'county'
|
| 195 |
+
AND UPPER(state) = UPPER($1)
|
| 196 |
+
AND county ILIKE $2
|
| 197 |
+
LIMIT 1
|
| 198 |
+
"""
|
| 199 |
+
result = await conn.fetchrow(query, state, f"%{county}%")
|
| 200 |
+
|
| 201 |
+
# Fall back to state-level if county not found
|
| 202 |
+
if not result and state:
|
| 203 |
+
logger.info(f"County '{county}' not found in stats, falling back to state '{state}'")
|
| 204 |
+
query = """
|
| 205 |
+
SELECT * FROM stats_aggregates
|
| 206 |
+
WHERE level = 'state' AND UPPER(state) = UPPER($1)
|
| 207 |
+
LIMIT 1
|
| 208 |
+
"""
|
| 209 |
+
result = await conn.fetchrow(query, state)
|
| 210 |
+
|
| 211 |
+
elif level == 'city':
|
| 212 |
+
# Try city-level stats first
|
| 213 |
+
query = """
|
| 214 |
+
SELECT * FROM stats_aggregates
|
| 215 |
+
WHERE level = 'city'
|
| 216 |
+
AND UPPER(state) = UPPER($1)
|
| 217 |
+
AND city ILIKE $2
|
| 218 |
+
LIMIT 1
|
| 219 |
+
"""
|
| 220 |
+
result = await conn.fetchrow(query, state, f"%{city}%")
|
| 221 |
+
|
| 222 |
+
# NEVER fall back to county stats for city requests
|
| 223 |
+
# If city stats not found, go straight to state-level
|
| 224 |
+
if not result and state:
|
| 225 |
+
logger.info(f"City '{city}' not found in stats, falling back to state '{state}' (skipping county)")
|
| 226 |
+
query = """
|
| 227 |
+
SELECT * FROM stats_aggregates
|
| 228 |
+
WHERE level = 'state' AND UPPER(state) = UPPER($1)
|
| 229 |
+
LIMIT 1
|
| 230 |
+
"""
|
| 231 |
+
result = await conn.fetchrow(query, state)
|
| 232 |
+
|
| 233 |
+
else:
|
| 234 |
+
return None
|
| 235 |
+
|
| 236 |
+
if result:
|
| 237 |
+
return dict(result)
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
logger.error(f"Database query error: {e}")
|
| 242 |
+
raise
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
@router.get("/stats/search")
|
| 246 |
+
async def search_stats(
|
| 247 |
+
query: str = Query(..., description="Search query"),
|
| 248 |
+
limit: int = Query(10, ge=1, le=100, description="Max results")
|
| 249 |
+
):
|
| 250 |
+
"""
|
| 251 |
+
Search for locations (cities, counties, states) with statistics
|
| 252 |
+
|
| 253 |
+
Example: GET /api/stats/search?query=boston&limit=5
|
| 254 |
+
|
| 255 |
+
Returns matching locations with their statistics
|
| 256 |
+
"""
|
| 257 |
+
try:
|
| 258 |
+
pool = await get_db_pool()
|
| 259 |
+
|
| 260 |
+
async with pool.acquire() as conn:
|
| 261 |
+
# Search across all geographic levels
|
| 262 |
+
results = await conn.fetch("""
|
| 263 |
+
SELECT
|
| 264 |
+
level,
|
| 265 |
+
state,
|
| 266 |
+
county,
|
| 267 |
+
city,
|
| 268 |
+
jurisdictions_count,
|
| 269 |
+
nonprofits_count,
|
| 270 |
+
events_count,
|
| 271 |
+
total_revenue
|
| 272 |
+
FROM stats_aggregates
|
| 273 |
+
WHERE
|
| 274 |
+
(city ILIKE $1 OR county ILIKE $1 OR state ILIKE $1)
|
| 275 |
+
AND level != 'national'
|
| 276 |
+
ORDER BY
|
| 277 |
+
CASE level
|
| 278 |
+
WHEN 'city' THEN 1
|
| 279 |
+
WHEN 'county' THEN 2
|
| 280 |
+
WHEN 'state' THEN 3
|
| 281 |
+
END,
|
| 282 |
+
nonprofits_count DESC
|
| 283 |
+
LIMIT $2
|
| 284 |
+
""", f"%{query}%", limit)
|
| 285 |
+
|
| 286 |
+
return [{
|
| 287 |
+
'level': row['level'],
|
| 288 |
+
'location': format_location(row),
|
| 289 |
+
'state': row['state'],
|
| 290 |
+
'county': row['county'],
|
| 291 |
+
'city': row['city'],
|
| 292 |
+
'jurisdictions': row['jurisdictions_count'],
|
| 293 |
+
'nonprofits': row['nonprofits_count'],
|
| 294 |
+
'events': row['events_count'],
|
| 295 |
+
'total_revenue': row['total_revenue']
|
| 296 |
+
} for row in results]
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
logger.error(f"Search error: {e}")
|
| 300 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def format_location(row) -> str:
|
| 304 |
+
"""Format location string from database row"""
|
| 305 |
+
if row['city']:
|
| 306 |
+
if row['county']:
|
| 307 |
+
return f"{row['city']}, {row['county']}, {row['state']}"
|
| 308 |
+
return f"{row['city']}, {row['state']}"
|
| 309 |
+
elif row['county']:
|
| 310 |
+
return f"{row['county']}, {row['state']}"
|
| 311 |
+
elif row['state']:
|
| 312 |
+
return row['state']
|
| 313 |
+
return 'Unknown'
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
@router.on_event("shutdown")
|
| 317 |
+
async def shutdown_db_pool():
|
| 318 |
+
"""Close database connection pool on shutdown"""
|
| 319 |
+
global _db_pool
|
| 320 |
+
if _db_pool:
|
| 321 |
+
await _db_pool.close()
|
| 322 |
+
_db_pool = None
|
api/static/assets/index-BIH9Tona.css
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.leaflet-pane,.leaflet-tile,.leaflet-marker-icon,.leaflet-marker-shadow,.leaflet-tile-container,.leaflet-pane>svg,.leaflet-pane>canvas,.leaflet-zoom-box,.leaflet-image-layer,.leaflet-layer{position:absolute;left:0;top:0}.leaflet-container{overflow:hidden}.leaflet-tile,.leaflet-marker-icon,.leaflet-marker-shadow{-webkit-user-select:none;-moz-user-select:none;user-select:none;-webkit-user-drag:none}.leaflet-tile::-moz-selection{background:transparent}.leaflet-tile::selection{background:transparent}.leaflet-safari .leaflet-tile{image-rendering:-webkit-optimize-contrast}.leaflet-safari .leaflet-tile-container{width:1600px;height:1600px;-webkit-transform-origin:0 0}.leaflet-marker-icon,.leaflet-marker-shadow{display:block}.leaflet-container .leaflet-overlay-pane svg{max-width:none!important;max-height:none!important}.leaflet-container .leaflet-marker-pane img,.leaflet-container .leaflet-shadow-pane img,.leaflet-container .leaflet-tile-pane img,.leaflet-container img.leaflet-image-layer,.leaflet-container .leaflet-tile{max-width:none!important;max-height:none!important;width:auto;padding:0}.leaflet-container img.leaflet-tile{mix-blend-mode:plus-lighter}.leaflet-container.leaflet-touch-zoom{touch-action:pan-x pan-y}.leaflet-container.leaflet-touch-drag{touch-action:none;touch-action:pinch-zoom}.leaflet-container.leaflet-touch-drag.leaflet-touch-zoom{touch-action:none}.leaflet-container{-webkit-tap-highlight-color:transparent}.leaflet-container a{-webkit-tap-highlight-color:rgba(51,181,229,.4)}.leaflet-tile{filter:inherit;visibility:hidden}.leaflet-tile-loaded{visibility:inherit}.leaflet-zoom-box{width:0;height:0;box-sizing:border-box;z-index:800}.leaflet-overlay-pane svg{-moz-user-select:none}.leaflet-pane{z-index:400}.leaflet-tile-pane{z-index:200}.leaflet-overlay-pane{z-index:400}.leaflet-shadow-pane{z-index:500}.leaflet-marker-pane{z-index:600}.leaflet-tooltip-pane{z-index:650}.leaflet-popup-pane{z-index:700}.leaflet-map-pane canvas{z-index:100}.leaflet-map-pane svg{z-index:200}.leaflet-vml-shape{width:1px;height:1px}.lvml{behavior:url(#default#VML);display:inline-block;position:absolute}.leaflet-control{position:relative;z-index:800;pointer-events:visiblePainted;pointer-events:auto}.leaflet-top,.leaflet-bottom{position:absolute;z-index:1000;pointer-events:none}.leaflet-top{top:0}.leaflet-right{right:0}.leaflet-bottom{bottom:0}.leaflet-left{left:0}.leaflet-control{float:left;clear:both}.leaflet-right .leaflet-control{float:right}.leaflet-top .leaflet-control{margin-top:10px}.leaflet-bottom .leaflet-control{margin-bottom:10px}.leaflet-left .leaflet-control{margin-left:10px}.leaflet-right .leaflet-control{margin-right:10px}.leaflet-fade-anim .leaflet-popup{opacity:0;transition:opacity .2s linear}.leaflet-fade-anim .leaflet-map-pane .leaflet-popup{opacity:1}.leaflet-zoom-animated{transform-origin:0 0}svg.leaflet-zoom-animated{will-change:transform}.leaflet-zoom-anim .leaflet-zoom-animated{transition:transform .25s cubic-bezier(0,0,.25,1)}.leaflet-zoom-anim .leaflet-tile,.leaflet-pan-anim .leaflet-tile{transition:none}.leaflet-zoom-anim .leaflet-zoom-hide{visibility:hidden}.leaflet-interactive{cursor:pointer}.leaflet-grab{cursor:grab}.leaflet-crosshair,.leaflet-crosshair .leaflet-interactive{cursor:crosshair}.leaflet-popup-pane,.leaflet-control{cursor:auto}.leaflet-dragging .leaflet-grab,.leaflet-dragging .leaflet-grab .leaflet-interactive,.leaflet-dragging .leaflet-marker-draggable{cursor:move;cursor:grabbing}.leaflet-marker-icon,.leaflet-marker-shadow,.leaflet-image-layer,.leaflet-pane>svg path,.leaflet-tile-container{pointer-events:none}.leaflet-marker-icon.leaflet-interactive,.leaflet-image-layer.leaflet-interactive,.leaflet-pane>svg path.leaflet-interactive,svg.leaflet-image-layer.leaflet-interactive path{pointer-events:visiblePainted;pointer-events:auto}.leaflet-container{background:#ddd;outline-offset:1px}.leaflet-container a{color:#0078a8}.leaflet-zoom-box{border:2px dotted #38f;background:#ffffff80}.leaflet-container{font-family:Helvetica Neue,Arial,Helvetica,sans-serif;font-size:12px;font-size:.75rem;line-height:1.5}.leaflet-bar{box-shadow:0 1px 5px #000000a6;border-radius:4px}.leaflet-bar a{background-color:#fff;border-bottom:1px solid #ccc;width:26px;height:26px;line-height:26px;display:block;text-align:center;text-decoration:none;color:#000}.leaflet-bar a,.leaflet-control-layers-toggle{background-position:50% 50%;background-repeat:no-repeat;display:block}.leaflet-bar a:hover,.leaflet-bar a:focus{background-color:#f4f4f4}.leaflet-bar a:first-child{border-top-left-radius:4px;border-top-right-radius:4px}.leaflet-bar a:last-child{border-bottom-left-radius:4px;border-bottom-right-radius:4px;border-bottom:none}.leaflet-bar a.leaflet-disabled{cursor:default;background-color:#f4f4f4;color:#bbb}.leaflet-touch .leaflet-bar a{width:30px;height:30px;line-height:30px}.leaflet-touch .leaflet-bar a:first-child{border-top-left-radius:2px;border-top-right-radius:2px}.leaflet-touch .leaflet-bar a:last-child{border-bottom-left-radius:2px;border-bottom-right-radius:2px}.leaflet-control-zoom-in,.leaflet-control-zoom-out{font:700 18px Lucida Console,Monaco,monospace;text-indent:1px}.leaflet-touch .leaflet-control-zoom-in,.leaflet-touch .leaflet-control-zoom-out{font-size:22px}.leaflet-control-layers{box-shadow:0 1px 5px #0006;background:#fff;border-radius:5px}.leaflet-control-layers-toggle{background-image:url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABoAAAAaCAQAAAADQ4RFAAACf0lEQVR4AY1UM3gkARTePdvdoTxXKc+qTl3aU5U6b2Kbkz3Gtq3Zw6ziLGNPzrYx7946Tr6/ee/XeCQ4D3ykPtL5tHno4n0d/h3+xfuWHGLX81cn7r0iTNzjr7LrlxCqPtkbTQEHeqOrTy4Yyt3VCi/IOB0v7rVC7q45Q3Gr5K6jt+3Gl5nCoDD4MtO+j96Wu8atmhGqcNGHObuf8OM/x3AMx38+4Z2sPqzCxRFK2aF2e5Jol56XTLyggAMTL56XOMoS1W4pOyjUcGGQdZxU6qRh7B9Zp+PfpOFlqt0zyDZckPi1ttmIp03jX8gyJ8a/PG2yutpS/Vol7peZIbZcKBAEEheEIAgFbDkz5H6Zrkm2hVWGiXKiF4Ycw0RWKdtC16Q7qe3X4iOMxruonzegJzWaXFrU9utOSsLUmrc0YjeWYjCW4PDMADElpJSSQ0vQvA1Tm6/JlKnqFs1EGyZiFCqnRZTEJJJiKRYzVYzJck2Rm6P4iH+cmSY0YzimYa8l0EtTODFWhcMIMVqdsI2uiTvKmTisIDHJ3od5GILVhBCarCfVRmo4uTjkhrhzkiBV7SsaqS+TzrzM1qpGGUFt28pIySQHR6h7F6KSwGWm97ay+Z+ZqMcEjEWebE7wxCSQwpkhJqoZA5ivCdZDjJepuJ9IQjGGUmuXJdBFUygxVqVsxFsLMbDe8ZbDYVCGKxs+W080max1hFCarCfV+C1KATwcnvE9gRRuMP2prdbWGowm1KB1y+zwMMENkM755cJ2yPDtqhTI6ED1M/82yIDtC/4j4BijjeObflpO9I9MwXTCsSX8jWAFeHr05WoLTJ5G8IQVS/7vwR6ohirYM7f6HzYpogfS3R2OAAAAAElFTkSuQmCC);width:36px;height:36px}.leaflet-retina .leaflet-control-layers-toggle{background-image:url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADQAAAA0CAQAAABvcdNgAAAEsklEQVR4AWL4TydIhpZK1kpWOlg0w3ZXP6D2soBtG42jeI6ZmQTHzAxiTbSJsYLjO9HhP+WOmcuhciVnmHVQcJnp7DFvScowZorad/+V/fVzMdMT2g9Cv9guXGv/7pYOrXh2U+RRR3dSd9JRx6bIFc/ekqHI29JC6pJ5ZEh1yWkhkbcFeSjxgx3L2m1cb1C7bceyxA+CNjT/Ifff+/kDk2u/w/33/IeCMOSaWZ4glosqT3DNnNZQ7Cs58/3Ce5HL78iZH/vKVIaYlqzfdLu8Vi7dnvUbEza5Idt36tquZFldl6N5Z/POLof0XLK61mZCmJSWjVF9tEjUluu74IUXvgttuVIHE7YxSkaYhJZam7yiM9Pv82JYfl9nptxZaxMJE4YSPty+vF0+Y2up9d3wwijfjZbabqm/3bZ9ecKHsiGmRflnn1MW4pjHf9oLufyn2z3y1D6n8g8TZhxyzipLNPnAUpsOiuWimg52psrTZYnOWYNDTMuWBWa0tJb4rgq1UvmutpaYEbZlwU3CLJm/ayYjHW5/h7xWLn9Hh1vepDkyf7dE7MtT5LR4e7yYpHrkhOUpEfssBLq2pPhAqoSWKUkk7EDqkmK6RrCEzqDjhNDWNE+XSMvkJRDWlZTmCW0l0PHQGRZY5t1L83kT0Y3l2SItk5JAWHl2dCOBm+fPu3fo5/3v61RMCO9Jx2EEYYhb0rmNQMX/vm7gqOEJLcXTGw3CAuRNeyaPWwjR8PRqKQ1PDA/dpv+on9Shox52WFnx0KY8onHayrJzm87i5h9xGw/tfkev0jGsQizqezUKjk12hBMKJ4kbCqGPVNXudyyrShovGw5CgxsRICxF6aRmSjlBnHRzg7Gx8fKqEubI2rahQYdR1YgDIRQO7JvQyD52hoIQx0mxa0ODtW2Iozn1le2iIRdzwWewedyZzewidueOGqlsn1MvcnQpuVwLGG3/IR1hIKxCjelIDZ8ldqWz25jWAsnldEnK0Zxro19TGVb2ffIZEsIO89EIEDvKMPrzmBOQcKQ+rroye6NgRRxqR4U8EAkz0CL6uSGOm6KQCdWjvjRiSP1BPalCRS5iQYiEIvxuBMJEWgzSoHADcVMuN7IuqqTeyUPq22qFimFtxDyBBJEwNyt6TM88blFHao/6tWWhuuOM4SAK4EI4QmFHA+SEyWlp4EQoJ13cYGzMu7yszEIBOm2rVmHUNqwAIQabISNMRstmdhNWcFLsSm+0tjJH1MdRxO5Nx0WDMhCtgD6OKgZeljJqJKc9po8juskR9XN0Y1lZ3mWjLR9JCO1jRDMd0fpYC2VnvjBSEFg7wBENc0R9HFlb0xvF1+TBEpF68d+DHR6IOWVv2BECtxo46hOFUBd/APU57WIoEwJhIi2CdpyZX0m93BZicktMj1AS9dClteUFAUNUIEygRZCtik5zSxI9MubTBH1GOiHsiLJ3OCoSZkILa9PxiN0EbvhsAo8tdAf9Seepd36lGWHmtNANTv5Jd0z4QYyeo/UEJqxKRpg5LZx6btLPsOaEmdMyxYdlc8LMaJnikDlhclqmPiQnTEpLUIZEwkRagjYkEibQErwhkTAKCLQEbUgkzJQWc/0PstHHcfEdQ+UAAAAASUVORK5CYII=);background-size:26px 26px}.leaflet-touch .leaflet-control-layers-toggle{width:44px;height:44px}.leaflet-control-layers .leaflet-control-layers-list,.leaflet-control-layers-expanded .leaflet-control-layers-toggle{display:none}.leaflet-control-layers-expanded .leaflet-control-layers-list{display:block;position:relative}.leaflet-control-layers-expanded{padding:6px 10px 6px 6px;color:#333;background:#fff}.leaflet-control-layers-scrollbar{overflow-y:scroll;overflow-x:hidden;padding-right:5px}.leaflet-control-layers-selector{margin-top:2px;position:relative;top:1px}.leaflet-control-layers label{display:block;font-size:13px;font-size:1.08333em}.leaflet-control-layers-separator{height:0;border-top:1px solid #ddd;margin:5px -10px 5px -6px}.leaflet-default-icon-path{background-image:url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABkAAAApCAYAAADAk4LOAAAFgUlEQVR4Aa1XA5BjWRTN2oW17d3YaZtr2962HUzbDNpjszW24mRt28p47v7zq/bXZtrp/lWnXr337j3nPCe85NcypgSFdugCpW5YoDAMRaIMqRi6aKq5E3YqDQO3qAwjVWrD8Ncq/RBpykd8oZUb/kaJutow8r1aP9II0WmLKLIsJyv1w/kqw9Ch2MYdB++12Onxee/QMwvf4/Dk/Lfp/i4nxTXtOoQ4pW5Aj7wpici1A9erdAN2OH64x8OSP9j3Ft3b7aWkTg/Fm91siTra0f9on5sQr9INejH6CUUUpavjFNq1B+Oadhxmnfa8RfEmN8VNAsQhPqF55xHkMzz3jSmChWU6f7/XZKNH+9+hBLOHYozuKQPxyMPUKkrX/K0uWnfFaJGS1QPRtZsOPtr3NsW0uyh6NNCOkU3Yz+bXbT3I8G3xE5EXLXtCXbbqwCO9zPQYPRTZ5vIDXD7U+w7rFDEoUUf7ibHIR4y6bLVPXrz8JVZEql13trxwue/uDivd3fkWRbS6/IA2bID4uk0UpF1N8qLlbBlXs4Ee7HLTfV1j54APvODnSfOWBqtKVvjgLKzF5YdEk5ewRkGlK0i33Eofffc7HT56jD7/6U+qH3Cx7SBLNntH5YIPvODnyfIXZYRVDPqgHtLs5ABHD3YzLuespb7t79FY34DjMwrVrcTuwlT55YMPvOBnRrJ4VXTdNnYug5ucHLBjEpt30701A3Ts+HEa73u6dT3FNWwflY86eMHPk+Yu+i6pzUpRrW7SNDg5JHR4KapmM5Wv2E8Tfcb1HoqqHMHU+uWDD7zg54mz5/2BSnizi9T1Dg4QQXLToGNCkb6tb1NU+QAlGr1++eADrzhn/u8Q2YZhQVlZ5+CAOtqfbhmaUCS1ezNFVm2imDbPmPng5wmz+gwh+oHDce0eUtQ6OGDIyR0uUhUsoO3vfDmmgOezH0mZN59x7MBi++WDL1g/eEiU3avlidO671bkLfwbw5XV2P8Pzo0ydy4t2/0eu33xYSOMOD8hTf4CrBtGMSoXfPLchX+J0ruSePw3LZeK0juPJbYzrhkH0io7B3k164hiGvawhOKMLkrQLyVpZg8rHFW7E2uHOL888IBPlNZ1FPzstSJM694fWr6RwpvcJK60+0HCILTBzZLFNdtAzJaohze60T8qBzyh5ZuOg5e7uwQppofEmf2++DYvmySqGBuKaicF1blQjhuHdvCIMvp8whTTfZzI7RldpwtSzL+F1+wkdZ2TBOW2gIF88PBTzD/gpeREAMEbxnJcaJHNHrpzji0gQCS6hdkEeYt9DF/2qPcEC8RM28Hwmr3sdNyht00byAut2k3gufWNtgtOEOFGUwcXWNDbdNbpgBGxEvKkOQsxivJx33iow0Vw5S6SVTrpVq11ysA2Rp7gTfPfktc6zhtXBBC+adRLshf6sG2RfHPZ5EAc4sVZ83yCN00Fk/4kggu40ZTvIEm5g24qtU4KjBrx/BTTH8ifVASAG7gKrnWxJDcU7x8X6Ecczhm3o6YicvsLXWfh3Ch1W0k8x0nXF+0fFxgt4phz8QvypiwCCFKMqXCnqXExjq10beH+UUA7+nG6mdG/Pu0f3LgFcGrl2s0kNNjpmoJ9o4B29CMO8dMT4Q5ox8uitF6fqsrJOr8qnwNbRzv6hSnG5wP+64C7h9lp30hKNtKdWjtdkbuPA19nJ7Tz3zR/ibgARbhb4AlhavcBebmTHcFl2fvYEnW0ox9xMxKBS8btJ+KiEbq9zA4RthQXDhPa0T9TEe69gWupwc6uBUphquXgf+/FrIjweHQS4/pduMe5ERUMHUd9xv8ZR98CxkS4F2n3EUrUZ10EYNw7BWm9x1GiPssi3GgiGRDKWRYZfXlON+dfNbM+GgIwYdwAAAAASUVORK5CYII=)}.leaflet-container .leaflet-control-attribution{background:#fff;background:#fffc;margin:0}.leaflet-control-attribution,.leaflet-control-scale-line{padding:0 5px;color:#333;line-height:1.4}.leaflet-control-attribution a{text-decoration:none}.leaflet-control-attribution a:hover,.leaflet-control-attribution a:focus{text-decoration:underline}.leaflet-attribution-flag{display:inline!important;vertical-align:baseline!important;width:1em;height:.6669em}.leaflet-left .leaflet-control-scale{margin-left:5px}.leaflet-bottom .leaflet-control-scale{margin-bottom:5px}.leaflet-control-scale-line{border:2px solid #777;border-top:none;line-height:1.1;padding:2px 5px 1px;white-space:nowrap;box-sizing:border-box;background:#fffc;text-shadow:1px 1px #fff}.leaflet-control-scale-line:not(:first-child){border-top:2px solid #777;border-bottom:none;margin-top:-2px}.leaflet-control-scale-line:not(:first-child):not(:last-child){border-bottom:2px solid #777}.leaflet-touch .leaflet-control-attribution,.leaflet-touch .leaflet-control-layers,.leaflet-touch .leaflet-bar{box-shadow:none}.leaflet-touch .leaflet-control-layers,.leaflet-touch .leaflet-bar{border:2px solid rgba(0,0,0,.2);background-clip:padding-box}.leaflet-popup{position:absolute;text-align:center;margin-bottom:20px}.leaflet-popup-content-wrapper{padding:1px;text-align:left;border-radius:12px}.leaflet-popup-content{margin:13px 24px 13px 20px;line-height:1.3;font-size:13px;font-size:1.08333em;min-height:1px}.leaflet-popup-content p{margin:1.3em 0}.leaflet-popup-tip-container{width:40px;height:20px;position:absolute;left:50%;margin-top:-1px;margin-left:-20px;overflow:hidden;pointer-events:none}.leaflet-popup-tip{width:17px;height:17px;padding:1px;margin:-10px auto 0;pointer-events:auto;transform:rotate(45deg)}.leaflet-popup-content-wrapper,.leaflet-popup-tip{background:#fff;color:#333;box-shadow:0 3px 14px #0006}.leaflet-container a.leaflet-popup-close-button{position:absolute;top:0;right:0;border:none;text-align:center;width:24px;height:24px;font:16px/24px Tahoma,Verdana,sans-serif;color:#757575;text-decoration:none;background:transparent}.leaflet-container a.leaflet-popup-close-button:hover,.leaflet-container a.leaflet-popup-close-button:focus{color:#585858}.leaflet-popup-scrolled{overflow:auto}.leaflet-oldie .leaflet-popup-content-wrapper{-ms-zoom:1}.leaflet-oldie .leaflet-popup-tip{width:24px;margin:0 auto;-ms-filter:"progid:DXImageTransform.Microsoft.Matrix(M11=0.70710678, M12=0.70710678, M21=-0.70710678, M22=0.70710678)";filter:progid:DXImageTransform.Microsoft.Matrix(M11=.70710678,M12=.70710678,M21=-.70710678,M22=.70710678)}.leaflet-oldie .leaflet-control-zoom,.leaflet-oldie .leaflet-control-layers,.leaflet-oldie .leaflet-popup-content-wrapper,.leaflet-oldie .leaflet-popup-tip{border:1px solid #999}.leaflet-div-icon{background:#fff;border:1px solid #666}.leaflet-tooltip{position:absolute;padding:6px;background-color:#fff;border:1px solid #fff;border-radius:3px;color:#222;white-space:nowrap;-webkit-user-select:none;-moz-user-select:none;user-select:none;pointer-events:none;box-shadow:0 1px 3px #0006}.leaflet-tooltip.leaflet-interactive{cursor:pointer;pointer-events:auto}.leaflet-tooltip-top:before,.leaflet-tooltip-bottom:before,.leaflet-tooltip-left:before,.leaflet-tooltip-right:before{position:absolute;pointer-events:none;border:6px solid transparent;background:transparent;content:""}.leaflet-tooltip-bottom{margin-top:6px}.leaflet-tooltip-top{margin-top:-6px}.leaflet-tooltip-bottom:before,.leaflet-tooltip-top:before{left:50%;margin-left:-6px}.leaflet-tooltip-top:before{bottom:0;margin-bottom:-12px;border-top-color:#fff}.leaflet-tooltip-bottom:before{top:0;margin-top:-12px;margin-left:-6px;border-bottom-color:#fff}.leaflet-tooltip-left{margin-left:-6px}.leaflet-tooltip-right{margin-left:6px}.leaflet-tooltip-left:before,.leaflet-tooltip-right:before{top:50%;margin-top:-6px}.leaflet-tooltip-left:before{right:0;margin-right:-12px;border-left-color:#fff}.leaflet-tooltip-right:before{left:0;margin-left:-12px;border-right-color:#fff}@media print{.leaflet-control{-webkit-print-color-adjust:exact;print-color-adjust:exact}}*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}h1{font-size:2.25rem;line-height:2.5rem;font-weight:700}h2{font-size:1.875rem;line-height:2.25rem;font-weight:600}h3{font-size:1.5rem;line-height:2rem;font-weight:600}.container{width:100%}@media (min-width: 640px){.container{max-width:640px}}@media (min-width: 768px){.container{max-width:768px}}@media (min-width: 1024px){.container{max-width:1024px}}@media (min-width: 1280px){.container{max-width:1280px}}@media (min-width: 1536px){.container{max-width:1536px}}.card{border-radius:.5rem;--tw-bg-opacity: 1;background-color:rgb(255 255 255 / var(--tw-bg-opacity, 1));padding:1.5rem;--tw-shadow: 0 4px 6px -1px rgb(0 0 0 / .1), 0 2px 4px -2px rgb(0 0 0 / .1);--tw-shadow-colored: 0 4px 6px -1px var(--tw-shadow-color), 0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.btn-primary{border-radius:.5rem;--tw-bg-opacity: 1;background-color:rgb(53 79 82 / var(--tw-bg-opacity, 1));padding:.5rem 1rem;font-weight:500;--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.btn-primary:hover{--tw-bg-opacity: 1;background-color:rgb(46 67 70 / var(--tw-bg-opacity, 1))}.btn-secondary{border-radius:.5rem;--tw-bg-opacity: 1;background-color:rgb(229 231 235 / var(--tw-bg-opacity, 1));padding:.5rem 1rem;font-weight:500;--tw-text-opacity: 1;color:rgb(31 41 55 / var(--tw-text-opacity, 1));transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.btn-secondary:hover{--tw-bg-opacity: 1;background-color:rgb(209 213 219 / var(--tw-bg-opacity, 1))}.pointer-events-auto{pointer-events:auto}.visible{visibility:visible}.invisible{visibility:hidden}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{top:0;right:0;bottom:0;left:0}.inset-y-0{top:0;bottom:0}.-bottom-6{bottom:-1.5rem}.bottom-0{bottom:0}.bottom-4{bottom:1rem}.left-0{left:0}.left-1\/2{left:50%}.left-3{left:.75rem}.left-4{left:1rem}.right-0{right:0}.right-2{right:.5rem}.right-4{right:1rem}.top-0{top:0}.top-1\.5{top:.375rem}.top-1\/2{top:50%}.top-16{top:4rem}.top-2{top:.5rem}.top-2\.5{top:.625rem}.top-3\.5{top:.875rem}.z-10{z-index:10}.z-20{z-index:20}.z-30{z-index:30}.z-40{z-index:40}.z-50{z-index:50}.z-\[1\]{z-index:1}.mx-8{margin-left:2rem;margin-right:2rem}.mx-auto{margin-left:auto;margin-right:auto}.my-1{margin-top:.25rem;margin-bottom:.25rem}.my-4{margin-top:1rem;margin-bottom:1rem}.mb-1{margin-bottom:.25rem}.mb-1\.5{margin-bottom:.375rem}.mb-12{margin-bottom:3rem}.mb-16{margin-bottom:4rem}.mb-2{margin-bottom:.5rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-6{margin-bottom:1.5rem}.mb-8{margin-bottom:2rem}.ml-1{margin-left:.25rem}.ml-2{margin-left:.5rem}.ml-4{margin-left:1rem}.mr-2{margin-right:.5rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-12{margin-top:3rem}.mt-16{margin-top:4rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-6{margin-top:1.5rem}.mt-8{margin-top:2rem}.line-clamp-2{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:2}.block{display:block}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-1{height:.25rem}.h-10{height:2.5rem}.h-12{height:3rem}.h-14{height:3.5rem}.h-16{height:4rem}.h-2{height:.5rem}.h-20{height:5rem}.h-24{height:6rem}.h-3{height:.75rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-64{height:16rem}.h-7{height:1.75rem}.h-8{height:2rem}.h-9{height:2.25rem}.h-96{height:24rem}.h-\[600px\]{height:600px}.h-\[calc\(100vh-10rem\)\]{height:calc(100vh - 10rem)}.h-auto{height:auto}.h-screen{height:100vh}.max-h-48{max-height:12rem}.max-h-60{max-height:15rem}.max-h-96{max-height:24rem}.min-h-full{min-height:100%}.min-h-screen{min-height:100vh}.w-10{width:2.5rem}.w-12{width:3rem}.w-14{width:3.5rem}.w-16{width:4rem}.w-2{width:.5rem}.w-20{width:5rem}.w-24{width:6rem}.w-3{width:.75rem}.w-32{width:8rem}.w-4{width:1rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-64{width:16rem}.w-7{width:1.75rem}.w-8{width:2rem}.w-9{width:2.25rem}.w-full{width:100%}.min-w-0{min-width:0px}.min-w-\[150px\]{min-width:150px}.min-w-\[200px\]{min-width:200px}.min-w-\[250px\]{min-width:250px}.min-w-full{min-width:100%}.max-w-2xl{max-width:42rem}.max-w-3xl{max-width:48rem}.max-w-4xl{max-width:56rem}.max-w-5xl{max-width:64rem}.max-w-6xl{max-width:72rem}.max-w-7xl{max-width:80rem}.max-w-md{max-width:28rem}.max-w-sm{max-width:24rem}.max-w-xs{max-width:20rem}.flex-1{flex:1 1 0%}.flex-shrink-0{flex-shrink:0}.grow{flex-grow:1}.origin-left{transform-origin:left}.-translate-x-1\/2{--tw-translate-x: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.-translate-x-full{--tw-translate-x: -100%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.-translate-y-1\/2{--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.translate-x-0{--tw-translate-x: 0px;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.translate-y-full{--tw-translate-y: 100%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-100{--tw-scale-x: 1;--tw-scale-y: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-95{--tw-scale-x: .95;--tw-scale-y: .95;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-x-0{--tw-scale-x: 0;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.animate-\[slideUp_0\.6s_ease-out\]{animation:slideUp .6s ease-out}.animate-\[slideUp_0\.8s_ease-out_0\.2s_both\]{animation:slideUp .8s ease-out .2s both}.animate-\[slideUp_0\.8s_ease-out_0\.4s_both\]{animation:slideUp .8s ease-out .4s both}.animate-\[slideUp_0\.8s_ease-out_0\.6s_both\]{animation:slideUp .8s ease-out .6s both}.animate-\[slideUp_0\.8s_ease-out_0\.8s_both\]{animation:slideUp .8s ease-out .8s both}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.cursor-not-allowed{cursor:not-allowed}.cursor-pointer{cursor:pointer}.list-inside{list-style-position:inside}.list-decimal{list-style-type:decimal}.list-disc{list-style-type:disc}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-12{gap:3rem}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.space-x-2>:not([hidden])~:not([hidden]){--tw-space-x-reverse: 0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.space-y-3>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.75rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.75rem * var(--tw-space-y-reverse))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1rem * var(--tw-space-y-reverse))}.space-y-6>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1.5rem * var(--tw-space-y-reverse))}.space-y-8>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(2rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(2rem * var(--tw-space-y-reverse))}.divide-y>:not([hidden])~:not([hidden]){--tw-divide-y-reverse: 0;border-top-width:calc(1px * calc(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(1px * var(--tw-divide-y-reverse))}.divide-gray-200>:not([hidden])~:not([hidden]){--tw-divide-opacity: 1;border-color:rgb(229 231 235 / var(--tw-divide-opacity, 1))}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.whitespace-nowrap{white-space:nowrap}.rounded{border-radius:.25rem}.rounded-2xl{border-radius:1rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.rounded-xl{border-radius:.75rem}.border{border-width:1px}.border-2{border-width:2px}.border-4{border-width:4px}.border-b{border-bottom-width:1px}.border-b-2{border-bottom-width:2px}.border-l-2{border-left-width:2px}.border-l-4{border-left-width:4px}.border-r{border-right-width:1px}.border-t{border-top-width:1px}.border-dashed{border-style:dashed}.border-\[\#354F52\]{--tw-border-opacity: 1;border-color:rgb(53 79 82 / var(--tw-border-opacity, 1))}.border-amber-200{--tw-border-opacity: 1;border-color:rgb(253 230 138 / var(--tw-border-opacity, 1))}.border-amber-500{--tw-border-opacity: 1;border-color:rgb(245 158 11 / var(--tw-border-opacity, 1))}.border-blue-200{--tw-border-opacity: 1;border-color:rgb(191 219 254 / var(--tw-border-opacity, 1))}.border-blue-400{--tw-border-opacity: 1;border-color:rgb(96 165 250 / var(--tw-border-opacity, 1))}.border-blue-500{--tw-border-opacity: 1;border-color:rgb(59 130 246 / var(--tw-border-opacity, 1))}.border-blue-600{--tw-border-opacity: 1;border-color:rgb(37 99 235 / var(--tw-border-opacity, 1))}.border-current{border-color:currentColor}.border-emerald-500{--tw-border-opacity: 1;border-color:rgb(16 185 129 / var(--tw-border-opacity, 1))}.border-gray-100{--tw-border-opacity: 1;border-color:rgb(243 244 246 / var(--tw-border-opacity, 1))}.border-gray-200{--tw-border-opacity: 1;border-color:rgb(229 231 235 / var(--tw-border-opacity, 1))}.border-gray-300{--tw-border-opacity: 1;border-color:rgb(209 213 219 / var(--tw-border-opacity, 1))}.border-gray-700{--tw-border-opacity: 1;border-color:rgb(55 65 81 / var(--tw-border-opacity, 1))}.border-gray-900{--tw-border-opacity: 1;border-color:rgb(17 24 39 / var(--tw-border-opacity, 1))}.border-green-200{--tw-border-opacity: 1;border-color:rgb(187 247 208 / var(--tw-border-opacity, 1))}.border-green-500{--tw-border-opacity: 1;border-color:rgb(34 197 94 / var(--tw-border-opacity, 1))}.border-green-500\/30{border-color:#22c55e4d}.border-green-500\/50{border-color:#22c55e80}.border-orange-200{--tw-border-opacity: 1;border-color:rgb(254 215 170 / var(--tw-border-opacity, 1))}.border-pink-200{--tw-border-opacity: 1;border-color:rgb(251 207 232 / var(--tw-border-opacity, 1))}.border-primary-500{--tw-border-opacity: 1;border-color:rgb(53 79 82 / var(--tw-border-opacity, 1))}.border-primary-600{--tw-border-opacity: 1;border-color:rgb(46 67 70 / var(--tw-border-opacity, 1))}.border-purple-200{--tw-border-opacity: 1;border-color:rgb(233 213 255 / var(--tw-border-opacity, 1))}.border-purple-500{--tw-border-opacity: 1;border-color:rgb(168 85 247 / var(--tw-border-opacity, 1))}.border-red-200{--tw-border-opacity: 1;border-color:rgb(254 202 202 / var(--tw-border-opacity, 1))}.border-red-300{--tw-border-opacity: 1;border-color:rgb(252 165 165 / var(--tw-border-opacity, 1))}.border-red-400{--tw-border-opacity: 1;border-color:rgb(248 113 113 / var(--tw-border-opacity, 1))}.border-red-500{--tw-border-opacity: 1;border-color:rgb(239 68 68 / var(--tw-border-opacity, 1))}.border-red-500\/30{border-color:#ef44444d}.border-red-500\/50{border-color:#ef444480}.border-teal-200{--tw-border-opacity: 1;border-color:rgb(153 246 228 / var(--tw-border-opacity, 1))}.border-transparent{border-color:transparent}.border-white{--tw-border-opacity: 1;border-color:rgb(255 255 255 / var(--tw-border-opacity, 1))}.border-yellow-200{--tw-border-opacity: 1;border-color:rgb(254 240 138 / var(--tw-border-opacity, 1))}.border-yellow-500\/30{border-color:#eab3084d}.border-yellow-500\/50{border-color:#eab30880}.border-t-primary-600{--tw-border-opacity: 1;border-top-color:rgb(46 67 70 / var(--tw-border-opacity, 1))}.border-t-transparent{border-top-color:transparent}.bg-\[\#354F52\]{--tw-bg-opacity: 1;background-color:rgb(53 79 82 / var(--tw-bg-opacity, 1))}.bg-\[\#E8EFEA\]{--tw-bg-opacity: 1;background-color:rgb(232 239 234 / var(--tw-bg-opacity, 1))}.bg-amber-100{--tw-bg-opacity: 1;background-color:rgb(254 243 199 / var(--tw-bg-opacity, 1))}.bg-amber-50{--tw-bg-opacity: 1;background-color:rgb(255 251 235 / var(--tw-bg-opacity, 1))}.bg-amber-600{--tw-bg-opacity: 1;background-color:rgb(217 119 6 / var(--tw-bg-opacity, 1))}.bg-black{--tw-bg-opacity: 1;background-color:rgb(0 0 0 / var(--tw-bg-opacity, 1))}.bg-blue-100{--tw-bg-opacity: 1;background-color:rgb(219 234 254 / var(--tw-bg-opacity, 1))}.bg-blue-50{--tw-bg-opacity: 1;background-color:rgb(239 246 255 / var(--tw-bg-opacity, 1))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity, 1))}.bg-blue-600{--tw-bg-opacity: 1;background-color:rgb(37 99 235 / var(--tw-bg-opacity, 1))}.bg-emerald-100{--tw-bg-opacity: 1;background-color:rgb(209 250 229 / var(--tw-bg-opacity, 1))}.bg-gray-100{--tw-bg-opacity: 1;background-color:rgb(243 244 246 / var(--tw-bg-opacity, 1))}.bg-gray-200{--tw-bg-opacity: 1;background-color:rgb(229 231 235 / var(--tw-bg-opacity, 1))}.bg-gray-50{--tw-bg-opacity: 1;background-color:rgb(249 250 251 / var(--tw-bg-opacity, 1))}.bg-gray-500{--tw-bg-opacity: 1;background-color:rgb(107 114 128 / var(--tw-bg-opacity, 1))}.bg-gray-800{--tw-bg-opacity: 1;background-color:rgb(31 41 55 / var(--tw-bg-opacity, 1))}.bg-gray-800\/50{background-color:#1f293780}.bg-gray-900{--tw-bg-opacity: 1;background-color:rgb(17 24 39 / var(--tw-bg-opacity, 1))}.bg-green-100{--tw-bg-opacity: 1;background-color:rgb(220 252 231 / var(--tw-bg-opacity, 1))}.bg-green-50{--tw-bg-opacity: 1;background-color:rgb(240 253 244 / var(--tw-bg-opacity, 1))}.bg-green-500{--tw-bg-opacity: 1;background-color:rgb(34 197 94 / var(--tw-bg-opacity, 1))}.bg-green-500\/10{background-color:#22c55e1a}.bg-green-500\/20{background-color:#22c55e33}.bg-green-500\/30{background-color:#22c55e4d}.bg-green-600{--tw-bg-opacity: 1;background-color:rgb(22 163 74 / var(--tw-bg-opacity, 1))}.bg-neutral-600{--tw-bg-opacity: 1;background-color:rgb(53 79 82 / var(--tw-bg-opacity, 1))}.bg-orange-100{--tw-bg-opacity: 1;background-color:rgb(255 237 213 / var(--tw-bg-opacity, 1))}.bg-orange-50{--tw-bg-opacity: 1;background-color:rgb(255 247 237 / var(--tw-bg-opacity, 1))}.bg-pink-100{--tw-bg-opacity: 1;background-color:rgb(252 231 243 / var(--tw-bg-opacity, 1))}.bg-primary-100{--tw-bg-opacity: 1;background-color:rgb(197 202 206 / var(--tw-bg-opacity, 1))}.bg-primary-50{--tw-bg-opacity: 1;background-color:rgb(232 234 235 / var(--tw-bg-opacity, 1))}.bg-primary-500{--tw-bg-opacity: 1;background-color:rgb(53 79 82 / var(--tw-bg-opacity, 1))}.bg-primary-600{--tw-bg-opacity: 1;background-color:rgb(46 67 70 / var(--tw-bg-opacity, 1))}.bg-purple-100{--tw-bg-opacity: 1;background-color:rgb(243 232 255 / var(--tw-bg-opacity, 1))}.bg-purple-50{--tw-bg-opacity: 1;background-color:rgb(250 245 255 / var(--tw-bg-opacity, 1))}.bg-red-100{--tw-bg-opacity: 1;background-color:rgb(254 226 226 / var(--tw-bg-opacity, 1))}.bg-red-50{--tw-bg-opacity: 1;background-color:rgb(254 242 242 / var(--tw-bg-opacity, 1))}.bg-red-500{--tw-bg-opacity: 1;background-color:rgb(239 68 68 / var(--tw-bg-opacity, 1))}.bg-red-500\/10{background-color:#ef44441a}.bg-red-500\/20{background-color:#ef444433}.bg-red-500\/30{background-color:#ef44444d}.bg-red-600{--tw-bg-opacity: 1;background-color:rgb(220 38 38 / var(--tw-bg-opacity, 1))}.bg-teal-100{--tw-bg-opacity: 1;background-color:rgb(204 251 241 / var(--tw-bg-opacity, 1))}.bg-white{--tw-bg-opacity: 1;background-color:rgb(255 255 255 / var(--tw-bg-opacity, 1))}.bg-white\/10{background-color:#ffffff1a}.bg-white\/95{background-color:#fffffff2}.bg-yellow-100{--tw-bg-opacity: 1;background-color:rgb(254 249 195 / var(--tw-bg-opacity, 1))}.bg-yellow-50{--tw-bg-opacity: 1;background-color:rgb(254 252 232 / var(--tw-bg-opacity, 1))}.bg-yellow-500{--tw-bg-opacity: 1;background-color:rgb(234 179 8 / var(--tw-bg-opacity, 1))}.bg-yellow-500\/10{background-color:#eab3081a}.bg-yellow-500\/20{background-color:#eab30833}.bg-yellow-500\/30{background-color:#eab3084d}.bg-opacity-10{--tw-bg-opacity: .1}.bg-opacity-25{--tw-bg-opacity: .25}.bg-opacity-50{--tw-bg-opacity: .5}.bg-gradient-to-br{background-image:linear-gradient(to bottom right,var(--tw-gradient-stops))}.bg-gradient-to-r{background-image:linear-gradient(to right,var(--tw-gradient-stops))}.from-\[\#52796F\]{--tw-gradient-from: #52796F var(--tw-gradient-from-position);--tw-gradient-to: rgb(82 121 111 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-blue-50{--tw-gradient-from: #eff6ff var(--tw-gradient-from-position);--tw-gradient-to: rgb(239 246 255 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-gray-50{--tw-gradient-from: #f9fafb var(--tw-gradient-from-position);--tw-gradient-to: rgb(249 250 251 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-primary-50{--tw-gradient-from: #e8eaeb var(--tw-gradient-from-position);--tw-gradient-to: rgb(232 234 235 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-primary-500{--tw-gradient-from: #354F52 var(--tw-gradient-from-position);--tw-gradient-to: rgb(53 79 82 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-primary-600{--tw-gradient-from: #2e4346 var(--tw-gradient-from-position);--tw-gradient-to: rgb(46 67 70 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-purple-600{--tw-gradient-from: #9333ea var(--tw-gradient-from-position);--tw-gradient-to: rgb(147 51 234 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-red-500{--tw-gradient-from: #ef4444 var(--tw-gradient-from-position);--tw-gradient-to: rgb(239 68 68 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-teal-50{--tw-gradient-from: #f0fdfa var(--tw-gradient-from-position);--tw-gradient-to: rgb(240 253 250 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.to-\[\#84A98C\]{--tw-gradient-to: #84A98C var(--tw-gradient-to-position)}.to-blue-50{--tw-gradient-to: #eff6ff var(--tw-gradient-to-position)}.to-gray-100{--tw-gradient-to: #f3f4f6 var(--tw-gradient-to-position)}.to-indigo-50{--tw-gradient-to: #eef2ff var(--tw-gradient-to-position)}.to-pink-600{--tw-gradient-to: #db2777 var(--tw-gradient-to-position)}.to-primary-100{--tw-gradient-to: #c5cace var(--tw-gradient-to-position)}.to-primary-600{--tw-gradient-to: #2e4346 var(--tw-gradient-to-position)}.to-primary-700{--tw-gradient-to: #27383a var(--tw-gradient-to-position)}.to-red-600{--tw-gradient-to: #dc2626 var(--tw-gradient-to-position)}.to-white{--tw-gradient-to: #fff var(--tw-gradient-to-position)}.bg-clip-text{-webkit-background-clip:text;background-clip:text}.object-contain{-o-object-fit:contain;object-fit:contain}.object-cover{-o-object-fit:cover;object-fit:cover}.p-0\.5{padding:.125rem}.p-12{padding:3rem}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.p-8{padding:2rem}.px-1\.5{padding-left:.375rem;padding-right:.375rem}.px-12{padding-left:3rem;padding-right:3rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-2\.5{padding-left:.625rem;padding-right:.625rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.px-5{padding-left:1.25rem;padding-right:1.25rem}.px-6{padding-left:1.5rem;padding-right:1.5rem}.px-8{padding-left:2rem;padding-right:2rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-1\.5{padding-top:.375rem;padding-bottom:.375rem}.py-12{padding-top:3rem;padding-bottom:3rem}.py-16{padding-top:4rem;padding-bottom:4rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-2\.5{padding-top:.625rem;padding-bottom:.625rem}.py-20{padding-top:5rem;padding-bottom:5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-4{padding-top:1rem;padding-bottom:1rem}.py-6{padding-top:1.5rem;padding-bottom:1.5rem}.py-8{padding-top:2rem;padding-bottom:2rem}.pb-1{padding-bottom:.25rem}.pb-12{padding-bottom:3rem}.pb-16{padding-bottom:4rem}.pb-20{padding-bottom:5rem}.pb-4{padding-bottom:1rem}.pb-6{padding-bottom:1.5rem}.pl-10{padding-left:2.5rem}.pl-12{padding-left:3rem}.pl-4{padding-left:1rem}.pt-16{padding-top:4rem}.pt-2{padding-top:.5rem}.pt-20{padding-top:5rem}.pt-3{padding-top:.75rem}.pt-32{padding-top:8rem}.pt-4{padding-top:1rem}.text-left{text-align:left}.text-center{text-align:center}.align-middle{vertical-align:middle}.font-mono{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace}.text-2xl{font-size:1.5rem;line-height:2rem}.text-3xl{font-size:1.875rem;line-height:2.25rem}.text-4xl{font-size:2.25rem;line-height:2.5rem}.text-5xl{font-size:3rem;line-height:1}.text-6xl{font-size:3.75rem;line-height:1}.text-\[10px\]{font-size:10px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.capitalize{text-transform:capitalize}.italic{font-style:italic}.leading-5{line-height:1.25rem}.leading-6{line-height:1.5rem}.leading-relaxed{line-height:1.625}.tracking-wide{letter-spacing:.025em}.tracking-wider{letter-spacing:.05em}.text-\[\#354F52\]{--tw-text-opacity: 1;color:rgb(53 79 82 / var(--tw-text-opacity, 1))}.text-\[\#52796F\]{--tw-text-opacity: 1;color:rgb(82 121 111 / var(--tw-text-opacity, 1))}.text-amber-600{--tw-text-opacity: 1;color:rgb(217 119 6 / var(--tw-text-opacity, 1))}.text-amber-700{--tw-text-opacity: 1;color:rgb(180 83 9 / var(--tw-text-opacity, 1))}.text-amber-800{--tw-text-opacity: 1;color:rgb(146 64 14 / var(--tw-text-opacity, 1))}.text-blue-300{--tw-text-opacity: 1;color:rgb(147 197 253 / var(--tw-text-opacity, 1))}.text-blue-500{--tw-text-opacity: 1;color:rgb(59 130 246 / var(--tw-text-opacity, 1))}.text-blue-600{--tw-text-opacity: 1;color:rgb(37 99 235 / var(--tw-text-opacity, 1))}.text-blue-700{--tw-text-opacity: 1;color:rgb(29 78 216 / var(--tw-text-opacity, 1))}.text-blue-800{--tw-text-opacity: 1;color:rgb(30 64 175 / var(--tw-text-opacity, 1))}.text-blue-900{--tw-text-opacity: 1;color:rgb(30 58 138 / var(--tw-text-opacity, 1))}.text-cyan-600{--tw-text-opacity: 1;color:rgb(8 145 178 / var(--tw-text-opacity, 1))}.text-emerald-600{--tw-text-opacity: 1;color:rgb(5 150 105 / var(--tw-text-opacity, 1))}.text-gray-200{--tw-text-opacity: 1;color:rgb(229 231 235 / var(--tw-text-opacity, 1))}.text-gray-300{--tw-text-opacity: 1;color:rgb(209 213 219 / var(--tw-text-opacity, 1))}.text-gray-400{--tw-text-opacity: 1;color:rgb(156 163 175 / var(--tw-text-opacity, 1))}.text-gray-500{--tw-text-opacity: 1;color:rgb(107 114 128 / var(--tw-text-opacity, 1))}.text-gray-600{--tw-text-opacity: 1;color:rgb(75 85 99 / var(--tw-text-opacity, 1))}.text-gray-700{--tw-text-opacity: 1;color:rgb(55 65 81 / var(--tw-text-opacity, 1))}.text-gray-800{--tw-text-opacity: 1;color:rgb(31 41 55 / var(--tw-text-opacity, 1))}.text-gray-900{--tw-text-opacity: 1;color:rgb(17 24 39 / var(--tw-text-opacity, 1))}.text-green-300{--tw-text-opacity: 1;color:rgb(134 239 172 / var(--tw-text-opacity, 1))}.text-green-300\/70{color:#86efacb3}.text-green-400{--tw-text-opacity: 1;color:rgb(74 222 128 / var(--tw-text-opacity, 1))}.text-green-500{--tw-text-opacity: 1;color:rgb(34 197 94 / var(--tw-text-opacity, 1))}.text-green-600{--tw-text-opacity: 1;color:rgb(22 163 74 / var(--tw-text-opacity, 1))}.text-green-700{--tw-text-opacity: 1;color:rgb(21 128 61 / var(--tw-text-opacity, 1))}.text-green-800{--tw-text-opacity: 1;color:rgb(22 101 52 / var(--tw-text-opacity, 1))}.text-green-900{--tw-text-opacity: 1;color:rgb(20 83 45 / var(--tw-text-opacity, 1))}.text-orange-600{--tw-text-opacity: 1;color:rgb(234 88 12 / var(--tw-text-opacity, 1))}.text-orange-700{--tw-text-opacity: 1;color:rgb(194 65 12 / var(--tw-text-opacity, 1))}.text-orange-800{--tw-text-opacity: 1;color:rgb(154 52 18 / var(--tw-text-opacity, 1))}.text-orange-900{--tw-text-opacity: 1;color:rgb(124 45 18 / var(--tw-text-opacity, 1))}.text-pink-600{--tw-text-opacity: 1;color:rgb(219 39 119 / var(--tw-text-opacity, 1))}.text-pink-700{--tw-text-opacity: 1;color:rgb(190 24 93 / var(--tw-text-opacity, 1))}.text-primary-50{--tw-text-opacity: 1;color:rgb(232 234 235 / var(--tw-text-opacity, 1))}.text-primary-600{--tw-text-opacity: 1;color:rgb(46 67 70 / var(--tw-text-opacity, 1))}.text-primary-700{--tw-text-opacity: 1;color:rgb(39 56 58 / var(--tw-text-opacity, 1))}.text-purple-50{--tw-text-opacity: 1;color:rgb(250 245 255 / var(--tw-text-opacity, 1))}.text-purple-600{--tw-text-opacity: 1;color:rgb(147 51 234 / var(--tw-text-opacity, 1))}.text-purple-700{--tw-text-opacity: 1;color:rgb(126 34 206 / var(--tw-text-opacity, 1))}.text-purple-800{--tw-text-opacity: 1;color:rgb(107 33 168 / var(--tw-text-opacity, 1))}.text-purple-900{--tw-text-opacity: 1;color:rgb(88 28 135 / var(--tw-text-opacity, 1))}.text-red-300{--tw-text-opacity: 1;color:rgb(252 165 165 / var(--tw-text-opacity, 1))}.text-red-300\/70{color:#fca5a5b3}.text-red-400{--tw-text-opacity: 1;color:rgb(248 113 113 / var(--tw-text-opacity, 1))}.text-red-500{--tw-text-opacity: 1;color:rgb(239 68 68 / var(--tw-text-opacity, 1))}.text-red-600{--tw-text-opacity: 1;color:rgb(220 38 38 / var(--tw-text-opacity, 1))}.text-red-700{--tw-text-opacity: 1;color:rgb(185 28 28 / var(--tw-text-opacity, 1))}.text-red-800{--tw-text-opacity: 1;color:rgb(153 27 27 / var(--tw-text-opacity, 1))}.text-red-900{--tw-text-opacity: 1;color:rgb(127 29 29 / var(--tw-text-opacity, 1))}.text-teal-600{--tw-text-opacity: 1;color:rgb(13 148 136 / var(--tw-text-opacity, 1))}.text-teal-700{--tw-text-opacity: 1;color:rgb(15 118 110 / var(--tw-text-opacity, 1))}.text-teal-800{--tw-text-opacity: 1;color:rgb(17 94 89 / var(--tw-text-opacity, 1))}.text-transparent{color:transparent}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.text-white\/90{color:#ffffffe6}.text-yellow-300{--tw-text-opacity: 1;color:rgb(253 224 71 / var(--tw-text-opacity, 1))}.text-yellow-300\/70{color:#fde047b3}.text-yellow-400{--tw-text-opacity: 1;color:rgb(250 204 21 / var(--tw-text-opacity, 1))}.text-yellow-600{--tw-text-opacity: 1;color:rgb(202 138 4 / var(--tw-text-opacity, 1))}.text-yellow-700{--tw-text-opacity: 1;color:rgb(161 98 7 / var(--tw-text-opacity, 1))}.text-yellow-800{--tw-text-opacity: 1;color:rgb(133 77 14 / var(--tw-text-opacity, 1))}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.opacity-0{opacity:0}.opacity-100{opacity:1}.opacity-60{opacity:.6}.opacity-90{opacity:.9}.shadow{--tw-shadow: 0 1px 3px 0 rgb(0 0 0 / .1), 0 1px 2px -1px rgb(0 0 0 / .1);--tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-2xl{--tw-shadow: 0 25px 50px -12px rgb(0 0 0 / .25);--tw-shadow-colored: 0 25px 50px -12px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-md{--tw-shadow: 0 4px 6px -1px rgb(0 0 0 / .1), 0 2px 4px -2px rgb(0 0 0 / .1);--tw-shadow-colored: 0 4px 6px -1px var(--tw-shadow-color), 0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-sm{--tw-shadow: 0 1px 2px 0 rgb(0 0 0 / .05);--tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-xl{--tw-shadow: 0 20px 25px -5px rgb(0 0 0 / .1), 0 8px 10px -6px rgb(0 0 0 / .1);--tw-shadow-colored: 0 20px 25px -5px var(--tw-shadow-color), 0 8px 10px -6px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.outline{outline-style:solid}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop-blur-sm{--tw-backdrop-blur: blur(4px);-webkit-backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.transition{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-opacity{transition-property:opacity;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-shadow{transition-property:box-shadow;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-transform{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-100{transition-duration:.1s}.duration-200{transition-duration:.2s}.duration-300{transition-duration:.3s}.duration-75{transition-duration:75ms}.ease-in{transition-timing-function:cubic-bezier(.4,0,1,1)}.ease-in-out{transition-timing-function:cubic-bezier(.4,0,.2,1)}.ease-out{transition-timing-function:cubic-bezier(0,0,.2,1)}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;line-height:1.5;font-weight:400;color-scheme:light dark;color:#ffffffde;background-color:#f1f5f9}html{scroll-behavior:smooth}body{margin:0;min-height:100vh}@keyframes slideUp{0%{opacity:0;transform:translateY(30px)}to{opacity:1;transform:translateY(0)}}.last\:rounded-b-lg:last-child{border-bottom-right-radius:.5rem;border-bottom-left-radius:.5rem}.last\:border-b-0:last-child{border-bottom-width:0px}.hover\:-translate-y-1:hover{--tw-translate-y: -.25rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.hover\:scale-105:hover{--tw-scale-x: 1.05;--tw-scale-y: 1.05;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.hover\:border-2:hover{border-width:2px}.hover\:border-\[\#354F52\]:hover{--tw-border-opacity: 1;border-color:rgb(53 79 82 / var(--tw-border-opacity, 1))}.hover\:border-amber-500:hover{--tw-border-opacity: 1;border-color:rgb(245 158 11 / var(--tw-border-opacity, 1))}.hover\:border-blue-500:hover{--tw-border-opacity: 1;border-color:rgb(59 130 246 / var(--tw-border-opacity, 1))}.hover\:border-gray-200:hover{--tw-border-opacity: 1;border-color:rgb(229 231 235 / var(--tw-border-opacity, 1))}.hover\:border-gray-300:hover{--tw-border-opacity: 1;border-color:rgb(209 213 219 / var(--tw-border-opacity, 1))}.hover\:border-gray-400:hover{--tw-border-opacity: 1;border-color:rgb(156 163 175 / var(--tw-border-opacity, 1))}.hover\:border-green-500:hover{--tw-border-opacity: 1;border-color:rgb(34 197 94 / var(--tw-border-opacity, 1))}.hover\:border-primary-500:hover{--tw-border-opacity: 1;border-color:rgb(53 79 82 / var(--tw-border-opacity, 1))}.hover\:border-purple-500:hover{--tw-border-opacity: 1;border-color:rgb(168 85 247 / var(--tw-border-opacity, 1))}.hover\:bg-\[\#d9e5db\]:hover{--tw-bg-opacity: 1;background-color:rgb(217 229 219 / var(--tw-bg-opacity, 1))}.hover\:bg-amber-200:hover{--tw-bg-opacity: 1;background-color:rgb(253 230 138 / var(--tw-bg-opacity, 1))}.hover\:bg-blue-200:hover{--tw-bg-opacity: 1;background-color:rgb(191 219 254 / var(--tw-bg-opacity, 1))}.hover\:bg-blue-700:hover{--tw-bg-opacity: 1;background-color:rgb(29 78 216 / var(--tw-bg-opacity, 1))}.hover\:bg-gray-100:hover{--tw-bg-opacity: 1;background-color:rgb(243 244 246 / var(--tw-bg-opacity, 1))}.hover\:bg-gray-200:hover{--tw-bg-opacity: 1;background-color:rgb(229 231 235 / var(--tw-bg-opacity, 1))}.hover\:bg-gray-300:hover{--tw-bg-opacity: 1;background-color:rgb(209 213 219 / var(--tw-bg-opacity, 1))}.hover\:bg-gray-50:hover{--tw-bg-opacity: 1;background-color:rgb(249 250 251 / var(--tw-bg-opacity, 1))}.hover\:bg-gray-700\/50:hover{background-color:#37415180}.hover\:bg-green-200:hover{--tw-bg-opacity: 1;background-color:rgb(187 247 208 / var(--tw-bg-opacity, 1))}.hover\:bg-green-700:hover{--tw-bg-opacity: 1;background-color:rgb(21 128 61 / var(--tw-bg-opacity, 1))}.hover\:bg-neutral-700:hover{--tw-bg-opacity: 1;background-color:rgb(46 67 70 / var(--tw-bg-opacity, 1))}.hover\:bg-primary-100:hover{--tw-bg-opacity: 1;background-color:rgb(197 202 206 / var(--tw-bg-opacity, 1))}.hover\:bg-primary-50:hover{--tw-bg-opacity: 1;background-color:rgb(232 234 235 / var(--tw-bg-opacity, 1))}.hover\:bg-primary-700:hover{--tw-bg-opacity: 1;background-color:rgb(39 56 58 / var(--tw-bg-opacity, 1))}.hover\:bg-purple-200:hover{--tw-bg-opacity: 1;background-color:rgb(233 213 255 / var(--tw-bg-opacity, 1))}.hover\:bg-red-100:hover{--tw-bg-opacity: 1;background-color:rgb(254 226 226 / var(--tw-bg-opacity, 1))}.hover\:bg-red-700:hover{--tw-bg-opacity: 1;background-color:rgb(185 28 28 / var(--tw-bg-opacity, 1))}.hover\:bg-teal-200:hover{--tw-bg-opacity: 1;background-color:rgb(153 246 228 / var(--tw-bg-opacity, 1))}.hover\:bg-white\/20:hover{background-color:#fff3}.hover\:text-\[\#354F52\]:hover{--tw-text-opacity: 1;color:rgb(53 79 82 / var(--tw-text-opacity, 1))}.hover\:text-amber-900:hover{--tw-text-opacity: 1;color:rgb(120 53 15 / var(--tw-text-opacity, 1))}.hover\:text-blue-600:hover{--tw-text-opacity: 1;color:rgb(37 99 235 / var(--tw-text-opacity, 1))}.hover\:text-blue-700:hover{--tw-text-opacity: 1;color:rgb(29 78 216 / var(--tw-text-opacity, 1))}.hover\:text-blue-800:hover{--tw-text-opacity: 1;color:rgb(30 64 175 / var(--tw-text-opacity, 1))}.hover\:text-gray-500:hover{--tw-text-opacity: 1;color:rgb(107 114 128 / var(--tw-text-opacity, 1))}.hover\:text-gray-600:hover{--tw-text-opacity: 1;color:rgb(75 85 99 / var(--tw-text-opacity, 1))}.hover\:text-gray-700:hover{--tw-text-opacity: 1;color:rgb(55 65 81 / var(--tw-text-opacity, 1))}.hover\:text-gray-900:hover{--tw-text-opacity: 1;color:rgb(17 24 39 / var(--tw-text-opacity, 1))}.hover\:text-primary-100:hover{--tw-text-opacity: 1;color:rgb(197 202 206 / var(--tw-text-opacity, 1))}.hover\:text-primary-600:hover{--tw-text-opacity: 1;color:rgb(46 67 70 / var(--tw-text-opacity, 1))}.hover\:text-primary-700:hover{--tw-text-opacity: 1;color:rgb(39 56 58 / var(--tw-text-opacity, 1))}.hover\:text-red-700:hover{--tw-text-opacity: 1;color:rgb(185 28 28 / var(--tw-text-opacity, 1))}.hover\:text-white:hover{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.hover\:underline:hover{text-decoration-line:underline}.hover\:decoration-2:hover{text-decoration-thickness:2px}.hover\:opacity-75:hover{opacity:.75}.hover\:opacity-90:hover{opacity:.9}.hover\:shadow-2xl:hover{--tw-shadow: 0 25px 50px -12px rgb(0 0 0 / .25);--tw-shadow-colored: 0 25px 50px -12px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.hover\:shadow-lg:hover{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.hover\:shadow-md:hover{--tw-shadow: 0 4px 6px -1px rgb(0 0 0 / .1), 0 2px 4px -2px rgb(0 0 0 / .1);--tw-shadow-colored: 0 4px 6px -1px var(--tw-shadow-color), 0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.hover\:shadow-xl:hover{--tw-shadow: 0 20px 25px -5px rgb(0 0 0 / .1), 0 8px 10px -6px rgb(0 0 0 / .1);--tw-shadow-colored: 0 20px 25px -5px var(--tw-shadow-color), 0 8px 10px -6px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.focus\:border-blue-500:focus{--tw-border-opacity: 1;border-color:rgb(59 130 246 / var(--tw-border-opacity, 1))}.focus\:border-primary-500:focus{--tw-border-opacity: 1;border-color:rgb(53 79 82 / var(--tw-border-opacity, 1))}.focus\:border-transparent:focus{border-color:transparent}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:ring-2:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-\[\#354F52\]:focus{--tw-ring-opacity: 1;--tw-ring-color: rgb(53 79 82 / var(--tw-ring-opacity, 1))}.focus\:ring-blue-500:focus{--tw-ring-opacity: 1;--tw-ring-color: rgb(59 130 246 / var(--tw-ring-opacity, 1))}.focus\:ring-primary-500:focus{--tw-ring-opacity: 1;--tw-ring-color: rgb(53 79 82 / var(--tw-ring-opacity, 1))}.focus\:ring-offset-2:focus{--tw-ring-offset-width: 2px}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:bg-gray-400:disabled{--tw-bg-opacity: 1;background-color:rgb(156 163 175 / var(--tw-bg-opacity, 1))}.disabled\:opacity-50:disabled{opacity:.5}.group:hover .group-hover\:translate-x-1{--tw-translate-x: .25rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.group:hover .group-hover\:scale-110{--tw-scale-x: 1.1;--tw-scale-y: 1.1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.group:hover .group-hover\:scale-x-100{--tw-scale-x: 1;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.group:hover .group-hover\:bg-amber-200{--tw-bg-opacity: 1;background-color:rgb(253 230 138 / var(--tw-bg-opacity, 1))}.group:hover .group-hover\:bg-blue-200{--tw-bg-opacity: 1;background-color:rgb(191 219 254 / var(--tw-bg-opacity, 1))}.group:hover .group-hover\:bg-green-200{--tw-bg-opacity: 1;background-color:rgb(187 247 208 / var(--tw-bg-opacity, 1))}.group:hover .group-hover\:bg-purple-200{--tw-bg-opacity: 1;background-color:rgb(233 213 255 / var(--tw-bg-opacity, 1))}.group:hover .group-hover\:text-\[\#354F52\]{--tw-text-opacity: 1;color:rgb(53 79 82 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-amber-600{--tw-text-opacity: 1;color:rgb(217 119 6 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-blue-200{--tw-text-opacity: 1;color:rgb(191 219 254 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-blue-600{--tw-text-opacity: 1;color:rgb(37 99 235 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-gray-600{--tw-text-opacity: 1;color:rgb(75 85 99 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-green-600{--tw-text-opacity: 1;color:rgb(22 163 74 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-primary-600{--tw-text-opacity: 1;color:rgb(46 67 70 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:text-purple-600{--tw-text-opacity: 1;color:rgb(147 51 234 / var(--tw-text-opacity, 1))}.group:hover .group-hover\:opacity-100{opacity:1}@media (min-width: 640px){.sm\:w-auto{width:auto}.sm\:flex-row{flex-direction:row}.sm\:px-6{padding-left:1.5rem;padding-right:1.5rem}}@media (min-width: 768px){.md\:block{display:block}.md\:inline{display:inline}.md\:flex{display:flex}.md\:hidden{display:none}.md\:h-12{height:3rem}.md\:translate-x-0{--tw-translate-x: 0px;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.md\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.md\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.md\:gap-2{gap:.5rem}.md\:gap-3{gap:.75rem}.md\:gap-4{gap:1rem}.md\:px-4{padding-left:1rem;padding-right:1rem}.md\:px-6{padding-left:1.5rem;padding-right:1.5rem}.md\:pl-64{padding-left:16rem}.md\:text-2xl{font-size:1.5rem;line-height:2rem}.md\:text-5xl{font-size:3rem;line-height:1}.md\:text-7xl{font-size:4.5rem;line-height:1}.md\:text-base{font-size:1rem;line-height:1.5rem}}@media (min-width: 1024px){.lg\:col-span-2{grid-column:span 2 / span 2}.lg\:col-span-3{grid-column:span 3 / span 3}.lg\:col-span-7{grid-column:span 7 / span 7}.lg\:flex{display:flex}.lg\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.lg\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.lg\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.lg\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.lg\:px-8{padding-left:2rem;padding-right:2rem}}@media (min-width: 1280px){.xl\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}}
|
api/static/assets/index-DoIJncqg.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
api/static/communityone_logo.jpg
ADDED
|
api/static/communityone_logo.svg
ADDED
|
|
api/static/communityone_logo_64.png
ADDED
|
api/static/favicon.ico
ADDED
|
|