-- ============================================================================ -- NEON DATABASE SCHEMA FOR OPEN NAVIGATOR -- Optimized for fast API queries on HuggingFace Spaces deployment -- ============================================================================ -- Drop existing tables if rerunning (careful in production!) DROP TABLE IF EXISTS stats_aggregates CASCADE; DROP TABLE IF EXISTS nonprofits_search CASCADE; DROP TABLE IF EXISTS jurisdictions_search CASCADE; DROP TABLE IF EXISTS contacts_search CASCADE; DROP TABLE IF EXISTS events_search CASCADE; DROP TABLE IF EXISTS reference_causes CASCADE; DROP TABLE IF EXISTS reference_ntee_codes CASCADE; DROP TABLE IF EXISTS last_sync CASCADE; -- ============================================================================ -- AGGREGATE STATISTICS TABLES -- Pre-computed stats for fast dashboard loading -- ============================================================================ CREATE TABLE stats_aggregates ( id SERIAL PRIMARY KEY, level VARCHAR(20) NOT NULL, -- 'national', 'state', 'county', 'city' state VARCHAR(2), -- Two-letter state code (e.g., 'MA') county VARCHAR(100), -- County name city VARCHAR(100), -- City name -- Core metrics jurisdictions_count INTEGER DEFAULT 0, school_districts_count INTEGER DEFAULT 0, nonprofits_count INTEGER DEFAULT 0, events_count INTEGER DEFAULT 0, bills_count INTEGER DEFAULT 0, contacts_count INTEGER DEFAULT 0, -- Financial aggregates (from nonprofit data) total_revenue BIGINT DEFAULT 0, total_assets BIGINT DEFAULT 0, -- Metadata last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- Unique constraint on geographic level UNIQUE (level, state, county, city) ); -- Indexes for fast lookups by geography CREATE INDEX idx_stats_level ON stats_aggregates(level); CREATE INDEX idx_stats_state ON stats_aggregates(state) WHERE state IS NOT NULL; CREATE INDEX idx_stats_state_county ON stats_aggregates(state, county) WHERE county IS NOT NULL; CREATE INDEX idx_stats_state_city ON stats_aggregates(state, city) WHERE city IS NOT NULL; -- ============================================================================ -- SEARCH-OPTIMIZED TABLES -- Denormalized for fast full-text search -- ============================================================================ -- Nonprofits search table (most frequently searched) CREATE TABLE nonprofits_search ( ein VARCHAR(20) PRIMARY KEY, name TEXT NOT NULL, street_address TEXT, city VARCHAR(100), state VARCHAR(2), zip_code VARCHAR(10), county VARCHAR(100), -- Classification ntee_code VARCHAR(10), ntee_description TEXT, subsection_code VARCHAR(10), affiliation_code VARCHAR(10), classification_code VARCHAR(20), -- Financial (most recent year) revenue BIGINT, assets BIGINT, income BIGINT, -- Status ruling_date DATE, foundation_code VARCHAR(10), pf_filing_requirement_code VARCHAR(10), accounting_period VARCHAR(10), asset_code VARCHAR(10), income_code VARCHAR(10), filing_requirement_code VARCHAR(10), exempt_organization_status_code VARCHAR(10), tax_period VARCHAR(10), asset_amount BIGINT, income_amount BIGINT, form_990_revenue_amount BIGINT, -- Metadata source VARCHAR(50) DEFAULT 'irs_bmf', last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); -- Full-text search indexes CREATE INDEX idx_nonprofits_name_search ON nonprofits_search USING GIN (to_tsvector('english', name)); CREATE INDEX idx_nonprofits_state ON nonprofits_search(state); CREATE INDEX idx_nonprofits_city_state ON nonprofits_search(city, state); CREATE INDEX idx_nonprofits_county ON nonprofits_search(county); CREATE INDEX idx_nonprofits_ntee ON nonprofits_search(ntee_code); CREATE INDEX idx_nonprofits_zip ON nonprofits_search(zip_code); -- Jurisdictions search table (cities, counties, townships) CREATE TABLE jurisdictions_search ( id SERIAL PRIMARY KEY, name TEXT NOT NULL, type VARCHAR(50) NOT NULL, -- 'city', 'county', 'township', 'school_district' state VARCHAR(2) NOT NULL, county VARCHAR(100), -- Geographic identifiers geoid VARCHAR(20), fips_code VARCHAR(20), -- Population/size population INTEGER, area_sq_miles DECIMAL(12, 2), -- Metadata source VARCHAR(50) DEFAULT 'census', last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- Unique constraint UNIQUE (name, type, state, county) ); CREATE INDEX idx_jurisdictions_name_search ON jurisdictions_search USING GIN (to_tsvector('english', name)); CREATE INDEX idx_jurisdictions_state ON jurisdictions_search(state); CREATE INDEX idx_jurisdictions_type ON jurisdictions_search(type); CREATE INDEX idx_jurisdictions_state_type ON jurisdictions_search(state, type); -- Contacts search table (nonprofit officers, legislators, etc.) CREATE TABLE contacts_search ( id SERIAL PRIMARY KEY, name TEXT NOT NULL, title VARCHAR(200), organization_name TEXT, organization_ein VARCHAR(20), -- Contact info email VARCHAR(255), phone VARCHAR(50), -- Address street_address TEXT, city VARCHAR(100), state VARCHAR(2), zip_code VARCHAR(10), -- Role/classification role_type VARCHAR(50), -- 'officer', 'legislator', 'board_member', etc. compensation BIGINT, hours_per_week DECIMAL(5, 1), -- Metadata source VARCHAR(50) DEFAULT 'irs_990', tax_year INTEGER, last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_contacts_name_search ON contacts_search USING GIN (to_tsvector('english', name)); CREATE INDEX idx_contacts_org_name_search ON contacts_search USING GIN (to_tsvector('english', organization_name)); CREATE INDEX idx_contacts_state ON contacts_search(state); CREATE INDEX idx_contacts_ein ON contacts_search(organization_ein); CREATE INDEX idx_contacts_role ON contacts_search(role_type); -- Events search table (meetings, hearings, events) CREATE TABLE events_search ( id SERIAL PRIMARY KEY, title TEXT NOT NULL, description TEXT, event_date DATE, event_time TIME, -- Organization jurisdiction_name VARCHAR(200), jurisdiction_type VARCHAR(50), state VARCHAR(2), city VARCHAR(100), -- Meeting details location TEXT, meeting_type VARCHAR(100), status VARCHAR(50), -- Documents/links agenda_url TEXT, minutes_url TEXT, video_url TEXT, -- Metadata source VARCHAR(50) DEFAULT 'legistar', last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_events_title_search ON events_search USING GIN (to_tsvector('english', title)); CREATE INDEX idx_events_date ON events_search(event_date DESC); CREATE INDEX idx_events_state ON events_search(state); CREATE INDEX idx_events_jurisdiction ON events_search(jurisdiction_name); CREATE INDEX idx_events_date_state ON events_search(event_date, state); -- ============================================================================ -- REFERENCE DATA TABLES -- Lookup tables for causes, NTEE codes, etc. -- ============================================================================ CREATE TABLE reference_causes ( id SERIAL PRIMARY KEY, cause_slug VARCHAR(100) UNIQUE NOT NULL, cause_name TEXT NOT NULL, description TEXT, parent_category VARCHAR(100), -- Metadata source VARCHAR(50) DEFAULT 'everyorg', last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_causes_slug ON reference_causes(cause_slug); CREATE INDEX idx_causes_name_search ON reference_causes USING GIN (to_tsvector('english', cause_name)); CREATE TABLE reference_ntee_codes ( code VARCHAR(10) PRIMARY KEY, description TEXT NOT NULL, category VARCHAR(50), subcategory VARCHAR(100), -- Metadata source VARCHAR(50) DEFAULT 'irs', last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_ntee_description_search ON reference_ntee_codes USING GIN (to_tsvector('english', description)); -- ============================================================================ -- SYNC TRACKING TABLE -- Track when data was last synced from parquet files -- ============================================================================ CREATE TABLE last_sync ( table_name VARCHAR(100) PRIMARY KEY, last_sync_time TIMESTAMP NOT NULL, records_synced INTEGER DEFAULT 0, sync_status VARCHAR(50) DEFAULT 'success', error_message TEXT ); -- ============================================================================ -- HELPER VIEWS FOR COMMON QUERIES -- ============================================================================ -- Active nonprofits with basic info CREATE VIEW nonprofits_active AS SELECT ein, name, city, state, ntee_code, ntee_description, revenue, assets FROM nonprofits_search WHERE exempt_organization_status_code IS NULL OR exempt_organization_status_code NOT IN ('T', 'X') -- Exclude terminated ORDER BY revenue DESC NULLS LAST; -- Recent events (last 30 days + upcoming) CREATE VIEW events_recent AS SELECT id, title, description, event_date, jurisdiction_name, state, city, status, agenda_url FROM events_search WHERE event_date >= CURRENT_DATE - INTERVAL '30 days' ORDER BY event_date DESC; -- Top nonprofits by state CREATE VIEW nonprofits_top_by_state AS SELECT state, ein, name, city, revenue, ROW_NUMBER() OVER (PARTITION BY state ORDER BY revenue DESC NULLS LAST) as rank FROM nonprofits_search WHERE revenue IS NOT NULL ORDER BY state, rank; -- ============================================================================ -- GRANT PERMISSIONS (adjust as needed for your user) -- ============================================================================ -- Grant SELECT to your application user if needed -- GRANT SELECT ON ALL TABLES IN SCHEMA public TO your_app_user; -- GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO your_app_user;