Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

App Files Files Community

open-navigator / api /routes /search.py

jcbowyer

Clean HuggingFace deployment without binary files

61d29fc 28 days ago

raw

history blame contribute delete

74.6 kB

	"""
	Unified Search API
	LinkedIn-style search across contacts, meetings, organizations, and causes
	Uses hybrid approach: PostgreSQL (primary, fast) + HuggingFace Search API + DuckDB (fallback)
	"""
	from fastapi import APIRouter, Query, HTTPException
	from fastapi.responses import JSONResponse
	from typing import Optional, List, Dict, Any
	from pathlib import Path
	import pandas as pd
	import duckdb
	from loguru import logger
	import re
	import os
	import sys
	import requests
	from functools import lru_cache
	from datetime import datetime, timedelta

	from api.errors import ErrorDetail, parse_error

	# Import PostgreSQL search functions (primary)
	from api.routes import search_postgres

	# Import HuggingFace Search helpers
	from api.routes.hf_search import (
	search_contacts_hf,
	search_organizations_hf,
	search_jurisdictions_hf,
	is_dataset_indexed
	)

	router = APIRouter(tags=["search"])

	# Paths to gold datasets
	GOLD_DIR = Path("data/gold")

	# Detect deployment environment
	IS_HF_SPACES = os.getenv("HF_SPACES") == "1"
	HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')

	# Cache for count queries (TTL: 1 hour)
	_count_cache = {}
	_count_cache_ttl = {}

	# In-memory DataFrame cache for HuggingFace datasets (TTL: 5 minutes)
	# Reduces remote HTTP requests from 2-3s to <10ms per search
	_dataframe_cache: Dict[str, pd.DataFrame] = {}
	_dataframe_cache_ttl: Dict[str, datetime] = {}
	DATAFRAME_CACHE_TTL = timedelta(minutes=5)

	# Every.org API config (fallback only)
	EVERYORG_API_KEY = os.getenv('EVERYORG_API_KEY', '')
	EVERYORG_API_BASE = "https://partners.every.org/v0.2"


	def load_parquet_cached(url: str) -> pd.DataFrame:
	"""
	Load parquet file with in-memory caching to avoid repeated HTTP requests.

	Cache TTL: 5 minutes (balances speed vs freshness)
	Reduces search latency from 2-3s to <10ms per query.

	Args:
	url: URL to parquet file (local path or HuggingFace URL)

	Returns:
	pandas DataFrame
	"""
	now = datetime.now()

	# Check cache
	if url in _dataframe_cache:
	cache_time = _dataframe_cache_ttl.get(url)
	if cache_time and (now - cache_time) < DATAFRAME_CACHE_TTL:
	logger.debug(f"🚀 Cache hit for {url}")
	return _dataframe_cache[url]

	# Cache miss - load from source
	logger.info(f"📥 Loading parquet from {url}")
	df = pd.read_parquet(url)

	# Store in cache
	_dataframe_cache[url] = df
	_dataframe_cache_ttl[url] = now
	logger.debug(f"💾 Cached {len(df)} rows from {url}")

	return df


	def get_hf_dataset_url(dataset_name: str) -> str:
	"""
	Convert dataset name to HuggingFace parquet URL.

	HuggingFace Datasets library stores parquet files in the standard format:
	data/train-00000-of-00001.parquet

	Examples:
	states-ma-contacts-local-officials ->
	https://huggingface.co/datasets/CommunityOne/states-ma-contacts-local-officials/resolve/main/data/train-00000-of-00001.parquet
	"""
	return f"https://huggingface.co/datasets/{HF_ORGANIZATION}/{dataset_name}/resolve/main/data/train-00000-of-00001.parquet"


	def get_data_source(file_path: Path, use_remote: bool = False) -> str:
	"""
	Get data source (local path or remote URL) based on environment.

	Args:
	file_path: Local file path (e.g., data/gold/states/MA/contacts_local_officials.parquet)
	use_remote: Force remote URL even in local environment

	Returns:
	File path string (local) or HuggingFace URL (remote)
	"""
	if not IS_HF_SPACES and not use_remote:
	return str(file_path)

	# Convert local path to HuggingFace dataset name
	# data/gold/states/MA/contacts_local_officials.parquet -> states-ma-contacts-local-officials
	parts = file_path.parts

	if 'states' in parts:
	state_idx = parts.index('states')
	state = parts[state_idx + 1].lower()
	filename = parts[-1].replace('.parquet', '').replace('_', '-')
	dataset_name = f"states-{state}-{filename}"
	elif 'national' in parts:
	filename = parts[-1].replace('.parquet', '').replace('_', '-')
	dataset_name = f"national-{filename}"
	elif 'reference' in parts:
	filename = parts[-1].replace('.parquet', '').replace('_', '-')
	dataset_name = f"reference-{filename}"
	else:
	# Fallback to local
	return str(file_path)

	return get_hf_dataset_url(dataset_name)


	@lru_cache(maxsize=5000)
	def fetch_form990_data(ein: str) -> Optional[Dict[str, Any]]:
	"""
	Fetch enrichment data from ProPublica Nonprofit Explorer (FREE!)
	Uses their API to get website and mission from Form 990 filings
	"""
	if not ein:
	return None

	try:
	clean_ein = str(ein).replace('-', '').zfill(9)
	url = f"https://projects.propublica.org/nonprofits/api/v2/organizations/{clean_ein}.json"

	response = requests.get(url, timeout=3)

	if response.status_code == 200:
	data = response.json()
	org = data.get('organization', {})
	filings = data.get('filings_with_data', [])

	# Get most recent filing data
	website = None
	mission = None

	if filings:
	# ProPublica provides website from most recent filing
	latest = filings[0]
	# Note: ProPublica API doesn't directly expose website field
	# but we can use their organization name and data as fallback
	pass

	return {
	'website': website, # ProPublica doesn't expose this in API
	'mission': None, # Would need to parse PDF
	'source': 'propublica',
	'last_updated': datetime.now().isoformat(),
	'tax_year': filings[0].get('tax_prd_yr') if filings else None
	}
	except Exception as e:
	logger.debug(f"ProPublica lookup failed for EIN {ein}: {e}")

	return None


	@lru_cache(maxsize=5000)
	def fetch_everyorg_data(ein: str) -> Optional[Dict[str, Any]]:
	"""Fetch enrichment data from Every.org API (cached) - FALLBACK ONLY"""
	if not EVERYORG_API_KEY or not ein:
	return None

	try:
	# Format EIN (remove dashes, ensure 9 digits)
	clean_ein = str(ein).replace('-', '').zfill(9)

	url = f"{EVERYORG_API_BASE}/nonprofit/{clean_ein}"
	headers = {
	"Authorization": f"Bearer {EVERYORG_API_KEY}",
	"Accept": "application/json"
	}

	response = requests.get(url, headers=headers, timeout=3)

	if response.status_code == 200:
	data = response.json()
	if data and 'data' in data and 'nonprofit' in data['data']:
	nonprofit = data['data']['nonprofit']
	tags = data['data'].get('nonprofitTags', [])
	causes = [tag.get('tagName') for tag in tags if tag.get('tagName')]

	return {
	'mission': nonprofit.get('description') or nonprofit.get('descriptionLong'),
	'website': nonprofit.get('websiteUrl'),
	'logo_url': nonprofit.get('logoUrl'),
	'profile_url': nonprofit.get('profileUrl'),
	'causes': causes[:5], # Limit to top 5 causes
	'source': 'everyorg',
	'last_updated': datetime.now().isoformat()
	}
	except Exception as e:
	logger.debug(f"Every.org lookup failed for EIN {ein}: {e}")

	return None


	def get_enrichment_data(ein: str, existing_data: Optional[Dict] = None) -> Dict[str, Any]:
	"""
	Get enrichment data with intelligent backfill strategy

	Priority:
	1. Existing form_990_* data (if recent)
	2. GivingTuesday 990 XML (future: direct S3 access)
	3. ProPublica API (current fallback)
	4. Every.org API (last resort)

	Tracks source and update time for incremental processing
	"""
	result = {
	'website': None,
	'mission': None,
	'logo_url': None,
	'profile_url': None,
	'causes': [],
	'data_sources': []
	}

	# Check existing data first (skip if older than 30 days)
	if existing_data:
	cutoff_date = datetime.now() - timedelta(days=30)

	# Check enrichment data (from any source: form_990, bigquery, etc.)
	if existing_data.get('enrichment_website'):
	last_updated = existing_data.get('enrichment_last_updated')
	if not last_updated or (isinstance(last_updated, str) and datetime.fromisoformat(last_updated) > cutoff_date):
	result['website'] = existing_data['enrichment_website']
	result['data_sources'].append('cached')

	if existing_data.get('enrichment_mission'):
	result['mission'] = existing_data['enrichment_mission']
	if 'cached' not in result['data_sources']:
	result['data_sources'].append('cached')

	# Try Every.org for missing fields (keeps logo and causes which 990 doesn't have)
	if not result['website'] or not result['mission']:
	everyorg_data = fetch_everyorg_data(ein)
	if everyorg_data:
	if not result['website'] and everyorg_data.get('website'):
	result['website'] = everyorg_data['website']
	result['data_sources'].append('everyorg')

	if not result['mission'] and everyorg_data.get('mission'):
	result['mission'] = everyorg_data['mission']
	result['data_sources'].append('everyorg')

	# Always get logo and causes from Every.org
	result['logo_url'] = everyorg_data.get('logo_url')
	result['profile_url'] = everyorg_data.get('profile_url')
	result['causes'] = everyorg_data.get('causes', [])
	if result['logo_url'] or result['causes']:
	if 'everyorg' not in result['data_sources']:
	result['data_sources'].append('everyorg')

	return result

	class SearchResult:
	"""Unified search result"""

	def __init__(self,
	result_type: str,
	title: str,
	subtitle: str,
	description: str,
	url: str,
	score: float,
	metadata: Dict[str, Any]):
	self.result_type = result_type
	self.title = title
	self.subtitle = subtitle
	self.description = description
	self.url = url
	self.score = score
	self.metadata = metadata

	def to_dict(self):
	return {
	"type": self.result_type,
	"title": self.title,
	"subtitle": self.subtitle,
	"description": self.description,
	"url": self.url,
	"score": self.score,
	"metadata": self.metadata
	}


	def convert_pg_result(pg_result: search_postgres.SearchResult) -> 'SearchResult':
	"""Convert PostgreSQL SearchResult dataclass to SearchResult class"""
	return SearchResult(
	result_type=pg_result.result_type,
	title=pg_result.title,
	subtitle=pg_result.subtitle,
	description=pg_result.description,
	url=pg_result.url,
	score=pg_result.score,
	metadata=pg_result.metadata
	)


	def calculate_relevance_score(text: str, query: str) -> float:
	"""Calculate relevance score for text matching query"""
	if not text or not query:
	return 0.0

	text_lower = text.lower()
	query_lower = query.lower()

	# Exact match gets highest score
	if query_lower in text_lower:
	score = 1.0
	# Boost if it's at the start
	if text_lower.startswith(query_lower):
	score += 0.5
	return min(score, 2.0)

	# Partial word matches
	query_words = query_lower.split()
	text_words = text_lower.split()

	matches = sum(1 for qw in query_words if any(qw in tw for tw in text_words))
	return matches / len(query_words) if query_words else 0.0


	def search_contacts_duckdb(query: str, state: Optional[str] = None, limit: int = 10) -> List[SearchResult]:
	"""
	Search contacts using DuckDB (supports local files or remote HTTP parquet).
	This is the fallback when HF Search API is unavailable.
	Supports browse mode when query is empty.
	"""
	results = []

	# Determine if this is browse mode (no query) or search mode
	is_browse_mode = not query or query.strip() == ''

	try:
	# Initialize DuckDB connection
	conn = duckdb.connect()

	# Search 1: State Officials (OpenStates - state legislators, mayors, etc.)
	if state:
	officials_file_path = GOLD_DIR / "states" / state / "contacts_officials.parquet"
	officials_file_paths = [officials_file_path]
	else:
	officials_file_paths = list(GOLD_DIR.glob("states/*/contacts_officials.parquet"))[:5]

	logger.info(f"Searching {len(officials_file_paths)} state official contact files (OpenStates) - browse_mode={is_browse_mode}")

	for file_path in officials_file_paths:
	if not file_path.exists():
	continue

	# Get data source (local or remote URL)
	data_source = get_data_source(file_path, use_remote=IS_HF_SPACES)

	try:
	if is_browse_mode:
	# Browse mode: return all officials, prioritize mayors
	sql = """
	SELECT
	full_name as name,
	role_type as title,
	city_jurisdiction as jurisdiction,
	state,
	email,
	phone,
	CASE
	WHEN LOWER(role_type) = 'mayor' THEN 2.0
	WHEN LOWER(role_type) LIKE '%council%' THEN 1.8
	WHEN LOWER(role_type) LIKE '%commission%' THEN 1.7
	ELSE 1.5
	END as score
	FROM read_parquet(?)
	ORDER BY score DESC, full_name ASC
	LIMIT ?
	"""

	rows = conn.execute(sql, [data_source, limit]).fetchall()
	else:
	# Search mode: relevance scoring
	sql = """
	SELECT
	full_name as name,
	role_type as title,
	city_jurisdiction as jurisdiction,
	state,
	email,
	phone,
	GREATEST(
	CASE
	WHEN LOWER(full_name) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(full_name) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(role_type) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(role_type) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(city_jurisdiction) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(city_jurisdiction) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(jurisdiction_name) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(jurisdiction_name) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END
	) as score
	FROM read_parquet(?)
	WHERE LOWER(full_name) LIKE LOWER(?)
	OR LOWER(role_type) LIKE LOWER(?)
	OR LOWER(city_jurisdiction) LIKE LOWER(?)
	OR LOWER(jurisdiction_name) LIKE LOWER(?)
	ORDER BY score DESC
	LIMIT ?
	"""

	query_pattern = f'%{query}%'
	query_start = f'{query}%'

	rows = conn.execute(sql, [
	query_start, query_pattern, # name scoring
	query_start, query_pattern, # role_type scoring
	query_start, query_pattern, # city_jurisdiction scoring
	query_start, query_pattern, # jurisdiction_name scoring
	data_source, # file path or URL
	query_pattern, query_pattern, query_pattern, query_pattern, # WHERE clause
	limit
	]).fetchall()

	# Convert to SearchResult objects
	for row in rows:
	name, title, jurisdiction, state_code, email, phone, score = row

	if score > 0.3: # Relevance threshold
	contact_info = []
	if email:
	contact_info.append(f"📧 {email}")
	if phone:
	contact_info.append(f"📞 {phone}")

	description = f"State official in {jurisdiction}" if jurisdiction else f"State official in {state_code}"
	if contact_info:
	description += f" • {' • '.join(contact_info)}"

	results.append(SearchResult(
	result_type="contact",
	title=name if name else "Unknown",
	subtitle=f"{title.title() if title else 'Official'} - {jurisdiction or state_code}",
	description=description,
	url=f"/people/{name.replace(' ', '-') if name else 'unknown'}",
	score=score,
	metadata={
	"title": title,
	"jurisdiction": jurisdiction,
	"state": state_code,
	"name": name,
	"email": email,
	"phone": phone,
	"contact_type": "state_official",
	"data_source": "OpenStates"
	}
	))

	except Exception as e:
	logger.debug(f"Error searching state officials {file_path}: {e}")

	# Search 2: Local Officials (from meeting transcripts)
	if state:
	local_file_path = GOLD_DIR / "states" / state / "contacts_local_officials.parquet"
	local_file_paths = [local_file_path]
	else:
	local_file_paths = list(GOLD_DIR.glob("states/*/contacts_local_officials.parquet"))[:5]

	logger.info(f"Searching {len(local_file_paths)} local official contact files (meeting transcripts)")

	for file_path in local_file_paths:
	# Get data source (local or remote URL)
	data_source = get_data_source(file_path, use_remote=IS_HF_SPACES)

	try:
	# SQL query with relevance scoring across name, title, jurisdiction
	sql = """
	SELECT
	name,
	title,
	jurisdiction,
	state,
	GREATEST(
	CASE
	WHEN LOWER(name) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(name) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(title) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(title) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(jurisdiction) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(jurisdiction) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END
	) as score
	FROM read_parquet(?)
	WHERE LOWER(name) LIKE LOWER(?)
	OR LOWER(title) LIKE LOWER(?)
	OR LOWER(jurisdiction) LIKE LOWER(?)
	ORDER BY score DESC
	LIMIT ?
	"""

	query_pattern = f'%{query}%'
	query_start = f'{query}%'

	rows = conn.execute(sql, [
	query_start, query_pattern, # name scoring
	query_start, query_pattern, # title scoring
	query_start, query_pattern, # jurisdiction scoring
	data_source, # file path or URL
	query_pattern, query_pattern, query_pattern, # WHERE clause
	limit
	]).fetchall()

	# Convert to SearchResult objects
	for row in rows:
	name, title, jurisdiction, state_code, score = row

	if score > 0.3: # Relevance threshold
	results.append(SearchResult(
	result_type="contact",
	title=name if name else "Unknown",
	subtitle=f"{title} - {jurisdiction}, {state_code}",
	description=f"Local official in {jurisdiction}",
	url=f"/people/{name.replace(' ', '-') if name else 'unknown'}",
	score=score,
	metadata={
	"title": title,
	"jurisdiction": jurisdiction,
	"state": state_code,
	"name": name
	}
	))

	except Exception as e:
	logger.debug(f"Error searching {file_path}: {e}")

	# Search 3: Nonprofit Officers from state directories
	nonprofit_files = []

	# If state specified, search that state's directory
	if state:
	state_nonprofit_file = GOLD_DIR / "states" / state / "contacts_nonprofit_officers.parquet"
	nonprofit_files.append(state_nonprofit_file)
	else:
	# Search all state directories
	for state_dir in (GOLD_DIR / "states").glob("*/"):
	state_file = state_dir / "contacts_nonprofit_officers.parquet"
	nonprofit_files.append(state_file)

	for nonprofit_file in nonprofit_files:
	# Get data source (local or remote URL)
	nonprofit_source = get_data_source(nonprofit_file, use_remote=IS_HF_SPACES)

	try:
	logger.info(f"Searching nonprofit officers: {nonprofit_source}")

	officer_sql = """
	SELECT
	name,
	title,
	organization_name,
	city,
	state,
	compensation,
	GREATEST(
	CASE
	WHEN LOWER(name) LIKE LOWER(?) THEN 1.5
	WHEN LOWER(name) LIKE LOWER(?) THEN 1.0
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(title) LIKE LOWER(?) THEN 1.0
	WHEN LOWER(title) LIKE LOWER(?) THEN 0.5
	ELSE 0.0
	END,
	CASE
	WHEN LOWER(organization_name) LIKE LOWER(?) THEN 1.0
	WHEN LOWER(organization_name) LIKE LOWER(?) THEN 0.5
	ELSE 0.0
	END
	) AS score
	FROM read_parquet(?)
	WHERE (LOWER(name) LIKE LOWER(?)
	OR LOWER(title) LIKE LOWER(?)
	OR LOWER(organization_name) LIKE LOWER(?))
	"""

	query_pattern = f'%{query}%'
	query_start = f'{query}%'
	params = [
	query_start, query_pattern, # name scoring
	query_start, query_pattern, # title scoring
	query_start, query_pattern, # organization scoring
	nonprofit_source, # file path or URL
	query_pattern, query_pattern, query_pattern # WHERE clause
	]

	if state:
	officer_sql += " AND LOWER(state) = LOWER(?)"
	params.append(state)

	officer_sql += " ORDER BY score DESC, compensation DESC NULLS LAST LIMIT ?"
	params.append(limit)

	officer_rows = conn.execute(officer_sql, params).fetchall()

	for row in officer_rows:
	name, title, org_name, city, state_code, compensation, score = row

	if score > 0.3:
	comp_text = f" (${compensation:,.0f})" if compensation else ""

	results.append(SearchResult(
	result_type="contact",
	title=name if name else "Unknown",
	subtitle=f"{title} - {org_name}{comp_text}",
	description=f"Nonprofit officer in {city}, {state_code}",
	url=f"/nonprofits?name={org_name.replace(' ', '-') if org_name else 'unknown'}",
	score=score,
	metadata={
	"title": title,
	"organization": org_name,
	"city": city,
	"state": state_code,
	"compensation": compensation,
	"contact_type": "nonprofit_officer"
	}
	))

	logger.info(f"Found {len([r for r in results if r.metadata.get('contact_type') == 'nonprofit_officer'])} nonprofit officer results")

	except Exception as e:
	logger.debug(f"Error searching nonprofit officers: {e}")

	conn.close()

	# Sort all results by score and limit
	results.sort(key=lambda x: x.score, reverse=True)
	logger.info(f"DuckDB search found {len(results)} contacts for query '{query}'")
	return results[:limit]

	except Exception as e:
	logger.error(f"Contact search error: {e}")
	return results


	def search_contacts(query: str, state: Optional[str] = None, limit: int = 10) -> List[SearchResult]:
	"""
	HYBRID SEARCH: Search local officials AND nonprofit officers.

	Strategy:
	1. Try HuggingFace Search API first (fast, server-side indexed) - HF Spaces only
	2. Fall back to DuckDB (local files or remote HTTP parquet)

	Args:
	query: Search text (name, title, organization, etc.)
	state: Optional 2-letter state code filter
	limit: Maximum results to return

	Returns:
	List of SearchResult objects sorted by relevance
	"""
	logger.info(f"🔎 search_contacts() called - query={query!r}, state={state!r}, limit={limit}, IS_HF_SPACES={IS_HF_SPACES}")

	# STRATEGY 1: Try HuggingFace Search API (fast text search)
	if query and IS_HF_SPACES:
	logger.info(f"🔍 Trying HF Search API for '{query}' (state={state})")
	try:
	hf_results = search_contacts_hf(query, state, limit=limit)

	if hf_results:
	logger.info(f"✅ HF Search API returned {len(hf_results)} results")
	# Convert HF results to SearchResult objects
	results = []
	for row in hf_results:
	source_type = row.get('source', 'contact')
	name = row.get('name', 'Unknown')
	title = row.get('title', '')
	jurisdiction = row.get('jurisdiction', row.get('organization_name', ''))
	state_code = row.get('state', state or '')

	results.append(SearchResult(
	result_type="contact",
	title=name,
	subtitle=f"{title} - {jurisdiction}, {state_code}",
	description=f"{'Local official' if source_type == 'local_officials' else 'Nonprofit officer'} in {jurisdiction}",
	url=f"/people/{name.replace(' ', '-')}",
	score=1.0,
	metadata={
	"title": title,
	"jurisdiction": jurisdiction,
	"state": state_code,
	"name": name,
	"source": source_type
	}
	))
	return results[:limit]
	except Exception as e:
	logger.warning(f"HF Search API failed, falling back to DuckDB: {e}")

	# STRATEGY 2: Fall back to DuckDB (works with local or remote parquet)
	logger.info(f"🔍 Using DuckDB {'remote' if IS_HF_SPACES else 'local'} search for '{query}'")
	return search_contacts_duckdb(query, state, limit)


	def search_meetings(query: str, state: Optional[str] = None, limit: int = 10) -> List[SearchResult]:
	"""Search meeting transcripts and agendas"""
	results = []

	try:
	# Search state event/meeting files (try new naming first, fallback to old)
	if state:
	meeting_files = list(GOLD_DIR.glob(f"states/{state}/events.parquet"))
	if not meeting_files:
	meeting_files = list(GOLD_DIR.glob(f"states/{state}/events_events.parquet"))
	if not meeting_files:
	meeting_files = list(GOLD_DIR.glob(f"states/{state}/meetings.parquet"))
	else:
	meeting_files = list(GOLD_DIR.glob("states/*/events.parquet"))
	if not meeting_files:
	meeting_files = list(GOLD_DIR.glob("states/*/events_events.parquet"))
	if not meeting_files:
	meeting_files = list(GOLD_DIR.glob("states/*/meetings.parquet"))

	for file_path in meeting_files[:5]: # Limit for performance
	try:
	df = pd.read_parquet(file_path)
	state_code = file_path.parent.name

	# Detect schema - different files have different column names
	columns = set(df.columns)

	# Map column names (handle LocalView vs CityScrapers vs other formats)
	title_col = 'vid_title' if 'vid_title' in columns else ('event_title' if 'event_title' in columns else 'title')
	body_col = 'caption_text_clean' if 'caption_text_clean' in columns else ('caption_text' if 'caption_text' in columns else ('full_text' if 'full_text' in columns else 'body'))
	jurisdiction_col = 'place_name' if 'place_name' in columns else ('jurisdiction_name' if 'jurisdiction_name' in columns else 'jurisdiction')
	date_col = 'meeting_date' if 'meeting_date' in columns else 'date'
	id_col = 'vid_id' if 'vid_id' in columns else ('meeting_id' if 'meeting_id' in columns else 'id')

	# Search in title, body, jurisdiction
	for _, row in df.iterrows():
	title = str(row.get(title_col, ''))
	body = str(row.get(body_col, ''))[:500] # First 500 chars
	jurisdiction = str(row.get(jurisdiction_col, ''))
	meeting_date = str(row.get(date_col, ''))
	meeting_id = str(row.get(id_col, ''))

	score = max(
	calculate_relevance_score(title, query),
	calculate_relevance_score(body, query) * 0.8, # Body matches slightly lower
	calculate_relevance_score(jurisdiction, query) * 0.6
	)

	if score > 0.3:
	# Extract snippet around query
	snippet = body[:200] + "..." if len(body) > 200 else body

	results.append(SearchResult(
	result_type="meeting",
	title=title,
	subtitle=f"{jurisdiction}, {state_code} - {meeting_date}",
	description=snippet,
	url=f"/documents?meeting_id={meeting_id}",
	score=score,
	metadata={
	"jurisdiction": jurisdiction,
	"state": state_code,
	"date": meeting_date,
	"meeting_id": meeting_id
	}
	))
	except Exception as e:
	logger.debug(f"Error searching {file_path}: {e}")

	except Exception as e:
	logger.error(f"Meeting search error: {e}")

	results.sort(key=lambda x: x.score, reverse=True)
	return results[:limit]


	def count_organizations(state: Optional[str] = None, ntee_code: Optional[str] = None, query: Optional[str] = None) -> int:
	"""Count total organizations matching criteria (for pagination) - cached"""
	# Create cache key
	cache_key = f"count_{state}_{ntee_code}_{query}"

	# Check cache (1 hour TTL)
	now = datetime.now()
	if cache_key in _count_cache:
	cached_time = _count_cache_ttl.get(cache_key)
	if cached_time and (now - cached_time).total_seconds() < 3600:
	return _count_cache[cache_key]

	try:
	# Determine file path
	if state:
	file_pattern = f"{GOLD_DIR}/states/{state}/nonprofits_organizations.parquet"
	else:
	file_pattern = f"{GOLD_DIR}/national/nonprofits_organizations.parquet"

	file_path = Path(file_pattern)
	if not file_path.exists():
	return 0

	conn = duckdb.connect()

	# Detect schema
	columns_query = f"DESCRIBE SELECT * FROM '{file_path}' LIMIT 0"
	available_columns = set([row[0] for row in conn.execute(columns_query).fetchall()])
	name_col = 'organization_name' if 'organization_name' in available_columns else 'name'
	ntee_col = 'ntee_code' if 'ntee_code' in available_columns else 'ntee_cd'

	# Build WHERE clause
	where_clauses = []
	params = []

	if query and query.strip():
	where_clauses.append(f"LOWER({name_col}) LIKE LOWER(?)")
	params.append(f'%{query}%')

	if ntee_code and ntee_col in available_columns:
	where_clauses.append(f"{ntee_col} LIKE ?")
	params.append(f'{ntee_code}%')

	where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"

	# Count query
	count_sql = f"SELECT COUNT(*) FROM '{data_source}' WHERE {where_sql}"
	result = conn.execute(count_sql, params).fetchone()
	conn.close()

	count = result[0] if result else 0

	# Cache the result
	_count_cache[cache_key] = count
	_count_cache_ttl[cache_key] = now

	return count
	except Exception as e:
	logger.error(f"Count error: {e}")
	return 0


	def search_organizations(query: str, state: Optional[str] = None, ntee_code: Optional[str] = None, limit: int = 10, offset: int = 0, enrich: bool = False, sort: str = 'relevance', ein: Optional[str] = None) -> List[SearchResult]:
	"""Search nonprofit organizations using DuckDB for fast Parquet queries

	Args:
	enrich: If True, fetch additional data from Every.org API (slower)
	sort: Sort order - 'relevance', 'name-asc', 'name-desc', 'revenue-asc', 'revenue-desc', 'assets-asc', 'assets-desc'
	ein: If provided, filter to exact EIN match (for direct organization links)
	"""
	results = []

	try:
	# Determine file path
	if state:
	file_pattern = f"{GOLD_DIR}/states/{state}/nonprofits_organizations.parquet"
	else:
	file_pattern = f"{GOLD_DIR}/national/nonprofits_organizations.parquet"

	# Get data source (local or remote HuggingFace URL)
	file_path = Path(file_pattern)
	data_source = get_data_source(file_path, use_remote=IS_HF_SPACES)

	# Load parquet with caching (speeds up from 2-3s to <10ms)
	df = load_parquet_cached(data_source)

	# Initialize DuckDB connection
	conn = duckdb.connect()

	# Query the DataFrame directly (DuckDB can query pandas DataFrames)
	available_columns = set(df.columns)

	# Detect column name variations (handle different schemas)
	name_col = 'organization_name' if 'organization_name' in available_columns else 'name'
	ntee_col = 'ntee_code' if 'ntee_code' in available_columns else 'ntee_cd'
	revenue_col = 'form_990_total_revenue' if 'form_990_total_revenue' in available_columns else 'revenue_amt'
	asset_col = 'form_990_total_assets' if 'form_990_total_assets' in available_columns else 'asset_amt'
	income_col = 'form_990_net_income' if 'form_990_net_income' in available_columns else 'income_amt'

	# Build WHERE clauses using detected column names
	where_clauses = []
	params = []

	# EIN filter (exact match - highest priority for direct organization links)
	if ein and ein.strip():
	where_clauses.append("ein = ?")
	params.append(ein.strip())

	# Search query (case-insensitive LIKE) - only if query provided and no EIN
	if query and query.strip() and not ein:
	where_clauses.append(f"LOWER({name_col}) LIKE LOWER(?)")
	params.append(f'%{query}%')

	# State filter (if using national file)
	if state and not file_pattern.startswith(f"{GOLD_DIR}/states/"):
	where_clauses.append("state = ?")
	params.append(state)

	# NTEE code filter
	if ntee_code and ntee_col in available_columns:
	where_clauses.append(f"{ntee_col} LIKE ?")
	params.append(f'{ntee_code}%')

	# Default to TRUE if no filters (browse all)
	where_sql = " AND ".join(where_clauses) if where_clauses else "TRUE"

	# Build column list with proper NULL handling for missing columns
	select_columns = []

	# Add core columns (with aliases for consistency)
	select_columns.append(f'{name_col} as name')
	select_columns.append('city')
	select_columns.append('state')
	select_columns.append(f'{ntee_col} as ntee_cd' if ntee_col in available_columns else 'NULL as ntee_cd')
	select_columns.append('ein')
	select_columns.append(f'{revenue_col} as revenue_amt' if revenue_col in available_columns else 'NULL as revenue_amt')
	select_columns.append(f'{asset_col} as asset_amt' if asset_col in available_columns else 'NULL as asset_amt')
	select_columns.append(f'{income_col} as income_amt' if income_col in available_columns else 'NULL as income_amt')
	select_columns.append('tax_period' if 'tax_period' in available_columns else 'NULL as tax_period')

	# Track enrichment columns (form_990 and bigquery)
	enrichment_cols = []
	enrichment_col_map = {}

	# Check for website columns (multiple possible names) - ALWAYS add if exists
	website_col_added = False
	for col_name in ['bigquery_website', 'form_990_website', 'website', 'everyorg_website']:
	if col_name in available_columns:
	select_columns.append(f'{col_name} as enrichment_website')
	enrichment_cols.append('enrichment_website')
	enrichment_col_map['enrichment_website'] = col_name
	website_col_added = True
	logger.debug(f"Added website column: {col_name}")
	break

	# Check for mission columns
	mission_col_added = False
	for col_name in ['bigquery_mission', 'form_990_mission', 'mission', 'everyorg_mission']:
	if col_name in available_columns:
	select_columns.append(f'{col_name} as enrichment_mission')
	enrichment_cols.append('enrichment_mission')
	enrichment_col_map['enrichment_mission'] = col_name
	mission_col_added = True
	logger.debug(f"Added mission column: {col_name}")
	break

	# Check for logo columns
	logo_col_added = False
	for col_name in ['logodev_logo_url', 'everyorg_logo_url', 'logo_url']:
	if col_name in available_columns:
	select_columns.append(f'{col_name} as enrichment_logo')
	enrichment_cols.append('enrichment_logo')
	enrichment_col_map['enrichment_logo'] = col_name
	logo_col_added = True
	logger.debug(f"Added logo column: {col_name}")
	break

	# Last updated timestamp
	for col_name in ['bigquery_updated_date', 'form_990_last_updated', 'everyorg_last_updated']:
	if col_name in available_columns:
	select_columns.append(f'{col_name} as enrichment_last_updated')
	enrichment_cols.append('enrichment_last_updated')
	enrichment_col_map['enrichment_last_updated'] = col_name
	logger.debug(f"Added timestamp column: {col_name}")
	break

	columns_sql = ', '.join(select_columns)

	# Log what we're selecting
	logger.info(f"🔍 Enrichment columns to select: {enrichment_cols}")
	logger.info(f"📋 Full SQL columns: {columns_sql}")

	# Build ORDER BY clause based on sort parameter
	order_by_clauses = []

	if sort == 'name-asc':
	order_by_clauses.append(f"{name_col} ASC")
	elif sort == 'name-desc':
	order_by_clauses.append(f"{name_col} DESC")
	elif sort == 'revenue-desc':
	order_by_clauses.append(f"COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0) DESC")
	elif sort == 'revenue-asc':
	# Low to high: Show positive values first (smallest to largest), then zeros, then negatives
	order_by_clauses.append(f"""
	CASE
	WHEN TRY_CAST({revenue_col} AS BIGINT) IS NULL THEN 3
	WHEN TRY_CAST({revenue_col} AS BIGINT) <= 0 THEN 2
	ELSE 1
	END ASC,
	ABS(COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0)) ASC
	""")
	elif sort == 'assets-desc':
	order_by_clauses.append(f"COALESCE(TRY_CAST({asset_col} AS BIGINT), 0) DESC")
	elif sort == 'assets-asc':
	# Low to high: Show positive values first (smallest to largest), then zeros, then negatives
	order_by_clauses.append(f"""
	CASE
	WHEN TRY_CAST({asset_col} AS BIGINT) IS NULL THEN 3
	WHEN TRY_CAST({asset_col} AS BIGINT) <= 0 THEN 2
	ELSE 1
	END ASC,
	ABS(COALESCE(TRY_CAST({asset_col} AS BIGINT), 0)) ASC
	""")
	elif query and query.strip():
	# Relevance sort (only for search mode)
	order_by_clauses.append("score DESC")
	order_by_clauses.append(f"COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0) DESC")
	else:
	# Default browse mode: sort by revenue/assets
	order_by_clauses.append(f"COALESCE(TRY_CAST({revenue_col} AS BIGINT), 0) DESC")
	order_by_clauses.append(f"COALESCE(TRY_CAST({asset_col} AS BIGINT), 0) DESC")

	# Always add name as final sort for consistency
	if 'name' not in sort:
	order_by_clauses.append(f"{name_col}")

	order_by_sql = ', '.join(order_by_clauses)

	# SQL query with relevance scoring (browse mode if no query)
	if query and query.strip():
	# Search mode: score by text match
	sql = f"""
	SELECT
	{columns_sql},
	CASE
	WHEN LOWER({name_col}) LIKE LOWER(?) THEN 1.5
	WHEN LOWER({name_col}) LIKE LOWER(?) THEN 1.0
	ELSE 0.5
	END as score
	FROM df
	WHERE {where_sql}
	ORDER BY {order_by_sql}
	LIMIT ? OFFSET ?
	"""
	# Execute query with scoring parameters
	query_params = [f'{query}%', f'%{query}%'] + params + [limit, offset]
	else:
	# Browse mode: sort by size/activity
	sql = f"""
	SELECT
	{columns_sql},
	1.0 as score
	FROM df
	WHERE {where_sql}
	ORDER BY {order_by_sql}
	LIMIT ? OFFSET ?
	"""
	# Execute query without scoring parameters
	query_params = params + [limit, offset]

	rows = conn.execute(sql, query_params).fetchall()

	# NTEE code descriptions for better context
	ntee_descriptions = {
	'E': 'Health Services',
	'E60': 'Health Support Services',
	'E61': 'Blood Supply',
	'E62': 'Emergency Medical Services',
	'E65': 'Organ & Tissue Banks',
	'E70': 'Public Health',
	'E80': 'Health Treatment - Primary Care',
	'E90': 'Nursing Services',
	'E20': 'Hospitals & Primary Medical Care',
	'E30': 'Ambulatory & Primary Health Care',
	'E32': 'Clinics & Community Health Centers',
	'P': 'Human Services',
	'B': 'Education',
	'X': 'Religion-Related',
	'A': 'Arts, Culture & Humanities',
	}

	# Convert to SearchResult objects with intelligent enrichment
	for row in rows:
	# Unpack base columns (now includes tax_period)
	org_name, city, state_code, ntee, ein, revenue, assets, income, tax_period = row[:9]

	# Unpack optional enrichment columns if present
	existing_data = {}
	idx = 9

	if 'enrichment_website' in enrichment_cols:
	existing_data['enrichment_website'] = row[idx]
	# Only log non-null websites to reduce spam
	if row[idx] and str(row[idx]) != 'nan':
	logger.debug(f"✅ Website: {row[idx]}")
	idx += 1
	if 'enrichment_mission' in enrichment_cols:
	existing_data['enrichment_mission'] = row[idx]
	idx += 1
	if 'enrichment_logo' in enrichment_cols:
	existing_data['enrichment_logo'] = row[idx]
	idx += 1
	if 'enrichment_last_updated' in enrichment_cols:
	existing_data['enrichment_last_updated'] = row[idx]
	idx += 1

	score = row[-1] # Score is always last

	# Parse tax year from tax_period (format: YYYYMM)
	tax_year = None
	if tax_period and str(tax_period).isdigit() and len(str(tax_period)) >= 4:
	tax_year = int(str(tax_period)[:4])

	# Get enriched data with intelligent backfill (only if requested)
	enrichment = get_enrichment_data(ein, existing_data) if (ein and enrich) else {}

	# Build a more informative description
	ntee_desc = None
	if ntee:
	# Try exact match first, then prefix match
	ntee_desc = ntee_descriptions.get(ntee)
	if not ntee_desc:
	# Try first character (major category)
	ntee_desc = ntee_descriptions.get(ntee[0]) if ntee else None

	# Use enriched mission as primary description, fallback to NTEE + financial
	description = enrichment.get('mission') if enrichment.get('mission') else None

	# Validate mission: if it contains a different org name, it's stale data
	if description and org_name:
	# Check if mission mentions a completely different org name
	# (e.g., "Catalyst Institute" when org name is "CAREQUEST INSTITUTE")
	mission_lower = description.lower()
	name_words = set(org_name.lower().split())

	# If mission starts with an org name that's not in our actual org name, skip it
	first_sentence = description.split('.')[0].lower()
	if ' is a nonprofit' in first_sentence or ' is an nonprofit' in first_sentence:
	# Extract the subject (organization name before "is a nonprofit")
	subject = first_sentence.split(' is a')[0].strip()
	subject_words = set(subject.split())

	# If the subject shares NO significant words with our org name, it's stale
	# (e.g., "catalyst institute" vs "carequest institute")
	significant_words = subject_words - {'the', 'a', 'an', 'of', 'for', 'and', 'inc', 'llc'}
	name_significant = name_words - {'the', 'a', 'an', 'of', 'for', 'and', 'inc', 'llc', 'institute'}

	if significant_words and not (significant_words & name_significant):
	# Stale data - mission talks about a different org
	logger.warning(f"Stale mission data for {org_name}: '{subject}' != '{org_name}'")
	description = None

	if not description:
	description_parts = []
	if ntee_desc:
	description_parts.append(ntee_desc)

	# Convert financial data to numbers (handle None and string types)
	try:
	revenue_num = float(revenue) if revenue else 0
	assets_num = float(assets) if assets else 0
	except (ValueError, TypeError):
	revenue_num = 0
	assets_num = 0

	if revenue_num > 0:
	description_parts.append(f"Revenue: ${revenue_num:,.0f}")
	elif assets_num > 0:
	description_parts.append(f"Assets: ${assets_num:,.0f}")

	if not description_parts:
	description_parts.append(f"Nonprofit serving {city}")

	description = " • ".join(description_parts)

	# Build metadata with enriched fields
	metadata = {
	"ein": ein,
	"city": city,
	"state": state_code,
	"ntee_code": ntee,
	"revenue": revenue,
	"assets": assets,
	"income": income,
	"tax_year": tax_year,
	"data_sources": []
	}

	# ALWAYS add enrichment from parquet columns (existing_data) - no enrich flag needed
	if existing_data.get('enrichment_website'):
	metadata['website'] = existing_data['enrichment_website']
	metadata['data_sources'].append('cached')

	if existing_data.get('enrichment_mission'):
	metadata['mission'] = existing_data['enrichment_mission']
	if 'cached' not in metadata['data_sources']:
	metadata['data_sources'].append('cached')

	if existing_data.get('enrichment_logo'):
	metadata['logo_url'] = existing_data['enrichment_logo']
	if 'cached' not in metadata['data_sources']:
	metadata['data_sources'].append('cached')

	# Add API enrichment if requested (enrich=true)
	if enrichment:
	if enrichment.get('website') and not metadata.get('website'):
	metadata['website'] = enrichment['website']
	if enrichment.get('logo_url'):
	metadata['logo_url'] = enrichment['logo_url']
	if enrichment.get('profile_url'):
	metadata['profile_url'] = enrichment['profile_url']
	if enrichment.get('causes'):
	metadata['causes'] = enrichment['causes']
	# Add API data sources
	for source in enrichment.get('data_sources', []):
	if source not in metadata['data_sources']:
	metadata['data_sources'].append(source)

	results.append(SearchResult(
	result_type="organization",
	title=org_name if org_name else "Unknown",
	subtitle=f"{city}, {state_code}" + (f" - NTEE: {ntee}" if ntee else ""),
	description=description,
	url=f"/search?types=organizations&state={state_code}&ein={ein}",
	score=score,
	metadata=metadata
	))

	conn.close()
	logger.info(f"DuckDB search found {len(results)} organizations for query '{query}'")

	except Exception as e:
	logger.error(f"Organization search error: {e}")

	return results


	def search_causes(query: str, limit: int = 10) -> List[SearchResult]:
	"""Search causes and NTEE categories - supports browse mode"""
	results = []

	try:
	# Get data source (local or remote HuggingFace URL)
	ntee_file = GOLD_DIR / "reference" / "causes_ntee_codes.parquet"
	data_source = get_data_source(ntee_file, use_remote=IS_HF_SPACES)

	# Load with caching
	df = load_parquet_cached(data_source)
	logger.debug(f"Loaded {len(df)} NTEE codes from cache")

	for _, row in df.iterrows():
	code = str(row.get('ntee_code', ''))
	description = str(row.get('description', ''))
	ntee_type = str(row.get('ntee_type', ''))

	# Browse mode: return all causes
	# Search mode: filter by relevance
	if query and query.strip():
	score = max(
	calculate_relevance_score(description, query),
	calculate_relevance_score(code, query)
	)
	if score <= 0.3:
	continue # Skip low relevance results
	else:
	score = 1.0 # Default score for browse mode

	results.append(SearchResult(
	result_type="cause",
	title=description,
	subtitle=f"NTEE Code: {code}",
	description=f"Category type: {ntee_type}",
	url=f"/nonprofits?ntee_code={code}",
	score=score,
	metadata={
	"ntee_code": code,
	"ntee_type": ntee_type
	}
	))

	logger.info(f"Found {len(results)} cause results for query '{query}'")

	except Exception as e:
	logger.error(f"Cause search error: {e}")

	results.sort(key=lambda x: x.score, reverse=True)
	return results[:limit]


	def search_jurisdictions(query: str, state: Optional[str] = None, city: Optional[str] = None, jurisdiction_levels: Optional[List[str]] = None, limit: int = 10, offset: int = 0) -> List[SearchResult]:
	"""Search cities, counties, townships, and school districts using DuckDB"""
	all_results = []

	try:
	conn = duckdb.connect()

	# Map frontend level IDs to file keys
	level_mapping = {
	'city': 'city',
	'county': 'county',
	'town': 'township',
	'village': 'township',
	'school_district': 'school district',
	'special_district': 'school district', # Use school district as proxy
	'state': None # States handled separately if needed
	}

	# Define jurisdiction files with priority scores
	jurisdiction_files = {
	'county': (f"{GOLD_DIR}/reference/jurisdictions_counties.parquet", 1.3), # Boost counties
	'city': (f"{GOLD_DIR}/reference/jurisdictions_cities.parquet", 1.0),
	'school district': (f"{GOLD_DIR}/reference/jurisdictions_school_districts.parquet", 1.1), # Boost school districts
	'township': (f"{GOLD_DIR}/reference/jurisdictions_townships.parquet", 0.9)
	}

	# Filter jurisdiction files based on selected levels
	if jurisdiction_levels:
	# Map selected levels to file keys
	selected_file_keys = set()
	for level in jurisdiction_levels:
	file_key = level_mapping.get(level)
	if file_key:
	selected_file_keys.add(file_key)

	# Filter to only selected types
	if selected_file_keys:
	jurisdiction_files = {
	k: v for k, v in jurisdiction_files.items()
	if k in selected_file_keys
	}

	# Fetch enough results from each type to ensure diversity
	# Even with small limits, we want representation from each type
	per_type_limit = max(limit, 15)

	for jtype, (file_path, type_score) in jurisdiction_files.items():
	file_path_obj = Path(file_path)
	if not file_path_obj.exists():
	continue

	try:
	# Build SQL query - use state column (lowercase)
	where_clauses = []
	params = []

	if state:
	where_clauses.append("state = ?")
	params.append(state)

	if city and query:
	# If city is specified, search for jurisdictions matching the city name
	where_clauses.append("LOWER(NAME) LIKE LOWER(?)")
	params.append(f"%{city}%")
	elif query:
	# General search across jurisdiction names
	where_clauses.append("LOWER(NAME) LIKE LOWER(?)")
	params.append(f"%{query}%")

	where_clause = " AND ".join(where_clauses) if where_clauses else "1=1"

	# Calculate name match score if query provided
	score_expr = f"{type_score}"
	if query:
	score_expr = f"""CASE
	WHEN LOWER(NAME) = LOWER('{query}') THEN {type_score} * 2.0
	WHEN LOWER(NAME) LIKE LOWER('{query}%') THEN {type_score} * 1.5
	ELSE {type_score}
	END"""

	sql = f"""
	SELECT
	NAME as name,
	state,
	GEOID as geoid,
	jurisdiction_type,
	{score_expr} as score
	FROM read_parquet(?)
	WHERE {where_clause}
	ORDER BY score DESC, NAME ASC
	LIMIT ?
	"""

	query_params = [str(file_path_obj)] + params + [per_type_limit]
	df = conn.execute(sql, query_params).fetchdf()

	for _, row in df.iterrows():
	jurisdiction_label = row['jurisdiction_type'].replace('_', ' ').title()
	all_results.append(SearchResult(
	result_type='jurisdiction',
	title=f"{row['name']}",
	subtitle=f"{jurisdiction_label}",
	description=f"{jurisdiction_label} in {row['state']}",
	url=f"/jurisdictions/{row['geoid']}",
	score=float(row['score']),
	metadata={
	'state': row['state'],
	'geoid': row['geoid'],
	'type': row['jurisdiction_type']
	}
	))

	except Exception as e:
	logger.error(f"Error searching {jtype} jurisdictions: {e}")
	continue

	except Exception as e:
	logger.error(f"Jurisdiction search error: {e}")

	# Sort all results by score, then apply pagination
	all_results.sort(key=lambda x: (x.score, x.title), reverse=True)
	return all_results[offset:offset + limit]


	@router.get("/api/search")
	@router.get("/api/search/", include_in_schema=False)
	async def unified_search(
	q: Optional[str] = Query(None, description="Search query (optional - browse by filters if omitted)"),
	types: Optional[str] = Query(None, description="Comma-separated result types: contacts,meetings,organizations,causes,jurisdictions"),
	state: Optional[str] = Query(None, description="Filter by state (2-letter code)"),
	city: Optional[str] = Query(None, description="Filter by city name"),
	jurisdiction_levels: Optional[str] = Query(None, description="Comma-separated jurisdiction levels: city,county,town,village,school_district,special_district,state"),
	ntee_code: Optional[str] = Query(None, description="Filter organizations by NTEE code"),
	ein: Optional[str] = Query(None, description="Filter organizations by exact EIN (for direct organization links)"),
	limit: int = Query(20, ge=1, le=100, description="Maximum results per type"),
	offset: int = Query(0, ge=0, description="Number of results to skip (for pagination)"),
	page: int = Query(1, ge=1, description="Page number (alternative to offset)"),
	enrich: bool = Query(False, description="Enable API enrichment (slower - fetches logos, causes from Every.org)"),
	sort: str = Query('relevance', description="Sort order: relevance, name-asc, name-desc, revenue-asc, revenue-desc, assets-asc, assets-desc")
	):
	"""
	Unified search across all data types

	Search for contacts, meetings, organizations, and causes in one query.
	NEW: Query is now optional - you can browse by state/type without searching!

	Pagination:
	- Use `offset` to skip results: `offset=20` skips first 20 results
	- Or use `page` with `limit`: `page=2&limit=20` gets results 21-40
	- `offset` takes precedence if both are provided

	Examples:
	- `/api/search?q=dental` - Search everything for "dental"
	- `/api/search?types=organizations&state=GA` - Browse all orgs in Georgia
	- `/api/search?q=budget&types=meetings` - Search only meetings
	- `/api/search?q=health&state=AL` - Search in Alabama only
	- `/api/search?q=education&types=organizations,causes` - Search orgs and causes
	- `/api/search?q=health&state=MA&page=2&limit=20` - Page 2 of MA health results
	"""
	# 🔍 DEBUG LOGGING - Log all incoming request parameters
	logger.info(f"🔍 SEARCH REQUEST: q={q!r}, types={types!r}, state={state!r}, city={city!r}, jurisdiction_levels={jurisdiction_levels!r}, ntee_code={ntee_code!r}, ein={ein!r}, limit={limit}, offset={offset}, page={page}, enrich={enrich}, sort={sort!r}")

	try:
	# Calculate offset from page if offset not explicitly provided
	if offset == 0 and page > 1:
	offset = (page - 1) * limit

	# Parse requested types
	if types:
	requested_types = [t.strip() for t in types.split(',')]
	else:
	requested_types = ['contacts', 'meetings', 'organizations', 'causes', 'jurisdictions']

	# Parse jurisdiction levels if provided
	jurisdiction_levels_list = None
	if jurisdiction_levels:
	jurisdiction_levels_list = [level.strip() for level in jurisdiction_levels.split(',')]

	logger.info(f"📋 Requested types: {requested_types}, calculated offset: {offset}")

	all_results = []

	# Optimize for single-type browse mode (no query)
	# Let database handle pagination for efficiency
	use_db_pagination = not q and len(requested_types) == 1

	if use_db_pagination:
	# Single-type browse: pass offset to DB for efficient pagination
	search_limit = limit
	search_offset = offset
	else:
	# Multi-type or search mode: fetch extra for mixing/sorting
	search_limit = offset + limit + 100
	search_offset = 0

	if 'contacts' in requested_types:
	# Use PostgreSQL for fast indexed search
	contact_results_pg = await search_postgres.search_contacts_pg(q, state, limit=search_limit)
	contact_results = [convert_pg_result(r) for r in contact_results_pg]
	logger.info(f"👤 Contacts search returned {len(contact_results)} results")
	all_results.extend(contact_results)

	if 'meetings' in requested_types:
	# Use PostgreSQL for fast indexed search
	meeting_results_pg = await search_postgres.search_events_pg(q, state, limit=search_limit)
	meeting_results = [convert_pg_result(r) for r in meeting_results_pg]
	logger.info(f"📅 Meetings search returned {len(meeting_results)} results")
	all_results.extend(meeting_results)

	if 'organizations' in requested_types:
	# Use PostgreSQL for fast indexed search
	org_results_pg = await search_postgres.search_organizations_pg(q, state, ntee_code, ein, limit=search_limit, offset=search_offset, sort=sort)
	org_results = [convert_pg_result(r) for r in org_results_pg]
	logger.info(f"🏢 Organizations search returned {len(org_results)} results")
	all_results.extend(org_results)

	if 'causes' in requested_types:
	cause_results = search_causes(q or "", limit=search_limit)
	logger.info(f"🎯 Causes search returned {len(cause_results)} results")
	all_results.extend(cause_results)

	if 'jurisdictions' in requested_types:
	# Use PostgreSQL for fast indexed search
	jurisdiction_results_pg = await search_postgres.search_jurisdictions_pg(q, state, city, jurisdiction_levels_list, limit=search_limit, offset=search_offset)
	jurisdiction_results = [convert_pg_result(r) for r in jurisdiction_results_pg]
	logger.info(f"🏛️ Jurisdictions search returned {len(jurisdiction_results)} results")
	all_results.extend(jurisdiction_results)

	# Sort all results by score
	all_results.sort(key=lambda x: x.score, reverse=True)

	logger.info(f"📊 Total combined results: {len(all_results)}, applying pagination (offset={offset}, limit={limit})")

	# Apply pagination
	if use_db_pagination:
	# DB already paginated - use all results
	paginated_results = all_results
	else:
	# Paginate in-memory from combined results
	paginated_results = all_results[offset:offset + limit]

	logger.info(f"✂️ Paginated results: {len(paginated_results)} items")

	# Group by type for response
	grouped_results = {
	'contacts': [r.to_dict() for r in paginated_results if r.result_type == 'contact'],
	'meetings': [r.to_dict() for r in paginated_results if r.result_type == 'meeting'],
	'organizations': [r.to_dict() for r in paginated_results if r.result_type == 'organization'],
	'causes': [r.to_dict() for r in paginated_results if r.result_type == 'cause'],
	'jurisdictions': [r.to_dict() for r in paginated_results if r.result_type == 'jurisdiction'],
	}

	logger.info(f"📦 Grouped results - contacts:{len(grouped_results['contacts'])}, meetings:{len(grouped_results['meetings'])}, organizations:{len(grouped_results['organizations'])}, causes:{len(grouped_results['causes'])}, jurisdictions:{len(grouped_results['jurisdictions'])}")

	# Calculate total results per type (from all_results before pagination)
	type_totals = {
	'contacts': len([r for r in all_results if r.result_type == 'contact']),
	'meetings': len([r for r in all_results if r.result_type == 'meeting']),
	'organizations': len([r for r in all_results if r.result_type == 'organization']),
	'causes': len([r for r in all_results if r.result_type == 'cause']),
	'jurisdictions': len([r for r in all_results if r.result_type == 'jurisdiction']),
	}

	# Calculate total results
	# For single-type browse mode, get accurate count from database
	if not q and len(requested_types) == 1:
	# Browse mode: count total matching records in DB
	if 'organizations' in requested_types:
	total_results = count_organizations(state=state, ntee_code=ntee_code, query=q)
	type_totals['organizations'] = total_results # Use accurate DB count
	else:
	# Fallback to fetched results for other types
	total_results = len(all_results)
	else:
	# Search/multi-type mode: use fetched results
	total_results = len(all_results)

	total_pages = (total_results + limit - 1) // limit # Ceiling division

	response_data = {
	"query": q or "",
	"total_results": total_results,
	"type_totals": type_totals, # Add per-type totals
	"results": grouped_results,
	"pagination": {
	"page": page if offset == 0 or offset == (page - 1) * limit else (offset // limit) + 1,
	"limit": limit,
	"offset": offset,
	"total_pages": total_pages,
	"has_next": offset + limit < total_results,
	"has_prev": offset > 0
	},
	"filters": {
	"state": state,
	"ntee_code": ntee_code,
	"types": requested_types,
	"sort": sort
	}
	}

	logger.info(f"✅ Search complete - returning {total_results} total results, {len(paginated_results)} on this page")
	return response_data

	except Exception as e:
	logger.error(f"❌ Search error: {type(e).__name__}: {e}")
	logger.exception("Full traceback:")

	# Parse error into structured response
	error_detail = parse_error(e, context={
	"query": q,
	"state": state,
	"types": types,
	"data_type": "search"
	})

	return JSONResponse(
	status_code=500,
	content=error_detail.model_dump()
	)


	@router.get("/api/search/suggest")
	async def search_suggestions(
	q: str = Query(..., min_length=1, description="Partial search query"),
	limit: int = Query(5, ge=1, le=20, description="Maximum suggestions")
	):
	"""
	Get search suggestions/autocomplete

	Returns quick suggestions as user types
	"""
	try:
	suggestions = []

	# Common search terms
	common_terms = [
	"dental health", "oral health", "affordable housing", "public transit",
	"school funding", "budget", "water quality", "parks", "zoning",
	"police", "fire department", "mental health", "food assistance",
	"senior services", "youth programs", "employment", "job training"
	]

	# Filter suggestions
	q_lower = q.lower()
	suggestions = [term for term in common_terms if q_lower in term.lower()]

	return {
	"query": q,
	"suggestions": suggestions[:limit]
	}

	except Exception as e:
	logger.error(f"Suggestion error: {e}")

	# Parse error into structured response
	error_detail = parse_error(e, context={
	"query": q,
	"data_type": "suggestions"
	})

	return JSONResponse(
	status_code=500,
	content=error_detail.model_dump()
	)