open-navigator / scripts /deployment /setup_openstates_db.sh
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
#!/bin/bash
set -e # Exit on error
# Setup OpenStates PostgreSQL Database
# Loads the 9.8GB legislative data dump from Open States
# Contains: 50+ tables with bills, legislators, votes, committees for all 50 states
# βœ… REQUIRES TWO FILES:
# 1. Schema file: Creates tables, indexes, constraints (~50 MB)
# 2. Data file: Contains all the actual data (~10 GB)
#
# Download both with:
# python scripts/bulk_legislative_download.py --postgres --month 2026-04
#
# This script restores schema first, then data.
echo "πŸ›οΈ OpenStates Database Setup"
echo "===================================="
echo ""
# Configuration
DB_NAME="openstates"
DB_USER="postgres"
DB_PASSWORD="password"
DB_HOST="localhost"
DB_PORT="5432" # PostgreSQL container (charming_mestorf)
SCHEMA_FILE="data/cache/legislation_bulk/postgres/2026-04-schema.pgdump"
DATA_FILE="data/cache/legislation_bulk/postgres/2026-04-public.pgdump"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Check if schema file exists
if [ ! -f "$SCHEMA_FILE" ]; then
echo -e "${RED}❌ Error: Schema file not found at $SCHEMA_FILE${NC}"
echo ""
echo "Download both schema and data files with:"
echo " python scripts/bulk_legislative_download.py --postgres --month 2026-04"
exit 1
fi
echo -e "${GREEN}βœ“${NC} Found schema file: $SCHEMA_FILE"
SCHEMA_SIZE=$(du -h "$SCHEMA_FILE" | cut -f1)
echo -e " Size: $SCHEMA_SIZE"
echo ""
# Check if data file exists
if [ ! -f "$DATA_FILE" ]; then
echo -e "${RED}❌ Error: Data file not found at $DATA_FILE${NC}"
echo ""
echo "Download both schema and data files with:"
echo " python scripts/bulk_legislative_download.py --postgres --month 2026-04"
exit 1
fi
echo -e "${GREEN}βœ“${NC} Found data file: $DATA_FILE"
DATA_SIZE=$(du -h "$DATA_FILE" | cut -f1)
echo -e " Size: $DATA_SIZE"
echo ""
# Check if PostgreSQL is running
if ! pg_isready -h $DB_HOST -p $DB_PORT -U $DB_USER > /dev/null 2>&1; then
echo -e "${RED}❌ Error: PostgreSQL 17 is not running on port $DB_PORT${NC}"
echo ""
echo "Start PostgreSQL 17 container:"
echo " docker start openstates-db"
exit 1
fi
echo -e "${GREEN}βœ“${NC} PostgreSQL is running"
echo ""
# Check if database already exists
if PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -lqt | cut -d \| -f 1 | grep -qw $DB_NAME; then
echo -e "${YELLOW}⚠️ Database '$DB_NAME' already exists${NC}"
echo ""
read -p "Drop and recreate? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Dropping existing database..."
PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -c "DROP DATABASE $DB_NAME;"
echo -e "${GREEN}βœ“${NC} Database dropped"
else
echo "Skipping database creation. Will attempt restore into existing database."
echo ""
fi
fi
# Create database if it doesn't exist
if ! PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -lqt | cut -d \| -f 1 | grep -qw $DB_NAME; then
echo "Creating database '$DB_NAME'..."
PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -c "CREATE DATABASE $DB_NAME;"
echo -e "${GREEN}βœ“${NC} Database created"
echo ""
fi
# Restore the schema and data
echo "πŸ”„ Restoring OpenStates database (2-step process)..."
echo ""
# Use pg_restore with verbose output
# --clean: Drop existing objects before recreating
# --if-exists: Don't error if objects don't exist
# --no-owner: Don't set ownership
# --no-acl: Don't set access privileges
export PGPASSWORD=$DB_PASSWORD
# Step 1: Restore schema (creates tables, indexes, constraints)
echo "πŸ“‹ Step 1/2: Restoring schema (creates all tables)..."
echo "Started at $(date)..."
pg_restore \
-h $DB_HOST \
-p $DB_PORT \
-U $DB_USER \
-d $DB_NAME \
--clean \
--if-exists \
--no-owner \
--no-acl \
--verbose \
"$SCHEMA_FILE" 2>&1 | while read line; do
# Only show important messages
if echo "$line" | grep -qE "^processing|^creating|^ERROR|^WARNING"; then
echo " $line"
fi
done
echo ""
echo -e "${GREEN}βœ“${NC} Schema restored at $(date)"
echo ""
# Step 2: Restore data (this takes longer, ~10-15 minutes)
echo "πŸ“Š Step 2/2: Restoring data (this will take 10-15 minutes for 9.8GB)..."
echo "Started at $(date)..."
pg_restore \
-h $DB_HOST \
-p $DB_PORT \
-U $DB_USER \
-d $DB_NAME \
--data-only \
--no-owner \
--no-acl \
--disable-triggers \
--verbose \
"$DATA_FILE" 2>&1 | while read line; do
# Only show important messages
if echo "$line" | grep -qE "^processing|^restoring data|^ERROR|^WARNING"; then
echo " $line"
fi
done
echo ""
echo "Data restore completed at $(date)"
echo ""
# Verify the restore
echo "πŸ“Š Verifying database contents..."
echo ""
TABLE_COUNT=$(PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';")
echo -e "${GREEN}βœ“${NC} Database restored successfully"
echo -e " Tables created: $TABLE_COUNT"
echo ""
# Show sample table counts
echo "Sample table contents:"
PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME << 'SQL'
SELECT
tablename,
(xpath('//row/cnt/text()', query_to_xml(format('SELECT COUNT(*) as cnt FROM %I', tablename), false, true, '')))[1]::text::int AS row_count
FROM pg_tables
WHERE schemaname = 'public'
AND tablename IN ('opencivicdata_person', 'opencivicdata_bill', 'opencivicdata_voteevent', 'opencivicdata_organization')
ORDER BY row_count DESC;
SQL
echo ""
echo "======================================"
echo -e "${GREEN}βœ… OpenStates database setup complete!${NC}"
echo "======================================"
echo ""
echo "Database: $DB_NAME"
echo "Host: $DB_HOST:$DB_PORT (PostgreSQL 17)"
echo ""
echo "Connect with:"
echo " PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME"
echo ""
echo "Or from Python:"
echo " import psycopg2"
echo " conn = psycopg2.connect(host='$DB_HOST', port=$DB_PORT, database='$DB_NAME', user='$DB_USER', password='$DB_PASSWORD')"
echo ""