widgettdc-api / data /dIV /schema-validation.sql
Kraft102's picture
fix: sql.js Docker/Alpine compatibility layer for PatternMemory and FailureMemory
5a81b95
-- schema-validation.sql
-- Database Schema Validation and Statistics Report
-- Citizen Intelligence Agency - Open Source Intelligence Platform
-- Generated for PostgreSQL 16
--
-- Purpose: Validates database schema by counting objects, extracting samples,
-- and generating statistics reports for documentation validation
--
-- NOTE: For comprehensive validation of ALL 177 objects (93 tables + 56 views + 28 mviews),
-- use schema-validation-v2.sql which provides 100% coverage validation.
-- This script focuses on statistics and sampling.
--
-- Usage:
-- psql -U postgres -d cia_dev -f schema-validation.sql > schema_report.txt 2>&1
--
-- Output formats available:
-- 1. Text report (stdout)
-- 2. JSON report (see JSON output section)
-- 3. CSV inventory (see CSV output section)
--
-- Related: Issue Hack23/cia#7865 - Schema validation script foundation
-- See also: schema-validation-v2.sql (Issue Hack23/cia#7872 - 100% coverage)
\set ECHO queries
\timing on
-- ===========================================================================
-- PART 1: DATABASE OBJECT COUNTS
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== DATABASE OBJECT COUNTS ==='
\echo '=========================================='
\echo ''
-- Count base tables
SELECT 'Base Tables' AS object_type, COUNT(*) AS count
FROM information_schema.tables
WHERE table_schema = 'public' AND table_type = 'BASE TABLE';
-- Count regular views
SELECT 'Regular Views' AS object_type, COUNT(*) AS count
FROM information_schema.views
WHERE table_schema = 'public';
-- Count materialized views
SELECT 'Materialized Views' AS object_type, COUNT(*) AS count
FROM pg_matviews
WHERE schemaname = 'public';
-- Count indexes
SELECT 'Indexes' AS object_type, COUNT(*) AS count
FROM pg_indexes
WHERE schemaname = 'public';
-- Count sequences
SELECT 'Sequences' AS object_type, COUNT(*) AS count
FROM information_schema.sequences
WHERE sequence_schema = 'public';
-- Count functions
SELECT 'Functions' AS object_type, COUNT(*) AS count
FROM pg_proc p
JOIN pg_namespace n ON p.pronamespace = n.oid
WHERE n.nspname = 'public';
-- Summary
\echo ''
\echo 'Summary of database objects:'
SELECT
(SELECT COUNT(*) FROM information_schema.tables
WHERE table_schema = 'public' AND table_type = 'BASE TABLE') AS base_tables,
(SELECT COUNT(*) FROM information_schema.views
WHERE table_schema = 'public') AS regular_views,
(SELECT COUNT(*) FROM pg_matviews
WHERE schemaname = 'public') AS materialized_views,
(SELECT COUNT(*) FROM pg_indexes
WHERE schemaname = 'public') AS indexes,
(SELECT COUNT(*) FROM information_schema.sequences
WHERE sequence_schema = 'public') AS sequences,
(SELECT COUNT(*) FROM pg_proc p JOIN pg_namespace n ON p.pronamespace = n.oid
WHERE n.nspname = 'public') AS functions;
-- ===========================================================================
-- PART 2: TABLE INVENTORY WITH ROW COUNTS AND METADATA
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== TABLE INVENTORY ==='
\echo '=========================================='
\echo ''
DO $$
DECLARE
r RECORD;
row_count BIGINT;
col_count INTEGER;
table_size TEXT;
BEGIN
RAISE NOTICE 'TABLE INVENTORY WITH ROW COUNTS, COLUMNS, AND SIZE';
RAISE NOTICE '-----------------------------------------------------';
RAISE NOTICE 'Format: Schema.Table | Rows | Columns | Size';
RAISE NOTICE '';
FOR r IN
SELECT schemaname, tablename
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY tablename
LOOP
BEGIN
-- Get row count
EXECUTE format('SELECT COUNT(*) FROM %I.%I', r.schemaname, r.tablename) INTO row_count;
-- Get column count
SELECT COUNT(*) INTO col_count
FROM information_schema.columns
WHERE table_schema = r.schemaname AND table_name = r.tablename;
-- Get table size
EXECUTE format('SELECT pg_size_pretty(pg_total_relation_size(%L))',
r.schemaname || '.' || r.tablename) INTO table_size;
RAISE NOTICE 'Table: %.% | Rows: % | Columns: % | Size: %',
r.schemaname, r.tablename, row_count, col_count, table_size;
EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'Table: %.% | ERROR: %', r.schemaname, r.tablename, SQLERRM;
END;
END LOOP;
END $$;
-- ===========================================================================
-- PART 3: VIEW INVENTORY WITH ROW COUNTS AND METADATA
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== VIEW INVENTORY ==='
\echo '=========================================='
\echo ''
DO $$
DECLARE
r RECORD;
row_count BIGINT;
col_count INTEGER;
BEGIN
RAISE NOTICE 'REGULAR VIEWS INVENTORY';
RAISE NOTICE '-----------------------';
RAISE NOTICE 'Format: Schema.View | Rows | Columns';
RAISE NOTICE '';
-- Regular views
FOR r IN
SELECT schemaname, viewname AS name
FROM pg_views
WHERE schemaname = 'public'
ORDER BY viewname
LOOP
BEGIN
-- Get row count
EXECUTE format('SELECT COUNT(*) FROM %I.%I', r.schemaname, r.name) INTO row_count;
-- Get column count
SELECT COUNT(*) INTO col_count
FROM information_schema.columns
WHERE table_schema = r.schemaname AND table_name = r.name;
RAISE NOTICE 'View: %.% | Rows: % | Columns: %',
r.schemaname, r.name, row_count, col_count;
EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'View: %.% | ERROR: %', r.schemaname, r.name, SQLERRM;
END;
END LOOP;
RAISE NOTICE '';
RAISE NOTICE 'MATERIALIZED VIEWS INVENTORY';
RAISE NOTICE '----------------------------';
RAISE NOTICE 'Format: Schema.View | Rows | Columns | Size';
RAISE NOTICE '';
-- Materialized views
FOR r IN
SELECT schemaname, matviewname AS name
FROM pg_matviews
WHERE schemaname = 'public'
ORDER BY matviewname
LOOP
DECLARE
view_size TEXT;
BEGIN
-- Get row count
EXECUTE format('SELECT COUNT(*) FROM %I.%I', r.schemaname, r.name) INTO row_count;
-- Get column count
SELECT COUNT(*) INTO col_count
FROM information_schema.columns
WHERE table_schema = r.schemaname AND table_name = r.name;
-- Get materialized view size
EXECUTE format('SELECT pg_size_pretty(pg_total_relation_size(%L))',
r.schemaname || '.' || r.name) INTO view_size;
RAISE NOTICE 'Materialized View: %.% | Rows: % | Columns: % | Size: %',
r.schemaname, r.name, row_count, col_count, view_size;
EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'Materialized View: %.% | ERROR: %', r.schemaname, r.name, SQLERRM;
END;
END LOOP;
END $$;
-- ===========================================================================
-- PART 4: SAMPLE DATA FROM KEY VIEWS
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== SAMPLE DATA FROM KEY VIEWS ==='
\echo '=========================================='
\echo ''
-- Sample from view_riksdagen_politician (if exists)
\echo '--- Sample from view_riksdagen_politician (5 rows) ---'
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.views WHERE table_name = 'view_riksdagen_politician') THEN
RAISE NOTICE 'Executing: SELECT * FROM view_riksdagen_politician LIMIT 5';
ELSE
RAISE NOTICE 'View view_riksdagen_politician does not exist';
END IF;
END $$;
SELECT * FROM view_riksdagen_politician LIMIT 5;
\echo ''
\echo '--- Sample from view_riksdagen_party (5 rows) ---'
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.views WHERE table_name = 'view_riksdagen_party') THEN
RAISE NOTICE 'Executing: SELECT * FROM view_riksdagen_party LIMIT 5';
ELSE
RAISE NOTICE 'View view_riksdagen_party does not exist';
END IF;
END $$;
SELECT * FROM view_riksdagen_party LIMIT 5;
\echo ''
\echo '--- Sample from view_riksdagen_vote_data_ballot_politician_summary_daily (recent 5 rows) ---'
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM pg_matviews WHERE matviewname = 'view_riksdagen_vote_data_ballot_politician_summary_daily') THEN
RAISE NOTICE 'Executing: SELECT * FROM view_riksdagen_vote_data_ballot_politician_summary_daily ORDER BY vote_date DESC LIMIT 5';
ELSE
RAISE NOTICE 'Materialized view view_riksdagen_vote_data_ballot_politician_summary_daily does not exist';
END IF;
END $$;
-- Only select if table has vote_date column
DO $$
DECLARE
has_column BOOLEAN;
BEGIN
SELECT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'view_riksdagen_vote_data_ballot_politician_summary_daily'
AND column_name = 'vote_date'
) INTO has_column;
IF has_column THEN
EXECUTE format('SELECT * FROM %I ORDER BY vote_date DESC LIMIT 5', 'view_riksdagen_vote_data_ballot_politician_summary_daily');
ELSE
EXECUTE format('SELECT * FROM %I LIMIT 5', 'view_riksdagen_vote_data_ballot_politician_summary_daily');
END IF;
EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'Could not query view: %', SQLERRM;
END $$;
-- ===========================================================================
-- PART 5: SCHEMA STATISTICS
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== SCHEMA STATISTICS ==='
\echo '=========================================='
\echo ''
-- Largest tables by row count
\echo '--- Top 20 Largest Tables by Row Count ---'
SELECT
schemaname,
tablename,
n_live_tup AS row_count,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename) - pg_relation_size(schemaname||'.'||tablename)) AS indexes_size
FROM pg_stat_user_tables
WHERE schemaname = 'public'
ORDER BY n_live_tup DESC
LIMIT 20;
\echo ''
\echo '--- Top 20 Largest Tables by Total Size ---'
SELECT
schemaname,
tablename,
n_live_tup AS row_count,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size
FROM pg_stat_user_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 20;
\echo ''
\echo '--- Tables with No Data ---'
SELECT
t.tablename,
COALESCE(s.n_live_tup, 0) AS row_count
FROM pg_tables t
LEFT JOIN pg_stat_user_tables s ON t.tablename = s.relname AND t.schemaname = s.schemaname
WHERE t.schemaname = 'public'
AND (s.n_live_tup = 0 OR s.n_live_tup IS NULL)
ORDER BY t.tablename;
\echo ''
\echo '--- Top 20 Views with Most Dependencies ---'
SELECT
dependent_view.relname AS view_name,
dependent_view.relkind AS view_type,
COUNT(DISTINCT source_table.relname) AS dependency_count
FROM pg_depend
JOIN pg_rewrite ON pg_depend.objid = pg_rewrite.oid
JOIN pg_class AS dependent_view ON pg_rewrite.ev_class = dependent_view.oid
JOIN pg_class AS source_table ON pg_depend.refobjid = source_table.oid
WHERE dependent_view.relkind IN ('v', 'm')
AND dependent_view.relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'public')
GROUP BY dependent_view.relname, dependent_view.relkind
ORDER BY dependency_count DESC
LIMIT 20;
\echo ''
\echo '--- Database Size Summary ---'
SELECT
pg_size_pretty(pg_database_size(current_database())) AS database_size,
(SELECT COUNT(*) FROM pg_stat_user_tables WHERE schemaname = 'public') AS table_count,
(SELECT COUNT(*) FROM pg_views WHERE schemaname = 'public') AS view_count,
(SELECT COUNT(*) FROM pg_matviews WHERE schemaname = 'public') AS materialized_view_count,
(SELECT SUM(n_live_tup) FROM pg_stat_user_tables WHERE schemaname = 'public') AS total_rows;
-- ===========================================================================
-- PART 6: COLUMN STATISTICS
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== COLUMN STATISTICS ==='
\echo '=========================================='
\echo ''
\echo '--- Tables with Most Columns ---'
SELECT
table_name,
COUNT(*) AS column_count
FROM information_schema.columns
WHERE table_schema = 'public'
GROUP BY table_name
ORDER BY column_count DESC
LIMIT 20;
\echo ''
\echo '--- Most Common Column Names ---'
SELECT
column_name,
COUNT(*) AS usage_count
FROM information_schema.columns
WHERE table_schema = 'public'
GROUP BY column_name
ORDER BY usage_count DESC
LIMIT 20;
\echo ''
\echo '--- Most Common Data Types ---'
SELECT
data_type,
COUNT(*) AS usage_count
FROM information_schema.columns
WHERE table_schema = 'public'
GROUP BY data_type
ORDER BY usage_count DESC;
-- ===========================================================================
-- PART 7: INDEX STATISTICS
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== INDEX STATISTICS ==='
\echo '=========================================='
\echo ''
\echo '--- Index Usage Statistics (Top 20 by Size) ---'
SELECT
schemaname,
tablename,
indexname,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
idx_scan AS number_of_scans,
idx_tup_read AS tuples_read,
idx_tup_fetch AS tuples_fetched
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY pg_relation_size(indexrelid) DESC
LIMIT 20;
\echo ''
\echo '--- Unused Indexes (No Scans) ---'
SELECT
schemaname,
tablename,
indexname,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
AND idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC;
-- ===========================================================================
-- PART 8: JSON OUTPUT (Optional)
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== JSON OUTPUT AVAILABLE ==='
\echo '=========================================='
\echo ''
\echo 'To generate JSON report, run:'
\echo 'psql -U postgres -d cia_dev -t -A -F"," -c "SELECT json_build_object(...)" > schema_report.json'
\echo ''
-- Commented out: Uncomment and modify path as needed
/*
COPY (
SELECT json_build_object(
'generated_at', NOW(),
'database', current_database(),
'summary', json_build_object(
'base_tables', (SELECT COUNT(*) FROM information_schema.tables
WHERE table_schema = 'public' AND table_type = 'BASE TABLE'),
'regular_views', (SELECT COUNT(*) FROM information_schema.views
WHERE table_schema = 'public'),
'materialized_views', (SELECT COUNT(*) FROM pg_matviews
WHERE schemaname = 'public'),
'indexes', (SELECT COUNT(*) FROM pg_indexes WHERE schemaname = 'public'),
'total_rows', (SELECT SUM(n_live_tup) FROM pg_stat_user_tables WHERE schemaname = 'public')
),
'tables', (
SELECT json_agg(json_build_object(
'name', tablename,
'rows', n_live_tup,
'size', pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename))
))
FROM pg_stat_user_tables
WHERE schemaname = 'public'
ORDER BY tablename
),
'views', (
SELECT json_agg(json_build_object(
'name', viewname,
'type', 'regular'
))
FROM pg_views
WHERE schemaname = 'public'
ORDER BY viewname
),
'materialized_views', (
SELECT json_agg(json_build_object(
'name', matviewname,
'type', 'materialized'
))
FROM pg_matviews
WHERE schemaname = 'public'
ORDER BY matviewname
)
)::text
) TO '/path/to/schema_report.json';
*/
-- ===========================================================================
-- PART 9: CSV OUTPUT (Optional)
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== CSV OUTPUT AVAILABLE ==='
\echo '=========================================='
\echo ''
\echo 'To generate CSV inventory, run:'
\echo 'psql -U postgres -d cia_dev -c "COPY (...) TO STDOUT WITH CSV HEADER" > schema_inventory.csv'
\echo ''
-- Commented out: Uncomment and modify path as needed
/*
COPY (
SELECT
'TABLE' AS object_type,
tablename AS object_name,
n_live_tup AS row_count,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = schemaname AND table_name = tablename) AS column_count
FROM pg_stat_user_tables
WHERE schemaname = 'public'
UNION ALL
SELECT
'VIEW' AS object_type,
viewname AS object_name,
NULL AS row_count,
NULL AS total_size,
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = schemaname AND table_name = viewname) AS column_count
FROM pg_views
WHERE schemaname = 'public'
UNION ALL
SELECT
'MATERIALIZED_VIEW' AS object_type,
matviewname AS object_name,
NULL AS row_count,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||matviewname)) AS total_size,
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = schemaname AND table_name = matviewname) AS column_count
FROM pg_matviews
WHERE schemaname = 'public'
ORDER BY object_type, object_name
) TO '/path/to/schema_inventory.csv' WITH (FORMAT CSV, HEADER);
*/
-- ===========================================================================
-- VALIDATION COMPLETE
-- ===========================================================================
\echo ''
\echo '=========================================='
\echo '=== VALIDATION COMPLETE ==='
\echo '=========================================='
\echo ''
\echo 'Schema validation report generated successfully.'
\echo 'Review the output above for schema health and statistics.'
\echo ''
\timing off