-- Crisis Correlation Schema Extensions -- Adds historical events and document-event linking -- Existing tables: entities, document_topics, document_keywords, document_features, page_features BEGIN; -- Historical events reference table CREATE TABLE IF NOT EXISTS historical_events ( id SERIAL PRIMARY KEY, event_name TEXT NOT NULL UNIQUE, start_date DATE NOT NULL, end_date DATE, category VARCHAR(50), description TEXT, keywords JSONB DEFAULT '[]' ); -- Many-to-many: documents <-> events with relevance scoring CREATE TABLE IF NOT EXISTS document_events ( document_id INT NOT NULL REFERENCES documents(id) ON DELETE CASCADE, event_id INT NOT NULL REFERENCES historical_events(id) ON DELETE CASCADE, relevance_score REAL NOT NULL DEFAULT 0, match_methods JSONB DEFAULT '[]', details JSONB DEFAULT '{}', created_at TIMESTAMP DEFAULT NOW(), PRIMARY KEY (document_id, event_id) ); CREATE INDEX IF NOT EXISTS idx_document_events_event ON document_events(event_id); CREATE INDEX IF NOT EXISTS idx_document_events_score ON document_events(relevance_score DESC); -- Document-level date estimates (extracted from filename, OCR, entities) CREATE TABLE IF NOT EXISTS document_dates ( document_id INT PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE, estimated_date DATE, date_source VARCHAR(30), date_confidence REAL, date_range_start DATE, date_range_end DATE, congress_session INT, created_at TIMESTAMP DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_document_dates_date ON document_dates(estimated_date); CREATE INDEX IF NOT EXISTS idx_document_dates_congress ON document_dates(congress_session); -- Track pipeline runs CREATE TABLE IF NOT EXISTS ml_pipeline_log ( id SERIAL PRIMARY KEY, pipeline TEXT NOT NULL, started_at TIMESTAMP DEFAULT NOW(), finished_at TIMESTAMP, docs_processed INT DEFAULT 0, status TEXT DEFAULT 'running' ); COMMIT;