File size: 2,046 Bytes
73e1d5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
-- Crisis Correlation Schema Extensions
-- Adds historical events and document-event linking
-- Existing tables: entities, document_topics, document_keywords, document_features, page_features

BEGIN;

-- Historical events reference table
CREATE TABLE IF NOT EXISTS historical_events (
    id          SERIAL PRIMARY KEY,
    event_name  TEXT NOT NULL UNIQUE,
    start_date  DATE NOT NULL,
    end_date    DATE,
    category    VARCHAR(50),
    description TEXT,
    keywords    JSONB DEFAULT '[]'
);

-- Many-to-many: documents <-> events with relevance scoring
CREATE TABLE IF NOT EXISTS document_events (
    document_id     INT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    event_id        INT NOT NULL REFERENCES historical_events(id) ON DELETE CASCADE,
    relevance_score REAL NOT NULL DEFAULT 0,
    match_methods   JSONB DEFAULT '[]',
    details         JSONB DEFAULT '{}',
    created_at      TIMESTAMP DEFAULT NOW(),
    PRIMARY KEY (document_id, event_id)
);

CREATE INDEX IF NOT EXISTS idx_document_events_event ON document_events(event_id);
CREATE INDEX IF NOT EXISTS idx_document_events_score ON document_events(relevance_score DESC);

-- Document-level date estimates (extracted from filename, OCR, entities)
CREATE TABLE IF NOT EXISTS document_dates (
    document_id     INT PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE,
    estimated_date  DATE,
    date_source     VARCHAR(30),
    date_confidence REAL,
    date_range_start DATE,
    date_range_end   DATE,
    congress_session INT,
    created_at      TIMESTAMP DEFAULT NOW()
);

CREATE INDEX IF NOT EXISTS idx_document_dates_date ON document_dates(estimated_date);
CREATE INDEX IF NOT EXISTS idx_document_dates_congress ON document_dates(congress_session);

-- Track pipeline runs
CREATE TABLE IF NOT EXISTS ml_pipeline_log (
    id           SERIAL PRIMARY KEY,
    pipeline     TEXT NOT NULL,
    started_at   TIMESTAMP DEFAULT NOW(),
    finished_at  TIMESTAMP,
    docs_processed INT DEFAULT 0,
    status       TEXT DEFAULT 'running'
);

COMMIT;