datamatters24 commited on
Commit
73e1d5b
·
verified ·
1 Parent(s): 57ffff6

Upload ml/schema.sql with huggingface_hub

Browse files
Files changed (1) hide show
  1. ml/schema.sql +57 -0
ml/schema.sql ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Crisis Correlation Schema Extensions
2
+ -- Adds historical events and document-event linking
3
+ -- Existing tables: entities, document_topics, document_keywords, document_features, page_features
4
+
5
+ BEGIN;
6
+
7
+ -- Historical events reference table
8
+ CREATE TABLE IF NOT EXISTS historical_events (
9
+ id SERIAL PRIMARY KEY,
10
+ event_name TEXT NOT NULL UNIQUE,
11
+ start_date DATE NOT NULL,
12
+ end_date DATE,
13
+ category VARCHAR(50),
14
+ description TEXT,
15
+ keywords JSONB DEFAULT '[]'
16
+ );
17
+
18
+ -- Many-to-many: documents <-> events with relevance scoring
19
+ CREATE TABLE IF NOT EXISTS document_events (
20
+ document_id INT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
21
+ event_id INT NOT NULL REFERENCES historical_events(id) ON DELETE CASCADE,
22
+ relevance_score REAL NOT NULL DEFAULT 0,
23
+ match_methods JSONB DEFAULT '[]',
24
+ details JSONB DEFAULT '{}',
25
+ created_at TIMESTAMP DEFAULT NOW(),
26
+ PRIMARY KEY (document_id, event_id)
27
+ );
28
+
29
+ CREATE INDEX IF NOT EXISTS idx_document_events_event ON document_events(event_id);
30
+ CREATE INDEX IF NOT EXISTS idx_document_events_score ON document_events(relevance_score DESC);
31
+
32
+ -- Document-level date estimates (extracted from filename, OCR, entities)
33
+ CREATE TABLE IF NOT EXISTS document_dates (
34
+ document_id INT PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE,
35
+ estimated_date DATE,
36
+ date_source VARCHAR(30),
37
+ date_confidence REAL,
38
+ date_range_start DATE,
39
+ date_range_end DATE,
40
+ congress_session INT,
41
+ created_at TIMESTAMP DEFAULT NOW()
42
+ );
43
+
44
+ CREATE INDEX IF NOT EXISTS idx_document_dates_date ON document_dates(estimated_date);
45
+ CREATE INDEX IF NOT EXISTS idx_document_dates_congress ON document_dates(congress_session);
46
+
47
+ -- Track pipeline runs
48
+ CREATE TABLE IF NOT EXISTS ml_pipeline_log (
49
+ id SERIAL PRIMARY KEY,
50
+ pipeline TEXT NOT NULL,
51
+ started_at TIMESTAMP DEFAULT NOW(),
52
+ finished_at TIMESTAMP,
53
+ docs_processed INT DEFAULT 0,
54
+ status TEXT DEFAULT 'running'
55
+ );
56
+
57
+ COMMIT;