File size: 4,776 Bytes
a3ae00a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
database/schema.py

Creates all pgVector tables from scratch.
Run once:  python -m database.schema
"""

import psycopg2
from psycopg2.extras import RealDictCursor
from contextlib import contextmanager
from config import PG_DSN


@contextmanager
def get_conn():
    conn = psycopg2.connect(PG_DSN)
    try:
        yield conn
        conn.commit()
    except Exception:
        conn.rollback()
        raise
    finally:
        conn.close()


@contextmanager
def get_cursor(conn):
    cur = conn.cursor(cursor_factory=RealDictCursor)
    try:
        yield cur
    finally:
        cur.close()


DDL = """
CREATE EXTENSION IF NOT EXISTS vector;

-- ── Table 1: documents ───────────────────────────────────────────────────────
-- One row per source record.
-- Holds all metadata, dense metadata embedding, and sparse metadata embedding.

CREATE TABLE IF NOT EXISTS documents (
    id              BIGSERIAL PRIMARY KEY,

    -- Identity
    ark_id          TEXT UNIQUE NOT NULL,
    record_id       TEXT,
    source_url      TEXT,
    iiif_manifest   TEXT,

    -- Provenance
    newspaper       TEXT,
    collection      TEXT,
    institution     TEXT,

    -- Bibliographic
    title           TEXT,
    issue_date      TEXT,
    date_iso        TEXT[],
    date_start      TIMESTAMPTZ,
    year            INT[],
    publisher       TEXT[],
    place           TEXT[],
    language        TEXT[],

    -- Content signals
    page_count      INT,
    pages           TEXT[],
    topics          TEXT[],
    geography       TEXT[],
    char_count      INT,
    genre               TEXT[],
    abstract            TEXT,
    exemplary_image_id  TEXT,

    -- Dense metadata embedding (BGE-M3 β†’ 1024 dims)
    metadata_embedding  VECTOR(1024),

    -- Sparse metadata embedding (two-array format, more efficient than JSONB)
    sparse_token_ids    INTEGER[],
    sparse_weights      FLOAT4[],

    -- Tracking
    ingested_at     TIMESTAMPTZ
);

-- HNSW index on metadata embedding (faster + smaller than ivfflat)
CREATE INDEX IF NOT EXISTS idx_documents_meta_emb
    ON documents
    USING hnsw (metadata_embedding vector_cosine_ops)
    WITH (m = 16, ef_construction = 64);

CREATE INDEX IF NOT EXISTS idx_documents_ark      ON documents (ark_id);
CREATE INDEX IF NOT EXISTS idx_documents_year     ON documents USING GIN (year);
CREATE INDEX IF NOT EXISTS idx_documents_language ON documents USING GIN (language);
CREATE INDEX IF NOT EXISTS idx_documents_topics   ON documents USING GIN (topics);
CREATE INDEX IF NOT EXISTS idx_documents_geo      ON documents USING GIN (geography);
CREATE INDEX IF NOT EXISTS idx_documents_genre ON documents USING GIN (genre);


-- ── Table 2: chunks ──────────────────────────────────────────────────────────
-- One row per text chunk (512 tokens, 100 overlap).
-- No tsvector column β€” lexical search handled by BGE-M3 sparse instead.

CREATE TABLE IF NOT EXISTS chunks (
    id              BIGSERIAL PRIMARY KEY,
    document_id     BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    ark_id          TEXT NOT NULL,

    chunk_index     INT NOT NULL,
    chunk_text      TEXT NOT NULL,

    -- Dense full-text embedding (BGE-M3 β†’ 1024 dims)
    text_embedding  VECTOR(1024),

    -- Sparse full-text embedding (two-array format)
    sparse_token_ids    INTEGER[],
    sparse_weights      FLOAT4[]
);

-- HNSW index on chunk text embedding
CREATE INDEX IF NOT EXISTS idx_chunks_text_emb
    ON chunks
    USING hnsw (text_embedding vector_cosine_ops)
    WITH (m = 16, ef_construction = 64);

CREATE INDEX IF NOT EXISTS idx_chunks_document_id
    ON chunks (document_id);


-- ── Table 3: query_logs ──────────────────────────────────────────────────────
-- Structured log of every query for evaluation and dashboarding.

CREATE TABLE IF NOT EXISTS query_logs (
    id                  BIGSERIAL PRIMARY KEY,
    queried_at          TIMESTAMPTZ DEFAULT NOW(),

    raw_query           TEXT,
    rewritten_query     TEXT,

    filters             JSONB,
    retrieved_ark_ids   TEXT[],
    response            TEXT,

    relevancy_score     FLOAT,
    faithfulness_score  FLOAT,
    latency_ms          INT
);
"""


def create_schema():
    print("Creating schema ...")
    with get_conn() as conn:
        with get_cursor(conn) as cur:
            cur.execute(DDL)
    print("Done. Tables: documents, chunks, query_logs")


if __name__ == "__main__":
    create_schema()