Spaces:
Sleeping
Sleeping
File size: 4,776 Bytes
a3ae00a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | """
database/schema.py
Creates all pgVector tables from scratch.
Run once: python -m database.schema
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from contextlib import contextmanager
from config import PG_DSN
@contextmanager
def get_conn():
conn = psycopg2.connect(PG_DSN)
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()
@contextmanager
def get_cursor(conn):
cur = conn.cursor(cursor_factory=RealDictCursor)
try:
yield cur
finally:
cur.close()
DDL = """
CREATE EXTENSION IF NOT EXISTS vector;
-- ββ Table 1: documents βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
-- One row per source record.
-- Holds all metadata, dense metadata embedding, and sparse metadata embedding.
CREATE TABLE IF NOT EXISTS documents (
id BIGSERIAL PRIMARY KEY,
-- Identity
ark_id TEXT UNIQUE NOT NULL,
record_id TEXT,
source_url TEXT,
iiif_manifest TEXT,
-- Provenance
newspaper TEXT,
collection TEXT,
institution TEXT,
-- Bibliographic
title TEXT,
issue_date TEXT,
date_iso TEXT[],
date_start TIMESTAMPTZ,
year INT[],
publisher TEXT[],
place TEXT[],
language TEXT[],
-- Content signals
page_count INT,
pages TEXT[],
topics TEXT[],
geography TEXT[],
char_count INT,
genre TEXT[],
abstract TEXT,
exemplary_image_id TEXT,
-- Dense metadata embedding (BGE-M3 β 1024 dims)
metadata_embedding VECTOR(1024),
-- Sparse metadata embedding (two-array format, more efficient than JSONB)
sparse_token_ids INTEGER[],
sparse_weights FLOAT4[],
-- Tracking
ingested_at TIMESTAMPTZ
);
-- HNSW index on metadata embedding (faster + smaller than ivfflat)
CREATE INDEX IF NOT EXISTS idx_documents_meta_emb
ON documents
USING hnsw (metadata_embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
CREATE INDEX IF NOT EXISTS idx_documents_ark ON documents (ark_id);
CREATE INDEX IF NOT EXISTS idx_documents_year ON documents USING GIN (year);
CREATE INDEX IF NOT EXISTS idx_documents_language ON documents USING GIN (language);
CREATE INDEX IF NOT EXISTS idx_documents_topics ON documents USING GIN (topics);
CREATE INDEX IF NOT EXISTS idx_documents_geo ON documents USING GIN (geography);
CREATE INDEX IF NOT EXISTS idx_documents_genre ON documents USING GIN (genre);
-- ββ Table 2: chunks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
-- One row per text chunk (512 tokens, 100 overlap).
-- No tsvector column β lexical search handled by BGE-M3 sparse instead.
CREATE TABLE IF NOT EXISTS chunks (
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
ark_id TEXT NOT NULL,
chunk_index INT NOT NULL,
chunk_text TEXT NOT NULL,
-- Dense full-text embedding (BGE-M3 β 1024 dims)
text_embedding VECTOR(1024),
-- Sparse full-text embedding (two-array format)
sparse_token_ids INTEGER[],
sparse_weights FLOAT4[]
);
-- HNSW index on chunk text embedding
CREATE INDEX IF NOT EXISTS idx_chunks_text_emb
ON chunks
USING hnsw (text_embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
CREATE INDEX IF NOT EXISTS idx_chunks_document_id
ON chunks (document_id);
-- ββ Table 3: query_logs ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
-- Structured log of every query for evaluation and dashboarding.
CREATE TABLE IF NOT EXISTS query_logs (
id BIGSERIAL PRIMARY KEY,
queried_at TIMESTAMPTZ DEFAULT NOW(),
raw_query TEXT,
rewritten_query TEXT,
filters JSONB,
retrieved_ark_ids TEXT[],
response TEXT,
relevancy_score FLOAT,
faithfulness_score FLOAT,
latency_ms INT
);
"""
def create_schema():
print("Creating schema ...")
with get_conn() as conn:
with get_cursor(conn) as cur:
cur.execute(DDL)
print("Done. Tables: documents, chunks, query_logs")
if __name__ == "__main__":
create_schema()
|