File size: 1,119 Bytes
dd6d0d3
 
 
 
 
 
 
 
 
 
 
d7850ea
dd6d0d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
-- Enable pg_trgm for wildcard support
CREATE EXTENSION IF NOT EXISTS pg_trgm;

-- Table: documents
CREATE TABLE IF NOT EXISTS documents (
  id          BIGSERIAL PRIMARY KEY,
  url         TEXT UNIQUE NOT NULL,
  title       TEXT,
  raw_html    TEXT,
  plain_text  TEXT,
  language    TEXT DEFAULT 'en',
  image_url   TEXT,
  crawled_at  TIMESTAMPTZ DEFAULT NOW(),
  indexed     BOOLEAN DEFAULT FALSE
);

-- Table: inverted_index
CREATE TABLE IF NOT EXISTS inverted_index (
  id        BIGSERIAL PRIMARY KEY,
  term      TEXT NOT NULL,
  doc_id    BIGINT REFERENCES documents(id) ON DELETE CASCADE,
  frequency INTEGER NOT NULL DEFAULT 1,
  positions INTEGER[] NOT NULL DEFAULT '{}',
  tf        FLOAT8,
  UNIQUE (term, doc_id)
);

-- Fast lookup by term
CREATE INDEX IF NOT EXISTS idx_inverted_term ON inverted_index (term);
-- Wildcard support with trigram index
CREATE INDEX IF NOT EXISTS idx_inverted_term_trgm ON inverted_index USING gin (term gin_trgm_ops);

-- Table: term_stats
CREATE TABLE IF NOT EXISTS term_stats (
  term      TEXT PRIMARY KEY,
  doc_freq  INTEGER NOT NULL DEFAULT 1,
  idf       FLOAT8
);