File size: 4,053 Bytes
25b35f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f775c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25b35f6
 
 
 
f775c99
 
 
25b35f6
 
f775c99
 
 
 
25b35f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f775c99
25b35f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
-- ==============================================================================
-- AXIOM V4.6 DATA ENGINEERING UPGRADE: THE SOVEREIGN MULTILINGUAL SCHEMA
-- ==============================================================================

BEGIN;

-- ------------------------------------------------------------------------------
-- 1. MULTILINGUAL HYBRID SEARCH FIX
-- Drop the English-hardcoded column and replace it with a globally agnostic one.
-- ------------------------------------------------------------------------------
ALTER TABLE document_chunks DROP COLUMN IF EXISTS fts_content CASCADE;

ALTER TABLE document_chunks 
ADD COLUMN fts_content tsvector 
GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED;

-- Recreate the Keyword Index
CREATE INDEX IF NOT EXISTS idx_fts_content ON document_chunks USING gin (fts_content);

-- ------------------------------------------------------------------------------
-- 2. VECTOR MATH OPTIMIZATION (Inner Product / L2 Norm Speedup)
-- ------------------------------------------------------------------------------
-- Drop the slow Cosine index
DROP INDEX IF EXISTS document_chunks_embedding_idx;

-- Create the blazing fast Inner Product HNSW index
CREATE INDEX idx_vector_ip ON document_chunks USING hnsw (embedding vector_ip_ops);

-- ------------------------------------------------------------------------------
-- 3. THE "SEQUENTIAL SCAN" KILLERS (B-Tree Indexes)
-- ------------------------------------------------------------------------------
CREATE INDEX IF NOT EXISTS idx_chunks_user_doc ON document_chunks(user_id, document_id);
CREATE INDEX IF NOT EXISTS idx_docs_user ON documents(user_id);
CREATE INDEX IF NOT EXISTS idx_audit_logs_user_time ON audit_logs(user_id, created_at DESC);

-- ------------------------------------------------------------------------------
-- 4. JSONB TELEMETRY INDEXING
-- ------------------------------------------------------------------------------
CREATE INDEX IF NOT EXISTS idx_chat_metrics_gin ON chat_messages USING gin (metrics);

-- ------------------------------------------------------------------------------
-- 5. UPGRADED RPC: MULTILINGUAL & INNER PRODUCT HYBRID SEARCH
-- ------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION hybrid_vault_search(
  query_text TEXT,
  query_embedding VECTOR(1024),
  match_count INT,
  target_user_id TEXT
) RETURNS TABLE (
  id BIGINT,
  document_id BIGINT,
  filename TEXT,
  content TEXT,
  similarity FLOAT,
  fts_rank REAL
) LANGUAGE plpgsql AS $$
BEGIN
  RETURN QUERY
  SELECT 
    c.id,
    c.document_id,
    d.filename,
    c.content,
    -- SOTA MATH: pgvector <#> returns negative inner product, so we multiply by -1
    (c.embedding <#> query_embedding) * -1 AS similarity,
    -- MULTILINGUAL FIX: use 'simple' dictionary
    ts_rank_cd(c.fts_content, websearch_to_tsquery('simple', query_text)) AS fts_rank
  FROM document_chunks c
  JOIN documents d ON c.document_id = d.id
  WHERE c.user_id = target_user_id
  -- SOTA: 0.7 Semantic (Inner Product) + 0.3 Keyword (Simple)
  ORDER BY (0.7 * ((c.embedding <#> query_embedding) * -1) + 0.3 * ts_rank_cd(c.fts_content, websearch_to_tsquery('simple', query_text))) DESC
  LIMIT match_count;
END;
$$;

-- ------------------------------------------------------------------------------
-- 6. UPGRADED RPC: SINGLE-DOCUMENT INNER PRODUCT MATCHING
-- ------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION match_document_chunks(
  query_embedding VECTOR(1024),
  match_limit INT,
  target_document_id BIGINT,
  target_user_id TEXT
) RETURNS TABLE (
  content TEXT,
  similarity FLOAT
) LANGUAGE plpgsql AS $$
BEGIN
  RETURN QUERY
  SELECT
    c.content,
    (c.embedding <#> query_embedding) * -1 AS similarity
  FROM document_chunks c
  WHERE c.document_id = target_document_id AND c.user_id = target_user_id
  ORDER BY c.embedding <#> query_embedding -- Ascending because <#> returns negative inner product
  LIMIT match_limit;
END;
$$;

COMMIT;