axiom-engine-api / migrations /001_init_vault.sql
EATosin's picture
fix: editor node
25b35f6
-- ==============================================================================
-- AXIOM V4.6 DATA ENGINEERING UPGRADE: THE SOVEREIGN MULTILINGUAL SCHEMA
-- ==============================================================================
BEGIN;
-- ------------------------------------------------------------------------------
-- 1. MULTILINGUAL HYBRID SEARCH FIX
-- Drop the English-hardcoded column and replace it with a globally agnostic one.
-- ------------------------------------------------------------------------------
ALTER TABLE document_chunks DROP COLUMN IF EXISTS fts_content CASCADE;
ALTER TABLE document_chunks
ADD COLUMN fts_content tsvector
GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED;
-- Recreate the Keyword Index
CREATE INDEX IF NOT EXISTS idx_fts_content ON document_chunks USING gin (fts_content);
-- ------------------------------------------------------------------------------
-- 2. VECTOR MATH OPTIMIZATION (Inner Product / L2 Norm Speedup)
-- ------------------------------------------------------------------------------
-- Drop the slow Cosine index
DROP INDEX IF EXISTS document_chunks_embedding_idx;
-- Create the blazing fast Inner Product HNSW index
CREATE INDEX idx_vector_ip ON document_chunks USING hnsw (embedding vector_ip_ops);
-- ------------------------------------------------------------------------------
-- 3. THE "SEQUENTIAL SCAN" KILLERS (B-Tree Indexes)
-- ------------------------------------------------------------------------------
CREATE INDEX IF NOT EXISTS idx_chunks_user_doc ON document_chunks(user_id, document_id);
CREATE INDEX IF NOT EXISTS idx_docs_user ON documents(user_id);
CREATE INDEX IF NOT EXISTS idx_audit_logs_user_time ON audit_logs(user_id, created_at DESC);
-- ------------------------------------------------------------------------------
-- 4. JSONB TELEMETRY INDEXING
-- ------------------------------------------------------------------------------
CREATE INDEX IF NOT EXISTS idx_chat_metrics_gin ON chat_messages USING gin (metrics);
-- ------------------------------------------------------------------------------
-- 5. UPGRADED RPC: MULTILINGUAL & INNER PRODUCT HYBRID SEARCH
-- ------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION hybrid_vault_search(
query_text TEXT,
query_embedding VECTOR(1024),
match_count INT,
target_user_id TEXT
) RETURNS TABLE (
id BIGINT,
document_id BIGINT,
filename TEXT,
content TEXT,
similarity FLOAT,
fts_rank REAL
) LANGUAGE plpgsql AS $$
BEGIN
RETURN QUERY
SELECT
c.id,
c.document_id,
d.filename,
c.content,
-- SOTA MATH: pgvector <#> returns negative inner product, so we multiply by -1
(c.embedding <#> query_embedding) * -1 AS similarity,
-- MULTILINGUAL FIX: use 'simple' dictionary
ts_rank_cd(c.fts_content, websearch_to_tsquery('simple', query_text)) AS fts_rank
FROM document_chunks c
JOIN documents d ON c.document_id = d.id
WHERE c.user_id = target_user_id
-- SOTA: 0.7 Semantic (Inner Product) + 0.3 Keyword (Simple)
ORDER BY (0.7 * ((c.embedding <#> query_embedding) * -1) + 0.3 * ts_rank_cd(c.fts_content, websearch_to_tsquery('simple', query_text))) DESC
LIMIT match_count;
END;
$$;
-- ------------------------------------------------------------------------------
-- 6. UPGRADED RPC: SINGLE-DOCUMENT INNER PRODUCT MATCHING
-- ------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION match_document_chunks(
query_embedding VECTOR(1024),
match_limit INT,
target_document_id BIGINT,
target_user_id TEXT
) RETURNS TABLE (
content TEXT,
similarity FLOAT
) LANGUAGE plpgsql AS $$
BEGIN
RETURN QUERY
SELECT
c.content,
(c.embedding <#> query_embedding) * -1 AS similarity
FROM document_chunks c
WHERE c.document_id = target_document_id AND c.user_id = target_user_id
ORDER BY c.embedding <#> query_embedding -- Ascending because <#> returns negative inner product
LIMIT match_limit;
END;
$$;
COMMIT;