Spaces:
Sleeping
Sleeping
| -- ============================================================================== | |
| -- AXIOM V4.6 DATA ENGINEERING UPGRADE: THE SOVEREIGN MULTILINGUAL SCHEMA | |
| -- ============================================================================== | |
| BEGIN; | |
| -- ------------------------------------------------------------------------------ | |
| -- 1. MULTILINGUAL HYBRID SEARCH FIX | |
| -- Drop the English-hardcoded column and replace it with a globally agnostic one. | |
| -- ------------------------------------------------------------------------------ | |
| ALTER TABLE document_chunks DROP COLUMN IF EXISTS fts_content CASCADE; | |
| ALTER TABLE document_chunks | |
| ADD COLUMN fts_content tsvector | |
| GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED; | |
| -- Recreate the Keyword Index | |
| CREATE INDEX IF NOT EXISTS idx_fts_content ON document_chunks USING gin (fts_content); | |
| -- ------------------------------------------------------------------------------ | |
| -- 2. VECTOR MATH OPTIMIZATION (Inner Product / L2 Norm Speedup) | |
| -- ------------------------------------------------------------------------------ | |
| -- Drop the slow Cosine index | |
| DROP INDEX IF EXISTS document_chunks_embedding_idx; | |
| -- Create the blazing fast Inner Product HNSW index | |
| CREATE INDEX idx_vector_ip ON document_chunks USING hnsw (embedding vector_ip_ops); | |
| -- ------------------------------------------------------------------------------ | |
| -- 3. THE "SEQUENTIAL SCAN" KILLERS (B-Tree Indexes) | |
| -- ------------------------------------------------------------------------------ | |
| CREATE INDEX IF NOT EXISTS idx_chunks_user_doc ON document_chunks(user_id, document_id); | |
| CREATE INDEX IF NOT EXISTS idx_docs_user ON documents(user_id); | |
| CREATE INDEX IF NOT EXISTS idx_audit_logs_user_time ON audit_logs(user_id, created_at DESC); | |
| -- ------------------------------------------------------------------------------ | |
| -- 4. JSONB TELEMETRY INDEXING | |
| -- ------------------------------------------------------------------------------ | |
| CREATE INDEX IF NOT EXISTS idx_chat_metrics_gin ON chat_messages USING gin (metrics); | |
| -- ------------------------------------------------------------------------------ | |
| -- 5. UPGRADED RPC: MULTILINGUAL & INNER PRODUCT HYBRID SEARCH | |
| -- ------------------------------------------------------------------------------ | |
| CREATE OR REPLACE FUNCTION hybrid_vault_search( | |
| query_text TEXT, | |
| query_embedding VECTOR(1024), | |
| match_count INT, | |
| target_user_id TEXT | |
| ) RETURNS TABLE ( | |
| id BIGINT, | |
| document_id BIGINT, | |
| filename TEXT, | |
| content TEXT, | |
| similarity FLOAT, | |
| fts_rank REAL | |
| ) LANGUAGE plpgsql AS $$ | |
| BEGIN | |
| RETURN QUERY | |
| SELECT | |
| c.id, | |
| c.document_id, | |
| d.filename, | |
| c.content, | |
| -- SOTA MATH: pgvector <#> returns negative inner product, so we multiply by -1 | |
| (c.embedding <#> query_embedding) * -1 AS similarity, | |
| -- MULTILINGUAL FIX: use 'simple' dictionary | |
| ts_rank_cd(c.fts_content, websearch_to_tsquery('simple', query_text)) AS fts_rank | |
| FROM document_chunks c | |
| JOIN documents d ON c.document_id = d.id | |
| WHERE c.user_id = target_user_id | |
| -- SOTA: 0.7 Semantic (Inner Product) + 0.3 Keyword (Simple) | |
| ORDER BY (0.7 * ((c.embedding <#> query_embedding) * -1) + 0.3 * ts_rank_cd(c.fts_content, websearch_to_tsquery('simple', query_text))) DESC | |
| LIMIT match_count; | |
| END; | |
| $$; | |
| -- ------------------------------------------------------------------------------ | |
| -- 6. UPGRADED RPC: SINGLE-DOCUMENT INNER PRODUCT MATCHING | |
| -- ------------------------------------------------------------------------------ | |
| CREATE OR REPLACE FUNCTION match_document_chunks( | |
| query_embedding VECTOR(1024), | |
| match_limit INT, | |
| target_document_id BIGINT, | |
| target_user_id TEXT | |
| ) RETURNS TABLE ( | |
| content TEXT, | |
| similarity FLOAT | |
| ) LANGUAGE plpgsql AS $$ | |
| BEGIN | |
| RETURN QUERY | |
| SELECT | |
| c.content, | |
| (c.embedding <#> query_embedding) * -1 AS similarity | |
| FROM document_chunks c | |
| WHERE c.document_id = target_document_id AND c.user_id = target_user_id | |
| ORDER BY c.embedding <#> query_embedding -- Ascending because <#> returns negative inner product | |
| LIMIT match_limit; | |
| END; | |
| $$; | |
| COMMIT; | |