Spaces:
Sleeping
Sleeping
File size: 3,375 Bytes
b0b150b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
-- MEXAR - Hybrid Search Function for Supabase
-- Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion (RRF)
CREATE OR REPLACE FUNCTION hybrid_search(
query_embedding vector(384),
query_text text,
match_agent_id integer,
match_count integer
)
RETURNS TABLE (
id integer,
agent_id integer,
content text,
source text,
chunk_index integer,
section_title text,
created_at timestamp with time zone,
rrf_score real
)
LANGUAGE plpgsql
AS $$
DECLARE
semantic_weight real := 0.6;
keyword_weight real := 0.4;
k_constant real := 60.0;
BEGIN
RETURN QUERY
WITH semantic_search AS (
SELECT
dc.id,
dc.agent_id,
dc.content,
dc.source,
dc.chunk_index,
dc.section_title,
dc.created_at,
ROW_NUMBER() OVER (ORDER BY dc.embedding <=> query_embedding) AS rank_num
FROM document_chunks dc
WHERE dc.agent_id = match_agent_id
ORDER BY dc.embedding <=> query_embedding
LIMIT match_count * 2
),
keyword_search AS (
SELECT
dc.id,
dc.agent_id,
dc.content,
dc.source,
dc.chunk_index,
dc.section_title,
dc.created_at,
ROW_NUMBER() OVER (ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC) AS rank_num
FROM document_chunks dc
WHERE dc.agent_id = match_agent_id
AND dc.content_tsvector @@ plainto_tsquery('english', query_text)
ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC
LIMIT match_count * 2
),
combined AS (
SELECT
COALESCE(s.id, k.id) AS id,
COALESCE(s.agent_id, k.agent_id) AS agent_id,
COALESCE(s.content, k.content) AS content,
COALESCE(s.source, k.source) AS source,
COALESCE(s.chunk_index, k.chunk_index) AS chunk_index,
COALESCE(s.section_title, k.section_title) AS section_title,
COALESCE(s.created_at, k.created_at) AS created_at,
(
COALESCE(semantic_weight / (k_constant + s.rank_num::real), 0.0) +
COALESCE(keyword_weight / (k_constant + k.rank_num::real), 0.0)
) AS rrf_score
FROM semantic_search s
FULL OUTER JOIN keyword_search k ON s.id = k.id
)
SELECT
c.id,
c.agent_id,
c.content,
c.source,
c.chunk_index,
c.section_title,
c.created_at,
c.rrf_score::real
FROM combined c
ORDER BY c.rrf_score DESC
LIMIT match_count;
END;
$$;
-- Add index on content_tsvector for better keyword search performance
CREATE INDEX IF NOT EXISTS idx_document_chunks_content_tsvector
ON document_chunks USING GIN(content_tsvector);
-- Add index on agent_id for filtering
CREATE INDEX IF NOT EXISTS idx_document_chunks_agent_id
ON document_chunks(agent_id);
-- Add index on embedding for vector similarity search
CREATE INDEX IF NOT EXISTS idx_document_chunks_embedding
ON document_chunks USING ivfflat(embedding vector_cosine_ops)
WITH (lists = 100);
COMMENT ON FUNCTION hybrid_search IS 'Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion';
|