Mexar / backend /migrations /hybrid_search_function.sql
Devrajsinh bharatsinh gohil
Initial commit of MEXAR Ultimate - Phase 2 cleanup complete
b0b150b
-- MEXAR - Hybrid Search Function for Supabase
-- Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion (RRF)
CREATE OR REPLACE FUNCTION hybrid_search(
query_embedding vector(384),
query_text text,
match_agent_id integer,
match_count integer
)
RETURNS TABLE (
id integer,
agent_id integer,
content text,
source text,
chunk_index integer,
section_title text,
created_at timestamp with time zone,
rrf_score real
)
LANGUAGE plpgsql
AS $$
DECLARE
semantic_weight real := 0.6;
keyword_weight real := 0.4;
k_constant real := 60.0;
BEGIN
RETURN QUERY
WITH semantic_search AS (
SELECT
dc.id,
dc.agent_id,
dc.content,
dc.source,
dc.chunk_index,
dc.section_title,
dc.created_at,
ROW_NUMBER() OVER (ORDER BY dc.embedding <=> query_embedding) AS rank_num
FROM document_chunks dc
WHERE dc.agent_id = match_agent_id
ORDER BY dc.embedding <=> query_embedding
LIMIT match_count * 2
),
keyword_search AS (
SELECT
dc.id,
dc.agent_id,
dc.content,
dc.source,
dc.chunk_index,
dc.section_title,
dc.created_at,
ROW_NUMBER() OVER (ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC) AS rank_num
FROM document_chunks dc
WHERE dc.agent_id = match_agent_id
AND dc.content_tsvector @@ plainto_tsquery('english', query_text)
ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC
LIMIT match_count * 2
),
combined AS (
SELECT
COALESCE(s.id, k.id) AS id,
COALESCE(s.agent_id, k.agent_id) AS agent_id,
COALESCE(s.content, k.content) AS content,
COALESCE(s.source, k.source) AS source,
COALESCE(s.chunk_index, k.chunk_index) AS chunk_index,
COALESCE(s.section_title, k.section_title) AS section_title,
COALESCE(s.created_at, k.created_at) AS created_at,
(
COALESCE(semantic_weight / (k_constant + s.rank_num::real), 0.0) +
COALESCE(keyword_weight / (k_constant + k.rank_num::real), 0.0)
) AS rrf_score
FROM semantic_search s
FULL OUTER JOIN keyword_search k ON s.id = k.id
)
SELECT
c.id,
c.agent_id,
c.content,
c.source,
c.chunk_index,
c.section_title,
c.created_at,
c.rrf_score::real
FROM combined c
ORDER BY c.rrf_score DESC
LIMIT match_count;
END;
$$;
-- Add index on content_tsvector for better keyword search performance
CREATE INDEX IF NOT EXISTS idx_document_chunks_content_tsvector
ON document_chunks USING GIN(content_tsvector);
-- Add index on agent_id for filtering
CREATE INDEX IF NOT EXISTS idx_document_chunks_agent_id
ON document_chunks(agent_id);
-- Add index on embedding for vector similarity search
CREATE INDEX IF NOT EXISTS idx_document_chunks_embedding
ON document_chunks USING ivfflat(embedding vector_cosine_ops)
WITH (lists = 100);
COMMENT ON FUNCTION hybrid_search IS 'Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion';