Spaces:
Sleeping
Sleeping
| -- MEXAR - Hybrid Search Function for Supabase | |
| -- Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion (RRF) | |
| CREATE OR REPLACE FUNCTION hybrid_search( | |
| query_embedding vector(384), | |
| query_text text, | |
| match_agent_id integer, | |
| match_count integer | |
| ) | |
| RETURNS TABLE ( | |
| id integer, | |
| agent_id integer, | |
| content text, | |
| source text, | |
| chunk_index integer, | |
| section_title text, | |
| created_at timestamp with time zone, | |
| rrf_score real | |
| ) | |
| LANGUAGE plpgsql | |
| AS $$ | |
| DECLARE | |
| semantic_weight real := 0.6; | |
| keyword_weight real := 0.4; | |
| k_constant real := 60.0; | |
| BEGIN | |
| RETURN QUERY | |
| WITH semantic_search AS ( | |
| SELECT | |
| dc.id, | |
| dc.agent_id, | |
| dc.content, | |
| dc.source, | |
| dc.chunk_index, | |
| dc.section_title, | |
| dc.created_at, | |
| ROW_NUMBER() OVER (ORDER BY dc.embedding <=> query_embedding) AS rank_num | |
| FROM document_chunks dc | |
| WHERE dc.agent_id = match_agent_id | |
| ORDER BY dc.embedding <=> query_embedding | |
| LIMIT match_count * 2 | |
| ), | |
| keyword_search AS ( | |
| SELECT | |
| dc.id, | |
| dc.agent_id, | |
| dc.content, | |
| dc.source, | |
| dc.chunk_index, | |
| dc.section_title, | |
| dc.created_at, | |
| ROW_NUMBER() OVER (ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC) AS rank_num | |
| FROM document_chunks dc | |
| WHERE dc.agent_id = match_agent_id | |
| AND dc.content_tsvector @@ plainto_tsquery('english', query_text) | |
| ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC | |
| LIMIT match_count * 2 | |
| ), | |
| combined AS ( | |
| SELECT | |
| COALESCE(s.id, k.id) AS id, | |
| COALESCE(s.agent_id, k.agent_id) AS agent_id, | |
| COALESCE(s.content, k.content) AS content, | |
| COALESCE(s.source, k.source) AS source, | |
| COALESCE(s.chunk_index, k.chunk_index) AS chunk_index, | |
| COALESCE(s.section_title, k.section_title) AS section_title, | |
| COALESCE(s.created_at, k.created_at) AS created_at, | |
| ( | |
| COALESCE(semantic_weight / (k_constant + s.rank_num::real), 0.0) + | |
| COALESCE(keyword_weight / (k_constant + k.rank_num::real), 0.0) | |
| ) AS rrf_score | |
| FROM semantic_search s | |
| FULL OUTER JOIN keyword_search k ON s.id = k.id | |
| ) | |
| SELECT | |
| c.id, | |
| c.agent_id, | |
| c.content, | |
| c.source, | |
| c.chunk_index, | |
| c.section_title, | |
| c.created_at, | |
| c.rrf_score::real | |
| FROM combined c | |
| ORDER BY c.rrf_score DESC | |
| LIMIT match_count; | |
| END; | |
| $$; | |
| -- Add index on content_tsvector for better keyword search performance | |
| CREATE INDEX IF NOT EXISTS idx_document_chunks_content_tsvector | |
| ON document_chunks USING GIN(content_tsvector); | |
| -- Add index on agent_id for filtering | |
| CREATE INDEX IF NOT EXISTS idx_document_chunks_agent_id | |
| ON document_chunks(agent_id); | |
| -- Add index on embedding for vector similarity search | |
| CREATE INDEX IF NOT EXISTS idx_document_chunks_embedding | |
| ON document_chunks USING ivfflat(embedding vector_cosine_ops) | |
| WITH (lists = 100); | |
| COMMENT ON FUNCTION hybrid_search IS 'Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion'; | |