File size: 3,375 Bytes
b0b150b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
-- MEXAR - Hybrid Search Function for Supabase
-- Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion (RRF)

CREATE OR REPLACE FUNCTION hybrid_search(
    query_embedding vector(384),
    query_text text,
    match_agent_id integer,
    match_count integer
)
RETURNS TABLE (
    id integer,
    agent_id integer,
    content text,
    source text,
    chunk_index integer,
    section_title text,
    created_at timestamp with time zone,
    rrf_score real
)
LANGUAGE plpgsql
AS $$
DECLARE
    semantic_weight real := 0.6;
    keyword_weight real := 0.4;
    k_constant real := 60.0;
BEGIN
    RETURN QUERY
    WITH semantic_search AS (
        SELECT
            dc.id,
            dc.agent_id,
            dc.content,
            dc.source,
            dc.chunk_index,
            dc.section_title,
            dc.created_at,
            ROW_NUMBER() OVER (ORDER BY dc.embedding <=> query_embedding) AS rank_num
        FROM document_chunks dc
        WHERE dc.agent_id = match_agent_id
        ORDER BY dc.embedding <=> query_embedding
        LIMIT match_count * 2
    ),
    keyword_search AS (
        SELECT
            dc.id,
            dc.agent_id,
            dc.content,
            dc.source,
            dc.chunk_index,
            dc.section_title,
            dc.created_at,
            ROW_NUMBER() OVER (ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC) AS rank_num
        FROM document_chunks dc
        WHERE dc.agent_id = match_agent_id
          AND dc.content_tsvector @@ plainto_tsquery('english', query_text)
        ORDER BY ts_rank_cd(dc.content_tsvector, plainto_tsquery('english', query_text)) DESC
        LIMIT match_count * 2
    ),
    combined AS (
        SELECT 
            COALESCE(s.id, k.id) AS id,
            COALESCE(s.agent_id, k.agent_id) AS agent_id,
            COALESCE(s.content, k.content) AS content,
            COALESCE(s.source, k.source) AS source,
            COALESCE(s.chunk_index, k.chunk_index) AS chunk_index,
            COALESCE(s.section_title, k.section_title) AS section_title,
            COALESCE(s.created_at, k.created_at) AS created_at,
            (
                COALESCE(semantic_weight / (k_constant + s.rank_num::real), 0.0) +
                COALESCE(keyword_weight / (k_constant + k.rank_num::real), 0.0)
            ) AS rrf_score
        FROM semantic_search s
        FULL OUTER JOIN keyword_search k ON s.id = k.id
    )
    SELECT 
        c.id,
        c.agent_id,
        c.content,
        c.source,
        c.chunk_index,
        c.section_title,
        c.created_at,
        c.rrf_score::real
    FROM combined c
    ORDER BY c.rrf_score DESC
    LIMIT match_count;
END;
$$;

-- Add index on content_tsvector for better keyword search performance
CREATE INDEX IF NOT EXISTS idx_document_chunks_content_tsvector 
ON document_chunks USING GIN(content_tsvector);

-- Add index on agent_id for filtering
CREATE INDEX IF NOT EXISTS idx_document_chunks_agent_id 
ON document_chunks(agent_id);

-- Add index on embedding for vector similarity search
CREATE INDEX IF NOT EXISTS idx_document_chunks_embedding 
ON document_chunks USING ivfflat(embedding vector_cosine_ops)
WITH (lists = 100);

COMMENT ON FUNCTION hybrid_search IS 'Combines semantic (vector) and keyword (full-text) search using Reciprocal Rank Fusion';