Spaces:
Sleeping
Sleeping
File size: 1,831 Bytes
5b7955a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | -- Supabase pgvector setup for PolicyDecoder
-- run this ONCE in supabase SQL editor before any ingestion
-- Embedding dimension: 768 (BAAI/bge-large-en-v1.5 via sentence-transformers)
-- To switch to Jina v3 (1024-dim) later:
-- ALTER TABLE policy_chunks ALTER COLUMN embedding TYPE vector(1024);
-- To switch to OpenAI (1536-dim) later:
-- ALTER TABLE policy_chunks ALTER COLUMN embedding TYPE vector(1536);
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS policy_chunks (
id bigserial PRIMARY KEY,
content text NOT NULL,
metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
embedding vector(768) NOT NULL,
created_at timestamptz NOT NULL DEFAULT now()
);
-- hnsw index for fast ANN search
CREATE INDEX IF NOT EXISTS policy_chunks_embedding_hnsw_idx
ON policy_chunks
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- gin index on metadata for jsonb filtering (policy_id etc)
CREATE INDEX IF NOT EXISTS policy_chunks_metadata_gin_idx
ON policy_chunks
USING gin (metadata);
-- rpc function that langchain calls for similarity search
-- filters by metadata containment (@>) then orders by cosine distance
CREATE OR REPLACE FUNCTION match_policy_chunks(
query_embedding vector(768),
match_count int,
filter jsonb DEFAULT '{}'::jsonb
)
RETURNS TABLE (
id bigint,
content text,
metadata jsonb,
similarity float
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY
SELECT
pc.id,
pc.content,
pc.metadata,
(1 - (pc.embedding <=> query_embedding))::float AS similarity
FROM policy_chunks pc
WHERE pc.metadata @> filter
ORDER BY pc.embedding <=> query_embedding
LIMIT match_count;
END;
$$;
|