Spaces:

Jacooo
/

PGC-AI-Chatbot

Sleeping

PGC-AI-Chatbot / scripts /sql /2026-05-03-knowledge-fts.sql

Deploy from GitHub: f9a0a7a

d68b9da verified 21 days ago

1.48 kB

	alter table public.knowledge_chunks
	add column if not exists content_tsv tsvector
	generated always as (to_tsvector('simple', coalesce(content, ''))) stored;

	create index if not exists knowledge_chunks_content_tsv_gin
	on public.knowledge_chunks
	using gin (content_tsv);

	-- Build a tsquery using OR logic so any matching token scores a hit.
	-- websearch_to_tsquery uses AND which requires ALL words in the query
	-- (including stop words like "what", "apa", "saja") to appear in the chunk.
	-- Instead, we tokenize via to_tsvector and join unique lexemes with \| (OR).
	create or replace function public.match_knowledge_fts(
	query_text text,
	match_count int DEFAULT 7
	)
	returns table (
	content text,
	source text,
	filename text,
	page_number integer,
	lexical_score float
	)
	language sql
	stable
	as $$
	with query_terms as (
	select to_tsquery(
	'simple',
	string_agg(lexeme, ' \| ' order by lexeme)
	) as tsq
	from (
	select distinct unnest(tsvector_to_array(
	to_tsvector('simple', query_text)
	)) as lexeme
	) tokens
	)
	select
	kc.content,
	kc.source,
	kc.filename,
	kc.page_number,
	ts_rank_cd(kc.content_tsv, query_terms.tsq) as lexical_score
	from public.knowledge_chunks kc
	cross join query_terms
	where kc.content_tsv @@ query_terms.tsq
	order by lexical_score desc, kc.id asc
	limit match_count;
	$$;