Hybrid RAG: BM25+Dense (sqlite-vec/BGE-M3) + cross-encoder reranker (bge-reranker-v2-m3)
Browse files- src/kpaa/cli.py +68 -0
src/kpaa/cli.py
CHANGED
|
@@ -106,6 +106,31 @@ def _build_parser() -> argparse.ArgumentParser:
|
|
| 106 |
)
|
| 107 |
p_build_guides.add_argument("--rebuild", action="store_true", default=True)
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
sub.add_parser(
|
| 110 |
"refresh-related-laws",
|
| 111 |
help="privacy.go.kr 개인정보 관련 법령·행정규칙 목록 재스크래이프 (data/related_laws.yaml)",
|
|
@@ -407,6 +432,49 @@ def main(argv: list[str] | None = None) -> int:
|
|
| 407 |
build()
|
| 408 |
return 0
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
if args.cmd == "refresh-related-laws":
|
| 411 |
from kpaa.related_laws import refresh as refresh_rl
|
| 412 |
|
|
|
|
| 106 |
)
|
| 107 |
p_build_guides.add_argument("--rebuild", action="store_true", default=True)
|
| 108 |
|
| 109 |
+
p_build_embed = sub.add_parser(
|
| 110 |
+
"build-embeddings",
|
| 111 |
+
help="data/guide/chunks/*.jsonl + cases.sqlite → embeddings.sqlite (sqlite-vec, BGE-M3)",
|
| 112 |
+
)
|
| 113 |
+
p_build_embed.add_argument(
|
| 114 |
+
"--source", choices=["guide", "case", "all"], default="all",
|
| 115 |
+
help="인덱싱 대상 (기본: all)",
|
| 116 |
+
)
|
| 117 |
+
p_build_embed.add_argument(
|
| 118 |
+
"--force", action="store_true",
|
| 119 |
+
help="기존 embeddings.sqlite 삭제 후 재빌드",
|
| 120 |
+
)
|
| 121 |
+
p_build_embed.add_argument(
|
| 122 |
+
"--batch", type=int, default=32,
|
| 123 |
+
help="임베딩 batch size (기본 32)",
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
p_embed_search = sub.add_parser(
|
| 127 |
+
"embed-search",
|
| 128 |
+
help="dense 검색만 단독 테스트 (BM25/RRF/reranker 미적용)",
|
| 129 |
+
)
|
| 130 |
+
p_embed_search.add_argument("query", help="검색 질의")
|
| 131 |
+
p_embed_search.add_argument("--source", choices=["guide", "case"], default="guide")
|
| 132 |
+
p_embed_search.add_argument("-k", type=int, default=10)
|
| 133 |
+
|
| 134 |
sub.add_parser(
|
| 135 |
"refresh-related-laws",
|
| 136 |
help="privacy.go.kr 개인정보 관련 법령·행정규칙 목록 재스크래이프 (data/related_laws.yaml)",
|
|
|
|
| 432 |
build()
|
| 433 |
return 0
|
| 434 |
|
| 435 |
+
if args.cmd == "build-embeddings":
|
| 436 |
+
import logging as _logging
|
| 437 |
+
_logging.basicConfig(level=_logging.INFO, format="%(message)s")
|
| 438 |
+
from kpaa.embeddings.index import build_embed_index, stats
|
| 439 |
+
|
| 440 |
+
n = build_embed_index(source=args.source, force=args.force, batch=args.batch)
|
| 441 |
+
s = stats()
|
| 442 |
+
print(f"새로 인덱싱: {n}건 / 합계: {s.get('by_source', {})}", file=sys.stderr)
|
| 443 |
+
return 0
|
| 444 |
+
|
| 445 |
+
if args.cmd == "embed-search":
|
| 446 |
+
from kpaa.embeddings.index import search_embed
|
| 447 |
+
from kpaa.guides.index import list_all_chunks
|
| 448 |
+
|
| 449 |
+
# guide chunks 로 chunk_id → section/body 미리 로드 (case 는 cases.sqlite 직접)
|
| 450 |
+
hits = search_embed(args.query, source_type=args.source, k=args.k)
|
| 451 |
+
if args.source == "guide":
|
| 452 |
+
chunks_by_id = {c.chunk_id: c for c in list_all_chunks()}
|
| 453 |
+
for h in hits:
|
| 454 |
+
c = chunks_by_id.get(h.chunk_id)
|
| 455 |
+
if c is None:
|
| 456 |
+
print(f" {h.distance:.4f} {h.chunk_id} (chunks 에 없음)", file=sys.stderr)
|
| 457 |
+
continue
|
| 458 |
+
print(
|
| 459 |
+
f" {h.distance:.4f} [{c.section[:60]}] {c.body[:80]}...",
|
| 460 |
+
file=sys.stderr,
|
| 461 |
+
)
|
| 462 |
+
else:
|
| 463 |
+
import sqlite3
|
| 464 |
+
from kpaa.cases.index import default_db_path as case_db_path
|
| 465 |
+
con = sqlite3.connect(case_db_path()); con.row_factory = sqlite3.Row
|
| 466 |
+
for h in hits:
|
| 467 |
+
ntt_id = h.chunk_id.removeprefix("case_")
|
| 468 |
+
row = con.execute("SELECT title, body, summary FROM cases WHERE ntt_id=?", (ntt_id,)).fetchone()
|
| 469 |
+
if row is None:
|
| 470 |
+
print(f" {h.distance:.4f} {h.chunk_id} (cases 에 없음)", file=sys.stderr)
|
| 471 |
+
continue
|
| 472 |
+
title = row["title"] or ""
|
| 473 |
+
body = (row["body"] or row["summary"] or "")[:80]
|
| 474 |
+
print(f" {h.distance:.4f} [{title[:50]}] {body}...", file=sys.stderr)
|
| 475 |
+
con.close()
|
| 476 |
+
return 0
|
| 477 |
+
|
| 478 |
if args.cmd == "refresh-related-laws":
|
| 479 |
from kpaa.related_laws import refresh as refresh_rl
|
| 480 |
|