scvcoder commited on
Commit
ec0742c
·
verified ·
1 Parent(s): ce557af

Hybrid RAG: BM25+Dense (sqlite-vec/BGE-M3) + cross-encoder reranker (bge-reranker-v2-m3)

Browse files
Files changed (1) hide show
  1. src/kpaa/cli.py +68 -0
src/kpaa/cli.py CHANGED
@@ -106,6 +106,31 @@ def _build_parser() -> argparse.ArgumentParser:
106
  )
107
  p_build_guides.add_argument("--rebuild", action="store_true", default=True)
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  sub.add_parser(
110
  "refresh-related-laws",
111
  help="privacy.go.kr 개인정보 관련 법령·행정규칙 목록 재스크래이프 (data/related_laws.yaml)",
@@ -407,6 +432,49 @@ def main(argv: list[str] | None = None) -> int:
407
  build()
408
  return 0
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  if args.cmd == "refresh-related-laws":
411
  from kpaa.related_laws import refresh as refresh_rl
412
 
 
106
  )
107
  p_build_guides.add_argument("--rebuild", action="store_true", default=True)
108
 
109
+ p_build_embed = sub.add_parser(
110
+ "build-embeddings",
111
+ help="data/guide/chunks/*.jsonl + cases.sqlite → embeddings.sqlite (sqlite-vec, BGE-M3)",
112
+ )
113
+ p_build_embed.add_argument(
114
+ "--source", choices=["guide", "case", "all"], default="all",
115
+ help="인덱싱 대상 (기본: all)",
116
+ )
117
+ p_build_embed.add_argument(
118
+ "--force", action="store_true",
119
+ help="기존 embeddings.sqlite 삭제 후 재빌드",
120
+ )
121
+ p_build_embed.add_argument(
122
+ "--batch", type=int, default=32,
123
+ help="임베딩 batch size (기본 32)",
124
+ )
125
+
126
+ p_embed_search = sub.add_parser(
127
+ "embed-search",
128
+ help="dense 검색만 단독 테스트 (BM25/RRF/reranker 미적용)",
129
+ )
130
+ p_embed_search.add_argument("query", help="검색 질의")
131
+ p_embed_search.add_argument("--source", choices=["guide", "case"], default="guide")
132
+ p_embed_search.add_argument("-k", type=int, default=10)
133
+
134
  sub.add_parser(
135
  "refresh-related-laws",
136
  help="privacy.go.kr 개인정보 관련 법령·행정규칙 목록 재스크래이프 (data/related_laws.yaml)",
 
432
  build()
433
  return 0
434
 
435
+ if args.cmd == "build-embeddings":
436
+ import logging as _logging
437
+ _logging.basicConfig(level=_logging.INFO, format="%(message)s")
438
+ from kpaa.embeddings.index import build_embed_index, stats
439
+
440
+ n = build_embed_index(source=args.source, force=args.force, batch=args.batch)
441
+ s = stats()
442
+ print(f"새로 인덱싱: {n}건 / 합계: {s.get('by_source', {})}", file=sys.stderr)
443
+ return 0
444
+
445
+ if args.cmd == "embed-search":
446
+ from kpaa.embeddings.index import search_embed
447
+ from kpaa.guides.index import list_all_chunks
448
+
449
+ # guide chunks 로 chunk_id → section/body 미리 로드 (case 는 cases.sqlite 직접)
450
+ hits = search_embed(args.query, source_type=args.source, k=args.k)
451
+ if args.source == "guide":
452
+ chunks_by_id = {c.chunk_id: c for c in list_all_chunks()}
453
+ for h in hits:
454
+ c = chunks_by_id.get(h.chunk_id)
455
+ if c is None:
456
+ print(f" {h.distance:.4f} {h.chunk_id} (chunks 에 없음)", file=sys.stderr)
457
+ continue
458
+ print(
459
+ f" {h.distance:.4f} [{c.section[:60]}] {c.body[:80]}...",
460
+ file=sys.stderr,
461
+ )
462
+ else:
463
+ import sqlite3
464
+ from kpaa.cases.index import default_db_path as case_db_path
465
+ con = sqlite3.connect(case_db_path()); con.row_factory = sqlite3.Row
466
+ for h in hits:
467
+ ntt_id = h.chunk_id.removeprefix("case_")
468
+ row = con.execute("SELECT title, body, summary FROM cases WHERE ntt_id=?", (ntt_id,)).fetchone()
469
+ if row is None:
470
+ print(f" {h.distance:.4f} {h.chunk_id} (cases 에 없음)", file=sys.stderr)
471
+ continue
472
+ title = row["title"] or ""
473
+ body = (row["body"] or row["summary"] or "")[:80]
474
+ print(f" {h.distance:.4f} [{title[:50]}] {body}...", file=sys.stderr)
475
+ con.close()
476
+ return 0
477
+
478
  if args.cmd == "refresh-related-laws":
479
  from kpaa.related_laws import refresh as refresh_rl
480