Hybrid RAG: BM25+Dense (sqlite-vec/BGE-M3) + cross-encoder reranker (bge-reranker-v2-m3)
Browse files- src/kpaa/guides/index.py +34 -0
src/kpaa/guides/index.py
CHANGED
|
@@ -159,6 +159,40 @@ def search(
|
|
| 159 |
]
|
| 160 |
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
def count(db_path: Path | None = None) -> int:
|
| 163 |
path = db_path or default_db_path()
|
| 164 |
if not path.exists():
|
|
|
|
| 159 |
]
|
| 160 |
|
| 161 |
|
| 162 |
+
def get_chunks(
|
| 163 |
+
chunk_ids: Iterable[str], *, db_path: Path | None = None
|
| 164 |
+
) -> dict[str, GuideChunk]:
|
| 165 |
+
"""chunk_id 리스트 → {chunk_id: GuideChunk} dict (누락된 id 는 dict 에 없음).
|
| 166 |
+
|
| 167 |
+
Hybrid retrieval 의 dense 검색 결과(chunk_id 만 반환)를 GuideChunk 로 복원할 때 사용.
|
| 168 |
+
"""
|
| 169 |
+
ids = [str(i) for i in chunk_ids]
|
| 170 |
+
if not ids:
|
| 171 |
+
return {}
|
| 172 |
+
path = db_path or default_db_path()
|
| 173 |
+
if not path.exists():
|
| 174 |
+
return {}
|
| 175 |
+
placeholders = ",".join("?" * len(ids))
|
| 176 |
+
conn = _connect(path)
|
| 177 |
+
try:
|
| 178 |
+
rows = conn.execute(
|
| 179 |
+
f"SELECT * FROM guides WHERE chunk_id IN ({placeholders})", ids
|
| 180 |
+
).fetchall()
|
| 181 |
+
finally:
|
| 182 |
+
conn.close()
|
| 183 |
+
return {
|
| 184 |
+
r["chunk_id"]: GuideChunk(
|
| 185 |
+
chunk_id=r["chunk_id"], doc_id=r["doc_id"],
|
| 186 |
+
doc_title=r["doc_title"], doc_date=r["doc_date"],
|
| 187 |
+
section=r["section"], chunk_no=r["chunk_no"],
|
| 188 |
+
body=r["body"], pages=r["pages"] or "",
|
| 189 |
+
source_pdf=r["source_pdf"],
|
| 190 |
+
chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
|
| 191 |
+
)
|
| 192 |
+
for r in rows
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
|
| 196 |
def count(db_path: Path | None = None) -> int:
|
| 197 |
path = db_path or default_db_path()
|
| 198 |
if not path.exists():
|