scvcoder commited on
Commit
49b0a74
·
verified ·
1 Parent(s): f64a4c2

Hybrid RAG: BM25+Dense (sqlite-vec/BGE-M3) + cross-encoder reranker (bge-reranker-v2-m3)

Browse files
Files changed (1) hide show
  1. src/kpaa/guides/index.py +34 -0
src/kpaa/guides/index.py CHANGED
@@ -159,6 +159,40 @@ def search(
159
  ]
160
 
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  def count(db_path: Path | None = None) -> int:
163
  path = db_path or default_db_path()
164
  if not path.exists():
 
159
  ]
160
 
161
 
162
+ def get_chunks(
163
+ chunk_ids: Iterable[str], *, db_path: Path | None = None
164
+ ) -> dict[str, GuideChunk]:
165
+ """chunk_id 리스트 → {chunk_id: GuideChunk} dict (누락된 id 는 dict 에 없음).
166
+
167
+ Hybrid retrieval 의 dense 검색 결과(chunk_id 만 반환)를 GuideChunk 로 복원할 때 사용.
168
+ """
169
+ ids = [str(i) for i in chunk_ids]
170
+ if not ids:
171
+ return {}
172
+ path = db_path or default_db_path()
173
+ if not path.exists():
174
+ return {}
175
+ placeholders = ",".join("?" * len(ids))
176
+ conn = _connect(path)
177
+ try:
178
+ rows = conn.execute(
179
+ f"SELECT * FROM guides WHERE chunk_id IN ({placeholders})", ids
180
+ ).fetchall()
181
+ finally:
182
+ conn.close()
183
+ return {
184
+ r["chunk_id"]: GuideChunk(
185
+ chunk_id=r["chunk_id"], doc_id=r["doc_id"],
186
+ doc_title=r["doc_title"], doc_date=r["doc_date"],
187
+ section=r["section"], chunk_no=r["chunk_no"],
188
+ body=r["body"], pages=r["pages"] or "",
189
+ source_pdf=r["source_pdf"],
190
+ chunk_context=(r["chunk_context"] if "chunk_context" in r.keys() else "") or "",
191
+ )
192
+ for r in rows
193
+ }
194
+
195
+
196
  def count(db_path: Path | None = None) -> int:
197
  path = db_path or default_db_path()
198
  if not path.exists():