scvcoder commited on
Commit
cd6cb3c
·
verified ·
1 Parent(s): ca5c4c2

Hybrid RAG: BM25+Dense (sqlite-vec/BGE-M3) + cross-encoder reranker (bge-reranker-v2-m3)

Browse files
Files changed (1) hide show
  1. src/kpaa/embeddings/embedder.py +81 -0
src/kpaa/embeddings/embedder.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """sentence-transformers 기반 임베더 wrapper (lazy singleton).
2
+
3
+ 기본 모델: BAAI/bge-m3 (1024 dim, multilingual). 한국어 SOTA급.
4
+
5
+ 디바이스 자동 감지 (CUDA → MPS → CPU). KPAA_EMBED_DEVICE 로 강제 가능.
6
+ 모델 로드는 첫 호출 시 lazy — import 만으로는 다운로드 발생 X.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ from functools import cached_property
13
+ from typing import ClassVar, TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ import numpy as np
17
+ from sentence_transformers import SentenceTransformer
18
+
19
+ logger = logging.getLogger("kpaa.embeddings.embedder")
20
+
21
+ _DEFAULT_MODEL = "BAAI/bge-m3"
22
+ _DIM_BY_MODEL: dict[str, int] = {
23
+ "BAAI/bge-m3": 1024,
24
+ }
25
+
26
+
27
+ def _detect_device() -> str:
28
+ forced = os.environ.get("KPAA_EMBED_DEVICE", "auto").lower()
29
+ if forced != "auto":
30
+ return forced
31
+ import torch
32
+ if torch.cuda.is_available():
33
+ return "cuda"
34
+ if torch.backends.mps.is_available():
35
+ return "mps"
36
+ return "cpu"
37
+
38
+
39
+ class Embedder:
40
+ """BGE-M3 (또는 KPAA_EMBEDDER 지정 모델) singleton."""
41
+
42
+ _instance: ClassVar["Embedder | None"] = None
43
+
44
+ def __init__(self, model_name: str | None = None, device: str | None = None) -> None:
45
+ self.model_name = model_name or os.environ.get("KPAA_EMBEDDER", _DEFAULT_MODEL)
46
+ self.device = device or _detect_device()
47
+
48
+ @classmethod
49
+ def default(cls) -> "Embedder":
50
+ if cls._instance is None:
51
+ cls._instance = cls()
52
+ return cls._instance
53
+
54
+ @cached_property
55
+ def model(self) -> "SentenceTransformer":
56
+ from sentence_transformers import SentenceTransformer
57
+ logger.info("Loading embedding model %s on %s ...", self.model_name, self.device)
58
+ return SentenceTransformer(self.model_name, device=self.device)
59
+
60
+ @property
61
+ def dim(self) -> int:
62
+ return _DIM_BY_MODEL.get(self.model_name) or self.model.get_sentence_embedding_dimension()
63
+
64
+ def encode_chunks(self, texts: list[str], *, batch: int = 32, show_progress: bool = True) -> "np.ndarray":
65
+ """문서 측 임베딩. cosine 검색 위해 정규화."""
66
+ return self.model.encode(
67
+ texts,
68
+ batch_size=batch,
69
+ normalize_embeddings=True,
70
+ show_progress_bar=show_progress,
71
+ convert_to_numpy=True,
72
+ )
73
+
74
+ def encode_query(self, text: str) -> "np.ndarray":
75
+ """쿼리 측 임베딩."""
76
+ return self.model.encode(
77
+ text,
78
+ normalize_embeddings=True,
79
+ convert_to_numpy=True,
80
+ show_progress_bar=False,
81
+ )