| from __future__ import annotations | |
| from pathlib import Path | |
| from threading import Lock | |
| from sentence_transformers import SentenceTransformer | |
| class KBEmbeddingService: | |
| _instance: "KBEmbeddingService | None" = None | |
| _instance_lock = Lock() | |
| _model_lock = Lock() | |
| _model_name = "BAAI/bge-base-en-v1.5" | |
| _local_model_dir = Path(__file__).resolve().parents[2] / "models" / "BAAI__bge-base-en-v1.5" | |
| _medical_prefix = "Medical text: " | |
| def __new__(cls) -> "KBEmbeddingService": | |
| if cls._instance is None: | |
| with cls._instance_lock: | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| cls._instance._model = None | |
| return cls._instance | |
| def load_model(self) -> SentenceTransformer: | |
| if self._model is None: | |
| with self._model_lock: | |
| if self._model is None: | |
| model_source = ( | |
| str(self._local_model_dir) | |
| if self._local_model_dir.exists() | |
| else self._model_name | |
| ) | |
| self._model = SentenceTransformer(model_source) | |
| print(f"KB Embedding model loaded: {self._model_name}") | |
| return self._model | |
| def embed(self, text: str) -> list[float]: | |
| model = self.load_model() | |
| embedding = model.encode( | |
| self._prepare_text(text), | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| ) | |
| return embedding.tolist() | |
| def embed_batch(self, texts: list[str]) -> list[list[float]]: | |
| if not texts: | |
| return [] | |
| model = self.load_model() | |
| embeddings = model.encode( | |
| [self._prepare_text(text) for text in texts], | |
| batch_size=min(64, len(texts)), | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| ) | |
| return embeddings.tolist() | |
| def embedding_dimension(self) -> int: | |
| model = self.load_model() | |
| return int(model.get_sentence_embedding_dimension()) | |
| def _prepare_text(self, text: str) -> str: | |
| normalized_text = (text or "").strip() | |
| return f"{self._medical_prefix}{normalized_text}" if normalized_text else self._medical_prefix | |
Xet Storage Details
- Size:
- 2.3 kB
- Xet hash:
- 8d507a38a4d2d8ffeb20a102cddd1851d45d45a77b7c4999183d8901b6531890
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.