Spaces:
Running
Running
| import hashlib | |
| import json | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import List, Dict | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| class EmbeddingEngine: | |
| def __init__(self, model_name: str = "BAAI/bge-small-zh-v1.5"): | |
| print(f"Loading model: {model_name}") | |
| self.model = SentenceTransformer(model_name, device="cpu") | |
| if hasattr(torch, 'set_num_threads'): | |
| torch.set_num_threads(2) | |
| self.dimension = 512 | |
| self.cache: Dict[str, np.ndarray] = {} | |
| self.cache_max_size = 50000 | |
| print(f"Model loaded. Dimension: {self.dimension}") | |
| def _get_cache_key(self, text: str) -> str: | |
| return hashlib.md5(text.encode()).hexdigest() | |
| def encode(self, texts: List[str], use_cache: bool = True) -> np.ndarray: | |
| if use_cache: | |
| results = [] | |
| uncached_texts = [] | |
| uncached_indices = [] | |
| for i, text in enumerate(texts): | |
| cache_key = self._get_cache_key(text) | |
| if cache_key in self.cache: | |
| results.append((i, self.cache[cache_key])) | |
| else: | |
| uncached_texts.append(text) | |
| uncached_indices.append(i) | |
| if uncached_texts: | |
| new_embeddings = self.model.encode( | |
| uncached_texts, | |
| normalize_embeddings=True, | |
| show_progress_bar=False | |
| ) | |
| for text, embedding in zip(uncached_texts, new_embeddings): | |
| cache_key = self._get_cache_key(text) | |
| if len(self.cache) < self.cache_max_size: | |
| self.cache[cache_key] = embedding | |
| for idx, embedding in zip(uncached_indices, new_embeddings): | |
| results.append((idx, embedding)) | |
| results.sort(key=lambda x: x[0]) | |
| return np.array([r[1] for r in results], dtype=np.float32) | |
| else: | |
| return self.model.encode( | |
| texts, | |
| normalize_embeddings=True, | |
| show_progress_bar=False | |
| ).astype(np.float32) | |
| def encode_single(self, text: str) -> np.ndarray: | |
| return self.encode([text])[0] | |