""" Embedding Manager - Centralized Embedding Management Fixes the critical embedding consistency issue where different components were using different embedding methods (random vs real embeddings). """ import os import logging from typing import Optional, List import numpy as np logger = logging.getLogger(__name__) class EmbeddingManager: """Centralized embedding management to ensure consistency across all components""" _instance: Optional['EmbeddingManager'] = None def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._initialize() return cls._instance def _initialize(self): """Initialize embedding model once""" self._client = None self._model = None if os.getenv("OPENAI_API_KEY"): try: from openai import OpenAI self._client = OpenAI() self.method = "openai" self.dimension = 1536 logger.info("Using OpenAI embeddings") except ImportError: logger.warning("OpenAI not available, falling back to local embeddings") self._setup_local_embeddings() else: self._setup_local_embeddings() def _setup_local_embeddings(self): """Setup local sentence transformer embeddings""" try: from sentence_transformers import SentenceTransformer self._model = SentenceTransformer('all-MiniLM-L6-v2') self.method = "local" self.dimension = 384 logger.info("Using local sentence transformer embeddings") except ImportError: logger.error("No embedding models available!") self.method = "none" self.dimension = 0 def embed(self, text: str) -> List[float]: """Get embedding for text""" if not text or self.method == "none": # Return zero vector as fallback return [0.0] * max(self.dimension, 384) if self.method == "openai": try: response = self._client.embeddings.create( model="text-embedding-3-small", input=text ) return response.data[0].embedding except Exception as e: logger.error(f"OpenAI embedding failed: {e}") return [0.0] * self.dimension else: try: embedding = self._model.encode(text) return embedding.tolist() except Exception as e: logger.error(f"Local embedding failed: {e}") return [0.0] * self.dimension def embed_batch(self, texts: List[str]) -> List[List[float]]: """Get embeddings for multiple texts (more efficient)""" if not texts: return [] if self.method == "openai": try: response = self._client.embeddings.create( model="text-embedding-3-small", input=texts ) return [data.embedding for data in response.data] except Exception as e: logger.error(f"OpenAI batch embedding failed: {e}") return [[0.0] * self.dimension for _ in texts] else: try: embeddings = self._model.encode(texts) return embeddings.tolist() except Exception as e: logger.error(f"Local batch embedding failed: {e}") return [[0.0] * self.dimension for _ in texts] def get_dimension(self) -> int: """Get embedding dimension""" return self.dimension def get_method(self) -> str: """Get embedding method being used""" return self.method # Global embedding manager instance embedding_manager = EmbeddingManager() def get_embedding_manager() -> EmbeddingManager: """Get the global embedding manager instance""" return embedding_manager