Spaces:
Sleeping
Sleeping
| """ | |
| Embedding Manager Module | |
| Handles text embeddings and vector database operations using sentence-transformers and FAISS | |
| """ | |
| import logging | |
| import os | |
| from typing import List, Optional | |
| from sentence_transformers import SentenceTransformer | |
| from langchain.schema import Document | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| class EmbeddingManager: | |
| """Manages text embeddings and vector database operations""" | |
| def __init__(self, model_name: str = "all-MiniLM-L6-v2"): | |
| """ | |
| Initialize embedding manager | |
| Args: | |
| model_name: Name of the sentence transformer model to use | |
| """ | |
| self.model_name = model_name | |
| self.embeddings = None | |
| self.vector_store = None | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| self.logger = logging.getLogger(__name__) | |
| # Initialize embeddings | |
| self._initialize_embeddings() | |
| def _initialize_embeddings(self): | |
| """Initialize the embedding model""" | |
| try: | |
| self.logger.info(f"Loading embedding model: {self.model_name}") | |
| # Use HuggingFaceEmbeddings wrapper for better LangChain integration | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name=f"sentence-transformers/{self.model_name}", | |
| model_kwargs={'device': 'cpu'}, # Use CPU for compatibility | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| self.logger.info("Embedding model loaded successfully") | |
| except Exception as e: | |
| self.logger.error(f"Error loading embedding model: {e}") | |
| raise | |
| def create_knowledge_base(self, documents: List[Document]) -> FAISS: | |
| """ | |
| Create FAISS knowledge base from documents | |
| Args: | |
| documents: List of Document objects | |
| Returns: | |
| FAISS vector store | |
| """ | |
| try: | |
| if not documents: | |
| raise ValueError("No documents provided") | |
| self.logger.info(f"Creating knowledge base with {len(documents)} documents") | |
| # Extract texts and metadata | |
| texts = [doc.page_content for doc in documents] | |
| metadatas = [doc.metadata for doc in documents] | |
| # Create FAISS vector store | |
| self.vector_store = FAISS.from_texts( | |
| texts=texts, | |
| embedding=self.embeddings, | |
| metadatas=metadatas | |
| ) | |
| self.logger.info("Knowledge base created successfully") | |
| return self.vector_store | |
| except Exception as e: | |
| self.logger.error(f"Error creating knowledge base: {e}") | |
| raise | |
| def similarity_search(self, query: str, k: int = 4) -> List[Document]: | |
| """ | |
| Perform similarity search on the knowledge base | |
| Args: | |
| query: Search query | |
| k: Number of similar documents to return | |
| Returns: | |
| List of similar documents | |
| """ | |
| try: | |
| if not self.vector_store: | |
| raise ValueError("Knowledge base not initialized") | |
| self.logger.info(f"Performing similarity search for: {query[:50]}...") | |
| # Perform search | |
| similar_docs = self.vector_store.similarity_search(query, k=k) | |
| self.logger.info(f"Found {len(similar_docs)} similar documents") | |
| return similar_docs | |
| except Exception as e: | |
| self.logger.error(f"Error in similarity search: {e}") | |
| raise | |
| def similarity_search_with_score(self, query: str, k: int = 4) -> List[tuple]: | |
| """ | |
| Perform similarity search with scores | |
| Args: | |
| query: Search query | |
| k: Number of similar documents to return | |
| Returns: | |
| List of (document, score) tuples | |
| """ | |
| try: | |
| if not self.vector_store: | |
| raise ValueError("Knowledge base not initialized") | |
| self.logger.info(f"Performing similarity search with scores for: {query[:50]}...") | |
| # Perform search with scores | |
| similar_docs_with_scores = self.vector_store.similarity_search_with_score(query, k=k) | |
| self.logger.info(f"Found {len(similar_docs_with_scores)} similar documents") | |
| return similar_docs_with_scores | |
| except Exception as e: | |
| self.logger.error(f"Error in similarity search with scores: {e}") | |
| raise | |
| def save_knowledge_base(self, path: str = "knowledge_base"): | |
| """ | |
| Save the knowledge base to disk | |
| Args: | |
| path: Directory path to save the knowledge base | |
| """ | |
| try: | |
| if not self.vector_store: | |
| raise ValueError("Knowledge base not initialized") | |
| self.logger.info(f"Saving knowledge base to: {path}") | |
| # Create directory if it doesn't exist | |
| os.makedirs(path, exist_ok=True) | |
| # Save vector store | |
| self.vector_store.save_local(path) | |
| self.logger.info("Knowledge base saved successfully") | |
| except Exception as e: | |
| self.logger.error(f"Error saving knowledge base: {e}") | |
| raise | |
| def load_knowledge_base(self, path: str = "knowledge_base") -> FAISS: | |
| """ | |
| Load the knowledge base from disk | |
| Args: | |
| path: Directory path to load the knowledge base from | |
| Returns: | |
| FAISS vector store | |
| """ | |
| try: | |
| self.logger.info(f"Loading knowledge base from: {path}") | |
| if not os.path.exists(path): | |
| raise FileNotFoundError(f"Knowledge base not found at: {path}") | |
| # Load vector store | |
| self.vector_store = FAISS.load_local(path, self.embeddings) | |
| self.logger.info("Knowledge base loaded successfully") | |
| return self.vector_store | |
| except Exception as e: | |
| self.logger.error(f"Error loading knowledge base: {e}") | |
| raise | |
| def get_knowledge_base_info(self) -> dict: | |
| """ | |
| Get information about the knowledge base | |
| Returns: | |
| Dictionary with knowledge base information | |
| """ | |
| if not self.vector_store: | |
| return {"status": "not_initialized", "documents": 0} | |
| try: | |
| # Get index info | |
| index = self.vector_store.index | |
| num_docs = index.ntotal if hasattr(index, 'ntotal') else "unknown" | |
| return { | |
| "status": "initialized", | |
| "documents": num_docs, | |
| "embedding_model": self.model_name, | |
| "index_type": type(index).__name__ | |
| } | |
| except Exception as e: | |
| self.logger.error(f"Error getting knowledge base info: {e}") | |
| return {"status": "error", "error": str(e)} |