PDFPal-PDF-chatbot / modules /embedding_manager.py
mahmoudalrefaey's picture
Upload 6 files
b35e487 verified
"""
Embedding Manager Module
Handles text embeddings and vector database operations using sentence-transformers and FAISS
"""
import logging
import os
from typing import List, Optional
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
class EmbeddingManager:
"""Manages text embeddings and vector database operations"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
"""
Initialize embedding manager
Args:
model_name: Name of the sentence transformer model to use
"""
self.model_name = model_name
self.embeddings = None
self.vector_store = None
# Configure logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
# Initialize embeddings
self._initialize_embeddings()
def _initialize_embeddings(self):
"""Initialize the embedding model"""
try:
self.logger.info(f"Loading embedding model: {self.model_name}")
# Use HuggingFaceEmbeddings wrapper for better LangChain integration
self.embeddings = HuggingFaceEmbeddings(
model_name=f"sentence-transformers/{self.model_name}",
model_kwargs={'device': 'cpu'}, # Use CPU for compatibility
encode_kwargs={'normalize_embeddings': True}
)
self.logger.info("Embedding model loaded successfully")
except Exception as e:
self.logger.error(f"Error loading embedding model: {e}")
raise
def create_knowledge_base(self, documents: List[Document]) -> FAISS:
"""
Create FAISS knowledge base from documents
Args:
documents: List of Document objects
Returns:
FAISS vector store
"""
try:
if not documents:
raise ValueError("No documents provided")
self.logger.info(f"Creating knowledge base with {len(documents)} documents")
# Extract texts and metadata
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
# Create FAISS vector store
self.vector_store = FAISS.from_texts(
texts=texts,
embedding=self.embeddings,
metadatas=metadatas
)
self.logger.info("Knowledge base created successfully")
return self.vector_store
except Exception as e:
self.logger.error(f"Error creating knowledge base: {e}")
raise
def similarity_search(self, query: str, k: int = 4) -> List[Document]:
"""
Perform similarity search on the knowledge base
Args:
query: Search query
k: Number of similar documents to return
Returns:
List of similar documents
"""
try:
if not self.vector_store:
raise ValueError("Knowledge base not initialized")
self.logger.info(f"Performing similarity search for: {query[:50]}...")
# Perform search
similar_docs = self.vector_store.similarity_search(query, k=k)
self.logger.info(f"Found {len(similar_docs)} similar documents")
return similar_docs
except Exception as e:
self.logger.error(f"Error in similarity search: {e}")
raise
def similarity_search_with_score(self, query: str, k: int = 4) -> List[tuple]:
"""
Perform similarity search with scores
Args:
query: Search query
k: Number of similar documents to return
Returns:
List of (document, score) tuples
"""
try:
if not self.vector_store:
raise ValueError("Knowledge base not initialized")
self.logger.info(f"Performing similarity search with scores for: {query[:50]}...")
# Perform search with scores
similar_docs_with_scores = self.vector_store.similarity_search_with_score(query, k=k)
self.logger.info(f"Found {len(similar_docs_with_scores)} similar documents")
return similar_docs_with_scores
except Exception as e:
self.logger.error(f"Error in similarity search with scores: {e}")
raise
def save_knowledge_base(self, path: str = "knowledge_base"):
"""
Save the knowledge base to disk
Args:
path: Directory path to save the knowledge base
"""
try:
if not self.vector_store:
raise ValueError("Knowledge base not initialized")
self.logger.info(f"Saving knowledge base to: {path}")
# Create directory if it doesn't exist
os.makedirs(path, exist_ok=True)
# Save vector store
self.vector_store.save_local(path)
self.logger.info("Knowledge base saved successfully")
except Exception as e:
self.logger.error(f"Error saving knowledge base: {e}")
raise
def load_knowledge_base(self, path: str = "knowledge_base") -> FAISS:
"""
Load the knowledge base from disk
Args:
path: Directory path to load the knowledge base from
Returns:
FAISS vector store
"""
try:
self.logger.info(f"Loading knowledge base from: {path}")
if not os.path.exists(path):
raise FileNotFoundError(f"Knowledge base not found at: {path}")
# Load vector store
self.vector_store = FAISS.load_local(path, self.embeddings)
self.logger.info("Knowledge base loaded successfully")
return self.vector_store
except Exception as e:
self.logger.error(f"Error loading knowledge base: {e}")
raise
def get_knowledge_base_info(self) -> dict:
"""
Get information about the knowledge base
Returns:
Dictionary with knowledge base information
"""
if not self.vector_store:
return {"status": "not_initialized", "documents": 0}
try:
# Get index info
index = self.vector_store.index
num_docs = index.ntotal if hasattr(index, 'ntotal') else "unknown"
return {
"status": "initialized",
"documents": num_docs,
"embedding_model": self.model_name,
"index_type": type(index).__name__
}
except Exception as e:
self.logger.error(f"Error getting knowledge base info: {e}")
return {"status": "error", "error": str(e)}