Spaces:
Sleeping
Sleeping
File size: 7,632 Bytes
b35e487 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
"""
Embedding Manager Module
Handles text embeddings and vector database operations using sentence-transformers and FAISS
"""
import logging
import os
from typing import List, Optional
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
class EmbeddingManager:
"""Manages text embeddings and vector database operations"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
"""
Initialize embedding manager
Args:
model_name: Name of the sentence transformer model to use
"""
self.model_name = model_name
self.embeddings = None
self.vector_store = None
# Configure logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
# Initialize embeddings
self._initialize_embeddings()
def _initialize_embeddings(self):
"""Initialize the embedding model"""
try:
self.logger.info(f"Loading embedding model: {self.model_name}")
# Use HuggingFaceEmbeddings wrapper for better LangChain integration
self.embeddings = HuggingFaceEmbeddings(
model_name=f"sentence-transformers/{self.model_name}",
model_kwargs={'device': 'cpu'}, # Use CPU for compatibility
encode_kwargs={'normalize_embeddings': True}
)
self.logger.info("Embedding model loaded successfully")
except Exception as e:
self.logger.error(f"Error loading embedding model: {e}")
raise
def create_knowledge_base(self, documents: List[Document]) -> FAISS:
"""
Create FAISS knowledge base from documents
Args:
documents: List of Document objects
Returns:
FAISS vector store
"""
try:
if not documents:
raise ValueError("No documents provided")
self.logger.info(f"Creating knowledge base with {len(documents)} documents")
# Extract texts and metadata
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
# Create FAISS vector store
self.vector_store = FAISS.from_texts(
texts=texts,
embedding=self.embeddings,
metadatas=metadatas
)
self.logger.info("Knowledge base created successfully")
return self.vector_store
except Exception as e:
self.logger.error(f"Error creating knowledge base: {e}")
raise
def similarity_search(self, query: str, k: int = 4) -> List[Document]:
"""
Perform similarity search on the knowledge base
Args:
query: Search query
k: Number of similar documents to return
Returns:
List of similar documents
"""
try:
if not self.vector_store:
raise ValueError("Knowledge base not initialized")
self.logger.info(f"Performing similarity search for: {query[:50]}...")
# Perform search
similar_docs = self.vector_store.similarity_search(query, k=k)
self.logger.info(f"Found {len(similar_docs)} similar documents")
return similar_docs
except Exception as e:
self.logger.error(f"Error in similarity search: {e}")
raise
def similarity_search_with_score(self, query: str, k: int = 4) -> List[tuple]:
"""
Perform similarity search with scores
Args:
query: Search query
k: Number of similar documents to return
Returns:
List of (document, score) tuples
"""
try:
if not self.vector_store:
raise ValueError("Knowledge base not initialized")
self.logger.info(f"Performing similarity search with scores for: {query[:50]}...")
# Perform search with scores
similar_docs_with_scores = self.vector_store.similarity_search_with_score(query, k=k)
self.logger.info(f"Found {len(similar_docs_with_scores)} similar documents")
return similar_docs_with_scores
except Exception as e:
self.logger.error(f"Error in similarity search with scores: {e}")
raise
def save_knowledge_base(self, path: str = "knowledge_base"):
"""
Save the knowledge base to disk
Args:
path: Directory path to save the knowledge base
"""
try:
if not self.vector_store:
raise ValueError("Knowledge base not initialized")
self.logger.info(f"Saving knowledge base to: {path}")
# Create directory if it doesn't exist
os.makedirs(path, exist_ok=True)
# Save vector store
self.vector_store.save_local(path)
self.logger.info("Knowledge base saved successfully")
except Exception as e:
self.logger.error(f"Error saving knowledge base: {e}")
raise
def load_knowledge_base(self, path: str = "knowledge_base") -> FAISS:
"""
Load the knowledge base from disk
Args:
path: Directory path to load the knowledge base from
Returns:
FAISS vector store
"""
try:
self.logger.info(f"Loading knowledge base from: {path}")
if not os.path.exists(path):
raise FileNotFoundError(f"Knowledge base not found at: {path}")
# Load vector store
self.vector_store = FAISS.load_local(path, self.embeddings)
self.logger.info("Knowledge base loaded successfully")
return self.vector_store
except Exception as e:
self.logger.error(f"Error loading knowledge base: {e}")
raise
def get_knowledge_base_info(self) -> dict:
"""
Get information about the knowledge base
Returns:
Dictionary with knowledge base information
"""
if not self.vector_store:
return {"status": "not_initialized", "documents": 0}
try:
# Get index info
index = self.vector_store.index
num_docs = index.ntotal if hasattr(index, 'ntotal') else "unknown"
return {
"status": "initialized",
"documents": num_docs,
"embedding_model": self.model_name,
"index_type": type(index).__name__
}
except Exception as e:
self.logger.error(f"Error getting knowledge base info: {e}")
return {"status": "error", "error": str(e)} |