moazx's picture
Project setup
c5e1945
import pickle
import logging
from pathlib import Path
from typing import List, Optional
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from config import EMBEDDING_MODEL, VECTOR_STORE_DIR, CHUNKS_PATH
from data_loaders import load_company_info, load_faq_documents
from text_processor import markdown_splitter, recursive_500
logger = logging.getLogger(__name__)
def load_company_vector_store() -> Optional[FAISS]:
"""Load existing vector store with proper error handling"""
try:
if Path(VECTOR_STORE_DIR).exists():
vector_store = FAISS.load_local(
str(VECTOR_STORE_DIR),
EMBEDDING_MODEL,
allow_dangerous_deserialization=True
)
logger.info("Successfully loaded existing vector store")
return vector_store
else:
logger.info("No existing vector store found")
return None
except Exception as e:
logger.error(f"Failed to load vector store: {e}")
return None
def create_company_vector_store(documents: List[Document]) -> Optional[FAISS]:
"""Create and save vector store with error handling"""
if not documents:
logger.error("No documents provided to create vector store")
return None
try:
# Ensure directory exists
Path(VECTOR_STORE_DIR).mkdir(parents=True, exist_ok=True)
vector_store = FAISS.from_documents(documents, EMBEDDING_MODEL)
vector_store.save_local(str(VECTOR_STORE_DIR))
logger.info(f"Successfully created and saved vector store with {len(documents)} documents")
return vector_store
except Exception as e:
logger.error(f"Failed to create vector store: {e}")
return None
def create_company_documents() -> List[Document]:
"""Create company documents with error handling"""
try:
company_documents = []
# Load FAQ documents
try:
faq_docs = load_faq_documents()
company_documents.extend(faq_docs)
logger.info(f"Loaded {len(faq_docs)} FAQ documents")
except Exception as e:
logger.error(f"Failed to load FAQ documents: {e}")
# Load company info
try:
company_info = load_company_info()
if company_info:
company_documents.append(company_info)
logger.info("Loaded company info document")
except Exception as e:
logger.error(f"Failed to load company info: {e}")
logger.info(f"Total documents loaded: {len(company_documents)}")
return company_documents
except Exception as e:
logger.error(f"Failed to create company documents: {e}")
return []
def split_documents(company_documents: List[Document]) -> List[Document]:
"""Split documents into chunks with error handling"""
if not company_documents:
logger.warning("No documents provided for splitting")
return []
company_chunks = []
try:
for i, doc in enumerate(company_documents):
try:
if doc.metadata.get("type") == "general_info":
# Use markdown splitter for info.md
split_docs = markdown_splitter.split_text(doc.page_content)
for d in split_docs:
d.metadata.update(doc.metadata)
company_chunks.extend(split_docs)
logger.debug(f"Split document {i} using markdown splitter")
else:
# Use recursive splitter for FAQs
split_docs = recursive_500.split_documents([doc])
company_chunks.extend(split_docs)
logger.debug(f"Split document {i} using recursive splitter")
except Exception as e:
logger.error(f"Failed to split document {i}: {e}")
continue
logger.info(f"Successfully split {len(company_documents)} documents into {len(company_chunks)} chunks")
return company_chunks
except Exception as e:
logger.error(f"Failed to split documents: {e}")
return []
def load_chunks() -> Optional[List[Document]]:
"""Load pre-processed chunks with error handling"""
try:
if Path(CHUNKS_PATH).exists():
with open(CHUNKS_PATH, 'rb') as f:
company_chunks = pickle.load(f)
logger.info(f"Successfully loaded {len(company_chunks)} chunks from cache")
return company_chunks
else:
logger.info("No cached chunks found")
return None
except Exception as e:
logger.error(f"Failed to load chunks: {e}")
return None
def save_chunks(chunks: List[Document]) -> bool:
"""Save processed chunks to file"""
try:
# Ensure directory exists
Path(CHUNKS_PATH).parent.mkdir(parents=True, exist_ok=True)
with open(CHUNKS_PATH, 'wb') as f:
pickle.dump(chunks, f)
logger.info(f"Successfully saved {len(chunks)} chunks to {CHUNKS_PATH}")
return True
except Exception as e:
logger.error(f"Failed to save chunks: {e}")
return False
def initialize_knowledge_base() -> Optional[FAISS]:
"""Initialize the complete knowledge base"""
try:
# Try to load existing vector store
vector_store = load_company_vector_store()
if vector_store:
return vector_store
# If no existing store, create new one
logger.info("Creating new knowledge base...")
# Load or create chunks
chunks = load_chunks()
if not chunks:
logger.info("No cached chunks found, processing documents...")
documents = create_company_documents()
if documents:
chunks = split_documents(documents)
if chunks:
save_chunks(chunks)
if chunks:
vector_store = create_company_vector_store(chunks)
return vector_store
else:
logger.error("No chunks available to create vector store")
return None
except Exception as e:
logger.error(f"Failed to initialize knowledge base: {e}")
return None