Spaces:
Sleeping
Sleeping
File size: 6,445 Bytes
c5e1945 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import pickle
import logging
from pathlib import Path
from typing import List, Optional
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from config import EMBEDDING_MODEL, VECTOR_STORE_DIR, CHUNKS_PATH
from data_loaders import load_company_info, load_faq_documents
from text_processor import markdown_splitter, recursive_500
logger = logging.getLogger(__name__)
def load_company_vector_store() -> Optional[FAISS]:
"""Load existing vector store with proper error handling"""
try:
if Path(VECTOR_STORE_DIR).exists():
vector_store = FAISS.load_local(
str(VECTOR_STORE_DIR),
EMBEDDING_MODEL,
allow_dangerous_deserialization=True
)
logger.info("Successfully loaded existing vector store")
return vector_store
else:
logger.info("No existing vector store found")
return None
except Exception as e:
logger.error(f"Failed to load vector store: {e}")
return None
def create_company_vector_store(documents: List[Document]) -> Optional[FAISS]:
"""Create and save vector store with error handling"""
if not documents:
logger.error("No documents provided to create vector store")
return None
try:
# Ensure directory exists
Path(VECTOR_STORE_DIR).mkdir(parents=True, exist_ok=True)
vector_store = FAISS.from_documents(documents, EMBEDDING_MODEL)
vector_store.save_local(str(VECTOR_STORE_DIR))
logger.info(f"Successfully created and saved vector store with {len(documents)} documents")
return vector_store
except Exception as e:
logger.error(f"Failed to create vector store: {e}")
return None
def create_company_documents() -> List[Document]:
"""Create company documents with error handling"""
try:
company_documents = []
# Load FAQ documents
try:
faq_docs = load_faq_documents()
company_documents.extend(faq_docs)
logger.info(f"Loaded {len(faq_docs)} FAQ documents")
except Exception as e:
logger.error(f"Failed to load FAQ documents: {e}")
# Load company info
try:
company_info = load_company_info()
if company_info:
company_documents.append(company_info)
logger.info("Loaded company info document")
except Exception as e:
logger.error(f"Failed to load company info: {e}")
logger.info(f"Total documents loaded: {len(company_documents)}")
return company_documents
except Exception as e:
logger.error(f"Failed to create company documents: {e}")
return []
def split_documents(company_documents: List[Document]) -> List[Document]:
"""Split documents into chunks with error handling"""
if not company_documents:
logger.warning("No documents provided for splitting")
return []
company_chunks = []
try:
for i, doc in enumerate(company_documents):
try:
if doc.metadata.get("type") == "general_info":
# Use markdown splitter for info.md
split_docs = markdown_splitter.split_text(doc.page_content)
for d in split_docs:
d.metadata.update(doc.metadata)
company_chunks.extend(split_docs)
logger.debug(f"Split document {i} using markdown splitter")
else:
# Use recursive splitter for FAQs
split_docs = recursive_500.split_documents([doc])
company_chunks.extend(split_docs)
logger.debug(f"Split document {i} using recursive splitter")
except Exception as e:
logger.error(f"Failed to split document {i}: {e}")
continue
logger.info(f"Successfully split {len(company_documents)} documents into {len(company_chunks)} chunks")
return company_chunks
except Exception as e:
logger.error(f"Failed to split documents: {e}")
return []
def load_chunks() -> Optional[List[Document]]:
"""Load pre-processed chunks with error handling"""
try:
if Path(CHUNKS_PATH).exists():
with open(CHUNKS_PATH, 'rb') as f:
company_chunks = pickle.load(f)
logger.info(f"Successfully loaded {len(company_chunks)} chunks from cache")
return company_chunks
else:
logger.info("No cached chunks found")
return None
except Exception as e:
logger.error(f"Failed to load chunks: {e}")
return None
def save_chunks(chunks: List[Document]) -> bool:
"""Save processed chunks to file"""
try:
# Ensure directory exists
Path(CHUNKS_PATH).parent.mkdir(parents=True, exist_ok=True)
with open(CHUNKS_PATH, 'wb') as f:
pickle.dump(chunks, f)
logger.info(f"Successfully saved {len(chunks)} chunks to {CHUNKS_PATH}")
return True
except Exception as e:
logger.error(f"Failed to save chunks: {e}")
return False
def initialize_knowledge_base() -> Optional[FAISS]:
"""Initialize the complete knowledge base"""
try:
# Try to load existing vector store
vector_store = load_company_vector_store()
if vector_store:
return vector_store
# If no existing store, create new one
logger.info("Creating new knowledge base...")
# Load or create chunks
chunks = load_chunks()
if not chunks:
logger.info("No cached chunks found, processing documents...")
documents = create_company_documents()
if documents:
chunks = split_documents(documents)
if chunks:
save_chunks(chunks)
if chunks:
vector_store = create_company_vector_store(chunks)
return vector_store
else:
logger.error("No chunks available to create vector store")
return None
except Exception as e:
logger.error(f"Failed to initialize knowledge base: {e}")
return None |