File size: 6,445 Bytes
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import pickle
import logging
from pathlib import Path
from typing import List, Optional
from langchain.schema import Document
from langchain_community.vectorstores import FAISS

from config import EMBEDDING_MODEL, VECTOR_STORE_DIR, CHUNKS_PATH
from data_loaders import load_company_info, load_faq_documents
from text_processor import markdown_splitter, recursive_500

logger = logging.getLogger(__name__)

def load_company_vector_store() -> Optional[FAISS]:
    """Load existing vector store with proper error handling"""
    try:
        if Path(VECTOR_STORE_DIR).exists():
            vector_store = FAISS.load_local(
                str(VECTOR_STORE_DIR), 
                EMBEDDING_MODEL, 
                allow_dangerous_deserialization=True
            )
            logger.info("Successfully loaded existing vector store")
            return vector_store
        else:
            logger.info("No existing vector store found")
            return None
    except Exception as e:
        logger.error(f"Failed to load vector store: {e}")
        return None

def create_company_vector_store(documents: List[Document]) -> Optional[FAISS]:
    """Create and save vector store with error handling"""
    if not documents:
        logger.error("No documents provided to create vector store")
        return None
        
    try:
        # Ensure directory exists
        Path(VECTOR_STORE_DIR).mkdir(parents=True, exist_ok=True)
        
        vector_store = FAISS.from_documents(documents, EMBEDDING_MODEL)
        vector_store.save_local(str(VECTOR_STORE_DIR))
        logger.info(f"Successfully created and saved vector store with {len(documents)} documents")
        return vector_store
    except Exception as e:
        logger.error(f"Failed to create vector store: {e}")
        return None

def create_company_documents() -> List[Document]:
    """Create company documents with error handling"""
    try:
        company_documents = []
        
        # Load FAQ documents
        try:
            faq_docs = load_faq_documents()
            company_documents.extend(faq_docs)
            logger.info(f"Loaded {len(faq_docs)} FAQ documents")
        except Exception as e:
            logger.error(f"Failed to load FAQ documents: {e}")
        
        # Load company info
        try:
            company_info = load_company_info()
            if company_info:
                company_documents.append(company_info)
                logger.info("Loaded company info document")
        except Exception as e:
            logger.error(f"Failed to load company info: {e}")
        
        logger.info(f"Total documents loaded: {len(company_documents)}")
        return company_documents
        
    except Exception as e:
        logger.error(f"Failed to create company documents: {e}")
        return []

def split_documents(company_documents: List[Document]) -> List[Document]:
    """Split documents into chunks with error handling"""
    if not company_documents:
        logger.warning("No documents provided for splitting")
        return []
        
    company_chunks = []
    
    try:
        for i, doc in enumerate(company_documents):
            try:
                if doc.metadata.get("type") == "general_info":
                    # Use markdown splitter for info.md
                    split_docs = markdown_splitter.split_text(doc.page_content)
                    for d in split_docs:
                        d.metadata.update(doc.metadata)
                    company_chunks.extend(split_docs)
                    logger.debug(f"Split document {i} using markdown splitter")
                else:
                    # Use recursive splitter for FAQs
                    split_docs = recursive_500.split_documents([doc])
                    company_chunks.extend(split_docs)
                    logger.debug(f"Split document {i} using recursive splitter")
                    
            except Exception as e:
                logger.error(f"Failed to split document {i}: {e}")
                continue
                
        logger.info(f"Successfully split {len(company_documents)} documents into {len(company_chunks)} chunks")
        return company_chunks
        
    except Exception as e:
        logger.error(f"Failed to split documents: {e}")
        return []

def load_chunks() -> Optional[List[Document]]:
    """Load pre-processed chunks with error handling"""
    try:
        if Path(CHUNKS_PATH).exists():
            with open(CHUNKS_PATH, 'rb') as f:
                company_chunks = pickle.load(f)
            logger.info(f"Successfully loaded {len(company_chunks)} chunks from cache")
            return company_chunks
        else:
            logger.info("No cached chunks found")
            return None
    except Exception as e:
        logger.error(f"Failed to load chunks: {e}")
        return None

def save_chunks(chunks: List[Document]) -> bool:
    """Save processed chunks to file"""
    try:
        # Ensure directory exists
        Path(CHUNKS_PATH).parent.mkdir(parents=True, exist_ok=True)
        
        with open(CHUNKS_PATH, 'wb') as f:
            pickle.dump(chunks, f)
        logger.info(f"Successfully saved {len(chunks)} chunks to {CHUNKS_PATH}")
        return True
    except Exception as e:
        logger.error(f"Failed to save chunks: {e}")
        return False

def initialize_knowledge_base() -> Optional[FAISS]:
    """Initialize the complete knowledge base"""
    try:
        # Try to load existing vector store
        vector_store = load_company_vector_store()
        if vector_store:
            return vector_store
        
        # If no existing store, create new one
        logger.info("Creating new knowledge base...")
        
        # Load or create chunks
        chunks = load_chunks()
        if not chunks:
            logger.info("No cached chunks found, processing documents...")
            documents = create_company_documents()
            if documents:
                chunks = split_documents(documents)
                if chunks:
                    save_chunks(chunks)
        
        if chunks:
            vector_store = create_company_vector_store(chunks)
            return vector_store
        else:
            logger.error("No chunks available to create vector store")
            return None
            
    except Exception as e:
        logger.error(f"Failed to initialize knowledge base: {e}")
        return None