import os import tempfile import shutil from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings # новый импорт from src.knowledge_base.loader import load_documents from config.settings import VECTOR_STORE_PATH, EMBEDDING_MODEL, HF_TOKEN from config.constants import CHUNK_SIZE, CHUNK_OVERLAP def get_embeddings(): """Get embeddings model""" return HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL, model_kwargs={'device': 'cpu'} ) def create_vector_store(mode: str = "rebuild"): """Create or update vector store and upload to dataset""" # Load documents documents = load_documents() if not documents: return False, "Error: documents not loaded" print(f"Loaded {len(documents)} documents") # Split into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP ) chunks = text_splitter.split_documents(documents) print(f"Created {len(chunks)} chunks") # Initialize embeddings embeddings = get_embeddings() try: # Always create new vector store in rebuild mode if mode == "rebuild": print("Creating new vector store...") vector_store = FAISS.from_documents(chunks, embeddings) else: # Try to load and update existing store from src.knowledge_base.dataset import DatasetManager dataset = DatasetManager(token=HF_TOKEN) success, result = dataset.download_vector_store() if success: print("Updating existing vector store...") vector_store = result vector_store.add_documents(chunks) else: return False, "Failed to load existing vector store for update" # Upload to dataset from src.knowledge_base.dataset import DatasetManager dataset = DatasetManager(token=HF_TOKEN) success, message = dataset.upload_vector_store(vector_store) # del force_update if not success: return False, f"Error uploading to dataset: {message}" action = "updated" if mode == "update" else "created" return True, f"Knowledge base {action} successfully! Processed {len(documents)} documents, {len(chunks)} chunks." except Exception as e: return False, f"Error {mode}ing knowledge base: {str(e)}" def load_vector_store(): """Load vector store""" try: from src.knowledge_base.dataset import DatasetManager dataset = DatasetManager(token=HF_TOKEN) success, result = dataset.download_vector_store() if not success: print(f"Failed to download vector store: {result}") return None return result except Exception as e: print(f"Error loading vector store: {str(e)}") return None