Rulga's picture
Update langchain-huggingface version and adjust imports in dataset and vector_store modules
9b48f6f
import os
import tempfile
import shutil
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings # новый импорт
from src.knowledge_base.loader import load_documents
from config.settings import VECTOR_STORE_PATH, EMBEDDING_MODEL, HF_TOKEN
from config.constants import CHUNK_SIZE, CHUNK_OVERLAP
def get_embeddings():
"""Get embeddings model"""
return HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
model_kwargs={'device': 'cpu'}
)
def create_vector_store(mode: str = "rebuild"):
"""Create or update vector store and upload to dataset"""
# Load documents
documents = load_documents()
if not documents:
return False, "Error: documents not loaded"
print(f"Loaded {len(documents)} documents")
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")
# Initialize embeddings
embeddings = get_embeddings()
try:
# Always create new vector store in rebuild mode
if mode == "rebuild":
print("Creating new vector store...")
vector_store = FAISS.from_documents(chunks, embeddings)
else:
# Try to load and update existing store
from src.knowledge_base.dataset import DatasetManager
dataset = DatasetManager(token=HF_TOKEN)
success, result = dataset.download_vector_store()
if success:
print("Updating existing vector store...")
vector_store = result
vector_store.add_documents(chunks)
else:
return False, "Failed to load existing vector store for update"
# Upload to dataset
from src.knowledge_base.dataset import DatasetManager
dataset = DatasetManager(token=HF_TOKEN)
success, message = dataset.upload_vector_store(vector_store) # del force_update
if not success:
return False, f"Error uploading to dataset: {message}"
action = "updated" if mode == "update" else "created"
return True, f"Knowledge base {action} successfully! Processed {len(documents)} documents, {len(chunks)} chunks."
except Exception as e:
return False, f"Error {mode}ing knowledge base: {str(e)}"
def load_vector_store():
"""Load vector store"""
try:
from src.knowledge_base.dataset import DatasetManager
dataset = DatasetManager(token=HF_TOKEN)
success, result = dataset.download_vector_store()
if not success:
print(f"Failed to download vector store: {result}")
return None
return result
except Exception as e:
print(f"Error loading vector store: {str(e)}")
return None