Spaces:
Running
Running
File size: 3,087 Bytes
0f93e9d c05e788 7ac0faf 0f93e9d 9b48f6f 0f93e9d 8ef4162 0f93e9d 0bb77b3 0f93e9d f58bd9b f44c4e4 0bb77b3 0f93e9d 0bb77b3 0f93e9d f44c4e4 0bb77b3 0f93e9d f44c4e4 0f93e9d 0bb77b3 0f93e9d f58bd9b f44c4e4 f58bd9b f44c4e4 a55b18e f58bd9b 7ac0faf 8100019 a55b18e 8100019 a55b18e 7ac0faf f58bd9b c05e788 f58bd9b 0f93e9d 0bb77b3 0f93e9d 3f8e971 a55b18e 3f8e971 0f93e9d 0bb77b3 c05e788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import tempfile
import shutil
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings # новый импорт
from src.knowledge_base.loader import load_documents
from config.settings import VECTOR_STORE_PATH, EMBEDDING_MODEL, HF_TOKEN
from config.constants import CHUNK_SIZE, CHUNK_OVERLAP
def get_embeddings():
"""Get embeddings model"""
return HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL,
model_kwargs={'device': 'cpu'}
)
def create_vector_store(mode: str = "rebuild"):
"""Create or update vector store and upload to dataset"""
# Load documents
documents = load_documents()
if not documents:
return False, "Error: documents not loaded"
print(f"Loaded {len(documents)} documents")
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")
# Initialize embeddings
embeddings = get_embeddings()
try:
# Always create new vector store in rebuild mode
if mode == "rebuild":
print("Creating new vector store...")
vector_store = FAISS.from_documents(chunks, embeddings)
else:
# Try to load and update existing store
from src.knowledge_base.dataset import DatasetManager
dataset = DatasetManager(token=HF_TOKEN)
success, result = dataset.download_vector_store()
if success:
print("Updating existing vector store...")
vector_store = result
vector_store.add_documents(chunks)
else:
return False, "Failed to load existing vector store for update"
# Upload to dataset
from src.knowledge_base.dataset import DatasetManager
dataset = DatasetManager(token=HF_TOKEN)
success, message = dataset.upload_vector_store(vector_store) # del force_update
if not success:
return False, f"Error uploading to dataset: {message}"
action = "updated" if mode == "update" else "created"
return True, f"Knowledge base {action} successfully! Processed {len(documents)} documents, {len(chunks)} chunks."
except Exception as e:
return False, f"Error {mode}ing knowledge base: {str(e)}"
def load_vector_store():
"""Load vector store"""
try:
from src.knowledge_base.dataset import DatasetManager
dataset = DatasetManager(token=HF_TOKEN)
success, result = dataset.download_vector_store()
if not success:
print(f"Failed to download vector store: {result}")
return None
return result
except Exception as e:
print(f"Error loading vector store: {str(e)}")
return None
|