Spaces:
Running
Running
Refactor create_vector_store function to streamline vector store creation and update logic, enhance logging, and enforce force update during dataset upload
Browse files
src/knowledge_base/vector_store.py
CHANGED
|
@@ -16,52 +16,48 @@ def get_embeddings():
|
|
| 16 |
)
|
| 17 |
|
| 18 |
def create_vector_store(mode: str = "rebuild"):
|
| 19 |
-
"""
|
| 20 |
-
Create or update vector store and upload to dataset
|
| 21 |
-
|
| 22 |
-
Args:
|
| 23 |
-
mode: Either "rebuild" (create from scratch) or "update" (add new documents)
|
| 24 |
-
|
| 25 |
-
Returns:
|
| 26 |
-
(success, message)
|
| 27 |
-
"""
|
| 28 |
# Load documents
|
| 29 |
documents = load_documents()
|
| 30 |
|
| 31 |
if not documents:
|
| 32 |
return False, "Error: documents not loaded"
|
| 33 |
|
|
|
|
|
|
|
| 34 |
# Split into chunks
|
| 35 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 36 |
chunk_size=CHUNK_SIZE,
|
| 37 |
chunk_overlap=CHUNK_OVERLAP
|
| 38 |
)
|
| 39 |
chunks = text_splitter.split_documents(documents)
|
|
|
|
| 40 |
|
| 41 |
# Initialize embeddings
|
| 42 |
embeddings = get_embeddings()
|
| 43 |
|
| 44 |
try:
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
from src.knowledge_base.dataset import DatasetManager
|
| 48 |
dataset = DatasetManager(token=HF_TOKEN)
|
| 49 |
success, result = dataset.download_vector_store()
|
| 50 |
|
| 51 |
if success:
|
| 52 |
-
|
| 53 |
vector_store = result
|
| 54 |
vector_store.add_documents(chunks)
|
| 55 |
else:
|
| 56 |
return False, "Failed to load existing vector store for update"
|
| 57 |
-
else:
|
| 58 |
-
# Create new vector store
|
| 59 |
-
vector_store = FAISS.from_documents(chunks, embeddings)
|
| 60 |
|
| 61 |
-
# Upload to dataset
|
| 62 |
from src.knowledge_base.dataset import DatasetManager
|
| 63 |
dataset = DatasetManager(token=HF_TOKEN)
|
| 64 |
-
success, message = dataset.upload_vector_store(vector_store)
|
| 65 |
|
| 66 |
if not success:
|
| 67 |
return False, f"Error uploading to dataset: {message}"
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
def create_vector_store(mode: str = "rebuild"):
|
| 19 |
+
"""Create or update vector store and upload to dataset"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# Load documents
|
| 21 |
documents = load_documents()
|
| 22 |
|
| 23 |
if not documents:
|
| 24 |
return False, "Error: documents not loaded"
|
| 25 |
|
| 26 |
+
print(f"Loaded {len(documents)} documents")
|
| 27 |
+
|
| 28 |
# Split into chunks
|
| 29 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 30 |
chunk_size=CHUNK_SIZE,
|
| 31 |
chunk_overlap=CHUNK_OVERLAP
|
| 32 |
)
|
| 33 |
chunks = text_splitter.split_documents(documents)
|
| 34 |
+
print(f"Created {len(chunks)} chunks")
|
| 35 |
|
| 36 |
# Initialize embeddings
|
| 37 |
embeddings = get_embeddings()
|
| 38 |
|
| 39 |
try:
|
| 40 |
+
# Always create new vector store in rebuild mode
|
| 41 |
+
if mode == "rebuild":
|
| 42 |
+
print("Creating new vector store...")
|
| 43 |
+
vector_store = FAISS.from_documents(chunks, embeddings)
|
| 44 |
+
else:
|
| 45 |
+
# Try to load and update existing store
|
| 46 |
from src.knowledge_base.dataset import DatasetManager
|
| 47 |
dataset = DatasetManager(token=HF_TOKEN)
|
| 48 |
success, result = dataset.download_vector_store()
|
| 49 |
|
| 50 |
if success:
|
| 51 |
+
print("Updating existing vector store...")
|
| 52 |
vector_store = result
|
| 53 |
vector_store.add_documents(chunks)
|
| 54 |
else:
|
| 55 |
return False, "Failed to load existing vector store for update"
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
# Upload to dataset with force flag
|
| 58 |
from src.knowledge_base.dataset import DatasetManager
|
| 59 |
dataset = DatasetManager(token=HF_TOKEN)
|
| 60 |
+
success, message = dataset.upload_vector_store(vector_store, force_update=True)
|
| 61 |
|
| 62 |
if not success:
|
| 63 |
return False, f"Error uploading to dataset: {message}"
|