Rulga commited on
Commit
f44c4e4
·
1 Parent(s): b67f5d5

Refactor create_vector_store function to streamline vector store creation and update logic, enhance logging, and enforce force update during dataset upload

Browse files
Files changed (1) hide show
  1. src/knowledge_base/vector_store.py +13 -17
src/knowledge_base/vector_store.py CHANGED
@@ -16,52 +16,48 @@ def get_embeddings():
16
  )
17
 
18
  def create_vector_store(mode: str = "rebuild"):
19
- """
20
- Create or update vector store and upload to dataset
21
-
22
- Args:
23
- mode: Either "rebuild" (create from scratch) or "update" (add new documents)
24
-
25
- Returns:
26
- (success, message)
27
- """
28
  # Load documents
29
  documents = load_documents()
30
 
31
  if not documents:
32
  return False, "Error: documents not loaded"
33
 
 
 
34
  # Split into chunks
35
  text_splitter = RecursiveCharacterTextSplitter(
36
  chunk_size=CHUNK_SIZE,
37
  chunk_overlap=CHUNK_OVERLAP
38
  )
39
  chunks = text_splitter.split_documents(documents)
 
40
 
41
  # Initialize embeddings
42
  embeddings = get_embeddings()
43
 
44
  try:
45
- if mode == "update":
46
- # Try to load existing vector store
 
 
 
 
47
  from src.knowledge_base.dataset import DatasetManager
48
  dataset = DatasetManager(token=HF_TOKEN)
49
  success, result = dataset.download_vector_store()
50
 
51
  if success:
52
- # Add new documents to existing store
53
  vector_store = result
54
  vector_store.add_documents(chunks)
55
  else:
56
  return False, "Failed to load existing vector store for update"
57
- else:
58
- # Create new vector store
59
- vector_store = FAISS.from_documents(chunks, embeddings)
60
 
61
- # Upload to dataset
62
  from src.knowledge_base.dataset import DatasetManager
63
  dataset = DatasetManager(token=HF_TOKEN)
64
- success, message = dataset.upload_vector_store(vector_store)
65
 
66
  if not success:
67
  return False, f"Error uploading to dataset: {message}"
 
16
  )
17
 
18
  def create_vector_store(mode: str = "rebuild"):
19
+ """Create or update vector store and upload to dataset"""
 
 
 
 
 
 
 
 
20
  # Load documents
21
  documents = load_documents()
22
 
23
  if not documents:
24
  return False, "Error: documents not loaded"
25
 
26
+ print(f"Loaded {len(documents)} documents")
27
+
28
  # Split into chunks
29
  text_splitter = RecursiveCharacterTextSplitter(
30
  chunk_size=CHUNK_SIZE,
31
  chunk_overlap=CHUNK_OVERLAP
32
  )
33
  chunks = text_splitter.split_documents(documents)
34
+ print(f"Created {len(chunks)} chunks")
35
 
36
  # Initialize embeddings
37
  embeddings = get_embeddings()
38
 
39
  try:
40
+ # Always create new vector store in rebuild mode
41
+ if mode == "rebuild":
42
+ print("Creating new vector store...")
43
+ vector_store = FAISS.from_documents(chunks, embeddings)
44
+ else:
45
+ # Try to load and update existing store
46
  from src.knowledge_base.dataset import DatasetManager
47
  dataset = DatasetManager(token=HF_TOKEN)
48
  success, result = dataset.download_vector_store()
49
 
50
  if success:
51
+ print("Updating existing vector store...")
52
  vector_store = result
53
  vector_store.add_documents(chunks)
54
  else:
55
  return False, "Failed to load existing vector store for update"
 
 
 
56
 
57
+ # Upload to dataset with force flag
58
  from src.knowledge_base.dataset import DatasetManager
59
  dataset = DatasetManager(token=HF_TOKEN)
60
+ success, message = dataset.upload_vector_store(vector_store, force_update=True)
61
 
62
  if not success:
63
  return False, f"Error uploading to dataset: {message}"