Rulga commited on
Commit
a55b18e
·
1 Parent(s): db04008

used only dataset

Browse files
src/knowledge_base/dataset.py CHANGED
@@ -77,166 +77,148 @@ class DatasetManager:
77
  except Exception as e:
78
  return False, f"Error initializing dataset structure: {str(e)}"
79
 
80
- def upload_vector_store(self) -> Tuple[bool, str]:
81
  """
82
  Upload vector store to dataset
83
 
 
 
 
84
  Returns:
85
  (success, message)
86
  """
87
  try:
88
- if not os.path.exists(VECTOR_STORE_PATH):
89
- return False, "Vector store directory not found"
 
90
 
91
- index_path = os.path.join(VECTOR_STORE_PATH, "index.faiss")
92
- config_path = os.path.join(VECTOR_STORE_PATH, "index.pkl")
93
-
94
- if not (os.path.exists(index_path) and os.path.exists(config_path)):
95
- return False, "Vector store files not found"
96
-
97
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
98
-
99
- # First save old files to archive if they exist
100
- try:
101
- # Check for existing files
102
- self.api.hf_hub_download(
103
- repo_id=self.dataset_name,
104
- filename="vector_store/index.faiss",
105
- repo_type="dataset"
106
- )
107
 
108
- # If file exists, create archive copy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  self.api.upload_file(
110
  path_or_fileobj=index_path,
111
- path_in_repo=f"vector_store/archive/index_{timestamp}.faiss",
112
  repo_id=self.dataset_name,
113
  repo_type="dataset"
114
  )
115
 
116
  self.api.upload_file(
117
  path_or_fileobj=config_path,
118
- path_in_repo=f"vector_store/archive/index_{timestamp}.pkl",
119
  repo_id=self.dataset_name,
120
  repo_type="dataset"
121
  )
122
- except Exception:
123
- # If no files exist, create archive directory
124
- with tempfile.NamedTemporaryFile(delete=False) as temp:
125
- temp_path = temp.name
 
 
 
 
 
 
126
 
127
  try:
128
  self.api.upload_file(
129
- path_or_fileobj=temp_path,
130
- path_in_repo="vector_store/archive/.gitkeep",
131
  repo_id=self.dataset_name,
132
  repo_type="dataset"
133
  )
134
  finally:
135
- if os.path.exists(temp_path):
136
- os.remove(temp_path)
137
-
138
- # Upload current files
139
- self.api.upload_file(
140
- path_or_fileobj=index_path,
141
- path_in_repo="vector_store/index.faiss",
142
- repo_id=self.dataset_name,
143
- repo_type="dataset"
144
- )
145
-
146
- self.api.upload_file(
147
- path_or_fileobj=config_path,
148
- path_in_repo="vector_store/index.pkl",
149
- repo_id=self.dataset_name,
150
- repo_type="dataset"
151
- )
152
-
153
- # Update metadata about last update
154
- metadata = {
155
- "last_update": timestamp,
156
- "version": "1.0"
157
- }
158
-
159
- with tempfile.NamedTemporaryFile(mode="w+", suffix=".json", delete=False) as temp:
160
- json.dump(metadata, temp, ensure_ascii=False, indent=2)
161
- temp_name = temp.name
162
-
163
- try:
164
- self.api.upload_file(
165
- path_or_fileobj=temp_name,
166
- path_in_repo="vector_store/metadata.json",
167
- repo_id=self.dataset_name,
168
- repo_type="dataset"
169
- )
170
- finally:
171
- if os.path.exists(temp_name):
172
- os.remove(temp_name)
173
-
174
- return True, "Vector store uploaded successfully"
175
-
176
  except Exception as e:
177
  return False, f"Error uploading vector store: {str(e)}"
178
 
179
- def download_vector_store(self, force: bool = False) -> Tuple[bool, Union[FAISS, str]]:
180
  """
181
  Download vector store from dataset
182
 
183
- Args:
184
- force: Force download even if local files exist
185
-
186
  Returns:
187
  (success, vector_store or error message)
188
  """
189
  try:
190
- # Check if local files exist and force is False
191
- if not force and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
192
- # Instead of returning string, load and return the vector store
193
- embeddings = HuggingFaceEmbeddings(
194
- model_name=EMBEDDING_MODEL,
195
- model_kwargs={'device': 'cpu'}
196
- )
197
- vector_store = FAISS.load_local(
198
- VECTOR_STORE_PATH,
199
- embeddings,
200
- allow_dangerous_deserialization=True
201
- )
202
- return True, vector_store
203
-
204
- # Ensure vector store directory exists
205
- os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
206
-
207
- # Download files
208
- try:
209
- self.api.hf_hub_download(
210
- repo_id=self.dataset_name,
211
- filename="vector_store/index.faiss",
212
- repo_type="dataset",
213
- local_dir=VECTOR_STORE_PATH
214
- )
215
-
216
- self.api.hf_hub_download(
217
- repo_id=self.dataset_name,
218
- filename="vector_store/index.pkl",
219
- repo_type="dataset",
220
- local_dir=VECTOR_STORE_PATH
221
- )
222
-
223
- # After successful download, load and return the vector store
224
- embeddings = HuggingFaceEmbeddings(
225
- model_name=EMBEDDING_MODEL,
226
- model_kwargs={'device': 'cpu'}
227
- )
228
- vector_store = FAISS.load_local(
229
- VECTOR_STORE_PATH,
230
- embeddings,
231
- allow_dangerous_deserialization=True
232
- )
233
- return True, vector_store
234
 
235
- except Exception as e:
236
- return False, f"Failed to download vector store: {str(e)}"
237
-
238
  except Exception as e:
239
- return False, f"Error in download_vector_store: {str(e)}"
240
 
241
  def save_chat_history(self, conversation_id: str, messages: List[Dict[str, str]]) -> Tuple[bool, str]:
242
  """
 
77
  except Exception as e:
78
  return False, f"Error initializing dataset structure: {str(e)}"
79
 
80
+ def upload_vector_store(self, vector_store: FAISS) -> Tuple[bool, str]:
81
  """
82
  Upload vector store to dataset
83
 
84
+ Args:
85
+ vector_store: FAISS vector store to upload
86
+
87
  Returns:
88
  (success, message)
89
  """
90
  try:
91
+ with tempfile.TemporaryDirectory() as temp_dir:
92
+ # Save vector store to temporary directory
93
+ vector_store.save_local(folder_path=temp_dir)
94
 
95
+ index_path = os.path.join(temp_dir, "index.faiss")
96
+ config_path = os.path.join(temp_dir, "index.pkl")
97
+
98
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ # First save old files to archive if they exist
101
+ try:
102
+ # Check for existing files
103
+ self.api.hf_hub_download(
104
+ repo_id=self.dataset_name,
105
+ filename="vector_store/index.faiss",
106
+ repo_type="dataset"
107
+ )
108
+
109
+ # If file exists, create archive copy
110
+ self.api.upload_file(
111
+ path_or_fileobj=index_path,
112
+ path_in_repo=f"vector_store/archive/index_{timestamp}.faiss",
113
+ repo_id=self.dataset_name,
114
+ repo_type="dataset"
115
+ )
116
+
117
+ self.api.upload_file(
118
+ path_or_fileobj=config_path,
119
+ path_in_repo=f"vector_store/archive/index_{timestamp}.pkl",
120
+ repo_id=self.dataset_name,
121
+ repo_type="dataset"
122
+ )
123
+ except Exception:
124
+ # If no files exist, create archive directory
125
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
126
+ temp_path = temp.name
127
+
128
+ try:
129
+ self.api.upload_file(
130
+ path_or_fileobj=temp_path,
131
+ path_in_repo="vector_store/archive/.gitkeep",
132
+ repo_id=self.dataset_name,
133
+ repo_type="dataset"
134
+ )
135
+ finally:
136
+ if os.path.exists(temp_path):
137
+ os.remove(temp_path)
138
+
139
+ # Upload current files
140
  self.api.upload_file(
141
  path_or_fileobj=index_path,
142
+ path_in_repo="vector_store/index.faiss",
143
  repo_id=self.dataset_name,
144
  repo_type="dataset"
145
  )
146
 
147
  self.api.upload_file(
148
  path_or_fileobj=config_path,
149
+ path_in_repo="vector_store/index.pkl",
150
  repo_id=self.dataset_name,
151
  repo_type="dataset"
152
  )
153
+
154
+ # Update metadata about last update
155
+ metadata = {
156
+ "last_update": timestamp,
157
+ "version": "1.0"
158
+ }
159
+
160
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".json", delete=False) as temp:
161
+ json.dump(metadata, temp, ensure_ascii=False, indent=2)
162
+ temp_name = temp.name
163
 
164
  try:
165
  self.api.upload_file(
166
+ path_or_fileobj=temp_name,
167
+ path_in_repo="vector_store/metadata.json",
168
  repo_id=self.dataset_name,
169
  repo_type="dataset"
170
  )
171
  finally:
172
+ if os.path.exists(temp_name):
173
+ os.remove(temp_name)
174
+
175
+ return True, "Vector store uploaded successfully"
176
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  except Exception as e:
178
  return False, f"Error uploading vector store: {str(e)}"
179
 
180
+ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
181
  """
182
  Download vector store from dataset
183
 
 
 
 
184
  Returns:
185
  (success, vector_store or error message)
186
  """
187
  try:
188
+ with tempfile.TemporaryDirectory() as temp_dir:
189
+ # Download files to temporary directory
190
+ try:
191
+ self.api.hf_hub_download(
192
+ repo_id=self.dataset_name,
193
+ filename="vector_store/index.faiss",
194
+ repo_type="dataset",
195
+ local_dir=temp_dir
196
+ )
197
+
198
+ self.api.hf_hub_download(
199
+ repo_id=self.dataset_name,
200
+ filename="vector_store/index.pkl",
201
+ repo_type="dataset",
202
+ local_dir=temp_dir
203
+ )
204
+
205
+ # Load vector store from temporary directory
206
+ embeddings = HuggingFaceEmbeddings(
207
+ model_name=EMBEDDING_MODEL,
208
+ model_kwargs={'device': 'cpu'}
209
+ )
210
+ vector_store = FAISS.load_local(
211
+ temp_dir,
212
+ embeddings,
213
+ allow_dangerous_deserialization=True
214
+ )
215
+ return True, vector_store
216
+
217
+ except Exception as e:
218
+ return False, f"Failed to download vector store: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
 
 
 
220
  except Exception as e:
221
+ return False, f"Error downloading vector store: {str(e)}"
222
 
223
  def save_chat_history(self, conversation_id: str, messages: List[Dict[str, str]]) -> Tuple[bool, str]:
224
  """
src/knowledge_base/vector_store.py CHANGED
@@ -50,11 +50,7 @@ def create_vector_store(mode: str = "rebuild"):
50
 
51
  if success:
52
  # Add new documents to existing store
53
- vector_store = FAISS.load_local(
54
- VECTOR_STORE_PATH,
55
- embeddings,
56
- allow_dangerous_deserialization=True
57
- )
58
  vector_store.add_documents(chunks)
59
  else:
60
  return False, "Failed to load existing vector store for update"
@@ -62,28 +58,13 @@ def create_vector_store(mode: str = "rebuild"):
62
  # Create new vector store
63
  vector_store = FAISS.from_documents(chunks, embeddings)
64
 
65
- # Save and upload
66
- with tempfile.TemporaryDirectory() as temp_dir:
67
- vector_store.save_local(folder_path=temp_dir)
68
-
69
- # Copy files to VECTOR_STORE_PATH for subsequent loading
70
- os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
71
- for file in ["index.faiss", "index.pkl"]:
72
- shutil.copy2(
73
- os.path.join(temp_dir, file),
74
- os.path.join(VECTOR_STORE_PATH, file)
75
- )
76
-
77
- # Upload to dataset
78
- from src.knowledge_base.dataset import DatasetManager
79
- dataset = DatasetManager(token=HF_TOKEN)
80
- success, message = dataset.upload_vector_store()
81
-
82
- # Clean up local files
83
- shutil.rmtree(VECTOR_STORE_PATH)
84
-
85
- if not success:
86
- return False, f"Error uploading to dataset: {message}"
87
 
88
  action = "updated" if mode == "update" else "created"
89
  return True, f"Knowledge base {action} successfully! Processed {len(documents)} documents, {len(chunks)} chunks."
@@ -94,7 +75,6 @@ def create_vector_store(mode: str = "rebuild"):
94
  def load_vector_store():
95
  """Load vector store"""
96
  try:
97
- # First check if we need to download from dataset
98
  from src.knowledge_base.dataset import DatasetManager
99
  dataset = DatasetManager(token=HF_TOKEN)
100
  success, result = dataset.download_vector_store()
@@ -103,19 +83,7 @@ def load_vector_store():
103
  print(f"Failed to download vector store: {result}")
104
  return None
105
 
106
- # Now try to load the local vector store
107
- embeddings = get_embeddings()
108
-
109
- if not os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
110
- print("Vector store files not found locally")
111
- return None
112
-
113
- vector_store = FAISS.load_local(
114
- VECTOR_STORE_PATH,
115
- embeddings,
116
- allow_dangerous_deserialization=True
117
- )
118
- return vector_store
119
 
120
  except Exception as e:
121
  print(f"Error loading vector store: {str(e)}")
 
50
 
51
  if success:
52
  # Add new documents to existing store
53
+ vector_store = result
 
 
 
 
54
  vector_store.add_documents(chunks)
55
  else:
56
  return False, "Failed to load existing vector store for update"
 
58
  # Create new vector store
59
  vector_store = FAISS.from_documents(chunks, embeddings)
60
 
61
+ # Upload to dataset
62
+ from src.knowledge_base.dataset import DatasetManager
63
+ dataset = DatasetManager(token=HF_TOKEN)
64
+ success, message = dataset.upload_vector_store(vector_store)
65
+
66
+ if not success:
67
+ return False, f"Error uploading to dataset: {message}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  action = "updated" if mode == "update" else "created"
70
  return True, f"Knowledge base {action} successfully! Processed {len(documents)} documents, {len(chunks)} chunks."
 
75
  def load_vector_store():
76
  """Load vector store"""
77
  try:
 
78
  from src.knowledge_base.dataset import DatasetManager
79
  dataset = DatasetManager(token=HF_TOKEN)
80
  success, result = dataset.download_vector_store()
 
83
  print(f"Failed to download vector store: {result}")
84
  return None
85
 
86
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  except Exception as e:
89
  print(f"Error loading vector store: {str(e)}")