Rulga commited on
Commit
ce6f5b5
·
1 Parent(s): 9b2029e

Enhance knowledge base rebuilding functionality: Add method to rebuild knowledge base from selected URLs, improve error handling, and log actions during vector store download process.

Browse files
Files changed (2) hide show
  1. app.py +49 -4
  2. src/knowledge_base/dataset.py +10 -11
app.py CHANGED
@@ -72,6 +72,51 @@ def update_kb_with_selected(sources_df) -> str:
72
  logger.error(f"Error updating knowledge base: {str(e)}")
73
  return f"Error updating knowledge base: {str(e)}"
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Set seed for consistent results
76
  langdetect.DetectorFactory.seed = 0
77
 
@@ -1734,17 +1779,15 @@ def rebuild_kb_with_selected(sources_df):
1734
  return "Error: No URLs selected for inclusion"
1735
 
1736
  # Временно заменяем URLS на выбранные URL
1737
- from config import constants
1738
- original_urls = constants.URLS
1739
  constants.URLS = selected_urls
1740
 
1741
  try:
1742
  # Пересоздаем базу знаний
1743
  success, message = create_vector_store(mode="rebuild")
1744
 
1745
- # Сохраняем метаданные с информацией о выбранных URL
1746
  if success:
1747
- # Создаем метаданные с текущей датой и выбранными URL
1748
  metadata = {
1749
  "last_updated": datetime.datetime.now().isoformat(),
1750
  "source_count": len(selected_urls),
@@ -1763,11 +1806,13 @@ def rebuild_kb_with_selected(sources_df):
1763
  )
1764
 
1765
  return message
 
1766
  finally:
1767
  # Восстанавливаем оригинальные URL
1768
  constants.URLS = original_urls
1769
 
1770
  except Exception as e:
 
1771
  return f"Error rebuilding knowledge base: {str(e)}"
1772
 
1773
  def save_kb_metadata():
 
72
  logger.error(f"Error updating knowledge base: {str(e)}")
73
  return f"Error updating knowledge base: {str(e)}"
74
 
75
+ def rebuild_kb_with_selected(sources_df):
76
+ """Rebuild knowledge base from scratch using only selected URLs"""
77
+ try:
78
+ selected_urls = get_selected_urls(sources_df)
79
+
80
+ if not selected_urls:
81
+ return "Error: No URLs selected for inclusion"
82
+
83
+ # Временно заменяем URLS на выбранные URL
84
+ original_urls = constants.URLS.copy()
85
+ constants.URLS = selected_urls
86
+
87
+ try:
88
+ # Пересоздаем базу знаний
89
+ success, message = create_vector_store(mode="rebuild")
90
+
91
+ # Сохраняем метаданные если успешно
92
+ if success:
93
+ metadata = {
94
+ "last_updated": datetime.datetime.now().isoformat(),
95
+ "source_count": len(selected_urls),
96
+ "sources": selected_urls
97
+ }
98
+
99
+ # Сохраняем в датасет
100
+ json_content = json.dumps(metadata, indent=2).encode('utf-8')
101
+ api = HfApi(token=HF_TOKEN)
102
+
103
+ api.upload_file(
104
+ path_or_fileobj=json_content,
105
+ path_in_repo="vector_store/metadata.json",
106
+ repo_id=DATASET_ID,
107
+ repo_type="dataset"
108
+ )
109
+
110
+ return message
111
+
112
+ finally:
113
+ # Восстанавливаем оригинальные URL
114
+ constants.URLS = original_urls
115
+
116
+ except Exception as e:
117
+ logger.error(f"Error rebuilding knowledge base: {str(e)}")
118
+ return f"Error rebuilding knowledge base: {str(e)}"
119
+
120
  # Set seed for consistent results
121
  langdetect.DetectorFactory.seed = 0
122
 
 
1779
  return "Error: No URLs selected for inclusion"
1780
 
1781
  # Временно заменяем URLS на выбранные URL
1782
+ original_urls = constants.URLS.copy()
 
1783
  constants.URLS = selected_urls
1784
 
1785
  try:
1786
  # Пересоздаем базу знаний
1787
  success, message = create_vector_store(mode="rebuild")
1788
 
1789
+ # Сохраняем метаданные если успешно
1790
  if success:
 
1791
  metadata = {
1792
  "last_updated": datetime.datetime.now().isoformat(),
1793
  "source_count": len(selected_urls),
 
1806
  )
1807
 
1808
  return message
1809
+
1810
  finally:
1811
  # Восстанавливаем оригинальные URL
1812
  constants.URLS = original_urls
1813
 
1814
  except Exception as e:
1815
+ logger.error(f"Error rebuilding knowledge base: {str(e)}")
1816
  return f"Error rebuilding knowledge base: {str(e)}"
1817
 
1818
  def save_kb_metadata():
src/knowledge_base/dataset.py CHANGED
@@ -45,17 +45,17 @@ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
45
  """Download vector store from dataset"""
46
  try:
47
  with tempfile.TemporaryDirectory() as temp_dir:
48
- print(f"Downloading to temporary directory: {temp_dir}")
49
 
50
- # Download files to temporary directory
51
  try:
 
52
  index_path = self.api.hf_hub_download(
53
  repo_id=self.dataset_name,
54
  filename="vector_store/index.faiss",
55
  repo_type="dataset",
56
  local_dir=temp_dir
57
  )
58
- print(f"Downloaded index.faiss to: {index_path}")
59
 
60
  config_path = self.api.hf_hub_download(
61
  repo_id=self.dataset_name,
@@ -63,27 +63,26 @@ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
63
  repo_type="dataset",
64
  local_dir=temp_dir
65
  )
66
- print(f"Downloaded index.pkl to: {config_path}")
67
 
68
- # Verify files exist
69
- if not os.path.exists(index_path) or not os.path.exists(config_path):
70
- return False, f"Downloaded files not found at {temp_dir}"
71
-
72
- # Load vector store from temporary directory
73
  embeddings = HuggingFaceEmbeddings(
74
  model_name=EMBEDDING_MODEL,
75
  model_kwargs={'device': 'cpu'}
76
  )
77
 
 
78
  vector_store = FAISS.load_local(
79
- os.path.join(temp_dir, "vector_store"),
80
- embeddings
81
  )
82
 
83
  return True, vector_store
84
 
85
  except Exception as e:
 
86
  return False, f"Error downloading vector store: {str(e)}"
 
87
  except Exception as e:
88
  logger.error(f"Error in download_vector_store: {str(e)}")
89
  return False, str(e)
 
45
  """Download vector store from dataset"""
46
  try:
47
  with tempfile.TemporaryDirectory() as temp_dir:
48
+ logger.debug(f"Downloading to temporary directory: {temp_dir}")
49
 
 
50
  try:
51
+ # Download vector store files
52
  index_path = self.api.hf_hub_download(
53
  repo_id=self.dataset_name,
54
  filename="vector_store/index.faiss",
55
  repo_type="dataset",
56
  local_dir=temp_dir
57
  )
58
+ logger.debug(f"Downloaded index.faiss to: {index_path}")
59
 
60
  config_path = self.api.hf_hub_download(
61
  repo_id=self.dataset_name,
 
63
  repo_type="dataset",
64
  local_dir=temp_dir
65
  )
66
+ logger.debug(f"Downloaded index.pkl to: {config_path}")
67
 
68
+ # Initialize embeddings
 
 
 
 
69
  embeddings = HuggingFaceEmbeddings(
70
  model_name=EMBEDDING_MODEL,
71
  model_kwargs={'device': 'cpu'}
72
  )
73
 
74
+ # Load vector store
75
  vector_store = FAISS.load_local(
76
+ folder_path=os.path.join(temp_dir, "vector_store"),
77
+ embeddings=embeddings
78
  )
79
 
80
  return True, vector_store
81
 
82
  except Exception as e:
83
+ logger.error(f"Error downloading vector store: {str(e)}")
84
  return False, f"Error downloading vector store: {str(e)}"
85
+
86
  except Exception as e:
87
  logger.error(f"Error in download_vector_store: {str(e)}")
88
  return False, str(e)