Rulga commited on
Commit
6c3f830
·
1 Parent(s): f7e43c2

Refactor app.py and dataset.py: Update comments for clarity, enhance download_vector_store method with improved error handling, and streamline vector store download process.

Browse files
Files changed (2) hide show
  1. app.py +24 -15
  2. src/knowledge_base/dataset.py +29 -88
app.py CHANGED
@@ -995,7 +995,7 @@ def initialize_app():
995
  token=HF_TOKEN
996
  )
997
 
998
- # Загружаем сохраненный системный промпт из предпочтений или используем DEFAULT_SYSTEM_MESSAGE
999
  system_prompt_text = DEFAULT_SYSTEM_MESSAGE
1000
  if "system_prompt" in preferences and "current" in preferences["system_prompt"]:
1001
  system_prompt_text = preferences["system_prompt"]["current"]
@@ -1012,7 +1012,7 @@ def initialize_chat_evaluator():
1012
  dataset_id=DATASET_ID
1013
  )
1014
 
1015
- # Проверим наличие директорий
1016
  os.makedirs(DATASET_CHAT_HISTORY_PATH, exist_ok=True)
1017
  os.makedirs(os.path.join(DATASET_ANNOTATIONS_PATH), exist_ok=True)
1018
 
@@ -1180,7 +1180,7 @@ with gr.Blocks(css="""
1180
  Please create a knowledge base using the buttons on the left.
1181
  """
1182
 
1183
- # Получаем информацию о векторном хранилище
1184
  doc_count = len(vector_store.docstore._dict)
1185
  sources = set()
1186
 
@@ -1651,33 +1651,42 @@ def get_selected_urls(sources_df):
1651
  logger.error(f"Error getting selected URLs: {str(e)}")
1652
  return []
1653
 
1654
- def update_kb_with_selected(sources_df):
1655
- """Update knowledge base using only selected URLs"""
 
 
 
 
 
 
 
 
1656
  try:
1657
- selected_urls = get_selected_urls(sources_df)
 
1658
 
1659
  if not selected_urls:
1660
- return "Error: No URLs selected for inclusion"
 
 
 
1661
 
1662
- # Временно заменяем URLS на выбранные URL
1663
- from config import constants
1664
- original_urls = constants.URLS
1665
  constants.URLS = selected_urls
1666
 
1667
  try:
1668
- # Обновляем базу знаний
1669
  success, message = create_vector_store(mode="update")
1670
 
1671
- # Сохраняем метаданные с информацией о выбранных URL
1672
  if success:
1673
- # Создаем метаданные с текущей датой и выбранными URL
1674
  metadata = {
1675
  "last_updated": datetime.datetime.now().isoformat(),
1676
  "source_count": len(selected_urls),
1677
  "sources": selected_urls
1678
  }
1679
 
1680
- # Сохраняем в датасет
1681
  json_content = json.dumps(metadata, indent=2).encode('utf-8')
1682
  api = HfApi(token=HF_TOKEN)
1683
 
@@ -1690,7 +1699,7 @@ def update_kb_with_selected(sources_df):
1690
 
1691
  return message
1692
  finally:
1693
- # Восстанавливаем оригинальные URL
1694
  constants.URLS = original_urls
1695
 
1696
  except Exception as e:
 
995
  token=HF_TOKEN
996
  )
997
 
998
+ # Load saved system prompt from preferences or use DEFAULT_SYSTEM_MESSAGE
999
  system_prompt_text = DEFAULT_SYSTEM_MESSAGE
1000
  if "system_prompt" in preferences and "current" in preferences["system_prompt"]:
1001
  system_prompt_text = preferences["system_prompt"]["current"]
 
1012
  dataset_id=DATASET_ID
1013
  )
1014
 
1015
+ # Check if directories exist
1016
  os.makedirs(DATASET_CHAT_HISTORY_PATH, exist_ok=True)
1017
  os.makedirs(os.path.join(DATASET_ANNOTATIONS_PATH), exist_ok=True)
1018
 
 
1180
  Please create a knowledge base using the buttons on the left.
1181
  """
1182
 
1183
+ # Get information about vector store
1184
  doc_count = len(vector_store.docstore._dict)
1185
  sources = set()
1186
 
 
1651
  logger.error(f"Error getting selected URLs: {str(e)}")
1652
  return []
1653
 
1654
+ def update_kb_with_selected(sources_df) -> str:
1655
+ """
1656
+ Updates knowledge base with selected sources.
1657
+
1658
+ Args:
1659
+ sources_df: Dataframe containing sources and their selection status
1660
+
1661
+ Returns:
1662
+ str: Status message
1663
+ """
1664
  try:
1665
+ # Filter selected URLs
1666
+ selected_urls = sources_df[sources_df['Include']]['URL'].tolist()
1667
 
1668
  if not selected_urls:
1669
+ return "Error: No sources selected"
1670
+
1671
+ # Store original URLs
1672
+ original_urls = URLS.copy()
1673
 
1674
+ # Update URLS with selected ones
 
 
1675
  constants.URLS = selected_urls
1676
 
1677
  try:
1678
+ # Update knowledge base
1679
  success, message = create_vector_store(mode="update")
1680
 
 
1681
  if success:
1682
+ # Create metadata with current date and selected URLs
1683
  metadata = {
1684
  "last_updated": datetime.datetime.now().isoformat(),
1685
  "source_count": len(selected_urls),
1686
  "sources": selected_urls
1687
  }
1688
 
1689
+ # Save to dataset
1690
  json_content = json.dumps(metadata, indent=2).encode('utf-8')
1691
  api = HfApi(token=HF_TOKEN)
1692
 
 
1699
 
1700
  return message
1701
  finally:
1702
+ # Restore original URLs
1703
  constants.URLS = original_urls
1704
 
1705
  except Exception as e:
src/knowledge_base/dataset.py CHANGED
@@ -40,106 +40,47 @@ class DatasetManager:
40
 
41
  # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
42
 
43
- def download_vector_store(self):
44
  """
45
- Загружает векторное хранилище из датасета.
46
 
47
  Returns:
48
- tuple: (success, result), где result - это объект FAISS или сообщение об ошибке
49
  """
50
  try:
51
- import tempfile
52
- import shutil
53
- from langchain.vectorstores import FAISS
54
- from langchain.embeddings import HuggingFaceEmbeddings
55
- from config.settings import EMBEDDING_MODEL, DATASET_VECTOR_STORE_PATH
56
-
57
- logger.info(f"Attempting to download vector store from dataset {self.dataset_id}")
58
-
59
- # Создаем временную директорию для скачивания
60
  temp_dir = tempfile.mkdtemp()
61
  logger.debug(f"Created temporary directory at {temp_dir}")
62
 
63
  try:
64
- # Инициализируем API
65
- api = HfApi(token=self.hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # Проверяем наличие файлов индекса в датасете
68
- try:
69
- files = api.list_repo_files(
70
- repo_id=self.dataset_id,
71
- repo_type="dataset"
72
- )
73
-
74
- # Ищем файлы векторного хранилища
75
- vector_store_files = [f for f in files if f.startswith(f"{DATASET_VECTOR_STORE_PATH}/")]
76
-
77
- if not vector_store_files:
78
- logger.warning(f"No vector store files found in dataset {self.dataset_id}")
79
- return False, "Vector store not found in dataset"
80
-
81
- # Создаем папку для скачивания
82
- vector_store_dir = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
83
- os.makedirs(vector_store_dir, exist_ok=True)
84
-
85
- # Скачиваем все файлы
86
- for file in vector_store_files:
87
- # Получаем имя файла без пути
88
- filename = os.path.basename(file)
89
- # Скачиваем файл
90
- api.hf_hub_download(
91
- repo_id=self.dataset_id,
92
- repo_type="dataset",
93
- filename=file,
94
- local_dir=temp_dir,
95
- local_dir_use_symlinks=False
96
- )
97
- logger.debug(f"Downloaded {file}")
98
-
99
- # Инициализируем embeddings
100
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
101
-
102
- # Загружаем FAISS из скачанных файлов
103
- try:
104
- # Путь к директории с файлами FAISS
105
- faiss_path = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
106
-
107
- # Проверяем наличие необходимых файлов
108
- if not os.path.exists(os.path.join(faiss_path, "index.faiss")):
109
- logger.error(f"Missing FAISS index file at {faiss_path}")
110
- return False, "Missing FAISS index file"
111
-
112
- if not os.path.exists(os.path.join(faiss_path, "index.pkl")):
113
- logger.error(f"Missing FAISS pickle file at {faiss_path}")
114
- return False, "Missing FAISS pickle file"
115
-
116
- # Загружаем FAISS из директории
117
- faiss_index = FAISS.load_local(faiss_path, embeddings)
118
- logger.info(f"Successfully loaded FAISS index with {len(faiss_index.docstore._dict)} documents")
119
-
120
- return True, faiss_index
121
-
122
- except Exception as e:
123
- logger.error(f"Error loading FAISS index: {str(e)}")
124
- return False, f"Error loading FAISS index: {str(e)}"
125
-
126
- except Exception as e:
127
- logger.error(f"Error listing files in dataset {self.dataset_id}: {str(e)}")
128
- return False, f"Error accessing dataset: {str(e)}"
129
-
130
  finally:
131
- # Очищаем временную директорию
132
- try:
133
- shutil.rmtree(temp_dir)
134
- logger.debug(f"Cleaned up temporary directory {temp_dir}")
135
- except Exception as e:
136
- logger.warning(f"Error cleaning up temporary directory {temp_dir}: {str(e)}")
137
-
138
  except Exception as e:
139
- logger.error(f"Exception in download_vector_store: {str(e)}")
140
- import traceback
141
- logger.error(traceback.format_exc())
142
- return False, f"Error downloading vector store: {str(e)}"
143
 
144
  def get_last_update_date(self):
145
  """
 
40
 
41
  # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
42
 
43
+ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
44
  """
45
+ Downloads vector store from dataset.
46
 
47
  Returns:
48
+ tuple: (success, result) where result is either FAISS object or error message
49
  """
50
  try:
51
+ # Create temporary directory for download
 
 
 
 
 
 
 
 
52
  temp_dir = tempfile.mkdtemp()
53
  logger.debug(f"Created temporary directory at {temp_dir}")
54
 
55
  try:
56
+ # Download vector store files
57
+ self.api.snapshot_download(
58
+ repo_id=self.dataset_name,
59
+ repo_type="dataset",
60
+ local_dir=temp_dir,
61
+ allow_patterns=["vector_store/*"]
62
+ )
63
+
64
+ # Load vector store
65
+ embeddings = HuggingFaceEmbeddings(
66
+ model_name=EMBEDDING_MODEL,
67
+ model_kwargs={'device': 'cpu'}
68
+ )
69
+
70
+ vector_store = FAISS.load_local(
71
+ os.path.join(temp_dir, "vector_store"),
72
+ embeddings
73
+ )
74
+
75
+ return True, vector_store
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  finally:
78
+ # Clean up temp directory
79
+ shutil.rmtree(temp_dir)
80
+
 
 
 
 
81
  except Exception as e:
82
+ logger.error(f"Error downloading vector store: {str(e)}")
83
+ return False, f"Error downloading vector store: {str(e)}"
 
 
84
 
85
  def get_last_update_date(self):
86
  """