Spaces:

Rulga
/

status-law-gbot

Running

App Files Files Community

Rulga commited on Apr 14

Commit

6c3f830

1 Parent(s): f7e43c2

Refactor app.py and dataset.py: Update comments for clarity, enhance download_vector_store method with improved error handling, and streamline vector store download process.

Browse files

Files changed (2) hide show

app.py +24 -15
src/knowledge_base/dataset.py +29 -88

app.py CHANGED Viewed

@@ -995,7 +995,7 @@ def initialize_app():
         token=HF_TOKEN
     )
-    # Загружаем сохраненный системный промпт из предпочтений или используем DEFAULT_SYSTEM_MESSAGE
     system_prompt_text = DEFAULT_SYSTEM_MESSAGE
     if "system_prompt" in preferences and "current" in preferences["system_prompt"]:
         system_prompt_text = preferences["system_prompt"]["current"]
@@ -1012,7 +1012,7 @@ def initialize_chat_evaluator():
             dataset_id=DATASET_ID
         )
-        # Проверим наличие директорий
         os.makedirs(DATASET_CHAT_HISTORY_PATH, exist_ok=True)
         os.makedirs(os.path.join(DATASET_ANNOTATIONS_PATH), exist_ok=True)
@@ -1180,7 +1180,7 @@ with gr.Blocks(css="""
                             Please create a knowledge base using the buttons on the left.
                             """
-                        # Получаем информацию о векторном хранилище
                         doc_count = len(vector_store.docstore._dict)
                         sources = set()
@@ -1651,33 +1651,42 @@ def get_selected_urls(sources_df):
         logger.error(f"Error getting selected URLs: {str(e)}")
         return []
-def update_kb_with_selected(sources_df):
-    """Update knowledge base using only selected URLs"""
     try:
-        selected_urls = get_selected_urls(sources_df)
         if not selected_urls:
-            return "Error: No URLs selected for inclusion"
-        # Временно заменяем URLS на выбранные URL
-        from config import constants
-        original_urls = constants.URLS
         constants.URLS = selected_urls
         try:
-            # Обновляем базу знаний
             success, message = create_vector_store(mode="update")
-            # Сохраняем метаданные с информацией о выбранных URL
             if success:
-                # Создаем метаданные с текущей датой и выбранными URL
                 metadata = {
                     "last_updated": datetime.datetime.now().isoformat(),
                     "source_count": len(selected_urls),
                     "sources": selected_urls
                 }
-                # Сохраняем в датасет
                 json_content = json.dumps(metadata, indent=2).encode('utf-8')
                 api = HfApi(token=HF_TOKEN)
@@ -1690,7 +1699,7 @@ def update_kb_with_selected(sources_df):
             return message
         finally:
-            # Восстанавливаем оригинальные URL
             constants.URLS = original_urls
     except Exception as e:

         token=HF_TOKEN
     )
+    # Load saved system prompt from preferences or use DEFAULT_SYSTEM_MESSAGE
     system_prompt_text = DEFAULT_SYSTEM_MESSAGE
     if "system_prompt" in preferences and "current" in preferences["system_prompt"]:
         system_prompt_text = preferences["system_prompt"]["current"]
             dataset_id=DATASET_ID
         )
+        # Check if directories exist
         os.makedirs(DATASET_CHAT_HISTORY_PATH, exist_ok=True)
         os.makedirs(os.path.join(DATASET_ANNOTATIONS_PATH), exist_ok=True)
                             Please create a knowledge base using the buttons on the left.
                             """
+                        # Get information about vector store
                         doc_count = len(vector_store.docstore._dict)
                         sources = set()
         logger.error(f"Error getting selected URLs: {str(e)}")
         return []
+def update_kb_with_selected(sources_df) -> str:
+    """
+    Updates knowledge base with selected sources.
+    Args:
+        sources_df: Dataframe containing sources and their selection status
+    Returns:
+        str: Status message
+    """
     try:
+        # Filter selected URLs
+        selected_urls = sources_df[sources_df['Include']]['URL'].tolist()
         if not selected_urls:
+            return "Error: No sources selected"
+        # Store original URLs
+        original_urls = URLS.copy()
+        # Update URLS with selected ones
         constants.URLS = selected_urls
         try:
+            # Update knowledge base
             success, message = create_vector_store(mode="update")
             if success:
+                # Create metadata with current date and selected URLs
                 metadata = {
                     "last_updated": datetime.datetime.now().isoformat(),
                     "source_count": len(selected_urls),
                     "sources": selected_urls
                 }
+                # Save to dataset
                 json_content = json.dumps(metadata, indent=2).encode('utf-8')
                 api = HfApi(token=HF_TOKEN)
             return message
         finally:
+            # Restore original URLs
             constants.URLS = original_urls
     except Exception as e:

src/knowledge_base/dataset.py CHANGED Viewed

@@ -40,106 +40,47 @@ class DatasetManager:
     # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
-def download_vector_store(self):
     """
-    Загружает векторное хранилище из датасета.
     Returns:
-        tuple: (success, result), где result - это объект FAISS или сообщение об ошибке
     """
     try:
-        import tempfile
-        import shutil
-        from langchain.vectorstores import FAISS
-        from langchain.embeddings import HuggingFaceEmbeddings
-        from config.settings import EMBEDDING_MODEL, DATASET_VECTOR_STORE_PATH
-        logger.info(f"Attempting to download vector store from dataset {self.dataset_id}")
-        # Создаем временную директорию для скачивания
         temp_dir = tempfile.mkdtemp()
         logger.debug(f"Created temporary directory at {temp_dir}")
         try:
-            # Инициализируем API
-            api = HfApi(token=self.hf_token)
-            # Проверяем наличие файлов индекса в датасете
-            try:
-                files = api.list_repo_files(
-                    repo_id=self.dataset_id,
-                    repo_type="dataset"
-                )
-                # Ищем файлы векторного хранилища
-                vector_store_files = [f for f in files if f.startswith(f"{DATASET_VECTOR_STORE_PATH}/")]
-                if not vector_store_files:
-                    logger.warning(f"No vector store files found in dataset {self.dataset_id}")
-                    return False, "Vector store not found in dataset"
-                # Создаем папку для скачивания
-                vector_store_dir = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
-                os.makedirs(vector_store_dir, exist_ok=True)
-                # Скачиваем все файлы
-                for file in vector_store_files:
-                    # Получаем имя файла без пути
-                    filename = os.path.basename(file)
-                    # Скачиваем файл
-                    api.hf_hub_download(
-                        repo_id=self.dataset_id,
-                        repo_type="dataset",
-                        filename=file,
-                        local_dir=temp_dir,
-                        local_dir_use_symlinks=False
-                    )
-                    logger.debug(f"Downloaded {file}")
-                # Инициализируем embeddings
-                embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-                # Загружаем FAISS из скачанных файлов
-                try:
-                    # Путь к директории с файлами FAISS
-                    faiss_path = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
-                    # Проверяем наличие необходимых файлов
-                    if not os.path.exists(os.path.join(faiss_path, "index.faiss")):
-                        logger.error(f"Missing FAISS index file at {faiss_path}")
-                        return False, "Missing FAISS index file"
-                    if not os.path.exists(os.path.join(faiss_path, "index.pkl")):
-                        logger.error(f"Missing FAISS pickle file at {faiss_path}")
-                        return False, "Missing FAISS pickle file"
-                    # Загружаем FAISS из директории
-                    faiss_index = FAISS.load_local(faiss_path, embeddings)
-                    logger.info(f"Successfully loaded FAISS index with {len(faiss_index.docstore._dict)} documents")
-                    return True, faiss_index
-                except Exception as e:
-                    logger.error(f"Error loading FAISS index: {str(e)}")
-                    return False, f"Error loading FAISS index: {str(e)}"
-            except Exception as e:
-                logger.error(f"Error listing files in dataset {self.dataset_id}: {str(e)}")
-                return False, f"Error accessing dataset: {str(e)}"
         finally:
-            # Очищаем временную директорию
-            try:
-                shutil.rmtree(temp_dir)
-                logger.debug(f"Cleaned up temporary directory {temp_dir}")
-            except Exception as e:
-                logger.warning(f"Error cleaning up temporary directory {temp_dir}: {str(e)}")
     except Exception as e:
-        logger.error(f"Exception in download_vector_store: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        return False, f"Error downloading vector store: {str(e)}"
 def get_last_update_date(self):
     """

     # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
+def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
     """
+    Downloads vector store from dataset.
     Returns:
+        tuple: (success, result) where result is either FAISS object or error message
     """
     try:
+        # Create temporary directory for download
         temp_dir = tempfile.mkdtemp()
         logger.debug(f"Created temporary directory at {temp_dir}")
         try:
+            # Download vector store files
+            self.api.snapshot_download(
+                repo_id=self.dataset_name,
+                repo_type="dataset",
+                local_dir=temp_dir,
+                allow_patterns=["vector_store/*"]
+            )
+            # Load vector store
+            embeddings = HuggingFaceEmbeddings(
+                model_name=EMBEDDING_MODEL,
+                model_kwargs={'device': 'cpu'}
+            )
+            vector_store = FAISS.load_local(
+                os.path.join(temp_dir, "vector_store"),
+                embeddings
+            )
+            return True, vector_store
         finally:
+            # Clean up temp directory
+            shutil.rmtree(temp_dir)
     except Exception as e:
+        logger.error(f"Error downloading vector store: {str(e)}")
+        return False, f"Error downloading vector store: {str(e)}"
 def get_last_update_date(self):
     """