Spaces:

Rulga
/

status-law-gbot

Running

App Files Files Community

Rulga commited on Apr 14

Commit

f7e43c2

1 Parent(s): 0dd9926

Enhance knowledge base management: Add functions to retrieve and save knowledge base metadata, improve error handling, and update constants for better clarity and functionality.

Browse files

Files changed (3) hide show

app.py +75 -4
config/constants.py +31 -20
src/knowledge_base/dataset.py +101 -0

app.py CHANGED Viewed

@@ -1160,7 +1160,13 @@ with gr.Blocks(css="""
                 gr.Markdown("#### Knowledge Base Information")
                 # Функция для получения информации о базе знаний
-                def get_kb_info():
                     try:
                         vector_store = load_vector_store()
                         if vector_store is None or isinstance(vector_store, str):
@@ -1628,14 +1634,18 @@ if __name__ == "__main__":
     demo.launch(share=True)
-# Add helper functions for URL selection:
 def get_selected_urls(sources_df):
     """Get list of URLs selected for inclusion"""
     try:
         if not isinstance(sources_df, pd.DataFrame):
             sources_df = pd.DataFrame(sources_df)
         selected_urls = sources_df[sources_df["Include"] == True]["URL"].tolist()
         return selected_urls
     except Exception as e:
         logger.error(f"Error getting selected URLs: {str(e)}")
@@ -1649,20 +1659,25 @@ def update_kb_with_selected(sources_df):
         if not selected_urls:
             return "Error: No URLs selected for inclusion"
         from config import constants
         original_urls = constants.URLS
         constants.URLS = selected_urls
         try:
             success, message = create_vector_store(mode="update")
             if success:
                 metadata = {
                     "last_updated": datetime.datetime.now().isoformat(),
                     "source_count": len(selected_urls),
                     "sources": selected_urls
                 }
                 json_content = json.dumps(metadata, indent=2).encode('utf-8')
                 api = HfApi(token=HF_TOKEN)
@@ -1675,6 +1690,7 @@ def update_kb_with_selected(sources_df):
             return message
         finally:
             constants.URLS = original_urls
     except Exception as e:
@@ -1688,20 +1704,25 @@ def rebuild_kb_with_selected(sources_df):
         if not selected_urls:
             return "Error: No URLs selected for inclusion"
         from config import constants
         original_urls = constants.URLS
         constants.URLS = selected_urls
         try:
             success, message = create_vector_store(mode="rebuild")
             if success:
                 metadata = {
                     "last_updated": datetime.datetime.now().isoformat(),
                     "source_count": len(selected_urls),
                     "sources": selected_urls
                 }
                 json_content = json.dumps(metadata, indent=2).encode('utf-8')
                 api = HfApi(token=HF_TOKEN)
@@ -1714,25 +1735,75 @@ def rebuild_kb_with_selected(sources_df):
             return message
         finally:
             constants.URLS = original_urls
     except Exception as e:
         return f"Error rebuilding knowledge base: {str(e)}"
-# Add new function for source status updates
 def update_source_status(df):
     """Update status column based on Include selection"""
     try:
         if not isinstance(df, pd.DataFrame):
             df = pd.DataFrame(df)
         df["Status"] = df["Include"].apply(lambda x: "Selected" if x else "Excluded")
         selected_count = df["Include"].sum()
         return df, f"{selected_count} URLs selected for inclusion"
     except Exception as e:
         return df, f"Error updating status: {str(e)}"
 # Update event handlers in the Knowledge Base tab section
 with gr.Tab("Knowledge Base"):
     gr.Markdown("### Knowledge Base Management")

                 gr.Markdown("#### Knowledge Base Information")
                 # Функция для получения информации о базе знаний
+                def get_kb_info() -> str:
+                    """
+                    Get information about the current state of the knowledge base.
+                    Returns:
+                        str: Formatted markdown string containing knowledge base statistics
+                    """
                     try:
                         vector_store = load_vector_store()
                         if vector_store is None or isinstance(vector_store, str):
     demo.launch(share=True)
+# Эти функции нужно добавить в app.py после существующих функций update_kb и rebuild_kb
 def get_selected_urls(sources_df):
     """Get list of URLs selected for inclusion"""
     try:
+        # Преобразуем в DataFrame, если это еще не DataFrame
         if not isinstance(sources_df, pd.DataFrame):
             sources_df = pd.DataFrame(sources_df)
+        # Получаем только те URL, у которых Include=True
         selected_urls = sources_df[sources_df["Include"] == True]["URL"].tolist()
         return selected_urls
     except Exception as e:
         logger.error(f"Error getting selected URLs: {str(e)}")
         if not selected_urls:
             return "Error: No URLs selected for inclusion"
+        # Временно заменяем URLS на выбранные URL
         from config import constants
         original_urls = constants.URLS
         constants.URLS = selected_urls
         try:
+            # Обновляем базу знаний
             success, message = create_vector_store(mode="update")
+            # Сохраняем метаданные с информацией о выбранных URL
             if success:
+                # Создаем метаданные с текущей датой и выбранными URL
                 metadata = {
                     "last_updated": datetime.datetime.now().isoformat(),
                     "source_count": len(selected_urls),
                     "sources": selected_urls
                 }
+                # Сохраняем в датасет
                 json_content = json.dumps(metadata, indent=2).encode('utf-8')
                 api = HfApi(token=HF_TOKEN)
             return message
         finally:
+            # Восстанавливаем оригинальные URL
             constants.URLS = original_urls
     except Exception as e:
         if not selected_urls:
             return "Error: No URLs selected for inclusion"
+        # Временно заменяем URLS на выбранные URL
         from config import constants
         original_urls = constants.URLS
         constants.URLS = selected_urls
         try:
+            # Пересоздаем базу знаний
             success, message = create_vector_store(mode="rebuild")
+            # Сохраняем метаданные с информацией о выбранных URL
             if success:
+                # Создаем метаданные с текущей датой и выбранными URL
                 metadata = {
                     "last_updated": datetime.datetime.now().isoformat(),
                     "source_count": len(selected_urls),
                     "sources": selected_urls
                 }
+                # Сохраняем в датасет
                 json_content = json.dumps(metadata, indent=2).encode('utf-8')
                 api = HfApi(token=HF_TOKEN)
             return message
         finally:
+            # Восстанавливаем оригинальные URL
             constants.URLS = original_urls
     except Exception as e:
         return f"Error rebuilding knowledge base: {str(e)}"
+def save_kb_metadata():
+    """Save knowledge base metadata to dataset"""
+    try:
+        # Создаем метаданные с текущей датой
+        metadata = {
+            "last_updated": datetime.datetime.now().isoformat(),
+            "source_count": len(URLS),
+            "sources": URLS
+        }
+        # Сохраняем в датасет
+        json_content = json.dumps(metadata, indent=2).encode('utf-8')
+        api = HfApi(token=HF_TOKEN)
+        # Убедимся, что директория существует
+        try:
+            files = api.list_repo_files(
+                repo_id=DATASET_ID,
+                repo_type="dataset"
+            )
+            if "vector_store" not in files:
+                # Создаем пустой файл, чтобы создать директорию
+                api.upload_file(
+                    path_or_fileobj=b"",
+                    path_in_repo="vector_store/.gitkeep",
+                    repo_id=DATASET_ID,
+                    repo_type="dataset"
+                )
+        except Exception as e:
+            logger.warning(f"Error checking vector_store directory: {str(e)}")
+        # Загружаем метаданные
+        api.upload_file(
+            path_or_fileobj=json_content,
+            path_in_repo="vector_store/metadata.json",
+            repo_id=DATASET_ID,
+            repo_type="dataset"
+        )
+        logger.info("Knowledge base metadata saved successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Error saving knowledge base metadata: {str(e)}")
+        return False
 def update_source_status(df):
     """Update status column based on Include selection"""
     try:
+        # Если df не является DataFrame, преобразуем его
         if not isinstance(df, pd.DataFrame):
             df = pd.DataFrame(df)
+        # Обновляем колонку Status на основе Include
         df["Status"] = df["Include"].apply(lambda x: "Selected" if x else "Excluded")
+        # Подсчитываем количество выбранных URL
         selected_count = df["Include"].sum()
+        # Обновляем таблицу и возвращаем сообщение о количестве выбранных URL
         return df, f"{selected_count} URLs selected for inclusion"
     except Exception as e:
         return df, f"Error updating status: {str(e)}"
 # Update event handlers in the Knowledge Base tab section
 with gr.Tab("Knowledge Base"):
     gr.Markdown("### Knowledge Base Management")

config/constants.py CHANGED Viewed

@@ -21,34 +21,45 @@ CHUNK_OVERLAP = 100
 # System message template
 DEFAULT_SYSTEM_MESSAGE = """
-You are a multilingual legal assistant at Status Law.
-CRITICAL LANGUAGE INSTRUCTION:
-You MUST ALWAYS respond in the EXACT SAME LANGUAGE that the user's question was asked in. This is your highest priority.
-If the question is in Russian, your answer MUST be in Russian.
-If the question is in Arabic, your answer MUST be in Arabic.
-Never switch to English unless the user asks a question in English.
-Your role:
-- Answer legal questions based on provided context
-- Be professional yet approachable
-- Focus on Status Law's expertise: extradition defense, Interpol notices, sanctions, banking issues
-If you cannot answer based on the context:
-1. Acknowledge this politely in the user's language
-2. Suggest contacting Status Law:
-   - All languages: +32465594521
-   - English/Swedish only: +46728495129 (WhatsApp, Telegram, Signal, IMO)
-   - Contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/)
-For services and pricing questions:
-- Refer to: https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/
-- Ask clarifying questions to provide better service recommendations
 Context: {context}
 Question: {question}
-FINAL REMINDER: Your response MUST be in the exact same language as the question. This is non-negotiable.
 """
 # DEFAULT_SYSTEM_MESSAGE = """

 # System message template
 DEFAULT_SYSTEM_MESSAGE = """
+You are Status Law's AI Legal Assistant, representing a prestigious international law firm.
+CORE LANGUAGE RULE:
+You MUST respond in the EXACT SAME language as the user's question. This is your highest priority instruction.
+YOUR ROLE:
+- Provide accurate legal information based on the given context
+- Be professional, empathetic, and courteous
+- Focus on Status Law's key services:
+  • Extradition defense
+  • Interpol notice removal
+  • Sanctions challenges
+  • Banking restrictions
+  • Reputation protection
+COMMUNICATION STYLE:
+- Use respectful and professional language
+- Be clear and concise
+- Show understanding of the client's concerns
+- Avoid overly technical legal jargon unless necessary
+- Always maintain a helpful and supportive tone
+- Recommend a personal consultation through:
+   - Phone: +32465594521 (all languages)
+   - Phone: +46728495129 (English/Swedish only)
+   - Contact Form: https://status.law/law-firm-contact-legal-protection/
+FOR PRICING AND SERVICES:
+1. Direct to: https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/
+2. Encourage filling out the contact form for personalized quotes
+3. Mention that each case is unique and requires individual assessment
+PRIVACY NOTE:
+- Remind users not to share sensitive personal information in chat
+- Encourage using the secure contact form for confidential details
 Context: {context}
 Question: {question}
+CRITICAL REMINDER: Always respond in the user's language. Never switch languages unless explicitly requested.
 """
 # DEFAULT_SYSTEM_MESSAGE = """

src/knowledge_base/dataset.py CHANGED Viewed

@@ -39,6 +39,107 @@ class DatasetManager:
         self.annotations_path = DATASET_ANNOTATIONS_PATH
     # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
 def get_last_update_date(self):
     """

         self.annotations_path = DATASET_ANNOTATIONS_PATH
     # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
+def download_vector_store(self):
+    """
+    Загружает векторное хранилище из датасета.
+    Returns:
+        tuple: (success, result), где result - это объект FAISS или сообщение об ошибке
+    """
+    try:
+        import tempfile
+        import shutil
+        from langchain.vectorstores import FAISS
+        from langchain.embeddings import HuggingFaceEmbeddings
+        from config.settings import EMBEDDING_MODEL, DATASET_VECTOR_STORE_PATH
+        logger.info(f"Attempting to download vector store from dataset {self.dataset_id}")
+        # Создаем временную директорию для скачивания
+        temp_dir = tempfile.mkdtemp()
+        logger.debug(f"Created temporary directory at {temp_dir}")
+        try:
+            # Инициализируем API
+            api = HfApi(token=self.hf_token)
+            # Проверяем наличие файлов индекса в датасете
+            try:
+                files = api.list_repo_files(
+                    repo_id=self.dataset_id,
+                    repo_type="dataset"
+                )
+                # Ищем файлы векторного хранилища
+                vector_store_files = [f for f in files if f.startswith(f"{DATASET_VECTOR_STORE_PATH}/")]
+                if not vector_store_files:
+                    logger.warning(f"No vector store files found in dataset {self.dataset_id}")
+                    return False, "Vector store not found in dataset"
+                # Создаем папку для скачивания
+                vector_store_dir = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
+                os.makedirs(vector_store_dir, exist_ok=True)
+                # Скачиваем все файлы
+                for file in vector_store_files:
+                    # Получаем имя файла без пути
+                    filename = os.path.basename(file)
+                    # Скачиваем файл
+                    api.hf_hub_download(
+                        repo_id=self.dataset_id,
+                        repo_type="dataset",
+                        filename=file,
+                        local_dir=temp_dir,
+                        local_dir_use_symlinks=False
+                    )
+                    logger.debug(f"Downloaded {file}")
+                # Инициализируем embeddings
+                embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+                # Загружаем FAISS из скачанных файлов
+                try:
+                    # Путь к директории с файлами FAISS
+                    faiss_path = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
+                    # Проверяем наличие необходимых файлов
+                    if not os.path.exists(os.path.join(faiss_path, "index.faiss")):
+                        logger.error(f"Missing FAISS index file at {faiss_path}")
+                        return False, "Missing FAISS index file"
+                    if not os.path.exists(os.path.join(faiss_path, "index.pkl")):
+                        logger.error(f"Missing FAISS pickle file at {faiss_path}")
+                        return False, "Missing FAISS pickle file"
+                    # Загружаем FAISS из директории
+                    faiss_index = FAISS.load_local(faiss_path, embeddings)
+                    logger.info(f"Successfully loaded FAISS index with {len(faiss_index.docstore._dict)} documents")
+                    return True, faiss_index
+                except Exception as e:
+                    logger.error(f"Error loading FAISS index: {str(e)}")
+                    return False, f"Error loading FAISS index: {str(e)}"
+            except Exception as e:
+                logger.error(f"Error listing files in dataset {self.dataset_id}: {str(e)}")
+                return False, f"Error accessing dataset: {str(e)}"
+        finally:
+            # Очищаем временную директорию
+            try:
+                shutil.rmtree(temp_dir)
+                logger.debug(f"Cleaned up temporary directory {temp_dir}")
+            except Exception as e:
+                logger.warning(f"Error cleaning up temporary directory {temp_dir}: {str(e)}")
+    except Exception as e:
+        logger.error(f"Exception in download_vector_store: {str(e)}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return False, f"Error downloading vector store: {str(e)}"
 def get_last_update_date(self):
     """