Rulga commited on
Commit
f7e43c2
·
1 Parent(s): 0dd9926

Enhance knowledge base management: Add functions to retrieve and save knowledge base metadata, improve error handling, and update constants for better clarity and functionality.

Browse files
Files changed (3) hide show
  1. app.py +75 -4
  2. config/constants.py +31 -20
  3. src/knowledge_base/dataset.py +101 -0
app.py CHANGED
@@ -1160,7 +1160,13 @@ with gr.Blocks(css="""
1160
  gr.Markdown("#### Knowledge Base Information")
1161
 
1162
  # Функция для получения информации о базе знаний
1163
- def get_kb_info():
 
 
 
 
 
 
1164
  try:
1165
  vector_store = load_vector_store()
1166
  if vector_store is None or isinstance(vector_store, str):
@@ -1628,14 +1634,18 @@ if __name__ == "__main__":
1628
 
1629
  demo.launch(share=True)
1630
 
1631
- # Add helper functions for URL selection:
 
1632
  def get_selected_urls(sources_df):
1633
  """Get list of URLs selected for inclusion"""
1634
  try:
 
1635
  if not isinstance(sources_df, pd.DataFrame):
1636
  sources_df = pd.DataFrame(sources_df)
1637
 
 
1638
  selected_urls = sources_df[sources_df["Include"] == True]["URL"].tolist()
 
1639
  return selected_urls
1640
  except Exception as e:
1641
  logger.error(f"Error getting selected URLs: {str(e)}")
@@ -1649,20 +1659,25 @@ def update_kb_with_selected(sources_df):
1649
  if not selected_urls:
1650
  return "Error: No URLs selected for inclusion"
1651
 
 
1652
  from config import constants
1653
  original_urls = constants.URLS
1654
  constants.URLS = selected_urls
1655
 
1656
  try:
 
1657
  success, message = create_vector_store(mode="update")
1658
 
 
1659
  if success:
 
1660
  metadata = {
1661
  "last_updated": datetime.datetime.now().isoformat(),
1662
  "source_count": len(selected_urls),
1663
  "sources": selected_urls
1664
  }
1665
 
 
1666
  json_content = json.dumps(metadata, indent=2).encode('utf-8')
1667
  api = HfApi(token=HF_TOKEN)
1668
 
@@ -1675,6 +1690,7 @@ def update_kb_with_selected(sources_df):
1675
 
1676
  return message
1677
  finally:
 
1678
  constants.URLS = original_urls
1679
 
1680
  except Exception as e:
@@ -1688,20 +1704,25 @@ def rebuild_kb_with_selected(sources_df):
1688
  if not selected_urls:
1689
  return "Error: No URLs selected for inclusion"
1690
 
 
1691
  from config import constants
1692
  original_urls = constants.URLS
1693
  constants.URLS = selected_urls
1694
 
1695
  try:
 
1696
  success, message = create_vector_store(mode="rebuild")
1697
 
 
1698
  if success:
 
1699
  metadata = {
1700
  "last_updated": datetime.datetime.now().isoformat(),
1701
  "source_count": len(selected_urls),
1702
  "sources": selected_urls
1703
  }
1704
 
 
1705
  json_content = json.dumps(metadata, indent=2).encode('utf-8')
1706
  api = HfApi(token=HF_TOKEN)
1707
 
@@ -1714,25 +1735,75 @@ def rebuild_kb_with_selected(sources_df):
1714
 
1715
  return message
1716
  finally:
 
1717
  constants.URLS = original_urls
1718
 
1719
  except Exception as e:
1720
  return f"Error rebuilding knowledge base: {str(e)}"
1721
 
1722
- # Add new function for source status updates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1723
  def update_source_status(df):
1724
  """Update status column based on Include selection"""
1725
  try:
 
1726
  if not isinstance(df, pd.DataFrame):
1727
  df = pd.DataFrame(df)
1728
 
 
1729
  df["Status"] = df["Include"].apply(lambda x: "Selected" if x else "Excluded")
 
 
1730
  selected_count = df["Include"].sum()
1731
 
 
1732
  return df, f"{selected_count} URLs selected for inclusion"
1733
  except Exception as e:
1734
  return df, f"Error updating status: {str(e)}"
1735
-
1736
  # Update event handlers in the Knowledge Base tab section
1737
  with gr.Tab("Knowledge Base"):
1738
  gr.Markdown("### Knowledge Base Management")
 
1160
  gr.Markdown("#### Knowledge Base Information")
1161
 
1162
  # Функция для получения информации о базе знаний
1163
+ def get_kb_info() -> str:
1164
+ """
1165
+ Get information about the current state of the knowledge base.
1166
+
1167
+ Returns:
1168
+ str: Formatted markdown string containing knowledge base statistics
1169
+ """
1170
  try:
1171
  vector_store = load_vector_store()
1172
  if vector_store is None or isinstance(vector_store, str):
 
1634
 
1635
  demo.launch(share=True)
1636
 
1637
+ # Эти функции нужно добавить в app.py после существующих функций update_kb и rebuild_kb
1638
+
1639
  def get_selected_urls(sources_df):
1640
  """Get list of URLs selected for inclusion"""
1641
  try:
1642
+ # Преобразуем в DataFrame, если это еще не DataFrame
1643
  if not isinstance(sources_df, pd.DataFrame):
1644
  sources_df = pd.DataFrame(sources_df)
1645
 
1646
+ # Получаем только те URL, у которых Include=True
1647
  selected_urls = sources_df[sources_df["Include"] == True]["URL"].tolist()
1648
+
1649
  return selected_urls
1650
  except Exception as e:
1651
  logger.error(f"Error getting selected URLs: {str(e)}")
 
1659
  if not selected_urls:
1660
  return "Error: No URLs selected for inclusion"
1661
 
1662
+ # Временно заменяем URLS на выбранные URL
1663
  from config import constants
1664
  original_urls = constants.URLS
1665
  constants.URLS = selected_urls
1666
 
1667
  try:
1668
+ # Обновляем базу знаний
1669
  success, message = create_vector_store(mode="update")
1670
 
1671
+ # Сохраняем метаданные с информацией о выбранных URL
1672
  if success:
1673
+ # Создаем метаданные с текущей датой и выбранными URL
1674
  metadata = {
1675
  "last_updated": datetime.datetime.now().isoformat(),
1676
  "source_count": len(selected_urls),
1677
  "sources": selected_urls
1678
  }
1679
 
1680
+ # Сохраняем в датасет
1681
  json_content = json.dumps(metadata, indent=2).encode('utf-8')
1682
  api = HfApi(token=HF_TOKEN)
1683
 
 
1690
 
1691
  return message
1692
  finally:
1693
+ # Восстанавливаем оригинальные URL
1694
  constants.URLS = original_urls
1695
 
1696
  except Exception as e:
 
1704
  if not selected_urls:
1705
  return "Error: No URLs selected for inclusion"
1706
 
1707
+ # Временно заменяем URLS на выбранные URL
1708
  from config import constants
1709
  original_urls = constants.URLS
1710
  constants.URLS = selected_urls
1711
 
1712
  try:
1713
+ # Пересоздаем базу знаний
1714
  success, message = create_vector_store(mode="rebuild")
1715
 
1716
+ # Сохраняем метаданные с информацией о выбранных URL
1717
  if success:
1718
+ # Создаем метаданные с текущей датой и выбранными URL
1719
  metadata = {
1720
  "last_updated": datetime.datetime.now().isoformat(),
1721
  "source_count": len(selected_urls),
1722
  "sources": selected_urls
1723
  }
1724
 
1725
+ # Сохраняем в датасет
1726
  json_content = json.dumps(metadata, indent=2).encode('utf-8')
1727
  api = HfApi(token=HF_TOKEN)
1728
 
 
1735
 
1736
  return message
1737
  finally:
1738
+ # Восстанавливаем оригинальные URL
1739
  constants.URLS = original_urls
1740
 
1741
  except Exception as e:
1742
  return f"Error rebuilding knowledge base: {str(e)}"
1743
 
1744
+ def save_kb_metadata():
1745
+ """Save knowledge base metadata to dataset"""
1746
+ try:
1747
+ # Создаем метаданные с текущей датой
1748
+ metadata = {
1749
+ "last_updated": datetime.datetime.now().isoformat(),
1750
+ "source_count": len(URLS),
1751
+ "sources": URLS
1752
+ }
1753
+
1754
+ # Сохраняем в датасет
1755
+ json_content = json.dumps(metadata, indent=2).encode('utf-8')
1756
+ api = HfApi(token=HF_TOKEN)
1757
+
1758
+ # Убедимся, что директория существует
1759
+ try:
1760
+ files = api.list_repo_files(
1761
+ repo_id=DATASET_ID,
1762
+ repo_type="dataset"
1763
+ )
1764
+
1765
+ if "vector_store" not in files:
1766
+ # Создаем пустой файл, чтобы создать директорию
1767
+ api.upload_file(
1768
+ path_or_fileobj=b"",
1769
+ path_in_repo="vector_store/.gitkeep",
1770
+ repo_id=DATASET_ID,
1771
+ repo_type="dataset"
1772
+ )
1773
+ except Exception as e:
1774
+ logger.warning(f"Error checking vector_store directory: {str(e)}")
1775
+
1776
+ # Загружаем метаданные
1777
+ api.upload_file(
1778
+ path_or_fileobj=json_content,
1779
+ path_in_repo="vector_store/metadata.json",
1780
+ repo_id=DATASET_ID,
1781
+ repo_type="dataset"
1782
+ )
1783
+
1784
+ logger.info("Knowledge base metadata saved successfully")
1785
+ return True
1786
+ except Exception as e:
1787
+ logger.error(f"Error saving knowledge base metadata: {str(e)}")
1788
+ return False
1789
+
1790
  def update_source_status(df):
1791
  """Update status column based on Include selection"""
1792
  try:
1793
+ # Если df не является DataFrame, преобразуем его
1794
  if not isinstance(df, pd.DataFrame):
1795
  df = pd.DataFrame(df)
1796
 
1797
+ # Обновляем колонку Status на основе Include
1798
  df["Status"] = df["Include"].apply(lambda x: "Selected" if x else "Excluded")
1799
+
1800
+ # Подсчитываем количество выбранных URL
1801
  selected_count = df["Include"].sum()
1802
 
1803
+ # Обновляем таблицу и возвращаем сообщение о количестве выбранных URL
1804
  return df, f"{selected_count} URLs selected for inclusion"
1805
  except Exception as e:
1806
  return df, f"Error updating status: {str(e)}"
 
1807
  # Update event handlers in the Knowledge Base tab section
1808
  with gr.Tab("Knowledge Base"):
1809
  gr.Markdown("### Knowledge Base Management")
config/constants.py CHANGED
@@ -21,34 +21,45 @@ CHUNK_OVERLAP = 100
21
 
22
  # System message template
23
  DEFAULT_SYSTEM_MESSAGE = """
24
- You are a multilingual legal assistant at Status Law.
25
 
26
- CRITICAL LANGUAGE INSTRUCTION:
27
- You MUST ALWAYS respond in the EXACT SAME LANGUAGE that the user's question was asked in. This is your highest priority.
28
- If the question is in Russian, your answer MUST be in Russian.
29
- If the question is in Arabic, your answer MUST be in Arabic.
30
- Never switch to English unless the user asks a question in English.
31
 
32
- Your role:
33
- - Answer legal questions based on provided context
34
- - Be professional yet approachable
35
- - Focus on Status Law's expertise: extradition defense, Interpol notices, sanctions, banking issues
 
 
 
 
 
36
 
37
- If you cannot answer based on the context:
38
- 1. Acknowledge this politely in the user's language
39
- 2. Suggest contacting Status Law:
40
- - All languages: +32465594521
41
- - English/Swedish only: +46728495129 (WhatsApp, Telegram, Signal, IMO)
42
- - Contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/)
 
 
 
 
43
 
44
- For services and pricing questions:
45
- - Refer to: https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/
46
- - Ask clarifying questions to provide better service recommendations
 
 
 
 
 
47
 
48
  Context: {context}
49
  Question: {question}
50
 
51
- FINAL REMINDER: Your response MUST be in the exact same language as the question. This is non-negotiable.
52
  """
53
 
54
  # DEFAULT_SYSTEM_MESSAGE = """
 
21
 
22
  # System message template
23
  DEFAULT_SYSTEM_MESSAGE = """
24
+ You are Status Law's AI Legal Assistant, representing a prestigious international law firm.
25
 
26
+ CORE LANGUAGE RULE:
27
+ You MUST respond in the EXACT SAME language as the user's question. This is your highest priority instruction.
 
 
 
28
 
29
+ YOUR ROLE:
30
+ - Provide accurate legal information based on the given context
31
+ - Be professional, empathetic, and courteous
32
+ - Focus on Status Law's key services:
33
+ • Extradition defense
34
+ • Interpol notice removal
35
+ • Sanctions challenges
36
+ • Banking restrictions
37
+ • Reputation protection
38
 
39
+ COMMUNICATION STYLE:
40
+ - Use respectful and professional language
41
+ - Be clear and concise
42
+ - Show understanding of the client's concerns
43
+ - Avoid overly technical legal jargon unless necessary
44
+ - Always maintain a helpful and supportive tone
45
+ - Recommend a personal consultation through:
46
+ - Phone: +32465594521 (all languages)
47
+ - Phone: +46728495129 (English/Swedish only)
48
+ - Contact Form: https://status.law/law-firm-contact-legal-protection/
49
 
50
+ FOR PRICING AND SERVICES:
51
+ 1. Direct to: https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/
52
+ 2. Encourage filling out the contact form for personalized quotes
53
+ 3. Mention that each case is unique and requires individual assessment
54
+
55
+ PRIVACY NOTE:
56
+ - Remind users not to share sensitive personal information in chat
57
+ - Encourage using the secure contact form for confidential details
58
 
59
  Context: {context}
60
  Question: {question}
61
 
62
+ CRITICAL REMINDER: Always respond in the user's language. Never switch languages unless explicitly requested.
63
  """
64
 
65
  # DEFAULT_SYSTEM_MESSAGE = """
src/knowledge_base/dataset.py CHANGED
@@ -39,6 +39,107 @@ class DatasetManager:
39
  self.annotations_path = DATASET_ANNOTATIONS_PATH
40
 
41
  # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def get_last_update_date(self):
44
  """
 
39
  self.annotations_path = DATASET_ANNOTATIONS_PATH
40
 
41
  # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
42
+
43
+ def download_vector_store(self):
44
+ """
45
+ Загружает векторное хранилище из датасета.
46
+
47
+ Returns:
48
+ tuple: (success, result), где result - это объект FAISS или сообщение об ошибке
49
+ """
50
+ try:
51
+ import tempfile
52
+ import shutil
53
+ from langchain.vectorstores import FAISS
54
+ from langchain.embeddings import HuggingFaceEmbeddings
55
+ from config.settings import EMBEDDING_MODEL, DATASET_VECTOR_STORE_PATH
56
+
57
+ logger.info(f"Attempting to download vector store from dataset {self.dataset_id}")
58
+
59
+ # Создаем временную директорию для скачивания
60
+ temp_dir = tempfile.mkdtemp()
61
+ logger.debug(f"Created temporary directory at {temp_dir}")
62
+
63
+ try:
64
+ # Инициализируем API
65
+ api = HfApi(token=self.hf_token)
66
+
67
+ # Проверяем наличие файлов индекса в датасете
68
+ try:
69
+ files = api.list_repo_files(
70
+ repo_id=self.dataset_id,
71
+ repo_type="dataset"
72
+ )
73
+
74
+ # Ищем файлы векторного хранилища
75
+ vector_store_files = [f for f in files if f.startswith(f"{DATASET_VECTOR_STORE_PATH}/")]
76
+
77
+ if not vector_store_files:
78
+ logger.warning(f"No vector store files found in dataset {self.dataset_id}")
79
+ return False, "Vector store not found in dataset"
80
+
81
+ # Создаем папку для скачивания
82
+ vector_store_dir = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
83
+ os.makedirs(vector_store_dir, exist_ok=True)
84
+
85
+ # Скачиваем все файлы
86
+ for file in vector_store_files:
87
+ # Получаем имя файла без пути
88
+ filename = os.path.basename(file)
89
+ # Скачиваем файл
90
+ api.hf_hub_download(
91
+ repo_id=self.dataset_id,
92
+ repo_type="dataset",
93
+ filename=file,
94
+ local_dir=temp_dir,
95
+ local_dir_use_symlinks=False
96
+ )
97
+ logger.debug(f"Downloaded {file}")
98
+
99
+ # Инициализируем embeddings
100
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
101
+
102
+ # Загружаем FAISS из скачанных файлов
103
+ try:
104
+ # Путь к директории с файлами FAISS
105
+ faiss_path = os.path.join(temp_dir, DATASET_VECTOR_STORE_PATH)
106
+
107
+ # Проверяем наличие необходимых файлов
108
+ if not os.path.exists(os.path.join(faiss_path, "index.faiss")):
109
+ logger.error(f"Missing FAISS index file at {faiss_path}")
110
+ return False, "Missing FAISS index file"
111
+
112
+ if not os.path.exists(os.path.join(faiss_path, "index.pkl")):
113
+ logger.error(f"Missing FAISS pickle file at {faiss_path}")
114
+ return False, "Missing FAISS pickle file"
115
+
116
+ # Загружаем FAISS из директории
117
+ faiss_index = FAISS.load_local(faiss_path, embeddings)
118
+ logger.info(f"Successfully loaded FAISS index with {len(faiss_index.docstore._dict)} documents")
119
+
120
+ return True, faiss_index
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error loading FAISS index: {str(e)}")
124
+ return False, f"Error loading FAISS index: {str(e)}"
125
+
126
+ except Exception as e:
127
+ logger.error(f"Error listing files in dataset {self.dataset_id}: {str(e)}")
128
+ return False, f"Error accessing dataset: {str(e)}"
129
+
130
+ finally:
131
+ # Очищаем временную директорию
132
+ try:
133
+ shutil.rmtree(temp_dir)
134
+ logger.debug(f"Cleaned up temporary directory {temp_dir}")
135
+ except Exception as e:
136
+ logger.warning(f"Error cleaning up temporary directory {temp_dir}: {str(e)}")
137
+
138
+ except Exception as e:
139
+ logger.error(f"Exception in download_vector_store: {str(e)}")
140
+ import traceback
141
+ logger.error(traceback.format_exc())
142
+ return False, f"Error downloading vector store: {str(e)}"
143
 
144
  def get_last_update_date(self):
145
  """