Rulga commited on
Commit
0dd9926
·
1 Parent(s): 3b59cc2

Refactor README and app.py: Update dataset structure in README, add knowledge base management features in app.py, and implement last update date retrieval in DatasetManager.

Browse files
Files changed (3) hide show
  1. README.md +16 -18
  2. app.py +409 -3
  3. src/knowledge_base/dataset.py +55 -0
README.md CHANGED
@@ -64,28 +64,26 @@ status-law-gbot/
64
  │ └── training/ # Training module
65
  │ ├── fine_tuner.py
66
  │ └── model_manager.py
67
- └── data/ # Data storage
68
- ├── vector_store/ # FAISS vector storage
69
- ├── index.faiss
70
- │ └── index.pkl
71
- ├── chat_history/ # Conversation logs
72
- └── logs.json
73
- └── fine_tuned_models/ # Fine-tuned model storage
74
- └── model_registry.json
75
  ```
76
 
77
  ## 💾 Data Storage
78
 
79
- ### Vector Store
80
- - `data/vector_store/index.faiss`: FAISS vector store for document embeddings
81
- - `data/vector_store/index.pkl`: Metadata and configuration for vector store
82
-
83
- ### Chat History
84
- - `data/chat_history/logs.json`: JSON file containing chat history and metadata
85
-
86
- ### Models
87
- - `src/models/fine_tuned/`: Directory for storing fine-tuned models
88
- - `src/models/registry.json`: Model registry and configuration
89
 
90
  ## 🛠️ Setup
91
 
 
64
  │ └── training/ # Training module
65
  │ ├── fine_tuner.py
66
  │ └── model_manager.py
67
+ └── dataset/ # HuggingFace dataset structure
68
+ ├── annotations/ # Conversation annotations
69
+ ├── chat_history/ # Chat logs and conversations
70
+ ├── fine_tuned_models/ # Fine-tuned model storage
71
+ ├── preferences/ # User preferences
72
+ ├── training_data/ # Processed training data
73
+ ├── training_logs/ # Training process logs
74
+ └── vector_store/ # FAISS vector storage
75
  ```
76
 
77
  ## 💾 Data Storage
78
 
79
+ ### Dataset Organization
80
+ - `annotations/`: Conversation quality metrics and annotations
81
+ - `chat_history/`: JSON files containing chat conversations
82
+ - `fine_tuned_models/`: Storage for LoRA adapters and model checkpoints
83
+ - `preferences/`: User preferences and settings
84
+ - `training_data/`: Processed data ready for model training
85
+ - `training_logs/`: Detailed training process logs
86
+ - `vector_store/`: FAISS indexes for semantic search
 
 
87
 
88
  ## 🛠️ Setup
89
 
app.py CHANGED
@@ -9,7 +9,7 @@ import os
9
 
10
  # Third-party imports
11
  import gradio as gr
12
- import pandas as pd # Add this import
13
 
14
 
15
  from huggingface_hub import HfApi, InferenceClient
@@ -18,6 +18,7 @@ import langdetect
18
  from dotenv import load_dotenv
19
  import requests
20
  from datasets import load_dataset
 
21
 
22
  # Set seed for consistent results
23
  langdetect.DetectorFactory.seed = 0
@@ -479,11 +480,17 @@ def log_api_error(user_message, error_message, model_id, is_fallback=False):
479
  logger.info(f"API error logged to {log_path}")
480
  except Exception as e:
481
  logger.error(f"Failed to log API error: {str(e)}")
482
-
483
  def update_kb():
484
  """Function to update existing knowledge base with new documents"""
485
  try:
 
486
  success, message = create_vector_store(mode="update")
 
 
 
 
 
487
  return message
488
  except Exception as e:
489
  return f"Error updating knowledge base: {str(e)}"
@@ -491,11 +498,63 @@ def update_kb():
491
  def rebuild_kb():
492
  """Function to create knowledge base from scratch"""
493
  try:
 
494
  success, message = create_vector_store(mode="rebuild")
 
 
 
 
 
495
  return message
496
  except Exception as e:
497
  return f"Error creating knowledge base: {str(e)}"
498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  def save_chat_history(history, conversation_id):
500
  """Save chat history to a file and to HuggingFace dataset"""
501
  try:
@@ -1051,6 +1110,158 @@ with gr.Blocks(css="""
1051
  )
1052
 
1053
  clear_btn.click(clear_conversation, None, [chatbot, conversation_id])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
 
1055
  with gr.Tab("Model Settings"):
1056
  gr.Markdown("### Model Configuration")
@@ -1387,7 +1598,7 @@ with gr.Blocks(css="""
1387
  inputs=[],
1388
  outputs=[evaluation_status, qa_table, refresh_data_status]
1389
  )
1390
-
1391
  # Model change handler - outside of Tabs but inside Blocks
1392
  model_selector.change(
1393
  fn=change_model,
@@ -1416,3 +1627,198 @@ if __name__ == "__main__":
1416
  logger.warning("Knowledge base not found. Please create it through the interface.")
1417
 
1418
  demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Third-party imports
11
  import gradio as gr
12
+ import pandas as pd
13
 
14
 
15
  from huggingface_hub import HfApi, InferenceClient
 
18
  from dotenv import load_dotenv
19
  import requests
20
  from datasets import load_dataset
21
+ from config.constants import URLS
22
 
23
  # Set seed for consistent results
24
  langdetect.DetectorFactory.seed = 0
 
480
  logger.info(f"API error logged to {log_path}")
481
  except Exception as e:
482
  logger.error(f"Failed to log API error: {str(e)}")
483
+
484
  def update_kb():
485
  """Function to update existing knowledge base with new documents"""
486
  try:
487
+ # Вызываем функцию для обновления базы знаний
488
  success, message = create_vector_store(mode="update")
489
+
490
+ # Если обновление успешно, сохраняем метаданные с датой обновления
491
+ if success:
492
+ save_kb_metadata()
493
+
494
  return message
495
  except Exception as e:
496
  return f"Error updating knowledge base: {str(e)}"
 
498
  def rebuild_kb():
499
  """Function to create knowledge base from scratch"""
500
  try:
501
+ # Вызываем функцию для пересоздания базы знаний
502
  success, message = create_vector_store(mode="rebuild")
503
+
504
+ # Если создание успешно, сохраняем метаданные с датой обновления
505
+ if success:
506
+ save_kb_metadata()
507
+
508
  return message
509
  except Exception as e:
510
  return f"Error creating knowledge base: {str(e)}"
511
 
512
+ def save_kb_metadata():
513
+ """Save knowledge base metadata to dataset"""
514
+ try:
515
+ # Создаем метаданные с текущей датой
516
+ metadata = {
517
+ "last_updated": datetime.datetime.now().isoformat(),
518
+ "source_count": len(URLS),
519
+ "sources": URLS
520
+ }
521
+
522
+ # Сохраняем в датасет
523
+ json_content = json.dumps(metadata, indent=2).encode('utf-8')
524
+ api = HfApi(token=HF_TOKEN)
525
+
526
+ # Убедимся, что директория существует
527
+ try:
528
+ files = api.list_repo_files(
529
+ repo_id=DATASET_ID,
530
+ repo_type="dataset"
531
+ )
532
+
533
+ if "vector_store" not in files:
534
+ # Создаем пустой файл, чтобы создать директорию
535
+ api.upload_file(
536
+ path_or_fileobj=b"",
537
+ path_in_repo="vector_store/.gitkeep",
538
+ repo_id=DATASET_ID,
539
+ repo_type="dataset"
540
+ )
541
+ except Exception as e:
542
+ logger.warning(f"Error checking vector_store directory: {str(e)}")
543
+
544
+ # Загружаем метаданные
545
+ api.upload_file(
546
+ path_or_fileobj=json_content,
547
+ path_in_repo="vector_store/metadata.json",
548
+ repo_id=DATASET_ID,
549
+ repo_type="dataset"
550
+ )
551
+
552
+ logger.info("Knowledge base metadata saved successfully")
553
+ return True
554
+ except Exception as e:
555
+ logger.error(f"Error saving knowledge base metadata: {str(e)}")
556
+ return False
557
+
558
  def save_chat_history(history, conversation_id):
559
  """Save chat history to a file and to HuggingFace dataset"""
560
  try:
 
1110
  )
1111
 
1112
  clear_btn.click(clear_conversation, None, [chatbot, conversation_id])
1113
+
1114
+
1115
+
1116
+ with gr.Tab("Knowledge Base"):
1117
+ gr.Markdown("### Knowledge Base Management")
1118
+
1119
+ with gr.Row():
1120
+ with gr.Column(scale=2):
1121
+ # Отображение источников
1122
+ gr.Markdown("#### Information Sources")
1123
+ sources_list = gr.Dataframe(
1124
+ value=pd.DataFrame({
1125
+ "URL": URLS,
1126
+ "Include": [True for _ in URLS],
1127
+ "Status": ["Ready" for _ in URLS]
1128
+ }),
1129
+ interactive=True,
1130
+ wrap=True,
1131
+ row_count=15,
1132
+ show_label=False
1133
+ )
1134
+
1135
+ # Статус операций с базой знаний
1136
+ kb_status = gr.Textbox(
1137
+ label="Operation Status",
1138
+ interactive=False,
1139
+ placeholder="Ready",
1140
+ value="Ready"
1141
+ )
1142
+
1143
+ # Кнопки для управления базой знаний
1144
+ with gr.Row():
1145
+ update_kb_btn = gr.Button("Update Knowledge Base", variant="primary")
1146
+ rebuild_kb_btn = gr.Button("Rebuild Knowledge Base from Scratch", variant="secondary")
1147
+
1148
+ gr.Markdown("""
1149
+ <small>
1150
+ **Update Knowledge Base**: Adds new information to the existing knowledge base.
1151
+
1152
+ **Rebuild Knowledge Base**: Recreates the entire knowledge base from scratch. Use this if there are inconsistencies.
1153
+
1154
+ All changes are saved to the Hugging Face dataset.
1155
+ </small>
1156
+ """)
1157
+
1158
+ with gr.Column(scale=1):
1159
+ # Информация о текущей базе знаний
1160
+ gr.Markdown("#### Knowledge Base Information")
1161
+
1162
+ # Функция для получения информации о базе знаний
1163
+ def get_kb_info():
1164
+ try:
1165
+ vector_store = load_vector_store()
1166
+ if vector_store is None or isinstance(vector_store, str):
1167
+ return """
1168
+ **Status**: Not found or error
1169
+
1170
+ **Documents**: 0
1171
+
1172
+ **Last updated**: Never
1173
+
1174
+ Please create a knowledge base using the buttons on the left.
1175
+ """
1176
+
1177
+ # Получаем информацию о векторном хранилище
1178
+ doc_count = len(vector_store.docstore._dict)
1179
+ sources = set()
1180
+
1181
+ for doc_id, doc in vector_store.docstore._dict.items():
1182
+ if hasattr(doc, 'metadata') and 'source' in doc.metadata:
1183
+ sources.add(doc.metadata['source'])
1184
+
1185
+ source_count = len(sources)
1186
+
1187
+ # Если хранилище существует, но источников нет
1188
+ if source_count == 0:
1189
+ return """
1190
+ **Status**: Created but empty
1191
+
1192
+ **Documents**: 0
1193
+
1194
+ **Last updated**: Unknown
1195
+
1196
+ Please rebuild the knowledge base using the button on the left.
1197
+ """
1198
+
1199
+ # Получаем файл с датой последнего обновления
1200
+ last_updated = "Unknown"
1201
+ try:
1202
+ from src.knowledge_base.dataset import DatasetManager
1203
+ dataset = DatasetManager()
1204
+ last_updated = dataset.get_last_update_date() or "Unknown"
1205
+ except Exception as e:
1206
+ logger.error(f"Error getting last update date: {str(e)}")
1207
+
1208
+ return f"""
1209
+ **Status**: Active
1210
+
1211
+ **Documents**: {doc_count}
1212
+
1213
+ **Sources**: {source_count}
1214
+
1215
+ **Last updated**: {last_updated}
1216
+ """
1217
+
1218
+ except Exception as e:
1219
+ return f"""
1220
+ **Status**: Error
1221
+
1222
+ **Details**: {str(e)}
1223
+
1224
+ Please try rebuilding the knowledge base.
1225
+ """
1226
+
1227
+ kb_info = gr.Markdown(value=get_kb_info())
1228
+ refresh_kb_info_btn = gr.Button("Refresh Information")
1229
+
1230
+ # 3. Добавим обработчики событий для кнопок в конце файла
1231
+ # Добавьте эти обработчики перед строкой "if __name__ == "__main__":"
1232
+
1233
+ # Обработчики для Knowledge Base
1234
+ update_kb_btn.click(
1235
+ fn=update_kb_with_selected,
1236
+ inputs=[sources_list],
1237
+ outputs=[kb_status]
1238
+ )
1239
+
1240
+ rebuild_kb_btn.click(
1241
+ fn=rebuild_kb_with_selected,
1242
+ inputs=[sources_list],
1243
+ outputs=[kb_status]
1244
+ )
1245
+
1246
+ # Обновление информации о базе знаний
1247
+ refresh_kb_info_btn.click(
1248
+ fn=get_kb_info,
1249
+ inputs=[],
1250
+ outputs=[kb_info]
1251
+ )
1252
+
1253
+ # Автоматическое обновление информации после ��пераций с базой знаний
1254
+ update_kb_btn.click(
1255
+ fn=get_kb_info,
1256
+ inputs=[],
1257
+ outputs=[kb_info]
1258
+ )
1259
+
1260
+ rebuild_kb_btn.click(
1261
+ fn=get_kb_info,
1262
+ inputs=[],
1263
+ outputs=[kb_info]
1264
+ )
1265
 
1266
  with gr.Tab("Model Settings"):
1267
  gr.Markdown("### Model Configuration")
 
1598
  inputs=[],
1599
  outputs=[evaluation_status, qa_table, refresh_data_status]
1600
  )
1601
+
1602
  # Model change handler - outside of Tabs but inside Blocks
1603
  model_selector.change(
1604
  fn=change_model,
 
1627
  logger.warning("Knowledge base not found. Please create it through the interface.")
1628
 
1629
  demo.launch(share=True)
1630
+
1631
+ # Add helper functions for URL selection:
1632
+ def get_selected_urls(sources_df):
1633
+ """Get list of URLs selected for inclusion"""
1634
+ try:
1635
+ if not isinstance(sources_df, pd.DataFrame):
1636
+ sources_df = pd.DataFrame(sources_df)
1637
+
1638
+ selected_urls = sources_df[sources_df["Include"] == True]["URL"].tolist()
1639
+ return selected_urls
1640
+ except Exception as e:
1641
+ logger.error(f"Error getting selected URLs: {str(e)}")
1642
+ return []
1643
+
1644
+ def update_kb_with_selected(sources_df):
1645
+ """Update knowledge base using only selected URLs"""
1646
+ try:
1647
+ selected_urls = get_selected_urls(sources_df)
1648
+
1649
+ if not selected_urls:
1650
+ return "Error: No URLs selected for inclusion"
1651
+
1652
+ from config import constants
1653
+ original_urls = constants.URLS
1654
+ constants.URLS = selected_urls
1655
+
1656
+ try:
1657
+ success, message = create_vector_store(mode="update")
1658
+
1659
+ if success:
1660
+ metadata = {
1661
+ "last_updated": datetime.datetime.now().isoformat(),
1662
+ "source_count": len(selected_urls),
1663
+ "sources": selected_urls
1664
+ }
1665
+
1666
+ json_content = json.dumps(metadata, indent=2).encode('utf-8')
1667
+ api = HfApi(token=HF_TOKEN)
1668
+
1669
+ api.upload_file(
1670
+ path_or_fileobj=json_content,
1671
+ path_in_repo="vector_store/metadata.json",
1672
+ repo_id=DATASET_ID,
1673
+ repo_type="dataset"
1674
+ )
1675
+
1676
+ return message
1677
+ finally:
1678
+ constants.URLS = original_urls
1679
+
1680
+ except Exception as e:
1681
+ return f"Error updating knowledge base: {str(e)}"
1682
+
1683
+ def rebuild_kb_with_selected(sources_df):
1684
+ """Rebuild knowledge base from scratch using only selected URLs"""
1685
+ try:
1686
+ selected_urls = get_selected_urls(sources_df)
1687
+
1688
+ if not selected_urls:
1689
+ return "Error: No URLs selected for inclusion"
1690
+
1691
+ from config import constants
1692
+ original_urls = constants.URLS
1693
+ constants.URLS = selected_urls
1694
+
1695
+ try:
1696
+ success, message = create_vector_store(mode="rebuild")
1697
+
1698
+ if success:
1699
+ metadata = {
1700
+ "last_updated": datetime.datetime.now().isoformat(),
1701
+ "source_count": len(selected_urls),
1702
+ "sources": selected_urls
1703
+ }
1704
+
1705
+ json_content = json.dumps(metadata, indent=2).encode('utf-8')
1706
+ api = HfApi(token=HF_TOKEN)
1707
+
1708
+ api.upload_file(
1709
+ path_or_fileobj=json_content,
1710
+ path_in_repo="vector_store/metadata.json",
1711
+ repo_id=DATASET_ID,
1712
+ repo_type="dataset"
1713
+ )
1714
+
1715
+ return message
1716
+ finally:
1717
+ constants.URLS = original_urls
1718
+
1719
+ except Exception as e:
1720
+ return f"Error rebuilding knowledge base: {str(e)}"
1721
+
1722
+ # Add new function for source status updates
1723
+ def update_source_status(df):
1724
+ """Update status column based on Include selection"""
1725
+ try:
1726
+ if not isinstance(df, pd.DataFrame):
1727
+ df = pd.DataFrame(df)
1728
+
1729
+ df["Status"] = df["Include"].apply(lambda x: "Selected" if x else "Excluded")
1730
+ selected_count = df["Include"].sum()
1731
+
1732
+ return df, f"{selected_count} URLs selected for inclusion"
1733
+ except Exception as e:
1734
+ return df, f"Error updating status: {str(e)}"
1735
+
1736
+ # Update event handlers in the Knowledge Base tab section
1737
+ with gr.Tab("Knowledge Base"):
1738
+ gr.Markdown("### Knowledge Base Management")
1739
+
1740
+ with gr.Row():
1741
+ with gr.Column(scale=2):
1742
+ # Sources list with selection
1743
+ gr.Markdown("#### Information Sources")
1744
+ sources_list = gr.Dataframe(
1745
+ value=pd.DataFrame({
1746
+ "URL": URLS,
1747
+ "Include": [True for _ in URLS],
1748
+ "Status": ["Ready" for _ in URLS]
1749
+ }),
1750
+ interactive=True,
1751
+ wrap=True,
1752
+ row_count=15,
1753
+ show_label=False
1754
+ )
1755
+
1756
+ # Status display
1757
+ kb_status = gr.Textbox(
1758
+ label="Operation Status",
1759
+ interactive=False,
1760
+ placeholder="Ready",
1761
+ value="Ready"
1762
+ )
1763
+
1764
+ # Control buttons
1765
+ with gr.Row():
1766
+ update_kb_btn = gr.Button("Update Knowledge Base", variant="primary")
1767
+ rebuild_kb_btn = gr.Button("Rebuild Knowledge Base from Scratch", variant="secondary")
1768
+
1769
+ # Help text
1770
+ gr.Markdown("""
1771
+ <small>
1772
+ **Update Knowledge Base**: Adds new information to the existing knowledge base.
1773
+
1774
+ **Rebuild Knowledge Base**: Recreates the entire knowledge base from scratch. Use this if there are inconsistencies.
1775
+
1776
+ All changes are saved to the Hugging Face dataset.
1777
+ </small>
1778
+ """)
1779
+
1780
+ with gr.Column(scale=1):
1781
+ # Knowledge base info display
1782
+ gr.Markdown("#### Knowledge Base Information")
1783
+ kb_info = gr.Markdown(value=get_kb_info())
1784
+ refresh_kb_info_btn = gr.Button("Refresh Information")
1785
+
1786
+ # Event handlers for Knowledge Base operations
1787
+ update_kb_btn.click(
1788
+ fn=update_kb_with_selected,
1789
+ inputs=[sources_list],
1790
+ outputs=[kb_status]
1791
+ )
1792
+
1793
+ rebuild_kb_btn.click(
1794
+ fn=rebuild_kb_with_selected,
1795
+ inputs=[sources_list],
1796
+ outputs=[kb_status]
1797
+ )
1798
+
1799
+ # Auto-refresh knowledge base info after operations
1800
+ update_kb_btn.click(
1801
+ fn=get_kb_info,
1802
+ inputs=[],
1803
+ outputs=[kb_info]
1804
+ )
1805
+
1806
+ rebuild_kb_btn.click(
1807
+ fn=get_kb_info,
1808
+ inputs=[],
1809
+ outputs=[kb_info]
1810
+ )
1811
+
1812
+ # Refresh button handler
1813
+ refresh_kb_info_btn.click(
1814
+ fn=get_kb_info,
1815
+ inputs=[],
1816
+ outputs=[kb_info]
1817
+ )
1818
+
1819
+ # Source selection status update handler
1820
+ sources_list.change(
1821
+ fn=update_source_status,
1822
+ inputs=[sources_list],
1823
+ outputs=[sources_list, kb_status]
1824
+ )
src/knowledge_base/dataset.py CHANGED
@@ -37,6 +37,61 @@ class DatasetManager:
37
  self.chat_history_path = DATASET_CHAT_HISTORY_PATH
38
  self.fine_tuned_path = DATASET_FINE_TUNED_PATH
39
  self.annotations_path = DATASET_ANNOTATIONS_PATH
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def init_dataset_structure(self) -> Tuple[bool, str]:
42
  """
 
37
  self.chat_history_path = DATASET_CHAT_HISTORY_PATH
38
  self.fine_tuned_path = DATASET_FINE_TUNED_PATH
39
  self.annotations_path = DATASET_ANNOTATIONS_PATH
40
+
41
+ # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
42
+
43
+ def get_last_update_date(self):
44
+ """
45
+ Получает дату последнего обновления базы знаний.
46
+
47
+ Returns:
48
+ str: Дата последнего обновления в формате ISO или None, если информация недоступна
49
+ """
50
+ try:
51
+ # Попробуем получить метаданные из датасета
52
+ api = HfApi(token=self.hf_token)
53
+
54
+ # Сначала проверим, есть ли специальный файл метаданных
55
+ files = api.list_repo_files(
56
+ repo_id=self.dataset_id,
57
+ repo_type="dataset"
58
+ )
59
+
60
+ metadata_file = "vector_store/metadata.json"
61
+
62
+ if metadata_file in files:
63
+ # Скачиваем файл метаданных
64
+ temp_dir = tempfile.mkdtemp()
65
+ metadata_path = os.path.join(temp_dir, "metadata.json")
66
+
67
+ api.hf_hub_download(
68
+ repo_id=self.dataset_id,
69
+ repo_type="dataset",
70
+ filename=metadata_file,
71
+ local_dir=temp_dir,
72
+ local_dir_use_symlinks=False
73
+ )
74
+
75
+ # Открываем и читаем дату из метаданных
76
+ with open(metadata_path, 'r') as f:
77
+ metadata = json.load(f)
78
+ return metadata.get("last_updated", None)
79
+
80
+ # Если специальный файл не найден, можно использовать дату последнего коммита
81
+ # для директории vector_store
82
+ last_commit = api.get_repo_info(
83
+ repo_id=self.dataset_id,
84
+ repo_type="dataset"
85
+ )
86
+
87
+ # Получаем дату последнего коммита
88
+ if hasattr(last_commit, "lastModified"):
89
+ return last_commit.lastModified
90
+
91
+ return None
92
+ except Exception as e:
93
+ logger.error(f"Error getting last update date: {str(e)}")
94
+ return None
95
 
96
  def init_dataset_structure(self) -> Tuple[bool, str]:
97
  """