Spaces:
Running
Running
Refactor README and app.py: Update dataset structure in README, add knowledge base management features in app.py, and implement last update date retrieval in DatasetManager.
Browse files- README.md +16 -18
- app.py +409 -3
- src/knowledge_base/dataset.py +55 -0
README.md
CHANGED
|
@@ -64,28 +64,26 @@ status-law-gbot/
|
|
| 64 |
│ └── training/ # Training module
|
| 65 |
│ ├── fine_tuner.py
|
| 66 |
│ └── model_manager.py
|
| 67 |
-
└──
|
| 68 |
-
├──
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
├──
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
```
|
| 76 |
|
| 77 |
## 💾 Data Storage
|
| 78 |
|
| 79 |
-
###
|
| 80 |
-
- `
|
| 81 |
-
- `
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
- `
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
- `src/models/fine_tuned/`: Directory for storing fine-tuned models
|
| 88 |
-
- `src/models/registry.json`: Model registry and configuration
|
| 89 |
|
| 90 |
## 🛠️ Setup
|
| 91 |
|
|
|
|
| 64 |
│ └── training/ # Training module
|
| 65 |
│ ├── fine_tuner.py
|
| 66 |
│ └── model_manager.py
|
| 67 |
+
└── dataset/ # HuggingFace dataset structure
|
| 68 |
+
├── annotations/ # Conversation annotations
|
| 69 |
+
├── chat_history/ # Chat logs and conversations
|
| 70 |
+
├── fine_tuned_models/ # Fine-tuned model storage
|
| 71 |
+
├── preferences/ # User preferences
|
| 72 |
+
├── training_data/ # Processed training data
|
| 73 |
+
├── training_logs/ # Training process logs
|
| 74 |
+
└── vector_store/ # FAISS vector storage
|
| 75 |
```
|
| 76 |
|
| 77 |
## 💾 Data Storage
|
| 78 |
|
| 79 |
+
### Dataset Organization
|
| 80 |
+
- `annotations/`: Conversation quality metrics and annotations
|
| 81 |
+
- `chat_history/`: JSON files containing chat conversations
|
| 82 |
+
- `fine_tuned_models/`: Storage for LoRA adapters and model checkpoints
|
| 83 |
+
- `preferences/`: User preferences and settings
|
| 84 |
+
- `training_data/`: Processed data ready for model training
|
| 85 |
+
- `training_logs/`: Detailed training process logs
|
| 86 |
+
- `vector_store/`: FAISS indexes for semantic search
|
|
|
|
|
|
|
| 87 |
|
| 88 |
## 🛠️ Setup
|
| 89 |
|
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import os
|
|
| 9 |
|
| 10 |
# Third-party imports
|
| 11 |
import gradio as gr
|
| 12 |
-
import pandas as pd
|
| 13 |
|
| 14 |
|
| 15 |
from huggingface_hub import HfApi, InferenceClient
|
|
@@ -18,6 +18,7 @@ import langdetect
|
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
import requests
|
| 20 |
from datasets import load_dataset
|
|
|
|
| 21 |
|
| 22 |
# Set seed for consistent results
|
| 23 |
langdetect.DetectorFactory.seed = 0
|
|
@@ -479,11 +480,17 @@ def log_api_error(user_message, error_message, model_id, is_fallback=False):
|
|
| 479 |
logger.info(f"API error logged to {log_path}")
|
| 480 |
except Exception as e:
|
| 481 |
logger.error(f"Failed to log API error: {str(e)}")
|
| 482 |
-
|
| 483 |
def update_kb():
|
| 484 |
"""Function to update existing knowledge base with new documents"""
|
| 485 |
try:
|
|
|
|
| 486 |
success, message = create_vector_store(mode="update")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
return message
|
| 488 |
except Exception as e:
|
| 489 |
return f"Error updating knowledge base: {str(e)}"
|
|
@@ -491,11 +498,63 @@ def update_kb():
|
|
| 491 |
def rebuild_kb():
|
| 492 |
"""Function to create knowledge base from scratch"""
|
| 493 |
try:
|
|
|
|
| 494 |
success, message = create_vector_store(mode="rebuild")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
return message
|
| 496 |
except Exception as e:
|
| 497 |
return f"Error creating knowledge base: {str(e)}"
|
| 498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
def save_chat_history(history, conversation_id):
|
| 500 |
"""Save chat history to a file and to HuggingFace dataset"""
|
| 501 |
try:
|
|
@@ -1051,6 +1110,158 @@ with gr.Blocks(css="""
|
|
| 1051 |
)
|
| 1052 |
|
| 1053 |
clear_btn.click(clear_conversation, None, [chatbot, conversation_id])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1054 |
|
| 1055 |
with gr.Tab("Model Settings"):
|
| 1056 |
gr.Markdown("### Model Configuration")
|
|
@@ -1387,7 +1598,7 @@ with gr.Blocks(css="""
|
|
| 1387 |
inputs=[],
|
| 1388 |
outputs=[evaluation_status, qa_table, refresh_data_status]
|
| 1389 |
)
|
| 1390 |
-
|
| 1391 |
# Model change handler - outside of Tabs but inside Blocks
|
| 1392 |
model_selector.change(
|
| 1393 |
fn=change_model,
|
|
@@ -1416,3 +1627,198 @@ if __name__ == "__main__":
|
|
| 1416 |
logger.warning("Knowledge base not found. Please create it through the interface.")
|
| 1417 |
|
| 1418 |
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Third-party imports
|
| 11 |
import gradio as gr
|
| 12 |
+
import pandas as pd
|
| 13 |
|
| 14 |
|
| 15 |
from huggingface_hub import HfApi, InferenceClient
|
|
|
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
import requests
|
| 20 |
from datasets import load_dataset
|
| 21 |
+
from config.constants import URLS
|
| 22 |
|
| 23 |
# Set seed for consistent results
|
| 24 |
langdetect.DetectorFactory.seed = 0
|
|
|
|
| 480 |
logger.info(f"API error logged to {log_path}")
|
| 481 |
except Exception as e:
|
| 482 |
logger.error(f"Failed to log API error: {str(e)}")
|
| 483 |
+
|
| 484 |
def update_kb():
|
| 485 |
"""Function to update existing knowledge base with new documents"""
|
| 486 |
try:
|
| 487 |
+
# Вызываем функцию для обновления базы знаний
|
| 488 |
success, message = create_vector_store(mode="update")
|
| 489 |
+
|
| 490 |
+
# Если обновление успешно, сохраняем метаданные с датой обновления
|
| 491 |
+
if success:
|
| 492 |
+
save_kb_metadata()
|
| 493 |
+
|
| 494 |
return message
|
| 495 |
except Exception as e:
|
| 496 |
return f"Error updating knowledge base: {str(e)}"
|
|
|
|
| 498 |
def rebuild_kb():
|
| 499 |
"""Function to create knowledge base from scratch"""
|
| 500 |
try:
|
| 501 |
+
# Вызываем функцию для пересоздания базы знаний
|
| 502 |
success, message = create_vector_store(mode="rebuild")
|
| 503 |
+
|
| 504 |
+
# Если создание успешно, сохраняем метаданные с датой обновления
|
| 505 |
+
if success:
|
| 506 |
+
save_kb_metadata()
|
| 507 |
+
|
| 508 |
return message
|
| 509 |
except Exception as e:
|
| 510 |
return f"Error creating knowledge base: {str(e)}"
|
| 511 |
|
| 512 |
+
def save_kb_metadata():
|
| 513 |
+
"""Save knowledge base metadata to dataset"""
|
| 514 |
+
try:
|
| 515 |
+
# Создаем метаданные с текущей датой
|
| 516 |
+
metadata = {
|
| 517 |
+
"last_updated": datetime.datetime.now().isoformat(),
|
| 518 |
+
"source_count": len(URLS),
|
| 519 |
+
"sources": URLS
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
# Сохраняем в датасет
|
| 523 |
+
json_content = json.dumps(metadata, indent=2).encode('utf-8')
|
| 524 |
+
api = HfApi(token=HF_TOKEN)
|
| 525 |
+
|
| 526 |
+
# Убедимся, что директория существует
|
| 527 |
+
try:
|
| 528 |
+
files = api.list_repo_files(
|
| 529 |
+
repo_id=DATASET_ID,
|
| 530 |
+
repo_type="dataset"
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
if "vector_store" not in files:
|
| 534 |
+
# Создаем пустой файл, чтобы создать директорию
|
| 535 |
+
api.upload_file(
|
| 536 |
+
path_or_fileobj=b"",
|
| 537 |
+
path_in_repo="vector_store/.gitkeep",
|
| 538 |
+
repo_id=DATASET_ID,
|
| 539 |
+
repo_type="dataset"
|
| 540 |
+
)
|
| 541 |
+
except Exception as e:
|
| 542 |
+
logger.warning(f"Error checking vector_store directory: {str(e)}")
|
| 543 |
+
|
| 544 |
+
# Загружаем метаданные
|
| 545 |
+
api.upload_file(
|
| 546 |
+
path_or_fileobj=json_content,
|
| 547 |
+
path_in_repo="vector_store/metadata.json",
|
| 548 |
+
repo_id=DATASET_ID,
|
| 549 |
+
repo_type="dataset"
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
logger.info("Knowledge base metadata saved successfully")
|
| 553 |
+
return True
|
| 554 |
+
except Exception as e:
|
| 555 |
+
logger.error(f"Error saving knowledge base metadata: {str(e)}")
|
| 556 |
+
return False
|
| 557 |
+
|
| 558 |
def save_chat_history(history, conversation_id):
|
| 559 |
"""Save chat history to a file and to HuggingFace dataset"""
|
| 560 |
try:
|
|
|
|
| 1110 |
)
|
| 1111 |
|
| 1112 |
clear_btn.click(clear_conversation, None, [chatbot, conversation_id])
|
| 1113 |
+
|
| 1114 |
+
|
| 1115 |
+
|
| 1116 |
+
with gr.Tab("Knowledge Base"):
|
| 1117 |
+
gr.Markdown("### Knowledge Base Management")
|
| 1118 |
+
|
| 1119 |
+
with gr.Row():
|
| 1120 |
+
with gr.Column(scale=2):
|
| 1121 |
+
# Отображение источников
|
| 1122 |
+
gr.Markdown("#### Information Sources")
|
| 1123 |
+
sources_list = gr.Dataframe(
|
| 1124 |
+
value=pd.DataFrame({
|
| 1125 |
+
"URL": URLS,
|
| 1126 |
+
"Include": [True for _ in URLS],
|
| 1127 |
+
"Status": ["Ready" for _ in URLS]
|
| 1128 |
+
}),
|
| 1129 |
+
interactive=True,
|
| 1130 |
+
wrap=True,
|
| 1131 |
+
row_count=15,
|
| 1132 |
+
show_label=False
|
| 1133 |
+
)
|
| 1134 |
+
|
| 1135 |
+
# Статус операций с базой знаний
|
| 1136 |
+
kb_status = gr.Textbox(
|
| 1137 |
+
label="Operation Status",
|
| 1138 |
+
interactive=False,
|
| 1139 |
+
placeholder="Ready",
|
| 1140 |
+
value="Ready"
|
| 1141 |
+
)
|
| 1142 |
+
|
| 1143 |
+
# Кнопки для управления базой знаний
|
| 1144 |
+
with gr.Row():
|
| 1145 |
+
update_kb_btn = gr.Button("Update Knowledge Base", variant="primary")
|
| 1146 |
+
rebuild_kb_btn = gr.Button("Rebuild Knowledge Base from Scratch", variant="secondary")
|
| 1147 |
+
|
| 1148 |
+
gr.Markdown("""
|
| 1149 |
+
<small>
|
| 1150 |
+
**Update Knowledge Base**: Adds new information to the existing knowledge base.
|
| 1151 |
+
|
| 1152 |
+
**Rebuild Knowledge Base**: Recreates the entire knowledge base from scratch. Use this if there are inconsistencies.
|
| 1153 |
+
|
| 1154 |
+
All changes are saved to the Hugging Face dataset.
|
| 1155 |
+
</small>
|
| 1156 |
+
""")
|
| 1157 |
+
|
| 1158 |
+
with gr.Column(scale=1):
|
| 1159 |
+
# Информация о текущей базе знаний
|
| 1160 |
+
gr.Markdown("#### Knowledge Base Information")
|
| 1161 |
+
|
| 1162 |
+
# Функция для получения информации о базе знаний
|
| 1163 |
+
def get_kb_info():
|
| 1164 |
+
try:
|
| 1165 |
+
vector_store = load_vector_store()
|
| 1166 |
+
if vector_store is None or isinstance(vector_store, str):
|
| 1167 |
+
return """
|
| 1168 |
+
**Status**: Not found or error
|
| 1169 |
+
|
| 1170 |
+
**Documents**: 0
|
| 1171 |
+
|
| 1172 |
+
**Last updated**: Never
|
| 1173 |
+
|
| 1174 |
+
Please create a knowledge base using the buttons on the left.
|
| 1175 |
+
"""
|
| 1176 |
+
|
| 1177 |
+
# Получаем информацию о векторном хранилище
|
| 1178 |
+
doc_count = len(vector_store.docstore._dict)
|
| 1179 |
+
sources = set()
|
| 1180 |
+
|
| 1181 |
+
for doc_id, doc in vector_store.docstore._dict.items():
|
| 1182 |
+
if hasattr(doc, 'metadata') and 'source' in doc.metadata:
|
| 1183 |
+
sources.add(doc.metadata['source'])
|
| 1184 |
+
|
| 1185 |
+
source_count = len(sources)
|
| 1186 |
+
|
| 1187 |
+
# Если хранилище существует, но источников нет
|
| 1188 |
+
if source_count == 0:
|
| 1189 |
+
return """
|
| 1190 |
+
**Status**: Created but empty
|
| 1191 |
+
|
| 1192 |
+
**Documents**: 0
|
| 1193 |
+
|
| 1194 |
+
**Last updated**: Unknown
|
| 1195 |
+
|
| 1196 |
+
Please rebuild the knowledge base using the button on the left.
|
| 1197 |
+
"""
|
| 1198 |
+
|
| 1199 |
+
# Получаем файл с датой последнего обновления
|
| 1200 |
+
last_updated = "Unknown"
|
| 1201 |
+
try:
|
| 1202 |
+
from src.knowledge_base.dataset import DatasetManager
|
| 1203 |
+
dataset = DatasetManager()
|
| 1204 |
+
last_updated = dataset.get_last_update_date() or "Unknown"
|
| 1205 |
+
except Exception as e:
|
| 1206 |
+
logger.error(f"Error getting last update date: {str(e)}")
|
| 1207 |
+
|
| 1208 |
+
return f"""
|
| 1209 |
+
**Status**: Active
|
| 1210 |
+
|
| 1211 |
+
**Documents**: {doc_count}
|
| 1212 |
+
|
| 1213 |
+
**Sources**: {source_count}
|
| 1214 |
+
|
| 1215 |
+
**Last updated**: {last_updated}
|
| 1216 |
+
"""
|
| 1217 |
+
|
| 1218 |
+
except Exception as e:
|
| 1219 |
+
return f"""
|
| 1220 |
+
**Status**: Error
|
| 1221 |
+
|
| 1222 |
+
**Details**: {str(e)}
|
| 1223 |
+
|
| 1224 |
+
Please try rebuilding the knowledge base.
|
| 1225 |
+
"""
|
| 1226 |
+
|
| 1227 |
+
kb_info = gr.Markdown(value=get_kb_info())
|
| 1228 |
+
refresh_kb_info_btn = gr.Button("Refresh Information")
|
| 1229 |
+
|
| 1230 |
+
# 3. Добавим обработчики событий для кнопок в конце файла
|
| 1231 |
+
# Добавьте эти обработчики перед строкой "if __name__ == "__main__":"
|
| 1232 |
+
|
| 1233 |
+
# Обработчики для Knowledge Base
|
| 1234 |
+
update_kb_btn.click(
|
| 1235 |
+
fn=update_kb_with_selected,
|
| 1236 |
+
inputs=[sources_list],
|
| 1237 |
+
outputs=[kb_status]
|
| 1238 |
+
)
|
| 1239 |
+
|
| 1240 |
+
rebuild_kb_btn.click(
|
| 1241 |
+
fn=rebuild_kb_with_selected,
|
| 1242 |
+
inputs=[sources_list],
|
| 1243 |
+
outputs=[kb_status]
|
| 1244 |
+
)
|
| 1245 |
+
|
| 1246 |
+
# Обновление информации о базе знаний
|
| 1247 |
+
refresh_kb_info_btn.click(
|
| 1248 |
+
fn=get_kb_info,
|
| 1249 |
+
inputs=[],
|
| 1250 |
+
outputs=[kb_info]
|
| 1251 |
+
)
|
| 1252 |
+
|
| 1253 |
+
# Автоматическое обновление информации после ��пераций с базой знаний
|
| 1254 |
+
update_kb_btn.click(
|
| 1255 |
+
fn=get_kb_info,
|
| 1256 |
+
inputs=[],
|
| 1257 |
+
outputs=[kb_info]
|
| 1258 |
+
)
|
| 1259 |
+
|
| 1260 |
+
rebuild_kb_btn.click(
|
| 1261 |
+
fn=get_kb_info,
|
| 1262 |
+
inputs=[],
|
| 1263 |
+
outputs=[kb_info]
|
| 1264 |
+
)
|
| 1265 |
|
| 1266 |
with gr.Tab("Model Settings"):
|
| 1267 |
gr.Markdown("### Model Configuration")
|
|
|
|
| 1598 |
inputs=[],
|
| 1599 |
outputs=[evaluation_status, qa_table, refresh_data_status]
|
| 1600 |
)
|
| 1601 |
+
|
| 1602 |
# Model change handler - outside of Tabs but inside Blocks
|
| 1603 |
model_selector.change(
|
| 1604 |
fn=change_model,
|
|
|
|
| 1627 |
logger.warning("Knowledge base not found. Please create it through the interface.")
|
| 1628 |
|
| 1629 |
demo.launch(share=True)
|
| 1630 |
+
|
| 1631 |
+
# Add helper functions for URL selection:
|
| 1632 |
+
def get_selected_urls(sources_df):
|
| 1633 |
+
"""Get list of URLs selected for inclusion"""
|
| 1634 |
+
try:
|
| 1635 |
+
if not isinstance(sources_df, pd.DataFrame):
|
| 1636 |
+
sources_df = pd.DataFrame(sources_df)
|
| 1637 |
+
|
| 1638 |
+
selected_urls = sources_df[sources_df["Include"] == True]["URL"].tolist()
|
| 1639 |
+
return selected_urls
|
| 1640 |
+
except Exception as e:
|
| 1641 |
+
logger.error(f"Error getting selected URLs: {str(e)}")
|
| 1642 |
+
return []
|
| 1643 |
+
|
| 1644 |
+
def update_kb_with_selected(sources_df):
|
| 1645 |
+
"""Update knowledge base using only selected URLs"""
|
| 1646 |
+
try:
|
| 1647 |
+
selected_urls = get_selected_urls(sources_df)
|
| 1648 |
+
|
| 1649 |
+
if not selected_urls:
|
| 1650 |
+
return "Error: No URLs selected for inclusion"
|
| 1651 |
+
|
| 1652 |
+
from config import constants
|
| 1653 |
+
original_urls = constants.URLS
|
| 1654 |
+
constants.URLS = selected_urls
|
| 1655 |
+
|
| 1656 |
+
try:
|
| 1657 |
+
success, message = create_vector_store(mode="update")
|
| 1658 |
+
|
| 1659 |
+
if success:
|
| 1660 |
+
metadata = {
|
| 1661 |
+
"last_updated": datetime.datetime.now().isoformat(),
|
| 1662 |
+
"source_count": len(selected_urls),
|
| 1663 |
+
"sources": selected_urls
|
| 1664 |
+
}
|
| 1665 |
+
|
| 1666 |
+
json_content = json.dumps(metadata, indent=2).encode('utf-8')
|
| 1667 |
+
api = HfApi(token=HF_TOKEN)
|
| 1668 |
+
|
| 1669 |
+
api.upload_file(
|
| 1670 |
+
path_or_fileobj=json_content,
|
| 1671 |
+
path_in_repo="vector_store/metadata.json",
|
| 1672 |
+
repo_id=DATASET_ID,
|
| 1673 |
+
repo_type="dataset"
|
| 1674 |
+
)
|
| 1675 |
+
|
| 1676 |
+
return message
|
| 1677 |
+
finally:
|
| 1678 |
+
constants.URLS = original_urls
|
| 1679 |
+
|
| 1680 |
+
except Exception as e:
|
| 1681 |
+
return f"Error updating knowledge base: {str(e)}"
|
| 1682 |
+
|
| 1683 |
+
def rebuild_kb_with_selected(sources_df):
|
| 1684 |
+
"""Rebuild knowledge base from scratch using only selected URLs"""
|
| 1685 |
+
try:
|
| 1686 |
+
selected_urls = get_selected_urls(sources_df)
|
| 1687 |
+
|
| 1688 |
+
if not selected_urls:
|
| 1689 |
+
return "Error: No URLs selected for inclusion"
|
| 1690 |
+
|
| 1691 |
+
from config import constants
|
| 1692 |
+
original_urls = constants.URLS
|
| 1693 |
+
constants.URLS = selected_urls
|
| 1694 |
+
|
| 1695 |
+
try:
|
| 1696 |
+
success, message = create_vector_store(mode="rebuild")
|
| 1697 |
+
|
| 1698 |
+
if success:
|
| 1699 |
+
metadata = {
|
| 1700 |
+
"last_updated": datetime.datetime.now().isoformat(),
|
| 1701 |
+
"source_count": len(selected_urls),
|
| 1702 |
+
"sources": selected_urls
|
| 1703 |
+
}
|
| 1704 |
+
|
| 1705 |
+
json_content = json.dumps(metadata, indent=2).encode('utf-8')
|
| 1706 |
+
api = HfApi(token=HF_TOKEN)
|
| 1707 |
+
|
| 1708 |
+
api.upload_file(
|
| 1709 |
+
path_or_fileobj=json_content,
|
| 1710 |
+
path_in_repo="vector_store/metadata.json",
|
| 1711 |
+
repo_id=DATASET_ID,
|
| 1712 |
+
repo_type="dataset"
|
| 1713 |
+
)
|
| 1714 |
+
|
| 1715 |
+
return message
|
| 1716 |
+
finally:
|
| 1717 |
+
constants.URLS = original_urls
|
| 1718 |
+
|
| 1719 |
+
except Exception as e:
|
| 1720 |
+
return f"Error rebuilding knowledge base: {str(e)}"
|
| 1721 |
+
|
| 1722 |
+
# Add new function for source status updates
|
| 1723 |
+
def update_source_status(df):
|
| 1724 |
+
"""Update status column based on Include selection"""
|
| 1725 |
+
try:
|
| 1726 |
+
if not isinstance(df, pd.DataFrame):
|
| 1727 |
+
df = pd.DataFrame(df)
|
| 1728 |
+
|
| 1729 |
+
df["Status"] = df["Include"].apply(lambda x: "Selected" if x else "Excluded")
|
| 1730 |
+
selected_count = df["Include"].sum()
|
| 1731 |
+
|
| 1732 |
+
return df, f"{selected_count} URLs selected for inclusion"
|
| 1733 |
+
except Exception as e:
|
| 1734 |
+
return df, f"Error updating status: {str(e)}"
|
| 1735 |
+
|
| 1736 |
+
# Update event handlers in the Knowledge Base tab section
|
| 1737 |
+
with gr.Tab("Knowledge Base"):
|
| 1738 |
+
gr.Markdown("### Knowledge Base Management")
|
| 1739 |
+
|
| 1740 |
+
with gr.Row():
|
| 1741 |
+
with gr.Column(scale=2):
|
| 1742 |
+
# Sources list with selection
|
| 1743 |
+
gr.Markdown("#### Information Sources")
|
| 1744 |
+
sources_list = gr.Dataframe(
|
| 1745 |
+
value=pd.DataFrame({
|
| 1746 |
+
"URL": URLS,
|
| 1747 |
+
"Include": [True for _ in URLS],
|
| 1748 |
+
"Status": ["Ready" for _ in URLS]
|
| 1749 |
+
}),
|
| 1750 |
+
interactive=True,
|
| 1751 |
+
wrap=True,
|
| 1752 |
+
row_count=15,
|
| 1753 |
+
show_label=False
|
| 1754 |
+
)
|
| 1755 |
+
|
| 1756 |
+
# Status display
|
| 1757 |
+
kb_status = gr.Textbox(
|
| 1758 |
+
label="Operation Status",
|
| 1759 |
+
interactive=False,
|
| 1760 |
+
placeholder="Ready",
|
| 1761 |
+
value="Ready"
|
| 1762 |
+
)
|
| 1763 |
+
|
| 1764 |
+
# Control buttons
|
| 1765 |
+
with gr.Row():
|
| 1766 |
+
update_kb_btn = gr.Button("Update Knowledge Base", variant="primary")
|
| 1767 |
+
rebuild_kb_btn = gr.Button("Rebuild Knowledge Base from Scratch", variant="secondary")
|
| 1768 |
+
|
| 1769 |
+
# Help text
|
| 1770 |
+
gr.Markdown("""
|
| 1771 |
+
<small>
|
| 1772 |
+
**Update Knowledge Base**: Adds new information to the existing knowledge base.
|
| 1773 |
+
|
| 1774 |
+
**Rebuild Knowledge Base**: Recreates the entire knowledge base from scratch. Use this if there are inconsistencies.
|
| 1775 |
+
|
| 1776 |
+
All changes are saved to the Hugging Face dataset.
|
| 1777 |
+
</small>
|
| 1778 |
+
""")
|
| 1779 |
+
|
| 1780 |
+
with gr.Column(scale=1):
|
| 1781 |
+
# Knowledge base info display
|
| 1782 |
+
gr.Markdown("#### Knowledge Base Information")
|
| 1783 |
+
kb_info = gr.Markdown(value=get_kb_info())
|
| 1784 |
+
refresh_kb_info_btn = gr.Button("Refresh Information")
|
| 1785 |
+
|
| 1786 |
+
# Event handlers for Knowledge Base operations
|
| 1787 |
+
update_kb_btn.click(
|
| 1788 |
+
fn=update_kb_with_selected,
|
| 1789 |
+
inputs=[sources_list],
|
| 1790 |
+
outputs=[kb_status]
|
| 1791 |
+
)
|
| 1792 |
+
|
| 1793 |
+
rebuild_kb_btn.click(
|
| 1794 |
+
fn=rebuild_kb_with_selected,
|
| 1795 |
+
inputs=[sources_list],
|
| 1796 |
+
outputs=[kb_status]
|
| 1797 |
+
)
|
| 1798 |
+
|
| 1799 |
+
# Auto-refresh knowledge base info after operations
|
| 1800 |
+
update_kb_btn.click(
|
| 1801 |
+
fn=get_kb_info,
|
| 1802 |
+
inputs=[],
|
| 1803 |
+
outputs=[kb_info]
|
| 1804 |
+
)
|
| 1805 |
+
|
| 1806 |
+
rebuild_kb_btn.click(
|
| 1807 |
+
fn=get_kb_info,
|
| 1808 |
+
inputs=[],
|
| 1809 |
+
outputs=[kb_info]
|
| 1810 |
+
)
|
| 1811 |
+
|
| 1812 |
+
# Refresh button handler
|
| 1813 |
+
refresh_kb_info_btn.click(
|
| 1814 |
+
fn=get_kb_info,
|
| 1815 |
+
inputs=[],
|
| 1816 |
+
outputs=[kb_info]
|
| 1817 |
+
)
|
| 1818 |
+
|
| 1819 |
+
# Source selection status update handler
|
| 1820 |
+
sources_list.change(
|
| 1821 |
+
fn=update_source_status,
|
| 1822 |
+
inputs=[sources_list],
|
| 1823 |
+
outputs=[sources_list, kb_status]
|
| 1824 |
+
)
|
src/knowledge_base/dataset.py
CHANGED
|
@@ -37,6 +37,61 @@ class DatasetManager:
|
|
| 37 |
self.chat_history_path = DATASET_CHAT_HISTORY_PATH
|
| 38 |
self.fine_tuned_path = DATASET_FINE_TUNED_PATH
|
| 39 |
self.annotations_path = DATASET_ANNOTATIONS_PATH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def init_dataset_structure(self) -> Tuple[bool, str]:
|
| 42 |
"""
|
|
|
|
| 37 |
self.chat_history_path = DATASET_CHAT_HISTORY_PATH
|
| 38 |
self.fine_tuned_path = DATASET_FINE_TUNED_PATH
|
| 39 |
self.annotations_path = DATASET_ANNOTATIONS_PATH
|
| 40 |
+
|
| 41 |
+
# Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
|
| 42 |
+
|
| 43 |
+
def get_last_update_date(self):
|
| 44 |
+
"""
|
| 45 |
+
Получает дату последнего обновления базы знаний.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
str: Дата последнего обновления в формате ISO или None, если информация недоступна
|
| 49 |
+
"""
|
| 50 |
+
try:
|
| 51 |
+
# Попробуем получить метаданные из датасета
|
| 52 |
+
api = HfApi(token=self.hf_token)
|
| 53 |
+
|
| 54 |
+
# Сначала проверим, есть ли специальный файл метаданных
|
| 55 |
+
files = api.list_repo_files(
|
| 56 |
+
repo_id=self.dataset_id,
|
| 57 |
+
repo_type="dataset"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
metadata_file = "vector_store/metadata.json"
|
| 61 |
+
|
| 62 |
+
if metadata_file in files:
|
| 63 |
+
# Скачиваем файл метаданных
|
| 64 |
+
temp_dir = tempfile.mkdtemp()
|
| 65 |
+
metadata_path = os.path.join(temp_dir, "metadata.json")
|
| 66 |
+
|
| 67 |
+
api.hf_hub_download(
|
| 68 |
+
repo_id=self.dataset_id,
|
| 69 |
+
repo_type="dataset",
|
| 70 |
+
filename=metadata_file,
|
| 71 |
+
local_dir=temp_dir,
|
| 72 |
+
local_dir_use_symlinks=False
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Открываем и читаем дату из метаданных
|
| 76 |
+
with open(metadata_path, 'r') as f:
|
| 77 |
+
metadata = json.load(f)
|
| 78 |
+
return metadata.get("last_updated", None)
|
| 79 |
+
|
| 80 |
+
# Если специальный файл не найден, можно использовать дату последнего коммита
|
| 81 |
+
# для директории vector_store
|
| 82 |
+
last_commit = api.get_repo_info(
|
| 83 |
+
repo_id=self.dataset_id,
|
| 84 |
+
repo_type="dataset"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Получаем дату последнего коммита
|
| 88 |
+
if hasattr(last_commit, "lastModified"):
|
| 89 |
+
return last_commit.lastModified
|
| 90 |
+
|
| 91 |
+
return None
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.error(f"Error getting last update date: {str(e)}")
|
| 94 |
+
return None
|
| 95 |
|
| 96 |
def init_dataset_structure(self) -> Tuple[bool, str]:
|
| 97 |
"""
|