Spaces:
Sleeping
Sleeping
Commit
·
e38f51d
1
Parent(s):
1c69559
added the file stats info to the UI
Browse files- app.py +30 -14
- main_utils.py +41 -0
app.py
CHANGED
|
@@ -20,7 +20,6 @@ def restart_system():
|
|
| 20 |
log_message("Начало перезапуска системы...")
|
| 21 |
log_message("Очистка кэша HuggingFace...")
|
| 22 |
|
| 23 |
-
# Clear HuggingFace cache to force fresh download
|
| 24 |
import shutil
|
| 25 |
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
|
| 26 |
if os.path.exists(cache_dir):
|
|
@@ -41,15 +40,20 @@ def restart_system():
|
|
| 41 |
)
|
| 42 |
|
| 43 |
if query_engine:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
log_message("Система успешно перезапущена")
|
| 45 |
-
return "✅ Система успешно перезапущена! Новые документы загружены."
|
| 46 |
else:
|
| 47 |
-
return "❌ Ошибка при перезапуске системы"
|
| 48 |
|
| 49 |
except Exception as e:
|
| 50 |
error_msg = f"Ошибка перезапуска: {str(e)}"
|
| 51 |
log_message(error_msg)
|
| 52 |
-
return f"❌ {error_msg}"
|
| 53 |
|
| 54 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 55 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
|
@@ -458,6 +462,17 @@ Rerank Top K: {retrieval_params['rerank_top_k']}"""
|
|
| 458 |
Выберите тип документа и загрузите файл. Система автоматически обработает и добавит его в базу знаний.
|
| 459 |
""")
|
| 460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
with gr.Row():
|
| 462 |
with gr.Column(scale=2):
|
| 463 |
file_type_radio = gr.Radio(
|
|
@@ -518,17 +533,18 @@ Rerank Top K: {retrieval_params['rerank_top_k']}"""
|
|
| 518 |
4. Нажмите "Перезапустить систему"
|
| 519 |
""")
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
|
| 527 |
-
restart_btn.click(
|
| 528 |
-
fn=restart_system,
|
| 529 |
-
inputs=[],
|
| 530 |
-
outputs=[restart_status]
|
| 531 |
-
)
|
| 532 |
switch_btn.click(
|
| 533 |
fn=switch_model_func,
|
| 534 |
inputs=[model_dropdown],
|
|
|
|
| 20 |
log_message("Начало перезапуска системы...")
|
| 21 |
log_message("Очистка кэша HuggingFace...")
|
| 22 |
|
|
|
|
| 23 |
import shutil
|
| 24 |
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
|
| 25 |
if os.path.exists(cache_dir):
|
|
|
|
| 40 |
)
|
| 41 |
|
| 42 |
if query_engine:
|
| 43 |
+
# Get updated stats
|
| 44 |
+
stats = get_repository_stats(HF_REPO_ID, HF_TOKEN, JSON_FILES_DIR,
|
| 45 |
+
TABLE_DATA_DIR, IMAGE_DATA_DIR)
|
| 46 |
+
stats_display = format_stats_display(stats)
|
| 47 |
+
|
| 48 |
log_message("Система успешно перезапущена")
|
| 49 |
+
return "✅ Система успешно перезапущена! Новые документы загружены.", stats_display
|
| 50 |
else:
|
| 51 |
+
return "❌ Ошибка при перезапуске системы", "Статистика недоступна"
|
| 52 |
|
| 53 |
except Exception as e:
|
| 54 |
error_msg = f"Ошибка перезапуска: {str(e)}"
|
| 55 |
log_message(error_msg)
|
| 56 |
+
return f"❌ {error_msg}", "Статистика недоступна"
|
| 57 |
|
| 58 |
def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
| 59 |
json_files_dir=None, table_data_dir=None, image_data_dir=None,
|
|
|
|
| 462 |
Выберите тип документа и загрузите файл. Система автоматически обработает и добавит его в базу знаний.
|
| 463 |
""")
|
| 464 |
|
| 465 |
+
# Add stats display at the top
|
| 466 |
+
stats_display = gr.Markdown(
|
| 467 |
+
value=format_stats_display(
|
| 468 |
+
get_repository_stats(HF_REPO_ID, HF_TOKEN, JSON_FILES_DIR,
|
| 469 |
+
TABLE_DATA_DIR, IMAGE_DATA_DIR)
|
| 470 |
+
),
|
| 471 |
+
label=""
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
gr.Markdown("---") # Separator
|
| 475 |
+
|
| 476 |
with gr.Row():
|
| 477 |
with gr.Column(scale=2):
|
| 478 |
file_type_radio = gr.Radio(
|
|
|
|
| 533 |
4. Нажмите "Перезапустить систему"
|
| 534 |
""")
|
| 535 |
|
| 536 |
+
upload_btn.click(
|
| 537 |
+
fn=process_uploaded_file,
|
| 538 |
+
inputs=[file_upload, file_type_radio],
|
| 539 |
+
outputs=[upload_status]
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
restart_btn.click(
|
| 543 |
+
fn=restart_system,
|
| 544 |
+
inputs=[],
|
| 545 |
+
outputs=[restart_status, stats_display]
|
| 546 |
+
)
|
| 547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
switch_btn.click(
|
| 549 |
fn=switch_model_func,
|
| 550 |
inputs=[model_dropdown],
|
main_utils.py
CHANGED
|
@@ -209,7 +209,48 @@ def enhance_query_with_keywords(query):
|
|
| 209 |
return enhanced
|
| 210 |
return f"{query}"
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
def merge_table_chunks(chunk_info):
|
| 215 |
merged = {}
|
|
|
|
| 209 |
return enhanced
|
| 210 |
return f"{query}"
|
| 211 |
|
| 212 |
+
def get_repository_stats(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 213 |
+
"""Get statistics about documents in the repository"""
|
| 214 |
+
try:
|
| 215 |
+
from huggingface_hub import list_repo_files
|
| 216 |
+
|
| 217 |
+
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 218 |
+
|
| 219 |
+
# Count JSON text files
|
| 220 |
+
json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
|
| 221 |
+
zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
|
| 222 |
+
|
| 223 |
+
# Count table files
|
| 224 |
+
table_files = [f for f in files if f.startswith(table_dir) and
|
| 225 |
+
(f.endswith('.json') or f.endswith('.xlsx') or f.endswith('.xls'))]
|
| 226 |
+
|
| 227 |
+
# Count image files
|
| 228 |
+
image_files = [f for f in files if f.startswith(image_dir) and
|
| 229 |
+
(f.endswith('.csv') or f.endswith('.xlsx') or f.endswith('.xls'))]
|
| 230 |
+
|
| 231 |
+
stats = {
|
| 232 |
+
'text_files': len(json_files) + len(zip_files),
|
| 233 |
+
'table_files': len(table_files),
|
| 234 |
+
'image_files': len(image_files),
|
| 235 |
+
'total_files': len(json_files) + len(zip_files) + len(table_files) + len(image_files)
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
log_message(f"Repository stats: {stats}")
|
| 239 |
+
return stats
|
| 240 |
+
except Exception as e:
|
| 241 |
+
log_message(f"Error getting repository stats: {e}")
|
| 242 |
+
return {'text_files': 0, 'table_files': 0, 'image_files': 0, 'total_files': 0}
|
| 243 |
+
|
| 244 |
+
def format_stats_display(stats):
|
| 245 |
+
"""Format statistics for display"""
|
| 246 |
+
return f"""📊 **Статистика базы данных:**
|
| 247 |
|
| 248 |
+
📝 Текстовые документы (JSON): **{stats['text_files']}**
|
| 249 |
+
📊 Табличные данные: **{stats['table_files']}**
|
| 250 |
+
🖼️ Изображения: **{stats['image_files']}**
|
| 251 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 252 |
+
📦 Всего файлов: **{stats['total_files']}**
|
| 253 |
+
"""
|
| 254 |
|
| 255 |
def merge_table_chunks(chunk_info):
|
| 256 |
merged = {}
|