Spaces:
Running
Running
Commit ·
17d0013
1
Parent(s): 0067c9d
Sort tables correctly by
Browse files- __pycache__/config.cpython-311.pyc +0 -0
- __pycache__/documents_prep.cpython-311.pyc +0 -0
- __pycache__/index_retriever.cpython-311.pyc +0 -0
- __pycache__/table_prep.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- index_retriever.py +21 -2
- table_prep.py +29 -3
- utils.py +41 -13
__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/config.cpython-311.pyc and b/__pycache__/config.cpython-311.pyc differ
|
|
|
__pycache__/documents_prep.cpython-311.pyc
ADDED
|
Binary file (23.5 kB). View file
|
|
|
__pycache__/index_retriever.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/index_retriever.cpython-311.pyc and b/__pycache__/index_retriever.cpython-311.pyc differ
|
|
|
__pycache__/table_prep.cpython-311.pyc
ADDED
|
Binary file (9.79 kB). View file
|
|
|
__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (23.7 kB). View file
|
|
|
index_retriever.py
CHANGED
|
@@ -80,9 +80,28 @@ def rerank_nodes(query, nodes, reranker, top_k=10):
|
|
| 80 |
final_nodes = priority_nodes + reranked_text_nodes
|
| 81 |
result = final_nodes[:top_k]
|
| 82 |
|
| 83 |
-
log_message(
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
except Exception as e:
|
| 87 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 88 |
return nodes[:top_k]
|
|
|
|
| 80 |
final_nodes = priority_nodes + reranked_text_nodes
|
| 81 |
result = final_nodes[:top_k]
|
| 82 |
|
| 83 |
+
log_message("=" * 80)
|
| 84 |
+
log_message(f"РЕЗУЛЬТАТЫ ПЕРЕРАНЖИРОВКИ")
|
| 85 |
+
log_message("=" * 80)
|
| 86 |
+
log_message(f"Приоритетных узлов (таблицы/изображения): {len(priority_nodes)}")
|
| 87 |
+
log_message(f"Текстовых узлов: {len(text_nodes)}")
|
| 88 |
+
log_message(f"Всего возвращается узлов: {len(final_nodes)}")
|
| 89 |
|
| 90 |
+
for i, node in enumerate(final_nodes, 1):
|
| 91 |
+
node_type = node.metadata.get('type', 'unknown')
|
| 92 |
+
doc_id = node.metadata.get('document_id', 'unknown')
|
| 93 |
+
if node_type == 'table':
|
| 94 |
+
identifier = f"таблица {node.metadata.get('table_number', 'unknown')}"
|
| 95 |
+
elif node_type == 'image':
|
| 96 |
+
identifier = f"изображение {node.metadata.get('image_number', 'unknown')}"
|
| 97 |
+
else:
|
| 98 |
+
identifier = f"раздел {node.metadata.get('section_id', 'unknown')}"
|
| 99 |
+
log_message(f" {i}. [{node_type}] {doc_id} - {identifier}")
|
| 100 |
+
|
| 101 |
+
log_message("=" * 80)
|
| 102 |
+
|
| 103 |
+
return final_nodes
|
| 104 |
+
|
| 105 |
except Exception as e:
|
| 106 |
log_message(f"Ошибка переранжировки: {str(e)}")
|
| 107 |
return nodes[:top_k]
|
table_prep.py
CHANGED
|
@@ -98,7 +98,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 98 |
document_id = table_data.get('document', 'unknown')
|
| 99 |
|
| 100 |
if 'sheets' in table_data:
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
sheet['document'] = document_id
|
| 103 |
docs_list = table_to_document(sheet, document_id)
|
| 104 |
table_documents.extend(docs_list)
|
|
@@ -121,7 +127,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 121 |
stats['by_document'][document_id]['size'] += size
|
| 122 |
|
| 123 |
elif isinstance(table_data, list):
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
docs_list = table_to_document(table_json)
|
| 126 |
table_documents.extend(docs_list)
|
| 127 |
|
|
@@ -156,4 +168,18 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 156 |
|
| 157 |
except Exception as e:
|
| 158 |
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 159 |
-
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
document_id = table_data.get('document', 'unknown')
|
| 99 |
|
| 100 |
if 'sheets' in table_data:
|
| 101 |
+
# Sort sheets by table_number to ensure correct order
|
| 102 |
+
sorted_sheets = sorted(
|
| 103 |
+
table_data['sheets'],
|
| 104 |
+
key=lambda x: extract_table_number(x.get('table_number', ''))
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
for sheet in sorted_sheets:
|
| 108 |
sheet['document'] = document_id
|
| 109 |
docs_list = table_to_document(sheet, document_id)
|
| 110 |
table_documents.extend(docs_list)
|
|
|
|
| 127 |
stats['by_document'][document_id]['size'] += size
|
| 128 |
|
| 129 |
elif isinstance(table_data, list):
|
| 130 |
+
# Sort list by table_number
|
| 131 |
+
sorted_tables = sorted(
|
| 132 |
+
table_data,
|
| 133 |
+
key=lambda x: extract_table_number(x.get('table_number', ''))
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
for table_json in sorted_tables:
|
| 137 |
docs_list = table_to_document(table_json)
|
| 138 |
table_documents.extend(docs_list)
|
| 139 |
|
|
|
|
| 168 |
|
| 169 |
except Exception as e:
|
| 170 |
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def extract_table_number(table_number_str):
|
| 175 |
+
"""Extract numeric value from table number for sorting"""
|
| 176 |
+
import re
|
| 177 |
+
if not table_number_str:
|
| 178 |
+
return 0
|
| 179 |
+
# Remove "№" and any non-numeric characters except dots
|
| 180 |
+
cleaned = re.sub(r'[^0-9.]', '', str(table_number_str))
|
| 181 |
+
try:
|
| 182 |
+
# Convert to float to handle numbers like "9.1", "9.65"
|
| 183 |
+
return float(cleaned) if cleaned else 0
|
| 184 |
+
except ValueError:
|
| 185 |
+
return 0
|
utils.py
CHANGED
|
@@ -374,25 +374,53 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 374 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 375 |
|
| 376 |
try:
|
|
|
|
| 377 |
start_time = time.time()
|
| 378 |
|
| 379 |
retrieved_nodes = query_engine.retriever.retrieve(question)
|
|
|
|
|
|
|
| 380 |
reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
|
| 381 |
|
| 382 |
-
#
|
| 383 |
-
log_message(
|
| 384 |
-
log_message(f"Всего
|
|
|
|
|
|
|
| 385 |
for i, node in enumerate(reranked_nodes, 1):
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
log_message(f"
|
| 393 |
-
log_message(f"
|
| 394 |
-
log_message("
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
formatted_context = format_context_for_llm(reranked_nodes)
|
| 397 |
|
| 398 |
enhanced_question = f"""
|
|
|
|
| 374 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
| 375 |
|
| 376 |
try:
|
| 377 |
+
log_message(f"Получен вопрос: {question}")
|
| 378 |
start_time = time.time()
|
| 379 |
|
| 380 |
retrieved_nodes = query_engine.retriever.retrieve(question)
|
| 381 |
+
log_message(f"Извлечено {len(retrieved_nodes)} узлов")
|
| 382 |
+
|
| 383 |
reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
|
| 384 |
|
| 385 |
+
# ADD THIS DETAILED LOGGING SECTION
|
| 386 |
+
log_message("=" * 80)
|
| 387 |
+
log_message(f"ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О ВОЗВРАЩАЕМЫХ УЗЛАХ (всего: {len(reranked_nodes)})")
|
| 388 |
+
log_message("=" * 80)
|
| 389 |
+
|
| 390 |
for i, node in enumerate(reranked_nodes, 1):
|
| 391 |
+
metadata = node.metadata if hasattr(node, 'metadata') else {}
|
| 392 |
+
doc_type = metadata.get('type', 'unknown')
|
| 393 |
+
doc_id = metadata.get('document_id', 'unknown')
|
| 394 |
+
|
| 395 |
+
log_message(f"\n{'='*60}")
|
| 396 |
+
log_message(f"УЗЕЛ #{i}")
|
| 397 |
+
log_message(f"{'='*60}")
|
| 398 |
+
log_message(f"Тип документа: {doc_type}")
|
| 399 |
+
log_message(f"ID документа: {doc_id}")
|
| 400 |
+
|
| 401 |
+
if doc_type == 'table':
|
| 402 |
+
log_message(f"Номер таблицы: {metadata.get('table_number', 'unknown')}")
|
| 403 |
+
log_message(f"Название таблицы: {metadata.get('table_title', 'unknown')}")
|
| 404 |
+
log_message(f"Раздел: {metadata.get('section', 'unknown')}")
|
| 405 |
+
log_message(f"Количество строк: {metadata.get('total_rows', 'unknown')}")
|
| 406 |
+
elif doc_type == 'image':
|
| 407 |
+
log_message(f"Номер изображения: {metadata.get('image_number', 'unknown')}")
|
| 408 |
+
log_message(f"Название: {metadata.get('image_title', 'unknown')}")
|
| 409 |
+
log_message(f"Раздел: {metadata.get('section', 'unknown')}")
|
| 410 |
+
else: # text
|
| 411 |
+
log_message(f"Раздел ID: {metadata.get('section_id', 'unknown')}")
|
| 412 |
+
log_message(f"Путь раздела: {metadata.get('section_path', 'unknown')}")
|
| 413 |
+
log_message(f"Текст раздела: {metadata.get('section_text', 'unknown')[:100]}...")
|
| 414 |
+
log_message(f"Уровень: {metadata.get('level', 'unknown')}")
|
| 415 |
+
|
| 416 |
+
log_message(f"Размер текста: {len(node.text)} символов")
|
| 417 |
+
log_message(f"ПРЕВЬЮ СОДЕРЖИМОГО (первые 300 символов):")
|
| 418 |
+
log_message(f"{node.text[:300]}...")
|
| 419 |
+
log_message(f"{'='*60}\n")
|
| 420 |
+
|
| 421 |
+
log_message("=" * 80)
|
| 422 |
+
|
| 423 |
+
# Continue with rest of the function...
|
| 424 |
formatted_context = format_context_for_llm(reranked_nodes)
|
| 425 |
|
| 426 |
enhanced_question = f"""
|