Spaces:
Sleeping
Sleeping
Commit
·
afcac41
1
Parent(s):
451cdc6
added sheet_name
Browse files- documents_prep.py +26 -19
documents_prep.py
CHANGED
|
@@ -407,7 +407,7 @@ def load_image_data(repo_id, hf_token, image_data_dir):
|
|
| 407 |
return []
|
| 408 |
|
| 409 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 410 |
-
"""Load and process table data with
|
| 411 |
log_message("=" * 60)
|
| 412 |
log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
|
| 413 |
log_message("=" * 60)
|
|
@@ -426,7 +426,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 426 |
stats = {
|
| 427 |
'total_tables': 0,
|
| 428 |
'total_size': 0,
|
| 429 |
-
'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
|
|
|
|
| 430 |
}
|
| 431 |
|
| 432 |
for file_path in table_files:
|
|
@@ -439,42 +440,43 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 439 |
token=hf_token
|
| 440 |
)
|
| 441 |
|
| 442 |
-
log_message(f"\nОбработка файла: {file_path}")
|
| 443 |
|
| 444 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 445 |
table_data = json.load(f)
|
| 446 |
|
| 447 |
if isinstance(table_data, dict):
|
| 448 |
-
# Extract file-level document_id
|
| 449 |
file_level_doc_id = (
|
| 450 |
table_data.get('document_id') or
|
| 451 |
table_data.get('document') or
|
| 452 |
-
table_data.get('Обозначение документа') or
|
| 453 |
'unknown'
|
| 454 |
)
|
| 455 |
|
| 456 |
-
# Handle multiple sheets
|
| 457 |
if 'sheets' in table_data:
|
| 458 |
sorted_sheets = sorted(
|
| 459 |
table_data['sheets'],
|
| 460 |
key=lambda sheet: sheet.get('table_number', '')
|
| 461 |
)
|
| 462 |
|
|
|
|
|
|
|
| 463 |
for sheet in sorted_sheets:
|
| 464 |
-
# CRITICAL
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
)
|
| 471 |
|
| 472 |
-
log_message(f"
|
| 473 |
|
| 474 |
-
# Pass sheet
|
| 475 |
docs_list = table_to_document(sheet, document_id=sheet_doc_id)
|
| 476 |
table_documents.extend(docs_list)
|
| 477 |
|
|
|
|
|
|
|
| 478 |
for doc in docs_list:
|
| 479 |
stats['total_tables'] += 1
|
| 480 |
size = doc.metadata.get('content_size', 0)
|
|
@@ -482,7 +484,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 482 |
stats['by_document'][sheet_doc_id]['count'] += 1
|
| 483 |
stats['by_document'][sheet_doc_id]['size'] += size
|
| 484 |
else:
|
| 485 |
-
# Single table
|
| 486 |
docs_list = table_to_document(table_data, document_id=file_level_doc_id)
|
| 487 |
table_documents.extend(docs_list)
|
| 488 |
|
|
@@ -499,18 +501,23 @@ def load_table_data(repo_id, hf_token, table_data_dir):
|
|
| 499 |
log_message(f"Traceback: {traceback.format_exc()}")
|
| 500 |
continue
|
| 501 |
|
| 502 |
-
#
|
| 503 |
log_message("\n" + "=" * 60)
|
| 504 |
log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
|
| 505 |
log_message("=" * 60)
|
| 506 |
-
log_message(f"Всего
|
| 507 |
log_message(f"Общий размер: {stats['total_size']:,} символов")
|
| 508 |
if stats['total_tables'] > 0:
|
| 509 |
log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
|
| 510 |
|
| 511 |
log_message("\nПо документам:")
|
| 512 |
for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 513 |
-
log_message(f" • {doc_id}: {doc_stats['count']}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
log_message("=" * 60)
|
| 516 |
|
|
|
|
| 407 |
return []
|
| 408 |
|
| 409 |
def load_table_data(repo_id, hf_token, table_data_dir):
|
| 410 |
+
"""Load and process table data with complete metadata preservation"""
|
| 411 |
log_message("=" * 60)
|
| 412 |
log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
|
| 413 |
log_message("=" * 60)
|
|
|
|
| 426 |
stats = {
|
| 427 |
'total_tables': 0,
|
| 428 |
'total_size': 0,
|
| 429 |
+
'by_document': defaultdict(lambda: {'count': 0, 'size': 0}),
|
| 430 |
+
'by_sheet': defaultdict(int)
|
| 431 |
}
|
| 432 |
|
| 433 |
for file_path in table_files:
|
|
|
|
| 440 |
token=hf_token
|
| 441 |
)
|
| 442 |
|
| 443 |
+
log_message(f"\n📂 Обработка файла: {file_path}")
|
| 444 |
|
| 445 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 446 |
table_data = json.load(f)
|
| 447 |
|
| 448 |
if isinstance(table_data, dict):
|
|
|
|
| 449 |
file_level_doc_id = (
|
| 450 |
table_data.get('document_id') or
|
| 451 |
table_data.get('document') or
|
|
|
|
| 452 |
'unknown'
|
| 453 |
)
|
| 454 |
|
|
|
|
| 455 |
if 'sheets' in table_data:
|
| 456 |
sorted_sheets = sorted(
|
| 457 |
table_data['sheets'],
|
| 458 |
key=lambda sheet: sheet.get('table_number', '')
|
| 459 |
)
|
| 460 |
|
| 461 |
+
log_message(f" Найдено листов: {len(sorted_sheets)}")
|
| 462 |
+
|
| 463 |
for sheet in sorted_sheets:
|
| 464 |
+
# CRITICAL: sheet_name MUST be present
|
| 465 |
+
if 'sheet_name' not in sheet:
|
| 466 |
+
log_message(f" ⚠️ Пропущен лист без sheet_name")
|
| 467 |
+
continue
|
| 468 |
+
|
| 469 |
+
sheet_name = sheet['sheet_name']
|
| 470 |
+
sheet_doc_id = sheet.get('document_id', file_level_doc_id)
|
| 471 |
|
| 472 |
+
log_message(f" → Лист: {sheet_name} | doc_id: {sheet_doc_id}")
|
| 473 |
|
| 474 |
+
# Pass complete sheet data to table_to_document
|
| 475 |
docs_list = table_to_document(sheet, document_id=sheet_doc_id)
|
| 476 |
table_documents.extend(docs_list)
|
| 477 |
|
| 478 |
+
stats['by_sheet'][sheet_name] += len(docs_list)
|
| 479 |
+
|
| 480 |
for doc in docs_list:
|
| 481 |
stats['total_tables'] += 1
|
| 482 |
size = doc.metadata.get('content_size', 0)
|
|
|
|
| 484 |
stats['by_document'][sheet_doc_id]['count'] += 1
|
| 485 |
stats['by_document'][sheet_doc_id]['size'] += size
|
| 486 |
else:
|
| 487 |
+
# Single table (no sheets structure)
|
| 488 |
docs_list = table_to_document(table_data, document_id=file_level_doc_id)
|
| 489 |
table_documents.extend(docs_list)
|
| 490 |
|
|
|
|
| 501 |
log_message(f"Traceback: {traceback.format_exc()}")
|
| 502 |
continue
|
| 503 |
|
| 504 |
+
# Enhanced logging with sheet breakdown
|
| 505 |
log_message("\n" + "=" * 60)
|
| 506 |
log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
|
| 507 |
log_message("=" * 60)
|
| 508 |
+
log_message(f"Всего таблиц/чанков: {stats['total_tables']}")
|
| 509 |
log_message(f"Общий размер: {stats['total_size']:,} символов")
|
| 510 |
if stats['total_tables'] > 0:
|
| 511 |
log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
|
| 512 |
|
| 513 |
log_message("\nПо документам:")
|
| 514 |
for doc_id, doc_stats in sorted(stats['by_document'].items()):
|
| 515 |
+
log_message(f" • {doc_id}: {doc_stats['count']} элементов, {doc_stats['size']:,} символов")
|
| 516 |
+
|
| 517 |
+
log_message("\nПо листам (топ-20):")
|
| 518 |
+
top_sheets = sorted(stats['by_sheet'].items(), key=lambda x: x[1], reverse=True)[:20]
|
| 519 |
+
for sheet_name, count in top_sheets:
|
| 520 |
+
log_message(f" • {sheet_name}: {count} чанков")
|
| 521 |
|
| 522 |
log_message("=" * 60)
|
| 523 |
|