MrSimple07 commited on
Commit
afcac41
·
1 Parent(s): 451cdc6

added sheet_name

Browse files
Files changed (1) hide show
  1. documents_prep.py +26 -19
documents_prep.py CHANGED
@@ -407,7 +407,7 @@ def load_image_data(repo_id, hf_token, image_data_dir):
407
  return []
408
 
409
  def load_table_data(repo_id, hf_token, table_data_dir):
410
- """Load and process table data with sheet-level document_id extraction"""
411
  log_message("=" * 60)
412
  log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
413
  log_message("=" * 60)
@@ -426,7 +426,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
426
  stats = {
427
  'total_tables': 0,
428
  'total_size': 0,
429
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
 
430
  }
431
 
432
  for file_path in table_files:
@@ -439,42 +440,43 @@ def load_table_data(repo_id, hf_token, table_data_dir):
439
  token=hf_token
440
  )
441
 
442
- log_message(f"\nОбработка файла: {file_path}")
443
 
444
  with open(local_path, 'r', encoding='utf-8') as f:
445
  table_data = json.load(f)
446
 
447
  if isinstance(table_data, dict):
448
- # Extract file-level document_id
449
  file_level_doc_id = (
450
  table_data.get('document_id') or
451
  table_data.get('document') or
452
- table_data.get('Обозначение документа') or
453
  'unknown'
454
  )
455
 
456
- # Handle multiple sheets
457
  if 'sheets' in table_data:
458
  sorted_sheets = sorted(
459
  table_data['sheets'],
460
  key=lambda sheet: sheet.get('table_number', '')
461
  )
462
 
 
 
463
  for sheet in sorted_sheets:
464
- # CRITICAL FIX: Use sheet-level document_id if available
465
- sheet_doc_id = (
466
- sheet.get('document_id') or
467
- sheet.get('document') or
468
- sheet.get('Обозначение документа') or
469
- file_level_doc_id
470
- )
471
 
472
- log_message(f" Sheet doc_id: {sheet_doc_id} (file: {file_level_doc_id})")
473
 
474
- # Pass sheet's own document_id
475
  docs_list = table_to_document(sheet, document_id=sheet_doc_id)
476
  table_documents.extend(docs_list)
477
 
 
 
478
  for doc in docs_list:
479
  stats['total_tables'] += 1
480
  size = doc.metadata.get('content_size', 0)
@@ -482,7 +484,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
482
  stats['by_document'][sheet_doc_id]['count'] += 1
483
  stats['by_document'][sheet_doc_id]['size'] += size
484
  else:
485
- # Single table
486
  docs_list = table_to_document(table_data, document_id=file_level_doc_id)
487
  table_documents.extend(docs_list)
488
 
@@ -499,18 +501,23 @@ def load_table_data(repo_id, hf_token, table_data_dir):
499
  log_message(f"Traceback: {traceback.format_exc()}")
500
  continue
501
 
502
- # Log summary
503
  log_message("\n" + "=" * 60)
504
  log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
505
  log_message("=" * 60)
506
- log_message(f"Всего таблиц: {stats['total_tables']}")
507
  log_message(f"Общий размер: {stats['total_size']:,} символов")
508
  if stats['total_tables'] > 0:
509
  log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
510
 
511
  log_message("\nПо документам:")
512
  for doc_id, doc_stats in sorted(stats['by_document'].items()):
513
- log_message(f" • {doc_id}: {doc_stats['count']} таблиц, {doc_stats['size']:,} символов")
 
 
 
 
 
514
 
515
  log_message("=" * 60)
516
 
 
407
  return []
408
 
409
  def load_table_data(repo_id, hf_token, table_data_dir):
410
+ """Load and process table data with complete metadata preservation"""
411
  log_message("=" * 60)
412
  log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
413
  log_message("=" * 60)
 
426
  stats = {
427
  'total_tables': 0,
428
  'total_size': 0,
429
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0}),
430
+ 'by_sheet': defaultdict(int)
431
  }
432
 
433
  for file_path in table_files:
 
440
  token=hf_token
441
  )
442
 
443
+ log_message(f"\n📂 Обработка файла: {file_path}")
444
 
445
  with open(local_path, 'r', encoding='utf-8') as f:
446
  table_data = json.load(f)
447
 
448
  if isinstance(table_data, dict):
 
449
  file_level_doc_id = (
450
  table_data.get('document_id') or
451
  table_data.get('document') or
 
452
  'unknown'
453
  )
454
 
 
455
  if 'sheets' in table_data:
456
  sorted_sheets = sorted(
457
  table_data['sheets'],
458
  key=lambda sheet: sheet.get('table_number', '')
459
  )
460
 
461
+ log_message(f" Найдено листов: {len(sorted_sheets)}")
462
+
463
  for sheet in sorted_sheets:
464
+ # CRITICAL: sheet_name MUST be present
465
+ if 'sheet_name' not in sheet:
466
+ log_message(f" ⚠️ Пропущен лист без sheet_name")
467
+ continue
468
+
469
+ sheet_name = sheet['sheet_name']
470
+ sheet_doc_id = sheet.get('document_id', file_level_doc_id)
471
 
472
+ log_message(f" Лист: {sheet_name} | doc_id: {sheet_doc_id}")
473
 
474
+ # Pass complete sheet data to table_to_document
475
  docs_list = table_to_document(sheet, document_id=sheet_doc_id)
476
  table_documents.extend(docs_list)
477
 
478
+ stats['by_sheet'][sheet_name] += len(docs_list)
479
+
480
  for doc in docs_list:
481
  stats['total_tables'] += 1
482
  size = doc.metadata.get('content_size', 0)
 
484
  stats['by_document'][sheet_doc_id]['count'] += 1
485
  stats['by_document'][sheet_doc_id]['size'] += size
486
  else:
487
+ # Single table (no sheets structure)
488
  docs_list = table_to_document(table_data, document_id=file_level_doc_id)
489
  table_documents.extend(docs_list)
490
 
 
501
  log_message(f"Traceback: {traceback.format_exc()}")
502
  continue
503
 
504
+ # Enhanced logging with sheet breakdown
505
  log_message("\n" + "=" * 60)
506
  log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
507
  log_message("=" * 60)
508
+ log_message(f"Всего таблиц/чанков: {stats['total_tables']}")
509
  log_message(f"Общий размер: {stats['total_size']:,} символов")
510
  if stats['total_tables'] > 0:
511
  log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
512
 
513
  log_message("\nПо документам:")
514
  for doc_id, doc_stats in sorted(stats['by_document'].items()):
515
+ log_message(f" • {doc_id}: {doc_stats['count']} элементов, {doc_stats['size']:,} символов")
516
+
517
+ log_message("\nПо листам (топ-20):")
518
+ top_sheets = sorted(stats['by_sheet'].items(), key=lambda x: x[1], reverse=True)[:20]
519
+ for sheet_name, count in top_sheets:
520
+ log_message(f" • {sheet_name}: {count} чанков")
521
 
522
  log_message("=" * 60)
523