MrSimple07 commited on
Commit
ff92caa
·
1 Parent(s): abca2ac

added the load_table_data function

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. documents_prep.py +96 -5
  3. table_prep.py +0 -5
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
- from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
- from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
  import sys
 
1
  import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
+ from documents_prep import *
5
+ from utils import *
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
  import sys
documents_prep.py CHANGED
@@ -48,11 +48,6 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
48
 
49
 
50
  def process_documents_with_chunking(documents):
51
- """
52
- Process all document types with appropriate chunking.
53
- Tables: row-block chunking (handled in table_prep.py)
54
- Text/Images: sentence-aware chunking
55
- """
56
  all_chunked_docs = []
57
  stats = {
58
  'table_whole': 0,
@@ -397,6 +392,102 @@ def load_image_data(repo_id, hf_token, image_data_dir):
397
  return []
398
 
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
401
  log_message("З��гружаю данные чанков из CSV")
402
 
 
48
 
49
 
50
  def process_documents_with_chunking(documents):
 
 
 
 
 
51
  all_chunked_docs = []
52
  stats = {
53
  'table_whole': 0,
 
392
  return []
393
 
394
 
395
+ def load_table_data(repo_id, hf_token, table_data_dir):
396
+ """Load and process table data from HuggingFace repo"""
397
+ log_message("=" * 60)
398
+ log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
399
+ log_message("=" * 60)
400
+
401
+ try:
402
+ from huggingface_hub import hf_hub_download, list_repo_files
403
+ import json
404
+ from collections import defaultdict
405
+
406
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
407
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
408
+
409
+ log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
410
+
411
+ table_documents = []
412
+ stats = {
413
+ 'total_tables': 0,
414
+ 'total_size': 0,
415
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
416
+ }
417
+
418
+ for file_path in table_files:
419
+ try:
420
+ local_path = hf_hub_download(
421
+ repo_id=repo_id,
422
+ filename=file_path,
423
+ local_dir='',
424
+ repo_type="dataset",
425
+ token=hf_token
426
+ )
427
+
428
+ log_message(f"\nОбработка файла: {file_path}")
429
+
430
+ with open(local_path, 'r', encoding='utf-8') as f:
431
+ table_data = json.load(f)
432
+
433
+ if isinstance(table_data, dict):
434
+ document_id = table_data.get('document', 'unknown')
435
+
436
+ # Handle multiple sheets
437
+ if 'sheets' in table_data:
438
+ sorted_sheets = sorted(
439
+ table_data['sheets'],
440
+ key=lambda sheet: sheet.get('table_number', '')
441
+ )
442
+
443
+ for sheet in sorted_sheets:
444
+ sheet['document'] = document_id
445
+ docs_list = table_to_document(sheet, document_id)
446
+ table_documents.extend(docs_list)
447
+
448
+ for doc in docs_list:
449
+ stats['total_tables'] += 1
450
+ size = doc.metadata.get('content_size', 0)
451
+ stats['total_size'] += size
452
+ stats['by_document'][document_id]['count'] += 1
453
+ stats['by_document'][document_id]['size'] += size
454
+ else:
455
+ # Single table
456
+ docs_list = table_to_document(table_data, document_id)
457
+ table_documents.extend(docs_list)
458
+
459
+ for doc in docs_list:
460
+ stats['total_tables'] += 1
461
+ size = doc.metadata.get('content_size', 0)
462
+ stats['total_size'] += size
463
+ stats['by_document'][document_id]['count'] += 1
464
+ stats['by_document'][document_id]['size'] += size
465
+
466
+ except Exception as e:
467
+ log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
468
+ continue
469
+
470
+ # Log summary
471
+ log_message("\n" + "=" * 60)
472
+ log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
473
+ log_message("=" * 60)
474
+ log_message(f"Всего таблиц: {stats['total_tables']}")
475
+ log_message(f"Общий размер: {stats['total_size']:,} символов")
476
+ if stats['total_tables'] > 0:
477
+ log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
478
+
479
+ log_message("\nПо документам:")
480
+ for doc_id, doc_stats in sorted(stats['by_document'].items()):
481
+ log_message(f" • {doc_id}: {doc_stats['count']} таблиц, {doc_stats['size']:,} символов")
482
+
483
+ log_message("=" * 60)
484
+
485
+ return table_documents
486
+
487
+ except Exception as e:
488
+ log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
489
+ return []
490
+
491
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
492
  log_message("З��гружаю данные чанков из CSV")
493
 
table_prep.py CHANGED
@@ -32,11 +32,6 @@ def create_table_content(table_data):
32
 
33
 
34
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
35
- """
36
- Smart table chunking:
37
- - Small tables: keep whole
38
- - Large tables: split by row-blocks, preserve headers in each chunk
39
- """
40
  if chunk_size is None:
41
  chunk_size = CHUNK_SIZE
42
  if chunk_overlap is None:
 
32
 
33
 
34
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
 
 
 
 
 
35
  if chunk_size is None:
36
  chunk_size = CHUNK_SIZE
37
  if chunk_overlap is None: