Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

4a17e89

1 Parent(s): d6920c6

max chunk size= 4000 + max row = 5

Browse files

Files changed (1) hide show

table_prep.py +8 -5

table_prep.py CHANGED Viewed

@@ -4,6 +4,9 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
@@ -32,7 +35,8 @@ def create_table_content(table_data):
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def chunk_table_document(doc, max_chunk_size=4000):
     lines = doc.text.strip().split('\n')
     # Separate header and data rows
@@ -60,8 +64,8 @@ def chunk_table_document(doc, max_chunk_size=4000):
     for row in data_rows:
         row_size = len(row) + 1
-        # If adding this row would exceed max_chunk_size, save current chunk
-        if current_size + row_size > max_chunk_size and current_rows:
             chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
             log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
@@ -71,6 +75,7 @@ def chunk_table_document(doc, max_chunk_size=4000):
         current_rows.append(row)
         current_size += row_size
         log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
     # Add final chunk
     if current_rows:
         chunk_text = header + '\n'.join(current_rows)
@@ -142,8 +147,6 @@ def table_to_document(table_data, document_id=None):
         chunks = chunk_table_document(base_doc)
         log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
         return chunk_table_document(base_doc)
     return [base_doc]

 from llama_index.core import Document
 from my_logging import log_message
+MAX_ROWS_PER_CHUNK = 5
+MAX_CHUNK_SIZE = 4000
 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
     lines = doc.text.strip().split('\n')
     # Separate header and data rows
     for row in data_rows:
         row_size = len(row) + 1
+        # Check both limits: chunk size and row count
+        if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
             chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
             log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
         current_rows.append(row)
         current_size += row_size
         log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
     # Add final chunk
     if current_rows:
         chunk_text = header + '\n'.join(current_rows)
         chunks = chunk_table_document(base_doc)
         log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
         return chunk_table_document(base_doc)
     return [base_doc]