Spaces:
Sleeping
Sleeping
Commit
·
4a17e89
1
Parent(s):
d6920c6
max chunk size= 4000 + max row = 5
Browse files- table_prep.py +8 -5
table_prep.py
CHANGED
|
@@ -4,6 +4,9 @@ from huggingface_hub import hf_hub_download, list_repo_files
|
|
| 4 |
from llama_index.core import Document
|
| 5 |
from my_logging import log_message
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
def create_table_content(table_data):
|
| 8 |
"""Create formatted content from table data"""
|
| 9 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
|
@@ -32,7 +35,8 @@ def create_table_content(table_data):
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
-
|
|
|
|
| 36 |
lines = doc.text.strip().split('\n')
|
| 37 |
|
| 38 |
# Separate header and data rows
|
|
@@ -60,8 +64,8 @@ def chunk_table_document(doc, max_chunk_size=4000):
|
|
| 60 |
|
| 61 |
for row in data_rows:
|
| 62 |
row_size = len(row) + 1
|
| 63 |
-
#
|
| 64 |
-
if current_size + row_size > max_chunk_size and current_rows:
|
| 65 |
chunk_text = header + '\n'.join(current_rows)
|
| 66 |
chunks.append(chunk_text)
|
| 67 |
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
|
@@ -71,6 +75,7 @@ def chunk_table_document(doc, max_chunk_size=4000):
|
|
| 71 |
current_rows.append(row)
|
| 72 |
current_size += row_size
|
| 73 |
log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
|
|
|
|
| 74 |
# Add final chunk
|
| 75 |
if current_rows:
|
| 76 |
chunk_text = header + '\n'.join(current_rows)
|
|
@@ -142,8 +147,6 @@ def table_to_document(table_data, document_id=None):
|
|
| 142 |
chunks = chunk_table_document(base_doc)
|
| 143 |
log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 144 |
return chunk_table_document(base_doc)
|
| 145 |
-
|
| 146 |
-
|
| 147 |
return [base_doc]
|
| 148 |
|
| 149 |
|
|
|
|
| 4 |
from llama_index.core import Document
|
| 5 |
from my_logging import log_message
|
| 6 |
|
| 7 |
+
MAX_ROWS_PER_CHUNK = 5
|
| 8 |
+
MAX_CHUNK_SIZE = 4000
|
| 9 |
+
|
| 10 |
def create_table_content(table_data):
|
| 11 |
"""Create formatted content from table data"""
|
| 12 |
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
|
|
|
|
| 35 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 36 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 37 |
|
| 38 |
+
|
| 39 |
+
def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
|
| 40 |
lines = doc.text.strip().split('\n')
|
| 41 |
|
| 42 |
# Separate header and data rows
|
|
|
|
| 64 |
|
| 65 |
for row in data_rows:
|
| 66 |
row_size = len(row) + 1
|
| 67 |
+
# Check both limits: chunk size and row count
|
| 68 |
+
if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
|
| 69 |
chunk_text = header + '\n'.join(current_rows)
|
| 70 |
chunks.append(chunk_text)
|
| 71 |
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
|
|
|
| 75 |
current_rows.append(row)
|
| 76 |
current_size += row_size
|
| 77 |
log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
|
| 78 |
+
|
| 79 |
# Add final chunk
|
| 80 |
if current_rows:
|
| 81 |
chunk_text = header + '\n'.join(current_rows)
|
|
|
|
| 147 |
chunks = chunk_table_document(base_doc)
|
| 148 |
log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
|
| 149 |
return chunk_table_document(base_doc)
|
|
|
|
|
|
|
| 150 |
return [base_doc]
|
| 151 |
|
| 152 |
|