MrSimple07 commited on
Commit
4a17e89
·
1 Parent(s): d6920c6

max chunk size= 4000 + max row = 5

Browse files
Files changed (1) hide show
  1. table_prep.py +8 -5
table_prep.py CHANGED
@@ -4,6 +4,9 @@ from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
 
 
 
7
  def create_table_content(table_data):
8
  """Create formatted content from table data"""
9
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
@@ -32,7 +35,8 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- def chunk_table_document(doc, max_chunk_size=4000):
 
36
  lines = doc.text.strip().split('\n')
37
 
38
  # Separate header and data rows
@@ -60,8 +64,8 @@ def chunk_table_document(doc, max_chunk_size=4000):
60
 
61
  for row in data_rows:
62
  row_size = len(row) + 1
63
- # If adding this row would exceed max_chunk_size, save current chunk
64
- if current_size + row_size > max_chunk_size and current_rows:
65
  chunk_text = header + '\n'.join(current_rows)
66
  chunks.append(chunk_text)
67
  log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
@@ -71,6 +75,7 @@ def chunk_table_document(doc, max_chunk_size=4000):
71
  current_rows.append(row)
72
  current_size += row_size
73
  log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
 
74
  # Add final chunk
75
  if current_rows:
76
  chunk_text = header + '\n'.join(current_rows)
@@ -142,8 +147,6 @@ def table_to_document(table_data, document_id=None):
142
  chunks = chunk_table_document(base_doc)
143
  log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
144
  return chunk_table_document(base_doc)
145
-
146
-
147
  return [base_doc]
148
 
149
 
 
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
+ MAX_ROWS_PER_CHUNK = 5
8
+ MAX_CHUNK_SIZE = 4000
9
+
10
  def create_table_content(table_data):
11
  """Create formatted content from table data"""
12
  doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
 
35
  from llama_index.core.text_splitter import SentenceSplitter
36
  from config import CHUNK_SIZE, CHUNK_OVERLAP
37
 
38
+
39
+ def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
40
  lines = doc.text.strip().split('\n')
41
 
42
  # Separate header and data rows
 
64
 
65
  for row in data_rows:
66
  row_size = len(row) + 1
67
+ # Check both limits: chunk size and row count
68
+ if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
69
  chunk_text = header + '\n'.join(current_rows)
70
  chunks.append(chunk_text)
71
  log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
 
75
  current_rows.append(row)
76
  current_size += row_size
77
  log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
78
+
79
  # Add final chunk
80
  if current_rows:
81
  chunk_text = header + '\n'.join(current_rows)
 
147
  chunks = chunk_table_document(base_doc)
148
  log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
149
  return chunk_table_document(base_doc)
 
 
150
  return [base_doc]
151
 
152