MrSimple07 commited on
Commit
f85ad1c
·
1 Parent(s): 822ef8c

new way of chunking

Browse files
Files changed (3) hide show
  1. config.py +1 -1
  2. documents_prep.py +127 -68
  3. table_prep.py +106 -60
config.py CHANGED
@@ -50,7 +50,7 @@ AVAILABLE_MODELS = {
50
 
51
  DEFAULT_MODEL = "Gemini 2.5 Flash"
52
 
53
- CHUNK_SIZE = 3000
54
  CHUNK_OVERLAP = 256
55
 
56
  CUSTOM_PROMPT = """
 
50
 
51
  DEFAULT_MODEL = "Gemini 2.5 Flash"
52
 
53
+ CHUNK_SIZE = 2000
54
  CHUNK_OVERLAP = 256
55
 
56
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -14,147 +14,206 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
14
  chunk_size = CHUNK_SIZE
15
  if chunk_overlap is None:
16
  chunk_overlap = CHUNK_OVERLAP
17
- text_splitter = SentenceSplitter(
18
- chunk_size=chunk_size,
19
- chunk_overlap=chunk_overlap,
20
- separator=" "
21
- )
22
 
23
- text_chunks = text_splitter.split_text(doc.text)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  chunked_docs = []
26
- for i, chunk_text in enumerate(text_chunks):
27
  chunk_metadata = doc.metadata.copy()
28
  chunk_metadata.update({
29
  "chunk_id": i,
30
- "total_chunks": len(text_chunks),
31
  "chunk_size": len(chunk_text),
32
- "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
33
  })
34
-
35
- chunked_doc = Document(
36
- text=chunk_text,
37
- metadata=chunk_metadata
38
- )
39
- chunked_docs.append(chunked_doc)
40
 
41
  return chunked_docs
42
 
43
  def process_documents_with_chunking(documents):
 
 
 
 
44
  all_chunked_docs = []
45
  chunk_info = []
46
- table_count = 0
47
- table_chunks_count = 0
48
- image_count = 0
49
- image_chunks_count = 0
50
- text_chunks_count = 0
51
 
52
- for doc in documents:
 
 
 
 
 
 
 
 
53
  doc_type = doc.metadata.get('type', 'text')
54
  is_already_chunked = doc.metadata.get('is_chunked', False)
 
 
 
 
 
 
55
 
56
  if doc_type == 'table':
57
  if is_already_chunked:
58
- table_chunks_count += 1
 
59
  all_chunked_docs.append(doc)
60
- chunk_info.append({
61
- 'document_id': doc.metadata.get('document_id', 'unknown'),
62
- 'section_id': doc.metadata.get('section_id', 'unknown'),
63
- 'chunk_id': doc.metadata.get('chunk_id', 0),
64
- 'total_chunks': doc.metadata.get('total_chunks', 1),
65
- 'chunk_size': len(doc.text),
66
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
67
- 'type': 'table',
68
- 'table_number': doc.metadata.get('table_number', 'unknown')
69
- })
70
  else:
71
- table_count += 1
 
72
  all_chunked_docs.append(doc)
73
- chunk_info.append({
74
- 'document_id': doc.metadata.get('document_id', 'unknown'),
75
- 'section_id': doc.metadata.get('section_id', 'unknown'),
76
- 'chunk_id': 0,
77
- 'chunk_size': len(doc.text),
78
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
79
- 'type': 'table',
80
- 'table_number': doc.metadata.get('table_number', 'unknown')
81
- })
 
 
 
 
 
82
 
83
  elif doc_type == 'image':
84
- image_count += 1
85
- doc_size = len(doc.text)
86
  if doc_size > CHUNK_SIZE:
87
- log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
88
- f"Размер: {doc_size} > {CHUNK_SIZE}")
89
  chunked_docs = chunk_document(doc)
90
- image_chunks_count += len(chunked_docs)
91
  all_chunked_docs.extend(chunked_docs)
92
- log_message(f" ✂️ Разделено на {len(chunked_docs)} чанков")
93
 
94
- for i, chunk_doc in enumerate(chunked_docs):
95
  chunk_info.append({
96
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
97
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
98
- 'chunk_id': i,
 
99
  'chunk_size': len(chunk_doc.text),
100
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
101
  'type': 'image',
102
- 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
 
103
  })
104
  else:
 
105
  all_chunked_docs.append(doc)
 
 
106
  chunk_info.append({
107
  'document_id': doc.metadata.get('document_id', 'unknown'),
108
  'section_id': doc.metadata.get('section_id', 'unknown'),
109
  'chunk_id': 0,
 
110
  'chunk_size': doc_size,
111
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
112
  'type': 'image',
113
- 'image_number': doc.metadata.get('image_number', 'unknown')
 
114
  })
115
 
116
- else:
117
- doc_size = len(doc.text)
118
  if doc_size > CHUNK_SIZE:
119
- log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
 
 
120
  f"Размер: {doc_size} > {CHUNK_SIZE}")
 
121
  chunked_docs = chunk_document(doc)
122
- text_chunks_count += len(chunked_docs)
123
  all_chunked_docs.extend(chunked_docs)
124
- log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
125
 
126
- for i, chunk_doc in enumerate(chunked_docs):
127
  chunk_info.append({
128
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
129
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
130
- 'chunk_id': i,
 
131
  'chunk_size': len(chunk_doc.text),
132
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
133
- 'type': 'text'
 
134
  })
135
  else:
 
136
  all_chunked_docs.append(doc)
 
 
137
  chunk_info.append({
138
  'document_id': doc.metadata.get('document_id', 'unknown'),
139
  'section_id': doc.metadata.get('section_id', 'unknown'),
140
  'chunk_id': 0,
 
141
  'chunk_size': doc_size,
142
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
- 'type': 'text'
 
144
  })
145
 
146
  log_message(f"\n{'='*60}")
147
- log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
148
- log_message(f" • Таблицы (целые): {table_count}")
149
- log_message(f" • Таблицы (чанки): {table_chunks_count}")
150
- log_message(f" Изображения (целые): {image_count - (image_chunks_count > 0)}")
151
- log_message(f" Изображения (чанки): {image_chunks_count}")
152
- log_message(f" • Текстовые чанки: {text_chunks_count}")
153
- log_message(f" Всего документов: {len(all_chunked_docs)}")
 
 
 
 
 
 
154
  log_message(f"{'='*60}\n")
155
 
156
  return all_chunked_docs, chunk_info
157
 
 
158
  def extract_text_from_json(data, document_id, document_name):
159
  documents = []
160
 
 
14
  chunk_size = CHUNK_SIZE
15
  if chunk_overlap is None:
16
  chunk_overlap = CHUNK_OVERLAP
 
 
 
 
 
17
 
18
+ text = doc.text
19
 
20
+ # Try to split by double newlines (paragraphs) first
21
+ paragraphs = text.split('\n\n')
22
+
23
+ chunks = []
24
+ current_chunk = ""
25
+
26
+ for para in paragraphs:
27
+ para = para.strip()
28
+ if not para:
29
+ continue
30
+
31
+ # If adding this paragraph exceeds limit, save current chunk
32
+ if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
33
+ chunks.append(current_chunk.strip())
34
+ # Add overlap from end of previous chunk
35
+ overlap_text = current_chunk[-chunk_overlap:] if len(current_chunk) > chunk_overlap else current_chunk
36
+ current_chunk = overlap_text + "\n\n" + para
37
+ else:
38
+ if current_chunk:
39
+ current_chunk += "\n\n" + para
40
+ else:
41
+ current_chunk = para
42
+
43
+ # Add last chunk
44
+ if current_chunk:
45
+ chunks.append(current_chunk.strip())
46
+
47
+ # If single paragraph is too large, fall back to sentence splitting
48
+ final_chunks = []
49
+ for chunk_text in chunks:
50
+ if len(chunk_text) > chunk_size:
51
+ splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
52
+ final_chunks.extend(splitter.split_text(chunk_text))
53
+ else:
54
+ final_chunks.append(chunk_text)
55
+
56
+ log_message(f" ✂️ Текст разбит на {len(final_chunks)} семантических чанков")
57
+
58
+ # Create documents
59
  chunked_docs = []
60
+ for i, chunk_text in enumerate(final_chunks):
61
  chunk_metadata = doc.metadata.copy()
62
  chunk_metadata.update({
63
  "chunk_id": i,
64
+ "total_chunks": len(final_chunks),
65
  "chunk_size": len(chunk_text),
66
+ "is_chunked": True
67
  })
68
+ chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
 
 
 
 
 
69
 
70
  return chunked_docs
71
 
72
  def process_documents_with_chunking(documents):
73
+ log_message("\n" + "="*60)
74
+ log_message("🔄 НАЧАЛО ПРОЦЕССА ЧАНКИНГА")
75
+ log_message("="*60)
76
+
77
  all_chunked_docs = []
78
  chunk_info = []
 
 
 
 
 
79
 
80
+ # Counters
81
+ table_whole_count = 0 # Целые таблицы (не нуждаются в чанкинге)
82
+ table_chunked_count = 0 # Таблицы, которые УЖЕ разбиты
83
+ image_whole_count = 0 # Целые изображения
84
+ image_chunked_count = 0 # Изображения, разбитые на чанки
85
+ text_whole_count = 0 # Целые текстовые документы
86
+ text_chunked_count = 0 # Текстовые документы, разбитые на чанки
87
+
88
+ for idx, doc in enumerate(documents):
89
  doc_type = doc.metadata.get('type', 'text')
90
  is_already_chunked = doc.metadata.get('is_chunked', False)
91
+ doc_size = len(doc.text)
92
+
93
+ log_message(f"\n📄 Документ {idx+1}/{len(documents)} | "
94
+ f"Тип: {doc_type} | "
95
+ f"Размер: {doc_size} | "
96
+ f"Уже разбит: {is_already_chunked}")
97
 
98
  if doc_type == 'table':
99
  if is_already_chunked:
100
+ # Таблица уже разбита на чанки в table_prep.py
101
+ table_chunked_count += 1
102
  all_chunked_docs.append(doc)
103
+ log_message(f" ✓ Таблица (чанк {doc.metadata.get('chunk_id', 0) + 1}/"
104
+ f"{doc.metadata.get('total_chunks', 1)}) добавлена без изменений")
 
 
 
 
 
 
 
 
105
  else:
106
+ # Целая таблица
107
+ table_whole_count += 1
108
  all_chunked_docs.append(doc)
109
+ log_message(f" ✓ Целая таблица добавлена | "
110
+ f"Номер: {doc.metadata.get('table_number', 'unknown')}")
111
+
112
+ chunk_info.append({
113
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
114
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
115
+ 'chunk_id': doc.metadata.get('chunk_id', 0),
116
+ 'total_chunks': doc.metadata.get('total_chunks', 1),
117
+ 'chunk_size': doc_size,
118
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
119
+ 'type': 'table',
120
+ 'table_number': doc.metadata.get('table_number', 'unknown'),
121
+ 'is_chunked': is_already_chunked
122
+ })
123
 
124
  elif doc_type == 'image':
 
 
125
  if doc_size > CHUNK_SIZE:
126
+ log_message(f" 📷 Изображение требует чанкинга | Размер: {doc_size} > {CHUNK_SIZE}")
 
127
  chunked_docs = chunk_document(doc)
128
+ image_chunked_count += len(chunked_docs)
129
  all_chunked_docs.extend(chunked_docs)
 
130
 
131
+ for chunk_doc in chunked_docs:
132
  chunk_info.append({
133
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
134
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
135
+ 'chunk_id': chunk_doc.metadata.get('chunk_id', 0),
136
+ 'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
137
  'chunk_size': len(chunk_doc.text),
138
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
139
  'type': 'image',
140
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown'),
141
+ 'is_chunked': True
142
  })
143
  else:
144
+ image_whole_count += 1
145
  all_chunked_docs.append(doc)
146
+ log_message(f" ✓ Целое изображение добавлено | Размер: {doc_size}")
147
+
148
  chunk_info.append({
149
  'document_id': doc.metadata.get('document_id', 'unknown'),
150
  'section_id': doc.metadata.get('section_id', 'unknown'),
151
  'chunk_id': 0,
152
+ 'total_chunks': 1,
153
  'chunk_size': doc_size,
154
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
155
  'type': 'image',
156
+ 'image_number': doc.metadata.get('image_number', 'unknown'),
157
+ 'is_chunked': False
158
  })
159
 
160
+ else: # text
 
161
  if doc_size > CHUNK_SIZE:
162
+ log_message(f" 📝 Текст требует чанкинга | "
163
+ f"Документ: {doc.metadata.get('document_id', 'unknown')} | "
164
+ f"Раздел: {doc.metadata.get('section_id', 'unknown')} | "
165
  f"Размер: {doc_size} > {CHUNK_SIZE}")
166
+
167
  chunked_docs = chunk_document(doc)
168
+ text_chunked_count += len(chunked_docs)
169
  all_chunked_docs.extend(chunked_docs)
 
170
 
171
+ for chunk_doc in chunked_docs:
172
  chunk_info.append({
173
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
174
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
175
+ 'chunk_id': chunk_doc.metadata.get('chunk_id', 0),
176
+ 'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
177
  'chunk_size': len(chunk_doc.text),
178
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
179
+ 'type': 'text',
180
+ 'is_chunked': True
181
  })
182
  else:
183
+ text_whole_count += 1
184
  all_chunked_docs.append(doc)
185
+ log_message(f" ✓ Целый текстовый документ добавлен | Размер: {doc_size}")
186
+
187
  chunk_info.append({
188
  'document_id': doc.metadata.get('document_id', 'unknown'),
189
  'section_id': doc.metadata.get('section_id', 'unknown'),
190
  'chunk_id': 0,
191
+ 'total_chunks': 1,
192
  'chunk_size': doc_size,
193
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
194
+ 'type': 'text',
195
+ 'is_chunked': False
196
  })
197
 
198
  log_message(f"\n{'='*60}")
199
+ log_message(f"📊 ИТОГОВАЯ СТАТИСТИКА ЧАНКИНГА:")
200
+ log_message(f"{'='*60}")
201
+ log_message(f" ТАБЛИЦЫ:")
202
+ log_message(f" Целые (не нуждались в чанкинге): {table_whole_count}")
203
+ log_message(f" Чанки (разбиты в table_prep.py): {table_chunked_count}")
204
+ log_message(f" ИЗОБРАЖЕНИЯ:")
205
+ log_message(f" Целые: {image_whole_count}")
206
+ log_message(f" • Чанки: {image_chunked_count}")
207
+ log_message(f" ТЕКСТ:")
208
+ log_message(f" • Целые документы: {text_whole_count}")
209
+ log_message(f" • Чанки: {text_chunked_count}")
210
+ log_message(f" {'─'*58}")
211
+ log_message(f" ВСЕГО ДОКУМЕНТОВ В ИНДЕКСЕ: {len(all_chunked_docs)}")
212
  log_message(f"{'='*60}\n")
213
 
214
  return all_chunked_docs, chunk_info
215
 
216
+
217
  def extract_text_from_json(data, document_id, document_name):
218
  documents = []
219
 
table_prep.py CHANGED
@@ -32,39 +32,80 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
36
- if chunk_size is None:
37
- chunk_size = CHUNK_SIZE
38
- if chunk_overlap is None:
39
- chunk_overlap = CHUNK_OVERLAP
40
-
41
- text_splitter = SentenceSplitter(
42
- chunk_size=chunk_size,
43
- chunk_overlap=chunk_overlap,
44
- separator="\n"
45
- )
46
-
47
- text_chunks = text_splitter.split_text(doc.text)
48
-
49
- chunked_docs = []
50
- for i, chunk_text in enumerate(text_chunks):
51
- chunk_metadata = doc.metadata.copy()
52
- chunk_metadata.update({
53
- "chunk_id": i,
54
- "total_chunks": len(text_chunks),
55
- "chunk_size": len(chunk_text),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  "is_chunked": True
57
- })
58
 
59
- chunked_doc = Document(
60
- text=chunk_text,
61
- metadata=chunk_metadata
62
- )
63
- chunked_docs.append(chunked_doc)
64
 
65
- return chunked_docs
 
66
 
67
  def table_to_document(table_data, document_id=None):
 
 
 
68
  if not isinstance(table_data, dict):
69
  log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
70
  return []
@@ -75,41 +116,46 @@ def table_to_document(table_data, document_id=None):
75
  section = table_data.get('section', 'Неизвестно')
76
 
77
  table_rows = table_data.get('data', [])
78
- if not table_rows or len(table_rows) == 0:
79
- log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
80
  return []
81
 
82
- content = create_table_content(table_data)
83
- content_size = len(content)
84
- row_count = len(table_rows)
85
-
86
- base_doc = Document(
87
- text=content,
88
- metadata={
89
- "type": "table",
90
- "table_number": table_num,
91
- "table_title": table_title,
92
- "document_id": doc_id,
93
- "section": section,
94
- "section_id": section,
95
- "total_rows": row_count,
96
- "content_size": content_size
97
- }
98
- )
99
-
100
- if content_size > CHUNK_SIZE:
101
- log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
102
- f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
103
- chunked_docs = chunk_table_document(base_doc)
104
- log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
105
- for i, chunk_doc in enumerate(chunked_docs):
106
- log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
107
- return chunked_docs
 
 
 
 
108
  else:
109
- log_message(f" ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
110
- f"Размер: {content_size} символов | Строк: {row_count}")
111
- return [base_doc]
112
-
 
113
 
114
  def load_table_data(repo_id, hf_token, table_data_dir):
115
  log_message("=" * 60)
 
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
+ def create_table_chunks_with_headers(table_data, rows_per_chunk=10):
36
+ """
37
+ Intelligently chunk tables by preserving headers and grouping rows
38
+ """
39
+ doc_id = table_data.get('document_id') or table_data.get('document', 'Неизвестно')
40
+ table_num = table_data.get('table_number', 'Неизвестно')
41
+ table_title = table_data.get('table_title', 'Неизвестно')
42
+ section = table_data.get('section', 'Неизвестно')
43
+ headers = table_data.get('headers', [])
44
+ table_rows = table_data.get('data', [])
45
+
46
+ if not table_rows:
47
+ return []
48
+
49
+ # Create header string that will be included in EVERY chunk
50
+ header_context = f"Таблица {table_num}: {table_title}\n"
51
+ header_context += f"Документ: {doc_id}\n"
52
+ header_context += f"Раздел: {section}\n"
53
+ if headers:
54
+ header_context += f"Заголовки: {' | '.join(headers)}\n"
55
+ header_context += f"Всего строк в таблице: {len(table_rows)}\n\n"
56
+
57
+ # Calculate optimal rows per chunk based on content size
58
+ avg_row_size = sum(len(str(row)) for row in table_rows[:5]) / min(5, len(table_rows))
59
+ max_chunk_size = CHUNK_SIZE - len(header_context) - 500 # Safety margin
60
+ optimal_rows = max(5, int(max_chunk_size / avg_row_size))
61
+
62
+ log_message(f" 📐 Средний размер строки: {avg_row_size:.0f} символов")
63
+ log_message(f" 📊 Оптимальное кол-во строк на чанк: {optimal_rows}")
64
+
65
+ chunks = []
66
+ total_rows = len(table_rows)
67
+
68
+ for i in range(0, total_rows, optimal_rows):
69
+ chunk_rows = table_rows[i:i + optimal_rows]
70
+
71
+ # Build chunk content
72
+ chunk_content = header_context
73
+ chunk_content += f"[Строки {i+1}-{min(i+optimal_rows, total_rows)} из {total_rows}]\n"
74
+ chunk_content += "Данные:\n"
75
+
76
+ for row_idx, row in enumerate(chunk_rows, start=i+1):
77
+ if isinstance(row, dict):
78
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
79
+ chunk_content += f"Строка {row_idx}: {row_text}\n"
80
+
81
+ chunk_metadata = {
82
+ "type": "table",
83
+ "table_number": table_num,
84
+ "table_title": table_title,
85
+ "document_id": doc_id,
86
+ "section": section,
87
+ "section_id": section,
88
+ "headers": headers,
89
+ "chunk_id": i // optimal_rows,
90
+ "total_chunks": (total_rows + optimal_rows - 1) // optimal_rows,
91
+ "row_range": f"{i+1}-{min(i+optimal_rows, total_rows)}",
92
+ "total_table_rows": total_rows,
93
  "is_chunked": True
94
+ }
95
 
96
+ doc = Document(text=chunk_content, metadata=chunk_metadata)
97
+ chunks.append(doc)
98
+
99
+ log_message(f" Чанк {len(chunks)}: строки {i+1}-{min(i+optimal_rows, total_rows)} | "
100
+ f"{len(chunk_content)} символов")
101
 
102
+ return chunks
103
+
104
 
105
  def table_to_document(table_data, document_id=None):
106
+ """
107
+ Convert table to Document(s) with intelligent chunking
108
+ """
109
  if not isinstance(table_data, dict):
110
  log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
111
  return []
 
116
  section = table_data.get('section', 'Неизвестно')
117
 
118
  table_rows = table_data.get('data', [])
119
+ if not table_rows:
120
+ log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных")
121
  return []
122
 
123
+ log_message(f"\n📊 Обработка таблицы {table_num} из документа '{doc_id}'")
124
+ log_message(f" Название: {table_title}")
125
+ log_message(f" Раздел: {section}")
126
+ log_message(f" Строк данных: {len(table_rows)}")
127
+
128
+ # Estimate if table needs chunking
129
+ sample_content = create_table_content(table_data)
130
+ estimated_size = len(sample_content)
131
+
132
+ log_message(f" Оценочный размер: {estimated_size} символов")
133
+
134
+ # Threshold: if table is small enough, keep it whole
135
+ if estimated_size <= CHUNK_SIZE * 0.8: # 80% of limit for safety
136
+ log_message(f" ✅ Таблица достаточно мала, хранится целиком")
137
+ doc = Document(
138
+ text=sample_content,
139
+ metadata={
140
+ "type": "table",
141
+ "table_number": table_num,
142
+ "table_title": table_title,
143
+ "document_id": doc_id,
144
+ "section": section,
145
+ "section_id": section,
146
+ "headers": table_data.get('headers', []),
147
+ "total_rows": len(table_rows),
148
+ "content_size": estimated_size,
149
+ "is_chunked": False
150
+ }
151
+ )
152
+ return [doc]
153
  else:
154
+ log_message(f" ⚠️ Таблица слишком большая ({estimated_size} > {CHUNK_SIZE})")
155
+ log_message(f" 🔄 Применяется умный чанкинг с сохранением заголовков...")
156
+ chunks = create_table_chunks_with_headers(table_data)
157
+ log_message(f" ✅ Таблица разбита на {len(chunks)} чанков с сохранением структуры")
158
+ return chunks
159
 
160
  def load_table_data(repo_id, hf_token, table_data_dir):
161
  log_message("=" * 60)