MrSimple07 commited on
Commit
e04e66f
·
1 Parent(s): bb76787

adaptive table chunking

Browse files
Files changed (1) hide show
  1. documents_prep.py +194 -105
documents_prep.py CHANGED
@@ -38,76 +38,51 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=1024):
42
- """Chunk tables by content size instead of rows"""
 
 
 
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
45
- table_num = table_data.get('table_number', 'unknown')
46
  table_title = table_data.get('table_title', '')
47
  section = table_data.get('section', '')
48
-
49
- table_num_clean = str(table_num).strip()
50
 
51
- # Create section-aware identifier
52
  import re
53
  if 'приложени' in section.lower():
54
  appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
55
  if appendix_match:
56
  appendix_num = appendix_match.group(1).upper()
57
- table_identifier = f"{table_num_clean} Приложение {appendix_num}"
58
  else:
59
- table_identifier = table_num_clean
60
  else:
61
- table_identifier = table_num_clean
62
 
63
  if not rows:
64
  return []
65
 
66
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
67
 
68
- # SIMPLIFIED base content - remove redundant search keywords
69
- base_content = f"ДОКУМЕНТ: {doc_id}\n"
70
- base_content += f"ТАБЛИЦА: {table_identifier}\n"
71
  if table_title:
72
- base_content += f"НАЗВАНИЕ: {table_title}\n"
73
- if section:
74
- base_content += f"РАЗДЕЛ: {section}\n"
75
- base_content += f"{'='*70}\n\n"
76
 
77
  if headers:
78
- header_str = ' | '.join(str(h) for h in headers)
79
- base_content += f"ЗАГОЛОВКИ: {header_str}\n\n"
80
 
81
- base_content += "ДАННЫЕ:\n"
 
 
 
82
 
83
- base_size = len(base_content)
84
- available_space = max_chars - base_size - 100 # Reduced footer overhead
85
-
86
- # Rest of the function stays the same...
87
- full_rows_content = format_table_rows(rows)
88
- if base_size + len(full_rows_content) <= max_chars:
89
- content = base_content + full_rows_content
90
-
91
- metadata = {
92
- 'type': 'table',
93
- 'document_id': doc_id,
94
- 'table_number': table_num_clean,
95
- 'table_identifier': table_identifier,
96
- 'table_title': table_title,
97
- 'section': section,
98
- 'total_rows': len(rows),
99
- 'chunk_size': len(content),
100
- 'is_complete_table': True,
101
- 'row_start': 0,
102
- 'row_end': len(rows)
103
- }
104
-
105
- log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
106
- return [Document(text=content, metadata=metadata)]
107
-
108
- # Chunking logic with row indices...
109
  chunks = []
110
- current_rows = []
111
  current_size = 0
112
  chunk_num = 0
113
 
@@ -115,62 +90,187 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1024):
115
  row_text = format_single_row(row, i + 1)
116
  row_size = len(row_text)
117
 
118
- if current_size + row_size > available_space and current_rows:
119
- content = base_content + format_table_rows(current_rows)
120
- content += f"\n[Строки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}]\n"
121
-
122
- metadata = {
123
- 'type': 'table',
124
- 'document_id': doc_id,
125
- 'table_number': table_num_clean,
126
- 'table_identifier': table_identifier,
127
- 'table_title': table_title,
128
- 'section': section,
129
- 'chunk_id': chunk_num,
130
- 'row_start': current_rows[0]['_idx'] - 1,
131
- 'row_end': current_rows[-1]['_idx'],
132
- 'total_rows': len(rows),
133
- 'chunk_size': len(content),
134
- 'is_complete_table': False
135
- }
136
-
137
- chunks.append(Document(text=content, metadata=metadata))
138
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, rows {current_rows[0]['_idx']}-{current_rows[-1]['_idx']}")
139
-
 
 
 
 
 
 
 
 
 
140
  chunk_num += 1
141
- current_rows = []
142
  current_size = 0
143
 
144
- row_copy = row.copy() if isinstance(row, dict) else {'data': row}
145
- row_copy['_idx'] = i + 1
146
- current_rows.append(row_copy)
147
  current_size += row_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- if current_rows:
150
- content = base_content + format_table_rows(current_rows)
151
- content += f"\n[Строки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}]\n"
 
 
 
 
 
152
 
153
- metadata = {
154
- 'type': 'table',
155
- 'document_id': doc_id,
156
- 'table_number': table_num_clean,
157
- 'table_identifier': table_identifier,
158
- 'table_title': table_title,
159
- 'section': section,
160
- 'chunk_id': chunk_num,
161
- 'row_start': current_rows[0]['_idx'] - 1,
162
- 'row_end': current_rows[-1]['_idx'],
163
- 'total_rows': len(rows),
164
- 'chunk_size': len(content),
165
- 'is_complete_table': False
166
- }
 
 
167
 
168
- chunks.append(Document(text=content, metadata=metadata))
169
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, rows {current_rows[0]['_idx']}-{current_rows[-1]['_idx']}")
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  return chunks
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  def format_single_row(row, idx):
175
  """Format a single row"""
176
  if isinstance(row, dict):
@@ -185,18 +285,6 @@ def format_single_row(row, idx):
185
  return ""
186
 
187
 
188
- def format_table_rows(rows):
189
- """Format multiple rows"""
190
- content = ""
191
- for row in rows:
192
- idx = row.get('_idx', 0)
193
- content += format_single_row(row, idx)
194
- return content
195
-
196
-
197
- def format_table_footer(table_identifier, doc_id):
198
- """Format table footer"""
199
- return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
200
 
201
  def load_table_documents(repo_id, hf_token, table_dir):
202
  log_message("Loading tables...")
@@ -222,9 +310,10 @@ def load_table_documents(repo_id, hf_token, table_dir):
222
  for sheet in data.get('sheets', []):
223
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
224
 
225
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1024)
 
226
  all_chunks.extend(chunks)
227
-
228
  except Exception as e:
229
  log_message(f"Error loading {file_path}: {e}")
230
 
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=4, max_chars=3000):
42
+ """
43
+ Chunk tables by rows with fallback to character limit.
44
+ Keeps 3-4 rows together, but splits individual rows if they're too large.
45
+ """
46
  headers = table_data.get('headers', [])
47
  rows = table_data.get('data', [])
48
+ table_num = str(table_data.get('table_number', 'unknown')).strip()
49
  table_title = table_data.get('table_title', '')
50
  section = table_data.get('section', '')
 
 
51
 
52
+ # Section-aware identifier (keep your existing logic)
53
  import re
54
  if 'приложени' in section.lower():
55
  appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
56
  if appendix_match:
57
  appendix_num = appendix_match.group(1).upper()
58
+ table_identifier = f"{table_num} Приложение {appendix_num}"
59
  else:
60
+ table_identifier = table_num
61
  else:
62
+ table_identifier = table_num
63
 
64
  if not rows:
65
  return []
66
 
67
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
68
 
69
+ # Build base header (compact version)
70
+ base_header = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
 
71
  if table_title:
72
+ base_header += f"НАЗВАНИЕ: {table_title}\n"
73
+ base_header += f"{'='*60}\n"
 
 
74
 
75
  if headers:
76
+ header_str = ' | '.join(str(h)[:30] for h in headers) # Truncate long headers
77
+ base_header += f"ЗАГОЛОВКИ: {header_str}\n\n"
78
 
79
+ # Calculate available space
80
+ base_size = len(base_header)
81
+ footer_size = 100
82
+ available_space = max_chars - base_size - footer_size
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  chunks = []
85
+ current_batch = []
86
  current_size = 0
87
  chunk_num = 0
88
 
 
90
  row_text = format_single_row(row, i + 1)
91
  row_size = len(row_text)
92
 
93
+ # Case 1: Single row exceeds max - split it internally
94
+ if row_size > available_space:
95
+ # Flush current batch first
96
+ if current_batch:
97
+ chunks.append(_create_chunk(
98
+ base_header, current_batch, table_identifier,
99
+ doc_id, table_num, table_title, section,
100
+ len(rows), chunk_num, False
101
+ ))
102
+ chunk_num += 1
103
+ current_batch = []
104
+ current_size = 0
105
+ log_message(f" ⚠ Row {i+1} too large ({row_size} chars), splitting...")
106
+ # Split the large row
107
+ split_chunks = _split_large_row(
108
+ row, i + 1, base_header, available_space,
109
+ table_identifier, doc_id, table_num, table_title,
110
+ section, len(rows), chunk_num
111
+ )
112
+ chunks.extend(split_chunks)
113
+ log_message(f" Created {len(split_chunks)} chunks from row {i+1}")
114
+ chunk_num += len(split_chunks)
115
+ continue
116
+
117
+ # Case 2: Adding this row would exceed limit - flush current batch
118
+ if current_size + row_size > available_space and current_batch:
119
+ chunks.append(_create_chunk(
120
+ base_header, current_batch, table_identifier,
121
+ doc_id, table_num, table_title, section,
122
+ len(rows), chunk_num, False
123
+ ))
124
  chunk_num += 1
125
+ current_batch = []
126
  current_size = 0
127
 
128
+ # Case 3: Add row to current batch
129
+ current_batch.append({'row': row, 'idx': i + 1, 'text': row_text})
130
+ log_message(f" + Row {i+1} ({row_size} chars) added to chunk {chunk_num}")
131
  current_size += row_size
132
+
133
+ # Flush if we hit target row count
134
+ if len(current_batch) >= rows_per_chunk:
135
+ chunks.append(_create_chunk(
136
+ base_header, current_batch, table_identifier,
137
+ doc_id, table_num, table_title, section,
138
+ len(rows), chunk_num, False
139
+ ))
140
+ chunk_num += 1
141
+ current_batch = []
142
+ current_size = 0
143
+
144
+ # Flush remaining rows
145
+ if current_batch:
146
+ chunks.append(_create_chunk(
147
+ base_header, current_batch, table_identifier,
148
+ doc_id, table_num, table_title, section,
149
+ len(rows), chunk_num, len(chunks) == 0
150
+ ))
151
+
152
+ log_message(f" Created {len(chunks)} chunks from {len(rows)} rows")
153
+ return chunks
154
+
155
+
156
+ def _create_chunk(base_header, batch, table_identifier, doc_id,
157
+ table_num, table_title, section, total_rows,
158
+ chunk_num, is_complete):
159
+ """Helper to create a chunk with full metadata"""
160
+ content = base_header + "ДАННЫЕ:\n"
161
+
162
+ for item in batch:
163
+ content += item['text']
164
+
165
+ row_start = batch[0]['idx']
166
+ row_end = batch[-1]['idx']
167
+
168
+ # Add footer with row info
169
+ if not is_complete:
170
+ content += f"\n[Строки {row_start}-{row_end} из {total_rows}]"
171
+
172
+ # EMBED ALL METADATA IN TEXT for better retrieval
173
+ content += f"\n\n--- МЕТАДАННЫЕ ---\n"
174
+ content += f"Документ: {doc_id}\n"
175
+ content += f"Таблица: {table_identifier}\n"
176
+ content += f"Название таблицы: {table_title}\n"
177
+ content += f"Раздел: {section}\n"
178
+ content += f"Строки: {row_start}-{row_end} из {total_rows}\n"
179
+
180
+ metadata = {
181
+ 'type': 'table',
182
+ 'document_id': doc_id,
183
+ 'table_number': table_num,
184
+ 'table_identifier': table_identifier,
185
+ 'table_title': table_title,
186
+ 'section': section,
187
+ 'chunk_id': chunk_num,
188
+ 'row_start': row_start - 1,
189
+ 'row_end': row_end,
190
+ 'total_rows': total_rows,
191
+ 'chunk_size': len(content),
192
+ 'is_complete_table': is_complete,
193
+ 'rows_in_chunk': len(batch)
194
+ }
195
+
196
+ return Document(text=content, metadata=metadata)
197
+
198
+
199
+ def _split_large_row(row, row_idx, base_header, max_size,
200
+ table_identifier, doc_id, table_num,
201
+ table_title, section, total_rows, base_chunk_num):
202
+ """Split a single large row into multiple chunks"""
203
+ if isinstance(row, dict):
204
+ items = list(row.items())
205
+ else:
206
+ items = [(f"col_{i}", v) for i, v in enumerate(row)]
207
 
208
+ chunks = []
209
+ current_items = []
210
+ current_size = 0
211
+ part_num = 0
212
+
213
+ for key, value in items:
214
+ item_text = f"{key}: {value}\n"
215
+ item_size = len(item_text)
216
 
217
+ if current_size + item_size > max_size and current_items:
218
+ # Create chunk for current items
219
+ content = base_header + "ДАННЫЕ:\n"
220
+ content += f"Строка {row_idx} (часть {part_num + 1}):\n"
221
+ content += "".join(current_items)
222
+ content += f"\n[Строка {row_idx} из {total_rows} - продолжается]"
223
+
224
+ chunks.append(_create_chunk_from_text(
225
+ content, doc_id, table_num, table_identifier,
226
+ table_title, section, row_idx, row_idx,
227
+ total_rows, base_chunk_num + part_num
228
+ ))
229
+
230
+ part_num += 1
231
+ current_items = []
232
+ current_size = 0
233
 
234
+ current_items.append(item_text)
235
+ current_size += item_size
236
+
237
+ # Flush remaining
238
+ if current_items:
239
+ content = base_header + "ДАННЫЕ:\n"
240
+ content += f"Строка {row_idx} (часть {part_num + 1}):\n"
241
+ content += "".join(current_items)
242
+
243
+ chunks.append(_create_chunk_from_text(
244
+ content, doc_id, table_num, table_identifier,
245
+ table_title, section, row_idx, row_idx,
246
+ total_rows, base_chunk_num + part_num
247
+ ))
248
 
249
  return chunks
250
 
251
 
252
+ def _create_chunk_from_text(content, doc_id, table_num, table_identifier,
253
+ table_title, section, row_start, row_end,
254
+ total_rows, chunk_num):
255
+ """Helper for creating chunk from pre-built text"""
256
+ metadata = {
257
+ 'type': 'table',
258
+ 'document_id': doc_id,
259
+ 'table_number': table_num,
260
+ 'table_identifier': table_identifier,
261
+ 'table_title': table_title,
262
+ 'section': section,
263
+ 'chunk_id': chunk_num,
264
+ 'row_start': row_start - 1,
265
+ 'row_end': row_end,
266
+ 'total_rows': total_rows,
267
+ 'chunk_size': len(content),
268
+ 'is_complete_table': False
269
+ }
270
+
271
+ return Document(text=content, metadata=metadata)
272
+
273
+
274
  def format_single_row(row, idx):
275
  """Format a single row"""
276
  if isinstance(row, dict):
 
285
  return ""
286
 
287
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  def load_table_documents(repo_id, hf_token, table_dir):
290
  log_message("Loading tables...")
 
310
  for sheet in data.get('sheets', []):
311
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
312
 
313
+ # USE NEW ADAPTIVE CHUNKING
314
+ chunks = chunk_table_by_rows(sheet, sheet_doc_id, max_chars=3072)
315
  all_chunks.extend(chunks)
316
+ log_message(f" 📄 {sheet_doc_id}: {len(chunks)} chunks")
317
  except Exception as e:
318
  log_message(f"Error loading {file_path}: {e}")
319