Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

e04e66f

1 Parent(s): bb76787

adaptive table chunking

Browse files

Files changed (1) hide show

documents_prep.py +194 -105

documents_prep.py CHANGED Viewed

@@ -38,76 +38,51 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_content(table_data, doc_id, max_chars=1024):
-    """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
-    table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    table_num_clean = str(table_num).strip()
-    # Create section-aware identifier
     import re
     if 'приложени' in section.lower():
         appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
         if appendix_match:
             appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
         else:
-            table_identifier = table_num_clean
     else:
-        table_identifier = table_num_clean
     if not rows:
         return []
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # SIMPLIFIED base content - remove redundant search keywords
-    base_content = f"ДОКУМЕНТ: {doc_id}\n"
-    base_content += f"ТАБЛИЦА: {table_identifier}\n"
     if table_title:
-        base_content += f"НАЗВАНИЕ: {table_title}\n"
-    if section:
-        base_content += f"РАЗДЕЛ: {section}\n"
-    base_content += f"{'='*70}\n\n"
     if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        base_content += f"ЗАГОЛОВКИ: {header_str}\n\n"
-    base_content += "ДАННЫЕ:\n"
-    base_size = len(base_content)
-    available_space = max_chars - base_size - 100  # Reduced footer overhead
-    # Rest of the function stays the same...
-    full_rows_content = format_table_rows(rows)
-    if base_size + len(full_rows_content) <= max_chars:
-        content = base_content + full_rows_content
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
-            'section': section,
-            'total_rows': len(rows),
-            'chunk_size': len(content),
-            'is_complete_table': True,
-            'row_start': 0,
-            'row_end': len(rows)
-        }
-        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
-        return [Document(text=content, metadata=metadata)]
-    # Chunking logic with row indices...
     chunks = []
-    current_rows = []
     current_size = 0
     chunk_num = 0
@@ -115,62 +90,187 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1024):
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
-        if current_size + row_size > available_space and current_rows:
-            content = base_content + format_table_rows(current_rows)
-            content += f"\n[Строки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}]\n"
-            metadata = {
-                'type': 'table',
-                'document_id': doc_id,
-                'table_number': table_num_clean,
-                'table_identifier': table_identifier,
-                'table_title': table_title,
-                'section': section,
-                'chunk_id': chunk_num,
-                'row_start': current_rows[0]['_idx'] - 1,
-                'row_end': current_rows[-1]['_idx'],
-                'total_rows': len(rows),
-                'chunk_size': len(content),
-                'is_complete_table': False
-            }
-            chunks.append(Document(text=content, metadata=metadata))
-            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, rows {current_rows[0]['_idx']}-{current_rows[-1]['_idx']}")
             chunk_num += 1
-            current_rows = []
             current_size = 0
-        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
-        row_copy['_idx'] = i + 1
-        current_rows.append(row_copy)
         current_size += row_size
-    if current_rows:
-        content = base_content + format_table_rows(current_rows)
-        content += f"\n[Строки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}]\n"
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
-            'section': section,
-            'chunk_id': chunk_num,
-            'row_start': current_rows[0]['_idx'] - 1,
-            'row_end': current_rows[-1]['_idx'],
-            'total_rows': len(rows),
-            'chunk_size': len(content),
-            'is_complete_table': False
-        }
-        chunks.append(Document(text=content, metadata=metadata))
-        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, rows {current_rows[0]['_idx']}-{current_rows[-1]['_idx']}")
     return chunks
 def format_single_row(row, idx):
     """Format a single row"""
     if isinstance(row, dict):
@@ -185,18 +285,6 @@ def format_single_row(row, idx):
     return ""
-def format_table_rows(rows):
-    """Format multiple rows"""
-    content = ""
-    for row in rows:
-        idx = row.get('_idx', 0)
-        content += format_single_row(row, idx)
-    return content
-def format_table_footer(table_identifier, doc_id):
-    """Format table footer"""
-    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
@@ -222,9 +310,10 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1024)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")

     return chunked
+def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=4, max_chars=3000):
+    """
+    Chunk tables by rows with fallback to character limit.
+    Keeps 3-4 rows together, but splits individual rows if they're too large.
+    """
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
+    table_num = str(table_data.get('table_number', 'unknown')).strip()
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # Section-aware identifier (keep your existing logic)
     import re
     if 'приложени' in section.lower():
         appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
         if appendix_match:
             appendix_num = appendix_match.group(1).upper()
+            table_identifier = f"{table_num} Приложение {appendix_num}"
         else:
+            table_identifier = table_num
     else:
+        table_identifier = table_num
     if not rows:
         return []
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Build base header (compact version)
+    base_header = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
     if table_title:
+        base_header += f"НАЗВАНИЕ: {table_title}\n"
+    base_header += f"{'='*60}\n"
     if headers:
+        header_str = ' | '.join(str(h)[:30] for h in headers)  # Truncate long headers
+        base_header += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    # Calculate available space
+    base_size = len(base_header)
+    footer_size = 100
+    available_space = max_chars - base_size - footer_size
     chunks = []
+    current_batch = []
     current_size = 0
     chunk_num = 0
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
+        # Case 1: Single row exceeds max - split it internally
+        if row_size > available_space:
+            # Flush current batch first
+            if current_batch:
+                chunks.append(_create_chunk(
+                    base_header, current_batch, table_identifier,
+                    doc_id, table_num, table_title, section,
+                    len(rows), chunk_num, False
+                ))
+                chunk_num += 1
+                current_batch = []
+                current_size = 0
+            log_message(f"    ⚠ Row {i+1} too large ({row_size} chars), splitting...")
+            # Split the large row
+            split_chunks = _split_large_row(
+                row, i + 1, base_header, available_space,
+                table_identifier, doc_id, table_num, table_title,
+                section, len(rows), chunk_num
+            )
+            chunks.extend(split_chunks)
+            log_message(f"      → Created {len(split_chunks)} chunks from row {i+1}")
+            chunk_num += len(split_chunks)
+            continue
+        # Case 2: Adding this row would exceed limit - flush current batch
+        if current_size + row_size > available_space and current_batch:
+            chunks.append(_create_chunk(
+                base_header, current_batch, table_identifier,
+                doc_id, table_num, table_title, section,
+                len(rows), chunk_num, False
+            ))
             chunk_num += 1
+            current_batch = []
             current_size = 0
+        # Case 3: Add row to current batch
+        current_batch.append({'row': row, 'idx': i + 1, 'text': row_text})
+        log_message(f"    + Row {i+1} ({row_size} chars) added to chunk {chunk_num}")
         current_size += row_size
+        # Flush if we hit target row count
+        if len(current_batch) >= rows_per_chunk:
+            chunks.append(_create_chunk(
+                base_header, current_batch, table_identifier,
+                doc_id, table_num, table_title, section,
+                len(rows), chunk_num, False
+            ))
+            chunk_num += 1
+            current_batch = []
+            current_size = 0
+    # Flush remaining rows
+    if current_batch:
+        chunks.append(_create_chunk(
+            base_header, current_batch, table_identifier,
+            doc_id, table_num, table_title, section,
+            len(rows), chunk_num, len(chunks) == 0
+        ))
+    log_message(f"    Created {len(chunks)} chunks from {len(rows)} rows")
+    return chunks
+def _create_chunk(base_header, batch, table_identifier, doc_id,
+                  table_num, table_title, section, total_rows,
+                  chunk_num, is_complete):
+    """Helper to create a chunk with full metadata"""
+    content = base_header + "ДАННЫЕ:\n"
+    for item in batch:
+        content += item['text']
+    row_start = batch[0]['idx']
+    row_end = batch[-1]['idx']
+    # Add footer with row info
+    if not is_complete:
+        content += f"\n[Строки {row_start}-{row_end} из {total_rows}]"
+    # EMBED ALL METADATA IN TEXT for better retrieval
+    content += f"\n\n--- МЕТАДАННЫЕ ---\n"
+    content += f"Документ: {doc_id}\n"
+    content += f"Таблица: {table_identifier}\n"
+    content += f"Название таблицы: {table_title}\n"
+    content += f"Раздел: {section}\n"
+    content += f"Строки: {row_start}-{row_end} из {total_rows}\n"
+    metadata = {
+        'type': 'table',
+        'document_id': doc_id,
+        'table_number': table_num,
+        'table_identifier': table_identifier,
+        'table_title': table_title,
+        'section': section,
+        'chunk_id': chunk_num,
+        'row_start': row_start - 1,
+        'row_end': row_end,
+        'total_rows': total_rows,
+        'chunk_size': len(content),
+        'is_complete_table': is_complete,
+        'rows_in_chunk': len(batch)
+    }
+    return Document(text=content, metadata=metadata)
+def _split_large_row(row, row_idx, base_header, max_size,
+                     table_identifier, doc_id, table_num,
+                     table_title, section, total_rows, base_chunk_num):
+    """Split a single large row into multiple chunks"""
+    if isinstance(row, dict):
+        items = list(row.items())
+    else:
+        items = [(f"col_{i}", v) for i, v in enumerate(row)]
+    chunks = []
+    current_items = []
+    current_size = 0
+    part_num = 0
+    for key, value in items:
+        item_text = f"{key}: {value}\n"
+        item_size = len(item_text)
+        if current_size + item_size > max_size and current_items:
+            # Create chunk for current items
+            content = base_header + "ДАННЫЕ:\n"
+            content += f"Строка {row_idx} (часть {part_num + 1}):\n"
+            content += "".join(current_items)
+            content += f"\n[Строка {row_idx} из {total_rows} - продолжается]"
+            chunks.append(_create_chunk_from_text(
+                content, doc_id, table_num, table_identifier,
+                table_title, section, row_idx, row_idx,
+                total_rows, base_chunk_num + part_num
+            ))
+            part_num += 1
+            current_items = []
+            current_size = 0
+        current_items.append(item_text)
+        current_size += item_size
+    # Flush remaining
+    if current_items:
+        content = base_header + "ДАННЫЕ:\n"
+        content += f"Строка {row_idx} (часть {part_num + 1}):\n"
+        content += "".join(current_items)
+        chunks.append(_create_chunk_from_text(
+            content, doc_id, table_num, table_identifier,
+            table_title, section, row_idx, row_idx,
+            total_rows, base_chunk_num + part_num
+        ))
     return chunks
+def _create_chunk_from_text(content, doc_id, table_num, table_identifier,
+                            table_title, section, row_start, row_end,
+                            total_rows, chunk_num):
+    """Helper for creating chunk from pre-built text"""
+    metadata = {
+        'type': 'table',
+        'document_id': doc_id,
+        'table_number': table_num,
+        'table_identifier': table_identifier,
+        'table_title': table_title,
+        'section': section,
+        'chunk_id': chunk_num,
+        'row_start': row_start - 1,
+        'row_end': row_end,
+        'total_rows': total_rows,
+        'chunk_size': len(content),
+        'is_complete_table': False
+    }
+    return Document(text=content, metadata=metadata)
 def format_single_row(row, idx):
     """Format a single row"""
     if isinstance(row, dict):
     return ""
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # USE NEW ADAPTIVE CHUNKING
+                chunks = chunk_table_by_rows(sheet, sheet_doc_id, max_chars=3072)
                 all_chunks.extend(chunks)
+                log_message(f"  📄 {sheet_doc_id}: {len(chunks)} chunks")
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")