Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

38ed4e9

1 Parent(s): d013631

new documents prep

Browse files

Files changed (2) hide show

table_prep.py +16 -23
utils.py +3 -4

table_prep.py CHANGED Viewed

@@ -32,15 +32,14 @@ def create_table_content(table_data):
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
     lines = doc.text.strip().split('\n')
     # Separate header and data rows
     header_lines = []
     data_rows = []
     in_data = False
     for line in lines:
         if line.startswith('Данные таблицы:'):
             in_data = True
@@ -49,40 +48,34 @@ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
             data_rows.append(line)
         elif not in_data:
             header_lines.append(line)
     header = '\n'.join(header_lines) + '\n'
-    # No rows to chunk
     if not data_rows:
         return [doc]
-    # Chunk the data rows
     chunks = []
     current_rows = []
     current_size = len(header)
     for row in data_rows:
-        row_size = len(row) + 1
-        if (len(current_rows) >= max_rows_per_chunk or
-            current_size + row_size > max_chunk_size) and current_rows:
-            # Save current chunk
             chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
             log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
-            current_rows = [current_rows[-1]]
-            log_message(f"Перенос строки для перекрытия: {current_rows[-1]}")
-            current_size = len(header) + len(current_rows[0]) + 1
         current_rows.append(row)
         current_size += row_size
     # Add final chunk
     if current_rows:
         chunk_text = header + '\n'.join(current_rows)
         chunks.append(chunk_text)
     # Create Document objects
     chunked_docs = []
     for i, chunk_text in enumerate(chunks):
@@ -99,7 +92,7 @@ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
             }
         )
         chunked_docs.append(chunk_doc)
     return chunked_docs

 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+def chunk_table_document(doc, max_chunk_size=2000):
     lines = doc.text.strip().split('\n')
     # Separate header and data rows
     header_lines = []
     data_rows = []
     in_data = False
     for line in lines:
         if line.startswith('Данные таблицы:'):
             in_data = True
             data_rows.append(line)
         elif not in_data:
             header_lines.append(line)
     header = '\n'.join(header_lines) + '\n'
     if not data_rows:
         return [doc]
     chunks = []
     current_rows = []
     current_size = len(header)
     for row in data_rows:
+        row_size = len(row) + 1
+        # If adding this row would exceed max_chunk_size, save current chunk
+        if current_size + row_size > max_chunk_size and current_rows:
             chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
             log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
+            current_rows = []
+            current_size = len(header)
         current_rows.append(row)
         current_size += row_size
     # Add final chunk
     if current_rows:
         chunk_text = header + '\n'.join(current_rows)
         chunks.append(chunk_text)
     # Create Document objects
     chunked_docs = []
     for i, chunk_text in enumerate(chunks):
             }
         )
         chunked_docs.append(chunk_doc)
     return chunked_docs

utils.py CHANGED Viewed

@@ -139,19 +139,18 @@ def deduplicate_nodes(nodes):
     unique_nodes = []
     for node in nodes:
-        # Create unique identifier from metadata
         doc_id = node.metadata.get('document_id', '')
         section_id = node.metadata.get('section_id', '')
-        chunk_id = node.metadata.get('chunk_id', 0)
         node_type = node.metadata.get('type', 'text')
-        if node_type == 'table':
             table_num = node.metadata.get('table_number', '')
-            identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
         elif node_type == 'image':
             img_num = node.metadata.get('image_number', '')
             identifier = f"{doc_id}|image|{img_num}"
         else:
             identifier = f"{doc_id}|{section_id}|{chunk_id}"
         if identifier not in seen:

     unique_nodes = []
     for node in nodes:
         doc_id = node.metadata.get('document_id', '')
         section_id = node.metadata.get('section_id', '')
         node_type = node.metadata.get('type', 'text')
+        if node_type == 'table' or node_type == 'table_row':
             table_num = node.metadata.get('table_number', '')
+            identifier = f"{doc_id}|table|{table_num}"
         elif node_type == 'image':
             img_num = node.metadata.get('image_number', '')
             identifier = f"{doc_id}|image|{img_num}"
         else:
+            chunk_id = node.metadata.get('chunk_id', 0)
             identifier = f"{doc_id}|{section_id}|{chunk_id}"
         if identifier not in seen: