Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

8c371f8

1 Parent(s): 15a7dee

max chunk size= 4000 + max row = 5

Browse files

Files changed (3) hide show

documents_prep.py +13 -11
index_retriever.py +1 -1
table_prep.py +1 -1

documents_prep.py CHANGED Viewed

@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 # Configuration
-CHUNK_SIZE = 1024
 CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
@@ -38,8 +38,8 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_content(table_data, doc_id, max_chars=1200):
-    """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
@@ -65,14 +65,14 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size (everything except row data)
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
     # If entire table fits, return as one chunk
-    full_rows_content = format_table_rows(rows)
-    if base_size + len(full_rows_content) <= max_chars:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
         metadata = {
@@ -90,7 +90,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
         return [Document(text=content, metadata=metadata)]
-    # Otherwise, chunk by content size
     chunks = []
     current_rows = []
     current_size = 0
@@ -100,8 +100,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
-        # If adding this row exceeds limit, save current chunk
-        if current_size + row_size > available_space and current_rows:
             content = base_content + format_table_rows(current_rows)
             content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
             content += format_table_footer(table_identifier, doc_id)
@@ -128,13 +130,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
             current_rows = []
             current_size = 0
-        # Add row index for tracking
         row_copy = row.copy() if isinstance(row, dict) else {'data': row}
         row_copy['_idx'] = i + 1
         current_rows.append(row_copy)
         current_size += row_size
-    # Add final chunk if rows remain
     if current_rows:
         content = base_content + format_table_rows(current_rows)
         content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"

 from my_logging import log_message
 # Configuration
+CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
     return chunked
+def chunk_table_by_content(table_data, doc_id, max_chars=2000, max_rows=5):
+    """Chunk tables by content size AND row count"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
     # If entire table fits, return as one chunk
+    full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
+    if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
         metadata = {
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
         return [Document(text=content, metadata=metadata)]
+    # Otherwise, chunk by BOTH content size AND row count
     chunks = []
     current_rows = []
     current_size = 0
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
+        # Check BOTH limits: size AND row count
+        should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
+        if should_split:
             content = base_content + format_table_rows(current_rows)
             content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
             content += format_table_footer(table_identifier, doc_id)
             current_rows = []
             current_size = 0
+        # Add row with index
         row_copy = row.copy() if isinstance(row, dict) else {'data': row}
         row_copy['_idx'] = i + 1
         current_rows.append(row_copy)
         current_size += row_size
+    # Add final chunk
     if current_rows:
         content = base_content + format_table_rows(current_rows)
         content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"

index_retriever.py CHANGED Viewed

@@ -57,7 +57,7 @@ def create_query_engine(vector_index):
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=50,
             num_queries=1
         )

         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=70,
             num_queries=1
         )

table_prep.py CHANGED Viewed

@@ -4,7 +4,7 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
-MAX_ROWS_PER_CHUNK = 5
 MAX_CHUNK_SIZE = 4000
 def create_table_content(table_data):

 from llama_index.core import Document
 from my_logging import log_message
+MAX_ROWS_PER_CHUNK = 10
 MAX_CHUNK_SIZE = 4000
 def create_table_content(table_data):