Spaces:
Sleeping
Sleeping
Commit ·
8c371f8
1
Parent(s): 15a7dee
max chunk size= 4000 + max row = 5
Browse files- documents_prep.py +13 -11
- index_retriever.py +1 -1
- table_prep.py +1 -1
documents_prep.py
CHANGED
|
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
-
CHUNK_SIZE =
|
| 11 |
CHUNK_OVERLAP = 128
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
@@ -38,8 +38,8 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def chunk_table_by_content(table_data, doc_id, max_chars=
|
| 42 |
-
"""Chunk tables by content size
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
| 45 |
table_num = table_data.get('table_number', 'unknown')
|
|
@@ -65,14 +65,14 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
|
|
| 65 |
|
| 66 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 67 |
|
| 68 |
-
# Calculate base metadata size
|
| 69 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 70 |
base_size = len(base_content)
|
| 71 |
available_space = max_chars - base_size - 200
|
| 72 |
|
| 73 |
# If entire table fits, return as one chunk
|
| 74 |
-
full_rows_content = format_table_rows(rows)
|
| 75 |
-
if base_size + len(full_rows_content) <= max_chars:
|
| 76 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 77 |
|
| 78 |
metadata = {
|
|
@@ -90,7 +90,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
|
|
| 90 |
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
| 91 |
return [Document(text=content, metadata=metadata)]
|
| 92 |
|
| 93 |
-
# Otherwise, chunk by content size
|
| 94 |
chunks = []
|
| 95 |
current_rows = []
|
| 96 |
current_size = 0
|
|
@@ -100,8 +100,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
|
|
| 100 |
row_text = format_single_row(row, i + 1)
|
| 101 |
row_size = len(row_text)
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
|
|
|
|
|
|
| 105 |
content = base_content + format_table_rows(current_rows)
|
| 106 |
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
|
| 107 |
content += format_table_footer(table_identifier, doc_id)
|
|
@@ -128,13 +130,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
|
|
| 128 |
current_rows = []
|
| 129 |
current_size = 0
|
| 130 |
|
| 131 |
-
# Add row
|
| 132 |
row_copy = row.copy() if isinstance(row, dict) else {'data': row}
|
| 133 |
row_copy['_idx'] = i + 1
|
| 134 |
current_rows.append(row_copy)
|
| 135 |
current_size += row_size
|
| 136 |
|
| 137 |
-
# Add final chunk
|
| 138 |
if current_rows:
|
| 139 |
content = base_content + format_table_rows(current_rows)
|
| 140 |
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
|
|
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
+
CHUNK_SIZE = 1500
|
| 11 |
CHUNK_OVERLAP = 128
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def chunk_table_by_content(table_data, doc_id, max_chars=2000, max_rows=5):
|
| 42 |
+
"""Chunk tables by content size AND row count"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
| 45 |
table_num = table_data.get('table_number', 'unknown')
|
|
|
|
| 65 |
|
| 66 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 67 |
|
| 68 |
+
# Calculate base metadata size
|
| 69 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 70 |
base_size = len(base_content)
|
| 71 |
available_space = max_chars - base_size - 200
|
| 72 |
|
| 73 |
# If entire table fits, return as one chunk
|
| 74 |
+
full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
|
| 75 |
+
if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
|
| 76 |
content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
|
| 77 |
|
| 78 |
metadata = {
|
|
|
|
| 90 |
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
| 91 |
return [Document(text=content, metadata=metadata)]
|
| 92 |
|
| 93 |
+
# Otherwise, chunk by BOTH content size AND row count
|
| 94 |
chunks = []
|
| 95 |
current_rows = []
|
| 96 |
current_size = 0
|
|
|
|
| 100 |
row_text = format_single_row(row, i + 1)
|
| 101 |
row_size = len(row_text)
|
| 102 |
|
| 103 |
+
# Check BOTH limits: size AND row count
|
| 104 |
+
should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
|
| 105 |
+
|
| 106 |
+
if should_split:
|
| 107 |
content = base_content + format_table_rows(current_rows)
|
| 108 |
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
|
| 109 |
content += format_table_footer(table_identifier, doc_id)
|
|
|
|
| 130 |
current_rows = []
|
| 131 |
current_size = 0
|
| 132 |
|
| 133 |
+
# Add row with index
|
| 134 |
row_copy = row.copy() if isinstance(row, dict) else {'data': row}
|
| 135 |
row_copy['_idx'] = i + 1
|
| 136 |
current_rows.append(row_copy)
|
| 137 |
current_size += row_size
|
| 138 |
|
| 139 |
+
# Add final chunk
|
| 140 |
if current_rows:
|
| 141 |
content = base_content + format_table_rows(current_rows)
|
| 142 |
content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
|
index_retriever.py
CHANGED
|
@@ -57,7 +57,7 @@ def create_query_engine(vector_index):
|
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
-
similarity_top_k=
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|
|
|
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
+
similarity_top_k=70,
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|
table_prep.py
CHANGED
|
@@ -4,7 +4,7 @@ from huggingface_hub import hf_hub_download, list_repo_files
|
|
| 4 |
from llama_index.core import Document
|
| 5 |
from my_logging import log_message
|
| 6 |
|
| 7 |
-
MAX_ROWS_PER_CHUNK =
|
| 8 |
MAX_CHUNK_SIZE = 4000
|
| 9 |
|
| 10 |
def create_table_content(table_data):
|
|
|
|
| 4 |
from llama_index.core import Document
|
| 5 |
from my_logging import log_message
|
| 6 |
|
| 7 |
+
MAX_ROWS_PER_CHUNK = 10
|
| 8 |
MAX_CHUNK_SIZE = 4000
|
| 9 |
|
| 10 |
def create_table_content(table_data):
|