MrSimple07 commited on
Commit
8c371f8
·
1 Parent(s): 15a7dee

max chunk size= 4000 + max row = 5

Browse files
Files changed (3) hide show
  1. documents_prep.py +13 -11
  2. index_retriever.py +1 -1
  3. table_prep.py +1 -1
documents_prep.py CHANGED
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
  # Configuration
10
- CHUNK_SIZE = 1024
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
@@ -38,8 +38,8 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=1200):
42
- """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
45
  table_num = table_data.get('table_number', 'unknown')
@@ -65,14 +65,14 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
65
 
66
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
67
 
68
- # Calculate base metadata size (everything except row data)
69
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
70
  base_size = len(base_content)
71
  available_space = max_chars - base_size - 200
72
 
73
  # If entire table fits, return as one chunk
74
- full_rows_content = format_table_rows(rows)
75
- if base_size + len(full_rows_content) <= max_chars:
76
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
77
 
78
  metadata = {
@@ -90,7 +90,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
90
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
91
  return [Document(text=content, metadata=metadata)]
92
 
93
- # Otherwise, chunk by content size
94
  chunks = []
95
  current_rows = []
96
  current_size = 0
@@ -100,8 +100,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
100
  row_text = format_single_row(row, i + 1)
101
  row_size = len(row_text)
102
 
103
- # If adding this row exceeds limit, save current chunk
104
- if current_size + row_size > available_space and current_rows:
 
 
105
  content = base_content + format_table_rows(current_rows)
106
  content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
107
  content += format_table_footer(table_identifier, doc_id)
@@ -128,13 +130,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
128
  current_rows = []
129
  current_size = 0
130
 
131
- # Add row index for tracking
132
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
133
  row_copy['_idx'] = i + 1
134
  current_rows.append(row_copy)
135
  current_size += row_size
136
 
137
- # Add final chunk if rows remain
138
  if current_rows:
139
  content = base_content + format_table_rows(current_rows)
140
  content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
 
7
  from my_logging import log_message
8
 
9
  # Configuration
10
+ CHUNK_SIZE = 1500
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=2000, max_rows=5):
42
+ """Chunk tables by content size AND row count"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
45
  table_num = table_data.get('table_number', 'unknown')
 
65
 
66
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
67
 
68
+ # Calculate base metadata size
69
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
70
  base_size = len(base_content)
71
  available_space = max_chars - base_size - 200
72
 
73
  # If entire table fits, return as one chunk
74
+ full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
75
+ if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
76
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
77
 
78
  metadata = {
 
90
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
91
  return [Document(text=content, metadata=metadata)]
92
 
93
+ # Otherwise, chunk by BOTH content size AND row count
94
  chunks = []
95
  current_rows = []
96
  current_size = 0
 
100
  row_text = format_single_row(row, i + 1)
101
  row_size = len(row_text)
102
 
103
+ # Check BOTH limits: size AND row count
104
+ should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
105
+
106
+ if should_split:
107
  content = base_content + format_table_rows(current_rows)
108
  content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
109
  content += format_table_footer(table_identifier, doc_id)
 
130
  current_rows = []
131
  current_size = 0
132
 
133
+ # Add row with index
134
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
135
  row_copy['_idx'] = i + 1
136
  current_rows.append(row_copy)
137
  current_size += row_size
138
 
139
+ # Add final chunk
140
  if current_rows:
141
  content = base_content + format_table_rows(current_rows)
142
  content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
index_retriever.py CHANGED
@@ -57,7 +57,7 @@ def create_query_engine(vector_index):
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
- similarity_top_k=50,
61
  num_queries=1
62
  )
63
 
 
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
+ similarity_top_k=70,
61
  num_queries=1
62
  )
63
 
table_prep.py CHANGED
@@ -4,7 +4,7 @@ from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
- MAX_ROWS_PER_CHUNK = 5
8
  MAX_CHUNK_SIZE = 4000
9
 
10
  def create_table_content(table_data):
 
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
+ MAX_ROWS_PER_CHUNK = 10
8
  MAX_CHUNK_SIZE = 4000
9
 
10
  def create_table_content(table_data):