MrSimple07 commited on
Commit
f79b229
·
1 Parent(s): 26c4970

chunk size = 1024 + max chars = 1200 + deduplication variant

Browse files
Files changed (3) hide show
  1. documents_prep.py +27 -51
  2. index_retriever.py +25 -20
  3. utils.py +19 -15
documents_prep.py CHANGED
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
  # Configuration
10
- CHUNK_SIZE = 512
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
@@ -65,15 +65,28 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
65
 
66
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
67
 
68
- # Calculate base metadata size (everything except row data)
69
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  base_size = len(base_content)
71
- available_space = max_chars - base_size - 200
72
 
73
- # If entire table fits, return as one chunk
74
  full_rows_content = format_table_rows(rows)
75
  if base_size + len(full_rows_content) <= max_chars:
76
- content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
77
 
78
  metadata = {
79
  'type': 'table',
@@ -84,13 +97,15 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
84
  'section': section,
85
  'total_rows': len(rows),
86
  'chunk_size': len(content),
87
- 'is_complete_table': True
 
 
88
  }
89
 
90
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
91
  return [Document(text=content, metadata=metadata)]
92
 
93
- # Otherwise, chunk by content size
94
  chunks = []
95
  current_rows = []
96
  current_size = 0
@@ -100,11 +115,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
100
  row_text = format_single_row(row, i + 1)
101
  row_size = len(row_text)
102
 
103
- # If adding this row exceeds limit, save current chunk
104
  if current_size + row_size > available_space and current_rows:
105
  content = base_content + format_table_rows(current_rows)
106
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
107
- content += format_table_footer(table_identifier, doc_id)
108
 
109
  metadata = {
110
  'type': 'table',
@@ -122,23 +135,20 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
122
  }
123
 
124
  chunks.append(Document(text=content, metadata=metadata))
125
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
126
 
127
  chunk_num += 1
128
  current_rows = []
129
  current_size = 0
130
 
131
- # Add row index for tracking
132
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
133
  row_copy['_idx'] = i + 1
134
  current_rows.append(row_copy)
135
  current_size += row_size
136
 
137
- # Add final chunk if rows remain
138
  if current_rows:
139
  content = base_content + format_table_rows(current_rows)
140
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
141
- content += format_table_footer(table_identifier, doc_id)
142
 
143
  metadata = {
144
  'type': 'table',
@@ -156,45 +166,11 @@ def chunk_table_by_content(table_data, doc_id, max_chars=1200):
156
  }
157
 
158
  chunks.append(Document(text=content, metadata=metadata))
159
- log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
160
 
161
  return chunks
162
 
163
 
164
- def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
165
- """Format consistent table header"""
166
- content = f"ДОКУМЕНТ: {doc_id}\n"
167
- content += f"ТАБЛИЦА: {table_identifier}\n"
168
- content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
169
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
170
- if table_title:
171
- content += f"НАЗВАНИЕ: {table_title}\n"
172
- if section:
173
- content += f"РАЗДЕЛ: {section}\n"
174
- content += f"{'='*70}\n\n"
175
-
176
- # Enhanced search keywords
177
- content += f"Это таблица {table_identifier} из документа {doc_id}. "
178
- content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
179
-
180
- if section:
181
- content += f"Раздел: {section}. "
182
- if 'приложени' in section.lower():
183
- content += f"Таблица из приложения. "
184
-
185
- if table_title:
186
- content += f"Название: {table_title}. "
187
-
188
- content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
189
-
190
- if headers:
191
- header_str = ' | '.join(str(h) for h in headers)
192
- content += f"ЗАГОЛОВКИ: {header_str}\n\n"
193
-
194
- content += "ДАННЫЕ:\n"
195
- return content
196
-
197
-
198
  def format_single_row(row, idx):
199
  """Format a single row"""
200
  if isinstance(row, dict):
 
7
  from my_logging import log_message
8
 
9
  # Configuration
10
+ CHUNK_SIZE = 1024
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
 
65
 
66
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
67
 
68
+ # SIMPLIFIED base content - remove redundant search keywords
69
+ base_content = f"ДОКУМЕНТ: {doc_id}\n"
70
+ base_content += f"ТАБЛИЦА: {table_identifier}\n"
71
+ if table_title:
72
+ base_content += f"НАЗВАНИЕ: {table_title}\n"
73
+ if section:
74
+ base_content += f"РАЗДЕЛ: {section}\n"
75
+ base_content += f"{'='*70}\n\n"
76
+
77
+ if headers:
78
+ header_str = ' | '.join(str(h) for h in headers)
79
+ base_content += f"ЗАГОЛОВКИ: {header_str}\n\n"
80
+
81
+ base_content += "ДАННЫЕ:\n"
82
+
83
  base_size = len(base_content)
84
+ available_space = max_chars - base_size - 100 # Reduced footer overhead
85
 
86
+ # Rest of the function stays the same...
87
  full_rows_content = format_table_rows(rows)
88
  if base_size + len(full_rows_content) <= max_chars:
89
+ content = base_content + full_rows_content
90
 
91
  metadata = {
92
  'type': 'table',
 
97
  'section': section,
98
  'total_rows': len(rows),
99
  'chunk_size': len(content),
100
+ 'is_complete_table': True,
101
+ 'row_start': 0,
102
+ 'row_end': len(rows)
103
  }
104
 
105
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
106
  return [Document(text=content, metadata=metadata)]
107
 
108
+ # Chunking logic with row indices...
109
  chunks = []
110
  current_rows = []
111
  current_size = 0
 
115
  row_text = format_single_row(row, i + 1)
116
  row_size = len(row_text)
117
 
 
118
  if current_size + row_size > available_space and current_rows:
119
  content = base_content + format_table_rows(current_rows)
120
+ content += f"\n[Строки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}]\n"
 
121
 
122
  metadata = {
123
  'type': 'table',
 
135
  }
136
 
137
  chunks.append(Document(text=content, metadata=metadata))
138
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, rows {current_rows[0]['_idx']}-{current_rows[-1]['_idx']}")
139
 
140
  chunk_num += 1
141
  current_rows = []
142
  current_size = 0
143
 
 
144
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
145
  row_copy['_idx'] = i + 1
146
  current_rows.append(row_copy)
147
  current_size += row_size
148
 
 
149
  if current_rows:
150
  content = base_content + format_table_rows(current_rows)
151
+ content += f"\n[Строки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}]\n"
 
152
 
153
  metadata = {
154
  'type': 'table',
 
166
  }
167
 
168
  chunks.append(Document(text=content, metadata=metadata))
169
+ log_message(f" Chunk {chunk_num + 1}: {len(content)} chars, rows {current_rows[0]['_idx']}-{current_rows[-1]['_idx']}")
170
 
171
  return chunks
172
 
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  def format_single_row(row, idx):
175
  """Format a single row"""
176
  if isinstance(row, dict):
index_retriever.py CHANGED
@@ -39,43 +39,48 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
39
  return filtered
40
 
41
  def create_query_engine(vector_index):
42
- """Create hybrid retrieval engine with keyword boost"""
43
  log_message("Creating query engine...")
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
- similarity_top_k=50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
- similarity_top_k=50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
- similarity_top_k=60,
56
  num_queries=1
57
  )
58
- response_synthesizer = get_response_synthesizer()
59
 
60
- class KeywordBoostQueryEngine(RetrieverQueryEngine):
61
  def retrieve(self, query):
62
- # Hybrid results
63
- hybrid_nodes = hybrid_retriever.retrieve(query)
64
- # Keyword filter from all indexed nodes
65
- all_nodes = list(vector_index.docstore.values())
66
- keyword_nodes = keyword_filter_nodes(query, all_nodes)
67
- # Combine and deduplicate
68
- all_candidates = {id(n): n for n in hybrid_nodes + keyword_nodes}
69
- log_message(f"Hybrid: {len(hybrid_nodes)}, Keyword: {len(keyword_nodes)}, Total: {len(all_candidates)}")
70
- return list(all_candidates.values())[:60]
71
- def query(self, prompt):
72
- nodes = self.retrieve(prompt)
73
- return response_synthesizer.synthesize(prompt, nodes)
 
 
 
 
74
 
75
- query_engine = KeywordBoostQueryEngine(
 
 
76
  retriever=hybrid_retriever,
77
  response_synthesizer=response_synthesizer
78
  )
79
 
80
- log_message("✓ Query engine created (with keyword boost)")
81
  return query_engine
 
39
  return filtered
40
 
41
  def create_query_engine(vector_index):
42
+ """Create hybrid retrieval engine with deduplication"""
43
  log_message("Creating query engine...")
44
 
45
  vector_retriever = VectorIndexRetriever(
46
  index=vector_index,
47
+ similarity_top_k=40 # Reduced from 50
48
  )
49
  bm25_retriever = BM25Retriever.from_defaults(
50
  docstore=vector_index.docstore,
51
+ similarity_top_k=40 # Reduced from 50
52
  )
53
  hybrid_retriever = QueryFusionRetriever(
54
  [vector_retriever, bm25_retriever],
55
+ similarity_top_k=50, # Reduced from 60
56
  num_queries=1
57
  )
 
58
 
59
+ class DeduplicatedQueryEngine(RetrieverQueryEngine):
60
  def retrieve(self, query):
61
+ nodes = hybrid_retriever.retrieve(query)
62
+
63
+ # CRITICAL: Deduplicate by text content hash
64
+ seen_hashes = set()
65
+ unique_nodes = []
66
+
67
+ for node in nodes:
68
+ # Create hash from first 200 chars to detect duplicates
69
+ text_hash = hash(node.text[:200])
70
+
71
+ if text_hash not in seen_hashes:
72
+ seen_hashes.add(text_hash)
73
+ unique_nodes.append(node)
74
+
75
+ log_message(f"Retrieved: {len(nodes)} → Unique: {len(unique_nodes)}")
76
+ return unique_nodes[:50] # Return top 50 unique
77
 
78
+ response_synthesizer = get_response_synthesizer()
79
+
80
+ query_engine = DeduplicatedQueryEngine(
81
  retriever=hybrid_retriever,
82
  response_synthesizer=response_synthesizer
83
  )
84
 
85
+ log_message("✓ Query engine created (with deduplication)")
86
  return query_engine
utils.py CHANGED
@@ -42,18 +42,21 @@ def answer_question(question, query_engine, reranker):
42
  log_message(f"\n{'='*70}")
43
  log_message(f"QUERY: {question}")
44
 
45
- # Retrieve and rerank nodes
46
- retrieved = query_engine.retriever.retrieve(question)
47
- log_message(f"\nRETRIEVED: {len(retrieved)} nodes")
48
- reranked = rerank_nodes(question, retrieved, reranker, top_k=25, min_score=0.3)
49
- log_message(f"\nRERANKED: {len(reranked)} nodes")
 
 
50
 
51
- # Build context for prompt
52
  context_parts = []
53
  for n in reranked:
54
  meta = n.metadata
55
  doc_id = meta.get('document_id', 'unknown')
56
  doc_type = meta.get('type', 'text')
 
57
  if doc_type == 'table':
58
  table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
59
  title = meta.get('table_title', '')
@@ -62,31 +65,32 @@ def answer_question(question, query_engine, reranker):
62
  source_label += f" {title}"
63
  else:
64
  source_label = f"[{doc_id}]"
65
- context_parts.append(f"{source_label}\n{n.text}") # Use FULL text, not [:500]
 
66
 
67
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
68
 
69
- # Use CUSTOM_PROMPT from config
70
  from config import CUSTOM_PROMPT
71
  prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
72
- log_message(f"\nPROMPT LENGTH: {len(prompt)} chars\n")
73
 
74
- # CRITICAL FIX: Call LLM directly instead of query_engine.query()
75
  from llama_index.core import Settings
76
  response = Settings.llm.complete(prompt)
77
 
78
  sources = format_sources(reranked)
79
 
80
- # Log retrieved chunks
81
  log_message(f"\n{'='*70}")
82
  log_message("RETRIEVED CHUNKS:")
83
  for i, node in enumerate(reranked, 1):
84
  log_message(f"\n--- Chunk {i} ---")
85
- log_message(f"Document: {node.metadata.get('document_id', 'unknown')}")
86
- log_message(f"Type: {node.metadata.get('type', 'unknown')}")
87
  if node.metadata.get('type') == 'table':
88
- log_message(f"Table: {node.metadata.get('table_identifier', 'unknown')}")
89
- log_message(f"Text preview: {node.text[:500]}...")
 
 
90
 
91
  return response.text, sources
92
 
 
42
  log_message(f"\n{'='*70}")
43
  log_message(f"QUERY: {question}")
44
 
45
+ # Retrieve nodes (already deduplicated)
46
+ retrieved = query_engine.retrieve(question)
47
+ log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
48
+
49
+ # Rerank
50
+ reranked = rerank_nodes(question, retrieved, reranker, top_k=15, min_score=0.25) # Reduced top_k
51
+ log_message(f"RERANKED: {len(reranked)} nodes")
52
 
53
+ # Build context - NO TRUNCATION
54
  context_parts = []
55
  for n in reranked:
56
  meta = n.metadata
57
  doc_id = meta.get('document_id', 'unknown')
58
  doc_type = meta.get('type', 'text')
59
+
60
  if doc_type == 'table':
61
  table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
62
  title = meta.get('table_title', '')
 
65
  source_label += f" {title}"
66
  else:
67
  source_label = f"[{doc_id}]"
68
+
69
+ context_parts.append(f"{source_label}\n{n.text}") # Full text
70
 
71
  context = "\n\n" + ("="*50 + "\n\n").join(context_parts)
72
 
 
73
  from config import CUSTOM_PROMPT
74
  prompt = CUSTOM_PROMPT.format(context_str=context, query_str=question)
75
+ log_message(f"PROMPT LENGTH: {len(prompt)} chars")
76
 
 
77
  from llama_index.core import Settings
78
  response = Settings.llm.complete(prompt)
79
 
80
  sources = format_sources(reranked)
81
 
82
+ # Log retrieved chunks WITHOUT duplicates
83
  log_message(f"\n{'='*70}")
84
  log_message("RETRIEVED CHUNKS:")
85
  for i, node in enumerate(reranked, 1):
86
  log_message(f"\n--- Chunk {i} ---")
87
+ log_message(f"Document: {node.metadata.get('document_id')}")
88
+ log_message(f"Type: {node.metadata.get('type')}")
89
  if node.metadata.get('type') == 'table':
90
+ table_id = node.metadata.get('table_identifier')
91
+ rows = f"{node.metadata.get('row_start', 0)}-{node.metadata.get('row_end', 0)}"
92
+ log_message(f"Table: {table_id} (rows {rows})")
93
+ log_message(f"Text: {node.text[:300]}...")
94
 
95
  return response.text, sources
96