MrSimple01 commited on
Commit
657a2ac
·
verified ·
1 Parent(s): 73ac4f0

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +35 -52
utils.py CHANGED
@@ -9,7 +9,6 @@ import time
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
- import re
13
 
14
  def get_llm_model(model_name):
15
  try:
@@ -173,14 +172,28 @@ def deduplicate_nodes(nodes):
173
 
174
  return unique_nodes
175
 
176
- def normalize_query(query):
177
- def repl(m):
178
- cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
179
- letter = cyr_to_lat.get(m.group(1), m.group(1))
180
- return f"{letter}{m.group(2)}"
181
-
182
- return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
183
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
186
  if query_engine is None:
@@ -188,58 +201,28 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
188
 
189
  try:
190
  start_time = time.time()
191
-
192
- # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
193
- normalized_question = normalize_query(question)
194
- log_message(f"Original query: {question}")
195
- log_message(f"Normalized query: {normalized_question}")
196
-
197
- # Use normalized query for retrieval
198
  retrieved_nodes = query_engine.retriever.retrieve(question)
199
  log_message(f"user query: {question}")
 
200
 
201
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
202
 
203
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
 
 
 
 
 
 
204
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
205
 
206
- # Check for connection types
207
- conn_types_retrieved = {}
208
- for node in unique_retrieved:
209
- if node.metadata.get('type') == 'table':
210
- conn_type = node.metadata.get('connection_type', '')
211
- if conn_type:
212
- conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
213
-
214
- if conn_types_retrieved:
215
- log_message("CONNECTION TYPES IN RETRIEVED:")
216
- for ct, cnt in sorted(conn_types_retrieved.items()):
217
- log_message(f" {ct}: {cnt} chunks")
218
-
219
- # Check if target type was retrieved
220
- # Normalize the check as well
221
- normalized_check = normalize_query('С-25') # Will become C25
222
- if normalized_check in question or 'С-25' in question or 'C-25' in question:
223
- if 'C25' in conn_types_retrieved:
224
- log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
225
- else:
226
- log_message("✗ C25 NOT RETRIEVED despite being in query!")
227
-
228
- # Sample of retrieved tables
229
- log_message("SAMPLE OF RETRIEVED TABLES:")
230
- for i, node in enumerate(unique_retrieved[:10]):
231
- if node.metadata.get('type') == 'table':
232
- table_num = node.metadata.get('table_number', 'N/A')
233
- table_title = node.metadata.get('table_title', 'N/A')
234
- conn_type = node.metadata.get('connection_type', 'N/A')
235
- doc_id = node.metadata.get('document_id', 'N/A')
236
- log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
237
-
238
- # Rerank - use normalized query for consistency
239
- reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
240
 
241
- # CRITICAL FIX: Use normalized query for LLM as well
242
- response = query_engine.query(normalized_question)
243
 
244
  end_time = time.time()
245
  processing_time = end_time - start_time
 
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
 
12
 
13
  def get_llm_model(model_name):
14
  try:
 
172
 
173
  return unique_nodes
174
 
175
+ def debug_search_tables(vector_index, search_term="С-25"):
176
+ """Debug function to find all tables containing a specific term"""
177
+ all_nodes = list(vector_index.docstore.docs.values())
178
+
179
+ matching = []
180
+ for node in all_nodes:
181
+ if node.metadata.get('type') == 'table':
182
+ text = node.get_content()
183
+ if search_term in text or search_term in node.metadata.get('table_title', ''):
184
+ matching.append({
185
+ 'doc_id': node.metadata.get('document_id'),
186
+ 'table_num': node.metadata.get('table_number'),
187
+ 'title': node.metadata.get('table_title', '')[:100]
188
+ })
189
+
190
+ log_message(f"\n{'='*60}")
191
+ log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
192
+ for m in matching:
193
+ log_message(f" • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
194
+ log_message(f"{'='*60}\n")
195
+
196
+ return matching
197
 
198
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
199
  if query_engine is None:
 
201
 
202
  try:
203
  start_time = time.time()
 
 
 
 
 
 
 
204
  retrieved_nodes = query_engine.retriever.retrieve(question)
205
  log_message(f"user query: {question}")
206
+
207
 
208
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
209
 
210
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
211
+
212
+ # DEBUG: Log what was retrieved
213
+ log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
214
+ for i, node in enumerate(unique_retrieved): # All debug
215
+ table_num = node.metadata.get('table_number', 'N/A')
216
+ table_title = node.metadata.get('table_title', 'N/A')
217
+ doc_id = node.metadata.get('document_id', 'N/A')
218
+ log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
219
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
220
 
221
+ # Simple reranking
222
+ reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
+ # Direct query without formatting
225
+ response = query_engine.query(question)
226
 
227
  end_time = time.time()
228
  processing_time = end_time - start_time