MrSimple01 commited on
Commit
34459df
·
verified ·
1 Parent(s): 9448dfa

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +85 -44
utils.py CHANGED
@@ -195,73 +195,112 @@ def debug_search_tables(vector_index, search_term="С-25"):
195
 
196
  return matching
197
 
 
 
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
- def expand_query_with_llm(query, llm_model):
201
- """Generate 5 alternative query formulations using LLM"""
202
- try:
203
- from config import QUERY_EXPANSION_PROMPT
204
-
205
- expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- log_message(f"Generating query variations for: {query}")
208
- response = llm_model.complete(expansion_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- # Parse response - split by newlines and filter empty
211
- variations = [line.strip() for line in response.text.split('\n') if line.strip()]
212
- variations = variations[:5] # Take only first 5
213
 
214
- if variations:
215
- log_message(f"Generated {len(variations)} query variations:")
216
- for i, var in enumerate(variations, 1):
217
- log_message(f" {i}. {var}")
218
-
219
- # Combine original + variations
220
- combined_query = query + " " + " ".join(variations)
221
- return combined_query
222
- else:
223
- log_message("No variations generated, using original query")
224
- return query
225
-
226
- except Exception as e:
227
- log_message(f"Error generating query variations: {e}")
228
- return query
229
 
230
 
231
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
232
 
233
- # Apply normalizations
234
  normalized_question = normalize_text(question)
235
- normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  if change_list:
238
  log_message(f"Query changes: {', '.join(change_list)}")
239
-
 
240
  if query_engine is None:
241
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
242
 
243
  try:
244
  start_time = time.time()
245
-
246
- # EXPAND QUERY USING LLM
247
- from utils import get_llm_model
248
- llm = get_llm_model(current_model)
249
- expanded_query = expand_query_with_llm(normalized_question_2, llm)
250
-
251
- # Use expanded query for retrieval
252
- retrieved_nodes = query_engine.retriever.retrieve(expanded_query)
253
-
254
  log_message(f"user query: {question}")
255
  log_message(f"normalized query: {normalized_question}")
256
  log_message(f"after steel normalization: {normalized_question_2}")
257
- log_message(f"expanded query length: {len(expanded_query)} chars")
258
  log_message(f"Steel grades normalized in query: {query_changes}")
259
 
260
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
261
 
262
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
263
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
264
-
265
  for i, node in enumerate(unique_retrieved):
266
  node_type = node.metadata.get('type', 'text')
267
  doc_id = node.metadata.get('document_id', 'N/A')
@@ -270,6 +309,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
270
  table_num = node.metadata.get('table_number', 'N/A')
271
  table_id = node.metadata.get('table_identifier', 'N/A')
272
  table_title = node.metadata.get('table_title', 'N/A')
 
273
  content_preview = node.text[:200].replace('\n', ' ')
274
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
275
  log_message(f" Title: {table_title[:80]}")
@@ -280,11 +320,12 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
280
 
281
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
282
 
283
- reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
284
- top_k=rerank_top_k)
 
285
 
286
- # Use ORIGINAL normalized question for final answer generation
287
- response = query_engine.query(normalized_question_2)
288
 
289
  end_time = time.time()
290
  processing_time = end_time - start_time
 
195
 
196
  return matching
197
 
198
+ GENERIC_STEEL_CONTEXT = "стандарт ГОСТ технические условия марка материал применение сварка"
199
+
200
+ from config import QUERY_EXPANSION_PROMPT
201
  from documents_prep import normalize_text, normalize_steel_designations
202
 
203
+ STEEL_PRODUCT_EXPANSIONS = {
204
+ "08X18H10T": ["Листы", "Трубы", "Поковки", "Крепежные изделия", "Сортовой прокат", "Отливки"],
205
+ "12X18H10T": ["Листы", "Поковки", "Сортовой прокат"],
206
+ "10X17H13M2T": ["Трубы", "Арматура", "Поковки", "Фланцы"],
207
+ "20X23H18": ["Листы", "Сортовой прокат", "Поковки"],
208
+ "03X17H14M3": ["Трубы", "Листы", "Проволока"]
209
+ }
210
+
211
+
212
+ def enhance_query_for_steel_grades(query):
213
+ """Expand query with steel grade specific context"""
214
+ import re
215
+
216
+ # FIX: Use the same pattern as normalize_steel_designations
217
+ # Pattern for regular steel grades: 08X18H10T, 12X18H10T, etc.
218
+ steel_pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
219
+ # Pattern for welding wires: СВ-08X19H10, CB-08X19H10
220
+ wire_pattern = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
221
+
222
+ matches = re.findall(steel_pattern, query, re.IGNORECASE)
223
+ wire_matches = re.findall(wire_pattern, query, re.IGNORECASE)
224
+
225
+ all_matches = matches + wire_matches
226
+
227
+ if not all_matches:
228
+ return query
229
+
230
+ # Collect context expansions
231
+ added_context = []
232
+ grades_found = []
233
+
234
+ for match in all_matches:
235
+ match_upper = match.upper()
236
+ grades_found.append(match_upper)
237
 
238
+ # Check if we have specific context for this grade
239
+ if match_upper in STEEL_PRODUCT_EXPANSIONS:
240
+ context = ' '.join(STEEL_PRODUCT_EXPANSIONS[match_upper])
241
+ added_context.append(context)
242
+ log_message(f" Found specific context for {match_upper}: {context}")
243
+ else:
244
+ # Use generic context for unknown grades
245
+ added_context.append(GENERIC_STEEL_CONTEXT)
246
+ log_message(f" Using generic context for {match_upper}")
247
+
248
+ # Build enhanced query
249
+ if added_context:
250
+ # Remove duplicates from context
251
+ unique_context = ' '.join(set(' '.join(added_context).split()))
252
+ enhanced = f"{query} {unique_context}"
253
 
254
+ log_message(f"Enhanced query for steel grades: {', '.join(grades_found)}")
255
+ log_message(f"Added context: {unique_context[:100]}...")
 
256
 
257
+ return enhanced
258
+
259
+ return query
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
 
262
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
263
 
 
264
  normalized_question = normalize_text(question)
265
+ normalized_question_2, query_changes, change_list = normalize_steel_designations(question)
266
+
267
+ # Step 1: Keyword-based enhancement (existing)
268
+ enhanced_question = enhance_query_for_steel_grades(normalized_question_2)
269
+
270
+ # Step 2: LLM-based query expansion (NEW)
271
+ try:
272
+ llm = get_llm_model(current_model)
273
+ expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=enhanced_question)
274
+ expanded_queries = llm.complete(expansion_prompt).text.strip()
275
+
276
+ # Combine original + expanded queries
277
+ enhanced_question = f"{enhanced_question} {expanded_queries}"
278
+ log_message(f"LLM expanded query: {expanded_queries[:200]}...")
279
+ except Exception as e:
280
+ log_message(f"Query expansion failed: {e}, using keyword-only enhancement")
281
 
282
  if change_list:
283
  log_message(f"Query changes: {', '.join(change_list)}")
284
+ if change_list:
285
+ log_message(f"Query changes: {', '.join(change_list)}")
286
  if query_engine is None:
287
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
288
 
289
  try:
290
  start_time = time.time()
291
+ retrieved_nodes = query_engine.retriever.retrieve(enhanced_question)
 
 
 
 
 
 
 
 
292
  log_message(f"user query: {question}")
293
  log_message(f"normalized query: {normalized_question}")
294
  log_message(f"after steel normalization: {normalized_question_2}")
295
+ log_message(f"enhanced query: {enhanced_question}")
296
  log_message(f"Steel grades normalized in query: {query_changes}")
297
 
298
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
299
 
300
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
301
+
302
+ # IMPROVED DEBUG: Log what was actually retrieved with FULL metadata
303
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
 
304
  for i, node in enumerate(unique_retrieved):
305
  node_type = node.metadata.get('type', 'text')
306
  doc_id = node.metadata.get('document_id', 'N/A')
 
309
  table_num = node.metadata.get('table_number', 'N/A')
310
  table_id = node.metadata.get('table_identifier', 'N/A')
311
  table_title = node.metadata.get('table_title', 'N/A')
312
+ # Show first 200 chars of content to verify it's the right table
313
  content_preview = node.text[:200].replace('\n', ' ')
314
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
315
  log_message(f" Title: {table_title[:80]}")
 
320
 
321
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
322
 
323
+ # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
324
+ reranked_nodes = rerank_nodes(enhanced_question, unique_retrieved, reranker,
325
+ top_k=rerank_top_k) # NOW PARAMETERIZED
326
 
327
+ # Direct query without formatting - use normalized question
328
+ response = query_engine.query(enhanced_question)
329
 
330
  end_time = time.time()
331
  processing_time = end_time - start_time