MrSimple01 commited on
Commit
2229afc
·
verified ·
1 Parent(s): acf0a69

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +40 -75
utils.py CHANGED
@@ -10,28 +10,6 @@ from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
 
13
- STEEL_GRADE_CONTEXT = {
14
- # Austenitic stainless steels
15
- "08X18H10T": "08X18H10T аустенитная нержавеющая сталь стандарт ГОСТ технические условия марка материал сварка применение",
16
- "12X18H10T": "12X18H10T аустенитная нержавеющая сталь стандарт ГОСТ технические условия марка материал сварка применение",
17
- "08X18H10": "08X18H10 аустенитная нержавеющая сталь стандарт ГОСТ технические условия марка материал сварка применение",
18
- "12X18H9": "12X18H9 аустенитная нержавеющая сталь стандарт ГОСТ технические условия марка материал сварка применение",
19
- "10X17H13M2T": "10X17H13M2T аустенитная нержавеющая сталь стандарт ГОСТ технические условия марка материал сварка применение",
20
-
21
- # Welding wires
22
- "CB-08X19H10": "CB-08X19H10 сварочная проволока стандарт ГОСТ технические условия марка материал сварка применение",
23
- "CB-08X18H10T": "CB-08X18H10T сварочная проволока стандарт ГОСТ технические условия марка материал сварка применение",
24
-
25
- }
26
-
27
-
28
-
29
-
30
- # Generic context for any steel grade pattern
31
- GENERIC_STEEL_CONTEXT = "стандарт ГОСТ технические условия марка материал применение сварка"
32
-
33
-
34
-
35
  def get_llm_model(model_name):
36
  try:
37
  model_config = AVAILABLE_MODELS.get(model_name)
@@ -219,57 +197,43 @@ def debug_search_tables(vector_index, search_term="С-25"):
219
 
220
  from documents_prep import normalize_text, normalize_steel_designations
221
 
222
- def enhance_query_for_steel_grades(query):
223
- import re
224
-
225
- # Detect steel grades in query
226
- steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
227
- matches = re.findall(steel_pattern, query, re.IGNORECASE)
228
-
229
- if not matches:
230
- return query
231
-
232
- # Collect context expansions
233
- added_context = []
234
- grades_found = []
235
-
236
- for match in matches:
237
- match_upper = match.upper()
238
- grades_found.append(match_upper)
239
 
240
- # Check if we have specific context for this grade
241
- if match_upper in STEEL_GRADE_CONTEXT:
242
- context = STEEL_GRADE_CONTEXT[match_upper]
243
- added_context.append(context)
244
- log_message(f" Found specific context for {match_upper}")
245
- else:
246
- # Use generic context for unknown grades
247
- added_context.append(GENERIC_STEEL_CONTEXT)
248
- log_message(f" Using generic context for {match_upper}")
249
-
250
- # Build enhanced query
251
- if added_context:
252
- # Remove duplicates from context
253
- unique_context = ' '.join(set(' '.join(added_context).split()))
254
- enhanced = f"{query} {unique_context}"
255
 
256
- log_message(f"Enhanced query for steel grades: {', '.join(grades_found)}")
257
- log_message(f"Added context: {unique_context[:100]}...")
258
 
259
- return enhanced
260
-
261
- return query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
 
264
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
265
 
266
- # FIXED: Apply all normalizations in correct order
267
  normalized_question = normalize_text(question)
268
  normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
269
 
270
- # FIX: Actually call enhance_query_for_steel_grades!
271
- enhanced_query = enhance_query_for_steel_grades(normalized_question_2)
272
-
273
  if change_list:
274
  log_message(f"Query changes: {', '.join(change_list)}")
275
 
@@ -279,22 +243,25 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
279
  try:
280
  start_time = time.time()
281
 
282
- # FIX: Use enhanced_query instead of normalized_question_2
283
- retrieved_nodes = query_engine.retriever.retrieve(enhanced_query)
 
 
 
 
 
284
 
285
  log_message(f"user query: {question}")
286
  log_message(f"normalized query: {normalized_question}")
287
  log_message(f"after steel normalization: {normalized_question_2}")
288
- log_message(f"enhanced query: {enhanced_query}") # NEW LOG
289
  log_message(f"Steel grades normalized in query: {query_changes}")
290
-
291
 
292
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
293
 
294
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
295
-
296
- # IMPROVED DEBUG: Log what was actually retrieved with FULL metadata
297
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
 
298
  for i, node in enumerate(unique_retrieved):
299
  node_type = node.metadata.get('type', 'text')
300
  doc_id = node.metadata.get('document_id', 'N/A')
@@ -303,7 +270,6 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
303
  table_num = node.metadata.get('table_number', 'N/A')
304
  table_id = node.metadata.get('table_identifier', 'N/A')
305
  table_title = node.metadata.get('table_title', 'N/A')
306
- # Show first 200 chars of content to verify it's the right table
307
  content_preview = node.text[:200].replace('\n', ' ')
308
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
309
  log_message(f" Title: {table_title[:80]}")
@@ -314,12 +280,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
314
 
315
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
316
 
317
- # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
318
- reranked_nodes = rerank_nodes(enhanced_query, unique_retrieved, reranker,
319
- top_k=rerank_top_k) # NOW PARAMETERIZED
320
 
321
- # Direct query without formatting - use normalized question
322
- response = query_engine.query(enhanced_query)
323
 
324
  end_time = time.time()
325
  processing_time = end_time - start_time
 
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def get_llm_model(model_name):
14
  try:
15
  model_config = AVAILABLE_MODELS.get(model_name)
 
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
+ def expand_query_with_llm(query, llm_model):
201
+ """Generate 5 alternative query formulations using LLM"""
202
+ try:
203
+ from config import QUERY_EXPANSION_PROMPT
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ log_message(f"Generating query variations for: {query}")
208
+ response = llm_model.complete(expansion_prompt)
209
 
210
+ # Parse response - split by newlines and filter empty
211
+ variations = [line.strip() for line in response.text.split('\n') if line.strip()]
212
+ variations = variations[:5] # Take only first 5
213
+
214
+ if variations:
215
+ log_message(f"Generated {len(variations)} query variations:")
216
+ for i, var in enumerate(variations, 1):
217
+ log_message(f" {i}. {var}")
218
+
219
+ # Combine original + variations
220
+ combined_query = query + " " + " ".join(variations)
221
+ return combined_query
222
+ else:
223
+ log_message("No variations generated, using original query")
224
+ return query
225
+
226
+ except Exception as e:
227
+ log_message(f"Error generating query variations: {e}")
228
+ return query
229
 
230
 
231
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
232
 
233
+ # Apply normalizations
234
  normalized_question = normalize_text(question)
235
  normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
236
 
 
 
 
237
  if change_list:
238
  log_message(f"Query changes: {', '.join(change_list)}")
239
 
 
243
  try:
244
  start_time = time.time()
245
 
246
+ # EXPAND QUERY USING LLM
247
+ from utils import get_llm_model
248
+ llm = get_llm_model(current_model)
249
+ expanded_query = expand_query_with_llm(normalized_question_2, llm)
250
+
251
+ # Use expanded query for retrieval
252
+ retrieved_nodes = query_engine.retriever.retrieve(expanded_query)
253
 
254
  log_message(f"user query: {question}")
255
  log_message(f"normalized query: {normalized_question}")
256
  log_message(f"after steel normalization: {normalized_question_2}")
257
+ log_message(f"expanded query length: {len(expanded_query)} chars")
258
  log_message(f"Steel grades normalized in query: {query_changes}")
 
259
 
260
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
261
 
262
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
263
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
264
+
265
  for i, node in enumerate(unique_retrieved):
266
  node_type = node.metadata.get('type', 'text')
267
  doc_id = node.metadata.get('document_id', 'N/A')
 
270
  table_num = node.metadata.get('table_number', 'N/A')
271
  table_id = node.metadata.get('table_identifier', 'N/A')
272
  table_title = node.metadata.get('table_title', 'N/A')
 
273
  content_preview = node.text[:200].replace('\n', ' ')
274
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
275
  log_message(f" Title: {table_title[:80]}")
 
280
 
281
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
282
 
283
+ reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
284
+ top_k=rerank_top_k)
 
285
 
286
+ # Use ORIGINAL normalized question for final answer generation
287
+ response = query_engine.query(normalized_question_2)
288
 
289
  end_time = time.time()
290
  processing_time = end_time - start_time