MrSimple01 commited on
Commit
549f9a0
·
verified ·
1 Parent(s): 3374997

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +62 -7
utils.py CHANGED
@@ -197,23 +197,78 @@ def debug_search_tables(vector_index, search_term="С-25"):
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
201
 
 
202
  normalized_question = normalize_text(question)
203
- log_message(f"Normalized question: {normalized_question}")
204
- normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
205
- log_message(f"After steel normalization: {normalized_question_2}")
 
 
206
  if change_list:
207
- log_message(f"Query changes: {', '.join(change_list)}")
 
208
  if query_engine is None:
209
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
210
 
211
  try:
212
  start_time = time.time()
213
- retrieved_nodes = query_engine.retriever.retrieve(normalized_question_2)
 
 
 
214
  log_message(f"user query: {question}")
215
  log_message(f"normalized query: {normalized_question}")
216
  log_message(f"after steel normalization: {normalized_question_2}")
 
217
  log_message(f"Steel grades normalized in query: {query_changes}")
218
 
219
 
@@ -243,11 +298,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
243
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
244
 
245
  # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
246
- reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker,
247
  top_k=rerank_top_k) # NOW PARAMETERIZED
248
 
249
  # Direct query without formatting - use normalized question
250
- response = query_engine.query(normalized_question)
251
 
252
  end_time = time.time()
253
  processing_time = end_time - start_time
 
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
+ def enhance_query_for_steel_grades(query):
201
+ """Expand query with related terms for better steel grade retrieval"""
202
+ import re
203
+
204
+ # Detect if query contains steel grades
205
+ steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
206
+ matches = re.findall(steel_pattern, query, re.IGNORECASE)
207
+
208
+ if matches:
209
+ # Add contextual terms
210
+ enhanced = query + " стандарт материал марка стали применение"
211
+ log_message(f"Enhanced query with steel context: {enhanced}")
212
+ return enhanced
213
+
214
+ return query
215
+
216
+ def generate_sub_questions(question, llm_model):
217
+ """Generate 3-5 related sub-questions to expand query coverage"""
218
+
219
+ expansion_prompt = f"""Ты эксперт по нормативной документации.
220
+ Пользователь задал вопрос: "{question}"
221
+
222
+ Сгенерируй 3-5 дополнительных вопросов, которые помогут найти полный ответ на основной вопрос.
223
+ Вопросы должны быть:
224
+ - Максимально близкими и релевантными к основному вопросу
225
+ - Покрывать разные аспекты темы (стандарты, материалы, методы, требования)
226
+ - Короткими и конкретными
227
+
228
+ Формат ответа - просто список вопросов, по одному на строку, без нумерации:"""
229
+
230
+ try:
231
+ response = llm_model.complete(expansion_prompt)
232
+ sub_questions = [q.strip() for q in response.text.strip().split('\n') if q.strip()]
233
+
234
+ # Take only first 5
235
+ sub_questions = sub_questions[:5]
236
+
237
+ log_message(f"Generated {len(sub_questions)} sub-questions:")
238
+ for sq in sub_questions:
239
+ log_message(f" - {sq}")
240
+
241
+ return sub_questions
242
+ except Exception as e:
243
+ log_message(f"Error generating sub-questions: {e}")
244
+ return []
245
+
246
+
247
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
248
 
249
+ # FIXED: Apply all normalizations in correct order
250
  normalized_question = normalize_text(question)
251
+ normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
252
+
253
+ # FIX: Actually call enhance_query_for_steel_grades!
254
+ enhanced_query = enhance_query_for_steel_grades(normalized_question_2)
255
+
256
  if change_list:
257
+ log_message(f"Query changes: {', '.join(change_list)}")
258
+
259
  if query_engine is None:
260
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
261
 
262
  try:
263
  start_time = time.time()
264
+
265
+ # FIX: Use enhanced_query instead of normalized_question_2
266
+ retrieved_nodes = query_engine.retriever.retrieve(enhanced_query)
267
+
268
  log_message(f"user query: {question}")
269
  log_message(f"normalized query: {normalized_question}")
270
  log_message(f"after steel normalization: {normalized_question_2}")
271
+ log_message(f"enhanced query: {enhanced_query}") # NEW LOG
272
  log_message(f"Steel grades normalized in query: {query_changes}")
273
 
274
 
 
298
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
299
 
300
  # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
301
+ reranked_nodes = rerank_nodes(enhanced_query, unique_retrieved, reranker,
302
  top_k=rerank_top_k) # NOW PARAMETERIZED
303
 
304
  # Direct query without formatting - use normalized question
305
+ response = query_engine.query(enhanced_query)
306
 
307
  end_time = time.time()
308
  processing_time = end_time - start_time