MrSimple07 commited on
Commit
6db5f4f
·
1 Parent(s): 379f6e4

added the new llm query expanding

Browse files
Files changed (3) hide show
  1. config.py +12 -0
  2. documents_prep.py +0 -6
  3. utils.py +49 -28
config.py CHANGED
@@ -54,6 +54,18 @@ CHUNK_OVERLAP = 128
54
  MAX_CHARS_TABLE = 2000
55
  MAX_ROWS_TABLE = 30
56
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  CUSTOM_PROMPT = """
58
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
59
  СТРОГО ОТВЕТИТЬ ТОЛЬКО НА РУССКОМ!
 
54
  MAX_CHARS_TABLE = 2000
55
  MAX_ROWS_TABLE = 30
56
 
57
+
58
+ QUERY_EXPANSION_PROMPT = """Ты помощник для расширения поисковых запросов.
59
+
60
+ Пользователь задал вопрос: "{original_query}"
61
+
62
+ Сгенерируй 5 альтернативных формулировок этого же вопроса, которые помогут найти релевантную информацию в технической документации. Формулировки должны быть:
63
+ - Близкими по смыслу к оригинальному вопросу
64
+ - Использовать технические термины и синонимы
65
+ - Быть краткими (5-10 слов)
66
+
67
+ Верни ТОЛЬКО 5 вопросов, каждый с новой строки, без нумерации и объяснений."""
68
+
69
  CUSTOM_PROMPT = """
70
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
71
  СТРОГО ОТВЕТИТЬ ТОЛЬКО НА РУССКОМ!
documents_prep.py CHANGED
@@ -25,12 +25,6 @@ def normalize_text(text):
25
  import re
26
 
27
  def normalize_steel_designations(text):
28
- """
29
- Normalize steel designations by converting Cyrillic letters to Latin.
30
- This improves search/retrieval since embedding models work better with Latin.
31
- Handles patterns like 08Х18Н10Т → 08X18H10T
32
- Returns: (normalized_text, changes_count, changes_list)
33
- """
34
  if not text:
35
  return text, 0, []
36
 
 
25
  import re
26
 
27
  def normalize_steel_designations(text):
 
 
 
 
 
 
28
  if not text:
29
  return text, 0, []
30
 
utils.py CHANGED
@@ -197,48 +197,71 @@ def debug_search_tables(vector_index, search_term="С-25"):
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
- def enhance_query_for_steel_grades(query):
201
- """Expand query with related terms for better steel grade retrieval"""
202
- import re
203
-
204
- # Detect if query contains steel grades
205
- steel_pattern = r'\b\d{1,3}[XHТCВKMAPХНТСВКМАР]\d*[XHТCВKMAPХНТСВКМАР\d]*\b'
206
- matches = re.findall(steel_pattern, query, re.IGNORECASE)
207
-
208
- if matches:
209
- # Add contextual terms
210
- enhanced = query + " стандарт материал марка стали применение"
211
- log_message(f"Enhanced query with steel context: {enhanced}")
212
- return enhanced
213
-
214
- return query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
 
217
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
218
 
 
219
  normalized_question = normalize_text(question)
220
- normalized_question_2, query_changes, change_list = normalize_steel_designations(question) # FIX: 3 values
221
- normalized_question_2 = enhance_query_for_steel_grades(normalized_question_2)
222
  if change_list:
223
- log_message(f"Query changes: {', '.join(change_list)}")
 
224
  if query_engine is None:
225
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
226
 
227
  try:
228
  start_time = time.time()
229
- retrieved_nodes = query_engine.retriever.retrieve(normalized_question_2)
 
 
 
 
 
 
 
 
230
  log_message(f"user query: {question}")
231
  log_message(f"normalized query: {normalized_question}")
232
  log_message(f"after steel normalization: {normalized_question_2}")
 
233
  log_message(f"Steel grades normalized in query: {query_changes}")
234
-
235
 
236
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
237
 
238
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
239
-
240
- # IMPROVED DEBUG: Log what was actually retrieved with FULL metadata
241
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
 
242
  for i, node in enumerate(unique_retrieved):
243
  node_type = node.metadata.get('type', 'text')
244
  doc_id = node.metadata.get('document_id', 'N/A')
@@ -247,7 +270,6 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
247
  table_num = node.metadata.get('table_number', 'N/A')
248
  table_id = node.metadata.get('table_identifier', 'N/A')
249
  table_title = node.metadata.get('table_title', 'N/A')
250
- # Show first 200 chars of content to verify it's the right table
251
  content_preview = node.text[:200].replace('\n', ' ')
252
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
253
  log_message(f" Title: {table_title[:80]}")
@@ -258,12 +280,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
258
 
259
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
260
 
261
- # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
262
- reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker,
263
- top_k=rerank_top_k) # NOW PARAMETERIZED
264
 
265
- # Direct query without formatting - use normalized question
266
- response = query_engine.query(normalized_question)
267
 
268
  end_time = time.time()
269
  processing_time = end_time - start_time
 
197
 
198
  from documents_prep import normalize_text, normalize_steel_designations
199
 
200
+ def expand_query_with_llm(query, llm_model):
201
+ """Generate 5 alternative query formulations using LLM"""
202
+ try:
203
+ from config import QUERY_EXPANSION_PROMPT
204
+
205
+ expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=query)
206
+
207
+ log_message(f"Generating query variations for: {query}")
208
+ response = llm_model.complete(expansion_prompt)
209
+
210
+ # Parse response - split by newlines and filter empty
211
+ variations = [line.strip() for line in response.text.split('\n') if line.strip()]
212
+ variations = variations[:5] # Take only first 5
213
+
214
+ if variations:
215
+ log_message(f"Generated {len(variations)} query variations:")
216
+ for i, var in enumerate(variations, 1):
217
+ log_message(f" {i}. {var}")
218
+
219
+ # Combine original + variations
220
+ combined_query = query + " " + " ".join(variations)
221
+ return combined_query
222
+ else:
223
+ log_message("No variations generated, using original query")
224
+ return query
225
+
226
+ except Exception as e:
227
+ log_message(f"Error generating query variations: {e}")
228
+ return query
229
 
230
 
231
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
232
 
233
+ # Apply normalizations
234
  normalized_question = normalize_text(question)
235
+ normalized_question_2, query_changes, change_list = normalize_steel_designations(normalized_question)
236
+
237
  if change_list:
238
+ log_message(f"Query changes: {', '.join(change_list)}")
239
+
240
  if query_engine is None:
241
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
242
 
243
  try:
244
  start_time = time.time()
245
+
246
+ # EXPAND QUERY USING LLM
247
+ from utils import get_llm_model
248
+ llm = get_llm_model(current_model)
249
+ expanded_query = expand_query_with_llm(normalized_question_2, llm)
250
+
251
+ # Use expanded query for retrieval
252
+ retrieved_nodes = query_engine.retriever.retrieve(expanded_query)
253
+
254
  log_message(f"user query: {question}")
255
  log_message(f"normalized query: {normalized_question}")
256
  log_message(f"after steel normalization: {normalized_question_2}")
257
+ log_message(f"expanded query length: {len(expanded_query)} chars")
258
  log_message(f"Steel grades normalized in query: {query_changes}")
 
259
 
260
  log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
261
 
262
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
263
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
264
+
265
  for i, node in enumerate(unique_retrieved):
266
  node_type = node.metadata.get('type', 'text')
267
  doc_id = node.metadata.get('document_id', 'N/A')
 
270
  table_num = node.metadata.get('table_number', 'N/A')
271
  table_id = node.metadata.get('table_identifier', 'N/A')
272
  table_title = node.metadata.get('table_title', 'N/A')
 
273
  content_preview = node.text[:200].replace('\n', ' ')
274
  log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
275
  log_message(f" Title: {table_title[:80]}")
 
280
 
281
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
282
 
283
+ reranked_nodes = rerank_nodes(normalized_question_2, unique_retrieved, reranker,
284
+ top_k=rerank_top_k)
 
285
 
286
+ # Use ORIGINAL normalized question for final answer generation
287
+ response = query_engine.query(normalized_question_2)
288
 
289
  end_time = time.time()
290
  processing_time = end_time - start_time