MrSimple07 commited on
Commit
5e35433
·
1 Parent(s): 73dd9ce

only semantic search top k = 30, cut off = 0.7

Browse files
Files changed (2) hide show
  1. index_retriever.py +9 -11
  2. utils.py +6 -26
index_retriever.py CHANGED
@@ -12,28 +12,26 @@ def create_vector_index(documents):
12
 
13
  def create_query_engine(vector_index):
14
  try:
15
- # --- Semantic-only retriever ---
16
  vector_retriever = VectorIndexRetriever(
17
- index=vector_index,
18
- similarity_top_k=30, # recommended default
19
- similarity_cutoff=0.78 # filter weak matches
20
  )
21
-
22
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
23
-
24
  response_synthesizer = get_response_synthesizer(
25
  response_mode=ResponseMode.TREE_SUMMARIZE,
26
  text_qa_template=custom_prompt_template
27
  )
28
-
29
  query_engine = RetrieverQueryEngine(
30
  retriever=vector_retriever,
31
  response_synthesizer=response_synthesizer
32
  )
33
-
34
- log_message("Semantic-only query engine успешно создан")
35
  return query_engine
36
-
37
  except Exception as e:
38
  log_message(f"Ошибка создания query engine: {str(e)}")
39
- raise
 
12
 
13
  def create_query_engine(vector_index):
14
  try:
 
15
  vector_retriever = VectorIndexRetriever(
16
+ index=vector_index,
17
+ similarity_top_k=30,
18
+ similarity_cutoff=0.7
19
  )
20
+
21
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
 
22
  response_synthesizer = get_response_synthesizer(
23
  response_mode=ResponseMode.TREE_SUMMARIZE,
24
  text_qa_template=custom_prompt_template
25
  )
26
+
27
  query_engine = RetrieverQueryEngine(
28
  retriever=vector_retriever,
29
  response_synthesizer=response_synthesizer
30
  )
31
+
32
+ log_message("Query engine успешно создан (только векторный поиск)")
33
  return query_engine
34
+
35
  except Exception as e:
36
  log_message(f"Ошибка создания query engine: {str(e)}")
37
+ raise
utils.py CHANGED
@@ -6,7 +6,7 @@ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
  from sentence_transformers import CrossEncoder
7
  from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
  import time
9
- from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
 
@@ -260,31 +260,11 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
260
 
261
  llm = get_llm_model(current_model)
262
 
263
- query_variations = expand_query(question, llm)
264
 
265
- all_nodes = []
266
- seen_node_ids = set()
267
 
268
- for query_var in query_variations:
269
- retrieved = query_engine.retriever.retrieve(query_var)
270
- for node in retrieved:
271
- node_id = f"{node.node_id if hasattr(node, 'node_id') else hash(node.text)}"
272
- if node_id not in seen_node_ids:
273
- all_nodes.append(node)
274
- seen_node_ids.add(node_id)
275
-
276
- log_message(f"Получено {len(all_nodes)} уникальных узлов из {len(query_variations)} запросов")
277
-
278
- reranked_nodes = rerank_nodes(
279
- question,
280
- all_nodes,
281
- reranker,
282
- top_k=20,
283
- min_score_threshold=0.5,
284
- diversity_penalty=0.3
285
- )
286
-
287
- formatted_context = format_context_for_llm(reranked_nodes)
288
 
289
  enhanced_question = f"""Контекст из базы данных:
290
  {formatted_context}
@@ -301,7 +281,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
301
 
302
  log_message(f"Обработка завершена за {processing_time:.2f}с")
303
 
304
- sources_html = generate_sources_html(reranked_nodes, chunks_df)
305
 
306
  answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
307
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
@@ -312,7 +292,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
312
  </div>"""
313
 
314
  chunk_info = []
315
- for node in reranked_nodes:
316
  metadata = node.metadata if hasattr(node, 'metadata') else {}
317
  chunk_info.append({
318
  'document_id': metadata.get('document_id', 'unknown'),
 
6
  from sentence_transformers import CrossEncoder
7
  from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
  import time
9
+ # from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
 
 
260
 
261
  llm = get_llm_model(current_model)
262
 
263
+ retrieved_nodes = query_engine.retriever.retrieve(question)
264
 
265
+ log_message(f"Получено {len(retrieved_nodes)} узлов")
 
266
 
267
+ formatted_context = format_context_for_llm(retrieved_nodes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  enhanced_question = f"""Контекст из базы данных:
270
  {formatted_context}
 
281
 
282
  log_message(f"Обработка завершена за {processing_time:.2f}с")
283
 
284
+ sources_html = generate_sources_html(retrieved_nodes, chunks_df)
285
 
286
  answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
287
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
 
292
  </div>"""
293
 
294
  chunk_info = []
295
+ for node in retrieved_nodes :
296
  metadata = node.metadata if hasattr(node, 'metadata') else {}
297
  chunk_info.append({
298
  'document_id': metadata.get('document_id', 'unknown'),