MrSimple07 commited on
Commit
1368f74
·
1 Parent(s): 5e35433

with bm and semantic

Browse files
Files changed (2) hide show
  1. index_retriever.py +19 -6
  2. utils.py +9 -5
index_retriever.py CHANGED
@@ -3,19 +3,32 @@ from llama_index.core.query_engine import RetrieverQueryEngine
3
  from llama_index.core.retrievers import VectorIndexRetriever
4
  from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
5
  from llama_index.core.prompts import PromptTemplate
 
 
6
  from my_logging import log_message
7
- from config import PROMPT_SIMPLE_POISK
8
 
9
  def create_vector_index(documents):
10
  log_message("Строю векторный индекс")
11
  return VectorStoreIndex.from_documents(documents)
12
-
13
  def create_query_engine(vector_index):
14
  try:
 
 
 
 
 
15
  vector_retriever = VectorIndexRetriever(
16
  index=vector_index,
17
- similarity_top_k=30,
18
- similarity_cutoff=0.7
 
 
 
 
 
 
 
19
  )
20
 
21
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
@@ -25,11 +38,11 @@ def create_query_engine(vector_index):
25
  )
26
 
27
  query_engine = RetrieverQueryEngine(
28
- retriever=vector_retriever,
29
  response_synthesizer=response_synthesizer
30
  )
31
 
32
- log_message("Query engine успешно создан (только векторный поиск)")
33
  return query_engine
34
 
35
  except Exception as e:
 
3
  from llama_index.core.retrievers import VectorIndexRetriever
4
  from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
5
  from llama_index.core.prompts import PromptTemplate
6
+ from llama_index.retrievers.bm25 import BM25Retriever
7
+ from llama_index.core.retrievers import QueryFusionRetriever
8
  from my_logging import log_message
9
+ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
  return VectorStoreIndex.from_documents(documents)
 
14
  def create_query_engine(vector_index):
15
  try:
16
+ bm25_retriever = BM25Retriever.from_defaults(
17
+ docstore=vector_index.docstore,
18
+ similarity_top_k=15 # Lower since we're combining with semantic
19
+ )
20
+
21
  vector_retriever = VectorIndexRetriever(
22
  index=vector_index,
23
+ similarity_top_k=15, # Lower since we're combining with BM25
24
+ similarity_cutoff=0.6 # Slightly lower threshold
25
+ )
26
+
27
+ # Hybrid retriever combines both approaches
28
+ hybrid_retriever = QueryFusionRetriever(
29
+ [vector_retriever, bm25_retriever],
30
+ similarity_top_k=30, # Final top_k after fusion
31
+ num_queries=1
32
  )
33
 
34
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
 
38
  )
39
 
40
  query_engine = RetrieverQueryEngine(
41
+ retriever=hybrid_retriever,
42
  response_synthesizer=response_synthesizer
43
  )
44
 
45
+ log_message("Query engine создан (BM25 + Semantic, без reranking)")
46
  return query_engine
47
 
48
  except Exception as e:
utils.py CHANGED
@@ -260,11 +260,15 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
260
 
261
  llm = get_llm_model(current_model)
262
 
 
263
  retrieved_nodes = query_engine.retriever.retrieve(question)
264
 
265
- log_message(f"Получено {len(retrieved_nodes)} узлов")
266
 
267
- formatted_context = format_context_for_llm(retrieved_nodes)
 
 
 
268
 
269
  enhanced_question = f"""Контекст из базы данных:
270
  {formatted_context}
@@ -281,18 +285,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
281
 
282
  log_message(f"Обработка завершена за {processing_time:.2f}с")
283
 
284
- sources_html = generate_sources_html(retrieved_nodes, chunks_df)
285
 
286
  answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
287
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
288
  <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
289
  <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
290
- Время обработки: {processing_time:.2f} секунд
291
  </div>
292
  </div>"""
293
 
294
  chunk_info = []
295
- for node in retrieved_nodes :
296
  metadata = node.metadata if hasattr(node, 'metadata') else {}
297
  chunk_info.append({
298
  'document_id': metadata.get('document_id', 'unknown'),
 
260
 
261
  llm = get_llm_model(current_model)
262
 
263
+ # Simple retrieval without query expansion
264
  retrieved_nodes = query_engine.retriever.retrieve(question)
265
 
266
+ log_message(f"Получено {len(retrieved_nodes)} узлов (BM25 + Semantic)")
267
 
268
+ # Use nodes directly without reranking
269
+ final_nodes = retrieved_nodes[:30] # Ensure we use top 30
270
+
271
+ formatted_context = format_context_for_llm(final_nodes)
272
 
273
  enhanced_question = f"""Контекст из базы данных:
274
  {formatted_context}
 
285
 
286
  log_message(f"Обработка завершена за {processing_time:.2f}с")
287
 
288
+ sources_html = generate_sources_html(final_nodes, chunks_df)
289
 
290
  answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
291
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
292
  <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
293
  <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
294
+ Время обработки: {processing_time:.2f} секунд | Метод: BM25 + Semantic (без reranking)
295
  </div>
296
  </div>"""
297
 
298
  chunk_info = []
299
+ for node in final_nodes:
300
  metadata = node.metadata if hasattr(node, 'metadata') else {}
301
  chunk_info.append({
302
  'document_id': metadata.get('document_id', 'unknown'),