MrSimple07 commited on
Commit
73dd9ce
·
1 Parent(s): e875be5

only semantic search top k = 30, cut off = 0.78

Browse files
Files changed (1) hide show
  1. index_retriever.py +13 -89
index_retriever.py CHANGED
@@ -3,10 +3,8 @@ from llama_index.core.query_engine import RetrieverQueryEngine
3
  from llama_index.core.retrievers import VectorIndexRetriever
4
  from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
5
  from llama_index.core.prompts import PromptTemplate
6
- from llama_index.retrievers.bm25 import BM25Retriever
7
- from llama_index.core.retrievers import QueryFusionRetriever
8
  from my_logging import log_message
9
- from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
@@ -14,102 +12,28 @@ def create_vector_index(documents):
14
 
15
  def create_query_engine(vector_index):
16
  try:
17
- bm25_retriever = BM25Retriever.from_defaults(
18
- docstore=vector_index.docstore,
19
- similarity_top_k=25
20
- )
21
-
22
  vector_retriever = VectorIndexRetriever(
23
- index=vector_index,
24
- similarity_top_k=30,
25
- similarity_cutoff=0.7
26
- )
27
-
28
- hybrid_retriever = QueryFusionRetriever(
29
- [vector_retriever, bm25_retriever],
30
- similarity_top_k=30,
31
- num_queries=1
32
  )
33
-
34
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
 
35
  response_synthesizer = get_response_synthesizer(
36
- response_mode=ResponseMode.TREE_SUMMARIZE,
37
  text_qa_template=custom_prompt_template
38
  )
39
-
40
  query_engine = RetrieverQueryEngine(
41
- retriever=hybrid_retriever,
42
  response_synthesizer=response_synthesizer
43
  )
44
-
45
- log_message("Query engine успешно создан")
46
  return query_engine
47
-
48
  except Exception as e:
49
  log_message(f"Ошибка создания query engine: {str(e)}")
50
  raise
51
-
52
- def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5, diversity_penalty=0.3):
53
- if not nodes or not reranker:
54
- return nodes[:top_k]
55
-
56
- try:
57
- log_message(f"Переранжирую {len(nodes)} узлов")
58
-
59
- pairs = [[query, node.text] for node in nodes]
60
- scores = reranker.predict(pairs)
61
- scored_nodes = list(zip(nodes, scores))
62
-
63
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
64
-
65
- if min_score_threshold is not None:
66
- scored_nodes = [(node, score) for node, score in scored_nodes
67
- if score >= min_score_threshold]
68
- log_message(f"После фильтрации по порогу {min_score_threshold}: {len(scored_nodes)} узлов")
69
-
70
- if not scored_nodes:
71
- log_message("Нет узлов после фильтрации, снижаю порог")
72
- scored_nodes = list(zip(nodes, scores))
73
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
74
- min_score_threshold = scored_nodes[0][1] * 0.6
75
- scored_nodes = [(node, score) for node, score in scored_nodes
76
- if score >= min_score_threshold]
77
-
78
- selected_nodes = []
79
- selected_docs = set()
80
- selected_sections = set()
81
-
82
- for node, score in scored_nodes:
83
- if len(selected_nodes) >= top_k:
84
- break
85
-
86
- metadata = node.metadata if hasattr(node, 'metadata') else {}
87
- doc_id = metadata.get('document_id', 'unknown')
88
- section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
89
-
90
- # Apply diversity penalty
91
- penalty = 0
92
- if doc_id in selected_docs:
93
- penalty += diversity_penalty * 0.5
94
- if section_key in selected_sections:
95
- penalty += diversity_penalty
96
-
97
- adjusted_score = score * (1 - penalty)
98
-
99
- # Add if still competitive
100
- if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.6:
101
- selected_nodes.append((node, score))
102
- selected_docs.add(doc_id)
103
- selected_sections.add(section_key)
104
-
105
- log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
106
- log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}")
107
-
108
- if selected_nodes:
109
- log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
110
-
111
- return [node for node, score in selected_nodes]
112
-
113
- except Exception as e:
114
- log_message(f"Ошибка переранжировки: {str(e)}")
115
- return nodes[:top_k]
 
3
  from llama_index.core.retrievers import VectorIndexRetriever
4
  from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
5
  from llama_index.core.prompts import PromptTemplate
 
 
6
  from my_logging import log_message
7
+ from config import PROMPT_SIMPLE_POISK
8
 
9
  def create_vector_index(documents):
10
  log_message("Строю векторный индекс")
 
12
 
13
  def create_query_engine(vector_index):
14
  try:
15
+ # --- Semantic-only retriever ---
 
 
 
 
16
  vector_retriever = VectorIndexRetriever(
17
+ index=vector_index,
18
+ similarity_top_k=30, # recommended default
19
+ similarity_cutoff=0.78 # filter weak matches
 
 
 
 
 
 
20
  )
21
+
22
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
23
+
24
  response_synthesizer = get_response_synthesizer(
25
+ response_mode=ResponseMode.TREE_SUMMARIZE,
26
  text_qa_template=custom_prompt_template
27
  )
28
+
29
  query_engine = RetrieverQueryEngine(
30
+ retriever=vector_retriever,
31
  response_synthesizer=response_synthesizer
32
  )
33
+
34
+ log_message("Semantic-only query engine успешно создан")
35
  return query_engine
36
+
37
  except Exception as e:
38
  log_message(f"Ошибка создания query engine: {str(e)}")
39
  raise