Spaces:
Sleeping
Sleeping
Commit ·
1f9fdb8
1
Parent(s): c63d524
added 35 semantic similarity
Browse files- index_retriever.py +2 -6
- utils.py +3 -7
index_retriever.py
CHANGED
|
@@ -22,7 +22,7 @@ def create_query_engine(vector_index):
|
|
| 22 |
vector_retriever = VectorIndexRetriever(
|
| 23 |
index=vector_index,
|
| 24 |
similarity_top_k=30, # Increased
|
| 25 |
-
similarity_cutoff=0.
|
| 26 |
)
|
| 27 |
|
| 28 |
hybrid_retriever = QueryFusionRetriever(
|
|
@@ -50,9 +50,6 @@ def create_query_engine(vector_index):
|
|
| 50 |
raise
|
| 51 |
|
| 52 |
def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5, diversity_penalty=0.3):
|
| 53 |
-
"""
|
| 54 |
-
Rerank nodes with diversity and adaptive scoring
|
| 55 |
-
"""
|
| 56 |
if not nodes or not reranker:
|
| 57 |
return nodes[:top_k]
|
| 58 |
|
|
@@ -74,11 +71,10 @@ def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5, dive
|
|
| 74 |
log_message("Нет узлов после фильтрации, снижаю порог")
|
| 75 |
scored_nodes = list(zip(nodes, scores))
|
| 76 |
scored_nodes.sort(key=lambda x: x[1], reverse=True)
|
| 77 |
-
min_score_threshold = scored_nodes[0][1] * 0.6
|
| 78 |
scored_nodes = [(node, score) for node, score in scored_nodes
|
| 79 |
if score >= min_score_threshold]
|
| 80 |
|
| 81 |
-
# MMR-like diversity selection
|
| 82 |
selected_nodes = []
|
| 83 |
selected_docs = set()
|
| 84 |
selected_sections = set()
|
|
|
|
| 22 |
vector_retriever = VectorIndexRetriever(
|
| 23 |
index=vector_index,
|
| 24 |
similarity_top_k=30, # Increased
|
| 25 |
+
similarity_cutoff=0.5 # Slightly lower for recall
|
| 26 |
)
|
| 27 |
|
| 28 |
hybrid_retriever = QueryFusionRetriever(
|
|
|
|
| 50 |
raise
|
| 51 |
|
| 52 |
def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5, diversity_penalty=0.3):
|
|
|
|
|
|
|
|
|
|
| 53 |
if not nodes or not reranker:
|
| 54 |
return nodes[:top_k]
|
| 55 |
|
|
|
|
| 71 |
log_message("Нет узлов после фильтрации, снижаю порог")
|
| 72 |
scored_nodes = list(zip(nodes, scores))
|
| 73 |
scored_nodes.sort(key=lambda x: x[1], reverse=True)
|
| 74 |
+
min_score_threshold = scored_nodes[0][1] * 0.6
|
| 75 |
scored_nodes = [(node, score) for node, score in scored_nodes
|
| 76 |
if score >= min_score_threshold]
|
| 77 |
|
|
|
|
| 78 |
selected_nodes = []
|
| 79 |
selected_docs = set()
|
| 80 |
selected_sections = set()
|
utils.py
CHANGED
|
@@ -258,13 +258,10 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 258 |
try:
|
| 259 |
start_time = time.time()
|
| 260 |
|
| 261 |
-
# Get LLM for query expansion
|
| 262 |
llm = get_llm_model(current_model)
|
| 263 |
|
| 264 |
-
# Expand query
|
| 265 |
query_variations = expand_query(question, llm)
|
| 266 |
|
| 267 |
-
# Retrieve with multiple queries and deduplicate
|
| 268 |
all_nodes = []
|
| 269 |
seen_node_ids = set()
|
| 270 |
|
|
@@ -278,13 +275,12 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 278 |
|
| 279 |
log_message(f"Получено {len(all_nodes)} уникальных узлов из {len(query_variations)} запросов")
|
| 280 |
|
| 281 |
-
# Rerank with stricter threshold and diversity
|
| 282 |
reranked_nodes = rerank_nodes(
|
| 283 |
-
question,
|
| 284 |
all_nodes,
|
| 285 |
reranker,
|
| 286 |
top_k=20,
|
| 287 |
-
min_score_threshold=0.5,
|
| 288 |
diversity_penalty=0.3
|
| 289 |
)
|
| 290 |
|
|
@@ -311,7 +307,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 311 |
<h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
|
| 312 |
<div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
|
| 313 |
<div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
|
| 314 |
-
Время обработки: {processing_time:.2f} секунд
|
| 315 |
</div>
|
| 316 |
</div>"""
|
| 317 |
|
|
|
|
| 258 |
try:
|
| 259 |
start_time = time.time()
|
| 260 |
|
|
|
|
| 261 |
llm = get_llm_model(current_model)
|
| 262 |
|
|
|
|
| 263 |
query_variations = expand_query(question, llm)
|
| 264 |
|
|
|
|
| 265 |
all_nodes = []
|
| 266 |
seen_node_ids = set()
|
| 267 |
|
|
|
|
| 275 |
|
| 276 |
log_message(f"Получено {len(all_nodes)} уникальных узлов из {len(query_variations)} запросов")
|
| 277 |
|
|
|
|
| 278 |
reranked_nodes = rerank_nodes(
|
| 279 |
+
question,
|
| 280 |
all_nodes,
|
| 281 |
reranker,
|
| 282 |
top_k=20,
|
| 283 |
+
min_score_threshold=0.5,
|
| 284 |
diversity_penalty=0.3
|
| 285 |
)
|
| 286 |
|
|
|
|
| 307 |
<h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
|
| 308 |
<div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
|
| 309 |
<div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
|
| 310 |
+
Время обработки: {processing_time:.2f} секунд
|
| 311 |
</div>
|
| 312 |
</div>"""
|
| 313 |
|