MrSimple07 commited on
Commit
39758a5
·
1 Parent(s): 3dea9cc

in ui changing possibility of topk

Browse files
Files changed (3) hide show
  1. app.py +190 -22
  2. index_retriever.py +10 -6
  3. utils.py +6 -5
app.py CHANGED
@@ -233,16 +233,52 @@ def switch_model(model_name, vector_index):
233
  log_message(error_msg)
234
  return None, f"❌ {error_msg}"
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  def main_answer_question(question):
237
- global query_engine, reranker, current_model, chunks_df
238
  if not question.strip():
239
  return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
240
  "<div style='color: black;'>Источники появятся после обработки запроса</div>",
241
  "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
242
 
243
  try:
244
- # Call the answer_question function which returns 3 values
245
- answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
 
 
246
  return answer_html, sources_html, chunks_html
247
 
248
  except Exception as e:
@@ -251,6 +287,37 @@ def main_answer_question(question):
251
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
252
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  def retrieve_chunks(question: str, top_k: int = 20) -> list:
255
  from index_retriever import rerank_nodes
256
  global query_engine, reranker
@@ -362,24 +429,132 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
362
  label="Релевантные чанки",
363
  value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
364
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
- switch_btn.click(
367
- fn=switch_model_func,
368
- inputs=[model_dropdown],
369
- outputs=[model_status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  )
371
 
372
- ask_btn.click(
373
- fn=answer_question_func,
374
- inputs=[question_input],
375
- outputs=[answer_output, sources_output, chunks_output]
 
 
 
376
  )
377
 
378
- question_input.submit(
379
- fn=answer_question_func,
380
- inputs=[question_input],
381
- outputs=[answer_output, sources_output, chunks_output]
 
 
 
 
 
 
 
 
 
 
 
 
382
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  return demo
384
 
385
 
@@ -389,13 +564,6 @@ reranker = None
389
  vector_index = None
390
  current_model = DEFAULT_MODEL
391
 
392
- def main_answer_question(question):
393
- global query_engine, reranker, current_model, chunks_df
394
- answer_html, sources_html, chunks_html = answer_question(
395
- question, query_engine, reranker, current_model, chunks_df
396
- )
397
- return answer_html, sources_html, chunks_html
398
-
399
  def main_switch_model(model_name):
400
  global query_engine, vector_index, current_model
401
 
 
233
  log_message(error_msg)
234
  return None, f"❌ {error_msg}"
235
 
236
+ # Add these global variables near the top with other globals
237
+ retrieval_params = {
238
+ 'vector_top_k': 50,
239
+ 'bm25_top_k': 50,
240
+ 'similarity_cutoff': 0.55,
241
+ 'hybrid_top_k': 100,
242
+ 'rerank_top_k': 20
243
+ }
244
+
245
+ # MODIFIED: Update create_query_engine call signature
246
+ def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
247
+ similarity_cutoff=0.55, hybrid_top_k=100):
248
+ try:
249
+ from config import CUSTOM_PROMPT
250
+ from index_retriever import create_query_engine as create_index_query_engine
251
+
252
+ # Pass parameters to the index_retriever function
253
+ query_engine = create_index_query_engine(
254
+ vector_index=vector_index,
255
+ vector_top_k=vector_top_k,
256
+ bm25_top_k=bm25_top_k,
257
+ similarity_cutoff=similarity_cutoff,
258
+ hybrid_top_k=hybrid_top_k
259
+ )
260
+
261
+ log_message(f"Query engine created with params: vector_top_k={vector_top_k}, "
262
+ f"bm25_top_k={bm25_top_k}, cutoff={similarity_cutoff}, hybrid_top_k={hybrid_top_k}")
263
+ return query_engine
264
+
265
+ except Exception as e:
266
+ log_message(f"Ошибка создания query engine: {str(e)}")
267
+ raise
268
+
269
+ # MODIFIED: Update answer_question to use global retrieval_params
270
  def main_answer_question(question):
271
+ global query_engine, reranker, current_model, chunks_df, retrieval_params
272
  if not question.strip():
273
  return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
274
  "<div style='color: black;'>Источники появятся после обработки запроса</div>",
275
  "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
276
 
277
  try:
278
+ answer_html, sources_html, chunks_html = answer_question(
279
+ question, query_engine, reranker, current_model, chunks_df,
280
+ rerank_top_k=retrieval_params['rerank_top_k']
281
+ )
282
  return answer_html, sources_html, chunks_html
283
 
284
  except Exception as e:
 
287
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
288
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
289
 
290
+ # NEW: Function to update retrieval parameters and recreate query engine
291
+ def update_retrieval_params(vector_top_k, bm25_top_k, similarity_cutoff, hybrid_top_k, rerank_top_k):
292
+ global query_engine, vector_index, retrieval_params
293
+
294
+ try:
295
+ retrieval_params['vector_top_k'] = vector_top_k
296
+ retrieval_params['bm25_top_k'] = bm25_top_k
297
+ retrieval_params['similarity_cutoff'] = similarity_cutoff
298
+ retrieval_params['hybrid_top_k'] = hybrid_top_k
299
+ retrieval_params['rerank_top_k'] = rerank_top_k
300
+
301
+ # Recreate query engine with new parameters
302
+ if vector_index is not None:
303
+ query_engine = create_query_engine(
304
+ vector_index=vector_index,
305
+ vector_top_k=vector_top_k,
306
+ bm25_top_k=bm25_top_k,
307
+ similarity_cutoff=similarity_cutoff,
308
+ hybrid_top_k=hybrid_top_k
309
+ )
310
+ log_message(f"Параметры поиска обновлены: vector_top_k={vector_top_k}, "
311
+ f"bm25_top_k={bm25_top_k}, cutoff={similarity_cutoff}, "
312
+ f"hybrid_top_k={hybrid_top_k}, rerank_top_k={rerank_top_k}")
313
+ return f"✅ Параметры обновлены"
314
+ else:
315
+ return "❌ Система не инициализирована"
316
+ except Exception as e:
317
+ error_msg = f"Ошибка обновления параметров: {str(e)}"
318
+ log_message(error_msg)
319
+ return f"❌ {error_msg}"
320
+
321
  def retrieve_chunks(question: str, top_k: int = 20) -> list:
322
  from index_retriever import rerank_nodes
323
  global query_engine, reranker
 
429
  label="Релевантные чанки",
430
  value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
431
  )
432
+
433
+ # NEW TAB: Retrieval Parameters
434
+ with gr.Tab("⚙️ Параметры поиска"):
435
+ gr.Markdown("### Настройка параметров векторного поиска и переранжирования")
436
+
437
+ with gr.Row():
438
+ with gr.Column():
439
+ vector_top_k = gr.Slider(
440
+ minimum=10,
441
+ maximum=200,
442
+ value=50,
443
+ step=10,
444
+ label="Vector Top K",
445
+ info="Количество результатов из векторного поиска"
446
+ )
447
+
448
+ with gr.Column():
449
+ bm25_top_k = gr.Slider(
450
+ minimum=10,
451
+ maximum=200,
452
+ value=50,
453
+ step=10,
454
+ label="BM25 Top K",
455
+ info="Количество результатов из BM25 поиска"
456
+ )
457
 
458
+ with gr.Row():
459
+ with gr.Column():
460
+ similarity_cutoff = gr.Slider(
461
+ minimum=0.0,
462
+ maximum=1.0,
463
+ value=0.55,
464
+ step=0.05,
465
+ label="Similarity Cutoff",
466
+ info="Минимальный порог схожести для векторного поиска"
467
+ )
468
+
469
+ with gr.Column():
470
+ hybrid_top_k = gr.Slider(
471
+ minimum=10,
472
+ maximum=300,
473
+ value=100,
474
+ step=10,
475
+ label="Hybrid Top K",
476
+ info="Количество результатов из гибридного поиска"
477
+ )
478
+
479
+ with gr.Row():
480
+ with gr.Column():
481
+ rerank_top_k = gr.Slider(
482
+ minimum=5,
483
+ maximum=100,
484
+ value=20,
485
+ step=5,
486
+ label="Rerank Top K",
487
+ info="Количество результатов после переранжирования"
488
+ )
489
+
490
+ with gr.Column():
491
+ update_btn = gr.Button("Применить параметры", variant="primary")
492
+ update_status = gr.Textbox(
493
+ value="Параметры готовы к применению",
494
+ label="Статус",
495
+ interactive=False
496
+ )
497
+
498
+ gr.Markdown("""
499
+ ### Рекомендации:
500
+ - **Vector Top K**: Увеличьте для более полного поиска по семантике (50-100)
501
+ - **BM25 Top K**: Увеличьте для лучшего поиска по ключевым словам (30-80)
502
+ - **Similarity Cutoff**: Снизьте для более мягких критериев (0.3-0.6), повысьте для строгих (0.7-0.9)
503
+ - **Hybrid Top K**: Объединённые результаты (100-150)
504
+ - **Rerank Top K**: Финальные результаты (10-30)
505
+ """)
506
+
507
+ update_btn.click(
508
+ fn=update_retrieval_params,
509
+ inputs=[vector_top_k, bm25_top_k, similarity_cutoff, hybrid_top_k, rerank_top_k],
510
+ outputs=[update_status]
511
  )
512
 
513
+ # Display current parameters
514
+ gr.Markdown("### Текущие параметры:")
515
+ current_params_display = gr.Textbox(
516
+ value="Vector: 50 | BM25: 50 | Cutoff: 0.55 | Hybrid: 100 | Rerank: 20",
517
+ label="",
518
+ interactive=False,
519
+ lines=2
520
  )
521
 
522
+ def display_current_params():
523
+ return f"""Vector Top K: {retrieval_params['vector_top_k']}
524
+ BM25 Top K: {retrieval_params['bm25_top_k']}
525
+ Similarity Cutoff: {retrieval_params['similarity_cutoff']}
526
+ Hybrid Top K: {retrieval_params['hybrid_top_k']}
527
+ Rerank Top K: {retrieval_params['rerank_top_k']}"""
528
+
529
+ # Refresh params display on tab change
530
+ demo.load(
531
+ fn=display_current_params,
532
+ outputs=[current_params_display]
533
+ )
534
+
535
+ update_btn.click(
536
+ fn=display_current_params,
537
+ outputs=[current_params_display]
538
  )
539
+
540
+ # Original tab logic
541
+ switch_btn.click(
542
+ fn=switch_model_func,
543
+ inputs=[model_dropdown],
544
+ outputs=[model_status]
545
+ )
546
+
547
+ ask_btn.click(
548
+ fn=answer_question_func,
549
+ inputs=[question_input],
550
+ outputs=[answer_output, sources_output, chunks_output]
551
+ )
552
+
553
+ question_input.submit(
554
+ fn=answer_question_func,
555
+ inputs=[question_input],
556
+ outputs=[answer_output, sources_output, chunks_output]
557
+ )
558
  return demo
559
 
560
 
 
564
  vector_index = None
565
  current_model = DEFAULT_MODEL
566
 
 
 
 
 
 
 
 
567
  def main_switch_model(model_name):
568
  global query_engine, vector_index, current_model
569
 
index_retriever.py CHANGED
@@ -65,24 +65,26 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
65
  log_message(f"Ошибка переранжировки: {str(e)}")
66
  return nodes[:top_k]
67
 
68
- def create_query_engine(vector_index):
 
 
69
  try:
70
  from config import CUSTOM_PROMPT
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
- similarity_top_k=60
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
- similarity_top_k=60,
80
- similarity_cutoff=0.45
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
- similarity_top_k=120,
86
  num_queries=1
87
  )
88
 
@@ -97,7 +99,9 @@ def create_query_engine(vector_index):
97
  response_synthesizer=response_synthesizer
98
  )
99
 
100
- log_message("Query engine успешно создан")
 
 
101
  return query_engine
102
 
103
  except Exception as e:
 
65
  log_message(f"Ошибка переранжировки: {str(e)}")
66
  return nodes[:top_k]
67
 
68
+ # MODIFIED: Update create_query_engine function signature
69
+ def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
70
+ similarity_cutoff=0.55, hybrid_top_k=100):
71
  try:
72
  from config import CUSTOM_PROMPT
73
 
74
  bm25_retriever = BM25Retriever.from_defaults(
75
  docstore=vector_index.docstore,
76
+ similarity_top_k=bm25_top_k # NOW PARAMETERIZED
77
  )
78
 
79
  vector_retriever = VectorIndexRetriever(
80
  index=vector_index,
81
+ similarity_top_k=vector_top_k, # NOW PARAMETERIZED
82
+ similarity_cutoff=similarity_cutoff # NOW PARAMETERIZED
83
  )
84
 
85
  hybrid_retriever = QueryFusionRetriever(
86
  [vector_retriever, bm25_retriever],
87
+ similarity_top_k=hybrid_top_k, # NOW PARAMETERIZED
88
  num_queries=1
89
  )
90
 
 
99
  response_synthesizer=response_synthesizer
100
  )
101
 
102
+ log_message(f"Query engine created: vector_top_k={vector_top_k}, "
103
+ f"bm25_top_k={bm25_top_k}, similarity_cutoff={similarity_cutoff}, "
104
+ f"hybrid_top_k={hybrid_top_k}")
105
  return query_engine
106
 
107
  except Exception as e:
utils.py CHANGED
@@ -197,8 +197,8 @@ def debug_search_tables(vector_index, search_term="С-25"):
197
 
198
  from documents_prep import normalize_text
199
 
200
- # MODIFIED: Update answer_question function
201
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
202
  # NORMALIZE the question to convert C to С
203
  normalized_question = normalize_text(question)
204
 
@@ -226,8 +226,9 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
226
  log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
227
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
228
 
229
- # Simple reranking with NORMALIZED question
230
- reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
 
231
 
232
  # Direct query without formatting - use normalized question
233
  response = query_engine.query(normalized_question)
@@ -243,7 +244,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
243
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
244
  <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
245
  <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
246
- Время обработки: {processing_time:.2f} секунд
247
  </div>
248
  </div>"""
249
  log_message(f"Model Answer: {response.response}")
 
197
 
198
  from documents_prep import normalize_text
199
 
200
+ # MODIFIED: Update answer_question function signature
201
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
202
  # NORMALIZE the question to convert C to С
203
  normalized_question = normalize_text(question)
204
 
 
226
  log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
227
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
228
 
229
+ # Simple reranking with NORMALIZED question and PARAMETERIZED top_k
230
+ reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker,
231
+ top_k=rerank_top_k) # NOW PARAMETERIZED
232
 
233
  # Direct query without formatting - use normalized question
234
  response = query_engine.query(normalized_question)
 
244
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
245
  <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
246
  <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
247
+ Время обработки: {processing_time:.2f} секунд | Переранжировано: {len(reranked_nodes)} документов
248
  </div>
249
  </div>"""
250
  log_message(f"Model Answer: {response.response}")