MrSimple07 commited on
Commit
af31260
·
1 Parent(s): 0970ce1

new final version of the rag

Browse files
Files changed (7) hide show
  1. app.py +182 -35
  2. config.py +17 -4
  3. converters/converter.py +116 -0
  4. documents_prep.py +188 -76
  5. index_retriever.py +11 -24
  6. table_prep.py +0 -229
  7. utils.py +71 -41
app.py CHANGED
@@ -10,6 +10,7 @@ from config import (
10
  HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
11
  JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
  )
 
13
 
14
 
15
  def merge_table_chunks(chunk_info):
@@ -39,7 +40,6 @@ def merge_table_chunks(chunk_info):
39
 
40
  return list(merged.values())
41
 
42
-
43
  def create_chunks_display_html(chunk_info):
44
  if not chunk_info:
45
  return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
@@ -142,19 +142,14 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
142
  separator=" ",
143
  backup_separators=["\n", ".", "!", "?"]
144
  )
145
-
146
- log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
147
- log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
148
 
149
  all_documents = []
150
  chunks_df = None
151
 
152
- # CHANGED: Use load_all_documents instead of loading separately
153
  if use_json_instead_csv and json_files_dir:
154
  log_message("Используем JSON файлы вместо CSV")
155
  from documents_prep import load_all_documents
156
 
157
- # This will handle text, tables, and images all together with proper logging
158
  all_documents = load_all_documents(
159
  repo_id=repo_id,
160
  hf_token=hf_token,
@@ -163,12 +158,10 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
163
  image_dir=image_data_dir if image_data_dir else ""
164
  )
165
  else:
166
- # OLD PATH: Loading separately (fallback)
167
  if chunks_filename:
168
  log_message("Загружаем данные из CSV")
169
 
170
  if table_data_dir:
171
- log_message("Добавляю табличные данные")
172
  from documents_prep import load_table_documents
173
 
174
  table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
@@ -176,7 +169,6 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
176
  all_documents.extend(table_chunks)
177
 
178
  if image_data_dir:
179
- log_message("Добавляю данные изображений")
180
  from documents_prep import load_image_documents
181
 
182
  image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
@@ -188,7 +180,6 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
188
  vector_index = create_vector_index(all_documents)
189
  query_engine = create_query_engine(vector_index)
190
 
191
- # Create chunk_info for display (extract from documents metadata)
192
  chunk_info = []
193
  for doc in all_documents:
194
  chunk_info.append({
@@ -233,16 +224,48 @@ def switch_model(model_name, vector_index):
233
  log_message(error_msg)
234
  return None, f"❌ {error_msg}"
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  def main_answer_question(question):
237
- global query_engine, reranker, current_model, chunks_df
238
  if not question.strip():
239
  return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
240
  "<div style='color: black;'>Источники появятся после обработки запроса</div>",
241
  "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
242
 
243
  try:
244
- # Call the answer_question function which returns 3 values
245
- answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
 
 
246
  return answer_html, sources_html, chunks_html
247
 
248
  except Exception as e:
@@ -251,6 +274,36 @@ def main_answer_question(question):
251
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
252
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  def retrieve_chunks(question: str, top_k: int = 20) -> list:
255
  from index_retriever import rerank_nodes
256
  global query_engine, reranker
@@ -362,24 +415,128 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
362
  label="Релевантные чанки",
363
  value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
364
  )
 
 
 
365
 
366
- switch_btn.click(
367
- fn=switch_model_func,
368
- inputs=[model_dropdown],
369
- outputs=[model_status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  )
371
 
372
- ask_btn.click(
373
- fn=answer_question_func,
374
- inputs=[question_input],
375
- outputs=[answer_output, sources_output, chunks_output]
 
 
 
 
 
 
376
  )
377
 
378
- question_input.submit(
379
- fn=answer_question_func,
380
- inputs=[question_input],
381
- outputs=[answer_output, sources_output, chunks_output]
382
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  return demo
384
 
385
 
@@ -389,13 +546,6 @@ reranker = None
389
  vector_index = None
390
  current_model = DEFAULT_MODEL
391
 
392
- def main_answer_question(question):
393
- global query_engine, reranker, current_model, chunks_df
394
- answer_html, sources_html, chunks_html = answer_question(
395
- question, query_engine, reranker, current_model, chunks_df
396
- )
397
- return answer_html, sources_html, chunks_html
398
-
399
  def main_switch_model(model_name):
400
  global query_engine, vector_index, current_model
401
 
@@ -406,9 +556,6 @@ def main_switch_model(model_name):
406
 
407
  return status_message
408
 
409
-
410
-
411
-
412
  def main():
413
  global query_engine, chunks_df, reranker, vector_index, current_model
414
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
 
10
  HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
11
  JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
  )
13
+ from converters.converter import convert_single_excel_to_json, convert_single_excel_to_csv
14
 
15
 
16
  def merge_table_chunks(chunk_info):
 
40
 
41
  return list(merged.values())
42
 
 
43
  def create_chunks_display_html(chunk_info):
44
  if not chunk_info:
45
  return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
 
142
  separator=" ",
143
  backup_separators=["\n", ".", "!", "?"]
144
  )
 
 
 
145
 
146
  all_documents = []
147
  chunks_df = None
148
 
 
149
  if use_json_instead_csv and json_files_dir:
150
  log_message("Используем JSON файлы вместо CSV")
151
  from documents_prep import load_all_documents
152
 
 
153
  all_documents = load_all_documents(
154
  repo_id=repo_id,
155
  hf_token=hf_token,
 
158
  image_dir=image_data_dir if image_data_dir else ""
159
  )
160
  else:
 
161
  if chunks_filename:
162
  log_message("Загружаем данные из CSV")
163
 
164
  if table_data_dir:
 
165
  from documents_prep import load_table_documents
166
 
167
  table_chunks = load_table_documents(repo_id, hf_token, table_data_dir)
 
169
  all_documents.extend(table_chunks)
170
 
171
  if image_data_dir:
 
172
  from documents_prep import load_image_documents
173
 
174
  image_documents = load_image_documents(repo_id, hf_token, image_data_dir)
 
180
  vector_index = create_vector_index(all_documents)
181
  query_engine = create_query_engine(vector_index)
182
 
 
183
  chunk_info = []
184
  for doc in all_documents:
185
  chunk_info.append({
 
224
  log_message(error_msg)
225
  return None, f"❌ {error_msg}"
226
 
227
+ retrieval_params = {
228
+ 'vector_top_k': 50,
229
+ 'bm25_top_k': 50,
230
+ 'similarity_cutoff': 0.55,
231
+ 'hybrid_top_k': 100,
232
+ 'rerank_top_k': 20
233
+ }
234
+
235
+ def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
236
+ similarity_cutoff=0.55, hybrid_top_k=100):
237
+ try:
238
+ from config import CUSTOM_PROMPT
239
+ from index_retriever import create_query_engine as create_index_query_engine
240
+
241
+ query_engine = create_index_query_engine(
242
+ vector_index=vector_index,
243
+ vector_top_k=vector_top_k,
244
+ bm25_top_k=bm25_top_k,
245
+ similarity_cutoff=similarity_cutoff,
246
+ hybrid_top_k=hybrid_top_k
247
+ )
248
+
249
+ log_message(f"Query engine created with params: vector_top_k={vector_top_k}, "
250
+ f"bm25_top_k={bm25_top_k}, cutoff={similarity_cutoff}, hybrid_top_k={hybrid_top_k}")
251
+ return query_engine
252
+
253
+ except Exception as e:
254
+ log_message(f"Ошибка создания query engine: {str(e)}")
255
+ raise
256
+
257
  def main_answer_question(question):
258
+ global query_engine, reranker, current_model, chunks_df, retrieval_params
259
  if not question.strip():
260
  return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
261
  "<div style='color: black;'>Источники появятся после обработки запроса</div>",
262
  "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
263
 
264
  try:
265
+ answer_html, sources_html, chunks_html = answer_question(
266
+ question, query_engine, reranker, current_model, chunks_df,
267
+ rerank_top_k=retrieval_params['rerank_top_k']
268
+ )
269
  return answer_html, sources_html, chunks_html
270
 
271
  except Exception as e:
 
274
  "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
275
  "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
276
 
277
+ def update_retrieval_params(vector_top_k, bm25_top_k, similarity_cutoff, hybrid_top_k, rerank_top_k):
278
+ global query_engine, vector_index, retrieval_params
279
+
280
+ try:
281
+ retrieval_params['vector_top_k'] = vector_top_k
282
+ retrieval_params['bm25_top_k'] = bm25_top_k
283
+ retrieval_params['similarity_cutoff'] = similarity_cutoff
284
+ retrieval_params['hybrid_top_k'] = hybrid_top_k
285
+ retrieval_params['rerank_top_k'] = rerank_top_k
286
+
287
+ # Recreate query engine with new parameters
288
+ if vector_index is not None:
289
+ query_engine = create_query_engine(
290
+ vector_index=vector_index,
291
+ vector_top_k=vector_top_k,
292
+ bm25_top_k=bm25_top_k,
293
+ similarity_cutoff=similarity_cutoff,
294
+ hybrid_top_k=hybrid_top_k
295
+ )
296
+ log_message(f"Параметры поиска обновлены: vector_top_k={vector_top_k}, "
297
+ f"bm25_top_k={bm25_top_k}, cutoff={similarity_cutoff}, "
298
+ f"hybrid_top_k={hybrid_top_k}, rerank_top_k={rerank_top_k}")
299
+ return f"✅ Параметры обновлены"
300
+ else:
301
+ return "❌ Система не инициализирована"
302
+ except Exception as e:
303
+ error_msg = f"Ошибка обновления параметров: {str(e)}"
304
+ log_message(error_msg)
305
+ return f"❌ {error_msg}"
306
+
307
  def retrieve_chunks(question: str, top_k: int = 20) -> list:
308
  from index_retriever import rerank_nodes
309
  global query_engine, reranker
 
415
  label="Релевантные чанки",
416
  value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
417
  )
418
+
419
+ with gr.Tab("⚙️ Параметры поиска"):
420
+ gr.Markdown("### Настройка параметров векторного поиска и переранжирования")
421
 
422
+ with gr.Row():
423
+ with gr.Column():
424
+ vector_top_k = gr.Slider(
425
+ minimum=10,
426
+ maximum=200,
427
+ value=50,
428
+ step=10,
429
+ label="Vector Top K",
430
+ info="Количество результатов из векторного поиска"
431
+ )
432
+
433
+ with gr.Column():
434
+ bm25_top_k = gr.Slider(
435
+ minimum=10,
436
+ maximum=200,
437
+ value=50,
438
+ step=10,
439
+ label="BM25 Top K",
440
+ info="Количество результатов из BM25 поиска"
441
+ )
442
+
443
+ with gr.Row():
444
+ with gr.Column():
445
+ similarity_cutoff = gr.Slider(
446
+ minimum=0.0,
447
+ maximum=1.0,
448
+ value=0.55,
449
+ step=0.05,
450
+ label="Similarity Cutoff",
451
+ info="Минимальный порог схожести для векторного поиска"
452
+ )
453
+
454
+ with gr.Column():
455
+ hybrid_top_k = gr.Slider(
456
+ minimum=10,
457
+ maximum=300,
458
+ value=100,
459
+ step=10,
460
+ label="Hybrid Top K",
461
+ info="Количество результатов из гибридного поиска"
462
+ )
463
+
464
+ with gr.Row():
465
+ with gr.Column():
466
+ rerank_top_k = gr.Slider(
467
+ minimum=5,
468
+ maximum=100,
469
+ value=20,
470
+ step=5,
471
+ label="Rerank Top K",
472
+ info="Количество результатов после переранжирования"
473
+ )
474
+
475
+ with gr.Column():
476
+ update_btn = gr.Button("Применить параметры", variant="primary")
477
+ update_status = gr.Textbox(
478
+ value="Параметры готовы к применению",
479
+ label="Статус",
480
+ interactive=False
481
+ )
482
+
483
+ gr.Markdown("""
484
+ ### Рекомендации:
485
+ - **Vector Top K**: Увеличьте для более полного поиска по семантике (50-100)
486
+ - **BM25 Top K**: Увеличьте для лучшего поиска по ключевым словам (30-80)
487
+ - **Similarity Cutoff**: Снизьте для более мягких критериев (0.3-0.6), повысьте для строгих (0.7-0.9)
488
+ - **Hybrid Top K**: Объединённые результаты (100-150)
489
+ - **Rerank Top K**: Финальные результаты (10-30)
490
+ """)
491
+
492
+ update_btn.click(
493
+ fn=update_retrieval_params,
494
+ inputs=[vector_top_k, bm25_top_k, similarity_cutoff, hybrid_top_k, rerank_top_k],
495
+ outputs=[update_status]
496
+ )
497
+
498
+ gr.Markdown("### Текущие параметры:")
499
+ current_params_display = gr.Textbox(
500
+ value="Vector: 50 | BM25: 50 | Cutoff: 0.55 | Hybrid: 100 | Rerank: 20",
501
+ label="",
502
+ interactive=False,
503
+ lines=2
504
  )
505
 
506
+ def display_current_params():
507
+ return f"""Vector Top K: {retrieval_params['vector_top_k']}
508
+ BM25 Top K: {retrieval_params['bm25_top_k']}
509
+ Similarity Cutoff: {retrieval_params['similarity_cutoff']}
510
+ Hybrid Top K: {retrieval_params['hybrid_top_k']}
511
+ Rerank Top K: {retrieval_params['rerank_top_k']}"""
512
+
513
+ demo.load(
514
+ fn=display_current_params,
515
+ outputs=[current_params_display]
516
  )
517
 
518
+ update_btn.click(
519
+ fn=display_current_params,
520
+ outputs=[current_params_display]
 
521
  )
522
+
523
+ switch_btn.click(
524
+ fn=switch_model_func,
525
+ inputs=[model_dropdown],
526
+ outputs=[model_status]
527
+ )
528
+
529
+ ask_btn.click(
530
+ fn=answer_question_func,
531
+ inputs=[question_input],
532
+ outputs=[answer_output, sources_output, chunks_output]
533
+ )
534
+
535
+ question_input.submit(
536
+ fn=answer_question_func,
537
+ inputs=[question_input],
538
+ outputs=[answer_output, sources_output, chunks_output]
539
+ )
540
  return demo
541
 
542
 
 
546
  vector_index = None
547
  current_model = DEFAULT_MODEL
548
 
 
 
 
 
 
 
 
549
  def main_switch_model(model_name):
550
  global query_engine, vector_index, current_model
551
 
 
556
 
557
  return status_message
558
 
 
 
 
559
  def main():
560
  global query_engine, chunks_df, reranker, vector_index, current_model
561
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
config.py CHANGED
@@ -5,7 +5,7 @@ SIMILARITY_THRESHOLD = 0.7
5
  RAG_FILES_DIR = "rag_files"
6
  PROCESSED_DATA_FILE = "processed_chunks.csv"
7
 
8
- REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
9
  faiss_index_filename = "cleaned_faiss_index.index"
10
  CHUNKS_FILENAME = "processed_chunks.csv"
11
  TABLE_DATA_DIR = "Табличные данные_JSON"
@@ -18,7 +18,6 @@ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
18
  HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
19
  HF_TOKEN = os.getenv('HF_TOKEN')
20
 
21
- # Available models configuration
22
  AVAILABLE_MODELS = {
23
  "Gemini 2.5 Flash": {
24
  "provider": "google",
@@ -52,8 +51,22 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
- MAX_CHARS_TABLE = 4500
56
- MAX_ROWS_TABLE = 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  CUSTOM_PROMPT = """
59
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
 
5
  RAG_FILES_DIR = "rag_files"
6
  PROCESSED_DATA_FILE = "processed_chunks.csv"
7
 
8
+ REPO_ID = "RAG-AIEXP/ragfiles"
9
  faiss_index_filename = "cleaned_faiss_index.index"
10
  CHUNKS_FILENAME = "processed_chunks.csv"
11
  TABLE_DATA_DIR = "Табличные данные_JSON"
 
18
  HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
19
  HF_TOKEN = os.getenv('HF_TOKEN')
20
 
 
21
  AVAILABLE_MODELS = {
22
  "Gemini 2.5 Flash": {
23
  "provider": "google",
 
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
+ MAX_CHARS_TABLE = 3000
55
+ MAX_ROWS_TABLE = 30
56
+
57
+
58
+ QUERY_EXPANSION_PROMPT = """Ты — интеллектуальный помощник для расширения поисковых запросов по стандартам ГОСТ, ТУ, ISO, EN и другой технической документации.
59
+ Твоя цель — помочь системе найти все возможные формулировки вопроса, включая те, где встречаются редкие или неочевидные термины.
60
+ Пользователь задал вопрос: "{original_query}"
61
+
62
+ Сгенерируй 5 вариантов запроса, которые:
63
+ Сохраняют смысл исходного вопроса
64
+ Используют синонимы и технические термины (например: "сталь" → "сплав", "марка", "материал")
65
+ Добавляют возможные контекстные уточнения (например: "ГОСТ", "ТУ", "марка", "лист", "труба", "прокат", "применение", "химический состав")
66
+ Могут охватывать как частотные, так и редкие термины
67
+ Краткие — не более 10 слов каждая
68
+
69
+ Верни только 5 запросов, каждый с новой строки, без нумерации и пояснений."""
70
 
71
  CUSTOM_PROMPT = """
72
  Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.
converters/converter.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import *
2
+ from utils import log_message
3
+ import json
4
+ import pandas as pd
5
+ import os
6
+
7
+ def process_uploaded_file(file, file_type):
8
+ """Обработка загруженного файла и добавление в систему"""
9
+ try:
10
+ if file is None:
11
+ return "❌ Файл не выбран"
12
+
13
+ from huggingface_hub import HfApi
14
+ import tempfile
15
+ import shutil
16
+
17
+ # Создаем временную директорию
18
+ with tempfile.TemporaryDirectory() as temp_dir:
19
+ # Сохраняем загруженный файл
20
+ file_path = os.path.join(temp_dir, file.name)
21
+ shutil.copy(file.name, file_path)
22
+
23
+ # Определяем целевую директорию на HuggingFace
24
+ if file_type == "Таблица":
25
+ target_dir = TABLE_DATA_DIR
26
+ # Конвертируем Excel в JSON
27
+ if file.name.endswith(('.xlsx', '.xls')):
28
+ json_path = convert_single_excel_to_json(file_path, temp_dir)
29
+ upload_file = json_path
30
+ else:
31
+ upload_file = file_path
32
+ elif file_type == "Изображение (метаданные)":
33
+ target_dir = IMAGE_DATA_DIR
34
+ # Конвертируем Excel в CSV
35
+ if file.name.endswith(('.xlsx', '.xls')):
36
+ csv_path = convert_single_excel_to_csv(file_path, temp_dir)
37
+ upload_file = csv_path
38
+ else:
39
+ upload_file = file_path
40
+ else: # JSON документ
41
+ target_dir = JSON_FILES_DIR
42
+ upload_file = file_path
43
+
44
+ # Загружаем на HuggingFace
45
+ api = HfApi()
46
+ api.upload_file(
47
+ path_or_fileobj=upload_file,
48
+ path_in_repo=f"{target_dir}/{os.path.basename(upload_file)}",
49
+ repo_id=HF_REPO_ID,
50
+ token=HF_TOKEN,
51
+ repo_type="dataset"
52
+ )
53
+
54
+ log_message(f"Файл {file.name} успешно загружен в {target_dir}")
55
+ return f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n⚠️ Перезапустите систему для применения изменений"
56
+
57
+ except Exception as e:
58
+ error_msg = f"Ошибка обработки файла: {str(e)}"
59
+ log_message(error_msg)
60
+ return f"❌ {error_msg}"
61
+
62
+ def convert_single_excel_to_json(excel_path, output_dir):
63
+ """Конвертация одного Excel файла в JSON для таблиц"""
64
+ df_dict = pd.read_excel(excel_path, sheet_name=None)
65
+
66
+ result = {
67
+ "document": os.path.basename(excel_path),
68
+ "total_sheets": len(df_dict),
69
+ "sheets": []
70
+ }
71
+
72
+ for sheet_name, df in df_dict.items():
73
+ if df.empty or "Номер таблицы" not in df.columns:
74
+ continue
75
+
76
+ df = df.dropna(how='all').fillna("")
77
+ grouped = df.groupby("Номер таблицы")
78
+
79
+ for table_number, group in grouped:
80
+ group = group.reset_index(drop=True)
81
+
82
+ sheet_data = {
83
+ "sheet_name": sheet_name,
84
+ "document_id": str(group.iloc[0].get("Обозначение документа", "")),
85
+ "section": str(group.iloc[0].get("Раздел документа", "")),
86
+ "table_number": str(table_number),
87
+ "table_title": str(group.iloc[0].get("Название таблицы", "")),
88
+ "table_description": str(group.iloc[0].get("Примечание", "")),
89
+ "headers": [col for col in df.columns if col not in
90
+ ["Обозначение документа", "Раздел документа", "Номер таблицы",
91
+ "Название таблицы", "Примечание"]],
92
+ "data": []
93
+ }
94
+
95
+ for _, row in group.iterrows():
96
+ row_dict = {col: str(row[col]) if pd.notna(row[col]) else ""
97
+ for col in sheet_data["headers"]}
98
+ sheet_data["data"].append(row_dict)
99
+
100
+ result["sheets"].append(sheet_data)
101
+
102
+ json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
103
+ json_path = os.path.join(output_dir, json_filename)
104
+
105
+ with open(json_path, 'w', encoding='utf-8') as f:
106
+ json.dump(result, f, ensure_ascii=False, indent=2)
107
+
108
+ return json_path
109
+
110
+ def convert_single_excel_to_csv(excel_path, output_dir):
111
+ """Конвертация одного Excel файла в CSV для изображений"""
112
+ df = pd.read_excel(excel_path)
113
+ csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
114
+ csv_path = os.path.join(output_dir, csv_filename)
115
+ df.to_csv(csv_path, index=False, encoding='utf-8')
116
+ return csv_path
documents_prep.py CHANGED
@@ -6,21 +6,83 @@ from llama_index.core import Document
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def chunk_text_documents(documents):
11
  text_splitter = SentenceSplitter(
12
  chunk_size=CHUNK_SIZE,
13
  chunk_overlap=CHUNK_OVERLAP
14
  )
 
 
15
 
16
  chunked = []
17
  for doc in documents:
18
  chunks = text_splitter.get_nodes_from_documents([doc])
19
  for i, chunk in enumerate(chunks):
 
 
 
 
 
 
 
20
  chunk.metadata.update({
21
  'chunk_id': i,
22
  'total_chunks': len(chunks),
23
- 'chunk_size': len(chunk.text) # Add chunk size
24
  })
25
  chunked.append(chunk)
26
 
@@ -31,23 +93,14 @@ def chunk_text_documents(documents):
31
  max_size = max(len(c.text) for c in chunked)
32
  log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
33
  log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
 
 
 
 
34
 
35
- return chunked
36
-
37
- def normalize_text(text):
38
- if not text:
39
- return text
40
-
41
- # Replace Cyrillic 'C' with Latin 'С' (U+0421)
42
- # This is for welding types like C-25 -> С-25
43
- text = text.replace('С-', 'C')
44
-
45
- # Also handle cases like "Type C" or variations
46
- import re
47
- # Match "C" followed by digit or space in context of welding types
48
- text = re.sub(r'\bС(\d)', r'С\1', text)
49
 
50
- return text
51
 
52
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
53
  headers = table_data.get('headers', [])
@@ -55,49 +108,108 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
55
  table_num = table_data.get('table_number', 'unknown')
56
  table_title = table_data.get('table_title', '')
57
  section = table_data.get('section', '')
 
58
 
 
 
 
 
59
  table_num_clean = str(table_num).strip()
60
- table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
61
 
62
  import re
63
- if 'приложени' in section.lower():
64
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
65
- if appendix_match:
66
- appendix_num = appendix_match.group(1).upper()
67
- table_identifier = f"{table_num_clean} Приложение {appendix_num}"
 
 
 
 
 
68
  else:
69
- table_identifier = table_num_clean
 
 
 
 
70
  else:
71
- table_identifier = table_num_clean
 
 
 
 
 
 
 
 
72
 
73
  if not rows:
74
  return []
75
 
76
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
77
 
78
- # Calculate base metadata size with NORMALIZED title
79
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  base_size = len(base_content)
81
  available_space = max_chars - base_size - 200
82
 
83
  # If entire table fits, return as one chunk
84
- full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
85
- if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
 
 
86
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
87
 
88
  metadata = {
89
  'type': 'table',
90
  'document_id': doc_id,
91
- 'table_number': table_num_clean,
92
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE identifier
93
- 'table_title': table_title_normalized, # NORMALIZED
94
  'section': section,
95
- 'total_rows': len(rows),
 
96
  'chunk_size': len(content),
97
- 'is_complete_table': True
 
98
  }
99
 
100
- log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
101
  return [Document(text=content, metadata=metadata)]
102
 
103
  chunks = []
@@ -105,30 +217,33 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
105
  current_size = 0
106
  chunk_num = 0
107
 
108
- for i, row in enumerate(rows):
109
  row_text = format_single_row(row, i + 1)
110
  row_size = len(row_text)
111
 
112
- should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
 
113
 
114
  if should_split:
115
  content = base_content + format_table_rows(current_rows)
116
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
117
  content += format_table_footer(table_identifier, doc_id)
118
 
119
  metadata = {
120
  'type': 'table',
121
  'document_id': doc_id,
122
- 'table_number': table_num_clean,
123
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE
124
- 'table_title': table_title_normalized, # NORMALIZED
125
  'section': section,
 
126
  'chunk_id': chunk_num,
127
  'row_start': current_rows[0]['_idx'] - 1,
128
  'row_end': current_rows[-1]['_idx'],
129
- 'total_rows': len(rows),
130
  'chunk_size': len(content),
131
- 'is_complete_table': False
 
132
  }
133
 
134
  chunks.append(Document(text=content, metadata=metadata))
@@ -138,31 +253,31 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
138
  current_rows = []
139
  current_size = 0
140
 
141
- # Add row with index
142
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
143
  row_copy['_idx'] = i + 1
144
  current_rows.append(row_copy)
145
  current_size += row_size
146
-
147
- # Add final chunk
148
  if current_rows:
149
  content = base_content + format_table_rows(current_rows)
150
- content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
151
  content += format_table_footer(table_identifier, doc_id)
152
 
153
  metadata = {
154
  'type': 'table',
155
  'document_id': doc_id,
156
- 'table_number': table_num_clean,
157
- 'table_identifier': normalize_text(table_identifier), # NORMALIZE
158
- 'table_title': table_title_normalized, # NORMALIZED
159
  'section': section,
 
160
  'chunk_id': chunk_num,
161
  'row_start': current_rows[0]['_idx'] - 1,
162
  'row_end': current_rows[-1]['_idx'],
163
- 'total_rows': len(rows),
164
  'chunk_size': len(content),
165
- 'is_complete_table': False
 
166
  }
167
 
168
  chunks.append(Document(text=content, metadata=metadata))
@@ -170,33 +285,36 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
170
 
171
  return chunks
172
 
173
-
174
- # MODIFIED: Update format_table_header function
175
- def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
176
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
177
 
178
- # Add table type/number prominently for matching
179
- if table_num:
180
- content += f"ТИП: {normalize_text(table_num)}\n"
 
 
 
181
 
182
  if table_title:
183
  content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
184
 
185
  if section:
186
  content += f"РАЗДЕЛ: {section}\n"
 
 
187
 
188
  content += f"{'='*70}\n"
189
 
190
  if headers:
191
- header_str = ' | '.join(str(h) for h in headers)
 
 
192
  content += f"ЗАГОЛОВКИ: {header_str}\n\n"
193
 
194
  content += "ДАННЫЕ:\n"
195
  return content
196
 
197
-
198
  def format_single_row(row, idx):
199
- """Format a single row"""
200
  if isinstance(row, dict):
201
  parts = [f"{k}: {v}" for k, v in row.items()
202
  if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
@@ -208,18 +326,14 @@ def format_single_row(row, idx):
208
  return f"{idx}. {' | '.join(parts)}\n"
209
  return ""
210
 
211
-
212
  def format_table_rows(rows):
213
- """Format multiple rows"""
214
  content = ""
215
  for row in rows:
216
  idx = row.get('_idx', 0)
217
  content += format_single_row(row, idx)
218
  return content
219
 
220
-
221
  def format_table_footer(table_identifier, doc_id):
222
- """Format table footer"""
223
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
224
 
225
  def load_json_documents(repo_id, hf_token, json_dir):
@@ -290,7 +404,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
290
  stats['failed'] += 1
291
  continue
292
 
293
- # Try UTF-8 first (most common)
294
  try:
295
  text_content = file_content.decode('utf-8')
296
  except UnicodeDecodeError:
@@ -298,7 +411,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
298
  text_content = file_content.decode('utf-8-sig')
299
  except UnicodeDecodeError:
300
  try:
301
- # Try UTF-16 (the issue you're seeing)
302
  text_content = file_content.decode('utf-16')
303
  except UnicodeDecodeError:
304
  try:
@@ -345,13 +457,11 @@ def load_json_documents(repo_id, hf_token, json_dir):
345
  log_message(f" Success: {stats['success']}")
346
  log_message(f" Empty: {stats['empty']}")
347
  log_message(f" Failed: {stats['failed']}")
348
- log_message(f" Total sections: {len(documents)}")
349
  log_message(f"="*60)
350
 
351
  return documents
352
 
353
  def extract_sections_from_json(json_path):
354
- """Extract sections from a single JSON file"""
355
  documents = []
356
 
357
  try:
@@ -401,14 +511,15 @@ def extract_sections_from_json(json_path):
401
 
402
  return documents
403
 
404
-
405
  def load_table_documents(repo_id, hf_token, table_dir):
406
  log_message("Loading tables...")
407
-
408
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
409
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
410
 
411
  all_chunks = []
 
 
412
  for file_path in table_files:
413
  try:
414
  local_path = hf_hub_download(
@@ -425,20 +536,22 @@ def load_table_documents(repo_id, hf_token, table_dir):
425
 
426
  for sheet in data.get('sheets', []):
427
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
428
 
429
- # Use the consistent MAX_CHARS_TABLE from config
430
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
 
431
  all_chunks.extend(chunks)
432
 
433
  except Exception as e:
434
  log_message(f"Error loading {file_path}: {e}")
435
 
436
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
 
437
  return all_chunks
438
 
439
-
440
  def load_image_documents(repo_id, hf_token, image_dir):
441
- """Load image descriptions"""
442
  log_message("Loading images...")
443
 
444
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -484,7 +597,6 @@ def load_image_documents(repo_id, hf_token, image_dir):
484
 
485
  return documents
486
 
487
-
488
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
489
  """Main loader - combines all document types"""
490
  log_message("="*60)
 
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
9
+ import re
10
+
11
+ def normalize_text(text):
12
+ if not text:
13
+ return text
14
+
15
+ # Replace Cyrillic 'C' with Latin 'С' (U+0421)
16
+ # This is for welding types like C-25 -> С-25
17
+ text = text.replace('С-', 'C')
18
+ text = re.sub(r'\bС(\d)', r'С\1', text)
19
+ return text
20
+
21
+ def normalize_steel_designations(text):
22
+ if not text:
23
+ return text, 0, []
24
+
25
+ import re
26
+
27
+ changes_count = 0
28
+ changes_list = []
29
+
30
+ # Mapping of Cyrillic to Latin for steel designations
31
+ replacements = {
32
+ 'Х': 'X',
33
+ 'Н': 'H',
34
+ 'Т': 'T',
35
+ 'С': 'C',
36
+ 'В': 'B',
37
+ 'К': 'K',
38
+ 'М': 'M',
39
+ 'А': 'A',
40
+ 'Р': 'P',
41
+ }
42
+
43
+ # Pattern: starts with digits, then letters+digits (steel grade pattern)
44
+ # Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
45
+ pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
46
+
47
+ # Also match welding wire patterns like СВ-08Х19Н10
48
+ pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
49
+
50
+ def replace_in_steel_grade(match):
51
+ nonlocal changes_count, changes_list
52
+ original = match.group(0)
53
+ converted = ''.join(replacements.get(ch, ch) for ch in original)
54
+ if converted != original:
55
+ changes_count += 1
56
+ changes_list.append(f"{original} → {converted}")
57
+ return converted
58
+ normalized_text = re.sub(pattern, replace_in_steel_grade, text)
59
+ normalized_text = re.sub(pattern_wire, replace_in_steel_grade, normalized_text)
60
+
61
+ return normalized_text, changes_count, changes_list
62
 
63
  def chunk_text_documents(documents):
64
  text_splitter = SentenceSplitter(
65
  chunk_size=CHUNK_SIZE,
66
  chunk_overlap=CHUNK_OVERLAP
67
  )
68
+ total_normalizations = 0
69
+ chunks_with_changes = 0
70
 
71
  chunked = []
72
  for doc in documents:
73
  chunks = text_splitter.get_nodes_from_documents([doc])
74
  for i, chunk in enumerate(chunks):
75
+ original_text = chunk.text
76
+ chunk.text, changes, change_list = normalize_steel_designations(chunk.text)
77
+
78
+ if changes > 0:
79
+ chunks_with_changes += 1
80
+ total_normalizations += changes
81
+
82
  chunk.metadata.update({
83
  'chunk_id': i,
84
  'total_chunks': len(chunks),
85
+ 'chunk_size': len(chunk.text)
86
  })
87
  chunked.append(chunk)
88
 
 
93
  max_size = max(len(c.text) for c in chunked)
94
  log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
95
  log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
96
+ log_message(f" Steel designation normalization:")
97
+ log_message(f" - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
98
+ log_message(f" - Total steel grades normalized: {total_normalizations}")
99
+ log_message(f" - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else " - No normalizations needed")
100
 
101
+ log_message("="*60)
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ return chunked
104
 
105
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
106
  headers = table_data.get('headers', [])
 
108
  table_num = table_data.get('table_number', 'unknown')
109
  table_title = table_data.get('table_title', '')
110
  section = table_data.get('section', '')
111
+ sheet_name = table_data.get('sheet_name', '')
112
 
113
+ # Apply steel designation normalization to title and section
114
+ table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
115
+ section, section_changes, section_list = normalize_steel_designations(section)
116
+
117
  table_num_clean = str(table_num).strip()
 
118
 
119
  import re
120
+
121
+ if table_num_clean in ['-', '', 'unknown', 'nan']:
122
+ if 'приложени' in sheet_name.lower() or 'приложени' in section.lower():
123
+ appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)',
124
+ (sheet_name + ' ' + section).lower())
125
+ if appendix_match:
126
+ appendix_num = appendix_match.group(1)
127
+ table_identifier = f"Приложение {appendix_num}"
128
+ else:
129
+ table_identifier = "Приложение"
130
  else:
131
+ if table_title:
132
+ first_words = ' '.join(table_title.split()[:5])
133
+ table_identifier = f"{first_words}"
134
+ else:
135
+ table_identifier = section.split(',')[0] if section else "БезНомера"
136
  else:
137
+ if 'приложени' in section.lower():
138
+ appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)', section.lower())
139
+ if appendix_match:
140
+ appendix_num = appendix_match.group(1)
141
+ table_identifier = f"{table_num_clean} Приложение {appendix_num}"
142
+ else:
143
+ table_identifier = table_num_clean
144
+ else:
145
+ table_identifier = table_num_clean
146
 
147
  if not rows:
148
  return []
149
 
150
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
151
 
152
+ # Normalize all row content (including steel designations)
153
+ normalized_rows = []
154
+ total_row_changes = 0
155
+ rows_with_changes = 0
156
+ all_row_changes = []
157
+
158
+ for row in rows:
159
+ if isinstance(row, dict):
160
+ normalized_row = {}
161
+ row_had_changes = False
162
+ for k, v in row.items():
163
+ normalized_val, changes, change_list = normalize_steel_designations(str(v))
164
+ normalized_row[k] = normalized_val
165
+ if changes > 0:
166
+ total_row_changes += changes
167
+ row_had_changes = True
168
+ all_row_changes.extend(change_list) # NEW
169
+ if row_had_changes:
170
+ rows_with_changes += 1
171
+ normalized_rows.append(normalized_row)
172
+ else:
173
+ normalized_rows.append(row)
174
+
175
+ # Log normalization stats with examples
176
+ if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
177
+ log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
178
+ f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
179
+ if title_list:
180
+ log_message(f" Title changes: {', '.join(title_list[:3])}")
181
+ if section_list:
182
+ log_message(f" Section changes: {', '.join(section_list[:3])}")
183
+ if all_row_changes:
184
+ log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
185
+ base_content = format_table_header(doc_id, table_identifier, table_num,
186
+ table_title, section, headers,
187
+ sheet_name)
188
  base_size = len(base_content)
189
  available_space = max_chars - base_size - 200
190
 
191
  # If entire table fits, return as one chunk
192
+ full_rows_content = format_table_rows([{**row, '_idx': i+1}
193
+ for i, row in enumerate(normalized_rows)])
194
+
195
+ if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
196
  content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
197
 
198
  metadata = {
199
  'type': 'table',
200
  'document_id': doc_id,
201
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
202
+ 'table_identifier': table_identifier,
203
+ 'table_title': table_title,
204
  'section': section,
205
+ 'sheet_name': sheet_name,
206
+ 'total_rows': len(normalized_rows),
207
  'chunk_size': len(content),
208
+ 'is_complete_table': True,
209
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
210
  }
211
 
212
+ log_message(f" Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
213
  return [Document(text=content, metadata=metadata)]
214
 
215
  chunks = []
 
217
  current_size = 0
218
  chunk_num = 0
219
 
220
+ for i, row in enumerate(normalized_rows):
221
  row_text = format_single_row(row, i + 1)
222
  row_size = len(row_text)
223
 
224
+ should_split = (current_size + row_size > available_space or
225
+ len(current_rows) >= max_rows) and current_rows
226
 
227
  if should_split:
228
  content = base_content + format_table_rows(current_rows)
229
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
230
  content += format_table_footer(table_identifier, doc_id)
231
 
232
  metadata = {
233
  'type': 'table',
234
  'document_id': doc_id,
235
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
236
+ 'table_identifier': table_identifier,
237
+ 'table_title': table_title,
238
  'section': section,
239
+ 'sheet_name': sheet_name,
240
  'chunk_id': chunk_num,
241
  'row_start': current_rows[0]['_idx'] - 1,
242
  'row_end': current_rows[-1]['_idx'],
243
+ 'total_rows': len(normalized_rows),
244
  'chunk_size': len(content),
245
+ 'is_complete_table': False,
246
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
247
  }
248
 
249
  chunks.append(Document(text=content, metadata=metadata))
 
253
  current_rows = []
254
  current_size = 0
255
 
 
256
  row_copy = row.copy() if isinstance(row, dict) else {'data': row}
257
  row_copy['_idx'] = i + 1
258
  current_rows.append(row_copy)
259
  current_size += row_size
260
+
 
261
  if current_rows:
262
  content = base_content + format_table_rows(current_rows)
263
+ content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
264
  content += format_table_footer(table_identifier, doc_id)
265
 
266
  metadata = {
267
  'type': 'table',
268
  'document_id': doc_id,
269
+ 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
270
+ 'table_identifier': table_identifier,
271
+ 'table_title': table_title,
272
  'section': section,
273
+ 'sheet_name': sheet_name,
274
  'chunk_id': chunk_num,
275
  'row_start': current_rows[0]['_idx'] - 1,
276
  'row_end': current_rows[-1]['_idx'],
277
+ 'total_rows': len(normalized_rows),
278
  'chunk_size': len(content),
279
+ 'is_complete_table': False,
280
+ 'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
281
  }
282
 
283
  chunks.append(Document(text=content, metadata=metadata))
 
285
 
286
  return chunks
287
 
288
+ def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
 
 
289
  content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
290
 
291
+ # Add multiple searchable identifiers
292
+ if table_num and table_num not in ['-', 'unknown']:
293
+ content += f"НОМЕР ТАБЛИЦЫ: {normalize_text(table_num)}\n"
294
+
295
+ if sheet_name:
296
+ content += f"ЛИСТ: {sheet_name}\n"
297
 
298
  if table_title:
299
  content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
300
 
301
  if section:
302
  content += f"РАЗДЕЛ: {section}\n"
303
+
304
+ content += f"КЛЮЧЕВЫЕ СЛОВА: материалы стали марки стандарты {doc_id}\n"
305
 
306
  content += f"{'='*70}\n"
307
 
308
  if headers:
309
+ # Normalize headers too
310
+ normalized_headers = [normalize_text(str(h)) for h in headers]
311
+ header_str = ' | '.join(normalized_headers)
312
  content += f"ЗАГОЛОВКИ: {header_str}\n\n"
313
 
314
  content += "ДАННЫЕ:\n"
315
  return content
316
 
 
317
  def format_single_row(row, idx):
 
318
  if isinstance(row, dict):
319
  parts = [f"{k}: {v}" for k, v in row.items()
320
  if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
326
  return f"{idx}. {' | '.join(parts)}\n"
327
  return ""
328
 
 
329
  def format_table_rows(rows):
 
330
  content = ""
331
  for row in rows:
332
  idx = row.get('_idx', 0)
333
  content += format_single_row(row, idx)
334
  return content
335
 
 
336
  def format_table_footer(table_identifier, doc_id):
 
337
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
338
 
339
  def load_json_documents(repo_id, hf_token, json_dir):
 
404
  stats['failed'] += 1
405
  continue
406
 
 
407
  try:
408
  text_content = file_content.decode('utf-8')
409
  except UnicodeDecodeError:
 
411
  text_content = file_content.decode('utf-8-sig')
412
  except UnicodeDecodeError:
413
  try:
 
414
  text_content = file_content.decode('utf-16')
415
  except UnicodeDecodeError:
416
  try:
 
457
  log_message(f" Success: {stats['success']}")
458
  log_message(f" Empty: {stats['empty']}")
459
  log_message(f" Failed: {stats['failed']}")
 
460
  log_message(f"="*60)
461
 
462
  return documents
463
 
464
  def extract_sections_from_json(json_path):
 
465
  documents = []
466
 
467
  try:
 
511
 
512
  return documents
513
 
 
514
  def load_table_documents(repo_id, hf_token, table_dir):
515
  log_message("Loading tables...")
516
+ log_message("="*60)
517
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
518
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
519
 
520
  all_chunks = []
521
+ tables_processed = 0
522
+
523
  for file_path in table_files:
524
  try:
525
  local_path = hf_hub_download(
 
536
 
537
  for sheet in data.get('sheets', []):
538
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
539
+ tables_processed += 1
540
 
541
+ chunks = chunk_table_by_content(sheet, sheet_doc_id,
542
+ max_chars=MAX_CHARS_TABLE,
543
+ max_rows=MAX_ROWS_TABLE)
544
  all_chunks.extend(chunks)
545
 
546
  except Exception as e:
547
  log_message(f"Error loading {file_path}: {e}")
548
 
549
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
550
+ log_message("="*60)
551
+
552
  return all_chunks
553
 
 
554
  def load_image_documents(repo_id, hf_token, image_dir):
 
555
  log_message("Loading images...")
556
 
557
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
 
597
 
598
  return documents
599
 
 
600
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
601
  """Main loader - combines all document types"""
602
  log_message("="*60)
index_retriever.py CHANGED
@@ -10,7 +10,6 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
13
-
14
  connection_type_sources = {}
15
  table_count = 0
16
 
@@ -22,21 +21,9 @@ def create_vector_index(documents):
22
  table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
23
  if conn_type not in connection_type_sources:
24
  connection_type_sources[conn_type] = []
25
- connection_type_sources[conn_type].append(table_id)
26
-
27
- log_message("="*60)
28
- log_message(f"INDEXING {table_count} TABLE CHUNKS")
29
- log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
30
- for conn_type in sorted(connection_type_sources.keys()):
31
- sources = list(set(connection_type_sources[conn_type])) # Unique sources
32
- log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
33
- for src in sources:
34
- log_message(f" - {src}")
35
- log_message("="*60)
36
-
37
  return VectorStoreIndex.from_documents(documents)
38
 
39
-
40
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
41
  if not nodes or not reranker:
42
  return nodes[:top_k]
@@ -48,13 +35,10 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
48
  scores = reranker.predict(pairs)
49
  scored_nodes = list(zip(nodes, scores))
50
 
51
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
52
-
53
- # Apply threshold
54
  filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold]
55
 
56
  if not filtered:
57
- # Lower threshold if nothing passes
58
  filtered = scored_nodes[:top_k]
59
 
60
  log_message(f"Выбрано {min(len(filtered), top_k)} узлов")
@@ -65,24 +49,25 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
65
  log_message(f"Ошибка переранжировки: {str(e)}")
66
  return nodes[:top_k]
67
 
68
- def create_query_engine(vector_index):
 
69
  try:
70
  from config import CUSTOM_PROMPT
71
 
72
  bm25_retriever = BM25Retriever.from_defaults(
73
  docstore=vector_index.docstore,
74
- similarity_top_k=70
75
  )
76
 
77
  vector_retriever = VectorIndexRetriever(
78
  index=vector_index,
79
- similarity_top_k=70,
80
- similarity_cutoff=0.6
81
  )
82
 
83
  hybrid_retriever = QueryFusionRetriever(
84
  [vector_retriever, bm25_retriever],
85
- similarity_top_k=140,
86
  num_queries=1
87
  )
88
 
@@ -97,7 +82,9 @@ def create_query_engine(vector_index):
97
  response_synthesizer=response_synthesizer
98
  )
99
 
100
- log_message("Query engine успешно создан")
 
 
101
  return query_engine
102
 
103
  except Exception as e:
 
10
 
11
  def create_vector_index(documents):
12
  log_message("Строю векторный индекс")
 
13
  connection_type_sources = {}
14
  table_count = 0
15
 
 
21
  table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
22
  if conn_type not in connection_type_sources:
23
  connection_type_sources[conn_type] = []
24
+ connection_type_sources[conn_type].append(table_id)
 
 
 
 
 
 
 
 
 
 
 
25
  return VectorStoreIndex.from_documents(documents)
26
 
 
27
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
28
  if not nodes or not reranker:
29
  return nodes[:top_k]
 
35
  scores = reranker.predict(pairs)
36
  scored_nodes = list(zip(nodes, scores))
37
 
38
+ scored_nodes.sort(key=lambda x: x[1], reverse=True)
 
 
39
  filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold]
40
 
41
  if not filtered:
 
42
  filtered = scored_nodes[:top_k]
43
 
44
  log_message(f"Выбрано {min(len(filtered), top_k)} узлов")
 
49
  log_message(f"Ошибка переранжировки: {str(e)}")
50
  return nodes[:top_k]
51
 
52
+ def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
53
+ similarity_cutoff=0.55, hybrid_top_k=100):
54
  try:
55
  from config import CUSTOM_PROMPT
56
 
57
  bm25_retriever = BM25Retriever.from_defaults(
58
  docstore=vector_index.docstore,
59
+ similarity_top_k=bm25_top_k
60
  )
61
 
62
  vector_retriever = VectorIndexRetriever(
63
  index=vector_index,
64
+ similarity_top_k=vector_top_k,
65
+ similarity_cutoff=similarity_cutoff
66
  )
67
 
68
  hybrid_retriever = QueryFusionRetriever(
69
  [vector_retriever, bm25_retriever],
70
+ similarity_top_k=hybrid_top_k,
71
  num_queries=1
72
  )
73
 
 
82
  response_synthesizer=response_synthesizer
83
  )
84
 
85
+ log_message(f"Query engine created: vector_top_k={vector_top_k}, "
86
+ f"bm25_top_k={bm25_top_k}, similarity_cutoff={similarity_cutoff}, "
87
+ f"hybrid_top_k={hybrid_top_k}")
88
  return query_engine
89
 
90
  except Exception as e:
table_prep.py DELETED
@@ -1,229 +0,0 @@
1
- from collections import defaultdict
2
- import json
3
- from huggingface_hub import hf_hub_download, list_repo_files
4
- from llama_index.core import Document
5
- from my_logging import log_message
6
- from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
7
-
8
-
9
- def create_table_content(table_data):
10
- doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
11
- table_num = table_data.get('table_number', 'Неизвестно')
12
- table_title = table_data.get('table_title', 'Неизвестно')
13
- section = table_data.get('section', 'Неизвестно')
14
-
15
- content = f"Таблица: {table_num}\n"
16
- content += f"Название: {table_title}\n"
17
- content += f"Документ: {doc_id}\n"
18
- content += f"Раздел: {section}\n"
19
-
20
- headers = table_data.get('headers', [])
21
- if headers:
22
- content += f"\nЗаголовки: {' | '.join(headers)}\n"
23
-
24
- if 'data' in table_data and isinstance(table_data['data'], list):
25
- content += "\nДанные таблицы:\n"
26
- for row_idx, row in enumerate(table_data['data'], start=1):
27
- if isinstance(row, dict):
28
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
29
- content += f"Строка {row_idx}: {row_text}\n"
30
-
31
- return content
32
-
33
- def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
34
- lines = doc.text.strip().split('\n')
35
-
36
- header_lines = []
37
- data_rows = []
38
- in_data = False
39
-
40
- for line in lines:
41
- if line.startswith('Данные таблицы:'):
42
- in_data = True
43
- header_lines.append(line)
44
- elif in_data and line.startswith('Строка'):
45
- data_rows.append(line)
46
- elif not in_data:
47
- header_lines.append(line)
48
-
49
- header = '\n'.join(header_lines) + '\n'
50
-
51
- if not data_rows:
52
- return [doc]
53
-
54
- chunks = []
55
- current_rows = []
56
- current_size = len(header)
57
-
58
- for row in data_rows:
59
- row_size = len(row) + 1
60
- # Check both limits: chunk size and row count
61
- if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
62
- chunk_text = header + '\n'.join(current_rows)
63
- chunks.append(chunk_text)
64
- log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
65
- current_rows = []
66
- current_size = len(header)
67
-
68
- current_rows.append(row)
69
- current_size += row_size
70
- log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
71
-
72
- # Add final chunk
73
- if current_rows:
74
- chunk_text = header + '\n'.join(current_rows)
75
- chunks.append(chunk_text)
76
- log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
77
-
78
- # Create Document objects
79
- chunked_docs = []
80
- for i, chunk_text in enumerate(chunks):
81
- chunk_doc = Document(
82
- text=chunk_text,
83
- metadata={
84
- "type": "table",
85
- "table_number": doc.metadata.get('table_number'),
86
- "document_id": doc.metadata.get('document_id'),
87
- "section": doc.metadata.get('section'),
88
- "chunk_id": i,
89
- "total_chunks": len(chunks),
90
- "is_chunked": True
91
- }
92
- )
93
- chunked_docs.append(chunk_doc)
94
-
95
- return chunked_docs
96
-
97
-
98
- # def table_to_document(table_data, document_id=None):
99
- # if not isinstance(table_data, dict):
100
- # return []
101
-
102
- # doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
103
- # table_num = table_data.get('table_number', 'Неизвестно')
104
- # table_title = table_data.get('table_title', 'Неизвестно')
105
- # section = table_data.get('section', 'Неизвестно')
106
- # table_rows = table_data.get('data', [])
107
-
108
- # if not table_rows:
109
- # return []
110
-
111
- # # Build table content
112
- # content = f"Таблица: {table_num}\n"
113
- # content += f"Название: {table_title}\n"
114
- # content += f"Документ: {doc_id}\n"
115
- # content += f"Раздел: {section}\n"
116
-
117
- # headers = table_data.get('headers', [])
118
- # if headers:
119
- # content += f"\nЗаголовки: {' | '.join(headers)}\n"
120
-
121
- # content += "\nДанные таблицы:\n"
122
- # for row_idx, row in enumerate(table_rows, start=1):
123
- # if isinstance(row, dict):
124
- # row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
125
- # content += f"Строка {row_idx}: {row_text}\n"
126
-
127
- # # Create base document
128
- # base_doc = Document(
129
- # text=content,
130
- # metadata={
131
- # "type": "table",
132
- # "table_number": table_num,
133
- # "document_id": doc_id,
134
- # "section": section
135
- # }
136
- # )
137
- # if len(content) > 4000:
138
- # chunks = chunk_table_document(base_doc)
139
- # log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
140
- # return chunk_table_document(base_doc)
141
- # return [base_doc]
142
-
143
-
144
- # def load_table_data(repo_id, hf_token, table_data_dir):
145
- # try:
146
- # files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
147
- # table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
148
-
149
- # log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
150
-
151
- # table_documents = []
152
- # stats = {
153
- # 'total_tables': 0,
154
- # 'total_size': 0,
155
- # 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
156
- # }
157
-
158
- # for file_path in table_files:
159
- # try:
160
- # local_path = hf_hub_download(
161
- # repo_id=repo_id,
162
- # filename=file_path,
163
- # local_dir='',
164
- # repo_type="dataset",
165
- # token=hf_token
166
- # )
167
-
168
- # log_message(f"\nОбработка файла: {file_path}")
169
-
170
- # with open(local_path, 'r', encoding='utf-8') as f:
171
- # table_data = json.load(f)
172
-
173
- # if isinstance(table_data, dict):
174
- # document_id = table_data.get('document', 'unknown')
175
-
176
- # if 'sheets' in table_data:
177
- # sorted_sheets = sorted(
178
- # table_data['sheets'],
179
- # key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
180
- # )
181
-
182
- # for sheet in sorted_sheets:
183
- # sheet['document'] = document_id
184
- # docs_list = table_to_document(sheet, document_id)
185
- # table_documents.extend(docs_list)
186
-
187
- # for doc in docs_list:
188
- # stats['total_tables'] += 1
189
- # size = doc.metadata.get('content_size', 0)
190
- # stats['total_size'] += size
191
- # stats['by_document'][document_id]['count'] += 1
192
- # stats['by_document'][document_id]['size'] += size
193
- # log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
194
- # else:
195
- # docs_list = table_to_document(table_data, document_id)
196
- # table_documents.extend(docs_list)
197
-
198
- # for doc in docs_list:
199
- # stats['total_tables'] += 1
200
- # size = doc.metadata.get('content_size', 0)
201
- # stats['total_size'] += size
202
- # stats['by_document'][document_id]['count'] += 1
203
- # stats['by_document'][document_id]['size'] += size
204
-
205
-
206
- # except Exception as e:
207
- # log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
208
- # continue
209
-
210
- # # Log summary statistics
211
- # log_message("\n" + "=" * 60)
212
- # log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
213
- # log_message("=" * 60)
214
- # log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
215
- # log_message(f"Общий размер: {stats['total_size']:,} символов")
216
- # log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
217
-
218
- # log_message("\nПо документам:")
219
- # for doc_id, doc_stats in sorted(stats['by_document'].items()):
220
- # log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
221
- # f"{doc_stats['size']:,} символов")
222
-
223
- # log_message("=" * 60)
224
-
225
- # return table_documents
226
-
227
- # except Exception as e:
228
- # log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
229
- # return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -9,6 +9,18 @@ import time
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def get_llm_model(model_name):
14
  try:
@@ -172,65 +184,83 @@ def deduplicate_nodes(nodes):
172
 
173
  return unique_nodes
174
 
175
- def debug_search_tables(vector_index, search_term="С-25"):
176
- """Debug function to find all tables containing a specific term"""
177
- all_nodes = list(vector_index.docstore.docs.values())
178
 
179
- matching = []
180
- for node in all_nodes:
181
- if node.metadata.get('type') == 'table':
182
- text = node.get_content()
183
- if search_term in text or search_term in node.metadata.get('table_title', ''):
184
- matching.append({
185
- 'doc_id': node.metadata.get('document_id'),
186
- 'table_num': node.metadata.get('table_number'),
187
- 'title': node.metadata.get('table_title', '')[:100]
188
- })
189
 
190
- log_message(f"\n{'='*60}")
191
- log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
192
- for m in matching:
193
- log_message(f" • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
194
- log_message(f"{'='*60}\n")
 
 
 
195
 
196
- return matching
 
 
 
 
 
 
 
 
197
 
198
- from documents_prep import normalize_text
199
 
200
- # MODIFIED: Update answer_question function
201
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
202
- # NORMALIZE the question to convert C to С
203
  normalized_question = normalize_text(question)
 
 
 
 
 
 
 
 
 
 
 
204
 
 
 
 
 
205
  if query_engine is None:
206
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
207
 
208
  try:
209
  start_time = time.time()
210
- # Use NORMALIZED question for retrieval
211
- retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
212
  log_message(f"user query: {question}")
213
- log_message(f"normalized query: {normalized_question}")
214
-
215
-
216
- log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
217
-
218
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
219
-
220
- # DEBUG: Log what was retrieved
221
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
222
- for i, node in enumerate(unique_retrieved): # All debug
223
- table_num = node.metadata.get('table_number', 'N/A')
224
- table_title = node.metadata.get('table_title', 'N/A')
225
  doc_id = node.metadata.get('document_id', 'N/A')
226
- log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
 
 
 
 
 
 
 
 
 
 
 
 
227
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
228
 
229
- # Simple reranking with NORMALIZED question
230
- reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
231
 
232
- # Direct query without formatting - use normalized question
233
- response = query_engine.query(normalized_question)
234
 
235
  end_time = time.time()
236
  processing_time = end_time - start_time
@@ -243,7 +273,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
243
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
244
  <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
245
  <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
246
- Время обработки: {processing_time:.2f} секунд
247
  </div>
248
  </div>"""
249
  log_message(f"Model Answer: {response.response}")
 
9
  from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
+ from config import QUERY_EXPANSION_PROMPT
13
+ from documents_prep import normalize_text, normalize_steel_designations
14
+
15
+
16
+ KEYWORD_EXPANSIONS = {
17
+ "08X18H10T": ["Листы", "Трубы", "Поковки", "Крепежные изделия", "Сортовой прокат", "Отливки"],
18
+ "12X18H10T": ["Листы", "Поковки", "Сортовой прокат"],
19
+ "10X17H13M2T": ["Трубы", "Арматура", "Поковки", "Фланцы"],
20
+ "20X23H18": ["Листы", "Сортовой прокат", "Поковки"],
21
+ "03X17H14M3": ["Трубы", "Листы", "Проволока"],
22
+ "СВ-08X19H10": ["Сварочная проволока", "Сварка", "Сварочные материалы"],
23
+ }
24
 
25
  def get_llm_model(model_name):
26
  try:
 
184
 
185
  return unique_nodes
186
 
187
+ def enhance_query_with_keywords(query):
188
+ query_upper = query.upper()
 
189
 
190
+ added_context = []
191
+ keywords_found = []
 
 
 
 
 
 
 
 
192
 
193
+ for keyword, expansions in KEYWORD_EXPANSIONS.items():
194
+ keyword_upper = keyword.upper()
195
+
196
+ if keyword_upper in query_upper:
197
+ context = ' '.join(expansions)
198
+ added_context.append(context)
199
+ keywords_found.append(keyword)
200
+ log_message(f" Found keyword '{keyword}': added context '{context}'")
201
 
202
+ if added_context:
203
+ unique_context = ' '.join(set(' '.join(added_context).split()))
204
+ enhanced = f"{query} {unique_context}"
205
+
206
+ log_message(f"Enhanced query with keywords: {', '.join(keywords_found)}")
207
+ log_message(f"Added context: {unique_context[:100]}...")
208
+
209
+ return enhanced
210
+ return f"{query}"
211
 
 
212
 
213
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
 
 
214
  normalized_question = normalize_text(question)
215
+ normalized_question_2, query_changes, change_list = normalize_steel_designations(question)
216
+ enhanced_question = enhance_query_with_keywords(normalized_question_2)
217
+
218
+ try:
219
+ llm = get_llm_model(current_model)
220
+ expansion_prompt = QUERY_EXPANSION_PROMPT.format(original_query=enhanced_question)
221
+ expanded_queries = llm.complete(expansion_prompt).text.strip()
222
+ enhanced_question = f"{enhanced_question} {expanded_queries}"
223
+ log_message(f"LLM expanded query: {expanded_queries[:200]}...")
224
+ except Exception as e:
225
+ log_message(f"Query expansion failed: {e}, using keyword-only enhancement")
226
 
227
+ if change_list:
228
+ log_message(f"Query changes: {', '.join(change_list)}")
229
+ if change_list:
230
+ log_message(f"Query changes: {', '.join(change_list)}")
231
  if query_engine is None:
232
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
233
 
234
  try:
235
  start_time = time.time()
236
+ retrieved_nodes = query_engine.retriever.retrieve(enhanced_question)
 
237
  log_message(f"user query: {question}")
238
+ log_message(f"after steel normalization: {normalized_question_2}")
239
+ log_message(f"enhanced query: {enhanced_question}")
 
 
 
240
  unique_retrieved = deduplicate_nodes(retrieved_nodes)
 
 
241
  log_message(f"RETRIEVED: unique {len(unique_retrieved)} nodes")
242
+ for i, node in enumerate(unique_retrieved):
243
+ node_type = node.metadata.get('type', 'text')
 
244
  doc_id = node.metadata.get('document_id', 'N/A')
245
+
246
+ if node_type == 'table':
247
+ table_num = node.metadata.get('table_number', 'N/A')
248
+ table_id = node.metadata.get('table_identifier', 'N/A')
249
+ table_title = node.metadata.get('table_title', 'N/A')
250
+ content_preview = node.text[:200].replace('\n', ' ')
251
+ log_message(f" [{i+1}] {doc_id} - Table {table_num} | ID: {table_id}")
252
+ log_message(f" Title: {table_title[:80]}")
253
+ log_message(f" Content: {content_preview}...")
254
+ else:
255
+ section = node.metadata.get('section_id', 'N/A')
256
+ log_message(f" [{i+1}] {doc_id} - Text section {section}")
257
+
258
  log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
259
 
260
+ reranked_nodes = rerank_nodes(enhanced_question, unique_retrieved, reranker,
261
+ top_k=rerank_top_k)
262
 
263
+ response = query_engine.query(enhanced_question)
 
264
 
265
  end_time = time.time()
266
  processing_time = end_time - start_time
 
273
  <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
274
  <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
275
  <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
276
+ Время обработки: {processing_time:.2f} секунд
277
  </div>
278
  </div>"""
279
  log_message(f"Model Answer: {response.response}")