MrSimple07 commited on
Commit
ab99142
·
1 Parent(s): 5099a0a

a new restart button + detailed logging, main_utils.py name changing

Browse files
Files changed (5) hide show
  1. app.py +38 -116
  2. app_1.py +1 -1
  3. converters/converter.py +63 -3
  4. index_retriever.py +2 -2
  5. utils.py → main_utils.py +112 -0
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
  from documents_prep import load_json_documents, load_table_documents, load_image_documents
5
- from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
  import sys
@@ -11,115 +11,37 @@ from config import (
11
  JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
  )
13
  from converters.converter import process_uploaded_file, convert_single_excel_to_json, convert_single_excel_to_csv
 
14
 
15
-
16
- def merge_table_chunks(chunk_info):
17
- merged = {}
18
 
19
- for chunk in chunk_info:
20
- doc_type = chunk.get('type', 'text')
21
- doc_id = chunk.get('document_id', 'unknown')
22
 
23
- if doc_type == 'table' or doc_type == 'table_row':
24
- table_num = chunk.get('table_number', '')
25
- key = f"{doc_id}_{table_num}"
26
-
27
- if key not in merged:
28
- merged[key] = {
29
- 'document_id': doc_id,
30
- 'type': 'table',
31
- 'table_number': table_num,
32
- 'section_id': chunk.get('section_id', 'unknown'),
33
- 'chunk_text': chunk.get('chunk_text', '')
34
- }
35
- else:
36
- merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
37
- else:
38
- unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
39
- merged[unique_key] = chunk
40
-
41
- return list(merged.values())
42
-
43
- def create_chunks_display_html(chunk_info):
44
- if not chunk_info:
45
- return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
46
-
47
- merged_chunks = merge_table_chunks(chunk_info)
48
-
49
- html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
50
- html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
51
-
52
- for i, chunk in enumerate(merged_chunks):
53
- bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
54
- section_display = get_section_display(chunk)
55
- formatted_content = get_formatted_content(chunk)
56
 
57
- html += f"""
58
- <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
59
- <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
60
- <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
61
- <strong style='color: black;'>Содержание:</strong><br>
62
- <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
63
- {formatted_content}
64
- </div>
65
- </div>
66
- """
67
-
68
- html += "</div>"
69
- return html
70
-
71
- def get_section_display(chunk):
72
- section_path = chunk.get('section_path', '')
73
- section_id = chunk.get('section_id', 'unknown')
74
- doc_type = chunk.get('type', 'text')
75
-
76
- if doc_type == 'table' and chunk.get('table_number'):
77
- table_num = chunk.get('table_number')
78
- if not str(table_num).startswith('№'):
79
- table_num = f"№{table_num}"
80
- return f"таблица {table_num}"
81
-
82
- if doc_type == 'image' and chunk.get('image_number'):
83
- image_num = chunk.get('image_number')
84
- if not str(image_num).startswith('№'):
85
- image_num = f"№{image_num}"
86
- return f"рисунок {image_num}"
87
-
88
- if section_path:
89
- return section_path
90
- elif section_id and section_id != 'unknown':
91
- return section_id
92
-
93
- return section_id
94
-
95
- def get_formatted_content(chunk):
96
- document_id = chunk.get('document_id', 'unknown')
97
- section_path = chunk.get('section_path', '')
98
- section_id = chunk.get('section_id', 'unknown')
99
- section_text = chunk.get('section_text', '')
100
- parent_section = chunk.get('parent_section', '')
101
- parent_title = chunk.get('parent_title', '')
102
- level = chunk.get('level', '')
103
- chunk_text = chunk.get('chunk_text', '')
104
- doc_type = chunk.get('type', 'text')
105
-
106
- # For text documents
107
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
108
- current_section = section_path if section_path else section_id
109
- parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
110
- return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
111
- else:
112
- current_section = section_path if section_path else section_id
113
- clean_text = chunk_text
114
- if section_text and chunk_text.startswith(section_text):
115
- section_title = section_text
116
- elif chunk_text.startswith(f"{current_section} "):
117
- clean_text = chunk_text[len(f"{current_section} "):].strip()
118
- section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
119
  else:
120
- section_title = section_text if section_text else current_section
121
 
122
- return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
 
 
 
 
123
 
124
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
125
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
@@ -190,7 +112,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
190
  'table_number': doc.metadata.get('table_number', ''),
191
  'image_number': doc.metadata.get('image_number', ''),
192
  'section': doc.metadata.get('section', ''),
193
- 'connection_type': doc.metadata.get('connection_type', '') # ADD THIS
194
  })
195
 
196
  log_message(f"Система успешно инициализирована")
@@ -225,15 +147,15 @@ def switch_model(model_name, vector_index):
225
  return None, f"❌ {error_msg}"
226
 
227
  retrieval_params = {
228
- 'vector_top_k': 50,
229
- 'bm25_top_k': 50,
230
- 'similarity_cutoff': 0.55,
231
- 'hybrid_top_k': 100,
232
  'rerank_top_k': 20
233
  }
234
 
235
- def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
236
- similarity_cutoff=0.55, hybrid_top_k=100):
237
  try:
238
  from config import CUSTOM_PROMPT
239
  from index_retriever import create_query_engine as create_index_query_engine
@@ -424,7 +346,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
424
  vector_top_k = gr.Slider(
425
  minimum=10,
426
  maximum=200,
427
- value=50,
428
  step=10,
429
  label="Vector Top K",
430
  info="Количество результатов из векторного поиска"
@@ -434,7 +356,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
434
  bm25_top_k = gr.Slider(
435
  minimum=10,
436
  maximum=200,
437
- value=50,
438
  step=10,
439
  label="BM25 Top K",
440
  info="Количество результатов из BM25 поиска"
@@ -445,7 +367,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
445
  similarity_cutoff = gr.Slider(
446
  minimum=0.0,
447
  maximum=1.0,
448
- value=0.55,
449
  step=0.05,
450
  label="Similarity Cutoff",
451
  info="Минимальный порог схожести для векторного поиска"
@@ -455,7 +377,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
455
  hybrid_top_k = gr.Slider(
456
  minimum=10,
457
  maximum=300,
458
- value=100,
459
  step=10,
460
  label="Hybrid Top K",
461
  info="Количество результатов из гибридного поиска"
@@ -497,7 +419,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
497
 
498
  gr.Markdown("### Текущие параметры:")
499
  current_params_display = gr.Textbox(
500
- value="Vector: 50 | BM25: 50 | Cutoff: 0.55 | Hybrid: 100 | Rerank: 20",
501
  label="",
502
  interactive=False,
503
  lines=2
 
2
  import os
3
  from llama_index.core import Settings
4
  from documents_prep import load_json_documents, load_table_documents, load_image_documents
5
+ from main_utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
  import sys
 
11
  JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
  )
13
  from converters.converter import process_uploaded_file, convert_single_excel_to_json, convert_single_excel_to_csv
14
+ from main_utils import *
15
 
16
+ def restart_system():
17
+ """Перезапуск системы для применения новых документов"""
18
+ global query_engine, chunks_df, reranker, vector_index, current_model
19
 
20
+ try:
21
+ log_message("Начало перезапуска системы...")
 
22
 
23
+ query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
24
+ repo_id=HF_REPO_ID,
25
+ hf_token=HF_TOKEN,
26
+ download_dir=DOWNLOAD_DIR,
27
+ json_files_dir=JSON_FILES_DIR,
28
+ table_data_dir=TABLE_DATA_DIR,
29
+ image_data_dir=IMAGE_DATA_DIR,
30
+ use_json_instead_csv=True,
31
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ if query_engine:
34
+ log_message("Система успешно перезапущена")
35
+ chunks_html = create_chunks_display_html(chunk_info)
36
+ return "✅ Система успешно перезапущена! Новые документы загружены.", chunks_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  else:
38
+ return "❌ Ошибка при перезапуске системы", "<div style='color: red;'>Ошибка загрузки</div>"
39
 
40
+ except Exception as e:
41
+ error_msg = f"Ошибка перезапуска: {str(e)}"
42
+ log_message(error_msg)
43
+ return f"❌ {error_msg}", "<div style='color: red;'>Ошибка</div>"
44
+
45
 
46
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
47
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
 
112
  'table_number': doc.metadata.get('table_number', ''),
113
  'image_number': doc.metadata.get('image_number', ''),
114
  'section': doc.metadata.get('section', ''),
115
+ 'connection_type': doc.metadata.get('connection_type', '')
116
  })
117
 
118
  log_message(f"Система успешно инициализирована")
 
147
  return None, f"❌ {error_msg}"
148
 
149
  retrieval_params = {
150
+ 'vector_top_k': 70,
151
+ 'bm25_top_k': 70,
152
+ 'similarity_cutoff': 0.45,
153
+ 'hybrid_top_k': 140,
154
  'rerank_top_k': 20
155
  }
156
 
157
+ def create_query_engine(vector_index, vector_top_k=70, bm25_top_k=70,
158
+ similarity_cutoff=0.45, hybrid_top_k=140):
159
  try:
160
  from config import CUSTOM_PROMPT
161
  from index_retriever import create_query_engine as create_index_query_engine
 
346
  vector_top_k = gr.Slider(
347
  minimum=10,
348
  maximum=200,
349
+ value=70,
350
  step=10,
351
  label="Vector Top K",
352
  info="Количество результатов из векторного поиска"
 
356
  bm25_top_k = gr.Slider(
357
  minimum=10,
358
  maximum=200,
359
+ value=70,
360
  step=10,
361
  label="BM25 Top K",
362
  info="Количество результатов из BM25 поиска"
 
367
  similarity_cutoff = gr.Slider(
368
  minimum=0.0,
369
  maximum=1.0,
370
+ value=0.45,
371
  step=0.05,
372
  label="Similarity Cutoff",
373
  info="Минимальный порог схожести для векторного поиска"
 
377
  hybrid_top_k = gr.Slider(
378
  minimum=10,
379
  maximum=300,
380
+ value=140,
381
  step=10,
382
  label="Hybrid Top K",
383
  info="Количество результатов из гибридного поиска"
 
419
 
420
  gr.Markdown("### Текущие параметры:")
421
  current_params_display = gr.Textbox(
422
+ value="Vector: 70 | BM25: 70 | Cutoff: 0.45 | Hybrid: 140 | Rerank: 20",
423
  label="",
424
  interactive=False,
425
  lines=2
app_1.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import os
3
  from llama_index.core import Settings
4
  from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
- from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
  import sys
 
2
  import os
3
  from llama_index.core import Settings
4
  from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
+ from main_utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
  import sys
converters/converter.py CHANGED
@@ -19,31 +19,66 @@ def process_uploaded_file(file, file_type):
19
  filename = os.path.basename(source_path)
20
  file_path = os.path.join(temp_dir, filename)
21
 
 
 
 
22
  if os.path.abspath(source_path) != os.path.abspath(file_path):
23
  shutil.copy(source_path, file_path)
24
  else:
25
  file_path = source_path
26
 
 
 
27
  if file_type == "Таблица":
28
  target_dir = TABLE_DATA_DIR
29
  if filename.endswith(('.xlsx', '.xls')):
30
  json_path = convert_single_excel_to_json(file_path, temp_dir)
31
  upload_file = json_path
 
 
 
 
 
 
32
  else:
33
  upload_file = file_path
 
 
34
  elif file_type == "Изображение (метаданные)":
35
  target_dir = IMAGE_DATA_DIR
36
- # Конвертируем Excel в CSV
37
  if filename.endswith(('.xlsx', '.xls')):
38
  csv_path = convert_single_excel_to_csv(file_path, temp_dir)
39
  upload_file = csv_path
 
 
 
 
 
40
  else:
41
  upload_file = file_path
 
 
 
 
 
 
42
  else: # JSON документ
43
  target_dir = JSON_FILES_DIR
44
  upload_file = file_path
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Загружаем на HuggingFace
 
47
  api = HfApi()
48
  api.upload_file(
49
  path_or_fileobj=upload_file,
@@ -54,7 +89,12 @@ def process_uploaded_file(file, file_type):
54
  )
55
 
56
  log_message(f"Файл {filename} успешно загружен в {target_dir}")
57
- return f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n⚠️ Перезапустите систему для применения изменений"
 
 
 
 
 
58
 
59
  except Exception as e:
60
  error_msg = f"Ошибка обработки файла: {str(e)}"
@@ -71,12 +111,18 @@ def convert_single_excel_to_json(excel_path, output_dir):
71
  "sheets": []
72
  }
73
 
 
 
 
 
74
  for sheet_name, df in df_dict.items():
75
  if df.empty or "Номер таблицы" not in df.columns:
 
76
  continue
77
 
78
  df = df.dropna(how='all').fillna("")
79
  grouped = df.groupby("Номер таблицы")
 
80
 
81
  for table_number, group in grouped:
82
  group = group.reset_index(drop=True)
@@ -100,6 +146,10 @@ def convert_single_excel_to_json(excel_path, output_dir):
100
  sheet_data["data"].append(row_dict)
101
 
102
  result["sheets"].append(sheet_data)
 
 
 
 
103
 
104
  json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
105
  json_path = os.path.join(output_dir, json_filename)
@@ -107,12 +157,22 @@ def convert_single_excel_to_json(excel_path, output_dir):
107
  with open(json_path, 'w', encoding='utf-8') as f:
108
  json.dump(result, f, ensure_ascii=False, indent=2)
109
 
 
 
 
110
  return json_path
111
 
112
  def convert_single_excel_to_csv(excel_path, output_dir):
113
  """Конвертация одного Excel файла в CSV для изображений"""
 
 
114
  df = pd.read_excel(excel_path)
115
  csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
116
  csv_path = os.path.join(output_dir, csv_filename)
117
  df.to_csv(csv_path, index=False, encoding='utf-8')
118
- return csv_path
 
 
 
 
 
 
19
  filename = os.path.basename(source_path)
20
  file_path = os.path.join(temp_dir, filename)
21
 
22
+ log_message(f"Начало обработки файла: {filename}")
23
+ log_message(f"Тип документа: {file_type}")
24
+
25
  if os.path.abspath(source_path) != os.path.abspath(file_path):
26
  shutil.copy(source_path, file_path)
27
  else:
28
  file_path = source_path
29
 
30
+ status_info = []
31
+
32
  if file_type == "Таблица":
33
  target_dir = TABLE_DATA_DIR
34
  if filename.endswith(('.xlsx', '.xls')):
35
  json_path = convert_single_excel_to_json(file_path, temp_dir)
36
  upload_file = json_path
37
+
38
+ # Read processed data for statistics
39
+ with open(json_path, 'r', encoding='utf-8') as f:
40
+ data = json.load(f)
41
+ status_info.append(f"📊 Обработано таблиц: {len(data['sheets'])}")
42
+ status_info.append(f"📄 Листов в документе: {data['total_sheets']}")
43
  else:
44
  upload_file = file_path
45
+ status_info.append(f"📄 Загружен файл: {filename}")
46
+
47
  elif file_type == "Изображение (метаданные)":
48
  target_dir = IMAGE_DATA_DIR
 
49
  if filename.endswith(('.xlsx', '.xls')):
50
  csv_path = convert_single_excel_to_csv(file_path, temp_dir)
51
  upload_file = csv_path
52
+
53
+ # Read CSV for statistics
54
+ df = pd.read_csv(csv_path)
55
+ status_info.append(f"🖼️ Записей изображений: {len(df)}")
56
+ status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
57
  else:
58
  upload_file = file_path
59
+ # Try to read CSV for stats
60
+ try:
61
+ df = pd.read_csv(upload_file)
62
+ status_info.append(f"🖼️ Записей изображений: {len(df)}")
63
+ except:
64
+ status_info.append(f"📄 Загружен файл: {filename}")
65
  else: # JSON документ
66
  target_dir = JSON_FILES_DIR
67
  upload_file = file_path
68
+
69
+ # Try to read JSON for statistics
70
+ try:
71
+ with open(upload_file, 'r', encoding='utf-8') as f:
72
+ json_data = json.load(f)
73
+ if isinstance(json_data, list):
74
+ status_info.append(f"📝 Документов в JSON: {len(json_data)}")
75
+ elif isinstance(json_data, dict):
76
+ status_info.append(f"📝 JSON объект загружен")
77
+ except:
78
+ status_info.append(f"📄 Загружен файл: {filename}")
79
 
80
  # Загружаем на HuggingFace
81
+ log_message(f"Загрузка на HuggingFace: {target_dir}/{os.path.basename(upload_file)}")
82
  api = HfApi()
83
  api.upload_file(
84
  path_or_fileobj=upload_file,
 
89
  )
90
 
91
  log_message(f"Файл {filename} успешно загружен в {target_dir}")
92
+
93
+ result_message = f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n\n"
94
+ result_message += "\n".join(status_info)
95
+ result_message += "\n\n⚠️ Нажмите кнопку 'Перезапустить систему' для применения изменений"
96
+
97
+ return result_message
98
 
99
  except Exception as e:
100
  error_msg = f"Ошибка обработки файла: {str(e)}"
 
111
  "sheets": []
112
  }
113
 
114
+ log_message(f"Обработка файла: {os.path.basename(excel_path)}")
115
+ log_message(f"Найдено листов: {len(df_dict)}")
116
+
117
+ total_tables = 0
118
  for sheet_name, df in df_dict.items():
119
  if df.empty or "Номер таблицы" not in df.columns:
120
+ log_message(f" Лист '{sheet_name}': пропущен (пустой или отсутствует колонка 'Номер таблицы')")
121
  continue
122
 
123
  df = df.dropna(how='all').fillna("")
124
  grouped = df.groupby("Номер таблицы")
125
+ sheet_tables = 0
126
 
127
  for table_number, group in grouped:
128
  group = group.reset_index(drop=True)
 
146
  sheet_data["data"].append(row_dict)
147
 
148
  result["sheets"].append(sheet_data)
149
+ sheet_tables += 1
150
+
151
+ total_tables += sheet_tables
152
+ log_message(f" Лист '{sheet_name}': обработано таблиц: {sheet_tables}")
153
 
154
  json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
155
  json_path = os.path.join(output_dir, json_filename)
 
157
  with open(json_path, 'w', encoding='utf-8') as f:
158
  json.dump(result, f, ensure_ascii=False, indent=2)
159
 
160
+ log_message(f"Конвертация завершена. Всего таблиц обработано: {total_tables}")
161
+ log_message(f"Результат сохранен: {json_filename}")
162
+
163
  return json_path
164
 
165
  def convert_single_excel_to_csv(excel_path, output_dir):
166
  """Конвертация одного Excel файла в CSV для изображений"""
167
+ log_message(f"Конвертация Excel в CSV: {os.path.basename(excel_path)}")
168
+
169
  df = pd.read_excel(excel_path)
170
  csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
171
  csv_path = os.path.join(output_dir, csv_filename)
172
  df.to_csv(csv_path, index=False, encoding='utf-8')
173
+
174
+ log_message(f" Строк обработано: {len(df)}")
175
+ log_message(f" Колонок: {len(df.columns)}")
176
+ log_message(f" Результат сохранен: {csv_filename}")
177
+
178
+ return csv_path
index_retriever.py CHANGED
@@ -49,8 +49,8 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
49
  log_message(f"Ошибка переранжировки: {str(e)}")
50
  return nodes[:top_k]
51
 
52
- def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
53
- similarity_cutoff=0.55, hybrid_top_k=100):
54
  try:
55
  from config import CUSTOM_PROMPT
56
 
 
49
  log_message(f"Ошибка переранжировки: {str(e)}")
50
  return nodes[:top_k]
51
 
52
+ def create_query_engine(vector_index, vector_top_k=70, bm25_top_k=70,
53
+ similarity_cutoff=0.45, hybrid_top_k=140):
54
  try:
55
  from config import CUSTOM_PROMPT
56
 
utils.py → main_utils.py RENAMED
@@ -210,6 +210,118 @@ def enhance_query_with_keywords(query):
210
  return f"{query}"
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
214
  normalized_question = normalize_text(question)
215
  normalized_question_2, query_changes, change_list = normalize_steel_designations(question)
 
210
  return f"{query}"
211
 
212
 
213
+
214
+ def merge_table_chunks(chunk_info):
215
+ merged = {}
216
+
217
+ for chunk in chunk_info:
218
+ doc_type = chunk.get('type', 'text')
219
+ doc_id = chunk.get('document_id', 'unknown')
220
+
221
+ if doc_type == 'table' or doc_type == 'table_row':
222
+ table_num = chunk.get('table_number', '')
223
+ key = f"{doc_id}_{table_num}"
224
+
225
+ if key not in merged:
226
+ merged[key] = {
227
+ 'document_id': doc_id,
228
+ 'type': 'table',
229
+ 'table_number': table_num,
230
+ 'section_id': chunk.get('section_id', 'unknown'),
231
+ 'chunk_text': chunk.get('chunk_text', '')
232
+ }
233
+ else:
234
+ merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
235
+ else:
236
+ unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
237
+ merged[unique_key] = chunk
238
+
239
+ return list(merged.values())
240
+
241
+ def create_chunks_display_html(chunk_info):
242
+ if not chunk_info:
243
+ return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
244
+
245
+ merged_chunks = merge_table_chunks(chunk_info)
246
+
247
+ html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
248
+ html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
249
+
250
+ for i, chunk in enumerate(merged_chunks):
251
+ bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
252
+ section_display = get_section_display(chunk)
253
+ formatted_content = get_formatted_content(chunk)
254
+
255
+ html += f"""
256
+ <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
257
+ <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
258
+ <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
259
+ <strong style='color: black;'>Содержание:</strong><br>
260
+ <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
261
+ {formatted_content}
262
+ </div>
263
+ </div>
264
+ """
265
+
266
+ html += "</div>"
267
+ return html
268
+
269
+ def get_section_display(chunk):
270
+ section_path = chunk.get('section_path', '')
271
+ section_id = chunk.get('section_id', 'unknown')
272
+ doc_type = chunk.get('type', 'text')
273
+
274
+ if doc_type == 'table' and chunk.get('table_number'):
275
+ table_num = chunk.get('table_number')
276
+ if not str(table_num).startswith('№'):
277
+ table_num = f"№{table_num}"
278
+ return f"таблица {table_num}"
279
+
280
+ if doc_type == 'image' and chunk.get('image_number'):
281
+ image_num = chunk.get('image_number')
282
+ if not str(image_num).startswith('№'):
283
+ image_num = f"№{image_num}"
284
+ return f"рисунок {image_num}"
285
+
286
+ if section_path:
287
+ return section_path
288
+ elif section_id and section_id != 'unknown':
289
+ return section_id
290
+
291
+ return section_id
292
+
293
+ def get_formatted_content(chunk):
294
+ document_id = chunk.get('document_id', 'unknown')
295
+ section_path = chunk.get('section_path', '')
296
+ section_id = chunk.get('section_id', 'unknown')
297
+ section_text = chunk.get('section_text', '')
298
+ parent_section = chunk.get('parent_section', '')
299
+ parent_title = chunk.get('parent_title', '')
300
+ level = chunk.get('level', '')
301
+ chunk_text = chunk.get('chunk_text', '')
302
+ doc_type = chunk.get('type', 'text')
303
+
304
+ # For text documents
305
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
306
+ current_section = section_path if section_path else section_id
307
+ parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
308
+ return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
309
+ else:
310
+ current_section = section_path if section_path else section_id
311
+ clean_text = chunk_text
312
+ if section_text and chunk_text.startswith(section_text):
313
+ section_title = section_text
314
+ elif chunk_text.startswith(f"{current_section} "):
315
+ clean_text = chunk_text[len(f"{current_section} "):].strip()
316
+ section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
317
+ else:
318
+ section_title = section_text if section_text else current_section
319
+
320
+ return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
321
+
322
+
323
+
324
+
325
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
326
  normalized_question = normalize_text(question)
327
  normalized_question_2, query_changes, change_list = normalize_steel_designations(question)