MrSimple07 commited on
Commit
865746a
·
1 Parent(s): 1333a87

added new table prep process + some improvement in chunking

Browse files
Files changed (5) hide show
  1. app.py +86 -5
  2. config.py +1 -1
  3. documents_prep.py +152 -116
  4. table_prep.py +347 -0
  5. utils.py +258 -14
app.py CHANGED
@@ -20,13 +20,18 @@ def create_chunks_display_html(chunk_info):
20
 
21
  for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
 
 
 
 
 
23
  html += f"""
24
  <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
25
  <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
26
- <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{chunk.get('section_id', 'unknown')}</span><br>
27
  <strong style='color: black;'>Содержание:</strong><br>
28
  <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
29
- {chunk['chunk_text']}
30
  </div>
31
  </div>
32
  """
@@ -34,12 +39,68 @@ def create_chunks_display_html(chunk_info):
34
  html += "</div>"
35
  return html
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
38
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
39
  use_json_instead_csv=False):
40
  try:
 
41
  log_message("Инициализация системы")
42
  os.makedirs(download_dir, exist_ok=True)
 
 
43
 
44
  embed_model = get_embedding_model()
45
  llm = get_llm_model(DEFAULT_MODEL)
@@ -47,7 +108,16 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
47
 
48
  Settings.embed_model = embed_model
49
  Settings.llm = llm
 
 
 
 
 
 
50
 
 
 
 
51
  all_documents = []
52
  chunks_df = None
53
  chunk_info = []
@@ -66,14 +136,24 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
66
  if table_data_dir:
67
  log_message("Добавляю табличные данные")
68
  table_documents = load_table_data(repo_id, hf_token, table_data_dir)
69
- all_documents.extend(table_documents)
 
 
 
 
 
70
 
71
  if image_data_dir:
72
  log_message("Добавляю данные изображений")
73
  image_documents = load_image_data(repo_id, hf_token, image_data_dir)
74
- all_documents.extend(image_documents)
 
 
 
 
 
75
 
76
- log_message(f"Всего документов: {len(all_documents)}")
77
 
78
  vector_index = create_vector_index(all_documents)
79
  query_engine = create_query_engine(vector_index)
@@ -171,6 +251,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
171
  "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
172
  "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
173
  "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
 
174
  ],
175
  inputs=question_input
176
  )
 
20
 
21
  for i, chunk in enumerate(chunk_info):
22
  bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
+
24
+ # Get section display info
25
+ section_display = get_section_display(chunk)
26
+ formatted_content = get_formatted_content(chunk)
27
+
28
  html += f"""
29
  <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
30
  <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
31
+ <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
32
  <strong style='color: black;'>Содержание:</strong><br>
33
  <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
34
+ {formatted_content}
35
  </div>
36
  </div>
37
  """
 
39
  html += "</div>"
40
  return html
41
 
42
+ def get_section_display(chunk):
43
+ section_path = chunk.get('section_path', '')
44
+ section_id = chunk.get('section_id', 'unknown')
45
+ doc_type = chunk.get('type', 'text')
46
+
47
+ if doc_type == 'table' and chunk.get('table_number'):
48
+ table_num = chunk.get('table_number')
49
+ if not str(table_num).startswith('№'):
50
+ table_num = f"№{table_num}"
51
+ return f"таблица {table_num}"
52
+
53
+ if doc_type == 'image' and chunk.get('image_number'):
54
+ image_num = chunk.get('image_number')
55
+ if not str(image_num).startswith('№'):
56
+ image_num = f"№{image_num}"
57
+ return f"рисунок {image_num}"
58
+
59
+ if section_path:
60
+ return section_path
61
+ elif section_id and section_id != 'unknown':
62
+ return section_id
63
+
64
+ return section_id
65
+
66
+ def get_formatted_content(chunk):
67
+ document_id = chunk.get('document_id', 'unknown')
68
+ section_path = chunk.get('section_path', '')
69
+ section_id = chunk.get('section_id', 'unknown')
70
+ section_text = chunk.get('section_text', '')
71
+ parent_section = chunk.get('parent_section', '')
72
+ parent_title = chunk.get('parent_title', '')
73
+ level = chunk.get('level', '')
74
+ chunk_text = chunk.get('chunk_text', '')
75
+ doc_type = chunk.get('type', 'text')
76
+
77
+ # For text documents
78
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
79
+ current_section = section_path if section_path else section_id
80
+ parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
81
+ return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
82
+ else:
83
+ current_section = section_path if section_path else section_id
84
+ clean_text = chunk_text
85
+ if section_text and chunk_text.startswith(section_text):
86
+ section_title = section_text
87
+ elif chunk_text.startswith(f"{current_section} "):
88
+ clean_text = chunk_text[len(f"{current_section} "):].strip()
89
+ section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
90
+ else:
91
+ section_title = section_text if section_text else current_section
92
+
93
+ return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
94
+
95
  def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
96
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
  use_json_instead_csv=False):
98
  try:
99
+ from documents_prep import process_documents_with_chunking
100
  log_message("Инициализация системы")
101
  os.makedirs(download_dir, exist_ok=True)
102
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
103
+ from llama_index.core.text_splitter import TokenTextSplitter
104
 
105
  embed_model = get_embedding_model()
106
  llm = get_llm_model(DEFAULT_MODEL)
 
108
 
109
  Settings.embed_model = embed_model
110
  Settings.llm = llm
111
+ Settings.text_splitter = TokenTextSplitter(
112
+ chunk_size=CHUNK_SIZE,
113
+ chunk_overlap=CHUNK_OVERLAP,
114
+ separator=" ",
115
+ backup_separators=["\n", ".", "!", "?"]
116
+ )
117
 
118
+ log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
119
+ log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
120
+
121
  all_documents = []
122
  chunks_df = None
123
  chunk_info = []
 
136
  if table_data_dir:
137
  log_message("Добавляю табличные данные")
138
  table_documents = load_table_data(repo_id, hf_token, table_data_dir)
139
+ log_message(f"Загружено {len(table_documents)} табличных документов")
140
+
141
+ # Process table documents through chunking
142
+ chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
143
+ all_documents.extend(chunked_table_docs)
144
+ chunk_info.extend(table_chunk_info)
145
 
146
  if image_data_dir:
147
  log_message("Добавляю данные изображений")
148
  image_documents = load_image_data(repo_id, hf_token, image_data_dir)
149
+ log_message(f"Загружено {len(image_documents)} документов изображений")
150
+
151
+ # Process image documents through chunking
152
+ chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
153
+ all_documents.extend(chunked_image_docs)
154
+ chunk_info.extend(image_chunk_info)
155
 
156
+ log_message(f"Всего документов после всей обработки: {len(all_documents)}")
157
 
158
  vector_index = create_vector_index(all_documents)
159
  query_engine = create_query_engine(vector_index)
 
251
  "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
252
  "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
253
  "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
254
+ "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
255
  ],
256
  inputs=question_input
257
  )
config.py CHANGED
@@ -52,7 +52,7 @@ AVAILABLE_MODELS = {
52
 
53
  DEFAULT_MODEL = "Gemini 2.5 Flash"
54
 
55
- CHUNK_SIZE = 2048
56
  CHUNK_OVERLAP = 256
57
 
58
  CUSTOM_PROMPT = """
 
52
 
53
  DEFAULT_MODEL = "Gemini 2.5 Flash"
54
 
55
+ CHUNK_SIZE = 25000
56
  CHUNK_OVERLAP = 256
57
 
58
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -6,9 +6,14 @@ from llama_index.core import Document
6
  from my_logging import log_message
7
  from llama_index.core.text_splitter import SentenceSplitter
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
 
9
 
10
 
11
- def chunk_document(doc, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
 
 
 
 
12
  text_splitter = SentenceSplitter(
13
  chunk_size=chunk_size,
14
  chunk_overlap=chunk_overlap,
@@ -35,33 +40,145 @@ def chunk_document(doc, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
35
 
36
  return chunked_docs
37
 
38
-
39
  def process_documents_with_chunking(documents):
40
  all_chunked_docs = []
41
  chunk_info = []
 
 
 
 
 
 
42
 
43
  for doc in documents:
44
- if len(doc.text) > CHUNK_SIZE:
45
- chunked_docs = chunk_document(doc)
46
- all_chunked_docs.extend(chunked_docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- for i, chunk_doc in enumerate(chunked_docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  chunk_info.append({
50
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
51
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
52
- 'chunk_id': i,
53
- 'chunk_size': len(chunk_doc.text),
54
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text
 
 
55
  })
56
- else:
57
- all_chunked_docs.append(doc)
58
- chunk_info.append({
59
- 'document_id': doc.metadata.get('document_id', 'unknown'),
60
- 'section_id': doc.metadata.get('section_id', 'unknown'),
61
- 'chunk_id': 0,
62
- 'chunk_size': len(doc.text),
63
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text
64
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  return all_chunked_docs, chunk_info
67
 
@@ -189,6 +306,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
189
 
190
  documents = extract_zip_and_process_json(local_zip_path)
191
  all_documents.extend(documents)
 
192
 
193
  except Exception as e:
194
  log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
@@ -221,17 +339,18 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
221
  log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
222
  continue
223
 
 
 
 
224
  chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
225
 
226
- log_message(f"Всего создано {len(all_documents)} исходных документов")
227
- log_message(f"Посл�� chunking получено {len(chunked_documents)} чанков")
228
 
229
  return chunked_documents, chunk_info
230
 
231
  except Exception as e:
232
  log_message(f"Ошибка загрузки JSON документов: {str(e)}")
233
  return [], []
234
-
235
 
236
  def extract_section_title(section_text):
237
  if not section_text.strip():
@@ -285,92 +404,6 @@ def extract_zip_and_process_json(zip_path):
285
 
286
  return documents
287
 
288
- def table_to_document(table_data, document_id=None):
289
- content = ""
290
- if isinstance(table_data, dict):
291
- doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
292
-
293
- table_num = table_data.get('table_number', 'Неизвестно')
294
- table_title = table_data.get('table_title', 'Неизвестно')
295
- section = table_data.get('section', 'Неизвестно')
296
-
297
- content += f"Таблица: {table_num}\n"
298
- content += f"Название: {table_title}\n"
299
- content += f"Документ: {doc_id}\n"
300
- content += f"Раздел: {section}\n"
301
-
302
- if 'data' in table_data and isinstance(table_data['data'], list):
303
- for row in table_data['data']:
304
- if isinstance(row, dict):
305
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
306
- content += f"{row_text}\n"
307
-
308
- return Document(
309
- text=content,
310
- metadata={
311
- "type": "table",
312
- "table_number": table_data.get('table_number', 'unknown'),
313
- "table_title": table_data.get('table_title', 'unknown'),
314
- "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
315
- "section": table_data.get('section', 'unknown'),
316
- "section_id": table_data.get('section', 'unknown')
317
- }
318
- )
319
-
320
- def load_table_data(repo_id, hf_token, table_data_dir):
321
- log_message("Начинаю загрузку табличных данных")
322
-
323
- table_files = []
324
- try:
325
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
326
- for file in files:
327
- if file.startswith(table_data_dir) and file.endswith('.json'):
328
- table_files.append(file)
329
-
330
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
331
-
332
- table_documents = []
333
- for file_path in table_files:
334
- try:
335
- log_message(f"Обрабатываю файл: {file_path}")
336
- local_path = hf_hub_download(
337
- repo_id=repo_id,
338
- filename=file_path,
339
- local_dir='',
340
- repo_type="dataset",
341
- token=hf_token
342
- )
343
-
344
- with open(local_path, 'r', encoding='utf-8') as f:
345
- table_data = json.load(f)
346
-
347
- if isinstance(table_data, dict):
348
- document_id = table_data.get('document', 'unknown')
349
-
350
- if 'sheets' in table_data:
351
- for sheet in table_data['sheets']:
352
- sheet['document'] = document_id
353
- doc = table_to_document(sheet, document_id)
354
- table_documents.append(doc)
355
- else:
356
- doc = table_to_document(table_data, document_id)
357
- table_documents.append(doc)
358
- elif isinstance(table_data, list):
359
- for table_json in table_data:
360
- doc = table_to_document(table_json)
361
- table_documents.append(doc)
362
-
363
- except Exception as e:
364
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
365
- continue
366
-
367
- log_message(f"Создано {len(table_documents)} документов из таблиц")
368
- return table_documents
369
-
370
- except Exception as e:
371
- log_message(f"Ошибка загрузки табличных данных: {str(e)}")
372
- return []
373
-
374
  def load_image_data(repo_id, hf_token, image_data_dir):
375
  log_message("Начинаю загрузку данных изображений")
376
 
@@ -398,12 +431,13 @@ def load_image_data(repo_id, hf_token, image_data_dir):
398
  df = pd.read_csv(local_path)
399
  log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
400
 
 
401
  for _, row in df.iterrows():
402
- section_value = row.get('Раздел документа', row.get('section', 'Неизвестно'))
403
 
404
  content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
405
  content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
406
- content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
407
  content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
408
  content += f"Раздел: {section_value}\n"
409
  content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
@@ -412,11 +446,13 @@ def load_image_data(repo_id, hf_token, image_data_dir):
412
  text=content,
413
  metadata={
414
  "type": "image",
415
- "image_number": row.get('№ Изображения', 'unknown'),
416
- "document_id": row.get('Обозначение документа', 'unknown'),
417
- "file_path": row.get('Файл изображения', 'unknown'),
418
- "section": section_value,
419
- "section_id": section_value
 
 
420
  }
421
  )
422
  image_documents.append(doc)
 
6
  from my_logging import log_message
7
  from llama_index.core.text_splitter import SentenceSplitter
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
+ from table_prep import table_to_document, load_table_data
10
 
11
 
12
+ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
+ if chunk_size is None:
14
+ chunk_size = CHUNK_SIZE
15
+ if chunk_overlap is None:
16
+ chunk_overlap = CHUNK_OVERLAP
17
  text_splitter = SentenceSplitter(
18
  chunk_size=chunk_size,
19
  chunk_overlap=chunk_overlap,
 
40
 
41
  return chunked_docs
42
 
 
43
  def process_documents_with_chunking(documents):
44
  all_chunked_docs = []
45
  chunk_info = []
46
+ table_count = 0
47
+ image_count = 0
48
+ text_chunks_count = 0
49
+ large_tables_count = 0
50
+ large_images_count = 0
51
+ custom_processed_count = 0
52
 
53
  for doc in documents:
54
+ doc_type = doc.metadata.get('type', 'text')
55
+
56
+ if doc_type == 'table':
57
+ table_count += 1
58
+ doc_id = doc.metadata.get('document_id', 'unknown')
59
+ table_num = doc.metadata.get('table_number', 'unknown')
60
+ from table_prep import should_use_custom_processing
61
+ use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
62
+
63
+ if use_custom:
64
+ custom_processed_count += 1
65
+ log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
66
+ # Add the document as-is since it was already processed by custom method
67
+ all_chunked_docs.append(doc)
68
+ chunk_info.append({
69
+ 'document_id': doc_id,
70
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
71
+ 'chunk_id': 0,
72
+ 'chunk_size': len(doc.text),
73
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
74
+ 'type': 'table',
75
+ 'table_number': table_num,
76
+ 'processing_method': method_config.get('method')
77
+ })
78
+ continue
79
+
80
+ # Standard processing for non-custom tables
81
+ doc_size = len(doc.text)
82
+ if doc_size > CHUNK_SIZE:
83
+ large_tables_count += 1
84
+ log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
85
+
86
+ # Chunk large tables
87
+ chunked_docs = chunk_document(doc)
88
+ all_chunked_docs.extend(chunked_docs)
89
+
90
+ for i, chunk_doc in enumerate(chunked_docs):
91
+ chunk_info.append({
92
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
93
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
94
+ 'chunk_id': i,
95
+ 'chunk_size': len(chunk_doc.text),
96
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
97
+ 'type': 'table',
98
+ 'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
99
+ 'processing_method': 'standard_chunked'
100
+ })
101
+ else:
102
+ all_chunked_docs.append(doc)
103
+ chunk_info.append({
104
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
105
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
106
+ 'chunk_id': 0,
107
+ 'chunk_size': doc_size,
108
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
109
+ 'type': 'table',
110
+ 'table_number': doc.metadata.get('table_number', 'unknown'),
111
+ 'processing_method': 'standard'
112
+ })
113
 
114
+ elif doc_type == 'image':
115
+ image_count += 1
116
+ doc_size = len(doc.text)
117
+ if doc_size > CHUNK_SIZE:
118
+ large_images_count += 1
119
+ log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
120
+
121
+ # Chunk large images
122
+ chunked_docs = chunk_document(doc)
123
+ all_chunked_docs.extend(chunked_docs)
124
+
125
+ for i, chunk_doc in enumerate(chunked_docs):
126
+ chunk_info.append({
127
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
128
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
129
+ 'chunk_id': i,
130
+ 'chunk_size': len(chunk_doc.text),
131
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
132
+ 'type': 'image',
133
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
134
+ })
135
+ else:
136
+ all_chunked_docs.append(doc)
137
  chunk_info.append({
138
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
139
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
140
+ 'chunk_id': 0,
141
+ 'chunk_size': doc_size,
142
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
+ 'type': 'image',
144
+ 'image_number': doc.metadata.get('image_number', 'unknown')
145
  })
146
+
147
+ else: # text documents
148
+ doc_size = len(doc.text)
149
+ if doc_size > CHUNK_SIZE:
150
+ chunked_docs = chunk_document(doc)
151
+ all_chunked_docs.extend(chunked_docs)
152
+ text_chunks_count += len(chunked_docs)
153
+
154
+ for i, chunk_doc in enumerate(chunked_docs):
155
+ chunk_info.append({
156
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
157
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
158
+ 'chunk_id': i,
159
+ 'chunk_size': len(chunk_doc.text),
160
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
161
+ 'type': 'text'
162
+ })
163
+ else:
164
+ all_chunked_docs.append(doc)
165
+ chunk_info.append({
166
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
167
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
168
+ 'chunk_id': 0,
169
+ 'chunk_size': doc_size,
170
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
171
+ 'type': 'text'
172
+ })
173
+
174
+ log_message(f"=== PROCESSING STATISTICS ===")
175
+ log_message(f"Total tables processed: {table_count}")
176
+ log_message(f"Custom processed tables: {custom_processed_count}")
177
+ log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
178
+ log_message(f"Total images processed: {image_count}")
179
+ log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
180
+ log_message(f"Total text chunks created: {text_chunks_count}")
181
+ log_message(f"Total documents after processing: {len(all_chunked_docs)}")
182
 
183
  return all_chunked_docs, chunk_info
184
 
 
306
 
307
  documents = extract_zip_and_process_json(local_zip_path)
308
  all_documents.extend(documents)
309
+ log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
310
 
311
  except Exception as e:
312
  log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
 
339
  log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
340
  continue
341
 
342
+ log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
343
+
344
+ # Process documents through chunking function
345
  chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
346
 
347
+ log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
 
348
 
349
  return chunked_documents, chunk_info
350
 
351
  except Exception as e:
352
  log_message(f"Ошибка загрузки JSON документов: {str(e)}")
353
  return [], []
 
354
 
355
  def extract_section_title(section_text):
356
  if not section_text.strip():
 
404
 
405
  return documents
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  def load_image_data(repo_id, hf_token, image_data_dir):
408
  log_message("Начинаю загрузку данных изображений")
409
 
 
431
  df = pd.read_csv(local_path)
432
  log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
433
 
434
+ # Обработка с правильными названиями колонок
435
  for _, row in df.iterrows():
436
+ section_value = row.get('Раздел документа', 'Неизвестно')
437
 
438
  content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
439
  content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
440
+ content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в названии колонки
441
  content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
442
  content += f"Раздел: {section_value}\n"
443
  content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
 
446
  text=content,
447
  metadata={
448
  "type": "image",
449
+ "image_number": str(row.get('№ Изображения', 'unknown')),
450
+ "image_title": str(row.get('Название изображения', 'unknown')),
451
+ "image_description": str(row.get('Описание изображение', 'unknown')),
452
+ "document_id": str(row.get('Обозначение документа', 'unknown')),
453
+ "file_path": str(row.get('Файл изображения', 'unknown')),
454
+ "section": str(section_value),
455
+ "section_id": str(section_value)
456
  }
457
  )
458
  image_documents.append(doc)
table_prep.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import defaultdict
3
+ import json
4
+ import zipfile
5
+ import pandas as pd
6
+ from huggingface_hub import hf_hub_download, list_repo_files
7
+ from llama_index.core import Document
8
+ from my_logging import log_message
9
+
10
+ CUSTOM_TABLE_CONFIGS = {
11
+ "ГОСТ Р 50.05.01-2018": {
12
+ "tables": {
13
+ "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
14
+ "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
15
+ }
16
+ },
17
+ "ГОСТ Р 50.06.01-2017": {
18
+ "tables": {
19
+ "№ Б.2": {"method": "split_by_rows"}
20
+ }
21
+ },
22
+ "НП-104-18": {
23
+ "tables": {
24
+ "*": {"method": "group_entire_table"} # All tables
25
+ }
26
+ },
27
+ "НП-068-05": {
28
+ "tables": {
29
+ "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
30
+ "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
31
+ "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
32
+ }
33
+ },
34
+ "ГОСТ Р 59023.1-2020": {
35
+ "tables": {
36
+ "№ 1": {"method": "split_by_rows"},
37
+ "№ 2": {"method": "split_by_rows"},
38
+ "№ 3": {"method": "split_by_rows"}
39
+ }
40
+ },
41
+ "НП-089-15": {
42
+ "tables": {
43
+ "-": {"method": "split_by_rows"}
44
+ }
45
+ },
46
+ "НП-105-18": {
47
+ "tables": {
48
+ "№ 4.8": {"method": "group_entire_table"}
49
+ }
50
+ },
51
+ "ГОСТ Р 50.05.23-2020": {
52
+ "tables": {
53
+ "№8": {"method": "group_entire_table"}
54
+ }
55
+ },
56
+ "ГОСТ Р 50.03.01-2017": {
57
+ "tables": {
58
+ "А.8": {"method": "group_entire_table"}
59
+ }
60
+ }
61
+ }
62
+
63
+ def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
64
+ base_info = f'Документ "{document_name}", Раздел: {section}, Таблица: {table_number}'
65
+ if table_title and table_title.strip():
66
+ base_info += f', Название: {table_title}'
67
+ if extra_info:
68
+ base_info += f', {extra_info}'
69
+ return base_info
70
+
71
+ def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
72
+ chunk_lines = [meta_info.rstrip()] # Remove trailing newline from meta_info
73
+
74
+ # Add headers only once
75
+ header_line = " | ".join(headers)
76
+ chunk_lines.append(f"Заголовки: {header_line}")
77
+
78
+ # Add rows without redundant formatting
79
+ for i, row in enumerate(rows, start=1):
80
+ row_parts = []
81
+ for h in headers:
82
+ value = row.get(h, '')
83
+ if value: # Only add non-empty values
84
+ row_parts.append(f"{h}: {value}")
85
+
86
+ if add_row_numbers:
87
+ chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
88
+ else:
89
+ chunk_lines.append(' | '.join(row_parts))
90
+
91
+ return "\n".join(chunk_lines)
92
+ def group_by_column_method(table_data, document_name, group_column):
93
+ """Group rows by specified column value"""
94
+ documents = []
95
+ headers = table_data.get("headers", [])
96
+ rows = table_data.get("data", [])
97
+ section = table_data.get("section", "")
98
+ table_number = table_data.get("table_number", "")
99
+ table_title = table_data.get("table_title", "")
100
+
101
+ grouped = defaultdict(list)
102
+ for row in rows:
103
+ key = row.get(group_column, "UNKNOWN")
104
+ grouped[key].append(row)
105
+
106
+ for group_value, group_rows in grouped.items():
107
+ meta_info = create_meta_info(document_name, section, table_number, table_title,
108
+ f'Группа по "{group_column}": {group_value}')
109
+
110
+ chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
111
+
112
+ doc = Document(
113
+ text=chunk_text,
114
+ metadata={
115
+ "type": "table",
116
+ "table_number": table_number,
117
+ "table_title": table_title,
118
+ "document_id": document_name,
119
+ "section": section,
120
+ "section_id": section,
121
+ "group_column": group_column,
122
+ "group_value": group_value,
123
+ "total_rows": len(group_rows),
124
+ "processing_method": "group_by_column"
125
+ }
126
+ )
127
+ documents.append(doc)
128
+ log_message(f"Created grouped chunk for {group_column}={group_value}, rows: {len(group_rows)}, length: {len(chunk_text)}")
129
+
130
+ return documents
131
+
132
+ def split_by_rows_method(table_data, document_name):
133
+ """Split table into individual row chunks"""
134
+ documents = []
135
+ headers = table_data.get("headers", [])
136
+ rows = table_data.get("data", [])
137
+ section = table_data.get("section", "")
138
+ table_number = table_data.get("table_number", "")
139
+ table_title = table_data.get("table_title", "")
140
+
141
+ for i, row in enumerate(rows, start=1):
142
+ meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
143
+
144
+ chunk_text = create_chunk_text(meta_info, headers, [row])
145
+
146
+ doc = Document(
147
+ text=chunk_text,
148
+ metadata={
149
+ "type": "table",
150
+ "table_number": table_number,
151
+ "table_title": table_title,
152
+ "document_id": document_name,
153
+ "section": section,
154
+ "section_id": section,
155
+ "row_number": i,
156
+ "total_rows": len(rows),
157
+ "processing_method": "split_by_rows"
158
+ }
159
+ )
160
+ documents.append(doc)
161
+
162
+ log_message(f"Split table {table_number} into {len(rows)} row chunks")
163
+ return documents
164
+
165
+ def group_entire_table_method(table_data, document_name):
166
+ """Group entire table as one chunk"""
167
+ headers = table_data.get("headers", [])
168
+ rows = table_data.get("data", [])
169
+ section = table_data.get("section", "")
170
+ table_number = table_data.get("table_number", "")
171
+ table_title = table_data.get("table_title", "")
172
+
173
+ meta_info = create_meta_info(document_name, section, table_number, table_title)
174
+ chunk_text = create_chunk_text(meta_info, headers, rows)
175
+
176
+ doc = Document(
177
+ text=chunk_text,
178
+ metadata={
179
+ "type": "table",
180
+ "table_number": table_number,
181
+ "table_title": table_title,
182
+ "document_id": document_name,
183
+ "section": section,
184
+ "section_id": section,
185
+ "total_rows": len(rows),
186
+ "processing_method": "group_entire_table"
187
+ }
188
+ )
189
+
190
+ log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
191
+ return [doc]
192
+
193
+ def should_use_custom_processing(document_id, table_number):
194
+ """Check if table should use custom processing"""
195
+ for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
196
+ if document_id.startswith(doc_pattern):
197
+ tables_config = config.get("tables", {})
198
+ if table_number in tables_config or "*" in tables_config:
199
+ return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
200
+ return False, None, None
201
+
202
+ def process_table_with_custom_method(table_data, document_name, method_config):
203
+ """Process table using custom method"""
204
+ method = method_config.get("method")
205
+
206
+ if method == "group_by_column":
207
+ group_column = method_config.get("group_column")
208
+ return group_by_column_method(table_data, document_name, group_column)
209
+ elif method == "split_by_rows":
210
+ return split_by_rows_method(table_data, document_name)
211
+ elif method == "group_entire_table":
212
+ return group_entire_table_method(table_data, document_name)
213
+ else:
214
+ log_message(f"Unknown custom method: {method}, falling back to default processing")
215
+ return None
216
+
217
+ def table_to_document(table_data, document_id=None):
218
+ if isinstance(table_data, dict):
219
+ doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
220
+ table_num = table_data.get('table_number', 'Неизвестно')
221
+ use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
222
+
223
+ if use_custom:
224
+ log_message(f"Using custom processing for table {table_num} in document {doc_id}")
225
+ custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
226
+ if custom_docs:
227
+ return custom_docs
228
+
229
+ # DEFAULT PROCESSING (only if NOT using custom)
230
+ table_title = table_data.get('table_title', 'Неизвестно')
231
+ section = table_data.get('section', 'Неизвестно')
232
+
233
+ header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
234
+
235
+ if 'data' in table_data and isinstance(table_data['data'], list):
236
+ table_content = header_content + "\nДанные таблицы:\n"
237
+ for row_idx, row in enumerate(table_data['data']):
238
+ if isinstance(row, dict):
239
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
240
+ table_content += f"Строка {row_idx + 1}: {row_text}\n"
241
+
242
+ doc = Document(
243
+ text=table_content,
244
+ metadata={
245
+ "type": "table",
246
+ "table_number": table_num,
247
+ "table_title": table_title,
248
+ "document_id": doc_id,
249
+ "section": section,
250
+ "section_id": section,
251
+ "total_rows": len(table_data['data']),
252
+ "processing_method": "default"
253
+ }
254
+ )
255
+ return [doc]
256
+ else:
257
+ doc = Document(
258
+ text=header_content,
259
+ metadata={
260
+ "type": "table",
261
+ "table_number": table_num,
262
+ "table_title": table_title,
263
+ "document_id": doc_id,
264
+ "section": section,
265
+ "section_id": section,
266
+ "processing_method": "default"
267
+ }
268
+ )
269
+ return [doc]
270
+
271
+ return []
272
+
273
+ def load_table_data(repo_id, hf_token, table_data_dir):
274
+ """Modified function with custom table processing integration"""
275
+ log_message("Начинаю загрузку табличных данных")
276
+
277
+ table_files = []
278
+ try:
279
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
280
+ for file in files:
281
+ if file.startswith(table_data_dir) and file.endswith('.json'):
282
+ table_files.append(file)
283
+
284
+ log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
285
+
286
+ table_documents = []
287
+ for file_path in table_files:
288
+ try:
289
+ log_message(f"Обрабатываю файл: {file_path}")
290
+ local_path = hf_hub_download(
291
+ repo_id=repo_id,
292
+ filename=file_path,
293
+ local_dir='',
294
+ repo_type="dataset",
295
+ token=hf_token
296
+ )
297
+
298
+ with open(local_path, 'r', encoding='utf-8') as f:
299
+ table_data = json.load(f)
300
+
301
+ if isinstance(table_data, dict):
302
+ document_id = table_data.get('document', 'unknown')
303
+
304
+ if 'sheets' in table_data:
305
+ for sheet in table_data['sheets']:
306
+ sheet['document'] = document_id
307
+ # Check if this table uses custom processing
308
+ table_num = sheet.get('table_number', 'Неизвестно')
309
+ use_custom, _, _ = should_use_custom_processing(document_id, table_num)
310
+
311
+ if use_custom:
312
+ log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
313
+
314
+ docs_list = table_to_document(sheet, document_id)
315
+ table_documents.extend(docs_list)
316
+ else:
317
+ # Check if this table uses custom processing
318
+ table_num = table_data.get('table_number', 'Неизвестно')
319
+ use_custom, _, _ = should_use_custom_processing(document_id, table_num)
320
+
321
+ if use_custom:
322
+ log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
323
+
324
+ docs_list = table_to_document(table_data, document_id)
325
+ table_documents.extend(docs_list)
326
+ elif isinstance(table_data, list):
327
+ for table_json in table_data:
328
+ document_id = table_json.get('document', 'unknown')
329
+ table_num = table_json.get('table_number', 'Неизвестно')
330
+ use_custom, _, _ = should_use_custom_processing(document_id, table_num)
331
+
332
+ if use_custom:
333
+ log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
334
+
335
+ docs_list = table_to_document(table_json)
336
+ table_documents.extend(docs_list)
337
+
338
+ except Exception as e:
339
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
340
+ continue
341
+
342
+ log_message(f"Создано {len(table_documents)} документов из таблиц")
343
+ return table_documents
344
+
345
+ except Exception as e:
346
+ log_message(f"Ошибка загрузки табличных данных: {str(e)}")
347
+ return []
utils.py CHANGED
@@ -10,6 +10,190 @@ from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def get_llm_model(model_name):
14
  try:
15
  model_config = AVAILABLE_MODELS.get(model_name)
@@ -99,36 +283,81 @@ def generate_sources_html(nodes, chunks_df=None):
99
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
100
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
101
 
 
 
102
  for i, node in enumerate(nodes):
103
  metadata = node.metadata if hasattr(node, 'metadata') else {}
104
  doc_type = metadata.get('type', 'text')
105
  doc_id = metadata.get('document_id', 'unknown')
106
- section_id = metadata.get('section_id', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
109
 
110
  if doc_type == 'text':
111
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
112
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📌 {section_id}</h4>"
113
 
114
- elif doc_type == 'table':
115
  table_num = metadata.get('table_number', 'unknown')
 
116
  if table_num and table_num != 'unknown':
117
- if not table_num.startswith('№'):
118
  table_num = f"№{table_num}"
119
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
 
 
120
  else:
121
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
 
122
  elif doc_type == 'image':
123
  image_num = metadata.get('image_number', 'unknown')
 
124
  section = metadata.get('section', '')
125
  if image_num and image_num != 'unknown':
126
  if not str(image_num).startswith('№'):
127
  image_num = f"№{image_num}"
128
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id} ({section})</h4>"
 
 
 
 
129
  else:
130
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id} ({section})</h4>"
131
 
 
132
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
133
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
134
  if not doc_rows.empty:
@@ -146,20 +375,35 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
146
 
147
  try:
148
  log_message(f"Получен вопрос: {question}")
149
- log_message(f"Используется модель: {current_model}")
150
  start_time = time.time()
151
 
152
- log_message("Извлекаю релевантные узлы")
153
  retrieved_nodes = query_engine.retriever.retrieve(question)
154
  log_message(f"Извлечено {len(retrieved_nodes)} узлов")
155
- for i in range(min(3, len(retrieved_nodes))):
156
- log_message(f"Пример узла {i+1}: {retrieved_nodes[i].text[:200]}...")
157
 
158
- log_message("Применяю переранжировку")
 
 
 
 
 
 
 
 
 
 
159
  reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
160
 
 
 
 
 
 
 
 
 
161
  formatted_context = format_context_for_llm(reranked_nodes)
162
- log_message(f"fорматированный контекст для LLM:\n{formatted_context[:500]}...")
163
 
164
  enhanced_question = f"""
165
  Контекст из базы данных:
@@ -167,10 +411,10 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
167
 
168
  Вопрос пользователя: {question}"""
169
 
170
- log_message(f"Отправляю запрос в LLM с {len(reranked_nodes)} узлами")
171
- log_message(f"Вопрос для LLM:\n{enhanced_question}...")
172
  response = query_engine.query(enhanced_question)
173
 
 
 
174
  end_time = time.time()
175
  processing_time = end_time - start_time
176
 
 
10
  from my_logging import log_message
11
  from config import PROMPT_SIMPLE_POISK
12
 
13
+ def get_llm_model(model_name):
14
+ try:
15
+ model_config = AVAILABLE_MODELS.get(model_name)
16
+ if not model_config:
17
+ log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
18
+ model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
19
+
20
+ if not model_config.get("api_key"):
21
+ raise Exception(f"API ключ не найден для модели {model_name}")
22
+
23
+ if model_config["provider"] == "google":
24
+ return GoogleGenAI(
25
+ model=model_config["model_name"],
26
+ api_key=model_config["api_key"]
27
+ )
28
+ elif model_config["provider"] == "openai":
29
+ return OpenAI(
30
+ model=model_config["model_name"],
31
+ api_key=model_config["api_key"]
32
+ )
33
+ else:
34
+ raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
35
+
36
+ except Exception as e:
37
+ log_message(f"Ошибка создания модели {model_name}: {str(e)}")
38
+ return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
39
+
40
+ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
41
+ return HuggingFaceEmbedding(model_name=model_name)
42
+
43
+ def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
+ return CrossEncoder(model_name)
45
+
46
+ def format_context_for_llm(nodes):
47
+ context_parts = []
48
+
49
+ for node in nodes:
50
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
51
+ doc_id = metadata.get('document_id', 'Неизвестный документ')
52
+
53
+ section_info = ""
54
+
55
+ if metadata.get('section_path'):
56
+ section_path = metadata['section_path']
57
+ section_text = metadata.get('section_text', '')
58
+ parent_section = metadata.get('parent_section', '')
59
+ parent_title = metadata.get('parent_title', '')
60
+ level = metadata.get('level', '')
61
+
62
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
63
+ # For subsections, show: пункт X.X в разделе X (Title)
64
+ section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
65
+ elif section_text:
66
+ # For main sections, show: пункт X (Title)
67
+ section_info = f"пункт {section_path} ({section_text})"
68
+ else:
69
+ section_info = f"пункт {section_path}"
70
+ elif metadata.get('section_id'):
71
+ section_id = metadata['section_id']
72
+ section_text = metadata.get('section_text', '')
73
+ level = metadata.get('level', '')
74
+ parent_section = metadata.get('parent_section', '')
75
+ parent_title = metadata.get('parent_title', '')
76
+
77
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
78
+ # For subsections without section_path, show: пункт X.X в разделе X (Title)
79
+ section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
80
+ elif section_text:
81
+ section_info = f"пункт {section_id} ({section_text})"
82
+ else:
83
+ section_info = f"пункт {section_id}"
84
+
85
+ if metadata.get('type') == 'table' and metadata.get('table_number'):
86
+ table_num = metadata['table_number']
87
+ if not str(table_num).startswith('№'):
88
+ table_num = f"№{table_num}"
89
+ section_info = f"таблица {table_num}"
90
+
91
+ if metadata.get('type') == 'image' and metadata.get('image_number'):
92
+ image_num = metadata['image_number']
93
+ if not str(image_num).startswith('№'):
94
+ image_num = f"№{image_num}"
95
+ section_info = f"рисунок {image_num}"
96
+
97
+ context_text = node.text if hasattr(node, 'text') else str(node)
98
+
99
+ if section_info:
100
+ formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
101
+ else:
102
+ formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
103
+
104
+ context_parts.append(formatted_context)
105
+
106
+ return "\n".join(context_parts)
107
+
108
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
109
+ if query_engine is None:
110
+ return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
111
+
112
+ try:
113
+ log_message(f"Получен вопрос: {question}")
114
+ start_time = time.time()
115
+
116
+ # Извлечение узлов
117
+ retrieved_nodes = query_engine.retriever.retrieve(question)
118
+ log_message(f"Извлечено {len(retrieved_nodes)} узлов")
119
+
120
+ # ДЕТАЛЬНОЕ ЛОГИРОВАНИЕ ИСТОЧНИКОВ
121
+ log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
122
+ for i, node in enumerate(retrieved_nodes):
123
+ log_message(f"Узел {i+1}:")
124
+ log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
125
+ log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
126
+ log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
127
+ log_message(f" Текст (первые 400 символов): {node.text[:400]}...")
128
+ log_message(f" Метаданные: {node.metadata}")
129
+
130
+ # Переранжировка
131
+ reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
132
+
133
+ log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
134
+ for i, node in enumerate(reranked_nodes):
135
+ log_message(f"Переранжированный узел {i+1}:")
136
+ log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
137
+ log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
138
+ log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
139
+ log_message(f" Полный текст: {node.text}")
140
+
141
+ formatted_context = format_context_for_llm(reranked_nodes)
142
+ log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
143
+
144
+ enhanced_question = f"""
145
+ Контекст из базы данных:
146
+ {formatted_context}
147
+
148
+ Вопрос пользователя: {question}"""
149
+
150
+ response = query_engine.query(enhanced_question)
151
+
152
+ log_message(f"ОТВЕТ LLM: {response.response}")
153
+
154
+ end_time = time.time()
155
+ processing_time = end_time - start_time
156
+
157
+ log_message(f"Обработка завершена за {processing_time:.2f} секунд")
158
+
159
+ sources_html = generate_sources_html(reranked_nodes, chunks_df)
160
+
161
+ answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
162
+ <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
163
+ <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
164
+ <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
165
+ Время обработки: {processing_time:.2f} секунд
166
+ </div>
167
+ </div>"""
168
+
169
+ chunk_info = []
170
+ for node in reranked_nodes:
171
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
172
+ chunk_info.append({
173
+ 'document_id': metadata.get('document_id', 'unknown'),
174
+ 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
175
+ 'section_path': metadata.get('section_path', ''),
176
+ 'section_text': metadata.get('section_text', ''),
177
+ 'level': metadata.get('level', ''),
178
+ 'parent_section': metadata.get('parent_section', ''),
179
+ 'parent_title': metadata.get('parent_title', ''),
180
+ 'type': metadata.get('type', 'text'),
181
+ 'table_number': metadata.get('table_number', ''),
182
+ 'image_number': metadata.get('image_number', ''),
183
+ 'chunk_size': len(node.text),
184
+ 'chunk_text': node.text
185
+ })
186
+ from app import create_chunks_display_html
187
+ chunks_html = create_chunks_display_html(chunk_info)
188
+
189
+ return answer_with_time, sources_html, chunks_html
190
+
191
+ except Exception as e:
192
+ log_message(f"Ошибка обработки вопроса: {str(e)}")
193
+ error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
194
+ return error_msg, ""
195
+
196
+
197
  def get_llm_model(model_name):
198
  try:
199
  model_config = AVAILABLE_MODELS.get(model_name)
 
283
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
284
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
285
 
286
+ sources_by_doc = {}
287
+
288
  for i, node in enumerate(nodes):
289
  metadata = node.metadata if hasattr(node, 'metadata') else {}
290
  doc_type = metadata.get('type', 'text')
291
  doc_id = metadata.get('document_id', 'unknown')
292
+ section_id = metadata.get('section_id', '')
293
+ section_text = metadata.get('section_text', '')
294
+ section_path = metadata.get('section_path', '')
295
+
296
+ # Create a unique key for grouping
297
+ if doc_type == 'table':
298
+ table_num = metadata.get('table_number', 'unknown')
299
+ key = f"{doc_id}_table_{table_num}"
300
+ elif doc_type == 'image':
301
+ image_num = metadata.get('image_number', 'unknown')
302
+ key = f"{doc_id}_image_{image_num}"
303
+ else:
304
+ # For text documents, group by section path or section id
305
+ section_key = section_path if section_path else section_id
306
+ key = f"{doc_id}_text_{section_key}"
307
+
308
+ if key not in sources_by_doc:
309
+ sources_by_doc[key] = {
310
+ 'doc_id': doc_id,
311
+ 'doc_type': doc_type,
312
+ 'metadata': metadata,
313
+ 'sections': set()
314
+ }
315
+
316
+ # Add section information
317
+ if section_path:
318
+ sources_by_doc[key]['sections'].add(f"пункт {section_path}")
319
+ elif section_id and section_id != 'unknown':
320
+ sources_by_doc[key]['sections'].add(f"пункт {section_id}")
321
+
322
+ # Generate HTML for each unique source
323
+ for source_info in sources_by_doc.values():
324
+ metadata = source_info['metadata']
325
+ doc_type = source_info['doc_type']
326
+ doc_id = source_info['doc_id']
327
 
328
  html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
329
 
330
  if doc_type == 'text':
331
  html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
 
332
 
333
+ elif doc_type == 'table' or doc_type == 'table_row':
334
  table_num = metadata.get('table_number', 'unknown')
335
+ table_title = metadata.get('table_title', '')
336
  if table_num and table_num != 'unknown':
337
+ if not str(table_num).startswith('№'):
338
  table_num = f"№{table_num}"
339
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
340
+ if table_title and table_title != 'unknown':
341
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
342
  else:
343
  html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
344
+
345
  elif doc_type == 'image':
346
  image_num = metadata.get('image_number', 'unknown')
347
+ image_title = metadata.get('image_title', '')
348
  section = metadata.get('section', '')
349
  if image_num and image_num != 'unknown':
350
  if not str(image_num).startswith('№'):
351
  image_num = f"№{image_num}"
352
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
353
+ if image_title and image_title != 'unknown':
354
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
355
+ if section and section != 'unknown':
356
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
357
  else:
358
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
359
 
360
+ # Add file link if available
361
  if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
362
  doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
363
  if not doc_rows.empty:
 
375
 
376
  try:
377
  log_message(f"Получен вопрос: {question}")
 
378
  start_time = time.time()
379
 
380
+ # Извлечение узлов
381
  retrieved_nodes = query_engine.retriever.retrieve(question)
382
  log_message(f"Извлечено {len(retrieved_nodes)} узлов")
 
 
383
 
384
+ # ДЕТАЛЬНОЕ ��ОГИРОВАНИЕ ИСТОЧНИКОВ
385
+ log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
386
+ for i, node in enumerate(retrieved_nodes):
387
+ log_message(f"Узел {i+1}:")
388
+ log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
389
+ log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
390
+ log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
391
+ log_message(f" Текст (первые 400 символов): {node.text[:400]}...")
392
+ log_message(f" Метаданные: {node.metadata}")
393
+
394
+ # Переранжировка
395
  reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
396
 
397
+ log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
398
+ for i, node in enumerate(reranked_nodes):
399
+ log_message(f"Переранжированный узел {i+1}:")
400
+ log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
401
+ log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
402
+ log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
403
+ log_message(f" Полный текст: {node.text}")
404
+
405
  formatted_context = format_context_for_llm(reranked_nodes)
406
+ log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
407
 
408
  enhanced_question = f"""
409
  Контекст из базы данных:
 
411
 
412
  Вопрос пользователя: {question}"""
413
 
 
 
414
  response = query_engine.query(enhanced_question)
415
 
416
+ log_message(f"ОТВЕТ LLM: {response.response}")
417
+
418
  end_time = time.time()
419
  processing_time = end_time - start_time
420