MrSimple07 commited on
Commit
a4f228e
·
1 Parent(s): 0a99ba6

a new version

Browse files
Files changed (4) hide show
  1. documents_prep.py +19 -70
  2. index_retriever.py +54 -27
  3. table_prep.py +91 -286
  4. utils.py +109 -220
documents_prep.py CHANGED
@@ -46,79 +46,29 @@ def process_documents_with_chunking(documents):
46
  table_count = 0
47
  image_count = 0
48
  text_chunks_count = 0
49
- large_tables_count = 0
50
- large_images_count = 0
51
- custom_processed_count = 0
52
 
53
  for doc in documents:
54
  doc_type = doc.metadata.get('type', 'text')
55
 
56
  if doc_type == 'table':
 
57
  table_count += 1
58
- doc_id = doc.metadata.get('document_id', 'unknown')
59
- table_num = doc.metadata.get('table_number', 'unknown')
60
- from table_prep import should_use_custom_processing
61
- use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
62
 
63
- if use_custom:
64
- custom_processed_count += 1
65
- log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
66
- # Add the document as-is since it was already processed by custom method
67
- all_chunked_docs.append(doc)
68
- chunk_info.append({
69
- 'document_id': doc_id,
70
- 'section_id': doc.metadata.get('section_id', 'unknown'),
71
- 'chunk_id': 0,
72
- 'chunk_size': len(doc.text),
73
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
74
- 'type': 'table',
75
- 'table_number': table_num,
76
- 'processing_method': method_config.get('method')
77
- })
78
- continue
79
-
80
- # Standard processing for non-custom tables
81
- doc_size = len(doc.text)
82
- if doc_size > CHUNK_SIZE:
83
- large_tables_count += 1
84
- log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
85
-
86
- # Chunk large tables
87
- chunked_docs = chunk_document(doc)
88
- all_chunked_docs.extend(chunked_docs)
89
-
90
- for i, chunk_doc in enumerate(chunked_docs):
91
- chunk_info.append({
92
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
93
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
94
- 'chunk_id': i,
95
- 'chunk_size': len(chunk_doc.text),
96
- 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
97
- 'type': 'table',
98
- 'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
99
- 'processing_method': 'standard_chunked'
100
- })
101
- else:
102
- all_chunked_docs.append(doc)
103
- chunk_info.append({
104
- 'document_id': doc.metadata.get('document_id', 'unknown'),
105
- 'section_id': doc.metadata.get('section_id', 'unknown'),
106
- 'chunk_id': 0,
107
- 'chunk_size': doc_size,
108
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
109
- 'type': 'table',
110
- 'table_number': doc.metadata.get('table_number', 'unknown'),
111
- 'processing_method': 'standard'
112
- })
113
 
114
  elif doc_type == 'image':
115
  image_count += 1
116
  doc_size = len(doc.text)
117
  if doc_size > CHUNK_SIZE:
118
- large_images_count += 1
119
- log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
120
-
121
- # Chunk large images
122
  chunked_docs = chunk_document(doc)
123
  all_chunked_docs.extend(chunked_docs)
124
 
@@ -144,7 +94,7 @@ def process_documents_with_chunking(documents):
144
  'image_number': doc.metadata.get('image_number', 'unknown')
145
  })
146
 
147
- else: # text documents
148
  doc_size = len(doc.text)
149
  if doc_size > CHUNK_SIZE:
150
  chunked_docs = chunk_document(doc)
@@ -171,14 +121,13 @@ def process_documents_with_chunking(documents):
171
  'type': 'text'
172
  })
173
 
174
- log_message(f"=== PROCESSING STATISTICS ===")
175
- log_message(f"Total tables processed: {table_count}")
176
- log_message(f"Custom processed tables: {custom_processed_count}")
177
- log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
178
- log_message(f"Total images processed: {image_count}")
179
- log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
180
- log_message(f"Total text chunks created: {text_chunks_count}")
181
- log_message(f"Total documents after processing: {len(all_chunked_docs)}")
182
 
183
  return all_chunked_docs, chunk_info
184
 
 
46
  table_count = 0
47
  image_count = 0
48
  text_chunks_count = 0
 
 
 
49
 
50
  for doc in documents:
51
  doc_type = doc.metadata.get('type', 'text')
52
 
53
  if doc_type == 'table':
54
+ # Add tables as-is, no chunking
55
  table_count += 1
56
+ all_chunked_docs.append(doc)
 
 
 
57
 
58
+ chunk_info.append({
59
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
60
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
61
+ 'chunk_id': 0,
62
+ 'chunk_size': len(doc.text),
63
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
64
+ 'type': 'table',
65
+ 'table_number': doc.metadata.get('table_number', 'unknown')
66
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  elif doc_type == 'image':
69
  image_count += 1
70
  doc_size = len(doc.text)
71
  if doc_size > CHUNK_SIZE:
 
 
 
 
72
  chunked_docs = chunk_document(doc)
73
  all_chunked_docs.extend(chunked_docs)
74
 
 
94
  'image_number': doc.metadata.get('image_number', 'unknown')
95
  })
96
 
97
+ else:
98
  doc_size = len(doc.text)
99
  if doc_size > CHUNK_SIZE:
100
  chunked_docs = chunk_document(doc)
 
121
  'type': 'text'
122
  })
123
 
124
+ log_message(f"\n{'='*60}")
125
+ log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
126
+ log_message(f" Таблицы: {table_count} (добавлены целиком)")
127
+ log_message(f" Изображения: {image_count}")
128
+ log_message(f" Текстовые чанки: {text_chunks_count}")
129
+ log_message(f" Всего документов: {len(all_chunked_docs)}")
130
+ log_message(f"{'='*60}\n")
 
131
 
132
  return all_chunked_docs, chunk_info
133
 
index_retriever.py CHANGED
@@ -16,24 +16,24 @@ def create_query_engine(vector_index):
16
  try:
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
- similarity_top_k=15
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
- similarity_top_k=30,
25
- similarity_cutoff=0.8
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
29
  [vector_retriever, bm25_retriever],
30
- similarity_top_k=30,
31
  num_queries=1
32
  )
33
 
34
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
35
  response_synthesizer = get_response_synthesizer(
36
- response_mode=ResponseMode.TREE_SUMMARIZE,
37
  text_qa_template=custom_prompt_template
38
  )
39
 
@@ -49,39 +49,66 @@ def create_query_engine(vector_index):
49
  log_message(f"Ошибка создания query engine: {str(e)}")
50
  raise
51
 
52
- def rerank_nodes(query, nodes, reranker, top_k=10):
53
  if not nodes or not reranker:
54
  return nodes[:top_k]
55
 
56
  try:
57
  log_message(f"Переранжирую {len(nodes)} узлов")
58
 
59
- # Separate tables and images from text nodes
60
- table_nodes = [node for node in nodes if node.metadata.get('type') == 'table']
61
- image_nodes = [node for node in nodes if node.metadata.get('type') == 'image']
62
- text_nodes = [node for node in nodes if node.metadata.get('type', 'text') == 'text']
63
 
64
- priority_nodes = table_nodes + image_nodes
65
 
66
- # Rerank only text nodes
67
- if text_nodes:
68
- pairs = []
69
- for node in text_nodes:
70
- pairs.append([query, node.text])
71
-
72
- scores = reranker.predict(pairs)
73
- scored_nodes = list(zip(text_nodes, scores))
74
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
75
- reranked_text_nodes = [node for node, score in scored_nodes]
76
- else:
77
- reranked_text_nodes = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- # Combine: priority nodes first, then reranked text nodes
80
- final_nodes = priority_nodes + reranked_text_nodes
81
- result = final_nodes[:top_k]
82
 
83
- log_message(f"Возвращаю {len(priority_nodes)} приоритетных узлов и {len(result) - len(priority_nodes)} текстовых узлов")
84
- return result
85
 
86
  except Exception as e:
87
  log_message(f"Ошибка переранжировки: {str(e)}")
 
16
  try:
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
+ similarity_top_k=20
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
+ similarity_top_k=30,
25
+ similarity_cutoff=0.65
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
29
  [vector_retriever, bm25_retriever],
30
+ similarity_top_k=40,
31
  num_queries=1
32
  )
33
 
34
  custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
35
  response_synthesizer = get_response_synthesizer(
36
+ response_mode=ResponseMode.TREE_SUMMARIZE,
37
  text_qa_template=custom_prompt_template
38
  )
39
 
 
49
  log_message(f"Ошибка создания query engine: {str(e)}")
50
  raise
51
 
52
+ def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5, diversity_penalty=0.3):
53
  if not nodes or not reranker:
54
  return nodes[:top_k]
55
 
56
  try:
57
  log_message(f"Переранжирую {len(nodes)} узлов")
58
 
59
+ pairs = [[query, node.text] for node in nodes]
60
+ scores = reranker.predict(pairs)
61
+ scored_nodes = list(zip(nodes, scores))
 
62
 
63
+ scored_nodes.sort(key=lambda x: x[1], reverse=True)
64
 
65
+ if min_score_threshold is not None:
66
+ scored_nodes = [(node, score) for node, score in scored_nodes
67
+ if score >= min_score_threshold]
68
+ log_message(f"После фильтрации по порогу {min_score_threshold}: {len(scored_nodes)} узлов")
69
+
70
+ if not scored_nodes:
71
+ log_message("Нет узлов после фильтрации, снижаю порог")
72
+ scored_nodes = list(zip(nodes, scores))
73
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
74
+ min_score_threshold = scored_nodes[0][1] * 0.6
75
+ scored_nodes = [(node, score) for node, score in scored_nodes
76
+ if score >= min_score_threshold]
77
+
78
+ selected_nodes = []
79
+ selected_docs = set()
80
+ selected_sections = set()
81
+
82
+ for node, score in scored_nodes:
83
+ if len(selected_nodes) >= top_k:
84
+ break
85
+
86
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
87
+ doc_id = metadata.get('document_id', 'unknown')
88
+ section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
89
+
90
+ # Apply diversity penalty
91
+ penalty = 0
92
+ if doc_id in selected_docs:
93
+ penalty += diversity_penalty * 0.5
94
+ if section_key in selected_sections:
95
+ penalty += diversity_penalty
96
+
97
+ adjusted_score = score * (1 - penalty)
98
+
99
+ # Add if still competitive
100
+ if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.6:
101
+ selected_nodes.append((node, score))
102
+ selected_docs.add(doc_id)
103
+ selected_sections.add(section_key)
104
+
105
+ log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
106
+ log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}")
107
 
108
+ if selected_nodes:
109
+ log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
 
110
 
111
+ return [node for node, score in selected_nodes]
 
112
 
113
  except Exception as e:
114
  log_message(f"Ошибка переранжировки: {str(e)}")
table_prep.py CHANGED
@@ -1,292 +1,86 @@
1
- import os
2
  from collections import defaultdict
3
  import json
4
- import zipfile
5
- import pandas as pd
6
  from huggingface_hub import hf_hub_download, list_repo_files
7
  from llama_index.core import Document
8
  from my_logging import log_message
9
 
10
- CUSTOM_TABLE_CONFIGS = {
11
- "ГОСТ Р 50.05.01-2018": {
12
- "tables": {
13
- "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
14
- "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
15
- }
16
- },
17
- "ГОСТ Р 50.06.01-2017": {
18
- "tables": {
19
- "№ Б.2": {"method": "split_by_rows"}
20
- }
21
- },
22
- "НП-104-18": {
23
- "tables": {
24
- "*": {"method": "group_entire_table"} # All tables
25
- }
26
- },
27
- "НП-068-05": {
28
- "tables": {
29
- "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
30
- "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
31
- "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
32
- }
33
- },
34
- "ГОСТ Р 59023.1-2020": {
35
- "tables": {
36
- "№ 1": {"method": "split_by_rows"},
37
- "№ 2": {"method": "split_by_rows"},
38
- "№ 3": {"method": "split_by_rows"}
39
- }
40
- },
41
- "НП-089-15": {
42
- "tables": {
43
- "-": {"method": "split_by_rows"}
44
- }
45
- },
46
- "НП-105-18": {
47
- "tables": {
48
- "№ 4.8": {"method": "group_entire_table"}
49
- }
50
- },
51
- "ГОСТ Р 50.05.23-2020": {
52
- "tables": {
53
- "№8": {"method": "group_entire_table"}
54
- }
55
- },
56
- "ГОСТ Р 50.03.01-2017": {
57
- "tables": {
58
- "А.8": {"method": "group_entire_table"}
59
- }
60
- }
61
- }
62
-
63
- def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
64
- base_info = f'Документ "{document_name}", Раздел: {section}, Таблица: {table_number}'
65
- if table_title and table_title.strip():
66
- base_info += f', Название: {table_title}'
67
- if extra_info:
68
- base_info += f', {extra_info}'
69
- return base_info
70
-
71
- def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
72
- chunk_lines = [meta_info.rstrip()] # Remove trailing newline from meta_info
73
 
74
- # Add headers only once
75
- header_line = " | ".join(headers)
76
- chunk_lines.append(f"Заголовки: {header_line}")
 
77
 
78
- # Add rows without redundant formatting
79
- for i, row in enumerate(rows, start=1):
80
- row_parts = []
81
- for h in headers:
82
- value = row.get(h, '')
83
- if value: # Only add non-empty values
84
- row_parts.append(f"{h}: {value}")
85
-
86
- if add_row_numbers:
87
- chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
88
- else:
89
- chunk_lines.append(' | '.join(row_parts))
90
 
91
- return "\n".join(chunk_lines)
92
- def group_by_column_method(table_data, document_name, group_column):
93
- """Group rows by specified column value"""
94
- documents = []
95
- headers = table_data.get("headers", [])
96
- rows = table_data.get("data", [])
97
- section = table_data.get("section", "")
98
- table_number = table_data.get("table_number", "")
99
- table_title = table_data.get("table_title", "")
100
 
101
- grouped = defaultdict(list)
102
- for row in rows:
103
- key = row.get(group_column, "UNKNOWN")
104
- grouped[key].append(row)
105
-
106
- for group_value, group_rows in grouped.items():
107
- meta_info = create_meta_info(document_name, section, table_number, table_title,
108
- f'Группа по "{group_column}": {group_value}')
109
-
110
- chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
111
-
112
- doc = Document(
113
- text=chunk_text,
114
- metadata={
115
- "type": "table",
116
- "table_number": table_number,
117
- "table_title": table_title,
118
- "document_id": document_name,
119
- "section": section,
120
- "section_id": section,
121
- "group_column": group_column,
122
- "group_value": group_value,
123
- "total_rows": len(group_rows),
124
- "processing_method": "group_by_column"
125
- }
126
- )
127
- documents.append(doc)
128
- log_message(f"Created grouped chunk for {group_column}={group_value}, rows: {len(group_rows)}, length: {len(chunk_text)}")
129
-
130
- return documents
131
 
132
- def split_by_rows_method(table_data, document_name):
133
- """Split table into individual row chunks"""
134
- documents = []
135
- headers = table_data.get("headers", [])
136
- rows = table_data.get("data", [])
137
- section = table_data.get("section", "")
138
- table_number = table_data.get("table_number", "")
139
- table_title = table_data.get("table_title", "")
140
 
141
- for i, row in enumerate(rows, start=1):
142
- meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
143
-
144
- chunk_text = create_chunk_text(meta_info, headers, [row])
145
-
146
- doc = Document(
147
- text=chunk_text,
148
- metadata={
149
- "type": "table",
150
- "table_number": table_number,
151
- "table_title": table_title,
152
- "document_id": document_name,
153
- "section": section,
154
- "section_id": section,
155
- "row_number": i,
156
- "total_rows": len(rows),
157
- "processing_method": "split_by_rows"
158
- }
159
- )
160
- documents.append(doc)
161
 
162
- log_message(f"Split table {table_number} into {len(rows)} row chunks")
163
- return documents
164
-
165
- def group_entire_table_method(table_data, document_name):
166
- """Group entire table as one chunk"""
167
- headers = table_data.get("headers", [])
168
- rows = table_data.get("data", [])
169
- section = table_data.get("section", "")
170
- table_number = table_data.get("table_number", "")
171
- table_title = table_data.get("table_title", "")
172
 
173
- meta_info = create_meta_info(document_name, section, table_number, table_title)
174
- chunk_text = create_chunk_text(meta_info, headers, rows)
 
 
175
 
176
- doc = Document(
177
- text=chunk_text,
178
  metadata={
179
  "type": "table",
180
- "table_number": table_number,
181
  "table_title": table_title,
182
- "document_id": document_name,
183
  "section": section,
184
  "section_id": section,
185
- "total_rows": len(rows),
186
- "processing_method": "group_entire_table"
187
  }
188
- )
189
-
190
- log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
191
- return [doc]
192
-
193
- def should_use_custom_processing(document_id, table_number):
194
- """Check if table should use custom processing"""
195
- for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
196
- if document_id.startswith(doc_pattern):
197
- tables_config = config.get("tables", {})
198
- if table_number in tables_config or "*" in tables_config:
199
- return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
200
- return False, None, None
201
-
202
- def process_table_with_custom_method(table_data, document_name, method_config):
203
- """Process table using custom method"""
204
- method = method_config.get("method")
205
-
206
- if method == "group_by_column":
207
- group_column = method_config.get("group_column")
208
- return group_by_column_method(table_data, document_name, group_column)
209
- elif method == "split_by_rows":
210
- return split_by_rows_method(table_data, document_name)
211
- elif method == "group_entire_table":
212
- return group_entire_table_method(table_data, document_name)
213
- else:
214
- log_message(f"Unknown custom method: {method}, falling back to default processing")
215
- return None
216
-
217
- def table_to_document(table_data, document_id=None):
218
- if isinstance(table_data, dict):
219
- doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
220
- table_num = table_data.get('table_number', 'Неизвестно')
221
- use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
222
-
223
- if use_custom:
224
- log_message(f"Using custom processing for table {table_num} in document {doc_id}")
225
- custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
226
- if custom_docs:
227
- return custom_docs
228
-
229
- # DEFAULT PROCESSING (only if NOT using custom)
230
- table_title = table_data.get('table_title', 'Неизвестно')
231
- section = table_data.get('section', 'Неизвестно')
232
-
233
- header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
234
-
235
- if 'data' in table_data and isinstance(table_data['data'], list):
236
- table_content = header_content + "\nДанные таблицы:\n"
237
- for row_idx, row in enumerate(table_data['data']):
238
- if isinstance(row, dict):
239
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
240
- table_content += f"Строка {row_idx + 1}: {row_text}\n"
241
-
242
- doc = Document(
243
- text=table_content,
244
- metadata={
245
- "type": "table",
246
- "table_number": table_num,
247
- "table_title": table_title,
248
- "document_id": doc_id,
249
- "section": section,
250
- "section_id": section,
251
- "total_rows": len(table_data['data']),
252
- "processing_method": "default"
253
- }
254
- )
255
- return [doc]
256
- else:
257
- doc = Document(
258
- text=header_content,
259
- metadata={
260
- "type": "table",
261
- "table_number": table_num,
262
- "table_title": table_title,
263
- "document_id": doc_id,
264
- "section": section,
265
- "section_id": section,
266
- "processing_method": "default"
267
- }
268
- )
269
- return [doc]
270
-
271
- return []
272
 
273
  def load_table_data(repo_id, hf_token, table_data_dir):
274
- """Modified function with custom table processing integration"""
275
- log_message("Начинаю загрузку табличных данных")
 
276
 
277
- table_files = []
278
  try:
279
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
280
- for file in files:
281
- if file.startswith(table_data_dir) and file.endswith('.json'):
282
- table_files.append(file)
283
 
284
  log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
285
 
286
  table_documents = []
 
 
 
 
 
 
287
  for file_path in table_files:
288
  try:
289
- log_message(f"Обрабатываю файл: {file_path}")
290
  local_path = hf_hub_download(
291
  repo_id=repo_id,
292
  filename=file_path,
@@ -295,6 +89,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
295
  token=hf_token
296
  )
297
 
 
 
298
  with open(local_path, 'r', encoding='utf-8') as f:
299
  table_data = json.load(f)
300
 
@@ -302,46 +98,55 @@ def load_table_data(repo_id, hf_token, table_data_dir):
302
  document_id = table_data.get('document', 'unknown')
303
 
304
  if 'sheets' in table_data:
305
- for sheet in table_data['sheets']:
 
 
 
 
 
306
  sheet['document'] = document_id
307
- # Check if this table uses custom processing
308
- table_num = sheet.get('table_number', 'Неизвестно')
309
- use_custom, _, _ = should_use_custom_processing(document_id, table_num)
310
-
311
- if use_custom:
312
- log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
313
-
314
  docs_list = table_to_document(sheet, document_id)
315
  table_documents.extend(docs_list)
 
 
 
 
 
 
 
316
  else:
317
- # Check if this table uses custom processing
318
- table_num = table_data.get('table_number', 'Неизвестно')
319
- use_custom, _, _ = should_use_custom_processing(document_id, table_num)
320
-
321
- if use_custom:
322
- log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
323
-
324
  docs_list = table_to_document(table_data, document_id)
325
  table_documents.extend(docs_list)
326
- elif isinstance(table_data, list):
327
- for table_json in table_data:
328
- document_id = table_json.get('document', 'unknown')
329
- table_num = table_json.get('table_number', 'Неизвестно')
330
- use_custom, _, _ = should_use_custom_processing(document_id, table_num)
331
-
332
- if use_custom:
333
- log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
334
 
335
- docs_list = table_to_document(table_json)
336
- table_documents.extend(docs_list)
 
 
 
 
 
337
 
338
  except Exception as e:
339
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
340
  continue
341
 
342
- log_message(f"Создано {len(table_documents)} документов из таблиц")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  return table_documents
344
 
345
  except Exception as e:
346
- log_message(f"Ошибка загрузки табличных данных: {str(e)}")
347
- return []
 
 
1
  from collections import defaultdict
2
  import json
 
 
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
5
  from my_logging import log_message
6
 
7
+ def create_table_content(table_data):
8
+ """Create formatted content from table data"""
9
+ doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
10
+ table_num = table_data.get('table_number', 'Неизвестно')
11
+ table_title = table_data.get('table_title', 'Неизвестно')
12
+ section = table_data.get('section', 'Неизвестно')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ content = f"Таблица: {table_num}\n"
15
+ content += f"Название: {table_title}\n"
16
+ content += f"Документ: {doc_id}\n"
17
+ content += f"Раздел: {section}\n"
18
 
19
+ headers = table_data.get('headers', [])
20
+ if headers:
21
+ content += f"\nЗаголовки: {' | '.join(headers)}\n"
 
 
 
 
 
 
 
 
 
22
 
23
+ if 'data' in table_data and isinstance(table_data['data'], list):
24
+ content += "\nДанные таблицы:\n"
25
+ for row_idx, row in enumerate(table_data['data'], start=1):
26
+ if isinstance(row, dict):
27
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
28
+ content += f"Строка {row_idx}: {row_text}\n"
 
 
 
29
 
30
+ return content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def table_to_document(table_data, document_id=None):
33
+ """Convert table data to a single Document"""
34
+ if not isinstance(table_data, dict):
35
+ return []
 
 
 
 
36
 
37
+ doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
38
+ table_num = table_data.get('table_number', 'Неизвестно')
39
+ table_title = table_data.get('table_title', 'Неизвестно')
40
+ section = table_data.get('section', 'Неизвестно')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ content = create_table_content(table_data)
43
+ content_size = len(content)
 
 
 
 
 
 
 
 
44
 
45
+ # Log table addition
46
+ row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
47
+ log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
48
+ f"Размер: {content_size} символов | Строк: {row_count}")
49
 
50
+ return [Document(
51
+ text=content,
52
  metadata={
53
  "type": "table",
54
+ "table_number": table_num,
55
  "table_title": table_title,
56
+ "document_id": doc_id,
57
  "section": section,
58
  "section_id": section,
59
+ "total_rows": row_count,
60
+ "content_size": content_size
61
  }
62
+ )]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  def load_table_data(repo_id, hf_token, table_data_dir):
65
+ log_message("=" * 60)
66
+ log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
67
+ log_message("=" * 60)
68
 
 
69
  try:
70
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
71
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
 
 
72
 
73
  log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
74
 
75
  table_documents = []
76
+ stats = {
77
+ 'total_tables': 0,
78
+ 'total_size': 0,
79
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
80
+ }
81
+
82
  for file_path in table_files:
83
  try:
 
84
  local_path = hf_hub_download(
85
  repo_id=repo_id,
86
  filename=file_path,
 
89
  token=hf_token
90
  )
91
 
92
+ log_message(f"\nОбработка файла: {file_path}")
93
+
94
  with open(local_path, 'r', encoding='utf-8') as f:
95
  table_data = json.load(f)
96
 
 
98
  document_id = table_data.get('document', 'unknown')
99
 
100
  if 'sheets' in table_data:
101
+ sorted_sheets = sorted(
102
+ table_data['sheets'],
103
+ key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
104
+ )
105
+
106
+ for sheet in sorted_sheets:
107
  sheet['document'] = document_id
 
 
 
 
 
 
 
108
  docs_list = table_to_document(sheet, document_id)
109
  table_documents.extend(docs_list)
110
+
111
+ for doc in docs_list:
112
+ stats['total_tables'] += 1
113
+ size = doc.metadata.get('content_size', 0)
114
+ stats['total_size'] += size
115
+ stats['by_document'][document_id]['count'] += 1
116
+ stats['by_document'][document_id]['size'] += size
117
  else:
 
 
 
 
 
 
 
118
  docs_list = table_to_document(table_data, document_id)
119
  table_documents.extend(docs_list)
 
 
 
 
 
 
 
 
120
 
121
+ for doc in docs_list:
122
+ stats['total_tables'] += 1
123
+ size = doc.metadata.get('content_size', 0)
124
+ stats['total_size'] += size
125
+ stats['by_document'][document_id]['count'] += 1
126
+ stats['by_document'][document_id]['size'] += size
127
+
128
 
129
  except Exception as e:
130
+ log_message(f" ОШИБКА файла {file_path}: {str(e)}")
131
  continue
132
 
133
+ # Log summary statistics
134
+ log_message("\n" + "=" * 60)
135
+ log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
136
+ log_message("=" * 60)
137
+ log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
138
+ log_message(f"Общий размер: {stats['total_size']:,} символов")
139
+ log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
140
+
141
+ log_message("\nПо документам:")
142
+ for doc_id, doc_stats in sorted(stats['by_document'].items()):
143
+ log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
144
+ f"{doc_stats['size']:,} символов")
145
+
146
+ log_message("=" * 60)
147
+
148
  return table_documents
149
 
150
  except Exception as e:
151
+ log_message(f" КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
152
+ return []
utils.py CHANGED
@@ -52,6 +52,7 @@ def format_context_for_llm(nodes):
52
 
53
  section_info = ""
54
 
 
55
  if metadata.get('section_path'):
56
  section_path = metadata['section_path']
57
  section_text = metadata.get('section_text', '')
@@ -60,13 +61,17 @@ def format_context_for_llm(nodes):
60
  level = metadata.get('level', '')
61
 
62
  if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
63
- # For subsections, show: пункт X.X в разделе X (Title)
64
- section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
 
 
 
65
  elif section_text:
66
- # For main sections, show: пункт X (Title)
67
- section_info = f"пункт {section_path} ({section_text})"
68
  else:
69
- section_info = f"пункт {section_path}"
 
70
  elif metadata.get('section_id'):
71
  section_id = metadata['section_id']
72
  section_text = metadata.get('section_text', '')
@@ -75,203 +80,54 @@ def format_context_for_llm(nodes):
75
  parent_title = metadata.get('parent_title', '')
76
 
77
  if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
78
- # For subsections without section_path, show: пункт X.X в разделе X (Title)
79
- section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
 
 
80
  elif section_text:
81
- section_info = f"пункт {section_id} ({section_text})"
82
  else:
83
- section_info = f"пункт {section_id}"
84
 
 
85
  if metadata.get('type') == 'table' and metadata.get('table_number'):
86
  table_num = metadata['table_number']
87
  if not str(table_num).startswith('№'):
88
  table_num = f"№{table_num}"
89
- section_info = f"таблица {table_num}"
90
-
91
- if metadata.get('type') == 'image' and metadata.get('image_number'):
92
- image_num = metadata['image_number']
93
- if not str(image_num).startswith(''):
94
- image_num = f"№{image_num}"
95
- section_info = f"рисунок {image_num}"
96
-
97
- context_text = node.text if hasattr(node, 'text') else str(node)
98
-
99
- if section_info:
100
- formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
101
- else:
102
- formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
103
-
104
- context_parts.append(formatted_context)
105
-
106
- return "\n".join(context_parts)
107
-
108
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
109
- if query_engine is None:
110
- return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
111
-
112
- try:
113
- log_message(f"Получен вопрос: {question}")
114
- start_time = time.time()
115
-
116
- # Извлечение узлов
117
- retrieved_nodes = query_engine.retriever.retrieve(question)
118
- log_message(f"Извлечено {len(retrieved_nodes)} узлов")
119
-
120
- # ДЕТАЛЬНОЕ ЛОГИРОВАНИЕ ИСТОЧНИКОВ
121
- log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
122
- for i, node in enumerate(retrieved_nodes):
123
- log_message(f"Узел {i+1}:")
124
- log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
125
- log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
126
- log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
127
- log_message(f" Текст (первые 400 символов): {node.text[:400]}...")
128
- log_message(f" Метаданные: {node.metadata}")
129
-
130
- # Переранжировка
131
- reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
132
-
133
- log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
134
- for i, node in enumerate(reranked_nodes):
135
- log_message(f"Переранжированный узел {i+1}:")
136
- log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
137
- log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
138
- log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
139
- log_message(f" Полный текст: {node.text}")
140
-
141
- formatted_context = format_context_for_llm(reranked_nodes)
142
- log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
143
-
144
- enhanced_question = f"""
145
- Контекст из базы данных:
146
- {formatted_context}
147
-
148
- Вопрос пользователя: {question}"""
149
-
150
- response = query_engine.query(enhanced_question)
151
-
152
- log_message(f"ОТВЕТ LLM: {response.response}")
153
-
154
- end_time = time.time()
155
- processing_time = end_time - start_time
156
-
157
- log_message(f"Обработка завершена за {processing_time:.2f} секунд")
158
-
159
- sources_html = generate_sources_html(reranked_nodes, chunks_df)
160
-
161
- answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
162
- <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
163
- <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
164
- <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
165
- Время обработки: {processing_time:.2f} секунд
166
- </div>
167
- </div>"""
168
-
169
- chunk_info = []
170
- for node in reranked_nodes:
171
- metadata = node.metadata if hasattr(node, 'metadata') else {}
172
- chunk_info.append({
173
- 'document_id': metadata.get('document_id', 'unknown'),
174
- 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
175
- 'section_path': metadata.get('section_path', ''),
176
- 'section_text': metadata.get('section_text', ''),
177
- 'level': metadata.get('level', ''),
178
- 'parent_section': metadata.get('parent_section', ''),
179
- 'parent_title': metadata.get('parent_title', ''),
180
- 'type': metadata.get('type', 'text'),
181
- 'table_number': metadata.get('table_number', ''),
182
- 'image_number': metadata.get('image_number', ''),
183
- 'chunk_size': len(node.text),
184
- 'chunk_text': node.text
185
- })
186
- from app import create_chunks_display_html
187
- chunks_html = create_chunks_display_html(chunk_info)
188
-
189
- return answer_with_time, sources_html, chunks_html
190
-
191
- except Exception as e:
192
- log_message(f"Ошибка обработки вопроса: {str(e)}")
193
- error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
194
- return error_msg, ""
195
-
196
-
197
- def get_llm_model(model_name):
198
- try:
199
- model_config = AVAILABLE_MODELS.get(model_name)
200
- if not model_config:
201
- log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
202
- model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
203
-
204
- if not model_config.get("api_key"):
205
- raise Exception(f"API ключ не найден для модели {model_name}")
206
-
207
- if model_config["provider"] == "google":
208
- return GoogleGenAI(
209
- model=model_config["model_name"],
210
- api_key=model_config["api_key"]
211
- )
212
- elif model_config["provider"] == "openai":
213
- return OpenAI(
214
- model=model_config["model_name"],
215
- api_key=model_config["api_key"]
216
- )
217
- else:
218
- raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
219
-
220
- except Exception as e:
221
- log_message(f"Ошибка создания модели {model_name}: {str(e)}")
222
- return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
223
-
224
- def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
225
- return HuggingFaceEmbedding(model_name=model_name)
226
-
227
- def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
228
- return CrossEncoder(model_name)
229
-
230
- def format_context_for_llm(nodes):
231
- context_parts = []
232
-
233
- for node in nodes:
234
- metadata = node.metadata if hasattr(node, 'metadata') else {}
235
- doc_id = metadata.get('document_id', 'Неизвестный документ')
236
-
237
- section_info = ""
238
-
239
- if metadata.get('section_path'):
240
- section_path = metadata['section_path']
241
- section_text = metadata.get('section_text', '')
242
- parent_section = metadata.get('parent_section', '')
243
- parent_title = metadata.get('parent_title', '')
244
 
245
- if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
246
- section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
247
- elif section_text:
248
- section_info = f"пункт {section_path} ({section_text})"
249
- else:
250
- section_info = f"пункт {section_path}"
251
- elif metadata.get('section_id'):
252
- section_id = metadata['section_id']
253
- section_text = metadata.get('section_text', '')
254
- if section_text:
255
- section_info = f"пункт {section_id} ({section_text})"
256
  else:
257
- section_info = f"пункт {section_id}"
258
-
259
- if metadata.get('type') == 'table' and metadata.get('table_number'):
260
- table_num = metadata['table_number']
261
- if not str(table_num).startswith('№'):
262
- table_num = f"№{table_num}"
263
- section_info = f"таблица {table_num}"
264
 
265
  if metadata.get('type') == 'image' and metadata.get('image_number'):
266
  image_num = metadata['image_number']
267
  if not str(image_num).startswith('№'):
268
  image_num = f"№{image_num}"
269
- section_info = f"рисунок {image_num}"
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  context_text = node.text if hasattr(node, 'text') else str(node)
272
 
273
  if section_info:
274
- formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
275
  else:
276
  formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
277
 
@@ -279,6 +135,7 @@ def format_context_for_llm(nodes):
279
 
280
  return "\n".join(context_parts)
281
 
 
282
  def generate_sources_html(nodes, chunks_df=None):
283
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
284
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
@@ -369,56 +226,80 @@ def generate_sources_html(nodes, chunks_df=None):
369
  html += "</div>"
370
  return html
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
373
  if query_engine is None:
374
- return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
375
 
376
  try:
377
- log_message(f"Получен вопрос: {question}")
378
  start_time = time.time()
379
 
380
- # Извлечение узлов
381
- retrieved_nodes = query_engine.retriever.retrieve(question)
382
- log_message(f"Извлечено {len(retrieved_nodes)} узлов")
383
 
384
- # ДЕТАЛЬНОЕ ЛОГИРОВАНИЕ ИСТОЧНИКОВ
385
- log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
386
- for i, node in enumerate(retrieved_nodes):
387
- log_message(f"Узел {i+1}:")
388
- log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
389
- log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
390
- log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
391
- log_message(f" Текст (первые 400 символов): {node.text[:400]}...")
392
- log_message(f" Метаданные: {node.metadata}")
393
 
394
- # Переранжировка
395
- reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
 
 
 
 
 
396
 
397
- log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
398
- for i, node in enumerate(reranked_nodes):
399
- log_message(f"Переранжированный узел {i+1}:")
400
- log_message(f" Документ: {node.metadata.get('document_id', 'unknown')}")
401
- log_message(f" Тип: {node.metadata.get('type', 'unknown')}")
402
- log_message(f" Раздел: {node.metadata.get('section_id', 'unknown')}")
403
- log_message(f" Полный текст: {node.text}")
 
 
 
404
 
405
  formatted_context = format_context_for_llm(reranked_nodes)
406
- log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
407
 
408
- enhanced_question = f"""
409
- Контекст из базы данных:
410
  {formatted_context}
411
 
412
- Вопрос пользователя: {question}"""
 
 
 
413
 
414
  response = query_engine.query(enhanced_question)
415
 
416
- log_message(f"ОТВЕТ LLM: {response.response}")
417
-
418
  end_time = time.time()
419
  processing_time = end_time - start_time
420
 
421
- log_message(f"Обработка завершена за {processing_time:.2f} секунд")
422
 
423
  sources_html = generate_sources_html(reranked_nodes, chunks_df)
424
 
@@ -432,10 +313,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
432
 
433
  chunk_info = []
434
  for node in reranked_nodes:
435
- section_id = node.metadata.get('section_id', node.metadata.get('section', 'unknown'))
436
  chunk_info.append({
437
- 'document_id': node.metadata.get('document_id', 'unknown'),
438
- 'section_id': section_id,
 
 
 
 
 
 
 
 
439
  'chunk_size': len(node.text),
440
  'chunk_text': node.text
441
  })
@@ -445,6 +334,6 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
445
  return answer_with_time, sources_html, chunks_html
446
 
447
  except Exception as e:
448
- log_message(f"Ошибка обработки вопроса: {str(e)}")
449
- error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
450
- return error_msg, ""
 
52
 
53
  section_info = ""
54
 
55
+ # Handle section information with proper hierarchy
56
  if metadata.get('section_path'):
57
  section_path = metadata['section_path']
58
  section_text = metadata.get('section_text', '')
 
61
  level = metadata.get('level', '')
62
 
63
  if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
64
+ # For subsections: раздел X (Title), пункт X.X
65
+ if section_text:
66
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
67
+ else:
68
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
69
  elif section_text:
70
+ # For main sections: раздел X (Title)
71
+ section_info = f"раздел {section_path} ({section_text})"
72
  else:
73
+ section_info = f"раздел {section_path}"
74
+
75
  elif metadata.get('section_id'):
76
  section_id = metadata['section_id']
77
  section_text = metadata.get('section_text', '')
 
80
  parent_title = metadata.get('parent_title', '')
81
 
82
  if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
83
+ if section_text:
84
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
85
+ else:
86
+ section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
87
  elif section_text:
88
+ section_info = f"раздел {section_id} ({section_text})"
89
  else:
90
+ section_info = f"раздел {section_id}"
91
 
92
+ # Override with table/image info if applicable
93
  if metadata.get('type') == 'table' and metadata.get('table_number'):
94
  table_num = metadata['table_number']
95
  if not str(table_num).startswith('№'):
96
  table_num = f"№{table_num}"
97
+ table_title = metadata.get('table_title', '')
98
+ # Include section context for tables
99
+ base_section = ""
100
+ if metadata.get('section_path'):
101
+ base_section = f", раздел {metadata['section_path']}"
102
+ elif metadata.get('section_id'):
103
+ base_section = f", раздел {metadata['section_id']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ if table_title:
106
+ section_info = f"Таблица {table_num} ({table_title}){base_section}"
 
 
 
 
 
 
 
 
 
107
  else:
108
+ section_info = f"Таблица {table_num}{base_section}"
 
 
 
 
 
 
109
 
110
  if metadata.get('type') == 'image' and metadata.get('image_number'):
111
  image_num = metadata['image_number']
112
  if not str(image_num).startswith('№'):
113
  image_num = f"№{image_num}"
114
+ image_title = metadata.get('image_title', '')
115
+ # Include section context for images
116
+ base_section = ""
117
+ if metadata.get('section_path'):
118
+ base_section = f", раздел {metadata['section_path']}"
119
+ elif metadata.get('section_id'):
120
+ base_section = f", раздел {metadata['section_id']}"
121
+
122
+ if image_title:
123
+ section_info = f"Рисунок {image_num} ({image_title}){base_section}"
124
+ else:
125
+ section_info = f"Рисунок {image_num}{base_section}"
126
 
127
  context_text = node.text if hasattr(node, 'text') else str(node)
128
 
129
  if section_info:
130
+ formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
131
  else:
132
  formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
133
 
 
135
 
136
  return "\n".join(context_parts)
137
 
138
+
139
  def generate_sources_html(nodes, chunks_df=None):
140
  html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
141
  html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
 
226
  html += "</div>"
227
  return html
228
 
229
+ def expand_query(question, llm_model):
230
+ """
231
+ Generate multiple query variations for better retrieval
232
+ """
233
+ expansion_prompt = f"""Дан вопрос: "{question}"
234
+
235
+ Сгенерируй 2 альтернативные формулировки этого вопроса для поиска в базе данных.
236
+ Используй синонимы и разные формулировки, сохраняя смысл.
237
+
238
+ Формат ответа (только вопросы, по одному на строку):
239
+ 1. [первая формулировка]
240
+ 2. [вторая формулировка]"""
241
+
242
+ try:
243
+ response = llm_model.complete(expansion_prompt)
244
+ expanded = [q.strip() for q in response.text.split('\n') if q.strip() and not q.strip().startswith('1.') and not q.strip().startswith('2.')]
245
+ # Clean up
246
+ expanded = [q.lstrip('12. ').strip() for q in expanded if len(q) > 10][:2]
247
+ log_message(f"Query expansion: {len(expanded)} вариантов")
248
+ return [question] + expanded
249
+ except Exception as e:
250
+ log_message(f"Ошибка расширения запроса: {str(e)}")
251
+ return [question]
252
+
253
+
254
  def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
255
  if query_engine is None:
256
+ return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
257
 
258
  try:
 
259
  start_time = time.time()
260
 
261
+ llm = get_llm_model(current_model)
262
+
263
+ query_variations = expand_query(question, llm)
264
 
265
+ all_nodes = []
266
+ seen_node_ids = set()
 
 
 
 
 
 
 
267
 
268
+ for query_var in query_variations:
269
+ retrieved = query_engine.retriever.retrieve(query_var)
270
+ for node in retrieved:
271
+ node_id = f"{node.node_id if hasattr(node, 'node_id') else hash(node.text)}"
272
+ if node_id not in seen_node_ids:
273
+ all_nodes.append(node)
274
+ seen_node_ids.add(node_id)
275
 
276
+ log_message(f"Получено {len(all_nodes)} уникальных узлов из {len(query_variations)} запросов")
277
+
278
+ reranked_nodes = rerank_nodes(
279
+ question,
280
+ all_nodes,
281
+ reranker,
282
+ top_k=20,
283
+ min_score_threshold=0.5,
284
+ diversity_penalty=0.3
285
+ )
286
 
287
  formatted_context = format_context_for_llm(reranked_nodes)
 
288
 
289
+ enhanced_question = f"""Контекст из базы данных:
 
290
  {formatted_context}
291
 
292
+ Вопрос пользователя: {question}
293
+
294
+ Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
295
+ Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
296
 
297
  response = query_engine.query(enhanced_question)
298
 
 
 
299
  end_time = time.time()
300
  processing_time = end_time - start_time
301
 
302
+ log_message(f"Обработка завершена за {processing_time:.2f}с")
303
 
304
  sources_html = generate_sources_html(reranked_nodes, chunks_df)
305
 
 
313
 
314
  chunk_info = []
315
  for node in reranked_nodes:
316
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
317
  chunk_info.append({
318
+ 'document_id': metadata.get('document_id', 'unknown'),
319
+ 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
320
+ 'section_path': metadata.get('section_path', ''),
321
+ 'section_text': metadata.get('section_text', ''),
322
+ 'level': metadata.get('level', ''),
323
+ 'parent_section': metadata.get('parent_section', ''),
324
+ 'parent_title': metadata.get('parent_title', ''),
325
+ 'type': metadata.get('type', 'text'),
326
+ 'table_number': metadata.get('table_number', ''),
327
+ 'image_number': metadata.get('image_number', ''),
328
  'chunk_size': len(node.text),
329
  'chunk_text': node.text
330
  })
 
334
  return answer_with_time, sources_html, chunks_html
335
 
336
  except Exception as e:
337
+ log_message(f"Ошибка: {str(e)}")
338
+ error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
339
+ return error_msg, "", ""