MrSimple01 commited on
Commit
0bc2e08
·
verified ·
1 Parent(s): ab29ce4

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +280 -280
utils.py CHANGED
@@ -1,281 +1,281 @@
1
- import logging
2
- import sys
3
- from llama_index.llms.google_genai import GoogleGenAI
4
- from llama_index.llms.openai import OpenAI
5
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
- from sentence_transformers import CrossEncoder
7
- from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
- import time
9
- from index_retriever import rerank_nodes
10
- from my_logging import log_message
11
- from config import PROMPT_SIMPLE_POISK
12
- import re
13
-
14
- def get_llm_model(model_name):
15
- try:
16
- model_config = AVAILABLE_MODELS.get(model_name)
17
- if not model_config:
18
- log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
19
- model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
20
-
21
- if not model_config.get("api_key"):
22
- raise Exception(f"API ключ не найден для модели {model_name}")
23
-
24
- if model_config["provider"] == "google":
25
- return GoogleGenAI(
26
- model=model_config["model_name"],
27
- api_key=model_config["api_key"]
28
- )
29
- elif model_config["provider"] == "openai":
30
- return OpenAI(
31
- model=model_config["model_name"],
32
- api_key=model_config["api_key"]
33
- )
34
- else:
35
- raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
36
-
37
- except Exception as e:
38
- log_message(f"Ошибка создания модели {model_name}: {str(e)}")
39
- return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
40
-
41
- def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
42
- return HuggingFaceEmbedding(model_name=model_name)
43
-
44
- def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
45
- return CrossEncoder(model_name)
46
-
47
- def generate_sources_html(nodes, chunks_df=None):
48
- html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
49
- html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
50
-
51
- sources_by_doc = {}
52
-
53
- for i, node in enumerate(nodes):
54
- metadata = node.metadata if hasattr(node, 'metadata') else {}
55
- doc_type = metadata.get('type', 'text')
56
- doc_id = metadata.get('document_id', 'unknown')
57
-
58
- if doc_type == 'table' or doc_type == 'table_row':
59
- table_num = metadata.get('table_number', 'unknown')
60
- key = f"{doc_id}_table_{table_num}"
61
- elif doc_type == 'image':
62
- image_num = metadata.get('image_number', 'unknown')
63
- key = f"{doc_id}_image_{image_num}"
64
- else:
65
- section_path = metadata.get('section_path', '')
66
- section_id = metadata.get('section_id', '')
67
- section_key = section_path if section_path else section_id
68
- key = f"{doc_id}_text_{section_key}"
69
-
70
- if key not in sources_by_doc:
71
- sources_by_doc[key] = {
72
- 'doc_id': doc_id,
73
- 'doc_type': doc_type,
74
- 'metadata': metadata,
75
- 'sections': set()
76
- }
77
-
78
- if doc_type not in ['table', 'table_row', 'image']:
79
- section_path = metadata.get('section_path', '')
80
- section_id = metadata.get('section_id', '')
81
- if section_path:
82
- sources_by_doc[key]['sections'].add(f"пункт {section_path}")
83
- elif section_id and section_id != 'unknown':
84
- sources_by_doc[key]['sections'].add(f"пункт {section_id}")
85
-
86
- for source_info in sources_by_doc.values():
87
- metadata = source_info['metadata']
88
- doc_type = source_info['doc_type']
89
- doc_id = source_info['doc_id']
90
-
91
- html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
92
-
93
- if doc_type == 'text':
94
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
95
- elif doc_type == 'table' or doc_type == 'table_row':
96
- table_num = metadata.get('table_number', 'unknown')
97
- table_title = metadata.get('table_title', '')
98
- if table_num and table_num != 'unknown':
99
- if not str(table_num).startswith('№'):
100
- table_num = f"№{table_num}"
101
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
102
- if table_title and table_title != 'unknown':
103
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
104
- else:
105
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
106
- elif doc_type == 'image':
107
- image_num = metadata.get('image_number', 'unknown')
108
- image_title = metadata.get('image_title', '')
109
- if image_num and image_num != 'unknown':
110
- if not str(image_num).startswith('№'):
111
- image_num = f"№{image_num}"
112
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
113
- if image_title and image_title != 'unknown':
114
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
115
-
116
- if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
117
- doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
118
- if not doc_rows.empty:
119
- file_link = doc_rows.iloc[0]['file_link']
120
- html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
121
-
122
- html += "</div>"
123
-
124
- html += "</div>"
125
- return html
126
-
127
- def deduplicate_nodes(nodes):
128
- """Deduplicate retrieved nodes based on content and metadata"""
129
- seen = set()
130
- unique_nodes = []
131
-
132
- for node in nodes:
133
- doc_id = node.metadata.get('document_id', '')
134
- node_type = node.metadata.get('type', 'text')
135
-
136
- if node_type == 'table' or node_type == 'table_row':
137
- table_num = node.metadata.get('table_number', '')
138
- table_identifier = node.metadata.get('table_identifier', table_num)
139
-
140
- # Use row range to distinguish table chunks
141
- row_start = node.metadata.get('row_start', '')
142
- row_end = node.metadata.get('row_end', '')
143
- is_complete = node.metadata.get('is_complete_table', False)
144
-
145
- if is_complete:
146
- identifier = f"{doc_id}|table|{table_identifier}|complete"
147
- elif row_start != '' and row_end != '':
148
- identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
149
- else:
150
- # Fallback: use chunk_id if available
151
- chunk_id = node.metadata.get('chunk_id', '')
152
- if chunk_id != '':
153
- identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
154
- else:
155
- # Last resort: hash first 100 chars of content
156
- import hashlib
157
- content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
158
- identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
159
-
160
- elif node_type == 'image':
161
- img_num = node.metadata.get('image_number', '')
162
- identifier = f"{doc_id}|image|{img_num}"
163
-
164
- else: # text
165
- section_id = node.metadata.get('section_id', '')
166
- chunk_id = node.metadata.get('chunk_id', 0)
167
- # For text, section_id + chunk_id should be unique
168
- identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
169
-
170
- if identifier not in seen:
171
- seen.add(identifier)
172
- unique_nodes.append(node)
173
-
174
- return unique_nodes
175
-
176
- def normalize_query(query):
177
- def repl(m):
178
- cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
179
- letter = cyr_to_lat.get(m.group(1), m.group(1))
180
- return f"{letter}{m.group(2)}"
181
-
182
- return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
183
-
184
-
185
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
186
- if query_engine is None:
187
- return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
188
-
189
- try:
190
- start_time = time.time()
191
-
192
- # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
193
- normalized_question = normalize_query(question)
194
- log_message(f"Original query: {question}")
195
- log_message(f"Normalized query: {normalized_question}")
196
-
197
- # Use normalized query for retrieval
198
- retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
199
- log_message(f"user query: {question}")
200
-
201
- log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
202
-
203
- unique_retrieved = deduplicate_nodes(retrieved_nodes)
204
- log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
205
-
206
- # Check for connection types
207
- conn_types_retrieved = {}
208
- for node in unique_retrieved:
209
- if node.metadata.get('type') == 'table':
210
- conn_type = node.metadata.get('connection_type', '')
211
- if conn_type:
212
- conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
213
-
214
- if conn_types_retrieved:
215
- log_message("CONNECTION TYPES IN RETRIEVED:")
216
- for ct, cnt in sorted(conn_types_retrieved.items()):
217
- log_message(f" {ct}: {cnt} chunks")
218
-
219
- # Check if target type was retrieved
220
- # Normalize the check as well
221
- normalized_check = normalize_query('С-25') # Will become C25
222
- if normalized_check in question or 'С-25' in question or 'C-25' in question:
223
- if 'C25' in conn_types_retrieved:
224
- log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
225
- else:
226
- log_message("✗ C25 NOT RETRIEVED despite being in query!")
227
-
228
- # Sample of retrieved tables
229
- log_message("SAMPLE OF RETRIEVED TABLES:")
230
- for i, node in enumerate(unique_retrieved[:10]):
231
- if node.metadata.get('type') == 'table':
232
- table_num = node.metadata.get('table_number', 'N/A')
233
- table_title = node.metadata.get('table_title', 'N/A')
234
- conn_type = node.metadata.get('connection_type', 'N/A')
235
- doc_id = node.metadata.get('document_id', 'N/A')
236
- log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
237
-
238
- # Rerank - use normalized query for consistency
239
- reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
240
-
241
- # CRITICAL FIX: Use normalized query for LLM as well
242
- response = query_engine.query(normalized_question)
243
-
244
- end_time = time.time()
245
- processing_time = end_time - start_time
246
-
247
- log_message(f"Обработка завершена за {processing_time:.2f}с")
248
-
249
- sources_html = generate_sources_html(reranked_nodes, chunks_df)
250
-
251
- answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
252
- <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
253
- <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
254
- <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
255
- Время обработки: {processing_time:.2f} секунд
256
- </div>
257
- </div>"""
258
-
259
- chunk_info = []
260
- for node in reranked_nodes:
261
- metadata = node.metadata if hasattr(node, 'metadata') else {}
262
- chunk_info.append({
263
- 'document_id': metadata.get('document_id', 'unknown'),
264
- 'section_id': metadata.get('section_id', 'unknown'),
265
- 'section_path': metadata.get('section_path', ''),
266
- 'section_text': metadata.get('section_text', ''),
267
- 'type': metadata.get('type', 'text'),
268
- 'table_number': metadata.get('table_number', ''),
269
- 'image_number': metadata.get('image_number', ''),
270
- 'chunk_size': len(node.text),
271
- 'chunk_text': node.text
272
- })
273
- from app import create_chunks_display_html
274
- chunks_html = create_chunks_display_html(chunk_info)
275
-
276
- return answer_with_time, sources_html, chunks_html
277
-
278
- except Exception as e:
279
- log_message(f"Ошибка: {str(e)}")
280
- error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
281
  return error_msg, "", ""
 
1
+ import logging
2
+ import sys
3
+ from llama_index.llms.google_genai import GoogleGenAI
4
+ from llama_index.llms.openai import OpenAI
5
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
+ from sentence_transformers import CrossEncoder
7
+ from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
+ import time
9
+ from index_retriever import rerank_nodes
10
+ from my_logging import log_message
11
+ from config import PROMPT_SIMPLE_POISK
12
+ import re
13
+
14
+ def get_llm_model(model_name):
15
+ try:
16
+ model_config = AVAILABLE_MODELS.get(model_name)
17
+ if not model_config:
18
+ log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
19
+ model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
20
+
21
+ if not model_config.get("api_key"):
22
+ raise Exception(f"API ключ не найден для модели {model_name}")
23
+
24
+ if model_config["provider"] == "google":
25
+ return GoogleGenAI(
26
+ model=model_config["model_name"],
27
+ api_key=model_config["api_key"]
28
+ )
29
+ elif model_config["provider"] == "openai":
30
+ return OpenAI(
31
+ model=model_config["model_name"],
32
+ api_key=model_config["api_key"]
33
+ )
34
+ else:
35
+ raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
36
+
37
+ except Exception as e:
38
+ log_message(f"Ошибка создания модели {model_name}: {str(e)}")
39
+ return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
40
+
41
+ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
42
+ return HuggingFaceEmbedding(model_name=model_name)
43
+
44
+ def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
45
+ return CrossEncoder(model_name)
46
+
47
+ def generate_sources_html(nodes, chunks_df=None):
48
+ html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
49
+ html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
50
+
51
+ sources_by_doc = {}
52
+
53
+ for i, node in enumerate(nodes):
54
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
55
+ doc_type = metadata.get('type', 'text')
56
+ doc_id = metadata.get('document_id', 'unknown')
57
+
58
+ if doc_type == 'table' or doc_type == 'table_row':
59
+ table_num = metadata.get('table_number', 'unknown')
60
+ key = f"{doc_id}_table_{table_num}"
61
+ elif doc_type == 'image':
62
+ image_num = metadata.get('image_number', 'unknown')
63
+ key = f"{doc_id}_image_{image_num}"
64
+ else:
65
+ section_path = metadata.get('section_path', '')
66
+ section_id = metadata.get('section_id', '')
67
+ section_key = section_path if section_path else section_id
68
+ key = f"{doc_id}_text_{section_key}"
69
+
70
+ if key not in sources_by_doc:
71
+ sources_by_doc[key] = {
72
+ 'doc_id': doc_id,
73
+ 'doc_type': doc_type,
74
+ 'metadata': metadata,
75
+ 'sections': set()
76
+ }
77
+
78
+ if doc_type not in ['table', 'table_row', 'image']:
79
+ section_path = metadata.get('section_path', '')
80
+ section_id = metadata.get('section_id', '')
81
+ if section_path:
82
+ sources_by_doc[key]['sections'].add(f"пункт {section_path}")
83
+ elif section_id and section_id != 'unknown':
84
+ sources_by_doc[key]['sections'].add(f"пункт {section_id}")
85
+
86
+ for source_info in sources_by_doc.values():
87
+ metadata = source_info['metadata']
88
+ doc_type = source_info['doc_type']
89
+ doc_id = source_info['doc_id']
90
+
91
+ html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
92
+
93
+ if doc_type == 'text':
94
+ html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
95
+ elif doc_type == 'table' or doc_type == 'table_row':
96
+ table_num = metadata.get('table_number', 'unknown')
97
+ table_title = metadata.get('table_title', '')
98
+ if table_num and table_num != 'unknown':
99
+ if not str(table_num).startswith('№'):
100
+ table_num = f"№{table_num}"
101
+ html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
102
+ if table_title and table_title != 'unknown':
103
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
104
+ else:
105
+ html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
106
+ elif doc_type == 'image':
107
+ image_num = metadata.get('image_number', 'unknown')
108
+ image_title = metadata.get('image_title', '')
109
+ if image_num and image_num != 'unknown':
110
+ if not str(image_num).startswith('№'):
111
+ image_num = f"№{image_num}"
112
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
113
+ if image_title and image_title != 'unknown':
114
+ html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
115
+
116
+ if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
117
+ doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
118
+ if not doc_rows.empty:
119
+ file_link = doc_rows.iloc[0]['file_link']
120
+ html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
121
+
122
+ html += "</div>"
123
+
124
+ html += "</div>"
125
+ return html
126
+
127
+ def deduplicate_nodes(nodes):
128
+ """Deduplicate retrieved nodes based on content and metadata"""
129
+ seen = set()
130
+ unique_nodes = []
131
+
132
+ for node in nodes:
133
+ doc_id = node.metadata.get('document_id', '')
134
+ node_type = node.metadata.get('type', 'text')
135
+
136
+ if node_type == 'table' or node_type == 'table_row':
137
+ table_num = node.metadata.get('table_number', '')
138
+ table_identifier = node.metadata.get('table_identifier', table_num)
139
+
140
+ # Use row range to distinguish table chunks
141
+ row_start = node.metadata.get('row_start', '')
142
+ row_end = node.metadata.get('row_end', '')
143
+ is_complete = node.metadata.get('is_complete_table', False)
144
+
145
+ if is_complete:
146
+ identifier = f"{doc_id}|table|{table_identifier}|complete"
147
+ elif row_start != '' and row_end != '':
148
+ identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
149
+ else:
150
+ # Fallback: use chunk_id if available
151
+ chunk_id = node.metadata.get('chunk_id', '')
152
+ if chunk_id != '':
153
+ identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
154
+ else:
155
+ # Last resort: hash first 100 chars of content
156
+ import hashlib
157
+ content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
158
+ identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
159
+
160
+ elif node_type == 'image':
161
+ img_num = node.metadata.get('image_number', '')
162
+ identifier = f"{doc_id}|image|{img_num}"
163
+
164
+ else: # text
165
+ section_id = node.metadata.get('section_id', '')
166
+ chunk_id = node.metadata.get('chunk_id', 0)
167
+ # For text, section_id + chunk_id should be unique
168
+ identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
169
+
170
+ if identifier not in seen:
171
+ seen.add(identifier)
172
+ unique_nodes.append(node)
173
+
174
+ return unique_nodes
175
+
176
+ def normalize_query(query):
177
+ def repl(m):
178
+ cyr_to_lat = {'С': 'C', 'с': 'C', 'Т': 'T', 'т': 'T', 'У': 'U', 'у': 'U'}
179
+ letter = cyr_to_lat.get(m.group(1), m.group(1))
180
+ return f"{letter}{m.group(2)}"
181
+
182
+ return re.sub(r'\b([СсТтУуCTU])[-\s]?(\d+)\b', repl, query)
183
+
184
+
185
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
186
+ if query_engine is None:
187
+ return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
188
+
189
+ try:
190
+ start_time = time.time()
191
+
192
+ # NORMALIZE QUERY: Convert Cyrillic to Latin and remove hyphens
193
+ normalized_question = normalize_query(question)
194
+ log_message(f"Original query: {question}")
195
+ log_message(f"Normalized query: {normalized_question}")
196
+
197
+ # Use normalized query for retrieval
198
+ retrieved_nodes = query_engine.retriever.retrieve(question)
199
+ log_message(f"user query: {question}")
200
+
201
+ log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
202
+
203
+ unique_retrieved = deduplicate_nodes(retrieved_nodes)
204
+ log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
205
+
206
+ # Check for connection types
207
+ conn_types_retrieved = {}
208
+ for node in unique_retrieved:
209
+ if node.metadata.get('type') == 'table':
210
+ conn_type = node.metadata.get('connection_type', '')
211
+ if conn_type:
212
+ conn_types_retrieved[conn_type] = conn_types_retrieved.get(conn_type, 0) + 1
213
+
214
+ if conn_types_retrieved:
215
+ log_message("CONNECTION TYPES IN RETRIEVED:")
216
+ for ct, cnt in sorted(conn_types_retrieved.items()):
217
+ log_message(f" {ct}: {cnt} chunks")
218
+
219
+ # Check if target type was retrieved
220
+ # Normalize the check as well
221
+ normalized_check = normalize_query('С-25') # Will become C25
222
+ if normalized_check in question or 'С-25' in question or 'C-25' in question:
223
+ if 'C25' in conn_types_retrieved:
224
+ log_message(f"✓ C25 RETRIEVED: {conn_types_retrieved['C25']} chunks")
225
+ else:
226
+ log_message("✗ C25 NOT RETRIEVED despite being in query!")
227
+
228
+ # Sample of retrieved tables
229
+ log_message("SAMPLE OF RETRIEVED TABLES:")
230
+ for i, node in enumerate(unique_retrieved[:10]):
231
+ if node.metadata.get('type') == 'table':
232
+ table_num = node.metadata.get('table_number', 'N/A')
233
+ table_title = node.metadata.get('table_title', 'N/A')
234
+ conn_type = node.metadata.get('connection_type', 'N/A')
235
+ doc_id = node.metadata.get('document_id', 'N/A')
236
+ log_message(f" [{i+1}] {doc_id} - Table {table_num} - Type: {conn_type}")
237
+
238
+ # Rerank - use normalized query for consistency
239
+ reranked_nodes = rerank_nodes(normalized_question, unique_retrieved, reranker, top_k=20)
240
+
241
+ # CRITICAL FIX: Use normalized query for LLM as well
242
+ response = query_engine.query(normalized_question)
243
+
244
+ end_time = time.time()
245
+ processing_time = end_time - start_time
246
+
247
+ log_message(f"Обработка завершена за {processing_time:.2f}с")
248
+
249
+ sources_html = generate_sources_html(reranked_nodes, chunks_df)
250
+
251
+ answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
252
+ <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
253
+ <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
254
+ <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
255
+ Время обработки: {processing_time:.2f} секунд
256
+ </div>
257
+ </div>"""
258
+
259
+ chunk_info = []
260
+ for node in reranked_nodes:
261
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
262
+ chunk_info.append({
263
+ 'document_id': metadata.get('document_id', 'unknown'),
264
+ 'section_id': metadata.get('section_id', 'unknown'),
265
+ 'section_path': metadata.get('section_path', ''),
266
+ 'section_text': metadata.get('section_text', ''),
267
+ 'type': metadata.get('type', 'text'),
268
+ 'table_number': metadata.get('table_number', ''),
269
+ 'image_number': metadata.get('image_number', ''),
270
+ 'chunk_size': len(node.text),
271
+ 'chunk_text': node.text
272
+ })
273
+ from app import create_chunks_display_html
274
+ chunks_html = create_chunks_display_html(chunk_info)
275
+
276
+ return answer_with_time, sources_html, chunks_html
277
+
278
+ except Exception as e:
279
+ log_message(f"Ошибка: {str(e)}")
280
+ error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
281
  return error_msg, "", ""