MrSimple01 commited on
Commit
1333a87
·
verified ·
1 Parent(s): 43fc13e

chunks html added

Browse files
Files changed (1) hide show
  1. utils.py +205 -193
utils.py CHANGED
@@ -1,194 +1,206 @@
1
- import logging
2
- import sys
3
- from llama_index.llms.google_genai import GoogleGenAI
4
- from llama_index.llms.openai import OpenAI
5
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
- from sentence_transformers import CrossEncoder
7
- from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
- import time
9
- from index_retriever import rerank_nodes
10
- from my_logging import log_message
11
- from config import PROMPT_SIMPLE_POISK
12
-
13
- def get_llm_model(model_name):
14
- try:
15
- model_config = AVAILABLE_MODELS.get(model_name)
16
- if not model_config:
17
- log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
18
- model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
19
-
20
- if not model_config.get("api_key"):
21
- raise Exception(f"API ключ не найден для модели {model_name}")
22
-
23
- if model_config["provider"] == "google":
24
- return GoogleGenAI(
25
- model=model_config["model_name"],
26
- api_key=model_config["api_key"]
27
- )
28
- elif model_config["provider"] == "openai":
29
- return OpenAI(
30
- model=model_config["model_name"],
31
- api_key=model_config["api_key"]
32
- )
33
- else:
34
- raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
35
-
36
- except Exception as e:
37
- log_message(f"Ошибка создания модели {model_name}: {str(e)}")
38
- return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
39
-
40
- def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
41
- return HuggingFaceEmbedding(model_name=model_name)
42
-
43
- def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
- return CrossEncoder(model_name)
45
-
46
- def format_context_for_llm(nodes):
47
- context_parts = []
48
-
49
- for node in nodes:
50
- metadata = node.metadata if hasattr(node, 'metadata') else {}
51
- doc_id = metadata.get('document_id', 'Неизвестный документ')
52
-
53
- section_info = ""
54
-
55
- if metadata.get('section_path'):
56
- section_path = metadata['section_path']
57
- section_text = metadata.get('section_text', '')
58
- parent_section = metadata.get('parent_section', '')
59
- parent_title = metadata.get('parent_title', '')
60
-
61
- if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
62
- section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
63
- elif section_text:
64
- section_info = f"пункт {section_path} ({section_text})"
65
- else:
66
- section_info = f"пункт {section_path}"
67
- elif metadata.get('section_id'):
68
- section_id = metadata['section_id']
69
- section_text = metadata.get('section_text', '')
70
- if section_text:
71
- section_info = f"пункт {section_id} ({section_text})"
72
- else:
73
- section_info = f"пункт {section_id}"
74
-
75
- if metadata.get('type') == 'table' and metadata.get('table_number'):
76
- table_num = metadata['table_number']
77
- if not str(table_num).startswith('№'):
78
- table_num = f"№{table_num}"
79
- section_info = f"таблица {table_num}"
80
-
81
- if metadata.get('type') == 'image' and metadata.get('image_number'):
82
- image_num = metadata['image_number']
83
- if not str(image_num).startswith('№'):
84
- image_num = f"№{image_num}"
85
- section_info = f"рисунок {image_num}"
86
-
87
- context_text = node.text if hasattr(node, 'text') else str(node)
88
-
89
- if section_info:
90
- formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
91
- else:
92
- formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
93
-
94
- context_parts.append(formatted_context)
95
-
96
- return "\n".join(context_parts)
97
-
98
- def generate_sources_html(nodes, chunks_df=None):
99
- html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
100
- html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
101
-
102
- for i, node in enumerate(nodes):
103
- metadata = node.metadata if hasattr(node, 'metadata') else {}
104
- doc_type = metadata.get('type', 'text')
105
- doc_id = metadata.get('document_id', 'unknown')
106
- section_id = metadata.get('section_id', '')
107
-
108
- html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
109
-
110
- if doc_type == 'text':
111
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
112
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📌 {section_id}</h4>"
113
-
114
- elif doc_type == 'table':
115
- table_num = metadata.get('table_number', 'unknown')
116
- if table_num and table_num != 'unknown':
117
- if not table_num.startswith('№'):
118
- table_num = f"№{table_num}"
119
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
120
- else:
121
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
122
- elif doc_type == 'image':
123
- image_num = metadata.get('image_number', 'unknown')
124
- section = metadata.get('section', '')
125
- if image_num and image_num != 'unknown':
126
- if not str(image_num).startswith('№'):
127
- image_num = f"№{image_num}"
128
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id} ({section})</h4>"
129
- else:
130
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id} ({section})</h4>"
131
-
132
- if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
133
- doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
134
- if not doc_rows.empty:
135
- file_link = doc_rows.iloc[0]['file_link']
136
- html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
137
-
138
- html += "</div>"
139
-
140
- html += "</div>"
141
- return html
142
-
143
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
144
- if query_engine is None:
145
- return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
146
-
147
- try:
148
- log_message(f"Получен вопрос: {question}")
149
- log_message(f"Используется модель: {current_model}")
150
- start_time = time.time()
151
-
152
- log_message("Извлекаю релевантные узлы")
153
- retrieved_nodes = query_engine.retriever.retrieve(question)
154
- log_message(f"Извлечено {len(retrieved_nodes)} узлов")
155
- for i in range(min(3, len(retrieved_nodes))):
156
- log_message(f"Пример узла {i+1}: {retrieved_nodes[i].text[:200]}...")
157
-
158
- log_message("Применяю переранжировку")
159
- reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
160
-
161
- formatted_context = format_context_for_llm(reranked_nodes)
162
- log_message(f"fорматированный контекст для LLM:\n{formatted_context[:500]}...")
163
-
164
- enhanced_question = f"""
165
- Контекст из базы данных:
166
- {formatted_context}
167
-
168
- Вопрос пользователя: {question}"""
169
-
170
- log_message(f"Отправляю запрос в LLM с {len(reranked_nodes)} узлами")
171
- log_message(f"Вопрос для LLM:\n{enhanced_question}...")
172
- response = query_engine.query(enhanced_question)
173
-
174
- end_time = time.time()
175
- processing_time = end_time - start_time
176
-
177
- log_message(f"Обработка завершена за {processing_time:.2f} секунд")
178
-
179
- sources_html = generate_sources_html(reranked_nodes, chunks_df)
180
-
181
- answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
182
- <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
183
- <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
184
- <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
185
- Время обработки: {processing_time:.2f} секунд
186
- </div>
187
- </div>"""
188
-
189
- return answer_with_time, sources_html
190
-
191
- except Exception as e:
192
- log_message(f"Ошибка обработки вопроса: {str(e)}")
193
- error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
 
 
 
 
 
 
 
 
 
 
 
 
194
  return error_msg, ""
 
1
+ import logging
2
+ import sys
3
+ from llama_index.llms.google_genai import GoogleGenAI
4
+ from llama_index.llms.openai import OpenAI
5
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
+ from sentence_transformers import CrossEncoder
7
+ from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
+ import time
9
+ from index_retriever import rerank_nodes
10
+ from my_logging import log_message
11
+ from config import PROMPT_SIMPLE_POISK
12
+
13
+ def get_llm_model(model_name):
14
+ try:
15
+ model_config = AVAILABLE_MODELS.get(model_name)
16
+ if not model_config:
17
+ log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
18
+ model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
19
+
20
+ if not model_config.get("api_key"):
21
+ raise Exception(f"API ключ не найден для модели {model_name}")
22
+
23
+ if model_config["provider"] == "google":
24
+ return GoogleGenAI(
25
+ model=model_config["model_name"],
26
+ api_key=model_config["api_key"]
27
+ )
28
+ elif model_config["provider"] == "openai":
29
+ return OpenAI(
30
+ model=model_config["model_name"],
31
+ api_key=model_config["api_key"]
32
+ )
33
+ else:
34
+ raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
35
+
36
+ except Exception as e:
37
+ log_message(f"Ошибка создания модели {model_name}: {str(e)}")
38
+ return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
39
+
40
+ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
41
+ return HuggingFaceEmbedding(model_name=model_name)
42
+
43
+ def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
44
+ return CrossEncoder(model_name)
45
+
46
+ def format_context_for_llm(nodes):
47
+ context_parts = []
48
+
49
+ for node in nodes:
50
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
51
+ doc_id = metadata.get('document_id', 'Неизвестный документ')
52
+
53
+ section_info = ""
54
+
55
+ if metadata.get('section_path'):
56
+ section_path = metadata['section_path']
57
+ section_text = metadata.get('section_text', '')
58
+ parent_section = metadata.get('parent_section', '')
59
+ parent_title = metadata.get('parent_title', '')
60
+
61
+ if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
62
+ section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
63
+ elif section_text:
64
+ section_info = f"пункт {section_path} ({section_text})"
65
+ else:
66
+ section_info = f"пункт {section_path}"
67
+ elif metadata.get('section_id'):
68
+ section_id = metadata['section_id']
69
+ section_text = metadata.get('section_text', '')
70
+ if section_text:
71
+ section_info = f"пункт {section_id} ({section_text})"
72
+ else:
73
+ section_info = f"пункт {section_id}"
74
+
75
+ if metadata.get('type') == 'table' and metadata.get('table_number'):
76
+ table_num = metadata['table_number']
77
+ if not str(table_num).startswith('№'):
78
+ table_num = f"№{table_num}"
79
+ section_info = f"таблица {table_num}"
80
+
81
+ if metadata.get('type') == 'image' and metadata.get('image_number'):
82
+ image_num = metadata['image_number']
83
+ if not str(image_num).startswith('№'):
84
+ image_num = f"№{image_num}"
85
+ section_info = f"рисунок {image_num}"
86
+
87
+ context_text = node.text if hasattr(node, 'text') else str(node)
88
+
89
+ if section_info:
90
+ formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
91
+ else:
92
+ formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
93
+
94
+ context_parts.append(formatted_context)
95
+
96
+ return "\n".join(context_parts)
97
+
98
+ def generate_sources_html(nodes, chunks_df=None):
99
+ html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
100
+ html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
101
+
102
+ for i, node in enumerate(nodes):
103
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
104
+ doc_type = metadata.get('type', 'text')
105
+ doc_id = metadata.get('document_id', 'unknown')
106
+ section_id = metadata.get('section_id', '')
107
+
108
+ html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
109
+
110
+ if doc_type == 'text':
111
+ html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
112
+ html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📌 {section_id}</h4>"
113
+
114
+ elif doc_type == 'table':
115
+ table_num = metadata.get('table_number', 'unknown')
116
+ if table_num and table_num != 'unknown':
117
+ if not table_num.startswith('№'):
118
+ table_num = f"№{table_num}"
119
+ html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
120
+ else:
121
+ html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
122
+ elif doc_type == 'image':
123
+ image_num = metadata.get('image_number', 'unknown')
124
+ section = metadata.get('section', '')
125
+ if image_num and image_num != 'unknown':
126
+ if not str(image_num).startswith('№'):
127
+ image_num = f"№{image_num}"
128
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id} ({section})</h4>"
129
+ else:
130
+ html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id} ({section})</h4>"
131
+
132
+ if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
133
+ doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
134
+ if not doc_rows.empty:
135
+ file_link = doc_rows.iloc[0]['file_link']
136
+ html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
137
+
138
+ html += "</div>"
139
+
140
+ html += "</div>"
141
+ return html
142
+
143
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
144
+ if query_engine is None:
145
+ return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
146
+
147
+ try:
148
+ log_message(f"Получен вопрос: {question}")
149
+ log_message(f"Используется модель: {current_model}")
150
+ start_time = time.time()
151
+
152
+ log_message("Извлекаю релевантные узлы")
153
+ retrieved_nodes = query_engine.retriever.retrieve(question)
154
+ log_message(f"Извлечено {len(retrieved_nodes)} узлов")
155
+ for i in range(min(3, len(retrieved_nodes))):
156
+ log_message(f"Пример узла {i+1}: {retrieved_nodes[i].text[:200]}...")
157
+
158
+ log_message("Применяю переранжировку")
159
+ reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
160
+
161
+ formatted_context = format_context_for_llm(reranked_nodes)
162
+ log_message(f"fорматированный контекст для LLM:\n{formatted_context[:500]}...")
163
+
164
+ enhanced_question = f"""
165
+ Контекст из базы данных:
166
+ {formatted_context}
167
+
168
+ Вопрос пользователя: {question}"""
169
+
170
+ log_message(f"Отправляю запрос в LLM с {len(reranked_nodes)} узлами")
171
+ log_message(f"Вопрос для LLM:\n{enhanced_question}...")
172
+ response = query_engine.query(enhanced_question)
173
+
174
+ end_time = time.time()
175
+ processing_time = end_time - start_time
176
+
177
+ log_message(f"Обработка завершена за {processing_time:.2f} секунд")
178
+
179
+ sources_html = generate_sources_html(reranked_nodes, chunks_df)
180
+
181
+ answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
182
+ <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
183
+ <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
184
+ <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
185
+ Время обработки: {processing_time:.2f} секунд
186
+ </div>
187
+ </div>"""
188
+
189
+ chunk_info = []
190
+ for node in reranked_nodes:
191
+ section_id = node.metadata.get('section_id', node.metadata.get('section', 'unknown'))
192
+ chunk_info.append({
193
+ 'document_id': node.metadata.get('document_id', 'unknown'),
194
+ 'section_id': section_id,
195
+ 'chunk_size': len(node.text),
196
+ 'chunk_text': node.text
197
+ })
198
+ from app import create_chunks_display_html
199
+ chunks_html = create_chunks_display_html(chunk_info)
200
+
201
+ return answer_with_time, sources_html, chunks_html
202
+
203
+ except Exception as e:
204
+ log_message(f"Ошибка обработки вопроса: {str(e)}")
205
+ error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
206
  return error_msg, ""