MrSimple07 commited on
Commit
9985d37
·
1 Parent(s): 6370d73

simplest version

Browse files
Files changed (4) hide show
  1. app.py +102 -328
  2. documents_prep.py +220 -540
  3. index_retriever.py +54 -113
  4. utils.py +81 -277
app.py CHANGED
@@ -1,355 +1,129 @@
1
  import gradio as gr
2
- import os
3
  from llama_index.core import Settings
4
- from documents_prep import *
5
- from utils import *
6
- from my_logging import log_message
7
  from index_retriever import create_vector_index, create_query_engine
8
- import sys
9
- from config import (
10
- HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
11
- JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
- )
 
13
 
14
- def create_chunks_display_html(chunk_info):
15
- if not chunk_info:
16
- return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
17
 
18
- html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
19
- html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
 
20
 
21
- for i, chunk in enumerate(chunk_info):
22
- bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
-
24
- # Get section display info
25
- section_display = get_section_display(chunk)
26
- formatted_content = get_formatted_content(chunk)
27
-
28
- html += f"""
29
- <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
30
- <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
31
- <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
32
- <strong style='color: black;'>Содержание:</strong><br>
33
- <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
34
- {formatted_content}
35
- </div>
36
- </div>
37
- """
38
 
39
- html += "</div>"
40
- return html
41
-
42
- def get_section_display(chunk):
43
- section_path = chunk.get('section_path', '')
44
- section_id = chunk.get('section_id', 'unknown')
45
- doc_type = chunk.get('type', 'text')
46
 
47
- if doc_type == 'table' and chunk.get('table_number'):
48
- table_num = chunk.get('table_number')
49
- if not str(table_num).startswith('№'):
50
- table_num = f"№{table_num}"
51
- return f"таблица {table_num}"
 
 
 
52
 
53
- if doc_type == 'image' and chunk.get('image_number'):
54
- image_num = chunk.get('image_number')
55
- if not str(image_num).startswith('№'):
56
- image_num = f"№{image_num}"
57
- return f"рисунок {image_num}"
58
 
59
- if section_path:
60
- return section_path
61
- elif section_id and section_id != 'unknown':
62
- return section_id
63
 
64
- return section_id
65
 
66
- def get_formatted_content(chunk):
67
- document_id = chunk.get('document_id', 'unknown')
68
- section_path = chunk.get('section_path', '')
69
- section_id = chunk.get('section_id', 'unknown')
70
- section_text = chunk.get('section_text', '')
71
- parent_section = chunk.get('parent_section', '')
72
- parent_title = chunk.get('parent_title', '')
73
- level = chunk.get('level', '')
74
- chunk_text = chunk.get('chunk_text', '')
75
- doc_type = chunk.get('type', 'text')
76
 
77
- # For text documents
78
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
79
- current_section = section_path if section_path else section_id
80
- parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
81
- return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
82
- else:
83
- current_section = section_path if section_path else section_id
84
- clean_text = chunk_text
85
- if section_text and chunk_text.startswith(section_text):
86
- section_title = section_text
87
- elif chunk_text.startswith(f"{current_section} "):
88
- clean_text = chunk_text[len(f"{current_section} "):].strip()
89
- section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
90
- else:
91
- section_title = section_text if section_text else current_section
92
-
93
- return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
94
 
95
- def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
96
- json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
- use_json_instead_csv=False):
98
- try:
99
- from documents_prep import process_documents_with_chunking
100
- log_message("Инициализация системы")
101
- os.makedirs(download_dir, exist_ok=True)
102
- from config import CHUNK_SIZE, CHUNK_OVERLAP
103
- from llama_index.core.text_splitter import TokenTextSplitter
104
-
105
- embed_model = get_embedding_model()
106
- llm = get_llm_model(DEFAULT_MODEL)
107
- reranker = get_reranker_model()
108
 
109
- Settings.embed_model = embed_model
110
- Settings.llm = llm
111
- Settings.text_splitter = TokenTextSplitter(
112
- chunk_size=CHUNK_SIZE,
113
- chunk_overlap=CHUNK_OVERLAP,
114
- separator=" ",
115
- backup_separators=["\n", ".", "!", "?"]
116
- )
117
 
118
- log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
119
- log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
120
-
121
- all_documents = []
122
- chunks_df = None
123
- chunk_info = []
124
 
125
- if use_json_instead_csv and json_files_dir:
126
- log_message("Используем JSON файлы вместо CSV")
127
- json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
- all_documents.extend(json_documents)
129
- chunk_info.extend(json_chunk_info)
130
- else:
131
- if chunks_filename:
132
- log_message("Загружаем данные из CSV")
133
- csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
134
- all_documents.extend(csv_documents)
135
 
136
- if table_data_dir:
137
- log_message("Добавляю табличные данные")
138
- table_documents = load_table_data(repo_id, hf_token, table_data_dir)
139
- log_message(f"Загружено {len(table_documents)} табличных документов")
140
-
141
- # Process table documents through chunking
142
- chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
143
- all_documents.extend(chunked_table_docs)
144
- chunk_info.extend(table_chunk_info)
145
 
146
- if image_data_dir:
147
- log_message("Добавляю данные изображений")
148
- image_documents = load_image_data(repo_id, hf_token, image_data_dir)
149
- log_message(f"Загружено {len(image_documents)} документов изображений")
150
-
151
- # Process image documents through chunking
152
- chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
153
- all_documents.extend(chunked_image_docs)
154
- chunk_info.extend(image_chunk_info)
155
 
156
- log_message(f"Всего документов после всей обработки: {len(all_documents)}")
 
 
 
 
 
 
 
 
157
 
158
- vector_index = create_vector_index(all_documents)
159
- query_engine = create_query_engine(vector_index)
 
 
 
160
 
161
- log_message(f"Система успешно инициализирована")
162
- return query_engine, chunks_df, reranker, vector_index, chunk_info
 
 
 
163
 
164
- except Exception as e:
165
- log_message(f"Ошибка инициализации: {str(e)}")
166
- return None, None, None, None, []
167
-
168
- def switch_model(model_name, vector_index):
169
- from llama_index.core import Settings
170
- from index_retriever import create_query_engine
171
 
172
- try:
173
- log_message(f"Переключение на модель: {model_name}")
174
-
175
- new_llm = get_llm_model(model_name)
176
- Settings.llm = new_llm
177
-
178
- if vector_index is not None:
179
- new_query_engine = create_query_engine(vector_index)
180
- log_message(f"Модель успешно переключена на: {model_name}")
181
- return new_query_engine, f"✅ Модель переключена на: {model_name}"
182
- else:
183
- return None, "❌ Ошибка: система не инициализирована"
184
-
185
- except Exception as e:
186
- error_msg = f"Ошибка переключения модели: {str(e)}"
187
- log_message(error_msg)
188
- return None, f"❌ {error_msg}"
189
-
190
- def main_answer_question(question):
191
- global query_engine, reranker, current_model, chunks_df
192
- if not question.strip():
193
- return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
194
- "<div style='color: black;'>Источники появятся после обработки запроса</div>",
195
- "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
196
-
197
- try:
198
- # Call the answer_question function which returns 3 values
199
- answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
200
- return answer_html, sources_html, chunks_html
201
-
202
- except Exception as e:
203
- log_message(f"Ошибка при ответе на вопрос: {str(e)}")
204
- return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
205
- "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
206
- "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
207
-
208
-
209
-
210
- def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
211
- with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
212
-
213
- gr.Markdown("""
214
- # AIEXP - Artificial Intelligence Expert
215
-
216
- ## Инструмент для работы с нормативной документацией
217
- """)
218
-
219
- with gr.Tab("Поиск по нормативным документам"):
220
- gr.Markdown("### Задайте вопрос по нормативной документации")
221
-
222
- with gr.Row():
223
- with gr.Column(scale=2):
224
- model_dropdown = gr.Dropdown(
225
- choices=list(AVAILABLE_MODELS.keys()),
226
- value=current_model,
227
- label="Выберите языковую модель",
228
- info="Выберите модель для генерации ответов"
229
- )
230
- with gr.Column(scale=1):
231
- switch_btn = gr.Button("Переключить модель", variant="secondary")
232
- model_status = gr.Textbox(
233
- value=f"Текущая модель: {current_model}",
234
- label="Статус модели",
235
- interactive=False
236
- )
237
-
238
- with gr.Row():
239
- with gr.Column(scale=3):
240
- question_input = gr.Textbox(
241
- label="Ваш вопрос к базе знаний",
242
- placeholder="Введите вопрос по нормативным документам...",
243
- lines=3
244
- )
245
- ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
246
-
247
- gr.Examples(
248
- examples=[
249
- "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
250
- "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
251
- "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
252
- "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
253
- "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
254
- "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
255
- ],
256
- inputs=question_input
257
- )
258
-
259
- with gr.Row():
260
- with gr.Column(scale=2):
261
- answer_output = gr.HTML(
262
- label="",
263
- value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
264
- )
265
-
266
- with gr.Column(scale=1):
267
- sources_output = gr.HTML(
268
- label="",
269
- value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
270
- )
271
-
272
- with gr.Column(scale=1):
273
- chunks_output = gr.HTML(
274
- label="Релевантные чанки",
275
- value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
276
- )
277
-
278
- switch_btn.click(
279
- fn=switch_model_func,
280
- inputs=[model_dropdown],
281
- outputs=[model_status]
282
- )
283
-
284
- ask_btn.click(
285
- fn=answer_question_func,
286
- inputs=[question_input],
287
- outputs=[answer_output, sources_output, chunks_output]
288
- )
289
-
290
- question_input.submit(
291
- fn=answer_question_func,
292
- inputs=[question_input],
293
- outputs=[answer_output, sources_output, chunks_output]
294
- )
295
  return demo
296
 
297
-
298
- query_engine = None
299
- chunks_df = None
300
- reranker = None
301
- vector_index = None
302
- current_model = DEFAULT_MODEL
303
-
304
- def main_answer_question(question):
305
- global query_engine, reranker, current_model, chunks_df
306
- answer_html, sources_html, chunks_html = answer_question(
307
- question, query_engine, reranker, current_model, chunks_df
308
- )
309
- return answer_html, sources_html, chunks_html
310
-
311
- def main_switch_model(model_name):
312
- global query_engine, vector_index, current_model
313
-
314
- new_query_engine, status_message = switch_model(model_name, vector_index)
315
- if new_query_engine:
316
- query_engine = new_query_engine
317
- current_model = model_name
318
-
319
- return status_message
320
-
321
- def main():
322
- global query_engine, chunks_df, reranker, vector_index, current_model
323
-
324
- log_message("Запуск AIEXP - AI Expert для нормативной документации")
325
-
326
- query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
327
- repo_id=HF_REPO_ID,
328
- hf_token=HF_TOKEN,
329
- download_dir=DOWNLOAD_DIR,
330
- json_files_dir=JSON_FILES_DIR,
331
- table_data_dir=TABLE_DATA_DIR,
332
- image_data_dir=IMAGE_DATA_DIR,
333
- use_json_instead_csv=True,
334
- )
335
-
336
- if query_engine:
337
- log_message("Запуск веб-интерфейса")
338
- demo = create_demo_interface(
339
- answer_question_func=main_answer_question,
340
- switch_model_func=main_switch_model,
341
- current_model=current_model,
342
- chunk_info=chunk_info
343
- )
344
- demo.launch(
345
- server_name="0.0.0.0",
346
- server_port=7860,
347
- share=True,
348
- debug=False
349
- )
350
- else:
351
- log_message("Невозможно запустить приложение из-за ошибки инициализации")
352
- sys.exit(1)
353
-
354
  if __name__ == "__main__":
355
- main()
 
 
 
 
 
 
1
  import gradio as gr
 
2
  from llama_index.core import Settings
3
+ from documents_prep import load_all_documents
 
 
4
  from index_retriever import create_vector_index, create_query_engine
5
+ from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
+ from my_logging import log_message
7
+ from config import *
8
+ # Global state
9
+ query_engine = None
10
+ reranker = None
11
 
12
+ def initialize_system():
13
+ """Initialize RAG system"""
14
+ global query_engine, reranker
15
 
16
+ log_message("="*60)
17
+ log_message("INITIALIZING SYSTEM")
18
+ log_message("="*60)
19
 
20
+ # Setup models
21
+ llm = get_llm_model(GOOGLE_API_KEY)
22
+ embed_model = get_embedding_model()
23
+ reranker = get_reranker_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ Settings.llm = llm
26
+ Settings.embed_model = embed_model
27
+
28
+ log_message("✓ Models loaded")
 
 
 
29
 
30
+ # Load documents
31
+ documents = load_all_documents(
32
+ repo_id=HF_REPO_ID,
33
+ hf_token=HF_TOKEN,
34
+ json_dir=JSON_FILES_DIR,
35
+ table_dir=TABLE_DATA_DIR,
36
+ image_dir=IMAGE_DATA_DIR
37
+ )
38
 
39
+ # Create index
40
+ vector_index = create_vector_index(documents)
41
+ query_engine = create_query_engine(vector_index)
 
 
42
 
43
+ log_message("="*60)
44
+ log_message("SYSTEM READY")
45
+ log_message("="*60)
 
46
 
47
+ return "✅ System initialized"
48
 
49
+ def ask_question(question):
50
+ """Handle question from UI"""
51
+ if not question.strip():
52
+ return "Пожалуйста, введите вопрос", ""
 
 
 
 
 
 
53
 
54
+ if query_engine is None:
55
+ return "❌ Система не инициализирована", ""
56
+
57
+ answer, sources = answer_question(question, query_engine, reranker)
58
+
59
+ return answer, sources
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ def create_interface():
62
+ """Create Gradio UI"""
63
+ with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
64
+ gr.Markdown("""
65
+ # AIEXP - AI Expert для нормативной документации
66
+ ## Упрощенная версия RAG системы
67
+ """)
 
 
 
 
 
 
68
 
69
+ with gr.Row():
70
+ init_btn = gr.Button("Инициализировать систему", variant="primary")
71
+ status = gr.Textbox(label="Статус", value="Не инициализирована")
 
 
 
 
 
72
 
73
+ gr.Markdown("### Задайте вопрос")
 
 
 
 
 
74
 
75
+ with gr.Row():
76
+ question = gr.Textbox(
77
+ label="Ваш вопрос",
78
+ placeholder="Введите вопрос...",
79
+ lines=3
80
+ )
 
 
 
 
81
 
82
+ ask_btn = gr.Button("Найти ответ", variant="primary")
 
 
 
 
 
 
 
 
83
 
84
+ gr.Examples(
85
+ examples=[
86
+ чем таблица А.12 в ГОСТ Р 59023.4-2020?",
87
+ "Какая температура подогрева для стали 20 толщиной до 100 мм?",
88
+ "Что показано на рисунке Л.2 в ГОСТ Р 50.04.07-2022?"
89
+ ],
90
+ inputs=question
91
+ )
 
92
 
93
+ with gr.Row():
94
+ answer = gr.Textbox(
95
+ label="Ответ",
96
+ lines=10
97
+ )
98
+ sources = gr.Textbox(
99
+ label="Источники",
100
+ lines=10
101
+ )
102
 
103
+ # Event handlers
104
+ init_btn.click(
105
+ fn=initialize_system,
106
+ outputs=status
107
+ )
108
 
109
+ ask_btn.click(
110
+ fn=ask_question,
111
+ inputs=question,
112
+ outputs=[answer, sources]
113
+ )
114
 
115
+ question.submit(
116
+ fn=ask_question,
117
+ inputs=question,
118
+ outputs=[answer, sources]
119
+ )
 
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  return demo
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  if __name__ == "__main__":
124
+ demo = create_interface()
125
+ demo.launch(
126
+ server_name="0.0.0.0",
127
+ server_port=7860,
128
+ share=True
129
+ )
documents_prep.py CHANGED
@@ -1,575 +1,255 @@
1
  import json
2
- import zipfile
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
6
- from my_logging import log_message
7
  from llama_index.core.text_splitter import SentenceSplitter
8
- from config import CHUNK_SIZE, CHUNK_OVERLAP
9
- from table_prep import table_to_document
10
 
 
 
 
11
 
12
- def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
- """
14
- Universal chunking for text and images.
15
- Tables use their own row-block chunking.
16
- """
17
- if chunk_size is None:
18
- chunk_size = CHUNK_SIZE
19
- if chunk_overlap is None:
20
- chunk_overlap = CHUNK_OVERLAP
21
-
22
- # Use sentence-aware splitting
23
  text_splitter = SentenceSplitter(
24
- chunk_size=chunk_size,
25
- chunk_overlap=chunk_overlap,
26
- separator=" "
27
  )
28
 
29
- text_chunks = text_splitter.split_text(doc.text)
30
-
31
- chunked_docs = []
32
- for i, chunk_text in enumerate(text_chunks):
33
- chunk_metadata = doc.metadata.copy()
34
- chunk_metadata.update({
35
- "chunk_id": i,
36
- "total_chunks": len(text_chunks),
37
- "chunk_size": len(chunk_text),
38
- "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
39
- })
40
-
41
- chunked_doc = Document(
42
- text=chunk_text,
43
- metadata=chunk_metadata
44
- )
45
- chunked_docs.append(chunked_doc)
46
-
47
- return chunked_docs
48
 
49
 
50
- def process_documents_with_chunking(documents):
51
- all_chunked_docs = []
52
- stats = {
53
- 'table_whole': 0,
54
- 'table_chunks': 0,
55
- 'image_whole': 0,
56
- 'image_chunks': 0,
57
- 'text_chunks': 0
58
- }
59
-
60
- for doc in documents:
61
- doc_type = doc.metadata.get('type', 'text')
62
- is_already_chunked = doc.metadata.get('is_chunked', False)
63
-
64
- # Tables: already chunked in table_prep.py if needed
65
- if doc_type == 'table':
66
- if is_already_chunked:
67
- stats['table_chunks'] += 1
68
- else:
69
- stats['table_whole'] += 1
70
- all_chunked_docs.append(doc)
71
-
72
- # Images: chunk if too large
73
- elif doc_type == 'image':
74
- doc_size = len(doc.text)
75
- if doc_size > CHUNK_SIZE:
76
- log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | {doc_size} > {CHUNK_SIZE}")
77
- chunked_docs = chunk_document(doc)
78
- stats['image_chunks'] += len(chunked_docs)
79
- all_chunked_docs.extend(chunked_docs)
80
- else:
81
- stats['image_whole'] += 1
82
- all_chunked_docs.append(doc)
83
-
84
- # Text: chunk if too large
85
- else:
86
- doc_size = len(doc.text)
87
- if doc_size > CHUNK_SIZE:
88
- log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | {doc_size} > {CHUNK_SIZE}")
89
- chunked_docs = chunk_document(doc)
90
- stats['text_chunks'] += len(chunked_docs)
91
- all_chunked_docs.extend(chunked_docs)
92
- else:
93
- all_chunked_docs.append(doc)
94
-
95
- log_message(f"\n{'='*60}")
96
- log_message(f"СТАТИСТИКА ОБРАБОТКИ:")
97
- log_message(f" • Таблицы (целые): {stats['table_whole']}")
98
- log_message(f" • Таблицы (чанки): {stats['table_chunks']}")
99
- log_message(f" • Изображения (целые): {stats['image_whole']}")
100
- log_message(f" • Изображения (чанки): {stats['image_chunks']}")
101
- log_message(f" • Текстовые чанки: {stats['text_chunks']}")
102
- log_message(f" • ВСЕГО: {len(all_chunked_docs)}")
103
- log_message(f"{'='*60}\n")
104
 
105
- return all_chunked_docs, [] # Second return value for backward compatibility
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
- def extract_text_from_json(data, document_id, document_name):
109
- documents = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- if 'sections' in data:
112
- for section in data['sections']:
113
- section_id = section.get('section_id', 'Unknown')
114
- section_text = section.get('section_text', '')
 
 
 
 
 
115
 
116
- section_path = f"{section_id}"
117
- section_title = extract_section_title(section_text)
118
 
119
- if section_text.strip():
120
- doc = Document(
121
- text=section_text,
122
- metadata={
123
- "type": "text",
124
- "document_id": document_id,
125
- "document_name": document_name,
126
- "section_id": section_id,
127
- "section_text": section_title[:200],
128
- "section_path": section_path,
129
- "level": "section"
130
- }
131
- )
132
- documents.append(doc)
133
 
134
- if 'subsections' in section:
135
- for subsection in section['subsections']:
136
- subsection_id = subsection.get('subsection_id', 'Unknown')
137
- subsection_text = subsection.get('subsection_text', '')
138
- subsection_title = extract_section_title(subsection_text)
139
- subsection_path = f"{section_path}.{subsection_id}"
140
-
141
- if subsection_text.strip():
142
- doc = Document(
143
- text=subsection_text,
144
- metadata={
145
- "type": "text",
146
- "document_id": document_id,
147
- "document_name": document_name,
148
- "section_id": subsection_id,
149
- "section_text": subsection_title[:200],
150
- "section_path": subsection_path,
151
- "level": "subsection",
152
- "parent_section": section_id,
153
- "parent_title": section_title[:100]
154
- }
155
- )
156
- documents.append(doc)
157
-
158
- if 'sub_subsections' in subsection:
159
- for sub_subsection in subsection['sub_subsections']:
160
- sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
161
- sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
162
- sub_subsection_title = extract_section_title(sub_subsection_text)
163
- sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
164
-
165
- if sub_subsection_text.strip():
166
- doc = Document(
167
- text=sub_subsection_text,
168
- metadata={
169
- "type": "text",
170
- "document_id": document_id,
171
- "document_name": document_name,
172
- "section_id": sub_subsection_id,
173
- "section_text": sub_subsection_title[:200],
174
- "section_path": sub_subsection_path,
175
- "level": "sub_subsection",
176
- "parent_section": subsection_id,
177
- "parent_title": subsection_title[:100]
178
- }
179
- )
180
- documents.append(doc)
181
-
182
- if 'sub_sub_subsections' in sub_subsection:
183
- for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
184
- sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
185
- sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
186
- sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
187
-
188
- if sub_sub_subsection_text.strip():
189
- doc = Document(
190
- text=sub_sub_subsection_text,
191
- metadata={
192
- "type": "text",
193
- "document_id": document_id,
194
- "document_name": document_name,
195
- "section_id": sub_sub_subsection_id,
196
- "section_text": sub_sub_subsection_title[:200],
197
- "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
198
- "level": "sub_sub_subsection",
199
- "parent_section": sub_subsection_id,
200
- "parent_title": sub_subsection_title[:100]
201
- }
202
- )
203
- documents.append(doc)
204
 
 
205
  return documents
206
 
207
- def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
208
- log_message("Начинаю загрузку JSON документов")
209
-
210
- try:
211
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
212
- zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
213
- json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
214
-
215
- log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
216
-
217
- all_documents = []
218
-
219
- for zip_file_path in zip_files:
220
- try:
221
- log_message(f"Загружаю ZIP архив: {zip_file_path}")
222
- local_zip_path = hf_hub_download(
223
- repo_id=repo_id,
224
- filename=zip_file_path,
225
- local_dir=download_dir,
226
- repo_type="dataset",
227
- token=hf_token
228
- )
229
-
230
- documents = extract_zip_and_process_json(local_zip_path)
231
- all_documents.extend(documents)
232
- log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
233
-
234
- except Exception as e:
235
- log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
236
- continue
237
-
238
- for file_path in json_files:
239
- try:
240
- log_message(f"Обрабатываю прямой JSON файл: {file_path}")
241
- local_path = hf_hub_download(
242
- repo_id=repo_id,
243
- filename=file_path,
244
- local_dir=download_dir,
245
- repo_type="dataset",
246
- token=hf_token
247
- )
248
-
249
- with open(local_path, 'r', encoding='utf-8') as f:
250
- json_data = json.load(f)
251
-
252
- document_metadata = json_data.get('document_metadata', {})
253
- document_id = document_metadata.get('document_id', 'unknown')
254
- document_name = document_metadata.get('document_name', 'unknown')
255
-
256
- documents = extract_text_from_json(json_data, document_id, document_name)
257
- all_documents.extend(documents)
258
-
259
- log_message(f"Извлечено {len(documents)} документов из {file_path}")
260
-
261
- except Exception as e:
262
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
263
- continue
264
-
265
- log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
266
-
267
- # Process documents through chunking function
268
- chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
269
-
270
- log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
271
-
272
- return chunked_documents, chunk_info
273
-
274
- except Exception as e:
275
- log_message(f"Ошибка загрузки JSON документов: {str(e)}")
276
- return [], []
277
 
278
- def extract_section_title(section_text):
279
- if not section_text.strip():
280
- return ""
281
-
282
- lines = section_text.strip().split('\n')
283
- first_line = lines[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
- if len(first_line) < 200 and not first_line.endswith('.'):
286
- return first_line
 
 
 
 
 
287
 
288
- # Otherwise, extract first sentence
289
- sentences = first_line.split('.')
290
- if len(sentences) > 1:
291
- return sentences[0].strip()
292
 
293
- return first_line[:100] + "..." if len(first_line) > 100 else first_line
294
-
295
- def extract_zip_and_process_json(zip_path):
296
  documents = []
297
-
298
- try:
299
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
300
- zip_files = zip_ref.namelist()
301
- json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
 
 
 
302
 
303
- log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
304
 
305
- for json_file in json_files:
306
- try:
307
- log_message(f"Обрабатываю файл из архива: {json_file}")
308
-
309
- with zip_ref.open(json_file) as f:
310
- json_data = json.load(f)
311
-
312
- document_metadata = json_data.get('document_metadata', {})
313
- document_id = document_metadata.get('document_id', 'unknown')
314
- document_name = document_metadata.get('document_name', 'unknown')
315
-
316
- docs = extract_text_from_json(json_data, document_id, document_name)
317
- documents.extend(docs)
318
-
319
- log_message(f"Извлечено {len(docs)} документов из {json_file}")
320
-
321
- except Exception as e:
322
- log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
323
- continue
324
-
325
- except Exception as e:
326
- log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
327
 
 
328
  return documents
329
 
330
- def load_image_data(repo_id, hf_token, image_data_dir):
331
- log_message("Начинаю загрузку данных изображений")
332
-
333
- image_files = []
334
- try:
335
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
336
- for file in files:
337
- if file.startswith(image_data_dir) and file.endswith('.csv'):
338
- image_files.append(file)
339
-
340
- log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
341
-
342
- image_documents = []
343
- for file_path in image_files:
344
- try:
345
- log_message(f"Обрабатываю файл изображений: {file_path}")
346
- local_path = hf_hub_download(
347
- repo_id=repo_id,
348
- filename=file_path,
349
- local_dir='',
350
- repo_type="dataset",
351
- token=hf_token
352
- )
353
-
354
- df = pd.read_csv(local_path)
355
- log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
356
-
357
- for _, row in df.iterrows():
358
- section_value = row.get('Раздел документа', 'Неизвестно')
359
- image_num = str(row.get('№ Изображения', 'Неизвестно'))
360
- image_title = str(row.get('Название изображения', 'Неизвестно'))
361
- image_desc = str(row.get('Описание изображение', 'Неизвестно'))
362
- doc_id = str(row.get('Обозначение документа', 'Неизвестно'))
363
- file_name = str(row.get('Файл изображения', 'Неизвестно'))
364
-
365
- # FIXED: Create structured, searchable content
366
- content = f"=== ИЗОБРАЖЕНИЕ ===\n"
367
- content += f"Документ: {doc_id}\n"
368
- content += f"Стандарт: {doc_id}\n"
369
- content += f"Раздел: {section_value}\n"
370
- content += f"Изображение: {image_num}\n"
371
- content += f"Название: {image_title}\n"
372
- content += f"Описание: {image_desc}\n"
373
- content += f"Файл: {file_name}\n"
374
- content += f"Уникальный ID: {doc_id} | {section_value} | {image_num}\n"
375
- content += f"===================\n\n"
376
-
377
- # Add contextual information for better retrieval
378
- content += f"Это изображение {image_num} из документа {doc_id}, "
379
- content += f"расположенное в разделе '{section_value}'. "
380
- content += f"{image_title}. {image_desc}"
381
-
382
- doc = Document(
383
- text=content,
384
- metadata={
385
- "type": "image",
386
- "image_number": image_num,
387
- "image_title": image_title,
388
- "image_description": image_desc,
389
- "document_id": doc_id,
390
- "file_path": file_name,
391
- "section": section_value,
392
- "section_id": section_value,
393
- "full_image_id": f"{doc_id} | {section_value} | {image_num}"
394
- }
395
- )
396
- image_documents.append(doc)
397
-
398
- except Exception as e:
399
- log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
400
- continue
401
-
402
- log_message(f"Создано {len(image_documents)} документов из изображений")
403
- return image_documents
404
-
405
- except Exception as e:
406
- log_message(f"Ошибка загрузки данных изображений: {str(e)}")
407
- return []
408
 
409
- def load_table_data(repo_id, hf_token, table_data_dir):
410
- """Load and process table data with complete metadata preservation"""
411
- log_message("=" * 60)
412
- log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
413
- log_message("=" * 60)
414
 
415
- try:
416
- from huggingface_hub import hf_hub_download, list_repo_files
417
- import json
418
- from collections import defaultdict
419
-
420
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
421
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
422
-
423
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
424
-
425
- table_documents = []
426
- stats = {
427
- 'total_tables': 0,
428
- 'total_size': 0,
429
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0}),
430
- 'by_sheet': defaultdict(int)
431
- }
432
-
433
- for file_path in table_files:
434
- try:
435
- local_path = hf_hub_download(
436
- repo_id=repo_id,
437
- filename=file_path,
438
- local_dir='',
439
- repo_type="dataset",
440
- token=hf_token
441
- )
442
-
443
- log_message(f"\n📂 Обработка файла: {file_path}")
444
-
445
- with open(local_path, 'r', encoding='utf-8') as f:
446
- table_data = json.load(f)
447
-
448
- if isinstance(table_data, dict):
449
- file_level_doc_id = (
450
- table_data.get('document_id') or
451
- table_data.get('document') or
452
- 'unknown'
453
- )
454
-
455
- if 'sheets' in table_data:
456
- sorted_sheets = sorted(
457
- table_data['sheets'],
458
- key=lambda sheet: sheet.get('table_number', '')
459
- )
460
-
461
- log_message(f" Найдено листов: {len(sorted_sheets)}")
462
-
463
- for sheet in sorted_sheets:
464
- # CRITICAL: sheet_name MUST be present
465
- if 'sheet_name' not in sheet:
466
- log_message(f" ⚠️ Пропущен лист без sheet_name")
467
- continue
468
-
469
- sheet_name = sheet['sheet_name']
470
- sheet_doc_id = sheet.get('document_id', file_level_doc_id)
471
-
472
- log_message(f" → Лист: {sheet_name} | doc_id: {sheet_doc_id}")
473
-
474
- # Pass complete sheet data to table_to_document
475
- docs_list = table_to_document(sheet, document_id=sheet_doc_id)
476
- table_documents.extend(docs_list)
477
-
478
- stats['by_sheet'][sheet_name] += len(docs_list)
479
-
480
- for doc in docs_list:
481
- stats['total_tables'] += 1
482
- size = doc.metadata.get('content_size', 0)
483
- stats['total_size'] += size
484
- stats['by_document'][sheet_doc_id]['count'] += 1
485
- stats['by_document'][sheet_doc_id]['size'] += size
486
- else:
487
- # Single table (no sheets structure)
488
- docs_list = table_to_document(table_data, document_id=file_level_doc_id)
489
- table_documents.extend(docs_list)
490
-
491
- for doc in docs_list:
492
- stats['total_tables'] += 1
493
- size = doc.metadata.get('content_size', 0)
494
- stats['total_size'] += size
495
- stats['by_document'][file_level_doc_id]['count'] += 1
496
- stats['by_document'][file_level_doc_id]['size'] += size
497
-
498
- except Exception as e:
499
- log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
500
- import traceback
501
- log_message(f"Traceback: {traceback.format_exc()}")
502
- continue
503
-
504
- # Enhanced logging with sheet breakdown
505
- log_message("\n" + "=" * 60)
506
- log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
507
- log_message("=" * 60)
508
- log_message(f"Всего таблиц/чанков: {stats['total_tables']}")
509
- log_message(f"Общий размер: {stats['total_size']:,} символов")
510
- if stats['total_tables'] > 0:
511
- log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
512
-
513
- log_message("\nПо документам:")
514
- for doc_id, doc_stats in sorted(stats['by_document'].items()):
515
- log_message(f" • {doc_id}: {doc_stats['count']} элементов, {doc_stats['size']:,} символов")
516
-
517
- log_message("\nПо листам (топ-20):")
518
- top_sheets = sorted(stats['by_sheet'].items(), key=lambda x: x[1], reverse=True)[:20]
519
- for sheet_name, count in top_sheets:
520
- log_message(f" • {sheet_name}: {count} чанков")
521
-
522
- log_message("=" * 60)
523
-
524
- return table_documents
525
-
526
- except Exception as e:
527
- log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
528
- import traceback
529
- log_message(f"Traceback: {traceback.format_exc()}")
530
- return []
531
-
532
- def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
533
- log_message("Загружаю данные чанков из CSV")
534
 
535
- try:
536
- chunks_csv_path = hf_hub_download(
537
- repo_id=repo_id,
538
- filename=chunks_filename,
539
- local_dir=download_dir,
540
- repo_type="dataset",
541
- token=hf_token
542
- )
543
-
544
- chunks_df = pd.read_csv(chunks_csv_path)
545
- log_message(f"Загружено {len(chunks_df)} чанков из CSV")
546
-
547
- text_column = None
548
- for col in chunks_df.columns:
549
- if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
550
- text_column = col
551
- break
552
-
553
- if text_column is None:
554
- text_column = chunks_df.columns[0]
555
-
556
- log_message(f"Использую колонку: {text_column}")
557
-
558
- documents = []
559
- for i, (_, row) in enumerate(chunks_df.iterrows()):
560
- doc = Document(
561
- text=str(row[text_column]),
562
- metadata={
563
- "chunk_id": row.get('chunk_id', i),
564
- "document_id": row.get('document_id', 'unknown'),
565
- "type": "text"
566
- }
567
- )
568
- documents.append(doc)
569
-
570
- log_message(f"Создано {len(documents)} текстовых документов из CSV")
571
- return documents, chunks_df
572
-
573
- except Exception as e:
574
- log_message(f"Ошибка загрузки CSV данных: {str(e)}")
575
- return [], None
 
1
  import json
 
2
  import pandas as pd
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
 
5
  from llama_index.core.text_splitter import SentenceSplitter
6
+ from my_logging import log_message
 
7
 
8
+ # Configuration
9
+ CHUNK_SIZE = 512
10
+ CHUNK_OVERLAP = 50
11
 
12
+ def chunk_text_documents(documents):
13
+ """Simple text chunking with sentence awareness"""
 
 
 
 
 
 
 
 
 
14
  text_splitter = SentenceSplitter(
15
+ chunk_size=CHUNK_SIZE,
16
+ chunk_overlap=CHUNK_OVERLAP
 
17
  )
18
 
19
+ chunked = []
20
+ for doc in documents:
21
+ chunks = text_splitter.get_nodes_from_documents([doc])
22
+ for i, chunk in enumerate(chunks):
23
+ chunk.metadata.update({
24
+ 'chunk_id': i,
25
+ 'total_chunks': len(chunks)
26
+ })
27
+ chunked.append(chunk)
28
+
29
+ log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
30
+ return chunked
 
 
 
 
 
 
 
31
 
32
 
33
+ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
34
+ """Split large tables into row blocks"""
35
+ headers = table_data.get('headers', [])
36
+ rows = table_data.get('data', [])
37
+ table_num = table_data.get('table_number', 'unknown')
38
+ table_title = table_data.get('table_title', '')
39
+ section = table_data.get('section', '')
40
+
41
+ if not rows:
42
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ # Small table: keep whole
45
+ if len(rows) <= max_rows:
46
+ content = format_table_content(table_data, headers, rows)
47
+ return [Document(
48
+ text=content,
49
+ metadata={
50
+ 'type': 'table',
51
+ 'document_id': doc_id,
52
+ 'table_number': table_num,
53
+ 'table_title': table_title,
54
+ 'section': section,
55
+ 'total_rows': len(rows)
56
+ }
57
+ )]
58
+
59
+ # Large table: split by row blocks
60
+ chunks = []
61
+ for i in range(0, len(rows), max_rows):
62
+ chunk_rows = rows[i:i+max_rows]
63
+ content = format_table_content(table_data, headers, chunk_rows,
64
+ chunk_info=f"Rows {i+1}-{i+len(chunk_rows)}")
65
+
66
+ chunks.append(Document(
67
+ text=content,
68
+ metadata={
69
+ 'type': 'table',
70
+ 'document_id': doc_id,
71
+ 'table_number': table_num,
72
+ 'table_title': table_title,
73
+ 'section': section,
74
+ 'chunk_id': i // max_rows,
75
+ 'row_start': i,
76
+ 'row_end': i + len(chunk_rows),
77
+ 'total_rows': len(rows)
78
+ }
79
+ ))
80
+
81
+ log_message(f" 📊 Table {table_num}: {len(rows)} rows → {len(chunks)} chunks")
82
+ return chunks
83
 
84
 
85
+ def format_table_content(table_data, headers, rows, chunk_info=""):
86
+ """Format table for semantic search"""
87
+ doc_id = table_data.get('document_id', 'unknown')
88
+ table_num = table_data.get('table_number', 'unknown')
89
+ table_title = table_data.get('table_title', '')
90
+ section = table_data.get('section', '')
91
+
92
+ content = f"Документ: {doc_id}\n"
93
+ content += f"Таблица: {table_num}\n"
94
+ if table_title:
95
+ content += f"Название: {table_title}\n"
96
+ if section:
97
+ content += f"Раздел: {section}\n"
98
+ if chunk_info:
99
+ content += f"{chunk_info}\n"
100
+ content += f"\nКолонки: {' | '.join(str(h) for h in headers)}\n\n"
101
+
102
+ # Add rows
103
+ for row in rows:
104
+ if isinstance(row, dict):
105
+ parts = [f"{k}: {v}" for k, v in row.items()
106
+ if v and str(v).strip() and str(v) != 'nan']
107
+ content += ' | '.join(parts) + "\n"
108
+ elif isinstance(row, list):
109
+ parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
110
+ content += ' | '.join(parts) + "\n"
111
+
112
+ return content
113
+
114
+
115
+ def load_json_documents(repo_id, hf_token, json_dir):
116
+ """Load text sections from JSON"""
117
+ log_message("Loading JSON documents...")
118
+
119
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
120
+ json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
121
 
122
+ documents = []
123
+ for file_path in json_files:
124
+ try:
125
+ local_path = hf_hub_download(
126
+ repo_id=repo_id,
127
+ filename=file_path,
128
+ repo_type="dataset",
129
+ token=hf_token
130
+ )
131
 
132
+ with open(local_path, 'r', encoding='utf-8') as f:
133
+ data = json.load(f)
134
 
135
+ doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ # Extract sections
138
+ for section in data.get('sections', []):
139
+ if section.get('section_text', '').strip():
140
+ documents.append(Document(
141
+ text=section['section_text'],
142
+ metadata={
143
+ 'type': 'text',
144
+ 'document_id': doc_id,
145
+ 'section_id': section.get('section_id', '')
146
+ }
147
+ ))
148
+ except Exception as e:
149
+ log_message(f"Error loading {file_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ log_message(f"✓ Loaded {len(documents)} text sections")
152
  return documents
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ def load_table_documents(repo_id, hf_token, table_dir):
156
+ """Load and chunk tables"""
157
+ log_message("Loading tables...")
158
+
159
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
160
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
161
+
162
+ all_chunks = []
163
+ for file_path in table_files:
164
+ try:
165
+ local_path = hf_hub_download(
166
+ repo_id=repo_id,
167
+ filename=file_path,
168
+ repo_type="dataset",
169
+ token=hf_token
170
+ )
171
+
172
+ with open(local_path, 'r', encoding='utf-8') as f:
173
+ data = json.load(f)
174
+
175
+ doc_id = data.get('document_id', 'unknown')
176
+
177
+ for sheet in data.get('sheets', []):
178
+ chunks = chunk_table_by_rows(sheet, doc_id)
179
+ all_chunks.extend(chunks)
180
+
181
+ except Exception as e:
182
+ log_message(f"Error loading {file_path}: {e}")
183
 
184
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks")
185
+ return all_chunks
186
+
187
+
188
+ def load_image_documents(repo_id, hf_token, image_dir):
189
+ """Load image descriptions"""
190
+ log_message("Loading images...")
191
 
192
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
193
+ csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
 
 
194
 
 
 
 
195
  documents = []
196
+ for file_path in csv_files:
197
+ try:
198
+ local_path = hf_hub_download(
199
+ repo_id=repo_id,
200
+ filename=file_path,
201
+ repo_type="dataset",
202
+ token=hf_token
203
+ )
204
 
205
+ df = pd.read_csv(local_path)
206
 
207
+ for _, row in df.iterrows():
208
+ content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
209
+ content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
210
+ content += f"Название: {row.get('Название изображения', '')}\n"
211
+ content += f"Описание: {row.get('Описание изображение', '')}\n"
212
+ content += f"Раздел: {row.get('Раздел документа', '')}\n"
213
+
214
+ documents.append(Document(
215
+ text=content,
216
+ metadata={
217
+ 'type': 'image',
218
+ 'document_id': str(row.get('Обозначение документа', 'unknown')),
219
+ 'image_number': str(row.get('№ Изображения', 'unknown')),
220
+ 'section': str(row.get('Раздел документа', ''))
221
+ }
222
+ ))
223
+ except Exception as e:
224
+ log_message(f"Error loading {file_path}: {e}")
 
 
 
 
225
 
226
+ log_message(f"✓ Loaded {len(documents)} images")
227
  return documents
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
231
+ """Main loader - combines all document types"""
232
+ log_message("="*60)
233
+ log_message("STARTING DOCUMENT LOADING")
234
+ log_message("="*60)
235
 
236
+ # Load text sections
237
+ text_docs = load_json_documents(repo_id, hf_token, json_dir)
238
+ text_chunks = chunk_text_documents(text_docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ # Load tables (already chunked)
241
+ table_chunks = load_table_documents(repo_id, hf_token, table_dir)
242
+
243
+ # Load images (no chunking needed)
244
+ image_docs = load_image_documents(repo_id, hf_token, image_dir)
245
+
246
+ all_docs = text_chunks + table_chunks + image_docs
247
+
248
+ log_message("="*60)
249
+ log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
250
+ log_message(f" Text chunks: {len(text_chunks)}")
251
+ log_message(f" Table chunks: {len(table_chunks)}")
252
+ log_message(f" Images: {len(image_docs)}")
253
+ log_message("="*60)
254
+
255
+ return all_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
index_retriever.py CHANGED
@@ -1,123 +1,64 @@
1
- from llama_index.core import VectorStoreIndex, Settings
2
  from llama_index.core.query_engine import RetrieverQueryEngine
3
  from llama_index.core.retrievers import VectorIndexRetriever
4
- from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
5
- from llama_index.core.prompts import PromptTemplate
6
  from llama_index.retrievers.bm25 import BM25Retriever
7
  from llama_index.core.retrievers import QueryFusionRetriever
 
8
  from my_logging import log_message
9
- from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def create_vector_index(documents):
12
- log_message("Строю векторный индекс")
13
- return VectorStoreIndex.from_documents(documents)
 
 
 
14
 
15
  def create_query_engine(vector_index):
16
- try:
17
- bm25_retriever = BM25Retriever.from_defaults(
18
- docstore=vector_index.docstore,
19
- similarity_top_k=30
20
- )
21
-
22
- vector_retriever = VectorIndexRetriever(
23
- index=vector_index,
24
- similarity_top_k=30,
25
- similarity_cutoff=0.65
26
- )
27
-
28
- hybrid_retriever = QueryFusionRetriever(
29
- [vector_retriever, bm25_retriever],
30
- similarity_top_k=40,
31
- num_queries=1
32
- )
33
-
34
- custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
35
- response_synthesizer = get_response_synthesizer(
36
- response_mode=ResponseMode.TREE_SUMMARIZE,
37
- text_qa_template=custom_prompt_template
38
- )
39
-
40
- query_engine = RetrieverQueryEngine(
41
- retriever=hybrid_retriever,
42
- response_synthesizer=response_synthesizer
43
- )
44
-
45
- log_message("Query engine успешно создан")
46
- return query_engine
47
-
48
- except Exception as e:
49
- log_message(f"Ошибка создания query engine: {str(e)}")
50
- raise
51
-
52
- def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.45, diversity_penalty=0.2):
53
- """Rerank with better handling of specific technical queries"""
54
- if not nodes or not reranker:
55
- return nodes[:top_k]
56
 
57
- try:
58
- log_message(f"Переранжирую {len(nodes)} узлов для запроса: {query[:50]}...")
59
-
60
- pairs = [[query, node.text] for node in nodes]
61
- scores = reranker.predict(pairs)
62
- scored_nodes = list(zip(nodes, scores))
63
-
64
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
65
-
66
- # Lower threshold for technical queries
67
- if min_score_threshold is not None:
68
- scored_nodes = [(node, score) for node, score in scored_nodes
69
- if score >= min_score_threshold]
70
- log_message(f"После фильтрации (порог {min_score_threshold}): {len(scored_nodes)} узлов")
71
-
72
- if not scored_nodes:
73
- log_message("⚠️ Нет узлов после фильтрации, снижаю порог до 0.3")
74
- scored_nodes = list(zip(nodes, scores))
75
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
76
- min_score_threshold = max(0.3, scored_nodes[0][1] * 0.5)
77
- scored_nodes = [(node, score) for node, score in scored_nodes
78
- if score >= min_score_threshold]
79
-
80
- selected_nodes = []
81
- selected_docs = {} # Track count per document
82
- selected_tables = set()
83
-
84
- for node, score in scored_nodes:
85
- if len(selected_nodes) >= top_k:
86
- break
87
-
88
- metadata = node.metadata if hasattr(node, 'metadata') else {}
89
- doc_id = metadata.get('document_id', 'unknown')
90
- node_type = metadata.get('type', 'text')
91
-
92
- # Track table uniqueness
93
- if node_type == 'table':
94
- table_id = metadata.get('full_table_id', '')
95
- if table_id in selected_tables:
96
- continue # Skip duplicate table chunks
97
- selected_tables.add(table_id)
98
-
99
- # Apply lighter diversity penalty
100
- penalty = 0
101
- doc_count = selected_docs.get(doc_id, 0)
102
- if doc_count > 0:
103
- penalty = min(diversity_penalty * doc_count, 0.5)
104
-
105
- adjusted_score = score * (1 - penalty)
106
-
107
- # Accept if competitive
108
- if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.5:
109
- selected_nodes.append((node, score))
110
- selected_docs[doc_id] = doc_count + 1
111
-
112
- log_message(f"✓ Выбрано {len(selected_nodes)} узлов")
113
- log_message(f" Уникальных документов: {len(selected_docs)}")
114
- log_message(f" Уникальных таблиц: {len(selected_tables)}")
115
-
116
- if selected_nodes:
117
- log_message(f" Score: {selected_nodes[0][1]:.3f} → {selected_nodes[-1][1]:.3f}")
118
-
119
- return [node for node, score in selected_nodes]
120
-
121
- except Exception as e:
122
- log_message(f"❌ Ошибка переранжировки: {str(e)}")
123
- return nodes[:top_k]
 
1
+ from llama_index.core import VectorStoreIndex
2
  from llama_index.core.query_engine import RetrieverQueryEngine
3
  from llama_index.core.retrievers import VectorIndexRetriever
 
 
4
  from llama_index.retrievers.bm25 import BM25Retriever
5
  from llama_index.core.retrievers import QueryFusionRetriever
6
+ from llama_index.core.response_synthesizers import get_response_synthesizer
7
  from my_logging import log_message
8
+
9
+ SIMPLE_PROMPT = """Вы - эксперт по нормативной документации.
10
+
11
+ Контекст:
12
+ {context_str}
13
+
14
+ Вопрос: {query_str}
15
+
16
+ Инструкция:
17
+ 1. Отвечайте ТОЛЬКО на основе предоставленного контекста
18
+ 2. Цитируйте конкретные источники (документ, раздел, таблицу)
19
+ 3. Если информации недостаточно, четко укажите это
20
+ 4. Будьте точны и конкретны
21
+
22
+ Ответ:"""
23
 
24
  def create_vector_index(documents):
25
+ """Create vector index from documents"""
26
+ log_message(f"Building vector index from {len(documents)} documents...")
27
+ index = VectorStoreIndex.from_documents(documents)
28
+ log_message("✓ Index created")
29
+ return index
30
 
31
  def create_query_engine(vector_index):
32
+ """Create hybrid retrieval engine"""
33
+ log_message("Creating query engine...")
34
+
35
+ # Vector retriever
36
+ vector_retriever = VectorIndexRetriever(
37
+ index=vector_index,
38
+ similarity_top_k=30
39
+ )
40
+
41
+ # BM25 retriever
42
+ bm25_retriever = BM25Retriever.from_defaults(
43
+ docstore=vector_index.docstore,
44
+ similarity_top_k=30
45
+ )
46
+
47
+ # Hybrid fusion
48
+ hybrid_retriever = QueryFusionRetriever(
49
+ [vector_retriever, bm25_retriever],
50
+ similarity_top_k=40,
51
+ num_queries=1
52
+ )
53
+
54
+ # Response synthesizer
55
+ response_synthesizer = get_response_synthesizer()
56
+
57
+ # Query engine
58
+ query_engine = RetrieverQueryEngine(
59
+ retriever=hybrid_retriever,
60
+ response_synthesizer=response_synthesizer
61
+ )
 
 
 
 
 
 
 
 
 
 
62
 
63
+ log_message("✓ Query engine created")
64
+ return query_engine
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -1,309 +1,113 @@
1
- import logging
2
- import sys
3
  from llama_index.llms.google_genai import GoogleGenAI
4
- from llama_index.llms.openai import OpenAI
5
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
  from sentence_transformers import CrossEncoder
7
- from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
8
- import time
9
- from index_retriever import rerank_nodes
10
  from my_logging import log_message
11
- from config import PROMPT_SIMPLE_POISK
12
 
13
- def get_llm_model(model_name):
14
- try:
15
- model_config = AVAILABLE_MODELS.get(model_name)
16
- if not model_config:
17
- log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
18
- model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
19
-
20
- if not model_config.get("api_key"):
21
- raise Exception(f"API ключ не найден для модели {model_name}")
22
-
23
- if model_config["provider"] == "google":
24
- # Fix: Remove image_config parameter or set it properly
25
- return GoogleGenAI(
26
- model=model_config["model_name"],
27
- api_key=model_config["api_key"],
28
- # Don't pass image_config=None
29
- )
30
- elif model_config["provider"] == "openai":
31
- return OpenAI(
32
- model=model_config["model_name"],
33
- api_key=model_config["api_key"]
34
- )
35
- else:
36
- raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
37
-
38
- except Exception as e:
39
- log_message(f"Ошибка создания модели {model_name}: {str(e)}")
40
- # Fix: Also apply to fallback model
41
- return GoogleGenAI(
42
- model="gemini-2.0-flash",
43
- api_key=GOOGLE_API_KEY
44
- )
45
 
46
  def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
 
47
  return HuggingFaceEmbedding(model_name=model_name)
48
 
49
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
 
50
  return CrossEncoder(model_name)
51
 
52
- def format_context_for_llm(nodes):
53
- context_parts = []
54
-
55
  for node in nodes:
56
- metadata = node.metadata if hasattr(node, 'metadata') else {}
57
- doc_id = metadata.get('document_id', 'Неизвестный документ')
58
-
59
- section_info = ""
60
-
61
- # Handle section information with proper hierarchy
62
- if metadata.get('section_path'):
63
- section_path = metadata['section_path']
64
- section_text = metadata.get('section_text', '')
65
- parent_section = metadata.get('parent_section', '')
66
- parent_title = metadata.get('parent_title', '')
67
- level = metadata.get('level', '')
68
-
69
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
70
- # For subsections: раздел X (Title), пункт X.X
71
- if section_text:
72
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
73
- else:
74
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
75
- elif section_text:
76
- # For main sections: раздел X (Title)
77
- section_info = f"раздел {section_path} ({section_text})"
78
- else:
79
- section_info = f"раздел {section_path}"
80
-
81
- elif metadata.get('section_id'):
82
- section_id = metadata['section_id']
83
- section_text = metadata.get('section_text', '')
84
- level = metadata.get('level', '')
85
- parent_section = metadata.get('parent_section', '')
86
- parent_title = metadata.get('parent_title', '')
87
-
88
- if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
89
- if section_text:
90
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
91
- else:
92
- section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
93
- elif section_text:
94
- section_info = f"раздел {section_id} ({section_text})"
95
- else:
96
- section_info = f"раздел {section_id}"
97
-
98
- # Override with table/image info if applicable
99
- if metadata.get('type') == 'table' and metadata.get('table_number'):
100
- table_num = metadata['table_number']
101
- if not str(table_num).startswith('№'):
102
- table_num = f"№{table_num}"
103
- table_title = metadata.get('table_title', '')
104
- # Include section context for tables
105
- base_section = ""
106
- if metadata.get('section_path'):
107
- base_section = f", раздел {metadata['section_path']}"
108
- elif metadata.get('section_id'):
109
- base_section = f", раздел {metadata['section_id']}"
110
-
111
- if table_title:
112
- section_info = f"Таблица {table_num} ({table_title}){base_section}"
113
- else:
114
- section_info = f"Таблица {table_num}{base_section}"
115
-
116
- if metadata.get('type') == 'image' and metadata.get('image_number'):
117
- image_num = metadata['image_number']
118
- if not str(image_num).startswith('№'):
119
- image_num = f"№{image_num}"
120
- image_title = metadata.get('image_title', '')
121
- # Include section context for images
122
- base_section = ""
123
- if metadata.get('section_path'):
124
- base_section = f", раздел {metadata['section_path']}"
125
- elif metadata.get('section_id'):
126
- base_section = f", раздел {metadata['section_id']}"
127
-
128
- if image_title:
129
- section_info = f"Рисунок {image_num} ({image_title}){base_section}"
130
- else:
131
- section_info = f"Рисунок {image_num}{base_section}"
132
-
133
- context_text = node.text if hasattr(node, 'text') else str(node)
134
 
135
- if section_info:
136
- formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
137
- else:
138
- formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
139
-
140
- context_parts.append(formatted_context)
141
-
142
- return "\n".join(context_parts)
143
-
144
-
145
- def generate_sources_html(nodes, chunks_df=None):
146
- html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
147
- html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
148
-
149
- sources_by_doc = {}
150
-
151
- for i, node in enumerate(nodes):
152
- metadata = node.metadata if hasattr(node, 'metadata') else {}
153
- doc_type = metadata.get('type', 'text')
154
- doc_id = metadata.get('document_id', 'unknown')
155
- section_id = metadata.get('section_id', '')
156
- section_text = metadata.get('section_text', '')
157
- section_path = metadata.get('section_path', '')
158
-
159
- # Create a unique key for grouping
160
  if doc_type == 'table':
161
- table_num = metadata.get('table_number', 'unknown')
162
- key = f"{doc_id}_table_{table_num}"
 
163
  elif doc_type == 'image':
164
- image_num = metadata.get('image_number', 'unknown')
165
- key = f"{doc_id}_image_{image_num}"
166
  else:
167
- # For text documents, group by section path or section id
168
- section_key = section_path if section_path else section_id
169
- key = f"{doc_id}_text_{section_key}"
170
-
171
- if key not in sources_by_doc:
172
- sources_by_doc[key] = {
173
- 'doc_id': doc_id,
174
- 'doc_type': doc_type,
175
- 'metadata': metadata,
176
- 'sections': set()
177
- }
178
-
179
- # Add section information
180
- if section_path:
181
- sources_by_doc[key]['sections'].add(f"пункт {section_path}")
182
- elif section_id and section_id != 'unknown':
183
- sources_by_doc[key]['sections'].add(f"пункт {section_id}")
184
 
185
- # Generate HTML for each unique source
186
- for source_info in sources_by_doc.values():
187
- metadata = source_info['metadata']
188
- doc_type = source_info['doc_type']
189
- doc_id = source_info['doc_id']
190
-
191
- html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
192
-
193
- if doc_type == 'text':
194
- html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
195
 
196
- elif doc_type == 'table' or doc_type == 'table_row':
197
- table_num = metadata.get('table_number', 'unknown')
198
- table_title = metadata.get('table_title', '')
199
- if table_num and table_num != 'unknown':
200
- if not str(table_num).startswith('№'):
201
- table_num = f"№{table_num}"
202
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
203
- if table_title and table_title != 'unknown':
204
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
205
- else:
206
- html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
207
-
208
- elif doc_type == 'image':
209
- image_num = metadata.get('image_number', 'unknown')
210
- image_title = metadata.get('image_title', '')
211
- section = metadata.get('section', '')
212
- if image_num and image_num != 'unknown':
213
- if not str(image_num).startswith('№'):
214
- image_num = f"№{image_num}"
215
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
216
- if image_title and image_title != 'unknown':
217
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
218
- if section and section != 'unknown':
219
- html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
220
- else:
221
- html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
222
-
223
- # Add file link if available
224
- if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
225
- doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
226
- if not doc_rows.empty:
227
- file_link = doc_rows.iloc[0]['file_link']
228
- html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
229
-
230
- html += "</div>"
231
-
232
- html += "</div>"
233
- return html
234
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
235
- if query_engine is None:
236
- return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
237
-
238
  try:
239
- start_time = time.time()
240
 
241
- llm = get_llm_model(current_model)
 
 
242
 
243
- # Direct retrieval without query expansion
244
- retrieved_nodes = query_engine.retriever.retrieve(question)
 
245
 
246
- log_message(f"Получено {len(retrieved_nodes)} узлов")
 
 
 
 
247
 
248
- reranked_nodes = rerank_nodes(
249
- question,
250
- retrieved_nodes,
251
- reranker,
252
- top_k=25,
253
- min_score_threshold=0.5,
254
- diversity_penalty=0.3
255
- )
256
-
257
- formatted_context = format_context_for_llm(reranked_nodes)
258
-
259
- enhanced_question = f"""Контекст из базы данных:
260
- {formatted_context}
261
 
262
- Вопрос пользователя: {question}
263
 
264
- Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
265
- Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
266
-
267
- response = query_engine.query(enhanced_question)
268
 
269
- end_time = time.time()
270
- processing_time = end_time - start_time
271
 
272
- log_message(f"Обработка завершена за {processing_time:.2f}с")
273
 
274
- sources_html = generate_sources_html(reranked_nodes, chunks_df)
275
 
276
- answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
277
- <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
278
- <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
279
- <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
280
- Время обработки: {processing_time:.2f} секунд
281
- </div>
282
- </div>"""
283
-
284
- chunk_info = []
285
- for node in reranked_nodes:
286
- metadata = node.metadata if hasattr(node, 'metadata') else {}
287
- chunk_info.append({
288
- 'document_id': metadata.get('document_id', 'unknown'),
289
- 'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
290
- 'section_path': metadata.get('section_path', ''),
291
- 'section_text': metadata.get('section_text', ''),
292
- 'level': metadata.get('level', ''),
293
- 'parent_section': metadata.get('parent_section', ''),
294
- 'parent_title': metadata.get('parent_title', ''),
295
- 'type': metadata.get('type', 'text'),
296
- 'table_number': metadata.get('table_number', ''),
297
- 'image_number': metadata.get('image_number', ''),
298
- 'chunk_size': len(node.text),
299
- 'chunk_text': node.text
300
- })
301
- from app import create_chunks_display_html
302
- chunks_html = create_chunks_display_html(chunk_info)
303
 
304
- return answer_with_time, sources_html, chunks_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- except Exception as e:
307
- log_message(f"Ошибка: {str(e)}")
308
- error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
309
- return error_msg, "", ""
 
 
 
 
 
 
 
 
 
1
  from llama_index.llms.google_genai import GoogleGenAI
 
2
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
3
  from sentence_transformers import CrossEncoder
 
 
 
4
  from my_logging import log_message
 
5
 
6
+ def get_llm_model(api_key, model_name="gemini-2.0-flash"):
7
+ """Get LLM model"""
8
+ return GoogleGenAI(model=model_name, api_key=api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
11
+ """Get embedding model"""
12
  return HuggingFaceEmbedding(model_name=model_name)
13
 
14
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
15
+ """Get reranker model"""
16
  return CrossEncoder(model_name)
17
 
18
+ def format_sources(nodes):
19
+ """Format retrieved sources for display"""
20
+ sources = []
21
  for node in nodes:
22
+ meta = node.metadata
23
+ doc_type = meta.get('type', 'text')
24
+ doc_id = meta.get('document_id', 'unknown')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if doc_type == 'table':
27
+ table_num = meta.get('table_number', 'unknown')
28
+ title = meta.get('table_title', '')
29
+ sources.append(f"📊 {doc_id} - Таблица {table_num}: {title}")
30
  elif doc_type == 'image':
31
+ img_num = meta.get('image_number', 'unknown')
32
+ sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
33
  else:
34
+ section = meta.get('section_id', '')
35
+ sources.append(f"📄 {doc_id} - Раздел {section}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ return "\n".join(set(sources))
 
 
 
 
 
 
 
 
 
38
 
39
+ def answer_question(question, query_engine, reranker):
40
+ """Answer question using RAG"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  try:
42
+ log_message(f"Query: {question}")
43
 
44
+ # Retrieve
45
+ retrieved = query_engine.retriever.retrieve(question)
46
+ log_message(f"Retrieved {len(retrieved)} nodes")
47
 
48
+ # Rerank
49
+ reranked = rerank_nodes(question, retrieved, reranker, top_k=15)
50
+ log_message(f"Reranked to {len(reranked)} nodes")
51
 
52
+ # Format context
53
+ context = "\n\n".join([
54
+ f"[{n.metadata.get('document_id', 'unknown')}]\n{n.text}"
55
+ for n in reranked
56
+ ])
57
 
58
+ # Generate answer
59
+ prompt = f"""Контекст из базы данных:
60
+ {context}
 
 
 
 
 
 
 
 
 
 
61
 
62
+ Вопрос: {question}
63
 
64
+ Ответь на вопрос используя ТОЛЬКО информацию из контекста. Цитируй источники."""
 
 
 
65
 
66
+ response = query_engine.query(prompt)
 
67
 
68
+ sources = format_sources(reranked)
69
 
70
+ return response.response, sources
71
 
72
+ except Exception as e:
73
+ log_message(f"Error: {e}")
74
+ return f"Ошибка: {e}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ def rerank_nodes(query, nodes, reranker, top_k=15, min_score=0.5):
77
+ """Rerank nodes with diversity"""
78
+ if not nodes:
79
+ return []
80
+
81
+ # Score all nodes
82
+ pairs = [[query, n.text] for n in nodes]
83
+ scores = reranker.predict(pairs)
84
+
85
+ # Sort by score
86
+ scored = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)
87
+
88
+ # Filter by threshold
89
+ filtered = [(n, s) for n, s in scored if s >= min_score]
90
+
91
+ if not filtered:
92
+ # Fallback: take top 30% if nothing passes threshold
93
+ cutoff = max(scores) * 0.6
94
+ filtered = [(n, s) for n, s in scored if s >= cutoff]
95
+
96
+ # Diversity selection
97
+ selected = []
98
+ seen_docs = set()
99
+
100
+ for node, score in filtered:
101
+ if len(selected) >= top_k:
102
+ break
103
 
104
+ doc_id = node.metadata.get('document_id', 'unknown')
105
+
106
+ # Prioritize diverse documents
107
+ if doc_id not in seen_docs or len(selected) < 5:
108
+ selected.append(node)
109
+ seen_docs.add(doc_id)
110
+
111
+ log_message(f"Reranked: {len(filtered)} → {len(selected)} (from {len(seen_docs)} docs)")
112
+
113
+ return selected