MrSimple07 commited on
Commit
f6a9f63
·
1 Parent(s): b91dfb0

eski holat

Browse files
Files changed (5) hide show
  1. app.py +320 -90
  2. documents_prep.py +434 -717
  3. index_retriever.py +65 -166
  4. table_prep.py +244 -177
  5. utils.py +145 -25
app.py CHANGED
@@ -1,86 +1,251 @@
1
  import gradio as gr
 
2
  from llama_index.core import Settings
3
- from documents_prep import load_all_documents
4
- from index_retriever import create_vector_index, create_query_engine
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
- from config import *
8
- # Global state
9
- query_engine = None
10
- reranker = None
 
 
11
 
12
- def initialize_system():
13
- """Initialize RAG system"""
14
- global query_engine, reranker
15
 
16
- log_message("="*60)
17
- log_message("INITIALIZING SYSTEM")
18
- log_message("="*60)
19
 
20
- # Setup models
21
- llm = get_llm_model(GOOGLE_API_KEY)
22
- embed_model = get_embedding_model()
23
- reranker = get_reranker_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- Settings.llm = llm
26
- Settings.embed_model = embed_model
 
 
 
 
 
27
 
28
- log_message("✓ Models loaded")
 
 
 
 
29
 
30
- # Load documents
31
- documents = load_all_documents(
32
- repo_id=HF_REPO_ID,
33
- hf_token=HF_TOKEN,
34
- json_dir=JSON_FILES_DIR,
35
- table_dir=TABLE_DATA_DIR,
36
- image_dir=IMAGE_DATA_DIR
37
- )
38
-
39
- # Create index
40
- vector_index = create_vector_index(documents)
41
- query_engine = create_query_engine(vector_index)
42
 
43
- log_message("="*60)
44
- log_message("SYSTEM READY")
45
- log_message("="*60)
 
46
 
47
- return "✅ System initialized"
48
 
49
- def ask_question(question):
50
- """Handle question from UI"""
51
- if not question.strip():
52
- return "Пожалуйста, введите вопрос", ""
53
-
54
- if query_engine is None:
55
- return "❌ Система не инициализирована", ""
 
 
 
56
 
57
- answer, sources = answer_question(question, query_engine, reranker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- return answer, sources
60
- def create_interface():
61
- """Create Gradio UI"""
62
- # Auto-initialize system before UI starts
63
- status_msg = initialize_system()
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
66
- gr.Markdown("""
67
- # AIEXP - AI Expert для нормативной документации
68
- ## Упрощенная версия RAG системы
69
- """)
 
70
 
71
- gr.Markdown("### Задайте вопрос")
 
 
 
 
 
 
 
 
 
72
 
73
- with gr.Row():
74
- question = gr.Textbox(
75
- label="Ваш вопрос",
76
- placeholder="Введите вопрос...",
77
- lines=3
78
- )
79
 
80
- ask_btn = gr.Button("Найти ответ", variant="primary")
81
 
82
- gr.Examples(
83
- examples=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
85
  "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
86
  "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
@@ -88,38 +253,103 @@ def create_interface():
88
  "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
89
  "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
90
  ],
91
- inputs=question
92
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- with gr.Row():
95
- answer = gr.Textbox(
96
- label="Ответ",
97
- lines=10
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
- sources = gr.Textbox(
100
- label="Источники",
101
- lines=10
 
 
102
  )
 
103
 
104
- # Event handlers
105
- ask_btn.click(
106
- fn=ask_question,
107
- inputs=question,
108
- outputs=[answer, sources]
109
- )
110
 
111
- question.submit(
112
- fn=ask_question,
113
- inputs=question,
114
- outputs=[answer, sources]
115
- )
116
 
117
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  if __name__ == "__main__":
120
- demo = create_interface()
121
- demo.launch(
122
- server_name="0.0.0.0",
123
- server_port=7860,
124
- share=True
125
- )
 
1
  import gradio as gr
2
+ import os
3
  from llama_index.core import Settings
4
+ from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
 
5
  from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
  from my_logging import log_message
7
+ from index_retriever import create_vector_index, create_query_engine
8
+ import sys
9
+ from config import (
10
+ HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
11
+ JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
12
+ )
13
 
14
+ def create_chunks_display_html(chunk_info):
15
+ if not chunk_info:
16
+ return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
17
 
18
+ html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
19
+ html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
 
20
 
21
+ for i, chunk in enumerate(chunk_info):
22
+ bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
23
+
24
+ # Get section display info
25
+ section_display = get_section_display(chunk)
26
+ formatted_content = get_formatted_content(chunk)
27
+
28
+ html += f"""
29
+ <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
30
+ <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
31
+ <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
32
+ <strong style='color: black;'>Содержание:</strong><br>
33
+ <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
34
+ {formatted_content}
35
+ </div>
36
+ </div>
37
+ """
38
 
39
+ html += "</div>"
40
+ return html
41
+
42
+ def get_section_display(chunk):
43
+ section_path = chunk.get('section_path', '')
44
+ section_id = chunk.get('section_id', 'unknown')
45
+ doc_type = chunk.get('type', 'text')
46
 
47
+ if doc_type == 'table' and chunk.get('table_number'):
48
+ table_num = chunk.get('table_number')
49
+ if not str(table_num).startswith('№'):
50
+ table_num = f"№{table_num}"
51
+ return f"таблица {table_num}"
52
 
53
+ if doc_type == 'image' and chunk.get('image_number'):
54
+ image_num = chunk.get('image_number')
55
+ if not str(image_num).startswith('№'):
56
+ image_num = f"№{image_num}"
57
+ return f"рисунок {image_num}"
 
 
 
 
 
 
 
58
 
59
+ if section_path:
60
+ return section_path
61
+ elif section_id and section_id != 'unknown':
62
+ return section_id
63
 
64
+ return section_id
65
 
66
+ def get_formatted_content(chunk):
67
+ document_id = chunk.get('document_id', 'unknown')
68
+ section_path = chunk.get('section_path', '')
69
+ section_id = chunk.get('section_id', 'unknown')
70
+ section_text = chunk.get('section_text', '')
71
+ parent_section = chunk.get('parent_section', '')
72
+ parent_title = chunk.get('parent_title', '')
73
+ level = chunk.get('level', '')
74
+ chunk_text = chunk.get('chunk_text', '')
75
+ doc_type = chunk.get('type', 'text')
76
 
77
+ # For text documents
78
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
79
+ current_section = section_path if section_path else section_id
80
+ parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
81
+ return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
82
+ else:
83
+ current_section = section_path if section_path else section_id
84
+ clean_text = chunk_text
85
+ if section_text and chunk_text.startswith(section_text):
86
+ section_title = section_text
87
+ elif chunk_text.startswith(f"{current_section} "):
88
+ clean_text = chunk_text[len(f"{current_section} "):].strip()
89
+ section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
90
+ else:
91
+ section_title = section_text if section_text else current_section
92
+
93
+ return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
94
+
95
+ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
96
+ json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
+ use_json_instead_csv=False):
98
+ try:
99
+ from documents_prep import process_documents_with_chunking
100
+ log_message("Инициализация системы")
101
+ os.makedirs(download_dir, exist_ok=True)
102
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
103
+ from llama_index.core.text_splitter import TokenTextSplitter
104
+
105
+ embed_model = get_embedding_model()
106
+ llm = get_llm_model(DEFAULT_MODEL)
107
+ reranker = get_reranker_model()
108
+
109
+ Settings.embed_model = embed_model
110
+ Settings.llm = llm
111
+ Settings.text_splitter = TokenTextSplitter(
112
+ chunk_size=CHUNK_SIZE,
113
+ chunk_overlap=CHUNK_OVERLAP,
114
+ separator=" ",
115
+ backup_separators=["\n", ".", "!", "?"]
116
+ )
117
+
118
+ log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
119
+ log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
120
+
121
+ all_documents = []
122
+ chunks_df = None
123
+ chunk_info = []
124
+
125
+ if use_json_instead_csv and json_files_dir:
126
+ log_message("Используем JSON файлы вместо CSV")
127
+ json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
+ all_documents.extend(json_documents)
129
+ chunk_info.extend(json_chunk_info)
130
+ else:
131
+ if chunks_filename:
132
+ log_message("Загружаем данные из CSV")
133
+ csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
134
+ all_documents.extend(csv_documents)
135
+
136
+ if table_data_dir:
137
+ log_message("Добавляю табличные данные")
138
+ table_documents = load_table_data(repo_id, hf_token, table_data_dir)
139
+ log_message(f"Загружено {len(table_documents)} табличных документов")
140
+
141
+ # Process table documents through chunking
142
+ chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
143
+ all_documents.extend(chunked_table_docs)
144
+ chunk_info.extend(table_chunk_info)
145
+
146
+ if image_data_dir:
147
+ log_message("Добавляю данные изображений")
148
+ image_documents = load_image_data(repo_id, hf_token, image_data_dir)
149
+ log_message(f"Загружено {len(image_documents)} документов изображений")
150
+
151
+ # Process image documents through chunking
152
+ chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
153
+ all_documents.extend(chunked_image_docs)
154
+ chunk_info.extend(image_chunk_info)
155
+
156
+ log_message(f"Всего документов после всей обработки: {len(all_documents)}")
157
+
158
+ vector_index = create_vector_index(all_documents)
159
+ query_engine = create_query_engine(vector_index)
160
+
161
+ log_message(f"Система успешно инициализирована")
162
+ return query_engine, chunks_df, reranker, vector_index, chunk_info
163
+
164
+ except Exception as e:
165
+ log_message(f"Ошибка инициализации: {str(e)}")
166
+ return None, None, None, None, []
167
+
168
+ def switch_model(model_name, vector_index):
169
+ from llama_index.core import Settings
170
+ from index_retriever import create_query_engine
171
 
172
+ try:
173
+ log_message(f"Переключение на модель: {model_name}")
174
+
175
+ new_llm = get_llm_model(model_name)
176
+ Settings.llm = new_llm
177
+
178
+ if vector_index is not None:
179
+ new_query_engine = create_query_engine(vector_index)
180
+ log_message(f"Модель успешно переключена на: {model_name}")
181
+ return new_query_engine, f"✅ Модель переключена на: {model_name}"
182
+ else:
183
+ return None, "❌ Ошибка: система не инициализирована"
184
+
185
+ except Exception as e:
186
+ error_msg = f"Ошибка переключения модели: {str(e)}"
187
+ log_message(error_msg)
188
+ return None, f"❌ {error_msg}"
189
 
190
+ def main_answer_question(question):
191
+ global query_engine, reranker, current_model, chunks_df
192
+ if not question.strip():
193
+ return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
194
+ "<div style='color: black;'>Источники появятся после обработки запроса</div>",
195
+ "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
196
 
197
+ try:
198
+ # Call the answer_question function which returns 3 values
199
+ answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
200
+ return answer_html, sources_html, chunks_html
201
+
202
+ except Exception as e:
203
+ log_message(f"Ошибка при ответе на вопрос: {str(e)}")
204
+ return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
205
+ "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
206
+ "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
207
 
 
 
 
 
 
 
208
 
 
209
 
210
+ def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
211
+ with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
212
+
213
+ gr.Markdown("""
214
+ # AIEXP - Artificial Intelligence Expert
215
+
216
+ ## Инструмент для работы с нормативной документацией
217
+ """)
218
+
219
+ with gr.Tab("Поиск по нормативным документам"):
220
+ gr.Markdown("### Задайте вопрос по нормативной документации")
221
+
222
+ with gr.Row():
223
+ with gr.Column(scale=2):
224
+ model_dropdown = gr.Dropdown(
225
+ choices=list(AVAILABLE_MODELS.keys()),
226
+ value=current_model,
227
+ label="Выберите языковую модель",
228
+ info="Выберите модель для генерации ответов"
229
+ )
230
+ with gr.Column(scale=1):
231
+ switch_btn = gr.Button("Переключить модель", variant="secondary")
232
+ model_status = gr.Textbox(
233
+ value=f"Текущая модель: {current_model}",
234
+ label="Статус модели",
235
+ interactive=False
236
+ )
237
+
238
+ with gr.Row():
239
+ with gr.Column(scale=3):
240
+ question_input = gr.Textbox(
241
+ label="Ваш вопрос к базе знаний",
242
+ placeholder="Введите вопрос по нормативным документам...",
243
+ lines=3
244
+ )
245
+ ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
246
+
247
+ gr.Examples(
248
+ examples=[
249
  "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
250
  "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
251
  "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
 
253
  "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
254
  "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
255
  ],
256
+ inputs=question_input
257
+ )
258
+
259
+ with gr.Row():
260
+ with gr.Column(scale=2):
261
+ answer_output = gr.HTML(
262
+ label="",
263
+ value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
264
+ )
265
+
266
+ with gr.Column(scale=1):
267
+ sources_output = gr.HTML(
268
+ label="",
269
+ value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
270
+ )
271
 
272
+ with gr.Column(scale=1):
273
+ chunks_output = gr.HTML(
274
+ label="Релевантные чанки",
275
+ value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
276
+ )
277
+
278
+ switch_btn.click(
279
+ fn=switch_model_func,
280
+ inputs=[model_dropdown],
281
+ outputs=[model_status]
282
+ )
283
+
284
+ ask_btn.click(
285
+ fn=answer_question_func,
286
+ inputs=[question_input],
287
+ outputs=[answer_output, sources_output, chunks_output]
288
  )
289
+
290
+ question_input.submit(
291
+ fn=answer_question_func,
292
+ inputs=[question_input],
293
+ outputs=[answer_output, sources_output, chunks_output]
294
  )
295
+ return demo
296
 
 
 
 
 
 
 
297
 
298
+ query_engine = None
299
+ chunks_df = None
300
+ reranker = None
301
+ vector_index = None
302
+ current_model = DEFAULT_MODEL
303
 
304
+ def main_answer_question(question):
305
+ global query_engine, reranker, current_model, chunks_df
306
+ answer_html, sources_html, chunks_html = answer_question(
307
+ question, query_engine, reranker, current_model, chunks_df
308
+ )
309
+ return answer_html, sources_html, chunks_html
310
+
311
+ def main_switch_model(model_name):
312
+ global query_engine, vector_index, current_model
313
+
314
+ new_query_engine, status_message = switch_model(model_name, vector_index)
315
+ if new_query_engine:
316
+ query_engine = new_query_engine
317
+ current_model = model_name
318
+
319
+ return status_message
320
+
321
+ def main():
322
+ global query_engine, chunks_df, reranker, vector_index, current_model
323
+
324
+ log_message("Запуск AIEXP - AI Expert для нормативной документации")
325
+
326
+ query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
327
+ repo_id=HF_REPO_ID,
328
+ hf_token=HF_TOKEN,
329
+ download_dir=DOWNLOAD_DIR,
330
+ json_files_dir=JSON_FILES_DIR,
331
+ table_data_dir=TABLE_DATA_DIR,
332
+ image_data_dir=IMAGE_DATA_DIR,
333
+ use_json_instead_csv=True,
334
+ )
335
+
336
+ if query_engine:
337
+ log_message("Запуск веб-интерфейса")
338
+ demo = create_demo_interface(
339
+ answer_question_func=main_answer_question,
340
+ switch_model_func=main_switch_model,
341
+ current_model=current_model,
342
+ chunk_info=chunk_info
343
+ )
344
+ demo.launch(
345
+ server_name="0.0.0.0",
346
+ server_port=7860,
347
+ share=True,
348
+ debug=False
349
+ )
350
+ else:
351
+ log_message("Невозможно запустить приложение из-за ошибки инициализации")
352
+ sys.exit(1)
353
 
354
  if __name__ == "__main__":
355
+ main()
 
 
 
 
 
documents_prep.py CHANGED
@@ -3,769 +3,486 @@ import zipfile
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
6
- from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
- import re
 
 
9
 
10
- # Configuration
11
- CHUNK_SIZE = 1500
12
- CHUNK_OVERLAP = 128
13
 
14
- def chunk_text_documents(documents):
 
 
 
 
15
  text_splitter = SentenceSplitter(
16
- chunk_size=CHUNK_SIZE,
17
- chunk_overlap=CHUNK_OVERLAP
 
18
  )
19
 
20
- chunked = []
21
- for doc in documents:
22
- chunks = text_splitter.get_nodes_from_documents([doc])
23
- for i, chunk in enumerate(chunks):
24
- chunk.metadata.update({
25
- 'chunk_id': i,
26
- 'total_chunks': len(chunks),
27
- 'chunk_size': len(chunk.text) # Add chunk size
28
- })
29
- chunked.append(chunk)
30
-
31
- # Log statistics
32
- if chunked:
33
- avg_size = sum(len(c.text) for c in chunked) / len(chunked)
34
- min_size = min(len(c.text) for c in chunked)
35
- max_size = max(len(c.text) for c in chunked)
36
- log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
37
- log_message(f" Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
38
-
39
- return chunked
40
-
41
- def should_keep_table_whole(doc_id):
42
- """Check if document should be kept as single chunk"""
43
- special_patterns = [
44
- r'НП\s*068-05',
45
- r'НП-068-05',
46
- r'59023',
47
- r'ГОСТ\s*Р?\s*59023'
48
- ]
49
-
50
- for pattern in special_patterns:
51
- if re.search(pattern, doc_id, re.IGNORECASE):
52
- return True
53
- return False
54
-
55
- def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=3, max_chars=2000):
56
- headers = table_data.get('headers', [])
57
- rows = table_data.get('data', [])
58
- table_num = str(table_data.get('table_number', 'unknown')).strip()
59
- table_title = table_data.get('table_title', '')
60
- section = table_data.get('section', '')
61
-
62
- # CHECK FOR SPECIAL FILES - NO CHUNKING
63
- if should_keep_table_whole(doc_id):
64
- log_message(f" 📊 FULL TABLE (special file): {doc_id} - {table_num}")
65
- return create_full_table_chunk(table_data, doc_id)
66
-
67
- # Section-aware identifier (keep your existing logic)
68
- import re
69
- if 'приложени' in section.lower():
70
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
71
- if appendix_match:
72
- appendix_num = appendix_match.group(1).upper()
73
- table_identifier = f"{table_num} Приложение {appendix_num}"
74
- else:
75
- table_identifier = table_num
76
- else:
77
- table_identifier = table_num
78
-
79
- log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
80
-
81
- # Build base header (compact version)
82
- base_header = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
83
- if table_title:
84
- base_header += f"НАЗВАНИЕ: {table_title}\n"
85
- base_header += f"{'='*60}\n"
86
-
87
- if headers:
88
- header_str = ' | '.join(str(h)[:30] for h in headers) # Truncate long headers
89
- base_header += f"ЗАГОЛОВКИ: {header_str}\n\n"
90
-
91
- # Calculate available space
92
- base_size = len(base_header)
93
- footer_size = 100
94
- available_space = max_chars - base_size - footer_size
95
-
96
- chunks = []
97
- current_batch = []
98
- current_size = 0
99
- chunk_num = 0
100
-
101
- for i, row in enumerate(rows):
102
- row_text = format_single_row(row, i + 1)
103
- row_size = len(row_text)
104
 
105
- # Case 1: Single row exceeds max - split it internally
106
- if row_size > available_space:
107
- # Flush current batch first
108
- if current_batch:
109
- chunks.append(_create_chunk(
110
- base_header, current_batch, table_identifier,
111
- doc_id, table_num, table_title, section,
112
- len(rows), chunk_num, False
113
- ))
114
- chunk_num += 1
115
- current_batch = []
116
- current_size = 0
117
- log_message(f" ⚠ Row {i+1} too large ({row_size} chars), splitting...")
118
- # Split the large row
119
- split_chunks = _split_large_row(
120
- row, i + 1, base_header, available_space,
121
- table_identifier, doc_id, table_num, table_title,
122
- section, len(rows), chunk_num
123
- )
124
- chunks.extend(split_chunks)
125
- log_message(f" → Created {len(split_chunks)} chunks from row {i+1}")
126
- chunk_num += len(split_chunks)
127
- continue
128
-
129
- # Case 2: Adding this row would exceed limit - flush current batch
130
- if current_size + row_size > available_space and current_batch:
131
- chunks.append(_create_chunk(
132
- base_header, current_batch, table_identifier,
133
- doc_id, table_num, table_title, section,
134
- len(rows), chunk_num, False
135
- ))
136
- chunk_num += 1
137
- current_batch = []
138
- current_size = 0
139
-
140
- # Case 3: Add row to current batch
141
- current_batch.append({'row': row, 'idx': i + 1, 'text': row_text})
142
- log_message(f" + Row {i+1} ({row_size} chars) added to chunk {chunk_num}")
143
- current_size += row_size
144
-
145
- # Flush if we hit target row count
146
- if len(current_batch) >= rows_per_chunk:
147
- chunks.append(_create_chunk(
148
- base_header, current_batch, table_identifier,
149
- doc_id, table_num, table_title, section,
150
- len(rows), chunk_num, False
151
- ))
152
- chunk_num += 1
153
- current_batch = []
154
- current_size = 0
155
-
156
- # Flush remaining rows
157
- if current_batch:
158
- chunks.append(_create_chunk(
159
- base_header, current_batch, table_identifier,
160
- doc_id, table_num, table_title, section,
161
- len(rows), chunk_num, len(chunks) == 0
162
- ))
163
-
164
- log_message(f" Created {len(chunks)} chunks from {len(rows)} rows")
165
- return chunks
166
-
167
-
168
- def create_full_table_chunk(table_data, doc_id):
169
- """Create a single chunk for entire table (no splitting)"""
170
- headers = table_data.get('headers', [])
171
- rows = table_data.get('data', [])
172
- table_num = str(table_data.get('table_number', 'unknown')).strip()
173
- table_title = table_data.get('table_title', '')
174
- section = table_data.get('section', '')
175
-
176
- # Section-aware identifier
177
- import re
178
- if 'приложени' in section.lower():
179
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
180
- if appendix_match:
181
- appendix_num = appendix_match.group(1).upper()
182
- table_identifier = f"{table_num} Приложение {appendix_num}"
183
- else:
184
- table_identifier = table_num
185
- else:
186
- table_identifier = table_num
187
-
188
- # Build full content
189
- content = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
190
- if table_title:
191
- content += f"НАЗВАНИЕ: {table_title}\n"
192
- content += f"РАЗДЕЛ: {section}\n"
193
- content += f"{'='*60}\n"
194
-
195
- if headers:
196
- header_str = ' | '.join(str(h) for h in headers)
197
- content += f"ЗАГОЛОВКИ: {header_str}\n\n"
198
 
199
- content += "ДАННЫЕ (ПОЛНАЯ ТАБЛИЦА):\n"
200
-
201
- for i, row in enumerate(rows, 1):
202
- row_text = format_single_row(row, i)
203
- if row_text:
204
- content += row_text
205
-
206
- content += f"\n[Полная таблица: {len(rows)} строк]\n"
207
-
208
- # Embed metadata in text
209
- content += f"\n\n--- МЕТАДАННЫЕ ---\n"
210
- content += f"Документ: {doc_id}\n"
211
- content += f"Таблица: {table_identifier}\n"
212
- content += f"Название таблицы: {table_title}\n"
213
- content += f"Раздел: {section}\n"
214
- content += f"Всего строк: {len(rows)}\n"
215
-
216
- metadata = {
217
- 'type': 'table',
218
- 'document_id': doc_id,
219
- 'table_number': table_num,
220
- 'table_identifier': table_identifier,
221
- 'table_title': table_title,
222
- 'section': section,
223
- 'chunk_id': 0,
224
- 'row_start': 0,
225
- 'row_end': len(rows),
226
- 'total_rows': len(rows),
227
- 'chunk_size': len(content),
228
- 'is_complete_table': True,
229
- 'chunking_strategy': 'full_table',
230
- 'rows_in_chunk': len(rows)
231
- }
232
-
233
- return [Document(text=content, metadata=metadata)]
234
-
235
 
236
- def _create_chunk(base_header, batch, table_identifier, doc_id,
237
- table_num, table_title, section, total_rows,
238
- chunk_num, is_complete):
239
- """Helper to create a chunk with full metadata"""
240
- content = base_header + "ДАННЫЕ:\n"
 
 
 
241
 
242
- for item in batch:
243
- content += item['text']
244
-
245
- row_start = batch[0]['idx']
246
- row_end = batch[-1]['idx']
247
-
248
- # Add footer with row info
249
- if not is_complete:
250
- content += f"\n[Строки {row_start}-{row_end} из {total_rows}]"
251
-
252
- # EMBED ALL METADATA IN TEXT for better retrieval
253
- content += f"\n\n--- МЕТАДАННЫЕ ---\n"
254
- content += f"Документ: {doc_id}\n"
255
- content += f"Таблица: {table_identifier}\n"
256
- content += f"Название таблицы: {table_title}\n"
257
- content += f"Раздел: {section}\n"
258
- content += f"Строки: {row_start}-{row_end} из {total_rows}\n"
259
-
260
- metadata = {
261
- 'type': 'table',
262
- 'document_id': doc_id,
263
- 'table_number': table_num,
264
- 'table_identifier': table_identifier,
265
- 'table_title': table_title,
266
- 'section': section,
267
- 'chunk_id': chunk_num,
268
- 'row_start': row_start - 1,
269
- 'row_end': row_end,
270
- 'total_rows': total_rows,
271
- 'chunk_size': len(content),
272
- 'is_complete_table': is_complete,
273
- 'rows_in_chunk': len(batch)
274
- }
275
-
276
- return Document(text=content, metadata=metadata)
277
-
278
-
279
- def _split_large_row(row, row_idx, base_header, max_size,
280
- table_identifier, doc_id, table_num,
281
- table_title, section, total_rows, base_chunk_num):
282
- """Split a single large row into multiple chunks"""
283
- if isinstance(row, dict):
284
- items = list(row.items())
285
- else:
286
- items = [(f"col_{i}", v) for i, v in enumerate(row)]
287
-
288
- chunks = []
289
- current_items = []
290
- current_size = 0
291
- part_num = 0
292
-
293
- for key, value in items:
294
- item_text = f"{key}: {value}\n"
295
- item_size = len(item_text)
296
 
297
- if current_size + item_size > max_size and current_items:
298
- # Create chunk for current items
299
- content = base_header + "ДАННЫЕ:\n"
300
- content += f"Строка {row_idx} (часть {part_num + 1}):\n"
301
- content += "".join(current_items)
302
- content += f"\n[Строка {row_idx} из {total_rows} - продолжается]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
- chunks.append(_create_chunk_from_text(
305
- content, doc_id, table_num, table_identifier,
306
- table_title, section, row_idx, row_idx,
307
- total_rows, base_chunk_num + part_num
308
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
- part_num += 1
311
- current_items = []
312
- current_size = 0
313
-
314
- current_items.append(item_text)
315
- current_size += item_size
316
-
317
- # Flush remaining
318
- if current_items:
319
- content = base_header + "ДАННЫЕ:\n"
320
- content += f"Строка {row_idx} (часть {part_num + 1}):\n"
321
- content += "".join(current_items)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
- chunks.append(_create_chunk_from_text(
324
- content, doc_id, table_num, table_identifier,
325
- table_title, section, row_idx, row_idx,
326
- total_rows, base_chunk_num + part_num
327
- ))
328
-
329
- return chunks
330
-
331
-
332
- def _create_chunk_from_text(content, doc_id, table_num, table_identifier,
333
- table_title, section, row_start, row_end,
334
- total_rows, chunk_num):
335
- """Helper for creating chunk from pre-built text"""
336
- metadata = {
337
- 'type': 'table',
338
- 'document_id': doc_id,
339
- 'table_number': table_num,
340
- 'table_identifier': table_identifier,
341
- 'table_title': table_title,
342
- 'section': section,
343
- 'chunk_id': chunk_num,
344
- 'row_start': row_start - 1,
345
- 'row_end': row_end,
346
- 'total_rows': total_rows,
347
- 'chunk_size': len(content),
348
- 'is_complete_table': False
349
- }
350
 
351
- return Document(text=content, metadata=metadata)
352
-
353
-
354
- def format_single_row(row, idx):
355
- """Format a single row"""
356
- if isinstance(row, dict):
357
- parts = [f"{k}: {v}" for k, v in row.items()
358
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
359
- if parts:
360
- return f"{idx}. {' | '.join(parts)}\n"
361
- elif isinstance(row, list):
362
- parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
363
- if parts:
364
- return f"{idx}. {' | '.join(parts)}\n"
365
- return ""
366
 
367
-
368
-
369
- def load_table_documents(repo_id, hf_token, table_dir):
370
- log_message("Loading tables...")
371
-
372
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
373
- table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
374
-
375
- all_chunks = []
376
- stats = {
377
- 'full_tables': 0,
378
- 'split_tables': 0,
379
- 'total_chunks': 0,
380
- 'full_table_sizes': [],
381
- 'split_chunk_sizes': []
382
- }
383
 
384
- for file_path in table_files:
385
- try:
386
- local_path = hf_hub_download(
387
- repo_id=repo_id,
388
- filename=file_path,
389
- repo_type="dataset",
390
- token=hf_token
391
- )
392
 
393
- with open(local_path, 'r', encoding='utf-8') as f:
394
- data = json.load(f)
395
 
396
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
- for sheet in data.get('sheets', []):
399
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
400
-
401
- chunks = chunk_table_by_rows(sheet, sheet_doc_id, max_chars=3072)
402
-
403
- # Track statistics
404
- if chunks:
405
- is_full = chunks[0].metadata.get('is_complete_table', False)
406
- chunk_size = chunks[0].metadata.get('chunk_size', 0)
407
 
408
- if is_full:
409
- stats['full_tables'] += 1
410
- stats['full_table_sizes'].append(chunk_size)
411
- log_message(f" 📄 {sheet_doc_id}: FULL TABLE ({chunk_size} chars)")
412
- else:
413
- stats['split_tables'] += 1
414
- for c in chunks:
415
- stats['split_chunk_sizes'].append(c.metadata.get('chunk_size', 0))
416
- log_message(f" 📄 {sheet_doc_id}: {len(chunks)} chunks (split)")
 
 
 
 
 
 
 
417
 
418
- stats['total_chunks'] += len(chunks)
419
-
420
- all_chunks.extend(chunks)
421
-
422
- except Exception as e:
423
- log_message(f"Error loading {file_path}: {e}")
424
-
425
- # Print final statistics
426
- log_message(f"\n{'='*60}")
427
- log_message(f"TABLE LOADING STATISTICS:")
428
- log_message(f" Total chunks created: {stats['total_chunks']}")
429
- log_message(f" Full tables (no split): {stats['full_tables']}")
430
- log_message(f" Split tables: {stats['split_tables']}")
431
-
432
- if stats['full_table_sizes']:
433
- avg_full = sum(stats['full_table_sizes']) / len(stats['full_table_sizes'])
434
- log_message(f" Full table avg size: {avg_full:.0f} chars")
435
- log_message(f" Full table size range: {min(stats['full_table_sizes'])} - {max(stats['full_table_sizes'])} chars")
436
-
437
- if stats['split_chunk_sizes']:
438
- avg_split = sum(stats['split_chunk_sizes']) / len(stats['split_chunk_sizes'])
439
- log_message(f" Split chunk avg size: {avg_split:.0f} chars")
440
- log_message(f" Split chunk size range: {min(stats['split_chunk_sizes'])} - {max(stats['split_chunk_sizes'])} chars")
441
-
442
- log_message(f"{'='*60}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
- return all_chunks
445
-
446
 
447
- def create_whole_table_chunk(table_data, doc_id):
448
- """Create a single chunk for the entire table (no splitting)"""
449
- headers = table_data.get('headers', [])
450
- rows = table_data.get('data', [])
451
- table_num = str(table_data.get('table_number', 'unknown')).strip()
452
- table_title = table_data.get('table_title', '')
453
- section = table_data.get('section', '')
454
-
455
- # Section-aware identifier
456
- import re
457
- if 'приложени' in section.lower():
458
- appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
459
- if appendix_match:
460
- appendix_num = appendix_match.group(1).upper()
461
- table_identifier = f"{table_num} Приложение {appendix_num}"
462
- else:
463
- table_identifier = table_num
464
- else:
465
- table_identifier = table_num
466
-
467
- if not rows:
468
- return []
469
-
470
- log_message(f" 📊 Creating WHOLE table: {doc_id} - {table_identifier} ({len(rows)} rows)")
471
-
472
- # Build complete table content
473
- content = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
474
- if table_title:
475
- content += f"НАЗВАНИЕ: {table_title}\n"
476
- content += f"{'='*60}\n"
477
-
478
- if headers:
479
- header_str = ' | '.join(str(h) for h in headers)
480
- content += f"ЗАГОЛОВКИ: {header_str}\n\n"
481
-
482
- content += "ДАННЫЕ:\n"
483
-
484
- # Add ALL rows
485
- for i, row in enumerate(rows, 1):
486
- row_text = format_single_row(row, i)
487
- if row_text:
488
- content += row_text
489
 
490
- # Add metadata section
491
- content += f"\n\n--- МЕТАДАННЫЕ ---\n"
492
- content += f"Документ: {doc_id}\n"
493
- content += f"Таблица: {table_identifier}\n"
494
- content += f"Название таблицы: {table_title}\n"
495
- content += f"Раздел: {section}\n"
496
- content += f"Полная таблица: {len(rows)} строк\n"
497
-
498
- metadata = {
499
- 'type': 'table',
500
- 'document_id': doc_id,
501
- 'table_number': table_num,
502
- 'table_identifier': table_identifier,
503
- 'table_title': table_title,
504
- 'section': section,
505
- 'chunk_id': 0,
506
- 'row_start': 0,
507
- 'row_end': len(rows),
508
- 'total_rows': len(rows),
509
- 'chunk_size': len(content),
510
- 'is_complete_table': True,
511
- 'rows_in_chunk': len(rows)
512
- }
513
-
514
- log_message(f" Created 1 chunk with {len(rows)} rows ({len(content)} chars)")
515
-
516
- return [Document(text=content, metadata=metadata)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
- def load_json_documents(repo_id, hf_token, json_dir):
519
- import zipfile
520
- import tempfile
521
- import os
522
 
523
- log_message("Loading JSON documents...")
 
524
 
525
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
526
- json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
527
- zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
528
 
529
- log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
 
 
 
530
 
 
 
 
531
  documents = []
532
- stats = {'success': 0, 'failed': 0, 'empty': 0}
533
 
534
- for file_path in json_files:
535
- try:
536
- log_message(f" Loading: {file_path}")
537
- local_path = hf_hub_download(
538
- repo_id=repo_id,
539
- filename=file_path,
540
- repo_type="dataset",
541
- token=hf_token
542
- )
543
-
544
- docs = extract_sections_from_json(local_path)
545
- if docs:
546
- documents.extend(docs)
547
- stats['success'] += 1
548
- log_message(f" ✓ Extracted {len(docs)} sections")
549
- else:
550
- stats['empty'] += 1
551
- log_message(f" ⚠ No sections found")
552
 
553
- except Exception as e:
554
- stats['failed'] += 1
555
- log_message(f" ✗ Error: {e}")
556
-
557
- for zip_path in zip_files:
558
- try:
559
- log_message(f" Processing ZIP: {zip_path}")
560
- local_zip = hf_hub_download(
561
- repo_id=repo_id,
562
- filename=zip_path,
563
- repo_type="dataset",
564
- token=hf_token
565
- )
566
 
567
- with zipfile.ZipFile(local_zip, 'r') as zf:
568
- json_files_in_zip = [f for f in zf.namelist()
569
- if f.endswith('.json')
570
- and not f.startswith('__MACOSX')
571
- and not f.startswith('.')
572
- and not '._' in f]
573
-
574
- log_message(f" Found {len(json_files_in_zip)} JSON files in ZIP")
575
-
576
- for json_file in json_files_in_zip:
577
- try:
578
- file_content = zf.read(json_file)
579
-
580
- # Skip if file is too small
581
- if len(file_content) < 10:
582
- log_message(f" ✗ Skipping: {json_file} (file too small)")
583
- stats['failed'] += 1
584
- continue
585
-
586
- # Try UTF-8 first (most common)
587
- try:
588
- text_content = file_content.decode('utf-8')
589
- except UnicodeDecodeError:
590
- try:
591
- text_content = file_content.decode('utf-8-sig')
592
- except UnicodeDecodeError:
593
- try:
594
- # Try UTF-16 (the issue you're seeing)
595
- text_content = file_content.decode('utf-16')
596
- except UnicodeDecodeError:
597
- try:
598
- text_content = file_content.decode('windows-1251')
599
- except UnicodeDecodeError:
600
- log_message(f" ✗ Skipping: {json_file} (encoding failed)")
601
- stats['failed'] += 1
602
- continue
603
-
604
- # Validate JSON structure
605
- if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
606
- log_message(f" ✗ Skipping: {json_file} (not valid JSON)")
607
- stats['failed'] += 1
608
- continue
609
-
610
- with tempfile.NamedTemporaryFile(mode='w', delete=False,
611
- suffix='.json', encoding='utf-8') as tmp:
612
- tmp.write(text_content)
613
- tmp_path = tmp.name
614
-
615
- docs = extract_sections_from_json(tmp_path)
616
- if docs:
617
- documents.extend(docs)
618
- stats['success'] += 1
619
- log_message(f" ✓ {json_file}: {len(docs)} sections")
620
- else:
621
- stats['empty'] += 1
622
- log_message(f" ⚠ {json_file}: No sections")
623
-
624
- os.unlink(tmp_path)
625
-
626
- except json.JSONDecodeError as e:
627
- stats['failed'] += 1
628
- log_message(f" ✗ {json_file}: Invalid JSON")
629
- except Exception as e:
630
- stats['failed'] += 1
631
- log_message(f" ✗ {json_file}: {str(e)[:100]}")
632
-
633
- except Exception as e:
634
- log_message(f" ✗ Error with ZIP: {e}")
635
 
636
- log_message(f"="*60)
637
- log_message(f"JSON Loading Stats:")
638
- log_message(f" Success: {stats['success']}")
639
- log_message(f" Empty: {stats['empty']}")
640
- log_message(f" Failed: {stats['failed']}")
641
- log_message(f" Total sections: {len(documents)}")
642
- log_message(f"="*60)
643
 
644
  return documents
645
 
646
- def extract_sections_from_json(json_path):
647
- """Extract sections from a single JSON file"""
648
- documents = []
649
 
 
650
  try:
651
- with open(json_path, 'r', encoding='utf-8') as f:
652
- data = json.load(f)
 
 
653
 
654
- doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
655
 
656
- # Extract all section levels
657
- for section in data.get('sections', []):
658
- if section.get('section_text', '').strip():
659
- documents.append(Document(
660
- text=section['section_text'],
661
- metadata={
662
- 'type': 'text',
663
- 'document_id': doc_id,
664
- 'section_id': section.get('section_id', '')
665
- }
666
- ))
667
-
668
- # Subsections
669
- for subsection in section.get('subsections', []):
670
- if subsection.get('subsection_text', '').strip():
671
- documents.append(Document(
672
- text=subsection['subsection_text'],
 
 
 
 
 
 
 
 
 
 
 
673
  metadata={
674
- 'type': 'text',
675
- 'document_id': doc_id,
676
- 'section_id': subsection.get('subsection_id', '')
 
 
 
 
 
677
  }
678
- ))
679
-
680
- # Sub-subsections
681
- for sub_sub in subsection.get('sub_subsections', []):
682
- if sub_sub.get('sub_subsection_text', '').strip():
683
- documents.append(Document(
684
- text=sub_sub['sub_subsection_text'],
685
- metadata={
686
- 'type': 'text',
687
- 'document_id': doc_id,
688
- 'section_id': sub_sub.get('sub_subsection_id', '')
689
- }
690
- ))
691
-
692
  except Exception as e:
693
- log_message(f"Error extracting from {json_path}: {e}")
694
-
695
- return documents
696
 
697
 
698
- def load_image_documents(repo_id, hf_token, image_dir):
699
- """Load image descriptions"""
700
- log_message("Loading images...")
701
-
702
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
703
- csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
704
 
705
- documents = []
706
- for file_path in csv_files:
707
- try:
708
- local_path = hf_hub_download(
709
- repo_id=repo_id,
710
- filename=file_path,
711
- repo_type="dataset",
712
- token=hf_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  )
714
-
715
- df = pd.read_csv(local_path)
716
-
717
- for _, row in df.iterrows():
718
- content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
719
- content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
720
- content += f"Название: {row.get('Название изображения', '')}\n"
721
- content += f"Описание: {row.get('Описание изображение', '')}\n"
722
- content += f"Раздел: {row.get('Раздел документа', '')}\n"
723
-
724
- chunk_size = len(content)
725
-
726
- documents.append(Document(
727
- text=content,
728
- metadata={
729
- 'type': 'image',
730
- 'document_id': str(row.get('Обозначение документа', 'unknown')),
731
- 'image_number': str(row.get('№ Изображения', 'unknown')),
732
- 'section': str(row.get('Раздел документа', '')),
733
- 'chunk_size': chunk_size
734
- }
735
- ))
736
- except Exception as e:
737
- log_message(f"Error loading {file_path}: {e}")
738
-
739
- if documents:
740
- avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
741
- log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
742
-
743
- return documents
744
-
745
-
746
- def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
747
- """Main loader - combines all document types"""
748
- log_message("="*60)
749
- log_message("STARTING DOCUMENT LOADING")
750
- log_message("="*60)
751
-
752
- # Load text sections
753
- text_docs = load_json_documents(repo_id, hf_token, json_dir)
754
- text_chunks = chunk_text_documents(text_docs)
755
-
756
- # Load tables (already chunked)
757
- table_chunks = load_table_documents(repo_id, hf_token, table_dir)
758
-
759
- # Load images (no chunking needed)
760
- image_docs = load_image_documents(repo_id, hf_token, image_dir)
761
-
762
- all_docs = text_chunks + table_chunks + image_docs
763
-
764
- log_message("="*60)
765
- log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
766
- log_message(f" Text chunks: {len(text_chunks)}")
767
- log_message(f" Table chunks: {len(table_chunks)}")
768
- log_message(f" Images: {len(image_docs)}")
769
- log_message("="*60)
770
-
771
- return all_docs
 
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
 
6
  from my_logging import log_message
7
+ from llama_index.core.text_splitter import SentenceSplitter
8
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
9
+ from table_prep import table_to_document, load_table_data
10
 
 
 
 
11
 
12
+ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
+ if chunk_size is None:
14
+ chunk_size = CHUNK_SIZE
15
+ if chunk_overlap is None:
16
+ chunk_overlap = CHUNK_OVERLAP
17
  text_splitter = SentenceSplitter(
18
+ chunk_size=chunk_size,
19
+ chunk_overlap=chunk_overlap,
20
+ separator=" "
21
  )
22
 
23
+ text_chunks = text_splitter.split_text(doc.text)
24
+
25
+ chunked_docs = []
26
+ for i, chunk_text in enumerate(text_chunks):
27
+ chunk_metadata = doc.metadata.copy()
28
+ chunk_metadata.update({
29
+ "chunk_id": i,
30
+ "total_chunks": len(text_chunks),
31
+ "chunk_size": len(chunk_text),
32
+ "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
33
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ chunked_doc = Document(
36
+ text=chunk_text,
37
+ metadata=chunk_metadata
38
+ )
39
+ chunked_docs.append(chunked_doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ return chunked_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ def process_documents_with_chunking(documents):
44
+ all_chunked_docs = []
45
+ chunk_info = []
46
+ table_count = 0
47
+ table_chunks_count = 0
48
+ image_count = 0
49
+ image_chunks_count = 0
50
+ text_chunks_count = 0
51
 
52
+ for doc in documents:
53
+ doc_type = doc.metadata.get('type', 'text')
54
+ is_already_chunked = doc.metadata.get('is_chunked', False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ if doc_type == 'table':
57
+ if is_already_chunked:
58
+ table_chunks_count += 1
59
+ all_chunked_docs.append(doc)
60
+ chunk_info.append({
61
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
62
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
63
+ 'chunk_id': doc.metadata.get('chunk_id', 0),
64
+ 'total_chunks': doc.metadata.get('total_chunks', 1),
65
+ 'chunk_size': len(doc.text),
66
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
67
+ 'type': 'table',
68
+ 'table_number': doc.metadata.get('table_number', 'unknown')
69
+ })
70
+ else:
71
+ table_count += 1
72
+ all_chunked_docs.append(doc)
73
+ chunk_info.append({
74
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
75
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
76
+ 'chunk_id': 0,
77
+ 'chunk_size': len(doc.text),
78
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
79
+ 'type': 'table',
80
+ 'table_number': doc.metadata.get('table_number', 'unknown')
81
+ })
82
 
83
+ elif doc_type == 'image':
84
+ image_count += 1
85
+ doc_size = len(doc.text)
86
+ if doc_size > CHUNK_SIZE:
87
+ log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
88
+ f"Размер: {doc_size} > {CHUNK_SIZE}")
89
+ chunked_docs = chunk_document(doc)
90
+ image_chunks_count += len(chunked_docs)
91
+ all_chunked_docs.extend(chunked_docs)
92
+ log_message(f" ✂️ Разделено на {len(chunked_docs)} чанков")
93
+
94
+ for i, chunk_doc in enumerate(chunked_docs):
95
+ chunk_info.append({
96
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
97
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
98
+ 'chunk_id': i,
99
+ 'chunk_size': len(chunk_doc.text),
100
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
101
+ 'type': 'image',
102
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
103
+ })
104
+ else:
105
+ all_chunked_docs.append(doc)
106
+ chunk_info.append({
107
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
108
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
109
+ 'chunk_id': 0,
110
+ 'chunk_size': doc_size,
111
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
112
+ 'type': 'image',
113
+ 'image_number': doc.metadata.get('image_number', 'unknown')
114
+ })
115
 
116
+ else:
117
+ doc_size = len(doc.text)
118
+ if doc_size > CHUNK_SIZE:
119
+ log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
120
+ f"Размер: {doc_size} > {CHUNK_SIZE}")
121
+ chunked_docs = chunk_document(doc)
122
+ text_chunks_count += len(chunked_docs)
123
+ all_chunked_docs.extend(chunked_docs)
124
+ log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
125
+
126
+ for i, chunk_doc in enumerate(chunked_docs):
127
+ chunk_info.append({
128
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
129
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
130
+ 'chunk_id': i,
131
+ 'chunk_size': len(chunk_doc.text),
132
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
133
+ 'type': 'text'
134
+ })
135
+ else:
136
+ all_chunked_docs.append(doc)
137
+ chunk_info.append({
138
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
139
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
140
+ 'chunk_id': 0,
141
+ 'chunk_size': doc_size,
142
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
+ 'type': 'text'
144
+ })
145
 
146
+ log_message(f"\n{'='*60}")
147
+ log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
148
+ log_message(f" • Таблицы (целые): {table_count}")
149
+ log_message(f" • Таблицы (чанки): {table_chunks_count}")
150
+ log_message(f" • Изображения (целые): {image_count - (image_chunks_count > 0)}")
151
+ log_message(f" • Изображения (чанки): {image_chunks_count}")
152
+ log_message(f" • Текстовые чанки: {text_chunks_count}")
153
+ log_message(f" • Всего документов: {len(all_chunked_docs)}")
154
+ log_message(f"{'='*60}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ return all_chunked_docs, chunk_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ def extract_text_from_json(data, document_id, document_name):
159
+ documents = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ if 'sections' in data:
162
+ for section in data['sections']:
163
+ section_id = section.get('section_id', 'Unknown')
164
+ section_text = section.get('section_text', '')
 
 
 
 
165
 
166
+ section_path = f"{section_id}"
167
+ section_title = extract_section_title(section_text)
168
 
169
+ if section_text.strip():
170
+ doc = Document(
171
+ text=section_text,
172
+ metadata={
173
+ "type": "text",
174
+ "document_id": document_id,
175
+ "document_name": document_name,
176
+ "section_id": section_id,
177
+ "section_text": section_title[:200],
178
+ "section_path": section_path,
179
+ "level": "section"
180
+ }
181
+ )
182
+ documents.append(doc)
183
 
184
+ if 'subsections' in section:
185
+ for subsection in section['subsections']:
186
+ subsection_id = subsection.get('subsection_id', 'Unknown')
187
+ subsection_text = subsection.get('subsection_text', '')
188
+ subsection_title = extract_section_title(subsection_text)
189
+ subsection_path = f"{section_path}.{subsection_id}"
 
 
 
190
 
191
+ if subsection_text.strip():
192
+ doc = Document(
193
+ text=subsection_text,
194
+ metadata={
195
+ "type": "text",
196
+ "document_id": document_id,
197
+ "document_name": document_name,
198
+ "section_id": subsection_id,
199
+ "section_text": subsection_title[:200],
200
+ "section_path": subsection_path,
201
+ "level": "subsection",
202
+ "parent_section": section_id,
203
+ "parent_title": section_title[:100]
204
+ }
205
+ )
206
+ documents.append(doc)
207
 
208
+ if 'sub_subsections' in subsection:
209
+ for sub_subsection in subsection['sub_subsections']:
210
+ sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
211
+ sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
212
+ sub_subsection_title = extract_section_title(sub_subsection_text)
213
+ sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
214
+
215
+ if sub_subsection_text.strip():
216
+ doc = Document(
217
+ text=sub_subsection_text,
218
+ metadata={
219
+ "type": "text",
220
+ "document_id": document_id,
221
+ "document_name": document_name,
222
+ "section_id": sub_subsection_id,
223
+ "section_text": sub_subsection_title[:200],
224
+ "section_path": sub_subsection_path,
225
+ "level": "sub_subsection",
226
+ "parent_section": subsection_id,
227
+ "parent_title": subsection_title[:100]
228
+ }
229
+ )
230
+ documents.append(doc)
231
+
232
+ if 'sub_sub_subsections' in sub_subsection:
233
+ for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
234
+ sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
235
+ sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
236
+ sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
237
+
238
+ if sub_sub_subsection_text.strip():
239
+ doc = Document(
240
+ text=sub_sub_subsection_text,
241
+ metadata={
242
+ "type": "text",
243
+ "document_id": document_id,
244
+ "document_name": document_name,
245
+ "section_id": sub_sub_subsection_id,
246
+ "section_text": sub_sub_subsection_title[:200],
247
+ "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
248
+ "level": "sub_sub_subsection",
249
+ "parent_section": sub_subsection_id,
250
+ "parent_title": sub_subsection_title[:100]
251
+ }
252
+ )
253
+ documents.append(doc)
254
 
255
+ return documents
 
256
 
257
+ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
258
+ log_message("Начинаю загрузку JSON документов")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ try:
261
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
262
+ zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
263
+ json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
264
+
265
+ log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
266
+
267
+ all_documents = []
268
+
269
+ for zip_file_path in zip_files:
270
+ try:
271
+ log_message(f"Загружаю ZIP архив: {zip_file_path}")
272
+ local_zip_path = hf_hub_download(
273
+ repo_id=repo_id,
274
+ filename=zip_file_path,
275
+ local_dir=download_dir,
276
+ repo_type="dataset",
277
+ token=hf_token
278
+ )
279
+
280
+ documents = extract_zip_and_process_json(local_zip_path)
281
+ all_documents.extend(documents)
282
+ log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
283
+
284
+ except Exception as e:
285
+ log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
286
+ continue
287
+
288
+ for file_path in json_files:
289
+ try:
290
+ log_message(f"Обрабатываю прямой JSON файл: {file_path}")
291
+ local_path = hf_hub_download(
292
+ repo_id=repo_id,
293
+ filename=file_path,
294
+ local_dir=download_dir,
295
+ repo_type="dataset",
296
+ token=hf_token
297
+ )
298
+
299
+ with open(local_path, 'r', encoding='utf-8') as f:
300
+ json_data = json.load(f)
301
+
302
+ document_metadata = json_data.get('document_metadata', {})
303
+ document_id = document_metadata.get('document_id', 'unknown')
304
+ document_name = document_metadata.get('document_name', 'unknown')
305
+
306
+ documents = extract_text_from_json(json_data, document_id, document_name)
307
+ all_documents.extend(documents)
308
+
309
+ log_message(f"Извлечено {len(documents)} документов из {file_path}")
310
+
311
+ except Exception as e:
312
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
313
+ continue
314
+
315
+ log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
316
+
317
+ # Process documents through chunking function
318
+ chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
319
+
320
+ log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
321
+
322
+ return chunked_documents, chunk_info
323
+
324
+ except Exception as e:
325
+ log_message(f"Ошибка загрузки JSON документов: {str(e)}")
326
+ return [], []
327
 
328
+ def extract_section_title(section_text):
329
+ if not section_text.strip():
330
+ return ""
 
331
 
332
+ lines = section_text.strip().split('\n')
333
+ first_line = lines[0].strip()
334
 
335
+ if len(first_line) < 200 and not first_line.endswith('.'):
336
+ return first_line
 
337
 
338
+ # Otherwise, extract first sentence
339
+ sentences = first_line.split('.')
340
+ if len(sentences) > 1:
341
+ return sentences[0].strip()
342
 
343
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
344
+
345
+ def extract_zip_and_process_json(zip_path):
346
  documents = []
 
347
 
348
+ try:
349
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
350
+ zip_files = zip_ref.namelist()
351
+ json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
+ log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+ for json_file in json_files:
356
+ try:
357
+ log_message(f"Обрабатываю файл из архива: {json_file}")
358
+
359
+ with zip_ref.open(json_file) as f:
360
+ json_data = json.load(f)
361
+
362
+ document_metadata = json_data.get('document_metadata', {})
363
+ document_id = document_metadata.get('document_id', 'unknown')
364
+ document_name = document_metadata.get('document_name', 'unknown')
365
+
366
+ docs = extract_text_from_json(json_data, document_id, document_name)
367
+ documents.extend(docs)
368
+
369
+ log_message(f"Извлечено {len(docs)} документов из {json_file}")
370
+
371
+ except Exception as e:
372
+ log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
373
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
+ except Exception as e:
376
+ log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
 
 
 
 
 
377
 
378
  return documents
379
 
380
+ def load_image_data(repo_id, hf_token, image_data_dir):
381
+ log_message("Начинаю загрузку данных изображений")
 
382
 
383
+ image_files = []
384
  try:
385
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
386
+ for file in files:
387
+ if file.startswith(image_data_dir) and file.endswith('.csv'):
388
+ image_files.append(file)
389
 
390
+ log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
391
 
392
+ image_documents = []
393
+ for file_path in image_files:
394
+ try:
395
+ log_message(f"Обрабатываю файл изображений: {file_path}")
396
+ local_path = hf_hub_download(
397
+ repo_id=repo_id,
398
+ filename=file_path,
399
+ local_dir='',
400
+ repo_type="dataset",
401
+ token=hf_token
402
+ )
403
+
404
+ df = pd.read_csv(local_path)
405
+ log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
406
+
407
+ # Обработка с правильными названиями колонок
408
+ for _, row in df.iterrows():
409
+ section_value = row.get('Раздел документа', 'Неизвестно')
410
+
411
+ content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
412
+ content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
413
+ content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в названии колонки
414
+ content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
415
+ content += f"Раздел: {section_value}\n"
416
+ content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
417
+
418
+ doc = Document(
419
+ text=content,
420
  metadata={
421
+ "type": "image",
422
+ "image_number": str(row.get('№ Изображения', 'unknown')),
423
+ "image_title": str(row.get('Название изображения', 'unknown')),
424
+ "image_description": str(row.get('Описание изображение', 'unknown')),
425
+ "document_id": str(row.get('Обозначение документа', 'unknown')),
426
+ "file_path": str(row.get('Файл изображения', 'unknown')),
427
+ "section": str(section_value),
428
+ "section_id": str(section_value)
429
  }
430
+ )
431
+ image_documents.append(doc)
432
+
433
+ except Exception as e:
434
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
435
+ continue
436
+
437
+ log_message(f"Создано {len(image_documents)} документов из изображений")
438
+ return image_documents
439
+
 
 
 
 
440
  except Exception as e:
441
+ log_message(f"Ошибка загрузки данных изображений: {str(e)}")
442
+ return []
 
443
 
444
 
445
+ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
446
+ log_message("Загружаю данные чанков из CSV")
 
 
 
 
447
 
448
+ try:
449
+ chunks_csv_path = hf_hub_download(
450
+ repo_id=repo_id,
451
+ filename=chunks_filename,
452
+ local_dir=download_dir,
453
+ repo_type="dataset",
454
+ token=hf_token
455
+ )
456
+
457
+ chunks_df = pd.read_csv(chunks_csv_path)
458
+ log_message(f"Загружено {len(chunks_df)} чанков из CSV")
459
+
460
+ text_column = None
461
+ for col in chunks_df.columns:
462
+ if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
463
+ text_column = col
464
+ break
465
+
466
+ if text_column is None:
467
+ text_column = chunks_df.columns[0]
468
+
469
+ log_message(f"Использую колонку: {text_column}")
470
+
471
+ documents = []
472
+ for i, (_, row) in enumerate(chunks_df.iterrows()):
473
+ doc = Document(
474
+ text=str(row[text_column]),
475
+ metadata={
476
+ "chunk_id": row.get('chunk_id', i),
477
+ "document_id": row.get('document_id', 'unknown'),
478
+ "type": "text"
479
+ }
480
  )
481
+ documents.append(doc)
482
+
483
+ log_message(f"Создано {len(documents)} текстовых документов из CSV")
484
+ return documents, chunks_df
485
+
486
+ except Exception as e:
487
+ log_message(f"Ошибка загрузки CSV данных: {str(e)}")
488
+ return [], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
index_retriever.py CHANGED
@@ -1,178 +1,77 @@
1
- from llama_index.core import VectorStoreIndex
2
  from llama_index.core.query_engine import RetrieverQueryEngine
3
  from llama_index.core.retrievers import VectorIndexRetriever
 
 
4
  from llama_index.retrievers.bm25 import BM25Retriever
5
  from llama_index.core.retrievers import QueryFusionRetriever
6
- from llama_index.core.response_synthesizers import get_response_synthesizer
7
  from my_logging import log_message
8
-
9
- import re
10
-
11
- import re
12
- from difflib import SequenceMatcher
13
-
14
 
15
  def create_vector_index(documents):
16
- """Create vector index from documents"""
17
- log_message(f"Building vector index from {len(documents)} documents...")
18
- index = VectorStoreIndex.from_documents(documents)
19
- log_message("✓ Index created")
20
- return index
21
-
22
- def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
23
- """Return nodes that contain at least one keyword from the query."""
24
- keywords = [w.lower() for w in query.split() if len(w) > 2]
25
- filtered = []
26
- for node in nodes:
27
- text = node.text.lower()
28
- if any(k in text for k in keywords):
29
- filtered.append(node)
30
- return filtered
31
-
32
-
33
- def normalize_doc_id(doc_id: str) -> str:
34
- """Normalize document ID for consistent comparison."""
35
- doc_id = doc_id.upper().strip()
36
- doc_id = re.sub(r'[^\w\d\.]+', '', doc_id) # remove spaces, dashes, etc.
37
- doc_id = doc_id.replace("ГОСТР", "ГОСТ")
38
- doc_id = doc_id.replace("GOSTR", "ГОСТ")
39
- return doc_id
40
-
41
- def base_number(doc_id: str) -> str:
42
- """Extract base numeric pattern (e.g., '59023.4' from 'ГОСТ Р 59023.4-2020')."""
43
- m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
44
- return m.group(1) if m else ""
45
-
46
- def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.5):
47
- """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
48
- if not doc_ids:
49
- return nodes
50
-
51
- filtered = []
52
- doc_ids_norm = [normalize_doc_id(d) for d in doc_ids]
53
- doc_ids_base = [base_number(d) for d in doc_ids_norm]
54
-
55
- for node in nodes:
56
- node_doc_id = normalize_doc_id(node.metadata.get('document_id', ''))
57
- node_base = base_number(node_doc_id)
58
-
59
- for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
60
- # Strong match: same base number (e.g., 59023.4)
61
- if q_base and node_base and q_base == node_base:
62
- filtered.append(node)
63
- break
64
-
65
- # Medium match: similarity ratio > threshold
66
- if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
67
- filtered.append(node)
68
- break
69
-
70
- # Weak fallback: contains or partial substring
71
- if q_base in node_doc_id or q_doc in node_doc_id:
72
- filtered.append(node)
73
- break
74
 
75
- return filtered if filtered else nodes # Fallback: keep all if none matched
76
-
77
-
78
- def extract_doc_id_from_query(query):
79
- """Extract document IDs from query text with better pattern matching"""
80
- patterns = [
81
- r'ГОСТ\s*Р?\s*\d+(?:\.\d+)*(?:-\d{4})?', # ГОСТ 59023.4, ГОСТ Р 50.05.01-2018
82
- r'НП-\d+(?:-\d+)?', # НП-104-18
83
- r'МУ[_\s]\d+(?:\.\d+)+(?:\.\d+)*(?:-\d{4})?', # МУ 1.2.3.07.0057-2018
84
- ]
85
-
86
- found_ids = []
87
- for pattern in patterns:
88
- matches = re.findall(pattern, query, re.IGNORECASE)
89
- found_ids.extend(matches)
90
-
91
- # Normalize spacing and preserve dots
92
- normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
93
- return normalized
94
- def russian_tokenizer(text):
95
- """Better tokenizer for Russian document IDs and technical terms"""
96
- import re
97
 
98
- # Keep document ID patterns intact
99
- text = re.sub(r'(ГОСТ\s*Р?\s*[\d\.]+(?:-\d{4})?)', r' \1 ', text)
100
- text = re.sub(r'(НП-\d+(?:-\d+)?)', r' \1 ', text)
101
- text = re.sub(r'(МУ[_\s][\d\.]+)', r' \1 ', text)
102
-
103
- # Split on whitespace and punctuation, but keep numbers with decimals
104
- tokens = re.findall(r'\d+\.\d+|\w+', text.lower())
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- return tokens
107
-
108
 
109
  def create_query_engine(vector_index):
110
- """Create hybrid retrieval engine with document ID filtering"""
111
- log_message("Creating query engine...")
112
-
113
- vector_retriever = VectorIndexRetriever(
114
- index=vector_index,
115
- similarity_top_k=150
116
- )
117
- bm25_retriever = BM25Retriever.from_defaults(
118
- docstore=vector_index.docstore,
119
- similarity_top_k=150,
120
- tokenizer=russian_tokenizer # Add custom tokenizer
121
-
122
- )
123
- hybrid_retriever = QueryFusionRetriever(
124
- [vector_retriever, bm25_retriever],
125
- similarity_top_k=80,
126
- num_queries=1
127
- )
128
-
129
- class DeduplicatedQueryEngine(RetrieverQueryEngine):
130
- def retrieve(self, query):
131
- nodes = hybrid_retriever.retrieve(query)
132
- log_message(f"Hybrid retrieval returned: {len(nodes)} nodes")
133
-
134
- # Extract document IDs from query
135
- doc_ids = extract_doc_id_from_query(query)
136
- if doc_ids:
137
- log_message(f"Detected document IDs in query: {doc_ids}")
138
- before = len(nodes)
139
- nodes = filter_nodes_by_doc_id(nodes, doc_ids)
140
- after = len(nodes)
141
- log_message(f"Filtered by doc ID: {after}/{before} nodes kept (fallback safe)")
142
-
143
-
144
- # Deduplication
145
- seen_hashes = set()
146
- unique_nodes = []
147
- doc_type_counts = {'text': 0, 'table': 0, 'image': 0}
148
-
149
- for node in nodes:
150
- text_hash = hash(node.text[:500])
151
-
152
- if text_hash not in seen_hashes:
153
- seen_hashes.add(text_hash)
154
- unique_nodes.append(node)
155
-
156
- node_type = node.metadata.get('type', 'text')
157
- doc_type_counts[node_type] = doc_type_counts.get(node_type, 0) + 1
158
-
159
- log_message(f"After dedup: {len(unique_nodes)} unique nodes")
160
- log_message(f"Types: text={doc_type_counts.get('text', 0)}, "
161
- f"table={doc_type_counts.get('table', 0)}, "
162
- f"image={doc_type_counts.get('image', 0)}")
163
-
164
- # Log which documents we're returning
165
- returned_docs = set(n.metadata.get('document_id', 'unknown') for n in unique_nodes[:50])
166
- log_message(f"Returning nodes from: {sorted(returned_docs)}")
167
-
168
- return unique_nodes[:50]
169
-
170
- response_synthesizer = get_response_synthesizer()
171
-
172
- query_engine = DeduplicatedQueryEngine(
173
- retriever=hybrid_retriever,
174
- response_synthesizer=response_synthesizer
175
- )
176
-
177
- log_message("✓ Query engine created with doc ID filtering")
178
- return query_engine
 
1
+ from llama_index.core import VectorStoreIndex, Settings
2
  from llama_index.core.query_engine import RetrieverQueryEngine
3
  from llama_index.core.retrievers import VectorIndexRetriever
4
+ from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
5
+ from llama_index.core.prompts import PromptTemplate
6
  from llama_index.retrievers.bm25 import BM25Retriever
7
  from llama_index.core.retrievers import QueryFusionRetriever
 
8
  from my_logging import log_message
9
+ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 
 
 
 
 
10
 
11
  def create_vector_index(documents):
12
+ log_message("Строю векторный индекс")
13
+ return VectorStoreIndex.from_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def deduplicate_nodes(nodes):
16
+ """Deduplicate retrieved nodes based on unique identifiers"""
17
+ seen = set()
18
+ unique_nodes = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ for node in nodes:
21
+ # Create unique identifier from metadata
22
+ doc_id = node.metadata.get('document_id', '')
23
+ section_id = node.metadata.get('section_id', '')
24
+ chunk_id = node.metadata.get('chunk_id', 0)
25
+ node_type = node.metadata.get('type', 'text')
26
+
27
+ if node_type == 'table':
28
+ table_num = node.metadata.get('table_number', '')
29
+ identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
30
+ elif node_type == 'image':
31
+ img_num = node.metadata.get('image_number', '')
32
+ identifier = f"{doc_id}|image|{img_num}"
33
+ else:
34
+ identifier = f"{doc_id}|{section_id}|{chunk_id}"
35
+
36
+ if identifier not in seen:
37
+ seen.add(identifier)
38
+ unique_nodes.append(node)
39
 
40
+ return unique_nodes
 
41
 
42
  def create_query_engine(vector_index):
43
+ try:
44
+ bm25_retriever = BM25Retriever.from_defaults(
45
+ docstore=vector_index.docstore,
46
+ similarity_top_k=20
47
+ )
48
+
49
+ vector_retriever = VectorIndexRetriever(
50
+ index=vector_index,
51
+ similarity_top_k=30,
52
+ similarity_cutoff=0.65
53
+ )
54
+
55
+ hybrid_retriever = QueryFusionRetriever(
56
+ [vector_retriever, bm25_retriever],
57
+ similarity_top_k=40,
58
+ num_queries=1
59
+ )
60
+
61
+ custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
62
+ response_synthesizer = get_response_synthesizer(
63
+ response_mode=ResponseMode.TREE_SUMMARIZE,
64
+ text_qa_template=custom_prompt_template
65
+ )
66
+
67
+ query_engine = RetrieverQueryEngine(
68
+ retriever=hybrid_retriever,
69
+ response_synthesizer=response_synthesizer
70
+ )
71
+
72
+ log_message("Query engine успешно создан")
73
+ return query_engine
74
+
75
+ except Exception as e:
76
+ log_message(f"Ошибка создания query engine: {str(e)}")
77
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
table_prep.py CHANGED
@@ -1,142 +1,163 @@
1
- from llama_index.core.text_splitter import SentenceSplitter
 
 
2
  from llama_index.core import Document
3
- from config import CHUNK_SIZE, CHUNK_OVERLAP
4
  from my_logging import log_message
5
 
6
- def normalize_table_number(table_num, section):
7
- """Normalize table numbers for consistent retrieval"""
8
- if not table_num or table_num == 'Неизвестно':
9
- return 'Неизвестно'
10
-
11
- # Clean up common prefixes
12
- tn = str(table_num).replace('Таблица', '').replace('№', '').strip()
13
-
14
- # Add section context for appendix tables
15
- if section and ('Приложение' in str(section) or 'приложение' in str(section).lower()):
16
- return f"№{tn} ({section})"
17
-
18
- return f"№{tn}"
19
-
20
  def create_table_content(table_data):
21
- """Create formatted content optimized for semantic search"""
22
- doc_id = (
23
- table_data.get('document_id') or
24
- table_data.get('document') or
25
- table_data.get('Обозначение документа') or
26
- 'Неизвестно'
27
- )
28
  table_num = table_data.get('table_number', 'Неизвестно')
29
  table_title = table_data.get('table_title', 'Неизвестно')
30
- section = (
31
- table_data.get('section') or
32
- table_data.get('Раздел документа') or
33
- 'Неизвестно'
34
- )
35
- sheet_name = table_data.get('sheet_name', '')
36
-
37
- # Enhanced table number with appendix context
38
- normalized_num = normalize_table_number(table_num, section)
39
- if 'Приложени' in str(section):
40
- # Extract appendix number
41
- import re
42
- appendix_match = re.search(r'Приложени[ея]\s*(\d+)', str(section))
43
- if appendix_match:
44
- appendix_num = appendix_match.group(1)
45
- normalized_num = f"{normalized_num} Приложения {appendix_num}"
46
 
47
- # Build searchable header
48
- content = f"Документ: {doc_id}\n"
49
- content += f"Раздел: {section}\n"
50
- content += f"Таблица: {normalized_num}\n"
51
  content += f"Название: {table_title}\n"
52
- if sheet_name:
53
- content += f"Лист: {sheet_name}\n"
54
- content += f"\n"
55
 
56
  headers = table_data.get('headers', [])
57
  if headers:
58
- header_str = ' | '.join(str(h) for h in headers)
59
- content += f"Колонки: {header_str}\n\n"
60
 
61
- # CRITICAL: Preserve searchable row identifiers
62
  if 'data' in table_data and isinstance(table_data['data'], list):
 
63
  for row_idx, row in enumerate(table_data['data'], start=1):
64
  if isinstance(row, dict):
65
- # Extract ALL key-value pairs naturally
66
- row_parts = []
67
- for k, v in row.items():
68
- if v and str(v).strip() and str(v) != 'nan':
69
- row_parts.append(f"{k}: {v}")
70
-
71
- if row_parts:
72
- content += ' | '.join(row_parts) + "\n"
73
- elif isinstance(row, list):
74
- row_str = ' | '.join([str(v) for v in row if v and str(v).strip() and str(v) != 'nan'])
75
- if row_str:
76
- content += row_str + "\n"
77
 
78
- return content, normalized_num
 
 
 
 
 
 
 
79
 
 
 
 
 
 
80
 
81
- def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
 
 
 
 
 
 
82
  if chunk_size is None:
83
  chunk_size = CHUNK_SIZE
84
  if chunk_overlap is None:
85
  chunk_overlap = CHUNK_OVERLAP
86
 
 
 
87
  table_num = doc.metadata.get('table_number', 'unknown')
 
88
  doc_id = doc.metadata.get('document_id', 'unknown')
89
- section = doc.metadata.get('section', 'Неизвестно')
90
-
91
- full_table_id = f"{doc_id} | {section} | {table_num}"
92
 
 
93
  lines = doc.text.strip().split('\n')
94
 
95
- # Find where data rows start
96
- data_start_idx = 0
97
- for i, line in enumerate(lines):
98
- if line.startswith('Колонки:'):
99
- data_start_idx = i + 2 # Skip header and blank line
100
- break
101
-
102
- table_header = '\n'.join(lines[:data_start_idx])
103
- data_rows = lines[data_start_idx:]
104
-
105
- if not data_rows or len(doc.text) < chunk_size * 1.5:
106
- log_message(f" 📊 {full_table_id}: малая таблица, без разбиения")
107
- return [doc]
108
-
109
- log_message(f" 📋 {full_table_id}: {len(data_rows)} строк → chunking")
110
-
111
- header_size = len(table_header)
112
- available_size = chunk_size - header_size - 100
113
-
114
- text_chunks = []
115
- current_chunk_rows = []
116
- current_size = 0
117
-
118
- for row in data_rows:
119
- row_size = len(row) + 1
 
120
 
121
- if current_size + row_size > available_size and current_chunk_rows:
122
- chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows)
123
- text_chunks.append(chunk_text)
 
 
 
 
 
 
124
 
125
- # Keep last 2 rows for overlap
126
- overlap_count = min(2, len(current_chunk_rows))
127
- current_chunk_rows = current_chunk_rows[-overlap_count:]
128
- current_size = sum(len(r) + 1 for r in current_chunk_rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- current_chunk_rows.append(row)
131
- current_size += row_size
132
-
133
- if current_chunk_rows:
134
- chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows)
135
- text_chunks.append(chunk_text)
136
 
137
- log_message(f" ✂️ {full_table_id} {len(text_chunks)} чанков")
138
 
 
139
  chunked_docs = []
 
 
 
140
  for i, chunk_text in enumerate(text_chunks):
141
  chunk_metadata = doc.metadata.copy()
142
  chunk_metadata.update({
@@ -144,12 +165,22 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
144
  "total_chunks": len(text_chunks),
145
  "chunk_size": len(chunk_text),
146
  "is_chunked": True,
147
- "full_table_id": full_table_id,
148
- "table_number_normalized": doc.metadata.get('table_number_normalized')
 
149
  })
150
 
 
 
 
 
 
 
 
 
 
151
  chunked_doc = Document(
152
- text=chunk_text,
153
  metadata=chunk_metadata
154
  )
155
  chunked_docs.append(chunked_doc)
@@ -158,102 +189,138 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
158
 
159
 
160
  def table_to_document(table_data, document_id=None):
161
- """Convert table data to Document with complete metadata"""
162
  if not isinstance(table_data, dict):
 
163
  return []
164
 
165
- sheet_doc_id = (
166
- table_data.get('document_id') or
167
- table_data.get('document') or
168
- table_data.get('Обозначение документа')
169
- )
170
-
171
- doc_id = sheet_doc_id or document_id or 'Неизвестно'
172
-
173
  table_num = table_data.get('table_number', 'Неизвестно')
174
  table_title = table_data.get('table_title', 'Неизвестно')
175
- section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно'))
176
- sheet_name = table_data.get('sheet_name', '')
177
 
178
  table_rows = table_data.get('data', [])
179
- if not table_rows:
180
- log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных")
181
  return []
182
 
183
- content, normalized_num = create_table_content(table_data)
184
  content_size = len(content)
 
185
 
186
  base_doc = Document(
187
  text=content,
188
  metadata={
189
  "type": "table",
190
  "table_number": table_num,
191
- "table_number_normalized": normalized_num,
192
  "table_title": table_title,
193
  "document_id": doc_id,
194
  "section": section,
195
  "section_id": section,
196
- "sheet_name": sheet_name,
197
- "total_rows": len(table_rows),
198
- "content_size": content_size,
199
- "full_table_id": f"{doc_id} | {section} | {normalized_num}"
200
  }
201
  )
202
 
203
  if content_size > CHUNK_SIZE:
204
- log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}")
205
- return chunk_table_document(base_doc)
 
 
 
 
 
206
  else:
207
- log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)")
 
208
  return [base_doc]
209
 
210
 
211
- def table_to_document(table_data, document_id=None):
212
- """Convert table data to Document with proper metadata"""
213
- if not isinstance(table_data, dict):
214
- return []
215
-
216
- # FIXED: Extract sheet-level document_id first
217
- sheet_doc_id = (
218
- table_data.get('document_id') or
219
- table_data.get('document') or
220
- table_data.get('Обозначение документа')
221
- )
222
 
223
- # Use sheet doc_id if available, otherwise use passed document_id
224
- doc_id = sheet_doc_id or document_id or 'Неизвестно'
225
-
226
- table_num = table_data.get('table_number', 'Неизвестно')
227
- table_title = table_data.get('table_title', 'Неизвестно')
228
- section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно'))
229
-
230
- table_rows = table_data.get('data', [])
231
- if not table_rows:
232
- log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных")
233
- return []
234
-
235
- content, normalized_num = create_table_content(table_data)
236
- content_size = len(content)
237
-
238
- base_doc = Document(
239
- text=content,
240
- metadata={
241
- "type": "table",
242
- "table_number": table_num,
243
- "table_number_normalized": normalized_num,
244
- "table_title": table_title,
245
- "document_id": doc_id,
246
- "section": section,
247
- "section_id": section,
248
- "total_rows": len(table_rows),
249
- "content_size": content_size,
250
- "full_table_id": f"{doc_id} | {section} | {normalized_num}"
251
  }
252
- )
253
-
254
- if content_size > CHUNK_SIZE:
255
- log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}")
256
- return chunk_table_document(base_doc)
257
- else:
258
- log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)")
259
- return [base_doc]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import json
3
+ from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
 
5
  from my_logging import log_message
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def create_table_content(table_data):
8
+ """Create formatted content from table data"""
9
+ doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
 
 
 
 
 
10
  table_num = table_data.get('table_number', 'Неизвестно')
11
  table_title = table_data.get('table_title', 'Неизвестно')
12
+ section = table_data.get('section', 'Неизвестно')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ content = f"Таблица: {table_num}\n"
 
 
 
15
  content += f"Название: {table_title}\n"
16
+ content += f"Документ: {doc_id}\n"
17
+ content += f"Раздел: {section}\n"
 
18
 
19
  headers = table_data.get('headers', [])
20
  if headers:
21
+ content += f"\nЗаголовки: {' | '.join(headers)}\n"
 
22
 
 
23
  if 'data' in table_data and isinstance(table_data['data'], list):
24
+ content += "\nДанные таблицы:\n"
25
  for row_idx, row in enumerate(table_data['data'], start=1):
26
  if isinstance(row, dict):
27
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
28
+ content += f"Строка {row_idx}: {row_text}\n"
 
 
 
 
 
 
 
 
 
 
29
 
30
+ return content
31
+
32
+ from llama_index.core.text_splitter import SentenceSplitter
33
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
34
+
35
+ def extract_table_metadata(table_text: str) -> dict:
36
+ words = table_text.split()
37
+ unique_words = set(words)
38
 
39
+ from collections import Counter
40
+ stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
41
+ filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
42
+ common = Counter(filtered).most_common(15)
43
+ key_terms = [w for w, _ in common]
44
 
45
+ return {
46
+ "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
47
+ "materials": [], # if you want to extract material names, hook in regex or LLM here
48
+ "key_terms": key_terms
49
+ }
50
+
51
+ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None, rows_per_chunk=4):
52
  if chunk_size is None:
53
  chunk_size = CHUNK_SIZE
54
  if chunk_overlap is None:
55
  chunk_overlap = CHUNK_OVERLAP
56
 
57
+ # Extract critical metadata from table before chunking
58
+ table_metadata = extract_table_metadata(doc.text)
59
  table_num = doc.metadata.get('table_number', 'unknown')
60
+ table_title = doc.metadata.get('table_title', 'unknown')
61
  doc_id = doc.metadata.get('document_id', 'unknown')
62
+ section = doc.metadata.get('section', 'unknown')
 
 
63
 
64
+ # Parse table structure
65
  lines = doc.text.strip().split('\n')
66
 
67
+ table_header_lines = []
68
+ data_rows = []
69
+ in_data = False
70
+
71
+ for line in lines:
72
+ if line.startswith('Данные таблицы:'):
73
+ in_data = True
74
+ table_header_lines.append(line)
75
+ elif in_data and line.startswith('Строка'):
76
+ data_rows.append(line)
77
+ elif not in_data:
78
+ table_header_lines.append(line)
79
+
80
+ table_header = '\n'.join(table_header_lines) + '\n'
81
+
82
+ if not data_rows:
83
+ log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
84
+ text_splitter = SentenceSplitter(
85
+ chunk_size=chunk_size,
86
+ chunk_overlap=chunk_overlap,
87
+ separator="\n"
88
+ )
89
+ text_chunks = text_splitter.split_text(doc.text)
90
+ log_message(f" 📊 Стандартное разбиение: {len(text_chunks)} чанков")
91
+ else:
92
+ log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
93
 
94
+ header_size = len(table_header)
95
+ available_size = chunk_size - header_size - 300 # Reserve for enrichment
96
+
97
+ text_chunks = []
98
+ current_chunk_rows = []
99
+ current_size = 0
100
+
101
+ for row in data_rows:
102
+ row_size = len(row) + 1
103
 
104
+ # If single row exceeds available size, split it
105
+ if row_size > available_size:
106
+ log_message(f" ⚠️ Строка слишком длинная ({row_size} символов), разбиваем внутри строки")
107
+
108
+ # Flush current chunk if exists
109
+ if current_chunk_rows:
110
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
111
+ text_chunks.append(chunk_text)
112
+ log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
113
+ current_chunk_rows = []
114
+ current_size = 0
115
+
116
+ # Split the oversized row
117
+ text_splitter = SentenceSplitter(
118
+ chunk_size=available_size,
119
+ chunk_overlap=100,
120
+ separator=" | "
121
+ )
122
+ row_parts = text_splitter.split_text(row)
123
+ log_message(f" Строка разделена на {len(row_parts)} частей")
124
+
125
+ for part in row_parts:
126
+ chunk_text = table_header + part
127
+ text_chunks.append(chunk_text)
128
+ log_message(f" Под-чанк создан: {len(chunk_text)} символов")
129
+
130
+ continue
131
+
132
+ # Check if adding row would exceed rows_per_chunk OR size limit
133
+ if (len(current_chunk_rows) >= rows_per_chunk or
134
+ (current_size + row_size > available_size)) and current_chunk_rows:
135
+
136
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
137
+ text_chunks.append(chunk_text)
138
+ log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
139
+
140
+ # Overlap: keep last 1 row
141
+ overlap_count = min(1, len(current_chunk_rows))
142
+ current_chunk_rows = current_chunk_rows[-overlap_count:]
143
+ current_size = sum(len(r) + 1 for r in current_chunk_rows)
144
+
145
+ current_chunk_rows.append(row)
146
+ current_size += row_size
147
 
148
+ # Final chunk
149
+ if current_chunk_rows:
150
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
151
+ text_chunks.append(chunk_text)
152
+ log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
 
153
 
154
+ log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
155
 
156
+ # Create enriched chunks (rest of the function remains the same)
157
  chunked_docs = []
158
+ materials = table_metadata.get("materials", [])
159
+ key_terms = table_metadata.get("key_terms", [])
160
+
161
  for i, chunk_text in enumerate(text_chunks):
162
  chunk_metadata = doc.metadata.copy()
163
  chunk_metadata.update({
 
165
  "total_chunks": len(text_chunks),
166
  "chunk_size": len(chunk_text),
167
  "is_chunked": True,
168
+ "materials": materials,
169
+ "key_terms": key_terms,
170
+ "table_summary": table_metadata.get("summary", "")
171
  })
172
 
173
+ materials_str = ', '.join(materials[:10]) if materials else 'нет'
174
+ terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
175
+
176
+ enriched_text = f"""[Таблица {table_num}: {table_title}]
177
+ [Материалы в таблице: {materials_str}]
178
+ [Ключевые термины: {terms_str}]
179
+
180
+ {chunk_text}"""
181
+
182
  chunked_doc = Document(
183
+ text=enriched_text,
184
  metadata=chunk_metadata
185
  )
186
  chunked_docs.append(chunked_doc)
 
189
 
190
 
191
  def table_to_document(table_data, document_id=None):
 
192
  if not isinstance(table_data, dict):
193
+ log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
194
  return []
195
 
196
+ doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
 
 
 
 
 
 
 
197
  table_num = table_data.get('table_number', 'Неизвестно')
198
  table_title = table_data.get('table_title', 'Неизвестно')
199
+ section = table_data.get('section', 'Неизвестно')
 
200
 
201
  table_rows = table_data.get('data', [])
202
+ if not table_rows or len(table_rows) == 0:
203
+ log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
204
  return []
205
 
206
+ content = create_table_content(table_data)
207
  content_size = len(content)
208
+ row_count = len(table_rows)
209
 
210
  base_doc = Document(
211
  text=content,
212
  metadata={
213
  "type": "table",
214
  "table_number": table_num,
 
215
  "table_title": table_title,
216
  "document_id": doc_id,
217
  "section": section,
218
  "section_id": section,
219
+ "total_rows": row_count,
220
+ "content_size": content_size
 
 
221
  }
222
  )
223
 
224
  if content_size > CHUNK_SIZE:
225
+ log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
226
+ f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
227
+ chunked_docs = chunk_table_document(base_doc)
228
+ log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
229
+ for i, chunk_doc in enumerate(chunked_docs):
230
+ log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
231
+ return chunked_docs
232
  else:
233
+ log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
234
+ f"Размер: {content_size} символов | Строк: {row_count}")
235
  return [base_doc]
236
 
237
 
238
+ def load_table_data(repo_id, hf_token, table_data_dir):
239
+ log_message("=" * 60)
240
+ log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
241
+ log_message("=" * 60)
 
 
 
 
 
 
 
242
 
243
+ try:
244
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
245
+ table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
246
+
247
+ log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
248
+
249
+ table_documents = []
250
+ stats = {
251
+ 'total_tables': 0,
252
+ 'total_size': 0,
253
+ 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  }
255
+
256
+ for file_path in table_files:
257
+ try:
258
+ local_path = hf_hub_download(
259
+ repo_id=repo_id,
260
+ filename=file_path,
261
+ local_dir='',
262
+ repo_type="dataset",
263
+ token=hf_token
264
+ )
265
+
266
+ log_message(f"\nОбработка файла: {file_path}")
267
+
268
+ with open(local_path, 'r', encoding='utf-8') as f:
269
+ table_data = json.load(f)
270
+
271
+ if isinstance(table_data, dict):
272
+ document_id = table_data.get('document', 'unknown')
273
+
274
+ if 'sheets' in table_data:
275
+ sorted_sheets = sorted(
276
+ table_data['sheets'],
277
+ key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
278
+ )
279
+
280
+ for sheet in sorted_sheets:
281
+ sheet['document'] = document_id
282
+ docs_list = table_to_document(sheet, document_id)
283
+ table_documents.extend(docs_list)
284
+
285
+ for doc in docs_list:
286
+ stats['total_tables'] += 1
287
+ size = doc.metadata.get('content_size', 0)
288
+ stats['total_size'] += size
289
+ stats['by_document'][document_id]['count'] += 1
290
+ stats['by_document'][document_id]['size'] += size
291
+ else:
292
+ docs_list = table_to_document(table_data, document_id)
293
+ table_documents.extend(docs_list)
294
+
295
+ for doc in docs_list:
296
+ stats['total_tables'] += 1
297
+ size = doc.metadata.get('content_size', 0)
298
+ stats['total_size'] += size
299
+ stats['by_document'][document_id]['count'] += 1
300
+ stats['by_document'][document_id]['size'] += size
301
+
302
+
303
+ except Exception as e:
304
+ log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
305
+ continue
306
+
307
+ # Log summary statistics
308
+ log_message("\n" + "=" * 60)
309
+ log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
310
+ log_message("=" * 60)
311
+ log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
312
+ log_message(f"Общий размер: {stats['total_size']:,} символов")
313
+ log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
314
+
315
+ log_message("\nПо документам:")
316
+ for doc_id, doc_stats in sorted(stats['by_document'].items()):
317
+ log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
318
+ f"{doc_stats['size']:,} символов")
319
+
320
+ log_message("=" * 60)
321
+
322
+ return table_documents
323
+
324
+ except Exception as e:
325
+ log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
326
+ return []
utils.py CHANGED
@@ -4,20 +4,15 @@ from sentence_transformers import CrossEncoder
4
  from my_logging import log_message
5
 
6
  def get_llm_model(api_key, model_name="gemini-2.0-flash"):
7
- """Get LLM model"""
8
  return GoogleGenAI(model=model_name, api_key=api_key)
9
 
10
  def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
11
- """Get embedding model"""
12
  return HuggingFaceEmbedding(model_name=model_name)
13
 
14
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
15
- """Get reranker model"""
16
  return CrossEncoder(model_name)
17
 
18
-
19
  def format_sources(nodes):
20
- """Format retrieved sources for display"""
21
  sources = []
22
  for node in nodes:
23
  meta = node.metadata
@@ -37,21 +32,132 @@ def format_sources(nodes):
37
 
38
  return "\n".join(set(sources))
39
 
40
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- def answer_question(question, query_engine, reranker):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  try:
44
  log_message(f"\n{'='*70}")
45
  log_message(f"QUERY: {question}")
46
 
47
  retrieved = query_engine.retrieve(question)
48
- log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
49
-
50
- reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=-0.5)
51
- log_message(f"RERANKED: {len(reranked)} nodes")
52
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Group by document and type
55
  doc_groups = {}
56
  for n in reranked:
57
  doc_id = n.metadata.get('document_id', 'unknown')
@@ -68,12 +174,10 @@ def answer_question(question, query_engine, reranker):
68
 
69
  log_message(f"Documents found: {list(doc_groups.keys())}")
70
 
71
- # Format context by document
72
  context_parts = []
73
  for doc_id, groups in doc_groups.items():
74
  doc_section = [f"=== ДОКУМЕНТ: {doc_id} ==="]
75
 
76
- # Tables first (most important for your queries)
77
  if groups['tables']:
78
  doc_section.append("\n--- ТАБЛИЦЫ ---")
79
  for n in groups['tables']:
@@ -81,13 +185,21 @@ def answer_question(question, query_engine, reranker):
81
  table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
82
  title = meta.get('table_title', '')
83
  doc_section.append(f"\n[Таблица {table_id}] {title}")
84
- doc_section.append(n.text[:1500]) # Limit length
85
  log_message(f" Included table {table_id} from {doc_id}")
86
 
87
- # Then text
 
 
 
 
 
 
 
 
88
  if groups['text']:
89
  doc_section.append("\n--- ТЕКСТ ---")
90
- for n in groups['text'][:3]: # Limit text chunks
91
  doc_section.append(n.text[:800])
92
  log_message(f" Included text section from {doc_id}")
93
 
@@ -103,26 +215,35 @@ def answer_question(question, query_engine, reranker):
103
  from llama_index.core import Settings
104
  response = Settings.llm.complete(prompt)
105
 
106
- sources = format_sources(reranked)
107
- return response.text, sources
 
 
 
 
 
 
108
 
109
  except Exception as e:
110
  log_message(f"Error: {e}")
111
  import traceback
112
  log_message(traceback.format_exc())
113
- return f"Ошибка: {e}", ""
 
 
 
 
 
114
 
115
- def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.1): # Much lower threshold
116
- """Rerank with detailed score logging"""
117
  if not nodes or not reranker:
118
  log_message("WARNING: No nodes or reranker available")
119
  return nodes[:top_k]
120
 
121
- pairs = [[query, n.text[:500]] for n in nodes] # Limit text length for reranker
122
  scores = reranker.predict(pairs)
123
  scored = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)
124
 
125
- # Detailed logging
126
  if scored:
127
  top_5_scores = [s for _, s in scored[:5]]
128
  bottom_5_scores = [s for _, s in scored[-5:]]
@@ -130,7 +251,6 @@ def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.1): # Much lower
130
  log_message(f"Top 5 scores: {top_5_scores}")
131
  log_message(f"Bottom 5 scores: {bottom_5_scores}")
132
 
133
- # Count how many pass threshold
134
  above_threshold = sum(1 for _, s in scored if s >= min_score)
135
  log_message(f"Nodes above threshold ({min_score}): {above_threshold}/{len(scored)}")
136
 
 
4
  from my_logging import log_message
5
 
6
  def get_llm_model(api_key, model_name="gemini-2.0-flash"):
 
7
  return GoogleGenAI(model=model_name, api_key=api_key)
8
 
9
  def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
 
10
  return HuggingFaceEmbedding(model_name=model_name)
11
 
12
  def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
 
13
  return CrossEncoder(model_name)
14
 
 
15
  def format_sources(nodes):
 
16
  sources = []
17
  for node in nodes:
18
  meta = node.metadata
 
32
 
33
  return "\n".join(set(sources))
34
 
35
+ def create_chunks_info_for_display(nodes):
36
+ chunks_info = []
37
+ for node in nodes:
38
+ meta = node.metadata
39
+ chunk_info = {
40
+ 'document_id': meta.get('document_id', 'unknown'),
41
+ 'section_path': meta.get('section_path', ''),
42
+ 'section_id': meta.get('section_id', 'unknown'),
43
+ 'section_text': meta.get('section_text', ''),
44
+ 'parent_section': meta.get('parent_section', ''),
45
+ 'parent_title': meta.get('parent_title', ''),
46
+ 'level': meta.get('level', ''),
47
+ 'chunk_text': node.text[:500],
48
+ 'type': meta.get('type', 'text'),
49
+ 'table_number': meta.get('table_number', ''),
50
+ 'image_number': meta.get('image_number', '')
51
+ }
52
+ chunks_info.append(chunk_info)
53
+ return chunks_info
54
+
55
+ def format_answer_html(answer_text, model_name):
56
+ html = f"""
57
+ <div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px;'>
58
+ <div style='margin-bottom: 10px;'>
59
+ <span style='background-color: #4a5568; padding: 5px 10px; border-radius: 5px; font-size: 12px;'>
60
+ Модель: {model_name}
61
+ </span>
62
+ </div>
63
+ <div style='line-height: 1.6;'>
64
+ {answer_text}
65
+ </div>
66
+ </div>
67
+ """
68
+ return html
69
+
70
+ def format_sources_html(sources_text):
71
+ if not sources_text or sources_text == "":
72
+ return "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Нет источников</div>"
73
+
74
+ sources_list = sources_text.strip().split('\n')
75
+ html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px;'>"
76
+ html += "<h4 style='color: white; margin-bottom: 15px;'>Использованные источники:</h4>"
77
+ html += "<div style='line-height: 2;'>"
78
+
79
+ for source in sources_list:
80
+ if source.strip():
81
+ html += f"<div style='padding: 5px 0; border-bottom: 1px solid #4a5568;'>{source}</div>"
82
+
83
+ html += "</div></div>"
84
+ return html
85
+
86
+ def format_chunks_html(chunks_info):
87
+ if not chunks_info:
88
+ return "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Нет данных о чанках</div>"
89
+
90
+ html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 500px; overflow-y: auto;'>"
91
+ html += f"<h4 style='color: white; margin-bottom: 15px;'>Найдено релевантных чанков: {len(chunks_info)}</h4>"
92
+
93
+ for i, chunk in enumerate(chunks_info):
94
+ bg_color = "#4a5568" if i % 2 == 0 else "#374151"
95
+
96
+ from app import get_section_display, get_formatted_content
97
+ section_display = get_section_display(chunk)
98
+ formatted_content = get_formatted_content(chunk)
99
+
100
+ html += f"""
101
+ <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #60a5fa;'>
102
+ <strong style='color: #93c5fd;'>Документ:</strong> <span style='color: white;'>{chunk['document_id']}</span><br>
103
+ <strong style='color: #93c5fd;'>Раздел:</strong> <span style='color: white;'>{section_display}</span><br>
104
+ <strong style='color: #93c5fd;'>Содержание:</strong><br>
105
+ <div style='background-color: #1f2937; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: #d1d5db; max-height: 200px; overflow-y: auto;'>
106
+ {formatted_content}
107
+ </div>
108
+ </div>
109
+ """
110
+
111
+ html += "</div>"
112
+ return html
113
 
114
+ def deduplicate_nodes(nodes):
115
+ """Deduplicate retrieved nodes based on unique identifiers"""
116
+ seen = set()
117
+ unique_nodes = []
118
+
119
+ for node in nodes:
120
+ # Create unique identifier from metadata
121
+ doc_id = node.metadata.get('document_id', '')
122
+ section_id = node.metadata.get('section_id', '')
123
+ chunk_id = node.metadata.get('chunk_id', 0)
124
+ node_type = node.metadata.get('type', 'text')
125
+
126
+ if node_type == 'table':
127
+ table_num = node.metadata.get('table_number', '')
128
+ identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
129
+ elif node_type == 'image':
130
+ img_num = node.metadata.get('image_number', '')
131
+ identifier = f"{doc_id}|image|{img_num}"
132
+ else:
133
+ identifier = f"{doc_id}|{section_id}|{chunk_id}"
134
+
135
+ if identifier not in seen:
136
+ seen.add(identifier)
137
+ unique_nodes.append(node)
138
+
139
+ return unique_nodes
140
+
141
+
142
+ def answer_question(question, query_engine, reranker, model_name):
143
  try:
144
  log_message(f"\n{'='*70}")
145
  log_message(f"QUERY: {question}")
146
 
147
  retrieved = query_engine.retrieve(question)
148
+ total_retrieved = len(retrieved)
149
+ log_message(f"RETRIEVED: {total_retrieved} nodes (before deduplication)")
 
 
150
 
151
+ # Deduplicate
152
+ unique_retrieved = deduplicate_nodes(retrieved)
153
+ duplicates_removed = total_retrieved - len(unique_retrieved)
154
+ log_message(f"DEDUPLICATION: {duplicates_removed} duplicates removed")
155
+ log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
156
+
157
+ reranked = rerank_nodes(question, unique_retrieved, reranker, top_k=20, min_score=-0.5)
158
+ log_message(f"RERANKED: {len(reranked)} nodes (after scoring)")
159
+
160
 
 
161
  doc_groups = {}
162
  for n in reranked:
163
  doc_id = n.metadata.get('document_id', 'unknown')
 
174
 
175
  log_message(f"Documents found: {list(doc_groups.keys())}")
176
 
 
177
  context_parts = []
178
  for doc_id, groups in doc_groups.items():
179
  doc_section = [f"=== ДОКУМЕНТ: {doc_id} ==="]
180
 
 
181
  if groups['tables']:
182
  doc_section.append("\n--- ТАБЛИЦЫ ---")
183
  for n in groups['tables']:
 
185
  table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
186
  title = meta.get('table_title', '')
187
  doc_section.append(f"\n[Таблица {table_id}] {title}")
188
+ doc_section.append(n.text[:1500])
189
  log_message(f" Included table {table_id} from {doc_id}")
190
 
191
+ if groups['images']:
192
+ doc_section.append("\n--- ИЗОБРАЖЕНИЯ ---")
193
+ for n in groups['images']:
194
+ meta = n.metadata
195
+ img_id = meta.get('image_number', 'unknown')
196
+ doc_section.append(f"\n[Рисунок {img_id}]")
197
+ doc_section.append(n.text[:1000])
198
+ log_message(f" Included image {img_id} from {doc_id}")
199
+
200
  if groups['text']:
201
  doc_section.append("\n--- ТЕКСТ ---")
202
+ for n in groups['text'][:3]:
203
  doc_section.append(n.text[:800])
204
  log_message(f" Included text section from {doc_id}")
205
 
 
215
  from llama_index.core import Settings
216
  response = Settings.llm.complete(prompt)
217
 
218
+ sources_text = format_sources(reranked)
219
+ chunks_info = create_chunks_info_for_display(reranked)
220
+
221
+ answer_html = format_answer_html(response.text, model_name)
222
+ sources_html = format_sources_html(sources_text)
223
+ chunks_html = format_chunks_html(chunks_info)
224
+
225
+ return answer_html, sources_html, chunks_html
226
 
227
  except Exception as e:
228
  log_message(f"Error: {e}")
229
  import traceback
230
  log_message(traceback.format_exc())
231
+
232
+ error_html = f"<div style='background-color: #2d3748; color: #ef4444; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
233
+ sources_html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Источники недоступны из-за ошибки</div>"
234
+ chunks_html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Чанки недоступны из-за ошибки</div>"
235
+
236
+ return error_html, sources_html, chunks_html
237
 
238
+ def rerank_nodes(query, nodes, reranker, top_k=20, min_score=-0.5):
 
239
  if not nodes or not reranker:
240
  log_message("WARNING: No nodes or reranker available")
241
  return nodes[:top_k]
242
 
243
+ pairs = [[query, n.text[:500]] for n in nodes]
244
  scores = reranker.predict(pairs)
245
  scored = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)
246
 
 
247
  if scored:
248
  top_5_scores = [s for _, s in scored[:5]]
249
  bottom_5_scores = [s for _, s in scored[-5:]]
 
251
  log_message(f"Top 5 scores: {top_5_scores}")
252
  log_message(f"Bottom 5 scores: {bottom_5_scores}")
253
 
 
254
  above_threshold = sum(1 for _, s in scored if s >= min_score)
255
  log_message(f"Nodes above threshold ({min_score}): {above_threshold}/{len(scored)}")
256