Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

f6a9f63

1 Parent(s): b91dfb0

eski holat

Browse files

Files changed (5) hide show

app.py +320 -90
documents_prep.py +434 -717
index_retriever.py +65 -166
table_prep.py +244 -177
utils.py +145 -25

app.py CHANGED Viewed

@@ -1,86 +1,251 @@
 import gradio as gr
 from llama_index.core import Settings
-from documents_prep import load_all_documents
-from index_retriever import create_vector_index, create_query_engine
 from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
-from config import *
-# Global state
-query_engine = None
-reranker = None
-def initialize_system():
-    """Initialize RAG system"""
-    global query_engine, reranker
-    log_message("="*60)
-    log_message("INITIALIZING SYSTEM")
-    log_message("="*60)
-    # Setup models
-    llm = get_llm_model(GOOGLE_API_KEY)
-    embed_model = get_embedding_model()
-    reranker = get_reranker_model()
-    Settings.llm = llm
-    Settings.embed_model = embed_model
-    log_message("✓ Models loaded")
-    # Load documents
-    documents = load_all_documents(
-        repo_id=HF_REPO_ID,
-        hf_token=HF_TOKEN,
-        json_dir=JSON_FILES_DIR,
-        table_dir=TABLE_DATA_DIR,
-        image_dir=IMAGE_DATA_DIR
-    )
-    # Create index
-    vector_index = create_vector_index(documents)
-    query_engine = create_query_engine(vector_index)
-    log_message("="*60)
-    log_message("SYSTEM READY")
-    log_message("="*60)
-    return "✅ System initialized"
-def ask_question(question):
-    """Handle question from UI"""
-    if not question.strip():
-        return "Пожалуйста, введите вопрос", ""
-    if query_engine is None:
-        return "❌ Система не инициализирована", ""
-    answer, sources = answer_question(question, query_engine, reranker)
-    return answer, sources
-def create_interface():
-    """Create Gradio UI"""
-    # Auto-initialize system before UI starts
-    status_msg = initialize_system()
-    with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # AIEXP - AI Expert для нормативной документации
-        ## Упрощенная версия RAG системы
-        """)
-        gr.Markdown("### Задайте вопрос")
-        with gr.Row():
-            question = gr.Textbox(
-                label="Ваш вопрос",
-                placeholder="Введите вопрос...",
-                lines=3
-            )
-        ask_btn = gr.Button("Найти ответ", variant="primary")
-        gr.Examples(
-                    examples=[
                             "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
                             "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
@@ -88,38 +253,103 @@ def create_interface():
                             "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
                             "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
                         ],
-            inputs=question
-        )
-        with gr.Row():
-            answer = gr.Textbox(
-                label="Ответ",
-                lines=10
             )
-            sources = gr.Textbox(
-                label="Источники",
-                lines=10
             )
-        # Event handlers
-        ask_btn.click(
-            fn=ask_question,
-            inputs=question,
-            outputs=[answer, sources]
-        )
-        question.submit(
-            fn=ask_question,
-            inputs=question,
-            outputs=[answer, sources]
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )

 import gradio as gr
+import os
 from llama_index.core import Settings
+from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
 from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
+from index_retriever import create_vector_index, create_query_engine
+import sys
+from config import (
+    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
+    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
+)
+def create_chunks_display_html(chunk_info):
+    if not chunk_info:
+        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
+    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
+    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
+    for i, chunk in enumerate(chunk_info):
+        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        # Get section display info
+        section_display = get_section_display(chunk)
+        formatted_content = get_formatted_content(chunk)
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
+            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
+            <strong style='color: black;'>Содержание:</strong><br>
+            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
+                {formatted_content}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def get_section_display(chunk):
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    doc_type = chunk.get('type', 'text')
+    if doc_type == 'table' and chunk.get('table_number'):
+        table_num = chunk.get('table_number')
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+        return f"таблица {table_num}"
+    if doc_type == 'image' and chunk.get('image_number'):
+        image_num = chunk.get('image_number')
+        if not str(image_num).startswith('№'):
+            image_num = f"№{image_num}"
+        return f"рисунок {image_num}"
+    if section_path:
+        return section_path
+    elif section_id and section_id != 'unknown':
+        return section_id
+    return section_id
+def get_formatted_content(chunk):
+    document_id = chunk.get('document_id', 'unknown')
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    section_text = chunk.get('section_text', '')
+    parent_section = chunk.get('parent_section', '')
+    parent_title = chunk.get('parent_title', '')
+    level = chunk.get('level', '')
+    chunk_text = chunk.get('chunk_text', '')
+    doc_type = chunk.get('type', 'text')
+    # For text documents
+    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
+        current_section = section_path if section_path else section_id
+        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
+        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
+    else:
+        current_section = section_path if section_path else section_id
+        clean_text = chunk_text
+        if section_text and chunk_text.startswith(section_text):
+            section_title = section_text
+        elif chunk_text.startswith(f"{current_section} "):
+            clean_text = chunk_text[len(f"{current_section} "):].strip()
+            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
+        else:
+            section_title = section_text if section_text else current_section
+        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
+def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
+                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
+                     use_json_instead_csv=False):
+    try:
+        from documents_prep import process_documents_with_chunking
+        log_message("Инициализация системы")
+        os.makedirs(download_dir, exist_ok=True)
+        from config import CHUNK_SIZE, CHUNK_OVERLAP
+        from llama_index.core.text_splitter import TokenTextSplitter
+        embed_model = get_embedding_model()
+        llm = get_llm_model(DEFAULT_MODEL)
+        reranker = get_reranker_model()
+        Settings.embed_model = embed_model
+        Settings.llm = llm
+        Settings.text_splitter = TokenTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separator=" ",
+            backup_separators=["\n", ".", "!", "?"]
+        )
+        log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
+        log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
+        all_documents = []
+        chunks_df = None
+        chunk_info = []
+        if use_json_instead_csv and json_files_dir:
+            log_message("Используем JSON файлы вместо CSV")
+            json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
+            all_documents.extend(json_documents)
+            chunk_info.extend(json_chunk_info)
+        else:
+            if chunks_filename:
+                log_message("Загружаем данные из CSV")
+                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
+                all_documents.extend(csv_documents)
+        if table_data_dir:
+            log_message("Добавляю табличные данные")
+            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            log_message(f"Загружено {len(table_documents)} табличных документов")
+            # Process table documents through chunking
+            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
+            all_documents.extend(chunked_table_docs)
+            chunk_info.extend(table_chunk_info)
+        if image_data_dir:
+            log_message("Добавляю данные изображений")
+            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            log_message(f"Загружено {len(image_documents)} документов изображений")
+            # Process image documents through chunking
+            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
+            all_documents.extend(chunked_image_docs)
+            chunk_info.extend(image_chunk_info)
+        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
+        vector_index = create_vector_index(all_documents)
+        query_engine = create_query_engine(vector_index)
+        log_message(f"Система успешно инициализирована")
+        return query_engine, chunks_df, reranker, vector_index, chunk_info
+    except Exception as e:
+        log_message(f"Ошибка инициализации: {str(e)}")
+        return None, None, None, None, []
+def switch_model(model_name, vector_index):
+    from llama_index.core import Settings
+    from index_retriever import create_query_engine
+    try:
+        log_message(f"Переключение на модель: {model_name}")
+        new_llm = get_llm_model(model_name)
+        Settings.llm = new_llm
+        if vector_index is not None:
+            new_query_engine = create_query_engine(vector_index)
+            log_message(f"Модель успешно переключена на: {model_name}")
+            return new_query_engine, f"✅ Модель переключена на: {model_name}"
+        else:
+            return None, "❌ Ошибка: система не инициализирована"
+    except Exception as e:
+        error_msg = f"Ошибка переключения модели: {str(e)}"
+        log_message(error_msg)
+        return None, f"❌ {error_msg}"
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    if not question.strip():
+        return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
+                "<div style='color: black;'>Источники появятся после обработки запроса</div>",
+                "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
+    try:
+        # Call the answer_question function which returns 3 values
+        answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
+        return answer_html, sources_html, chunks_html
+    except Exception as e:
+        log_message(f"Ошибка при ответе на вопрос: {str(e)}")
+        return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
+                "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
+                "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
+def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
+    with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # AIEXP - Artificial Intelligence Expert
+        ## Инструмент для работы с нормативной документацией
+        """)
+        with gr.Tab("Поиск по нормативным документам"):
+            gr.Markdown("### Задайте вопрос по нормативной документации")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_dropdown = gr.Dropdown(
+                        choices=list(AVAILABLE_MODELS.keys()),
+                        value=current_model,
+                        label="Выберите языковую модель",
+                        info="Выберите модель для генерации ответов"
+                    )
+                with gr.Column(scale=1):
+                    switch_btn = gr.Button("Переключить модель", variant="secondary")
+                    model_status = gr.Textbox(
+                        value=f"Текущая модель: {current_model}",
+                        label="Статус модели",
+                        interactive=False
+                    )
+            with gr.Row():
+                with gr.Column(scale=3):
+                    question_input = gr.Textbox(
+                        label="Ваш вопрос к базе знаний",
+                        placeholder="Введите вопрос по нормативным документам...",
+                        lines=3
+                    )
+                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
+                    gr.Examples(
+                        examples=[
                             "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
                             "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
                             "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
                             "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
                         ],
+                        inputs=question_input
+                    )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    answer_output = gr.HTML(
+                        label="",
+                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
+                    )
+                with gr.Column(scale=1):
+                    sources_output = gr.HTML(
+                        label="",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+                with gr.Column(scale=1):
+                    chunks_output = gr.HTML(
+                        label="Релевантные чанки",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+            switch_btn.click(
+                fn=switch_model_func,
+                inputs=[model_dropdown],
+                outputs=[model_status]
+            )
+            ask_btn.click(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
             )
+            question_input.submit(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
             )
+    return demo
+query_engine = None
+chunks_df = None
+reranker = None
+vector_index = None
+current_model = DEFAULT_MODEL
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    answer_html, sources_html, chunks_html = answer_question(
+        question, query_engine, reranker, current_model, chunks_df
+    )
+    return answer_html, sources_html, chunks_html
+def main_switch_model(model_name):
+    global query_engine, vector_index, current_model
+    new_query_engine, status_message = switch_model(model_name, vector_index)
+    if new_query_engine:
+        query_engine = new_query_engine
+        current_model = model_name
+    return status_message
+def main():
+    global query_engine, chunks_df, reranker, vector_index, current_model
+    log_message("Запуск AIEXP - AI Expert для нормативной документации")
+    query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
+        repo_id=HF_REPO_ID,
+        hf_token=HF_TOKEN,
+        download_dir=DOWNLOAD_DIR,
+        json_files_dir=JSON_FILES_DIR,
+        table_data_dir=TABLE_DATA_DIR,
+        image_data_dir=IMAGE_DATA_DIR,
+        use_json_instead_csv=True,
+    )
+    if query_engine:
+        log_message("Запуск веб-интерфейса")
+        demo = create_demo_interface(
+            answer_question_func=main_answer_question,
+            switch_model_func=main_switch_model,
+            current_model=current_model,
+            chunk_info=chunk_info
+        )
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True,
+            debug=False
+        )
+    else:
+        log_message("Невозможно запустить приложение из-за ошибки инициализации")
+        sys.exit(1)
 if __name__ == "__main__":
+    main()

documents_prep.py CHANGED Viewed

@@ -3,769 +3,486 @@ import zipfile
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
-from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
-import re
-# Configuration
-CHUNK_SIZE = 1500
-CHUNK_OVERLAP = 128
-def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP
     )
-    chunked = []
-    for doc in documents:
-        chunks = text_splitter.get_nodes_from_documents([doc])
-        for i, chunk in enumerate(chunks):
-            chunk.metadata.update({
-                'chunk_id': i,
-                'total_chunks': len(chunks),
-                'chunk_size': len(chunk.text)  # Add chunk size
-            })
-            chunked.append(chunk)
-    # Log statistics
-    if chunked:
-        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
-        min_size = min(len(c.text) for c in chunked)
-        max_size = max(len(c.text) for c in chunked)
-        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
-        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
-    return chunked
-def should_keep_table_whole(doc_id):
-    """Check if document should be kept as single chunk"""
-    special_patterns = [
-        r'НП\s*068-05',
-        r'НП-068-05',
-        r'59023',
-        r'ГОСТ\s*Р?\s*59023'
-    ]
-    for pattern in special_patterns:
-        if re.search(pattern, doc_id, re.IGNORECASE):
-            return True
-    return False
-def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=3, max_chars=2000):
-    headers = table_data.get('headers', [])
-    rows = table_data.get('data', [])
-    table_num = str(table_data.get('table_number', 'unknown')).strip()
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    # CHECK FOR SPECIAL FILES - NO CHUNKING
-    if should_keep_table_whole(doc_id):
-        log_message(f"  📊 FULL TABLE (special file): {doc_id} - {table_num}")
-        return create_full_table_chunk(table_data, doc_id)
-    # Section-aware identifier (keep your existing logic)
-    import re
-    if 'приложени' in section.lower():
-        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
-        if appendix_match:
-            appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num} Приложение {appendix_num}"
-        else:
-            table_identifier = table_num
-    else:
-        table_identifier = table_num
-    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Build base header (compact version)
-    base_header = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
-    if table_title:
-        base_header += f"НАЗВАНИЕ: {table_title}\n"
-    base_header += f"{'='*60}\n"
-    if headers:
-        header_str = ' | '.join(str(h)[:30] for h in headers)  # Truncate long headers
-        base_header += f"ЗАГОЛОВКИ: {header_str}\n\n"
-    # Calculate available space
-    base_size = len(base_header)
-    footer_size = 100
-    available_space = max_chars - base_size - footer_size
-    chunks = []
-    current_batch = []
-    current_size = 0
-    chunk_num = 0
-    for i, row in enumerate(rows):
-        row_text = format_single_row(row, i + 1)
-        row_size = len(row_text)
-        # Case 1: Single row exceeds max - split it internally
-        if row_size > available_space:
-            # Flush current batch first
-            if current_batch:
-                chunks.append(_create_chunk(
-                    base_header, current_batch, table_identifier,
-                    doc_id, table_num, table_title, section,
-                    len(rows), chunk_num, False
-                ))
-                chunk_num += 1
-                current_batch = []
-                current_size = 0
-            log_message(f"    ⚠ Row {i+1} too large ({row_size} chars), splitting...")
-            # Split the large row
-            split_chunks = _split_large_row(
-                row, i + 1, base_header, available_space,
-                table_identifier, doc_id, table_num, table_title,
-                section, len(rows), chunk_num
-            )
-            chunks.extend(split_chunks)
-            log_message(f"      → Created {len(split_chunks)} chunks from row {i+1}")
-            chunk_num += len(split_chunks)
-            continue
-        # Case 2: Adding this row would exceed limit - flush current batch
-        if current_size + row_size > available_space and current_batch:
-            chunks.append(_create_chunk(
-                base_header, current_batch, table_identifier,
-                doc_id, table_num, table_title, section,
-                len(rows), chunk_num, False
-            ))
-            chunk_num += 1
-            current_batch = []
-            current_size = 0
-        # Case 3: Add row to current batch
-        current_batch.append({'row': row, 'idx': i + 1, 'text': row_text})
-        log_message(f"    + Row {i+1} ({row_size} chars) added to chunk {chunk_num}")
-        current_size += row_size
-        # Flush if we hit target row count
-        if len(current_batch) >= rows_per_chunk:
-            chunks.append(_create_chunk(
-                base_header, current_batch, table_identifier,
-                doc_id, table_num, table_title, section,
-                len(rows), chunk_num, False
-            ))
-            chunk_num += 1
-            current_batch = []
-            current_size = 0
-    # Flush remaining rows
-    if current_batch:
-        chunks.append(_create_chunk(
-            base_header, current_batch, table_identifier,
-            doc_id, table_num, table_title, section,
-            len(rows), chunk_num, len(chunks) == 0
-        ))
-    log_message(f"    Created {len(chunks)} chunks from {len(rows)} rows")
-    return chunks
-def create_full_table_chunk(table_data, doc_id):
-    """Create a single chunk for entire table (no splitting)"""
-    headers = table_data.get('headers', [])
-    rows = table_data.get('data', [])
-    table_num = str(table_data.get('table_number', 'unknown')).strip()
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    # Section-aware identifier
-    import re
-    if 'приложени' in section.lower():
-        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
-        if appendix_match:
-            appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num} Приложение {appendix_num}"
-        else:
-            table_identifier = table_num
-    else:
-        table_identifier = table_num
-    # Build full content
-    content = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
-    if table_title:
-        content += f"НАЗВАНИЕ: {table_title}\n"
-    content += f"РАЗДЕЛ: {section}\n"
-    content += f"{'='*60}\n"
-    if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
-    content += "ДАННЫЕ (ПОЛНАЯ ТАБЛИЦА):\n"
-    for i, row in enumerate(rows, 1):
-        row_text = format_single_row(row, i)
-        if row_text:
-            content += row_text
-    content += f"\n[Полная таблица: {len(rows)} строк]\n"
-    # Embed metadata in text
-    content += f"\n\n--- МЕТАДАННЫЕ ---\n"
-    content += f"Документ: {doc_id}\n"
-    content += f"Таблица: {table_identifier}\n"
-    content += f"Название таблицы: {table_title}\n"
-    content += f"Раздел: {section}\n"
-    content += f"Всего строк: {len(rows)}\n"
-    metadata = {
-        'type': 'table',
-        'document_id': doc_id,
-        'table_number': table_num,
-        'table_identifier': table_identifier,
-        'table_title': table_title,
-        'section': section,
-        'chunk_id': 0,
-        'row_start': 0,
-        'row_end': len(rows),
-        'total_rows': len(rows),
-        'chunk_size': len(content),
-        'is_complete_table': True,
-        'chunking_strategy': 'full_table',
-        'rows_in_chunk': len(rows)
-    }
-    return [Document(text=content, metadata=metadata)]
-def _create_chunk(base_header, batch, table_identifier, doc_id,
-                  table_num, table_title, section, total_rows,
-                  chunk_num, is_complete):
-    """Helper to create a chunk with full metadata"""
-    content = base_header + "ДАННЫЕ:\n"
-    for item in batch:
-        content += item['text']
-    row_start = batch[0]['idx']
-    row_end = batch[-1]['idx']
-    # Add footer with row info
-    if not is_complete:
-        content += f"\n[Строки {row_start}-{row_end} из {total_rows}]"
-    # EMBED ALL METADATA IN TEXT for better retrieval
-    content += f"\n\n--- МЕТАДАННЫЕ ---\n"
-    content += f"Документ: {doc_id}\n"
-    content += f"Таблица: {table_identifier}\n"
-    content += f"Название таблицы: {table_title}\n"
-    content += f"Раздел: {section}\n"
-    content += f"Строки: {row_start}-{row_end} из {total_rows}\n"
-    metadata = {
-        'type': 'table',
-        'document_id': doc_id,
-        'table_number': table_num,
-        'table_identifier': table_identifier,
-        'table_title': table_title,
-        'section': section,
-        'chunk_id': chunk_num,
-        'row_start': row_start - 1,
-        'row_end': row_end,
-        'total_rows': total_rows,
-        'chunk_size': len(content),
-        'is_complete_table': is_complete,
-        'rows_in_chunk': len(batch)
-    }
-    return Document(text=content, metadata=metadata)
-def _split_large_row(row, row_idx, base_header, max_size,
-                     table_identifier, doc_id, table_num,
-                     table_title, section, total_rows, base_chunk_num):
-    """Split a single large row into multiple chunks"""
-    if isinstance(row, dict):
-        items = list(row.items())
-    else:
-        items = [(f"col_{i}", v) for i, v in enumerate(row)]
-    chunks = []
-    current_items = []
-    current_size = 0
-    part_num = 0
-    for key, value in items:
-        item_text = f"{key}: {value}\n"
-        item_size = len(item_text)
-        if current_size + item_size > max_size and current_items:
-            # Create chunk for current items
-            content = base_header + "ДАННЫЕ:\n"
-            content += f"Строка {row_idx} (часть {part_num + 1}):\n"
-            content += "".join(current_items)
-            content += f"\n[Строка {row_idx} из {total_rows} - продолжается]"
-            chunks.append(_create_chunk_from_text(
-                content, doc_id, table_num, table_identifier,
-                table_title, section, row_idx, row_idx,
-                total_rows, base_chunk_num + part_num
-            ))
-            part_num += 1
-            current_items = []
-            current_size = 0
-        current_items.append(item_text)
-        current_size += item_size
-    # Flush remaining
-    if current_items:
-        content = base_header + "ДАННЫЕ:\n"
-        content += f"Строка {row_idx} (часть {part_num + 1}):\n"
-        content += "".join(current_items)
-        chunks.append(_create_chunk_from_text(
-            content, doc_id, table_num, table_identifier,
-            table_title, section, row_idx, row_idx,
-            total_rows, base_chunk_num + part_num
-        ))
-    return chunks
-def _create_chunk_from_text(content, doc_id, table_num, table_identifier,
-                            table_title, section, row_start, row_end,
-                            total_rows, chunk_num):
-    """Helper for creating chunk from pre-built text"""
-    metadata = {
-        'type': 'table',
-        'document_id': doc_id,
-        'table_number': table_num,
-        'table_identifier': table_identifier,
-        'table_title': table_title,
-        'section': section,
-        'chunk_id': chunk_num,
-        'row_start': row_start - 1,
-        'row_end': row_end,
-        'total_rows': total_rows,
-        'chunk_size': len(content),
-        'is_complete_table': False
-    }
-    return Document(text=content, metadata=metadata)
-def format_single_row(row, idx):
-    """Format a single row"""
-    if isinstance(row, dict):
-        parts = [f"{k}: {v}" for k, v in row.items()
-                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
-        if parts:
-            return f"{idx}. {' | '.join(parts)}\n"
-    elif isinstance(row, list):
-        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
-        if parts:
-            return f"{idx}. {' | '.join(parts)}\n"
-    return ""
-def load_table_documents(repo_id, hf_token, table_dir):
-    log_message("Loading tables...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
-    all_chunks = []
-    stats = {
-        'full_tables': 0,
-        'split_tables': 0,
-        'total_chunks': 0,
-        'full_table_sizes': [],
-        'split_chunk_sizes': []
-    }
-    for file_path in table_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
-            for sheet in data.get('sheets', []):
-                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                chunks = chunk_table_by_rows(sheet, sheet_doc_id, max_chars=3072)
-                # Track statistics
-                if chunks:
-                    is_full = chunks[0].metadata.get('is_complete_table', False)
-                    chunk_size = chunks[0].metadata.get('chunk_size', 0)
-                    if is_full:
-                        stats['full_tables'] += 1
-                        stats['full_table_sizes'].append(chunk_size)
-                        log_message(f"  📄 {sheet_doc_id}: FULL TABLE ({chunk_size} chars)")
-                    else:
-                        stats['split_tables'] += 1
-                        for c in chunks:
-                            stats['split_chunk_sizes'].append(c.metadata.get('chunk_size', 0))
-                        log_message(f"  📄 {sheet_doc_id}: {len(chunks)} chunks (split)")
-                    stats['total_chunks'] += len(chunks)
-                all_chunks.extend(chunks)
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    # Print final statistics
-    log_message(f"\n{'='*60}")
-    log_message(f"TABLE LOADING STATISTICS:")
-    log_message(f"  Total chunks created: {stats['total_chunks']}")
-    log_message(f"  Full tables (no split): {stats['full_tables']}")
-    log_message(f"  Split tables: {stats['split_tables']}")
-    if stats['full_table_sizes']:
-        avg_full = sum(stats['full_table_sizes']) / len(stats['full_table_sizes'])
-        log_message(f"  Full table avg size: {avg_full:.0f} chars")
-        log_message(f"  Full table size range: {min(stats['full_table_sizes'])} - {max(stats['full_table_sizes'])} chars")
-    if stats['split_chunk_sizes']:
-        avg_split = sum(stats['split_chunk_sizes']) / len(stats['split_chunk_sizes'])
-        log_message(f"  Split chunk avg size: {avg_split:.0f} chars")
-        log_message(f"  Split chunk size range: {min(stats['split_chunk_sizes'])} - {max(stats['split_chunk_sizes'])} chars")
-    log_message(f"{'='*60}\n")
-    return all_chunks
-def create_whole_table_chunk(table_data, doc_id):
-    """Create a single chunk for the entire table (no splitting)"""
-    headers = table_data.get('headers', [])
-    rows = table_data.get('data', [])
-    table_num = str(table_data.get('table_number', 'unknown')).strip()
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    # Section-aware identifier
-    import re
-    if 'приложени' in section.lower():
-        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
-        if appendix_match:
-            appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num} Приложение {appendix_num}"
-        else:
-            table_identifier = table_num
-    else:
-        table_identifier = table_num
-    if not rows:
-        return []
-    log_message(f"  📊 Creating WHOLE table: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Build complete table content
-    content = f"ДОКУМЕНТ: {doc_id} | ТАБЛИЦА: {table_identifier}\n"
-    if table_title:
-        content += f"НАЗВАНИЕ: {table_title}\n"
-    content += f"{'='*60}\n"
-    if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
-    content += "ДАННЫЕ:\n"
-    # Add ALL rows
-    for i, row in enumerate(rows, 1):
-        row_text = format_single_row(row, i)
-        if row_text:
-            content += row_text
-    # Add metadata section
-    content += f"\n\n--- МЕТАДАННЫЕ ---\n"
-    content += f"Документ: {doc_id}\n"
-    content += f"Таблица: {table_identifier}\n"
-    content += f"Название таблицы: {table_title}\n"
-    content += f"Раздел: {section}\n"
-    content += f"Полная таблица: {len(rows)} строк\n"
-    metadata = {
-        'type': 'table',
-        'document_id': doc_id,
-        'table_number': table_num,
-        'table_identifier': table_identifier,
-        'table_title': table_title,
-        'section': section,
-        'chunk_id': 0,
-        'row_start': 0,
-        'row_end': len(rows),
-        'total_rows': len(rows),
-        'chunk_size': len(content),
-        'is_complete_table': True,
-        'rows_in_chunk': len(rows)
-    }
-    log_message(f"    Created 1 chunk with {len(rows)} rows ({len(content)} chars)")
-    return [Document(text=content, metadata=metadata)]
-def load_json_documents(repo_id, hf_token, json_dir):
-    import zipfile
-    import tempfile
-    import os
-    log_message("Loading JSON documents...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
-    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
-    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
     documents = []
-    stats = {'success': 0, 'failed': 0, 'empty': 0}
-    for file_path in json_files:
-        try:
-            log_message(f"  Loading: {file_path}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            docs = extract_sections_from_json(local_path)
-            if docs:
-                documents.extend(docs)
-                stats['success'] += 1
-                log_message(f"    ✓ Extracted {len(docs)} sections")
-            else:
-                stats['empty'] += 1
-                log_message(f"    ⚠ No sections found")
-        except Exception as e:
-            stats['failed'] += 1
-            log_message(f"    ✗ Error: {e}")
-    for zip_path in zip_files:
-        try:
-            log_message(f"  Processing ZIP: {zip_path}")
-            local_zip = hf_hub_download(
-                repo_id=repo_id,
-                filename=zip_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with zipfile.ZipFile(local_zip, 'r') as zf:
-                json_files_in_zip = [f for f in zf.namelist()
-                                    if f.endswith('.json')
-                                    and not f.startswith('__MACOSX')
-                                    and not f.startswith('.')
-                                    and not '._' in f]
-                log_message(f"    Found {len(json_files_in_zip)} JSON files in ZIP")
-                for json_file in json_files_in_zip:
-                    try:
-                        file_content = zf.read(json_file)
-                        # Skip if file is too small
-                        if len(file_content) < 10:
-                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
-                            stats['failed'] += 1
-                            continue
-                        # Try UTF-8 first (most common)
-                        try:
-                            text_content = file_content.decode('utf-8')
-                        except UnicodeDecodeError:
-                            try:
-                                text_content = file_content.decode('utf-8-sig')
-                            except UnicodeDecodeError:
-                                try:
-                                    # Try UTF-16 (the issue you're seeing)
-                                    text_content = file_content.decode('utf-16')
-                                except UnicodeDecodeError:
-                                    try:
-                                        text_content = file_content.decode('windows-1251')
-                                    except UnicodeDecodeError:
-                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
-                                        stats['failed'] += 1
-                                        continue
-                        # Validate JSON structure
-                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
-                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
-                            stats['failed'] += 1
-                            continue
-                        with tempfile.NamedTemporaryFile(mode='w', delete=False,
-                                                        suffix='.json', encoding='utf-8') as tmp:
-                            tmp.write(text_content)
-                            tmp_path = tmp.name
-                        docs = extract_sections_from_json(tmp_path)
-                        if docs:
-                            documents.extend(docs)
-                            stats['success'] += 1
-                            log_message(f"      ✓ {json_file}: {len(docs)} sections")
-                        else:
-                            stats['empty'] += 1
-                            log_message(f"      ⚠ {json_file}: No sections")
-                        os.unlink(tmp_path)
-                    except json.JSONDecodeError as e:
-                        stats['failed'] += 1
-                        log_message(f"      ✗ {json_file}: Invalid JSON")
-                    except Exception as e:
-                        stats['failed'] += 1
-                        log_message(f"      ✗ {json_file}: {str(e)[:100]}")
-        except Exception as e:
-            log_message(f"    ✗ Error with ZIP: {e}")
-    log_message(f"="*60)
-    log_message(f"JSON Loading Stats:")
-    log_message(f"  Success: {stats['success']}")
-    log_message(f"  Empty: {stats['empty']}")
-    log_message(f"  Failed: {stats['failed']}")
-    log_message(f"  Total sections: {len(documents)}")
-    log_message(f"="*60)
     return documents
-def extract_sections_from_json(json_path):
-    """Extract sections from a single JSON file"""
-    documents = []
     try:
-        with open(json_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
-        # Extract all section levels
-        for section in data.get('sections', []):
-            if section.get('section_text', '').strip():
-                documents.append(Document(
-                    text=section['section_text'],
-                    metadata={
-                        'type': 'text',
-                        'document_id': doc_id,
-                        'section_id': section.get('section_id', '')
-                    }
-                ))
-            # Subsections
-            for subsection in section.get('subsections', []):
-                if subsection.get('subsection_text', '').strip():
-                    documents.append(Document(
-                        text=subsection['subsection_text'],
                         metadata={
-                            'type': 'text',
-                            'document_id': doc_id,
-                            'section_id': subsection.get('subsection_id', '')
                         }
-                    ))
-                # Sub-subsections
-                for sub_sub in subsection.get('sub_subsections', []):
-                    if sub_sub.get('sub_subsection_text', '').strip():
-                        documents.append(Document(
-                            text=sub_sub['sub_subsection_text'],
-                            metadata={
-                                'type': 'text',
-                                'document_id': doc_id,
-                                'section_id': sub_sub.get('sub_subsection_id', '')
-                            }
-                        ))
     except Exception as e:
-        log_message(f"Error extracting from {json_path}: {e}")
-    return documents
-def load_image_documents(repo_id, hf_token, image_dir):
-    """Load image descriptions"""
-    log_message("Loading images...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
-    documents = []
-    for file_path in csv_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
             )
-            df = pd.read_csv(local_path)
-            for _, row in df.iterrows():
-                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
-                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
-                content += f"Название: {row.get('Название изображения', '')}\n"
-                content += f"Описание: {row.get('Описание изображение', '')}\n"
-                content += f"Раздел: {row.get('Раздел документа', '')}\n"
-                chunk_size = len(content)
-                documents.append(Document(
-                    text=content,
-                    metadata={
-                        'type': 'image',
-                        'document_id': str(row.get('Обозначение документа', 'unknown')),
-                        'image_number': str(row.get('№ Изображения', 'unknown')),
-                        'section': str(row.get('Раздел документа', '')),
-                        'chunk_size': chunk_size
-                    }
-                ))
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    if documents:
-        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
-        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
-    return documents
-def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
-    """Main loader - combines all document types"""
-    log_message("="*60)
-    log_message("STARTING DOCUMENT LOADING")
-    log_message("="*60)
-    # Load text sections
-    text_docs = load_json_documents(repo_id, hf_token, json_dir)
-    text_chunks = chunk_text_documents(text_docs)
-    # Load tables (already chunked)
-    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
-    # Load images (no chunking needed)
-    image_docs = load_image_documents(repo_id, hf_token, image_dir)
-    all_docs = text_chunks + table_chunks + image_docs
-    log_message("="*60)
-    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
-    log_message(f"  Text chunks: {len(text_chunks)}")
-    log_message(f"  Table chunks: {len(table_chunks)}")
-    log_message(f"  Images: {len(image_docs)}")
-    log_message("="*60)
-    return all_docs

 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
+from llama_index.core.text_splitter import SentenceSplitter
+from config import CHUNK_SIZE, CHUNK_OVERLAP
+from table_prep import table_to_document, load_table_data
+def chunk_document(doc, chunk_size=None, chunk_overlap=None):
+    if chunk_size is None:
+        chunk_size = CHUNK_SIZE
+    if chunk_overlap is None:
+        chunk_overlap = CHUNK_OVERLAP
     text_splitter = SentenceSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separator=" "
     )
+    text_chunks = text_splitter.split_text(doc.text)
+    chunked_docs = []
+    for i, chunk_text in enumerate(text_chunks):
+        chunk_metadata = doc.metadata.copy()
+        chunk_metadata.update({
+            "chunk_id": i,
+            "total_chunks": len(text_chunks),
+            "chunk_size": len(chunk_text),
+            "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
+        })
+        chunked_doc = Document(
+            text=chunk_text,
+            metadata=chunk_metadata
+        )
+        chunked_docs.append(chunked_doc)
+    return chunked_docs
+def process_documents_with_chunking(documents):
+    all_chunked_docs = []
+    chunk_info = []
+    table_count = 0
+    table_chunks_count = 0
+    image_count = 0
+    image_chunks_count = 0
+    text_chunks_count = 0
+    for doc in documents:
+        doc_type = doc.metadata.get('type', 'text')
+        is_already_chunked = doc.metadata.get('is_chunked', False)
+        if doc_type == 'table':
+            if is_already_chunked:
+                table_chunks_count += 1
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': doc.metadata.get('chunk_id', 0),
+                    'total_chunks': doc.metadata.get('total_chunks', 1),
+                    'chunk_size': len(doc.text),
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown')
+                })
+            else:
+                table_count += 1
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': len(doc.text),
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown')
+                })
+        elif doc_type == 'image':
+            image_count += 1
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
+                           f"Размер: {doc_size} > {CHUNK_SIZE}")
+                chunked_docs = chunk_document(doc)
+                image_chunks_count += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+                log_message(f"  ✂️ Разделено на {len(chunked_docs)} чанков")
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'image',
+                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'image',
+                    'image_number': doc.metadata.get('image_number', 'unknown')
+                })
+        else:
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
+                           f"Размер: {doc_size} > {CHUNK_SIZE}")
+                chunked_docs = chunk_document(doc)
+                text_chunks_count += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+                log_message(f"  ✂️ Разделен на {len(chunked_docs)} чанков")
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'text'
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'text'
+                })
+    log_message(f"\n{'='*60}")
+    log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
+    log_message(f"  • Таблицы (целые): {table_count}")
+    log_message(f"  • Таблицы (чанки): {table_chunks_count}")
+    log_message(f"  • Изображения (целые): {image_count - (image_chunks_count > 0)}")
+    log_message(f"  • Изображения (чанки): {image_chunks_count}")
+    log_message(f"  • Текстовые чанки: {text_chunks_count}")
+    log_message(f"  • Всего документов: {len(all_chunked_docs)}")
+    log_message(f"{'='*60}\n")
+    return all_chunked_docs, chunk_info
+def extract_text_from_json(data, document_id, document_name):
+    documents = []
+    if 'sections' in data:
+        for section in data['sections']:
+            section_id = section.get('section_id', 'Unknown')
+            section_text = section.get('section_text', '')
+            section_path = f"{section_id}"
+            section_title = extract_section_title(section_text)
+            if section_text.strip():
+                doc = Document(
+                    text=section_text,
+                    metadata={
+                        "type": "text",
+                        "document_id": document_id,
+                        "document_name": document_name,
+                        "section_id": section_id,
+                        "section_text": section_title[:200],
+                        "section_path": section_path,
+                        "level": "section"
+                    }
+                )
+                documents.append(doc)
+            if 'subsections' in section:
+                for subsection in section['subsections']:
+                    subsection_id = subsection.get('subsection_id', 'Unknown')
+                    subsection_text = subsection.get('subsection_text', '')
+                    subsection_title = extract_section_title(subsection_text)
+                    subsection_path = f"{section_path}.{subsection_id}"
+                    if subsection_text.strip():
+                        doc = Document(
+                            text=subsection_text,
+                            metadata={
+                                "type": "text",
+                                "document_id": document_id,
+                                "document_name": document_name,
+                                "section_id": subsection_id,
+                                "section_text": subsection_title[:200],
+                                "section_path": subsection_path,
+                                "level": "subsection",
+                                "parent_section": section_id,
+                                "parent_title": section_title[:100]
+                            }
+                        )
+                        documents.append(doc)
+                    if 'sub_subsections' in subsection:
+                        for sub_subsection in subsection['sub_subsections']:
+                            sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
+                            sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
+                            sub_subsection_title = extract_section_title(sub_subsection_text)
+                            sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
+                            if sub_subsection_text.strip():
+                                doc = Document(
+                                    text=sub_subsection_text,
+                                    metadata={
+                                        "type": "text",
+                                        "document_id": document_id,
+                                        "document_name": document_name,
+                                        "section_id": sub_subsection_id,
+                                        "section_text": sub_subsection_title[:200],
+                                        "section_path": sub_subsection_path,
+                                        "level": "sub_subsection",
+                                        "parent_section": subsection_id,
+                                        "parent_title": subsection_title[:100]
+                                    }
+                                )
+                                documents.append(doc)
+                            if 'sub_sub_subsections' in sub_subsection:
+                                for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
+                                    sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
+                                    sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
+                                    sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
+                                    if sub_sub_subsection_text.strip():
+                                        doc = Document(
+                                            text=sub_sub_subsection_text,
+                                            metadata={
+                                                "type": "text",
+                                                "document_id": document_id,
+                                                "document_name": document_name,
+                                                "section_id": sub_sub_subsection_id,
+                                                "section_text": sub_sub_subsection_title[:200],
+                                                "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
+                                                "level": "sub_sub_subsection",
+                                                "parent_section": sub_subsection_id,
+                                                "parent_title": sub_subsection_title[:100]
+                                            }
+                                        )
+                                        documents.append(doc)
+    return documents
+def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
+    log_message("Начинаю загрузку JSON документов")
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
+        json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
+        all_documents = []
+        for zip_file_path in zip_files:
+            try:
+                log_message(f"Загружаю ZIP архив: {zip_file_path}")
+                local_zip_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=zip_file_path,
+                    local_dir=download_dir,
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                documents = extract_zip_and_process_json(local_zip_path)
+                all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
+            except Exception as e:
+                log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
+                continue
+        for file_path in json_files:
+            try:
+                log_message(f"Обрабатываю прямой JSON файл: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir=download_dir,
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    json_data = json.load(f)
+                document_metadata = json_data.get('document_metadata', {})
+                document_id = document_metadata.get('document_id', 'unknown')
+                document_name = document_metadata.get('document_name', 'unknown')
+                documents = extract_text_from_json(json_data, document_id, document_name)
+                all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из {file_path}")
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
+        # Process documents through chunking function
+        chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
+        log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
+        return chunked_documents, chunk_info
+    except Exception as e:
+        log_message(f"Ошибка загрузки JSON документов: {str(e)}")
+        return [], []
+def extract_section_title(section_text):
+    if not section_text.strip():
+        return ""
+    lines = section_text.strip().split('\n')
+    first_line = lines[0].strip()
+    if len(first_line) < 200 and not first_line.endswith('.'):
+        return first_line
+    # Otherwise, extract first sentence
+    sentences = first_line.split('.')
+    if len(sentences) > 1:
+        return sentences[0].strip()
+    return first_line[:100] + "..." if len(first_line) > 100 else first_line
+def extract_zip_and_process_json(zip_path):
     documents = []
+    try:
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_files = zip_ref.namelist()
+            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
+            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
+            for json_file in json_files:
+                try:
+                    log_message(f"Обрабатываю файл из архива: {json_file}")
+                    with zip_ref.open(json_file) as f:
+                        json_data = json.load(f)
+                    document_metadata = json_data.get('document_metadata', {})
+                    document_id = document_metadata.get('document_id', 'unknown')
+                    document_name = document_metadata.get('document_name', 'unknown')
+                    docs = extract_text_from_json(json_data, document_id, document_name)
+                    documents.extend(docs)
+                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
+                except Exception as e:
+                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
+                    continue
+    except Exception as e:
+        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
     return documents
+def load_image_data(repo_id, hf_token, image_data_dir):
+    log_message("Начинаю загрузку данных изображений")
+    image_files = []
     try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(image_data_dir) and file.endswith('.csv'):
+                image_files.append(file)
+        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
+        image_documents = []
+        for file_path in image_files:
+            try:
+                log_message(f"Обрабатываю файл изображений: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                df = pd.read_csv(local_path)
+                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
+                # Обработка с правильными названиями колонок
+                for _, row in df.iterrows():
+                    section_value = row.get('Раздел документа', 'Неизвестно')
+                    content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
+                    content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"  # Опечатка в названии колонки
+                    content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                    content += f"Раздел: {section_value}\n"
+                    content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
+                    doc = Document(
+                        text=content,
                         metadata={
+                            "type": "image",
+                            "image_number": str(row.get('№ Изображения', 'unknown')),
+                            "image_title": str(row.get('Название изображения', 'unknown')),
+                            "image_description": str(row.get('Описание изображение', 'unknown')),
+                            "document_id": str(row.get('Обозначение документа', 'unknown')),
+                            "file_path": str(row.get('Файл изображения', 'unknown')),
+                            "section": str(section_value),
+                            "section_id": str(section_value)
                         }
+                    )
+                    image_documents.append(doc)
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Создано {len(image_documents)} документов из изображений")
+        return image_documents
     except Exception as e:
+        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
+        return []
+def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
+    log_message("Загружаю данные чанков из CSV")
+    try:
+        chunks_csv_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=chunks_filename,
+            local_dir=download_dir,
+            repo_type="dataset",
+            token=hf_token
+        )
+        chunks_df = pd.read_csv(chunks_csv_path)
+        log_message(f"Загружено {len(chunks_df)} чанков из CSV")
+        text_column = None
+        for col in chunks_df.columns:
+            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
+                text_column = col
+                break
+        if text_column is None:
+            text_column = chunks_df.columns[0]
+        log_message(f"Использую колонку: {text_column}")
+        documents = []
+        for i, (_, row) in enumerate(chunks_df.iterrows()):
+            doc = Document(
+                text=str(row[text_column]),
+                metadata={
+                    "chunk_id": row.get('chunk_id', i),
+                    "document_id": row.get('document_id', 'unknown'),
+                    "type": "text"
+                }
             )
+            documents.append(doc)
+        log_message(f"Создано {len(documents)} текстовых документов из CSV")
+        return documents, chunks_df
+    except Exception as e:
+        log_message(f"Ошибка загрузки CSV данных: {str(e)}")
+        return [], None

index_retriever.py CHANGED Viewed

@@ -1,178 +1,77 @@
-from llama_index.core import VectorStoreIndex
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.retrievers.bm25 import BM25Retriever
 from llama_index.core.retrievers import QueryFusionRetriever
-from llama_index.core.response_synthesizers import get_response_synthesizer
 from my_logging import log_message
-import re
-import re
-from difflib import SequenceMatcher
 def create_vector_index(documents):
-    """Create vector index from documents"""
-    log_message(f"Building vector index from {len(documents)} documents...")
-    index = VectorStoreIndex.from_documents(documents)
-    log_message("✓ Index created")
-    return index
-def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
-    """Return nodes that contain at least one keyword from the query."""
-    keywords = [w.lower() for w in query.split() if len(w) > 2]
-    filtered = []
-    for node in nodes:
-        text = node.text.lower()
-        if any(k in text for k in keywords):
-            filtered.append(node)
-    return filtered
-def normalize_doc_id(doc_id: str) -> str:
-    """Normalize document ID for consistent comparison."""
-    doc_id = doc_id.upper().strip()
-    doc_id = re.sub(r'[^\w\d\.]+', '', doc_id)  # remove spaces, dashes, etc.
-    doc_id = doc_id.replace("ГОСТР", "ГОСТ")
-    doc_id = doc_id.replace("GOSTR", "ГОСТ")
-    return doc_id
-def base_number(doc_id: str) -> str:
-    """Extract base numeric pattern (e.g., '59023.4' from 'ГОСТ Р 59023.4-2020')."""
-    m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
-    return m.group(1) if m else ""
-def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.5):
-    """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
-    if not doc_ids:
-        return nodes
-    filtered = []
-    doc_ids_norm = [normalize_doc_id(d) for d in doc_ids]
-    doc_ids_base = [base_number(d) for d in doc_ids_norm]
-    for node in nodes:
-        node_doc_id = normalize_doc_id(node.metadata.get('document_id', ''))
-        node_base = base_number(node_doc_id)
-        for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
-            # Strong match: same base number (e.g., 59023.4)
-            if q_base and node_base and q_base == node_base:
-                filtered.append(node)
-                break
-            # Medium match: similarity ratio > threshold
-            if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
-                filtered.append(node)
-                break
-            # Weak fallback: contains or partial substring
-            if q_base in node_doc_id or q_doc in node_doc_id:
-                filtered.append(node)
-                break
-    return filtered if filtered else nodes  # Fallback: keep all if none matched
-def extract_doc_id_from_query(query):
-    """Extract document IDs from query text with better pattern matching"""
-    patterns = [
-        r'ГОСТ\s*Р?\s*\d+(?:\.\d+)*(?:-\d{4})?',  # ГОСТ 59023.4, ГОСТ Р 50.05.01-2018
-        r'НП-\d+(?:-\d+)?',                         # НП-104-18
-        r'МУ[_\s]\d+(?:\.\d+)+(?:\.\d+)*(?:-\d{4})?',  # МУ 1.2.3.07.0057-2018
-    ]
-    found_ids = []
-    for pattern in patterns:
-        matches = re.findall(pattern, query, re.IGNORECASE)
-        found_ids.extend(matches)
-    # Normalize spacing and preserve dots
-    normalized = [re.sub(r'\s+', ' ', id.strip().upper()) for id in found_ids]
-    return normalized
-def russian_tokenizer(text):
-    """Better tokenizer for Russian document IDs and technical terms"""
-    import re
-    # Keep document ID patterns intact
-    text = re.sub(r'(ГОСТ\s*Р?\s*[\d\.]+(?:-\d{4})?)', r' \1 ', text)
-    text = re.sub(r'(НП-\d+(?:-\d+)?)', r' \1 ', text)
-    text = re.sub(r'(МУ[_\s][\d\.]+)', r' \1 ', text)
-    # Split on whitespace and punctuation, but keep numbers with decimals
-    tokens = re.findall(r'\d+\.\d+|\w+', text.lower())
-    return tokens
 def create_query_engine(vector_index):
-    """Create hybrid retrieval engine with document ID filtering"""
-    log_message("Creating query engine...")
-    vector_retriever = VectorIndexRetriever(
-        index=vector_index,
-        similarity_top_k=150
-    )
-    bm25_retriever = BM25Retriever.from_defaults(
-        docstore=vector_index.docstore,
-        similarity_top_k=150,
-        tokenizer=russian_tokenizer  # Add custom tokenizer
-    )
-    hybrid_retriever = QueryFusionRetriever(
-        [vector_retriever, bm25_retriever],
-        similarity_top_k=80,
-        num_queries=1
-    )
-    class DeduplicatedQueryEngine(RetrieverQueryEngine):
-        def retrieve(self, query):
-            nodes = hybrid_retriever.retrieve(query)
-            log_message(f"Hybrid retrieval returned: {len(nodes)} nodes")
-            # Extract document IDs from query
-            doc_ids = extract_doc_id_from_query(query)
-            if doc_ids:
-                log_message(f"Detected document IDs in query: {doc_ids}")
-                before = len(nodes)
-                nodes = filter_nodes_by_doc_id(nodes, doc_ids)
-                after = len(nodes)
-                log_message(f"Filtered by doc ID: {after}/{before} nodes kept (fallback safe)")
-            # Deduplication
-            seen_hashes = set()
-            unique_nodes = []
-            doc_type_counts = {'text': 0, 'table': 0, 'image': 0}
-            for node in nodes:
-                text_hash = hash(node.text[:500])
-                if text_hash not in seen_hashes:
-                    seen_hashes.add(text_hash)
-                    unique_nodes.append(node)
-                    node_type = node.metadata.get('type', 'text')
-                    doc_type_counts[node_type] = doc_type_counts.get(node_type, 0) + 1
-            log_message(f"After dedup: {len(unique_nodes)} unique nodes")
-            log_message(f"Types: text={doc_type_counts.get('text', 0)}, "
-                       f"table={doc_type_counts.get('table', 0)}, "
-                       f"image={doc_type_counts.get('image', 0)}")
-            # Log which documents we're returning
-            returned_docs = set(n.metadata.get('document_id', 'unknown') for n in unique_nodes[:50])
-            log_message(f"Returning nodes from: {sorted(returned_docs)}")
-            return unique_nodes[:50]
-    response_synthesizer = get_response_synthesizer()
-    query_engine = DeduplicatedQueryEngine(
-        retriever=hybrid_retriever,
-        response_synthesizer=response_synthesizer
-    )
-    log_message("✓ Query engine created with doc ID filtering")
-    return query_engine

+from llama_index.core import VectorStoreIndex, Settings
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
+from llama_index.core.prompts import PromptTemplate
 from llama_index.retrievers.bm25 import BM25Retriever
 from llama_index.core.retrievers import QueryFusionRetriever
 from my_logging import log_message
+from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 def create_vector_index(documents):
+    log_message("Строю векторный индекс")
+    return VectorStoreIndex.from_documents(documents)
+def deduplicate_nodes(nodes):
+    """Deduplicate retrieved nodes based on unique identifiers"""
+    seen = set()
+    unique_nodes = []
+    for node in nodes:
+        # Create unique identifier from metadata
+        doc_id = node.metadata.get('document_id', '')
+        section_id = node.metadata.get('section_id', '')
+        chunk_id = node.metadata.get('chunk_id', 0)
+        node_type = node.metadata.get('type', 'text')
+        if node_type == 'table':
+            table_num = node.metadata.get('table_number', '')
+            identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
+        elif node_type == 'image':
+            img_num = node.metadata.get('image_number', '')
+            identifier = f"{doc_id}|image|{img_num}"
+        else:
+            identifier = f"{doc_id}|{section_id}|{chunk_id}"
+        if identifier not in seen:
+            seen.add(identifier)
+            unique_nodes.append(node)
+    return unique_nodes
 def create_query_engine(vector_index):
+    try:
+        bm25_retriever = BM25Retriever.from_defaults(
+            docstore=vector_index.docstore,
+            similarity_top_k=20
+        )
+        vector_retriever = VectorIndexRetriever(
+            index=vector_index,
+            similarity_top_k=30,
+            similarity_cutoff=0.65
+        )
+        hybrid_retriever = QueryFusionRetriever(
+            [vector_retriever, bm25_retriever],
+            similarity_top_k=40,
+            num_queries=1
+        )
+        custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
+        response_synthesizer = get_response_synthesizer(
+            response_mode=ResponseMode.TREE_SUMMARIZE,
+            text_qa_template=custom_prompt_template
+        )
+        query_engine = RetrieverQueryEngine(
+            retriever=hybrid_retriever,
+            response_synthesizer=response_synthesizer
+        )
+        log_message("Query engine успешно создан")
+        return query_engine
+    except Exception as e:
+        log_message(f"Ошибка создания query engine: {str(e)}")
+        raise

table_prep.py CHANGED Viewed

@@ -1,142 +1,163 @@
-from llama_index.core.text_splitter import SentenceSplitter
 from llama_index.core import Document
-from config import CHUNK_SIZE, CHUNK_OVERLAP
 from my_logging import log_message
-def normalize_table_number(table_num, section):
-    """Normalize table numbers for consistent retrieval"""
-    if not table_num or table_num == 'Неизвестно':
-        return 'Неизвестно'
-    # Clean up common prefixes
-    tn = str(table_num).replace('Таблица', '').replace('№', '').strip()
-    # Add section context for appendix tables
-    if section and ('Приложение' in str(section) or 'приложение' in str(section).lower()):
-        return f"№{tn} ({section})"
-    return f"№{tn}"
 def create_table_content(table_data):
-    """Create formatted content optimized for semantic search"""
-    doc_id = (
-        table_data.get('document_id') or
-        table_data.get('document') or
-        table_data.get('Обозначение документа') or
-        'Неизвестно'
-    )
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
-    section = (
-        table_data.get('section') or
-        table_data.get('Раздел документа') or
-        'Неизвестно'
-    )
-    sheet_name = table_data.get('sheet_name', '')
-    # Enhanced table number with appendix context
-    normalized_num = normalize_table_number(table_num, section)
-    if 'Приложени' in str(section):
-        # Extract appendix number
-        import re
-        appendix_match = re.search(r'Приложени[ея]\s*(\d+)', str(section))
-        if appendix_match:
-            appendix_num = appendix_match.group(1)
-            normalized_num = f"{normalized_num} Приложения {appendix_num}"
-    # Build searchable header
-    content = f"Документ: {doc_id}\n"
-    content += f"Раздел: {section}\n"
-    content += f"Таблица: {normalized_num}\n"
     content += f"Название: {table_title}\n"
-    if sheet_name:
-        content += f"Лист: {sheet_name}\n"
-    content += f"\n"
     headers = table_data.get('headers', [])
     if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        content += f"Колонки: {header_str}\n\n"
-    # CRITICAL: Preserve searchable row identifiers
     if 'data' in table_data and isinstance(table_data['data'], list):
         for row_idx, row in enumerate(table_data['data'], start=1):
             if isinstance(row, dict):
-                # Extract ALL key-value pairs naturally
-                row_parts = []
-                for k, v in row.items():
-                    if v and str(v).strip() and str(v) != 'nan':
-                        row_parts.append(f"{k}: {v}")
-                if row_parts:
-                    content += ' | '.join(row_parts) + "\n"
-            elif isinstance(row, list):
-                row_str = ' | '.join([str(v) for v in row if v and str(v).strip() and str(v) != 'nan'])
-                if row_str:
-                    content += row_str + "\n"
-    return content, normalized_num
-def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:
         chunk_overlap = CHUNK_OVERLAP
     table_num = doc.metadata.get('table_number', 'unknown')
     doc_id = doc.metadata.get('document_id', 'unknown')
-    section = doc.metadata.get('section', 'Неизвестно')
-    full_table_id = f"{doc_id} | {section} | {table_num}"
     lines = doc.text.strip().split('\n')
-    # Find where data rows start
-    data_start_idx = 0
-    for i, line in enumerate(lines):
-        if line.startswith('Колонки:'):
-            data_start_idx = i + 2  # Skip header and blank line
-            break
-    table_header = '\n'.join(lines[:data_start_idx])
-    data_rows = lines[data_start_idx:]
-    if not data_rows or len(doc.text) < chunk_size * 1.5:
-        log_message(f"  📊 {full_table_id}: малая таблица, без разбиения")
-        return [doc]
-    log_message(f"  📋 {full_table_id}: {len(data_rows)} строк → chunking")
-    header_size = len(table_header)
-    available_size = chunk_size - header_size - 100
-    text_chunks = []
-    current_chunk_rows = []
-    current_size = 0
-    for row in data_rows:
-        row_size = len(row) + 1
-        if current_size + row_size > available_size and current_chunk_rows:
-            chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows)
-            text_chunks.append(chunk_text)
-            # Keep last 2 rows for overlap
-            overlap_count = min(2, len(current_chunk_rows))
-            current_chunk_rows = current_chunk_rows[-overlap_count:]
-            current_size = sum(len(r) + 1 for r in current_chunk_rows)
-        current_chunk_rows.append(row)
-        current_size += row_size
-    if current_chunk_rows:
-        chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows)
-        text_chunks.append(chunk_text)
-    log_message(f"  ✂️ {full_table_id} → {len(text_chunks)} чанков")
     chunked_docs = []
     for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
@@ -144,12 +165,22 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
             "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
             "is_chunked": True,
-            "full_table_id": full_table_id,
-            "table_number_normalized": doc.metadata.get('table_number_normalized')
         })
         chunked_doc = Document(
-            text=chunk_text,
             metadata=chunk_metadata
         )
         chunked_docs.append(chunked_doc)
@@ -158,102 +189,138 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
 def table_to_document(table_data, document_id=None):
-    """Convert table data to Document with complete metadata"""
     if not isinstance(table_data, dict):
         return []
-    sheet_doc_id = (
-        table_data.get('document_id') or
-        table_data.get('document') or
-        table_data.get('Обозначение документа')
-    )
-    doc_id = sheet_doc_id or document_id or 'Неизвестно'
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
-    section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно'))
-    sheet_name = table_data.get('sheet_name', '')
     table_rows = table_data.get('data', [])
-    if not table_rows:
-        log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных")
         return []
-    content, normalized_num = create_table_content(table_data)
     content_size = len(content)
     base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "table_number": table_num,
-            "table_number_normalized": normalized_num,
             "table_title": table_title,
             "document_id": doc_id,
             "section": section,
             "section_id": section,
-            "sheet_name": sheet_name,
-            "total_rows": len(table_rows),
-            "content_size": content_size,
-            "full_table_id": f"{doc_id} | {section} | {normalized_num}"
         }
     )
     if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}")
-        return chunk_table_document(base_doc)
     else:
-        log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)")
         return [base_doc]
-def table_to_document(table_data, document_id=None):
-    """Convert table data to Document with proper metadata"""
-    if not isinstance(table_data, dict):
-        return []
-    # FIXED: Extract sheet-level document_id first
-    sheet_doc_id = (
-        table_data.get('document_id') or
-        table_data.get('document') or
-        table_data.get('Обозначение документа')
-    )
-    # Use sheet doc_id if available, otherwise use passed document_id
-    doc_id = sheet_doc_id or document_id or 'Неизвестно'
-    table_num = table_data.get('table_number', 'Неизвестно')
-    table_title = table_data.get('table_title', 'Неизвестно')
-    section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно'))
-    table_rows = table_data.get('data', [])
-    if not table_rows:
-        log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных")
-        return []
-    content, normalized_num = create_table_content(table_data)
-    content_size = len(content)
-    base_doc = Document(
-        text=content,
-        metadata={
-            "type": "table",
-            "table_number": table_num,
-            "table_number_normalized": normalized_num,
-            "table_title": table_title,
-            "document_id": doc_id,
-            "section": section,
-            "section_id": section,
-            "total_rows": len(table_rows),
-            "content_size": content_size,
-            "full_table_id": f"{doc_id} | {section} | {normalized_num}"
         }
-    )
-    if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}")
-        return chunk_table_document(base_doc)
-    else:
-        log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)")
-        return [base_doc]

+from collections import defaultdict
+import json
+from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
 def create_table_content(table_data):
+    """Create formatted content from table data"""
+    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
+    section = table_data.get('section', 'Неизвестно')
+    content = f"Таблица: {table_num}\n"
     content += f"Название: {table_title}\n"
+    content += f"Документ: {doc_id}\n"
+    content += f"Раздел: {section}\n"
     headers = table_data.get('headers', [])
     if headers:
+        content += f"\nЗаголовки: {' | '.join(headers)}\n"
     if 'data' in table_data and isinstance(table_data['data'], list):
+        content += "\nДанные таблицы:\n"
         for row_idx, row in enumerate(table_data['data'], start=1):
             if isinstance(row, dict):
+                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+                content += f"Строка {row_idx}: {row_text}\n"
+    return content
+from llama_index.core.text_splitter import SentenceSplitter
+from config import CHUNK_SIZE, CHUNK_OVERLAP
+def extract_table_metadata(table_text: str) -> dict:
+    words = table_text.split()
+    unique_words = set(words)
+    from collections import Counter
+    stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
+    filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
+    common = Counter(filtered).most_common(15)
+    key_terms = [w for w, _ in common]
+    return {
+        "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
+        "materials": [],   # if you want to extract material names, hook in regex or LLM here
+        "key_terms": key_terms
+    }
+def chunk_table_document(doc, chunk_size=None, chunk_overlap=None, rows_per_chunk=4):
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:
         chunk_overlap = CHUNK_OVERLAP
+    # Extract critical metadata from table before chunking
+    table_metadata = extract_table_metadata(doc.text)
     table_num = doc.metadata.get('table_number', 'unknown')
+    table_title = doc.metadata.get('table_title', 'unknown')
     doc_id = doc.metadata.get('document_id', 'unknown')
+    section = doc.metadata.get('section', 'unknown')
+    # Parse table structure
     lines = doc.text.strip().split('\n')
+    table_header_lines = []
+    data_rows = []
+    in_data = False
+    for line in lines:
+        if line.startswith('Данные таблицы:'):
+            in_data = True
+            table_header_lines.append(line)
+        elif in_data and line.startswith('Строка'):
+            data_rows.append(line)
+        elif not in_data:
+            table_header_lines.append(line)
+    table_header = '\n'.join(table_header_lines) + '\n'
+    if not data_rows:
+        log_message(f"  ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
+        text_splitter = SentenceSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separator="\n"
+        )
+        text_chunks = text_splitter.split_text(doc.text)
+        log_message(f"  📊 Стандартное разбиение: {len(text_chunks)} чанков")
+    else:
+        log_message(f"  📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
+        header_size = len(table_header)
+        available_size = chunk_size - header_size - 300  # Reserve for enrichment
+        text_chunks = []
+        current_chunk_rows = []
+        current_size = 0
+        for row in data_rows:
+            row_size = len(row) + 1
+            # If single row exceeds available size, split it
+            if row_size > available_size:
+                log_message(f"    ⚠️ Строка слишком длинная ({row_size} символов), разбиваем внутри строки")
+                # Flush current chunk if exists
+                if current_chunk_rows:
+                    chunk_text = table_header + '\n'.join(current_chunk_rows)
+                    text_chunks.append(chunk_text)
+                    log_message(f"    ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
+                    current_chunk_rows = []
+                    current_size = 0
+                # Split the oversized row
+                text_splitter = SentenceSplitter(
+                    chunk_size=available_size,
+                    chunk_overlap=100,
+                    separator=" | "
+                )
+                row_parts = text_splitter.split_text(row)
+                log_message(f"      Строка разделена на {len(row_parts)} частей")
+                for part in row_parts:
+                    chunk_text = table_header + part
+                    text_chunks.append(chunk_text)
+                    log_message(f"      Под-чанк создан: {len(chunk_text)} символов")
+                continue
+            # Check if adding row would exceed rows_per_chunk OR size limit
+            if (len(current_chunk_rows) >= rows_per_chunk or
+                (current_size + row_size > available_size)) and current_chunk_rows:
+                chunk_text = table_header + '\n'.join(current_chunk_rows)
+                text_chunks.append(chunk_text)
+                log_message(f"    ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
+                # Overlap: keep last 1 row
+                overlap_count = min(1, len(current_chunk_rows))
+                current_chunk_rows = current_chunk_rows[-overlap_count:]
+                current_size = sum(len(r) + 1 for r in current_chunk_rows)
+            current_chunk_rows.append(row)
+            current_size += row_size
+        # Final chunk
+        if current_chunk_rows:
+            chunk_text = table_header + '\n'.join(current_chunk_rows)
+            text_chunks.append(chunk_text)
+            log_message(f"    ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
+    log_message(f"  📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
+    # Create enriched chunks (rest of the function remains the same)
     chunked_docs = []
+    materials = table_metadata.get("materials", [])
+    key_terms = table_metadata.get("key_terms", [])
     for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
             "is_chunked": True,
+            "materials": materials,
+            "key_terms": key_terms,
+            "table_summary": table_metadata.get("summary", "")
         })
+        materials_str = ', '.join(materials[:10]) if materials else 'нет'
+        terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
+        enriched_text = f"""[Таблица {table_num}: {table_title}]
+[Материалы в таблице: {materials_str}]
+[Ключевые термины: {terms_str}]
+{chunk_text}"""
         chunked_doc = Document(
+            text=enriched_text,
             metadata=chunk_metadata
         )
         chunked_docs.append(chunked_doc)
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
+        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
         return []
+    doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
+    section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
+    if not table_rows or len(table_rows) == 0:
+        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
         return []
+    content = create_table_content(table_data)
     content_size = len(content)
+    row_count = len(table_rows)
     base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "table_number": table_num,
             "table_title": table_title,
             "document_id": doc_id,
             "section": section,
             "section_id": section,
+            "total_rows": row_count,
+            "content_size": content_size
         }
     )
     if content_size > CHUNK_SIZE:
+        log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
+                   f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
+        chunked_docs = chunk_table_document(base_doc)
+        log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
+        for i, chunk_doc in enumerate(chunked_docs):
+            log_message(f"    Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
+        return chunked_docs
     else:
+        log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
+                   f"Размер: {content_size} символов | Строк: {row_count}")
         return [base_doc]
+def load_table_data(repo_id, hf_token, table_data_dir):
+    log_message("=" * 60)
+    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
+    log_message("=" * 60)
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+        table_documents = []
+        stats = {
+            'total_tables': 0,
+            'total_size': 0,
+            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
         }
+        for file_path in table_files:
+            try:
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                log_message(f"\nОбработка файла: {file_path}")
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    table_data = json.load(f)
+                    if isinstance(table_data, dict):
+                        document_id = table_data.get('document', 'unknown')
+                        if 'sheets' in table_data:
+                            sorted_sheets = sorted(
+                                table_data['sheets'],
+                                key=lambda sheet: sheet.get('table_number', '')  # or use 'table_number'
+                            )
+                            for sheet in sorted_sheets:
+                                sheet['document'] = document_id
+                                docs_list = table_to_document(sheet, document_id)
+                                table_documents.extend(docs_list)
+                                for doc in docs_list:
+                                    stats['total_tables'] += 1
+                                    size = doc.metadata.get('content_size', 0)
+                                    stats['total_size'] += size
+                                    stats['by_document'][document_id]['count'] += 1
+                                    stats['by_document'][document_id]['size'] += size
+                        else:
+                            docs_list = table_to_document(table_data, document_id)
+                            table_documents.extend(docs_list)
+                            for doc in docs_list:
+                                stats['total_tables'] += 1
+                                size = doc.metadata.get('content_size', 0)
+                                stats['total_size'] += size
+                                stats['by_document'][document_id]['count'] += 1
+                                stats['by_document'][document_id]['size'] += size
+            except Exception as e:
+                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
+                continue
+        # Log summary statistics
+        log_message("\n" + "=" * 60)
+        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
+        log_message("=" * 60)
+        log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
+        log_message(f"Общий размер: {stats['total_size']:,} символов")
+        log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
+        log_message("\nПо документам:")
+        for doc_id, doc_stats in sorted(stats['by_document'].items()):
+            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
+                       f"{doc_stats['size']:,} символов")
+        log_message("=" * 60)
+        return table_documents
+    except Exception as e:
+        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
+        return []

utils.py CHANGED Viewed

@@ -4,20 +4,15 @@ from sentence_transformers import CrossEncoder
 from my_logging import log_message
 def get_llm_model(api_key, model_name="gemini-2.0-flash"):
-    """Get LLM model"""
     return GoogleGenAI(model=model_name, api_key=api_key)
 def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
-    """Get embedding model"""
     return HuggingFaceEmbedding(model_name=model_name)
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
-    """Get reranker model"""
     return CrossEncoder(model_name)
 def format_sources(nodes):
-    """Format retrieved sources for display"""
     sources = []
     for node in nodes:
         meta = node.metadata
@@ -37,21 +32,132 @@ def format_sources(nodes):
     return "\n".join(set(sources))
-import re
-def answer_question(question, query_engine, reranker):
     try:
         log_message(f"\n{'='*70}")
         log_message(f"QUERY: {question}")
         retrieved = query_engine.retrieve(question)
-        log_message(f"RETRIEVED: {len(retrieved)} unique nodes")
-        reranked = rerank_nodes(question, retrieved, reranker, top_k=20, min_score=-0.5)
-        log_message(f"RERANKED: {len(reranked)} nodes")
-        # Group by document and type
         doc_groups = {}
         for n in reranked:
             doc_id = n.metadata.get('document_id', 'unknown')
@@ -68,12 +174,10 @@ def answer_question(question, query_engine, reranker):
         log_message(f"Documents found: {list(doc_groups.keys())}")
-        # Format context by document
         context_parts = []
         for doc_id, groups in doc_groups.items():
             doc_section = [f"=== ДОКУМЕНТ: {doc_id} ==="]
-            # Tables first (most important for your queries)
             if groups['tables']:
                 doc_section.append("\n--- ТАБЛИЦЫ ---")
                 for n in groups['tables']:
@@ -81,13 +185,21 @@ def answer_question(question, query_engine, reranker):
                     table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
                     title = meta.get('table_title', '')
                     doc_section.append(f"\n[Таблица {table_id}] {title}")
-                    doc_section.append(n.text[:1500])  # Limit length
                     log_message(f"  Included table {table_id} from {doc_id}")
-            # Then text
             if groups['text']:
                 doc_section.append("\n--- ТЕКСТ ---")
-                for n in groups['text'][:3]:  # Limit text chunks
                     doc_section.append(n.text[:800])
                     log_message(f"  Included text section from {doc_id}")
@@ -103,26 +215,35 @@ def answer_question(question, query_engine, reranker):
         from llama_index.core import Settings
         response = Settings.llm.complete(prompt)
-        sources = format_sources(reranked)
-        return response.text, sources
     except Exception as e:
         log_message(f"Error: {e}")
         import traceback
         log_message(traceback.format_exc())
-        return f"Ошибка: {e}", ""
-def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.1):  # Much lower threshold
-    """Rerank with detailed score logging"""
     if not nodes or not reranker:
         log_message("WARNING: No nodes or reranker available")
         return nodes[:top_k]
-    pairs = [[query, n.text[:500]] for n in nodes]  # Limit text length for reranker
     scores = reranker.predict(pairs)
     scored = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)
-    # Detailed logging
     if scored:
         top_5_scores = [s for _, s in scored[:5]]
         bottom_5_scores = [s for _, s in scored[-5:]]
@@ -130,7 +251,6 @@ def rerank_nodes(query, nodes, reranker, top_k=20, min_score=0.1):  # Much lower
         log_message(f"Top 5 scores: {top_5_scores}")
         log_message(f"Bottom 5 scores: {bottom_5_scores}")
-    # Count how many pass threshold
     above_threshold = sum(1 for _, s in scored if s >= min_score)
     log_message(f"Nodes above threshold ({min_score}): {above_threshold}/{len(scored)}")

 from my_logging import log_message
 def get_llm_model(api_key, model_name="gemini-2.0-flash"):
     return GoogleGenAI(model=model_name, api_key=api_key)
 def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
     return HuggingFaceEmbedding(model_name=model_name)
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
 def format_sources(nodes):
     sources = []
     for node in nodes:
         meta = node.metadata
     return "\n".join(set(sources))
+def create_chunks_info_for_display(nodes):
+    chunks_info = []
+    for node in nodes:
+        meta = node.metadata
+        chunk_info = {
+            'document_id': meta.get('document_id', 'unknown'),
+            'section_path': meta.get('section_path', ''),
+            'section_id': meta.get('section_id', 'unknown'),
+            'section_text': meta.get('section_text', ''),
+            'parent_section': meta.get('parent_section', ''),
+            'parent_title': meta.get('parent_title', ''),
+            'level': meta.get('level', ''),
+            'chunk_text': node.text[:500],
+            'type': meta.get('type', 'text'),
+            'table_number': meta.get('table_number', ''),
+            'image_number': meta.get('image_number', '')
+        }
+        chunks_info.append(chunk_info)
+    return chunks_info
+def format_answer_html(answer_text, model_name):
+    html = f"""
+    <div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px;'>
+        <div style='margin-bottom: 10px;'>
+            <span style='background-color: #4a5568; padding: 5px 10px; border-radius: 5px; font-size: 12px;'>
+                Модель: {model_name}
+            </span>
+        </div>
+        <div style='line-height: 1.6;'>
+            {answer_text}
+        </div>
+    </div>
+    """
+    return html
+def format_sources_html(sources_text):
+    if not sources_text or sources_text == "":
+        return "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Нет источников</div>"
+    sources_list = sources_text.strip().split('\n')
+    html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px;'>"
+    html += "<h4 style='color: white; margin-bottom: 15px;'>Использованные источники:</h4>"
+    html += "<div style='line-height: 2;'>"
+    for source in sources_list:
+        if source.strip():
+            html += f"<div style='padding: 5px 0; border-bottom: 1px solid #4a5568;'>{source}</div>"
+    html += "</div></div>"
+    return html
+def format_chunks_html(chunks_info):
+    if not chunks_info:
+        return "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Нет данных о чанках</div>"
+    html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 500px; overflow-y: auto;'>"
+    html += f"<h4 style='color: white; margin-bottom: 15px;'>Найдено релевантных чанков: {len(chunks_info)}</h4>"
+    for i, chunk in enumerate(chunks_info):
+        bg_color = "#4a5568" if i % 2 == 0 else "#374151"
+        from app import get_section_display, get_formatted_content
+        section_display = get_section_display(chunk)
+        formatted_content = get_formatted_content(chunk)
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #60a5fa;'>
+            <strong style='color: #93c5fd;'>Документ:</strong> <span style='color: white;'>{chunk['document_id']}</span><br>
+            <strong style='color: #93c5fd;'>Раздел:</strong> <span style='color: white;'>{section_display}</span><br>
+            <strong style='color: #93c5fd;'>Содержание:</strong><br>
+            <div style='background-color: #1f2937; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: #d1d5db; max-height: 200px; overflow-y: auto;'>
+                {formatted_content}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def deduplicate_nodes(nodes):
+    """Deduplicate retrieved nodes based on unique identifiers"""
+    seen = set()
+    unique_nodes = []
+    for node in nodes:
+        # Create unique identifier from metadata
+        doc_id = node.metadata.get('document_id', '')
+        section_id = node.metadata.get('section_id', '')
+        chunk_id = node.metadata.get('chunk_id', 0)
+        node_type = node.metadata.get('type', 'text')
+        if node_type == 'table':
+            table_num = node.metadata.get('table_number', '')
+            identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
+        elif node_type == 'image':
+            img_num = node.metadata.get('image_number', '')
+            identifier = f"{doc_id}|image|{img_num}"
+        else:
+            identifier = f"{doc_id}|{section_id}|{chunk_id}"
+        if identifier not in seen:
+            seen.add(identifier)
+            unique_nodes.append(node)
+    return unique_nodes
+def answer_question(question, query_engine, reranker, model_name):
     try:
         log_message(f"\n{'='*70}")
         log_message(f"QUERY: {question}")
         retrieved = query_engine.retrieve(question)
+        total_retrieved = len(retrieved)
+        log_message(f"RETRIEVED: {total_retrieved} nodes (before deduplication)")
+        # Deduplicate
+        unique_retrieved = deduplicate_nodes(retrieved)
+        duplicates_removed = total_retrieved - len(unique_retrieved)
+        log_message(f"DEDUPLICATION: {duplicates_removed} duplicates removed")
+        log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        reranked = rerank_nodes(question, unique_retrieved, reranker, top_k=20, min_score=-0.5)
+        log_message(f"RERANKED: {len(reranked)} nodes (after scoring)")
         doc_groups = {}
         for n in reranked:
             doc_id = n.metadata.get('document_id', 'unknown')
         log_message(f"Documents found: {list(doc_groups.keys())}")
         context_parts = []
         for doc_id, groups in doc_groups.items():
             doc_section = [f"=== ДОКУМЕНТ: {doc_id} ==="]
             if groups['tables']:
                 doc_section.append("\n--- ТАБЛИЦЫ ---")
                 for n in groups['tables']:
                     table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
                     title = meta.get('table_title', '')
                     doc_section.append(f"\n[Таблица {table_id}] {title}")
+                    doc_section.append(n.text[:1500])
                     log_message(f"  Included table {table_id} from {doc_id}")
+            if groups['images']:
+                doc_section.append("\n--- ИЗОБРАЖЕНИЯ ---")
+                for n in groups['images']:
+                    meta = n.metadata
+                    img_id = meta.get('image_number', 'unknown')
+                    doc_section.append(f"\n[Рисунок {img_id}]")
+                    doc_section.append(n.text[:1000])
+                    log_message(f"  Included image {img_id} from {doc_id}")
             if groups['text']:
                 doc_section.append("\n--- ТЕКСТ ---")
+                for n in groups['text'][:3]:
                     doc_section.append(n.text[:800])
                     log_message(f"  Included text section from {doc_id}")
         from llama_index.core import Settings
         response = Settings.llm.complete(prompt)
+        sources_text = format_sources(reranked)
+        chunks_info = create_chunks_info_for_display(reranked)
+        answer_html = format_answer_html(response.text, model_name)
+        sources_html = format_sources_html(sources_text)
+        chunks_html = format_chunks_html(chunks_info)
+        return answer_html, sources_html, chunks_html
     except Exception as e:
         log_message(f"Error: {e}")
         import traceback
         log_message(traceback.format_exc())
+        error_html = f"<div style='background-color: #2d3748; color: #ef4444; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
+        sources_html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Источники недоступны из-за ошибки</div>"
+        chunks_html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Чанки недоступны из-за ошибки</div>"
+        return error_html, sources_html, chunks_html
+def rerank_nodes(query, nodes, reranker, top_k=20, min_score=-0.5):
     if not nodes or not reranker:
         log_message("WARNING: No nodes or reranker available")
         return nodes[:top_k]
+    pairs = [[query, n.text[:500]] for n in nodes]
     scores = reranker.predict(pairs)
     scored = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)
     if scored:
         top_5_scores = [s for _, s in scored[:5]]
         bottom_5_scores = [s for _, s in scored[-5:]]
         log_message(f"Top 5 scores: {top_5_scores}")
         log_message(f"Bottom 5 scores: {bottom_5_scores}")
     above_threshold = sum(1 for _, s in scored if s >= min_score)
     log_message(f"Nodes above threshold ({min_score}): {above_threshold}/{len(scored)}")