Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

9985d37

1 Parent(s): 6370d73

simplest version

Browse files

Files changed (4) hide show

app.py +102 -328
documents_prep.py +220 -540
index_retriever.py +54 -113
utils.py +81 -277

app.py CHANGED Viewed

@@ -1,355 +1,129 @@
 import gradio as gr
-import os
 from llama_index.core import Settings
-from documents_prep import *
-from utils import *
-from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
-import sys
-from config import (
-    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
-    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
-)
-def create_chunks_display_html(chunk_info):
-    if not chunk_info:
-        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
-    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
-    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
-    for i, chunk in enumerate(chunk_info):
-        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
-        # Get section display info
-        section_display = get_section_display(chunk)
-        formatted_content = get_formatted_content(chunk)
-        html += f"""
-        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
-            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
-            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
-            <strong style='color: black;'>Содержание:</strong><br>
-            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
-                {formatted_content}
-            </div>
-        </div>
-        """
-    html += "</div>"
-    return html
-def get_section_display(chunk):
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    doc_type = chunk.get('type', 'text')
-    if doc_type == 'table' and chunk.get('table_number'):
-        table_num = chunk.get('table_number')
-        if not str(table_num).startswith('№'):
-            table_num = f"№{table_num}"
-        return f"таблица {table_num}"
-    if doc_type == 'image' and chunk.get('image_number'):
-        image_num = chunk.get('image_number')
-        if not str(image_num).startswith('№'):
-            image_num = f"№{image_num}"
-        return f"рисунок {image_num}"
-    if section_path:
-        return section_path
-    elif section_id and section_id != 'unknown':
-        return section_id
-    return section_id
-def get_formatted_content(chunk):
-    document_id = chunk.get('document_id', 'unknown')
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    section_text = chunk.get('section_text', '')
-    parent_section = chunk.get('parent_section', '')
-    parent_title = chunk.get('parent_title', '')
-    level = chunk.get('level', '')
-    chunk_text = chunk.get('chunk_text', '')
-    doc_type = chunk.get('type', 'text')
-    # For text documents
-    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
-        current_section = section_path if section_path else section_id
-        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
-        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
-    else:
-        current_section = section_path if section_path else section_id
-        clean_text = chunk_text
-        if section_text and chunk_text.startswith(section_text):
-            section_title = section_text
-        elif chunk_text.startswith(f"{current_section} "):
-            clean_text = chunk_text[len(f"{current_section} "):].strip()
-            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
-        else:
-            section_title = section_text if section_text else current_section
-        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
-def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
-                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
-                     use_json_instead_csv=False):
-    try:
-        from documents_prep import process_documents_with_chunking
-        log_message("Инициализация системы")
-        os.makedirs(download_dir, exist_ok=True)
-        from config import CHUNK_SIZE, CHUNK_OVERLAP
-        from llama_index.core.text_splitter import TokenTextSplitter
-        embed_model = get_embedding_model()
-        llm = get_llm_model(DEFAULT_MODEL)
-        reranker = get_reranker_model()
-        Settings.embed_model = embed_model
-        Settings.llm = llm
-        Settings.text_splitter = TokenTextSplitter(
-            chunk_size=CHUNK_SIZE,
-            chunk_overlap=CHUNK_OVERLAP,
-            separator=" ",
-            backup_separators=["\n", ".", "!", "?"]
-        )
-        log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
-        log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
-        all_documents = []
-        chunks_df = None
-        chunk_info = []
-        if use_json_instead_csv and json_files_dir:
-            log_message("Используем JSON файлы вместо CSV")
-            json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
-            all_documents.extend(json_documents)
-            chunk_info.extend(json_chunk_info)
-        else:
-            if chunks_filename:
-                log_message("Загружаем данные из CSV")
-                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
-                all_documents.extend(csv_documents)
-        if table_data_dir:
-            log_message("Добавляю табличные данные")
-            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
-            log_message(f"Загружено {len(table_documents)} табличных документов")
-            # Process table documents through chunking
-            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
-            all_documents.extend(chunked_table_docs)
-            chunk_info.extend(table_chunk_info)
-        if image_data_dir:
-            log_message("Добавляю данные изображений")
-            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
-            log_message(f"Загружено {len(image_documents)} документов изображений")
-            # Process image documents through chunking
-            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
-            all_documents.extend(chunked_image_docs)
-            chunk_info.extend(image_chunk_info)
-        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
-        vector_index = create_vector_index(all_documents)
-        query_engine = create_query_engine(vector_index)
-        log_message(f"Система успешно инициализирована")
-        return query_engine, chunks_df, reranker, vector_index, chunk_info
-    except Exception as e:
-        log_message(f"Ошибка инициализации: {str(e)}")
-        return None, None, None, None, []
-def switch_model(model_name, vector_index):
-    from llama_index.core import Settings
-    from index_retriever import create_query_engine
-    try:
-        log_message(f"Переключение на модель: {model_name}")
-        new_llm = get_llm_model(model_name)
-        Settings.llm = new_llm
-        if vector_index is not None:
-            new_query_engine = create_query_engine(vector_index)
-            log_message(f"Модель успешно переключена на: {model_name}")
-            return new_query_engine, f"✅ Модель переключена на: {model_name}"
-        else:
-            return None, "❌ Ошибка: система не инициализирована"
-    except Exception as e:
-        error_msg = f"Ошибка переключения модели: {str(e)}"
-        log_message(error_msg)
-        return None, f"❌ {error_msg}"
-def main_answer_question(question):
-    global query_engine, reranker, current_model, chunks_df
-    if not question.strip():
-        return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
-                "<div style='color: black;'>Источники появятся после обработки запроса</div>",
-                "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
-    try:
-        # Call the answer_question function which returns 3 values
-        answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
-        return answer_html, sources_html, chunks_html
-    except Exception as e:
-        log_message(f"Ошибка при ответе на вопрос: {str(e)}")
-        return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
-                "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
-                "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
-def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
-    with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # AIEXP - Artificial Intelligence Expert
-        ## Инструмент для работы с нормативной документацией
-        """)
-        with gr.Tab("Поиск по нормативным документам"):
-            gr.Markdown("### Задайте вопрос по нормативной документации")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    model_dropdown = gr.Dropdown(
-                        choices=list(AVAILABLE_MODELS.keys()),
-                        value=current_model,
-                        label="Выберите языковую модель",
-                        info="Выберите модель для генерации ответов"
-                    )
-                with gr.Column(scale=1):
-                    switch_btn = gr.Button("Переключить модель", variant="secondary")
-                    model_status = gr.Textbox(
-                        value=f"Текущая модель: {current_model}",
-                        label="Статус модели",
-                        interactive=False
-                    )
-            with gr.Row():
-                with gr.Column(scale=3):
-                    question_input = gr.Textbox(
-                        label="Ваш вопрос к базе знаний",
-                        placeholder="Введите вопрос по нормативным документам...",
-                        lines=3
-                    )
-                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
-                    gr.Examples(
-                        examples=[
-                            "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
-                            "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
-                            "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
-                            "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
-                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
-                            "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
-                        ],
-                        inputs=question_input
-                    )
-            with gr.Row():
-                with gr.Column(scale=2):
-                    answer_output = gr.HTML(
-                        label="",
-                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
-                    )
-                with gr.Column(scale=1):
-                    sources_output = gr.HTML(
-                        label="",
-                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
-                    )
-                with gr.Column(scale=1):
-                    chunks_output = gr.HTML(
-                        label="Релевантные чанки",
-                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
-                    )
-            switch_btn.click(
-                fn=switch_model_func,
-                inputs=[model_dropdown],
-                outputs=[model_status]
-            )
-            ask_btn.click(
-                fn=answer_question_func,
-                inputs=[question_input],
-                outputs=[answer_output, sources_output, chunks_output]
-            )
-            question_input.submit(
-                fn=answer_question_func,
-                inputs=[question_input],
-                outputs=[answer_output, sources_output, chunks_output]
-            )
     return demo
-query_engine = None
-chunks_df = None
-reranker = None
-vector_index = None
-current_model = DEFAULT_MODEL
-def main_answer_question(question):
-    global query_engine, reranker, current_model, chunks_df
-    answer_html, sources_html, chunks_html = answer_question(
-        question, query_engine, reranker, current_model, chunks_df
-    )
-    return answer_html, sources_html, chunks_html
-def main_switch_model(model_name):
-    global query_engine, vector_index, current_model
-    new_query_engine, status_message = switch_model(model_name, vector_index)
-    if new_query_engine:
-        query_engine = new_query_engine
-        current_model = model_name
-    return status_message
-def main():
-    global query_engine, chunks_df, reranker, vector_index, current_model
-    log_message("Запуск AIEXP - AI Expert для нормативной документации")
-    query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
-        repo_id=HF_REPO_ID,
-        hf_token=HF_TOKEN,
-        download_dir=DOWNLOAD_DIR,
-        json_files_dir=JSON_FILES_DIR,
-        table_data_dir=TABLE_DATA_DIR,
-        image_data_dir=IMAGE_DATA_DIR,
-        use_json_instead_csv=True,
-    )
-    if query_engine:
-        log_message("Запуск веб-интерфейса")
-        demo = create_demo_interface(
-            answer_question_func=main_answer_question,
-            switch_model_func=main_switch_model,
-            current_model=current_model,
-            chunk_info=chunk_info
-        )
-        demo.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=True,
-            debug=False
-        )
-    else:
-        log_message("Невозможно запустить приложение из-за ошибки инициализации")
-        sys.exit(1)
 if __name__ == "__main__":
-    main()

 import gradio as gr
 from llama_index.core import Settings
+from documents_prep import load_all_documents
 from index_retriever import create_vector_index, create_query_engine
+from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
+from my_logging import log_message
+from config import *
+# Global state
+query_engine = None
+reranker = None
+def initialize_system():
+    """Initialize RAG system"""
+    global query_engine, reranker
+    log_message("="*60)
+    log_message("INITIALIZING SYSTEM")
+    log_message("="*60)
+    # Setup models
+    llm = get_llm_model(GOOGLE_API_KEY)
+    embed_model = get_embedding_model()
+    reranker = get_reranker_model()
+    Settings.llm = llm
+    Settings.embed_model = embed_model
+    log_message("✓ Models loaded")
+    # Load documents
+    documents = load_all_documents(
+        repo_id=HF_REPO_ID,
+        hf_token=HF_TOKEN,
+        json_dir=JSON_FILES_DIR,
+        table_dir=TABLE_DATA_DIR,
+        image_dir=IMAGE_DATA_DIR
+    )
+    # Create index
+    vector_index = create_vector_index(documents)
+    query_engine = create_query_engine(vector_index)
+    log_message("="*60)
+    log_message("SYSTEM READY")
+    log_message("="*60)
+    return "✅ System initialized"
+def ask_question(question):
+    """Handle question from UI"""
+    if not question.strip():
+        return "Пожалуйста, введите вопрос", ""
+    if query_engine is None:
+        return "❌ Система не инициализирована", ""
+    answer, sources = answer_question(question, query_engine, reranker)
+    return answer, sources
+def create_interface():
+    """Create Gradio UI"""
+    with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # AIEXP - AI Expert для нормативной документации
+        ## Упрощенная версия RAG системы
+        """)
+        with gr.Row():
+            init_btn = gr.Button("Инициализировать систему", variant="primary")
+            status = gr.Textbox(label="Статус", value="Не инициализирована")
+        gr.Markdown("### Задайте вопрос")
+        with gr.Row():
+            question = gr.Textbox(
+                label="Ваш вопрос",
+                placeholder="Введите вопрос...",
+                lines=3
+            )
+        ask_btn = gr.Button("Найти ответ", variant="primary")
+        gr.Examples(
+            examples=[
+                "О чем таблица А.12 в ГОСТ Р 59023.4-2020?",
+                "Какая температура подогрева для стали 20 толщиной до 100 мм?",
+                "Что показано на рисунке Л.2 в ГОСТ Р 50.04.07-2022?"
+            ],
+            inputs=question
+        )
+        with gr.Row():
+            answer = gr.Textbox(
+                label="Ответ",
+                lines=10
+            )
+            sources = gr.Textbox(
+                label="Источники",
+                lines=10
+            )
+        # Event handlers
+        init_btn.click(
+            fn=initialize_system,
+            outputs=status
+        )
+        ask_btn.click(
+            fn=ask_question,
+            inputs=question,
+            outputs=[answer, sources]
+        )
+        question.submit(
+            fn=ask_question,
+            inputs=question,
+            outputs=[answer, sources]
+        )
     return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )

documents_prep.py CHANGED Viewed

@@ -1,575 +1,255 @@
 import json
-import zipfile
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
-from my_logging import log_message
 from llama_index.core.text_splitter import SentenceSplitter
-from config import CHUNK_SIZE, CHUNK_OVERLAP
-from table_prep import table_to_document
-def chunk_document(doc, chunk_size=None, chunk_overlap=None):
-    """
-    Universal chunking for text and images.
-    Tables use their own row-block chunking.
-    """
-    if chunk_size is None:
-        chunk_size = CHUNK_SIZE
-    if chunk_overlap is None:
-        chunk_overlap = CHUNK_OVERLAP
-    # Use sentence-aware splitting
     text_splitter = SentenceSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-        separator=" "
     )
-    text_chunks = text_splitter.split_text(doc.text)
-    chunked_docs = []
-    for i, chunk_text in enumerate(text_chunks):
-        chunk_metadata = doc.metadata.copy()
-        chunk_metadata.update({
-            "chunk_id": i,
-            "total_chunks": len(text_chunks),
-            "chunk_size": len(chunk_text),
-            "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
-        })
-        chunked_doc = Document(
-            text=chunk_text,
-            metadata=chunk_metadata
-        )
-        chunked_docs.append(chunked_doc)
-    return chunked_docs
-def process_documents_with_chunking(documents):
-    all_chunked_docs = []
-    stats = {
-        'table_whole': 0,
-        'table_chunks': 0,
-        'image_whole': 0,
-        'image_chunks': 0,
-        'text_chunks': 0
-    }
-    for doc in documents:
-        doc_type = doc.metadata.get('type', 'text')
-        is_already_chunked = doc.metadata.get('is_chunked', False)
-        # Tables: already chunked in table_prep.py if needed
-        if doc_type == 'table':
-            if is_already_chunked:
-                stats['table_chunks'] += 1
-            else:
-                stats['table_whole'] += 1
-            all_chunked_docs.append(doc)
-        # Images: chunk if too large
-        elif doc_type == 'image':
-            doc_size = len(doc.text)
-            if doc_size > CHUNK_SIZE:
-                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | {doc_size} > {CHUNK_SIZE}")
-                chunked_docs = chunk_document(doc)
-                stats['image_chunks'] += len(chunked_docs)
-                all_chunked_docs.extend(chunked_docs)
-            else:
-                stats['image_whole'] += 1
-                all_chunked_docs.append(doc)
-        # Text: chunk if too large
-        else:
-            doc_size = len(doc.text)
-            if doc_size > CHUNK_SIZE:
-                log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | {doc_size} > {CHUNK_SIZE}")
-                chunked_docs = chunk_document(doc)
-                stats['text_chunks'] += len(chunked_docs)
-                all_chunked_docs.extend(chunked_docs)
-            else:
-                all_chunked_docs.append(doc)
-    log_message(f"\n{'='*60}")
-    log_message(f"СТАТИСТИКА ОБРАБОТКИ:")
-    log_message(f"  • Таблицы (целые): {stats['table_whole']}")
-    log_message(f"  • Таблицы (чанки): {stats['table_chunks']}")
-    log_message(f"  • Изображения (целые): {stats['image_whole']}")
-    log_message(f"  • Изображения (чанки): {stats['image_chunks']}")
-    log_message(f"  • Текстовые чанки: {stats['text_chunks']}")
-    log_message(f"  • ВСЕГО: {len(all_chunked_docs)}")
-    log_message(f"{'='*60}\n")
-    return all_chunked_docs, []  # Second return value for backward compatibility
-def extract_text_from_json(data, document_id, document_name):
-    documents = []
-    if 'sections' in data:
-        for section in data['sections']:
-            section_id = section.get('section_id', 'Unknown')
-            section_text = section.get('section_text', '')
-            section_path = f"{section_id}"
-            section_title = extract_section_title(section_text)
-            if section_text.strip():
-                doc = Document(
-                    text=section_text,
-                    metadata={
-                        "type": "text",
-                        "document_id": document_id,
-                        "document_name": document_name,
-                        "section_id": section_id,
-                        "section_text": section_title[:200],
-                        "section_path": section_path,
-                        "level": "section"
-                    }
-                )
-                documents.append(doc)
-            if 'subsections' in section:
-                for subsection in section['subsections']:
-                    subsection_id = subsection.get('subsection_id', 'Unknown')
-                    subsection_text = subsection.get('subsection_text', '')
-                    subsection_title = extract_section_title(subsection_text)
-                    subsection_path = f"{section_path}.{subsection_id}"
-                    if subsection_text.strip():
-                        doc = Document(
-                            text=subsection_text,
-                            metadata={
-                                "type": "text",
-                                "document_id": document_id,
-                                "document_name": document_name,
-                                "section_id": subsection_id,
-                                "section_text": subsection_title[:200],
-                                "section_path": subsection_path,
-                                "level": "subsection",
-                                "parent_section": section_id,
-                                "parent_title": section_title[:100]
-                            }
-                        )
-                        documents.append(doc)
-                    if 'sub_subsections' in subsection:
-                        for sub_subsection in subsection['sub_subsections']:
-                            sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
-                            sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
-                            sub_subsection_title = extract_section_title(sub_subsection_text)
-                            sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
-                            if sub_subsection_text.strip():
-                                doc = Document(
-                                    text=sub_subsection_text,
-                                    metadata={
-                                        "type": "text",
-                                        "document_id": document_id,
-                                        "document_name": document_name,
-                                        "section_id": sub_subsection_id,
-                                        "section_text": sub_subsection_title[:200],
-                                        "section_path": sub_subsection_path,
-                                        "level": "sub_subsection",
-                                        "parent_section": subsection_id,
-                                        "parent_title": subsection_title[:100]
-                                    }
-                                )
-                                documents.append(doc)
-                            if 'sub_sub_subsections' in sub_subsection:
-                                for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
-                                    sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
-                                    sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
-                                    sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
-                                    if sub_sub_subsection_text.strip():
-                                        doc = Document(
-                                            text=sub_sub_subsection_text,
-                                            metadata={
-                                                "type": "text",
-                                                "document_id": document_id,
-                                                "document_name": document_name,
-                                                "section_id": sub_sub_subsection_id,
-                                                "section_text": sub_sub_subsection_title[:200],
-                                                "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
-                                                "level": "sub_sub_subsection",
-                                                "parent_section": sub_subsection_id,
-                                                "parent_title": sub_subsection_title[:100]
-                                            }
-                                        )
-                                        documents.append(doc)
     return documents
-def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
-    log_message("Начинаю загрузку JSON документов")
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
-        json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
-        all_documents = []
-        for zip_file_path in zip_files:
-            try:
-                log_message(f"Загружаю ZIP архив: {zip_file_path}")
-                local_zip_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=zip_file_path,
-                    local_dir=download_dir,
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                documents = extract_zip_and_process_json(local_zip_path)
-                all_documents.extend(documents)
-                log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
-            except Exception as e:
-                log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
-                continue
-        for file_path in json_files:
-            try:
-                log_message(f"Обрабатываю прямой JSON файл: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir=download_dir,
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    json_data = json.load(f)
-                document_metadata = json_data.get('document_metadata', {})
-                document_id = document_metadata.get('document_id', 'unknown')
-                document_name = document_metadata.get('document_name', 'unknown')
-                documents = extract_text_from_json(json_data, document_id, document_name)
-                all_documents.extend(documents)
-                log_message(f"Извлечено {len(documents)} документов из {file_path}")
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
-        # Process documents through chunking function
-        chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
-        log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
-        return chunked_documents, chunk_info
-    except Exception as e:
-        log_message(f"Ошибка загрузки JSON документов: {str(e)}")
-        return [], []
-def extract_section_title(section_text):
-    if not section_text.strip():
-        return ""
-    lines = section_text.strip().split('\n')
-    first_line = lines[0].strip()
-    if len(first_line) < 200 and not first_line.endswith('.'):
-        return first_line
-    # Otherwise, extract first sentence
-    sentences = first_line.split('.')
-    if len(sentences) > 1:
-        return sentences[0].strip()
-    return first_line[:100] + "..." if len(first_line) > 100 else first_line
-def extract_zip_and_process_json(zip_path):
     documents = []
-    try:
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_files = zip_ref.namelist()
-            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
-            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
-            for json_file in json_files:
-                try:
-                    log_message(f"Обрабатываю файл из архива: {json_file}")
-                    with zip_ref.open(json_file) as f:
-                        json_data = json.load(f)
-                    document_metadata = json_data.get('document_metadata', {})
-                    document_id = document_metadata.get('document_id', 'unknown')
-                    document_name = document_metadata.get('document_name', 'unknown')
-                    docs = extract_text_from_json(json_data, document_id, document_name)
-                    documents.extend(docs)
-                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
-                except Exception as e:
-                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
-                    continue
-    except Exception as e:
-        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
     return documents
-def load_image_data(repo_id, hf_token, image_data_dir):
-    log_message("Начинаю загрузку данных изображений")
-    image_files = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        for file in files:
-            if file.startswith(image_data_dir) and file.endswith('.csv'):
-                image_files.append(file)
-        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
-        image_documents = []
-        for file_path in image_files:
-            try:
-                log_message(f"Обрабатываю файл изображений: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                df = pd.read_csv(local_path)
-                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
-                for _, row in df.iterrows():
-                    section_value = row.get('Раздел документа', 'Неизвестно')
-                    image_num = str(row.get('№ Изображения', 'Неизвестно'))
-                    image_title = str(row.get('Название изображения', 'Неизвестно'))
-                    image_desc = str(row.get('Описание изображение', 'Неизвестно'))
-                    doc_id = str(row.get('Обозначение документа', 'Неизвестно'))
-                    file_name = str(row.get('Файл изображения', 'Неизвестно'))
-                    # FIXED: Create structured, searchable content
-                    content = f"=== ИЗОБРАЖЕНИЕ ===\n"
-                    content += f"Документ: {doc_id}\n"
-                    content += f"Стандарт: {doc_id}\n"
-                    content += f"Раздел: {section_value}\n"
-                    content += f"Изображение: {image_num}\n"
-                    content += f"Название: {image_title}\n"
-                    content += f"Описание: {image_desc}\n"
-                    content += f"Файл: {file_name}\n"
-                    content += f"Уникальный ID: {doc_id} | {section_value} | {image_num}\n"
-                    content += f"===================\n\n"
-                    # Add contextual information for better retrieval
-                    content += f"Это изображение {image_num} из документа {doc_id}, "
-                    content += f"расположенное в разделе '{section_value}'. "
-                    content += f"{image_title}. {image_desc}"
-                    doc = Document(
-                        text=content,
-                        metadata={
-                            "type": "image",
-                            "image_number": image_num,
-                            "image_title": image_title,
-                            "image_description": image_desc,
-                            "document_id": doc_id,
-                            "file_path": file_name,
-                            "section": section_value,
-                            "section_id": section_value,
-                            "full_image_id": f"{doc_id} | {section_value} | {image_num}"
-                        }
-                    )
-                    image_documents.append(doc)
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Создано {len(image_documents)} документов из изображений")
-        return image_documents
-    except Exception as e:
-        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
-        return []
-def load_table_data(repo_id, hf_token, table_data_dir):
-    """Load and process table data with complete metadata preservation"""
-    log_message("=" * 60)
-    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
-    log_message("=" * 60)
-    try:
-        from huggingface_hub import hf_hub_download, list_repo_files
-        import json
-        from collections import defaultdict
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        stats = {
-            'total_tables': 0,
-            'total_size': 0,
-            'by_document': defaultdict(lambda: {'count': 0, 'size': 0}),
-            'by_sheet': defaultdict(int)
-        }
-        for file_path in table_files:
-            try:
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                log_message(f"\n📂 Обработка файла: {file_path}")
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    if isinstance(table_data, dict):
-                        file_level_doc_id = (
-                            table_data.get('document_id') or
-                            table_data.get('document') or
-                            'unknown'
-                        )
-                        if 'sheets' in table_data:
-                            sorted_sheets = sorted(
-                                table_data['sheets'],
-                                key=lambda sheet: sheet.get('table_number', '')
-                            )
-                            log_message(f"  Найдено листов: {len(sorted_sheets)}")
-                            for sheet in sorted_sheets:
-                                # CRITICAL: sheet_name MUST be present
-                                if 'sheet_name' not in sheet:
-                                    log_message(f"  ⚠️ Пропущен лист без sheet_name")
-                                    continue
-                                sheet_name = sheet['sheet_name']
-                                sheet_doc_id = sheet.get('document_id', file_level_doc_id)
-                                log_message(f"  → Лист: {sheet_name} | doc_id: {sheet_doc_id}")
-                                # Pass complete sheet data to table_to_document
-                                docs_list = table_to_document(sheet, document_id=sheet_doc_id)
-                                table_documents.extend(docs_list)
-                                stats['by_sheet'][sheet_name] += len(docs_list)
-                                for doc in docs_list:
-                                    stats['total_tables'] += 1
-                                    size = doc.metadata.get('content_size', 0)
-                                    stats['total_size'] += size
-                                    stats['by_document'][sheet_doc_id]['count'] += 1
-                                    stats['by_document'][sheet_doc_id]['size'] += size
-                        else:
-                            # Single table (no sheets structure)
-                            docs_list = table_to_document(table_data, document_id=file_level_doc_id)
-                            table_documents.extend(docs_list)
-                            for doc in docs_list:
-                                stats['total_tables'] += 1
-                                size = doc.metadata.get('content_size', 0)
-                                stats['total_size'] += size
-                                stats['by_document'][file_level_doc_id]['count'] += 1
-                                stats['by_document'][file_level_doc_id]['size'] += size
-            except Exception as e:
-                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
-                import traceback
-                log_message(f"Traceback: {traceback.format_exc()}")
-                continue
-        # Enhanced logging with sheet breakdown
-        log_message("\n" + "=" * 60)
-        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
-        log_message("=" * 60)
-        log_message(f"Всего таблиц/чанков: {stats['total_tables']}")
-        log_message(f"Общий размер: {stats['total_size']:,} символов")
-        if stats['total_tables'] > 0:
-            log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
-        log_message("\nПо документам:")
-        for doc_id, doc_stats in sorted(stats['by_document'].items()):
-            log_message(f"  • {doc_id}: {doc_stats['count']} элементов, {doc_stats['size']:,} символов")
-        log_message("\nПо листам (топ-20):")
-        top_sheets = sorted(stats['by_sheet'].items(), key=lambda x: x[1], reverse=True)[:20]
-        for sheet_name, count in top_sheets:
-            log_message(f"  • {sheet_name}: {count} чанков")
-        log_message("=" * 60)
-        return table_documents
-    except Exception as e:
-        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
-        import traceback
-        log_message(f"Traceback: {traceback.format_exc()}")
-        return []
-def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
-    log_message("Загружаю данные чанков из CSV")
-    try:
-        chunks_csv_path = hf_hub_download(
-            repo_id=repo_id,
-            filename=chunks_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=hf_token
-        )
-        chunks_df = pd.read_csv(chunks_csv_path)
-        log_message(f"Загружено {len(chunks_df)} чанков из CSV")
-        text_column = None
-        for col in chunks_df.columns:
-            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
-                text_column = col
-                break
-        if text_column is None:
-            text_column = chunks_df.columns[0]
-        log_message(f"Использую колонку: {text_column}")
-        documents = []
-        for i, (_, row) in enumerate(chunks_df.iterrows()):
-            doc = Document(
-                text=str(row[text_column]),
-                metadata={
-                    "chunk_id": row.get('chunk_id', i),
-                    "document_id": row.get('document_id', 'unknown'),
-                    "type": "text"
-                }
-            )
-            documents.append(doc)
-        log_message(f"Создано {len(documents)} текстовых документов из CSV")
-        return documents, chunks_df
-    except Exception as e:
-        log_message(f"Ошибка загрузки CSV данных: {str(e)}")
-        return [], None

 import json
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
+from my_logging import log_message
+# Configuration
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 50
+def chunk_text_documents(documents):
+    """Simple text chunking with sentence awareness"""
     text_splitter = SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
     )
+    chunked = []
+    for doc in documents:
+        chunks = text_splitter.get_nodes_from_documents([doc])
+        for i, chunk in enumerate(chunks):
+            chunk.metadata.update({
+                'chunk_id': i,
+                'total_chunks': len(chunks)
+            })
+            chunked.append(chunk)
+    log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
+    return chunked
+def chunk_table_by_rows(table_data, doc_id, max_rows=30):
+    """Split large tables into row blocks"""
+    headers = table_data.get('headers', [])
+    rows = table_data.get('data', [])
+    table_num = table_data.get('table_number', 'unknown')
+    table_title = table_data.get('table_title', '')
+    section = table_data.get('section', '')
+    if not rows:
+        return []
+    # Small table: keep whole
+    if len(rows) <= max_rows:
+        content = format_table_content(table_data, headers, rows)
+        return [Document(
+            text=content,
+            metadata={
+                'type': 'table',
+                'document_id': doc_id,
+                'table_number': table_num,
+                'table_title': table_title,
+                'section': section,
+                'total_rows': len(rows)
+            }
+        )]
+    # Large table: split by row blocks
+    chunks = []
+    for i in range(0, len(rows), max_rows):
+        chunk_rows = rows[i:i+max_rows]
+        content = format_table_content(table_data, headers, chunk_rows,
+                                       chunk_info=f"Rows {i+1}-{i+len(chunk_rows)}")
+        chunks.append(Document(
+            text=content,
+            metadata={
+                'type': 'table',
+                'document_id': doc_id,
+                'table_number': table_num,
+                'table_title': table_title,
+                'section': section,
+                'chunk_id': i // max_rows,
+                'row_start': i,
+                'row_end': i + len(chunk_rows),
+                'total_rows': len(rows)
+            }
+        ))
+    log_message(f"  📊 Table {table_num}: {len(rows)} rows → {len(chunks)} chunks")
+    return chunks
+def format_table_content(table_data, headers, rows, chunk_info=""):
+    """Format table for semantic search"""
+    doc_id = table_data.get('document_id', 'unknown')
+    table_num = table_data.get('table_number', 'unknown')
+    table_title = table_data.get('table_title', '')
+    section = table_data.get('section', '')
+    content = f"Документ: {doc_id}\n"
+    content += f"Таблица: {table_num}\n"
+    if table_title:
+        content += f"Название: {table_title}\n"
+    if section:
+        content += f"Раздел: {section}\n"
+    if chunk_info:
+        content += f"{chunk_info}\n"
+    content += f"\nКолонки: {' | '.join(str(h) for h in headers)}\n\n"
+    # Add rows
+    for row in rows:
+        if isinstance(row, dict):
+            parts = [f"{k}: {v}" for k, v in row.items()
+                    if v and str(v).strip() and str(v) != 'nan']
+            content += ' | '.join(parts) + "\n"
+        elif isinstance(row, list):
+            parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
+            content += ' | '.join(parts) + "\n"
+    return content
+def load_json_documents(repo_id, hf_token, json_dir):
+    """Load text sections from JSON"""
+    log_message("Loading JSON documents...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
+    documents = []
+    for file_path in json_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
+            # Extract sections
+            for section in data.get('sections', []):
+                if section.get('section_text', '').strip():
+                    documents.append(Document(
+                        text=section['section_text'],
+                        metadata={
+                            'type': 'text',
+                            'document_id': doc_id,
+                            'section_id': section.get('section_id', '')
+                        }
+                    ))
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(documents)} text sections")
     return documents
+def load_table_documents(repo_id, hf_token, table_dir):
+    """Load and chunk tables"""
+    log_message("Loading tables...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            doc_id = data.get('document_id', 'unknown')
+            for sheet in data.get('sheets', []):
+                chunks = chunk_table_by_rows(sheet, doc_id)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    return all_chunks
+def load_image_documents(repo_id, hf_token, image_dir):
+    """Load image descriptions"""
+    log_message("Loading images...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
     documents = []
+    for file_path in csv_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            df = pd.read_csv(local_path)
+            for _, row in df.iterrows():
+                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
+                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
+                content += f"Название: {row.get('Название изображения', '')}\n"
+                content += f"Описание: {row.get('Описание изображение', '')}\n"
+                content += f"Раздел: {row.get('Раздел документа', '')}\n"
+                documents.append(Document(
+                    text=content,
+                    metadata={
+                        'type': 'image',
+                        'document_id': str(row.get('Обозначение документа', 'unknown')),
+                        'image_number': str(row.get('№ Изображения', 'unknown')),
+                        'section': str(row.get('Раздел документа', ''))
+                    }
+                ))
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(documents)} images")
     return documents
+def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
+    """Main loader - combines all document types"""
+    log_message("="*60)
+    log_message("STARTING DOCUMENT LOADING")
+    log_message("="*60)
+    # Load text sections
+    text_docs = load_json_documents(repo_id, hf_token, json_dir)
+    text_chunks = chunk_text_documents(text_docs)
+    # Load tables (already chunked)
+    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
+    # Load images (no chunking needed)
+    image_docs = load_image_documents(repo_id, hf_token, image_dir)
+    all_docs = text_chunks + table_chunks + image_docs
+    log_message("="*60)
+    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
+    log_message(f"  Text chunks: {len(text_chunks)}")
+    log_message(f"  Table chunks: {len(table_chunks)}")
+    log_message(f"  Images: {len(image_docs)}")
+    log_message("="*60)
+    return all_docs

index_retriever.py CHANGED Viewed

@@ -1,123 +1,64 @@
-from llama_index.core import VectorStoreIndex, Settings
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
-from llama_index.core.prompts import PromptTemplate
 from llama_index.retrievers.bm25 import BM25Retriever
 from llama_index.core.retrievers import QueryFusionRetriever
 from my_logging import log_message
-from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
 def create_vector_index(documents):
-    log_message("Строю векторный индекс")
-    return VectorStoreIndex.from_documents(documents)
 def create_query_engine(vector_index):
-    try:
-        bm25_retriever = BM25Retriever.from_defaults(
-            docstore=vector_index.docstore,
-            similarity_top_k=30
-        )
-        vector_retriever = VectorIndexRetriever(
-            index=vector_index,
-            similarity_top_k=30,
-            similarity_cutoff=0.65
-        )
-        hybrid_retriever = QueryFusionRetriever(
-            [vector_retriever, bm25_retriever],
-            similarity_top_k=40,
-            num_queries=1
-        )
-        custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
-        response_synthesizer = get_response_synthesizer(
-            response_mode=ResponseMode.TREE_SUMMARIZE,
-            text_qa_template=custom_prompt_template
-        )
-        query_engine = RetrieverQueryEngine(
-            retriever=hybrid_retriever,
-            response_synthesizer=response_synthesizer
-        )
-        log_message("Query engine успешно создан")
-        return query_engine
-    except Exception as e:
-        log_message(f"Ошибка создания query engine: {str(e)}")
-        raise
-def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.45, diversity_penalty=0.2):
-    """Rerank with better handling of specific technical queries"""
-    if not nodes or not reranker:
-        return nodes[:top_k]
-    try:
-        log_message(f"Переранжирую {len(nodes)} узлов для запроса: {query[:50]}...")
-        pairs = [[query, node.text] for node in nodes]
-        scores = reranker.predict(pairs)
-        scored_nodes = list(zip(nodes, scores))
-        scored_nodes.sort(key=lambda x: x[1], reverse=True)
-        # Lower threshold for technical queries
-        if min_score_threshold is not None:
-            scored_nodes = [(node, score) for node, score in scored_nodes
-                          if score >= min_score_threshold]
-            log_message(f"После фильтрации (порог {min_score_threshold}): {len(scored_nodes)} узлов")
-        if not scored_nodes:
-            log_message("⚠️ Нет узлов после фильтрации, снижаю порог до 0.3")
-            scored_nodes = list(zip(nodes, scores))
-            scored_nodes.sort(key=lambda x: x[1], reverse=True)
-            min_score_threshold = max(0.3, scored_nodes[0][1] * 0.5)
-            scored_nodes = [(node, score) for node, score in scored_nodes
-                          if score >= min_score_threshold]
-        selected_nodes = []
-        selected_docs = {}  # Track count per document
-        selected_tables = set()
-        for node, score in scored_nodes:
-            if len(selected_nodes) >= top_k:
-                break
-            metadata = node.metadata if hasattr(node, 'metadata') else {}
-            doc_id = metadata.get('document_id', 'unknown')
-            node_type = metadata.get('type', 'text')
-            # Track table uniqueness
-            if node_type == 'table':
-                table_id = metadata.get('full_table_id', '')
-                if table_id in selected_tables:
-                    continue  # Skip duplicate table chunks
-                selected_tables.add(table_id)
-            # Apply lighter diversity penalty
-            penalty = 0
-            doc_count = selected_docs.get(doc_id, 0)
-            if doc_count > 0:
-                penalty = min(diversity_penalty * doc_count, 0.5)
-            adjusted_score = score * (1 - penalty)
-            # Accept if competitive
-            if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.5:
-                selected_nodes.append((node, score))
-                selected_docs[doc_id] = doc_count + 1
-        log_message(f"✓ Выбрано {len(selected_nodes)} узлов")
-        log_message(f"  Уникальных документов: {len(selected_docs)}")
-        log_message(f"  Уникальных таблиц: {len(selected_tables)}")
-        if selected_nodes:
-            log_message(f"  Score: {selected_nodes[0][1]:.3f} → {selected_nodes[-1][1]:.3f}")
-        return [node for node, score in selected_nodes]
-    except Exception as e:
-        log_message(f"❌ Ошибка переранжировки: {str(e)}")
-        return nodes[:top_k]

+from llama_index.core import VectorStoreIndex
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.retrievers.bm25 import BM25Retriever
 from llama_index.core.retrievers import QueryFusionRetriever
+from llama_index.core.response_synthesizers import get_response_synthesizer
 from my_logging import log_message
+SIMPLE_PROMPT = """Вы - эксперт по нормативной документации.
+Контекст:
+{context_str}
+Вопрос: {query_str}
+Инструкция:
+1. Отвечайте ТОЛЬКО на основе предоставленного контекста
+2. Цитируйте конкретные источники (документ, раздел, таблицу)
+3. Если информации недостаточно, четко укажите это
+4. Будьте точны и конкретны
+Ответ:"""
 def create_vector_index(documents):
+    """Create vector index from documents"""
+    log_message(f"Building vector index from {len(documents)} documents...")
+    index = VectorStoreIndex.from_documents(documents)
+    log_message("✓ Index created")
+    return index
 def create_query_engine(vector_index):
+    """Create hybrid retrieval engine"""
+    log_message("Creating query engine...")
+    # Vector retriever
+    vector_retriever = VectorIndexRetriever(
+        index=vector_index,
+        similarity_top_k=30
+    )
+    # BM25 retriever
+    bm25_retriever = BM25Retriever.from_defaults(
+        docstore=vector_index.docstore,
+        similarity_top_k=30
+    )
+    # Hybrid fusion
+    hybrid_retriever = QueryFusionRetriever(
+        [vector_retriever, bm25_retriever],
+        similarity_top_k=40,
+        num_queries=1
+    )
+    # Response synthesizer
+    response_synthesizer = get_response_synthesizer()
+    # Query engine
+    query_engine = RetrieverQueryEngine(
+        retriever=hybrid_retriever,
+        response_synthesizer=response_synthesizer
+    )
+    log_message("✓ Query engine created")
+    return query_engine

utils.py CHANGED Viewed

@@ -1,309 +1,113 @@
-import logging
-import sys
 from llama_index.llms.google_genai import GoogleGenAI
-from llama_index.llms.openai import OpenAI
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from sentence_transformers import CrossEncoder
-from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
-import time
-from index_retriever import rerank_nodes
 from my_logging import log_message
-from config import PROMPT_SIMPLE_POISK
-def get_llm_model(model_name):
-    try:
-        model_config = AVAILABLE_MODELS.get(model_name)
-        if not model_config:
-            log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
-            model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
-        if not model_config.get("api_key"):
-            raise Exception(f"API ключ не найден для модели {model_name}")
-        if model_config["provider"] == "google":
-            # Fix: Remove image_config parameter or set it properly
-            return GoogleGenAI(
-                model=model_config["model_name"],
-                api_key=model_config["api_key"],
-                # Don't pass image_config=None
-            )
-        elif model_config["provider"] == "openai":
-            return OpenAI(
-                model=model_config["model_name"],
-                api_key=model_config["api_key"]
-            )
-        else:
-            raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
-    except Exception as e:
-        log_message(f"Ошибка создания модели {model_name}: {str(e)}")
-        # Fix: Also apply to fallback model
-        return GoogleGenAI(
-            model="gemini-2.0-flash",
-            api_key=GOOGLE_API_KEY
-        )
 def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
     return HuggingFaceEmbedding(model_name=model_name)
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
-def format_context_for_llm(nodes):
-    context_parts = []
     for node in nodes:
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_id = metadata.get('document_id', 'Неизвестный документ')
-        section_info = ""
-        # Handle section information with proper hierarchy
-        if metadata.get('section_path'):
-            section_path = metadata['section_path']
-            section_text = metadata.get('section_text', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            level = metadata.get('level', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                # For subsections: раздел X (Title), пункт X.X
-                if section_text:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
-                else:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
-            elif section_text:
-                # For main sections: раздел X (Title)
-                section_info = f"раздел {section_path} ({section_text})"
-            else:
-                section_info = f"раздел {section_path}"
-        elif metadata.get('section_id'):
-            section_id = metadata['section_id']
-            section_text = metadata.get('section_text', '')
-            level = metadata.get('level', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                if section_text:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
-                else:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
-            elif section_text:
-                section_info = f"раздел {section_id} ({section_text})"
-            else:
-                section_info = f"раздел {section_id}"
-        # Override with table/image info if applicable
-        if metadata.get('type') == 'table' and metadata.get('table_number'):
-            table_num = metadata['table_number']
-            if not str(table_num).startswith('№'):
-                table_num = f"№{table_num}"
-            table_title = metadata.get('table_title', '')
-            # Include section context for tables
-            base_section = ""
-            if metadata.get('section_path'):
-                base_section = f", раздел {metadata['section_path']}"
-            elif metadata.get('section_id'):
-                base_section = f", раздел {metadata['section_id']}"
-            if table_title:
-                section_info = f"Таблица {table_num} ({table_title}){base_section}"
-            else:
-                section_info = f"Таблица {table_num}{base_section}"
-        if metadata.get('type') == 'image' and metadata.get('image_number'):
-            image_num = metadata['image_number']
-            if not str(image_num).startswith('№'):
-                image_num = f"№{image_num}"
-            image_title = metadata.get('image_title', '')
-            # Include section context for images
-            base_section = ""
-            if metadata.get('section_path'):
-                base_section = f", раздел {metadata['section_path']}"
-            elif metadata.get('section_id'):
-                base_section = f", раздел {metadata['section_id']}"
-            if image_title:
-                section_info = f"Рисунок {image_num} ({image_title}){base_section}"
-            else:
-                section_info = f"Рисунок {image_num}{base_section}"
-        context_text = node.text if hasattr(node, 'text') else str(node)
-        if section_info:
-            formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
-        else:
-            formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
-        context_parts.append(formatted_context)
-    return "\n".join(context_parts)
-def generate_sources_html(nodes, chunks_df=None):
-    html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
-    html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
-    sources_by_doc = {}
-    for i, node in enumerate(nodes):
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_type = metadata.get('type', 'text')
-        doc_id = metadata.get('document_id', 'unknown')
-        section_id = metadata.get('section_id', '')
-        section_text = metadata.get('section_text', '')
-        section_path = metadata.get('section_path', '')
-        # Create a unique key for grouping
         if doc_type == 'table':
-            table_num = metadata.get('table_number', 'unknown')
-            key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
-            image_num = metadata.get('image_number', 'unknown')
-            key = f"{doc_id}_image_{image_num}"
         else:
-            # For text documents, group by section path or section id
-            section_key = section_path if section_path else section_id
-            key = f"{doc_id}_text_{section_key}"
-        if key not in sources_by_doc:
-            sources_by_doc[key] = {
-                'doc_id': doc_id,
-                'doc_type': doc_type,
-                'metadata': metadata,
-                'sections': set()
-            }
-        # Add section information
-        if section_path:
-            sources_by_doc[key]['sections'].add(f"пункт {section_path}")
-        elif section_id and section_id != 'unknown':
-            sources_by_doc[key]['sections'].add(f"пункт {section_id}")
-    # Generate HTML for each unique source
-    for source_info in sources_by_doc.values():
-        metadata = source_info['metadata']
-        doc_type = source_info['doc_type']
-        doc_id = source_info['doc_id']
-        html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
-        if doc_type == 'text':
-            html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
-        elif doc_type == 'table' or doc_type == 'table_row':
-            table_num = metadata.get('table_number', 'unknown')
-            table_title = metadata.get('table_title', '')
-            if table_num and table_num != 'unknown':
-                if not str(table_num).startswith('№'):
-                    table_num = f"№{table_num}"
-                html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
-                if table_title and table_title != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
-            else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
-        elif doc_type == 'image':
-            image_num = metadata.get('image_number', 'unknown')
-            image_title = metadata.get('image_title', '')
-            section = metadata.get('section', '')
-            if image_num and image_num != 'unknown':
-                if not str(image_num).startswith('№'):
-                    image_num = f"№{image_num}"
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
-                if image_title and image_title != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
-                if section and section != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
-            else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
-        # Add file link if available
-        if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
-            doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
-            if not doc_rows.empty:
-                file_link = doc_rows.iloc[0]['file_link']
-                html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
-        html += "</div>"
-    html += "</div>"
-    return html
-def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
-    if query_engine is None:
-        return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
-        start_time = time.time()
-        llm = get_llm_model(current_model)
-        # Direct retrieval without query expansion
-        retrieved_nodes = query_engine.retriever.retrieve(question)
-        log_message(f"Получено {len(retrieved_nodes)} узлов")
-        reranked_nodes = rerank_nodes(
-            question,
-            retrieved_nodes,
-            reranker,
-            top_k=25,
-            min_score_threshold=0.5,
-            diversity_penalty=0.3
-        )
-        formatted_context = format_context_for_llm(reranked_nodes)
-        enhanced_question = f"""Контекст из базы данных:
-{formatted_context}
-Вопрос пользователя: {question}
-Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
-Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
-        response = query_engine.query(enhanced_question)
-        end_time = time.time()
-        processing_time = end_time - start_time
-        log_message(f"Обработка завершена за {processing_time:.2f}с")
-        sources_html = generate_sources_html(reranked_nodes, chunks_df)
-        answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
-        <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
-        <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
-        <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
-        Время обработки: {processing_time:.2f} секунд
-        </div>
-        </div>"""
-        chunk_info = []
-        for node in reranked_nodes:
-            metadata = node.metadata if hasattr(node, 'metadata') else {}
-            chunk_info.append({
-                'document_id': metadata.get('document_id', 'unknown'),
-                'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
-                'section_path': metadata.get('section_path', ''),
-                'section_text': metadata.get('section_text', ''),
-                'level': metadata.get('level', ''),
-                'parent_section': metadata.get('parent_section', ''),
-                'parent_title': metadata.get('parent_title', ''),
-                'type': metadata.get('type', 'text'),
-                'table_number': metadata.get('table_number', ''),
-                'image_number': metadata.get('image_number', ''),
-                'chunk_size': len(node.text),
-                'chunk_text': node.text
-            })
-        from app import create_chunks_display_html
-        chunks_html = create_chunks_display_html(chunk_info)
-        return answer_with_time, sources_html, chunks_html
-    except Exception as e:
-        log_message(f"Ошибка: {str(e)}")
-        error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка: {str(e)}</div>"
-        return error_msg, "", ""

 from llama_index.llms.google_genai import GoogleGenAI
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from sentence_transformers import CrossEncoder
 from my_logging import log_message
+def get_llm_model(api_key, model_name="gemini-2.0-flash"):
+    """Get LLM model"""
+    return GoogleGenAI(model=model_name, api_key=api_key)
 def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
+    """Get embedding model"""
     return HuggingFaceEmbedding(model_name=model_name)
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
+    """Get reranker model"""
     return CrossEncoder(model_name)
+def format_sources(nodes):
+    """Format retrieved sources for display"""
+    sources = []
     for node in nodes:
+        meta = node.metadata
+        doc_type = meta.get('type', 'text')
+        doc_id = meta.get('document_id', 'unknown')
         if doc_type == 'table':
+            table_num = meta.get('table_number', 'unknown')
+            title = meta.get('table_title', '')
+            sources.append(f"📊 {doc_id} - Таблица {table_num}: {title}")
         elif doc_type == 'image':
+            img_num = meta.get('image_number', 'unknown')
+            sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
         else:
+            section = meta.get('section_id', '')
+            sources.append(f"📄 {doc_id} - Раздел {section}")
+    return "\n".join(set(sources))
+def answer_question(question, query_engine, reranker):
+    """Answer question using RAG"""
     try:
+        log_message(f"Query: {question}")
+        # Retrieve
+        retrieved = query_engine.retriever.retrieve(question)
+        log_message(f"Retrieved {len(retrieved)} nodes")
+        # Rerank
+        reranked = rerank_nodes(question, retrieved, reranker, top_k=15)
+        log_message(f"Reranked to {len(reranked)} nodes")
+        # Format context
+        context = "\n\n".join([
+            f"[{n.metadata.get('document_id', 'unknown')}]\n{n.text}"
+            for n in reranked
+        ])
+        # Generate answer
+        prompt = f"""Контекст из базы данных:
+{context}
+Вопрос: {question}
+Ответь на вопрос используя ТОЛЬКО информацию из контекста. Цитируй источники."""
+        response = query_engine.query(prompt)
+        sources = format_sources(reranked)
+        return response.response, sources
+    except Exception as e:
+        log_message(f"Error: {e}")
+        return f"Ошибка: {e}", ""
+def rerank_nodes(query, nodes, reranker, top_k=15, min_score=0.5):
+    """Rerank nodes with diversity"""
+    if not nodes:
+        return []
+    # Score all nodes
+    pairs = [[query, n.text] for n in nodes]
+    scores = reranker.predict(pairs)
+    # Sort by score
+    scored = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)
+    # Filter by threshold
+    filtered = [(n, s) for n, s in scored if s >= min_score]
+    if not filtered:
+        # Fallback: take top 30% if nothing passes threshold
+        cutoff = max(scores) * 0.6
+        filtered = [(n, s) for n, s in scored if s >= cutoff]
+    # Diversity selection
+    selected = []
+    seen_docs = set()
+    for node, score in filtered:
+        if len(selected) >= top_k:
+            break
+        doc_id = node.metadata.get('document_id', 'unknown')
+        # Prioritize diverse documents
+        if doc_id not in seen_docs or len(selected) < 5:
+            selected.append(node)
+            seen_docs.add(doc_id)
+    log_message(f"Reranked: {len(filtered)} → {len(selected)} (from {len(seen_docs)} docs)")
+    return selected