Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 12, 2025

Commit

ba52088

1 Parent(s): a7e15db

complete new structure

Browse files

Files changed (5) hide show

app.py +133 -90
config.py +10 -0
documents_prep.py +332 -387
index_retriever.py +61 -192
utils.py +135 -0

app.py CHANGED Viewed

@@ -1,83 +1,88 @@
 import gradio as gr
 import os
 import sys
-import logging
-import config
-from documents_prep import DocumentsPreparation
-from index_retriever import IndexRetriever
-from chat_handler import ChatHandler
-REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
-HF_TOKEN = os.getenv('HF_TOKEN')
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-doc_prep = None
-index_retriever = None
-chat_handler = None
-def log_message(message):
-    logger.info(message)
-    print(message, flush=True)
-    sys.stdout.flush()
-def initialize_system():
-    global doc_prep, index_retriever, chat_handler
     try:
-        log_message("Запуск инициализации системы AIEXP")
-        doc_prep = DocumentsPreparation(REPO_ID, HF_TOKEN)
-        index_retriever = IndexRetriever(config=config)
-        log_message("Подготовка документов")
-        all_documents = doc_prep.prepare_all_documents()
-        if not all_documents:
-            log_message("Не удалось загрузить документы")
-            return False
-        log_message("Инициализация моделей и индекса")
-        if not index_retriever.initialize_models(all_documents):
-            log_message("Не удалось инициализировать модели")
-            return False
-        chat_handler = ChatHandler(index_retriever)
-        log_message("Система успешно инициализирована")
-        return True
     except Exception as e:
-        log_message(f"Ошибка инициализации системы: {str(e)}")
-        return False
-def handle_question(question):
-    if chat_handler is None:
-        return "Система не инициализирована", ""
-    return chat_handler.answer_question(question)
-def handle_model_switch(model_name):
-    if index_retriever is None:
-        return "Система не инициализирована"
-    return index_retriever.switch_model(model_name)
-def get_current_model_status():
-    if index_retriever is None:
-        return "Система не инициализирована"
-    return f"Текущая модель: {index_retriever.get_current_model()}"
-def get_chat_history_html():
-    if chat_handler is None:
-        return "Истор��я недоступна"
-    return chat_handler.get_history_html()
-def clear_chat_history():
-    if chat_handler is not None:
-        chat_handler.clear_history()
-    return "История очищена"
-def create_demo_interface():
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
@@ -92,15 +97,15 @@ def create_demo_interface():
             with gr.Row():
                 with gr.Column(scale=2):
                     model_dropdown = gr.Dropdown(
-                        choices=list(config.AVAILABLE_MODELS.keys()),
-                        value=config.DEFAULT_MODEL,
                         label="🤖 Выберите языковую модель",
                         info="Выберите модель для генерации ответов"
                     )
                 with gr.Column(scale=1):
                     switch_btn = gr.Button("🔄 Переключить модель", variant="secondary")
                     model_status = gr.Textbox(
-                        value=get_current_model_status(),
                         label="Статус модели",
                         interactive=False
                     )
@@ -129,7 +134,7 @@ def create_demo_interface():
                 with gr.Column(scale=2):
                     answer_output = gr.HTML(
                         label="",
-                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {config.DEFAULT_MODEL}</small></div>",
                     )
                 with gr.Column(scale=1):
@@ -137,33 +142,68 @@ def create_demo_interface():
                         label="",
                         value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
                     )
-        switch_btn.click(
-            fn=handle_model_switch,
-            inputs=[model_dropdown],
-            outputs=[model_status]
-        )
-        ask_btn.click(
-            fn=handle_question,
-            inputs=[question_input],
-            outputs=[answer_output, sources_output]
-        )
-        question_input.submit(
-            fn=handle_question,
-            inputs=[question_input],
-            outputs=[answer_output, sources_output]
-        )
     return demo
-if __name__ == "__main__":
     log_message("Запуск AIEXP - AI Expert для нормативной документации")
-    if initialize_system():
         log_message("Запуск веб-интерфейса")
-        demo = create_demo_interface()
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
@@ -172,4 +212,7 @@ if __name__ == "__main__":
         )
     else:
         log_message("Невозможно запустить приложение из-за ошибки инициализации")
-        sys.exit(1)

 import gradio as gr
 import os
+from llama_index.core import Settings
+from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
+from utils import get_llm_model, get_embedding_model, get_reranker_model, log_message, answer_question
+from index_retriever import create_vector_index, create_query_engine
 import sys
+from config import (
+    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
+    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
+)
+def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
+                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
+                     use_json_instead_csv=False):
     try:
+        log_message("Инициализация системы")
+        os.makedirs(download_dir, exist_ok=True)
+        embed_model = get_embedding_model()
+        llm = get_llm_model(DEFAULT_MODEL)
+        reranker = get_reranker_model()
+        Settings.embed_model = embed_model
+        Settings.llm = llm
+        all_documents = []
+        chunks_df = None
+        if use_json_instead_csv and json_files_dir:
+            log_message("Используем JSON файлы вместо CSV")
+            json_documents = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
+            all_documents.extend(json_documents)
+        else:
+            if chunks_filename:
+                log_message("Загружаем данные из CSV")
+                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
+                all_documents.extend(csv_documents)
+        if table_data_dir:
+            log_message("Добавляю табличные данные")
+            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            all_documents.extend(table_documents)
+        if image_data_dir:
+            log_message("Добавляю данные изображений")
+            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            all_documents.extend(image_documents)
+        log_message(f"Всего документов: {len(all_documents)}")
+        vector_index = create_vector_index(all_documents)
+        query_engine = create_query_engine(vector_index)
+        log_message(f"Система успешно инициализирована")
+        return query_engine, chunks_df, reranker, vector_index
     except Exception as e:
+        log_message(f"Ошибка инициализации: {str(e)}")
+        return None, None, None, None
+def switch_model(model_name, vector_index):
+    from llama_index.core import Settings
+    from index_retriever import create_query_engine
+    try:
+        log_message(f"Переключение на модель: {model_name}")
+        new_llm = get_llm_model(model_name)
+        Settings.llm = new_llm
+        if vector_index is not None:
+            new_query_engine = create_query_engine(vector_index)
+            log_message(f"Модель успешно переключена на: {model_name}")
+            return new_query_engine, f"✅ Модель переключена на: {model_name}"
+        else:
+            return None, "❌ Ошибка: система не инициализирована"
+    except Exception as e:
+        error_msg = f"Ошибка переключения модели: {str(e)}"
+        log_message(error_msg)
+        return None, f"❌ {error_msg}"
+def create_demo_interface(answer_question_func, switch_model_func, current_model):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
             with gr.Row():
                 with gr.Column(scale=2):
                     model_dropdown = gr.Dropdown(
+                        choices=list(AVAILABLE_MODELS.keys()),
+                        value=current_model,
                         label="🤖 Выберите языковую модель",
                         info="Выберите модель для генерации ответов"
                     )
                 with gr.Column(scale=1):
                     switch_btn = gr.Button("🔄 Переключить модель", variant="secondary")
                     model_status = gr.Textbox(
+                        value=f"Текущая модель: {current_model}",
                         label="Статус модели",
                         interactive=False
                     )
                 with gr.Column(scale=2):
                     answer_output = gr.HTML(
                         label="",
+                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
                     )
                 with gr.Column(scale=1):
                         label="",
                         value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
                     )
+            switch_btn.click(
+                fn=switch_model_func,
+                inputs=[model_dropdown],
+                outputs=[model_status]
+            )
+            ask_btn.click(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output]
+            )
+            question_input.submit(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output]
+            )
     return demo
+query_engine = None
+chunks_df = None
+reranker = None
+vector_index = None
+current_model = DEFAULT_MODEL
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    return answer_question(question, query_engine, reranker, current_model, chunks_df)
+def main_switch_model(model_name):
+    global query_engine, vector_index, current_model
+    new_query_engine, status_message = switch_model(model_name, vector_index)
+    if new_query_engine:
+        query_engine = new_query_engine
+        current_model = model_name
+    return status_message
+def main():
+    global query_engine, chunks_df, reranker, vector_index, current_model
     log_message("Запуск AIEXP - AI Expert для нормативной документации")
+    query_engine, chunks_df, reranker, vector_index = initialize_system(
+        repo_id=HF_REPO_ID,
+        hf_token=HF_TOKEN,
+        download_dir=DOWNLOAD_DIR,
+        json_files_dir=JSON_FILES_DIR,
+        table_data_dir=TABLE_DATA_DIR,
+        image_data_dir=IMAGE_DATA_DIR,
+    )
+    if query_engine:
         log_message("Запуск веб-интерфейса")
+        demo = create_demo_interface(
+            answer_question_func=main_answer_question,
+            switch_model_func=main_switch_model,
+            current_model=current_model
+        )
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
         )
     else:
         log_message("Невозможно запустить приложение из-за ошибки инициализации")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

config.py CHANGED Viewed

@@ -6,6 +6,16 @@ SIMILARITY_THRESHOLD = 0.7
 RAG_FILES_DIR = "rag_files"
 PROCESSED_DATA_FILE = "processed_chunks.csv"
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"

 RAG_FILES_DIR = "rag_files"
 PROCESSED_DATA_FILE = "processed_chunks.csv"
+REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
+faiss_index_filename = "cleaned_faiss_index.index"
+CHUNKS_FILENAME = "processed_chunks.csv"
+TABLE_DATA_DIR = "Табличные данные_JSON"
+IMAGE_DATA_DIR = "Изображения"
+DOWNLOAD_DIR = "rag_files"
+JSON_FILES_DIR ="JSON"
+HF_TOKEN = os.getenv('HF_TOKEN')
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"

documents_prep.py CHANGED Viewed

@@ -1,431 +1,376 @@
 import json
-import pandas as pd
-import os
 import zipfile
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
-import logging
-logger = logging.getLogger(__name__)
-def log_message(message):
-    logger.info(message)
-    print(message, flush=True)
-class DocumentsPreparation:
-    def __init__(self, repo_id, hf_token):
-        self.repo_id = repo_id
-        self.hf_token = hf_token
-        self.json_files_dir = "JSON"
-        self.table_data_dir = "Табличные данные_JSON"
-        self.image_data_dir = "Изображения"
-        self.download_dir = "rag_files"
-    def extract_text_from_json(self, data, document_id, document_name):
-        documents = []
-        if 'sections' in data:
-            for section in data['sections']:
-                section_id = section.get('section_id', 'Unknown')
-                section_text = section.get('section_text', '')
-                if section_text.strip():
-                    doc = Document(
-                        text=section_text,
-                        metadata={
-                            "type": "text",
-                            "document_id": document_id,
-                            "document_name": document_name,
-                            "section_id": section_id,
-                            "level": "section"
-                        }
-                    )
-                    documents.append(doc)
-                if 'subsections' in section:
-                    for subsection in section['subsections']:
-                        subsection_id = subsection.get('subsection_id', 'Unknown')
-                        subsection_text = subsection.get('subsection_text', '')
-                        if subsection_text.strip():
-                            doc = Document(
-                                text=subsection_text,
-                                metadata={
-                                    "type": "text",
-                                    "document_id": document_id,
-                                    "document_name": document_name,
-                                    "section_id": section_id,
-                                    "subsection_id": subsection_id,
-                                    "level": "subsection"
-                                }
-                            )
-                            documents.append(doc)
-                        if 'sub_subsections' in subsection:
-                            for sub_subsection in subsection['sub_subsections']:
-                                sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
-                                sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
-                                if sub_subsection_text.strip():
-                                    doc = Document(
-                                        text=sub_subsection_text,
-                                        metadata={
-                                            "type": "text",
-                                            "document_id": document_id,
-                                            "document_name": document_name,
-                                            "section_id": section_id,
-                                            "subsection_id": subsection_id,
-                                            "sub_subsection_id": sub_subsection_id,
-                                            "level": "sub_subsection"
-                                        }
-                                    )
-                                    documents.append(doc)
-                                if 'sub_sub_subsections' in sub_subsection:
-                                    for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
-                                        sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
-                                        sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
-                                        if sub_sub_subsection_text.strip():
-                                            doc = Document(
-                                                text=sub_sub_subsection_text,
-                                                metadata={
-                                                    "type": "text",
-                                                    "document_id": document_id,
-                                                    "document_name": document_name,
-                                                    "section_id": section_id,
-                                                    "subsection_id": subsection_id,
-                                                    "sub_subsection_id": sub_subsection_id,
-                                                    "sub_sub_subsection_id": sub_sub_subsection_id,
-                                                    "level": "sub_sub_subsection"
-                                                }
-                                            )
-                                            documents.append(doc)
-        return documents
-    def extract_zip_and_process_json(self, zip_path):
-        """Extract ZIP file and process JSON files inside"""
-        documents = []
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                # Get list of files in ZIP
-                zip_files = zip_ref.namelist()
-                json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
-                log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
-                for json_file in json_files:
-                    try:
-                        log_message(f"Обрабатываю файл из архива: {json_file}")
-                        # Read JSON file from ZIP
-                        with zip_ref.open(json_file) as f:
-                            json_data = json.load(f)
-                        document_metadata = json_data.get('document_metadata', {})
-                        document_id = document_metadata.get('document_id', 'unknown')
-                        document_name = document_metadata.get('document_name', 'unknown')
-                        docs = self.extract_text_from_json(json_data, document_id, document_name)
-                        documents.extend(docs)
-                        log_message(f"Извлечено {len(docs)} документов из {json_file}")
-                    except Exception as e:
-                        log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
-                        continue
-        except Exception as e:
-            log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
-        return documents
-    def load_json_documents(self):
-        log_message("Начинаю загрузку JSON документов")
-        try:
-            files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
-            zip_files = [f for f in files if f.startswith(self.json_files_dir) and f.endswith('.zip')]
-            json_files = [f for f in files if f.startswith(self.json_files_dir) and f.endswith('.json')]
-            log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
-            all_documents = []
-            for zip_file_path in zip_files:
-                try:
-                    log_message(f"Загружаю ZIP архив: {zip_file_path}")
-                    local_zip_path = hf_hub_download(
-                        repo_id=self.repo_id,
-                        filename=zip_file_path,
-                        local_dir=self.download_dir,
-                        repo_type="dataset",
-                        token=self.hf_token
-                    )
-                    documents = self.extract_zip_and_process_json(local_zip_path)
-                    all_documents.extend(documents)
-                except Exception as e:
-                    log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
-                    continue
-            # Process direct JSON files (if any)
-            for file_path in json_files:
                 try:
-                    log_message(f"Обрабатываю прямой JSON файл: {file_path}")
-                    local_path = hf_hub_download(
-                        repo_id=self.repo_id,
-                        filename=file_path,
-                        local_dir=self.download_dir,
-                        repo_type="dataset",
-                        token=self.hf_token
-                    )
-                    with open(local_path, 'r', encoding='utf-8') as f:
                         json_data = json.load(f)
                     document_metadata = json_data.get('document_metadata', {})
                     document_id = document_metadata.get('document_id', 'unknown')
                     document_name = document_metadata.get('document_name', 'unknown')
-                    documents = self.extract_text_from_json(json_data, document_id, document_name)
-                    all_documents.extend(documents)
-                    log_message(f"Извлечено {len(documents)} документов из {file_path}")
                 except Exception as e:
-                    log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                     continue
-            log_message(f"Всего создано {len(all_documents)} текстовых документов")
-            return all_documents
-        except Exception as e:
-            log_message(f"Ошибка загрузки JSON документов: {str(e)}")
-            return []
-    def table_to_document(self, table_data, document_id=None):
-        content = ""
-        if isinstance(table_data, dict):
-            doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
-            table_num = table_data.get('table_number', 'Неизвестно')
-            table_title = table_data.get('table_title', 'Неизвестно')
-            section = table_data.get('section', 'Неизвестно')
-            content += f"Таблица: {table_num}\n"
-            content += f"Название: {table_title}\n"
-            content += f"Документ: {doc_id}\n"
-            content += f"Раздел: {section}\n"
-            if 'data' in table_data and isinstance(table_data['data'], list):
-                for row in table_data['data']:
-                    if isinstance(row, dict):
-                        row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
-                        content += f"{row_text}\n"
-        return Document(
-            text=content,
-            metadata={
-                "type": "table",
-                "table_number": table_data.get('table_number', 'unknown'),
-                "table_title": table_data.get('table_title', 'unknown'),
-                "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
-                "section": table_data.get('section', 'unknown')
-            }
-        )
-    def extract_zip_and_process_tables(self, zip_path):
-        """Extract ZIP file and process table JSON files inside"""
-        documents = []
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                zip_files = zip_ref.namelist()
-                json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
-                log_message(f"Найдено {len(json_files)} JSON файлов таблиц в архиве")
-                for json_file in json_files:
-                    try:
-                        log_message(f"Обрабатываю файл таблицы из архива: {json_file}")
-                        # Read JSON file from ZIP
-                        with zip_ref.open(json_file) as f:
-                            table_data = json.load(f)
-                        if isinstance(table_data, dict):
-                            document_id = table_data.get('document', 'unknown')
-                            if 'sheets' in table_data:
-                                for sheet in table_data['sheets']:
-                                    sheet['document'] = document_id
-                                    doc = self.table_to_document(sheet, document_id)
-                                    documents.append(doc)
-                            else:
-                                doc = self.table_to_document(table_data, document_id)
-                                documents.append(doc)
-                        elif isinstance(table_data, list):
-                            for table_json in table_data:
-                                doc = self.table_to_document(table_json)
-                                documents.append(doc)
-                    except Exception as e:
-                        log_message(f"Ошибка обработки файла таблицы {json_file}: {str(e)}")
-                        continue
-        except Exception as e:
-            log_message(f"Ошибка извлечения ZIP архива таблиц {zip_path}: {str(e)}")
-        return documents
-    def load_table_documents(self):
-        log_message("Начинаю загрузку табличных данных")
-        try:
-            files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
-            # Look for ZIP files in the table directory
-            zip_files = [f for f in files if f.startswith(self.table_data_dir) and f.endswith('.zip')]
-            # Also look for direct JSON files (fallback)
-            table_files = [f for f in files if f.startswith(self.table_data_dir) and f.endswith('.json')]
-            log_message(f"Найдено {len(zip_files)} ZIP файлов с таблицами и {len(table_files)} прямых JSON файлов")
-            table_documents = []
-            # Process ZIP files first
-            for zip_file_path in zip_files:
-                try:
-                    log_message(f"Загружаю ZIP архив таблиц: {zip_file_path}")
-                    local_zip_path = hf_hub_download(
-                        repo_id=self.repo_id,
-                        filename=zip_file_path,
-                        local_dir=self.download_dir,
-                        repo_type="dataset",
-                        token=self.hf_token
-                    )
-                    documents = self.extract_zip_and_process_tables(local_zip_path)
-                    table_documents.extend(documents)
-                except Exception as e:
-                    log_message(f"Ошибка обработки ZIP файла таблиц {zip_file_path}: {str(e)}")
-                    continue
-            # Process direct JSON files (if any)
-            for file_path in table_files:
-                try:
-                    log_message(f"Обрабатываю прямой файл таблицы: {file_path}")
-                    local_path = hf_hub_download(
-                        repo_id=self.repo_id,
-                        filename=file_path,
-                        local_dir=self.download_dir,
-                        repo_type="dataset",
-                        token=self.hf_token
-                    )
-                    with open(local_path, 'r', encoding='utf-8') as f:
-                        table_data = json.load(f)
-                        if isinstance(table_data, dict):
-                            document_id = table_data.get('document', 'unknown')
-                            if 'sheets' in table_data:
-                                for sheet in table_data['sheets']:
-                                    sheet['document'] = document_id
-                                    doc = self.table_to_document(sheet, document_id)
-                                    table_documents.append(doc)
-                            else:
-                                doc = self.table_to_document(table_data, document_id)
-                                table_documents.append(doc)
-                        elif isinstance(table_data, list):
-                            for table_json in table_data:
-                                doc = self.table_to_document(table_json)
                                 table_documents.append(doc)
-                except Exception as e:
-                    log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                    continue
-            log_message(f"Создано {len(table_documents)} документов из таблиц")
-            return table_documents
-        except Exception as e:
-            log_message(f"Ошибка загрузки табличных данных: {str(e)}")
-            return []
-    def load_image_documents(self):
-        log_message("Нач��наю загрузку данных изображений")
-        try:
-            files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
-            image_files = [f for f in files if f.startswith(self.image_data_dir) and f.endswith('.csv')]
-            log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
-            image_documents = []
-            for file_path in image_files:
-                try:
-                    log_message(f"Обрабатываю файл изображений: {file_path}")
-                    local_path = hf_hub_download(
-                        repo_id=self.repo_id,
-                        filename=file_path,
-                        local_dir=self.download_dir,
-                        repo_type="dataset",
-                        token=self.hf_token
-                    )
-                    df = pd.read_csv(local_path)
-                    log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
-                    for _, row in df.iterrows():
-                        content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
-                        content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
-                        content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
-                        content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
-                        content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
-                        content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
-                        doc = Document(
-                            text=content,
-                            metadata={
-                                "type": "image",
-                                "image_number": row.get('№ Изображения', 'unknown'),
-                                "document_id": row.get('Обозначение документа', 'unknown'),
-                                "file_path": row.get('Фа��л изображения', 'unknown'),
-                                "section": row.get('Раздел документа', 'unknown')
-                            }
-                        )
-                        image_documents.append(doc)
-                except Exception as e:
-                    log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                    continue
-            log_message(f"Создано {len(image_documents)} документов из изображений")
-            return image_documents
-        except Exception as e:
-            log_message(f"Ошибка загрузки данных изображений: {str(e)}")
-            return []
-    def prepare_all_documents(self):
-        log_message("Подготовка всех документов")
-        all_documents = []
-        json_documents = self.load_json_documents()
-        all_documents.extend(json_documents)
-        table_documents = self.load_table_documents()
-        all_documents.extend(table_documents)
-        image_documents = self.load_image_documents()
-        all_documents.extend(image_documents)
-        log_message(f"Всего подготовлено {len(all_documents)} документов")
-        return all_documents

 import json
 import zipfile
+import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
+from utils import log_message
+def extract_text_from_json(data, document_id, document_name):
+    documents = []
+    if 'sections' in data:
+        for section in data['sections']:
+            section_id = section.get('section_id', 'Unknown')
+            section_text = section.get('section_text', '')
+            if section_text.strip():
+                doc = Document(
+                    text=section_text,
+                    metadata={
+                        "type": "text",
+                        "document_id": document_id,
+                        "document_name": document_name,
+                        "section_id": section_id,
+                        "level": "section"
+                    }
+                )
+                documents.append(doc)
+            if 'subsections' in section:
+                for subsection in section['subsections']:
+                    subsection_id = subsection.get('subsection_id', 'Unknown')
+                    subsection_text = subsection.get('subsection_text', '')
+                    if subsection_text.strip():
+                        doc = Document(
+                            text=subsection_text,
+                            metadata={
+                                "type": "text",
+                                "document_id": document_id,
+                                "document_name": document_name,
+                                "section_id": section_id,
+                                "subsection_id": subsection_id,
+                                "level": "subsection"
+                            }
+                        )
+                        documents.append(doc)
+                    if 'sub_subsections' in subsection:
+                        for sub_subsection in subsection['sub_subsections']:
+                            sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
+                            sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
+                            if sub_subsection_text.strip():
+                                doc = Document(
+                                    text=sub_subsection_text,
+                                    metadata={
+                                        "type": "text",
+                                        "document_id": document_id,
+                                        "document_name": document_name,
+                                        "section_id": section_id,
+                                        "subsection_id": subsection_id,
+                                        "sub_subsection_id": sub_subsection_id,
+                                        "level": "sub_subsection"
+                                    }
+                                )
+                                documents.append(doc)
+                            if 'sub_sub_subsections' in sub_subsection:
+                                for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
+                                    sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
+                                    sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
+                                    if sub_sub_subsection_text.strip():
+                                        doc = Document(
+                                            text=sub_sub_subsection_text,
+                                            metadata={
+                                                "type": "text",
+                                                "document_id": document_id,
+                                                "document_name": document_name,
+                                                "section_id": section_id,
+                                                "subsection_id": subsection_id,
+                                                "sub_subsection_id": sub_subsection_id,
+                                                "sub_sub_subsection_id": sub_sub_subsection_id,
+                                                "level": "sub_sub_subsection"
+                                            }
+                                        )
+                                        documents.append(doc)
+    return documents
+def extract_zip_and_process_json(zip_path):
+    documents = []
+    try:
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_files = zip_ref.namelist()
+            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
+            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
+            for json_file in json_files:
                 try:
+                    log_message(f"Обрабатываю файл из архива: {json_file}")
+                    with zip_ref.open(json_file) as f:
                         json_data = json.load(f)
                     document_metadata = json_data.get('document_metadata', {})
                     document_id = document_metadata.get('document_id', 'unknown')
                     document_name = document_metadata.get('document_name', 'unknown')
+                    docs = extract_text_from_json(json_data, document_id, document_name)
+                    documents.extend(docs)
+                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
                 except Exception as e:
+                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
                     continue
+    except Exception as e:
+        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
+    return documents
+def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
+    log_message("Начинаю загрузку JSON документов")
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
+        json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
+        all_documents = []
+        for zip_file_path in zip_files:
+            try:
+                log_message(f"Загружаю ZIP архив: {zip_file_path}")
+                local_zip_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=zip_file_path,
+                    local_dir=download_dir,
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                documents = extract_zip_and_process_json(local_zip_path)
+                all_documents.extend(documents)
+            except Exception as e:
+                log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
+                continue
+        for file_path in json_files:
+            try:
+                log_message(f"Обрабатываю прямой JSON файл: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir=download_dir,
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    json_data = json.load(f)
+                document_metadata = json_data.get('document_metadata', {})
+                document_id = document_metadata.get('document_id', 'unknown')
+                document_name = document_metadata.get('document_name', 'unknown')
+                documents = extract_text_from_json(json_data, document_id, document_name)
+                all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из {file_path}")
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Всего создано {len(all_documents)} текстовых документов")
+        return all_documents
+    except Exception as e:
+        log_message(f"Ошибка загрузки JSON документов: {str(e)}")
+        return []
+def table_to_document(table_data, document_id=None):
+    content = ""
+    if isinstance(table_data, dict):
+        doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
+        table_num = table_data.get('table_number', 'Неизвестно')
+        table_title = table_data.get('table_title', 'Н��известно')
+        section = table_data.get('section', 'Неизвестно')
+        content += f"Таблица: {table_num}\n"
+        content += f"Название: {table_title}\n"
+        content += f"Документ: {doc_id}\n"
+        content += f"Раздел: {section}\n"
+        if 'data' in table_data and isinstance(table_data['data'], list):
+            for row in table_data['data']:
+                if isinstance(row, dict):
+                    row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
+                    content += f"{row_text}\n"
+    return Document(
+        text=content,
+        metadata={
+            "type": "table",
+            "table_number": table_data.get('table_number', 'unknown'),
+            "table_title": table_data.get('table_title', 'unknown'),
+            "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
+            "section": table_data.get('section', 'unknown')
+        }
+    )
+def load_table_data(repo_id, hf_token, table_data_dir):
+    log_message("Начинаю загрузку табличных данных")
+    table_files = []
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(table_data_dir) and file.endswith('.json'):
+                table_files.append(file)
+        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+        table_documents = []
+        for file_path in table_files:
+            try:
+                log_message(f"Обрабатываю файл: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    table_data = json.load(f)
+                    if isinstance(table_data, dict):
+                        document_id = table_data.get('document', 'unknown')
+                        if 'sheets' in table_data:
+                            for sheet in table_data['sheets']:
+                                sheet['document'] = document_id
+                                doc = table_to_document(sheet, document_id)
                                 table_documents.append(doc)
+                        else:
+                            doc = table_to_document(table_data, document_id)
+                            table_documents.append(doc)
+                    elif isinstance(table_data, list):
+                        for table_json in table_data:
+                            doc = table_to_document(table_json)
+                            table_documents.append(doc)
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Создано {len(table_documents)} документов из таблиц")
+        return table_documents
+    except Exception as e:
+        log_message(f"Ошибка загрузки табличных данных: {str(e)}")
+        return []
+def load_image_data(repo_id, hf_token, image_data_dir):
+    log_message("Начинаю загрузку данных изображений")
+    image_files = []
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(image_data_dir) and file.endswith('.csv'):
+                image_files.append(file)
+        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
+        image_documents = []
+        for file_path in image_files:
+            try:
+                log_message(f"Обрабатываю файл изображений: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                df = pd.read_csv(local_path)
+                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
+                for _, row in df.iterrows():
+                    content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
+                    content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
+                    content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                    content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
+                    content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
+                    doc = Document(
+                        text=content,
+                        metadata={
+                            "type": "image",
+                            "image_number": row.get('№ Изображения', 'unknown'),
+                            "document_id": row.get('Обозначение документа', 'unknown'),
+                            "file_path": row.get('Файл изображения', 'unknown'),
+                            "section": row.get('Раздел документа', 'unknown')
+                        }
+                    )
+                    image_documents.append(doc)
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Создано {len(image_documents)} документов из изображений")
+        return image_documents
+    except Exception as e:
+        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
+        return []
+def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
+    log_message("Загружаю данные чанков из CSV")
+    try:
+        chunks_csv_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=chunks_filename,
+            local_dir=download_dir,
+            repo_type="dataset",
+            token=hf_token
+        )
+        chunks_df = pd.read_csv(chunks_csv_path)
+        log_message(f"Загружено {len(chunks_df)} чанков из CSV")
+        text_column = None
+        for col in chunks_df.columns:
+            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
+                text_column = col
+                break
+        if text_column is None:
+            text_column = chunks_df.columns[0]
+        log_message(f"Использую колонку: {text_column}")
+        documents = []
+        for i, (_, row) in enumerate(chunks_df.iterrows()):
+            doc = Document(
+                text=str(row[text_column]),
+                metadata={
+                    "chunk_id": row.get('chunk_id', i),
+                    "document_id": row.get('document_id', 'unknown'),
+                    "type": "text"
+                }
+            )
+            documents.append(doc)
+        log_message(f"Создано {len(documents)} текстовых документов из CSV")
+        return documents, chunks_df
+    except Exception as e:
+        log_message(f"Ошибка загрузки CSV данных: {str(e)}")
+        return [], None

index_retriever.py CHANGED Viewed

@@ -1,207 +1,76 @@
 from llama_index.core import VectorStoreIndex, Settings
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.llms.google_genai import GoogleGenAI
-from llama_index.llms.openai import OpenAI
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
 from llama_index.core.prompts import PromptTemplate
 from llama_index.retrievers.bm25 import BM25Retriever
 from llama_index.core.retrievers import QueryFusionRetriever
-from sentence_transformers import CrossEncoder
-import logging
-from config import *
-logger = logging.getLogger(__name__)
-def log_message(message):
-    logger.info(message)
-    print(message, flush=True)
-class IndexRetriever:
-    def __init__(self, config):
-        self.config = config
-        self.vector_index = None
-        self.query_engine = None
-        self.reranker = None
-        self.current_model = config.DEFAULT_MODEL
-    def get_llm_model(self, model_name):
-        try:
-            model_config = self.config.AVAILABLE_MODELS.get(model_name)
-            if not model_config:
-                log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
-                model_config = self.config.AVAILABLE_MODELS[self.config.DEFAULT_MODEL]
-            if not model_config.get("api_key"):
-                raise Exception(f"API ключ не найден для модели {model_name}")
-            if model_config["provider"] == "google":
-                return GoogleGenAI(
-                    model=model_config["model_name"],
-                    api_key=model_config["api_key"]
-                )
-            elif model_config["provider"] == "openai":
-                return OpenAI(
-                    model=model_config["model_name"],
-                    api_key=model_config["api_key"]
-                )
-            else:
-                raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
-        except Exception as e:
-            log_message(f"Ошибка создания модели {model_name}: {str(e)}")
-            return GoogleGenAI(model="gemini-2.0-flash", api_key=self.config.GOOGLE_API_KEY)
-    def initialize_models(self, documents):
-        try:
-            log_message("Инициализация моделей и индекса")
-            embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-            llm = self.get_llm_model(self.current_model)
-            log_message("Инициализирую переранкер")
-            self.reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
-            Settings.embed_model = embed_model
-            Settings.llm = llm
-            log_message(f"Строю векторный индекс из {len(documents)} документов")
-            self.vector_index = VectorStoreIndex.from_documents(documents)
-            self.create_query_engine()
-            log_message(f"Модели и индекс успешно инициализированы с моделью: {self.current_model}")
-            return True
-        except Exception as e:
-            log_message(f"Ошибка инициализации моделей: {str(e)}")
-            return False
-    def create_query_engine(self):
-        try:
-            log_message(f"Применяется промпт: {self.config.PROMPT_SIMPLE_POISK[:100]}...")
-            bm25_retriever = BM25Retriever.from_defaults(
-                docstore=self.vector_index.docstore,
-                similarity_top_k=15
-            )
-            vector_retriever = VectorIndexRetriever(
-                index=self.vector_index,
-                similarity_top_k=20,
-                similarity_cutoff=0.5
-            )
-            hybrid_retriever = QueryFusionRetriever(
-                [vector_retriever, bm25_retriever],
-                similarity_top_k=30,
-                num_queries=1
-            )
-            custom_prompt_template = PromptTemplate(self.config.PROMPT_SIMPLE_POISK)
-            response_synthesizer = get_response_synthesizer(
-                response_mode=ResponseMode.TREE_SUMMARIZE,
-                text_qa_template=custom_prompt_template
-            )
-            self.query_engine = RetrieverQueryEngine(
-                retriever=hybrid_retriever,
-                response_synthesizer=response_synthesizer
-            )
-            log_message("Query engine успешно создан с кастомным промптом")
-        except Exception as e:
-            log_message(f"Ошибка создания query engine: {str(e)}")
-            raise
-    def query(self, question):
-        """Метод для выполнения запроса с применением промпта"""
-        if self.query_engine is None:
-            log_message("❌ Query engine не инициализирован")
-            return "❌ Система не инициализирована"
-        try:
-            log_message(f"Получен вопрос: {question}")
-            log_message(f"Используется модель: {self.current_model}")
-            log_message(f"Применяется промпт: {self.config.PROMPT_SIMPLE_POISK[:150]}...")
-            log_message(f"Обрабатываю запрос: {question}")
-            response = self.query_engine.query(question)
-            log_message(f"Ответ получен, длина: {len(str(response))}")
-            return str(response)
-        except Exception as e:
-            error_msg = f"Ошибка обработки запроса: {str(e)}"
-            log_message(error_msg)
-            return f"❌ {error_msg}"
-    def switch_model(self, model_name):
-        try:
-            log_message(f"Переключение на модель: {model_name}")
-            new_llm = self.get_llm_model(model_name)
-            Settings.llm = new_llm
-            if self.vector_index is not None:
-                self.create_query_engine()
-                self.current_model = model_name
-                log_message(f"Модель успешно переключена на: {model_name}")
-                return f"✅ Модель переключена на: {model_name}"
-            else:
-                return "❌ Ошибка: система не инициализирована"
-        except Exception as e:
-            error_msg = f"Ошибка переключения модели: {str(e)}"
-            log_message(error_msg)
-            return f"❌ {error_msg}"
-    def rerank_nodes(self, query, nodes, top_k=10):
-        if not nodes or not self.reranker:
-            return nodes[:top_k]
-        try:
-            log_message(f"Переранжирую {len(nodes)} узлов")
-            pairs = []
-            for node in nodes:
-                pairs.append([query, node.text])
-            scores = self.reranker.predict(pairs)
-            scored_nodes = list(zip(nodes, scores))
-            scored_nodes.sort(key=lambda x: x[1], reverse=True)
-            reranked_nodes = [node for node, score in scored_nodes[:top_k]]
-            log_message(f"Возвращаю топ-{len(reranked_nodes)} переранжированных узлов")
-            return reranked_nodes
-        except Exception as e:
-            log_message(f"Ошибка переранжировки: {str(e)}")
-            return nodes[:top_k]
-    def retrieve_nodes(self, question):
-        if self.query_engine is None:
-            return []
-        try:
-            log_message(f"Извлекаю релевантные узлы для вопроса: {question}")
-            retrieved_nodes = self.query_engine.retriever.retrieve(question)
-            log_message(f"Извлечено {len(retrieved_nodes)} узлов")
-            log_message("Применяю переранжировку")
-            reranked_nodes = self.rerank_nodes(question, retrieved_nodes, top_k=10)
-            return reranked_nodes
-        except Exception as e:
-            log_message(f"Ошибка извлечения узлов: {str(e)}")
-            return []
-    def get_current_model(self):
-        return self.current_model
-    def is_initialized(self):
-        return self.query_engine is not None

 from llama_index.core import VectorStoreIndex, Settings
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
 from llama_index.core.prompts import PromptTemplate
 from llama_index.retrievers.bm25 import BM25Retriever
 from llama_index.core.retrievers import QueryFusionRetriever
+from utils import log_message
+from config import CUSTOM_PROMPT
+def create_vector_index(documents):
+    log_message("Строю векторный индекс")
+    return VectorStoreIndex.from_documents(documents)
+def create_query_engine(vector_index):
+    try:
+        bm25_retriever = BM25Retriever.from_defaults(
+            docstore=vector_index.docstore,
+            similarity_top_k=15
+        )
+        vector_retriever = VectorIndexRetriever(
+            index=vector_index,
+            similarity_top_k=20,
+            similarity_cutoff=0.5
+        )
+        hybrid_retriever = QueryFusionRetriever(
+            [vector_retriever, bm25_retriever],
+            similarity_top_k=30,
+            num_queries=1
+        )
+        custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
+        response_synthesizer = get_response_synthesizer(
+            response_mode=ResponseMode.TREE_SUMMARIZE,
+            text_qa_template=custom_prompt_template
+        )
+        query_engine = RetrieverQueryEngine(
+            retriever=hybrid_retriever,
+            response_synthesizer=response_synthesizer
+        )
+        log_message("Query engine успешно создан")
+        return query_engine
+    except Exception as e:
+        log_message(f"Ошибка создания query engine: {str(e)}")
+        raise
+def rerank_nodes(query, nodes, reranker, top_k=10):
+    if not nodes or not reranker:
+        return nodes[:top_k]
+    try:
+        log_message(f"Переранжирую {len(nodes)} узлов")
+        pairs = []
+        for node in nodes:
+            pairs.append([query, node.text])
+        scores = reranker.predict(pairs)
+        scored_nodes = list(zip(nodes, scores))
+        scored_nodes.sort(key=lambda x: x[1], reverse=True)
+        reranked_nodes = [node for node, score in scored_nodes[:top_k]]
+        log_message(f"Возвращаю топ-{len(reranked_nodes)} переранжированных узлов")
+        return reranked_nodes
+    except Exception as e:
+        log_message(f"Ошибка переранжировки: {str(e)}")
+        return nodes[:top_k]

utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import logging
+import sys
+from llama_index.llms.google_genai import GoogleGenAI
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers import CrossEncoder
+from config import AVAILABLE_MODELS, DEFAULT_MODEL, GOOGLE_API_KEY
+import time
+from index_retriever import rerank_nodes
+from utils import log_message, generate_sources_html
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def log_message(message):
+    logger.info(message)
+    print(message, flush=True)
+    sys.stdout.flush()
+def get_llm_model(model_name):
+    try:
+        model_config = AVAILABLE_MODELS.get(model_name)
+        if not model_config:
+            log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
+            model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
+        if not model_config.get("api_key"):
+            raise Exception(f"API ключ не найден для модели {model_name}")
+        if model_config["provider"] == "google":
+            return GoogleGenAI(
+                model=model_config["model_name"],
+                api_key=model_config["api_key"]
+            )
+        elif model_config["provider"] == "openai":
+            return OpenAI(
+                model=model_config["model_name"],
+                api_key=model_config["api_key"]
+            )
+        else:
+            raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
+    except Exception as e:
+        log_message(f"Ошибка создания модели {model_name}: {str(e)}")
+        return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
+def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
+    return HuggingFaceEmbedding(model_name=model_name)
+def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
+    return CrossEncoder(model_name)
+def generate_sources_html(nodes, chunks_df=None):
+    html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
+    html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
+    for i, node in enumerate(nodes):
+        metadata = node.metadata if hasattr(node, 'metadata') else {}
+        doc_type = metadata.get('type', 'text')
+        doc_id = metadata.get('document_id', 'unknown')
+        html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
+        if doc_type == 'text':
+            html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
+        elif doc_type == 'table':
+            table_num = metadata.get('table_number', 'unknown')
+            if table_num and table_num != 'unknown':
+                if not table_num.startswith('№'):
+                    table_num = f"№{table_num}"
+                html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
+            else:
+                html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
+        elif doc_type == 'image':
+            image_num = metadata.get('image_number', 'unknown')
+            section = metadata.get('section', '')
+            if image_num and image_num != 'unknown':
+                if not str(image_num).startswith('№'):
+                    image_num = f"№{image_num}"
+                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id} ({section})</h4>"
+            else:
+                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id} ({section})</h4>"
+        if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
+            doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
+            if not doc_rows.empty:
+                file_link = doc_rows.iloc[0]['file_link']
+                html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-top: 10px;'>🔗 Ссылка на документ</a><br>"
+        html += "</div>"
+    html += "</div>"
+    return html
+def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
+    if query_engine is None:
+        return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
+    try:
+        log_message(f"Получен вопрос: {question}")
+        log_message(f"Используется модель: {current_model}")
+        start_time = time.time()
+        log_message("Извлекаю релевантные узлы")
+        retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"Извлечено {len(retrieved_nodes)} узлов")
+        log_message("Применяю переранжировку")
+        reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
+        log_message(f"Отправляю запрос в LLM с {len(reranked_nodes)} узлами")
+        response = query_engine.query(question)
+        end_time = time.time()
+        processing_time = end_time - start_time
+        log_message(f"Обработка завершена за {processing_time:.2f} секунд")
+        sources_html = generate_sources_html(reranked_nodes, chunks_df)
+        answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
+        <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
+        <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
+        <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
+        Время обработки: {processing_time:.2f} секунд
+        </div>
+        </div>"""
+        return answer_with_time, sources_html
+    except Exception as e:
+        log_message(f"Ошибка обработки вопроса: {str(e)}")
+        error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
+        return error_msg, ""