Spaces:

MrSimple01
/

RAG_AIEXP_1

Sleeping

App Files Files Community

MrSimple01 commited on Sep 16, 2025

Commit

43fc13e

verified ·

1 Parent(s): 7329ea6

adding a new window for chunks

Browse files

Files changed (1) hide show

app.py +273 -219

app.py CHANGED Viewed

@@ -1,220 +1,274 @@
-import gradio as gr
-import os
-from llama_index.core import Settings
-from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
-from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
-from my_logging import log_message
-from index_retriever import create_vector_index, create_query_engine
-import sys
-from config import (
-    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
-    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
-)
-def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
-                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
-                     use_json_instead_csv=False):
-    try:
-        log_message("Инициализация системы")
-        os.makedirs(download_dir, exist_ok=True)
-        embed_model = get_embedding_model()
-        llm = get_llm_model(DEFAULT_MODEL)
-        reranker = get_reranker_model()
-        Settings.embed_model = embed_model
-        Settings.llm = llm
-        all_documents = []
-        chunks_df = None
-        if use_json_instead_csv and json_files_dir:
-            log_message("Используем JSON файлы вместо CSV")
-            json_documents = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
-            all_documents.extend(json_documents)
-        else:
-            if chunks_filename:
-                log_message("Загружаем данные из CSV")
-                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
-                all_documents.extend(csv_documents)
-        if table_data_dir:
-            log_message("Добавляю табличные данные")
-            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
-            all_documents.extend(table_documents)
-        if image_data_dir:
-            log_message("Добавляю данные изображений")
-            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
-            all_documents.extend(image_documents)
-        log_message(f"Всего документов: {len(all_documents)}")
-        vector_index = create_vector_index(all_documents)
-        query_engine = create_query_engine(vector_index)
-        log_message(f"Система успешно инициализирована")
-        return query_engine, chunks_df, reranker, vector_index
-    except Exception as e:
-        log_message(f"Ошибка инициализации: {str(e)}")
-        return None, None, None, None
-def switch_model(model_name, vector_index):
-    from llama_index.core import Settings
-    from index_retriever import create_query_engine
-    try:
-        log_message(f"Переключение на модель: {model_name}")
-        new_llm = get_llm_model(model_name)
-        Settings.llm = new_llm
-        if vector_index is not None:
-            new_query_engine = create_query_engine(vector_index)
-            log_message(f"Модель успешно переключена на: {model_name}")
-            return new_query_engine, f"✅ Модель переключена на: {model_name}"
-        else:
-            return None, "❌ Ошибка: система не инициализирована"
-    except Exception as e:
-        error_msg = f"Ошибка переключения модели: {str(e)}"
-        log_message(error_msg)
-        return None, f"❌ {error_msg}"
-def create_demo_interface(answer_question_func, switch_model_func, current_model):
-    with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # AIEXP - Artificial Intelligence Expert
-        ## Инструмент для работы с нормативной документацией
-        """)
-        with gr.Tab("🏠 Поиск по нормативным документам"):
-            gr.Markdown("### Задайте вопрос по нормативной документации")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    model_dropdown = gr.Dropdown(
-                        choices=list(AVAILABLE_MODELS.keys()),
-                        value=current_model,
-                        label="🤖 Выберите языковую модель",
-                        info="Выберите модель для генерации ответов"
-                    )
-                with gr.Column(scale=1):
-                    switch_btn = gr.Button("🔄 Переключить модель", variant="secondary")
-                    model_status = gr.Textbox(
-                        value=f"Текущая модель: {current_model}",
-                        label="Статус модели",
-                        interactive=False
-                    )
-            with gr.Row():
-                with gr.Column(scale=3):
-                    question_input = gr.Textbox(
-                        label="Ваш вопрос к базе знаний",
-                        placeholder="Введите вопрос по нормативным документам...",
-                        lines=3
-                    )
-                    ask_btn = gr.Button("🔍 Найти ответ", variant="primary", size="lg")
-                    gr.Examples(
-                        examples=[
-                            "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
-                            "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
-                            "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
-                            "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
-                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
-                        ],
-                        inputs=question_input
-                    )
-            with gr.Row():
-                with gr.Column(scale=2):
-                    answer_output = gr.HTML(
-                        label="",
-                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
-                    )
-                with gr.Column(scale=1):
-                    sources_output = gr.HTML(
-                        label="",
-                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
-                    )
-            switch_btn.click(
-                fn=switch_model_func,
-                inputs=[model_dropdown],
-                outputs=[model_status]
-            )
-            ask_btn.click(
-                fn=answer_question_func,
-                inputs=[question_input],
-                outputs=[answer_output, sources_output]
-            )
-            question_input.submit(
-                fn=answer_question_func,
-                inputs=[question_input],
-                outputs=[answer_output, sources_output]
-            )
-    return demo
-query_engine = None
-chunks_df = None
-reranker = None
-vector_index = None
-current_model = DEFAULT_MODEL
-def main_answer_question(question):
-    global query_engine, reranker, current_model, chunks_df
-    return answer_question(question, query_engine, reranker, current_model, chunks_df)
-def main_switch_model(model_name):
-    global query_engine, vector_index, current_model
-    new_query_engine, status_message = switch_model(model_name, vector_index)
-    if new_query_engine:
-        query_engine = new_query_engine
-        current_model = model_name
-    return status_message
-def main():
-    global query_engine, chunks_df, reranker, vector_index, current_model
-    log_message("Запуск AIEXP - AI Expert для нормативной документации")
-    query_engine, chunks_df, reranker, vector_index = initialize_system(
-        repo_id=HF_REPO_ID,
-        hf_token=HF_TOKEN,
-        download_dir=DOWNLOAD_DIR,
-        json_files_dir=JSON_FILES_DIR,
-        table_data_dir=TABLE_DATA_DIR,
-        image_data_dir=IMAGE_DATA_DIR,
-        use_json_instead_csv=True,
-    )
-    if query_engine:
-        log_message("Запуск веб-интерфейса")
-        demo = create_demo_interface(
-            answer_question_func=main_answer_question,
-            switch_model_func=main_switch_model,
-            current_model=current_model
-        )
-        demo.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=True,
-            debug=False
-        )
-    else:
-        log_message("Невозможно запустить приложение из-за ошибки иници��лизации")
-        sys.exit(1)
-if __name__ == "__main__":
     main()

+import gradio as gr
+import os
+from llama_index.core import Settings
+from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
+from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
+from my_logging import log_message
+from index_retriever import create_vector_index, create_query_engine
+import sys
+from config import (
+    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
+    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
+)
+def create_chunks_display_html(chunk_info):
+    if not chunk_info:
+        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
+    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
+    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
+    for i, chunk in enumerate(chunk_info):
+        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
+            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{chunk.get('section_id', 'unknown')}</span><br>
+            <strong style='color: black;'>Содержание:</strong><br>
+            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
+                {chunk['chunk_text']}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
+                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
+                     use_json_instead_csv=False):
+    try:
+        log_message("Инициализация системы")
+        os.makedirs(download_dir, exist_ok=True)
+        embed_model = get_embedding_model()
+        llm = get_llm_model(DEFAULT_MODEL)
+        reranker = get_reranker_model()
+        Settings.embed_model = embed_model
+        Settings.llm = llm
+        all_documents = []
+        chunks_df = None
+        chunk_info = []
+        if use_json_instead_csv and json_files_dir:
+            log_message("Используем JSON файлы вместо CSV")
+            json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
+            all_documents.extend(json_documents)
+            chunk_info.extend(json_chunk_info)
+        else:
+            if chunks_filename:
+                log_message("Загружаем данные из CSV")
+                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
+                all_documents.extend(csv_documents)
+        if table_data_dir:
+            log_message("Добавляю табличные данные")
+            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            all_documents.extend(table_documents)
+        if image_data_dir:
+            log_message("Добавляю данные изображений")
+            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            all_documents.extend(image_documents)
+        log_message(f"Всего документов: {len(all_documents)}")
+        vector_index = create_vector_index(all_documents)
+        query_engine = create_query_engine(vector_index)
+        log_message(f"Система успешно инициализирована")
+        return query_engine, chunks_df, reranker, vector_index, chunk_info
+    except Exception as e:
+        log_message(f"Ошибка инициализации: {str(e)}")
+        return None, None, None, None, []
+def switch_model(model_name, vector_index):
+    from llama_index.core import Settings
+    from index_retriever import create_query_engine
+    try:
+        log_message(f"Переключение на модель: {model_name}")
+        new_llm = get_llm_model(model_name)
+        Settings.llm = new_llm
+        if vector_index is not None:
+            new_query_engine = create_query_engine(vector_index)
+            log_message(f"Модель успешно переключена на: {model_name}")
+            return new_query_engine, f"✅ Модель переключена на: {model_name}"
+        else:
+            return None, "❌ Ошибка: система не инициализирована"
+    except Exception as e:
+        error_msg = f"Ошибка переключения модели: {str(e)}"
+        log_message(error_msg)
+        return None, f"❌ {error_msg}"
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    if not question.strip():
+        return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
+                "<div style='color: black;'>Источники появятся после обработки запроса</div>",
+                "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
+    try:
+        # Call the answer_question function which returns 3 values
+        answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
+        return answer_html, sources_html, chunks_html
+    except Exception as e:
+        log_message(f"Ошибка при ответе на вопрос: {str(e)}")
+        return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
+                "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
+                "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
+def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
+    with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # AIEXP - Artificial Intelligence Expert
+        ## Инструмент для работы с нормативной документацией
+        """)
+        with gr.Tab("Поиск по нормативным документам"):
+            gr.Markdown("### Задайте вопрос по нормативной документации")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_dropdown = gr.Dropdown(
+                        choices=list(AVAILABLE_MODELS.keys()),
+                        value=current_model,
+                        label="Выберите языковую модель",
+                        info="Выберите модель для генерации ответов"
+                    )
+                with gr.Column(scale=1):
+                    switch_btn = gr.Button("Переключить модель", variant="secondary")
+                    model_status = gr.Textbox(
+                        value=f"Текущая модель: {current_model}",
+                        label="Статус модели",
+                        interactive=False
+                    )
+            with gr.Row():
+                with gr.Column(scale=3):
+                    question_input = gr.Textbox(
+                        label="Ваш вопрос к базе знаний",
+                        placeholder="Введите вопрос по нормативным документам...",
+                        lines=3
+                    )
+                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
+                    gr.Examples(
+                        examples=[
+                            "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
+                            "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
+                            "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
+                            "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
+                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
+                        ],
+                        inputs=question_input
+                    )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    answer_output = gr.HTML(
+                        label="",
+                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
+                    )
+                with gr.Column(scale=1):
+                    sources_output = gr.HTML(
+                        label="",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+                with gr.Column(scale=1):
+                    chunks_output = gr.HTML(
+                        label="Релевантные чанки",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+            switch_btn.click(
+                fn=switch_model_func,
+                inputs=[model_dropdown],
+                outputs=[model_status]
+            )
+            ask_btn.click(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
+            )
+            question_input.submit(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
+            )
+    return demo
+query_engine = None
+chunks_df = None
+reranker = None
+vector_index = None
+current_model = DEFAULT_MODEL
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    answer_html, sources_html, chunks_html = answer_question(
+        question, query_engine, reranker, current_model, chunks_df
+    )
+    return answer_html, sources_html, chunks_html
+def main_switch_model(model_name):
+    global query_engine, vector_index, current_model
+    new_query_engine, status_message = switch_model(model_name, vector_index)
+    if new_query_engine:
+        query_engine = new_query_engine
+        current_model = model_name
+    return status_message
+def main():
+    global query_engine, chunks_df, reranker, vector_index, current_model
+    log_message("Запуск AIEXP - AI Expert для нормативной документации")
+    query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
+        repo_id=HF_REPO_ID,
+        hf_token=HF_TOKEN,
+        download_dir=DOWNLOAD_DIR,
+        json_files_dir=JSON_FILES_DIR,
+        table_data_dir=TABLE_DATA_DIR,
+        image_data_dir=IMAGE_DATA_DIR,
+        use_json_instead_csv=True,
+    )
+    if query_engine:
+        log_message("Запуск веб-интерфейса")
+        demo = create_demo_interface(
+            answer_question_func=main_answer_question,
+            switch_model_func=main_switch_model,
+            current_model=current_model,
+            chunk_info=chunk_info
+        )
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True,
+            debug=False
+        )
+    else:
+        log_message("Невозможно запустить приложение из-за ошибки инициализации")
+        sys.exit(1)
+if __name__ == "__main__":
     main()