Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| from llama_index.core import Settings | |
| from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks | |
| from main_utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question | |
| from my_logging import log_message | |
| from index_retriever import create_vector_index, create_query_engine | |
| import sys | |
| from config import ( | |
| HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME, | |
| JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS | |
| ) | |
| def create_chunks_display_html(chunk_info): | |
| if not chunk_info: | |
| return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>" | |
| html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>" | |
| html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>" | |
| for i, chunk in enumerate(chunk_info): | |
| bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef" | |
| # Get section display info | |
| section_display = get_section_display(chunk) | |
| formatted_content = get_formatted_content(chunk) | |
| html += f""" | |
| <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'> | |
| <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br> | |
| <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br> | |
| <strong style='color: black;'>Содержание:</strong><br> | |
| <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'> | |
| {formatted_content} | |
| </div> | |
| </div> | |
| """ | |
| html += "</div>" | |
| return html | |
| def get_section_display(chunk): | |
| section_path = chunk.get('section_path', '') | |
| section_id = chunk.get('section_id', 'unknown') | |
| doc_type = chunk.get('type', 'text') | |
| if doc_type == 'table' and chunk.get('table_number'): | |
| table_num = chunk.get('table_number') | |
| if not str(table_num).startswith('№'): | |
| table_num = f"№{table_num}" | |
| return f"таблица {table_num}" | |
| if doc_type == 'image' and chunk.get('image_number'): | |
| image_num = chunk.get('image_number') | |
| if not str(image_num).startswith('№'): | |
| image_num = f"№{image_num}" | |
| return f"рисунок {image_num}" | |
| if section_path: | |
| return section_path | |
| elif section_id and section_id != 'unknown': | |
| return section_id | |
| return section_id | |
| def get_formatted_content(chunk): | |
| document_id = chunk.get('document_id', 'unknown') | |
| section_path = chunk.get('section_path', '') | |
| section_id = chunk.get('section_id', 'unknown') | |
| section_text = chunk.get('section_text', '') | |
| parent_section = chunk.get('parent_section', '') | |
| parent_title = chunk.get('parent_title', '') | |
| level = chunk.get('level', '') | |
| chunk_text = chunk.get('chunk_text', '') | |
| doc_type = chunk.get('type', 'text') | |
| # For text documents | |
| if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section: | |
| current_section = section_path if section_path else section_id | |
| parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section | |
| return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}" | |
| else: | |
| current_section = section_path if section_path else section_id | |
| clean_text = chunk_text | |
| if section_text and chunk_text.startswith(section_text): | |
| section_title = section_text | |
| elif chunk_text.startswith(f"{current_section} "): | |
| clean_text = chunk_text[len(f"{current_section} "):].strip() | |
| section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}" | |
| else: | |
| section_title = section_text if section_text else current_section | |
| return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}" | |
| def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None, | |
| json_files_dir=None, table_data_dir=None, image_data_dir=None, | |
| use_json_instead_csv=False): | |
| try: | |
| from documents_prep import process_documents_with_chunking | |
| log_message("Инициализация системы") | |
| os.makedirs(download_dir, exist_ok=True) | |
| from config import CHUNK_SIZE, CHUNK_OVERLAP | |
| from llama_index.core.text_splitter import TokenTextSplitter | |
| embed_model = get_embedding_model() | |
| llm = get_llm_model(DEFAULT_MODEL) | |
| reranker = get_reranker_model() | |
| Settings.embed_model = embed_model | |
| Settings.llm = llm | |
| Settings.text_splitter = TokenTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP, | |
| separator=" ", | |
| backup_separators=["\n", ".", "!", "?"] | |
| ) | |
| log_message(f"Configured chunk size: {CHUNK_SIZE} tokens") | |
| log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens") | |
| all_documents = [] | |
| chunks_df = None | |
| chunk_info = [] | |
| if use_json_instead_csv and json_files_dir: | |
| log_message("Используем JSON файлы вместо CSV") | |
| json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir) | |
| all_documents.extend(json_documents) | |
| chunk_info.extend(json_chunk_info) | |
| else: | |
| if chunks_filename: | |
| log_message("Загружаем данные из CSV") | |
| csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir) | |
| all_documents.extend(csv_documents) | |
| if table_data_dir: | |
| log_message("Добавляю табличные данные") | |
| table_documents = load_table_data(repo_id, hf_token, table_data_dir) | |
| log_message(f"Загружено {len(table_documents)} табличных документов") | |
| # Process table documents through chunking | |
| chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents) | |
| all_documents.extend(chunked_table_docs) | |
| chunk_info.extend(table_chunk_info) | |
| if image_data_dir: | |
| log_message("Добавляю данные изображений") | |
| image_documents = load_image_data(repo_id, hf_token, image_data_dir) | |
| log_message(f"Загружено {len(image_documents)} документов изображений") | |
| # Process image documents through chunking | |
| chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents) | |
| all_documents.extend(chunked_image_docs) | |
| chunk_info.extend(image_chunk_info) | |
| log_message(f"Всего документов после всей обработки: {len(all_documents)}") | |
| vector_index = create_vector_index(all_documents) | |
| query_engine = create_query_engine(vector_index) | |
| log_message(f"Система успешно инициализирована") | |
| return query_engine, chunks_df, reranker, vector_index, chunk_info | |
| except Exception as e: | |
| log_message(f"Ошибка инициализации: {str(e)}") | |
| return None, None, None, None, [] | |
| def switch_model(model_name, vector_index): | |
| from llama_index.core import Settings | |
| from index_retriever import create_query_engine | |
| try: | |
| log_message(f"Переключение на модель: {model_name}") | |
| new_llm = get_llm_model(model_name) | |
| Settings.llm = new_llm | |
| if vector_index is not None: | |
| new_query_engine = create_query_engine(vector_index) | |
| log_message(f"Модель успешно переключена на: {model_name}") | |
| return new_query_engine, f"✅ Модель переключена на: {model_name}" | |
| else: | |
| return None, "❌ Ошибка: система не инициализирована" | |
| except Exception as e: | |
| error_msg = f"Ошибка переключения модели: {str(e)}" | |
| log_message(error_msg) | |
| return None, f"❌ {error_msg}" | |
| def main_answer_question(question): | |
| global query_engine, reranker, current_model, chunks_df | |
| if not question.strip(): | |
| return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>", | |
| "<div style='color: black;'>Источники появятся после обработки запроса</div>", | |
| "<div style='color: black;'>Чанки появятся после обработки запроса</div>") | |
| try: | |
| # Call the answer_question function which returns 3 values | |
| answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df) | |
| return answer_html, sources_html, chunks_html | |
| except Exception as e: | |
| log_message(f"Ошибка при ответе на вопрос: {str(e)}") | |
| return (f"<div style='color: red;'>Ошибка: {str(e)}</div>", | |
| "<div style='color: black;'>Источники недоступны из-за ошибки</div>", | |
| "<div style='color: black;'>Чанки недоступны из-за ошибки</div>") | |
| def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None): | |
| with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # AIEXP - Artificial Intelligence Expert | |
| ## Инструмент для работы с нормативной документацией | |
| """) | |
| with gr.Tab("Поиск по нормативным документам"): | |
| gr.Markdown("### Задайте вопрос по нормативной документации") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(AVAILABLE_MODELS.keys()), | |
| value=current_model, | |
| label="Выберите языковую модель", | |
| info="Выберите модель для генерации ответов" | |
| ) | |
| with gr.Column(scale=1): | |
| switch_btn = gr.Button("Переключить модель", variant="secondary") | |
| model_status = gr.Textbox( | |
| value=f"Текущая модель: {current_model}", | |
| label="Статус модели", | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| question_input = gr.Textbox( | |
| label="Ваш вопрос к базе знаний", | |
| placeholder="Введите вопрос по нормативным документам...", | |
| lines=3 | |
| ) | |
| ask_btn = gr.Button("Найти ответ", variant="primary", size="lg") | |
| gr.Examples( | |
| examples=[ | |
| "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2", | |
| "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?", | |
| "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?", | |
| "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?", | |
| "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?", | |
| "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?" | |
| ], | |
| inputs=question_input | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| answer_output = gr.HTML( | |
| label="", | |
| value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>", | |
| ) | |
| with gr.Column(scale=1): | |
| sources_output = gr.HTML( | |
| label="", | |
| value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>", | |
| ) | |
| with gr.Column(scale=1): | |
| chunks_output = gr.HTML( | |
| label="Релевантные чанки", | |
| value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>", | |
| ) | |
| switch_btn.click( | |
| fn=switch_model_func, | |
| inputs=[model_dropdown], | |
| outputs=[model_status] | |
| ) | |
| ask_btn.click( | |
| fn=answer_question_func, | |
| inputs=[question_input], | |
| outputs=[answer_output, sources_output, chunks_output] | |
| ) | |
| question_input.submit( | |
| fn=answer_question_func, | |
| inputs=[question_input], | |
| outputs=[answer_output, sources_output, chunks_output] | |
| ) | |
| return demo | |
| query_engine = None | |
| chunks_df = None | |
| reranker = None | |
| vector_index = None | |
| current_model = DEFAULT_MODEL | |
| def main_answer_question(question): | |
| global query_engine, reranker, current_model, chunks_df | |
| answer_html, sources_html, chunks_html = answer_question( | |
| question, query_engine, reranker, current_model, chunks_df | |
| ) | |
| return answer_html, sources_html, chunks_html | |
| def main_switch_model(model_name): | |
| global query_engine, vector_index, current_model | |
| new_query_engine, status_message = switch_model(model_name, vector_index) | |
| if new_query_engine: | |
| query_engine = new_query_engine | |
| current_model = model_name | |
| return status_message | |
| def main(): | |
| global query_engine, chunks_df, reranker, vector_index, current_model | |
| log_message("Запуск AIEXP - AI Expert для нормативной документации") | |
| query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system( | |
| repo_id=HF_REPO_ID, | |
| hf_token=HF_TOKEN, | |
| download_dir=DOWNLOAD_DIR, | |
| json_files_dir=JSON_FILES_DIR, | |
| table_data_dir=TABLE_DATA_DIR, | |
| image_data_dir=IMAGE_DATA_DIR, | |
| use_json_instead_csv=True, | |
| ) | |
| if query_engine: | |
| log_message("Запуск веб-интерфейса") | |
| demo = create_demo_interface( | |
| answer_question_func=main_answer_question, | |
| switch_model_func=main_switch_model, | |
| current_model=current_model, | |
| chunk_info=chunk_info | |
| ) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True, | |
| debug=False | |
| ) | |
| else: | |
| log_message("Невозможно запустить приложение из-за ошибки инициализации") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |