Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

147e01b

1 Parent(s): 9160af0

new version of rag

Browse files

Files changed (2) hide show

app.py +343 -305
app_1.py +282 -501

app.py CHANGED Viewed

@@ -1,354 +1,392 @@
-import gradio as gr
 import os
-from llama_index.core import Settings
-from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
-from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
-from my_logging import log_message
-from index_retriever import create_vector_index, create_query_engine
 import sys
-from config import (
-    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
-    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
-)
-def create_chunks_display_html(chunk_info):
-    if not chunk_info:
-        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
-    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
-    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
-    for i, chunk in enumerate(chunk_info):
-        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
-        # Get section display info
-        section_display = get_section_display(chunk)
-        formatted_content = get_formatted_content(chunk)
-        html += f"""
-        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
-            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
-            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
-            <strong style='color: black;'>Содержание:</strong><br>
-            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
-                {formatted_content}
-            </div>
-        </div>
-        """
-    html += "</div>"
-    return html
-def get_section_display(chunk):
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    doc_type = chunk.get('type', 'text')
-    if doc_type == 'table' and chunk.get('table_number'):
-        table_num = chunk.get('table_number')
-        if not str(table_num).startswith('№'):
-            table_num = f"№{table_num}"
-        return f"таблица {table_num}"
-    if doc_type == 'image' and chunk.get('image_number'):
-        image_num = chunk.get('image_number')
-        if not str(image_num).startswith('№'):
-            image_num = f"№{image_num}"
-        return f"рисунок {image_num}"
-    if section_path:
-        return section_path
-    elif section_id and section_id != 'unknown':
-        return section_id
-    return section_id
-def get_formatted_content(chunk):
-    document_id = chunk.get('document_id', 'unknown')
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    section_text = chunk.get('section_text', '')
-    parent_section = chunk.get('parent_section', '')
-    parent_title = chunk.get('parent_title', '')
-    level = chunk.get('level', '')
-    chunk_text = chunk.get('chunk_text', '')
-    doc_type = chunk.get('type', 'text')
-    # For text documents
-    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
-        current_section = section_path if section_path else section_id
-        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
-        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
-    else:
-        current_section = section_path if section_path else section_id
-        clean_text = chunk_text
-        if section_text and chunk_text.startswith(section_text):
-            section_title = section_text
-        elif chunk_text.startswith(f"{current_section} "):
-            clean_text = chunk_text[len(f"{current_section} "):].strip()
-            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
-        else:
-            section_title = section_text if section_text else current_section
-        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
-def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
-                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
-                     use_json_instead_csv=False):
-    try:
-        from documents_prep import process_documents_with_chunking
-        log_message("Инициализация системы")
-        os.makedirs(download_dir, exist_ok=True)
-        from config import CHUNK_SIZE, CHUNK_OVERLAP
-        from llama_index.core.text_splitter import TokenTextSplitter
-        embed_model = get_embedding_model()
-        llm = get_llm_model(DEFAULT_MODEL)
-        reranker = get_reranker_model()
-        Settings.embed_model = embed_model
-        Settings.llm = llm
-        Settings.text_splitter = TokenTextSplitter(
-            chunk_size=CHUNK_SIZE,
-            chunk_overlap=CHUNK_OVERLAP,
-            separator=" ",
-            backup_separators=["\n", ".", "!", "?"]
-        )
-        log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
-        log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
-        all_documents = []
-        chunks_df = None
-        chunk_info = []
-        if use_json_instead_csv and json_files_dir:
-            log_message("Используем JSON файлы вместо CSV")
-            json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
-            all_documents.extend(json_documents)
-            chunk_info.extend(json_chunk_info)
-        else:
-            if chunks_filename:
-                log_message("Загружаем данные из CSV")
-                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
-                all_documents.extend(csv_documents)
-        if table_data_dir:
-            log_message("Добавляю табличные данные")
-            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
-            log_message(f"Загружено {len(table_documents)} табличных документов")
-            # Process table documents through chunking
-            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
-            all_documents.extend(chunked_table_docs)
-            chunk_info.extend(table_chunk_info)
-        if image_data_dir:
-            log_message("Добавляю данные изображений")
-            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
-            log_message(f"Загружено {len(image_documents)} документов изображений")
-            # Process image documents through chunking
-            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
-            all_documents.extend(chunked_image_docs)
-            chunk_info.extend(image_chunk_info)
-        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
-        vector_index = create_vector_index(all_documents)
-        query_engine = create_query_engine(vector_index)
-        log_message(f"Система успешно инициализирована")
-        return query_engine, chunks_df, reranker, vector_index, chunk_info
     except Exception as e:
-        log_message(f"Ошибка инициализации: {str(e)}")
-        return None, None, None, None, []
-def switch_model(model_name, vector_index):
-    from llama_index.core import Settings
-    from index_retriever import create_query_engine
     try:
-        log_message(f"Переключение на модель: {model_name}")
         new_llm = get_llm_model(model_name)
         Settings.llm = new_llm
-        if vector_index is not None:
-            new_query_engine = create_query_engine(vector_index)
-            log_message(f"Модель успешно переключена на: {model_name}")
-            return new_query_engine, f"✅ Модель переключена на: {model_name}"
-        else:
-            return None, "❌ Ошибка: система не инициализирована"
-    except Exception as e:
-        error_msg = f"Ошибка переключения модели: {str(e)}"
-        log_message(error_msg)
-        return None, f"❌ {error_msg}"
-def main_answer_question(question):
-    global query_engine, reranker, current_model, chunks_df
-    if not question.strip():
-        return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
-                "<div style='color: black;'>Источники появятся после обработки запроса</div>",
-                "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
-    try:
-        # Call the answer_question function which returns 3 values
-        answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
-        return answer_html, sources_html, chunks_html
     except Exception as e:
-        log_message(f"Ошибка при ответе на вопрос: {str(e)}")
-        return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
-                "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
-                "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
-def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
-    with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # AIEXP - Artificial Intelligence Expert
-        ## Инструмент для работы с нормативной документацией
-        """)
-        with gr.Tab("Поиск по нормативным документам"):
-            gr.Markdown("### Задайте вопрос по нормативной документации")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    model_dropdown = gr.Dropdown(
-                        choices=list(AVAILABLE_MODELS.keys()),
-                        value=current_model,
-                        label="Выберите языковую модель",
-                        info="Выберите модель для генерации ответов"
-                    )
-                with gr.Column(scale=1):
-                    switch_btn = gr.Button("Переключить модель", variant="secondary")
-                    model_status = gr.Textbox(
-                        value=f"Текущая модель: {current_model}",
-                        label="Статус модели",
-                        interactive=False
-                    )
-            with gr.Row():
-                with gr.Column(scale=3):
-                    question_input = gr.Textbox(
-                        label="Ваш вопрос к базе знаний",
-                        placeholder="Введите вопрос по нормативным документам...",
-                        lines=3
-                    )
-                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
-                    gr.Examples(
-                        examples=[
-                            "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
-                            "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
-                            "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
-                            "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
-                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
-                            "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
-                        ],
-                        inputs=question_input
-                    )
-            with gr.Row():
-                with gr.Column(scale=2):
-                    answer_output = gr.HTML(
-                        label="",
-                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
-                    )
-                with gr.Column(scale=1):
-                    sources_output = gr.HTML(
-                        label="",
-                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
-                    )
-                with gr.Column(scale=1):
-                    chunks_output = gr.HTML(
-                        label="Релевантные чанки",
-                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
-                    )
-            switch_btn.click(
-                fn=switch_model_func,
-                inputs=[model_dropdown],
-                outputs=[model_status]
-            )
-            ask_btn.click(
-                fn=answer_question_func,
-                inputs=[question_input],
-                outputs=[answer_output, sources_output, chunks_output]
-            )
-            question_input.submit(
-                fn=answer_question_func,
-                inputs=[question_input],
-                outputs=[answer_output, sources_output, chunks_output]
-            )
-    return demo
 query_engine = None
-chunks_df = None
-reranker = None
 vector_index = None
 current_model = DEFAULT_MODEL
 def main_answer_question(question):
-    global query_engine, reranker, current_model, chunks_df
-    answer_html, sources_html, chunks_html = answer_question(
-        question, query_engine, reranker, current_model, chunks_df
-    )
-    return answer_html, sources_html, chunks_html
 def main_switch_model(model_name):
-    global query_engine, vector_index, current_model
-    new_query_engine, status_message = switch_model(model_name, vector_index)
     if new_query_engine:
         query_engine = new_query_engine
         current_model = model_name
-    return status_message
 def main():
-    global query_engine, chunks_df, reranker, vector_index, current_model
-    log_message("Запуск AIEXP - AI Expert для нормативной документации")
-    query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
-        repo_id=HF_REPO_ID,
-        hf_token=HF_TOKEN,
-        download_dir=DOWNLOAD_DIR,
-        json_files_dir=JSON_FILES_DIR,
-        table_data_dir=TABLE_DATA_DIR,
-        image_data_dir=IMAGE_DATA_DIR,
-        use_json_instead_csv=True,
-    )
     if query_engine:
-        log_message("Запуск веб-интерфейса")
-        demo = create_demo_interface(
-            answer_question_func=main_answer_question,
-            switch_model_func=main_switch_model,
-            current_model=current_model,
-            chunk_info=chunk_info
-        )
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
-            share=True,
-            debug=False
         )
     else:
-        log_message("Невозможно запустить приложение из-за ошибки инициализации")
         sys.exit(1)
 if __name__ == "__main__":

 import os
+import json
+import zipfile
+from typing import List, Dict, Any
+import pandas as pd
+from huggingface_hub import hf_hub_download, list_repo_files
+from llama_index.core import Document, VectorStoreIndex, KeywordTableIndex, Settings
+from llama_index.core.retrievers import VectorIndexRetriever, QueryFusionRetriever
+from llama_index.retrievers.bm25 import BM25Retriever
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
+from llama_index.core.text_splitter import SentenceSplitter
+from sentence_transformers import SentenceTransformer
+import gradio as gr
 import sys
+GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
+HF_TOKEN = os.getenv('HF_TOKEN')
+AVAILABLE_MODELS = {
+    "Gemini 2.5 Flash": {
+        "provider": "google",
+        "model_name": "gemini-2.5-flash",
+        "api_key": GOOGLE_API_KEY
+    },
+    "Gemini 2.5 Pro": {
+        "provider": "google",
+        "model_name": "gemini-2.5-pro",
+        "api_key": GOOGLE_API_KEY
+    },
+    "GPT-4o": {
+        "provider": "openai",
+        "model_name": "gpt-4o",
+        "api_key": OPENAI_API_KEY
+    },
+    "GPT-4o Mini": {
+        "provider": "openai",
+        "model_name": "gpt-4o-mini",
+        "api_key": OPENAI_API_KEY
+    },
+    "GPT-5": {
+        "provider": "openai",
+        "model_name": "gpt-5",
+        "api_key": OPENAI_API_KEY
+    }
+}
+DEFAULT_MODEL = "Gemini 2.5 Flash"
+DOWNLOAD_DIR = "rag_files"
+JSON_FILES_DIR = "JSON"
+TABLE_DATA_DIR = "Табличные данные_JSON"
+IMAGE_DATA_DIR = "Изображения"
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 50
+TABLE_MAX_ROWS_PER_CHUNK = 30
+os.makedirs(DOWNLOAD_DIR, exist_ok=True)
+def get_llm_model(model_name):
+    config = AVAILABLE_MODELS[model_name]
+    if config["provider"] == "google":
+        from llama_index.llms.gemini import Gemini
+        return Gemini(model=config["model_name"], api_key=config["api_key"])
+    else:
+        from llama_index.llms.openai import OpenAI
+        return OpenAI(model=config["model_name"], api_key=config["api_key"])
+def get_embedding_model():
+    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+    return HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
+def list_zip_files_in_repo(repo_id: str) -> List[str]:
+    files = list_repo_files(repo_id, token=HF_TOKEN)
+    return [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
+def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
+    local_path = hf_hub_download(repo_id=repo_id, filename=path_in_repo, repo_type="dataset", token=HF_TOKEN)
+    base = os.path.basename(local_path)
+    dst = os.path.join(dest_dir, base)
+    if local_path != dst:
+        try:
+            with open(local_path, 'rb') as r, open(dst, 'wb') as w:
+                w.write(r.read())
+        except Exception:
+            pass
+    return dst
+def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
+    docs = []
+    with zipfile.ZipFile(zip_path, 'r') as z:
+        for name in z.namelist():
+            if name.lower().endswith('.json'):
+                with z.open(name) as f:
+                    try:
+                        text = f.read().decode('utf-8')
+                        data = json.loads(text)
+                        docs.append(data)
+                    except Exception as e:
+                        print(f"Failed to load {name} in {zip_path}: {e}")
+    return docs
+def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
+    nodes = splitter.split_text(text)
+    chunks = []
+    for i, node_text in enumerate(nodes):
+        md = dict(doc_meta)
+        md.update({
+            'chunk_id': f"{md.get('document_id','unknown')}_text_{i}",
+            'chunk_type': 'text'
+        })
+        chunks.append(Document(text=node_text, metadata=md))
+    return chunks
+def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
+    headers = table.get('headers') or []
+    rows = table.get('data') or []
+    if not rows:
+        text = table.get('table_description') or table.get('table_title') or ''
+        md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
+        return [Document(text=text, metadata=md)]
+    chunks = []
+    for i in range(0, len(rows), max_rows):
+        block = rows[i:i+max_rows]
+        lines = []
+        lines.append(f"Table {table_meta.get('table_number','?')} - {table_meta.get('table_title','')}")
+        lines.append(f"Headers: {headers}")
+        for r in block:
+            row_items = [f"{k}: {v}" for k, v in r.items()]
+            lines.append(" | ".join(row_items))
+        chunk_text = "\n".join(lines)
+        md = dict(table_meta)
+        md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
+        chunks.append(Document(text=chunk_text, metadata=md))
+    return chunks
+def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
+    txt = f"Image: {image_entry.get('Название изображения') or image_entry.get('title','')}. "
+    txt += f"Описание: {image_entry.get('Описание изображение') or image_entry.get('description','')}. "
+    txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
+    md = dict(image_meta)
+    md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
+    return Document(text=txt, metadata=md)
+def build_chunks_from_repo(repo_id: str) -> List[Document]:
+    zip_paths = list_zip_files_in_repo(repo_id)
+    print(f"Found {len(zip_paths)} zip files under {JSON_FILES_DIR} in repo {repo_id}")
+    splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    all_chunks = []
+    for remote_path in zip_paths:
+        print(f"Downloading {remote_path}...")
+        local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
+        print(f"Parsing {local_zip}...")
+        json_docs = read_jsons_from_zip(local_zip)
+        for doc in json_docs:
+            doc_meta = doc.get('document_metadata', {})
+            doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or 'unknown_doc'
+            base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
+            for sec in doc.get('sections', []):
+                sec_meta = dict(base_meta)
+                sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
+                text = sec.get('section_text') or sec.get('text') or ''
+                if text and text.strip():
+                    chunks = chunk_text_field(text, sec_meta, splitter)
+                    all_chunks.extend(chunks)
+            for sheet in doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []:
+                table_meta = dict(base_meta)
+                table_meta.update({
+                    'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
+                    'section': sheet.get('section'),
+                    'table_number': sheet.get('table_number'),
+                    'table_title': sheet.get('table_title')
+                })
+                table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
+                all_chunks.extend(table_chunks)
+            for img in doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', []):
+                img_meta = dict(base_meta)
+                chunk = chunk_image(img, img_meta)
+                all_chunks.append(chunk)
+    print(f"Built total {len(all_chunks)} chunks")
+    return all_chunks
+def create_hybrid_index(documents):
+    print("Creating vector index...")
+    vector_index = VectorStoreIndex.from_documents(documents)
+    print("Creating keyword index...")
+    keyword_index = KeywordTableIndex.from_documents(documents)
+    return vector_index, keyword_index
+def create_fusion_retriever(vector_index, keyword_index, documents):
+    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
+    bm25_retriever = BM25Retriever.from_defaults(
+        docstore=vector_index.docstore,
+        similarity_top_k=5
+    )
+    fusion_retriever = QueryFusionRetriever(
+        [vector_retriever, bm25_retriever],
+        similarity_top_k=5,
+        num_queries=1,
+        mode="reciprocal_rerank",
+        use_async=False
+    )
+    return fusion_retriever
+def create_query_engine(vector_index, keyword_index, documents):
+    fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
+    response_synthesizer = get_response_synthesizer(
+        response_mode=ResponseMode.COMPACT,
+        use_async=False
+    )
+    query_engine = RetrieverQueryEngine(
+        retriever=fusion_retriever,
+        response_synthesizer=response_synthesizer
+    )
+    return query_engine
+def initialize_system():
+    print("Initializing system...")
+    embed_model = get_embedding_model()
+    llm = get_llm_model(DEFAULT_MODEL)
+    Settings.embed_model = embed_model
+    Settings.llm = llm
+    Settings.chunk_size = CHUNK_SIZE
+    Settings.chunk_overlap = CHUNK_OVERLAP
+    print("Loading documents...")
+    documents = build_chunks_from_repo(HF_REPO_ID)
+    print("Creating indices...")
+    vector_index, keyword_index = create_hybrid_index(documents)
+    print("Creating query engine...")
+    query_engine = create_query_engine(vector_index, keyword_index, documents)
+    print("System initialized successfully!")
+    return query_engine, vector_index, keyword_index, documents
+def answer_question(question, query_engine):
+    if not question.strip():
+        return "<div style='color: black;'>Please enter a question</div>"
+    try:
+        response = query_engine.query(question)
+        answer_html = f"""
+        <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
+            <h3 style='color: #007bff;'>Answer:</h3>
+            <p>{response.response}</p>
+        </div>
+        """
+        sources_html = "<div style='background-color: #e9ecef; padding: 15px; border-radius: 8px; color: black;'>"
+        sources_html += "<h4>Sources:</h4>"
+        for i, node in enumerate(response.source_nodes):
+            sources_html += f"""
+            <div style='margin: 10px 0; padding: 10px; background-color: white; border-left: 3px solid #007bff;'>
+                <strong>Document {i+1}:</strong> {node.metadata.get('document_id', 'unknown')}<br>
+                <strong>Score:</strong> {node.score:.3f}<br>
+                <strong>Text:</strong> {node.text[:200]}...
+            </div>
+            """
+        sources_html += "</div>"
+        return answer_html, sources_html
     except Exception as e:
+        error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
+        return error_html, error_html
+def switch_model(model_name, vector_index, keyword_index, documents):
     try:
+        print(f"Switching to model: {model_name}")
         new_llm = get_llm_model(model_name)
         Settings.llm = new_llm
+        new_query_engine = create_query_engine(vector_index, keyword_index, documents)
+        return new_query_engine, f"✅ Model switched to: {model_name}"
     except Exception as e:
+        return None, f"❌ Error: {str(e)}"
 query_engine = None
 vector_index = None
+keyword_index = None
+documents = None
 current_model = DEFAULT_MODEL
 def main_answer_question(question):
+    global query_engine
+    return answer_question(question, query_engine)
 def main_switch_model(model_name):
+    global query_engine, vector_index, keyword_index, documents, current_model
+    new_query_engine, status = switch_model(model_name, vector_index, keyword_index, documents)
     if new_query_engine:
         query_engine = new_query_engine
         current_model = model_name
+    return status
+def create_interface():
+    with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# AIEXP - AI Expert for Regulatory Documentation")
+        with gr.Row():
+            model_dropdown = gr.Dropdown(
+                choices=list(AVAILABLE_MODELS.keys()),
+                value=current_model,
+                label="Select Language Model"
+            )
+            switch_btn = gr.Button("Switch Model")
+            model_status = gr.Textbox(
+                value=f"Current model: {current_model}",
+                label="Model Status",
+                interactive=False
+            )
+        with gr.Row():
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="Ask a question about the documents...",
+                lines=3
+            )
+        ask_btn = gr.Button("Get Answer", variant="primary")
+        with gr.Row():
+            answer_output = gr.HTML(
+                label="Answer",
+                value="<div style='padding: 20px; text-align: center;'>Answer will appear here...</div>"
+            )
+            sources_output = gr.HTML(
+                label="Sources",
+                value="<div style='padding: 20px; text-align: center;'>Sources will appear here...</div>"
+            )
+        switch_btn.click(
+            fn=main_switch_model,
+            inputs=[model_dropdown],
+            outputs=[model_status]
+        )
+        ask_btn.click(
+            fn=main_answer_question,
+            inputs=[question_input],
+            outputs=[answer_output, sources_output]
+        )
+        question_input.submit(
+            fn=main_answer_question,
+            inputs=[question_input],
+            outputs=[answer_output, sources_output]
+        )
+    return demo
 def main():
+    global query_engine, vector_index, keyword_index, documents
+    print("Starting AIEXP - AI Expert for Regulatory Documentation")
+    query_engine, vector_index, keyword_index, documents = initialize_system()
     if query_engine:
+        print("Launching web interface...")
+        demo = create_interface()
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
+            share=True
         )
     else:
+        print("Failed to initialize system")
         sys.exit(1)
 if __name__ == "__main__":

app_1.py CHANGED Viewed

@@ -1,501 +1,213 @@
 import gradio as gr
-from huggingface_hub import hf_hub_download
-import faiss
-import pandas as pd
 import os
-import json
-from llama_index.core import Document, VectorStoreIndex, Settings
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.llms.google_genai import GoogleGenAI
-from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
-from llama_index.core.prompts import PromptTemplate
-import time
 import sys
-from config import *
-REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
-faiss_index_filename = "cleaned_faiss_index.index"
-chunks_filename = "processed_chunks.csv"
-download_dir = "rag_files"
-table_data_dir = "Табличные данные_JSON"
-HF_TOKEN = os.getenv('HF_TOKEN')
-GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
-CUSTOM_PROMPT_NEW = """
-Вы являетесь высокоспециализированным Ассистентом для анализа документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы на основе анализа нормативной документации (НД). Все ваши ответы должны основываться исключительно на предоставленном контексте без использования внешних знаний или предположений.
-КРИТИЧЕСКИ ВАЖНО: ВСЕ ОТВЕТЫ ДОЛЖНЫ БЫТЬ ТОЛЬКО НА РУССКОМ ЯЗЫКЕ! НИКОГДА НЕ ОТВЕЧАЙТЕ НА АНГЛИЙСКОМ!
-История чата:
-{chat_history}
-ИНСТРУКЦИИ ПО ОБРАБОТКЕ КОНТЕКСТА:
-1. АНАЛИЗ ТАБЛИЧНЫХ ДАННЫХ:
-   - Если в контексте есть информация начинающаяся с "Таблица", внимательно изучите её содержимое
-   - Извлекайте данные из строк с заголовками и данными таблицы
-   - Указывайте номер и название таблицы при ответе
-   - Структурируйте ответ на основе табличных данных
-2. ОПРЕДЕЛЕНИЕ ТИПА ЗАДАЧИ:
-Проанализируйте запрос пользователя и определите тип задачи:
-1. КРАТКОЕ САММАРИ (ключевые слова: "кратко", "суммировать", "резюме", "основные моменты", "в двух словах"):
-   - Предоставьте структурированное резюме запрашиваемого раздела/пункта
-   - Выделите ключевые требования, процедуры или положения
-   - Используйте нумерованный список для лучшей читаемости
-   - Сохраняйте терминологию НД
-2. ПОИСК ДОКУМЕНТА И ПУНКТА (ключевые слова: "найти", "где", "какой документ", "в каком разделе", "ссылка"):
-   - Укажите конкретный документ и его структурное расположение
-   - Предоставьте точные номера разделов/подразделов/пунктов
-   - Процитируйте релевантные фрагменты
-   - Если найдено несколько документов, перечислите все с указанием специфики каждого
-3. ПРОВЕРКА КОРРЕКТНОСТИ (ключевые слова: "правильно ли", "соответствует ли", "проверить", "корректно", "нарушение"):
-   - Сопоставьте предоставленную информацию с требованиями НД
-   - Четко укажите: "СООТВЕТСТВУЕТ" или "НЕ СООТВЕТСТВУЕТ"
-   - Перечислите конкретные требования НД
-   - Укажите выявленные расхождения или подтвердите соответствие
-   - Процитируйте релевантные пункты НД
-4. ПЛАН ДЕЙСТВИЙ (ключевые слова: "план", "алгоритм", "последовательность", "как действовать", "пошагово"):
-   - Создайте пронумерованный пошаговый план
-   - Каждый шаг должен содержать ссылку на соответствующий пункт НД
-   - Укажите необходимые документы или формы
-   - Добавьте временные рамки, если они указаны в НД
-   - Выделите критические требования или ограничения
-5. УТОЧНЯЮЩИЕ ВОПРОСЫ (ключевые слова: "что это значит", "что означает", "объясните", "расскажите подробнее"):
-   - Используйте историю чата для понимания контекста
-   - Если вопрос относится к предыдущему обсуждению, опирайтесь на него
-   - Предоставьте подробное объяснение на основе НД
-   - Если контекст неясен, попросите уточнения
-ПРАВИЛА ФОРМИРОВАНИЯ ОТВЕТОВ:
-1. ОБЯЗАТЕЛЬНОЕ УКАЗАНИЕ ИСТОЧНИКОВ:
-   - Для каждого ответа указывайте: "Согласно [Название документа], раздел [X], пункт [X.X]: [Ваш ответ]"
-   - В конце ответа добавляйте: "Подробнее об этом можно узнать в документе [Название документа], раздел [X]."
-   - При отсутствии точного раздела: "Согласно документу [Название]: [Ваш ответ]"
-2. СТРОГОЕ СЛЕДОВАНИЕ КОНТЕКСТУ:
-   - Если информация не найдена: "Информация по вашему запросу не была найдена в нормативной документации."
-   - НЕ используйте английский язык ни при каких обстоятельствах
-   - Используйте историю чата для понимания контекста вопросов
-3. ИСПОЛЬЗОВАНИЕ ТЕРМИНОЛОГИИ НД:
-   - Применяйте официальную терминологию из документов
-   - Сохраняйте оригинальные формулировки ключевых требований
-   - При необходимости разъясняйте специальные термины на основе НД
-4. СТРУКТУРИРОВАНИЕ ОТВЕТОВ:
-   - Основной ответ на русском языке
-   - Указание источника
-   - Дополнительная информация о документе
-Контекст: {context_str}
-Вопрос: {query_str}
-Ответ (ТОЛЬКО НА РУССКОМ ЯЗЫКЕ):
-"""
-query_engine = None
-chunks_df = None
-chat_history = []
-def log_message(message):
-    print(message, flush=True)
-    sys.stdout.flush()
-def table_to_document(table_json):
-    document_id = table_json.get("document_id") or table_json.get("document", "unknown")
-    metadata = {
-        "document_id": document_id,
-        "section": table_json.get("section", ""),
-        "table_number": table_json.get("table_number", ""),
-        "table_title": table_json.get("table_title", ""),
-    }
-    description = table_json.get("table_description", "")
-    headers = table_json.get("headers", [])
-    table_text = f"ТАБЛИЦА: {table_json.get('table_number', '')} - {table_json.get('table_title', '')}\n"
-    table_text += f"ДОКУМЕНТ: {document_id}\n"
-    table_text += f"РАЗДЕЛ: {table_json.get('section', '')}\n"
-    if description:
-        table_text += f"ОПИСАНИЕ: {description}\n"
-    if headers:
-        table_text += f"ЗАГОЛОВКИ ТАБЛИЦЫ: {' | '.join(headers)}\n"
-    data = table_json.get("data", [])
-    if data:
-        table_text += "ДАННЫЕ ТАБЛИЦЫ:\n"
-        for i, row in enumerate(data):
-            if isinstance(row, dict):
-                row_str = " | ".join([f"{k}: {v}" for k,v in row.items()])
-                table_text += f"Строка {i+1}: {row_str}\n"
-    return Document(text=table_text, metadata=metadata)
-def download_table_data():
-    log_message("📥 Загрузка табличных данных...")
-    from huggingface_hub import list_repo_files
-    table_files = []
-    try:
-        files = list_repo_files(repo_id=REPO_ID, repo_type="dataset", token=HF_TOKEN)
-        for file in files:
-            if file.startswith(table_data_dir) and file.endswith('.json'):
-                table_files.append(file)
-        log_message(f"📊 Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        for file_path in table_files:
-            try:
-                log_message(f"🔄 Обработка файла: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=REPO_ID,
-                    filename=file_path,
-                    local_dir=download_dir,
-                    repo_type="dataset",
-                    token=HF_TOKEN
-                )
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    log_message(f"📋 Структура JSON: {list(table_data.keys()) if isinstance(table_data, dict) else 'Список'}")
-                    if isinstance(table_data, dict):
-                        if 'sheets' in table_data:
-                            for sheet in table_data['sheets']:
-                                doc = table_to_document(sheet)
-                                table_documents.append(doc)
-                                log_message(f"✅ Создан документ из таблицы: {sheet.get('table_number', 'unknown')}")
-                        else:
-                            doc = table_to_document(table_data)
-                            table_documents.append(doc)
-                            log_message(f"✅ Создан документ из JSON объекта")
-                    elif isinstance(table_data, list):
-                        for table_json in table_data:
-                            doc = table_to_document(table_json)
-                            table_documents.append(doc)
-                            log_message(f"✅ Создан документ из элемента списка")
-            except Exception as e:
-                log_message(f"❌ Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"✅ Создано {len(table_documents)} документов из таблиц")
-        return table_documents
-    except Exception as e:
-        log_message(f"❌ Ошибка загрузки табличных данных: {str(e)}")
-        return []
-def improve_query_with_history(question, chat_history_list):
     try:
-        log_message("🔄 Улучшение запроса с учетом истории...")
-        if not chat_history_list:
-            log_message("📝 История чата пуста, используем оригинальный запрос")
-            return question
-        history_context = ""
-        for i, (user_msg, bot_msg) in enumerate(chat_history_list[-3:], 1):
-            history_context += f"Вопрос {i}: {user_msg}\nОтвет {i}: {bot_msg[:200]}...\n\n"
-        improvement_prompt = f"""Проанализируй историю диалога и улучши текущий запрос пользователя.
-ИСТОРИЯ ДИАЛОГА:
-{history_context}
-ТЕКУЩИЙ ЗАПРОС: {question}
-ПРАВИЛА:
-1. Если запрос неполный или ссылается на предыдущий контекст (например: "что это", "о чем это", "объясни это"), дополни его информацией из истории
-2. Если запрос самодостаточный, верни его без изменений
-3. Сохраняй ключевые термины и названия документов из истории
-4. Отвечай только улучшенным запросом без дополнительных пояснений
-УЛУЧШЕННЫЙ ЗАПРОС:"""
-        from llama_index.llms.google_genai import GoogleGenAI
-        llm = GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
-        improved_query = llm.complete(improvement_prompt).text.strip()
-        log_message(f"✨ Улучшенный запрос: {improved_query}")
-        return improved_query
     except Exception as e:
-        log_message(f"❌ Ошибка улучшения запроса: {str(e)}")
-        return question
-def format_chat_history():
-    if not chat_history:
-        return "История чата пуста."
-    history_text = ""
-    for i, (user_msg, bot_msg) in enumerate(chat_history[-5:], 1):
-        history_text += f"Сообщение {i}:\nПользователь: {user_msg}\nАссистент: {bot_msg}\n\n"
-    return history_text
-def answer_question(question, history):
-    global query_engine, chunks_df, chat_history
-    if query_engine is None:
-        return history + [["", "❌ Система не инициализирована"]], ""
     try:
-        start_time = time.time()
-        log_message(f"🔍 Получен вопрос: {question}")
-        log_message(f"📜 История чата: {len(chat_history)} сообщений")
-        # Улучшаем запрос с учетом истории
-        improved_question = improve_query_with_history(question, chat_history)
-        log_message(f"🎯 Обработка улучшенного запроса: {improved_question}")
-        # Форматируем историю чата для промпта
-        chat_history_text = format_chat_history()
-        log_message(f"📝 Сформированная история для промпта: {len(chat_history_text)} символов")
-        log_message("🔎 Поиск релевантных чанков...")
-        retrieved_nodes = query_engine.retriever.retrieve(improved_question)
-        log_message(f"📊 Найдено {len(retrieved_nodes)} релевантных чанков")
-        # Логируем найденные чанки
-        for i, node in enumerate(retrieved_nodes[:3]):
-            log_message(f"📄 Чанк {i+1}: {node.text[:100]}...")
-            log_message(f"🏷️ Метаданные: {node.metadata}")
-        log_message("🤖 Отправка запроса в LLM...")
-        # Создаем контекст с историей чата
-        query_with_context = f"""
-История чата:
-{chat_history_text}
-Текущий вопрос: {question}
-"""
-        response = query_engine.query(query_with_context)
-        end_time = time.time()
-        processing_time = end_time - start_time
-        bot_response = response.response
-        log_message(f"✅ Получен ответ: {bot_response[:100]}...")
-        # Проверяем, что ответ на русском языке
-        if any(english_word in bot_response.lower() for english_word in ['i am sorry', 'i cannot', 'the query', 'this request']):
-            log_message("⚠️ Обнаружен ответ на английском языке, форсируем русский ответ")
-            # Принудительно запрашиваем ответ на русском
-            russian_prompt = f"""
-            ВАЖНО: Отвечай ТОЛЬКО на русском языке!
-            Вопрос: {question}
-            История: {chat_history_text}
-            Контекст: {retrieved_nodes[0].text if retrieved_nodes else 'Нет контекста'}
-            Если информации недостаточно для ответа, скажи: "Недостаточно информации для ответа на ваш вопрос в предоставленной документации."
-            Ответ на русском языке:
-            """
-            from llama_index.llms.google_genai import GoogleGenAI
-            llm = GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
-            bot_response = llm.complete(russian_prompt).text.strip()
-            log_message(f"🔄 Исправленный ответ на русском: {bot_response[:100]}...")
-        # Обновляем историю чата
-        chat_history.append((question, bot_response))
-        if len(chat_history) > 10:
-            chat_history = chat_history[-10:]
-        log_message(f"💾 История чата обновлена. Всего сообщений: {len(chat_history)}")
-        sources_html = generate_sources_html(retrieved_nodes)
-        response_with_time = f"{bot_response}\n\n⏱️ Время обработки: {processing_time:.2f} сек"
-        history.append([question, response_with_time])
-        return history, sources_html
     except Exception as e:
-        error_msg = f"❌ Ошибка обработки вопроса: {str(e)}"
-        log_message(f"❌ Ошибка: {str(e)}")
-        history.append([question, error_msg])
-        return history, ""
-def initialize_models():
-    global query_engine, chunks_df
     try:
-        log_message("🔄 Инициализация системы...")
-        os.makedirs(download_dir, exist_ok=True)
-        log_message("📥 Загрузка основных файлов...")
-        faiss_index_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=faiss_index_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        chunks_csv_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=chunks_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        log_message("📚 Загрузка индекса и данных...")
-        index_faiss = faiss.read_index(faiss_index_path)
-        chunks_df = pd.read_csv(chunks_csv_path)
-        log_message(f"📄 Загружено {len(chunks_df)} основных чанков")
-        log_message(f"📋 Колонки в chunks_df: {list(chunks_df.columns)}")
-        table_documents = download_table_data()
-        log_message("🤖 Настройка моделей...")
-        embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-        llm = GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
-        Settings.embed_model = embed_model
-        Settings.llm = llm
-        text_column = None
-        for col in chunks_df.columns:
-            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
-                text_column = col
-                break
-        if text_column is None:
-            text_column = chunks_df.columns[0]
-        log_message(f"📝 Используется колонка для текста: {text_column}")
-        documents = []
-        for i, (_, row) in enumerate(chunks_df.iterrows()):
-            doc = Document(
-                text=str(row[text_column]),
-                metadata={
-                    "chunk_id": row.get('chunk_id', i),
-                    "document_id": row.get('document_id', 'unknown')
-                }
-            )
-            documents.append(doc)
-        documents.extend(table_documents)
-        log_message(f"📋 Всего создано {len(documents)} документов ({len(chunks_df)} чанков + {len(table_documents)} таблиц)")
-        log_message("🔍 Построение векторного индекса...")
-        vector_index = VectorStoreIndex.from_documents(documents)
-        retriever = VectorIndexRetriever(
-            index=vector_index,
-            similarity_top_k=20,
-            similarity_cutoff=0.7
-        )
-        custom_prompt_template = PromptTemplate(CUSTOM_PROMPT_NEW)
-        response_synthesizer = get_response_synthesizer(
-            response_mode=ResponseMode.TREE_SUMMARIZE,
-            text_qa_template=custom_prompt_template
-        )
-        query_engine = RetrieverQueryEngine(
-            retriever=retriever,
-            response_synthesizer=response_synthesizer
-        )
-        log_message("✅ Система успешно инициализирована!")
-        return True
-    except Exception as e:
-        log_message(f"❌ Ошибка инициализации: {str(e)}")
-        return False
-def generate_sources_html(nodes):
-    html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
-    html += "<h3 style='color: #63b3ed; margin-top: 0;'>📚 Источники:</h3>"
-    unique_docs = {}
-    for node in nodes:
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_id = metadata.get('document_id', 'unknown')
-        if doc_id not in unique_docs:
-            unique_docs[doc_id] = []
-        unique_docs[doc_id].append(node)
-    for doc_id, doc_nodes in unique_docs.items():
-        if doc_id == 'unknown' or doc_id == 'Раздел документа':
-            continue
-        file_link = None
-        if chunks_df is not None and 'file_link' in chunks_df.columns:
-            doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
-            if not doc_rows.empty:
-                file_link = doc_rows.iloc[0]['file_link']
-        html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
-        html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
-        if file_link:
-            html += f"<a href='{file_link}' target='_blank' style='color: #68d391; text-decoration: none; font-size: 14px; display: inline-block; margin-bottom: 10px;'>🔗 Ссылка на документ</a><br>"
-        table_nodes = [node for node in doc_nodes if 'table_number' in node.metadata]
-        if table_nodes:
-            for node in table_nodes[:3]:
-                metadata = node.metadata
-                table_num = metadata.get('table_number', '')
-                table_title = metadata.get('table_title', 'Без названия')
-                if table_num and table_title != 'Без названия':
-                    html += f"<p style='font-size: 12px; color: #a0aec0; margin: 5px 0;'>📊 {table_num}: {table_title}</p>"
-        html += "</div>"
-    html += "</div>"
-    return html
-def clear_chat():
-    global chat_history
-    chat_history = []
-    log_message("🗑️ История чата очищена")
-    return [], ""
-def handle_submit(message, history):
-    if not message.strip():
-        return history, ""
-    updated_history, sources = answer_question(message, history)
-    return updated_history, sources
-def create_demo_interface():
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
@@ -504,65 +216,131 @@ def create_demo_interface():
         ## Инструмент для работы с нормативной документацией
         """)
-        with gr.Tab("💬 Чат с документами"):
             gr.Markdown("### Задайте вопрос по нормативной документации")
             with gr.Row():
                 with gr.Column(scale=2):
-                    chatbot = gr.Chatbot(
-                        label="Диалог с AIEXP",
-                        height=500,
-                        show_copy_button=True
                     )
-                    with gr.Row():
-                        msg = gr.Textbox(
-                            label="Ваш вопрос",
-                            placeholder="Введите вопрос по нормативным документам...",
-                            lines=2,
-                            scale=4
-                        )
-                        send_btn = gr.Button("📤 Отправить", variant="primary", scale=1)
-                    with gr.Row():
-                        clear_btn = gr.Button("🗑️ Очистить чат", variant="secondary")
                     gr.Examples(
                         examples=[
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
                             "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
-                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями, не включенными в перечисления?",
                         ],
-                        inputs=msg
                     )
                 with gr.Column(scale=1):
                     sources_output = gr.HTML(
-                        label="Источники",
-                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся источники...</div>",
                     )
-            def submit_message(message, history):
-                return handle_submit(message, history)
-            msg.submit(submit_message, [msg, chatbot], [chatbot, sources_output]).then(
-                lambda: "", None, msg
             )
-            send_btn.click(submit_message, [msg, chatbot], [chatbot, sources_output]).then(
-                lambda: "", None, msg
             )
-            clear_btn.click(clear_chat, outputs=[chatbot, sources_output])
     return demo
-if __name__ == "__main__":
-    log_message("🚀 Запуск AIEXP - AI Expert для нормативной документации")
-    if initialize_models():
-        log_message("🌟 Запуск веб-интерфейса...")
-        demo = create_demo_interface()
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
@@ -570,5 +348,8 @@ if __name__ == "__main__":
             debug=False
         )
     else:
-        log_message("❌ Невозможно запустить приложение из-за ошибки инициализации")
-        sys.exit(1)

 import gradio as gr
 import os
+from llama_index.core import Settings
+from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
+from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
+from my_logging import log_message
+from index_retriever import create_vector_index, create_query_engine
 import sys
+from config import (
+    HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
+    JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
+)
+def create_chunks_display_html(chunk_info):
+    if not chunk_info:
+        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
+    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
+    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
+    for i, chunk in enumerate(chunk_info):
+        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        # Get section display info
+        section_display = get_section_display(chunk)
+        formatted_content = get_formatted_content(chunk)
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
+            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
+            <strong style='color: black;'>Содержание:</strong><br>
+            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
+                {formatted_content}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def get_section_display(chunk):
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    doc_type = chunk.get('type', 'text')
+    if doc_type == 'table' and chunk.get('table_number'):
+        table_num = chunk.get('table_number')
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+        return f"таблица {table_num}"
+    if doc_type == 'image' and chunk.get('image_number'):
+        image_num = chunk.get('image_number')
+        if not str(image_num).startswith('№'):
+            image_num = f"№{image_num}"
+        return f"рисунок {image_num}"
+    if section_path:
+        return section_path
+    elif section_id and section_id != 'unknown':
+        return section_id
+    return section_id
+def get_formatted_content(chunk):
+    document_id = chunk.get('document_id', 'unknown')
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    section_text = chunk.get('section_text', '')
+    parent_section = chunk.get('parent_section', '')
+    parent_title = chunk.get('parent_title', '')
+    level = chunk.get('level', '')
+    chunk_text = chunk.get('chunk_text', '')
+    doc_type = chunk.get('type', 'text')
+    # For text documents
+    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
+        current_section = section_path if section_path else section_id
+        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
+        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
+    else:
+        current_section = section_path if section_path else section_id
+        clean_text = chunk_text
+        if section_text and chunk_text.startswith(section_text):
+            section_title = section_text
+        elif chunk_text.startswith(f"{current_section} "):
+            clean_text = chunk_text[len(f"{current_section} "):].strip()
+            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
+        else:
+            section_title = section_text if section_text else current_section
+        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
+def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
+                     json_files_dir=None, table_data_dir=None, image_data_dir=None,
+                     use_json_instead_csv=False):
     try:
+        from documents_prep import process_documents_with_chunking
+        log_message("Инициализация системы")
+        os.makedirs(download_dir, exist_ok=True)
+        from config import CHUNK_SIZE, CHUNK_OVERLAP
+        from llama_index.core.text_splitter import TokenTextSplitter
+        embed_model = get_embedding_model()
+        llm = get_llm_model(DEFAULT_MODEL)
+        reranker = get_reranker_model()
+        Settings.embed_model = embed_model
+        Settings.llm = llm
+        Settings.text_splitter = TokenTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separator=" ",
+            backup_separators=["\n", ".", "!", "?"]
+        )
+        log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
+        log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
+        all_documents = []
+        chunks_df = None
+        chunk_info = []
+        if use_json_instead_csv and json_files_dir:
+            log_message("Используем JSON файлы вместо CSV")
+            json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
+            all_documents.extend(json_documents)
+            chunk_info.extend(json_chunk_info)
+        else:
+            if chunks_filename:
+                log_message("Загружаем данные из CSV")
+                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
+                all_documents.extend(csv_documents)
+        if table_data_dir:
+            log_message("Добавляю табличные данные")
+            table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            log_message(f"Загружено {len(table_documents)} табличных документов")
+            # Process table documents through chunking
+            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
+            all_documents.extend(chunked_table_docs)
+            chunk_info.extend(table_chunk_info)
+        if image_data_dir:
+            log_message("Добавляю данные изображений")
+            image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            log_message(f"Загружено {len(image_documents)} документов изображений")
+            # Process image documents through chunking
+            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
+            all_documents.extend(chunked_image_docs)
+            chunk_info.extend(image_chunk_info)
+        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
+        vector_index = create_vector_index(all_documents)
+        query_engine = create_query_engine(vector_index)
+        log_message(f"Система успешно инициализирована")
+        return query_engine, chunks_df, reranker, vector_index, chunk_info
     except Exception as e:
+        log_message(f"Ошибка инициализации: {str(e)}")
+        return None, None, None, None, []
+def switch_model(model_name, vector_index):
+    from llama_index.core import Settings
+    from index_retriever import create_query_engine
     try:
+        log_message(f"Переключение на модель: {model_name}")
+        new_llm = get_llm_model(model_name)
+        Settings.llm = new_llm
+        if vector_index is not None:
+            new_query_engine = create_query_engine(vector_index)
+            log_message(f"Модель успешно переключена на: {model_name}")
+            return new_query_engine, f"✅ Модель переключена на: {model_name}"
+        else:
+            return None, "❌ Ошибка: система не инициализирована"
     except Exception as e:
+        error_msg = f"Ошибка переключения модели: {str(e)}"
+        log_message(error_msg)
+        return None, f"❌ {error_msg}"
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    if not question.strip():
+        return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
+                "<div style='color: black;'>Источники появятся после обработки запроса</div>",
+                "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
     try:
+        # Call the answer_question function which returns 3 values
+        answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
+        return answer_html, sources_html, chunks_html
+    except Exception as e:
+        log_message(f"Ошибка при ответе на вопрос: {str(e)}")
+        return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
+                "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
+                "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
+def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         ## Инструмент для работы с нормативной документацией
         """)
+        with gr.Tab("Поиск по нормативным документам"):
             gr.Markdown("### Задайте вопрос по нормативной документации")
             with gr.Row():
                 with gr.Column(scale=2):
+                    model_dropdown = gr.Dropdown(
+                        choices=list(AVAILABLE_MODELS.keys()),
+                        value=current_model,
+                        label="Выберите языковую модель",
+                        info="Выберите модель для генерации ответов"
                     )
+                with gr.Column(scale=1):
+                    switch_btn = gr.Button("Переключить модель", variant="secondary")
+                    model_status = gr.Textbox(
+                        value=f"Текущая модель: {current_model}",
+                        label="Статус модели",
+                        interactive=False
+                    )
+            with gr.Row():
+                with gr.Column(scale=3):
+                    question_input = gr.Textbox(
+                        label="Ваш вопрос к базе знаний",
+                        placeholder="Введите вопрос по нормативным документам...",
+                        lines=3
+                    )
+                    ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
                     gr.Examples(
                         examples=[
+                            "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
+                            "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
                             "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
+                            "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
+                            "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
                         ],
+                        inputs=question_input
+                    )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    answer_output = gr.HTML(
+                        label="",
+                        value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
                     )
                 with gr.Column(scale=1):
                     sources_output = gr.HTML(
+                        label="",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
+                    )
+                with gr.Column(scale=1):
+                    chunks_output = gr.HTML(
+                        label="Релевантные чанки",
+                        value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
                     )
+            switch_btn.click(
+                fn=switch_model_func,
+                inputs=[model_dropdown],
+                outputs=[model_status]
             )
+            ask_btn.click(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
             )
+            question_input.submit(
+                fn=answer_question_func,
+                inputs=[question_input],
+                outputs=[answer_output, sources_output, chunks_output]
+            )
     return demo
+query_engine = None
+chunks_df = None
+reranker = None
+vector_index = None
+current_model = DEFAULT_MODEL
+def main_answer_question(question):
+    global query_engine, reranker, current_model, chunks_df
+    answer_html, sources_html, chunks_html = answer_question(
+        question, query_engine, reranker, current_model, chunks_df
+    )
+    return answer_html, sources_html, chunks_html
+def main_switch_model(model_name):
+    global query_engine, vector_index, current_model
+    new_query_engine, status_message = switch_model(model_name, vector_index)
+    if new_query_engine:
+        query_engine = new_query_engine
+        current_model = model_name
+    return status_message
+def main():
+    global query_engine, chunks_df, reranker, vector_index, current_model
+    log_message("Запуск AIEXP - AI Expert для нормативной документации")
+    query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
+        repo_id=HF_REPO_ID,
+        hf_token=HF_TOKEN,
+        download_dir=DOWNLOAD_DIR,
+        json_files_dir=JSON_FILES_DIR,
+        table_data_dir=TABLE_DATA_DIR,
+        image_data_dir=IMAGE_DATA_DIR,
+        use_json_instead_csv=True,
+    )
+    if query_engine:
+        log_message("Запуск веб-интерфейса")
+        demo = create_demo_interface(
+            answer_question_func=main_answer_question,
+            switch_model_func=main_switch_model,
+            current_model=current_model,
+            chunk_info=chunk_info
+        )
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
             debug=False
         )
     else:
+        log_message("Невозможно запустить приложение из-за ошибки инициализации")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()