Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 19, 2025

Commit

ab99142

1 Parent(s): 5099a0a

a new restart button + detailed logging, main_utils.py name changing

Browse files

Files changed (5) hide show

app.py +38 -116
app_1.py +1 -1
converters/converter.py +63 -3
index_retriever.py +2 -2
utils.py → main_utils.py +112 -0

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import os
 from llama_index.core import Settings
 from documents_prep import load_json_documents, load_table_documents, load_image_documents
-from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys
@@ -11,115 +11,37 @@ from config import (
     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
 from converters.converter import process_uploaded_file, convert_single_excel_to_json, convert_single_excel_to_csv
-def merge_table_chunks(chunk_info):
-    merged = {}
-    for chunk in chunk_info:
-        doc_type = chunk.get('type', 'text')
-        doc_id = chunk.get('document_id', 'unknown')
-        if doc_type == 'table' or doc_type == 'table_row':
-            table_num = chunk.get('table_number', '')
-            key = f"{doc_id}_{table_num}"
-            if key not in merged:
-                merged[key] = {
-                    'document_id': doc_id,
-                    'type': 'table',
-                    'table_number': table_num,
-                    'section_id': chunk.get('section_id', 'unknown'),
-                    'chunk_text': chunk.get('chunk_text', '')
-                }
-            else:
-                merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
-        else:
-            unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
-            merged[unique_key] = chunk
-    return list(merged.values())
-def create_chunks_display_html(chunk_info):
-    if not chunk_info:
-        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
-    merged_chunks = merge_table_chunks(chunk_info)
-    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
-    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
-    for i, chunk in enumerate(merged_chunks):
-        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
-        section_display = get_section_display(chunk)
-        formatted_content = get_formatted_content(chunk)
-        html += f"""
-        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
-            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
-            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
-            <strong style='color: black;'>Содержание:</strong><br>
-            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
-                {formatted_content}
-            </div>
-        </div>
-        """
-    html += "</div>"
-    return html
-def get_section_display(chunk):
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    doc_type = chunk.get('type', 'text')
-    if doc_type == 'table' and chunk.get('table_number'):
-        table_num = chunk.get('table_number')
-        if not str(table_num).startswith('№'):
-            table_num = f"№{table_num}"
-        return f"таблица {table_num}"
-    if doc_type == 'image' and chunk.get('image_number'):
-        image_num = chunk.get('image_number')
-        if not str(image_num).startswith('№'):
-            image_num = f"№{image_num}"
-        return f"рисунок {image_num}"
-    if section_path:
-        return section_path
-    elif section_id and section_id != 'unknown':
-        return section_id
-    return section_id
-def get_formatted_content(chunk):
-    document_id = chunk.get('document_id', 'unknown')
-    section_path = chunk.get('section_path', '')
-    section_id = chunk.get('section_id', 'unknown')
-    section_text = chunk.get('section_text', '')
-    parent_section = chunk.get('parent_section', '')
-    parent_title = chunk.get('parent_title', '')
-    level = chunk.get('level', '')
-    chunk_text = chunk.get('chunk_text', '')
-    doc_type = chunk.get('type', 'text')
-    # For text documents
-    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
-        current_section = section_path if section_path else section_id
-        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
-        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
-    else:
-        current_section = section_path if section_path else section_id
-        clean_text = chunk_text
-        if section_text and chunk_text.startswith(section_text):
-            section_title = section_text
-        elif chunk_text.startswith(f"{current_section} "):
-            clean_text = chunk_text[len(f"{current_section} "):].strip()
-            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
         else:
-            section_title = section_text if section_text else current_section
-        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
@@ -190,7 +112,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                 'table_number': doc.metadata.get('table_number', ''),
                 'image_number': doc.metadata.get('image_number', ''),
                 'section': doc.metadata.get('section', ''),
-                'connection_type': doc.metadata.get('connection_type', '')  # ADD THIS
             })
         log_message(f"Система успешно инициализирована")
@@ -225,15 +147,15 @@ def switch_model(model_name, vector_index):
         return None, f"❌ {error_msg}"
 retrieval_params = {
-    'vector_top_k': 50,
-    'bm25_top_k': 50,
-    'similarity_cutoff': 0.55,
-    'hybrid_top_k': 100,
     'rerank_top_k': 20
 }
-def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
-                       similarity_cutoff=0.55, hybrid_top_k=100):
     try:
         from config import CUSTOM_PROMPT
         from index_retriever import create_query_engine as create_index_query_engine
@@ -424,7 +346,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     vector_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
-                        value=50,
                         step=10,
                         label="Vector Top K",
                         info="Количество результатов из векторного поиска"
@@ -434,7 +356,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     bm25_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
-                        value=50,
                         step=10,
                         label="BM25 Top K",
                         info="Количество результатов из BM25 поиска"
@@ -445,7 +367,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     similarity_cutoff = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
-                        value=0.55,
                         step=0.05,
                         label="Similarity Cutoff",
                         info="Минимальный порог схожести для векторного поиска"
@@ -455,7 +377,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                     hybrid_top_k = gr.Slider(
                         minimum=10,
                         maximum=300,
-                        value=100,
                         step=10,
                         label="Hybrid Top K",
                         info="Количество результатов из гибридного поиска"
@@ -497,7 +419,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
             gr.Markdown("### Текущие параметры:")
             current_params_display = gr.Textbox(
-                value="Vector: 50 | BM25: 50 | Cutoff: 0.55 | Hybrid: 100 | Rerank: 20",
                 label="",
                 interactive=False,
                 lines=2

 import os
 from llama_index.core import Settings
 from documents_prep import load_json_documents, load_table_documents, load_image_documents
+from main_utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys
     JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 )
 from converters.converter import process_uploaded_file, convert_single_excel_to_json, convert_single_excel_to_csv
+from main_utils import *
+def restart_system():
+    """Перезапуск системы для применения новых документов"""
+    global query_engine, chunks_df, reranker, vector_index, current_model
+    try:
+        log_message("Начало перезапуска системы...")
+        query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
+            repo_id=HF_REPO_ID,
+            hf_token=HF_TOKEN,
+            download_dir=DOWNLOAD_DIR,
+            json_files_dir=JSON_FILES_DIR,
+            table_data_dir=TABLE_DATA_DIR,
+            image_data_dir=IMAGE_DATA_DIR,
+            use_json_instead_csv=True,
+        )
+        if query_engine:
+            log_message("Система успешно перезапущена")
+            chunks_html = create_chunks_display_html(chunk_info)
+            return "✅ Система успешно перезапущена! Новые документы загружены.", chunks_html
         else:
+            return "❌ Ошибка при перезапуске системы", "<div style='color: red;'>Ошибка загрузки</div>"
+    except Exception as e:
+        error_msg = f"Ошибка перезапуска: {str(e)}"
+        log_message(error_msg)
+        return f"❌ {error_msg}", "<div style='color: red;'>Ошибка</div>"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                 'table_number': doc.metadata.get('table_number', ''),
                 'image_number': doc.metadata.get('image_number', ''),
                 'section': doc.metadata.get('section', ''),
+                'connection_type': doc.metadata.get('connection_type', '')
             })
         log_message(f"Система успешно инициализирована")
         return None, f"❌ {error_msg}"
 retrieval_params = {
+    'vector_top_k': 70,
+    'bm25_top_k': 70,
+    'similarity_cutoff': 0.45,
+    'hybrid_top_k': 140,
     'rerank_top_k': 20
 }
+def create_query_engine(vector_index, vector_top_k=70, bm25_top_k=70,
+                       similarity_cutoff=0.45, hybrid_top_k=140):
     try:
         from config import CUSTOM_PROMPT
         from index_retriever import create_query_engine as create_index_query_engine
                     vector_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
+                        value=70,
                         step=10,
                         label="Vector Top K",
                         info="Количество результатов из векторного поиска"
                     bm25_top_k = gr.Slider(
                         minimum=10,
                         maximum=200,
+                        value=70,
                         step=10,
                         label="BM25 Top K",
                         info="Количество результатов из BM25 поиска"
                     similarity_cutoff = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
+                        value=0.45,
                         step=0.05,
                         label="Similarity Cutoff",
                         info="Минимальный порог схожести для векторного поиска"
                     hybrid_top_k = gr.Slider(
                         minimum=10,
                         maximum=300,
+                        value=140,
                         step=10,
                         label="Hybrid Top K",
                         info="Количество результатов из гибридного поиска"
             gr.Markdown("### Текущие параметры:")
             current_params_display = gr.Textbox(
+                value="Vector: 70 | BM25: 70 | Cutoff: 0.45 | Hybrid: 140 | Rerank: 20",
                 label="",
                 interactive=False,
                 lines=2

app_1.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import os
 from llama_index.core import Settings
 from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
-from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys

 import os
 from llama_index.core import Settings
 from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
+from main_utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
 import sys

converters/converter.py CHANGED Viewed

@@ -19,31 +19,66 @@ def process_uploaded_file(file, file_type):
             filename = os.path.basename(source_path)
             file_path = os.path.join(temp_dir, filename)
             if os.path.abspath(source_path) != os.path.abspath(file_path):
                 shutil.copy(source_path, file_path)
             else:
                 file_path = source_path
             if file_type == "Таблица":
                 target_dir = TABLE_DATA_DIR
                 if filename.endswith(('.xlsx', '.xls')):
                     json_path = convert_single_excel_to_json(file_path, temp_dir)
                     upload_file = json_path
                 else:
                     upload_file = file_path
             elif file_type == "Изображение (метаданные)":
                 target_dir = IMAGE_DATA_DIR
-                # Конвертируем Excel в CSV
                 if filename.endswith(('.xlsx', '.xls')):
                     csv_path = convert_single_excel_to_csv(file_path, temp_dir)
                     upload_file = csv_path
                 else:
                     upload_file = file_path
             else:  # JSON документ
                 target_dir = JSON_FILES_DIR
                 upload_file = file_path
             # Загружаем на HuggingFace
             api = HfApi()
             api.upload_file(
                 path_or_fileobj=upload_file,
@@ -54,7 +89,12 @@ def process_uploaded_file(file, file_type):
             )
             log_message(f"Файл {filename} успешно загружен в {target_dir}")
-            return f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n⚠️ Перезапустите систему для применения изменений"
     except Exception as e:
         error_msg = f"Ошибка обработки файла: {str(e)}"
@@ -71,12 +111,18 @@ def convert_single_excel_to_json(excel_path, output_dir):
         "sheets": []
     }
     for sheet_name, df in df_dict.items():
         if df.empty or "Номер таблицы" not in df.columns:
             continue
         df = df.dropna(how='all').fillna("")
         grouped = df.groupby("Номер таблицы")
         for table_number, group in grouped:
             group = group.reset_index(drop=True)
@@ -100,6 +146,10 @@ def convert_single_excel_to_json(excel_path, output_dir):
                 sheet_data["data"].append(row_dict)
             result["sheets"].append(sheet_data)
     json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
     json_path = os.path.join(output_dir, json_filename)
@@ -107,12 +157,22 @@ def convert_single_excel_to_json(excel_path, output_dir):
     with open(json_path, 'w', encoding='utf-8') as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
     return json_path
 def convert_single_excel_to_csv(excel_path, output_dir):
     """Конвертация одного Excel файла в CSV для изображений"""
     df = pd.read_excel(excel_path)
     csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
     csv_path = os.path.join(output_dir, csv_filename)
     df.to_csv(csv_path, index=False, encoding='utf-8')
-    return csv_path

             filename = os.path.basename(source_path)
             file_path = os.path.join(temp_dir, filename)
+            log_message(f"Начало обработки файла: {filename}")
+            log_message(f"Тип документа: {file_type}")
             if os.path.abspath(source_path) != os.path.abspath(file_path):
                 shutil.copy(source_path, file_path)
             else:
                 file_path = source_path
+            status_info = []
             if file_type == "Таблица":
                 target_dir = TABLE_DATA_DIR
                 if filename.endswith(('.xlsx', '.xls')):
                     json_path = convert_single_excel_to_json(file_path, temp_dir)
                     upload_file = json_path
+                    # Read processed data for statistics
+                    with open(json_path, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                    status_info.append(f"📊 Обработано таблиц: {len(data['sheets'])}")
+                    status_info.append(f"📄 Листов в документе: {data['total_sheets']}")
                 else:
                     upload_file = file_path
+                    status_info.append(f"📄 Загружен файл: {filename}")
             elif file_type == "Изображение (метаданные)":
                 target_dir = IMAGE_DATA_DIR
                 if filename.endswith(('.xlsx', '.xls')):
                     csv_path = convert_single_excel_to_csv(file_path, temp_dir)
                     upload_file = csv_path
+                    # Read CSV for statistics
+                    df = pd.read_csv(csv_path)
+                    status_info.append(f"🖼️ Записей изображений: {len(df)}")
+                    status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
                 else:
                     upload_file = file_path
+                    # Try to read CSV for stats
+                    try:
+                        df = pd.read_csv(upload_file)
+                        status_info.append(f"🖼️ Записей изображений: {len(df)}")
+                    except:
+                        status_info.append(f"📄 Загружен файл: {filename}")
             else:  # JSON документ
                 target_dir = JSON_FILES_DIR
                 upload_file = file_path
+                # Try to read JSON for statistics
+                try:
+                    with open(upload_file, 'r', encoding='utf-8') as f:
+                        json_data = json.load(f)
+                    if isinstance(json_data, list):
+                        status_info.append(f"📝 Документов в JSON: {len(json_data)}")
+                    elif isinstance(json_data, dict):
+                        status_info.append(f"📝 JSON объект загружен")
+                except:
+                    status_info.append(f"📄 Загружен файл: {filename}")
             # Загружаем на HuggingFace
+            log_message(f"Загрузка на HuggingFace: {target_dir}/{os.path.basename(upload_file)}")
             api = HfApi()
             api.upload_file(
                 path_or_fileobj=upload_file,
             )
             log_message(f"Файл {filename} успешно загружен в {target_dir}")
+            result_message = f"✅ Файл успешно загружен и обработан: {os.path.basename(upload_file)}\n\n"
+            result_message += "\n".join(status_info)
+            result_message += "\n\n⚠️ Нажмите кнопку 'Перезапустить систему' для применения изменений"
+            return result_message
     except Exception as e:
         error_msg = f"Ошибка обработки файла: {str(e)}"
         "sheets": []
     }
+    log_message(f"Обработка файла: {os.path.basename(excel_path)}")
+    log_message(f"Найдено листов: {len(df_dict)}")
+    total_tables = 0
     for sheet_name, df in df_dict.items():
         if df.empty or "Номер таблицы" not in df.columns:
+            log_message(f"  Лист '{sheet_name}': пропущен (пустой или отсутствует колонка 'Номер таблицы')")
             continue
         df = df.dropna(how='all').fillna("")
         grouped = df.groupby("Номер таблицы")
+        sheet_tables = 0
         for table_number, group in grouped:
             group = group.reset_index(drop=True)
                 sheet_data["data"].append(row_dict)
             result["sheets"].append(sheet_data)
+            sheet_tables += 1
+        total_tables += sheet_tables
+        log_message(f"  Лист '{sheet_name}': обработано таблиц: {sheet_tables}")
     json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
     json_path = os.path.join(output_dir, json_filename)
     with open(json_path, 'w', encoding='utf-8') as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
+    log_message(f"Конвертация завершена. Всего таблиц обработано: {total_tables}")
+    log_message(f"Результат сохранен: {json_filename}")
     return json_path
 def convert_single_excel_to_csv(excel_path, output_dir):
     """Конвертация одного Excel файла в CSV для изображений"""
+    log_message(f"Конвертация Excel в CSV: {os.path.basename(excel_path)}")
     df = pd.read_excel(excel_path)
     csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
     csv_path = os.path.join(output_dir, csv_filename)
     df.to_csv(csv_path, index=False, encoding='utf-8')
+    log_message(f"  Строк обработано: {len(df)}")
+    log_message(f"  Колонок: {len(df.columns)}")
+    log_message(f"  Результат сохранен: {csv_filename}")
+    return csv_path

index_retriever.py CHANGED Viewed

@@ -49,8 +49,8 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
         log_message(f"Ошибка переранжировки: {str(e)}")
         return nodes[:top_k]
-def create_query_engine(vector_index, vector_top_k=50, bm25_top_k=50,
-                       similarity_cutoff=0.55, hybrid_top_k=100):
     try:
         from config import CUSTOM_PROMPT

         log_message(f"Ошибка переранжировки: {str(e)}")
         return nodes[:top_k]
+def create_query_engine(vector_index, vector_top_k=70, bm25_top_k=70,
+                       similarity_cutoff=0.45, hybrid_top_k=140):
     try:
         from config import CUSTOM_PROMPT

utils.py → main_utils.py RENAMED Viewed

@@ -210,6 +210,118 @@ def enhance_query_with_keywords(query):
     return f"{query}"
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
     normalized_question = normalize_text(question)
     normalized_question_2, query_changes, change_list = normalize_steel_designations(question)

     return f"{query}"
+def merge_table_chunks(chunk_info):
+    merged = {}
+    for chunk in chunk_info:
+        doc_type = chunk.get('type', 'text')
+        doc_id = chunk.get('document_id', 'unknown')
+        if doc_type == 'table' or doc_type == 'table_row':
+            table_num = chunk.get('table_number', '')
+            key = f"{doc_id}_{table_num}"
+            if key not in merged:
+                merged[key] = {
+                    'document_id': doc_id,
+                    'type': 'table',
+                    'table_number': table_num,
+                    'section_id': chunk.get('section_id', 'unknown'),
+                    'chunk_text': chunk.get('chunk_text', '')
+                }
+            else:
+                merged[key]['chunk_text'] += '\n' + chunk.get('chunk_text', '')
+        else:
+            unique_key = f"{doc_id}_{chunk.get('section_id', '')}_{chunk.get('chunk_id', 0)}"
+            merged[unique_key] = chunk
+    return list(merged.values())
+def create_chunks_display_html(chunk_info):
+    if not chunk_info:
+        return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
+    merged_chunks = merge_table_chunks(chunk_info)
+    html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
+    html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(merged_chunks)}</h4>"
+    for i, chunk in enumerate(merged_chunks):
+        bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        section_display = get_section_display(chunk)
+        formatted_content = get_formatted_content(chunk)
+        html += f"""
+        <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
+            <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
+            <strong style='color: black;'>Содержание:</strong><br>
+            <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
+                {formatted_content}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def get_section_display(chunk):
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    doc_type = chunk.get('type', 'text')
+    if doc_type == 'table' and chunk.get('table_number'):
+        table_num = chunk.get('table_number')
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+        return f"таблица {table_num}"
+    if doc_type == 'image' and chunk.get('image_number'):
+        image_num = chunk.get('image_number')
+        if not str(image_num).startswith('№'):
+            image_num = f"№{image_num}"
+        return f"рисунок {image_num}"
+    if section_path:
+        return section_path
+    elif section_id and section_id != 'unknown':
+        return section_id
+    return section_id
+def get_formatted_content(chunk):
+    document_id = chunk.get('document_id', 'unknown')
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    section_text = chunk.get('section_text', '')
+    parent_section = chunk.get('parent_section', '')
+    parent_title = chunk.get('parent_title', '')
+    level = chunk.get('level', '')
+    chunk_text = chunk.get('chunk_text', '')
+    doc_type = chunk.get('type', 'text')
+    # For text documents
+    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
+        current_section = section_path if section_path else section_id
+        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
+        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
+    else:
+        current_section = section_path if section_path else section_id
+        clean_text = chunk_text
+        if section_text and chunk_text.startswith(section_text):
+            section_title = section_text
+        elif chunk_text.startswith(f"{current_section} "):
+            clean_text = chunk_text[len(f"{current_section} "):].strip()
+            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
+        else:
+            section_title = section_text if section_text else current_section
+        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None, rerank_top_k=20):
     normalized_question = normalize_text(question)
     normalized_question_2, query_changes, change_list = normalize_steel_designations(question)