Spaces:

MrSimple01
/

RAG_AIEXP_1

Sleeping

App Files Files Community

MrSimple07 commited on Sep 22, 2025

Commit

865746a

1 Parent(s): 1333a87

added new table prep process + some improvement in chunking

Browse files

Files changed (5) hide show

app.py +86 -5
config.py +1 -1
documents_prep.py +152 -116
table_prep.py +347 -0
utils.py +258 -14

app.py CHANGED Viewed

@@ -20,13 +20,18 @@ def create_chunks_display_html(chunk_info):
     for i, chunk in enumerate(chunk_info):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
         html += f"""
         <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
             <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
-            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{chunk.get('section_id', 'unknown')}</span><br>
             <strong style='color: black;'>Содержание:</strong><br>
             <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
-                {chunk['chunk_text']}
             </div>
         </div>
         """
@@ -34,12 +39,68 @@ def create_chunks_display_html(chunk_info):
     html += "</div>"
     return html
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):
     try:
         log_message("Инициализация системы")
         os.makedirs(download_dir, exist_ok=True)
         embed_model = get_embedding_model()
         llm = get_llm_model(DEFAULT_MODEL)
@@ -47,7 +108,16 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
         Settings.embed_model = embed_model
         Settings.llm = llm
         all_documents = []
         chunks_df = None
         chunk_info = []
@@ -66,14 +136,24 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
         if table_data_dir:
             log_message("Добавляю табличные данные")
             table_documents = load_table_data(repo_id, hf_token, table_data_dir)
-            all_documents.extend(table_documents)
         if image_data_dir:
             log_message("Добавляю данные изображений")
             image_documents = load_image_data(repo_id, hf_token, image_data_dir)
-            all_documents.extend(image_documents)
-        log_message(f"Всего документов: {len(all_documents)}")
         vector_index = create_vector_index(all_documents)
         query_engine = create_query_engine(vector_index)
@@ -171,6 +251,7 @@ def create_demo_interface(answer_question_func, switch_model_func, current_model
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
                             "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
                             "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
                         ],
                         inputs=question_input
                     )

     for i, chunk in enumerate(chunk_info):
         bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
+        # Get section display info
+        section_display = get_section_display(chunk)
+        formatted_content = get_formatted_content(chunk)
         html += f"""
         <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
             <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
+            <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
             <strong style='color: black;'>Содержание:</strong><br>
             <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
+                {formatted_content}
             </div>
         </div>
         """
     html += "</div>"
     return html
+def get_section_display(chunk):
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    doc_type = chunk.get('type', 'text')
+    if doc_type == 'table' and chunk.get('table_number'):
+        table_num = chunk.get('table_number')
+        if not str(table_num).startswith('№'):
+            table_num = f"№{table_num}"
+        return f"таблица {table_num}"
+    if doc_type == 'image' and chunk.get('image_number'):
+        image_num = chunk.get('image_number')
+        if not str(image_num).startswith('№'):
+            image_num = f"№{image_num}"
+        return f"рисунок {image_num}"
+    if section_path:
+        return section_path
+    elif section_id and section_id != 'unknown':
+        return section_id
+    return section_id
+def get_formatted_content(chunk):
+    document_id = chunk.get('document_id', 'unknown')
+    section_path = chunk.get('section_path', '')
+    section_id = chunk.get('section_id', 'unknown')
+    section_text = chunk.get('section_text', '')
+    parent_section = chunk.get('parent_section', '')
+    parent_title = chunk.get('parent_title', '')
+    level = chunk.get('level', '')
+    chunk_text = chunk.get('chunk_text', '')
+    doc_type = chunk.get('type', 'text')
+    # For text documents
+    if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
+        current_section = section_path if section_path else section_id
+        parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
+        return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
+    else:
+        current_section = section_path if section_path else section_id
+        clean_text = chunk_text
+        if section_text and chunk_text.startswith(section_text):
+            section_title = section_text
+        elif chunk_text.startswith(f"{current_section} "):
+            clean_text = chunk_text[len(f"{current_section} "):].strip()
+            section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
+        else:
+            section_title = section_text if section_text else current_section
+        return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
 def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):
     try:
+        from documents_prep import process_documents_with_chunking
         log_message("Инициализация системы")
         os.makedirs(download_dir, exist_ok=True)
+        from config import CHUNK_SIZE, CHUNK_OVERLAP
+        from llama_index.core.text_splitter import TokenTextSplitter
         embed_model = get_embedding_model()
         llm = get_llm_model(DEFAULT_MODEL)
         Settings.embed_model = embed_model
         Settings.llm = llm
+        Settings.text_splitter = TokenTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separator=" ",
+            backup_separators=["\n", ".", "!", "?"]
+        )
+        log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
+        log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
         all_documents = []
         chunks_df = None
         chunk_info = []
         if table_data_dir:
             log_message("Добавляю табличные данные")
             table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            log_message(f"Загружено {len(table_documents)} табличных документов")
+            # Process table documents through chunking
+            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
+            all_documents.extend(chunked_table_docs)
+            chunk_info.extend(table_chunk_info)
         if image_data_dir:
             log_message("Добавляю данные изображений")
             image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            log_message(f"Загружено {len(image_documents)} документов изображений")
+            # Process image documents through chunking
+            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
+            all_documents.extend(chunked_image_docs)
+            chunk_info.extend(image_chunk_info)
+        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
         vector_index = create_vector_index(all_documents)
         query_engine = create_query_engine(vector_index)
                             "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
                             "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
                             "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
+                            "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
                         ],
                         inputs=question_input
                     )

config.py CHANGED Viewed

@@ -52,7 +52,7 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL = "Gemini 2.5 Flash"
-CHUNK_SIZE = 2048
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

 DEFAULT_MODEL = "Gemini 2.5 Flash"
+CHUNK_SIZE = 25000
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

documents_prep.py CHANGED Viewed

@@ -6,9 +6,14 @@ from llama_index.core import Document
 from my_logging import log_message
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def chunk_document(doc, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
     text_splitter = SentenceSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
@@ -35,33 +40,145 @@ def chunk_document(doc, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
     return chunked_docs
 def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
     for doc in documents:
-        if len(doc.text) > CHUNK_SIZE:
-            chunked_docs = chunk_document(doc)
-            all_chunked_docs.extend(chunked_docs)
-            for i, chunk_doc in enumerate(chunked_docs):
                 chunk_info.append({
-                    'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                    'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': i,
-                    'chunk_size': len(chunk_doc.text),
-                    'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text
                 })
-        else:
-            all_chunked_docs.append(doc)
-            chunk_info.append({
-                'document_id': doc.metadata.get('document_id', 'unknown'),
-                'section_id': doc.metadata.get('section_id', 'unknown'),
-                'chunk_id': 0,
-                'chunk_size': len(doc.text),
-                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text
-            })
     return all_chunked_docs, chunk_info
@@ -189,6 +306,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                 documents = extract_zip_and_process_json(local_zip_path)
                 all_documents.extend(documents)
             except Exception as e:
                 log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
@@ -221,17 +339,18 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                 log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
         chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
-        log_message(f"Всего создано {len(all_documents)} исходных документов")
-        log_message(f"Посл�� chunking получено {len(chunked_documents)} чанков")
         return chunked_documents, chunk_info
     except Exception as e:
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
 def extract_section_title(section_text):
     if not section_text.strip():
@@ -285,92 +404,6 @@ def extract_zip_and_process_json(zip_path):
     return documents
-def table_to_document(table_data, document_id=None):
-    content = ""
-    if isinstance(table_data, dict):
-        doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
-        table_num = table_data.get('table_number', 'Неизвестно')
-        table_title = table_data.get('table_title', 'Неизвестно')
-        section = table_data.get('section', 'Неизвестно')
-        content += f"Таблица: {table_num}\n"
-        content += f"Название: {table_title}\n"
-        content += f"Документ: {doc_id}\n"
-        content += f"Раздел: {section}\n"
-        if 'data' in table_data and isinstance(table_data['data'], list):
-            for row in table_data['data']:
-                if isinstance(row, dict):
-                    row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
-                    content += f"{row_text}\n"
-    return Document(
-        text=content,
-        metadata={
-            "type": "table",
-            "table_number": table_data.get('table_number', 'unknown'),
-            "table_title": table_data.get('table_title', 'unknown'),
-            "document_id": doc_id or table_data.get('document_id', table_data.get('document', 'unknown')),
-            "section": table_data.get('section', 'unknown'),
-            "section_id": table_data.get('section', 'unknown')
-        }
-    )
-def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message("Начинаю загрузку табличных данных")
-    table_files = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        for file in files:
-            if file.startswith(table_data_dir) and file.endswith('.json'):
-                table_files.append(file)
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        for file_path in table_files:
-            try:
-                log_message(f"Обрабатываю файл: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    if isinstance(table_data, dict):
-                        document_id = table_data.get('document', 'unknown')
-                        if 'sheets' in table_data:
-                            for sheet in table_data['sheets']:
-                                sheet['document'] = document_id
-                                doc = table_to_document(sheet, document_id)
-                                table_documents.append(doc)
-                        else:
-                            doc = table_to_document(table_data, document_id)
-                            table_documents.append(doc)
-                    elif isinstance(table_data, list):
-                        for table_json in table_data:
-                            doc = table_to_document(table_json)
-                            table_documents.append(doc)
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Создано {len(table_documents)} документов из таблиц")
-        return table_documents
-    except Exception as e:
-        log_message(f"Ошибка загрузки табличных данных: {str(e)}")
-        return []
 def load_image_data(repo_id, hf_token, image_data_dir):
     log_message("Начинаю загрузку данных изображений")
@@ -398,12 +431,13 @@ def load_image_data(repo_id, hf_token, image_data_dir):
                 df = pd.read_csv(local_path)
                 log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
                 for _, row in df.iterrows():
-                    section_value = row.get('Раздел документа', row.get('section', 'Неизвестно'))
                     content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
                     content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
-                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
                     content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
                     content += f"Раздел: {section_value}\n"
                     content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
@@ -412,11 +446,13 @@ def load_image_data(repo_id, hf_token, image_data_dir):
                         text=content,
                         metadata={
                             "type": "image",
-                            "image_number": row.get('№ Изображения', 'unknown'),
-                            "document_id": row.get('Обозначение документа', 'unknown'),
-                            "file_path": row.get('Файл изображения', 'unknown'),
-                            "section": section_value,
-                            "section_id": section_value
                         }
                     )
                     image_documents.append(doc)

 from my_logging import log_message
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+from table_prep import table_to_document, load_table_data
+def chunk_document(doc, chunk_size=None, chunk_overlap=None):
+    if chunk_size is None:
+        chunk_size = CHUNK_SIZE
+    if chunk_overlap is None:
+        chunk_overlap = CHUNK_OVERLAP
     text_splitter = SentenceSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
     return chunked_docs
 def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
+    table_count = 0
+    image_count = 0
+    text_chunks_count = 0
+    large_tables_count = 0
+    large_images_count = 0
+    custom_processed_count = 0
     for doc in documents:
+        doc_type = doc.metadata.get('type', 'text')
+        if doc_type == 'table':
+            table_count += 1
+            doc_id = doc.metadata.get('document_id', 'unknown')
+            table_num = doc.metadata.get('table_number', 'unknown')
+            from table_prep import should_use_custom_processing
+            use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
+            if use_custom:
+                custom_processed_count += 1
+                log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
+                # Add the document as-is since it was already processed by custom method
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc_id,
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': len(doc.text),
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': table_num,
+                    'processing_method': method_config.get('method')
+                })
+                continue
+            # Standard processing for non-custom tables
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                large_tables_count += 1
+                log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
+                # Chunk large tables
+                chunked_docs = chunk_document(doc)
+                all_chunked_docs.extend(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'table',
+                        'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
+                        'processing_method': 'standard_chunked'
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown'),
+                    'processing_method': 'standard'
+                })
+        elif doc_type == 'image':
+            image_count += 1
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                large_images_count += 1
+                log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
+                # Chunk large images
+                chunked_docs = chunk_document(doc)
+                all_chunked_docs.extend(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'image',
+                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
+                    })
+            else:
+                all_chunked_docs.append(doc)
                 chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'image',
+                    'image_number': doc.metadata.get('image_number', 'unknown')
                 })
+        else:  # text documents
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                chunked_docs = chunk_document(doc)
+                all_chunked_docs.extend(chunked_docs)
+                text_chunks_count += len(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'text'
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'text'
+                })
+    log_message(f"=== PROCESSING STATISTICS ===")
+    log_message(f"Total tables processed: {table_count}")
+    log_message(f"Custom processed tables: {custom_processed_count}")
+    log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
+    log_message(f"Total images processed: {image_count}")
+    log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
+    log_message(f"Total text chunks created: {text_chunks_count}")
+    log_message(f"Total documents after processing: {len(all_chunked_docs)}")
     return all_chunked_docs, chunk_info
                 documents = extract_zip_and_process_json(local_zip_path)
                 all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
             except Exception as e:
                 log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
                 log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
+        log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
+        # Process documents through chunking function
         chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
+        log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
         return chunked_documents, chunk_info
     except Exception as e:
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
 def extract_section_title(section_text):
     if not section_text.strip():
     return documents
 def load_image_data(repo_id, hf_token, image_data_dir):
     log_message("Начинаю загрузку данных изображений")
                 df = pd.read_csv(local_path)
                 log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
+                # Обработка с правильными названиями колонок
                 for _, row in df.iterrows():
+                    section_value = row.get('Раздел документа', 'Неизвестно')
                     content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
                     content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"  # Опечатка в названии колонки
                     content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
                     content += f"Раздел: {section_value}\n"
                     content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
                         text=content,
                         metadata={
                             "type": "image",
+                            "image_number": str(row.get('№ Изображения', 'unknown')),
+                            "image_title": str(row.get('Название изображения', 'unknown')),
+                            "image_description": str(row.get('Описание изображение', 'unknown')),
+                            "document_id": str(row.get('Обозначение документа', 'unknown')),
+                            "file_path": str(row.get('Файл изображения', 'unknown')),
+                            "section": str(section_value),
+                            "section_id": str(section_value)
                         }
                     )
                     image_documents.append(doc)

table_prep.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import os
+from collections import defaultdict
+import json
+import zipfile
+import pandas as pd
+from huggingface_hub import hf_hub_download, list_repo_files
+from llama_index.core import Document
+from my_logging import log_message
+CUSTOM_TABLE_CONFIGS = {
+    "ГОСТ Р 50.05.01-2018": {
+        "tables": {
+            "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
+            "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
+        }
+    },
+    "ГОСТ Р 50.06.01-2017": {
+        "tables": {
+            "№ Б.2": {"method": "split_by_rows"}
+        }
+    },
+    "НП-104-18": {
+        "tables": {
+            "*": {"method": "group_entire_table"}  # All tables
+        }
+    },
+    "НП-068-05": {
+        "tables": {
+            "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
+            "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
+            "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
+        }
+    },
+    "ГОСТ Р 59023.1-2020": {
+        "tables": {
+            "№ 1": {"method": "split_by_rows"},
+            "№ 2": {"method": "split_by_rows"},
+            "№ 3": {"method": "split_by_rows"}
+        }
+    },
+    "НП-089-15": {
+        "tables": {
+            "-": {"method": "split_by_rows"}
+        }
+    },
+    "НП-105-18": {
+        "tables": {
+            "№ 4.8": {"method": "group_entire_table"}
+        }
+    },
+    "ГОСТ Р 50.05.23-2020": {
+        "tables": {
+            "№8": {"method": "group_entire_table"}
+        }
+    },
+    "ГОСТ Р 50.03.01-2017": {
+        "tables": {
+            "А.8": {"method": "group_entire_table"}
+        }
+    }
+}
+def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
+    base_info = f'Документ "{document_name}", Раздел: {section}, Таблица: {table_number}'
+    if table_title and table_title.strip():
+        base_info += f', Название: {table_title}'
+    if extra_info:
+        base_info += f', {extra_info}'
+    return base_info
+def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
+    chunk_lines = [meta_info.rstrip()]  # Remove trailing newline from meta_info
+    # Add headers only once
+    header_line = " | ".join(headers)
+    chunk_lines.append(f"Заголовки: {header_line}")
+    # Add rows without redundant formatting
+    for i, row in enumerate(rows, start=1):
+        row_parts = []
+        for h in headers:
+            value = row.get(h, '')
+            if value:  # Only add non-empty values
+                row_parts.append(f"{h}: {value}")
+        if add_row_numbers:
+            chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
+        else:
+            chunk_lines.append(' | '.join(row_parts))
+    return "\n".join(chunk_lines)
+def group_by_column_method(table_data, document_name, group_column):
+    """Group rows by specified column value"""
+    documents = []
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    grouped = defaultdict(list)
+    for row in rows:
+        key = row.get(group_column, "UNKNOWN")
+        grouped[key].append(row)
+    for group_value, group_rows in grouped.items():
+        meta_info = create_meta_info(document_name, section, table_number, table_title,
+                                   f'Группа по "{group_column}": {group_value}')
+        chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
+        doc = Document(
+            text=chunk_text,
+            metadata={
+                "type": "table",
+                "table_number": table_number,
+                "table_title": table_title,
+                "document_id": document_name,
+                "section": section,
+                "section_id": section,
+                "group_column": group_column,
+                "group_value": group_value,
+                "total_rows": len(group_rows),
+                "processing_method": "group_by_column"
+            }
+        )
+        documents.append(doc)
+        log_message(f"Created grouped chunk for {group_column}={group_value}, rows: {len(group_rows)}, length: {len(chunk_text)}")
+    return documents
+def split_by_rows_method(table_data, document_name):
+    """Split table into individual row chunks"""
+    documents = []
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    for i, row in enumerate(rows, start=1):
+        meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
+        chunk_text = create_chunk_text(meta_info, headers, [row])
+        doc = Document(
+            text=chunk_text,
+            metadata={
+                "type": "table",
+                "table_number": table_number,
+                "table_title": table_title,
+                "document_id": document_name,
+                "section": section,
+                "section_id": section,
+                "row_number": i,
+                "total_rows": len(rows),
+                "processing_method": "split_by_rows"
+            }
+        )
+        documents.append(doc)
+    log_message(f"Split table {table_number} into {len(rows)} row chunks")
+    return documents
+def group_entire_table_method(table_data, document_name):
+    """Group entire table as one chunk"""
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    meta_info = create_meta_info(document_name, section, table_number, table_title)
+    chunk_text = create_chunk_text(meta_info, headers, rows)
+    doc = Document(
+        text=chunk_text,
+        metadata={
+            "type": "table",
+            "table_number": table_number,
+            "table_title": table_title,
+            "document_id": document_name,
+            "section": section,
+            "section_id": section,
+            "total_rows": len(rows),
+            "processing_method": "group_entire_table"
+        }
+    )
+    log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
+    return [doc]
+def should_use_custom_processing(document_id, table_number):
+    """Check if table should use custom processing"""
+    for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
+        if document_id.startswith(doc_pattern):
+            tables_config = config.get("tables", {})
+            if table_number in tables_config or "*" in tables_config:
+                return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
+    return False, None, None
+def process_table_with_custom_method(table_data, document_name, method_config):
+    """Process table using custom method"""
+    method = method_config.get("method")
+    if method == "group_by_column":
+        group_column = method_config.get("group_column")
+        return group_by_column_method(table_data, document_name, group_column)
+    elif method == "split_by_rows":
+        return split_by_rows_method(table_data, document_name)
+    elif method == "group_entire_table":
+        return group_entire_table_method(table_data, document_name)
+    else:
+        log_message(f"Unknown custom method: {method}, falling back to default processing")
+        return None
+def table_to_document(table_data, document_id=None):
+    if isinstance(table_data, dict):
+        doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
+        table_num = table_data.get('table_number', 'Неизвестно')
+        use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
+        if use_custom:
+            log_message(f"Using custom processing for table {table_num} in document {doc_id}")
+            custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
+            if custom_docs:
+                return custom_docs
+        # DEFAULT PROCESSING (only if NOT using custom)
+        table_title = table_data.get('table_title', 'Неизвестно')
+        section = table_data.get('section', 'Неизвестно')
+        header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
+        if 'data' in table_data and isinstance(table_data['data'], list):
+            table_content = header_content + "\nДанные таблицы:\n"
+            for row_idx, row in enumerate(table_data['data']):
+                if isinstance(row, dict):
+                    row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
+                    table_content += f"Строка {row_idx + 1}: {row_text}\n"
+            doc = Document(
+                text=table_content,
+                metadata={
+                    "type": "table",
+                    "table_number": table_num,
+                    "table_title": table_title,
+                    "document_id": doc_id,
+                    "section": section,
+                    "section_id": section,
+                    "total_rows": len(table_data['data']),
+                    "processing_method": "default"
+                }
+            )
+            return [doc]
+        else:
+            doc = Document(
+                text=header_content,
+                metadata={
+                    "type": "table",
+                    "table_number": table_num,
+                    "table_title": table_title,
+                    "document_id": doc_id,
+                    "section": section,
+                    "section_id": section,
+                    "processing_method": "default"
+                }
+            )
+            return [doc]
+    return []
+def load_table_data(repo_id, hf_token, table_data_dir):
+    """Modified function with custom table processing integration"""
+    log_message("Начинаю загрузку табличных данных")
+    table_files = []
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(table_data_dir) and file.endswith('.json'):
+                table_files.append(file)
+        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+        table_documents = []
+        for file_path in table_files:
+            try:
+                log_message(f"Обрабатываю файл: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    table_data = json.load(f)
+                    if isinstance(table_data, dict):
+                        document_id = table_data.get('document', 'unknown')
+                        if 'sheets' in table_data:
+                            for sheet in table_data['sheets']:
+                                sheet['document'] = document_id
+                                # Check if this table uses custom processing
+                                table_num = sheet.get('table_number', 'Неизвестно')
+                                use_custom, _, _ = should_use_custom_processing(document_id, table_num)
+                                if use_custom:
+                                    log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
+                                docs_list = table_to_document(sheet, document_id)
+                                table_documents.extend(docs_list)
+                        else:
+                            # Check if this table uses custom processing
+                            table_num = table_data.get('table_number', 'Неизвестно')
+                            use_custom, _, _ = should_use_custom_processing(document_id, table_num)
+                            if use_custom:
+                                log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
+                            docs_list = table_to_document(table_data, document_id)
+                            table_documents.extend(docs_list)
+                    elif isinstance(table_data, list):
+                        for table_json in table_data:
+                            document_id = table_json.get('document', 'unknown')
+                            table_num = table_json.get('table_number', 'Неизвестно')
+                            use_custom, _, _ = should_use_custom_processing(document_id, table_num)
+                            if use_custom:
+                                log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
+                            docs_list = table_to_document(table_json)
+                            table_documents.extend(docs_list)
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Создано {len(table_documents)} документов из таблиц")
+        return table_documents
+    except Exception as e:
+        log_message(f"Ошибка загрузки табличных данных: {str(e)}")
+        return []

utils.py CHANGED Viewed

@@ -10,6 +10,190 @@ from index_retriever import rerank_nodes
 from my_logging import log_message
 from config import PROMPT_SIMPLE_POISK
 def get_llm_model(model_name):
     try:
         model_config = AVAILABLE_MODELS.get(model_name)
@@ -99,36 +283,81 @@ def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
     for i, node in enumerate(nodes):
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
-        section_id = metadata.get('section_id', '')
         html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
-            html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📌 {section_id}</h4>"
-        elif doc_type == 'table':
             table_num = metadata.get('table_number', 'unknown')
             if table_num and table_num != 'unknown':
-                if not table_num.startswith('№'):
                     table_num = f"№{table_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             section = metadata.get('section', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id} ({section})</h4>"
             else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id} ({section})</h4>"
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:
@@ -146,20 +375,35 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
     try:
         log_message(f"Получен вопрос: {question}")
-        log_message(f"Используется модель: {current_model}")
         start_time = time.time()
-        log_message("Извлекаю релевантные узлы")
         retrieved_nodes = query_engine.retriever.retrieve(question)
         log_message(f"Извлечено {len(retrieved_nodes)} узлов")
-        for i in range(min(3, len(retrieved_nodes))):
-            log_message(f"Пример узла {i+1}: {retrieved_nodes[i].text[:200]}...")
-        log_message("Применяю переранжировку")
         reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
         formatted_context = format_context_for_llm(reranked_nodes)
-        log_message(f"fорматированный контекст для LLM:\n{formatted_context[:500]}...")
         enhanced_question = f"""
 Контекст из базы данных:
@@ -167,10 +411,10 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
 Вопрос пользователя: {question}"""
-        log_message(f"Отправляю запрос в LLM с {len(reranked_nodes)} узлами")
-        log_message(f"Вопрос для LLM:\n{enhanced_question}...")
         response = query_engine.query(enhanced_question)
         end_time = time.time()
         processing_time = end_time - start_time

 from my_logging import log_message
 from config import PROMPT_SIMPLE_POISK
+def get_llm_model(model_name):
+    try:
+        model_config = AVAILABLE_MODELS.get(model_name)
+        if not model_config:
+            log_message(f"Модель {model_name} не найдена, использую модель по умолчанию")
+            model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
+        if not model_config.get("api_key"):
+            raise Exception(f"API ключ не найден для модели {model_name}")
+        if model_config["provider"] == "google":
+            return GoogleGenAI(
+                model=model_config["model_name"],
+                api_key=model_config["api_key"]
+            )
+        elif model_config["provider"] == "openai":
+            return OpenAI(
+                model=model_config["model_name"],
+                api_key=model_config["api_key"]
+            )
+        else:
+            raise Exception(f"Неподдерживаемый провайдер: {model_config['provider']}")
+    except Exception as e:
+        log_message(f"Ошибка создания модели {model_name}: {str(e)}")
+        return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
+def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
+    return HuggingFaceEmbedding(model_name=model_name)
+def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
+    return CrossEncoder(model_name)
+def format_context_for_llm(nodes):
+    context_parts = []
+    for node in nodes:
+        metadata = node.metadata if hasattr(node, 'metadata') else {}
+        doc_id = metadata.get('document_id', 'Неизвестный документ')
+        section_info = ""
+        if metadata.get('section_path'):
+            section_path = metadata['section_path']
+            section_text = metadata.get('section_text', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            level = metadata.get('level', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                # For subsections, show: пункт X.X в разделе X (Title)
+                section_info = f"пункт {section_path} в разделе {parent_section} ({parent_title})"
+            elif section_text:
+                # For main sections, show: пункт X (Title)
+                section_info = f"пункт {section_path} ({section_text})"
+            else:
+                section_info = f"пункт {section_path}"
+        elif metadata.get('section_id'):
+            section_id = metadata['section_id']
+            section_text = metadata.get('section_text', '')
+            level = metadata.get('level', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                # For subsections without section_path, show: пункт X.X в разделе X (Title)
+                section_info = f"пункт {section_id} в разделе {parent_section} ({parent_title})"
+            elif section_text:
+                section_info = f"пункт {section_id} ({section_text})"
+            else:
+                section_info = f"пункт {section_id}"
+        if metadata.get('type') == 'table' and metadata.get('table_number'):
+            table_num = metadata['table_number']
+            if not str(table_num).startswith('№'):
+                table_num = f"№{table_num}"
+            section_info = f"таблица {table_num}"
+        if metadata.get('type') == 'image' and metadata.get('image_number'):
+            image_num = metadata['image_number']
+            if not str(image_num).startswith('№'):
+                image_num = f"№{image_num}"
+            section_info = f"рисунок {image_num}"
+        context_text = node.text if hasattr(node, 'text') else str(node)
+        if section_info:
+            formatted_context = f"[ИСТОЧНИК: {section_info} документа {doc_id}]\n{context_text}\n"
+        else:
+            formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
+        context_parts.append(formatted_context)
+    return "\n".join(context_parts)
+def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
+    if query_engine is None:
+        return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", ""
+    try:
+        log_message(f"Получен вопрос: {question}")
+        start_time = time.time()
+        # Извлечение узлов
+        retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"Извлечено {len(retrieved_nodes)} узлов")
+        # ДЕТАЛЬНОЕ ЛОГИРОВАНИЕ ИСТОЧНИКОВ
+        log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
+        for i, node in enumerate(retrieved_nodes):
+            log_message(f"Узел {i+1}:")
+            log_message(f"  Документ: {node.metadata.get('document_id', 'unknown')}")
+            log_message(f"  Тип: {node.metadata.get('type', 'unknown')}")
+            log_message(f"  Раздел: {node.metadata.get('section_id', 'unknown')}")
+            log_message(f"  Текст (первые 400 символов): {node.text[:400]}...")
+            log_message(f"  Метаданные: {node.metadata}")
+        # Переранжировка
+        reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
+        log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
+        for i, node in enumerate(reranked_nodes):
+            log_message(f"Переранжированный узел {i+1}:")
+            log_message(f"  Документ: {node.metadata.get('document_id', 'unknown')}")
+            log_message(f"  Тип: {node.metadata.get('type', 'unknown')}")
+            log_message(f"  Раздел: {node.metadata.get('section_id', 'unknown')}")
+            log_message(f"  Полный текст: {node.text}")
+        formatted_context = format_context_for_llm(reranked_nodes)
+        log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
+        enhanced_question = f"""
+Контекст из базы данных:
+{formatted_context}
+Вопрос пользователя: {question}"""
+        response = query_engine.query(enhanced_question)
+        log_message(f"ОТВЕТ LLM: {response.response}")
+        end_time = time.time()
+        processing_time = end_time - start_time
+        log_message(f"Обработка завершена за {processing_time:.2f} секунд")
+        sources_html = generate_sources_html(reranked_nodes, chunks_df)
+        answer_with_time = f"""<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; margin-bottom: 10px;'>
+        <h3 style='color: #63b3ed; margin-top: 0;'>Ответ (Модель: {current_model}):</h3>
+        <div style='line-height: 1.6; font-size: 16px;'>{response.response}</div>
+        <div style='margin-top: 15px; padding-top: 10px; border-top: 1px solid #4a5568; font-size: 14px; color: #a0aec0;'>
+        Время обработки: {processing_time:.2f} секунд
+        </div>
+        </div>"""
+        chunk_info = []
+        for node in reranked_nodes:
+            metadata = node.metadata if hasattr(node, 'metadata') else {}
+            chunk_info.append({
+                'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
+                'section_path': metadata.get('section_path', ''),
+                'section_text': metadata.get('section_text', ''),
+                'level': metadata.get('level', ''),
+                'parent_section': metadata.get('parent_section', ''),
+                'parent_title': metadata.get('parent_title', ''),
+                'type': metadata.get('type', 'text'),
+                'table_number': metadata.get('table_number', ''),
+                'image_number': metadata.get('image_number', ''),
+                'chunk_size': len(node.text),
+                'chunk_text': node.text
+            })
+        from app import create_chunks_display_html
+        chunks_html = create_chunks_display_html(chunk_info)
+        return answer_with_time, sources_html, chunks_html
+    except Exception as e:
+        log_message(f"Ошибка обработки вопроса: {str(e)}")
+        error_msg = f"<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Ошибка обработки вопроса: {str(e)}</div>"
+        return error_msg, ""
 def get_llm_model(model_name):
     try:
         model_config = AVAILABLE_MODELS.get(model_name)
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
+    sources_by_doc = {}
     for i, node in enumerate(nodes):
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
+        section_id = metadata.get('section_id', '')
+        section_text = metadata.get('section_text', '')
+        section_path = metadata.get('section_path', '')
+        # Create a unique key for grouping
+        if doc_type == 'table':
+            table_num = metadata.get('table_number', 'unknown')
+            key = f"{doc_id}_table_{table_num}"
+        elif doc_type == 'image':
+            image_num = metadata.get('image_number', 'unknown')
+            key = f"{doc_id}_image_{image_num}"
+        else:
+            # For text documents, group by section path or section id
+            section_key = section_path if section_path else section_id
+            key = f"{doc_id}_text_{section_key}"
+        if key not in sources_by_doc:
+            sources_by_doc[key] = {
+                'doc_id': doc_id,
+                'doc_type': doc_type,
+                'metadata': metadata,
+                'sections': set()
+            }
+        # Add section information
+        if section_path:
+            sources_by_doc[key]['sections'].add(f"пункт {section_path}")
+        elif section_id and section_id != 'unknown':
+            sources_by_doc[key]['sections'].add(f"пункт {section_id}")
+    # Generate HTML for each unique source
+    for source_info in sources_by_doc.values():
+        metadata = source_info['metadata']
+        doc_type = source_info['doc_type']
+        doc_id = source_info['doc_id']
         html += f"<div style='margin-bottom: 15px; padding: 15px; border: 1px solid #4a5568; border-radius: 8px; background-color: #1a202c;'>"
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
+        elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
+            table_title = metadata.get('table_title', '')
             if table_num and table_num != 'unknown':
+                if not str(table_num).startswith('№'):
                     table_num = f"№{table_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица {table_num} - {doc_id}</h4>"
+                if table_title and table_title != 'unknown':
+                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
+            image_title = metadata.get('image_title', '')
             section = metadata.get('section', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
+                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
+                if image_title and image_title != 'unknown':
+                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
+                if section and section != 'unknown':
+                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
             else:
+                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
+        # Add file link if available
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:
     try:
         log_message(f"Получен вопрос: {question}")
         start_time = time.time()
+        # Извлечение узлов
         retrieved_nodes = query_engine.retriever.retrieve(question)
         log_message(f"Извлечено {len(retrieved_nodes)} узлов")
+        # ДЕТАЛЬНОЕ ��ОГИРОВАНИЕ ИСТОЧНИКОВ
+        log_message("=== ДЕТАЛЬНАЯ ИНФОРМАЦИЯ О НАЙДЕННЫХ УЗЛАХ ===")
+        for i, node in enumerate(retrieved_nodes):
+            log_message(f"Узел {i+1}:")
+            log_message(f"  Документ: {node.metadata.get('document_id', 'unknown')}")
+            log_message(f"  Тип: {node.metadata.get('type', 'unknown')}")
+            log_message(f"  Раздел: {node.metadata.get('section_id', 'unknown')}")
+            log_message(f"  Текст (первые 400 символов): {node.text[:400]}...")
+            log_message(f"  Метаданные: {node.metadata}")
+        # Переранжировка
         reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
+        log_message("=== УЗЛЫ ПОСЛЕ ПЕРЕРАНЖИРОВКИ ===")
+        for i, node in enumerate(reranked_nodes):
+            log_message(f"Переранжированный узел {i+1}:")
+            log_message(f"  Документ: {node.metadata.get('document_id', 'unknown')}")
+            log_message(f"  Тип: {node.metadata.get('type', 'unknown')}")
+            log_message(f"  Раздел: {node.metadata.get('section_id', 'unknown')}")
+            log_message(f"  Полный текст: {node.text}")
         formatted_context = format_context_for_llm(reranked_nodes)
+        log_message(f"ПОЛНЫЙ КОНТЕКСТ ДЛЯ LLM:\n{formatted_context}")
         enhanced_question = f"""
 Контекст из базы данных:
 Вопрос пользователя: {question}"""
         response = query_engine.query(enhanced_question)
+        log_message(f"ОТВЕТ LLM: {response.response}")
         end_time = time.time()
         processing_time = end_time - start_time