Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 7, 2025

Commit

33c996e

1 Parent(s): 5f6b6af

new api = retrieve chunks + some more text fixing

Browse files

Files changed (5) hide show

app.py +49 -1
config.py +5 -2
documents_prep.py +3 -51
table_prep.py +3 -12
utils.py +139 -70

app.py CHANGED Viewed

@@ -248,7 +248,49 @@ def main_answer_question(question):
                 "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
                 "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
 def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
@@ -361,6 +403,9 @@ def main_switch_model(model_name):
     return status_message
 def main():
     global query_engine, chunks_df, reranker, vector_index, current_model
     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
@@ -387,6 +432,9 @@ def main():
             current_model=current_model,
             chunk_info=chunk_info
         )
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,

                 "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
                 "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
+def retrieve_chunks(question, top_k=20):
+    from index_retriever import rerank_nodes
+    global query_engine, reranker
+    if query_engine is None:
+        return "Система не инициализирована"
+    try:
+        retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"Получено {len(retrieved_nodes)} узлов")
+        # Rerank nodes
+        reranked_nodes = rerank_nodes(
+            question,
+            retrieved_nodes,
+            reranker,
+            top_k=top_k,
+            min_score_threshold=0.5
+        )
+        chunks_data = []
+        for i, node in enumerate(reranked_nodes):
+            metadata = node.metadata if hasattr(node, 'metadata') else {}
+            chunk = {
+                'rank': i + 1,
+                'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', ''),
+                'section_path': metadata.get('section_path', ''),
+                'section_text': metadata.get('section_text', ''),
+                'type': metadata.get('type', 'text'),
+                'table_number': metadata.get('table_number', ''),
+                'image_number': metadata.get('image_number', ''),
+                'text': node.text
+            }
+            chunks_data.append(chunk)
+        log_message(f"Возвращено {len(chunks_data)} чанков")
+        return chunks_data
+    except Exception as e:
+        log_message(f"Ошибка получения чанков: {str(e)}")
+        return f"Ошибка: {str(e)}"
 def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
     return status_message
+gr.api(retrieve_chunks, api_name="retrieve_chunks")
 def main():
     global query_engine, chunks_df, reranker, vector_index, current_model
     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
             current_model=current_model,
             chunk_info=chunk_info
         )
+        demo.api = "retrieve_chunks"
+        demo.queue()
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,

config.py CHANGED Viewed

@@ -49,8 +49,11 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL = "Gemini 2.5 Flash"
-CHUNK_SIZE = 2000
-CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """
 Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.

 DEFAULT_MODEL = "Gemini 2.5 Flash"
+CHUNK_SIZE = 1500
+CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 2500
+MAX_ROWS_TABLE = 10
 CUSTOM_PROMPT = """
 Вы являетесь высокоспециализированным Ассистентом для анализа нормативных документов (AIEXP). Ваша цель - предоставлять точные, корректные и контекстно релевантные ответы исключительно на основе предоставленного контекста из нормативной документации.

documents_prep.py CHANGED Viewed

@@ -5,10 +5,7 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
-# Configuration
-CHUNK_SIZE = 1500
-CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
@@ -38,8 +35,7 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
-    """Chunk tables by content size AND row count"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
@@ -48,7 +44,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
     table_num_clean = str(table_num).strip()
-    # Create section-aware identifier
     import re
     if 'приложени' in section.lower():
         appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
@@ -89,8 +84,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
         return [Document(text=content, metadata=metadata)]
-    # Otherwise, chunk by BOTH content size AND row count
     chunks = []
     current_rows = []
     current_size = 0
@@ -100,7 +94,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2500, max_rows=10):
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
-        # Check BOTH limits: size AND row count
         should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
         if should_split:
@@ -203,43 +196,8 @@ def format_table_rows(rows):
 def format_table_footer(table_identifier, doc_id):
-    """Format table footer"""
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
-def load_table_documents(repo_id, hf_token, table_dir):
-    log_message("Loading tables...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
-    all_chunks = []
-    for file_path in table_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
-            for sheet in data.get('sheets', []):
-                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
-                all_chunks.extend(chunks)
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    return all_chunks
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
@@ -369,7 +327,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
     return documents
 def extract_sections_from_json(json_path):
-    """Extract sections from a single JSON file"""
     documents = []
     try:
@@ -421,7 +378,6 @@ def extract_sections_from_json(json_path):
 def load_table_documents(repo_id, hf_token, table_dir):
-    """Load and chunk tables"""
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -439,15 +395,11 @@ def load_table_documents(repo_id, hf_token, table_dir):
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            # Extract file-level document_id
             file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
-                # Use sheet-level document_id if available, otherwise use file-level
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # CRITICAL: Pass document_id to chunk function
                 chunks = chunk_table_by_content(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)

 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
+from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
     return chunked
+def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     table_num_clean = str(table_num).strip()
     import re
     if 'приложени' in section.lower():
         appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
         return [Document(text=content, metadata=metadata)]
     chunks = []
     current_rows = []
     current_size = 0
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
         should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
         if should_split:
 def format_table_footer(table_identifier, doc_id):
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
     return documents
 def extract_sections_from_json(json_path):
     documents = []
     try:
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
                 chunks = chunk_table_by_content(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)

table_prep.py CHANGED Viewed

@@ -3,12 +3,10 @@ import json
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
-MAX_ROWS_PER_CHUNK = 10
-MAX_CHUNK_SIZE = 4000
 def create_table_content(table_data):
-    """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
@@ -32,10 +30,9 @@ def create_table_content(table_data):
     return content
-def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
     lines = doc.text.strip().split('\n')
-    # Separate header and data rows
     header_lines = []
     data_rows = []
     in_data = False
@@ -99,8 +96,6 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=
 def table_to_document(table_data, document_id=None):
-    """Convert table data to Document, chunk if needed"""
     if not isinstance(table_data, dict):
         return []
@@ -146,11 +141,7 @@ def table_to_document(table_data, document_id=None):
     return [base_doc]
-def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message("=" * 60)
-    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
-    log_message("=" * 60)
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]

 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
+from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
 def create_table_content(table_data):
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     return content
+def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
     lines = doc.text.strip().split('\n')
     header_lines = []
     data_rows = []
     in_data = False
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
         return []
     return [base_doc]
+def load_table_data(repo_id, hf_token, table_data_dir):
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]

utils.py CHANGED Viewed

@@ -43,6 +43,99 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
@@ -53,16 +146,19 @@ def generate_sources_html(nodes, chunks_df=None):
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
-        if doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             key = f"{doc_id}_image_{image_num}"
         else:
-            section_path = metadata.get('section_path', '')
-            section_id = metadata.get('section_id', '')
             section_key = section_path if section_path else section_id
             key = f"{doc_id}_text_{section_key}"
@@ -74,14 +170,13 @@ def generate_sources_html(nodes, chunks_df=None):
                 'sections': set()
             }
-        if doc_type not in ['table', 'table_row', 'image']:
-            section_path = metadata.get('section_path', '')
-            section_id = metadata.get('section_id', '')
-            if section_path:
-                sources_by_doc[key]['sections'].add(f"пункт {section_path}")
-            elif section_id and section_id != 'unknown':
-                sources_by_doc[key]['sections'].add(f"пункт {section_id}")
     for source_info in sources_by_doc.values():
         metadata = source_info['metadata']
         doc_type = source_info['doc_type']
@@ -91,6 +186,7 @@ def generate_sources_html(nodes, chunks_df=None):
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             table_title = metadata.get('table_title', '')
@@ -102,16 +198,23 @@ def generate_sources_html(nodes, chunks_df=None):
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             image_title = metadata.get('image_title', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
                 if image_title and image_title != 'unknown':
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:
@@ -123,56 +226,6 @@ def generate_sources_html(nodes, chunks_df=None):
     html += "</div>"
     return html
-def deduplicate_nodes(nodes):
-    """Deduplicate retrieved nodes based on content and metadata"""
-    seen = set()
-    unique_nodes = []
-    for node in nodes:
-        doc_id = node.metadata.get('document_id', '')
-        node_type = node.metadata.get('type', 'text')
-        if node_type == 'table' or node_type == 'table_row':
-            table_num = node.metadata.get('table_number', '')
-            table_identifier = node.metadata.get('table_identifier', table_num)
-            # Use row range to distinguish table chunks
-            row_start = node.metadata.get('row_start', '')
-            row_end = node.metadata.get('row_end', '')
-            is_complete = node.metadata.get('is_complete_table', False)
-            if is_complete:
-                identifier = f"{doc_id}|table|{table_identifier}|complete"
-            elif row_start != '' and row_end != '':
-                identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
-            else:
-                # Fallback: use chunk_id if available
-                chunk_id = node.metadata.get('chunk_id', '')
-                if chunk_id != '':
-                    identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
-                else:
-                    # Last resort: hash first 100 chars of content
-                    import hashlib
-                    content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
-                    identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
-        elif node_type == 'image':
-            img_num = node.metadata.get('image_number', '')
-            identifier = f"{doc_id}|image|{img_num}"
-        else:  # text
-            section_id = node.metadata.get('section_id', '')
-            chunk_id = node.metadata.get('chunk_id', 0)
-            # For text, section_id + chunk_id should be unique
-            identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
-        if identifier not in seen:
-            seen.add(identifier)
-            unique_nodes.append(node)
-    return unique_nodes
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
@@ -180,20 +233,33 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
     try:
         start_time = time.time()
-        # Simple retrieval
         retrieved_nodes = query_engine.retriever.retrieve(question)
-        log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
-        # Deduplicate
-        unique_retrieved = deduplicate_nodes(retrieved_nodes)
-        log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
-        # Simple reranking
-        reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
-        # Direct query without formatting
-        response = query_engine.query(question)
         end_time = time.time()
         processing_time = end_time - start_time
@@ -215,9 +281,12 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
             metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
                 'document_id': metadata.get('document_id', 'unknown'),
-                'section_id': metadata.get('section_id', 'unknown'),
                 'section_path': metadata.get('section_path', ''),
                 'section_text': metadata.get('section_text', ''),
                 'type': metadata.get('type', 'text'),
                 'table_number': metadata.get('table_number', ''),
                 'image_number': metadata.get('image_number', ''),

 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
+def format_context_for_llm(nodes):
+    context_parts = []
+    for node in nodes:
+        metadata = node.metadata if hasattr(node, 'metadata') else {}
+        doc_id = metadata.get('document_id', 'Неизвестный документ')
+        section_info = ""
+        # Handle section information with proper hierarchy
+        if metadata.get('section_path'):
+            section_path = metadata['section_path']
+            section_text = metadata.get('section_text', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            level = metadata.get('level', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                # For subsections: раздел X (Title), пункт X.X
+                if section_text:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
+                else:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
+            elif section_text:
+                # For main sections: раздел X (Title)
+                section_info = f"раздел {section_path} ({section_text})"
+            else:
+                section_info = f"раздел {section_path}"
+        elif metadata.get('section_id'):
+            section_id = metadata['section_id']
+            section_text = metadata.get('section_text', '')
+            level = metadata.get('level', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                if section_text:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
+                else:
+                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
+            elif section_text:
+                section_info = f"раздел {section_id} ({section_text})"
+            else:
+                section_info = f"раздел {section_id}"
+        # Override with table/image info if applicable
+        if metadata.get('type') == 'table' and metadata.get('table_number'):
+            table_num = metadata['table_number']
+            if not str(table_num).startswith('№'):
+                table_num = f"№{table_num}"
+            table_title = metadata.get('table_title', '')
+            # Include section context for tables
+            base_section = ""
+            if metadata.get('section_path'):
+                base_section = f", раздел {metadata['section_path']}"
+            elif metadata.get('section_id'):
+                base_section = f", раздел {metadata['section_id']}"
+            if table_title:
+                section_info = f"Таблица {table_num} ({table_title}){base_section}"
+            else:
+                section_info = f"Таблица {table_num}{base_section}"
+        if metadata.get('type') == 'image' and metadata.get('image_number'):
+            image_num = metadata['image_number']
+            if not str(image_num).startswith('№'):
+                image_num = f"№{image_num}"
+            image_title = metadata.get('image_title', '')
+            # Include section context for images
+            base_section = ""
+            if metadata.get('section_path'):
+                base_section = f", раздел {metadata['section_path']}"
+            elif metadata.get('section_id'):
+                base_section = f", раздел {metadata['section_id']}"
+            if image_title:
+                section_info = f"Рисунок {image_num} ({image_title}){base_section}"
+            else:
+                section_info = f"Рисунок {image_num}{base_section}"
+        context_text = node.text if hasattr(node, 'text') else str(node)
+        if section_info:
+            formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
+        else:
+            formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
+        context_parts.append(formatted_context)
+    return "\n".join(context_parts)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
+        section_id = metadata.get('section_id', '')
+        section_text = metadata.get('section_text', '')
+        section_path = metadata.get('section_path', '')
+        # Create a unique key for grouping
+        if doc_type == 'table':
             table_num = metadata.get('table_number', 'unknown')
             key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             key = f"{doc_id}_image_{image_num}"
         else:
+            # For text documents, group by section path or section id
             section_key = section_path if section_path else section_id
             key = f"{doc_id}_text_{section_key}"
                 'sections': set()
             }
+        # Add section information
+        if section_path:
+            sources_by_doc[key]['sections'].add(f"пункт {section_path}")
+        elif section_id and section_id != 'unknown':
+            sources_by_doc[key]['sections'].add(f"пункт {section_id}")
+    # Generate HTML for each unique source
     for source_info in sources_by_doc.values():
         metadata = source_info['metadata']
         doc_type = source_info['doc_type']
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             table_title = metadata.get('table_title', '')
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             image_title = metadata.get('image_title', '')
+            section = metadata.get('section', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
                 if image_title and image_title != 'unknown':
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
+                if section and section != 'unknown':
+                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
+            else:
+                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
+        # Add file link if available
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:
     html += "</div>"
     return html
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
+        llm = get_llm_model(current_model)
+        # Direct retrieval without query expansion
         retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"Получено {len(retrieved_nodes)} узлов")
+        reranked_nodes = rerank_nodes(
+            question,
+            retrieved_nodes,
+            reranker,
+            top_k=40,
+            min_score_threshold=0.5,
+            diversity_penalty=0.3
+        )
+        formatted_context = format_context_for_llm(reranked_nodes)
+        enhanced_question = f"""Контекст из базы данных:
+{formatted_context}
+Вопрос пользователя: {question}
+Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
+Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
+        response = query_engine.query(enhanced_question)
         end_time = time.time()
         processing_time = end_time - start_time
             metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
                 'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
                 'section_path': metadata.get('section_path', ''),
                 'section_text': metadata.get('section_text', ''),
+                'level': metadata.get('level', ''),
+                'parent_section': metadata.get('parent_section', ''),
+                'parent_title': metadata.get('parent_title', ''),
                 'type': metadata.get('type', 'text'),
                 'table_number': metadata.get('table_number', ''),
                 'image_number': metadata.get('image_number', ''),