Spaces:

MrSimple01
/

RAG_AIEXP_1

Sleeping

App Files Files Community

MrSimple07 commited on Oct 7, 2025

Commit

566457a

1 Parent(s): 09d215a

new rag with max chunk size + api for chunks

Browse files

Files changed (3) hide show

app.py +48 -0
table_prep.py +3 -12
utils.py +69 -140

app.py CHANGED Viewed

@@ -248,10 +248,52 @@ def main_answer_question(question):
                 "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
                 "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
 def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # AIEXP - Artificial Intelligence Expert
@@ -361,6 +403,9 @@ def main_switch_model(model_name):
     return status_message
 def main():
     global query_engine, chunks_df, reranker, vector_index, current_model
     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
@@ -387,6 +432,9 @@ def main():
             current_model=current_model,
             chunk_info=chunk_info
         )
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,

                 "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
                 "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
+def retrieve_chunks(question: str, top_k: int = 20) -> list:
+    from index_retriever import rerank_nodes
+    global query_engine, reranker
+    if query_engine is None:
+        return []
+    try:
+        retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"Получено {len(retrieved_nodes)} узлов")
+        reranked_nodes = rerank_nodes(
+            question,
+            retrieved_nodes,
+            reranker,
+            top_k=top_k,
+            min_score_threshold=0.5
+        )
+        chunks_data = []
+        for i, node in enumerate(reranked_nodes):
+            metadata = node.metadata if hasattr(node, 'metadata') else {}
+            chunk = {
+                'rank': i + 1,
+                'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', ''),
+                'section_path': metadata.get('section_path', ''),
+                'section_text': metadata.get('section_text', ''),
+                'type': metadata.get('type', 'text'),
+                'table_number': metadata.get('table_number', ''),
+                'image_number': metadata.get('image_number', ''),
+                'text': node.text
+            }
+            chunks_data.append(chunk)
+        log_message(f"Возвращено {len(chunks_data)} чанков")
+        return chunks_data
+    except Exception as e:
+        log_message(f"Ошибка получения чанков: {str(e)}")
+        return []
 def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
     with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
+        gr.api(retrieve_chunks, api_name="retrieve_chunks")
         gr.Markdown("""
         # AIEXP - Artificial Intelligence Expert
     return status_message
 def main():
     global query_engine, chunks_df, reranker, vector_index, current_model
     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
             current_model=current_model,
             chunk_info=chunk_info
         )
+        demo.api = "retrieve_chunks"
+        demo.queue()
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,

table_prep.py CHANGED Viewed

@@ -3,12 +3,10 @@ import json
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
-MAX_ROWS_PER_CHUNK = 10
-MAX_CHUNK_SIZE = 4000
 def create_table_content(table_data):
-    """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
@@ -32,10 +30,9 @@ def create_table_content(table_data):
     return content
-def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=MAX_ROWS_PER_CHUNK):
     lines = doc.text.strip().split('\n')
-    # Separate header and data rows
     header_lines = []
     data_rows = []
     in_data = False
@@ -99,8 +96,6 @@ def chunk_table_document(doc, max_chunk_size=MAX_CHUNK_SIZE, max_rows_per_chunk=
 def table_to_document(table_data, document_id=None):
-    """Convert table data to Document, chunk if needed"""
     if not isinstance(table_data, dict):
         return []
@@ -146,11 +141,7 @@ def table_to_document(table_data, document_id=None):
     return [base_doc]
-def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message("=" * 60)
-    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
-    log_message("=" * 60)
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]

 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
+from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
 def create_table_content(table_data):
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     return content
+def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
     lines = doc.text.strip().split('\n')
     header_lines = []
     data_rows = []
     in_data = False
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
         return []
     return [base_doc]
+def load_table_data(repo_id, hf_token, table_data_dir):
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]

utils.py CHANGED Viewed

@@ -43,99 +43,6 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
-def format_context_for_llm(nodes):
-    context_parts = []
-    for node in nodes:
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_id = metadata.get('document_id', 'Неизвестный документ')
-        section_info = ""
-        # Handle section information with proper hierarchy
-        if metadata.get('section_path'):
-            section_path = metadata['section_path']
-            section_text = metadata.get('section_text', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            level = metadata.get('level', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                # For subsections: раздел X (Title), пункт X.X
-                if section_text:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
-                else:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
-            elif section_text:
-                # For main sections: раздел X (Title)
-                section_info = f"раздел {section_path} ({section_text})"
-            else:
-                section_info = f"раздел {section_path}"
-        elif metadata.get('section_id'):
-            section_id = metadata['section_id']
-            section_text = metadata.get('section_text', '')
-            level = metadata.get('level', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                if section_text:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
-                else:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
-            elif section_text:
-                section_info = f"раздел {section_id} ({section_text})"
-            else:
-                section_info = f"раздел {section_id}"
-        # Override with table/image info if applicable
-        if metadata.get('type') == 'table' and metadata.get('table_number'):
-            table_num = metadata['table_number']
-            if not str(table_num).startswith('№'):
-                table_num = f"№{table_num}"
-            table_title = metadata.get('table_title', '')
-            # Include section context for tables
-            base_section = ""
-            if metadata.get('section_path'):
-                base_section = f", раздел {metadata['section_path']}"
-            elif metadata.get('section_id'):
-                base_section = f", раздел {metadata['section_id']}"
-            if table_title:
-                section_info = f"Таблица {table_num} ({table_title}){base_section}"
-            else:
-                section_info = f"Таблица {table_num}{base_section}"
-        if metadata.get('type') == 'image' and metadata.get('image_number'):
-            image_num = metadata['image_number']
-            if not str(image_num).startswith('№'):
-                image_num = f"№{image_num}"
-            image_title = metadata.get('image_title', '')
-            # Include section context for images
-            base_section = ""
-            if metadata.get('section_path'):
-                base_section = f", раздел {metadata['section_path']}"
-            elif metadata.get('section_id'):
-                base_section = f", раздел {metadata['section_id']}"
-            if image_title:
-                section_info = f"Рисунок {image_num} ({image_title}){base_section}"
-            else:
-                section_info = f"Рисунок {image_num}{base_section}"
-        context_text = node.text if hasattr(node, 'text') else str(node)
-        if section_info:
-            formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
-        else:
-            formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
-        context_parts.append(formatted_context)
-    return "\n".join(context_parts)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
@@ -146,19 +53,16 @@ def generate_sources_html(nodes, chunks_df=None):
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
-        section_id = metadata.get('section_id', '')
-        section_text = metadata.get('section_text', '')
-        section_path = metadata.get('section_path', '')
-        # Create a unique key for grouping
-        if doc_type == 'table':
             table_num = metadata.get('table_number', 'unknown')
             key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             key = f"{doc_id}_image_{image_num}"
         else:
-            # For text documents, group by section path or section id
             section_key = section_path if section_path else section_id
             key = f"{doc_id}_text_{section_key}"
@@ -170,13 +74,14 @@ def generate_sources_html(nodes, chunks_df=None):
                 'sections': set()
             }
-        # Add section information
-        if section_path:
-            sources_by_doc[key]['sections'].add(f"пункт {section_path}")
-        elif section_id and section_id != 'unknown':
-            sources_by_doc[key]['sections'].add(f"пункт {section_id}")
-    # Generate HTML for each unique source
     for source_info in sources_by_doc.values():
         metadata = source_info['metadata']
         doc_type = source_info['doc_type']
@@ -186,7 +91,6 @@ def generate_sources_html(nodes, chunks_df=None):
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             table_title = metadata.get('table_title', '')
@@ -198,23 +102,16 @@ def generate_sources_html(nodes, chunks_df=None):
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             image_title = metadata.get('image_title', '')
-            section = metadata.get('section', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
                 if image_title and image_title != 'unknown':
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
-                if section and section != 'unknown':
-                    html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 12px;'>Раздел: {section}</p>"
-            else:
-                html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение - {doc_id}</h4>"
-        # Add file link if available
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:
@@ -225,40 +122,75 @@ def generate_sources_html(nodes, chunks_df=None):
     html += "</div>"
     return html
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
-        llm = get_llm_model(current_model)
-        # Direct retrieval without query expansion
         retrieved_nodes = query_engine.retriever.retrieve(question)
-        log_message(f"Получено {len(retrieved_nodes)} узлов")
-        reranked_nodes = rerank_nodes(
-            question,
-            retrieved_nodes,
-            reranker,
-            top_k=40,
-            min_score_threshold=0.5,
-            diversity_penalty=0.3
-        )
-        formatted_context = format_context_for_llm(reranked_nodes)
-        enhanced_question = f"""Контекст из базы данных:
-{formatted_context}
-Вопрос пользователя: {question}
-Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
-Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
-        response = query_engine.query(enhanced_question)
         end_time = time.time()
         processing_time = end_time - start_time
@@ -280,12 +212,9 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
             metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
                 'document_id': metadata.get('document_id', 'unknown'),
-                'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
                 'section_path': metadata.get('section_path', ''),
                 'section_text': metadata.get('section_text', ''),
-                'level': metadata.get('level', ''),
-                'parent_section': metadata.get('parent_section', ''),
-                'parent_title': metadata.get('parent_title', ''),
                 'type': metadata.get('type', 'text'),
                 'table_number': metadata.get('table_number', ''),
                 'image_number': metadata.get('image_number', ''),

 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"
         metadata = node.metadata if hasattr(node, 'metadata') else {}
         doc_type = metadata.get('type', 'text')
         doc_id = metadata.get('document_id', 'unknown')
+        if doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             key = f"{doc_id}_table_{table_num}"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             key = f"{doc_id}_image_{image_num}"
         else:
+            section_path = metadata.get('section_path', '')
+            section_id = metadata.get('section_id', '')
             section_key = section_path if section_path else section_id
             key = f"{doc_id}_text_{section_key}"
                 'sections': set()
             }
+        if doc_type not in ['table', 'table_row', 'image']:
+            section_path = metadata.get('section_path', '')
+            section_id = metadata.get('section_id', '')
+            if section_path:
+                sources_by_doc[key]['sections'].add(f"пункт {section_path}")
+            elif section_id and section_id != 'unknown':
+                sources_by_doc[key]['sections'].add(f"пункт {section_id}")
     for source_info in sources_by_doc.values():
         metadata = source_info['metadata']
         doc_type = source_info['doc_type']
         if doc_type == 'text':
             html += f"<h4 style='margin: 0 0 10px 0; color: #63b3ed;'>📄 {doc_id}</h4>"
         elif doc_type == 'table' or doc_type == 'table_row':
             table_num = metadata.get('table_number', 'unknown')
             table_title = metadata.get('table_title', '')
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{table_title}</p>"
             else:
                 html += f"<h4 style='margin: 0 0 10px 0; color: #68d391;'>📊 Таблица - {doc_id}</h4>"
         elif doc_type == 'image':
             image_num = metadata.get('image_number', 'unknown')
             image_title = metadata.get('image_title', '')
             if image_num and image_num != 'unknown':
                 if not str(image_num).startswith('№'):
                     image_num = f"№{image_num}"
                 html += f"<h4 style='margin: 0 0 10px 0; color: #fbb6ce;'>🖼️ Изображение {image_num} - {doc_id}</h4>"
                 if image_title and image_title != 'unknown':
                     html += f"<p style='margin: 5px 0; color: #a0aec0; font-size: 14px;'>{image_title}</p>"
         if chunks_df is not None and 'file_link' in chunks_df.columns and doc_type == 'text':
             doc_rows = chunks_df[chunks_df['document_id'] == doc_id]
             if not doc_rows.empty:
     html += "</div>"
     return html
+def deduplicate_nodes(nodes):
+    """Deduplicate retrieved nodes based on content and metadata"""
+    seen = set()
+    unique_nodes = []
+    for node in nodes:
+        doc_id = node.metadata.get('document_id', '')
+        node_type = node.metadata.get('type', 'text')
+        if node_type == 'table' or node_type == 'table_row':
+            table_num = node.metadata.get('table_number', '')
+            table_identifier = node.metadata.get('table_identifier', table_num)
+            # Use row range to distinguish table chunks
+            row_start = node.metadata.get('row_start', '')
+            row_end = node.metadata.get('row_end', '')
+            is_complete = node.metadata.get('is_complete_table', False)
+            if is_complete:
+                identifier = f"{doc_id}|table|{table_identifier}|complete"
+            elif row_start != '' and row_end != '':
+                identifier = f"{doc_id}|table|{table_identifier}|rows_{row_start}_{row_end}"
+            else:
+                # Fallback: use chunk_id if available
+                chunk_id = node.metadata.get('chunk_id', '')
+                if chunk_id != '':
+                    identifier = f"{doc_id}|table|{table_identifier}|chunk_{chunk_id}"
+                else:
+                    # Last resort: hash first 100 chars of content
+                    import hashlib
+                    content_hash = hashlib.md5(node.text[:100].encode()).hexdigest()[:8]
+                    identifier = f"{doc_id}|table|{table_identifier}|{content_hash}"
+        elif node_type == 'image':
+            img_num = node.metadata.get('image_number', '')
+            identifier = f"{doc_id}|image|{img_num}"
+        else:  # text
+            section_id = node.metadata.get('section_id', '')
+            chunk_id = node.metadata.get('chunk_id', 0)
+            # For text, section_id + chunk_id should be unique
+            identifier = f"{doc_id}|text|{section_id}|{chunk_id}"
+        if identifier not in seen:
+            seen.add(identifier)
+            unique_nodes.append(node)
+    return unique_nodes
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
     try:
         start_time = time.time()
         retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
+        unique_retrieved = deduplicate_nodes(retrieved_nodes)
+        log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        # Simple reranking
+        reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
+        # Direct query without formatting
+        response = query_engine.query(question)
         end_time = time.time()
         processing_time = end_time - start_time
             metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
                 'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', 'unknown'),
                 'section_path': metadata.get('section_path', ''),
                 'section_text': metadata.get('section_text', ''),
                 'type': metadata.get('type', 'text'),
                 'table_number': metadata.get('table_number', ''),
                 'image_number': metadata.get('image_number', ''),