Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

b01a551

1 Parent(s): c697463

added the load_table_data function

Browse files

Files changed (2) hide show

index_retriever.py +49 -26
table_prep.py +109 -66

index_retriever.py CHANGED Viewed

@@ -14,21 +14,21 @@ def create_vector_index(documents):
 def create_query_engine(vector_index):
     try:
-        # FIXED: Increase retrieval numbers for tables
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=50  # Increased from 30
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=50,  # Increased from 30
-            similarity_cutoff=0.55  # FIXED: Lowered from 0.65 to catch more tables
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=60,  # Increased from 40
             num_queries=1
         )
@@ -51,7 +51,7 @@ def create_query_engine(vector_index):
         raise
-def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, diversity_penalty=0.2):  # FIXED: Adjusted defaults
     if not nodes or not reranker:
         return nodes[:top_k]
@@ -64,24 +64,37 @@ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, div
         scored_nodes.sort(key=lambda x: x[1], reverse=True)
-        # FIXED: Lower threshold and add special handling for tables
         if min_score_threshold is not None:
-            scored_nodes = [(node, score) for node, score in scored_nodes
-                          if score >= min_score_threshold]
-            log_message(f"После фильтрации по порогу {min_score_threshold}: {len(scored_nodes)} узлов")
-        if not scored_nodes:
-            log_message("Нет узлов после фильтрации, снижаю порог")
-            scored_nodes = list(zip(nodes, scores))
-            scored_nodes.sort(key=lambda x: x[1], reverse=True)
-            min_score_threshold = scored_nodes[0][1] * 0.5  # FIXED: Lower threshold
-            scored_nodes = [(node, score) for node, score in scored_nodes
-                          if score >= min_score_threshold]
         selected_nodes = []
         selected_docs = set()
         selected_sections = set()
-        selected_tables = set()  # FIXED: Track tables separately
         for node, score in scored_nodes:
             if len(selected_nodes) >= top_k:
@@ -91,16 +104,26 @@ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, div
             doc_id = metadata.get('document_id', 'unknown')
             node_type = metadata.get('type', 'text')
             section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
-            table_key = f"{doc_id}_{metadata.get('table_number', '')}" if node_type == 'table' else None
-            # FIXED: Lower diversity penalty for tables
             penalty = 0
             if node_type == 'table':
-                # Tables get less penalty - we want multiple tables from same document
                 if table_key and table_key in selected_tables:
-                    penalty += diversity_penalty * 0.3
                 else:
-                    penalty += diversity_penalty * 0.1 if doc_id in selected_docs else 0
             else:
                 if doc_id in selected_docs:
                     penalty += diversity_penalty * 0.5
@@ -109,8 +132,8 @@ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, div
             adjusted_score = score * (1 - penalty)
-            # FIXED: More lenient threshold for adding nodes
-            if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.5:
                 selected_nodes.append((node, score))
                 selected_docs.add(doc_id)
                 selected_sections.add(section_key)

 def create_query_engine(vector_index):
     try:
+        # FIXED: Significantly increased retrieval for tables and lowered BM25 threshold
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=80  # Increased from 50
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=80,  # Increased from 50
+            similarity_cutoff=0.45  # FIXED: Lowered from 0.55 to catch more tables
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=100,  # Increased from 60 to ensure tables aren't filtered early
             num_queries=1
         )
         raise
+def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15):  # FIXED: More lenient
     if not nodes or not reranker:
         return nodes[:top_k]
         scored_nodes.sort(key=lambda x: x[1], reverse=True)
+        # FIXED: Much lower threshold and special boost for tables
+        table_boost = 0.15  # Boost table scores
+        boosted_scored_nodes = []
+        for node, score in scored_nodes:
+            metadata = node.metadata if hasattr(node, 'metadata') else {}
+            if metadata.get('type') == 'table':
+                boosted_score = min(1.0, score * (1 + table_boost))
+                boosted_scored_nodes.append((node, boosted_score))
+            else:
+                boosted_scored_nodes.append((node, score))
+        boosted_scored_nodes.sort(key=lambda x: x[1], reverse=True)
         if min_score_threshold is not None:
+            filtered_nodes = [(node, score) for node, score in boosted_scored_nodes
+                             if score >= min_score_threshold]
+            log_message(f"После фильтрации по порогу {min_score_threshold}: {len(filtered_nodes)} узлов")
+            if filtered_nodes:
+                scored_nodes = filtered_nodes
+            else:
+                # Fallback: take top nodes even if below threshold
+                log_message("⚠️ Нет узлов после фильтрации, беру топ-40 без порога")
+                scored_nodes = boosted_scored_nodes[:40]
+        else:
+            scored_nodes = boosted_scored_nodes
         selected_nodes = []
         selected_docs = set()
         selected_sections = set()
+        selected_tables = set()
+        selected_appendix_tables = set()  # FIXED: Track appendix tables separately
         for node, score in scored_nodes:
             if len(selected_nodes) >= top_k:
             doc_id = metadata.get('document_id', 'unknown')
             node_type = metadata.get('type', 'text')
             section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
+            # FIXED: Better table tracking with appendix awareness
+            if node_type == 'table':
+                table_num = metadata.get('table_number_clean', metadata.get('table_number', ''))
+                appendix_num = metadata.get('appendix_number')
+                if appendix_num:
+                    table_key = f"{doc_id}_appendix_{appendix_num}_table_{table_num}"
+                else:
+                    table_key = f"{doc_id}_table_{table_num}"
+            else:
+                table_key = None
+            # FIXED: Even lower diversity penalty for tables
             penalty = 0
             if node_type == 'table':
+                # Tables get minimal penalty - we want all relevant tables
                 if table_key and table_key in selected_tables:
+                    penalty += diversity_penalty * 0.2
                 else:
+                    penalty += diversity_penalty * 0.05 if doc_id in selected_docs else 0
             else:
                 if doc_id in selected_docs:
                     penalty += diversity_penalty * 0.5
             adjusted_score = score * (1 - penalty)
+            # FIXED: Very lenient threshold for adding nodes
+            if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.3:
                 selected_nodes.append((node, score))
                 selected_docs.add(doc_id)
                 selected_sections.add(section_key)

table_prep.py CHANGED Viewed

@@ -5,7 +5,6 @@ from my_logging import log_message
 def create_table_content(table_data):
     """Create formatted content from table data"""
-    # FIXED: More robust field extraction
     doc_id = (
         table_data.get('document_id') or
         table_data.get('document') or
@@ -20,31 +19,134 @@ def create_table_content(table_data):
         'Неизвестно'
     )
-    # FIXED: Add more context in content for better semantic search
     content = f"Документ: {doc_id}\n"
-    content += f"Таблица: {table_num}\n"
     content += f"Название таблицы: {table_title}\n"
     content += f"Раздел документа: {section}\n"
-    content += f"Стандарт/ГОСТ: {doc_id}\n"  # Explicitly mention GOST for queries
     headers = table_data.get('headers', [])
     if headers:
-        content += f"\nЗаголовки колонок: {' | '.join(str(h) for h in headers)}\n"
-    # Data section
     if 'data' in table_data and isinstance(table_data['data'], list):
         content += "\nСодержимое таблицы:\n"
         for row_idx, row in enumerate(table_data['data'], start=1):
             if isinstance(row, dict):
                 row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
                 content += f"Строка {row_idx}: {row_text}\n"
             elif isinstance(row, list):
                 row_text = " | ".join([str(v) for v in row if v])
                 content += f"Строка {row_idx}: {row_text}\n"
     return content
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
@@ -128,63 +230,4 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
         )
         chunked_docs.append(chunked_doc)
-    return chunked_docs
-def table_to_document(table_data, document_id=None):
-    """Convert table data to Document, with smart chunking if needed"""
-    if not isinstance(table_data, dict):
-        return []
-    # FIXED: More robust document_id extraction with multiple fallbacks
-    doc_id = (
-        document_id or
-        table_data.get('document_id') or
-        table_data.get('document') or
-        table_data.get('Обозначение документа') or
-        'Неизвестно'
-    )
-    table_num = table_data.get('table_number', 'Неизвестно')
-    table_title = table_data.get('table_title', 'Неизвестно')
-    # FIXED: More robust section extraction
-    section = (
-        table_data.get('section') or
-        table_data.get('Раздел документа') or
-        table_data.get('section_id') or
-        'Неизвестно'
-    )
-    table_rows = table_data.get('data', [])
-    if not table_rows:
-        log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
-        return []
-    content = create_table_content(table_data)
-    content_size = len(content)
-    # FIXED: Enhanced metadata with more searchable fields
-    base_doc = Document(
-        text=content,
-        metadata={
-            "type": "table",
-            "table_number": str(table_num),
-            "table_title": str(table_title),
-            "document_id": str(doc_id),
-            "section": str(section),
-            "section_id": str(section),
-            "total_rows": len(table_rows),
-            "content_size": content_size,
-            # FIXED: Add searchable composite field for better retrieval
-            "search_key": f"{doc_id} {table_num} {table_title} {section}".lower()
-        }
-    )
-    # Apply smart chunking if too large
-    if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
-        return chunk_table_document(base_doc)
-    else:
-        log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
-        return [base_doc]

 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = (
         table_data.get('document_id') or
         table_data.get('document') or
         'Неизвестно'
     )
+    # FIXED: Normalize table number and create variations
+    table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
+    # FIXED: Enhanced content with multiple references for better matching
     content = f"Документ: {doc_id}\n"
+    content += f"ГОСТ/Стандарт: {doc_id}\n"
+    content += f"Таблица номер: {table_num}\n"
+    content += f"Таблица: {table_num_clean}\n"
     content += f"Название таблицы: {table_title}\n"
     content += f"Раздел документа: {section}\n"
+    # FIXED: Add explicit appendix reference if present
+    if 'приложени' in section.lower():
+        appendix_match = section.lower().split('приложени')[1].split()[0] if 'приложени' in section.lower() else ''
+        content += f"Таблица {table_num_clean} Приложения {appendix_match}\n"
     headers = table_data.get('headers', [])
     if headers:
+        # FIXED: Add headers as searchable keywords
+        headers_text = ' | '.join(str(h) for h in headers)
+        content += f"\nЗаголовки колонок: {headers_text}\n"
+        content += f"Параметры: {headers_text}\n"  # Alternative keyword
+    # FIXED: Extract and emphasize key data values for better semantic search
     if 'data' in table_data and isinstance(table_data['data'], list):
         content += "\nСодержимое таблицы:\n"
+        # Extract unique values for search enhancement
+        all_values = set()
         for row_idx, row in enumerate(table_data['data'], start=1):
             if isinstance(row, dict):
                 row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
                 content += f"Строка {row_idx}: {row_text}\n"
+                # Collect values
+                all_values.update([str(v) for v in row.values() if v and str(v).strip()])
             elif isinstance(row, list):
                 row_text = " | ".join([str(v) for v in row if v])
                 content += f"Строка {row_idx}: {row_text}\n"
+                all_values.update([str(v) for v in row if v and str(v).strip()])
+        # FIXED: Add searchable keywords from data
+        if all_values:
+            content += f"\nКлючевые значения: {' '.join(list(all_values)[:50])}\n"
     return content
+def table_to_document(table_data, document_id=None):
+    """Convert table data to Document, with smart chunking if needed"""
+    if not isinstance(table_data, dict):
+        return []
+    doc_id = (
+        document_id or
+        table_data.get('document_id') or
+        table_data.get('document') or
+        table_data.get('Обозначение документа') or
+        'Неизвестно'
+    )
+    table_num = table_data.get('table_number', 'Неизвестно')
+    table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
+    table_title = table_data.get('table_title', 'Неизвестно')
+    section = (
+        table_data.get('section') or
+        table_data.get('Раздел документа') or
+        table_data.get('section_id') or
+        'Неизвестно'
+    )
+    table_rows = table_data.get('data', [])
+    if not table_rows:
+        log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
+        return []
+    content = create_table_content(table_data)
+    content_size = len(content)
+    # FIXED: Extract appendix info for better identification
+    appendix_num = None
+    if 'приложени' in section.lower():
+        import re
+        match = re.search(r'приложени[ея]\s*(\d+)', section.lower())
+        if match:
+            appendix_num = match.group(1)
+    # FIXED: Create comprehensive search variations
+    search_variations = [
+        f"{doc_id} таблица {table_num_clean}",
+        f"{doc_id} {table_num}",
+        f"таблица {table_num_clean} {doc_id}",
+        table_title.lower(),
+        section.lower()
+    ]
+    if appendix_num:
+        search_variations.extend([
+            f"таблица {table_num_clean} приложения {appendix_num}",
+            f"приложение {appendix_num} таблица {table_num_clean}"
+        ])
+    base_doc = Document(
+        text=content,
+        metadata={
+            "type": "table",
+            "table_number": str(table_num),
+            "table_number_clean": str(table_num_clean),  # FIXED: Add normalized version
+            "table_title": str(table_title),
+            "document_id": str(doc_id),
+            "section": str(section),
+            "section_id": str(section),
+            "appendix_number": str(appendix_num) if appendix_num else None,  # FIXED: Add appendix tracking
+            "total_rows": len(table_rows),
+            "content_size": content_size,
+            "search_key": " | ".join(search_variations),  # FIXED: Enhanced search key
+            "headers": " ".join(str(h) for h in table_data.get('headers', []))  # FIXED: Add headers as metadata
+        }
+    )
+    # Apply smart chunking if too large
+    if content_size > CHUNK_SIZE:
+        log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
+        return chunk_table_document(base_doc)
+    else:
+        log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
+        return [base_doc]
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
     if chunk_size is None:
         chunk_size = CHUNK_SIZE
         )
         chunked_docs.append(chunked_doc)
+    return chunked_docs