Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

05c597d

1 Parent(s): 7062aff

new debug functions + 2000 chunk size

Browse files

Files changed (3) hide show

config.py +1 -1
documents_prep.py +17 -19
index_retriever.py +50 -39

config.py CHANGED Viewed

@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
-MAX_CHARS_TABLE = 10000
 MAX_ROWS_TABLE = 40
 CUSTOM_PROMPT = """

 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 2000
 MAX_ROWS_TABLE = 40
 CUSTOM_PROMPT = """

documents_prep.py CHANGED Viewed

@@ -174,33 +174,31 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
     content += f"ТАБЛИЦА: {table_identifier}\n"
     # Extract and emphasize the connection type if present
     if table_title:
         content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
-        # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
         import re
-        type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
         if type_match:
-            connection_type = type_match.group(0)
             content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
-    if table_num and table_num != table_identifier:
-        content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
-    if section:
-        content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
-    content += f"\n{'='*70}\n"
-    # Add headers with better formatting
-    if headers:
-        content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
-        for i, h in enumerate(headers, 1):
-            content += f"  {i}. {h}\n"
-        content += "\n"
-    content += "ДАННЫЕ ТАБЛИЦЫ:\n"
-    return content
 def format_single_row(row, idx):

     content += f"ТАБЛИЦА: {table_identifier}\n"
     # Extract and emphasize the connection type if present
+    connection_type = ''
     if table_title:
         content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
+        # Parse type from title - ADD MORE VARIANTS
         import re
+        type_match = re.search(r'[СУUTC]-?\s*\d+(?:-\d+)?', table_title)
         if type_match:
+            connection_type = type_match.group(0).replace(' ', '')
+            # Normalize: always use С (Cyrillic)
+            connection_type = connection_type.replace('C', 'С').replace('c', 'С')
             content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
+            # ADD SEARCHABLE KEYWORDS
+            content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
+    # Also check table_identifier for type
+    if not connection_type and table_identifier:
+        import re
+        type_match = re.search(r'[СУUTC]-?\s*\d+', table_identifier)
+        if type_match:
+            connection_type = type_match.group(0).replace(' ', '')
+            connection_type = connection_type.replace('C', 'С')
+            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
+            content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
 def format_single_row(row, idx):

index_retriever.py CHANGED Viewed

@@ -44,20 +44,66 @@ def create_query_engine(vector_index):
     try:
         from config import CUSTOM_PROMPT
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=200
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
             similarity_top_k=50,
-            similarity_cutoff=0.35
         )
         hybrid_retriever = QueryFusionRetriever(
-            [vector_retriever, bm25_retriever],
-            similarity_top_k=150,
             num_queries=1
         )
@@ -73,42 +119,7 @@ def create_query_engine(vector_index):
         )
         log_message("Query engine успешно создан")
-        all_nodes = list(vector_index.docstore.docs.values())
-        c25_tables = []
-        for node_id, node in vector_index.docstore.docs.items():
-            metadata = node.metadata
-            text = node.get_content()
-            # Check if this is a С-25 table
-            if ('С-25' in text or 'C-25' in text or
-                'С-25' in str(metadata.get('table_title', '')) or
-                'С-25' in str(metadata.get('table_number', ''))):
-                c25_tables.append({
-                    'node_id': node_id,
-                    'doc_id': metadata.get('document_id'),
-                    'table_num': metadata.get('table_number'),
-                    'table_title': metadata.get('table_title', ''),
-                    'text_preview': text[:200]
-                })
-        log_message(f"\n{'='*70}")
-        log_message(f"DEBUG: Found {len(c25_tables)} С-25 tables in index:")
-        for t in c25_tables:
-            log_message(f"  • {t['doc_id']} - Table {t['table_num']}")
-            log_message(f"    Title: {t['table_title']}")
-            log_message(f"    Preview: {t['text_preview']}")
-        log_message(f"{'='*70}\n")
         return query_engine
     except Exception as e:
         log_message(f"Ошибка создания query engine: {str(e)}")
         raise

     try:
         from config import CUSTOM_PROMPT
+        # Preprocess query to expand table number patterns
+        class TableAwareRetriever:
+            def __init__(self, base_retriever):
+                self.base_retriever = base_retriever
+            def retrieve(self, query_str):
+                import re
+                # Expand queries with table numbers
+                queries = [query_str]
+                # Extract table numbers like С-25, C-25, С25
+                table_patterns = re.findall(r'[СCс]-?\s*\d+', query_str)
+                if table_patterns:
+                    for pattern in table_patterns:
+                        # Normalize: "С-25" -> ["С-25", "C-25", "С25", "C25"]
+                        normalized = pattern.upper().replace(' ', '')
+                        variants = [
+                            normalized,
+                            normalized.replace('С', 'C'),
+                            normalized.replace('-', ''),
+                            normalized.replace('С', 'C').replace('-', '')
+                        ]
+                        for variant in variants:
+                            queries.append(f"тип соединения {variant}")
+                            queries.append(f"таблица {variant}")
+                log_message(f"Searching with {len(queries)} query variants: {queries[:3]}...")
+                # Retrieve with all variants
+                all_nodes = []
+                seen_ids = set()
+                for q in queries:
+                    nodes = self.base_retriever.retrieve(q)
+                    for node in nodes:
+                        node_id = id(node)
+                        if node_id not in seen_ids:
+                            seen_ids.add(node_id)
+                            all_nodes.append(node)
+                return all_nodes
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=100
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
             similarity_top_k=50,
+            similarity_cutoff=0.3  # Lower threshold
         )
+        # Wrap retrievers with table-aware logic
+        table_aware_bm25 = TableAwareRetriever(bm25_retriever)
         hybrid_retriever = QueryFusionRetriever(
+            [vector_retriever, table_aware_bm25],
+            similarity_top_k=200,  # Increase to capture more candidates
             num_queries=1
         )
         )
         log_message("Query engine успешно создан")
         return query_engine
     except Exception as e:
         log_message(f"Ошибка создания query engine: {str(e)}")
         raise