Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

8d6a517

1 Parent(s): 6ff1953

added debugging functions for the c25

Browse files

Files changed (2) hide show

documents_prep.py +38 -14
utils.py +30 -0

documents_prep.py CHANGED Viewed

@@ -34,6 +34,11 @@ def chunk_text_documents(documents):
     return chunked
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
@@ -41,6 +46,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
@@ -60,8 +66,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
@@ -79,7 +90,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
-            'is_complete_table': True
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -155,27 +168,38 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
-    # Add table type/number prominently for matching
-    if table_num:
-        content += f"ТИП: {table_num}\n"
     if table_title:
-        content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
-        content += f"РАЗДЕЛ: {section}\n"
-    content += f"{'='*70}\n"
     if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
-    content += "ДАННЫЕ:\n"
     return content

     return chunked
+def extract_connection_type(text):
+    """Extract connection type like С-25, У-14, etc. from text"""
+    import re
+    match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', text)
+    return match.group(0) if match else ''
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    table_description = table_data.get('table_description', '')  # NEW
     table_num_clean = str(table_num).strip()
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size - NOW INCLUDING DESCRIPTION
     base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
+    # ADD DESCRIPTION HERE if it exists
+    if table_description:
+        base_content += f"ОПИСАНИЕ: {table_description}\n\n"
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
+            'is_complete_table': True,
+            'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    # Start with clear identification
+    content = f"ДОКУМЕНТ: {doc_id}\n"
+    content += f"ТАБЛИЦА: {table_identifier}\n"
+    # Extract and emphasize the connection type if present
     if table_title:
+        content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
+        # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
+        import re
+        type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
+        if type_match:
+            connection_type = type_match.group(0)
+            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
+    if table_num and table_num != table_identifier:
+        content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     if section:
+        content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
+    content += f"\n{'='*70}\n"
+    # Add headers with better formatting
     if headers:
+        content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
+        for i, h in enumerate(headers, 1):
+            content += f"  {i}. {h}\n"
+        content += "\n"
+    content += "ДАННЫЕ ТАБЛИЦЫ:\n"
     return content

utils.py CHANGED Viewed

@@ -172,6 +172,28 @@ def deduplicate_nodes(nodes):
     return unique_nodes
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
@@ -186,6 +208,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
         # Simple reranking

     return unique_nodes
+def debug_search_tables(vector_index, search_term="С-25"):
+    """Debug function to find all tables containing a specific term"""
+    all_nodes = list(vector_index.docstore.docs.values())
+    matching = []
+    for node in all_nodes:
+        if node.metadata.get('type') == 'table':
+            text = node.get_content()
+            if search_term in text or search_term in node.metadata.get('table_title', ''):
+                matching.append({
+                    'doc_id': node.metadata.get('document_id'),
+                    'table_num': node.metadata.get('table_number'),
+                    'title': node.metadata.get('table_title', '')[:100]
+                })
+    log_message(f"\n{'='*60}")
+    log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
+    for m in matching:
+        log_message(f"  • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
+    log_message(f"{'='*60}\n")
+    return matching
 def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
     if query_engine is None:
         log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
+        # DEBUG: Log what was retrieved
+        log_message(f"RETRIEVED: {len(unique_retrieved)} nodes")
+        for i, node in enumerate(unique_retrieved):  # All debug
+            table_num = node.metadata.get('table_number', 'N/A')
+            table_title = node.metadata.get('table_title', 'N/A')
+            doc_id = node.metadata.get('document_id', 'N/A')
+            log_message(f"  [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
         # Simple reranking