Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 5, 2025

Commit

123a5db

1 Parent(s): 83b921a

simplest version

Browse files

Files changed (2) hide show

documents_prep.py +80 -54
utils.py +75 -38

documents_prep.py CHANGED Viewed

@@ -38,42 +38,60 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_rows(table_data, doc_id, max_rows=10):  # Reduced from 30
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    # Keep original format
     table_num_clean = str(table_num).strip()
     if not rows:
         return []
-    # For small tables, keep as single chunk
     if len(rows) <= max_rows:
-        content = format_table_content(table_data, headers, rows)
         chunk_size = len(content)
-        log_message(f"  📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → 1 chunk ({chunk_size} chars)")
-        return [Document(
-            text=content,
-            metadata={
-                'type': 'table',
-                'document_id': doc_id,
-                'table_number': table_num_clean,
-                'table_title': table_title,
-                'section': section,
-                'total_rows': len(rows),
-                'chunk_size': chunk_size,
-                'is_complete_table': True
-            }
-        )]
-    # For large tables, chunk with overlap
     chunks = []
-    overlap = 3  # Reduced overlap
     chunk_num = 0
     for i in range(0, len(rows), max_rows - overlap):
@@ -85,71 +103,79 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10):  # Reduced from 30
             table_data,
             headers,
             chunk_rows,
             chunk_info=chunk_info
         )
         chunk_size = len(content)
-        chunks.append(Document(
-            text=content,
-            metadata={
-                'type': 'table',
-                'document_id': doc_id,
-                'table_number': table_num_clean,
-                'table_title': table_title,
-                'section': section,
-                'chunk_id': chunk_num,
-                'row_start': i,
-                'row_end': i + len(chunk_rows),
-                'total_rows': len(rows),
-                'chunk_size': chunk_size,
-                'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
-                'is_complete_table': False
-            }
-        ))
         chunk_num += 1
-    log_message(f"  📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
-    for idx, chunk in enumerate(chunks):
-        log_message(f"    Chunk {idx+1}: rows {chunk.metadata['row_start']}-{chunk.metadata['row_end']} ({chunk.metadata['chunk_size']} chars)")
     return chunks
-def format_table_content(table_data, headers, rows, chunk_info=""):
     doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
-    table_num_clean = str(table_num).replace('№', '').strip()
     content = f"ДОКУМЕНТ: {doc_id}\n"
-    content += f"ТАБЛИЦА: {table_num_clean}\n"
-    content += f"НОМЕР ТАБЛИЦЫ: {table_num_clean}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
     content += f"{'='*70}\n\n"
-    content += f"Это таблица {table_num_clean} из документа {doc_id}. "
-    content += f"Номер таблицы: {table_num_clean}. "
     content += f"Документ: {doc_id}. "
     if table_title:
         content += f"Название таблицы: {table_title}. "
         content += f"Таблица о: {table_title}. "
-    if section:
-        content += f"Раздел: {section}. "
-    content += f"Поиск: таблица {table_num_clean} {doc_id}. "
     if chunk_info:
         content += f"\n{chunk_info}\n"
-    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_num_clean}:\n"
     content += f"="*70 + "\n\n"
     if headers:
@@ -169,7 +195,7 @@ def format_table_content(table_data, headers, rows, chunk_info=""):
                 content += f"{idx}. {' | '.join(parts)}\n"
     content += f"\n{'='*70}\n"
-    content += f"КОНЕЦ ТАБЛИЦЫ {table_num_clean} ИЗ {doc_id}\n"
     return content

     return chunked
+def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # Enhanced table identification
     table_num_clean = str(table_num).strip()
+    # Create unique table identifier with section context
+    if 'приложени' in section.lower():
+        # Extract appendix number
+        import re
+        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
+        if appendix_match:
+            appendix_num = appendix_match.group(1).upper()
+            table_identifier = f"{table_num_clean} (Приложение {appendix_num})"
+        else:
+            table_identifier = f"{table_num_clean} ({section[:30]})"
+    else:
+        table_identifier = table_num_clean
     if not rows:
+        log_message(f"  ⚠️  Table {table_identifier} ({doc_id}): Empty table, skipping")
         return []
+    log_message(f"  📊 Processing Table {table_identifier} ({doc_id}): {len(rows)} rows, {len(headers)} columns")
+    # For small tables
     if len(rows) <= max_rows:
+        content = format_table_content(table_data, headers, rows, table_identifier)
         chunk_size = len(content)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean,
+            'table_identifier': table_identifier,  # NEW: unique identifier
+            'table_title': table_title,
+            'section': section,
+            'total_rows': len(rows),
+            'chunk_size': chunk_size,
+            'is_complete_table': True
+        }
+        log_message(f"    ✓ Single chunk created:")
+        log_message(f"      Metadata: {metadata}")
+        return [Document(text=content, metadata=metadata)]
+    # For large tables with chunking
     chunks = []
+    overlap = 3
     chunk_num = 0
     for i in range(0, len(rows), max_rows - overlap):
             table_data,
             headers,
             chunk_rows,
+            table_identifier,
             chunk_info=chunk_info
         )
         chunk_size = len(content)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean,
+            'table_identifier': table_identifier,  # NEW
+            'table_title': table_title,
+            'section': section,
+            'chunk_id': chunk_num,
+            'row_start': i,
+            'row_end': i + len(chunk_rows),
+            'total_rows': len(rows),
+            'chunk_size': chunk_size,
+            'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
+            'is_complete_table': False
+        }
+        chunks.append(Document(text=content, metadata=metadata))
+        log_message(f"    Chunk {chunk_num+1} created:")
+        log_message(f"      Rows: {i}-{i+len(chunk_rows)}, Size: {chunk_size} chars")
+        log_message(f"      Metadata: {metadata}")
         chunk_num += 1
+    log_message(f"  ✓ Table {table_identifier} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
     return chunks
+def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
     doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    # Use enhanced identifier
     content = f"ДОКУМЕНТ: {doc_id}\n"
+    content += f"ТАБЛИЦА: {table_identifier}\n"
+    content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
+    content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
     content += f"{'='*70}\n\n"
+    # Enhanced search keywords
+    content += f"Это таблица {table_identifier} из документа {doc_id}. "
+    content += f"Идентификатор таблицы: {table_identifier}. "
+    content += f"Номер: {table_num}. "
     content += f"Документ: {doc_id}. "
+    if section:
+        content += f"Находится в разделе: {section}. "
+        if 'приложени' in section.lower():
+            content += f"Таблица из приложения. "
     if table_title:
         content += f"Название таблицы: {table_title}. "
         content += f"Таблица о: {table_title}. "
+    content += f"Поиск: таблица {table_identifier} {doc_id}. "
     if chunk_info:
         content += f"\n{chunk_info}\n"
+    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
     content += f"="*70 + "\n\n"
     if headers:
                 content += f"{idx}. {' | '.join(parts)}\n"
     content += f"\n{'='*70}\n"
+    content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
     return content

utils.py CHANGED Viewed

@@ -24,9 +24,15 @@ def format_sources(nodes):
         doc_id = meta.get('document_id', 'unknown')
         if doc_type == 'table':
-            table_num = meta.get('table_number', 'unknown')
             title = meta.get('table_title', '')
-            sources.append(f"📊 {doc_id} - Таблица {table_num}: {title}")
         elif doc_type == 'image':
             img_num = meta.get('image_number', 'unknown')
             sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
@@ -34,76 +40,107 @@ def format_sources(nodes):
             section = meta.get('section_id', '')
             sources.append(f"📄 {doc_id} - Раздел {section}")
-    return "\n".join(set(sources))
 def preprocess_query(question):
     import re
     question_lower = question.lower()
-    table_match = re.search(r'табли[цу]\w*\s+([а-яa-z0-9\.]+)', question_lower)
     doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
     enhanced_query = question
-    if table_match:
-        table_num = table_match.group(1).upper()
-        enhanced_query += f" таблица номер {table_num}"
     if doc_match:
         doc_id = f"{doc_match.group(1).upper()} {doc_match.group(2)}"
         enhanced_query += f" документ {doc_id}"
     return enhanced_query
 def answer_question(question, query_engine, reranker):
     try:
         log_message(f"Query: {question}")
         enhanced_query = preprocess_query(question)
         if enhanced_query != question:
             log_message(f"Enhanced query: {enhanced_query}")
         retrieved = query_engine.retriever.retrieve(enhanced_query)
-        log_message(f"Retrieved {len(retrieved)} nodes")
-        doc_ids = [n.metadata.get('document_id', 'unknown') for n in retrieved]
-        table_nums = [n.metadata.get('table_number', '') for n in retrieved if n.metadata.get('type') == 'table']
-        log_message(f"Retrieved from documents: {set(doc_ids)}")
-        if table_nums:
-            log_message(f"Retrieved tables: {set(table_nums)}")
         reranked = rerank_nodes(question, retrieved, reranker, top_k=25)
-        log_message(f"Reranked to {len(reranked)} nodes")
-        doc_ids_reranked = [n.metadata.get('document_id', 'unknown') for n in reranked]
-        table_nums_reranked = [n.metadata.get('table_number', '') for n in reranked if n.metadata.get('type') == 'table']
-        log_message(f"After reranking - documents: {set(doc_ids_reranked)}")
-        if table_nums_reranked:
-            log_message(f"After reranking - tables: {set(table_nums_reranked)}")
-        context_parts = []
         for n in reranked:
-            meta = n.metadata
-            doc_id = meta.get('document_id', 'unknown')
-            doc_type = meta.get('type', 'text')
-            if doc_type == 'table':
-                table_num = meta.get('table_number', 'unknown')
-                title = meta.get('table_title', '')
-                source_label = f"[ТАБЛИЦА {table_num} - {doc_id}]"
-                if title:
-                    source_label += f" {title}"
-            elif doc_type == 'image':
-                img_num = meta.get('image_number', 'unknown')
-                source_label = f"[РИСУНОК {img_num} - {doc_id}]"
             else:
-                section = meta.get('section_id', '')
-                source_label = f"[{doc_id} - {section}]"
-            context_parts.append(f"{source_label}\n{n.text}")
-        context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
         prompt = f"""Ты эксперт по технической документации.

         doc_id = meta.get('document_id', 'unknown')
         if doc_type == 'table':
+            table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
             title = meta.get('table_title', '')
+            section = meta.get('section', '')
+            source = f"📊 {doc_id} - {table_id}"
+            if title:
+                source += f": {title}"
+            if section:
+                source += f" ({section})"
+            sources.append(source)
         elif doc_type == 'image':
             img_num = meta.get('image_number', 'unknown')
             sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
             section = meta.get('section_id', '')
             sources.append(f"📄 {doc_id} - Раздел {section}")
+    return "\n".join(sources)  # Don't use set() to preserve order
 def preprocess_query(question):
     import re
     question_lower = question.lower()
+    # Enhanced table detection with appendix
+    table_patterns = [
+        r'табли[цу]\w*\s+([а-яa-z0-9\.]+)(?:\s+(?:из\s+)?приложени[яеий]\s+(\d+|[а-я]))?',
+        r'табли[цу]\w*\s+(?:№|номер)?\s*([а-яa-z0-9\.]+)',
+    ]
     doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
     enhanced_query = question
+    for pattern in table_patterns:
+        table_match = re.search(pattern, question_lower)
+        if table_match:
+            table_num = table_match.group(1).upper()
+            enhanced_query += f" таблица номер {table_num}"
+            # Add appendix context if mentioned
+            if len(table_match.groups()) > 1 and table_match.group(2):
+                appendix_num = table_match.group(2).upper()
+                enhanced_query += f" приложение {appendix_num}"
+            break
     if doc_match:
         doc_id = f"{doc_match.group(1).upper()} {doc_match.group(2)}"
         enhanced_query += f" документ {doc_id}"
+        # Add variations for better matching
+        enhanced_query += f" {doc_match.group(1).upper()}Р {doc_match.group(2)}"
     return enhanced_query
 def answer_question(question, query_engine, reranker):
     try:
+        log_message(f"\n{'='*70}")
         log_message(f"Query: {question}")
+        log_message(f"{'='*70}")
         enhanced_query = preprocess_query(question)
         if enhanced_query != question:
             log_message(f"Enhanced query: {enhanced_query}")
         retrieved = query_engine.retriever.retrieve(enhanced_query)
+        log_message(f"\n📥 INITIAL RETRIEVAL: {len(retrieved)} nodes")
+        # Detailed logging
+        doc_ids = {}
+        for n in retrieved:
+            doc_id = n.metadata.get('document_id', 'unknown')
+            if doc_id not in doc_ids:
+                doc_ids[doc_id] = {'tables': [], 'text': 0, 'images': 0}
+            if n.metadata.get('type') == 'table':
+                table_id = n.metadata.get('table_identifier', n.metadata.get('table_number', ''))
+                doc_ids[doc_id]['tables'].append(table_id)
+            elif n.metadata.get('type') == 'image':
+                doc_ids[doc_id]['images'] += 1
+            else:
+                doc_ids[doc_id]['text'] += 1
+        for doc_id, counts in doc_ids.items():
+            log_message(f"  📄 {doc_id}:")
+            if counts['tables']:
+                log_message(f"    Tables: {', '.join(set(counts['tables']))}")
+            if counts['text']:
+                log_message(f"    Text chunks: {counts['text']}")
+            if counts['images']:
+                log_message(f"    Images: {counts['images']}")
         reranked = rerank_nodes(question, retrieved, reranker, top_k=25)
+        log_message(f"\n🔄 AFTER RERANKING: {len(reranked)} nodes")
+        # Detailed reranking results
+        doc_ids_reranked = {}
         for n in reranked:
+            doc_id = n.metadata.get('document_id', 'unknown')
+            if doc_id not in doc_ids_reranked:
+                doc_ids_reranked[doc_id] = {'tables': [], 'text': 0, 'images': 0}
+            if n.metadata.get('type') == 'table':
+                table_id = n.metadata.get('table_identifier', n.metadata.get('table_number', ''))
+                doc_ids_reranked[doc_id]['tables'].append(table_id)
+            elif n.metadata.get('type') == 'image':
+                doc_ids_reranked[doc_id]['images'] += 1
             else:
+                doc_ids_reranked[doc_id]['text'] += 1
+        for doc_id, counts in doc_ids_reranked.items():
+            log_message(f"  📄 {doc_id}:")
+            if counts['tables']:
+                log_message(f"    Tables: {', '.join(set(counts['tables']))}")
+            if counts['text']:
+                log_message(f"    Text chunks: {counts['text']}")
+            if counts['images']:
+                log_message(f"    Images: {counts['images']}")
+        context = "\n\n" + ("="*70 + "\n\n").join(doc_ids_reranked)
         prompt = f"""Ты эксперт по технической документации.