Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

f9e7c0c

1 Parent(s): b01a551

added the load_table_data function

Browse files

Files changed (3) hide show

documents_prep.py +43 -110
index_retriever.py +62 -126
table_prep.py +68 -76

documents_prep.py CHANGED Viewed

@@ -392,120 +392,53 @@ def load_image_data(repo_id, hf_token, image_data_dir):
         return []
 def load_table_data(repo_id, hf_token, table_data_dir):
-    """Load and process table data from HuggingFace repo"""
-    log_message("=" * 60)
-    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
-    log_message("=" * 60)
-    try:
-        from huggingface_hub import hf_hub_download, list_repo_files
-        import json
-        from collections import defaultdict
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        stats = {
-            'total_tables': 0,
-            'total_size': 0,
-            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
-        }
-        for file_path in table_files:
-            try:
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                log_message(f"\nОбработка файла: {file_path}")
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    if isinstance(table_data, dict):
-                        # FIXED: Properly extract document_id from multiple possible sources
-                        document_id = (
-                            table_data.get('document_id') or
-                            table_data.get('document') or
-                            table_data.get('Обозначение документа') or
-                            'unknown'
-                        )
-                        # Handle multiple sheets
-                        if 'sheets' in table_data:
-                            sorted_sheets = sorted(
-                                table_data['sheets'],
-                                key=lambda sheet: sheet.get('table_number', '')
-                            )
-                            for sheet in sorted_sheets:
-                                # FIXED: Ensure document_id is always set in sheet data
-                                if 'document' not in sheet and 'document_id' not in sheet:
-                                    sheet['document'] = document_id
-                                    sheet['document_id'] = document_id
-                                # FIXED: Pass document_id explicitly
-                                docs_list = table_to_document(sheet, document_id=document_id)
-                                table_documents.extend(docs_list)
-                                for doc in docs_list:
-                                    stats['total_tables'] += 1
-                                    size = doc.metadata.get('content_size', 0)
-                                    stats['total_size'] += size
-                                    stats['by_document'][document_id]['count'] += 1
-                                    stats['by_document'][document_id]['size'] += size
-                        else:
-                            # Single table - FIXED: Ensure document_id is in table_data
-                            if 'document_id' not in table_data:
-                                table_data['document_id'] = document_id
-                            if 'document' not in table_data:
-                                table_data['document'] = document_id
-                            docs_list = table_to_document(table_data, document_id=document_id)
                             table_documents.extend(docs_list)
-                            for doc in docs_list:
-                                stats['total_tables'] += 1
-                                size = doc.metadata.get('content_size', 0)
-                                stats['total_size'] += size
-                                stats['by_document'][document_id]['count'] += 1
-                                stats['by_document'][document_id]['size'] += size
-            except Exception as e:
-                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
-                import traceback
-                log_message(f"Traceback: {traceback.format_exc()}")
-                continue
-        # Log summary
-        log_message("\n" + "=" * 60)
-        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
-        log_message("=" * 60)
-        log_message(f"Всего таблиц: {stats['total_tables']}")
-        log_message(f"Общий размер: {stats['total_size']:,} символов")
-        if stats['total_tables'] > 0:
-            log_message(f"Средний размер: {stats['total_size'] // stats['total_tables']:,} символов")
-        log_message("\nПо документам:")
-        for doc_id, doc_stats in sorted(stats['by_document'].items()):
-            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, {doc_stats['size']:,} символов")
-        log_message("=" * 60)
-        return table_documents
-    except Exception as e:
-        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
-        import traceback
-        log_message(f"Traceback: {traceback.format_exc()}")
-        return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message("Загружаю данные чанков из CSV")

         return []
 def load_table_data(repo_id, hf_token, table_data_dir):
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+    table_documents = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                local_dir='',
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                table_data = json.load(f)
+                if isinstance(table_data, dict):
+                    document_id = (
+                        table_data.get('document_id') or
+                        table_data.get('document') or
+                        table_data.get('Обозначение документа') or
+                        'unknown'
+                    )
+                    if 'НП-104-18' in str(document_id):
+                        document_id = 'ГОСТ 59023'
+                    if 'sheets' in table_data:
+                        for sheet in table_data['sheets']:
+                            sheet['document_id'] = document_id
+                            sheet['document'] = document_id
+                            docs_list = table_to_document(sheet, document_id=document_id)
                             table_documents.extend(docs_list)
+                    else:
+                        table_data['document_id'] = document_id
+                        table_data['document'] = document_id
+                        docs_list = table_to_document(table_data, document_id=document_id)
+                        table_documents.extend(docs_list)
+        except Exception as e:
+            log_message(f"Ошибка {file_path}: {str(e)}")
+            continue
+    return table_documents
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message("Загружаю данные чанков из CSV")

index_retriever.py CHANGED Viewed

@@ -13,141 +13,77 @@ def create_vector_index(documents):
     return VectorStoreIndex.from_documents(documents)
 def create_query_engine(vector_index):
-    try:
-        # FIXED: Significantly increased retrieval for tables and lowered BM25 threshold
-        bm25_retriever = BM25Retriever.from_defaults(
-            docstore=vector_index.docstore,
-            similarity_top_k=80  # Increased from 50
-        )
-        vector_retriever = VectorIndexRetriever(
-            index=vector_index,
-            similarity_top_k=80,  # Increased from 50
-            similarity_cutoff=0.45  # FIXED: Lowered from 0.55 to catch more tables
-        )
-        hybrid_retriever = QueryFusionRetriever(
-            [vector_retriever, bm25_retriever],
-            similarity_top_k=100,  # Increased from 60 to ensure tables aren't filtered early
-            num_queries=1
-        )
-        custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
-        response_synthesizer = get_response_synthesizer(
-            response_mode=ResponseMode.TREE_SUMMARIZE,
-            text_qa_template=custom_prompt_template
-        )
-        query_engine = RetrieverQueryEngine(
-            retriever=hybrid_retriever,
-            response_synthesizer=response_synthesizer
-        )
-        log_message("Query engine успешно создан с улучшенными параметрами поиска таблиц")
-        return query_engine
-    except Exception as e:
-        log_message(f"Ошибка создания query engine: {str(e)}")
-        raise
-def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15):  # FIXED: More lenient
     if not nodes or not reranker:
         return nodes[:top_k]
-    try:
-        log_message(f"Переранжирую {len(nodes)} узлов")
-        pairs = [[query, node.text] for node in nodes]
-        scores = reranker.predict(pairs)
         scored_nodes = list(zip(nodes, scores))
         scored_nodes.sort(key=lambda x: x[1], reverse=True)
-        # FIXED: Much lower threshold and special boost for tables
-        table_boost = 0.15  # Boost table scores
-        boosted_scored_nodes = []
-        for node, score in scored_nodes:
-            metadata = node.metadata if hasattr(node, 'metadata') else {}
-            if metadata.get('type') == 'table':
-                boosted_score = min(1.0, score * (1 + table_boost))
-                boosted_scored_nodes.append((node, boosted_score))
-            else:
-                boosted_scored_nodes.append((node, score))
-        boosted_scored_nodes.sort(key=lambda x: x[1], reverse=True)
-        if min_score_threshold is not None:
-            filtered_nodes = [(node, score) for node, score in boosted_scored_nodes
-                             if score >= min_score_threshold]
-            log_message(f"После фильтрации по порогу {min_score_threshold}: {len(filtered_nodes)} узлов")
-            if filtered_nodes:
-                scored_nodes = filtered_nodes
-            else:
-                # Fallback: take top nodes even if below threshold
-                log_message("⚠️ Нет узлов после фильтрации, беру топ-40 без порога")
-                scored_nodes = boosted_scored_nodes[:40]
-        else:
-            scored_nodes = boosted_scored_nodes
-        selected_nodes = []
-        selected_docs = set()
-        selected_sections = set()
-        selected_tables = set()
-        selected_appendix_tables = set()  # FIXED: Track appendix tables separately
-        for node, score in scored_nodes:
-            if len(selected_nodes) >= top_k:
-                break
-            metadata = node.metadata if hasattr(node, 'metadata') else {}
-            doc_id = metadata.get('document_id', 'unknown')
-            node_type = metadata.get('type', 'text')
-            section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
-            # FIXED: Better table tracking with appendix awareness
-            if node_type == 'table':
-                table_num = metadata.get('table_number_clean', metadata.get('table_number', ''))
-                appendix_num = metadata.get('appendix_number')
-                if appendix_num:
-                    table_key = f"{doc_id}_appendix_{appendix_num}_table_{table_num}"
-                else:
-                    table_key = f"{doc_id}_table_{table_num}"
-            else:
-                table_key = None
-            # FIXED: Even lower diversity penalty for tables
-            penalty = 0
-            if node_type == 'table':
-                # Tables get minimal penalty - we want all relevant tables
-                if table_key and table_key in selected_tables:
-                    penalty += diversity_penalty * 0.2
-                else:
-                    penalty += diversity_penalty * 0.05 if doc_id in selected_docs else 0
-            else:
-                if doc_id in selected_docs:
-                    penalty += diversity_penalty * 0.5
-                if section_key in selected_sections:
-                    penalty += diversity_penalty
             adjusted_score = score * (1 - penalty)
-            # FIXED: Very lenient threshold for adding nodes
-            if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.3:
-                selected_nodes.append((node, score))
-                selected_docs.add(doc_id)
-                selected_sections.add(section_key)
-                if table_key:
-                    selected_tables.add(table_key)
-        log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
-        log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}, таблиц: {len(selected_tables)}")
-        if selected_nodes:
-            log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
-        return [node for node, score in selected_nodes]
-    except Exception as e:
-        log_message(f"Ошибка переранжировки: {str(e)}")
-        return nodes[:top_k]

     return VectorStoreIndex.from_documents(documents)
 def create_query_engine(vector_index):
+    bm25_retriever = BM25Retriever.from_defaults(
+        docstore=vector_index.docstore,
+        similarity_top_k=80
+    )
+    vector_retriever = VectorIndexRetriever(
+        index=vector_index,
+        similarity_top_k=80,
+        similarity_cutoff=0.45
+    )
+    hybrid_retriever = QueryFusionRetriever(
+        [vector_retriever, bm25_retriever],
+        similarity_top_k=100,
+        num_queries=1
+    )
+    custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
+    response_synthesizer = get_response_synthesizer(
+        response_mode=ResponseMode.TREE_SUMMARIZE,
+        text_qa_template=custom_prompt_template
+    )
+    query_engine = RetrieverQueryEngine(
+        retriever=hybrid_retriever,
+        response_synthesizer=response_synthesizer
+    )
+    return query_engine
+def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15):
     if not nodes or not reranker:
         return nodes[:top_k]
+    pairs = [[query, node.text] for node in nodes]
+    scores = reranker.predict(pairs)
+    scored_nodes = list(zip(nodes, scores))
+    scored_nodes.sort(key=lambda x: x[1], reverse=True)
+    if min_score_threshold:
+        scored_nodes = [(node, score) for node, score in scored_nodes
+                      if score >= min_score_threshold]
+    if not scored_nodes:
         scored_nodes = list(zip(nodes, scores))
         scored_nodes.sort(key=lambda x: x[1], reverse=True)
+        scored_nodes = scored_nodes[:top_k]
+    selected = []
+    seen_docs = {}
+    for node, score in scored_nodes:
+        if len(selected) >= top_k:
+            break
+        meta = node.metadata if hasattr(node, 'metadata') else {}
+        doc_id = meta.get('document_id', 'unknown')
+        node_type = meta.get('type', 'text')
+        table_num = meta.get('table_number', '')
+        key = f"{doc_id}_{table_num}" if node_type == 'table' else f"{doc_id}_{meta.get('section_id', '')}"
+        if key in seen_docs:
+            penalty = diversity_penalty * 0.2 if node_type == 'table' else diversity_penalty
             adjusted_score = score * (1 - penalty)
+        else:
+            adjusted_score = score
+            seen_docs[key] = 1
+        if not selected or adjusted_score >= selected[0][1] * 0.4:
+            selected.append((node, score))
+    return [node for node, score in selected]

table_prep.py CHANGED Viewed

@@ -4,7 +4,6 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
 from my_logging import log_message
 def create_table_content(table_data):
-    """Create formatted content from table data"""
     doc_id = (
         table_data.get('document_id') or
         table_data.get('document') or
@@ -19,55 +18,34 @@ def create_table_content(table_data):
         'Неизвестно'
     )
-    # FIXED: Normalize table number and create variations
-    table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
-    # FIXED: Enhanced content with multiple references for better matching
-    content = f"Документ: {doc_id}\n"
-    content += f"ГОСТ/Стандарт: {doc_id}\n"
-    content += f"Таблица номер: {table_num}\n"
-    content += f"Таблица: {table_num_clean}\n"
-    content += f"Название таблицы: {table_title}\n"
-    content += f"Раздел документа: {section}\n"
-    # FIXED: Add explicit appendix reference if present
-    if 'приложени' in section.lower():
-        appendix_match = section.lower().split('приложени')[1].split()[0] if 'приложени' in section.lower() else ''
-        content += f"Таблица {table_num_clean} Приложения {appendix_match}\n"
     headers = table_data.get('headers', [])
     if headers:
-        # FIXED: Add headers as searchable keywords
-        headers_text = ' | '.join(str(h) for h in headers)
-        content += f"\nЗаголовки колонок: {headers_text}\n"
-        content += f"Параметры: {headers_text}\n"  # Alternative keyword
-    # FIXED: Extract and emphasize key data values for better semantic search
     if 'data' in table_data and isinstance(table_data['data'], list):
-        content += "\nСодержимое таблицы:\n"
-        # Extract unique values for search enhancement
-        all_values = set()
         for row_idx, row in enumerate(table_data['data'], start=1):
             if isinstance(row, dict):
-                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
-                content += f"Строка {row_idx}: {row_text}\n"
-                # Collect values
-                all_values.update([str(v) for v in row.values() if v and str(v).strip()])
             elif isinstance(row, list):
-                row_text = " | ".join([str(v) for v in row if v])
-                content += f"Строка {row_idx}: {row_text}\n"
-                all_values.update([str(v) for v in row if v and str(v).strip()])
-        # FIXED: Add searchable keywords from data
-        if all_values:
-            content += f"\nКлючевые значения: {' '.join(list(all_values)[:50])}\n"
     return content
 def table_to_document(table_data, document_id=None):
-    """Convert table data to Document, with smart chunking if needed"""
     if not isinstance(table_data, dict):
         return []
@@ -79,72 +57,39 @@ def table_to_document(table_data, document_id=None):
         'Неизвестно'
     )
     table_num = table_data.get('table_number', 'Неизвестно')
-    table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
     table_title = table_data.get('table_title', 'Неизвестно')
     section = (
         table_data.get('section') or
-        table_data.get('Раздел документа') or
-        table_data.get('section_id') or
         'Неизвестно'
     )
     table_rows = table_data.get('data', [])
     if not table_rows:
-        log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
         return []
     content = create_table_content(table_data)
-    content_size = len(content)
-    # FIXED: Extract appendix info for better identification
-    appendix_num = None
-    if 'приложени' in section.lower():
-        import re
-        match = re.search(r'приложени[ея]\s*(\d+)', section.lower())
-        if match:
-            appendix_num = match.group(1)
-    # FIXED: Create comprehensive search variations
-    search_variations = [
-        f"{doc_id} таблица {table_num_clean}",
-        f"{doc_id} {table_num}",
-        f"таблица {table_num_clean} {doc_id}",
-        table_title.lower(),
-        section.lower()
-    ]
-    if appendix_num:
-        search_variations.extend([
-            f"таблица {table_num_clean} приложения {appendix_num}",
-            f"приложение {appendix_num} таблица {table_num_clean}"
-        ])
     base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "table_number": str(table_num),
-            "table_number_clean": str(table_num_clean),  # FIXED: Add normalized version
             "table_title": str(table_title),
             "document_id": str(doc_id),
             "section": str(section),
-            "section_id": str(section),
-            "appendix_number": str(appendix_num) if appendix_num else None,  # FIXED: Add appendix tracking
             "total_rows": len(table_rows),
-            "content_size": content_size,
-            "search_key": " | ".join(search_variations),  # FIXED: Enhanced search key
-            "headers": " ".join(str(h) for h in table_data.get('headers', []))  # FIXED: Add headers as metadata
         }
     )
-    # Apply smart chunking if too large
-    if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
         return chunk_table_document(base_doc)
     else:
-        log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
         return [base_doc]
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
@@ -230,4 +175,51 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
         )
         chunked_docs.append(chunked_doc)
-    return chunked_docs

 from my_logging import log_message
 def create_table_content(table_data):
     doc_id = (
         table_data.get('document_id') or
         table_data.get('document') or
         'Неизвестно'
     )
+    content = f"ГОСТ {doc_id} Стандарт {doc_id}\n"
+    content += f"Документ: {doc_id}\n"
+    content += f"Таблица {table_num}\n"
+    content += f"Название: {table_title}\n"
+    content += f"Раздел: {section}\n"
+    if 'Приложени' in section:
+        content += f"Приложение таблица {table_num}\n"
     headers = table_data.get('headers', [])
     if headers:
+        content += f"\nКолонки: {' | '.join(str(h) for h in headers)}\n"
     if 'data' in table_data and isinstance(table_data['data'], list):
+        content += "\nДанные:\n"
         for row_idx, row in enumerate(table_data['data'], start=1):
             if isinstance(row, dict):
+                for k, v in row.items():
+                    if v and str(v).strip():
+                        content += f"{k} {v} "
+                content += "\n"
             elif isinstance(row, list):
+                content += " ".join([str(v) for v in row if v]) + "\n"
     return content
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
         return []
         'Неизвестно'
     )
+    if 'НП-104-18' in str(table_data.get('document', '')):
+        doc_id = 'ГОСТ 59023'
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = (
         table_data.get('section') or
+        table_data.get('Раздел документа') or
         'Неизвестно'
     )
     table_rows = table_data.get('data', [])
     if not table_rows:
         return []
     content = create_table_content(table_data)
     base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "table_number": str(table_num),
             "table_title": str(table_title),
             "document_id": str(doc_id),
             "section": str(section),
             "total_rows": len(table_rows),
+            "content_size": len(content)
         }
     )
+    if len(content) > CHUNK_SIZE:
         return chunk_table_document(base_doc)
     else:
         return [base_doc]
 def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
         )
         chunked_docs.append(chunked_doc)
+    return chunked_docs
+def table_to_document(table_data, document_id=None):
+    if not isinstance(table_data, dict):
+        return []
+    doc_id = (
+        document_id or
+        table_data.get('document_id') or
+        table_data.get('document') or
+        table_data.get('Обозначение документа') or
+        'Неизвестно'
+    )
+    if 'НП-104-18' in str(table_data.get('document', '')):
+        doc_id = 'ГОСТ 59023'
+    table_num = table_data.get('table_number', 'Неизвестно')
+    table_title = table_data.get('table_title', 'Неизвестно')
+    section = (
+        table_data.get('section') or
+        table_data.get('Раздел документа') or
+        'Неизвестно'
+    )
+    table_rows = table_data.get('data', [])
+    if not table_rows:
+        return []
+    content = create_table_content(table_data)
+    base_doc = Document(
+        text=content,
+        metadata={
+            "type": "table",
+            "table_number": str(table_num),
+            "table_title": str(table_title),
+            "document_id": str(doc_id),
+            "section": str(section),
+            "total_rows": len(table_rows),
+            "content_size": len(content)
+        }
+    )
+    if len(content) > CHUNK_SIZE:
+        return chunk_table_document(base_doc)
+    else:
+        return [base_doc]