Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 29, 2025

Commit

6562b97

1 Parent(s): 4ce52d0

fix returns a tuple (major, minor) instead of a float.

Browse files

Files changed (1) hide show

table_prep.py +86 -150

table_prep.py CHANGED Viewed

@@ -4,80 +4,6 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
-# Custom table configurations
-CUSTOM_TABLE_CONFIGS = {
-    "НП-104-18": {
-        "tables": {}  # Add specific tables here if needed
-    },
-    "НП-105-18": {
-        "tables": {
-            "№ 4.8": {"method": "group_entire_table"}
-        }
-    },
-    "ГОСТ Р 50.05.23-2020": {
-        "tables": {
-            "№8": {"method": "group_entire_table"}
-        }
-    },
-    "ГОСТ Р 50.03.01-2017": {
-        "tables": {
-            "А.8": {"method": "group_entire_table"}
-        }
-    }
-}
-def create_meta_info(document_name, section, table_number, table_title):
-    """Create metadata information for table"""
-    meta = f"Таблица: {table_number}\n"
-    meta += f"Название: {table_title}\n"
-    meta += f"Документ: {document_name}\n"
-    meta += f"Раздел: {section}\n"
-    return meta
-def create_chunk_text(meta_info, headers, rows):
-    """Create formatted text from table data"""
-    chunk_text = meta_info
-    if headers:
-        chunk_text += f"\nЗаголовки: {' | '.join(headers)}\n"
-    chunk_text += "\nДанные таблицы:\n"
-    for row_idx, row in enumerate(rows, start=1):
-        if isinstance(row, dict):
-            row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
-            chunk_text += f"Строка {row_idx}: {row_text}\n"
-    return chunk_text
-def group_entire_table_method(table_data, document_name):
-    """Group entire table as one chunk"""
-    headers = table_data.get("headers", [])
-    rows = table_data.get("data", [])
-    section = table_data.get("section", "")
-    table_number = table_data.get("table_number", "")
-    table_title = table_data.get("table_title", "")
-    meta_info = create_meta_info(document_name, section, table_number, table_title)
-    chunk_text = create_chunk_text(meta_info, headers, rows)
-    doc = Document(
-        text=chunk_text,
-        metadata={
-            "type": "table",
-            "table_number": table_number,
-            "table_title": table_title,
-            "document_id": document_name,
-            "section": section,
-            "section_id": section,
-            "total_rows": len(rows),
-            "processing_method": "group_entire_table",
-            "content_size": len(chunk_text)
-        }
-    )
-    log_message(f"✓ GROUPED ENTIRE TABLE: {table_number}, rows: {len(rows)}, size: {len(chunk_text)} символов")
-    return [doc]
 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
@@ -104,66 +30,41 @@ def create_table_content(table_data):
     return content
 def table_to_document(table_data, document_id=None):
-    """Convert table data to a single Document with custom processing support"""
     if not isinstance(table_data, dict):
         return []
     doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
-    # Check for custom processing
-    if doc_id in CUSTOM_TABLE_CONFIGS:
-        doc_config = CUSTOM_TABLE_CONFIGS[doc_id]
-        if table_num in doc_config.get("tables", {}):
-            method = doc_config["tables"][table_num].get("method")
-            if method == "group_entire_table":
-                return group_entire_table_method(table_data, doc_id)
-    # Default processing
     content = create_table_content(table_data)
     content_size = len(content)
     row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
     log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
                 f"Размер: {content_size} символов | Строк: {row_count}")
     return [Document(
         text=content,
-        metadata={
-            "type": "table",
-            "table_number": table_num,
-            "table_title": table_title,
-            "document_id": doc_id,
-            "section": section,
-            "section_id": section,
-            "total_rows": row_count,
-            "content_size": content_size
-        }
     )]
-def extract_table_number(table_number_str):
-    """Extract numeric value from table number for sorting"""
-    import re
-    if not table_number_str:
-        return 0
-    # Remove "№" and whitespace
-    cleaned = str(table_number_str).replace('№', '').strip()
-    # Try to extract the numeric part (handles formats like "9.1", "9.30", "А.8")
-    match = re.search(r'(\d+)\.?(\d*)', cleaned)
-    if match:
-        major = int(match.group(1))
-        minor = int(match.group(2)) if match.group(2) else 0
-        # Create sortable number: major * 1000 + minor
-        # This ensures 9.2 comes before 9.30
-        return major * 1000 + minor
-    # If no numbers found, try alphabetic sorting
-    return hash(cleaned)
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)
     log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
@@ -179,7 +80,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
         stats = {
             'total_tables': 0,
             'total_size': 0,
-            'by_document': defaultdict(lambda: {'count': 0, 'size': 0, 'tables': []})
         }
         for file_path in table_files:
@@ -201,11 +102,9 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                         document_id = table_data.get('document', 'unknown')
                         if 'sheets' in table_data:
-                            # Sort sheets by table_number
                             sorted_sheets = sorted(
                                 table_data['sheets'],
-                                key=lambda x: extract_table_number(x.get('table_number', ''))
-                            )
                             for sheet in sorted_sheets:
                                 sheet['document'] = document_id
@@ -213,46 +112,22 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                                 table_documents.extend(docs_list)
                                 for doc in docs_list:
-                                    table_num = doc.metadata.get('table_number', '')
                                     stats['total_tables'] += 1
                                     size = doc.metadata.get('content_size', 0)
                                     stats['total_size'] += size
                                     stats['by_document'][document_id]['count'] += 1
                                     stats['by_document'][document_id]['size'] += size
-                                    stats['by_document'][document_id]['tables'].append(table_num)
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)
                             for doc in docs_list:
-                                table_num = doc.metadata.get('table_number', '')
                                 stats['total_tables'] += 1
                                 size = doc.metadata.get('content_size', 0)
                                 stats['total_size'] += size
                                 stats['by_document'][document_id]['count'] += 1
                                 stats['by_document'][document_id]['size'] += size
-                                stats['by_document'][document_id]['tables'].append(table_num)
-                    elif isinstance(table_data, list):
-                        # Sort list by table_number
-                        sorted_tables = sorted(
-                            table_data,
-                            key=lambda x: extract_table_number(x.get('table_number', ''))
-                        )
-                        for table_json in sorted_tables:
-                            docs_list = table_to_document(table_json)
-                            table_documents.extend(docs_list)
-                            for doc in docs_list:
-                                doc_id = doc.metadata.get('document_id', 'unknown')
-                                table_num = doc.metadata.get('table_number', '')
-                                stats['total_tables'] += 1
-                                size = doc.metadata.get('content_size', 0)
-                                stats['total_size'] += size
-                                stats['by_document'][doc_id]['count'] += 1
-                                stats['by_document'][doc_id]['size'] += size
-                                stats['by_document'][doc_id]['tables'].append(table_num)
             except Exception as e:
                 log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
@@ -270,8 +145,6 @@ def load_table_data(repo_id, hf_token, table_data_dir):
         for doc_id, doc_stats in sorted(stats['by_document'].items()):
             log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
                        f"{doc_stats['size']:,} символов")
-            log_message(f"    Таблицы: {', '.join(doc_stats['tables'][:10])}"
-                       f"{'...' if len(doc_stats['tables']) > 10 else ''}")
         log_message("=" * 60)
@@ -279,4 +152,67 @@ def load_table_data(repo_id, hf_token, table_data_dir):
     except Exception as e:
         log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
-        return []

 from llama_index.core import Document
 from my_logging import log_message
 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     return content
 def table_to_document(table_data, document_id=None):
+    """Convert table data to a single Document"""
     if not isinstance(table_data, dict):
         return []
     doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
+    sheet_name = table_data.get('sheet_name', None)  # <-- add this
     content = create_table_content(table_data)
     content_size = len(content)
     row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
     log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
                 f"Размер: {content_size} символов | Строк: {row_count}")
+    metadata = {
+        "type": "table",
+        "table_number": table_num,
+        "table_title": table_title,
+        "document_id": doc_id,
+        "section": section,
+        "section_id": section,
+        "total_rows": row_count,
+        "content_size": content_size
+    }
+    if sheet_name:
+        metadata["sheet_name"] = sheet_name  # <-- add this
     return [Document(
         text=content,
+        metadata=metadata
     )]
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)
     log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
         stats = {
             'total_tables': 0,
             'total_size': 0,
+            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
         }
         for file_path in table_files:
                         document_id = table_data.get('document', 'unknown')
                         if 'sheets' in table_data:
                             sorted_sheets = sorted(
                                 table_data['sheets'],
+                                )
                             for sheet in sorted_sheets:
                                 sheet['document'] = document_id
                                 table_documents.extend(docs_list)
                                 for doc in docs_list:
                                     stats['total_tables'] += 1
                                     size = doc.metadata.get('content_size', 0)
                                     stats['total_size'] += size
                                     stats['by_document'][document_id]['count'] += 1
                                     stats['by_document'][document_id]['size'] += size
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)
                             for doc in docs_list:
                                 stats['total_tables'] += 1
                                 size = doc.metadata.get('content_size', 0)
                                 stats['total_size'] += size
                                 stats['by_document'][document_id]['count'] += 1
                                 stats['by_document'][document_id]['size'] += size
             except Exception as e:
                 log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
         for doc_id, doc_stats in sorted(stats['by_document'].items()):
             log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
                        f"{doc_stats['size']:,} символов")
         log_message("=" * 60)
     except Exception as e:
         log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
+        return []
+CUSTOM_TABLE_CONFIGS = {
+    "НП-105-18": {
+        "tables": {
+            "№ 4.8": {"method": "group_entire_table"}
+        }
+    },
+    "ГОСТ Р 50.05.23-2020": {
+        "tables": {
+            "№8": {"method": "group_entire_table"}
+        }
+    },
+    "ГОСТ Р 50.03.01-2017": {
+        "tables": {
+            "А.8": {"method": "group_entire_table"}
+        }
+    }
+}
+def create_meta_info(document_name, section, table_number, table_title):
+    meta_info = f"Документ: {document_name}\n"
+    meta_info += f"Раздел: {section}\n"
+    meta_info += f"Таблица: {table_number}\n"
+    meta_info += f"Название таблицы: {table_title}\n"
+    return meta_info
+def create_chunk_text(meta_info, headers, rows):
+    header_line = ", ".join(headers)
+    row_lines = ["; ".join(map(str, row)) for row in rows]
+    chunk = f"Meta: {meta_info}\nHeaders: {header_line}\nRows:\n" + "\n".join(row_lines)
+    return chunk
+def group_entire_table_method(table_data, document_name):
+    """Group entire table as one chunk"""
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    sheet_name = table_data.get("sheet_name", None)
+    meta_info = create_meta_info(document_name, section, table_number, table_title)
+    chunk_text = create_chunk_text(meta_info, headers, rows)
+    metadata = {
+        "type": "table",
+        "table_number": table_number,
+        "table_title": table_title,
+        "document_id": document_name,
+        "section": section,
+        "section_id": section,
+        "total_rows": len(rows),
+        "processing_method": "group_entire_table"
+    }
+    if sheet_name:
+        metadata["sheet_name"] = sheet_name
+    doc = Document(
+        text=chunk_text,
+        metadata=metadata
+    )
+    log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
+    return [doc]