Spaces:

MrSimple01
/

RAG_AIEXP_01

Running

App Files Files Community

MrSimple07 commited on Sep 29, 2025

Commit

4ce52d0

1 Parent(s): 1b689ce

fix returns a tuple (major, minor) instead of a float.

Browse files

Files changed (1) hide show

table_prep.py +116 -28

table_prep.py CHANGED Viewed

@@ -4,6 +4,80 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
@@ -30,7 +104,7 @@ def create_table_content(table_data):
     return content
 def table_to_document(table_data, document_id=None):
-    """Convert table data to a single Document"""
     if not isinstance(table_data, dict):
         return []
@@ -39,10 +113,18 @@ def table_to_document(table_data, document_id=None):
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
     content = create_table_content(table_data)
     content_size = len(content)
-    # Log table addition
     row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
     log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
                 f"Размер: {content_size} символов | Строк: {row_count}")
@@ -61,6 +143,27 @@ def table_to_document(table_data, document_id=None):
         }
     )]
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)
     log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
@@ -76,7 +179,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
         stats = {
             'total_tables': 0,
             'total_size': 0,
-            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
         }
         for file_path in table_files:
@@ -98,7 +201,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                         document_id = table_data.get('document', 'unknown')
                         if 'sheets' in table_data:
-                            # Sort sheets by table_number to ensure correct order
                             sorted_sheets = sorted(
                                 table_data['sheets'],
                                 key=lambda x: extract_table_number(x.get('table_number', ''))
@@ -110,21 +213,25 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                                 table_documents.extend(docs_list)
                                 for doc in docs_list:
                                     stats['total_tables'] += 1
                                     size = doc.metadata.get('content_size', 0)
                                     stats['total_size'] += size
                                     stats['by_document'][document_id]['count'] += 1
                                     stats['by_document'][document_id]['size'] += size
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)
                             for doc in docs_list:
                                 stats['total_tables'] += 1
                                 size = doc.metadata.get('content_size', 0)
                                 stats['total_size'] += size
                                 stats['by_document'][document_id]['count'] += 1
                                 stats['by_document'][document_id]['size'] += size
                     elif isinstance(table_data, list):
                         # Sort list by table_number
@@ -139,11 +246,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                             for doc in docs_list:
                                 doc_id = doc.metadata.get('document_id', 'unknown')
                                 stats['total_tables'] += 1
                                 size = doc.metadata.get('content_size', 0)
                                 stats['total_size'] += size
                                 stats['by_document'][doc_id]['count'] += 1
                                 stats['by_document'][doc_id]['size'] += size
             except Exception as e:
                 log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
@@ -161,6 +270,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
         for doc_id, doc_stats in sorted(stats['by_document'].items()):
             log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
                        f"{doc_stats['size']:,} символов")
         log_message("=" * 60)
@@ -168,27 +279,4 @@ def load_table_data(repo_id, hf_token, table_data_dir):
     except Exception as e:
         log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
-        return []
-def extract_table_number(table_number_str):
-    """Extract numeric value from table number for sorting"""
-    import re
-    if not table_number_str:
-        return (0, 0)
-    # Remove "№" and extract numbers
-    cleaned = re.sub(r'[^0-9.]', '', str(table_number_str))
-    try:
-        # Split by dot to handle hierarchical numbering
-        parts = cleaned.split('.')
-        if len(parts) == 2:
-            # Return tuple for proper sorting: (major, minor)
-            return (int(parts[0]), int(parts[1]))
-        elif len(parts) == 1 and parts[0]:
-            return (int(parts[0]), 0)
-        else:
-            return (0, 0)
-    except (ValueError, IndexError):
-        return (0, 0)

 from llama_index.core import Document
 from my_logging import log_message
+# Custom table configurations
+CUSTOM_TABLE_CONFIGS = {
+    "НП-104-18": {
+        "tables": {}  # Add specific tables here if needed
+    },
+    "НП-105-18": {
+        "tables": {
+            "№ 4.8": {"method": "group_entire_table"}
+        }
+    },
+    "ГОСТ Р 50.05.23-2020": {
+        "tables": {
+            "№8": {"method": "group_entire_table"}
+        }
+    },
+    "ГОСТ Р 50.03.01-2017": {
+        "tables": {
+            "А.8": {"method": "group_entire_table"}
+        }
+    }
+}
+def create_meta_info(document_name, section, table_number, table_title):
+    """Create metadata information for table"""
+    meta = f"Таблица: {table_number}\n"
+    meta += f"Название: {table_title}\n"
+    meta += f"Документ: {document_name}\n"
+    meta += f"Раздел: {section}\n"
+    return meta
+def create_chunk_text(meta_info, headers, rows):
+    """Create formatted text from table data"""
+    chunk_text = meta_info
+    if headers:
+        chunk_text += f"\nЗаголовки: {' | '.join(headers)}\n"
+    chunk_text += "\nДанные таблицы:\n"
+    for row_idx, row in enumerate(rows, start=1):
+        if isinstance(row, dict):
+            row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+            chunk_text += f"Строка {row_idx}: {row_text}\n"
+    return chunk_text
+def group_entire_table_method(table_data, document_name):
+    """Group entire table as one chunk"""
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    meta_info = create_meta_info(document_name, section, table_number, table_title)
+    chunk_text = create_chunk_text(meta_info, headers, rows)
+    doc = Document(
+        text=chunk_text,
+        metadata={
+            "type": "table",
+            "table_number": table_number,
+            "table_title": table_title,
+            "document_id": document_name,
+            "section": section,
+            "section_id": section,
+            "total_rows": len(rows),
+            "processing_method": "group_entire_table",
+            "content_size": len(chunk_text)
+        }
+    )
+    log_message(f"✓ GROUPED ENTIRE TABLE: {table_number}, rows: {len(rows)}, size: {len(chunk_text)} символов")
+    return [doc]
 def create_table_content(table_data):
     """Create formatted content from table data"""
     doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     return content
 def table_to_document(table_data, document_id=None):
+    """Convert table data to a single Document with custom processing support"""
     if not isinstance(table_data, dict):
         return []
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
+    # Check for custom processing
+    if doc_id in CUSTOM_TABLE_CONFIGS:
+        doc_config = CUSTOM_TABLE_CONFIGS[doc_id]
+        if table_num in doc_config.get("tables", {}):
+            method = doc_config["tables"][table_num].get("method")
+            if method == "group_entire_table":
+                return group_entire_table_method(table_data, doc_id)
+    # Default processing
     content = create_table_content(table_data)
     content_size = len(content)
     row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
     log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
                 f"Размер: {content_size} символов | Строк: {row_count}")
         }
     )]
+def extract_table_number(table_number_str):
+    """Extract numeric value from table number for sorting"""
+    import re
+    if not table_number_str:
+        return 0
+    # Remove "№" and whitespace
+    cleaned = str(table_number_str).replace('№', '').strip()
+    # Try to extract the numeric part (handles formats like "9.1", "9.30", "А.8")
+    match = re.search(r'(\d+)\.?(\d*)', cleaned)
+    if match:
+        major = int(match.group(1))
+        minor = int(match.group(2)) if match.group(2) else 0
+        # Create sortable number: major * 1000 + minor
+        # This ensures 9.2 comes before 9.30
+        return major * 1000 + minor
+    # If no numbers found, try alphabetic sorting
+    return hash(cleaned)
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)
     log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
         stats = {
             'total_tables': 0,
             'total_size': 0,
+            'by_document': defaultdict(lambda: {'count': 0, 'size': 0, 'tables': []})
         }
         for file_path in table_files:
                         document_id = table_data.get('document', 'unknown')
                         if 'sheets' in table_data:
+                            # Sort sheets by table_number
                             sorted_sheets = sorted(
                                 table_data['sheets'],
                                 key=lambda x: extract_table_number(x.get('table_number', ''))
                                 table_documents.extend(docs_list)
                                 for doc in docs_list:
+                                    table_num = doc.metadata.get('table_number', '')
                                     stats['total_tables'] += 1
                                     size = doc.metadata.get('content_size', 0)
                                     stats['total_size'] += size
                                     stats['by_document'][document_id]['count'] += 1
                                     stats['by_document'][document_id]['size'] += size
+                                    stats['by_document'][document_id]['tables'].append(table_num)
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)
                             for doc in docs_list:
+                                table_num = doc.metadata.get('table_number', '')
                                 stats['total_tables'] += 1
                                 size = doc.metadata.get('content_size', 0)
                                 stats['total_size'] += size
                                 stats['by_document'][document_id]['count'] += 1
                                 stats['by_document'][document_id]['size'] += size
+                                stats['by_document'][document_id]['tables'].append(table_num)
                     elif isinstance(table_data, list):
                         # Sort list by table_number
                             for doc in docs_list:
                                 doc_id = doc.metadata.get('document_id', 'unknown')
+                                table_num = doc.metadata.get('table_number', '')
                                 stats['total_tables'] += 1
                                 size = doc.metadata.get('content_size', 0)
                                 stats['total_size'] += size
                                 stats['by_document'][doc_id]['count'] += 1
                                 stats['by_document'][doc_id]['size'] += size
+                                stats['by_document'][doc_id]['tables'].append(table_num)
             except Exception as e:
                 log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
         for doc_id, doc_stats in sorted(stats['by_document'].items()):
             log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
                        f"{doc_stats['size']:,} символов")
+            log_message(f"    Таблицы: {', '.join(doc_stats['tables'][:10])}"
+                       f"{'...' if len(doc_stats['tables']) > 10 else ''}")
         log_message("=" * 60)
     except Exception as e:
         log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
+        return []