Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 29, 2025

Commit

0067c9d

1 Parent(s): 7dcc6c5

Removed all custom table configurations (CUSTOM_TABLE_CONFIGS)

Browse files

Files changed (2) hide show

documents_prep.py +18 -38
table_prep.py +81 -179

documents_prep.py CHANGED Viewed

@@ -44,7 +44,6 @@ def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
     table_count = 0
-    table_added_count = 0
     image_count = 0
     text_chunks_count = 0
@@ -52,44 +51,19 @@ def process_documents_with_chunking(documents):
         doc_type = doc.metadata.get('type', 'text')
         if doc_type == 'table':
             table_count += 1
-            doc_size = len(doc.text)
-            # Log table size
-            table_num = doc.metadata.get('table_number', 'unknown')
-            doc_id = doc.metadata.get('document_id', 'unknown')
-            log_message(f"Таблица {table_num} в документе {doc_id}: размер {doc_size} символов")
-            # Always add table regardless of size or custom config
-            if doc_size > CHUNK_SIZE:
-                chunked_docs = chunk_document(doc)
-                all_chunked_docs.extend(chunked_docs)
-                table_added_count += len(chunked_docs)
-                for i, chunk_doc in enumerate(chunked_docs):
-                    chunk_info.append({
-                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
-                        'chunk_size': len(chunk_doc.text),
-                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
-                        'type': 'table',
-                        'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
-                        'processing_method': 'chunked'
-                    })
-            else:
-                all_chunked_docs.append(doc)
-                table_added_count += 1
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': doc_size,
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'table',
-                    'table_number': doc.metadata.get('table_number', 'unknown'),
-                    'processing_method': 'standard'
-                })
         elif doc_type == 'image':
             image_count += 1
@@ -147,7 +121,13 @@ def process_documents_with_chunking(documents):
                     'type': 'text'
                 })
-    log_message(f"Таблицы: всего {table_count}, добавлено {table_added_count}, Изображения: {image_count}, Текстовые чанки: {text_chunks_count}, Итого: {len(all_chunked_docs)}")
     return all_chunked_docs, chunk_info

     all_chunked_docs = []
     chunk_info = []
     table_count = 0
     image_count = 0
     text_chunks_count = 0
         doc_type = doc.metadata.get('type', 'text')
         if doc_type == 'table':
+            # Add tables as-is, no chunking
             table_count += 1
+            all_chunked_docs.append(doc)
+            chunk_info.append({
+                'document_id': doc.metadata.get('document_id', 'unknown'),
+                'section_id': doc.metadata.get('section_id', 'unknown'),
+                'chunk_id': 0,
+                'chunk_size': len(doc.text),
+                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                'type': 'table',
+                'table_number': doc.metadata.get('table_number', 'unknown')
+            })
         elif doc_type == 'image':
             image_count += 1
                     'type': 'text'
                 })
+    log_message(f"\n{'='*60}")
+    log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
+    log_message(f"  • Таблицы: {table_count} (добавлены целиком)")
+    log_message(f"  • Изображения: {image_count}")
+    log_message(f"  • Текстовые чанки: {text_chunks_count}")
+    log_message(f"  • Всего документов: {len(all_chunked_docs)}")
+    log_message(f"{'='*60}\n")
     return all_chunked_docs, chunk_info

table_prep.py CHANGED Viewed

@@ -4,156 +4,33 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
-CUSTOM_TABLE_CONFIGS = {
-    "ГОСТ Р 50.05.01-2018": {
-        "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
-        "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
-    },
-    "ГОСТ Р 50.06.01-2017": {"№ Б.2": {"method": "split_by_rows"}},
-    "НП-104-18": {"*": {"method": "group_entire_table"}},
-    "НП-068-05": {
-        "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
-        "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
-        "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
-    },
-    "ГОСТ Р 59023.1-2020": {
-        "№ 1": {"method": "split_by_rows"},
-        "№ 2": {"method": "split_by_rows"},
-        "№ 3": {"method": "split_by_rows"}
-    },
-    "НП-089-15": {"-": {"method": "split_by_rows"}},
-    "НП-105-18": {"№ 4.8": {"method": "group_entire_table"}},
-    "ГОСТ Р 50.05.23-2020": {"№8": {"method": "group_entire_table"}},
-    "ГОСТ Р 50.03.01-2017": {"А.8": {"method": "group_entire_table"}}
-}
-def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
-    base_info = f'Документ "{document_name}", Раздел: {section}, Таблица: {table_number}'
-    if table_title and table_title.strip():
-        base_info += f', Название: {table_title}'
-    if extra_info:
-        base_info += f', {extra_info}'
-    return base_info
-def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
-    chunk_lines = [meta_info.rstrip()]
-    chunk_lines.append("Заголовки: " + " | ".join(headers))
-    for i, row in enumerate(rows, start=1):
-        row_parts = [f"{h}: {row.get(h, '')}" for h in headers if row.get(h, '')]
-        if add_row_numbers:
-            chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
-        else:
-            chunk_lines.append(' | '.join(row_parts))
-    return "\n".join(chunk_lines)
-def get_custom_config(document_id, table_number):
-    for doc_pattern, tables_config in CUSTOM_TABLE_CONFIGS.items():
-        if document_id.startswith(doc_pattern):
-            return tables_config.get(table_number, tables_config.get("*"))
-    return None
-def group_by_column_method(table_data, document_name, group_column):
-    documents = []
-    headers = table_data.get("headers", [])
-    rows = table_data.get("data", [])
-    section = table_data.get("section", "")
-    table_number = table_data.get("table_number", "")
-    table_title = table_data.get("table_title", "")
-    grouped = defaultdict(list)
-    for row in rows:
-        grouped[row.get(group_column, "UNKNOWN")].append(row)
-    for group_value, group_rows in grouped.items():
-        meta_info = create_meta_info(document_name, section, table_number, table_title,
-                                   f'Группа по "{group_column}": {group_value}')
-        chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
-        documents.append(Document(
-            text=chunk_text,
-            metadata={
-                "type": "table",
-                "table_number": table_number,
-                "table_title": table_title,
-                "document_id": document_name,
-                "section": section,
-                "section_id": section,
-                "group_column": group_column,
-                "group_value": group_value,
-                "total_rows": len(group_rows),
-                "processing_method": "group_by_column"
-            }
-        ))
-    return documents
-def split_by_rows_method(table_data, document_name):
-    documents = []
-    headers = table_data.get("headers", [])
-    rows = table_data.get("data", [])
-    section = table_data.get("section", "")
-    table_number = table_data.get("table_number", "")
-    table_title = table_data.get("table_title", "")
-    for i, row in enumerate(rows, start=1):
-        meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
-        chunk_text = create_chunk_text(meta_info, headers, [row])
-        documents.append(Document(
-            text=chunk_text,
-            metadata={
-                "type": "table",
-                "table_number": table_number,
-                "table_title": table_title,
-                "document_id": document_name,
-                "section": section,
-                "section_id": section,
-                "row_number": i,
-                "total_rows": len(rows),
-                "processing_method": "split_by_rows"
-            }
-        ))
-    return documents
-def group_entire_table_method(table_data, document_name):
-    headers = table_data.get("headers", [])
-    rows = table_data.get("data", [])
-    section = table_data.get("section", "")
-    table_number = table_data.get("table_number", "")
-    table_title = table_data.get("table_title", "")
-    meta_info = create_meta_info(document_name, section, table_number, table_title)
-    chunk_text = create_chunk_text(meta_info, headers, rows)
-    return [Document(
-        text=chunk_text,
-        metadata={
-            "type": "table",
-            "table_number": table_number,
-            "table_title": table_title,
-            "document_id": document_name,
-            "section": section,
-            "section_id": section,
-            "total_rows": len(rows),
-            "processing_method": "group_entire_table"
-        }
-    )]
-def process_table(table_data, document_name, method_config):
-    method = method_config.get("method")
-    if method == "group_by_column":
-        return group_by_column_method(table_data, document_name, method_config.get("group_column"))
-    elif method == "split_by_rows":
-        return split_by_rows_method(table_data, document_name)
-    elif method == "group_entire_table":
-        return group_entire_table_method(table_data, document_name)
-    return None
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
         return []
@@ -162,39 +39,16 @@ def table_to_document(table_data, document_id=None):
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
-    method_config = get_custom_config(doc_id, table_num)
-    if method_config:
-        log_message(f"✓ Таблица {table_num} '{table_title}' в документе {doc_id}: метод {method_config['method']}")
-        custom_docs = process_table(table_data, doc_id, method_config)
-        if custom_docs:
-            return custom_docs
-    header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
-    if 'data' in table_data and isinstance(table_data['data'], list):
-        table_content = header_content + "\nДанные таблицы:\n"
-        for row_idx, row in enumerate(table_data['data']):
-            if isinstance(row, dict):
-                row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
-                table_content += f"Строка {row_idx + 1}: {row_text}\n"
-        return [Document(
-            text=table_content,
-            metadata={
-                "type": "table",
-                "table_number": table_num,
-                "table_title": table_title,
-                "document_id": doc_id,
-                "section": section,
-                "section_id": section,
-                "total_rows": len(table_data['data']),
-                "processing_method": "default"
-            }
-        )]
     return [Document(
-        text=header_content,
         metadata={
             "type": "table",
             "table_number": table_num,
@@ -202,12 +56,15 @@ def table_to_document(table_data, document_id=None):
             "document_id": doc_id,
             "section": section,
             "section_id": section,
-            "processing_method": "default"
         }
     )]
 def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message("Загрузка табличных данных")
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -216,6 +73,12 @@ def load_table_data(repo_id, hf_token, table_data_dir):
         log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
         table_documents = []
         for file_path in table_files:
             try:
                 local_path = hf_hub_download(
@@ -226,6 +89,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                     token=hf_token
                 )
                 with open(local_path, 'r', encoding='utf-8') as f:
                     table_data = json.load(f)
@@ -237,21 +102,58 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                                 sheet['document'] = document_id
                                 docs_list = table_to_document(sheet, document_id)
                                 table_documents.extend(docs_list)
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)
                     elif isinstance(table_data, list):
                         for table_json in table_data:
                             docs_list = table_to_document(table_json)
                             table_documents.extend(docs_list)
             except Exception as e:
-                log_message(f"Ошибка файла {file_path}: {str(e)}")
                 continue
-        log_message(f"✓✓✓ ИТОГО создано {len(table_documents)} табличных документов (до chunking)")
         return table_documents
     except Exception as e:
-        log_message(f"Ошибка загрузки табличных данных: {str(e)}")
         return []

 from llama_index.core import Document
 from my_logging import log_message
+def create_table_content(table_data):
+    """Create formatted content from table data"""
+    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
+    table_num = table_data.get('table_number', 'Неизвестно')
+    table_title = table_data.get('table_title', 'Неизвестно')
+    section = table_data.get('section', 'Неизвестно')
+    content = f"Таблица: {table_num}\n"
+    content += f"Название: {table_title}\n"
+    content += f"Документ: {doc_id}\n"
+    content += f"Раздел: {section}\n"
+    headers = table_data.get('headers', [])
+    if headers:
+        content += f"\nЗаголовки: {' | '.join(headers)}\n"
+    if 'data' in table_data and isinstance(table_data['data'], list):
+        content += "\nДанные таблицы:\n"
+        for row_idx, row in enumerate(table_data['data'], start=1):
+            if isinstance(row, dict):
+                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+                content += f"Строка {row_idx}: {row_text}\n"
+    return content
 def table_to_document(table_data, document_id=None):
+    """Convert table data to a single Document"""
     if not isinstance(table_data, dict):
         return []
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
+    content = create_table_content(table_data)
+    content_size = len(content)
+    # Log table addition
+    row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
+    log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
+                f"Размер: {content_size} символов | Строк: {row_count}")
     return [Document(
+        text=content,
         metadata={
             "type": "table",
             "table_number": table_num,
             "document_id": doc_id,
             "section": section,
             "section_id": section,
+            "total_rows": row_count,
+            "content_size": content_size
         }
     )]
 def load_table_data(repo_id, hf_token, table_data_dir):
+    log_message("=" * 60)
+    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
+    log_message("=" * 60)
     try:
         files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
         table_documents = []
+        stats = {
+            'total_tables': 0,
+            'total_size': 0,
+            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
+        }
         for file_path in table_files:
             try:
                 local_path = hf_hub_download(
                     token=hf_token
                 )
+                log_message(f"\nОбработка файла: {file_path}")
                 with open(local_path, 'r', encoding='utf-8') as f:
                     table_data = json.load(f)
                                 sheet['document'] = document_id
                                 docs_list = table_to_document(sheet, document_id)
                                 table_documents.extend(docs_list)
+                                for doc in docs_list:
+                                    stats['total_tables'] += 1
+                                    size = doc.metadata.get('content_size', 0)
+                                    stats['total_size'] += size
+                                    stats['by_document'][document_id]['count'] += 1
+                                    stats['by_document'][document_id]['size'] += size
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)
+                            for doc in docs_list:
+                                stats['total_tables'] += 1
+                                size = doc.metadata.get('content_size', 0)
+                                stats['total_size'] += size
+                                stats['by_document'][document_id]['count'] += 1
+                                stats['by_document'][document_id]['size'] += size
                     elif isinstance(table_data, list):
                         for table_json in table_data:
                             docs_list = table_to_document(table_json)
                             table_documents.extend(docs_list)
+                            for doc in docs_list:
+                                doc_id = doc.metadata.get('document_id', 'unknown')
+                                stats['total_tables'] += 1
+                                size = doc.metadata.get('content_size', 0)
+                                stats['total_size'] += size
+                                stats['by_document'][doc_id]['count'] += 1
+                                stats['by_document'][doc_id]['size'] += size
             except Exception as e:
+                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
                 continue
+        # Log summary statistics
+        log_message("\n" + "=" * 60)
+        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
+        log_message("=" * 60)
+        log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
+        log_message(f"Общий размер: {stats['total_size']:,} символов")
+        log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
+        log_message("\nПо документам:")
+        for doc_id, doc_stats in sorted(stats['by_document'].items()):
+            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
+                       f"{doc_stats['size']:,} символов")
+        log_message("=" * 60)
         return table_documents
     except Exception as e:
+        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
         return []