Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 19, 2025

Commit

5884230

1 Parent(s): bf0077f

table processing + new version of np104

Browse files

Files changed (7) hide show

.gitattributes +1 -0
documents_prep.py +1 -102
new_xlsx.py/new_xlsx.py +82 -0
table_prep.py +325 -0
tempCodeRunnerFile.py +2 -0
Табличные данные/НП-104-18_ГОСТ 59023.xlsx +3 -0
Табличные данные_JSON/НП-104-18_ГОСТ 59023.json +2 -2

.gitattributes CHANGED Viewed

@@ -43,3 +43,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.pdf filter=lfs diff=lfs merge=lfs -text
 =======
 >>>>>>> b38db646fba42cf62de437de07713765675b4628

 *.pdf filter=lfs diff=lfs merge=lfs -text
 =======
 >>>>>>> b38db646fba42cf62de437de07713765675b4628
+*.xlsx filter=lfs diff=lfs merge=lfs -text

documents_prep.py CHANGED Viewed

@@ -6,6 +6,7 @@ from llama_index.core import Document
 from my_logging import log_message
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
 def chunk_document(doc, chunk_size=None, chunk_overlap=None):
@@ -378,108 +379,6 @@ def extract_zip_and_process_json(zip_path):
     return documents
-def table_to_document(table_data, document_id=None):
-    documents = []
-    if isinstance(table_data, dict):
-        doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
-        table_num = table_data.get('table_number', 'Неизвестно')
-        table_title = table_data.get('table_title', 'Неизвестно')
-        section = table_data.get('section', 'Неизвестно')
-        header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
-        if 'data' in table_data and isinstance(table_data['data'], list):
-            table_content = header_content + "\nДанные таблицы:\n"
-            for row_idx, row in enumerate(table_data['data']):
-                if isinstance(row, dict):
-                    row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
-                    table_content += f"Строка {row_idx + 1}: {row_text}\n"
-            doc = Document(
-                text=table_content,
-                metadata={
-                    "type": "table",
-                    "table_number": table_num,
-                    "table_title": table_title,
-                    "document_id": doc_id,
-                    "section": section,
-                    "section_id": section,
-                    "total_rows": len(table_data['data'])
-                }
-            )
-            documents.append(doc)
-        else:
-            doc = Document(
-                text=header_content,
-                metadata={
-                    "type": "table",
-                    "table_number": table_num,
-                    "table_title": table_title,
-                    "document_id": doc_id,
-                    "section": section,
-                    "section_id": section
-                }
-            )
-            documents.append(doc)
-    return documents
-def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message("Начинаю загрузку табличных данных")
-    table_files = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        for file in files:
-            if file.startswith(table_data_dir) and file.endswith('.json'):
-                table_files.append(file)
-        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
-        table_documents = []
-        for file_path in table_files:
-            try:
-                log_message(f"Обрабатываю файл: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                    if isinstance(table_data, dict):
-                        document_id = table_data.get('document', 'unknown')
-                        if 'sheets' in table_data:
-                            for sheet in table_data['sheets']:
-                                sheet['document'] = document_id
-                                # table_to_document теперь возвращает список
-                                docs_list = table_to_document(sheet, document_id)
-                                table_documents.extend(docs_list)  # extend вместо append
-                        else:
-                            docs_list = table_to_document(table_data, document_id)
-                            table_documents.extend(docs_list)  # extend вместо append
-                    elif isinstance(table_data, list):
-                        for table_json in table_data:
-                            docs_list = table_to_document(table_json)
-                            table_documents.extend(docs_list)  # extend вместо append
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Создано {len(table_documents)} документов из таблиц")
-        return table_documents
-    except Exception as e:
-        log_message(f"Ошибка загрузки табличных данных: {str(e)}")
-        return []
 def load_image_data(repo_id, hf_token, image_data_dir):
     log_message("Начинаю загрузку данных изображений")

 from my_logging import log_message
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+from table_prep import table_to_document, load_table_data
 def chunk_document(doc, chunk_size=None, chunk_overlap=None):
     return documents
 def load_image_data(repo_id, hf_token, image_data_dir):
     log_message("Начинаю загрузку данных изображений")

new_xlsx.py/new_xlsx.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pandas as pd
+import json
+import os
+def excel_to_json():
+    input_dir = "Табличные данные"
+    output_dir = "Табличные данные_JSON_2"
+    os.makedirs(output_dir, exist_ok=True)
+    excel_files = [f for f in os.listdir(input_dir) if f.endswith(('.xlsx', '.xls'))]
+    print(f"Found {len(excel_files)} Excel files")
+    successful, failed = 0, 0
+    for file in excel_files:
+        try:
+            file_path = os.path.join(input_dir, file)
+            all_sheets = pd.read_excel(file_path, sheet_name=None)
+            result = {
+                "document": file,
+                "total_sheets": len(all_sheets),
+                "sheets": []
+            }
+            for sheet_name, df in all_sheets.items():
+                if df.empty:
+                    continue
+                df = df.dropna(how='all').fillna("")
+                # проверим, что есть нужные колонки
+                if "Номер таблицы" not in df.columns:
+                    continue
+                # группировка по номеру таблицы
+                grouped = df.groupby("Номер таблицы")
+                for table_number, group in grouped:
+                    group = group.reset_index(drop=True)
+                    sheet_data = {
+                        "sheet_name": sheet_name,
+                        "document_id": str(group.iloc[0].get("Обозначение документа", "")),
+                        "section": str(group.iloc[0].get("Раздел документа", "")),
+                        "table_number": str(table_number),
+                        "table_title": str(group.iloc[0].get("Название таблицы", "")),
+                        "table_description": str(group.iloc[0].get("Примечание", "")),
+                        "headers": [col for col in df.columns if col not in ["Обозначение документа", "Раздел документа", "Номер таблицы", "Название таблицы", "Примечание"]],
+                        "data": []
+                    }
+                    # добавляем строки данных
+                    for _, row in group.iterrows():
+                        row_dict = {}
+                        for col in sheet_data["headers"]:
+                            row_dict[col] = str(row[col]) if pd.notna(row[col]) else ""
+                        sheet_data["data"].append(row_dict)
+                    result["sheets"].append(sheet_data)
+            json_filename = file.replace('.xlsx', '.json').replace('.xls', '.json')
+            json_path = os.path.join(output_dir, json_filename)
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            print(f"✓ Converted: {file} -> {json_filename}")
+            successful += 1
+        except Exception as e:
+            print(f"✗ Failed: {file} - {str(e)}")
+            failed += 1
+    print(f"\nResults:")
+    print(f"Successfully converted: {successful} files")
+    print(f"Failed: {failed} files")
+    print(f"JSON files saved to: {output_dir}")
+if __name__ == "__main__":
+    excel_to_json()

table_prep.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import os
+from collections import defaultdict
+import json
+import zipfile
+import pandas as pd
+from huggingface_hub import hf_hub_download, list_repo_files
+from llama_index.core import Document
+from my_logging import log_message
+# Add this configuration at the top of your documents_prep file
+CUSTOM_TABLE_CONFIGS = {
+    "ГОСТ Р 50.05.01-2018": {
+        "tables": {
+            "№3": {"method": "group_by_column", "group_column": "Класс герметичности и чувствительности"},
+            "№Б.1": {"method": "group_by_column", "group_column": "Класс чувствительности системы контроля"}
+        }
+    },
+    "ГОСТ Р 50.06.01-2017": {
+        "tables": {
+            "№ Б.2": {"method": "split_by_rows"}
+        }
+    },
+    "ГОСТ Р 59023.2-2020": {
+        "tables": {
+            "*": {"method": "group_entire_table"}  # All tables
+        }
+    },
+    "НП-068-05": {
+        "tables": {
+            "Таблица 1": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
+            "Таблица 2": {"method": "group_by_column", "group_column": "Рабочее давление среды, МПа"},
+            "Таблица Приложения 1": {"method": "group_by_column", "group_column": "Тип"}
+        }
+    },
+    "ГОСТ Р 59023.1-2020": {
+        "tables": {
+            "№ 1": {"method": "split_by_rows"},
+            "№ 2": {"method": "split_by_rows"},
+            "№ 3": {"method": "split_by_rows"}
+        }
+    }
+}
+def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
+    """Create standard meta information string"""
+    base_info = f'Документ "{document_name}", Раздел: {section}, Номер таблицы: {table_number}, Название таблицы: {table_title}'
+    if extra_info:
+        base_info += f', {extra_info}'
+    return base_info + '\n'
+def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
+    """Create chunk text with headers and rows"""
+    header_line = " | ".join(headers)
+    chunk_lines = [meta_info + "Заголовки: " + header_line]
+    for i, row in enumerate(rows, start=1):
+        row_text = " | ".join([f"{h}: {row.get(h, '')}" for h in headers])
+        if add_row_numbers:
+            chunk_lines.append(f"Строка {i}: {row_text}")
+        else:
+            chunk_lines.append(row_text)
+    return "\n".join(chunk_lines)
+def group_by_column_method(table_data, document_name, group_column):
+    """Group rows by specified column value"""
+    documents = []
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    grouped = defaultdict(list)
+    for row in rows:
+        key = row.get(group_column, "UNKNOWN")
+        grouped[key].append(row)
+    for group_value, group_rows in grouped.items():
+        meta_info = create_meta_info(document_name, section, table_number, table_title,
+                                   f'Группа по "{group_column}": {group_value}')
+        chunk_text = create_chunk_text(meta_info, headers, group_rows, add_row_numbers=True)
+        doc = Document(
+            text=chunk_text,
+            metadata={
+                "type": "table",
+                "table_number": table_number,
+                "table_title": table_title,
+                "document_id": document_name,
+                "section": section,
+                "section_id": section,
+                "group_column": group_column,
+                "group_value": group_value,
+                "total_rows": len(group_rows),
+                "processing_method": "group_by_column"
+            }
+        )
+        documents.append(doc)
+        log_message(f"Created grouped chunk for {group_column}={group_value}, rows: {len(group_rows)}, length: {len(chunk_text)}")
+    return documents
+def split_by_rows_method(table_data, document_name):
+    """Split table into individual row chunks"""
+    documents = []
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    for i, row in enumerate(rows, start=1):
+        meta_info = create_meta_info(document_name, section, table_number, table_title, f'Строка: {i}')
+        chunk_text = create_chunk_text(meta_info, headers, [row])
+        doc = Document(
+            text=chunk_text,
+            metadata={
+                "type": "table",
+                "table_number": table_number,
+                "table_title": table_title,
+                "document_id": document_name,
+                "section": section,
+                "section_id": section,
+                "row_number": i,
+                "total_rows": len(rows),
+                "processing_method": "split_by_rows"
+            }
+        )
+        documents.append(doc)
+    log_message(f"Split table {table_number} into {len(rows)} row chunks")
+    return documents
+def group_entire_table_method(table_data, document_name):
+    """Group entire table as one chunk"""
+    headers = table_data.get("headers", [])
+    rows = table_data.get("data", [])
+    section = table_data.get("section", "")
+    table_number = table_data.get("table_number", "")
+    table_title = table_data.get("table_title", "")
+    meta_info = create_meta_info(document_name, section, table_number, table_title)
+    chunk_text = create_chunk_text(meta_info, headers, rows)
+    doc = Document(
+        text=chunk_text,
+        metadata={
+            "type": "table",
+            "table_number": table_number,
+            "table_title": table_title,
+            "document_id": document_name,
+            "section": section,
+            "section_id": section,
+            "total_rows": len(rows),
+            "processing_method": "group_entire_table"
+        }
+    )
+    log_message(f"Grouped entire table {table_number}, rows: {len(rows)}, length: {len(chunk_text)}")
+    return [doc]
+def should_use_custom_processing(document_id, table_number):
+    """Check if table should use custom processing"""
+    for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
+        if document_id.startswith(doc_pattern):
+            tables_config = config.get("tables", {})
+            # Check for exact match or wildcard
+            if table_number in tables_config or "*" in tables_config:
+                return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
+    return False, None, None
+def process_table_with_custom_method(table_data, document_name, method_config):
+    """Process table using custom method"""
+    method = method_config.get("method")
+    if method == "group_by_column":
+        group_column = method_config.get("group_column")
+        return group_by_column_method(table_data, document_name, group_column)
+    elif method == "split_by_rows":
+        return split_by_rows_method(table_data, document_name)
+    elif method == "group_entire_table":
+        return group_entire_table_method(table_data, document_name)
+    else:
+        log_message(f"Unknown custom method: {method}, falling back to default processing")
+        return None
+def table_to_document(table_data, document_id=None):
+    if isinstance(table_data, dict):
+        doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
+        table_num = table_data.get('table_number', 'Неизвестно')
+        # Check if this table should use custom processing
+        use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
+        if use_custom:
+            log_message(f"Using custom processing for table {table_num} in document {doc_id}")
+            custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
+            if custom_docs:
+                # Return custom processed documents and skip default processing
+                return custom_docs
+        # Default processing for tables not in custom config
+        table_title = table_data.get('table_title', 'Неизвестно')
+        section = table_data.get('section', 'Неизвестно')
+        header_content = f"Таблица: {table_num}\nНазвание: {table_title}\nДокумент: {doc_id}\nРаздел: {section}\n"
+        if 'data' in table_data and isinstance(table_data['data'], list):
+            table_content = header_content + "\nДанные таблицы:\n"
+            for row_idx, row in enumerate(table_data['data']):
+                if isinstance(row, dict):
+                    row_text = " | ".join([f"{k}: {v}" for k, v in row.items()])
+                    table_content += f"Строка {row_idx + 1}: {row_text}\n"
+            doc = Document(
+                text=table_content,
+                metadata={
+                    "type": "table",
+                    "table_number": table_num,
+                    "table_title": table_title,
+                    "document_id": doc_id,
+                    "section": section,
+                    "section_id": section,
+                    "total_rows": len(table_data['data']),
+                    "processing_method": "default"
+                }
+            )
+            return [doc]
+        else:
+            doc = Document(
+                text=header_content,
+                metadata={
+                    "type": "table",
+                    "table_number": table_num,
+                    "table_title": table_title,
+                    "document_id": doc_id,
+                    "section": section,
+                    "section_id": section,
+                    "processing_method": "default"
+                }
+            )
+            return [doc]
+    return []
+def load_table_data(repo_id, hf_token, table_data_dir):
+    """Modified function with custom table processing integration"""
+    log_message("Начинаю загрузку табличных данных")
+    table_files = []
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(table_data_dir) and file.endswith('.json'):
+                table_files.append(file)
+        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+        table_documents = []
+        for file_path in table_files:
+            try:
+                log_message(f"Обрабатываю файл: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    table_data = json.load(f)
+                    if isinstance(table_data, dict):
+                        document_id = table_data.get('document', 'unknown')
+                        if 'sheets' in table_data:
+                            for sheet in table_data['sheets']:
+                                sheet['document'] = document_id
+                                # Check if this table uses custom processing
+                                table_num = sheet.get('table_number', 'Неизвестно')
+                                use_custom, _, _ = should_use_custom_processing(document_id, table_num)
+                                if use_custom:
+                                    log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
+                                docs_list = table_to_document(sheet, document_id)
+                                table_documents.extend(docs_list)
+                        else:
+                            # Check if this table uses custom processing
+                            table_num = table_data.get('table_number', 'Неизвестно')
+                            use_custom, _, _ = should_use_custom_processing(document_id, table_num)
+                            if use_custom:
+                                log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
+                            docs_list = table_to_document(table_data, document_id)
+                            table_documents.extend(docs_list)
+                    elif isinstance(table_data, list):
+                        for table_json in table_data:
+                            document_id = table_json.get('document', 'unknown')
+                            table_num = table_json.get('table_number', 'Неизвестно')
+                            use_custom, _, _ = should_use_custom_processing(document_id, table_num)
+                            if use_custom:
+                                log_message(f"Skipping default processing for custom table {table_num} in {document_id}")
+                            docs_list = table_to_document(table_json)
+                            table_documents.extend(docs_list)
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Создано {len(table_documents)} документов из таблиц")
+        return table_documents
+    except Exception as e:
+        log_message(f"Ошибка загрузки табличных данных: {str(e)}")
+        return []

tempCodeRunnerFile.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ print(f"\nSuccessfully processed {len(results)} tables in {json_file}.")
2	+ else:

Табличные данные/НП-104-18_ГОСТ 59023.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ea4dc2f6b1cad2637b7147e050418dc6b9e2d81bcaeb091c4e6f490f6c9ceca
+size 292360

Табличные данные_JSON/НП-104-18_ГОСТ 59023.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee3468bc116401f24b7ab1dff7850e458d933ab157639c3e7127451deb310291
-size 2642133

 version https://git-lfs.github.com/spec/v1
+oid sha256:4b64b00d8e90a82ba5f6a0ff8589b9e9b29b568d28e7d10a743d4a5534d3c655
+size 3316944