Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 19, 2025

Commit

9ad6501

1 Parent(s): 433ff61

max size 25000 + improved table prep

Browse files

Files changed (4) hide show

config.py +1 -1
documents_prep.py +31 -6
table_prep.py +6 -4
Табличные данные/НП-104-18_ГОСТ 59023.xlsx +0 -3

config.py CHANGED Viewed

@@ -50,7 +50,7 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL = "Gemini 2.5 Flash"
-CHUNK_SIZE = 8192
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

 DEFAULT_MODEL = "Gemini 2.5 Flash"
+CHUNK_SIZE = 25000
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

documents_prep.py CHANGED Viewed

@@ -40,7 +40,6 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
     return chunked_docs
 def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
@@ -49,16 +48,40 @@ def process_documents_with_chunking(documents):
     text_chunks_count = 0
     large_tables_count = 0
     large_images_count = 0
     for doc in documents:
         doc_type = doc.metadata.get('type', 'text')
         if doc_type == 'table':
             table_count += 1
             doc_size = len(doc.text)
             if doc_size > CHUNK_SIZE:
                 large_tables_count += 1
-                log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
                 # Chunk large tables
                 chunked_docs = chunk_document(doc)
@@ -72,7 +95,8 @@ def process_documents_with_chunking(documents):
                         'chunk_size': len(chunk_doc.text),
                         'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
                         'type': 'table',
-                        'table_number': chunk_doc.metadata.get('table_number', 'unknown')
                     })
             else:
                 all_chunked_docs.append(doc)
@@ -83,7 +107,8 @@ def process_documents_with_chunking(documents):
                     'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'table',
-                    'table_number': doc.metadata.get('table_number', 'unknown')
                 })
         elif doc_type == 'image':
@@ -145,9 +170,10 @@ def process_documents_with_chunking(documents):
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'text'
                 })
     log_message(f"=== PROCESSING STATISTICS ===")
     log_message(f"Total tables processed: {table_count}")
     log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
     log_message(f"Total images processed: {image_count}")
     log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
@@ -156,7 +182,6 @@ def process_documents_with_chunking(documents):
     return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []

     return chunked_docs
 def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
     text_chunks_count = 0
     large_tables_count = 0
     large_images_count = 0
+    custom_processed_count = 0
     for doc in documents:
         doc_type = doc.metadata.get('type', 'text')
         if doc_type == 'table':
             table_count += 1
+            doc_id = doc.metadata.get('document_id', 'unknown')
+            table_num = doc.metadata.get('table_number', 'unknown')
+            from table_prep import should_use_custom_processing
+            use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
+            if use_custom:
+                custom_processed_count += 1
+                log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
+                # Add the document as-is since it was already processed by custom method
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc_id,
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': len(doc.text),
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': table_num,
+                    'processing_method': method_config.get('method')
+                })
+                continue
+            # Standard processing for non-custom tables
             doc_size = len(doc.text)
             if doc_size > CHUNK_SIZE:
                 large_tables_count += 1
+                log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
                 # Chunk large tables
                 chunked_docs = chunk_document(doc)
                         'chunk_size': len(chunk_doc.text),
                         'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
                         'type': 'table',
+                        'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
+                        'processing_method': 'standard_chunked'
                     })
             else:
                 all_chunked_docs.append(doc)
                     'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown'),
+                    'processing_method': 'standard'
                 })
         elif doc_type == 'image':
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'text'
                 })
     log_message(f"=== PROCESSING STATISTICS ===")
     log_message(f"Total tables processed: {table_count}")
+    log_message(f"Custom processed tables: {custom_processed_count}")
     log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
     log_message(f"Total images processed: {image_count}")
     log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
     return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []

table_prep.py CHANGED Viewed

@@ -7,8 +7,6 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from my_logging import log_message
-# Add this configuration at the top of your documents_prep file
 CUSTOM_TABLE_CONFIGS = {
     "ГОСТ Р 50.05.01-2018": {
         "tables": {
@@ -21,7 +19,7 @@ CUSTOM_TABLE_CONFIGS = {
             "№ Б.2": {"method": "split_by_rows"}
         }
     },
-    "ГОСТ Р 59023.2-2020": {
         "tables": {
             "*": {"method": "group_entire_table"}  # All tables
         }
@@ -39,6 +37,11 @@ CUSTOM_TABLE_CONFIGS = {
             "№ 2": {"method": "split_by_rows"},
             "№ 3": {"method": "split_by_rows"}
         }
     }
 }
@@ -169,7 +172,6 @@ def should_use_custom_processing(document_id, table_number):
     for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
         if document_id.startswith(doc_pattern):
             tables_config = config.get("tables", {})
-            # Check for exact match or wildcard
             if table_number in tables_config or "*" in tables_config:
                 return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
     return False, None, None

 from llama_index.core import Document
 from my_logging import log_message
 CUSTOM_TABLE_CONFIGS = {
     "ГОСТ Р 50.05.01-2018": {
         "tables": {
             "№ Б.2": {"method": "split_by_rows"}
         }
     },
+    "НП-104-18": {
         "tables": {
             "*": {"method": "group_entire_table"}  # All tables
         }
             "№ 2": {"method": "split_by_rows"},
             "№ 3": {"method": "split_by_rows"}
         }
+    },
+    "НП-089-15": {  # New addition
+        "tables": {
+            "-": {"method": "split_by_rows"}
+        }
     }
 }
     for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
         if document_id.startswith(doc_pattern):
             tables_config = config.get("tables", {})
             if table_number in tables_config or "*" in tables_config:
                 return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
     return False, None, None

Табличные данные/НП-104-18_ГОСТ 59023.xlsx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4ea4dc2f6b1cad2637b7147e050418dc6b9e2d81bcaeb091c4e6f490f6c9ceca
-size 292360