Spaces:
Sleeping
Sleeping
Commit
·
9ad6501
1
Parent(s):
433ff61
max size 25000 + improved table prep
Browse files- config.py +1 -1
- documents_prep.py +31 -6
- table_prep.py +6 -4
- Табличные данные/НП-104-18_ГОСТ 59023.xlsx +0 -3
config.py
CHANGED
|
@@ -50,7 +50,7 @@ AVAILABLE_MODELS = {
|
|
| 50 |
|
| 51 |
DEFAULT_MODEL = "Gemini 2.5 Flash"
|
| 52 |
|
| 53 |
-
CHUNK_SIZE =
|
| 54 |
CHUNK_OVERLAP = 256
|
| 55 |
|
| 56 |
CUSTOM_PROMPT = """
|
|
|
|
| 50 |
|
| 51 |
DEFAULT_MODEL = "Gemini 2.5 Flash"
|
| 52 |
|
| 53 |
+
CHUNK_SIZE = 25000
|
| 54 |
CHUNK_OVERLAP = 256
|
| 55 |
|
| 56 |
CUSTOM_PROMPT = """
|
documents_prep.py
CHANGED
|
@@ -40,7 +40,6 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
|
|
| 40 |
|
| 41 |
return chunked_docs
|
| 42 |
|
| 43 |
-
|
| 44 |
def process_documents_with_chunking(documents):
|
| 45 |
all_chunked_docs = []
|
| 46 |
chunk_info = []
|
|
@@ -49,16 +48,40 @@ def process_documents_with_chunking(documents):
|
|
| 49 |
text_chunks_count = 0
|
| 50 |
large_tables_count = 0
|
| 51 |
large_images_count = 0
|
|
|
|
| 52 |
|
| 53 |
for doc in documents:
|
| 54 |
doc_type = doc.metadata.get('type', 'text')
|
| 55 |
|
| 56 |
if doc_type == 'table':
|
| 57 |
table_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
doc_size = len(doc.text)
|
| 59 |
if doc_size > CHUNK_SIZE:
|
| 60 |
large_tables_count += 1
|
| 61 |
-
log_message(f"Large table found: {
|
| 62 |
|
| 63 |
# Chunk large tables
|
| 64 |
chunked_docs = chunk_document(doc)
|
|
@@ -72,7 +95,8 @@ def process_documents_with_chunking(documents):
|
|
| 72 |
'chunk_size': len(chunk_doc.text),
|
| 73 |
'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
|
| 74 |
'type': 'table',
|
| 75 |
-
'table_number': chunk_doc.metadata.get('table_number', 'unknown')
|
|
|
|
| 76 |
})
|
| 77 |
else:
|
| 78 |
all_chunked_docs.append(doc)
|
|
@@ -83,7 +107,8 @@ def process_documents_with_chunking(documents):
|
|
| 83 |
'chunk_size': doc_size,
|
| 84 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 85 |
'type': 'table',
|
| 86 |
-
'table_number': doc.metadata.get('table_number', 'unknown')
|
|
|
|
| 87 |
})
|
| 88 |
|
| 89 |
elif doc_type == 'image':
|
|
@@ -145,9 +170,10 @@ def process_documents_with_chunking(documents):
|
|
| 145 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 146 |
'type': 'text'
|
| 147 |
})
|
| 148 |
-
|
| 149 |
log_message(f"=== PROCESSING STATISTICS ===")
|
| 150 |
log_message(f"Total tables processed: {table_count}")
|
|
|
|
| 151 |
log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
|
| 152 |
log_message(f"Total images processed: {image_count}")
|
| 153 |
log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
|
|
@@ -156,7 +182,6 @@ def process_documents_with_chunking(documents):
|
|
| 156 |
|
| 157 |
return all_chunked_docs, chunk_info
|
| 158 |
|
| 159 |
-
|
| 160 |
def extract_text_from_json(data, document_id, document_name):
|
| 161 |
documents = []
|
| 162 |
|
|
|
|
| 40 |
|
| 41 |
return chunked_docs
|
| 42 |
|
|
|
|
| 43 |
def process_documents_with_chunking(documents):
|
| 44 |
all_chunked_docs = []
|
| 45 |
chunk_info = []
|
|
|
|
| 48 |
text_chunks_count = 0
|
| 49 |
large_tables_count = 0
|
| 50 |
large_images_count = 0
|
| 51 |
+
custom_processed_count = 0
|
| 52 |
|
| 53 |
for doc in documents:
|
| 54 |
doc_type = doc.metadata.get('type', 'text')
|
| 55 |
|
| 56 |
if doc_type == 'table':
|
| 57 |
table_count += 1
|
| 58 |
+
doc_id = doc.metadata.get('document_id', 'unknown')
|
| 59 |
+
table_num = doc.metadata.get('table_number', 'unknown')
|
| 60 |
+
from table_prep import should_use_custom_processing
|
| 61 |
+
use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
|
| 62 |
+
|
| 63 |
+
if use_custom:
|
| 64 |
+
custom_processed_count += 1
|
| 65 |
+
log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
|
| 66 |
+
# Add the document as-is since it was already processed by custom method
|
| 67 |
+
all_chunked_docs.append(doc)
|
| 68 |
+
chunk_info.append({
|
| 69 |
+
'document_id': doc_id,
|
| 70 |
+
'section_id': doc.metadata.get('section_id', 'unknown'),
|
| 71 |
+
'chunk_id': 0,
|
| 72 |
+
'chunk_size': len(doc.text),
|
| 73 |
+
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 74 |
+
'type': 'table',
|
| 75 |
+
'table_number': table_num,
|
| 76 |
+
'processing_method': method_config.get('method')
|
| 77 |
+
})
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
# Standard processing for non-custom tables
|
| 81 |
doc_size = len(doc.text)
|
| 82 |
if doc_size > CHUNK_SIZE:
|
| 83 |
large_tables_count += 1
|
| 84 |
+
log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
|
| 85 |
|
| 86 |
# Chunk large tables
|
| 87 |
chunked_docs = chunk_document(doc)
|
|
|
|
| 95 |
'chunk_size': len(chunk_doc.text),
|
| 96 |
'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
|
| 97 |
'type': 'table',
|
| 98 |
+
'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
|
| 99 |
+
'processing_method': 'standard_chunked'
|
| 100 |
})
|
| 101 |
else:
|
| 102 |
all_chunked_docs.append(doc)
|
|
|
|
| 107 |
'chunk_size': doc_size,
|
| 108 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 109 |
'type': 'table',
|
| 110 |
+
'table_number': doc.metadata.get('table_number', 'unknown'),
|
| 111 |
+
'processing_method': 'standard'
|
| 112 |
})
|
| 113 |
|
| 114 |
elif doc_type == 'image':
|
|
|
|
| 170 |
'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
|
| 171 |
'type': 'text'
|
| 172 |
})
|
| 173 |
+
|
| 174 |
log_message(f"=== PROCESSING STATISTICS ===")
|
| 175 |
log_message(f"Total tables processed: {table_count}")
|
| 176 |
+
log_message(f"Custom processed tables: {custom_processed_count}")
|
| 177 |
log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
|
| 178 |
log_message(f"Total images processed: {image_count}")
|
| 179 |
log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
|
|
|
|
| 182 |
|
| 183 |
return all_chunked_docs, chunk_info
|
| 184 |
|
|
|
|
| 185 |
def extract_text_from_json(data, document_id, document_name):
|
| 186 |
documents = []
|
| 187 |
|
table_prep.py
CHANGED
|
@@ -7,8 +7,6 @@ from huggingface_hub import hf_hub_download, list_repo_files
|
|
| 7 |
from llama_index.core import Document
|
| 8 |
from my_logging import log_message
|
| 9 |
|
| 10 |
-
|
| 11 |
-
# Add this configuration at the top of your documents_prep file
|
| 12 |
CUSTOM_TABLE_CONFIGS = {
|
| 13 |
"ГОСТ Р 50.05.01-2018": {
|
| 14 |
"tables": {
|
|
@@ -21,7 +19,7 @@ CUSTOM_TABLE_CONFIGS = {
|
|
| 21 |
"№ Б.2": {"method": "split_by_rows"}
|
| 22 |
}
|
| 23 |
},
|
| 24 |
-
"
|
| 25 |
"tables": {
|
| 26 |
"*": {"method": "group_entire_table"} # All tables
|
| 27 |
}
|
|
@@ -39,6 +37,11 @@ CUSTOM_TABLE_CONFIGS = {
|
|
| 39 |
"№ 2": {"method": "split_by_rows"},
|
| 40 |
"№ 3": {"method": "split_by_rows"}
|
| 41 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
}
|
| 44 |
|
|
@@ -169,7 +172,6 @@ def should_use_custom_processing(document_id, table_number):
|
|
| 169 |
for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
|
| 170 |
if document_id.startswith(doc_pattern):
|
| 171 |
tables_config = config.get("tables", {})
|
| 172 |
-
# Check for exact match or wildcard
|
| 173 |
if table_number in tables_config or "*" in tables_config:
|
| 174 |
return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
|
| 175 |
return False, None, None
|
|
|
|
| 7 |
from llama_index.core import Document
|
| 8 |
from my_logging import log_message
|
| 9 |
|
|
|
|
|
|
|
| 10 |
CUSTOM_TABLE_CONFIGS = {
|
| 11 |
"ГОСТ Р 50.05.01-2018": {
|
| 12 |
"tables": {
|
|
|
|
| 19 |
"№ Б.2": {"method": "split_by_rows"}
|
| 20 |
}
|
| 21 |
},
|
| 22 |
+
"НП-104-18": {
|
| 23 |
"tables": {
|
| 24 |
"*": {"method": "group_entire_table"} # All tables
|
| 25 |
}
|
|
|
|
| 37 |
"№ 2": {"method": "split_by_rows"},
|
| 38 |
"№ 3": {"method": "split_by_rows"}
|
| 39 |
}
|
| 40 |
+
},
|
| 41 |
+
"НП-089-15": { # New addition
|
| 42 |
+
"tables": {
|
| 43 |
+
"-": {"method": "split_by_rows"}
|
| 44 |
+
}
|
| 45 |
}
|
| 46 |
}
|
| 47 |
|
|
|
|
| 172 |
for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
|
| 173 |
if document_id.startswith(doc_pattern):
|
| 174 |
tables_config = config.get("tables", {})
|
|
|
|
| 175 |
if table_number in tables_config or "*" in tables_config:
|
| 176 |
return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
|
| 177 |
return False, None, None
|
Табличные данные/НП-104-18_ГОСТ 59023.xlsx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4ea4dc2f6b1cad2637b7147e050418dc6b9e2d81bcaeb091c4e6f490f6c9ceca
|
| 3 |
-
size 292360
|
|
|
|
|
|
|
|
|
|
|
|