MrSimple07 commited on
Commit
9ad6501
·
1 Parent(s): 433ff61

max size 25000 + improved table prep

Browse files
config.py CHANGED
@@ -50,7 +50,7 @@ AVAILABLE_MODELS = {
50
 
51
  DEFAULT_MODEL = "Gemini 2.5 Flash"
52
 
53
- CHUNK_SIZE = 8192
54
  CHUNK_OVERLAP = 256
55
 
56
  CUSTOM_PROMPT = """
 
50
 
51
  DEFAULT_MODEL = "Gemini 2.5 Flash"
52
 
53
+ CHUNK_SIZE = 25000
54
  CHUNK_OVERLAP = 256
55
 
56
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -40,7 +40,6 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
40
 
41
  return chunked_docs
42
 
43
-
44
  def process_documents_with_chunking(documents):
45
  all_chunked_docs = []
46
  chunk_info = []
@@ -49,16 +48,40 @@ def process_documents_with_chunking(documents):
49
  text_chunks_count = 0
50
  large_tables_count = 0
51
  large_images_count = 0
 
52
 
53
  for doc in documents:
54
  doc_type = doc.metadata.get('type', 'text')
55
 
56
  if doc_type == 'table':
57
  table_count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  doc_size = len(doc.text)
59
  if doc_size > CHUNK_SIZE:
60
  large_tables_count += 1
61
- log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
62
 
63
  # Chunk large tables
64
  chunked_docs = chunk_document(doc)
@@ -72,7 +95,8 @@ def process_documents_with_chunking(documents):
72
  'chunk_size': len(chunk_doc.text),
73
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
74
  'type': 'table',
75
- 'table_number': chunk_doc.metadata.get('table_number', 'unknown')
 
76
  })
77
  else:
78
  all_chunked_docs.append(doc)
@@ -83,7 +107,8 @@ def process_documents_with_chunking(documents):
83
  'chunk_size': doc_size,
84
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
85
  'type': 'table',
86
- 'table_number': doc.metadata.get('table_number', 'unknown')
 
87
  })
88
 
89
  elif doc_type == 'image':
@@ -145,9 +170,10 @@ def process_documents_with_chunking(documents):
145
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
146
  'type': 'text'
147
  })
148
-
149
  log_message(f"=== PROCESSING STATISTICS ===")
150
  log_message(f"Total tables processed: {table_count}")
 
151
  log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
152
  log_message(f"Total images processed: {image_count}")
153
  log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
@@ -156,7 +182,6 @@ def process_documents_with_chunking(documents):
156
 
157
  return all_chunked_docs, chunk_info
158
 
159
-
160
  def extract_text_from_json(data, document_id, document_name):
161
  documents = []
162
 
 
40
 
41
  return chunked_docs
42
 
 
43
  def process_documents_with_chunking(documents):
44
  all_chunked_docs = []
45
  chunk_info = []
 
48
  text_chunks_count = 0
49
  large_tables_count = 0
50
  large_images_count = 0
51
+ custom_processed_count = 0
52
 
53
  for doc in documents:
54
  doc_type = doc.metadata.get('type', 'text')
55
 
56
  if doc_type == 'table':
57
  table_count += 1
58
+ doc_id = doc.metadata.get('document_id', 'unknown')
59
+ table_num = doc.metadata.get('table_number', 'unknown')
60
+ from table_prep import should_use_custom_processing
61
+ use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
62
+
63
+ if use_custom:
64
+ custom_processed_count += 1
65
+ log_message(f"Table {table_num} in document {doc_id} was processed with custom method '{method_config.get('method')}', skipping standard chunking")
66
+ # Add the document as-is since it was already processed by custom method
67
+ all_chunked_docs.append(doc)
68
+ chunk_info.append({
69
+ 'document_id': doc_id,
70
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
71
+ 'chunk_id': 0,
72
+ 'chunk_size': len(doc.text),
73
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
74
+ 'type': 'table',
75
+ 'table_number': table_num,
76
+ 'processing_method': method_config.get('method')
77
+ })
78
+ continue
79
+
80
+ # Standard processing for non-custom tables
81
  doc_size = len(doc.text)
82
  if doc_size > CHUNK_SIZE:
83
  large_tables_count += 1
84
+ log_message(f"Large table found: {table_num} in document {doc_id}, size: {doc_size} characters")
85
 
86
  # Chunk large tables
87
  chunked_docs = chunk_document(doc)
 
95
  'chunk_size': len(chunk_doc.text),
96
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
97
  'type': 'table',
98
+ 'table_number': chunk_doc.metadata.get('table_number', 'unknown'),
99
+ 'processing_method': 'standard_chunked'
100
  })
101
  else:
102
  all_chunked_docs.append(doc)
 
107
  'chunk_size': doc_size,
108
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
109
  'type': 'table',
110
+ 'table_number': doc.metadata.get('table_number', 'unknown'),
111
+ 'processing_method': 'standard'
112
  })
113
 
114
  elif doc_type == 'image':
 
170
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
171
  'type': 'text'
172
  })
173
+
174
  log_message(f"=== PROCESSING STATISTICS ===")
175
  log_message(f"Total tables processed: {table_count}")
176
+ log_message(f"Custom processed tables: {custom_processed_count}")
177
  log_message(f"Large tables (>{CHUNK_SIZE} chars): {large_tables_count}")
178
  log_message(f"Total images processed: {image_count}")
179
  log_message(f"Large images (>{CHUNK_SIZE} chars): {large_images_count}")
 
182
 
183
  return all_chunked_docs, chunk_info
184
 
 
185
  def extract_text_from_json(data, document_id, document_name):
186
  documents = []
187
 
table_prep.py CHANGED
@@ -7,8 +7,6 @@ from huggingface_hub import hf_hub_download, list_repo_files
7
  from llama_index.core import Document
8
  from my_logging import log_message
9
 
10
-
11
- # Add this configuration at the top of your documents_prep file
12
  CUSTOM_TABLE_CONFIGS = {
13
  "ГОСТ Р 50.05.01-2018": {
14
  "tables": {
@@ -21,7 +19,7 @@ CUSTOM_TABLE_CONFIGS = {
21
  "№ Б.2": {"method": "split_by_rows"}
22
  }
23
  },
24
- "ГОСТ Р 59023.2-2020": {
25
  "tables": {
26
  "*": {"method": "group_entire_table"} # All tables
27
  }
@@ -39,6 +37,11 @@ CUSTOM_TABLE_CONFIGS = {
39
  "№ 2": {"method": "split_by_rows"},
40
  "№ 3": {"method": "split_by_rows"}
41
  }
 
 
 
 
 
42
  }
43
  }
44
 
@@ -169,7 +172,6 @@ def should_use_custom_processing(document_id, table_number):
169
  for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
170
  if document_id.startswith(doc_pattern):
171
  tables_config = config.get("tables", {})
172
- # Check for exact match or wildcard
173
  if table_number in tables_config or "*" in tables_config:
174
  return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
175
  return False, None, None
 
7
  from llama_index.core import Document
8
  from my_logging import log_message
9
 
 
 
10
  CUSTOM_TABLE_CONFIGS = {
11
  "ГОСТ Р 50.05.01-2018": {
12
  "tables": {
 
19
  "№ Б.2": {"method": "split_by_rows"}
20
  }
21
  },
22
+ "НП-104-18": {
23
  "tables": {
24
  "*": {"method": "group_entire_table"} # All tables
25
  }
 
37
  "№ 2": {"method": "split_by_rows"},
38
  "№ 3": {"method": "split_by_rows"}
39
  }
40
+ },
41
+ "НП-089-15": { # New addition
42
+ "tables": {
43
+ "-": {"method": "split_by_rows"}
44
+ }
45
  }
46
  }
47
 
 
172
  for doc_pattern, config in CUSTOM_TABLE_CONFIGS.items():
173
  if document_id.startswith(doc_pattern):
174
  tables_config = config.get("tables", {})
 
175
  if table_number in tables_config or "*" in tables_config:
176
  return True, doc_pattern, tables_config.get(table_number, tables_config.get("*"))
177
  return False, None, None
Табличные данные/НП-104-18_ГОСТ 59023.xlsx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ea4dc2f6b1cad2637b7147e050418dc6b9e2d81bcaeb091c4e6f490f6c9ceca
3
- size 292360