MrSimple07 commited on
Commit
736465e
·
1 Parent(s): 9ad6501

new 2 custom tables + metadata repetition fixed

Browse files
Files changed (1) hide show
  1. table_prep.py +37 -17
table_prep.py CHANGED
@@ -19,7 +19,7 @@ CUSTOM_TABLE_CONFIGS = {
19
  "№ Б.2": {"method": "split_by_rows"}
20
  }
21
  },
22
- "НП-104-18": {
23
  "tables": {
24
  "*": {"method": "group_entire_table"} # All tables
25
  }
@@ -38,34 +38,57 @@ CUSTOM_TABLE_CONFIGS = {
38
  "№ 3": {"method": "split_by_rows"}
39
  }
40
  },
41
- "НП-089-15": { # New addition
42
  "tables": {
43
- "-": {"method": "split_by_rows"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
  }
46
  }
47
 
48
  def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
49
- """Create standard meta information string"""
50
- base_info = f'Документ "{document_name}", Раздел: {section}, Номер таблицы: {table_number}, Название таблицы: {table_title}'
 
51
  if extra_info:
52
  base_info += f', {extra_info}'
53
- return base_info + '\n'
54
 
55
  def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
56
- """Create chunk text with headers and rows"""
 
 
57
  header_line = " | ".join(headers)
58
- chunk_lines = [meta_info + "Заголовки: " + header_line]
59
 
 
60
  for i, row in enumerate(rows, start=1):
61
- row_text = " | ".join([f"{h}: {row.get(h, '')}" for h in headers])
 
 
 
 
 
62
  if add_row_numbers:
63
- chunk_lines.append(f"Строка {i}: {row_text}")
64
  else:
65
- chunk_lines.append(row_text)
66
 
67
  return "\n".join(chunk_lines)
68
-
69
  def group_by_column_method(table_data, document_name, group_column):
70
  """Group rows by specified column value"""
71
  documents = []
@@ -194,19 +217,16 @@ def process_table_with_custom_method(table_data, document_name, method_config):
194
  def table_to_document(table_data, document_id=None):
195
  if isinstance(table_data, dict):
196
  doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
197
- table_num = table_data.get('table_number', 'Неизвестно')
198
-
199
- # Check if this table should use custom processing
200
  use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
201
 
202
  if use_custom:
203
  log_message(f"Using custom processing for table {table_num} in document {doc_id}")
204
  custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
205
  if custom_docs:
206
- # Return custom processed documents and skip default processing
207
  return custom_docs
208
 
209
- # Default processing for tables not in custom config
210
  table_title = table_data.get('table_title', 'Неизвестно')
211
  section = table_data.get('section', 'Неизвестно')
212
 
 
19
  "№ Б.2": {"method": "split_by_rows"}
20
  }
21
  },
22
+ "НП-104-18": {
23
  "tables": {
24
  "*": {"method": "group_entire_table"} # All tables
25
  }
 
38
  "№ 3": {"method": "split_by_rows"}
39
  }
40
  },
41
+ "НП-089-15": {
42
  "tables": {
43
+ "-": {"method": "split_by_rows"}
44
+ }
45
+ },
46
+ "НП-105-18": { # New addition for problematic table
47
+ "tables": {
48
+ "№ 4.8": {"method": "group_entire_table"}
49
+ }
50
+ },
51
+ "ГОСТ Р 50.05.23-2020": { # New addition for problematic table
52
+ "tables": {
53
+ "№8": {"method": "group_entire_table"}
54
+ }
55
+ },
56
+ "ГОСТ Р 50.03.01-2017": { # New addition for А.8 table
57
+ "tables": {
58
+ "А.8": {"method": "group_entire_table"}
59
  }
60
  }
61
  }
62
 
63
  def create_meta_info(document_name, section, table_number, table_title, extra_info=""):
64
+ base_info = f'Документ "{document_name}", Раздел: {section}, Таблица: {table_number}'
65
+ if table_title and table_title.strip():
66
+ base_info += f', Название: {table_title}'
67
  if extra_info:
68
  base_info += f', {extra_info}'
69
+ return base_info
70
 
71
  def create_chunk_text(meta_info, headers, rows, add_row_numbers=False):
72
+ chunk_lines = [meta_info.rstrip()] # Remove trailing newline from meta_info
73
+
74
+ # Add headers only once
75
  header_line = " | ".join(headers)
76
+ chunk_lines.append(f"Заголовки: {header_line}")
77
 
78
+ # Add rows without redundant formatting
79
  for i, row in enumerate(rows, start=1):
80
+ row_parts = []
81
+ for h in headers:
82
+ value = row.get(h, '')
83
+ if value: # Only add non-empty values
84
+ row_parts.append(f"{h}: {value}")
85
+
86
  if add_row_numbers:
87
+ chunk_lines.append(f"Строка {i}: {' | '.join(row_parts)}")
88
  else:
89
+ chunk_lines.append(' | '.join(row_parts))
90
 
91
  return "\n".join(chunk_lines)
 
92
  def group_by_column_method(table_data, document_name, group_column):
93
  """Group rows by specified column value"""
94
  documents = []
 
217
  def table_to_document(table_data, document_id=None):
218
  if isinstance(table_data, dict):
219
  doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
220
+ table_num = table_data.get('table_number', 'Неизвестно')
 
 
221
  use_custom, doc_pattern, method_config = should_use_custom_processing(doc_id, table_num)
222
 
223
  if use_custom:
224
  log_message(f"Using custom processing for table {table_num} in document {doc_id}")
225
  custom_docs = process_table_with_custom_method(table_data, doc_id, method_config)
226
  if custom_docs:
 
227
  return custom_docs
228
 
229
+ # DEFAULT PROCESSING (only if NOT using custom)
230
  table_title = table_data.get('table_title', 'Неизвестно')
231
  section = table_data.get('section', 'Неизвестно')
232