MrSimple01 commited on
Commit
77e588d
·
verified ·
1 Parent(s): b03f5ea

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +27 -11
documents_prep.py CHANGED
@@ -34,6 +34,20 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
39
  headers = table_data.get('headers', [])
@@ -43,6 +57,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
43
  section = table_data.get('section', '')
44
 
45
  table_num_clean = str(table_num).strip()
 
46
 
47
  import re
48
  if 'приложени' in section.lower():
@@ -60,8 +75,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
60
 
61
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
62
 
63
- # Calculate base metadata size
64
- base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
65
  base_size = len(base_content)
66
  available_space = max_chars - base_size - 200
67
 
@@ -74,8 +89,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
74
  'type': 'table',
75
  'document_id': doc_id,
76
  'table_number': table_num_clean,
77
- 'table_identifier': table_identifier,
78
- 'table_title': table_title,
79
  'section': section,
80
  'total_rows': len(rows),
81
  'chunk_size': len(content),
@@ -105,8 +120,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
105
  'type': 'table',
106
  'document_id': doc_id,
107
  'table_number': table_num_clean,
108
- 'table_identifier': table_identifier,
109
- 'table_title': table_title,
110
  'section': section,
111
  'chunk_id': chunk_num,
112
  'row_start': current_rows[0]['_idx'] - 1,
@@ -139,8 +154,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
139
  'type': 'table',
140
  'document_id': doc_id,
141
  'table_number': table_num_clean,
142
- 'table_identifier': table_identifier,
143
- 'table_title': table_title,
144
  'section': section,
145
  'chunk_id': chunk_num,
146
  'row_start': current_rows[0]['_idx'] - 1,
@@ -156,15 +171,16 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
156
  return chunks
157
 
158
 
 
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
- content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
161
 
162
  # Add table type/number prominently for matching
163
  if table_num:
164
- content += f"ТИП: {table_num}\n"
165
 
166
  if table_title:
167
- content += f"НАЗВАНИЕ: {table_title}\n"
168
 
169
  if section:
170
  content += f"РАЗДЕЛ: {section}\n"
 
34
 
35
  return chunked
36
 
37
+ def normalize_text(text):
38
+ if not text:
39
+ return text
40
+
41
+ # Replace Cyrillic 'C' with Latin 'С' (U+0421)
42
+ # This is for welding types like C-25 -> С-25
43
+ text = text.replace('С-', 'C')
44
+
45
+ # Also handle cases like "Type C" or variations
46
+ import re
47
+ # Match "C" followed by digit or space in context of welding types
48
+ text = re.sub(r'\bС(\d)', r'С\1', text)
49
+
50
+ return text
51
 
52
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
53
  headers = table_data.get('headers', [])
 
57
  section = table_data.get('section', '')
58
 
59
  table_num_clean = str(table_num).strip()
60
+ table_title_normalized = normalize_text(str(table_title)) # NORMALIZE TITLE
61
 
62
  import re
63
  if 'приложени' in section.lower():
 
75
 
76
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
77
 
78
+ # Calculate base metadata size with NORMALIZED title
79
+ base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
80
  base_size = len(base_content)
81
  available_space = max_chars - base_size - 200
82
 
 
89
  'type': 'table',
90
  'document_id': doc_id,
91
  'table_number': table_num_clean,
92
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE identifier
93
+ 'table_title': table_title_normalized, # NORMALIZED
94
  'section': section,
95
  'total_rows': len(rows),
96
  'chunk_size': len(content),
 
120
  'type': 'table',
121
  'document_id': doc_id,
122
  'table_number': table_num_clean,
123
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE
124
+ 'table_title': table_title_normalized, # NORMALIZED
125
  'section': section,
126
  'chunk_id': chunk_num,
127
  'row_start': current_rows[0]['_idx'] - 1,
 
154
  'type': 'table',
155
  'document_id': doc_id,
156
  'table_number': table_num_clean,
157
+ 'table_identifier': normalize_text(table_identifier), # NORMALIZE
158
+ 'table_title': table_title_normalized, # NORMALIZED
159
  'section': section,
160
  'chunk_id': chunk_num,
161
  'row_start': current_rows[0]['_idx'] - 1,
 
171
  return chunks
172
 
173
 
174
+ # MODIFIED: Update format_table_header function
175
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
176
+ content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
177
 
178
  # Add table type/number prominently for matching
179
  if table_num:
180
+ content += f"ТИП: {normalize_text(table_num)}\n"
181
 
182
  if table_title:
183
+ content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
184
 
185
  if section:
186
  content += f"РАЗДЕЛ: {section}\n"