MrSimple01 commited on
Commit
57493a9
·
verified ·
1 Parent(s): 33d47b8

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +25 -15
documents_prep.py CHANGED
@@ -26,18 +26,19 @@ def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
  Only applies to specific patterns to avoid changing legitimate Latin text.
29
- Returns: (normalized_text, changes_count)
30
  """
31
  if not text:
32
- return text, 0
33
 
34
  import re
35
 
36
  changes_count = 0
 
37
 
38
  def replace_in_steel_grade(match):
39
  """Replace Latin with Cyrillic only in steel grade context"""
40
- nonlocal changes_count
41
  grade = match.group(0)
42
  original_grade = grade
43
 
@@ -58,24 +59,24 @@ def normalize_steel_designations(text):
58
 
59
  if grade != original_grade:
60
  changes_count += 1
 
61
 
62
  return grade
63
 
64
- # Pattern for steel grades: digits followed by letters and more digits/letters
65
- # Examples: 08X18H10T, 12X18H10T, 20X13, etc.
66
  text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
67
  replace_in_steel_grade, text)
68
 
69
- # Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
70
  text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
71
  replace_in_steel_grade, text)
72
 
73
- # Pattern 3: Welding consumables like C-25, C-26 (but be careful not to change section refs)
74
- # Only replace if followed by dash and digits
75
  text = re.sub(r'\b[C]-\d{1,2}\b',
76
- lambda m: m.group(0).replace('C', 'С'), text)
 
77
 
78
- return text, changes_count
79
 
80
 
81
 
@@ -136,8 +137,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
136
  sheet_name = table_data.get('sheet_name', '')
137
 
138
  # Apply steel designation normalization to title and section
139
- table_title, title_changes = normalize_steel_designations(str(table_title))
140
- section, section_changes = normalize_steel_designations(section)
141
 
142
  table_num_clean = str(table_num).strip()
143
 
@@ -178,28 +179,37 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
178
  normalized_rows = []
179
  total_row_changes = 0
180
  rows_with_changes = 0
 
181
 
182
  for row in rows:
183
  if isinstance(row, dict):
184
  normalized_row = {}
185
  row_had_changes = False
186
  for k, v in row.items():
187
- normalized_val, changes = normalize_steel_designations(str(v))
188
  normalized_row[k] = normalized_val
189
  if changes > 0:
190
  total_row_changes += changes
191
  row_had_changes = True
 
192
  if row_had_changes:
193
  rows_with_changes += 1
194
  normalized_rows.append(normalized_row)
195
  else:
196
  normalized_rows.append(row)
197
 
198
- # Log normalization stats for this table
199
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
200
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
201
  f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
202
-
 
 
 
 
 
 
 
203
  # Continue with rest of existing logic using normalized_rows...
204
  # Calculate base metadata size
205
  base_content = format_table_header(doc_id, table_identifier, table_num,
 
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
  Only applies to specific patterns to avoid changing legitimate Latin text.
29
+ Returns: (normalized_text, changes_count, changes_list)
30
  """
31
  if not text:
32
+ return text, 0, []
33
 
34
  import re
35
 
36
  changes_count = 0
37
+ changes_list = [] # NEW: Track what changed
38
 
39
  def replace_in_steel_grade(match):
40
  """Replace Latin with Cyrillic only in steel grade context"""
41
+ nonlocal changes_count, changes_list
42
  grade = match.group(0)
43
  original_grade = grade
44
 
 
59
 
60
  if grade != original_grade:
61
  changes_count += 1
62
+ changes_list.append(f"{original_grade} → {grade}") # NEW: Record change
63
 
64
  return grade
65
 
66
+ # Pattern for steel grades
 
67
  text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
68
  replace_in_steel_grade, text)
69
 
70
+ # Pattern 2: Welding wire designations
71
  text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
72
  replace_in_steel_grade, text)
73
 
74
+ # Pattern 3: Welding consumables
 
75
  text = re.sub(r'\b[C]-\d{1,2}\b',
76
+ lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
77
+ text)
78
 
79
+ return text, changes_count, changes_list
80
 
81
 
82
 
 
137
  sheet_name = table_data.get('sheet_name', '')
138
 
139
  # Apply steel designation normalization to title and section
140
+ table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
141
+ section, section_changes, section_list = normalize_steel_designations(section)
142
 
143
  table_num_clean = str(table_num).strip()
144
 
 
179
  normalized_rows = []
180
  total_row_changes = 0
181
  rows_with_changes = 0
182
+ all_row_changes = [] # NEW
183
 
184
  for row in rows:
185
  if isinstance(row, dict):
186
  normalized_row = {}
187
  row_had_changes = False
188
  for k, v in row.items():
189
+ normalized_val, changes, change_list = normalize_steel_designations(str(v))
190
  normalized_row[k] = normalized_val
191
  if changes > 0:
192
  total_row_changes += changes
193
  row_had_changes = True
194
+ all_row_changes.extend(change_list) # NEW
195
  if row_had_changes:
196
  rows_with_changes += 1
197
  normalized_rows.append(normalized_row)
198
  else:
199
  normalized_rows.append(row)
200
 
201
+ # Log normalization stats with examples
202
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
203
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
204
  f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
205
+
206
+ # NEW: Show examples of what changed
207
+ if title_list:
208
+ log_message(f" Title changes: {', '.join(title_list[:3])}")
209
+ if section_list:
210
+ log_message(f" Section changes: {', '.join(section_list[:3])}")
211
+ if all_row_changes:
212
+ log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
213
  # Continue with rest of existing logic using normalized_rows...
214
  # Calculate base metadata size
215
  base_content = format_table_header(doc_id, table_identifier, table_num,