MrSimple07 commited on
Commit
5263b61
·
1 Parent(s): 9ce9909

added a new loggers for normalizations

Browse files
Files changed (1) hide show
  1. documents_prep.py +39 -40
documents_prep.py CHANGED
@@ -25,57 +25,49 @@ def normalize_text(text):
25
  def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
- Only applies to specific patterns to avoid changing legitimate Latin text.
29
- Returns: (normalized_text, changes_count)
30
  """
31
  if not text:
32
- return text, 0
33
-
34
  import re
35
-
36
  changes_count = 0
37
-
 
38
  def replace_in_steel_grade(match):
39
- """Replace Latin with Cyrillic only in steel grade context"""
40
- nonlocal changes_count
41
  grade = match.group(0)
42
  original_grade = grade
43
-
44
- # Mapping of Latin to Cyrillic for steel designations
45
  replacements = {
46
- 'X': 'Х', # Latin X -> Cyrillic Х (Kha)
47
- 'H': 'Н', # Latin H -> Cyrillic Н (En)
48
- 'T': 'Т', # Latin T -> Cyrillic Т (Te)
49
- 'C': 'С', # Latin C -> Cyrillic С (Es)
50
- 'B': 'В', # Latin B -> Cyrillic В (Ve)
51
- 'K': 'К', # Latin K -> Cyrillic К (Ka)
52
- 'M': 'М', # Latin M -> Cyrillic М (Em)
53
- 'A': 'А', # Latin A -> Cyrillic А (A)
54
- 'P': 'Р', # Latin P -> Cyrillic Р (Er)
55
  }
56
  for latin, cyrillic in replacements.items():
57
  grade = grade.replace(latin, cyrillic)
58
-
59
  if grade != original_grade:
60
  changes_count += 1
61
-
 
62
  return grade
63
-
64
- # Pattern for steel grades: digits followed by letters and more digits/letters
65
- # Examples: 08X18H10T, 12X18H10T, 20X13, etc.
66
- text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
67
  replace_in_steel_grade, text)
68
-
69
- # Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
70
  text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
71
  replace_in_steel_grade, text)
72
-
73
- # Pattern 3: Welding consumables like C-25, C-26 (but be careful not to change section refs)
74
- # Only replace if followed by dash and digits
75
- text = re.sub(r'\b[C]-\d{1,2}\b',
76
- lambda m: m.group(0).replace('C', 'С'), text)
77
-
78
- return text, changes_count
79
 
80
 
81
 
@@ -174,32 +166,39 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
174
 
175
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
176
 
177
- # Normalize all row content (including steel designations)
178
  normalized_rows = []
179
  total_row_changes = 0
180
  rows_with_changes = 0
181
-
 
182
  for row in rows:
183
  if isinstance(row, dict):
184
  normalized_row = {}
185
  row_had_changes = False
 
186
  for k, v in row.items():
187
- normalized_val, changes = normalize_steel_designations(str(v))
188
  normalized_row[k] = normalized_val
189
  if changes > 0:
190
  total_row_changes += changes
191
  row_had_changes = True
 
192
  if row_had_changes:
193
  rows_with_changes += 1
 
194
  normalized_rows.append(normalized_row)
195
  else:
196
  normalized_rows.append(row)
197
-
198
  # Log normalization stats for this table
199
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
200
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
201
- f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
202
-
 
 
 
 
203
  # Continue with rest of existing logic using normalized_rows...
204
  # Calculate base metadata size
205
  base_content = format_table_header(doc_id, table_identifier, table_num,
 
25
  def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
+ Returns: (normalized_text, changes_count, normalized_words)
 
29
  """
30
  if not text:
31
+ return text, 0, []
32
+
33
  import re
34
+
35
  changes_count = 0
36
+ normalized_words = []
37
+
38
  def replace_in_steel_grade(match):
39
+ nonlocal changes_count, normalized_words
 
40
  grade = match.group(0)
41
  original_grade = grade
42
+
 
43
  replacements = {
44
+ 'X': 'Х',
45
+ 'H': 'Н',
46
+ 'T': 'Т',
47
+ 'C': 'С',
48
+ 'B': 'В',
49
+ 'K': 'К',
50
+ 'M': 'М',
51
+ 'A': 'А',
52
+ 'P': 'Р',
53
  }
54
  for latin, cyrillic in replacements.items():
55
  grade = grade.replace(latin, cyrillic)
56
+
57
  if grade != original_grade:
58
  changes_count += 1
59
+ normalized_words.append((original_grade, grade))
60
+
61
  return grade
62
+
63
+ text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
 
 
64
  replace_in_steel_grade, text)
 
 
65
  text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
66
  replace_in_steel_grade, text)
67
+ text = re.sub(r'\b[C]-\d{1,2}\b',
68
+ lambda m: (normalized_words.append((m.group(0), m.group(0).replace('C', 'С'))) or m.group(0).replace('C', 'С')), text)
69
+
70
+ return text, changes_count, normalized_words
 
 
 
71
 
72
 
73
 
 
166
 
167
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
168
 
 
169
  normalized_rows = []
170
  total_row_changes = 0
171
  rows_with_changes = 0
172
+ all_normalized_words = []
173
+
174
  for row in rows:
175
  if isinstance(row, dict):
176
  normalized_row = {}
177
  row_had_changes = False
178
+ row_words = []
179
  for k, v in row.items():
180
+ normalized_val, changes, norm_words = normalize_steel_designations(str(v))
181
  normalized_row[k] = normalized_val
182
  if changes > 0:
183
  total_row_changes += changes
184
  row_had_changes = True
185
+ row_words.extend(norm_words)
186
  if row_had_changes:
187
  rows_with_changes += 1
188
+ all_normalized_words.extend(row_words)
189
  normalized_rows.append(normalized_row)
190
  else:
191
  normalized_rows.append(row)
192
+
193
  # Log normalization stats for this table
194
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
195
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
196
+ f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
197
+ if all_normalized_words:
198
+ log_message(" Normalized words:")
199
+ for orig, norm in all_normalized_words:
200
+ log_message(f" {orig} → {norm}")
201
+
202
  # Continue with rest of existing logic using normalized_rows...
203
  # Calculate base metadata size
204
  base_content = format_table_header(doc_id, table_identifier, table_num,