MrSimple07 commited on
Commit
bd2b030
·
1 Parent(s): 5263b61

added a new loggers for normalizations

Browse files
Files changed (1) hide show
  1. documents_prep.py +51 -40
documents_prep.py CHANGED
@@ -25,49 +25,58 @@ def normalize_text(text):
25
  def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
- Returns: (normalized_text, changes_count, normalized_words)
 
29
  """
30
  if not text:
31
  return text, 0, []
32
-
33
  import re
34
-
35
  changes_count = 0
36
- normalized_words = []
37
-
38
  def replace_in_steel_grade(match):
39
- nonlocal changes_count, normalized_words
 
40
  grade = match.group(0)
41
  original_grade = grade
42
-
 
43
  replacements = {
44
- 'X': 'Х',
45
- 'H': 'Н',
46
- 'T': 'Т',
47
- 'C': 'С',
48
- 'B': 'В',
49
- 'K': 'К',
50
- 'M': 'М',
51
- 'A': 'А',
52
- 'P': 'Р',
53
  }
54
  for latin, cyrillic in replacements.items():
55
  grade = grade.replace(latin, cyrillic)
56
-
57
  if grade != original_grade:
58
  changes_count += 1
59
- normalized_words.append((original_grade, grade))
60
-
61
  return grade
62
-
63
- text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
 
64
  replace_in_steel_grade, text)
 
 
65
  text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
66
  replace_in_steel_grade, text)
67
- text = re.sub(r'\b[C]-\d{1,2}\b',
68
- lambda m: (normalized_words.append((m.group(0), m.group(0).replace('C', 'С'))) or m.group(0).replace('C', 'С')), text)
69
-
70
- return text, changes_count, normalized_words
 
 
 
71
 
72
 
73
 
@@ -128,8 +137,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
128
  sheet_name = table_data.get('sheet_name', '')
129
 
130
  # Apply steel designation normalization to title and section
131
- table_title, title_changes = normalize_steel_designations(str(table_title))
132
- section, section_changes = normalize_steel_designations(section)
133
 
134
  table_num_clean = str(table_num).strip()
135
 
@@ -166,39 +175,41 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
166
 
167
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
168
 
 
169
  normalized_rows = []
170
  total_row_changes = 0
171
  rows_with_changes = 0
172
- all_normalized_words = []
173
-
174
  for row in rows:
175
  if isinstance(row, dict):
176
  normalized_row = {}
177
  row_had_changes = False
178
- row_words = []
179
  for k, v in row.items():
180
- normalized_val, changes, norm_words = normalize_steel_designations(str(v))
181
  normalized_row[k] = normalized_val
182
  if changes > 0:
183
  total_row_changes += changes
184
  row_had_changes = True
185
- row_words.extend(norm_words)
186
  if row_had_changes:
187
  rows_with_changes += 1
188
- all_normalized_words.extend(row_words)
189
  normalized_rows.append(normalized_row)
190
  else:
191
  normalized_rows.append(row)
192
-
193
- # Log normalization stats for this table
194
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
195
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
196
- f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
197
- if all_normalized_words:
198
- log_message(" Normalized words:")
199
- for orig, norm in all_normalized_words:
200
- log_message(f" {orig} → {norm}")
201
 
 
 
 
 
 
 
 
202
  # Continue with rest of existing logic using normalized_rows...
203
  # Calculate base metadata size
204
  base_content = format_table_header(doc_id, table_identifier, table_num,
 
25
  def normalize_steel_designations(text):
26
  """
27
  Convert Latin letters to Cyrillic in steel designations.
28
+ Only applies to specific patterns to avoid changing legitimate Latin text.
29
+ Returns: (normalized_text, changes_count, changes_list)
30
  """
31
  if not text:
32
  return text, 0, []
33
+
34
  import re
35
+
36
  changes_count = 0
37
+ changes_list = [] # NEW: Track what changed
38
+
39
  def replace_in_steel_grade(match):
40
+ """Replace Latin with Cyrillic only in steel grade context"""
41
+ nonlocal changes_count, changes_list
42
  grade = match.group(0)
43
  original_grade = grade
44
+
45
+ # Mapping of Latin to Cyrillic for steel designations
46
  replacements = {
47
+ 'X': 'Х', # Latin X -> Cyrillic Х (Kha)
48
+ 'H': 'Н', # Latin H -> Cyrillic Н (En)
49
+ 'T': 'Т', # Latin T -> Cyrillic Т (Te)
50
+ 'C': 'С', # Latin C -> Cyrillic С (Es)
51
+ 'B': 'В', # Latin B -> Cyrillic В (Ve)
52
+ 'K': 'К', # Latin K -> Cyrillic К (Ka)
53
+ 'M': 'М', # Latin M -> Cyrillic М (Em)
54
+ 'A': 'А', # Latin A -> Cyrillic А (A)
55
+ 'P': 'Р', # Latin P -> Cyrillic Р (Er)
56
  }
57
  for latin, cyrillic in replacements.items():
58
  grade = grade.replace(latin, cyrillic)
59
+
60
  if grade != original_grade:
61
  changes_count += 1
62
+ changes_list.append(f"{original_grade} → {grade}") # NEW: Record change
63
+
64
  return grade
65
+
66
+ # Pattern for steel grades
67
+ text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
68
  replace_in_steel_grade, text)
69
+
70
+ # Pattern 2: Welding wire designations
71
  text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
72
  replace_in_steel_grade, text)
73
+
74
+ # Pattern 3: Welding consumables
75
+ text = re.sub(r'\b[C]-\d{1,2}\b',
76
+ lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
77
+ text)
78
+
79
+ return text, changes_count, changes_list
80
 
81
 
82
 
 
137
  sheet_name = table_data.get('sheet_name', '')
138
 
139
  # Apply steel designation normalization to title and section
140
+ table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
141
+ section, section_changes, section_list = normalize_steel_designations(section)
142
 
143
  table_num_clean = str(table_num).strip()
144
 
 
175
 
176
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
177
 
178
+ # Normalize all row content (including steel designations)
179
  normalized_rows = []
180
  total_row_changes = 0
181
  rows_with_changes = 0
182
+ all_row_changes = [] # NEW
183
+
184
  for row in rows:
185
  if isinstance(row, dict):
186
  normalized_row = {}
187
  row_had_changes = False
 
188
  for k, v in row.items():
189
+ normalized_val, changes, change_list = normalize_steel_designations(str(v))
190
  normalized_row[k] = normalized_val
191
  if changes > 0:
192
  total_row_changes += changes
193
  row_had_changes = True
194
+ all_row_changes.extend(change_list) # NEW
195
  if row_had_changes:
196
  rows_with_changes += 1
 
197
  normalized_rows.append(normalized_row)
198
  else:
199
  normalized_rows.append(row)
200
+
201
+ # Log normalization stats with examples
202
  if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
203
  log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
204
+ f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
 
 
 
 
205
 
206
+ # NEW: Show examples of what changed
207
+ if title_list:
208
+ log_message(f" Title changes: {', '.join(title_list[:3])}")
209
+ if section_list:
210
+ log_message(f" Section changes: {', '.join(section_list[:3])}")
211
+ if all_row_changes:
212
+ log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
213
  # Continue with rest of existing logic using normalized_rows...
214
  # Calculate base metadata size
215
  base_content = format_table_header(doc_id, table_identifier, table_num,