MrSimple07 commited on
Commit
035fbdc
·
1 Parent(s): 0f89c6b

added a new loggers for normalizations

Browse files
Files changed (1) hide show
  1. documents_prep.py +35 -39
documents_prep.py CHANGED
@@ -24,60 +24,56 @@ def normalize_text(text):
24
 
25
  def normalize_steel_designations(text):
26
  """
27
- Convert Latin letters to Cyrillic in steel designations.
28
- Only applies to specific patterns to avoid changing legitimate Latin text.
29
  Returns: (normalized_text, changes_count, changes_list)
30
  """
31
  if not text:
32
  return text, 0, []
33
-
34
  import re
35
-
36
  changes_count = 0
37
- changes_list = [] # NEW: Track what changed
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def replace_in_steel_grade(match):
40
- """Replace Latin with Cyrillic only in steel grade context"""
41
  nonlocal changes_count, changes_list
42
  grade = match.group(0)
43
  original_grade = grade
44
-
45
- # Mapping of Latin to Cyrillic for steel designations
46
- replacements = {
47
- 'X': 'Х', # Latin X -> Cyrillic Х (Kha)
48
- 'H': 'Н', # Latin H -> Cyrillic Н (En)
49
- 'T': 'Т', # Latin T -> Cyrillic Т (Te)
50
- 'C': 'С', # Latin C -> Cyrillic С (Es)
51
- 'B': 'В', # Latin B -> Cyrillic В (Ve)
52
- 'K': 'К', # Latin K -> Cyrillic К (Ka)
53
- 'M': 'М', # Latin M -> Cyrillic М (Em)
54
- 'A': 'А', # Latin A -> Cyrillic А (A)
55
- 'P': 'Р', # Latin P -> Cyrillic Р (Er)
56
- }
57
  for latin, cyrillic in replacements.items():
58
- grade = grade.replace(latin, cyrillic)
59
-
60
  if grade != original_grade:
61
  changes_count += 1
62
- changes_list.append(f"{original_grade} → {grade}") # NEW: Record change
63
-
64
  return grade
65
-
66
- # Pattern for steel grades
67
- text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
68
- replace_in_steel_grade, text)
69
-
70
- # Pattern 2: Welding wire designations
71
- text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
72
- replace_in_steel_grade, text)
73
-
74
- # Pattern 3: Welding consumables
75
- text = re.sub(r'\b[C]-\d{1,2}\b',
76
- lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
77
- text)
78
-
79
- return text, changes_count, changes_list
80
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
83
  def chunk_text_documents(documents):
 
24
 
25
  def normalize_steel_designations(text):
26
  """
27
+ Convert Latin letters to Cyrillic in steel designations, including mixed patterns.
 
28
  Returns: (normalized_text, changes_count, changes_list)
29
  """
30
  if not text:
31
  return text, 0, []
32
+
33
  import re
34
+
35
  changes_count = 0
36
+ changes_list = []
37
+
38
+ # Mapping of Latin to Cyrillic for steel designations
39
+ replacements = {
40
+ 'X': 'Х', # Latin X -> Cyrillic Х
41
+ 'H': 'Н', # Latin H -> Cyrillic Н
42
+ 'T': 'Т', # Latin T -> Cyrillic Т
43
+ 'C': 'С', # Latin C -> Cyrillic С
44
+ 'B': 'В', # Latin B -> Cyrillic В
45
+ 'K': 'К', # Latin K -> Cyrillic К
46
+ 'M': 'М', # Latin M -> Cyrillic М
47
+ 'A': 'А', # Latin A -> Cyrillic А
48
+ 'P': 'Р', # Latin P -> Cyrillic Р
49
+ }
50
+
51
+ # Regex for steel grades: digits + letters (Latin or Cyrillic), possibly mixed
52
+ pattern = r'\b\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
53
+
54
  def replace_in_steel_grade(match):
 
55
  nonlocal changes_count, changes_list
56
  grade = match.group(0)
57
  original_grade = grade
58
+ # Replace only Latin letters, leave Cyrillic as is
 
 
 
 
 
 
 
 
 
 
 
 
59
  for latin, cyrillic in replacements.items():
60
+ grade = re.sub(latin, cyrillic, grade)
 
61
  if grade != original_grade:
62
  changes_count += 1
63
+ changes_list.append(f"{original_grade} → {grade}")
 
64
  return grade
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ text = re.sub(pattern, replace_in_steel_grade, text)
67
+
68
+ # Pattern for welding wire designations (CB-..., СВ-..., etc.)
69
+ wire_pattern = r'\b[CSС]B-\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
70
+ text = re.sub(wire_pattern, replace_in_steel_grade, text)
71
+
72
+ # Pattern for welding consumables (C-..., С-...)
73
+ consumable_pattern = r'\b[CSС]-\d{1,2}\b'
74
+ text = re.sub(consumable_pattern, replace_in_steel_grade, text)
75
+
76
+ return text, changes_count, changes_list
77
 
78
 
79
  def chunk_text_documents(documents):