MrSimple07 commited on
Commit
8362ae9
·
1 Parent(s): 035fbdc

added a new loggers for normalizations

Browse files
Files changed (1) hide show
  1. documents_prep.py +25 -31
documents_prep.py CHANGED
@@ -22,58 +22,52 @@ def normalize_text(text):
22
 
23
  return text
24
 
 
 
25
  def normalize_steel_designations(text):
26
  """
27
- Convert Latin letters to Cyrillic in steel designations, including mixed patterns.
 
28
  Returns: (normalized_text, changes_count, changes_list)
29
  """
30
  if not text:
31
  return text, 0, []
32
 
33
- import re
34
-
35
  changes_count = 0
36
  changes_list = []
37
 
38
  # Mapping of Latin to Cyrillic for steel designations
39
  replacements = {
40
- 'X': 'Х', # Latin X -> Cyrillic Х
41
- 'H': 'Н', # Latin H -> Cyrillic Н
42
- 'T': 'Т', # Latin T -> Cyrillic Т
43
- 'C': 'С', # Latin C -> Cyrillic С
44
- 'B': 'В', # Latin B -> Cyrillic В
45
- 'K': 'К', # Latin K -> Cyrillic К
46
- 'M': 'М', # Latin M -> Cyrillic М
47
- 'A': 'А', # Latin A -> Cyrillic А
48
- 'P': 'Р', # Latin P -> Cyrillic Р
49
  }
50
 
51
- # Regex for steel grades: digits + letters (Latin or Cyrillic), possibly mixed
52
- pattern = r'\b\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
 
 
 
53
 
54
  def replace_in_steel_grade(match):
55
  nonlocal changes_count, changes_list
56
- grade = match.group(0)
57
- original_grade = grade
58
- # Replace only Latin letters, leave Cyrillic as is
59
- for latin, cyrillic in replacements.items():
60
- grade = re.sub(latin, cyrillic, grade)
61
- if grade != original_grade:
62
  changes_count += 1
63
- changes_list.append(f"{original_grade} → {grade}")
64
- return grade
65
-
66
- text = re.sub(pattern, replace_in_steel_grade, text)
67
 
68
- # Pattern for welding wire designations (CB-..., СВ-..., etc.)
69
- wire_pattern = r'\b[CSС]B-\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
70
- text = re.sub(wire_pattern, replace_in_steel_grade, text)
71
 
72
- # Pattern for welding consumables (C-..., С-...)
73
- consumable_pattern = r'\b[CSС]-\d{1,2}\b'
74
- text = re.sub(consumable_pattern, replace_in_steel_grade, text)
75
 
76
- return text, changes_count, changes_list
77
 
78
 
79
  def chunk_text_documents(documents):
 
22
 
23
  return text
24
 
25
+ import re
26
+
27
  def normalize_steel_designations(text):
28
  """
29
+ Normalize steel designations by converting Latin letters to Cyrillic.
30
+ Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
31
  Returns: (normalized_text, changes_count, changes_list)
32
  """
33
  if not text:
34
  return text, 0, []
35
 
 
 
36
  changes_count = 0
37
  changes_list = []
38
 
39
  # Mapping of Latin to Cyrillic for steel designations
40
  replacements = {
41
+ 'X': 'Х',
42
+ 'H': 'Н',
43
+ 'T': 'Т',
44
+ 'C': 'С',
45
+ 'B': 'В',
46
+ 'K': 'К',
47
+ 'M': 'М',
48
+ 'A': 'А',
49
+ 'P': 'Р',
50
  }
51
 
52
+ # Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
53
+ # Explanation:
54
+ # \b\d{1,3} — starts with 1–3 digits
55
+ # (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
56
+ pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
57
 
58
  def replace_in_steel_grade(match):
59
  nonlocal changes_count, changes_list
60
+ original = match.group(0)
61
+ converted = ''.join(replacements.get(ch, ch) for ch in original)
62
+ if converted != original:
 
 
 
63
  changes_count += 1
64
+ changes_list.append(f"{original} → {converted}")
65
+ return converted
 
 
66
 
67
+ normalized_text = re.sub(pattern, replace_in_steel_grade, text)
 
 
68
 
69
+ return normalized_text, changes_count, changes_list
 
 
70
 
 
71
 
72
 
73
  def chunk_text_documents(documents):