MrSimple01 commited on
Commit
2409907
·
verified ·
1 Parent(s): 2f54f79

Update documents_prep.py

Browse files
Files changed (1) hide show
  1. documents_prep.py +34 -45
documents_prep.py CHANGED
@@ -22,61 +22,50 @@ def normalize_text(text):
22
 
23
  return text
24
 
 
 
25
  def normalize_steel_designations(text):
26
  """
27
- Convert Latin letters to Cyrillic in steel designations.
28
- Only applies to specific patterns to avoid changing legitimate Latin text.
29
  Returns: (normalized_text, changes_count, changes_list)
30
  """
31
  if not text:
32
  return text, 0, []
33
-
34
- import re
35
-
36
  changes_count = 0
37
- changes_list = [] # NEW: Track what changed
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def replace_in_steel_grade(match):
40
- """Replace Latin with Cyrillic only in steel grade context"""
41
  nonlocal changes_count, changes_list
42
- grade = match.group(0)
43
- original_grade = grade
44
-
45
- # Mapping of Latin to Cyrillic for steel designations
46
- replacements = {
47
- 'X': 'Х', # Latin X -> Cyrillic Х (Kha)
48
- 'H': 'Н', # Latin H -> Cyrillic Н (En)
49
- 'T': 'Т', # Latin T -> Cyrillic Т (Te)
50
- 'C': 'С', # Latin C -> Cyrillic С (Es)
51
- 'B': 'В', # Latin B -> Cyrillic В (Ve)
52
- 'K': 'К', # Latin K -> Cyrillic К (Ka)
53
- 'M': 'М', # Latin M -> Cyrillic М (Em)
54
- 'A': 'А', # Latin A -> Cyrillic А (A)
55
- 'P': 'Р', # Latin P -> Cyrillic Р (Er)
56
- }
57
- for latin, cyrillic in replacements.items():
58
- grade = grade.replace(latin, cyrillic)
59
-
60
- if grade != original_grade:
61
  changes_count += 1
62
- changes_list.append(f"{original_grade} → {grade}") # NEW: Record change
63
-
64
- return grade
65
-
66
- # Pattern for steel grades
67
- text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
68
- replace_in_steel_grade, text)
69
-
70
- # Pattern 2: Welding wire designations
71
- text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
72
- replace_in_steel_grade, text)
73
-
74
- # Pattern 3: Welding consumables
75
- text = re.sub(r'\b[C]-\d{1,2}\b',
76
- lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
77
- text)
78
-
79
- return text, changes_count, changes_list
80
 
81
 
82
 
 
22
 
23
  return text
24
 
25
+ import re
26
+
27
  def normalize_steel_designations(text):
28
  """
29
+ Normalize steel designations by converting Latin letters to Cyrillic.
30
+ Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
31
  Returns: (normalized_text, changes_count, changes_list)
32
  """
33
  if not text:
34
  return text, 0, []
35
+
 
 
36
  changes_count = 0
37
+ changes_list = []
38
+
39
+ # Mapping of Latin to Cyrillic for steel designations
40
+ replacements = {
41
+ 'X': 'Х',
42
+ 'H': 'Н',
43
+ 'T': 'Т',
44
+ 'C': 'С',
45
+ 'B': 'В',
46
+ 'K': 'К',
47
+ 'M': 'М',
48
+ 'A': 'А',
49
+ 'P': 'Р',
50
+ }
51
+
52
+ # Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
53
+ # \b\d{1,3} — starts with 1–3 digits
54
+ # (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
55
+ pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
56
+
57
  def replace_in_steel_grade(match):
 
58
  nonlocal changes_count, changes_list
59
+ original = match.group(0)
60
+ converted = ''.join(replacements.get(ch, ch) for ch in original)
61
+ if converted != original:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  changes_count += 1
63
+ changes_list.append(f"{original} → {converted}")
64
+ return converted
65
+
66
+ normalized_text = re.sub(pattern, replace_in_steel_grade, text)
67
+
68
+ return normalized_text, changes_count, changes_list
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71