Spaces:
Paused
Paused
Update documents_prep.py
Browse files- documents_prep.py +34 -45
documents_prep.py
CHANGED
|
@@ -22,61 +22,50 @@ def normalize_text(text):
|
|
| 22 |
|
| 23 |
return text
|
| 24 |
|
|
|
|
|
|
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
-
|
| 28 |
-
|
| 29 |
Returns: (normalized_text, changes_count, changes_list)
|
| 30 |
"""
|
| 31 |
if not text:
|
| 32 |
return text, 0, []
|
| 33 |
-
|
| 34 |
-
import re
|
| 35 |
-
|
| 36 |
changes_count = 0
|
| 37 |
-
changes_list = []
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def replace_in_steel_grade(match):
|
| 40 |
-
"""Replace Latin with Cyrillic only in steel grade context"""
|
| 41 |
nonlocal changes_count, changes_list
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
# Mapping of Latin to Cyrillic for steel designations
|
| 46 |
-
replacements = {
|
| 47 |
-
'X': 'Х', # Latin X -> Cyrillic Х (Kha)
|
| 48 |
-
'H': 'Н', # Latin H -> Cyrillic Н (En)
|
| 49 |
-
'T': 'Т', # Latin T -> Cyrillic Т (Te)
|
| 50 |
-
'C': 'С', # Latin C -> Cyrillic С (Es)
|
| 51 |
-
'B': 'В', # Latin B -> Cyrillic В (Ve)
|
| 52 |
-
'K': 'К', # Latin K -> Cyrillic К (Ka)
|
| 53 |
-
'M': 'М', # Latin M -> Cyrillic М (Em)
|
| 54 |
-
'A': 'А', # Latin A -> Cyrillic А (A)
|
| 55 |
-
'P': 'Р', # Latin P -> Cyrillic Р (Er)
|
| 56 |
-
}
|
| 57 |
-
for latin, cyrillic in replacements.items():
|
| 58 |
-
grade = grade.replace(latin, cyrillic)
|
| 59 |
-
|
| 60 |
-
if grade != original_grade:
|
| 61 |
changes_count += 1
|
| 62 |
-
changes_list.append(f"{
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
replace_in_steel_grade, text)
|
| 69 |
-
|
| 70 |
-
# Pattern 2: Welding wire designations
|
| 71 |
-
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 72 |
-
replace_in_steel_grade, text)
|
| 73 |
-
|
| 74 |
-
# Pattern 3: Welding consumables
|
| 75 |
-
text = re.sub(r'\b[C]-\d{1,2}\b',
|
| 76 |
-
lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
|
| 77 |
-
text)
|
| 78 |
-
|
| 79 |
-
return text, changes_count, changes_list
|
| 80 |
|
| 81 |
|
| 82 |
|
|
|
|
| 22 |
|
| 23 |
return text
|
| 24 |
|
| 25 |
+
import re
|
| 26 |
+
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
"""
|
| 29 |
+
Normalize steel designations by converting Latin letters to Cyrillic.
|
| 30 |
+
Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
|
| 31 |
Returns: (normalized_text, changes_count, changes_list)
|
| 32 |
"""
|
| 33 |
if not text:
|
| 34 |
return text, 0, []
|
| 35 |
+
|
|
|
|
|
|
|
| 36 |
changes_count = 0
|
| 37 |
+
changes_list = []
|
| 38 |
+
|
| 39 |
+
# Mapping of Latin to Cyrillic for steel designations
|
| 40 |
+
replacements = {
|
| 41 |
+
'X': 'Х',
|
| 42 |
+
'H': 'Н',
|
| 43 |
+
'T': 'Т',
|
| 44 |
+
'C': 'С',
|
| 45 |
+
'B': 'В',
|
| 46 |
+
'K': 'К',
|
| 47 |
+
'M': 'М',
|
| 48 |
+
'A': 'А',
|
| 49 |
+
'P': 'Р',
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
|
| 53 |
+
# \b\d{1,3} — starts with 1–3 digits
|
| 54 |
+
# (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
|
| 55 |
+
pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
|
| 56 |
+
|
| 57 |
def replace_in_steel_grade(match):
|
|
|
|
| 58 |
nonlocal changes_count, changes_list
|
| 59 |
+
original = match.group(0)
|
| 60 |
+
converted = ''.join(replacements.get(ch, ch) for ch in original)
|
| 61 |
+
if converted != original:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
changes_count += 1
|
| 63 |
+
changes_list.append(f"{original} → {converted}")
|
| 64 |
+
return converted
|
| 65 |
+
|
| 66 |
+
normalized_text = re.sub(pattern, replace_in_steel_grade, text)
|
| 67 |
+
|
| 68 |
+
return normalized_text, changes_count, changes_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
|