Spaces:
Sleeping
Sleeping
Commit
·
035fbdc
1
Parent(s):
0f89c6b
added a new loggers for normalizations
Browse files- documents_prep.py +35 -39
documents_prep.py
CHANGED
|
@@ -24,60 +24,56 @@ def normalize_text(text):
|
|
| 24 |
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
-
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
-
Only applies to specific patterns to avoid changing legitimate Latin text.
|
| 29 |
Returns: (normalized_text, changes_count, changes_list)
|
| 30 |
"""
|
| 31 |
if not text:
|
| 32 |
return text, 0, []
|
| 33 |
-
|
| 34 |
import re
|
| 35 |
-
|
| 36 |
changes_count = 0
|
| 37 |
-
changes_list = []
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def replace_in_steel_grade(match):
|
| 40 |
-
"""Replace Latin with Cyrillic only in steel grade context"""
|
| 41 |
nonlocal changes_count, changes_list
|
| 42 |
grade = match.group(0)
|
| 43 |
original_grade = grade
|
| 44 |
-
|
| 45 |
-
# Mapping of Latin to Cyrillic for steel designations
|
| 46 |
-
replacements = {
|
| 47 |
-
'X': 'Х', # Latin X -> Cyrillic Х (Kha)
|
| 48 |
-
'H': 'Н', # Latin H -> Cyrillic Н (En)
|
| 49 |
-
'T': 'Т', # Latin T -> Cyrillic Т (Te)
|
| 50 |
-
'C': 'С', # Latin C -> Cyrillic С (Es)
|
| 51 |
-
'B': 'В', # Latin B -> Cyrillic В (Ve)
|
| 52 |
-
'K': 'К', # Latin K -> Cyrillic К (Ka)
|
| 53 |
-
'M': 'М', # Latin M -> Cyrillic М (Em)
|
| 54 |
-
'A': 'А', # Latin A -> Cyrillic А (A)
|
| 55 |
-
'P': 'Р', # Latin P -> Cyrillic Р (Er)
|
| 56 |
-
}
|
| 57 |
for latin, cyrillic in replacements.items():
|
| 58 |
-
grade =
|
| 59 |
-
|
| 60 |
if grade != original_grade:
|
| 61 |
changes_count += 1
|
| 62 |
-
changes_list.append(f"{original_grade} → {grade}")
|
| 63 |
-
|
| 64 |
return grade
|
| 65 |
-
|
| 66 |
-
# Pattern for steel grades
|
| 67 |
-
text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 68 |
-
replace_in_steel_grade, text)
|
| 69 |
-
|
| 70 |
-
# Pattern 2: Welding wire designations
|
| 71 |
-
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 72 |
-
replace_in_steel_grade, text)
|
| 73 |
-
|
| 74 |
-
# Pattern 3: Welding consumables
|
| 75 |
-
text = re.sub(r'\b[C]-\d{1,2}\b',
|
| 76 |
-
lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
|
| 77 |
-
text)
|
| 78 |
-
|
| 79 |
-
return text, changes_count, changes_list
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
def chunk_text_documents(documents):
|
|
|
|
| 24 |
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
+
Convert Latin letters to Cyrillic in steel designations, including mixed patterns.
|
|
|
|
| 28 |
Returns: (normalized_text, changes_count, changes_list)
|
| 29 |
"""
|
| 30 |
if not text:
|
| 31 |
return text, 0, []
|
| 32 |
+
|
| 33 |
import re
|
| 34 |
+
|
| 35 |
changes_count = 0
|
| 36 |
+
changes_list = []
|
| 37 |
+
|
| 38 |
+
# Mapping of Latin to Cyrillic for steel designations
|
| 39 |
+
replacements = {
|
| 40 |
+
'X': 'Х', # Latin X -> Cyrillic Х
|
| 41 |
+
'H': 'Н', # Latin H -> Cyrillic Н
|
| 42 |
+
'T': 'Т', # Latin T -> Cyrillic Т
|
| 43 |
+
'C': 'С', # Latin C -> Cyrillic С
|
| 44 |
+
'B': 'В', # Latin B -> Cyrillic В
|
| 45 |
+
'K': 'К', # Latin K -> Cyrillic К
|
| 46 |
+
'M': 'М', # Latin M -> Cyrillic М
|
| 47 |
+
'A': 'А', # Latin A -> Cyrillic А
|
| 48 |
+
'P': 'Р', # Latin P -> Cyrillic Р
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Regex for steel grades: digits + letters (Latin or Cyrillic), possibly mixed
|
| 52 |
+
pattern = r'\b\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
|
| 53 |
+
|
| 54 |
def replace_in_steel_grade(match):
|
|
|
|
| 55 |
nonlocal changes_count, changes_list
|
| 56 |
grade = match.group(0)
|
| 57 |
original_grade = grade
|
| 58 |
+
# Replace only Latin letters, leave Cyrillic as is
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
for latin, cyrillic in replacements.items():
|
| 60 |
+
grade = re.sub(latin, cyrillic, grade)
|
|
|
|
| 61 |
if grade != original_grade:
|
| 62 |
changes_count += 1
|
| 63 |
+
changes_list.append(f"{original_grade} → {grade}")
|
|
|
|
| 64 |
return grade
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
+
text = re.sub(pattern, replace_in_steel_grade, text)
|
| 67 |
+
|
| 68 |
+
# Pattern for welding wire designations (CB-..., СВ-..., etc.)
|
| 69 |
+
wire_pattern = r'\b[CSС]B-\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
|
| 70 |
+
text = re.sub(wire_pattern, replace_in_steel_grade, text)
|
| 71 |
+
|
| 72 |
+
# Pattern for welding consumables (C-..., С-...)
|
| 73 |
+
consumable_pattern = r'\b[CSС]-\d{1,2}\b'
|
| 74 |
+
text = re.sub(consumable_pattern, replace_in_steel_grade, text)
|
| 75 |
+
|
| 76 |
+
return text, changes_count, changes_list
|
| 77 |
|
| 78 |
|
| 79 |
def chunk_text_documents(documents):
|