Spaces:
Sleeping
Sleeping
Commit
·
8362ae9
1
Parent(s):
035fbdc
added a new loggers for normalizations
Browse files- documents_prep.py +25 -31
documents_prep.py
CHANGED
|
@@ -22,58 +22,52 @@ def normalize_text(text):
|
|
| 22 |
|
| 23 |
return text
|
| 24 |
|
|
|
|
|
|
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
-
|
|
|
|
| 28 |
Returns: (normalized_text, changes_count, changes_list)
|
| 29 |
"""
|
| 30 |
if not text:
|
| 31 |
return text, 0, []
|
| 32 |
|
| 33 |
-
import re
|
| 34 |
-
|
| 35 |
changes_count = 0
|
| 36 |
changes_list = []
|
| 37 |
|
| 38 |
# Mapping of Latin to Cyrillic for steel designations
|
| 39 |
replacements = {
|
| 40 |
-
'X': 'Х',
|
| 41 |
-
'H': 'Н',
|
| 42 |
-
'T': 'Т',
|
| 43 |
-
'C': 'С',
|
| 44 |
-
'B': 'В',
|
| 45 |
-
'K': 'К',
|
| 46 |
-
'M': 'М',
|
| 47 |
-
'A': 'А',
|
| 48 |
-
'P': 'Р',
|
| 49 |
}
|
| 50 |
|
| 51 |
-
# Regex
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def replace_in_steel_grade(match):
|
| 55 |
nonlocal changes_count, changes_list
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
for latin, cyrillic in replacements.items():
|
| 60 |
-
grade = re.sub(latin, cyrillic, grade)
|
| 61 |
-
if grade != original_grade:
|
| 62 |
changes_count += 1
|
| 63 |
-
changes_list.append(f"{
|
| 64 |
-
return
|
| 65 |
-
|
| 66 |
-
text = re.sub(pattern, replace_in_steel_grade, text)
|
| 67 |
|
| 68 |
-
|
| 69 |
-
wire_pattern = r'\b[CSС]B-\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
|
| 70 |
-
text = re.sub(wire_pattern, replace_in_steel_grade, text)
|
| 71 |
|
| 72 |
-
|
| 73 |
-
consumable_pattern = r'\b[CSС]-\d{1,2}\b'
|
| 74 |
-
text = re.sub(consumable_pattern, replace_in_steel_grade, text)
|
| 75 |
|
| 76 |
-
return text, changes_count, changes_list
|
| 77 |
|
| 78 |
|
| 79 |
def chunk_text_documents(documents):
|
|
|
|
| 22 |
|
| 23 |
return text
|
| 24 |
|
| 25 |
+
import re
|
| 26 |
+
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
"""
|
| 29 |
+
Normalize steel designations by converting Latin letters to Cyrillic.
|
| 30 |
+
Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
|
| 31 |
Returns: (normalized_text, changes_count, changes_list)
|
| 32 |
"""
|
| 33 |
if not text:
|
| 34 |
return text, 0, []
|
| 35 |
|
|
|
|
|
|
|
| 36 |
changes_count = 0
|
| 37 |
changes_list = []
|
| 38 |
|
| 39 |
# Mapping of Latin to Cyrillic for steel designations
|
| 40 |
replacements = {
|
| 41 |
+
'X': 'Х',
|
| 42 |
+
'H': 'Н',
|
| 43 |
+
'T': 'Т',
|
| 44 |
+
'C': 'С',
|
| 45 |
+
'B': 'В',
|
| 46 |
+
'K': 'К',
|
| 47 |
+
'M': 'М',
|
| 48 |
+
'A': 'А',
|
| 49 |
+
'P': 'Р',
|
| 50 |
}
|
| 51 |
|
| 52 |
+
# Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
|
| 53 |
+
# Explanation:
|
| 54 |
+
# \b\d{1,3} — starts with 1–3 digits
|
| 55 |
+
# (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
|
| 56 |
+
pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
|
| 57 |
|
| 58 |
def replace_in_steel_grade(match):
|
| 59 |
nonlocal changes_count, changes_list
|
| 60 |
+
original = match.group(0)
|
| 61 |
+
converted = ''.join(replacements.get(ch, ch) for ch in original)
|
| 62 |
+
if converted != original:
|
|
|
|
|
|
|
|
|
|
| 63 |
changes_count += 1
|
| 64 |
+
changes_list.append(f"{original} → {converted}")
|
| 65 |
+
return converted
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
normalized_text = re.sub(pattern, replace_in_steel_grade, text)
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
return normalized_text, changes_count, changes_list
|
|
|
|
|
|
|
| 70 |
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
def chunk_text_documents(documents):
|