Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

8362ae9

1 Parent(s): 035fbdc

added a new loggers for normalizations

Browse files

Files changed (1) hide show

documents_prep.py +25 -31

documents_prep.py CHANGED Viewed

@@ -22,58 +22,52 @@ def normalize_text(text):
     return text
 def normalize_steel_designations(text):
     """
-    Convert Latin letters to Cyrillic in steel designations, including mixed patterns.
     Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
         return text, 0, []
-    import re
     changes_count = 0
     changes_list = []
     # Mapping of Latin to Cyrillic for steel designations
     replacements = {
-        'X': 'Х',  # Latin X -> Cyrillic Х
-        'H': 'Н',  # Latin H -> Cyrillic Н
-        'T': 'Т',  # Latin T -> Cyrillic Т
-        'C': 'С',  # Latin C -> Cyrillic С
-        'B': 'В',  # Latin B -> Cyrillic В
-        'K': 'К',  # Latin K -> Cyrillic К
-        'M': 'М',  # Latin M -> Cyrillic М
-        'A': 'А',  # Latin A -> Cyrillic А
-        'P': 'Р',  # Latin P -> Cyrillic Р
     }
-    # Regex for steel grades: digits + letters (Latin or Cyrillic), possibly mixed
-    pattern = r'\b\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
-        grade = match.group(0)
-        original_grade = grade
-        # Replace only Latin letters, leave Cyrillic as is
-        for latin, cyrillic in replacements.items():
-            grade = re.sub(latin, cyrillic, grade)
-        if grade != original_grade:
             changes_count += 1
-            changes_list.append(f"{original_grade} → {grade}")
-        return grade
-    text = re.sub(pattern, replace_in_steel_grade, text)
-    # Pattern for welding wire designations (CB-..., СВ-..., etc.)
-    wire_pattern = r'\b[CSС]B-\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
-    text = re.sub(wire_pattern, replace_in_steel_grade, text)
-    # Pattern for welding consumables (C-..., С-...)
-    consumable_pattern = r'\b[CSС]-\d{1,2}\b'
-    text = re.sub(consumable_pattern, replace_in_steel_grade, text)
-    return text, changes_count, changes_list
 def chunk_text_documents(documents):

     return text
+import re
 def normalize_steel_designations(text):
     """
+    Normalize steel designations by converting Latin letters to Cyrillic.
+    Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
     Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
         return text, 0, []
     changes_count = 0
     changes_list = []
     # Mapping of Latin to Cyrillic for steel designations
     replacements = {
+        'X': 'Х',
+        'H': 'Н',
+        'T': 'Т',
+        'C': 'С',
+        'B': 'В',
+        'K': 'К',
+        'M': 'М',
+        'A': 'А',
+        'P': 'Р',
     }
+    # Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
+    # Explanation:
+    # \b\d{1,3} — starts with 1–3 digits
+    # (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
+    pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
+        original = match.group(0)
+        converted = ''.join(replacements.get(ch, ch) for ch in original)
+        if converted != original:
             changes_count += 1
+            changes_list.append(f"{original} → {converted}")
+        return converted
+    normalized_text = re.sub(pattern, replace_in_steel_grade, text)
+    return normalized_text, changes_count, changes_list
 def chunk_text_documents(documents):