Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

035fbdc

1 Parent(s): 0f89c6b

added a new loggers for normalizations

Browse files

Files changed (1) hide show

documents_prep.py +35 -39

documents_prep.py CHANGED Viewed

@@ -24,60 +24,56 @@ def normalize_text(text):
 def normalize_steel_designations(text):
     """
-    Convert Latin letters to Cyrillic in steel designations.
-    Only applies to specific patterns to avoid changing legitimate Latin text.
     Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
         return text, 0, []
     import re
     changes_count = 0
-    changes_list = []  # NEW: Track what changed
     def replace_in_steel_grade(match):
-        """Replace Latin with Cyrillic only in steel grade context"""
         nonlocal changes_count, changes_list
         grade = match.group(0)
         original_grade = grade
-        # Mapping of Latin to Cyrillic for steel designations
-        replacements = {
-            'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
-            'H': 'Н',  # Latin H -> Cyrillic Н (En)
-            'T': 'Т',  # Latin T -> Cyrillic Т (Te)
-            'C': 'С',  # Latin C -> Cyrillic С (Es)
-            'B': 'В',  # Latin B -> Cyrillic В (Ve)
-            'K': 'К',  # Latin K -> Cyrillic К (Ka)
-            'M': 'М',  # Latin M -> Cyrillic М (Em)
-            'A': 'А',  # Latin A -> Cyrillic А (A)
-            'P': 'Р',  # Latin P -> Cyrillic Р (Er)
-        }
         for latin, cyrillic in replacements.items():
-            grade = grade.replace(latin, cyrillic)
         if grade != original_grade:
             changes_count += 1
-            changes_list.append(f"{original_grade} → {grade}")  # NEW: Record change
         return grade
-    # Pattern for steel grades
-    text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
-                  replace_in_steel_grade, text)
-    # Pattern 2: Welding wire designations
-    text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
-                  replace_in_steel_grade, text)
-    # Pattern 3: Welding consumables
-    text = re.sub(r'\b[C]-\d{1,2}\b',
-                  lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
-                  text)
-    return text, changes_count, changes_list
 def chunk_text_documents(documents):

 def normalize_steel_designations(text):
     """
+    Convert Latin letters to Cyrillic in steel designations, including mixed patterns.
     Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
         return text, 0, []
     import re
     changes_count = 0
+    changes_list = []
+    # Mapping of Latin to Cyrillic for steel designations
+    replacements = {
+        'X': 'Х',  # Latin X -> Cyrillic Х
+        'H': 'Н',  # Latin H -> Cyrillic Н
+        'T': 'Т',  # Latin T -> Cyrillic Т
+        'C': 'С',  # Latin C -> Cyrillic С
+        'B': 'В',  # Latin B -> Cyrillic В
+        'K': 'К',  # Latin K -> Cyrillic К
+        'M': 'М',  # Latin M -> Cyrillic М
+        'A': 'А',  # Latin A -> Cyrillic А
+        'P': 'Р',  # Latin P -> Cyrillic Р
+    }
+    # Regex for steel grades: digits + letters (Latin or Cyrillic), possibly mixed
+    pattern = r'\b\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
         grade = match.group(0)
         original_grade = grade
+        # Replace only Latin letters, leave Cyrillic as is
         for latin, cyrillic in replacements.items():
+            grade = re.sub(latin, cyrillic, grade)
         if grade != original_grade:
             changes_count += 1
+            changes_list.append(f"{original_grade} → {grade}")
         return grade
+    text = re.sub(pattern, replace_in_steel_grade, text)
+    # Pattern for welding wire designations (CB-..., СВ-..., etc.)
+    wire_pattern = r'\b[CSС]B-\d{1,3}[XHTCBKMAPХНТСКМАР]{1,6}\d{0,2}[XHTCBKMAPХНТСКМАР]{0,6}\b'
+    text = re.sub(wire_pattern, replace_in_steel_grade, text)
+    # Pattern for welding consumables (C-..., С-...)
+    consumable_pattern = r'\b[CSС]-\d{1,2}\b'
+    text = re.sub(consumable_pattern, replace_in_steel_grade, text)
+    return text, changes_count, changes_list
 def chunk_text_documents(documents):