Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

5263b61

1 Parent(s): 9ce9909

added a new loggers for normalizations

Browse files

Files changed (1) hide show

documents_prep.py +39 -40

documents_prep.py CHANGED Viewed

@@ -25,57 +25,49 @@ def normalize_text(text):
 def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
-    Only applies to specific patterns to avoid changing legitimate Latin text.
-    Returns: (normalized_text, changes_count)
     """
     if not text:
-        return text, 0
     import re
     changes_count = 0
     def replace_in_steel_grade(match):
-        """Replace Latin with Cyrillic only in steel grade context"""
-        nonlocal changes_count
         grade = match.group(0)
         original_grade = grade
-        # Mapping of Latin to Cyrillic for steel designations
         replacements = {
-            'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
-            'H': 'Н',  # Latin H -> Cyrillic Н (En)
-            'T': 'Т',  # Latin T -> Cyrillic Т (Te)
-            'C': 'С',  # Latin C -> Cyrillic С (Es)
-            'B': 'В',  # Latin B -> Cyrillic В (Ve)
-            'K': 'К',  # Latin K -> Cyrillic К (Ka)
-            'M': 'М',  # Latin M -> Cyrillic М (Em)
-            'A': 'А',  # Latin A -> Cyrillic А (A)
-            'P': 'Р',  # Latin P -> Cyrillic Р (Er)
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
         if grade != original_grade:
             changes_count += 1
         return grade
-    # Pattern for steel grades: digits followed by letters and more digits/letters
-    # Examples: 08X18H10T, 12X18H10T, 20X13, etc.
-    text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
-    # Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
     text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
-    # Pattern 3: Welding consumables like C-25, C-26 (but be careful not to change section refs)
-    # Only replace if followed by dash and digits
-    text = re.sub(r'\b[C]-\d{1,2}\b',
-                  lambda m: m.group(0).replace('C', 'С'), text)
-    return text, changes_count
@@ -174,32 +166,39 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Normalize all row content (including steel designations)
     normalized_rows = []
     total_row_changes = 0
     rows_with_changes = 0
     for row in rows:
         if isinstance(row, dict):
             normalized_row = {}
             row_had_changes = False
             for k, v in row.items():
-                normalized_val, changes = normalize_steel_designations(str(v))
                 normalized_row[k] = normalized_val
                 if changes > 0:
                     total_row_changes += changes
                     row_had_changes = True
             if row_had_changes:
                 rows_with_changes += 1
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
     # Log normalization stats for this table
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
-                   f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
     # Continue with rest of existing logic using normalized_rows...
     # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,

 def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
+    Returns: (normalized_text, changes_count, normalized_words)
     """
     if not text:
+        return text, 0, []
     import re
     changes_count = 0
+    normalized_words = []
     def replace_in_steel_grade(match):
+        nonlocal changes_count, normalized_words
         grade = match.group(0)
         original_grade = grade
         replacements = {
+            'X': 'Х',
+            'H': 'Н',
+            'T': 'Т',
+            'C': 'С',
+            'B': 'В',
+            'K': 'К',
+            'M': 'М',
+            'A': 'А',
+            'P': 'Р',
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
         if grade != original_grade:
             changes_count += 1
+            normalized_words.append((original_grade, grade))
         return grade
+    text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
     text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
+    text = re.sub(r'\b[C]-\d{1,2}\b',
+                  lambda m: (normalized_words.append((m.group(0), m.group(0).replace('C', 'С'))) or m.group(0).replace('C', 'С')), text)
+    return text, changes_count, normalized_words
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
     normalized_rows = []
     total_row_changes = 0
     rows_with_changes = 0
+    all_normalized_words = []
     for row in rows:
         if isinstance(row, dict):
             normalized_row = {}
             row_had_changes = False
+            row_words = []
             for k, v in row.items():
+                normalized_val, changes, norm_words = normalize_steel_designations(str(v))
                 normalized_row[k] = normalized_val
                 if changes > 0:
                     total_row_changes += changes
                     row_had_changes = True
+                    row_words.extend(norm_words)
             if row_had_changes:
                 rows_with_changes += 1
+                all_normalized_words.extend(row_words)
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
     # Log normalization stats for this table
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
+                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
+        if all_normalized_words:
+            log_message("    Normalized words:")
+            for orig, norm in all_normalized_words:
+                log_message(f"      {orig} → {norm}")
     # Continue with rest of existing logic using normalized_rows...
     # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,