Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 15, 2025

Commit

bd2b030

1 Parent(s): 5263b61

added a new loggers for normalizations

Browse files

Files changed (1) hide show

documents_prep.py +51 -40

documents_prep.py CHANGED Viewed

@@ -25,49 +25,58 @@ def normalize_text(text):
 def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
-    Returns: (normalized_text, changes_count, normalized_words)
     """
     if not text:
         return text, 0, []
     import re
     changes_count = 0
-    normalized_words = []
     def replace_in_steel_grade(match):
-        nonlocal changes_count, normalized_words
         grade = match.group(0)
         original_grade = grade
         replacements = {
-            'X': 'Х',
-            'H': 'Н',
-            'T': 'Т',
-            'C': 'С',
-            'B': 'В',
-            'K': 'К',
-            'M': 'М',
-            'A': 'А',
-            'P': 'Р',
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
         if grade != original_grade:
             changes_count += 1
-            normalized_words.append((original_grade, grade))
         return grade
-    text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
     text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
-    text = re.sub(r'\b[C]-\d{1,2}\b',
-                  lambda m: (normalized_words.append((m.group(0), m.group(0).replace('C', 'С'))) or m.group(0).replace('C', 'С')), text)
-    return text, changes_count, normalized_words
@@ -128,8 +137,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
-    table_title, title_changes = normalize_steel_designations(str(table_title))
-    section, section_changes = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
@@ -166,39 +175,41 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
     normalized_rows = []
     total_row_changes = 0
     rows_with_changes = 0
-    all_normalized_words = []
     for row in rows:
         if isinstance(row, dict):
             normalized_row = {}
             row_had_changes = False
-            row_words = []
             for k, v in row.items():
-                normalized_val, changes, norm_words = normalize_steel_designations(str(v))
                 normalized_row[k] = normalized_val
                 if changes > 0:
                     total_row_changes += changes
                     row_had_changes = True
-                    row_words.extend(norm_words)
             if row_had_changes:
                 rows_with_changes += 1
-                all_normalized_words.extend(row_words)
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
-    # Log normalization stats for this table
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
-                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
-        if all_normalized_words:
-            log_message("    Normalized words:")
-            for orig, norm in all_normalized_words:
-                log_message(f"      {orig} → {norm}")
     # Continue with rest of existing logic using normalized_rows...
     # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,

 def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
+    Only applies to specific patterns to avoid changing legitimate Latin text.
+    Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
         return text, 0, []
     import re
     changes_count = 0
+    changes_list = []  # NEW: Track what changed
     def replace_in_steel_grade(match):
+        """Replace Latin with Cyrillic only in steel grade context"""
+        nonlocal changes_count, changes_list
         grade = match.group(0)
         original_grade = grade
+        # Mapping of Latin to Cyrillic for steel designations
         replacements = {
+            'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
+            'H': 'Н',  # Latin H -> Cyrillic Н (En)
+            'T': 'Т',  # Latin T -> Cyrillic Т (Te)
+            'C': 'С',  # Latin C -> Cyrillic С (Es)
+            'B': 'В',  # Latin B -> Cyrillic В (Ve)
+            'K': 'К',  # Latin K -> Cyrillic К (Ka)
+            'M': 'М',  # Latin M -> Cyrillic М (Em)
+            'A': 'А',  # Latin A -> Cyrillic А (A)
+            'P': 'Р',  # Latin P -> Cyrillic Р (Er)
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
         if grade != original_grade:
             changes_count += 1
+            changes_list.append(f"{original_grade} → {grade}")  # NEW: Record change
         return grade
+    # Pattern for steel grades
+    text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
+    # Pattern 2: Welding wire designations
     text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
+    # Pattern 3: Welding consumables
+    text = re.sub(r'\b[C]-\d{1,2}\b',
+                  lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
+                  text)
+    return text, changes_count, changes_list
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
+    table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
+    section, section_changes, section_list = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Normalize all row content (including steel designations)
     normalized_rows = []
     total_row_changes = 0
     rows_with_changes = 0
+    all_row_changes = []  # NEW
     for row in rows:
         if isinstance(row, dict):
             normalized_row = {}
             row_had_changes = False
             for k, v in row.items():
+                normalized_val, changes, change_list = normalize_steel_designations(str(v))
                 normalized_row[k] = normalized_val
                 if changes > 0:
                     total_row_changes += changes
                     row_had_changes = True
+                    all_row_changes.extend(change_list)  # NEW
             if row_had_changes:
                 rows_with_changes += 1
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
+    # Log normalization stats with examples
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
+                   f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
+        # NEW: Show examples of what changed
+        if title_list:
+            log_message(f"      Title changes: {', '.join(title_list[:3])}")
+        if section_list:
+            log_message(f"      Section changes: {', '.join(section_list[:3])}")
+        if all_row_changes:
+            log_message(f"      Row examples: {', '.join(all_row_changes[:5])}")
     # Continue with rest of existing logic using normalized_rows...
     # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,