Spaces:

MrSimple01
/

RAG_AIEXP_001

Paused

App Files Files Community

MrSimple01 commited on Oct 15, 2025

Commit

57493a9

verified ·

1 Parent(s): 33d47b8

Update documents_prep.py

Browse files

Files changed (1) hide show

documents_prep.py +25 -15

documents_prep.py CHANGED Viewed

@@ -26,18 +26,19 @@ def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
     Only applies to specific patterns to avoid changing legitimate Latin text.
-    Returns: (normalized_text, changes_count)
     """
     if not text:
-        return text, 0
     import re
     changes_count = 0
     def replace_in_steel_grade(match):
         """Replace Latin with Cyrillic only in steel grade context"""
-        nonlocal changes_count
         grade = match.group(0)
         original_grade = grade
@@ -58,24 +59,24 @@ def normalize_steel_designations(text):
         if grade != original_grade:
             changes_count += 1
         return grade
-    # Pattern for steel grades: digits followed by letters and more digits/letters
-    # Examples: 08X18H10T, 12X18H10T, 20X13, etc.
     text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
-    # Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
     text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
-    # Pattern 3: Welding consumables like C-25, C-26 (but be careful not to change section refs)
-    # Only replace if followed by dash and digits
     text = re.sub(r'\b[C]-\d{1,2}\b',
-                  lambda m: m.group(0).replace('C', 'С'), text)
-    return text, changes_count
@@ -136,8 +137,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
-    table_title, title_changes = normalize_steel_designations(str(table_title))
-    section, section_changes = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
@@ -178,28 +179,37 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     normalized_rows = []
     total_row_changes = 0
     rows_with_changes = 0
     for row in rows:
         if isinstance(row, dict):
             normalized_row = {}
             row_had_changes = False
             for k, v in row.items():
-                normalized_val, changes = normalize_steel_designations(str(v))
                 normalized_row[k] = normalized_val
                 if changes > 0:
                     total_row_changes += changes
                     row_had_changes = True
             if row_had_changes:
                 rows_with_changes += 1
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
-    # Log normalization stats for this table
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
     # Continue with rest of existing logic using normalized_rows...
     # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,

     """
     Convert Latin letters to Cyrillic in steel designations.
     Only applies to specific patterns to avoid changing legitimate Latin text.
+    Returns: (normalized_text, changes_count, changes_list)
     """
     if not text:
+        return text, 0, []
     import re
     changes_count = 0
+    changes_list = []  # NEW: Track what changed
     def replace_in_steel_grade(match):
         """Replace Latin with Cyrillic only in steel grade context"""
+        nonlocal changes_count, changes_list
         grade = match.group(0)
         original_grade = grade
         if grade != original_grade:
             changes_count += 1
+            changes_list.append(f"{original_grade} → {grade}")  # NEW: Record change
         return grade
+    # Pattern for steel grades
     text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
+    # Pattern 2: Welding wire designations
     text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
                   replace_in_steel_grade, text)
+    # Pattern 3: Welding consumables
     text = re.sub(r'\b[C]-\d{1,2}\b',
+                  lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
+                  text)
+    return text, changes_count, changes_list
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
+    table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
+    section, section_changes, section_list = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
     normalized_rows = []
     total_row_changes = 0
     rows_with_changes = 0
+    all_row_changes = []  # NEW
     for row in rows:
         if isinstance(row, dict):
             normalized_row = {}
             row_had_changes = False
             for k, v in row.items():
+                normalized_val, changes, change_list = normalize_steel_designations(str(v))
                 normalized_row[k] = normalized_val
                 if changes > 0:
                     total_row_changes += changes
                     row_had_changes = True
+                    all_row_changes.extend(change_list)  # NEW
             if row_had_changes:
                 rows_with_changes += 1
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
+    # Log normalization stats with examples
     if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
         log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
                    f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
+        # NEW: Show examples of what changed
+        if title_list:
+            log_message(f"      Title changes: {', '.join(title_list[:3])}")
+        if section_list:
+            log_message(f"      Section changes: {', '.join(section_list[:3])}")
+        if all_row_changes:
+            log_message(f"      Row examples: {', '.join(all_row_changes[:5])}")
     # Continue with rest of existing logic using normalized_rows...
     # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,