Spaces:
Paused
Paused
Update documents_prep.py
Browse files- documents_prep.py +25 -15
documents_prep.py
CHANGED
|
@@ -26,18 +26,19 @@ def normalize_steel_designations(text):
|
|
| 26 |
"""
|
| 27 |
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
Only applies to specific patterns to avoid changing legitimate Latin text.
|
| 29 |
-
Returns: (normalized_text, changes_count)
|
| 30 |
"""
|
| 31 |
if not text:
|
| 32 |
-
return text, 0
|
| 33 |
|
| 34 |
import re
|
| 35 |
|
| 36 |
changes_count = 0
|
|
|
|
| 37 |
|
| 38 |
def replace_in_steel_grade(match):
|
| 39 |
"""Replace Latin with Cyrillic only in steel grade context"""
|
| 40 |
-
nonlocal changes_count
|
| 41 |
grade = match.group(0)
|
| 42 |
original_grade = grade
|
| 43 |
|
|
@@ -58,24 +59,24 @@ def normalize_steel_designations(text):
|
|
| 58 |
|
| 59 |
if grade != original_grade:
|
| 60 |
changes_count += 1
|
|
|
|
| 61 |
|
| 62 |
return grade
|
| 63 |
|
| 64 |
-
# Pattern for steel grades
|
| 65 |
-
# Examples: 08X18H10T, 12X18H10T, 20X13, etc.
|
| 66 |
text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 67 |
replace_in_steel_grade, text)
|
| 68 |
|
| 69 |
-
# Pattern 2: Welding wire designations
|
| 70 |
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 71 |
replace_in_steel_grade, text)
|
| 72 |
|
| 73 |
-
# Pattern 3: Welding consumables
|
| 74 |
-
# Only replace if followed by dash and digits
|
| 75 |
text = re.sub(r'\b[C]-\d{1,2}\b',
|
| 76 |
-
lambda m: m.group(0).replace('C', 'С'),
|
|
|
|
| 77 |
|
| 78 |
-
return text, changes_count
|
| 79 |
|
| 80 |
|
| 81 |
|
|
@@ -136,8 +137,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 136 |
sheet_name = table_data.get('sheet_name', '')
|
| 137 |
|
| 138 |
# Apply steel designation normalization to title and section
|
| 139 |
-
table_title, title_changes = normalize_steel_designations(str(table_title))
|
| 140 |
-
section, section_changes = normalize_steel_designations(section)
|
| 141 |
|
| 142 |
table_num_clean = str(table_num).strip()
|
| 143 |
|
|
@@ -178,28 +179,37 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 178 |
normalized_rows = []
|
| 179 |
total_row_changes = 0
|
| 180 |
rows_with_changes = 0
|
|
|
|
| 181 |
|
| 182 |
for row in rows:
|
| 183 |
if isinstance(row, dict):
|
| 184 |
normalized_row = {}
|
| 185 |
row_had_changes = False
|
| 186 |
for k, v in row.items():
|
| 187 |
-
normalized_val, changes = normalize_steel_designations(str(v))
|
| 188 |
normalized_row[k] = normalized_val
|
| 189 |
if changes > 0:
|
| 190 |
total_row_changes += changes
|
| 191 |
row_had_changes = True
|
|
|
|
| 192 |
if row_had_changes:
|
| 193 |
rows_with_changes += 1
|
| 194 |
normalized_rows.append(normalized_row)
|
| 195 |
else:
|
| 196 |
normalized_rows.append(row)
|
| 197 |
|
| 198 |
-
# Log normalization stats
|
| 199 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 200 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 201 |
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Continue with rest of existing logic using normalized_rows...
|
| 204 |
# Calculate base metadata size
|
| 205 |
base_content = format_table_header(doc_id, table_identifier, table_num,
|
|
|
|
| 26 |
"""
|
| 27 |
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
Only applies to specific patterns to avoid changing legitimate Latin text.
|
| 29 |
+
Returns: (normalized_text, changes_count, changes_list)
|
| 30 |
"""
|
| 31 |
if not text:
|
| 32 |
+
return text, 0, []
|
| 33 |
|
| 34 |
import re
|
| 35 |
|
| 36 |
changes_count = 0
|
| 37 |
+
changes_list = [] # NEW: Track what changed
|
| 38 |
|
| 39 |
def replace_in_steel_grade(match):
|
| 40 |
"""Replace Latin with Cyrillic only in steel grade context"""
|
| 41 |
+
nonlocal changes_count, changes_list
|
| 42 |
grade = match.group(0)
|
| 43 |
original_grade = grade
|
| 44 |
|
|
|
|
| 59 |
|
| 60 |
if grade != original_grade:
|
| 61 |
changes_count += 1
|
| 62 |
+
changes_list.append(f"{original_grade} → {grade}") # NEW: Record change
|
| 63 |
|
| 64 |
return grade
|
| 65 |
|
| 66 |
+
# Pattern for steel grades
|
|
|
|
| 67 |
text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 68 |
replace_in_steel_grade, text)
|
| 69 |
|
| 70 |
+
# Pattern 2: Welding wire designations
|
| 71 |
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 72 |
replace_in_steel_grade, text)
|
| 73 |
|
| 74 |
+
# Pattern 3: Welding consumables
|
|
|
|
| 75 |
text = re.sub(r'\b[C]-\d{1,2}\b',
|
| 76 |
+
lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
|
| 77 |
+
text)
|
| 78 |
|
| 79 |
+
return text, changes_count, changes_list
|
| 80 |
|
| 81 |
|
| 82 |
|
|
|
|
| 137 |
sheet_name = table_data.get('sheet_name', '')
|
| 138 |
|
| 139 |
# Apply steel designation normalization to title and section
|
| 140 |
+
table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
|
| 141 |
+
section, section_changes, section_list = normalize_steel_designations(section)
|
| 142 |
|
| 143 |
table_num_clean = str(table_num).strip()
|
| 144 |
|
|
|
|
| 179 |
normalized_rows = []
|
| 180 |
total_row_changes = 0
|
| 181 |
rows_with_changes = 0
|
| 182 |
+
all_row_changes = [] # NEW
|
| 183 |
|
| 184 |
for row in rows:
|
| 185 |
if isinstance(row, dict):
|
| 186 |
normalized_row = {}
|
| 187 |
row_had_changes = False
|
| 188 |
for k, v in row.items():
|
| 189 |
+
normalized_val, changes, change_list = normalize_steel_designations(str(v))
|
| 190 |
normalized_row[k] = normalized_val
|
| 191 |
if changes > 0:
|
| 192 |
total_row_changes += changes
|
| 193 |
row_had_changes = True
|
| 194 |
+
all_row_changes.extend(change_list) # NEW
|
| 195 |
if row_had_changes:
|
| 196 |
rows_with_changes += 1
|
| 197 |
normalized_rows.append(normalized_row)
|
| 198 |
else:
|
| 199 |
normalized_rows.append(row)
|
| 200 |
|
| 201 |
+
# Log normalization stats with examples
|
| 202 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 203 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 204 |
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
| 205 |
+
|
| 206 |
+
# NEW: Show examples of what changed
|
| 207 |
+
if title_list:
|
| 208 |
+
log_message(f" Title changes: {', '.join(title_list[:3])}")
|
| 209 |
+
if section_list:
|
| 210 |
+
log_message(f" Section changes: {', '.join(section_list[:3])}")
|
| 211 |
+
if all_row_changes:
|
| 212 |
+
log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
|
| 213 |
# Continue with rest of existing logic using normalized_rows...
|
| 214 |
# Calculate base metadata size
|
| 215 |
base_content = format_table_header(doc_id, table_identifier, table_num,
|