Spaces:
Sleeping
Sleeping
Commit
·
bd2b030
1
Parent(s):
5263b61
added a new loggers for normalizations
Browse files- documents_prep.py +51 -40
documents_prep.py
CHANGED
|
@@ -25,49 +25,58 @@ def normalize_text(text):
|
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
-
|
|
|
|
| 29 |
"""
|
| 30 |
if not text:
|
| 31 |
return text, 0, []
|
| 32 |
-
|
| 33 |
import re
|
| 34 |
-
|
| 35 |
changes_count = 0
|
| 36 |
-
|
| 37 |
-
|
| 38 |
def replace_in_steel_grade(match):
|
| 39 |
-
|
|
|
|
| 40 |
grade = match.group(0)
|
| 41 |
original_grade = grade
|
| 42 |
-
|
|
|
|
| 43 |
replacements = {
|
| 44 |
-
'X': 'Х',
|
| 45 |
-
'H': 'Н',
|
| 46 |
-
'T': 'Т',
|
| 47 |
-
'C': 'С',
|
| 48 |
-
'B': 'В',
|
| 49 |
-
'K': 'К',
|
| 50 |
-
'M': 'М',
|
| 51 |
-
'A': 'А',
|
| 52 |
-
'P': 'Р',
|
| 53 |
}
|
| 54 |
for latin, cyrillic in replacements.items():
|
| 55 |
grade = grade.replace(latin, cyrillic)
|
| 56 |
-
|
| 57 |
if grade != original_grade:
|
| 58 |
changes_count += 1
|
| 59 |
-
|
| 60 |
-
|
| 61 |
return grade
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
replace_in_steel_grade, text)
|
|
|
|
|
|
|
| 65 |
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 66 |
replace_in_steel_grade, text)
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
|
|
@@ -128,8 +137,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 128 |
sheet_name = table_data.get('sheet_name', '')
|
| 129 |
|
| 130 |
# Apply steel designation normalization to title and section
|
| 131 |
-
table_title, title_changes = normalize_steel_designations(str(table_title))
|
| 132 |
-
section, section_changes = normalize_steel_designations(section)
|
| 133 |
|
| 134 |
table_num_clean = str(table_num).strip()
|
| 135 |
|
|
@@ -166,39 +175,41 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 166 |
|
| 167 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 168 |
|
|
|
|
| 169 |
normalized_rows = []
|
| 170 |
total_row_changes = 0
|
| 171 |
rows_with_changes = 0
|
| 172 |
-
|
| 173 |
-
|
| 174 |
for row in rows:
|
| 175 |
if isinstance(row, dict):
|
| 176 |
normalized_row = {}
|
| 177 |
row_had_changes = False
|
| 178 |
-
row_words = []
|
| 179 |
for k, v in row.items():
|
| 180 |
-
normalized_val, changes,
|
| 181 |
normalized_row[k] = normalized_val
|
| 182 |
if changes > 0:
|
| 183 |
total_row_changes += changes
|
| 184 |
row_had_changes = True
|
| 185 |
-
|
| 186 |
if row_had_changes:
|
| 187 |
rows_with_changes += 1
|
| 188 |
-
all_normalized_words.extend(row_words)
|
| 189 |
normalized_rows.append(normalized_row)
|
| 190 |
else:
|
| 191 |
normalized_rows.append(row)
|
| 192 |
-
|
| 193 |
-
# Log normalization stats
|
| 194 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 195 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 196 |
-
|
| 197 |
-
if all_normalized_words:
|
| 198 |
-
log_message(" Normalized words:")
|
| 199 |
-
for orig, norm in all_normalized_words:
|
| 200 |
-
log_message(f" {orig} → {norm}")
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
# Continue with rest of existing logic using normalized_rows...
|
| 203 |
# Calculate base metadata size
|
| 204 |
base_content = format_table_header(doc_id, table_identifier, table_num,
|
|
|
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
+
Only applies to specific patterns to avoid changing legitimate Latin text.
|
| 29 |
+
Returns: (normalized_text, changes_count, changes_list)
|
| 30 |
"""
|
| 31 |
if not text:
|
| 32 |
return text, 0, []
|
| 33 |
+
|
| 34 |
import re
|
| 35 |
+
|
| 36 |
changes_count = 0
|
| 37 |
+
changes_list = [] # NEW: Track what changed
|
| 38 |
+
|
| 39 |
def replace_in_steel_grade(match):
|
| 40 |
+
"""Replace Latin with Cyrillic only in steel grade context"""
|
| 41 |
+
nonlocal changes_count, changes_list
|
| 42 |
grade = match.group(0)
|
| 43 |
original_grade = grade
|
| 44 |
+
|
| 45 |
+
# Mapping of Latin to Cyrillic for steel designations
|
| 46 |
replacements = {
|
| 47 |
+
'X': 'Х', # Latin X -> Cyrillic Х (Kha)
|
| 48 |
+
'H': 'Н', # Latin H -> Cyrillic Н (En)
|
| 49 |
+
'T': 'Т', # Latin T -> Cyrillic Т (Te)
|
| 50 |
+
'C': 'С', # Latin C -> Cyrillic С (Es)
|
| 51 |
+
'B': 'В', # Latin B -> Cyrillic В (Ve)
|
| 52 |
+
'K': 'К', # Latin K -> Cyrillic К (Ka)
|
| 53 |
+
'M': 'М', # Latin M -> Cyrillic М (Em)
|
| 54 |
+
'A': 'А', # Latin A -> Cyrillic А (A)
|
| 55 |
+
'P': 'Р', # Latin P -> Cyrillic Р (Er)
|
| 56 |
}
|
| 57 |
for latin, cyrillic in replacements.items():
|
| 58 |
grade = grade.replace(latin, cyrillic)
|
| 59 |
+
|
| 60 |
if grade != original_grade:
|
| 61 |
changes_count += 1
|
| 62 |
+
changes_list.append(f"{original_grade} → {grade}") # NEW: Record change
|
| 63 |
+
|
| 64 |
return grade
|
| 65 |
+
|
| 66 |
+
# Pattern for steel grades
|
| 67 |
+
text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 68 |
replace_in_steel_grade, text)
|
| 69 |
+
|
| 70 |
+
# Pattern 2: Welding wire designations
|
| 71 |
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 72 |
replace_in_steel_grade, text)
|
| 73 |
+
|
| 74 |
+
# Pattern 3: Welding consumables
|
| 75 |
+
text = re.sub(r'\b[C]-\d{1,2}\b',
|
| 76 |
+
lambda m: (changes_list.append(f"{m.group(0)} → {m.group(0).replace('C', 'С')}") or changes_count.__add__(1), m.group(0).replace('C', 'С'))[1] if m.group(0) != m.group(0).replace('C', 'С') else m.group(0),
|
| 77 |
+
text)
|
| 78 |
+
|
| 79 |
+
return text, changes_count, changes_list
|
| 80 |
|
| 81 |
|
| 82 |
|
|
|
|
| 137 |
sheet_name = table_data.get('sheet_name', '')
|
| 138 |
|
| 139 |
# Apply steel designation normalization to title and section
|
| 140 |
+
table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
|
| 141 |
+
section, section_changes, section_list = normalize_steel_designations(section)
|
| 142 |
|
| 143 |
table_num_clean = str(table_num).strip()
|
| 144 |
|
|
|
|
| 175 |
|
| 176 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 177 |
|
| 178 |
+
# Normalize all row content (including steel designations)
|
| 179 |
normalized_rows = []
|
| 180 |
total_row_changes = 0
|
| 181 |
rows_with_changes = 0
|
| 182 |
+
all_row_changes = [] # NEW
|
| 183 |
+
|
| 184 |
for row in rows:
|
| 185 |
if isinstance(row, dict):
|
| 186 |
normalized_row = {}
|
| 187 |
row_had_changes = False
|
|
|
|
| 188 |
for k, v in row.items():
|
| 189 |
+
normalized_val, changes, change_list = normalize_steel_designations(str(v))
|
| 190 |
normalized_row[k] = normalized_val
|
| 191 |
if changes > 0:
|
| 192 |
total_row_changes += changes
|
| 193 |
row_had_changes = True
|
| 194 |
+
all_row_changes.extend(change_list) # NEW
|
| 195 |
if row_had_changes:
|
| 196 |
rows_with_changes += 1
|
|
|
|
| 197 |
normalized_rows.append(normalized_row)
|
| 198 |
else:
|
| 199 |
normalized_rows.append(row)
|
| 200 |
+
|
| 201 |
+
# Log normalization stats with examples
|
| 202 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 203 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 204 |
+
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
+
# NEW: Show examples of what changed
|
| 207 |
+
if title_list:
|
| 208 |
+
log_message(f" Title changes: {', '.join(title_list[:3])}")
|
| 209 |
+
if section_list:
|
| 210 |
+
log_message(f" Section changes: {', '.join(section_list[:3])}")
|
| 211 |
+
if all_row_changes:
|
| 212 |
+
log_message(f" Row examples: {', '.join(all_row_changes[:5])}")
|
| 213 |
# Continue with rest of existing logic using normalized_rows...
|
| 214 |
# Calculate base metadata size
|
| 215 |
base_content = format_table_header(doc_id, table_identifier, table_num,
|