Spaces:
Sleeping
Sleeping
Commit
·
5263b61
1
Parent(s):
9ce9909
added a new loggers for normalizations
Browse files- documents_prep.py +39 -40
documents_prep.py
CHANGED
|
@@ -25,57 +25,49 @@ def normalize_text(text):
|
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
-
|
| 29 |
-
Returns: (normalized_text, changes_count)
|
| 30 |
"""
|
| 31 |
if not text:
|
| 32 |
-
return text, 0
|
| 33 |
-
|
| 34 |
import re
|
| 35 |
-
|
| 36 |
changes_count = 0
|
| 37 |
-
|
|
|
|
| 38 |
def replace_in_steel_grade(match):
|
| 39 |
-
|
| 40 |
-
nonlocal changes_count
|
| 41 |
grade = match.group(0)
|
| 42 |
original_grade = grade
|
| 43 |
-
|
| 44 |
-
# Mapping of Latin to Cyrillic for steel designations
|
| 45 |
replacements = {
|
| 46 |
-
'X': 'Х',
|
| 47 |
-
'H': 'Н',
|
| 48 |
-
'T': 'Т',
|
| 49 |
-
'C': 'С',
|
| 50 |
-
'B': 'В',
|
| 51 |
-
'K': 'К',
|
| 52 |
-
'M': 'М',
|
| 53 |
-
'A': 'А',
|
| 54 |
-
'P': 'Р',
|
| 55 |
}
|
| 56 |
for latin, cyrillic in replacements.items():
|
| 57 |
grade = grade.replace(latin, cyrillic)
|
| 58 |
-
|
| 59 |
if grade != original_grade:
|
| 60 |
changes_count += 1
|
| 61 |
-
|
|
|
|
| 62 |
return grade
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
# Examples: 08X18H10T, 12X18H10T, 20X13, etc.
|
| 66 |
-
text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 67 |
replace_in_steel_grade, text)
|
| 68 |
-
|
| 69 |
-
# Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
|
| 70 |
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 71 |
replace_in_steel_grade, text)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
text
|
| 76 |
-
lambda m: m.group(0).replace('C', 'С'), text)
|
| 77 |
-
|
| 78 |
-
return text, changes_count
|
| 79 |
|
| 80 |
|
| 81 |
|
|
@@ -174,32 +166,39 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 174 |
|
| 175 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 176 |
|
| 177 |
-
# Normalize all row content (including steel designations)
|
| 178 |
normalized_rows = []
|
| 179 |
total_row_changes = 0
|
| 180 |
rows_with_changes = 0
|
| 181 |
-
|
|
|
|
| 182 |
for row in rows:
|
| 183 |
if isinstance(row, dict):
|
| 184 |
normalized_row = {}
|
| 185 |
row_had_changes = False
|
|
|
|
| 186 |
for k, v in row.items():
|
| 187 |
-
normalized_val, changes = normalize_steel_designations(str(v))
|
| 188 |
normalized_row[k] = normalized_val
|
| 189 |
if changes > 0:
|
| 190 |
total_row_changes += changes
|
| 191 |
row_had_changes = True
|
|
|
|
| 192 |
if row_had_changes:
|
| 193 |
rows_with_changes += 1
|
|
|
|
| 194 |
normalized_rows.append(normalized_row)
|
| 195 |
else:
|
| 196 |
normalized_rows.append(row)
|
| 197 |
-
|
| 198 |
# Log normalization stats for this table
|
| 199 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 200 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Continue with rest of existing logic using normalized_rows...
|
| 204 |
# Calculate base metadata size
|
| 205 |
base_content = format_table_header(doc_id, table_identifier, table_num,
|
|
|
|
| 25 |
def normalize_steel_designations(text):
|
| 26 |
"""
|
| 27 |
Convert Latin letters to Cyrillic in steel designations.
|
| 28 |
+
Returns: (normalized_text, changes_count, normalized_words)
|
|
|
|
| 29 |
"""
|
| 30 |
if not text:
|
| 31 |
+
return text, 0, []
|
| 32 |
+
|
| 33 |
import re
|
| 34 |
+
|
| 35 |
changes_count = 0
|
| 36 |
+
normalized_words = []
|
| 37 |
+
|
| 38 |
def replace_in_steel_grade(match):
|
| 39 |
+
nonlocal changes_count, normalized_words
|
|
|
|
| 40 |
grade = match.group(0)
|
| 41 |
original_grade = grade
|
| 42 |
+
|
|
|
|
| 43 |
replacements = {
|
| 44 |
+
'X': 'Х',
|
| 45 |
+
'H': 'Н',
|
| 46 |
+
'T': 'Т',
|
| 47 |
+
'C': 'С',
|
| 48 |
+
'B': 'В',
|
| 49 |
+
'K': 'К',
|
| 50 |
+
'M': 'М',
|
| 51 |
+
'A': 'А',
|
| 52 |
+
'P': 'Р',
|
| 53 |
}
|
| 54 |
for latin, cyrillic in replacements.items():
|
| 55 |
grade = grade.replace(latin, cyrillic)
|
| 56 |
+
|
| 57 |
if grade != original_grade:
|
| 58 |
changes_count += 1
|
| 59 |
+
normalized_words.append((original_grade, grade))
|
| 60 |
+
|
| 61 |
return grade
|
| 62 |
+
|
| 63 |
+
text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
|
|
|
|
|
|
| 64 |
replace_in_steel_grade, text)
|
|
|
|
|
|
|
| 65 |
text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
|
| 66 |
replace_in_steel_grade, text)
|
| 67 |
+
text = re.sub(r'\b[C]-\d{1,2}\b',
|
| 68 |
+
lambda m: (normalized_words.append((m.group(0), m.group(0).replace('C', 'С'))) or m.group(0).replace('C', 'С')), text)
|
| 69 |
+
|
| 70 |
+
return text, changes_count, normalized_words
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
|
|
|
|
| 166 |
|
| 167 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 168 |
|
|
|
|
| 169 |
normalized_rows = []
|
| 170 |
total_row_changes = 0
|
| 171 |
rows_with_changes = 0
|
| 172 |
+
all_normalized_words = []
|
| 173 |
+
|
| 174 |
for row in rows:
|
| 175 |
if isinstance(row, dict):
|
| 176 |
normalized_row = {}
|
| 177 |
row_had_changes = False
|
| 178 |
+
row_words = []
|
| 179 |
for k, v in row.items():
|
| 180 |
+
normalized_val, changes, norm_words = normalize_steel_designations(str(v))
|
| 181 |
normalized_row[k] = normalized_val
|
| 182 |
if changes > 0:
|
| 183 |
total_row_changes += changes
|
| 184 |
row_had_changes = True
|
| 185 |
+
row_words.extend(norm_words)
|
| 186 |
if row_had_changes:
|
| 187 |
rows_with_changes += 1
|
| 188 |
+
all_normalized_words.extend(row_words)
|
| 189 |
normalized_rows.append(normalized_row)
|
| 190 |
else:
|
| 191 |
normalized_rows.append(row)
|
| 192 |
+
|
| 193 |
# Log normalization stats for this table
|
| 194 |
if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
|
| 195 |
log_message(f" Steel normalization: title={title_changes}, section={section_changes}, "
|
| 196 |
+
f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
|
| 197 |
+
if all_normalized_words:
|
| 198 |
+
log_message(" Normalized words:")
|
| 199 |
+
for orig, norm in all_normalized_words:
|
| 200 |
+
log_message(f" {orig} → {norm}")
|
| 201 |
+
|
| 202 |
# Continue with rest of existing logic using normalized_rows...
|
| 203 |
# Calculate base metadata size
|
| 204 |
base_content = format_table_header(doc_id, table_identifier, table_num,
|