Spaces:
Sleeping
Sleeping
Commit
·
15ae02f
1
Parent(s):
4c96122
from cry to latin + 2000,30
Browse files- config.py +1 -1
- documents_prep.py +23 -16
config.py
CHANGED
|
@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
-
MAX_CHARS_TABLE =
|
| 55 |
MAX_ROWS_TABLE = 30
|
| 56 |
|
| 57 |
CUSTOM_PROMPT = """
|
|
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
+
MAX_CHARS_TABLE = 2000
|
| 55 |
MAX_ROWS_TABLE = 30
|
| 56 |
|
| 57 |
CUSTOM_PROMPT = """
|
documents_prep.py
CHANGED
|
@@ -26,33 +26,38 @@ import re
|
|
| 26 |
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
"""
|
| 29 |
-
Normalize steel designations by converting
|
| 30 |
-
|
|
|
|
| 31 |
Returns: (normalized_text, changes_count, changes_list)
|
| 32 |
"""
|
| 33 |
if not text:
|
| 34 |
return text, 0, []
|
| 35 |
|
|
|
|
|
|
|
| 36 |
changes_count = 0
|
| 37 |
changes_list = []
|
| 38 |
|
| 39 |
-
# Mapping of
|
| 40 |
replacements = {
|
| 41 |
-
'
|
| 42 |
-
'
|
| 43 |
-
'
|
| 44 |
-
'
|
| 45 |
-
'
|
| 46 |
-
'
|
| 47 |
-
'
|
| 48 |
-
'
|
| 49 |
-
'
|
| 50 |
}
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
| 56 |
|
| 57 |
def replace_in_steel_grade(match):
|
| 58 |
nonlocal changes_count, changes_list
|
|
@@ -63,7 +68,9 @@ def normalize_steel_designations(text):
|
|
| 63 |
changes_list.append(f"{original} → {converted}")
|
| 64 |
return converted
|
| 65 |
|
|
|
|
| 66 |
normalized_text = re.sub(pattern, replace_in_steel_grade, text)
|
|
|
|
| 67 |
|
| 68 |
return normalized_text, changes_count, changes_list
|
| 69 |
|
|
|
|
| 26 |
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
"""
|
| 29 |
+
Normalize steel designations by converting Cyrillic letters to Latin.
|
| 30 |
+
This improves search/retrieval since embedding models work better with Latin.
|
| 31 |
+
Handles patterns like 08Х18Н10Т → 08X18H10T
|
| 32 |
Returns: (normalized_text, changes_count, changes_list)
|
| 33 |
"""
|
| 34 |
if not text:
|
| 35 |
return text, 0, []
|
| 36 |
|
| 37 |
+
import re
|
| 38 |
+
|
| 39 |
changes_count = 0
|
| 40 |
changes_list = []
|
| 41 |
|
| 42 |
+
# Mapping of Cyrillic to Latin for steel designations
|
| 43 |
replacements = {
|
| 44 |
+
'Х': 'X', # Cyrillic Kha → Latin X
|
| 45 |
+
'Н': 'H', # Cyrillic En → Latin H
|
| 46 |
+
'Т': 'T', # Cyrillic Te → Latin T
|
| 47 |
+
'С': 'C', # Cyrillic Es → Latin C
|
| 48 |
+
'В': 'B', # Cyrillic Ve → Latin B
|
| 49 |
+
'К': 'K', # Cyrillic Ka → Latin K
|
| 50 |
+
'М': 'M', # Cyrillic Em → Latin M
|
| 51 |
+
'А': 'A', # Cyrillic A → Latin A
|
| 52 |
+
'Р': 'P', # Cyrillic Er → Latin P
|
| 53 |
}
|
| 54 |
|
| 55 |
+
# Pattern: starts with digits, then letters+digits (steel grade pattern)
|
| 56 |
+
# Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
|
| 57 |
+
pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 58 |
+
|
| 59 |
+
# Also match welding wire patterns like СВ-08Х19Н10
|
| 60 |
+
pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 61 |
|
| 62 |
def replace_in_steel_grade(match):
|
| 63 |
nonlocal changes_count, changes_list
|
|
|
|
| 68 |
changes_list.append(f"{original} → {converted}")
|
| 69 |
return converted
|
| 70 |
|
| 71 |
+
# Apply both patterns
|
| 72 |
normalized_text = re.sub(pattern, replace_in_steel_grade, text)
|
| 73 |
+
normalized_text = re.sub(pattern_wire, replace_in_steel_grade, normalized_text)
|
| 74 |
|
| 75 |
return normalized_text, changes_count, changes_list
|
| 76 |
|