Spaces:
Sleeping
Sleeping
Commit
·
3dcab53
1
Parent(s):
6db5f4f
added the new llm query expanding + 4000,30 + latin to cyrilic
Browse files- config.py +1 -1
- documents_prep.py +21 -15
config.py
CHANGED
|
@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
-
MAX_CHARS_TABLE =
|
| 55 |
MAX_ROWS_TABLE = 30
|
| 56 |
|
| 57 |
|
|
|
|
| 51 |
CHUNK_SIZE = 1500
|
| 52 |
CHUNK_OVERLAP = 128
|
| 53 |
|
| 54 |
+
MAX_CHARS_TABLE = 4000
|
| 55 |
MAX_ROWS_TABLE = 30
|
| 56 |
|
| 57 |
|
documents_prep.py
CHANGED
|
@@ -25,6 +25,12 @@ def normalize_text(text):
|
|
| 25 |
import re
|
| 26 |
|
| 27 |
def normalize_steel_designations(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
if not text:
|
| 29 |
return text, 0, []
|
| 30 |
|
|
@@ -33,25 +39,24 @@ def normalize_steel_designations(text):
|
|
| 33 |
changes_count = 0
|
| 34 |
changes_list = []
|
| 35 |
|
| 36 |
-
# Mapping of
|
| 37 |
replacements = {
|
| 38 |
-
'
|
| 39 |
-
'
|
| 40 |
-
'
|
| 41 |
-
'
|
| 42 |
-
'
|
| 43 |
-
'
|
| 44 |
-
'
|
| 45 |
-
'
|
| 46 |
-
'
|
| 47 |
}
|
| 48 |
|
| 49 |
-
# Pattern
|
| 50 |
-
# Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
|
| 51 |
pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 52 |
-
|
| 53 |
-
#
|
| 54 |
-
pattern_wire = r'\b[С
|
| 55 |
|
| 56 |
def replace_in_steel_grade(match):
|
| 57 |
nonlocal changes_count, changes_list
|
|
@@ -70,6 +75,7 @@ def normalize_steel_designations(text):
|
|
| 70 |
|
| 71 |
|
| 72 |
|
|
|
|
| 73 |
def chunk_text_documents(documents):
|
| 74 |
text_splitter = SentenceSplitter(
|
| 75 |
chunk_size=CHUNK_SIZE,
|
|
|
|
| 25 |
import re
|
| 26 |
|
| 27 |
def normalize_steel_designations(text):
|
| 28 |
+
"""
|
| 29 |
+
Normalize steel designations by converting Latin letters to Cyrillic.
|
| 30 |
+
Handles patterns like 08X18H10T → 08Х18Н10Т.
|
| 31 |
+
Useful when aligning with Russian technical documentation.
|
| 32 |
+
Returns: (normalized_text, changes_count, changes_list)
|
| 33 |
+
"""
|
| 34 |
if not text:
|
| 35 |
return text, 0, []
|
| 36 |
|
|
|
|
| 39 |
changes_count = 0
|
| 40 |
changes_list = []
|
| 41 |
|
| 42 |
+
# Mapping of Latin to Cyrillic for steel designations
|
| 43 |
replacements = {
|
| 44 |
+
'X': 'Х', # Latin X → Cyrillic Х
|
| 45 |
+
'H': 'Н', # Latin H → Cyrillic Н
|
| 46 |
+
'T': 'Т', # Latin T → Cyrillic Т
|
| 47 |
+
'C': 'С', # Latin C → Cyrillic С
|
| 48 |
+
'B': 'В', # Latin B → Cyrillic В
|
| 49 |
+
'K': 'К', # Latin K → Cyrillic К
|
| 50 |
+
'M': 'М', # Latin M → Cyrillic М
|
| 51 |
+
'A': 'А', # Latin A → Cyrillic А
|
| 52 |
+
'P': 'Р', # Latin P → Cyrillic Р
|
| 53 |
}
|
| 54 |
|
| 55 |
+
# Pattern for steel grades (digits + letters)
|
|
|
|
| 56 |
pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 57 |
+
|
| 58 |
+
# Pattern for welding wire designations (e.g. CB-08X19H10)
|
| 59 |
+
pattern_wire = r'\b[CSС][BVВ]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
|
| 60 |
|
| 61 |
def replace_in_steel_grade(match):
|
| 62 |
nonlocal changes_count, changes_list
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
|
| 78 |
+
|
| 79 |
def chunk_text_documents(documents):
|
| 80 |
text_splitter = SentenceSplitter(
|
| 81 |
chunk_size=CHUNK_SIZE,
|