MrSimple07 commited on
Commit
15ae02f
·
1 Parent(s): 4c96122

from cry to latin + 2000,30

Browse files
Files changed (2) hide show
  1. config.py +1 -1
  2. documents_prep.py +23 -16
config.py CHANGED
@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
- MAX_CHARS_TABLE = 3000
55
  MAX_ROWS_TABLE = 30
56
 
57
  CUSTOM_PROMPT = """
 
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
+ MAX_CHARS_TABLE = 2000
55
  MAX_ROWS_TABLE = 30
56
 
57
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -26,33 +26,38 @@ import re
26
 
27
  def normalize_steel_designations(text):
28
  """
29
- Normalize steel designations by converting Latin letters to Cyrillic.
30
- Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
 
31
  Returns: (normalized_text, changes_count, changes_list)
32
  """
33
  if not text:
34
  return text, 0, []
35
 
 
 
36
  changes_count = 0
37
  changes_list = []
38
 
39
- # Mapping of Latin to Cyrillic for steel designations
40
  replacements = {
41
- 'X': 'Х',
42
- 'H': 'Н',
43
- 'T': 'Т',
44
- 'C': 'С',
45
- 'B': 'В',
46
- 'K': 'К',
47
- 'M': 'М',
48
- 'A': 'А',
49
- 'P': 'Р',
50
  }
51
 
52
- # Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
53
- # \b\d{1,3} starts with 1–3 digits
54
- # (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
55
- pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
 
 
56
 
57
  def replace_in_steel_grade(match):
58
  nonlocal changes_count, changes_list
@@ -63,7 +68,9 @@ def normalize_steel_designations(text):
63
  changes_list.append(f"{original} → {converted}")
64
  return converted
65
 
 
66
  normalized_text = re.sub(pattern, replace_in_steel_grade, text)
 
67
 
68
  return normalized_text, changes_count, changes_list
69
 
 
26
 
27
  def normalize_steel_designations(text):
28
  """
29
+ Normalize steel designations by converting Cyrillic letters to Latin.
30
+ This improves search/retrieval since embedding models work better with Latin.
31
+ Handles patterns like 08Х18Н10Т → 08X18H10T
32
  Returns: (normalized_text, changes_count, changes_list)
33
  """
34
  if not text:
35
  return text, 0, []
36
 
37
+ import re
38
+
39
  changes_count = 0
40
  changes_list = []
41
 
42
+ # Mapping of Cyrillic to Latin for steel designations
43
  replacements = {
44
+ 'Х': 'X', # Cyrillic Kha → Latin X
45
+ 'Н': 'H', # Cyrillic En → Latin H
46
+ 'Т': 'T', # Cyrillic Te → Latin T
47
+ 'С': 'C', # Cyrillic Es → Latin C
48
+ 'В': 'B', # Cyrillic Ve → Latin B
49
+ 'К': 'K', # Cyrillic Ka → Latin K
50
+ 'М': 'M', # Cyrillic Em → Latin M
51
+ 'А': 'A', # Cyrillic A → Latin A
52
+ 'Р': 'P', # Cyrillic Er → Latin P
53
  }
54
 
55
+ # Pattern: starts with digits, then letters+digits (steel grade pattern)
56
+ # Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
57
+ pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
58
+
59
+ # Also match welding wire patterns like СВ-08Х19Н10
60
+ pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
61
 
62
  def replace_in_steel_grade(match):
63
  nonlocal changes_count, changes_list
 
68
  changes_list.append(f"{original} → {converted}")
69
  return converted
70
 
71
+ # Apply both patterns
72
  normalized_text = re.sub(pattern, replace_in_steel_grade, text)
73
+ normalized_text = re.sub(pattern_wire, replace_in_steel_grade, normalized_text)
74
 
75
  return normalized_text, changes_count, changes_list
76