MrSimple07 commited on
Commit
3dcab53
·
1 Parent(s): 6db5f4f

added the new llm query expanding + 4000,30 + latin to cyrilic

Browse files
Files changed (2) hide show
  1. config.py +1 -1
  2. documents_prep.py +21 -15
config.py CHANGED
@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
- MAX_CHARS_TABLE = 2000
55
  MAX_ROWS_TABLE = 30
56
 
57
 
 
51
  CHUNK_SIZE = 1500
52
  CHUNK_OVERLAP = 128
53
 
54
+ MAX_CHARS_TABLE = 4000
55
  MAX_ROWS_TABLE = 30
56
 
57
 
documents_prep.py CHANGED
@@ -25,6 +25,12 @@ def normalize_text(text):
25
  import re
26
 
27
  def normalize_steel_designations(text):
 
 
 
 
 
 
28
  if not text:
29
  return text, 0, []
30
 
@@ -33,25 +39,24 @@ def normalize_steel_designations(text):
33
  changes_count = 0
34
  changes_list = []
35
 
36
- # Mapping of Cyrillic to Latin for steel designations
37
  replacements = {
38
- 'Х': 'X', # Cyrillic KhaLatin X
39
- 'Н': 'H', # Cyrillic EnLatin H
40
- 'Т': 'T', # Cyrillic TeLatin T
41
- 'С': 'C', # Cyrillic EsLatin C
42
- 'В': 'B', # Cyrillic VeLatin B
43
- 'К': 'K', # Cyrillic KaLatin K
44
- 'М': 'M', # Cyrillic EmLatin M
45
- 'А': 'A', # Cyrillic A → Latin A
46
- 'Р': 'P', # Cyrillic ErLatin P
47
  }
48
 
49
- # Pattern: starts with digits, then letters+digits (steel grade pattern)
50
- # Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
51
  pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
52
-
53
- # Also match welding wire patterns like СВ-08Х19Н10
54
- pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
55
 
56
  def replace_in_steel_grade(match):
57
  nonlocal changes_count, changes_list
@@ -70,6 +75,7 @@ def normalize_steel_designations(text):
70
 
71
 
72
 
 
73
  def chunk_text_documents(documents):
74
  text_splitter = SentenceSplitter(
75
  chunk_size=CHUNK_SIZE,
 
25
  import re
26
 
27
  def normalize_steel_designations(text):
28
+ """
29
+ Normalize steel designations by converting Latin letters to Cyrillic.
30
+ Handles patterns like 08X18H10T → 08Х18Н10Т.
31
+ Useful when aligning with Russian technical documentation.
32
+ Returns: (normalized_text, changes_count, changes_list)
33
+ """
34
  if not text:
35
  return text, 0, []
36
 
 
39
  changes_count = 0
40
  changes_list = []
41
 
42
+ # Mapping of Latin to Cyrillic for steel designations
43
  replacements = {
44
+ 'X': 'Х', # Latin XCyrillic Х
45
+ 'H': 'Н', # Latin HCyrillic Н
46
+ 'T': 'Т', # Latin TCyrillic Т
47
+ 'C': 'С', # Latin CCyrillic С
48
+ 'B': 'В', # Latin BCyrillic В
49
+ 'K': 'К', # Latin KCyrillic К
50
+ 'M': 'М', # Latin MCyrillic М
51
+ 'A': 'А', # Latin A → Cyrillic А
52
+ 'P': 'Р', # Latin PCyrillic Р
53
  }
54
 
55
+ # Pattern for steel grades (digits + letters)
 
56
  pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
57
+
58
+ # Pattern for welding wire designations (e.g. CB-08X19H10)
59
+ pattern_wire = r'\b[CSС][BVВ]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
60
 
61
  def replace_in_steel_grade(match):
62
  nonlocal changes_count, changes_list
 
75
 
76
 
77
 
78
+
79
  def chunk_text_documents(documents):
80
  text_splitter = SentenceSplitter(
81
  chunk_size=CHUNK_SIZE,