Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

MrSimple07 commited on Oct 15, 2025

Commit

3dcab53

1 Parent(s): 6db5f4f

added the new llm query expanding + 4000,30 + latin to cyrilic

Files changed (2) hide show

config.py CHANGED Viewed

@@ -51,7 +51,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
-MAX_CHARS_TABLE = 2000
 MAX_ROWS_TABLE = 30

 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 4000
 MAX_ROWS_TABLE = 30

documents_prep.py CHANGED Viewed

@@ -25,6 +25,12 @@ def normalize_text(text):
 import re
 def normalize_steel_designations(text):
     if not text:
         return text, 0, []
@@ -33,25 +39,24 @@ def normalize_steel_designations(text):
     changes_count = 0
     changes_list = []
-    # Mapping of Cyrillic to Latin for steel designations
     replacements = {
-        'Х': 'X',  # Cyrillic Kha → Latin X
-        'Н': 'H',  # Cyrillic En → Latin H
-        'Т': 'T',  # Cyrillic Te → Latin T
-        'С': 'C',  # Cyrillic Es → Latin C
-        'В': 'B',  # Cyrillic Ve → Latin B
-        'К': 'K',  # Cyrillic Ka → Latin K
-        'М': 'M',  # Cyrillic Em → Latin M
-        'А': 'A',  # Cyrillic A → Latin A
-        'Р': 'P',  # Cyrillic Er → Latin P
     }
-    # Pattern: starts with digits, then letters+digits (steel grade pattern)
-    # Examples: 08Х18Н10Т, 12Х18Н9, 10Н17Н13М2Т, СВ-08Х19Н10
     pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
-    # Also match welding wire patterns like СВ-08Х19Н10
-    pattern_wire = r'\b[СC][ВB]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
@@ -70,6 +75,7 @@ def normalize_steel_designations(text):
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,

 import re
 def normalize_steel_designations(text):
+    """
+    Normalize steel designations by converting Latin letters to Cyrillic.
+    Handles patterns like 08X18H10T → 08Х18Н10Т.
+    Useful when aligning with Russian technical documentation.
+    Returns: (normalized_text, changes_count, changes_list)
+    """
     if not text:
         return text, 0, []
     changes_count = 0
     changes_list = []
+    # Mapping of Latin to Cyrillic for steel designations
     replacements = {
+        'X': 'Х',  # Latin X → Cyrillic Х
+        'H': 'Н',  # Latin H → Cyrillic Н
+        'T': 'Т',  # Latin T → Cyrillic Т
+        'C': 'С',  # Latin C → Cyrillic С
+        'B': 'В',  # Latin B → Cyrillic В
+        'K': 'К',  # Latin K → Cyrillic К
+        'M': 'М',  # Latin M → Cyrillic М
+        'A': 'А',  # Latin A → Cyrillic А
+        'P': 'Р',  # Latin P → Cyrillic Р
     }
+    # Pattern for steel grades (digits + letters)
     pattern = r'\b\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
+    # Pattern for welding wire designations (e.g. CB-08X19H10)
+    pattern_wire = r'\b[CSС][BVВ]-\d{1,3}(?:[A-ZА-ЯЁ]\d*)+\b'
     def replace_in_steel_grade(match):
         nonlocal changes_count, changes_list
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,