Spaces:

MrSimple01
/

RAG_AIEXP_001

Paused

App Files Files Community

MrSimple01 commited on Oct 15, 2025

Commit

450cf87

verified ·

1 Parent(s): 4849803

Update documents_prep.py

Browse files

Files changed (1) hide show

documents_prep.py +161 -55

documents_prep.py CHANGED Viewed

@@ -7,6 +7,72 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
@@ -17,10 +83,13 @@ def chunk_text_documents(documents):
     for doc in documents:
         chunks = text_splitter.get_nodes_from_documents([doc])
         for i, chunk in enumerate(chunks):
             chunk.metadata.update({
                 'chunk_id': i,
                 'total_chunks': len(chunks),
-                'chunk_size': len(chunk.text)  # Add chunk size
             })
             chunked.append(chunk)
@@ -34,20 +103,6 @@ def chunk_text_documents(documents):
     return chunked
-def normalize_text(text):
-    if not text:
-        return text
-    # Replace Cyrillic 'C' with Latin 'С' (U+0421)
-    # This is for welding types like C-25 -> С-25
-    text = text.replace('С-', 'C')
-    # Also handle cases like "Type C" or variations
-    import re
-    # Match "C" followed by digit or space in context of welding types
-    text = re.sub(r'\bС(\d)', r'С\1', text)
-    return text
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
@@ -55,80 +110,124 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
-    table_title_normalized = normalize_text(str(table_title))  # NORMALIZE TITLE
     import re
-    if 'приложени' in section.lower():
-        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
-        if appendix_match:
-            appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
         else:
-            table_identifier = table_num_clean
     else:
-        table_identifier = table_num_clean
     if not rows:
         return []
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
     # Calculate base metadata size with NORMALIZED title
-    base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
     # If entire table fits, return as one chunk
-    full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
-    if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
         metadata = {
             'type': 'table',
             'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': normalize_text(table_identifier),  # NORMALIZE identifier
-            'table_title': table_title_normalized,  # NORMALIZED
             'section': section,
-            'total_rows': len(rows),
             'chunk_size': len(content),
-            'is_complete_table': True
         }
-        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
         return [Document(text=content, metadata=metadata)]
     chunks = []
     current_rows = []
     current_size = 0
     chunk_num = 0
-    for i, row in enumerate(rows):
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
-        should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
         if should_split:
             content = base_content + format_table_rows(current_rows)
-            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
             content += format_table_footer(table_identifier, doc_id)
             metadata = {
                 'type': 'table',
                 'document_id': doc_id,
-                'table_number': table_num_clean,
-                'table_identifier': normalize_text(table_identifier),  # NORMALIZE
-                'table_title': table_title_normalized,  # NORMALIZED
                 'section': section,
                 'chunk_id': chunk_num,
                 'row_start': current_rows[0]['_idx'] - 1,
                 'row_end': current_rows[-1]['_idx'],
-                'total_rows': len(rows),
                 'chunk_size': len(content),
-                'is_complete_table': False
             }
             chunks.append(Document(text=content, metadata=metadata))
@@ -138,31 +237,32 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             current_rows = []
             current_size = 0
-        # Add row with index
         row_copy = row.copy() if isinstance(row, dict) else {'data': row}
         row_copy['_idx'] = i + 1
         current_rows.append(row_copy)
         current_size += row_size
-    # Add final chunk
     if current_rows:
         content = base_content + format_table_rows(current_rows)
-        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
         content += format_table_footer(table_identifier, doc_id)
         metadata = {
             'type': 'table',
             'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': normalize_text(table_identifier),  # NORMALIZE
-            'table_title': table_title_normalized,  # NORMALIZED
             'section': section,
             'chunk_id': chunk_num,
             'row_start': current_rows[0]['_idx'] - 1,
             'row_end': current_rows[-1]['_idx'],
-            'total_rows': len(rows),
             'chunk_size': len(content),
-            'is_complete_table': False
         }
         chunks.append(Document(text=content, metadata=metadata))
@@ -171,13 +271,15 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
-# MODIFIED: Update format_table_header function
-def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
     content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
-    # Add table type/number prominently for matching
-    if table_num:
-        content += f"ТИП: {normalize_text(table_num)}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
@@ -185,16 +287,20 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
     if section:
         content += f"РАЗДЕЛ: {section}\n"
     content += f"{'='*70}\n"
     if headers:
-        header_str = ' | '.join(str(h) for h in headers)
         content += f"ЗАГОЛОВКИ: {header_str}\n\n"
     content += "ДАННЫЕ:\n"
     return content
 def format_single_row(row, idx):
     """Format a single row"""
     if isinstance(row, dict):

 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
+def normalize_text(text):
+    if not text:
+        return text
+    # Replace Cyrillic 'C' with Latin 'С' (U+0421)
+    # This is for welding types like C-25 -> С-25
+    text = text.replace('С-', 'C')
+    # Also handle cases like "Type C" or variations
+    import re
+    # Match "C" followed by digit or space in context of welding types
+    text = re.sub(r'\bС(\d)', r'С\1', text)
+    return text
+def normalize_steel_designations(text):
+    """
+    Convert Latin letters to Cyrillic in steel designations.
+    Only applies to specific patterns to avoid changing legitimate Latin text.
+    """
+    if not text:
+        return text
+    import re
+    # Pattern 1: Steel grades like 08X18H10T, 12X18H10T, etc.
+    # Format: digits + Latin letters (no spaces typically)
+    # Common steel designation pattern: [\d]+[XHTKBMCAP]+[\d]*[XHTKBMCAP]*
+    def replace_in_steel_grade(match):
+        """Replace Latin with Cyrillic only in steel grade context"""
+        grade = match.group(0)
+        # Mapping of Latin to Cyrillic for steel designations
+        replacements = {
+            'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
+            'H': 'Н',  # Latin H -> Cyrillic Н (En)
+            'T': 'Т',  # Latin T -> Cyrillic Т (Te)
+            'C': 'С',  # Latin C -> Cyrillic С (Es)
+            'B': 'В',  # Latin B -> Cyrillic В (Ve)
+            'K': 'К',  # Latin K -> Cyrillic К (Ka)
+            'M': 'М',  # Latin M -> Cyrillic М (Em)
+            'A': 'А',  # Latin A -> Cyrillic А (A)
+            'P': 'Р',  # Latin P -> Cyrillic Р (Er)
+        }
+        for latin, cyrillic in replacements.items():
+            grade = grade.replace(latin, cyrillic)
+        return grade
+    # Pattern for steel grades: digits followed by letters and more digits/letters
+    # Examples: 08X18H10T, 12X18H10T, 20X13, etc.
+    text = re.sub(r'\b\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
+                  replace_in_steel_grade, text)
+    # Pattern 2: Welding wire designations like CB-08X19H10, CB-10XH25T
+    text = re.sub(r'\b[CS]B-\d{1,3}[XHTCBKMAP]{1,4}\d{0,2}[XHTCBKMAP]{0,4}\b',
+                  replace_in_steel_grade, text)
+    # Pattern 3: Welding consumables like C-25, C-26 (but be careful not to change section refs)
+    # Only replace if followed by dash and digits
+    text = re.sub(r'\b[C]-\d{1,2}\b',
+                  lambda m: m.group(0).replace('C', 'С'), text)
+    return text
 def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
     for doc in documents:
         chunks = text_splitter.get_nodes_from_documents([doc])
         for i, chunk in enumerate(chunks):
+            # Normalize steel designations in the chunk text
+            chunk.text = normalize_steel_designations(chunk.text)
             chunk.metadata.update({
                 'chunk_id': i,
                 'total_chunks': len(chunks),
+                'chunk_size': len(chunk.text)
             })
             chunked.append(chunk)
     return chunked
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
     table_num = table_data.get('table_number', 'unknown')
     table_title = table_data.get('table_title', '')
     section = table_data.get('section', '')
+    sheet_name = table_data.get('sheet_name', '')
+    # Apply steel designation normalization to title and section
+    table_title = normalize_steel_designations(str(table_title))
+    section = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
+    table_title_normalized = normalize_text(str(table_title))
+    import re
     import re
+    if table_num_clean in ['-', '', 'unknown', 'nan']:
+        if 'приложени' in sheet_name.lower() or 'приложени' in section.lower():
+            appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)',
+                                      (sheet_name + ' ' + section).lower())
+            if appendix_match:
+                appendix_num = appendix_match.group(1)
+                table_identifier = f"Приложение {appendix_num}"
+            else:
+                table_identifier = "Приложение"
         else:
+            if table_title:
+                first_words = ' '.join(table_title.split()[:5])
+                table_identifier = f"{first_words}"
+            else:
+                table_identifier = section.split(',')[0] if section else "БезНомера"
     else:
+        if 'приложени' in section.lower():
+            appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)', section.lower())
+            if appendix_match:
+                appendix_num = appendix_match.group(1)
+                table_identifier = f"{table_num_clean} Приложение {appendix_num}"
+            else:
+                table_identifier = table_num_clean
+        else:
+            table_identifier = table_num_clean
     if not rows:
         return []
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Normalize all row content (including steel designations)
+    normalized_rows = []
+    for row in rows:
+        if isinstance(row, dict):
+            normalized_row = {k: normalize_steel_designations(str(v)) for k, v in row.items()}
+            normalized_rows.append(normalized_row)
+        else:
+            normalized_rows.append(row)
     # Calculate base metadata size with NORMALIZED title
+    base_content = format_table_header(doc_id, table_identifier, table_num,
+                                       table_title_normalized, section, headers,
+                                       sheet_name)  # Pass sheet_name
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
     # If entire table fits, return as one chunk
+    full_rows_content = format_table_rows([{**row, '_idx': i+1}
+                                           for i, row in enumerate(normalized_rows)])
+    if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
         content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
         metadata = {
             'type': 'table',
             'document_id': doc_id,
+            'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': normalize_text(table_identifier),
+            'table_title': table_title_normalized,
             'section': section,
+            'sheet_name': sheet_name,  # ADD THIS
+            'total_rows': len(normalized_rows),
             'chunk_size': len(content),
+            'is_complete_table': True,
+            # ADD SEARCHABLE KEYWORDS
+            'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
         }
+        log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]
+    # Chunking logic continues with normalized_rows instead of rows...
     chunks = []
     current_rows = []
     current_size = 0
     chunk_num = 0
+    for i, row in enumerate(normalized_rows):
         row_text = format_single_row(row, i + 1)
         row_size = len(row_text)
+        should_split = (current_size + row_size > available_space or
+                       len(current_rows) >= max_rows) and current_rows
         if should_split:
             content = base_content + format_table_rows(current_rows)
+            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
             content += format_table_footer(table_identifier, doc_id)
             metadata = {
                 'type': 'table',
                 'document_id': doc_id,
+                'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+                'table_identifier': normalize_text(table_identifier),
+                'table_title': table_title_normalized,
                 'section': section,
+                'sheet_name': sheet_name,
                 'chunk_id': chunk_num,
                 'row_start': current_rows[0]['_idx'] - 1,
                 'row_end': current_rows[-1]['_idx'],
+                'total_rows': len(normalized_rows),
                 'chunk_size': len(content),
+                'is_complete_table': False,
+                'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
             }
             chunks.append(Document(text=content, metadata=metadata))
             current_rows = []
             current_size = 0
         row_copy = row.copy() if isinstance(row, dict) else {'data': row}
         row_copy['_idx'] = i + 1
         current_rows.append(row_copy)
         current_size += row_size
+    # Final chunk
     if current_rows:
         content = base_content + format_table_rows(current_rows)
+        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
         content += format_table_footer(table_identifier, doc_id)
         metadata = {
             'type': 'table',
             'document_id': doc_id,
+            'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': normalize_text(table_identifier),
+            'table_title': table_title_normalized,
             'section': section,
+            'sheet_name': sheet_name,
             'chunk_id': chunk_num,
             'row_start': current_rows[0]['_idx'] - 1,
             'row_end': current_rows[-1]['_idx'],
+            'total_rows': len(normalized_rows),
             'chunk_size': len(content),
+            'is_complete_table': False,
+            'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
         }
         chunks.append(Document(text=content, metadata=metadata))
     return chunks
+def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
     content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
+    # Add multiple searchable identifiers
+    if table_num and table_num not in ['-', 'unknown']:
+        content += f"НОМЕР ТАБЛИЦЫ: {normalize_text(table_num)}\n"
+    if sheet_name:
+        content += f"ЛИСТ: {sheet_name}\n"
     if table_title:
         content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"
+    # ADD KEYWORDS for better retrieval
+    content += f"КЛЮЧЕВЫЕ СЛОВА: материалы стали марки стандарты {doc_id}\n"
     content += f"{'='*70}\n"
     if headers:
+        # Normalize headers too
+        normalized_headers = [normalize_text(str(h)) for h in headers]
+        header_str = ' | '.join(normalized_headers)
         content += f"ЗАГОЛОВКИ: {header_str}\n\n"
     content += "ДАННЫЕ:\n"
     return content
 def format_single_row(row, idx):
     """Format a single row"""
     if isinstance(row, dict):