Spaces:

MrSimple01
/

RAG_AIEXP_001

Paused

App Files Files Community

MrSimple01 commited on Oct 14, 2025

Commit

77e588d

verified ·

1 Parent(s): b03f5ea

Update documents_prep.py

Browse files

Files changed (1) hide show

documents_prep.py +27 -11

documents_prep.py CHANGED Viewed

@@ -34,6 +34,20 @@ def chunk_text_documents(documents):
     return chunked
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
@@ -43,6 +57,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
     import re
     if 'приложени' in section.lower():
@@ -60,8 +75,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size
-    base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
@@ -74,8 +89,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
@@ -105,8 +120,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
-                'table_identifier': table_identifier,
-                'table_title': table_title,
                 'section': section,
                 'chunk_id': chunk_num,
                 'row_start': current_rows[0]['_idx'] - 1,
@@ -139,8 +154,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
             'section': section,
             'chunk_id': chunk_num,
             'row_start': current_rows[0]['_idx'] - 1,
@@ -156,15 +171,16 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
     # Add table type/number prominently for matching
     if table_num:
-        content += f"ТИП: {table_num}\n"
     if table_title:
-        content += f"НАЗВАНИЕ: {table_title}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"

     return chunked
+def normalize_text(text):
+    if not text:
+        return text
+    # Replace Cyrillic 'C' with Latin 'С' (U+0421)
+    # This is for welding types like C-25 -> С-25
+    text = text.replace('С-', 'C')
+    # Also handle cases like "Type C" or variations
+    import re
+    # Match "C" followed by digit or space in context of welding types
+    text = re.sub(r'\bС(\d)', r'С\1', text)
+    return text
 def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
     headers = table_data.get('headers', [])
     section = table_data.get('section', '')
     table_num_clean = str(table_num).strip()
+    table_title_normalized = normalize_text(str(table_title))  # NORMALIZE TITLE
     import re
     if 'приложени' in section.lower():
     log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size with NORMALIZED title
+    base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
+            'table_identifier': normalize_text(table_identifier),  # NORMALIZE identifier
+            'table_title': table_title_normalized,  # NORMALIZED
             'section': section,
             'total_rows': len(rows),
             'chunk_size': len(content),
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean,
+                'table_identifier': normalize_text(table_identifier),  # NORMALIZE
+                'table_title': table_title_normalized,  # NORMALIZED
                 'section': section,
                 'chunk_id': chunk_num,
                 'row_start': current_rows[0]['_idx'] - 1,
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean,
+            'table_identifier': normalize_text(table_identifier),  # NORMALIZE
+            'table_title': table_title_normalized,  # NORMALIZED
             'section': section,
             'chunk_id': chunk_num,
             'row_start': current_rows[0]['_idx'] - 1,
     return chunks
+# MODIFIED: Update format_table_header function
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
     # Add table type/number prominently for matching
     if table_num:
+        content += f"ТИП: {normalize_text(table_num)}\n"
     if table_title:
+        content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
     if section:
         content += f"РАЗДЕЛ: {section}\n"