Spaces:

MrSimple01
/

RAG_AIEXP_001

Paused

App Files Files Community

MrSimple01 commited on Oct 15, 2025

Commit

abb0a7b

verified ·

1 Parent(s): 73cef4b

Update documents_prep.py

Browse files

Files changed (1) hide show

documents_prep.py +73 -27

documents_prep.py CHANGED Viewed

@@ -26,19 +26,21 @@ def normalize_steel_designations(text):
     """
     Convert Latin letters to Cyrillic in steel designations.
     Only applies to specific patterns to avoid changing legitimate Latin text.
     """
     if not text:
-        return text
     import re
-    # Pattern 1: Steel grades like 08X18H10T, 12X18H10T, etc.
-    # Format: digits + Latin letters (no spaces typically)
-    # Common steel designation pattern: [\d]+[XHTKBMCAP]+[\d]*[XHTKBMCAP]*
     def replace_in_steel_grade(match):
         """Replace Latin with Cyrillic only in steel grade context"""
         grade = match.group(0)
         # Mapping of Latin to Cyrillic for steel designations
         replacements = {
             'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
@@ -53,6 +55,10 @@ def normalize_steel_designations(text):
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
         return grade
     # Pattern for steel grades: digits followed by letters and more digits/letters
@@ -69,7 +75,7 @@ def normalize_steel_designations(text):
     text = re.sub(r'\b[C]-\d{1,2}\b',
                   lambda m: m.group(0).replace('C', 'С'), text)
-    return text
@@ -79,12 +85,23 @@ def chunk_text_documents(documents):
         chunk_overlap=CHUNK_OVERLAP
     )
     chunked = []
     for doc in documents:
         chunks = text_splitter.get_nodes_from_documents([doc])
         for i, chunk in enumerate(chunks):
             # Normalize steel designations in the chunk text
-            chunk.text = normalize_steel_designations(chunk.text)
             chunk.metadata.update({
                 'chunk_id': i,
@@ -100,6 +117,12 @@ def chunk_text_documents(documents):
         max_size = max(len(c.text) for c in chunked)
         log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
         log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
     return chunked
@@ -113,13 +136,10 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
-    table_title = normalize_steel_designations(str(table_title))
-    section = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
-    table_title_normalized = normalize_text(str(table_title))
-    import re
     import re
@@ -156,17 +176,35 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     # Normalize all row content (including steel designations)
     normalized_rows = []
     for row in rows:
         if isinstance(row, dict):
-            normalized_row = {k: normalize_steel_designations(str(v)) for k, v in row.items()}
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
-    # Calculate base metadata size with NORMALIZED title
     base_content = format_table_header(doc_id, table_identifier, table_num,
-                                       table_title_normalized, section, headers,
-                                       sheet_name)  # Pass sheet_name
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
@@ -181,21 +219,20 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-            'table_identifier': normalize_text(table_identifier),
-            'table_title': table_title_normalized,
             'section': section,
-            'sheet_name': sheet_name,  # ADD THIS
             'total_rows': len(normalized_rows),
             'chunk_size': len(content),
             'is_complete_table': True,
-            # ADD SEARCHABLE KEYWORDS
             'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]
-    # Chunking logic continues with normalized_rows instead of rows...
     chunks = []
     current_rows = []
     current_size = 0
@@ -217,8 +254,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-                'table_identifier': normalize_text(table_identifier),
-                'table_title': table_title_normalized,
                 'section': section,
                 'sheet_name': sheet_name,
                 'chunk_id': chunk_num,
@@ -252,8 +289,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
-            'table_identifier': normalize_text(table_identifier),
-            'table_title': table_title_normalized,
             'section': section,
             'sheet_name': sheet_name,
             'chunk_id': chunk_num,
@@ -271,6 +308,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
     content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
@@ -510,11 +548,15 @@ def extract_sections_from_json(json_path):
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
@@ -531,15 +573,19 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # Use the consistent MAX_CHARS_TABLE from config
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
     return all_chunks

     """
     Convert Latin letters to Cyrillic in steel designations.
     Only applies to specific patterns to avoid changing legitimate Latin text.
+    Returns: (normalized_text, changes_count)
     """
     if not text:
+        return text, 0
     import re
+    changes_count = 0
     def replace_in_steel_grade(match):
         """Replace Latin with Cyrillic only in steel grade context"""
+        nonlocal changes_count
         grade = match.group(0)
+        original_grade = grade
         # Mapping of Latin to Cyrillic for steel designations
         replacements = {
             'X': 'Х',  # Latin X -> Cyrillic Х (Kha)
         }
         for latin, cyrillic in replacements.items():
             grade = grade.replace(latin, cyrillic)
+        if grade != original_grade:
+            changes_count += 1
         return grade
     # Pattern for steel grades: digits followed by letters and more digits/letters
     text = re.sub(r'\b[C]-\d{1,2}\b',
                   lambda m: m.group(0).replace('C', 'С'), text)
+    return text, changes_count
         chunk_overlap=CHUNK_OVERLAP
     )
+    log_message("="*60)
+    log_message("NORMALIZING STEEL DESIGNATIONS IN TEXT CHUNKS")
+    total_normalizations = 0
+    chunks_with_changes = 0
     chunked = []
     for doc in documents:
         chunks = text_splitter.get_nodes_from_documents([doc])
         for i, chunk in enumerate(chunks):
             # Normalize steel designations in the chunk text
+            original_text = chunk.text
+            chunk.text, changes = normalize_steel_designations(chunk.text)
+            if changes > 0:
+                chunks_with_changes += 1
+                total_normalizations += changes
             chunk.metadata.update({
                 'chunk_id': i,
         max_size = max(len(c.text) for c in chunked)
         log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
         log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
+        log_message(f"  Steel designation normalization:")
+        log_message(f"    - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
+        log_message(f"    - Total steel grades normalized: {total_normalizations}")
+        log_message(f"    - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else "    - No normalizations needed")
+    log_message("="*60)
     return chunked
     sheet_name = table_data.get('sheet_name', '')
     # Apply steel designation normalization to title and section
+    table_title, title_changes = normalize_steel_designations(str(table_title))
+    section, section_changes = normalize_steel_designations(section)
     table_num_clean = str(table_num).strip()
     import re
     # Normalize all row content (including steel designations)
     normalized_rows = []
+    total_row_changes = 0
+    rows_with_changes = 0
     for row in rows:
         if isinstance(row, dict):
+            normalized_row = {}
+            row_had_changes = False
+            for k, v in row.items():
+                normalized_val, changes = normalize_steel_designations(str(v))
+                normalized_row[k] = normalized_val
+                if changes > 0:
+                    total_row_changes += changes
+                    row_had_changes = True
+            if row_had_changes:
+                rows_with_changes += 1
             normalized_rows.append(normalized_row)
         else:
             normalized_rows.append(row)
+    # Log normalization stats for this table
+    if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
+        log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
+                   f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
+    # Continue with rest of existing logic using normalized_rows...
+    # Calculate base metadata size
     base_content = format_table_header(doc_id, table_identifier, table_num,
+                                       table_title, section, headers,
+                                       sheet_name)
     base_size = len(base_content)
     available_space = max_chars - base_size - 200
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
             'section': section,
+            'sheet_name': sheet_name,
             'total_rows': len(normalized_rows),
             'chunk_size': len(content),
             'is_complete_table': True,
             'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
         }
         log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
         return [Document(text=content, metadata=metadata)]
+    # Chunking logic continues...
     chunks = []
     current_rows = []
     current_size = 0
                 'type': 'table',
                 'document_id': doc_id,
                 'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+                'table_identifier': table_identifier,
+                'table_title': table_title,
                 'section': section,
                 'sheet_name': sheet_name,
                 'chunk_id': chunk_num,
             'type': 'table',
             'document_id': doc_id,
             'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
             'section': section,
             'sheet_name': sheet_name,
             'chunk_id': chunk_num,
     return chunks
 def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
     content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
+    log_message("="*60)
+    log_message("NORMALIZING STEEL DESIGNATIONS IN TABLES")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
     table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
     all_chunks = []
+    tables_processed = 0
     for file_path in table_files:
         try:
             local_path = hf_hub_download(
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                tables_processed += 1
+                chunks = chunk_table_by_content(sheet, sheet_doc_id,
+                                               max_chars=MAX_CHARS_TABLE,
+                                               max_rows=MAX_ROWS_TABLE)
                 all_chunks.extend(chunks)
         except Exception as e:
             log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
+    log_message("="*60)
     return all_chunks