Spaces:

MrSimple01
/

AIEXP_0

Sleeping

App Files Files Community

MrSimple01 commited on Oct 15, 2025

Commit

a2e9ee2

verified ·

1 Parent(s): fbc8fb0

Update documents_prep.py

Browse files

Files changed (1) hide show

documents_prep.py +663 -512

documents_prep.py CHANGED Viewed

@@ -1,513 +1,664 @@
-import json
-import zipfile
-import pandas as pd
-from huggingface_hub import hf_hub_download, list_repo_files
-from llama_index.core import Document
-from llama_index.core.text_splitter import SentenceSplitter
-from my_logging import log_message
-from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
-def chunk_text_documents(documents):
-    text_splitter = SentenceSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP
-    )
-    chunked = []
-    for doc in documents:
-        chunks = text_splitter.get_nodes_from_documents([doc])
-        for i, chunk in enumerate(chunks):
-            chunk.metadata.update({
-                'chunk_id': i,
-                'total_chunks': len(chunks),
-                'chunk_size': len(chunk.text)  # Add chunk size
-            })
-            chunked.append(chunk)
-    # Log statistics
-    if chunked:
-        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
-        min_size = min(len(c.text) for c in chunked)
-        max_size = max(len(c.text) for c in chunked)
-        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
-        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
-    return chunked
-def normalize_text(text):
-    if not text:
-        return text
-    # Replace Cyrillic 'C' with Latin 'С' (U+0421)
-    # This is for welding types like C-25 -> С-25
-    text = text.replace('С-', 'C')
-    # Also handle cases like "Type C" or variations
-    import re
-    # Match "C" followed by digit or space in context of welding types
-    text = re.sub(r'\bС(\d)', r'С\1', text)
-    return text
-def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
-    headers = table_data.get('headers', [])
-    rows = table_data.get('data', [])
-    table_num = table_data.get('table_number', 'unknown')
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    table_num_clean = str(table_num).strip()
-    table_title_normalized = normalize_text(str(table_title))  # NORMALIZE TITLE
-    import re
-    if 'приложени' in section.lower():
-        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
-        if appendix_match:
-            appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
-        else:
-            table_identifier = table_num_clean
-    else:
-        table_identifier = table_num_clean
-    if not rows:
-        return []
-    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size with NORMALIZED title
-    base_content = format_table_header(doc_id, table_identifier, table_num, table_title_normalized, section, headers)
-    base_size = len(base_content)
-    available_space = max_chars - base_size - 200
-    # If entire table fits, return as one chunk
-    full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
-    if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
-        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': normalize_text(table_identifier),  # NORMALIZE identifier
-            'table_title': table_title_normalized,  # NORMALIZED
-            'section': section,
-            'total_rows': len(rows),
-            'chunk_size': len(content),
-            'is_complete_table': True
-        }
-        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
-        return [Document(text=content, metadata=metadata)]
-    chunks = []
-    current_rows = []
-    current_size = 0
-    chunk_num = 0
-    for i, row in enumerate(rows):
-        row_text = format_single_row(row, i + 1)
-        row_size = len(row_text)
-        should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
-        if should_split:
-            content = base_content + format_table_rows(current_rows)
-            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
-            content += format_table_footer(table_identifier, doc_id)
-            metadata = {
-                'type': 'table',
-                'document_id': doc_id,
-                'table_number': table_num_clean,
-                'table_identifier': normalize_text(table_identifier),  # NORMALIZE
-                'table_title': table_title_normalized,  # NORMALIZED
-                'section': section,
-                'chunk_id': chunk_num,
-                'row_start': current_rows[0]['_idx'] - 1,
-                'row_end': current_rows[-1]['_idx'],
-                'total_rows': len(rows),
-                'chunk_size': len(content),
-                'is_complete_table': False
-            }
-            chunks.append(Document(text=content, metadata=metadata))
-            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
-            chunk_num += 1
-            current_rows = []
-            current_size = 0
-        # Add row with index
-        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
-        row_copy['_idx'] = i + 1
-        current_rows.append(row_copy)
-        current_size += row_size
-    # Add final chunk
-    if current_rows:
-        content = base_content + format_table_rows(current_rows)
-        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
-        content += format_table_footer(table_identifier, doc_id)
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': normalize_text(table_identifier),  # NORMALIZE
-            'table_title': table_title_normalized,  # NORMALIZED
-            'section': section,
-            'chunk_id': chunk_num,
-            'row_start': current_rows[0]['_idx'] - 1,
-            'row_end': current_rows[-1]['_idx'],
-            'total_rows': len(rows),
-            'chunk_size': len(content),
-            'is_complete_table': False
-        }
-        chunks.append(Document(text=content, metadata=metadata))
-        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
-    return chunks
-# MODIFIED: Update format_table_header function
-def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
-    # Add table type/number prominently for matching
-    if table_num:
-        content += f"ТИП: {normalize_text(table_num)}\n"
-    if table_title:
-        content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
-    if section:
-        content += f"РАЗДЕЛ: {section}\n"
-    content += f"{'='*70}\n"
-    if headers:
-        header_str = ' | '.join(str(h) for h in headers)
-        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
-    content += "ДАННЫЕ:\n"
-    return content
-def format_single_row(row, idx):
-    """Format a single row"""
-    if isinstance(row, dict):
-        parts = [f"{k}: {v}" for k, v in row.items()
-                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
-        if parts:
-            return f"{idx}. {' | '.join(parts)}\n"
-    elif isinstance(row, list):
-        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
-        if parts:
-            return f"{idx}. {' | '.join(parts)}\n"
-    return ""
-def format_table_rows(rows):
-    """Format multiple rows"""
-    content = ""
-    for row in rows:
-        idx = row.get('_idx', 0)
-        content += format_single_row(row, idx)
-    return content
-def format_table_footer(table_identifier, doc_id):
-    """Format table footer"""
-    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
-def load_json_documents(repo_id, hf_token, json_dir):
-    import zipfile
-    import tempfile
-    import os
-    log_message("Loading JSON documents...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
-    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
-    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
-    documents = []
-    stats = {'success': 0, 'failed': 0, 'empty': 0}
-    for file_path in json_files:
-        try:
-            log_message(f"  Loading: {file_path}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            docs = extract_sections_from_json(local_path)
-            if docs:
-                documents.extend(docs)
-                stats['success'] += 1
-                log_message(f"    ✓ Extracted {len(docs)} sections")
-            else:
-                stats['empty'] += 1
-                log_message(f"    ⚠ No sections found")
-        except Exception as e:
-            stats['failed'] += 1
-            log_message(f"    ✗ Error: {e}")
-    for zip_path in zip_files:
-        try:
-            log_message(f"  Processing ZIP: {zip_path}")
-            local_zip = hf_hub_download(
-                repo_id=repo_id,
-                filename=zip_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with zipfile.ZipFile(local_zip, 'r') as zf:
-                json_files_in_zip = [f for f in zf.namelist()
-                                    if f.endswith('.json')
-                                    and not f.startswith('__MACOSX')
-                                    and not f.startswith('.')
-                                    and not '._' in f]
-                log_message(f"    Found {len(json_files_in_zip)} JSON files in ZIP")
-                for json_file in json_files_in_zip:
-                    try:
-                        file_content = zf.read(json_file)
-                        # Skip if file is too small
-                        if len(file_content) < 10:
-                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
-                            stats['failed'] += 1
-                            continue
-                        # Try UTF-8 first (most common)
-                        try:
-                            text_content = file_content.decode('utf-8')
-                        except UnicodeDecodeError:
-                            try:
-                                text_content = file_content.decode('utf-8-sig')
-                            except UnicodeDecodeError:
-                                try:
-                                    # Try UTF-16 (the issue you're seeing)
-                                    text_content = file_content.decode('utf-16')
-                                except UnicodeDecodeError:
-                                    try:
-                                        text_content = file_content.decode('windows-1251')
-                                    except UnicodeDecodeError:
-                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
-                                        stats['failed'] += 1
-                                        continue
-                        # Validate JSON structure
-                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
-                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
-                            stats['failed'] += 1
-                            continue
-                        with tempfile.NamedTemporaryFile(mode='w', delete=False,
-                                                        suffix='.json', encoding='utf-8') as tmp:
-                            tmp.write(text_content)
-                            tmp_path = tmp.name
-                        docs = extract_sections_from_json(tmp_path)
-                        if docs:
-                            documents.extend(docs)
-                            stats['success'] += 1
-                            log_message(f"      ✓ {json_file}: {len(docs)} sections")
-                        else:
-                            stats['empty'] += 1
-                            log_message(f"      ⚠ {json_file}: No sections")
-                        os.unlink(tmp_path)
-                    except json.JSONDecodeError as e:
-                        stats['failed'] += 1
-                        log_message(f"      ✗ {json_file}: Invalid JSON")
-                    except Exception as e:
-                        stats['failed'] += 1
-                        log_message(f"      ✗ {json_file}: {str(e)[:100]}")
-        except Exception as e:
-            log_message(f"    ✗ Error with ZIP: {e}")
-    log_message(f"="*60)
-    log_message(f"JSON Loading Stats:")
-    log_message(f"  Success: {stats['success']}")
-    log_message(f"  Empty: {stats['empty']}")
-    log_message(f"  Failed: {stats['failed']}")
-    log_message(f"  Total sections: {len(documents)}")
-    log_message(f"="*60)
-    return documents
-def extract_sections_from_json(json_path):
-    """Extract sections from a single JSON file"""
-    documents = []
-    try:
-        with open(json_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
-        # Extract all section levels
-        for section in data.get('sections', []):
-            if section.get('section_text', '').strip():
-                documents.append(Document(
-                    text=section['section_text'],
-                    metadata={
-                        'type': 'text',
-                        'document_id': doc_id,
-                        'section_id': section.get('section_id', '')
-                    }
-                ))
-            # Subsections
-            for subsection in section.get('subsections', []):
-                if subsection.get('subsection_text', '').strip():
-                    documents.append(Document(
-                        text=subsection['subsection_text'],
-                        metadata={
-                            'type': 'text',
-                            'document_id': doc_id,
-                            'section_id': subsection.get('subsection_id', '')
-                        }
-                    ))
-                # Sub-subsections
-                for sub_sub in subsection.get('sub_subsections', []):
-                    if sub_sub.get('sub_subsection_text', '').strip():
-                        documents.append(Document(
-                            text=sub_sub['sub_subsection_text'],
-                            metadata={
-                                'type': 'text',
-                                'document_id': doc_id,
-                                'section_id': sub_sub.get('sub_subsection_id', '')
-                            }
-                        ))
-    except Exception as e:
-        log_message(f"Error extracting from {json_path}: {e}")
-    return documents
-def load_table_documents(repo_id, hf_token, table_dir):
-    log_message("Loading tables...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
-    all_chunks = []
-    for file_path in table_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
-            for sheet in data.get('sheets', []):
-                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                # Use the consistent MAX_CHARS_TABLE from config
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
-                all_chunks.extend(chunks)
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    return all_chunks
-def load_image_documents(repo_id, hf_token, image_dir):
-    """Load image descriptions"""
-    log_message("Loading images...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
-    documents = []
-    for file_path in csv_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            df = pd.read_csv(local_path)
-            for _, row in df.iterrows():
-                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
-                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
-                content += f"Название: {row.get('Название изображения', '')}\n"
-                content += f"Описание: {row.get('Описание изображение', '')}\n"
-                content += f"Раздел: {row.get('Раздел документа', '')}\n"
-                chunk_size = len(content)
-                documents.append(Document(
-                    text=content,
-                    metadata={
-                        'type': 'image',
-                        'document_id': str(row.get('Обозначение документа', 'unknown')),
-                        'image_number': str(row.get('№ Изображения', 'unknown')),
-                        'section': str(row.get('Раздел документа', '')),
-                        'chunk_size': chunk_size
-                    }
-                ))
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    if documents:
-        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
-        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
-    return documents
-def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
-    """Main loader - combines all document types"""
-    log_message("="*60)
-    log_message("STARTING DOCUMENT LOADING")
-    log_message("="*60)
-    # Load text sections
-    text_docs = load_json_documents(repo_id, hf_token, json_dir)
-    text_chunks = chunk_text_documents(text_docs)
-    # Load tables (already chunked)
-    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
-    # Load images (no chunking needed)
-    image_docs = load_image_documents(repo_id, hf_token, image_dir)
-    all_docs = text_chunks + table_chunks + image_docs
-    log_message("="*60)
-    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
-    log_message(f"  Text chunks: {len(text_chunks)}")
-    log_message(f"  Table chunks: {len(table_chunks)}")
-    log_message(f"  Images: {len(image_docs)}")
-    log_message("="*60)
     return all_docs

+import json
+import zipfile
+import pandas as pd
+from huggingface_hub import hf_hub_download, list_repo_files
+from llama_index.core import Document
+from llama_index.core.text_splitter import SentenceSplitter
+from my_logging import log_message
+from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
+def normalize_text(text):
+    if not text:
+        return text
+    # Replace Cyrillic 'C' with Latin 'С' (U+0421)
+    # This is for welding types like C-25 -> С-25
+    text = text.replace('С-', 'C')
+    # Also handle cases like "Type C" or variations
+    import re
+    # Match "C" followed by digit or space in context of welding types
+    text = re.sub(r'\bС(\d)', r'С\1', text)
+    return text
+import re
+def normalize_steel_designations(text):
+    """
+    Normalize steel designations by converting Latin letters to Cyrillic.
+    Handles patterns like 08X18H10T, 12X18H9, 10H17N13M2T, etc.
+    Returns: (normalized_text, changes_count, changes_list)
+    """
+    if not text:
+        return text, 0, []
+    changes_count = 0
+    changes_list = []
+    # Mapping of Latin to Cyrillic for steel designations
+    replacements = {
+        'X': 'Х',
+        'H': 'Н',
+        'T': 'Т',
+        'C': 'С',
+        'B': 'В',
+        'K': 'К',
+        'M': 'М',
+        'A': 'А',
+        'P': 'Р',
+    }
+    # Regex to match steel designations like 08X18H10T, 10H17N13M2T, etc.
+    # \b\d{1,3} — starts with 1–3 digits
+    # (?:[A-ZА-Я]\d*)+ — then one or more groups of a letter + optional digits
+    pattern = r'\b\d{1,3}(?:[A-ZА-Я]\d*)+\b'
+    def replace_in_steel_grade(match):
+        nonlocal changes_count, changes_list
+        original = match.group(0)
+        converted = ''.join(replacements.get(ch, ch) for ch in original)
+        if converted != original:
+            changes_count += 1
+            changes_list.append(f"{original} → {converted}")
+        return converted
+    normalized_text = re.sub(pattern, replace_in_steel_grade, text)
+    return normalized_text, changes_count, changes_list
+def chunk_text_documents(documents):
+    text_splitter = SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    )
+    log_message("="*60)
+    log_message("NORMALIZING STEEL DESIGNATIONS IN TEXT CHUNKS")
+    total_normalizations = 0
+    chunks_with_changes = 0
+    chunked = []
+    for doc in documents:
+        chunks = text_splitter.get_nodes_from_documents([doc])
+        for i, chunk in enumerate(chunks):
+            # Normalize steel designations in the chunk text
+            original_text = chunk.text
+            chunk.text, changes, change_list = normalize_steel_designations(chunk.text)  # FIX: 3 values
+            if changes > 0:
+                chunks_with_changes += 1
+                total_normalizations += changes
+            chunk.metadata.update({
+                'chunk_id': i,
+                'total_chunks': len(chunks),
+                'chunk_size': len(chunk.text)
+            })
+            chunked.append(chunk)
+    # Log statistics
+    if chunked:
+        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
+        min_size = min(len(c.text) for c in chunked)
+        max_size = max(len(c.text) for c in chunked)
+        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
+        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
+        log_message(f"  Steel designation normalization:")
+        log_message(f"    - Chunks with changes: {chunks_with_changes}/{len(chunked)}")
+        log_message(f"    - Total steel grades normalized: {total_normalizations}")
+        log_message(f"    - Avg per affected chunk: {total_normalizations/chunks_with_changes:.1f}" if chunks_with_changes > 0 else "    - No normalizations needed")
+    log_message("="*60)
+    return chunked
+def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
+    headers = table_data.get('headers', [])
+    rows = table_data.get('data', [])
+    table_num = table_data.get('table_number', 'unknown')
+    table_title = table_data.get('table_title', '')
+    section = table_data.get('section', '')
+    sheet_name = table_data.get('sheet_name', '')
+    # Apply steel designation normalization to title and section
+    table_title, title_changes, title_list = normalize_steel_designations(str(table_title))
+    section, section_changes, section_list = normalize_steel_designations(section)
+    table_num_clean = str(table_num).strip()
+    import re
+    if table_num_clean in ['-', '', 'unknown', 'nan']:
+        if 'приложени' in sheet_name.lower() or 'приложени' in section.lower():
+            appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)',
+                                      (sheet_name + ' ' + section).lower())
+            if appendix_match:
+                appendix_num = appendix_match.group(1)
+                table_identifier = f"Приложение {appendix_num}"
+            else:
+                table_identifier = "Приложение"
+        else:
+            if table_title:
+                first_words = ' '.join(table_title.split()[:5])
+                table_identifier = f"{first_words}"
+            else:
+                table_identifier = section.split(',')[0] if section else "БезНомера"
+    else:
+        if 'приложени' in section.lower():
+            appendix_match = re.search(r'приложени[еия]\s*[№]?\s*(\d+)', section.lower())
+            if appendix_match:
+                appendix_num = appendix_match.group(1)
+                table_identifier = f"{table_num_clean} Приложение {appendix_num}"
+            else:
+                table_identifier = table_num_clean
+        else:
+            table_identifier = table_num_clean
+    if not rows:
+        return []
+    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Normalize all row content (including steel designations)
+    normalized_rows = []
+    total_row_changes = 0
+    rows_with_changes = 0
+    all_row_changes = []  # NEW
+    for row in rows:
+        if isinstance(row, dict):
+            normalized_row = {}
+            row_had_changes = False
+            for k, v in row.items():
+                normalized_val, changes, change_list = normalize_steel_designations(str(v))
+                normalized_row[k] = normalized_val
+                if changes > 0:
+                    total_row_changes += changes
+                    row_had_changes = True
+                    all_row_changes.extend(change_list)  # NEW
+            if row_had_changes:
+                rows_with_changes += 1
+            normalized_rows.append(normalized_row)
+        else:
+            normalized_rows.append(row)
+    # Log normalization stats with examples
+    if total_row_changes > 0 or title_changes > 0 or section_changes > 0:
+        log_message(f"    Steel normalization: title={title_changes}, section={section_changes}, "
+                   f"rows={rows_with_changes}/{len(rows)} ({total_row_changes} total)")
+        # NEW: Show examples of what changed
+        if title_list:
+            log_message(f"      Title changes: {', '.join(title_list[:3])}")
+        if section_list:
+            log_message(f"      Section changes: {', '.join(section_list[:3])}")
+        if all_row_changes:
+            log_message(f"      Row examples: {', '.join(all_row_changes[:5])}")
+    # Continue with rest of existing logic using normalized_rows...
+    # Calculate base metadata size
+    base_content = format_table_header(doc_id, table_identifier, table_num,
+                                       table_title, section, headers,
+                                       sheet_name)
+    base_size = len(base_content)
+    available_space = max_chars - base_size - 200
+    # If entire table fits, return as one chunk
+    full_rows_content = format_table_rows([{**row, '_idx': i+1}
+                                           for i, row in enumerate(normalized_rows)])
+    if base_size + len(full_rows_content) <= max_chars and len(normalized_rows) <= max_rows:
+        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'sheet_name': sheet_name,
+            'total_rows': len(normalized_rows),
+            'chunk_size': len(content),
+            'is_complete_table': True,
+            'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
+        }
+        log_message(f"    Single chunk: {len(content)} chars, {len(normalized_rows)} rows")
+        return [Document(text=content, metadata=metadata)]
+    # Chunking logic continues...
+    chunks = []
+    current_rows = []
+    current_size = 0
+    chunk_num = 0
+    for i, row in enumerate(normalized_rows):
+        row_text = format_single_row(row, i + 1)
+        row_size = len(row_text)
+        should_split = (current_size + row_size > available_space or
+                       len(current_rows) >= max_rows) and current_rows
+        if should_split:
+            content = base_content + format_table_rows(current_rows)
+            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
+            content += format_table_footer(table_identifier, doc_id)
+            metadata = {
+                'type': 'table',
+                'document_id': doc_id,
+                'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+                'table_identifier': table_identifier,
+                'table_title': table_title,
+                'section': section,
+                'sheet_name': sheet_name,
+                'chunk_id': chunk_num,
+                'row_start': current_rows[0]['_idx'] - 1,
+                'row_end': current_rows[-1]['_idx'],
+                'total_rows': len(normalized_rows),
+                'chunk_size': len(content),
+                'is_complete_table': False,
+                'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
+            }
+            chunks.append(Document(text=content, metadata=metadata))
+            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+            chunk_num += 1
+            current_rows = []
+            current_size = 0
+        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
+        row_copy['_idx'] = i + 1
+        current_rows.append(row_copy)
+        current_size += row_size
+    # Final chunk
+    if current_rows:
+        content = base_content + format_table_rows(current_rows)
+        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(normalized_rows)}\n"
+        content += format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean if table_num_clean not in ['-', 'unknown'] else table_identifier,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'sheet_name': sheet_name,
+            'chunk_id': chunk_num,
+            'row_start': current_rows[0]['_idx'] - 1,
+            'row_end': current_rows[-1]['_idx'],
+            'total_rows': len(normalized_rows),
+            'chunk_size': len(content),
+            'is_complete_table': False,
+            'keywords': f"{doc_id} {table_identifier} {table_title} {section} сталь материал"
+        }
+        chunks.append(Document(text=content, metadata=metadata))
+        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+    return chunks
+def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers, sheet_name=''):
+    content = f"ТАБЛИЦА {normalize_text(table_identifier)} из документа {doc_id}\n"
+    # Add multiple searchable identifiers
+    if table_num and table_num not in ['-', 'unknown']:
+        content += f"НОМЕР ТАБЛИЦЫ: {normalize_text(table_num)}\n"
+    if sheet_name:
+        content += f"ЛИСТ: {sheet_name}\n"
+    if table_title:
+        content += f"НАЗВАНИЕ: {normalize_text(table_title)}\n"
+    if section:
+        content += f"РАЗДЕЛ: {section}\n"
+    # ADD KEYWORDS for better retrieval
+    content += f"КЛЮЧЕВЫЕ СЛОВА: материалы стали марки стандарты {doc_id}\n"
+    content += f"{'='*70}\n"
+    if headers:
+        # Normalize headers too
+        normalized_headers = [normalize_text(str(h)) for h in headers]
+        header_str = ' | '.join(normalized_headers)
+        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    content += "ДАННЫЕ:\n"
+    return content
+def format_single_row(row, idx):
+    """Format a single row"""
+    if isinstance(row, dict):
+        parts = [f"{k}: {v}" for k, v in row.items()
+                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    elif isinstance(row, list):
+        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    return ""
+def format_table_rows(rows):
+    """Format multiple rows"""
+    content = ""
+    for row in rows:
+        idx = row.get('_idx', 0)
+        content += format_single_row(row, idx)
+    return content
+def format_table_footer(table_identifier, doc_id):
+    """Format table footer"""
+    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
+def load_json_documents(repo_id, hf_token, json_dir):
+    import zipfile
+    import tempfile
+    import os
+    log_message("Loading JSON documents...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
+    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
+    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
+    documents = []
+    stats = {'success': 0, 'failed': 0, 'empty': 0}
+    for file_path in json_files:
+        try:
+            log_message(f"  Loading: {file_path}")
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            docs = extract_sections_from_json(local_path)
+            if docs:
+                documents.extend(docs)
+                stats['success'] += 1
+                log_message(f"    ✓ Extracted {len(docs)} sections")
+            else:
+                stats['empty'] += 1
+                log_message(f"    ⚠ No sections found")
+        except Exception as e:
+            stats['failed'] += 1
+            log_message(f"    ✗ Error: {e}")
+    for zip_path in zip_files:
+        try:
+            log_message(f"  Processing ZIP: {zip_path}")
+            local_zip = hf_hub_download(
+                repo_id=repo_id,
+                filename=zip_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with zipfile.ZipFile(local_zip, 'r') as zf:
+                json_files_in_zip = [f for f in zf.namelist()
+                                    if f.endswith('.json')
+                                    and not f.startswith('__MACOSX')
+                                    and not f.startswith('.')
+                                    and not '._' in f]
+                log_message(f"    Found {len(json_files_in_zip)} JSON files in ZIP")
+                for json_file in json_files_in_zip:
+                    try:
+                        file_content = zf.read(json_file)
+                        # Skip if file is too small
+                        if len(file_content) < 10:
+                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
+                            stats['failed'] += 1
+                            continue
+                        # Try UTF-8 first (most common)
+                        try:
+                            text_content = file_content.decode('utf-8')
+                        except UnicodeDecodeError:
+                            try:
+                                text_content = file_content.decode('utf-8-sig')
+                            except UnicodeDecodeError:
+                                try:
+                                    # Try UTF-16 (the issue you're seeing)
+                                    text_content = file_content.decode('utf-16')
+                                except UnicodeDecodeError:
+                                    try:
+                                        text_content = file_content.decode('windows-1251')
+                                    except UnicodeDecodeError:
+                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
+                                        stats['failed'] += 1
+                                        continue
+                        # Validate JSON structure
+                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
+                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
+                            stats['failed'] += 1
+                            continue
+                        with tempfile.NamedTemporaryFile(mode='w', delete=False,
+                                                        suffix='.json', encoding='utf-8') as tmp:
+                            tmp.write(text_content)
+                            tmp_path = tmp.name
+                        docs = extract_sections_from_json(tmp_path)
+                        if docs:
+                            documents.extend(docs)
+                            stats['success'] += 1
+                            log_message(f"      ✓ {json_file}: {len(docs)} sections")
+                        else:
+                            stats['empty'] += 1
+                            log_message(f"      ⚠ {json_file}: No sections")
+                        os.unlink(tmp_path)
+                    except json.JSONDecodeError as e:
+                        stats['failed'] += 1
+                        log_message(f"      ✗ {json_file}: Invalid JSON")
+                    except Exception as e:
+                        stats['failed'] += 1
+                        log_message(f"      ✗ {json_file}: {str(e)[:100]}")
+        except Exception as e:
+            log_message(f"    ✗ Error with ZIP: {e}")
+    log_message(f"="*60)
+    log_message(f"JSON Loading Stats:")
+    log_message(f"  Success: {stats['success']}")
+    log_message(f"  Empty: {stats['empty']}")
+    log_message(f"  Failed: {stats['failed']}")
+    log_message(f"  Total sections: {len(documents)}")
+    log_message(f"="*60)
+    return documents
+def extract_sections_from_json(json_path):
+    """Extract sections from a single JSON file"""
+    documents = []
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
+        # Extract all section levels
+        for section in data.get('sections', []):
+            if section.get('section_text', '').strip():
+                documents.append(Document(
+                    text=section['section_text'],
+                    metadata={
+                        'type': 'text',
+                        'document_id': doc_id,
+                        'section_id': section.get('section_id', '')
+                    }
+                ))
+            # Subsections
+            for subsection in section.get('subsections', []):
+                if subsection.get('subsection_text', '').strip():
+                    documents.append(Document(
+                        text=subsection['subsection_text'],
+                        metadata={
+                            'type': 'text',
+                            'document_id': doc_id,
+                            'section_id': subsection.get('subsection_id', '')
+                        }
+                    ))
+                # Sub-subsections
+                for sub_sub in subsection.get('sub_subsections', []):
+                    if sub_sub.get('sub_subsection_text', '').strip():
+                        documents.append(Document(
+                            text=sub_sub['sub_subsection_text'],
+                            metadata={
+                                'type': 'text',
+                                'document_id': doc_id,
+                                'section_id': sub_sub.get('sub_subsection_id', '')
+                            }
+                        ))
+    except Exception as e:
+        log_message(f"Error extracting from {json_path}: {e}")
+    return documents
+def load_table_documents(repo_id, hf_token, table_dir):
+    log_message("Loading tables...")
+    log_message("="*60)
+    log_message("NORMALIZING STEEL DESIGNATIONS IN TABLES")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    tables_processed = 0
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
+            for sheet in data.get('sheets', []):
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                tables_processed += 1
+                chunks = chunk_table_by_content(sheet, sheet_doc_id,
+                                               max_chars=MAX_CHARS_TABLE,
+                                               max_rows=MAX_ROWS_TABLE)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks from {tables_processed} tables")
+    log_message("="*60)
+    return all_chunks
+def load_image_documents(repo_id, hf_token, image_dir):
+    """Load image descriptions"""
+    log_message("Loading images...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
+    documents = []
+    for file_path in csv_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            df = pd.read_csv(local_path)
+            for _, row in df.iterrows():
+                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
+                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
+                content += f"Название: {row.get('Название изображения', '')}\n"
+                content += f"Описание: {row.get('Описание изображение', '')}\n"
+                content += f"Раздел: {row.get('Раздел документа', '')}\n"
+                chunk_size = len(content)
+                documents.append(Document(
+                    text=content,
+                    metadata={
+                        'type': 'image',
+                        'document_id': str(row.get('Обозначение документа', 'unknown')),
+                        'image_number': str(row.get('№ Изображения', 'unknown')),
+                        'section': str(row.get('Раздел документа', '')),
+                        'chunk_size': chunk_size
+                    }
+                ))
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    if documents:
+        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
+        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
+    return documents
+def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
+    """Main loader - combines all document types"""
+    log_message("="*60)
+    log_message("STARTING DOCUMENT LOADING")
+    log_message("="*60)
+    # Load text sections
+    text_docs = load_json_documents(repo_id, hf_token, json_dir)
+    text_chunks = chunk_text_documents(text_docs)
+    # Load tables (already chunked)
+    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
+    # Load images (no chunking needed)
+    image_docs = load_image_documents(repo_id, hf_token, image_dir)
+    all_docs = text_chunks + table_chunks + image_docs
+    log_message("="*60)
+    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
+    log_message(f"  Text chunks: {len(text_chunks)}")
+    log_message(f"  Table chunks: {len(table_chunks)}")
+    log_message(f"  Images: {len(image_docs)}")
+    log_message("="*60)
     return all_docs