Spaces:

MrSimple01
/

RAG_AIEXP_001

Paused

App Files Files Community

MrSimple01 commited on Oct 13, 2025

Commit

051e774

verified ·

1 Parent(s): 0bc2e08

Update documents_prep.py

Browse files

Files changed (1) hide show

documents_prep.py +496 -573

documents_prep.py CHANGED Viewed

@@ -1,574 +1,497 @@
-import json
-import zipfile
-import pandas as pd
-from huggingface_hub import hf_hub_download, list_repo_files
-from llama_index.core import Document
-from llama_index.core.text_splitter import SentenceSplitter
-from my_logging import log_message
-from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
-def chunk_text_documents(documents):
-    text_splitter = SentenceSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP
-    )
-    chunked = []
-    for doc in documents:
-        chunks = text_splitter.get_nodes_from_documents([doc])
-        for i, chunk in enumerate(chunks):
-            chunk.metadata.update({
-                'chunk_id': i,
-                'total_chunks': len(chunks),
-                'chunk_size': len(chunk.text)  # Add chunk size
-            })
-            chunked.append(chunk)
-    # Log statistics
-    if chunked:
-        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
-        min_size = min(len(c.text) for c in chunked)
-        max_size = max(len(c.text) for c in chunked)
-        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
-        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
-    return chunked
-def normalize_connection_type(s):
-    # Replace Cyrillic with Latin
-    s = s.replace('С', 'C').replace('с', 'c')
-    s = s.replace('У', 'U').replace('у', 'u')
-    s = s.replace('Т', 'T').replace('т', 't')
-    s= s.replace('С-', 'C-').replace('с-', 'c-')
-    s = s.replace('У-', 'U-').replace('у-', 'u-')
-    s = s.replace('Т-', 'T-').replace('т-', 't-')
-    # REMOVE ALL HYPHENS for consistent tokenization
-    s = s.replace('-', '')
-    return s
-def extract_connection_type(text):
-    import re
-    # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
-    match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
-    if match:
-        normalized = normalize_connection_type(match.group(0))
-        return normalized
-    return ''
-def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
-    headers = table_data.get('headers', [])
-    rows = table_data.get('data', [])
-    table_num = table_data.get('table_number', 'unknown')
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    table_description = table_data.get('table_description', '')
-    table_num_clean = str(table_num).strip()
-    import re
-    if 'приложени' in section.lower():
-        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
-        if appendix_match:
-            appendix_num = appendix_match.group(1).upper()
-            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
-        else:
-            table_identifier = table_num_clean
-    else:
-        table_identifier = table_num_clean
-    if not rows:
-        return []
-    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
-    # Calculate base metadata size - NOW INCLUDING DESCRIPTION
-    base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
-    # ADD DESCRIPTION HERE if it exists
-    if table_description:
-        base_content += f"ОПИСАНИЕ: {table_description}\n\n"
-    base_size = len(base_content)
-    available_space = max_chars - base_size - 200
-    # If entire table fits, return as one chunk
-    full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
-    if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
-        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
-            'section': section,
-            'total_rows': len(rows),
-            'chunk_size': len(content),
-            'is_complete_table': True,
-            'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
-        }
-        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
-        return [Document(text=content, metadata=metadata)]
-    chunks = []
-    current_rows = []
-    current_size = 0
-    chunk_num = 0
-    for i, row in enumerate(rows):
-        row_text = format_single_row(row, i + 1)
-        row_size = len(row_text)
-        should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
-        if should_split:
-            content = base_content + format_table_rows(current_rows)
-            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
-            content += format_table_footer(table_identifier, doc_id)
-            metadata = {
-                'type': 'table',
-                'document_id': doc_id,
-                'table_number': table_num_clean,
-                'table_identifier': table_identifier,
-                'table_title': table_title,
-                'section': section,
-                'chunk_id': chunk_num,
-                'row_start': current_rows[0]['_idx'] - 1,
-                'row_end': current_rows[-1]['_idx'],
-                'total_rows': len(rows),
-                'chunk_size': len(content),
-                'is_complete_table': False,
-                'connection_type': extract_connection_type(table_title) if table_title else ''  # NEW
-            }
-            chunks.append(Document(text=content, metadata=metadata))
-            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
-            chunk_num += 1
-            current_rows = []
-            current_size = 0
-        # Add row with index
-        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
-        row_copy['_idx'] = i + 1
-        current_rows.append(row_copy)
-        current_size += row_size
-    # Add final chunk
-    if current_rows:
-        content = base_content + format_table_rows(current_rows)
-        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
-        content += format_table_footer(table_identifier, doc_id)
-        metadata = {
-            'type': 'table',
-            'document_id': doc_id,
-            'table_number': table_num_clean,
-            'table_identifier': table_identifier,
-            'table_title': table_title,
-            'section': section,
-            'chunk_id': chunk_num,
-            'row_start': current_rows[0]['_idx'] - 1,
-            'row_end': current_rows[-1]['_idx'],
-            'total_rows': len(rows),
-            'chunk_size': len(content),
-            'is_complete_table': False
-        }
-        chunks.append(Document(text=content, metadata=metadata))
-        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
-    return chunks
-def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
-    content = f"ДОКУМЕНТ: {doc_id}\n"
-    content += f"ТАБЛИЦА: {table_identifier}\n"
-    if table_title:
-        # Normalize the title text itself for better searchability
-        normalized_title = normalize_connection_type(table_title)
-        content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
-        # Extract and store the normalized connection type
-        connection_type = extract_connection_type(table_title)
-        if connection_type:
-            content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
-    if table_num and table_num != table_identifier:
-        content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
-    if section:
-        content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
-    content += f"\n{'='*70}\n"
-    if headers:
-        content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
-        for i, h in enumerate(headers, 1):
-            # NORMALIZE HEADERS TOO
-            normalized_header = normalize_connection_type(h)
-            content += f"  {i}. {normalized_header}\n"
-        content += "\n"
-    content += "ДАННЫЕ ТАБЛИЦЫ:\n"
-    return content
-def format_single_row(row, idx):
-    """Format a single row with normalization"""
-    if isinstance(row, dict):
-        # NORMALIZE VALUES IN ROWS
-        parts = []
-        for k, v in row.items():
-            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
-                normalized_v = normalize_connection_type(str(v))
-                parts.append(f"{k}: {normalized_v}")
-        if parts:
-            return f"{idx}. {' | '.join(parts)}\n"
-    elif isinstance(row, list):
-        # NORMALIZE LIST VALUES
-        parts = []
-        for v in row:
-            if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
-                normalized_v = normalize_connection_type(str(v))
-                parts.append(normalized_v)
-        if parts:
-            return f"{idx}. {' | '.join(parts)}\n"
-    return ""
-def format_table_rows(rows):
-    """Format multiple rows"""
-    content = ""
-    for row in rows:
-        idx = row.get('_idx', 0)
-        content += format_single_row(row, idx)
-    return content
-def format_table_footer(table_identifier, doc_id):
-    """Format table footer"""
-    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
-def load_json_documents(repo_id, hf_token, json_dir):
-    import zipfile
-    import tempfile
-    import os
-    log_message("Loading JSON documents...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
-    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
-    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
-    documents = []
-    stats = {'success': 0, 'failed': 0, 'empty': 0}
-    for file_path in json_files:
-        try:
-            log_message(f"  Loading: {file_path}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            docs = extract_sections_from_json(local_path)
-            if docs:
-                documents.extend(docs)
-                stats['success'] += 1
-                log_message(f"    ✓ Extracted {len(docs)} sections")
-            else:
-                stats['empty'] += 1
-                log_message(f"    ⚠ No sections found")
-        except Exception as e:
-            stats['failed'] += 1
-            log_message(f"    ✗ Error: {e}")
-    for zip_path in zip_files:
-        try:
-            log_message(f"  Processing ZIP: {zip_path}")
-            local_zip = hf_hub_download(
-                repo_id=repo_id,
-                filename=zip_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with zipfile.ZipFile(local_zip, 'r') as zf:
-                json_files_in_zip = [f for f in zf.namelist()
-                                    if f.endswith('.json')
-                                    and not f.startswith('__MACOSX')
-                                    and not f.startswith('.')
-                                    and not '._' in f]
-                log_message(f"    Found {len(json_files_in_zip)} JSON files in ZIP")
-                for json_file in json_files_in_zip:
-                    try:
-                        file_content = zf.read(json_file)
-                        # Skip if file is too small
-                        if len(file_content) < 10:
-                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
-                            stats['failed'] += 1
-                            continue
-                        # Try UTF-8 first (most common)
-                        try:
-                            text_content = file_content.decode('utf-8')
-                        except UnicodeDecodeError:
-                            try:
-                                text_content = file_content.decode('utf-8-sig')
-                            except UnicodeDecodeError:
-                                try:
-                                    # Try UTF-16 (the issue you're seeing)
-                                    text_content = file_content.decode('utf-16')
-                                except UnicodeDecodeError:
-                                    try:
-                                        text_content = file_content.decode('windows-1251')
-                                    except UnicodeDecodeError:
-                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
-                                        stats['failed'] += 1
-                                        continue
-                        # Validate JSON structure
-                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
-                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
-                            stats['failed'] += 1
-                            continue
-                        with tempfile.NamedTemporaryFile(mode='w', delete=False,
-                                                        suffix='.json', encoding='utf-8') as tmp:
-                            tmp.write(text_content)
-                            tmp_path = tmp.name
-                        docs = extract_sections_from_json(tmp_path)
-                        if docs:
-                            documents.extend(docs)
-                            stats['success'] += 1
-                            log_message(f"      ✓ {json_file}: {len(docs)} sections")
-                        else:
-                            stats['empty'] += 1
-                            log_message(f"      ⚠ {json_file}: No sections")
-                        os.unlink(tmp_path)
-                    except json.JSONDecodeError as e:
-                        stats['failed'] += 1
-                        log_message(f"      ✗ {json_file}: Invalid JSON")
-                    except Exception as e:
-                        stats['failed'] += 1
-                        log_message(f"      ✗ {json_file}: {str(e)[:100]}")
-        except Exception as e:
-            log_message(f"    ✗ Error with ZIP: {e}")
-    log_message(f"="*60)
-    log_message(f"JSON Loading Stats:")
-    log_message(f"  Success: {stats['success']}")
-    log_message(f"  Empty: {stats['empty']}")
-    log_message(f"  Failed: {stats['failed']}")
-    log_message(f"  Total sections: {len(documents)}")
-    log_message(f"="*60)
-    return documents
-def extract_sections_from_json(json_path):
-    """Extract sections from a single JSON file"""
-    documents = []
-    try:
-        with open(json_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
-        # Extract all section levels
-        for section in data.get('sections', []):
-            if section.get('section_text', '').strip():
-                documents.append(Document(
-                    text=section['section_text'],
-                    metadata={
-                        'type': 'text',
-                        'document_id': doc_id,
-                        'section_id': section.get('section_id', '')
-                    }
-                ))
-            # Subsections
-            for subsection in section.get('subsections', []):
-                if subsection.get('subsection_text', '').strip():
-                    documents.append(Document(
-                        text=subsection['subsection_text'],
-                        metadata={
-                            'type': 'text',
-                            'document_id': doc_id,
-                            'section_id': subsection.get('subsection_id', '')
-                        }
-                    ))
-                # Sub-subsections
-                for sub_sub in subsection.get('sub_subsections', []):
-                    if sub_sub.get('sub_subsection_text', '').strip():
-                        documents.append(Document(
-                            text=sub_sub['sub_subsection_text'],
-                            metadata={
-                                'type': 'text',
-                                'document_id': doc_id,
-                                'section_id': sub_sub.get('sub_subsection_id', '')
-                            }
-                        ))
-    except Exception as e:
-        log_message(f"Error extracting from {json_path}: {e}")
-    return documents
-def load_table_documents(repo_id, hf_token, table_dir):
-    log_message("Loading tables...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
-    all_chunks = []
-    connection_type_sources = {}  # Track which table each type comes from
-    for file_path in table_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
-            for sheet in data.get('sheets', []):
-                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                table_num = sheet.get('table_number', 'unknown')
-                table_title = sheet.get('table_title', '')
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
-                all_chunks.extend(chunks)
-                # Track connection type source
-                conn_type = extract_connection_type(table_title)
-                if conn_type:
-                    if conn_type not in connection_type_sources:
-                        connection_type_sources[conn_type] = []
-                    connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
-    log_message("="*60)
-    log_message("CONNECTION TYPES AND THEIR SOURCES:")
-    for conn_type in sorted(connection_type_sources.keys()):
-        sources = connection_type_sources[conn_type]
-        log_message(f"  {conn_type}: {len(sources)} tables")
-        for src in sources:
-            log_message(f"    - {src}")
-    log_message("="*60)
-    return all_chunks
-def load_image_documents(repo_id, hf_token, image_dir):
-    """Load image descriptions"""
-    log_message("Loading images...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
-    documents = []
-    for file_path in csv_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            df = pd.read_csv(local_path)
-            for _, row in df.iterrows():
-                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
-                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
-                content += f"Название: {row.get('Название изображения', '')}\n"
-                content += f"Описание: {row.get('Описание изображение', '')}\n"
-                content += f"Раздел: {row.get('Раздел документа', '')}\n"
-                chunk_size = len(content)
-                documents.append(Document(
-                    text=content,
-                    metadata={
-                        'type': 'image',
-                        'document_id': str(row.get('Обозначение документа', 'unknown')),
-                        'image_number': str(row.get('№ Изображения', 'unknown')),
-                        'section': str(row.get('Раздел документа', '')),
-                        'chunk_size': chunk_size
-                    }
-                ))
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    if documents:
-        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
-        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
-    return documents
-def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
-    log_message("="*60)
-    log_message("STARTING DOCUMENT LOADING")
-    log_message("="*60)
-    # Load text sections
-    text_docs = load_json_documents(repo_id, hf_token, json_dir)
-    text_chunks = chunk_text_documents(text_docs)
-    # Load tables (already chunked)
-    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
-    # NEW: Analyze connection types in tables
-    connection_types = {}
-    for chunk in table_chunks:
-        conn_type = chunk.metadata.get('connection_type', '')
-        if conn_type:
-            connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
-    log_message("="*60)
-    log_message("CONNECTION TYPES FOUND IN TABLES:")
-    for conn_type, count in sorted(connection_types.items()):
-        log_message(f"  {conn_type}: {count} chunks")
-    log_message("="*60)
-    # Load images (no chunking needed)
-    image_docs = load_image_documents(repo_id, hf_token, image_dir)
-    all_docs = text_chunks + table_chunks + image_docs
-    log_message("="*60)
-    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
-    log_message(f"  Text chunks: {len(text_chunks)}")
-    log_message(f"  Table chunks: {len(table_chunks)}")
-    log_message(f"  Images: {len(image_docs)}")
-    log_message("="*60)
     return all_docs

+import json
+import zipfile
+import pandas as pd
+from huggingface_hub import hf_hub_download, list_repo_files
+from llama_index.core import Document
+from llama_index.core.text_splitter import SentenceSplitter
+from my_logging import log_message
+from config import CHUNK_SIZE, CHUNK_OVERLAP, MAX_CHARS_TABLE, MAX_ROWS_TABLE
+def chunk_text_documents(documents):
+    text_splitter = SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    )
+    chunked = []
+    for doc in documents:
+        chunks = text_splitter.get_nodes_from_documents([doc])
+        for i, chunk in enumerate(chunks):
+            chunk.metadata.update({
+                'chunk_id': i,
+                'total_chunks': len(chunks),
+                'chunk_size': len(chunk.text)  # Add chunk size
+            })
+            chunked.append(chunk)
+    # Log statistics
+    if chunked:
+        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
+        min_size = min(len(c.text) for c in chunked)
+        max_size = max(len(c.text) for c in chunked)
+        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
+        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
+    return chunked
+def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
+    headers = table_data.get('headers', [])
+    rows = table_data.get('data', [])
+    table_num = table_data.get('table_number', 'unknown')
+    table_title = table_data.get('table_title', '')
+    section = table_data.get('section', '')
+    table_num_clean = str(table_num).strip()
+    import re
+    if 'приложени' in section.lower():
+        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
+        if appendix_match:
+            appendix_num = appendix_match.group(1).upper()
+            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
+        else:
+            table_identifier = table_num_clean
+    else:
+        table_identifier = table_num_clean
+    if not rows:
+        return []
+    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size
+    base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
+    base_size = len(base_content)
+    available_space = max_chars - base_size - 200
+    # If entire table fits, return as one chunk
+    full_rows_content = format_table_rows([{**row, '_idx': i+1} for i, row in enumerate(rows)])
+    if base_size + len(full_rows_content) <= max_chars and len(rows) <= max_rows:
+        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'total_rows': len(rows),
+            'chunk_size': len(content),
+            'is_complete_table': True
+        }
+        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
+        return [Document(text=content, metadata=metadata)]
+    chunks = []
+    current_rows = []
+    current_size = 0
+    chunk_num = 0
+    for i, row in enumerate(rows):
+        row_text = format_single_row(row, i + 1)
+        row_size = len(row_text)
+        should_split = (current_size + row_size > available_space or len(current_rows) >= max_rows) and current_rows
+        if should_split:
+            content = base_content + format_table_rows(current_rows)
+            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
+            content += format_table_footer(table_identifier, doc_id)
+            metadata = {
+                'type': 'table',
+                'document_id': doc_id,
+                'table_number': table_num_clean,
+                'table_identifier': table_identifier,
+                'table_title': table_title,
+                'section': section,
+                'chunk_id': chunk_num,
+                'row_start': current_rows[0]['_idx'] - 1,
+                'row_end': current_rows[-1]['_idx'],
+                'total_rows': len(rows),
+                'chunk_size': len(content),
+                'is_complete_table': False
+            }
+            chunks.append(Document(text=content, metadata=metadata))
+            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+            chunk_num += 1
+            current_rows = []
+            current_size = 0
+        # Add row with index
+        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
+        row_copy['_idx'] = i + 1
+        current_rows.append(row_copy)
+        current_size += row_size
+    # Add final chunk
+    if current_rows:
+        content = base_content + format_table_rows(current_rows)
+        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
+        content += format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'chunk_id': chunk_num,
+            'row_start': current_rows[0]['_idx'] - 1,
+            'row_end': current_rows[-1]['_idx'],
+            'total_rows': len(rows),
+            'chunk_size': len(content),
+            'is_complete_table': False
+        }
+        chunks.append(Document(text=content, metadata=metadata))
+        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+    return chunks
+def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
+    # Add table type/number prominently for matching
+    if table_num:
+        content += f"ТИП: {table_num}\n"
+    if table_title:
+        content += f"НАЗВАНИЕ: {table_title}\n"
+    if section:
+        content += f"РАЗДЕЛ: {section}\n"
+    content += f"{'='*70}\n"
+    if headers:
+        header_str = ' | '.join(str(h) for h in headers)
+        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    content += "ДАННЫЕ:\n"
+    return content
+def format_single_row(row, idx):
+    """Format a single row"""
+    if isinstance(row, dict):
+        parts = [f"{k}: {v}" for k, v in row.items()
+                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    elif isinstance(row, list):
+        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    return ""
+def format_table_rows(rows):
+    """Format multiple rows"""
+    content = ""
+    for row in rows:
+        idx = row.get('_idx', 0)
+        content += format_single_row(row, idx)
+    return content
+def format_table_footer(table_identifier, doc_id):
+    """Format table footer"""
+    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
+def load_json_documents(repo_id, hf_token, json_dir):
+    import zipfile
+    import tempfile
+    import os
+    log_message("Loading JSON documents...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
+    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
+    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
+    documents = []
+    stats = {'success': 0, 'failed': 0, 'empty': 0}
+    for file_path in json_files:
+        try:
+            log_message(f"  Loading: {file_path}")
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            docs = extract_sections_from_json(local_path)
+            if docs:
+                documents.extend(docs)
+                stats['success'] += 1
+                log_message(f"    ✓ Extracted {len(docs)} sections")
+            else:
+                stats['empty'] += 1
+                log_message(f"    ⚠ No sections found")
+        except Exception as e:
+            stats['failed'] += 1
+            log_message(f"    ✗ Error: {e}")
+    for zip_path in zip_files:
+        try:
+            log_message(f"  Processing ZIP: {zip_path}")
+            local_zip = hf_hub_download(
+                repo_id=repo_id,
+                filename=zip_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with zipfile.ZipFile(local_zip, 'r') as zf:
+                json_files_in_zip = [f for f in zf.namelist()
+                                    if f.endswith('.json')
+                                    and not f.startswith('__MACOSX')
+                                    and not f.startswith('.')
+                                    and not '._' in f]
+                log_message(f"    Found {len(json_files_in_zip)} JSON files in ZIP")
+                for json_file in json_files_in_zip:
+                    try:
+                        file_content = zf.read(json_file)
+                        # Skip if file is too small
+                        if len(file_content) < 10:
+                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
+                            stats['failed'] += 1
+                            continue
+                        # Try UTF-8 first (most common)
+                        try:
+                            text_content = file_content.decode('utf-8')
+                        except UnicodeDecodeError:
+                            try:
+                                text_content = file_content.decode('utf-8-sig')
+                            except UnicodeDecodeError:
+                                try:
+                                    # Try UTF-16 (the issue you're seeing)
+                                    text_content = file_content.decode('utf-16')
+                                except UnicodeDecodeError:
+                                    try:
+                                        text_content = file_content.decode('windows-1251')
+                                    except UnicodeDecodeError:
+                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
+                                        stats['failed'] += 1
+                                        continue
+                        # Validate JSON structure
+                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
+                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
+                            stats['failed'] += 1
+                            continue
+                        with tempfile.NamedTemporaryFile(mode='w', delete=False,
+                                                        suffix='.json', encoding='utf-8') as tmp:
+                            tmp.write(text_content)
+                            tmp_path = tmp.name
+                        docs = extract_sections_from_json(tmp_path)
+                        if docs:
+                            documents.extend(docs)
+                            stats['success'] += 1
+                            log_message(f"      ✓ {json_file}: {len(docs)} sections")
+                        else:
+                            stats['empty'] += 1
+                            log_message(f"      ⚠ {json_file}: No sections")
+                        os.unlink(tmp_path)
+                    except json.JSONDecodeError as e:
+                        stats['failed'] += 1
+                        log_message(f"      ✗ {json_file}: Invalid JSON")
+                    except Exception as e:
+                        stats['failed'] += 1
+                        log_message(f"      ✗ {json_file}: {str(e)[:100]}")
+        except Exception as e:
+            log_message(f"    ✗ Error with ZIP: {e}")
+    log_message(f"="*60)
+    log_message(f"JSON Loading Stats:")
+    log_message(f"  Success: {stats['success']}")
+    log_message(f"  Empty: {stats['empty']}")
+    log_message(f"  Failed: {stats['failed']}")
+    log_message(f"  Total sections: {len(documents)}")
+    log_message(f"="*60)
+    return documents
+def extract_sections_from_json(json_path):
+    """Extract sections from a single JSON file"""
+    documents = []
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
+        # Extract all section levels
+        for section in data.get('sections', []):
+            if section.get('section_text', '').strip():
+                documents.append(Document(
+                    text=section['section_text'],
+                    metadata={
+                        'type': 'text',
+                        'document_id': doc_id,
+                        'section_id': section.get('section_id', '')
+                    }
+                ))
+            # Subsections
+            for subsection in section.get('subsections', []):
+                if subsection.get('subsection_text', '').strip():
+                    documents.append(Document(
+                        text=subsection['subsection_text'],
+                        metadata={
+                            'type': 'text',
+                            'document_id': doc_id,
+                            'section_id': subsection.get('subsection_id', '')
+                        }
+                    ))
+                # Sub-subsections
+                for sub_sub in subsection.get('sub_subsections', []):
+                    if sub_sub.get('sub_subsection_text', '').strip():
+                        documents.append(Document(
+                            text=sub_sub['sub_subsection_text'],
+                            metadata={
+                                'type': 'text',
+                                'document_id': doc_id,
+                                'section_id': sub_sub.get('sub_subsection_id', '')
+                            }
+                        ))
+    except Exception as e:
+        log_message(f"Error extracting from {json_path}: {e}")
+    return documents
+def load_table_documents(repo_id, hf_token, table_dir):
+    log_message("Loading tables...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
+            for sheet in data.get('sheets', []):
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # Use the consistent MAX_CHARS_TABLE from config
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    return all_chunks
+def load_image_documents(repo_id, hf_token, image_dir):
+    """Load image descriptions"""
+    log_message("Loading images...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
+    documents = []
+    for file_path in csv_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            df = pd.read_csv(local_path)
+            for _, row in df.iterrows():
+                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
+                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
+                content += f"Название: {row.get('Название изображения', '')}\n"
+                content += f"Описание: {row.get('Описание изображение', '')}\n"
+                content += f"Раздел: {row.get('Раздел документа', '')}\n"
+                chunk_size = len(content)
+                documents.append(Document(
+                    text=content,
+                    metadata={
+                        'type': 'image',
+                        'document_id': str(row.get('Обозначение документа', 'unknown')),
+                        'image_number': str(row.get('№ Изображения', 'unknown')),
+                        'section': str(row.get('Раздел документа', '')),
+                        'chunk_size': chunk_size
+                    }
+                ))
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    if documents:
+        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
+        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
+    return documents
+def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
+    """Main loader - combines all document types"""
+    log_message("="*60)
+    log_message("STARTING DOCUMENT LOADING")
+    log_message("="*60)
+    # Load text sections
+    text_docs = load_json_documents(repo_id, hf_token, json_dir)
+    text_chunks = chunk_text_documents(text_docs)
+    # Load tables (already chunked)
+    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
+    # Load images (no chunking needed)
+    image_docs = load_image_documents(repo_id, hf_token, image_dir)
+    all_docs = text_chunks + table_chunks + image_docs
+    log_message("="*60)
+    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
+    log_message(f"  Text chunks: {len(text_chunks)}")
+    log_message(f"  Table chunks: {len(table_chunks)}")
+    log_message(f"  Images: {len(image_docs)}")
+    log_message("="*60)
     return all_docs