Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

9160af0

1 Parent(s): 4775037

new documents_prep

Browse files

Files changed (2) hide show

app.py +6 -1
documents_prep.py +512 -332

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import os
 from llama_index.core import Settings
-from documents_prep import load_json_documents, load_table_data, load_image_data
 from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
@@ -127,6 +127,11 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
             json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
             all_documents.extend(json_documents)
             chunk_info.extend(json_chunk_info)
         if table_data_dir:
             log_message("Добавляю табличные данные")

 import gradio as gr
 import os
 from llama_index.core import Settings
+from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
 from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
 from my_logging import log_message
 from index_retriever import create_vector_index, create_query_engine
             json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
             all_documents.extend(json_documents)
             chunk_info.extend(json_chunk_info)
+        else:
+            if chunks_filename:
+                log_message("Загружаем данные из CSV")
+                csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
+                all_documents.extend(csv_documents)
         if table_data_dir:
             log_message("Добавляю табличные данные")

documents_prep.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import zipfile
 import pandas as pd
-from collections import Counter, defaultdict
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
@@ -10,26 +10,25 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
 # ============================================================================
-# TEXT CHUNKING - For regular text sections
 # ============================================================================
 def chunk_text_document(doc):
-    """Split text document into semantic chunks"""
-    splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         separator=" "
     )
-    chunks = splitter.split_text(doc.text)
-    log_message(f"  ✂️ Text split into {len(chunks)} chunks")
     chunked_docs = []
-    for i, chunk_text in enumerate(chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
-            "total_chunks": len(chunks),
             "chunk_size": len(chunk_text)
         })
@@ -39,226 +38,265 @@ def chunk_text_document(doc):
 # ============================================================================
-# TABLE CHUNKING - Row-based splitting with headers preserved
 # ============================================================================
-def chunk_table_document(doc):
-    """Split large tables by rows while keeping headers in each chunk"""
     table_num = doc.metadata.get('table_number', 'unknown')
     table_title = doc.metadata.get('table_title', 'unknown')
     lines = doc.text.strip().split('\n')
-    # Separate header info from data rows
-    header_lines = []
     data_rows = []
-    found_data = False
     for line in lines:
-        if 'Данные таблицы:' in line:
-            found_data = True
-            header_lines.append(line)
-        elif found_data and line.startswith('Строка'):
             data_rows.append(line)
-        elif not found_data:
-            header_lines.append(line)
-    table_header = '\n'.join(header_lines) + '\n'
     if not data_rows:
-        log_message(f"  ⚠️ Table {table_num}: no data rows found, using standard split")
         return chunk_text_document(doc)
-    log_message(f"  📊 Table {table_num}: found {len(data_rows)} data rows")
-    # Calculate space available for rows
     header_size = len(table_header)
-    available_size = CHUNK_SIZE - header_size - 100  # Reserve 100 chars
-    # Split rows into chunks
-    chunks = []
-    current_rows = []
     current_size = 0
     for row in data_rows:
-        row_size = len(row) + 1  # +1 for newline
-        if current_size + row_size > available_size and current_rows:
-            # Save current chunk
-            chunk_text = table_header + '\n'.join(current_rows)
-            chunks.append(chunk_text)
             # Keep last 2 rows for overlap
-            overlap_rows = min(2, len(current_rows))
-            current_rows = current_rows[-overlap_rows:]
-            current_size = sum(len(r) + 1 for r in current_rows)
-        current_rows.append(row)
         current_size += row_size
-    # Add final chunk
-    if current_rows:
-        chunk_text = table_header + '\n'.join(current_rows)
-        chunks.append(chunk_text)
-    log_message(f"  ✂️ Table split into {len(chunks)} chunks")
-    # Create documents with metadata
     chunked_docs = []
-    for i, chunk_text in enumerate(chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
-            "total_chunks": len(chunks),
             "chunk_size": len(chunk_text),
-            "is_chunked": True
         })
-        chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
     return chunked_docs
-# ============================================================================
-# TABLE DATA LOADING
-# ============================================================================
-def load_table_data(table_data):
-    """Format table data as readable text"""
-    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
-    text = f"Таблица: {table_num}\n"
-    text += f"Название: {table_title}\n"
-    text += f"Документ: {doc_id}\n"
-    text += f"Раздел: {section}\n"
-    headers = table_data.get('headers', [])
-    if headers:
-        text += f"\nЗаголовки: {' | '.join(headers)}\n"
-    if 'data' in table_data and table_data['data']:
-        text += "\nДанные таблицы:\n"
-        for row_idx, row in enumerate(table_data['data'], start=1):
-            if isinstance(row, dict):
-                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
-                text += f"Строка {row_idx}: {row_text}\n"
-    return text
-def load_tables_from_json(repo_id, hf_token, table_data_dir):
-    """Load and process all tables from JSON files"""
     log_message("=" * 60)
-    log_message("LOADING TABLE DATA")
     log_message("=" * 60)
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-    log_message(f"Found {len(table_files)} JSON table files")
-    table_documents = []
-    stats = defaultdict(lambda: {'count': 0, 'total_size': 0, 'chunked': 0})
-    for file_path in table_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                local_dir='',
-                repo_type="dataset",
-                token=hf_token
-            )
-            log_message(f"\n📄 Processing: {file_path}")
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            document_id = data.get('document', 'unknown')
-            # Process each table/sheet
-            sheets = data.get('sheets', [data]) if 'sheets' in data else [data]
-            for sheet in sorted(sheets, key=lambda x: x.get('table_number', '')):
-                # Skip empty tables
-                if not sheet.get('data'):
-                    log_message(f"  ⚠️ Skipping empty table {sheet.get('table_number')}")
-                    continue
-                # Create table text
-                table_text = load_table_data(sheet)
-                table_size = len(table_text)
-                table_num = sheet.get('table_number', 'unknown')
-                # Create base document
-                doc = Document(
-                    text=table_text,
-                    metadata={
-                        "type": "table",
-                        "table_number": table_num,
-                        "table_title": sheet.get('table_title', 'unknown'),
-                        "document_id": document_id,
-                        "section": sheet.get('section', 'unknown'),
-                        "section_id": sheet.get('section', 'unknown'),
-                        "total_rows": len(sheet.get('data', [])),
-                        "content_size": table_size
-                    }
                 )
-                # Chunk if necessary
-                if table_size > CHUNK_SIZE:
-                    log_message(f"  📊 Table {table_num}: {table_size} chars > {CHUNK_SIZE}, chunking...")
-                    docs = chunk_table_document(doc)
-                    stats[document_id]['chunked'] += 1
-                else:
-                    log_message(f"  ✓ Table {table_num}: {table_size} chars, keeping whole")
-                    docs = [doc]
-                table_documents.extend(docs)
-                stats[document_id]['count'] += len(docs)
-                stats[document_id]['total_size'] += table_size
-        except Exception as e:
-            log_message(f"  ❌ ERROR: {str(e)}")
-            continue
-    # Summary
-    log_message("\n" + "=" * 60)
-    log_message("TABLE STATISTICS")
-    log_message("=" * 60)
-    total_tables = sum(s['count'] for s in stats.values())
-    total_chunked = sum(s['chunked'] for s in stats.values())
-    log_message(f"Total table chunks: {total_tables}")
-    log_message(f"Large tables chunked: {total_chunked}")
-    for doc_id, doc_stats in sorted(stats.items()):
-        log_message(f"  • {doc_id}: {doc_stats['count']} chunks, "
-                   f"{doc_stats['chunked']} tables split")
-    log_message("=" * 60)
-    return table_documents
 # ============================================================================
-# TEXT SECTIONS LOADING
 # ============================================================================
-def extract_section_title(text):
-    """Extract first line or sentence as title"""
-    if not text.strip():
         return ""
-    first_line = text.strip().split('\n')[0].strip()
-    # If short and doesn't end with period, use as-is
     if len(first_line) < 200 and not first_line.endswith('.'):
         return first_line
-    # Otherwise extract first sentence
     sentences = first_line.split('.')
     if len(sentences) > 1:
         return sentences[0].strip()
@@ -266,8 +304,8 @@ def extract_section_title(text):
     return first_line[:100] + "..." if len(first_line) > 100 else first_line
-def extract_sections_from_json(data, document_id, document_name):
-    """Recursively extract all sections from JSON structure"""
     documents = []
     if 'sections' not in data:
@@ -278,6 +316,7 @@ def extract_sections_from_json(data, document_id, document_name):
         section_text = section.get('section_text', '')
         if section_text.strip():
             doc = Document(
                 text=section_text,
                 metadata={
@@ -285,48 +324,32 @@ def extract_sections_from_json(data, document_id, document_name):
                     "document_id": document_id,
                     "document_name": document_name,
                     "section_id": section_id,
-                    "section_title": extract_section_title(section_text)[:200],
                     "level": "section"
                 }
             )
             documents.append(doc)
         # Process subsections recursively
-        for subsection in section.get('subsections', []):
-            subsection_id = subsection.get('subsection_id', 'Unknown')
-            subsection_text = subsection.get('subsection_text', '')
-            if subsection_text.strip():
-                doc = Document(
-                    text=subsection_text,
-                    metadata={
-                        "type": "text",
-                        "document_id": document_id,
-                        "document_name": document_name,
-                        "section_id": subsection_id,
-                        "section_title": extract_section_title(subsection_text)[:200],
-                        "level": "subsection",
-                        "parent_section": section_id
-                    }
-                )
-                documents.append(doc)
-            # Process sub-subsections
-            for sub_subsection in subsection.get('sub_subsections', []):
-                sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
-                sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
-                if sub_subsection_text.strip():
                     doc = Document(
-                        text=sub_subsection_text,
                         metadata={
                             "type": "text",
                             "document_id": document_id,
                             "document_name": document_name,
-                            "section_id": sub_subsection_id,
-                            "section_title": extract_section_title(sub_subsection_text)[:200],
-                            "level": "sub_subsection",
-                            "parent_section": subsection_id
                         }
                     )
                     documents.append(doc)
@@ -335,159 +358,316 @@ def extract_sections_from_json(data, document_id, document_name):
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
-    """Load text sections from JSON files and ZIP archives"""
     log_message("=" * 60)
-    log_message("LOADING TEXT DOCUMENTS")
     log_message("=" * 60)
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
-    json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
-    log_message(f"Found {len(zip_files)} ZIP files and {len(json_files)} JSON files")
-    all_documents = []
-    # Process ZIP files
-    for zip_path in zip_files:
-        try:
-            log_message(f"\n📦 Processing ZIP: {zip_path}")
-            local_zip = hf_hub_download(
-                repo_id=repo_id,
-                filename=zip_path,
-                local_dir=download_dir,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with zipfile.ZipFile(local_zip, 'r') as zip_ref:
-                json_in_zip = [f for f in zip_ref.namelist()
-                              if f.endswith('.json') and not f.startswith('__MACOSX')]
-                for json_file in json_in_zip:
-                    with zip_ref.open(json_file) as f:
-                        data = json.load(f)
-                    metadata = data.get('document_metadata', {})
-                    doc_id = metadata.get('document_id', 'unknown')
-                    doc_name = metadata.get('document_name', 'unknown')
-                    docs = extract_sections_from_json(data, doc_id, doc_name)
-                    all_documents.extend(docs)
-                    log_message(f"  ✓ {json_file}: {len(docs)} sections")
-        except Exception as e:
-            log_message(f"  ❌ ERROR: {str(e)}")
-            continue
-    # Process direct JSON files
-    for json_path in json_files:
-        try:
-            log_message(f"\n📄 Processing JSON: {json_path}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=json_path,
-                local_dir=download_dir,
-                repo_type="dataset",
-                token=hf_token
-            )
-            with open(local_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            metadata = data.get('document_metadata', {})
-            doc_id = metadata.get('document_id', 'unknown')
-            doc_name = metadata.get('document_name', 'unknown')
-            docs = extract_sections_from_json(data, doc_id, doc_name)
-            all_documents.extend(docs)
-            log_message(f"  ✓ Extracted {len(docs)} sections")
-        except Exception as e:
-            log_message(f"  ❌ ERROR: {str(e)}")
-            continue
-    log_message(f"\n✓ Total text sections: {len(all_documents)}")
-    # Apply chunking
-    chunked_docs = []
-    chunked_count = 0
-    for doc in all_documents:
-        if len(doc.text) > CHUNK_SIZE:
-            log_message(f"  ✂️ Chunking section '{doc.metadata.get('section_id')}' "
-                       f"({len(doc.text)} chars)")
-            chunks = chunk_text_document(doc)
-            chunked_docs.extend(chunks)
-            chunked_count += 1
-        else:
-            chunked_docs.append(doc)
-    log_message(f"\n✓ After chunking: {len(chunked_docs)} total chunks")
-    log_message(f"✓ Sections chunked: {chunked_count}")
-    log_message("=" * 60)
-    return chunked_docs
 # ============================================================================
-# IMAGE DATA LOADING
 # ============================================================================
 def load_image_data(repo_id, hf_token, image_data_dir):
     """Load image metadata from CSV files"""
     log_message("=" * 60)
-    log_message("LOADING IMAGE METADATA")
     log_message("=" * 60)
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
-    log_message(f"Found {len(image_files)} CSV image files")
-    image_docs = []
-    for csv_path in image_files:
-        try:
-            log_message(f"\n📷 Processing: {csv_path}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=csv_path,
-                local_dir='',
-                repo_type="dataset",
-                token=hf_token
-            )
-            df = pd.read_csv(local_path)
-            log_message(f"  ✓ Loaded {len(df)} image records")
-            for _, row in df.iterrows():
-                text = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
-                text += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
-                text += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
-                text += f"Документ: {row.get('Обозначение документа', 'Не��звестно')}\n"
-                text += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
-                text += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
-                doc = Document(
-                    text=text,
-                    metadata={
-                        "type": "image",
-                        "image_number": str(row.get('№ Изображения', 'unknown')),
-                        "image_title": str(row.get('Название изображения', 'unknown')),
-                        "image_description": str(row.get('Описание изображение', 'unknown')),
-                        "document_id": str(row.get('Обозначение документа', 'unknown')),
-                        "file_path": str(row.get('Файл изображения', 'unknown')),
-                        "section": str(row.get('Раздел документа', 'Неизвестно'))
-                    }
-                )
-                image_docs.append(doc)
-        except Exception as e:
-            log_message(f"  ❌ ERROR: {str(e)}")
-            continue
-    log_message(f"\n✓ Total image documents: {len(image_docs)}")
-    log_message("=" * 60)
-    return image_docs

 import json
 import zipfile
 import pandas as pd
+from collections import Counter
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 # ============================================================================
+# TEXT CHUNKING
 # ============================================================================
 def chunk_text_document(doc):
+    """Split text document into chunks using sentence splitter"""
+    text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP,
         separator=" "
     )
+    text_chunks = text_splitter.split_text(doc.text)
     chunked_docs = []
+    for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
+            "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text)
         })
 # ============================================================================
+# TABLE PROCESSING
 # ============================================================================
+def extract_table_metadata(table_text):
+    """Extract key terms from table for enrichment"""
+    words = table_text.split()
+    # Filter stopwords and short words
+    stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
+    filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
+    # Get top 15 most common terms
+    common = Counter(filtered).most_common(15)
+    key_terms = [w for w, _ in common]
+    return {
+        "summary": f"Таблица содержит {len(words)} слов",
+        "key_terms": key_terms
+    }
+def create_table_content(table_data):
+    """Format table data as text"""
+    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
+    table_num = table_data.get('table_number', 'Неизвестно')
+    table_title = table_data.get('table_title', 'Неизвестно')
+    section = table_data.get('section', 'Неизвестно')
+    content = f"Таблица: {table_num}\n"
+    content += f"Название: {table_title}\n"
+    content += f"Документ: {doc_id}\n"
+    content += f"Раздел: {section}\n"
+    # Add headers
+    headers = table_data.get('headers', [])
+    if headers:
+        content += f"\nЗаголовки: {' | '.join(headers)}\n"
+    # Add data rows
+    if 'data' in table_data and isinstance(table_data['data'], list):
+        content += "\nДанные таблицы:\n"
+        for row_idx, row in enumerate(table_data['data'], start=1):
+            if isinstance(row, dict):
+                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+                content += f"Строка {row_idx}: {row_text}\n"
+    return content
+def chunk_table_by_rows(doc):
+    """Split large table into chunks by rows, preserving headers"""
+    # Extract metadata
+    table_metadata = extract_table_metadata(doc.text)
     table_num = doc.metadata.get('table_number', 'unknown')
     table_title = doc.metadata.get('table_title', 'unknown')
+    # Parse table structure
     lines = doc.text.strip().split('\n')
+    # Separate header and data rows
+    table_header_lines = []
     data_rows = []
+    in_data = False
     for line in lines:
+        if line.startswith('Данные таблицы:'):
+            in_data = True
+            table_header_lines.append(line)
+        elif in_data and line.startswith('Строка'):
             data_rows.append(line)
+        elif not in_data:
+            table_header_lines.append(line)
+    table_header = '\n'.join(table_header_lines) + '\n'
+    # If no rows, use standard text splitting
     if not data_rows:
+        log_message(f"  ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
         return chunk_text_document(doc)
+    log_message(f"  📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
+    # Row-based chunking
     header_size = len(table_header)
+    available_size = CHUNK_SIZE - header_size - 300  # Reserve space for enrichment
+    text_chunks = []
+    current_chunk_rows = []
     current_size = 0
     for row in data_rows:
+        row_size = len(row) + 1
+        # If adding this row exceeds limit, create chunk
+        if current_size + row_size > available_size and current_chunk_rows:
+            chunk_text = table_header + '\n'.join(current_chunk_rows)
+            text_chunks.append(chunk_text)
+            log_message(f"    ✂️ Создан чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
             # Keep last 2 rows for overlap
+            overlap_count = min(2, len(current_chunk_rows))
+            current_chunk_rows = current_chunk_rows[-overlap_count:]
+            current_size = sum(len(r) + 1 for r in current_chunk_rows)
+        current_chunk_rows.append(row)
         current_size += row_size
+    # Final chunk
+    if current_chunk_rows:
+        chunk_text = table_header + '\n'.join(current_chunk_rows)
+        text_chunks.append(chunk_text)
+        log_message(f"    ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
+    log_message(f"  📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
+    # Create enriched chunks with metadata
     chunked_docs = []
+    key_terms = table_metadata.get("key_terms", [])
+    for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
+            "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
+            "is_chunked": True,
+            "key_terms": key_terms
         })
+        # Add enrichment prefix
+        terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
+        enriched_text = f"""[Таблица {table_num}: {table_title}]
+[Ключевые термины: {terms_str}]
+{chunk_text}"""
+        chunked_docs.append(Document(text=enriched_text, metadata=chunk_metadata))
     return chunked_docs
+def table_to_document(table_data, document_id=None):
+    """Convert table data to Document, chunking if needed"""
+    if not isinstance(table_data, dict):
+        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
+        return []
+    doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
+    table_rows = table_data.get('data', [])
+    if not table_rows:
+        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} - нет данных")
+        return []
+    content = create_table_content(table_data)
+    content_size = len(content)
+    base_doc = Document(
+        text=content,
+        metadata={
+            "type": "table",
+            "table_number": table_num,
+            "table_title": table_title,
+            "document_id": doc_id,
+            "section": section,
+            "section_id": section,
+            "total_rows": len(table_rows),
+            "content_size": content_size
+        }
+    )
+    # Chunk if needed
+    if content_size > CHUNK_SIZE:
+        log_message(f"📊 CHUNKING: Таблица {table_num} | Размер: {content_size} > {CHUNK_SIZE}")
+        return chunk_table_by_rows(base_doc)
+    else:
+        log_message(f"✓ Таблица {table_num} | Размер: {content_size} символов | Строк: {len(table_rows)}")
+        return [base_doc]
+def load_table_data(repo_id, hf_token, table_data_dir):
+    """Load all table data from HuggingFace repo"""
     log_message("=" * 60)
+    log_message("ЗАГРУЗКА ТАБЛИЧНЫХ ДАННЫХ")
     log_message("=" * 60)
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
+        table_documents = []
+        for file_path in table_files:
+            try:
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
                 )
+                log_message(f"\nОбработка файла: {file_path}")
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    table_data = json.load(f)
+                if isinstance(table_data, dict):
+                    document_id = table_data.get('document', 'unknown')
+                    # Process sheets if present
+                    if 'sheets' in table_data:
+                        sorted_sheets = sorted(
+                            table_data['sheets'],
+                            key=lambda sheet: sheet.get('table_number', '')
+                        )
+                        for sheet in sorted_sheets:
+                            sheet['document'] = document_id
+                            docs_list = table_to_document(sheet, document_id)
+                            table_documents.extend(docs_list)
+                    else:
+                        docs_list = table_to_document(table_data, document_id)
+                        table_documents.extend(docs_list)
+            except Exception as e:
+                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
+                continue
+        log_message(f"\n{'='*60}")
+        log_message(f"Загружено {len(table_documents)} табличных документов")
+        log_message("=" * 60)
+        return table_documents
+    except Exception as e:
+        log_message(f"❌ ОШИБКА загрузки таблиц: {str(e)}")
+        return []
 # ============================================================================
+# JSON TEXT DOCUMENTS
 # ============================================================================
+def extract_section_title(section_text):
+    """Extract clean title from section text"""
+    if not section_text.strip():
         return ""
+    first_line = section_text.strip().split('\n')[0].strip()
     if len(first_line) < 200 and not first_line.endswith('.'):
         return first_line
     sentences = first_line.split('.')
     if len(sentences) > 1:
         return sentences[0].strip()
     return first_line[:100] + "..." if len(first_line) > 100 else first_line
+def extract_text_from_json(data, document_id, document_name):
+    """Extract text documents from JSON structure"""
     documents = []
     if 'sections' not in data:
         section_text = section.get('section_text', '')
         if section_text.strip():
+            section_title = extract_section_title(section_text)
             doc = Document(
                 text=section_text,
                 metadata={
                     "document_id": document_id,
                     "document_name": document_name,
                     "section_id": section_id,
+                    "section_text": section_title[:200],
+                    "section_path": section_id,
                     "level": "section"
                 }
             )
             documents.append(doc)
         # Process subsections recursively
+        if 'subsections' in section:
+            for subsection in section['subsections']:
+                subsection_id = subsection.get('subsection_id', 'Unknown')
+                subsection_text = subsection.get('subsection_text', '')
+                if subsection_text.strip():
+                    subsection_title = extract_section_title(subsection_text)
                     doc = Document(
+                        text=subsection_text,
                         metadata={
                             "type": "text",
                             "document_id": document_id,
                             "document_name": document_name,
+                            "section_id": subsection_id,
+                            "section_text": subsection_title[:200],
+                            "section_path": f"{section_id}.{subsection_id}",
+                            "level": "subsection",
+                            "parent_section": section_id
                         }
                     )
                     documents.append(doc)
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
+    """Load JSON documents from HuggingFace repo"""
     log_message("=" * 60)
+    log_message("ЗАГРУЗКА JSON ДОКУМЕНТОВ")
     log_message("=" * 60)
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
+        json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} JSON файлов")
+        all_documents = []
+        # Process ZIP files
+        for zip_file_path in zip_files:
+            try:
+                log_message(f"Загружаю ZIP: {zip_file_path}")
+                local_zip_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=zip_file_path,
+                    local_dir=download_dir,
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
+                    json_files_in_zip = [f for f in zip_ref.namelist()
+                                        if f.endswith('.json') and not f.startswith('__MACOSX')]
+                    for json_file in json_files_in_zip:
+                        with zip_ref.open(json_file) as f:
+                            json_data = json.load(f)
+                        metadata = json_data.get('document_metadata', {})
+                        doc_id = metadata.get('document_id', 'unknown')
+                        doc_name = metadata.get('document_name', 'unknown')
+                        docs = extract_text_from_json(json_data, doc_id, doc_name)
+                        all_documents.extend(docs)
+                log_message(f"Извлечено документов из ZIP: {len(all_documents)}")
+            except Exception as e:
+                log_message(f"❌ ОШИБКА ZIP {zip_file_path}: {str(e)}")
+                continue
+        # Process direct JSON files
+        for file_path in json_files:
+            try:
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir=download_dir,
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                with open(local_path, 'r', encoding='utf-8') as f:
+                    json_data = json.load(f)
+                metadata = json_data.get('document_metadata', {})
+                doc_id = metadata.get('document_id', 'unknown')
+                doc_name = metadata.get('document_name', 'unknown')
+                docs = extract_text_from_json(json_data, doc_id, doc_name)
+                all_documents.extend(docs)
+            except Exception as e:
+                log_message(f"❌ ОШИБКА JSON {file_path}: {str(e)}")
+                continue
+        log_message(f"Всего загружено {len(all_documents)} текстовых документов")
+        # Chunk all documents
+        chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
+        log_message(f"После chunking: {len(chunked_documents)} чанков")
+        log_message("=" * 60)
+        return chunked_documents, chunk_info
+    except Exception as e:
+        log_message(f"❌ ОШИБКА загрузки JSON: {str(e)}")
+        return [], []
 # ============================================================================
+# IMAGE DATA
 # ============================================================================
 def load_image_data(repo_id, hf_token, image_data_dir):
     """Load image metadata from CSV files"""
     log_message("=" * 60)
+    log_message("ЗАГРУЗКА ДАННЫХ ИЗОБРАЖЕНИЙ")
     log_message("=" * 60)
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
+        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
+        image_documents = []
+        for file_path in image_files:
+            try:
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                df = pd.read_csv(local_path)
+                log_message(f"Загружено {len(df)} изображений из {file_path}")
+                for _, row in df.iterrows():
+                    content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
+                    content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
+                    content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                    content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
+                    doc = Document(
+                        text=content,
+                        metadata={
+                            "type": "image",
+                            "image_number": str(row.get('№ Изображения', 'unknown')),
+                            "image_title": str(row.get('Название изображения', 'unknown')),
+                            "document_id": str(row.get('Обозначение документа', 'unknown')),
+                            "section": str(row.get('Раздел документа', 'unknown'))
+                        }
+                    )
+                    image_documents.append(doc)
+            except Exception as e:
+                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Загружено {len(image_documents)} документов изображений")
+        log_message("=" * 60)
+        return image_documents
+    except Exception as e:
+        log_message(f"❌ ОШИБКА загрузки изображений: {str(e)}")
+        return []
+# ============================================================================
+# DOCUMENT PROCESSING WITH CHUNKING
+# ============================================================================
+def process_documents_with_chunking(documents):
+    """Process all documents and chunk if needed"""
+    all_chunked_docs = []
+    chunk_info = []
+    stats = {
+        'text_chunks': 0,
+        'table_whole': 0,
+        'table_chunks': 0,
+        'image_whole': 0,
+        'image_chunks': 0
+    }
+    for doc in documents:
+        doc_type = doc.metadata.get('type', 'text')
+        is_already_chunked = doc.metadata.get('is_chunked', False)
+        doc_size = len(doc.text)
+        # Tables - already chunked or whole
+        if doc_type == 'table':
+            if is_already_chunked:
+                stats['table_chunks'] += 1
+            else:
+                stats['table_whole'] += 1
+            all_chunked_docs.append(doc)
+            chunk_info.append({
+                'document_id': doc.metadata.get('document_id', 'unknown'),
+                'section_id': doc.metadata.get('section_id', 'unknown'),
+                'chunk_id': doc.metadata.get('chunk_id', 0),
+                'total_chunks': doc.metadata.get('total_chunks', 1),
+                'chunk_size': doc_size,
+                'chunk_preview': doc.text[:200] + "..." if doc_size > 200 else doc.text,
+                'type': 'table',
+                'table_number': doc.metadata.get('table_number', 'unknown')
+            })
+        # Images - chunk if too large
+        elif doc_type == 'image':
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | Размер: {doc_size}")
+                chunked_docs = chunk_text_document(doc)
+                stats['image_chunks'] += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "...",
+                        'type': 'image',
+                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
+                    })
+            else:
+                stats['image_whole'] += 1
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "...",
+                    'type': 'image',
+                    'image_number': doc.metadata.get('image_number', 'unknown')
+                })
+        # Text - chunk if too large
+        else:
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | Размер: {doc_size}")
+                chunked_docs = chunk_text_document(doc)
+                stats['text_chunks'] += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "...",
+                        'type': 'text'
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "...",
+                    'type': 'text'
+                })
+    # Log summary
+    log_message(f"\n{'='*60}")
+    log_message("ИТОГОВАЯ СТАТИСТИКА:")
+    log_message(f"  • Текстовые чанки: {stats['text_chunks']}")
+    log_message(f"  • Таблицы (целые): {stats['table_whole']}")
+    log_message(f"  • Таблицы (чанки): {stats['table_chunks']}")
+    log_message(f"  • Изображения (целые): {stats['image_whole']}")
+    log_message(f"  • Изображения (чанки): {stats['image_chunks']}")
+    log_message(f"  • ВСЕГО ДОКУМЕНТОВ: {len(all_chunked_docs)}")
+    log_message(f"{'='*60}\n")
+    return all_chunked_docs, chunk_info
+# ============================================================================
+# CSV CHUNKS (Legacy support)
+# ============================================================================
+def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
+    """Load pre-chunked data from CSV (legacy support)"""
+    log_message("Загрузка данных из CSV")
+    try:
+        chunks_csv_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=chunks_filename,
+            local_dir=download_dir,
+            repo_type="dataset",
+            token=hf_token
+        )
+        chunks_df = pd.read_csv(chunks_csv_path)
+        log_message(f"Загружено {len(chunks_df)} чанков из CSV")
+        # Find text column
+        text_column = None
+        for col in chunks_df.columns:
+            if any(keyword in col.lower() for keyword in ['text', 'content', 'chunk']):
+                text_column = col
+                break
+        if text_column is None:
+            text_column = chunks_df.columns[0]
+        documents = []
+        for i, (_, row) in enumerate(chunks_df.iterrows()):
+            doc = Document(
+                text=str(row[text_column]),
+                metadata={
+                    "chunk_id": row.get('chunk_id', i),
+                    "document_id": row.get('document_id', 'unknown'),
+                    "type": "text"
+                }
+            )
+            documents.append(doc)
+        log_message(f"Создано {len(documents)} документов из CSV")
+        return documents, chunks_df
+    except Exception as e:
+        log_message(f"❌ ОШИБКА загрузки CSV: {str(e)}")
+        return [], None