Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

a33029f

1 Parent(s): 5fc122f

new documents_prep

Browse files

Files changed (1) hide show

documents_prep.py +442 -247

documents_prep.py CHANGED Viewed

@@ -1,298 +1,493 @@
 import json
 import zipfile
-import os
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def process_documents_with_chunking(documents):
-    if not documents:
-        return [], []
-    log_message(f"Чанкинг {len(documents)} документов")
-    text_splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP
     )
-    chunked_docs = []
-    chunk_info = []
-    for doc in documents:
-        chunks = text_splitter.get_nodes_from_documents([doc])
-        for chunk in chunks:
-            chunked_docs.append(chunk)
-            metadata = doc.metadata.copy()
-            chunk_info.append({
-                'document_id': metadata.get('document_id', 'unknown'),
-                'section_id': metadata.get('section_id', 'unknown'),
-                'section_path': metadata.get('section_path', ''),
-                'section_text': metadata.get('section_text', ''),
-                'parent_section': metadata.get('parent_section', ''),
-                'parent_title': metadata.get('parent_title', ''),
-                'level': metadata.get('level', ''),
-                'chunk_text': chunk.text,
-                'type': metadata.get('type', 'text'),
-                'table_number': metadata.get('table_number', ''),
-                'image_number': metadata.get('image_number', '')
-            })
-    log_message(f"Создано {len(chunked_docs)} чанков")
-    return chunked_docs, chunk_info
-def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
-    log_message(f"Загрузка JSON документов из {json_files_dir}")
-    documents = []
-    chunk_info = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
-        log_message(f"Найдено {len(zip_files)} ZIP архивов")
-        for zip_file in zip_files:
-            log_message(f"Загружаю архив: {zip_file}")
-            zip_path = hf_hub_download(
                 repo_id=repo_id,
-                filename=zip_file,
-                local_dir=download_dir,
                 repo_type="dataset",
                 token=hf_token
             )
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                json_files = [f for f in zip_ref.namelist()
-                             if f.endswith('.json') and not f.startswith('__MACOSX')]
-                log_message(f"Найдено {len(json_files)} JSON файлов в {zip_file}")
-                for json_file in json_files:
-                    try:
-                        with zip_ref.open(json_file) as f:
-                            json_data = json.load(f)
-                        doc_id = json_data.get('document_id', os.path.basename(json_file))
-                        sections = json_data.get('sections', [])
-                        for section in sections:
-                            text = section.get('text', '').strip()
-                            if not text:
-                                continue
-                            metadata = {
-                                'document_id': doc_id,
-                                'section_id': section.get('section_id', ''),
-                                'section_path': section.get('section_path', ''),
-                                'section_text': section.get('section_text', ''),
-                                'parent_section': section.get('parent_section', ''),
-                                'parent_title': section.get('parent_title', ''),
-                                'level': section.get('level', ''),
-                                'type': 'text'
-                            }
-                            doc = Document(text=text, metadata=metadata)
-                            documents.append(doc)
-                    except Exception as e:
-                        log_message(f"Ошибка обработки {json_file}: {str(e)}")
-        log_message(f"Всего загружено {len(documents)} текстовых документов")
-        if documents:
-            chunked_docs, chunk_info = process_documents_with_chunking(documents)
-            return chunked_docs, chunk_info
-        return [], []
-    except Exception as e:
-        log_message(f"Ошибка загрузки JSON: {str(e)}")
-        return [], []
-def chunk_large_table(table_text, table_number, table_title, doc_id, max_tokens=1500):
-    chunks = []
-    lines = table_text.split('\n')
-    header_lines = [l for l in lines[:5] if l.strip()]
-    data_lines = [l for l in lines if l.strip() and l not in header_lines]
-    if len(table_text) < max_tokens:
-        return [table_text]
-    chunk_size = max(30, len(data_lines) // ((len(table_text) // max_tokens) + 1))
-    for i in range(0, len(data_lines), chunk_size):
-        chunk_data = data_lines[i:i+chunk_size]
-        chunk_text = f"Таблица {table_number} - {table_title}\n"
-        chunk_text += '\n'.join(header_lines) + '\n'
-        chunk_text += '\n'.join(chunk_data)
-        chunks.append(chunk_text)
-    return chunks
-def load_table_data(repo_id, hf_token, table_data_dir):
-    log_message(f"Загрузка табличных данных из {table_data_dir}")
     documents = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(table_files)} файлов таблиц")
-        for table_file in table_files:
-            try:
-                file_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=table_file,
-                    repo_type="dataset",
-                    token=hf_token
                 )
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    table_data = json.load(f)
-                doc_id = table_data.get('document_id', '')
-                table_number = table_data.get('table_number', '')
-                table_title = table_data.get('table_title', '')
-                table_text = f"Таблица {table_number} - {table_title}\n"
-                if 'headers' in table_data:
-                    table_text += "Заголовки: " + " | ".join(table_data['headers']) + "\n"
-                if 'data' in table_data:
-                    for row in table_data['data']:
-                        if isinstance(row, list):
-                            table_text += " | ".join(str(cell) for cell in row) + "\n"
-                        elif isinstance(row, dict):
-                            table_text += " | ".join(f"{k}: {v}" for k, v in row.items()) + "\n"
-                chunks = chunk_large_table(table_text, table_number, table_title, doc_id)
-                for idx, chunk_text in enumerate(chunks):
-                    metadata = {
-                        'document_id': doc_id,
-                        'table_number': table_number,
-                        'table_title': table_title,
-                        'type': 'table',
-                        'chunk_index': idx,
-                        'section_id': f"table_{table_number}",
-                        'section_path': f"Таблица {table_number}"
-                    }
-                    doc = Document(text=chunk_text, metadata=metadata)
                     documents.append(doc)
-            except Exception as e:
-                log_message(f"Ошибка обработки таблицы {table_file}: {str(e)}")
-        log_message(f"Загружено {len(documents)} табличных документов")
-        return documents
-    except Exception as e:
-        log_message(f"Ошибка загрузки таблиц: {str(e)}")
-        return []
-def load_image_data(repo_id, hf_token, image_data_dir):
-    log_message(f"Загрузка данных изображений из {image_data_dir}")
-    documents = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(image_files)} файлов изображений")
-        for image_file in image_files:
-            try:
-                file_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=image_file,
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    image_data = json.load(f)
-                doc_id = image_data.get('document_id', '')
-                image_number = image_data.get('image_number', '')
-                image_title = image_data.get('image_title', '')
-                image_description = image_data.get('image_description', '')
-                text = f"Рисунок {image_number} - {image_title}\n"
-                if image_description:
-                    text += f"Описание: {image_description}"
-                metadata = {
-                    'document_id': doc_id,
-                    'image_number': image_number,
-                    'image_title': image_title,
-                    'type': 'image',
-                    'section_id': f"image_{image_number}",
-                    'section_path': f"Рисунок {image_number}"
-                }
-                doc = Document(text=text, metadata=metadata)
-                documents.append(doc)
-            except Exception as e:
-                log_message(f"Ошибка обработки изображения {image_file}: {str(e)}")
-        log_message(f"Загружено {len(documents)} документов изображений")
-        return documents
-    except Exception as e:
-        log_message(f"Ошибка загрузки изображений: {str(e)}")
-        return []
-def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
-    log_message(f"Загрузка CSV чанков из {chunks_filename}")
-    documents = []
-    chunks_df = None
-    try:
-        csv_path = hf_hub_download(
-            repo_id=repo_id,
-            filename=chunks_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=hf_token
-        )
-        chunks_df = pd.read_csv(csv_path)
-        log_message(f"Загружено {len(chunks_df)} строк из CSV")
-        for _, row in chunks_df.iterrows():
-            text = row.get('chunk_text', '')
-            if not text:
-                continue
-            metadata = {
-                'document_id': row.get('document_id', 'unknown'),
-                'section_id': row.get('section_id', ''),
-                'section_path': row.get('section_path', ''),
-                'type': 'text'
-            }
-            doc = Document(text=text, metadata=metadata)
-            documents.append(doc)
-        log_message(f"Создано {len(documents)} документов из CSV")
-        return documents, chunks_df
-    except Exception as e:
-        log_message(f"Ошибка загрузки CSV: {str(e)}")
-        return [], None

 import json
 import zipfile
 import pandas as pd
+from collections import Counter, defaultdict
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+# ============================================================================
+# TEXT CHUNKING - For regular text sections
+# ============================================================================
+def chunk_text_document(doc):
+    """Split text document into semantic chunks"""
+    splitter = SentenceSplitter(
         chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        separator=" "
     )
+    chunks = splitter.split_text(doc.text)
+    log_message(f"  ✂️ Text split into {len(chunks)} chunks")
+    chunked_docs = []
+    for i, chunk_text in enumerate(chunks):
+        chunk_metadata = doc.metadata.copy()
+        chunk_metadata.update({
+            "chunk_id": i,
+            "total_chunks": len(chunks),
+            "chunk_size": len(chunk_text)
+        })
+        chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
+    return chunked_docs
+# ============================================================================
+# TABLE CHUNKING - Row-based splitting with headers preserved
+# ============================================================================
+def chunk_table_document(doc):
+    """Split large tables by rows while keeping headers in each chunk"""
+    table_num = doc.metadata.get('table_number', 'unknown')
+    table_title = doc.metadata.get('table_title', 'unknown')
+    lines = doc.text.strip().split('\n')
+    # Separate header info from data rows
+    header_lines = []
+    data_rows = []
+    found_data = False
+    for line in lines:
+        if 'Данные таблицы:' in line:
+            found_data = True
+            header_lines.append(line)
+        elif found_data and line.startswith('Строка'):
+            data_rows.append(line)
+        elif not found_data:
+            header_lines.append(line)
+    table_header = '\n'.join(header_lines) + '\n'
+    if not data_rows:
+        log_message(f"  ⚠️ Table {table_num}: no data rows found, using standard split")
+        return chunk_text_document(doc)
+    log_message(f"  📊 Table {table_num}: found {len(data_rows)} data rows")
+    # Calculate space available for rows
+    header_size = len(table_header)
+    available_size = CHUNK_SIZE - header_size - 100  # Reserve 100 chars
+    # Split rows into chunks
+    chunks = []
+    current_rows = []
+    current_size = 0
+    for row in data_rows:
+        row_size = len(row) + 1  # +1 for newline
+        if current_size + row_size > available_size and current_rows:
+            # Save current chunk
+            chunk_text = table_header + '\n'.join(current_rows)
+            chunks.append(chunk_text)
+            # Keep last 2 rows for overlap
+            overlap_rows = min(2, len(current_rows))
+            current_rows = current_rows[-overlap_rows:]
+            current_size = sum(len(r) + 1 for r in current_rows)
+        current_rows.append(row)
+        current_size += row_size
+    # Add final chunk
+    if current_rows:
+        chunk_text = table_header + '\n'.join(current_rows)
+        chunks.append(chunk_text)
+    log_message(f"  ✂️ Table split into {len(chunks)} chunks")
+    # Create documents with metadata
+    chunked_docs = []
+    for i, chunk_text in enumerate(chunks):
+        chunk_metadata = doc.metadata.copy()
+        chunk_metadata.update({
+            "chunk_id": i,
+            "total_chunks": len(chunks),
+            "chunk_size": len(chunk_text),
+            "is_chunked": True
+        })
+        chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
+    return chunked_docs
+# ============================================================================
+# TABLE DATA LOADING
+# ============================================================================
+def create_table_text(table_data):
+    """Format table data as readable text"""
+    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
+    table_num = table_data.get('table_number', 'Неизвестно')
+    table_title = table_data.get('table_title', 'Неизвестно')
+    section = table_data.get('section', 'Неизвестно')
+    text = f"Таблица: {table_num}\n"
+    text += f"Название: {table_title}\n"
+    text += f"Документ: {doc_id}\n"
+    text += f"Раздел: {section}\n"
+    headers = table_data.get('headers', [])
+    if headers:
+        text += f"\nЗаголовки: {' | '.join(headers)}\n"
+    if 'data' in table_data and table_data['data']:
+        text += "\nДанные таблицы:\n"
+        for row_idx, row in enumerate(table_data['data'], start=1):
+            if isinstance(row, dict):
+                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+                text += f"Строка {row_idx}: {row_text}\n"
+    return text
+def load_tables_from_json(repo_id, hf_token, table_data_dir):
+    """Load and process all tables from JSON files"""
+    log_message("=" * 60)
+    log_message("LOADING TABLE DATA")
+    log_message("=" * 60)
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+    log_message(f"Found {len(table_files)} JSON table files")
+    table_documents = []
+    stats = defaultdict(lambda: {'count': 0, 'total_size': 0, 'chunked': 0})
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
                 repo_id=repo_id,
+                filename=file_path,
+                local_dir='',
                 repo_type="dataset",
                 token=hf_token
             )
+            log_message(f"\n📄 Processing: {file_path}")
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            document_id = data.get('document', 'unknown')
+            # Process each table/sheet
+            sheets = data.get('sheets', [data]) if 'sheets' in data else [data]
+            for sheet in sorted(sheets, key=lambda x: x.get('table_number', '')):
+                # Skip empty tables
+                if not sheet.get('data'):
+                    log_message(f"  ⚠️ Skipping empty table {sheet.get('table_number')}")
+                    continue
+                # Create table text
+                table_text = create_table_text(sheet)
+                table_size = len(table_text)
+                table_num = sheet.get('table_number', 'unknown')
+                # Create base document
+                doc = Document(
+                    text=table_text,
+                    metadata={
+                        "type": "table",
+                        "table_number": table_num,
+                        "table_title": sheet.get('table_title', 'unknown'),
+                        "document_id": document_id,
+                        "section": sheet.get('section', 'unknown'),
+                        "section_id": sheet.get('section', 'unknown'),
+                        "total_rows": len(sheet.get('data', [])),
+                        "content_size": table_size
+                    }
+                )
+                # Chunk if necessary
+                if table_size > CHUNK_SIZE:
+                    log_message(f"  📊 Table {table_num}: {table_size} chars > {CHUNK_SIZE}, chunking...")
+                    docs = chunk_table_document(doc)
+                    stats[document_id]['chunked'] += 1
+                else:
+                    log_message(f"  ✓ Table {table_num}: {table_size} chars, keeping whole")
+                    docs = [doc]
+                table_documents.extend(docs)
+                stats[document_id]['count'] += len(docs)
+                stats[document_id]['total_size'] += table_size
+        except Exception as e:
+            log_message(f"  ❌ ERROR: {str(e)}")
+            continue
+    # Summary
+    log_message("\n" + "=" * 60)
+    log_message("TABLE STATISTICS")
+    log_message("=" * 60)
+    total_tables = sum(s['count'] for s in stats.values())
+    total_chunked = sum(s['chunked'] for s in stats.values())
+    log_message(f"Total table chunks: {total_tables}")
+    log_message(f"Large tables chunked: {total_chunked}")
+    for doc_id, doc_stats in sorted(stats.items()):
+        log_message(f"  • {doc_id}: {doc_stats['count']} chunks, "
+                   f"{doc_stats['chunked']} tables split")
+    log_message("=" * 60)
+    return table_documents
+# ============================================================================
+# TEXT SECTIONS LOADING
+# ============================================================================
+def extract_section_title(text):
+    """Extract first line or sentence as title"""
+    if not text.strip():
+        return ""
+    first_line = text.strip().split('\n')[0].strip()
+    # If short and doesn't end with period, use as-is
+    if len(first_line) < 200 and not first_line.endswith('.'):
+        return first_line
+    # Otherwise extract first sentence
+    sentences = first_line.split('.')
+    if len(sentences) > 1:
+        return sentences[0].strip()
+    return first_line[:100] + "..." if len(first_line) > 100 else first_line
+def extract_sections_from_json(data, document_id, document_name):
+    """Recursively extract all sections from JSON structure"""
     documents = []
+    if 'sections' not in data:
+        return documents
+    for section in data['sections']:
+        section_id = section.get('section_id', 'Unknown')
+        section_text = section.get('section_text', '')
+        if section_text.strip():
+            doc = Document(
+                text=section_text,
+                metadata={
+                    "type": "text",
+                    "document_id": document_id,
+                    "document_name": document_name,
+                    "section_id": section_id,
+                    "section_title": extract_section_title(section_text)[:200],
+                    "level": "section"
+                }
+            )
+            documents.append(doc)
+        # Process subsections recursively
+        for subsection in section.get('subsections', []):
+            subsection_id = subsection.get('subsection_id', 'Unknown')
+            subsection_text = subsection.get('subsection_text', '')
+            if subsection_text.strip():
+                doc = Document(
+                    text=subsection_text,
+                    metadata={
+                        "type": "text",
+                        "document_id": document_id,
+                        "document_name": document_name,
+                        "section_id": subsection_id,
+                        "section_title": extract_section_title(subsection_text)[:200],
+                        "level": "subsection",
+                        "parent_section": section_id
+                    }
                 )
+                documents.append(doc)
+            # Process sub-subsections
+            for sub_subsection in subsection.get('sub_subsections', []):
+                sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
+                sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
+                if sub_subsection_text.strip():
+                    doc = Document(
+                        text=sub_subsection_text,
+                        metadata={
+                            "type": "text",
+                            "document_id": document_id,
+                            "document_name": document_name,
+                            "section_id": sub_subsection_id,
+                            "section_title": extract_section_title(sub_subsection_text)[:200],
+                            "level": "sub_subsection",
+                            "parent_section": subsection_id
+                        }
+                    )
                     documents.append(doc)
+    return documents
+def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
+    """Load text sections from JSON files and ZIP archives"""
+    log_message("=" * 60)
+    log_message("LOADING TEXT DOCUMENTS")
+    log_message("=" * 60)
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
+    json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
+    log_message(f"Found {len(zip_files)} ZIP files and {len(json_files)} JSON files")
+    all_documents = []
+    # Process ZIP files
+    for zip_path in zip_files:
+        try:
+            log_message(f"\n📦 Processing ZIP: {zip_path}")
+            local_zip = hf_hub_download(
+                repo_id=repo_id,
+                filename=zip_path,
+                local_dir=download_dir,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with zipfile.ZipFile(local_zip, 'r') as zip_ref:
+                json_in_zip = [f for f in zip_ref.namelist()
+                              if f.endswith('.json') and not f.startswith('__MACOSX')]
+                for json_file in json_in_zip:
+                    with zip_ref.open(json_file) as f:
+                        data = json.load(f)
+                    metadata = data.get('document_metadata', {})
+                    doc_id = metadata.get('document_id', 'unknown')
+                    doc_name = metadata.get('document_name', 'unknown')
+                    docs = extract_sections_from_json(data, doc_id, doc_name)
+                    all_documents.extend(docs)
+                    log_message(f"  ✓ {json_file}: {len(docs)} sections")
+        except Exception as e:
+            log_message(f"  ❌ ERROR: {str(e)}")
+            continue
+    # Process direct JSON files
+    for json_path in json_files:
+        try:
+            log_message(f"\n📄 Processing JSON: {json_path}")
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=json_path,
+                local_dir=download_dir,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            metadata = data.get('document_metadata', {})
+            doc_id = metadata.get('document_id', 'unknown')
+            doc_name = metadata.get('document_name', 'unknown')
+            docs = extract_sections_from_json(data, doc_id, doc_name)
+            all_documents.extend(docs)
+            log_message(f"  ✓ Extracted {len(docs)} sections")
+        except Exception as e:
+            log_message(f"  ❌ ERROR: {str(e)}")
+            continue
+    log_message(f"\n✓ Total text sections: {len(all_documents)}")
+    # Apply chunking
+    chunked_docs = []
+    chunked_count = 0
+    for doc in all_documents:
+        if len(doc.text) > CHUNK_SIZE:
+            log_message(f"  ✂️ Chunking section '{doc.metadata.get('section_id')}' "
+                       f"({len(doc.text)} chars)")
+            chunks = chunk_text_document(doc)
+            chunked_docs.extend(chunks)
+            chunked_count += 1
+        else:
+            chunked_docs.append(doc)
+    log_message(f"\n✓ After chunking: {len(chunked_docs)} total chunks")
+    log_message(f"✓ Sections chunked: {chunked_count}")
+    log_message("=" * 60)
+    return chunked_docs
+# ============================================================================
+# IMAGE DATA LOADING
+# ============================================================================
+def load_image_documents(repo_id, hf_token, image_data_dir):
+    """Load image metadata from CSV files"""
+    log_message("=" * 60)
+    log_message("LOADING IMAGE METADATA")
+    log_message("=" * 60)
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
+    log_message(f"Found {len(image_files)} CSV image files")
+    image_docs = []
+    for csv_path in image_files:
+        try:
+            log_message(f"\n📷 Processing: {csv_path}")
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=csv_path,
+                local_dir='',
+                repo_type="dataset",
+                token=hf_token
+            )
+            df = pd.read_csv(local_path)
+            log_message(f"  ✓ Loaded {len(df)} image records")
+            for _, row in df.iterrows():
+                text = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
+                text += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                text += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
+                text += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                text += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
+                text += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
+                doc = Document(
+                    text=text,
+                    metadata={
+                        "type": "image",
+                        "image_number": str(row.get('№ Изображения', 'unknown')),
+                        "image_title": str(row.get('Название изображения', 'unknown')),
+                        "image_description": str(row.get('Описание изображение', 'unknown')),
+                        "document_id": str(row.get('Обозначение документа', 'unknown')),
+                        "file_path": str(row.get('Файл изображения', 'unknown')),
+                        "section": str(row.get('Раздел документа', 'Неизвестно'))
+                    }
+                )
+                image_docs.append(doc)
+        except Exception as e:
+            log_message(f"  ❌ ERROR: {str(e)}")
+            continue
+    log_message(f"\n✓ Total image documents: {len(image_docs)}")
+    log_message("=" * 60)
+    return image_docs