Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 3, 2025

Commit

f85ad1c

1 Parent(s): 822ef8c

new way of chunking

Browse files

Files changed (3) hide show

config.py +1 -1
documents_prep.py +127 -68
table_prep.py +106 -60

config.py CHANGED Viewed

@@ -50,7 +50,7 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL = "Gemini 2.5 Flash"
-CHUNK_SIZE = 3000
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

 DEFAULT_MODEL = "Gemini 2.5 Flash"
+CHUNK_SIZE = 2000
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

documents_prep.py CHANGED Viewed

@@ -14,147 +14,206 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:
         chunk_overlap = CHUNK_OVERLAP
-    text_splitter = SentenceSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-        separator=" "
-    )
-    text_chunks = text_splitter.split_text(doc.text)
     chunked_docs = []
-    for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
-            "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
-            "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
         })
-        chunked_doc = Document(
-            text=chunk_text,
-            metadata=chunk_metadata
-        )
-        chunked_docs.append(chunked_doc)
     return chunked_docs
 def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
-    table_count = 0
-    table_chunks_count = 0
-    image_count = 0
-    image_chunks_count = 0
-    text_chunks_count = 0
-    for doc in documents:
         doc_type = doc.metadata.get('type', 'text')
         is_already_chunked = doc.metadata.get('is_chunked', False)
         if doc_type == 'table':
             if is_already_chunked:
-                table_chunks_count += 1
                 all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': doc.metadata.get('chunk_id', 0),
-                    'total_chunks': doc.metadata.get('total_chunks', 1),
-                    'chunk_size': len(doc.text),
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'table',
-                    'table_number': doc.metadata.get('table_number', 'unknown')
-                })
             else:
-                table_count += 1
                 all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': len(doc.text),
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'table',
-                    'table_number': doc.metadata.get('table_number', 'unknown')
-                })
         elif doc_type == 'image':
-            image_count += 1
-            doc_size = len(doc.text)
             if doc_size > CHUNK_SIZE:
-                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
-                           f"Размер: {doc_size} > {CHUNK_SIZE}")
                 chunked_docs = chunk_document(doc)
-                image_chunks_count += len(chunked_docs)
                 all_chunked_docs.extend(chunked_docs)
-                log_message(f"  ✂️ Разделено на {len(chunked_docs)} чанков")
-                for i, chunk_doc in enumerate(chunked_docs):
                     chunk_info.append({
                         'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
                         'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
                         'chunk_size': len(chunk_doc.text),
                         'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
                         'type': 'image',
-                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
                     })
             else:
                 all_chunked_docs.append(doc)
                 chunk_info.append({
                     'document_id': doc.metadata.get('document_id', 'unknown'),
                     'section_id': doc.metadata.get('section_id', 'unknown'),
                     'chunk_id': 0,
                     'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'image',
-                    'image_number': doc.metadata.get('image_number', 'unknown')
                 })
-        else:
-            doc_size = len(doc.text)
             if doc_size > CHUNK_SIZE:
-                log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
                            f"Размер: {doc_size} > {CHUNK_SIZE}")
                 chunked_docs = chunk_document(doc)
-                text_chunks_count += len(chunked_docs)
                 all_chunked_docs.extend(chunked_docs)
-                log_message(f"  ✂️ Разделен на {len(chunked_docs)} чанков")
-                for i, chunk_doc in enumerate(chunked_docs):
                     chunk_info.append({
                         'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
                         'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
                         'chunk_size': len(chunk_doc.text),
                         'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
-                        'type': 'text'
                     })
             else:
                 all_chunked_docs.append(doc)
                 chunk_info.append({
                     'document_id': doc.metadata.get('document_id', 'unknown'),
                     'section_id': doc.metadata.get('section_id', 'unknown'),
                     'chunk_id': 0,
                     'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'text'
                 })
     log_message(f"\n{'='*60}")
-    log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
-    log_message(f"  • Таблицы (целые): {table_count}")
-    log_message(f"  • Таблицы (чанки): {table_chunks_count}")
-    log_message(f"  • Изображения (целые): {image_count - (image_chunks_count > 0)}")
-    log_message(f"  • Изображения (чанки): {image_chunks_count}")
-    log_message(f"  • Текстовые чанки: {text_chunks_count}")
-    log_message(f"  • Всего документов: {len(all_chunked_docs)}")
     log_message(f"{'='*60}\n")
     return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []

         chunk_size = CHUNK_SIZE
     if chunk_overlap is None:
         chunk_overlap = CHUNK_OVERLAP
+    text = doc.text
+    # Try to split by double newlines (paragraphs) first
+    paragraphs = text.split('\n\n')
+    chunks = []
+    current_chunk = ""
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+        # If adding this paragraph exceeds limit, save current chunk
+        if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
+            chunks.append(current_chunk.strip())
+            # Add overlap from end of previous chunk
+            overlap_text = current_chunk[-chunk_overlap:] if len(current_chunk) > chunk_overlap else current_chunk
+            current_chunk = overlap_text + "\n\n" + para
+        else:
+            if current_chunk:
+                current_chunk += "\n\n" + para
+            else:
+                current_chunk = para
+    # Add last chunk
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    # If single paragraph is too large, fall back to sentence splitting
+    final_chunks = []
+    for chunk_text in chunks:
+        if len(chunk_text) > chunk_size:
+            splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+            final_chunks.extend(splitter.split_text(chunk_text))
+        else:
+            final_chunks.append(chunk_text)
+    log_message(f"  ✂️ Текст разбит на {len(final_chunks)} семантических чанков")
+    # Create documents
     chunked_docs = []
+    for i, chunk_text in enumerate(final_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
+            "total_chunks": len(final_chunks),
             "chunk_size": len(chunk_text),
+            "is_chunked": True
         })
+        chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
     return chunked_docs
 def process_documents_with_chunking(documents):
+    log_message("\n" + "="*60)
+    log_message("🔄 НАЧАЛО ПРОЦЕССА ЧАНКИНГА")
+    log_message("="*60)
     all_chunked_docs = []
     chunk_info = []
+    # Counters
+    table_whole_count = 0      # Целые таблицы (не нуждаются в чанкинге)
+    table_chunked_count = 0    # Таблицы, которые УЖЕ разбиты
+    image_whole_count = 0      # Целые изображения
+    image_chunked_count = 0    # Изображения, разбитые на чанки
+    text_whole_count = 0       # Целые текстовые документы
+    text_chunked_count = 0     # Текстовые документы, разбитые на чанки
+    for idx, doc in enumerate(documents):
         doc_type = doc.metadata.get('type', 'text')
         is_already_chunked = doc.metadata.get('is_chunked', False)
+        doc_size = len(doc.text)
+        log_message(f"\n📄 Документ {idx+1}/{len(documents)} | "
+                   f"Тип: {doc_type} | "
+                   f"Размер: {doc_size} | "
+                   f"Уже разбит: {is_already_chunked}")
         if doc_type == 'table':
             if is_already_chunked:
+                # Таблица уже разбита на чанки в table_prep.py
+                table_chunked_count += 1
                 all_chunked_docs.append(doc)
+                log_message(f"  ✓ Таблица (чанк {doc.metadata.get('chunk_id', 0) + 1}/"
+                           f"{doc.metadata.get('total_chunks', 1)}) добавлена без изменений")
             else:
+                # Целая таблица
+                table_whole_count += 1
                 all_chunked_docs.append(doc)
+                log_message(f"  ✓ Целая таблица добавлена | "
+                           f"Номер: {doc.metadata.get('table_number', 'unknown')}")
+            chunk_info.append({
+                'document_id': doc.metadata.get('document_id', 'unknown'),
+                'section_id': doc.metadata.get('section_id', 'unknown'),
+                'chunk_id': doc.metadata.get('chunk_id', 0),
+                'total_chunks': doc.metadata.get('total_chunks', 1),
+                'chunk_size': doc_size,
+                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                'type': 'table',
+                'table_number': doc.metadata.get('table_number', 'unknown'),
+                'is_chunked': is_already_chunked
+            })
         elif doc_type == 'image':
             if doc_size > CHUNK_SIZE:
+                log_message(f"  📷 Изображение требует чанкинга | Размер: {doc_size} > {CHUNK_SIZE}")
                 chunked_docs = chunk_document(doc)
+                image_chunked_count += len(chunked_docs)
                 all_chunked_docs.extend(chunked_docs)
+                for chunk_doc in chunked_docs:
                     chunk_info.append({
                         'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
                         'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': chunk_doc.metadata.get('chunk_id', 0),
+                        'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
                         'chunk_size': len(chunk_doc.text),
                         'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
                         'type': 'image',
+                        'image_number': chunk_doc.metadata.get('image_number', 'unknown'),
+                        'is_chunked': True
                     })
             else:
+                image_whole_count += 1
                 all_chunked_docs.append(doc)
+                log_message(f"  ✓ Целое изображение добавлено | Размер: {doc_size}")
                 chunk_info.append({
                     'document_id': doc.metadata.get('document_id', 'unknown'),
                     'section_id': doc.metadata.get('section_id', 'unknown'),
                     'chunk_id': 0,
+                    'total_chunks': 1,
                     'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'image',
+                    'image_number': doc.metadata.get('image_number', 'unknown'),
+                    'is_chunked': False
                 })
+        else:  # text
             if doc_size > CHUNK_SIZE:
+                log_message(f"  📝 Текст требует чанкинга | "
+                           f"Документ: {doc.metadata.get('document_id', 'unknown')} | "
+                           f"Раздел: {doc.metadata.get('section_id', 'unknown')} | "
                            f"Размер: {doc_size} > {CHUNK_SIZE}")
                 chunked_docs = chunk_document(doc)
+                text_chunked_count += len(chunked_docs)
                 all_chunked_docs.extend(chunked_docs)
+                for chunk_doc in chunked_docs:
                     chunk_info.append({
                         'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
                         'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': chunk_doc.metadata.get('chunk_id', 0),
+                        'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
                         'chunk_size': len(chunk_doc.text),
                         'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'text',
+                        'is_chunked': True
                     })
             else:
+                text_whole_count += 1
                 all_chunked_docs.append(doc)
+                log_message(f"  ✓ Целый текстовый документ добавлен | Размер: {doc_size}")
                 chunk_info.append({
                     'document_id': doc.metadata.get('document_id', 'unknown'),
                     'section_id': doc.metadata.get('section_id', 'unknown'),
                     'chunk_id': 0,
+                    'total_chunks': 1,
                     'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'text',
+                    'is_chunked': False
                 })
     log_message(f"\n{'='*60}")
+    log_message(f"📊 ИТОГОВАЯ СТАТИСТИКА ЧАНКИНГА:")
+    log_message(f"{'='*60}")
+    log_message(f"  ТАБЛИЦЫ:")
+    log_message(f"    • Целые (не нуждались в чанкинге): {table_whole_count}")
+    log_message(f"    • Чанки (разбиты в table_prep.py): {table_chunked_count}")
+    log_message(f"  ИЗОБРАЖЕНИЯ:")
+    log_message(f"    • Целые: {image_whole_count}")
+    log_message(f"    • Чанки: {image_chunked_count}")
+    log_message(f"  ТЕКСТ:")
+    log_message(f"    • Целые документы: {text_whole_count}")
+    log_message(f"    • Чанки: {text_chunked_count}")
+    log_message(f"  {'─'*58}")
+    log_message(f"  ВСЕГО ДОКУМЕНТОВ В ИНДЕКСЕ: {len(all_chunked_docs)}")
     log_message(f"{'='*60}\n")
     return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []

table_prep.py CHANGED Viewed

@@ -32,39 +32,80 @@ def create_table_content(table_data):
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
-    if chunk_size is None:
-        chunk_size = CHUNK_SIZE
-    if chunk_overlap is None:
-        chunk_overlap = CHUNK_OVERLAP
-    text_splitter = SentenceSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-        separator="\n"
-    )
-    text_chunks = text_splitter.split_text(doc.text)
-    chunked_docs = []
-    for i, chunk_text in enumerate(text_chunks):
-        chunk_metadata = doc.metadata.copy()
-        chunk_metadata.update({
-            "chunk_id": i,
-            "total_chunks": len(text_chunks),
-            "chunk_size": len(chunk_text),
             "is_chunked": True
-        })
-        chunked_doc = Document(
-            text=chunk_text,
-            metadata=chunk_metadata
-        )
-        chunked_docs.append(chunked_doc)
-    return chunked_docs
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
         log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
         return []
@@ -75,41 +116,46 @@ def table_to_document(table_data, document_id=None):
     section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
-    if not table_rows or len(table_rows) == 0:
-        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
         return []
-    content = create_table_content(table_data)
-    content_size = len(content)
-    row_count = len(table_rows)
-    base_doc = Document(
-        text=content,
-        metadata={
-            "type": "table",
-            "table_number": table_num,
-            "table_title": table_title,
-            "document_id": doc_id,
-            "section": section,
-            "section_id": section,
-            "total_rows": row_count,
-            "content_size": content_size
-        }
-    )
-    if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
-                   f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
-        chunked_docs = chunk_table_document(base_doc)
-        log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
-        for i, chunk_doc in enumerate(chunked_docs):
-            log_message(f"    Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
-        return chunked_docs
     else:
-        log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
-                   f"Размер: {content_size} символов | Строк: {row_count}")
-        return [base_doc]
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)

 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+def create_table_chunks_with_headers(table_data, rows_per_chunk=10):
+    """
+    Intelligently chunk tables by preserving headers and grouping rows
+    """
+    doc_id = table_data.get('document_id') or table_data.get('document', 'Неизвестно')
+    table_num = table_data.get('table_number', 'Неизвестно')
+    table_title = table_data.get('table_title', 'Неизвестно')
+    section = table_data.get('section', 'Неизвестно')
+    headers = table_data.get('headers', [])
+    table_rows = table_data.get('data', [])
+    if not table_rows:
+        return []
+    # Create header string that will be included in EVERY chunk
+    header_context = f"Таблица {table_num}: {table_title}\n"
+    header_context += f"Документ: {doc_id}\n"
+    header_context += f"Раздел: {section}\n"
+    if headers:
+        header_context += f"Заголовки: {' | '.join(headers)}\n"
+    header_context += f"Всего строк в таблице: {len(table_rows)}\n\n"
+    # Calculate optimal rows per chunk based on content size
+    avg_row_size = sum(len(str(row)) for row in table_rows[:5]) / min(5, len(table_rows))
+    max_chunk_size = CHUNK_SIZE - len(header_context) - 500  # Safety margin
+    optimal_rows = max(5, int(max_chunk_size / avg_row_size))
+    log_message(f"  📐 Средний размер строки: {avg_row_size:.0f} символов")
+    log_message(f"  📊 Оптимальное кол-во строк на чанк: {optimal_rows}")
+    chunks = []
+    total_rows = len(table_rows)
+    for i in range(0, total_rows, optimal_rows):
+        chunk_rows = table_rows[i:i + optimal_rows]
+        # Build chunk content
+        chunk_content = header_context
+        chunk_content += f"[Строки {i+1}-{min(i+optimal_rows, total_rows)} из {total_rows}]\n"
+        chunk_content += "Данные:\n"
+        for row_idx, row in enumerate(chunk_rows, start=i+1):
+            if isinstance(row, dict):
+                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
+                chunk_content += f"Строка {row_idx}: {row_text}\n"
+        chunk_metadata = {
+            "type": "table",
+            "table_number": table_num,
+            "table_title": table_title,
+            "document_id": doc_id,
+            "section": section,
+            "section_id": section,
+            "headers": headers,
+            "chunk_id": i // optimal_rows,
+            "total_chunks": (total_rows + optimal_rows - 1) // optimal_rows,
+            "row_range": f"{i+1}-{min(i+optimal_rows, total_rows)}",
+            "total_table_rows": total_rows,
             "is_chunked": True
+        }
+        doc = Document(text=chunk_content, metadata=chunk_metadata)
+        chunks.append(doc)
+        log_message(f"    Чанк {len(chunks)}: строки {i+1}-{min(i+optimal_rows, total_rows)} | "
+                   f"{len(chunk_content)} символов")
+    return chunks
 def table_to_document(table_data, document_id=None):
+    """
+    Convert table to Document(s) with intelligent chunking
+    """
     if not isinstance(table_data, dict):
         log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
         return []
     section = table_data.get('section', 'Неизвестно')
     table_rows = table_data.get('data', [])
+    if not table_rows:
+        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных")
         return []
+    log_message(f"\n📊 Обработка таблицы {table_num} из документа '{doc_id}'")
+    log_message(f"  Название: {table_title}")
+    log_message(f"  Раздел: {section}")
+    log_message(f"  Строк данных: {len(table_rows)}")
+    # Estimate if table needs chunking
+    sample_content = create_table_content(table_data)
+    estimated_size = len(sample_content)
+    log_message(f"  Оценочный размер: {estimated_size} символов")
+    # Threshold: if table is small enough, keep it whole
+    if estimated_size <= CHUNK_SIZE * 0.8:  # 80% of limit for safety
+        log_message(f"  ✅ Таблица достаточно мала, хранится целиком")
+        doc = Document(
+            text=sample_content,
+            metadata={
+                "type": "table",
+                "table_number": table_num,
+                "table_title": table_title,
+                "document_id": doc_id,
+                "section": section,
+                "section_id": section,
+                "headers": table_data.get('headers', []),
+                "total_rows": len(table_rows),
+                "content_size": estimated_size,
+                "is_chunked": False
+            }
+        )
+        return [doc]
     else:
+        log_message(f"  ⚠️ Таблица слишком большая ({estimated_size} > {CHUNK_SIZE})")
+        log_message(f"  🔄 Применяется умный чанкинг с сохранением заголовков...")
+        chunks = create_table_chunks_with_headers(table_data)
+        log_message(f"  ✅ Таблица разбита на {len(chunks)} чанков с сохранением структуры")
+        return chunks
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)