Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 2, 2025

Commit

822ef8c

1 Parent(s): c0bcb11

added the new chunking and loggings

Browse files

Files changed (2) hide show

documents_prep.py +92 -47
table_prep.py +61 -15

documents_prep.py CHANGED Viewed

@@ -44,68 +44,113 @@ def process_documents_with_chunking(documents):
     all_chunked_docs = []
     chunk_info = []
     table_count = 0
     image_count = 0
     text_chunks_count = 0
     for doc in documents:
         doc_type = doc.metadata.get('type', 'text')
-        doc_size = len(doc.text)
-        # Apply chunking to ALL documents if they exceed CHUNK_SIZE
-        if doc_size > CHUNK_SIZE:
-            chunked_docs = chunk_document(doc)
-            all_chunked_docs.extend(chunked_docs)
-            if doc_type == 'table':
-                table_count += len(chunked_docs)
-            elif doc_type == 'image':
-                image_count += len(chunked_docs)
             else:
-                text_chunks_count += len(chunked_docs)
-            for i, chunk_doc in enumerate(chunked_docs):
                 chunk_info.append({
-                    'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                    'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': i,
-                    'chunk_size': len(chunk_doc.text),
-                    'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
-                    'type': doc_type,
-                    'table_number': chunk_doc.metadata.get('table_number', 'unknown') if doc_type == 'table' else None,
-                    'table_title': chunk_doc.metadata.get('table_title', '') if doc_type == 'table' else None,
-                    'image_number': chunk_doc.metadata.get('image_number', 'unknown') if doc_type == 'image' else None,
-                    'image_title': chunk_doc.metadata.get('image_title', '') if doc_type == 'image' else None
                 })
-        else:
-            # Document is small enough, add as-is
-            all_chunked_docs.append(doc)
-            if doc_type == 'table':
-                table_count += 1
-            elif doc_type == 'image':
-                image_count += 1
             else:
-                text_chunks_count += 1
-            chunk_info.append({
-                'document_id': doc.metadata.get('document_id', 'unknown'),
-                'section_id': doc.metadata.get('section_id', 'unknown'),
-                'chunk_id': 0,
-                'chunk_size': doc_size,
-                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                'type': doc_type,
-                'table_number': doc.metadata.get('table_number', 'unknown') if doc_type == 'table' else None,
-                'table_title': doc.metadata.get('table_title', '') if doc_type == 'table' else None,
-                'image_number': doc.metadata.get('image_number', 'unknown') if doc_type == 'image' else None,
-                'image_title': doc.metadata.get('image_title', '') if doc_type == 'image' else None
-            })
     log_message(f"\n{'='*60}")
     log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
-    log_message(f"  • Таблицы (чанки): {table_count}")
-    log_message(f"  • Изображения (чанки): {image_count}")
     log_message(f"  • Текстовые чанки: {text_chunks_count}")
-    log_message(f"  • Всего чанков: {len(all_chunked_docs)}")
     log_message(f"{'='*60}\n")
     return all_chunked_docs, chunk_info

     all_chunked_docs = []
     chunk_info = []
     table_count = 0
+    table_chunks_count = 0
     image_count = 0
+    image_chunks_count = 0
     text_chunks_count = 0
     for doc in documents:
         doc_type = doc.metadata.get('type', 'text')
+        is_already_chunked = doc.metadata.get('is_chunked', False)
+        if doc_type == 'table':
+            if is_already_chunked:
+                table_chunks_count += 1
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': doc.metadata.get('chunk_id', 0),
+                    'total_chunks': doc.metadata.get('total_chunks', 1),
+                    'chunk_size': len(doc.text),
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown')
+                })
             else:
+                table_count += 1
+                all_chunked_docs.append(doc)
                 chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': len(doc.text),
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown')
                 })
+        elif doc_type == 'image':
+            image_count += 1
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
+                           f"Размер: {doc_size} > {CHUNK_SIZE}")
+                chunked_docs = chunk_document(doc)
+                image_chunks_count += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+                log_message(f"  ✂️ Разделено на {len(chunked_docs)} чанков")
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'image',
+                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
+                    })
             else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'image',
+                    'image_number': doc.metadata.get('image_number', 'unknown')
+                })
+        else:
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
+                log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
+                           f"Размер: {doc_size} > {CHUNK_SIZE}")
+                chunked_docs = chunk_document(doc)
+                text_chunks_count += len(chunked_docs)
+                all_chunked_docs.extend(chunked_docs)
+                log_message(f"  ✂️ Разделен на {len(chunked_docs)} чанков")
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'text'
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'text'
+                })
     log_message(f"\n{'='*60}")
     log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
+    log_message(f"  • Таблицы (целые): {table_count}")
+    log_message(f"  • Таблицы (чанки): {table_chunks_count}")
+    log_message(f"  • Изображения (целые): {image_count - (image_chunks_count > 0)}")
+    log_message(f"  • Изображения (чанки): {image_chunks_count}")
     log_message(f"  • Текстовые чанки: {text_chunks_count}")
+    log_message(f"  • Всего документов: {len(all_chunked_docs)}")
     log_message(f"{'='*60}\n")
     return all_chunked_docs, chunk_info

table_prep.py CHANGED Viewed

@@ -29,26 +29,61 @@ def create_table_content(table_data):
     return content
 def table_to_document(table_data, document_id=None):
-    """Convert table data to a single Document with rich metadata"""
     if not isinstance(table_data, dict):
         return []
-    doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
     content = create_table_content(table_data)
     content_size = len(content)
-    # Log table addition
-    row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
-    log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
-                f"Размер: {content_size} символов | Строк: {row_count}")
-    # Store all table metadata including headers for preservation during chunking
-    return [Document(
         text=content,
         metadata={
             "type": "table",
@@ -57,14 +92,25 @@ def table_to_document(table_data, document_id=None):
             "document_id": doc_id,
             "section": section,
             "section_id": section,
-            "section_path": section,  # Add for consistency with text chunks
             "total_rows": row_count,
-            "content_size": content_size,
-            "headers": table_data.get('headers', []),  # Preserve headers
-            "original_table_data": True  # Mark as original table
         }
-    )]
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)
     log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")

     return content
+from llama_index.core.text_splitter import SentenceSplitter
+from config import CHUNK_SIZE, CHUNK_OVERLAP
+def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
+    if chunk_size is None:
+        chunk_size = CHUNK_SIZE
+    if chunk_overlap is None:
+        chunk_overlap = CHUNK_OVERLAP
+    text_splitter = SentenceSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separator="\n"
+    )
+    text_chunks = text_splitter.split_text(doc.text)
+    chunked_docs = []
+    for i, chunk_text in enumerate(text_chunks):
+        chunk_metadata = doc.metadata.copy()
+        chunk_metadata.update({
+            "chunk_id": i,
+            "total_chunks": len(text_chunks),
+            "chunk_size": len(chunk_text),
+            "is_chunked": True
+        })
+        chunked_doc = Document(
+            text=chunk_text,
+            metadata=chunk_metadata
+        )
+        chunked_docs.append(chunked_doc)
+    return chunked_docs
 def table_to_document(table_data, document_id=None):
     if not isinstance(table_data, dict):
+        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
         return []
+    doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
     table_num = table_data.get('table_number', 'Неизвестно')
     table_title = table_data.get('table_title', 'Неизвестно')
     section = table_data.get('section', 'Неизвестно')
+    table_rows = table_data.get('data', [])
+    if not table_rows or len(table_rows) == 0:
+        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
+        return []
     content = create_table_content(table_data)
     content_size = len(content)
+    row_count = len(table_rows)
+    base_doc = Document(
         text=content,
         metadata={
             "type": "table",
             "document_id": doc_id,
             "section": section,
             "section_id": section,
             "total_rows": row_count,
+            "content_size": content_size
         }
+    )
+    if content_size > CHUNK_SIZE:
+        log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
+                   f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
+        chunked_docs = chunk_table_document(base_doc)
+        log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
+        for i, chunk_doc in enumerate(chunked_docs):
+            log_message(f"    Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
+        return chunked_docs
+    else:
+        log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
+                   f"Размер: {content_size} символов | Строк: {row_count}")
+        return [base_doc]
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message("=" * 60)
     log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")